1 #ifndef UTIL___ROW_READER_EXCEL_CSV__HPP 2 #define UTIL___ROW_READER_EXCEL_CSV__HPP 3 4 /* $Id: row_reader_excel_csv.hpp 564203 2018-05-23 12:13:23Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Authors: Denis Vakatov, Sergey Satskiy 30 * 31 * File Description: 32 * Implementation of the CRowReader<> traits for MS EXCEL CSV 33 * 34 * =========================================================================== 35 */ 36 37 #include <util/row_reader_char_delimited.hpp> 38 39 40 BEGIN_NCBI_SCOPE 41 42 43 /// Note 1: Empty rows are allowed and treated as 0 fields rows 44 /// Note 2: Both CRLF and LF are allowed 45 /// Note 3: Number of fields is not enforced 46 /// Note 4: There is no formal MS Excel CSV spec. So the implementation is 47 /// based on experiments made on MS Excel 2013. 48 /// See the description in JIRA: CXX-9221 49 /// Note 5: Two field cases in a data source 50 /// - empty, i.e. ,, 51 /// - "", i.e. ,"", 52 /// are translated to a Null field 53 /// Note 6: trailing Null fields in a data source are stripped 54 55 56 const CTempString kNullFieldRepresentation = CTempString("\"\"", 2); 57 58 59 /// MS Excel CSV traits. 60 class CRowReaderStream_Excel_CSV : public TRowReaderStream_SingleCommaDelimited 61 { 62 public: CRowReaderStream_Excel_CSV()63 CRowReaderStream_Excel_CSV() 64 { 65 m_LineSeparator.reserve(2); 66 m_PreviousLineSeparator.reserve(2); 67 } 68 69 // It could be more than one raw line in one row ReadRowData(CNcbiIstream & is,string * data)70 size_t ReadRowData(CNcbiIstream& is, string* data) 71 { 72 data->clear(); 73 m_Tokens.clear(); 74 75 size_t current_index= 0; 76 size_t token_begin_index = 0; 77 size_t lines_read = 0; 78 bool in_quotes = false; 79 for (;;) { 80 x_ReadOneLine(is, data, lines_read > 0); 81 ++lines_read; 82 83 while (current_index < data->size()) { 84 auto current_char = (*data)[current_index]; 85 if (current_char == ',') { 86 if (!in_quotes) { 87 m_Tokens.emplace_back(token_begin_index); 88 token_begin_index = current_index + 1; 89 } 90 } else if (current_char == '"') { 91 if (token_begin_index == current_index) { 92 in_quotes = true; 93 } else { 94 if (in_quotes) { 95 if (current_index + 1 < data->size() && 96 (*data)[current_index + 1] == '"') { 97 ++current_index; 98 } else { 99 in_quotes = false; 100 } 101 } 102 } 103 } 104 105 ++current_index; 106 } 107 108 if (!in_quotes) 109 break; 110 111 // Here: need to read one more line because of the double quotes. 112 // So check if we still can read. 113 if (!bool(is)) 114 break; 115 } 116 117 m_Tokens.push_back(token_begin_index); 118 return lines_read; 119 } 120 OnNextLine(CTempString)121 ERR_Action OnNextLine(CTempString /* raw_line */) 122 { 123 return eRR_Continue_Data; 124 } 125 126 127 // The tokenization is actually done in the ReadRowData() member Tokenize(const CTempString raw_line,vector<CTempString> & tokens)128 ERR_Action Tokenize(const CTempString raw_line, 129 vector<CTempString>& tokens) 130 { 131 // Special case in accordance with CXX-9221: empty line => no fields 132 if (!raw_line.empty()) { 133 size_t field_size; 134 for (TFieldNo field_no = 0; 135 field_no < m_Tokens.size(); ++field_no) { 136 if (field_no + 1 < m_Tokens.size()) 137 field_size = m_Tokens[field_no + 1] - m_Tokens[field_no] - 1; 138 else 139 field_size = raw_line.size() - m_Tokens[field_no]; 140 tokens.emplace_back(raw_line.data() + m_Tokens[field_no], 141 field_size); 142 } 143 144 x_StripTrailingNullFields(tokens); 145 } 146 return eRR_Continue_Data; 147 } 148 Validate(CTempString raw_line,ERR_FieldValidationMode field_validation_mode)149 ERR_Action Validate(CTempString raw_line, 150 ERR_FieldValidationMode field_validation_mode) 151 { 152 if (field_validation_mode == eRR_NoFieldValidation) 153 return eRR_Skip; 154 if (m_FieldsToValidate.empty()) 155 return eRR_Skip; 156 157 if (raw_line.empty()) 158 return eRR_Skip; 159 160 // Here: the field values need to be validated and there is some type 161 // information 162 m_ValidationTokens.clear(); 163 ERR_Action action = this->Tokenize(raw_line, m_ValidationTokens); 164 165 if (action == eRR_Skip) 166 return eRR_Skip; 167 168 for (const auto& info : m_FieldsToValidate) { 169 if (info.first < m_Tokens.size()) { 170 string translated; 171 ERR_TranslationResult translation_result = 172 this->Translate((TFieldNo)info.first, m_ValidationTokens[info.first], translated); 173 if (translation_result == eRR_UseOriginal) { 174 CRR_Util::ValidateBasicTypeFieldValue( 175 m_ValidationTokens[info.first], 176 info.second.first, info.second.second); 177 } else { 178 CRR_Util::ValidateBasicTypeFieldValue( 179 translated, info.second.first, info.second.second); 180 } 181 } 182 } 183 return eRR_Skip; 184 } 185 Translate(TFieldNo,const CTempString raw_value,string & translated_value)186 ERR_TranslationResult Translate(TFieldNo /* field_no */, 187 const CTempString raw_value, 188 string& translated_value) 189 { 190 if (x_IsNull(raw_value)) 191 return eRR_Null; 192 193 if (raw_value[0] == '=') { 194 size_t dbl_quote_cnt = 0; 195 for (size_t index = 0; index < raw_value.size(); ++index) 196 if (raw_value[index] == '"') 197 ++dbl_quote_cnt; 198 199 if (dbl_quote_cnt == 0) { 200 translated_value = string(raw_value.data() + 1, 201 raw_value.size() - 1); 202 return eRR_Translated; 203 } 204 205 // Here: there are " in the field. They may need to be stripped 206 // together with = if: 207 // - " follows = immediately 208 // - " is the last character in a field 209 // - there is an even number of " 210 // If so then "" need to be translated into " inside the field 211 // as well 212 if (dbl_quote_cnt % 2 == 0) { 213 if (raw_value[1] == '"' && 214 raw_value[raw_value.size() - 1] == '"') { 215 // Balanced double quote and poperly surround the field 216 // value => strip the = and surrounding " plus replace 217 // "" with " 218 translated_value = string(raw_value.data() + 2, 219 raw_value.size() - 3); 220 NStr::ReplaceInPlace(translated_value, "\"\"", "\""); 221 return eRR_Translated; 222 } 223 } 224 225 // Non balanced double quotes or they are not surrounding the 226 // value after = 227 // There is no translation for this case 228 return eRR_UseOriginal; 229 } 230 231 if (raw_value[0] == '"') { 232 size_t match_index = 1; 233 for (; match_index < raw_value.size(); ++match_index) { 234 if (raw_value[match_index] == '"') { 235 if (match_index + 1< raw_value.size() && 236 raw_value[match_index + 1] == '"') 237 ++match_index; 238 else 239 break; 240 } 241 } 242 243 // Here: match_index points beyond of the field or to a 244 // matching " 245 if (match_index < raw_value.size()) { 246 // matching " found 247 translated_value = string(raw_value.data() + 1, 248 match_index - 1); 249 NStr::ReplaceInPlace(translated_value, "\"\"", "\""); 250 if (match_index < raw_value.size() - 1) { 251 // tail of the field needs to ba attached as is 252 translated_value.append( 253 raw_value.data() + match_index + 1, 254 raw_value.size() - match_index - 1); 255 } 256 } else { 257 // Unbalanced " case 258 translated_value = string(raw_value.data() + 1, 259 raw_value.size() - 1); 260 } 261 262 // This could be a case with a leading = which may need to be 263 // stripped as well... 264 if (!translated_value.empty()) { 265 if (translated_value[0] == '=') { 266 size_t dbl_quote_cnt = 0; 267 for (size_t index = 0; 268 index < translated_value.size(); ++index) 269 if (translated_value[index] == '"') 270 ++dbl_quote_cnt; 271 272 if (dbl_quote_cnt > 0 && (dbl_quote_cnt % 2 == 0)) { 273 if (translated_value[1] == '"' && 274 translated_value[translated_value.size() - 1] == '"') { 275 translated_value = translated_value.substr(2, translated_value.size() - 3); 276 } 277 } 278 } 279 } 280 281 return eRR_Translated; 282 } 283 return eRR_UseOriginal; 284 } 285 OnEvent(ERR_Event event,ERR_EventMode event_mode)286 ERR_EventAction OnEvent(ERR_Event event, 287 ERR_EventMode event_mode) 288 { 289 switch (event) { 290 case eRR_Event_SourceBegin: 291 GetMyStream().x_ClearTraitsProvidedFieldsInfo(); 292 293 if (event_mode == eRR_EventMode_Validating) 294 x_GetFieldTypesToValidate(); 295 296 // fall through 297 case eRR_Event_SourceEnd: 298 case eRR_Event_SourceError: 299 default: 300 ; 301 } 302 return eRR_EventAction_Default; 303 } 304 305 private: x_ReadOneLine(CNcbiIstream & is,string * data,bool joining)306 void x_ReadOneLine(CNcbiIstream& is, string* data, bool joining) 307 { 308 m_RawLine.clear(); 309 std::getline(is, m_RawLine); 310 m_LineSeparator = "\n"; 311 if(!m_RawLine.empty() && m_RawLine.back() == '\r') { 312 m_RawLine.pop_back(); 313 m_LineSeparator = "\r\n"; 314 } 315 316 if (joining) 317 data->append(m_PreviousLineSeparator); 318 data->append(m_RawLine); 319 320 m_PreviousLineSeparator = m_LineSeparator; 321 } 322 x_GetFieldTypesToValidate(void)323 void x_GetFieldTypesToValidate(void) 324 { 325 m_FieldsToValidate.clear(); 326 for (const auto& info : GetMyStream().GetFieldsMetaInfo()) { 327 if (info.is_type_initialized) { 328 auto field_type = info.type.GetType(); 329 if (field_type == eRR_Boolean || field_type == eRR_Integer || 330 field_type == eRR_Double || field_type == eRR_DateTime) 331 m_FieldsToValidate[info.field_no] = 332 make_pair(field_type, info.type.GetProps()); 333 } 334 } 335 } 336 x_IsNull(const CTempString & raw_field_value)337 bool x_IsNull(const CTempString& raw_field_value) 338 { 339 return raw_field_value.empty() || 340 (raw_field_value == kNullFieldRepresentation); 341 } 342 x_StripTrailingNullFields(vector<CTempString> & tokens)343 void x_StripTrailingNullFields(vector<CTempString>& tokens) 344 { 345 while (!tokens.empty()) { 346 if (x_IsNull(tokens.back())) 347 tokens.pop_back(); 348 else 349 break; 350 } 351 } 352 353 private: 354 vector<size_t> m_Tokens; 355 string m_LineSeparator; 356 string m_PreviousLineSeparator; 357 string m_RawLine; 358 359 map<size_t, pair<ERR_FieldType, string>> m_FieldsToValidate; 360 vector<CTempString> m_ValidationTokens; 361 362 RR_TRAITS_PARENT_STREAM(CRowReaderStream_Excel_CSV); 363 }; 364 365 366 367 END_NCBI_SCOPE 368 369 #endif /* UTIL___ROW_READER_EXCEL_CSV__HPP */ 370