1 /********************************************************************\ 2 * gnc-tokenizer.hpp - base class for converting a text file into a * 3 * two-dimensional vector of strings (table) * 4 * * 5 * This program is free software; you can redistribute it and/or * 6 * modify it under the terms of the GNU General Public License as * 7 * published by the Free Software Foundation; either version 2 of * 8 * the License, or (at your option) any later version. * 9 * * 10 * This program is distributed in the hope that it will be useful, * 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 13 * GNU General Public License for more details. * 14 * * 15 * You should have received a copy of the GNU General Public License* 16 * along with this program; if not, contact: * 17 * * 18 * Free Software Foundation Voice: +1-617-542-5942 * 19 * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 * 20 * Boston, MA 02110-1301, USA gnu@gnu.org * 21 \********************************************************************/ 22 23 /** @file 24 @brief Class convert a file into vector of string vectors. 25 This is a generic base class that holds the functionality common 26 to different specializations (eg a csv file parser, a fixed-width 27 file parser,...) 28 The child classes have to override the tokenize function to 29 create a full tokenizer class. 30 * 31 gnc-tokenizer.hpp 32 @author Copyright (c) 2015 Geert Janssens <geert@kobaltwit.be> 33 */ 34 35 #ifndef GNC_TOKENIZER_HPP 36 #define GNC_TOKENIZER_HPP 37 38 extern "C" { 39 #include <config.h> 40 } 41 42 #include <iostream> 43 #include <fstream> // fstream 44 #include <vector> 45 #include <string> 46 #include <memory> 47 48 using StrVec = std::vector<std::string>; 49 50 /** Enumeration for file formats supported by this importer. */ 51 enum class GncImpFileFormat { 52 UNKNOWN, 53 CSV, 54 FIXED_WIDTH 55 }; 56 57 class GncTokenizerTest; 58 59 class GncTokenizer 60 { 61 friend GncTokenizerTest; 62 public: 63 GncTokenizer() = default; // default constructor 64 GncTokenizer(const GncTokenizer&) = default; // copy constructor 65 GncTokenizer& operator=(const GncTokenizer&) = default; // copy assignment 66 GncTokenizer(GncTokenizer&&) = default; // move constructor 67 GncTokenizer& operator=(GncTokenizer&&) = default; // move assignment 68 virtual ~GncTokenizer() = default; // destructor 69 70 virtual void load_file(const std::string& path); 71 const std::string& current_file(); 72 void encoding(const std::string& encoding); 73 const std::string& encoding(); 74 virtual int tokenize() = 0; 75 const std::vector<StrVec>& get_tokens(); 76 77 protected: 78 std::string m_utf8_contents; 79 std::vector<StrVec> m_tokenized_contents; 80 81 private: 82 std::string m_imp_file_str; 83 std::string m_raw_contents; 84 std::string m_enc_str; 85 }; 86 87 88 // Function to instantiate specializations of the GncTokenizer 89 std::unique_ptr<GncTokenizer> gnc_tokenizer_factory(GncImpFileFormat fmt); 90 91 #endif 92