1 /********************************************************************\
2  * gnc-tokenizer.hpp - base class for converting a text file into a *
3  *                     two-dimensional vector of strings (table)    *
4  *                                                                  *
5  * This program is free software; you can redistribute it and/or    *
6  * modify it under the terms of the GNU General Public License as   *
7  * published by the Free Software Foundation; either version 2 of   *
8  * the License, or (at your option) any later version.              *
9  *                                                                  *
10  * This program is distributed in the hope that it will be useful,  *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of   *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    *
13  * GNU General Public License for more details.                     *
14  *                                                                  *
15  * You should have received a copy of the GNU General Public License*
16  * along with this program; if not, contact:                        *
17  *                                                                  *
18  * Free Software Foundation           Voice:  +1-617-542-5942       *
19  * 51 Franklin Street, Fifth Floor    Fax:    +1-617-542-2652       *
20  * Boston, MA  02110-1301,  USA       gnu@gnu.org                   *
21 \********************************************************************/
22 
23 /** @file
24      @brief Class convert a file into vector of string vectors.
25      This is a generic base class that holds the functionality common
26      to different specializations (eg a csv file parser, a fixed-width
27      file parser,...)
28      The child classes have to override the tokenize function to
29      create a full tokenizer class.
30      *
31      gnc-tokenizer.hpp
32      @author Copyright (c) 2015 Geert Janssens <geert@kobaltwit.be>
33  */
34 
35 #ifndef GNC_TOKENIZER_HPP
36 #define GNC_TOKENIZER_HPP
37 
38 extern "C" {
39 #include <config.h>
40 }
41 
42 #include <iostream>
43 #include <fstream>      // fstream
44 #include <vector>
45 #include <string>
46 #include <memory>
47 
48 using StrVec = std::vector<std::string>;
49 
50 /** Enumeration for file formats supported by this importer. */
51 enum class GncImpFileFormat {
52     UNKNOWN,
53     CSV,
54     FIXED_WIDTH
55 };
56 
57 class GncTokenizerTest;
58 
59 class GncTokenizer
60 {
61 friend GncTokenizerTest;
62 public:
63     GncTokenizer() = default;                               // default constructor
64     GncTokenizer(const GncTokenizer&) = default;            // copy constructor
65     GncTokenizer& operator=(const GncTokenizer&) = default; // copy assignment
66     GncTokenizer(GncTokenizer&&) = default;                 // move constructor
67     GncTokenizer& operator=(GncTokenizer&&) = default;      // move assignment
68     virtual ~GncTokenizer() = default;                      // destructor
69 
70     virtual void load_file(const std::string& path);
71     const std::string& current_file();
72     void encoding(const std::string& encoding);
73     const std::string& encoding();
74     virtual int  tokenize() = 0;
75     const std::vector<StrVec>& get_tokens();
76 
77 protected:
78     std::string m_utf8_contents;
79     std::vector<StrVec> m_tokenized_contents;
80 
81 private:
82     std::string m_imp_file_str;
83     std::string m_raw_contents;
84     std::string m_enc_str;
85 };
86 
87 
88 // Function to instantiate specializations of the GncTokenizer
89 std::unique_ptr<GncTokenizer> gnc_tokenizer_factory(GncImpFileFormat fmt);
90 
91 #endif
92