1 /***************************************************************************
2  *   Copyright (c) 2005-2020 by Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6)  *
3  *   info_at_agrum_dot_org                                               *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20 /** @file
21  * @brief Class for fast parsing of CSV file (never more than one line in
22  * application memory)
23  *
24  * Typical use :
25  * @code
26  * // open the CSV file
27  * std::string filename="foo.csv"
28  * std::ifstream in(filename.c_str());
29  * gum::learning::CSVParser csvp(in);
30  *
31  * // read each line in the CSV file
32  * while (csvp.next()) {
33  *   csvp.current ();
34  * }
35  *
36  * in.close();
37  * @endcode
38  *
39  * @author Pierre-Henri WUILLEMIN(_at_LIP6) & Christophe GONZALES(_at_AMU)
40  *
41  */
42 
43 #ifndef GUM_CSV_PARSER_H
44 #define GUM_CSV_PARSER_H
45 
46 #include <istream>
47 #include <string>
48 #include <vector>
49 
50 #include <agrum/agrum.h>
51 
52 namespace gum {
53 
54   namespace learning {
55 
56     /** @class CSVParser
57      * @ingroup learning_database
58      * @headerfile CSVParser.h <agrum/tools/database/CSVParser.h>
59      * @brief  Class for fast parsing of CSV file (never more than one
60      * line in application memory)
61      *
62      * Typical use:
63      * @code
64      * // open the CSV file
65      * std::string filename="foo.csv"
66      * std::ifstream in(filename.c_str());
67      * gum::learning::CSVParser<> csvp(in);
68      *
69      * // read each line in the CSV file
70      * while (csvp.next()) {
71      *  csvp.current ();
72      * }
73      *
74      * in.close();
75      * @endcode
76      */
77     template < template < typename > class ALLOC = std::allocator >
78     class CSVParser {
79       public:
80       /// type for the allocators passed in arguments of methods
81       using allocator_type = ALLOC< std::string >;
82 
83 
84       // ##########################################################################
85       /// @name Constructors / Destructors
86       // ##########################################################################
87       /// @{
88 
89       /// default constructor
90       /** @param in an input stream containing the CSV
91        * @param delimiter the character that acts as the column separator in
92        * the CSV
93        * @param commentmarker the character that marks the beginning of a comment
94        * @param quoteMarker the character that is used to quote the sentences
95        * in the CSV
96        * @param alloc the allocator used by all the methods
97        */
98       CSVParser(std::istream&         in,
99                 const std::string&    filename,
100                 const std::string&    delimiter     = ",",
101                 const char            commentmarker = '#',
102                 const char            quoteMarker   = '"',
103                 const allocator_type& alloc         = allocator_type());
104 
105       /// destructor
106       virtual ~CSVParser();
107 
108       /// @}
109 
110 
111       // ########################################################################
112       /// @name Accessors / Modifiers
113       // ########################################################################
114       /// @{
115 
116       /// gets the next line of the csv stream and parses it
117       /** @return false if there is no next line
118        */
119       bool next();
120 
121       /// returns the current parsed line
122       /** @throw NullElement is raised if there is no data
123        */
124       const std::vector< std::string, ALLOC< std::string > >& current() const;
125 
126       /// returns the current line number within the stream
127       const std::size_t nbLine() const;
128 
129       /// reopens a new input stream to parse
130       void useNewStream(std::istream&      in,
131                         const std::string& delimiter     = ",",
132                         const char         commentmarker = '#',
133                         const char         quoteMarker   = '"');
134 
135       /// @}
136 
137 
138 #ifndef DOXYGEN_SHOULD_SKIP_THIS
139 
140       private:
141       void _getNextTriplet_(const std::string& str,
142                             std::size_t&       first_letter_token,
143                             std::size_t&       next_token,
144                             std::size_t&       last_letter_token,
145                             std::size_t        from) const;
146 
147       void _tokenize_(const std::string& str);
148 
149       std::size_t _correspondingQuoteMarker_(const std::string& str, std::size_t pos) const;
150 
151 
152       std::string _line_;
153       std::string _delimiter_;
154       std::string _spaces_;
155       std::string _delimiterPlusSpaces_;
156       std::size_t _nbLine_;
157       char        _commentMarker_;
158       char        _quoteMarker_;
159       bool        _emptyData_;
160 
161       std::istream*                                    _instream_;
162       std::vector< std::string, ALLOC< std::string > > _data_;
163       const std::string                                _filename_;
164 
165 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
166     };
167 
168   }   // namespace learning
169 
170 }   // namespace gum
171 
172 #include <agrum/tools/database/CSVParser_tpl.h>
173 
174 #endif   // GUM_CSV_PARSER_H
175