1 /***************************************************************************
2  *   Copyright (c) 2005-2020 by Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6)  *
3  *   info_at_agrum_dot_org                                               *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20 /** @file
21  * @brief The class for parsing DatabaseTable rows and generating output rows
22  *
23  * @author Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6)
24  */
25 #ifndef GUM_LEARNING_DB_ROW_GENERATOR_PARSER_H
26 #define GUM_LEARNING_DB_ROW_GENERATOR_PARSER_H
27 
28 #include <limits>
29 
30 #include <agrum/agrum.h>
31 #include <agrum/tools/database/DBHandler.h>
32 #include <agrum/tools/database/databaseTable.h>
33 #include <agrum/tools/database/DBRowGeneratorSet.h>
34 
35 namespace gum {
36 
37   namespace learning {
38 
39     /** @class DBRowGeneratorParser
40      * @headerfile DBRowGeneratorParser.h <agrum/tools/database/DBRowGeneratorParser.h>
41      * @ingroup learning_database
42      * @brief the class used to read a row in the database and to transform it
43      * into a set of DBRow instances that can be used for learning.
44      *
45      * A DBRowGeneratorParser contains a handler on a DatabaseTable that enables
46      * it to parse DBRows contained in the DatabaseTable. It also contains a
47      * DBRowGeneratorSet that is used to create output rows for each parsed
48      * DBRow. Note that if the DBRowGeneratorSet is empty, then
49      * DBRowGeneratorParser simply outputs each parsed DBRow without additional
50      * processing. To understand the difference between a DBRowGeneratorParser
51      * and a DBRowGeneratorSet, the latter is designed to take as input only
52      * one DBRow instance and to produce some output DBRow instances, whereas
53      * the former is designed to parse the content of a DatabaseTable and to
54      * produce from them some output DBRow instances.
55      *
56      * @par Usage example:
57      * @code
58      * // create and fill a database
59      * gum::learning::DatabaseTable<> database ( ... );
60      * .....
61      *
62      * // create a vector with the types of the columns of database
63      * const std::vector<gum::learning::DBTranslatedValueType>
64      *   col_types ( 10, gum::learning::DBTranslatedValueType::DISCRETE );
65      *
66      * // create a generator set
67      * gum::learning::MyGenerator<>  generator1 ( col_types, 6 );
68      * gum::learning::MyGenerator2<> generator2 ( col_types, 4 );
69      * gum::learning::DBRowGeneratorSet<> genset;
70      * genset.insertGenerator ( generator1 );
71      * genset.insertGenerator ( generator2 );
72      *
73      * // create the DBRowGeneratorParser
74      * gum::learning::DBRowGeneratorParser<>
75      *   parser ( database.handler (), genset );
76      *
77      * // use the parser to parse all the database and to apply all the
78      * // transformations induced by generator1 and generator2
79      * while ( parser.hasRows () ) {
80      *   const auto& dbrow = parser.row();
81      *   // do something with dbrow
82      * }
83      * @endcode
84      */
85     template <template<typename> class ALLOC = std::allocator>
86     class DBRowGeneratorParser {
87       public:
88 
89       /// type for the allocators passed in arguments of methods
90       using allocator_type = ALLOC<DBTranslatedValue>;
91 
92       // ##########################################################################
93       /// @name Constructors / Destructors
94       // ##########################################################################
95 
96       /// @{
97 
98       /// default constructor
99       DBRowGeneratorParser( const typename DatabaseTable<ALLOC>::Handler& handler,
100                             const DBRowGeneratorSet<ALLOC>& generator_set,
101                             const allocator_type& alloc = allocator_type () );
102 
103       /// copy constructor
104       DBRowGeneratorParser( const DBRowGeneratorParser<ALLOC>& from );
105 
106       /// copy constructor with a given allocator
107       DBRowGeneratorParser( const DBRowGeneratorParser<ALLOC>& from,
108                             const allocator_type& alloc );
109 
110       /// move constructor
111       DBRowGeneratorParser(DBRowGeneratorParser<ALLOC>&& filter);
112 
113       /// move constructor with a given allocator
114       DBRowGeneratorParser(DBRowGeneratorParser<ALLOC>&& filter,
115                            const allocator_type& alloc );
116 
117       /// virtual copy constructor
118       virtual DBRowGeneratorParser<ALLOC>* clone () const;
119 
120       /// virtual copy constructor with a given allocator
121       virtual DBRowGeneratorParser<ALLOC>*
122       clone (const allocator_type& alloc) const;
123 
124       /// destructor
125       virtual ~DBRowGeneratorParser();
126 
127       /// @}
128 
129       // ##########################################################################
130       /// @name Operators
131       // ##########################################################################
132 
133       /// @{
134 
135       /// copy operator
136       DBRowGeneratorParser<ALLOC>&
137       operator=(const DBRowGeneratorParser<ALLOC>& from );
138 
139       /// move operator
140       DBRowGeneratorParser<ALLOC>&
141       operator=(DBRowGeneratorParser<ALLOC>&& from );
142 
143       /// @}
144 
145       // ##########################################################################
146       /// @name Accessors / Modifiers
147       // ##########################################################################
148 
149       /// @{
150 
151       /** @brief returns true if there are still rows that can be output by the
152        * DBRowGeneratorParser
153        *
154        * The usual way of calling this method is to encapsulate it into a while
155        * loop whose stopping condition is when the handler has no more rows.
156        * This loop shall be inside a try-catch statement that enables to
157        * stop properly the loop when the NotFound exception is raised. In most
158        * practical cases, this exception will never be raised, but if you use
159        * a row generator that enables to return 0 row (say, for instance an
160        * intelligent EM that does not return any row when there are too many
161        * missing data) and if the last rows of the database are such that this
162        * generator will return no row, then the exception will be raised.
163        * Actually, it is not efficient to parse all the database to detect such
164        * a case before trying to return the rows, especially because this
165        * situation is very unlikely to occur. So a correct code to use method
166        * row () is like:
167        * @code
168        * try {
169        *   while ( parser.hasRows () ) {
170        *     const auto& row = parser.row ();
171        *     do_whatever_you_want_with_the_row... ;
172        *   }
173        * }
174        * catch ( NotFound& ) { // stop, there are no more rows to process }
175        * @endcode
176        */
177       bool hasRows();
178 
179       /// returns a new output row with its corresponding weight
180       /** The usual way of calling this method is to encapsulate it into a while
181        * loop whose stopping condition is when the handler has no more rows.
182        * This loop shall be inside a try-catch statement that enables to
183        * stop properly the loop when the NotFound exception is raised. In most
184        * practical cases, this exception will never be raised, but if you use
185        * a row generator that enables to return 0 row (say, for instance an
186        * intelligent EM that does not return any row when there are too many
187        * missing data) and if the last rows of the database are such that this
188        * generator will return no row, then the exception will be raised.
189        * Actually, it is not efficient to parse all the database to detect such
190        * a case before trying to return the rows, especially because this
191        * situation is very unlikely to occur. So a correct code to use method
192        * row () is like:
193        * @code
194        * try {
195        *   while ( parser.hasRows () ) {
196        *     const auto& row = parser.row ();
197        *     do_whatever_you_want_with_the_row... ;
198        *   }
199        * }
200        * catch ( NotFound& ) { // stop, there are no more rows to process }
201        * @endcode
202        */
203       const DBRow<DBTranslatedValue,ALLOC>& row ();
204 
205       /// resets the parser
206       void reset();
207 
208       /// returns the handler used by the parser
209       typename DatabaseTable<ALLOC>::Handler& handler();
210 
211       /// returns the handler used by the parser
212       const typename DatabaseTable<ALLOC>::Handler& handler() const;
213 
214       /// returns a reference on the database
215       const DatabaseTable<ALLOC>& database () const;
216 
217       /// returns the generator set that is actually used
218       DBRowGeneratorSet<ALLOC>& generatorSet();
219 
220       /// returns the generator set that is actually used
221       const DBRowGeneratorSet<ALLOC>& generatorSet() const;
222 
223       /// sets the area in the database the handler will handle
224       /** In addition to setting the area that will be parsed by the handler,
225        * this method makes the handler point to the beginning of the area.
226        * @param begin the first row to be handled
227        * @param end the handler handles rows in interval [begin,end). Thus,
228        * the endth row is not included in the set of rows handled.
229        * @warning if begin is greater than end, these values are swapped.
230        * @throw NullElement is raised if the handler does not point to
231        * any database
232        * @throw SizeError is raised if end is greater than the number of
233        * rows of the database */
234       void setRange(std::size_t begin, std::size_t end);
235 
236       /** @brief sets the columns of interest: the output DBRow needs only
237        * contain values fot these columns
238        *
239        * This method is useful, e.g., for EM-like algorithms that need to know
240        * which unobserved variables/values need be filled.
241        *
242        * @throw OperationNotAllowed is raised if the generator set has already
243        * started generating output rows and is currently in a state where the
244        * generation is not completed yet (i.e., we still need to call the
245        * generate() method to complete it). */
246       void setColumnsOfInterest (
247         const std::vector<std::size_t,ALLOC<std::size_t>>& cols_of_interest );
248 
249       /** @brief sets the columns of interest: the output DBRow needs only
250        * contain values fot these columns
251        *
252        * This method is useful, e.g., for EM-like algorithms that need to know
253        * which unobserved variables/values need be filled.
254        *
255        * @throw OperationNotAllowed is raised if the generator set has already
256        * started generating output rows and is currently in a state where the
257        * generation is not completed yet (i.e., we still need to call the
258        * generate() method to complete it). */
259       void setColumnsOfInterest (
260         std::vector<std::size_t,ALLOC<std::size_t>>&& cols_of_interest );
261 
262       /// assign a new Bayes net to all the generators that depend on a BN
263       /** Typically, generators based on EM or K-means depend on a model to
264        * compute correctly their outputs. Method setBayesNet enables to
265        * update their BN model.
266        * @warning if one generator that relies on Bayes nets cannot be assigned
267        * new_bn, then no generator is updated and an exception is raised. */
268       template < typename GUM_SCALAR >
269       void setBayesNet (const BayesNet<GUM_SCALAR>& new_bn);
270 
271       /// returns the allocator used
272       allocator_type getAllocator () const;
273 
274       /// @}
275 
276 
277     private:
278 
279       /// the handler that is really used to parse the database
280       typename DatabaseTable<ALLOC>::Handler  _handler_;
281 
282       /// the set of DBRow generators (might be empty)
283       DBRowGeneratorSet<ALLOC>  _generator_set_;
284 
285       /// the size of the generator set
286       std::size_t  _generator_size_;
287 
288     };
289 
290   } /* namespace learning */
291 
292 } /* namespace gum */
293 
294 // always include the template implementation
295 #include <agrum/tools/database/DBRowGeneratorParser_tpl.h>
296 
297 #endif /* GUM_LEARNING_DB_ROW_GENERATOR_PARSER_H */
298