1 /**
2  *
3  *   Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(_at_LIP6) & Christophe GONZALES(_at_AMU)
4  *   info_at_agrum_dot_org
5  *
6  *  This library is free software: you can redistribute it and/or modify
7  *  it under the terms of the GNU Lesser General Public License as published by
8  *  the Free Software Foundation, either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This library is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU Lesser General Public License for more details.
15  *
16  *  You should have received a copy of the GNU Lesser General Public License
17  *  along with this library.  If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The databases' cell translators for continuous variables
24  *
25  * @author Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_CONTINUOUS_VARIABLE_H
28 #define GUM_LEARNING_DB_TRANSLATOR_4_CONTINUOUS_VARIABLE_H
29 
30 #include <string>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/core/hashTable.h>
34 #include <agrum/tools/database/DBTranslator.h>
35 #include <agrum/tools/variables/continuousVariable.h>
36 
37 
38 namespace gum {
39 
40   namespace learning {
41 
42 
43     /** @class DBTranslator4ContinuousVariable
44      * @headerfile DBTranslator4ContinuousVariable.h <agrum/tools/database/DBTranslator4ContinuousVariable.h>
45      * @brief The databases' cell translators for continuous variables
46      *
47      * Translators are used by DatabaseTable instances to transform datasets'
48      * strings into DBTranslatedValue instances. The point is that strings are
49      * not adequate for fast learning, they need to be preprocessed into a type
50      * that can be analyzed quickly (the so-called DBTranslatedValue type).
51      *
52      * A DBTranslator4ContinuousVariable is a translator that contains and
53      * exploits a ContinuousVariable for translations. Each time a string needs
54      * be translated, we ask the ContinuousVariable whether it belongs to its
55      * domain (which is supposed to be of type [x_min,x_max]). If this is the
56      * case, then the DBTranslatedValue corresponding to the translation of the
57      * string contains the floating point number specified in the string.
58      *
59      * @par Here is an example of how to use this class:
60      * @code
61      * // create the translator, with possible missing symbols: "N/A" and "???"
62      * // i.e., each time the translator reads a "N/A" or a "???" string, it
63      * // won't translate it into a number but into a missing value.
64      * std::vector<std::string> missing { "N/A", "???" };
65      * gum::learning::DBTranslator4ContinuousVariable<> translator ( missing );
66      *
67      * // gets the DBTranslatedValue corresponding to some strings
68      * auto val1 = translator.translate("5");   // val1 = DBTranslatedValue {5.0f}
69      * auto val2 = translator.translate("4.2"); // val2 = DBTRanslatedValue {4.2f}
70      * auto val3 = translator << "3.4";         // val3 = DBTranslatedValue {3.4f}
71      *
72      * // add the numbers assigned to val1, val2, val3
73      * float sum = val1.cont_val + val2.cont_val + val3.cont_val;
74      *
75      * // translate missing values: val4 and val5 will be equal to:
76      * // DBTranslatedValue { std::numeric_limits<float>::max () }
77      * auto val4 = translator << "N/A";
78      * auto val5 = translator.translate ( "???" );
79      *
80      * // the following instructions raise TypeError exceptions because the
81      * // strings cannot be translated into real numbers
82      * auto val6 = translator << "4.22x";
83      * auto val7 = translator.translate ( "xxx" );
84      *
85      * // given a DBTranslatedValue that is supposed to contain a float, get
86      * // the corresponding string. The strings should be equivalent to those
87      * // indicated below (maybe they could contain more zeroes after the dot).
88      * std::string str;
89      * str = translator.translateBack ( val1 );        // str ~ "5.0"
90      * str = translator >> val2;                       // str ~ "4.2"
91      * str = translator >> gum::learning::DBTranslatedValue {7.2e3f};
92      *                                                 // str ~ "7.2 e3"
93      *
94      * // translate back missing values: the string will corresponds to one of
95      * // the missing symbols known to the translator
96      * str = translator >> val4; // str = "N/A" or "???"
97      * str = translator >> val5; // str = "N/A" or "???"
98      *
99      * // get the variable stored within the translator
100      * const gum::ContinuousVariable<float>* var =
101      *   dynamic_cast<const gum::ContinuousVariable<float>*>
102      *     ( translator.variable () );
103      *
104      * // it is possible to create a translator for an already known variable.
105      * // In this case, by default, the translator is not in editable mode, but
106      * // this behavior can be changed passing the right arguments to the
107      * // constructor of the translator, or using the setEditableDictionaryMode
108      * // method. Here, we create a continuous variable whose domain is [-2,10]
109      * gum::ContinuousVariable<float> var ( "X", "", -2, 10 );
110      * gum::learning::DBTranslator4ContinuousVariable<> translator2 (var,missing);
111      *
112      * float xval1 = translator2.translate ( "-1.4" ).cont_val; // xval1 = -1.4
113      * float xval2 = translator2.translate ( "7" ).cont_val;    // xval2 = 7
114      * float xval3 = translator2.translate ( "N/A" ).cont_val;
115      * // here xval3 corresponds to a missing value, hence it is equal to
116      * // std::numeric_limits<float>::max ()
117      *
118      * // trying to translate a string which is outside the domain of var will
119      * // raise Exception NotFound
120      * translator2.translate ( "20" ); // NotFound
121      * @endcode
122      *
123      * @ingroup learning_database
124      */
125     template < template < typename > class ALLOC = std::allocator >
126     class DBTranslator4ContinuousVariable: public DBTranslator< ALLOC > {
127       public:
128       /// type for the allocators passed in arguments of methods
129       using allocator_type = typename DBTranslator< ALLOC >::allocator_type;
130 
131 
132       // ##########################################################################
133       /// @name Constructors / Destructors
134       // ##########################################################################
135 
136       /// @{
137 
138       /// default constructor without any initial variable
139       /** When using this constructor, it is assumed implicitly that the
140        * continuous variable has a range from minus infinity to plus infinity.
141        * If the fit_range parameter is on, the range of the variable is updated
142        * so that it precisely fits the range of the observed values in the
143        * database.
144        * @param  missing_symbols the set of symbols in the database
145        * representing missing values
146        * @param fit_range if true, the range of the variable is updated
147        * so that it precisely fits the range of the observed values in the
148        * database, else the range is kept to (-inf,inf)
149        * @param alloc The allocator used to allocate memory for all the
150        * fields of the DBTranslator4ContinuousVariable
151        */
152       template < template < typename > class XALLOC >
153       DBTranslator4ContinuousVariable(
154          const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
155          const bool                                               fit_range = false,
156          const allocator_type&                                    alloc     = allocator_type());
157 
158       /// default constructor without any initial variable nor missing symbol
159       /** When using this constructor, it is assumed implicitly that the
160        * continuous variable has a range from minus infinity to plus infinity.
161        * If the fit_range parameter is on, the range of the variable is updated
162        * so that it precisely fits the range of the observed values in the
163        * database.
164        * @param fit_range if true, the range of the variable is updated
165        * so that it precisely fits the range of the observed values in the
166        * database, else the range is kept to (-inf,inf)
167        * @param alloc The allocator used to allocate memory for all the
168        * fields of the DBTranslator4ContinuousVariable
169        */
170       DBTranslator4ContinuousVariable(const bool            fit_range = false,
171                                       const allocator_type& alloc     = allocator_type());
172 
173       /// default constructor with a continuous variable as translator
174       /** @param var a continuous variable that will be used for
175        * translations. The translator keeps a copy of this variable
176        * @param  missing_symbols the set of symbols in the database
177        * representing missing values
178        * @param fit_range if true, the range of the variable is updated
179        * so that it precisely fits the range of the observed values in the
180        * database, else the range is kept to (-inf,inf)
181        * @param alloc The allocator used to allocate memory for all the
182        * fields of the DBTranslator4ContinuousVariable
183        * @warning If a missing value symbol is a number included in the range
184        * of the continuous variable, it will be discarded. If the fit_range
185        * parameter is on, the range of the variable is updated so that it
186        * can contain the range of the observed values in the database. */
187       template < typename GUM_SCALAR, template < typename > class XALLOC >
188       DBTranslator4ContinuousVariable(
189          const ContinuousVariable< GUM_SCALAR >&                  var,
190          const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
191          const bool                                               fit_range = false,
192          const allocator_type&                                    alloc     = allocator_type());
193 
194       /** @brief default constructor with a continuous variable as translator
195        * but without missing symbol
196        *
197        * @param var a continuous variable that will be used for
198        * translations. The translator keeps a copy of this variable
199        * @param fit_range if true, the range of the variable is updated
200        * so that it precisely fits the range of the observed values in the
201        * database, else the range is kept to (-inf,inf)
202        * @param alloc The allocator used to allocate memory for all the
203        * fields of the DBTranslator4ContinuousVariable
204        * @warning If a missing value symbol is a number included in the range
205        * of the continuous variable, it will be discarded. If the fit_range
206        * parameter is on, the range of the variable is updated so that it
207        * can contain the range of the observed values in the database. */
208       template < typename GUM_SCALAR >
209       DBTranslator4ContinuousVariable(const ContinuousVariable< GUM_SCALAR >& var,
210                                       const bool                              fit_range = false,
211                                       const allocator_type& alloc = allocator_type());
212 
213       /// default constructor with a IContinuous variable as translator
214       /** @param var a IContinuous variable that will be used for
215        * translations. The translator keeps a copy of this variable
216        * @param  missing_symbols the set of symbols in the database
217        * representing missing values
218        * @param fit_range if true, the range of the variable is updated
219        * so that it precisely fits the range of the observed values in the
220        * database, else the range is kept to (-inf,inf)
221        * @param alloc The allocator used to allocate memory for all the
222        * fields of the DBTranslator4ContinuousVariable
223        * @warning If a missing value symbol is a number included in the range
224        * of the continuous variable, it will be discarded. If the fit_range
225        * parameter is on, the range of the variable is updated so that it
226        * can contain the range of the observed values in the database. */
227       template < template < typename > class XALLOC >
228       DBTranslator4ContinuousVariable(
229          const IContinuousVariable&                               var,
230          const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
231          const bool                                               fit_range = false,
232          const allocator_type&                                    alloc     = allocator_type());
233 
234       /** @brief default constructor with a IContinuous variable as translator
235        * but without missing symbol
236        *
237        * @param var a IContinuous variable that will be used for
238        * translations. The translator keeps a copy of this variable
239        * @param fit_range if true, the range of the variable is updated
240        * so that it precisely fits the range of the observed values in the
241        * database, else the range is kept to (-inf,inf)
242        * @param alloc The allocator used to allocate memory for all the
243        * fields of the DBTranslator4ContinuousVariable
244        * @warning If a missing value symbol is a number included in the range
245        * of the continuous variable, it will be discarded. If the fit_range
246        * parameter is on, the range of the variable is updated so that it
247        * can contain the range of the observed values in the database. */
248       DBTranslator4ContinuousVariable(const IContinuousVariable& var,
249                                       const bool                 fit_range = false,
250                                       const allocator_type&      alloc     = allocator_type());
251 
252       /// copy constructor
253       DBTranslator4ContinuousVariable(const DBTranslator4ContinuousVariable< ALLOC >& from);
254 
255       /// copy constructor with a given allocator
256       DBTranslator4ContinuousVariable(const DBTranslator4ContinuousVariable< ALLOC >& from,
257                                       const allocator_type&                           alloc);
258 
259       /// move constructor
260       DBTranslator4ContinuousVariable(DBTranslator4ContinuousVariable< ALLOC >&& from);
261 
262       /// move constructor with a given allocator
263       DBTranslator4ContinuousVariable(DBTranslator4ContinuousVariable< ALLOC >&& from,
264                                       const allocator_type&                      alloc);
265 
266       /// virtual copy constructor
267       virtual DBTranslator4ContinuousVariable< ALLOC >* clone() const;
268 
269       /// virtual copy constructor with a given allocator
270       virtual DBTranslator4ContinuousVariable< ALLOC >* clone(const allocator_type& alloc) const;
271 
272       /// destructor
273       virtual ~DBTranslator4ContinuousVariable();
274 
275       /// @}
276 
277 
278       // ##########################################################################
279       /// @name Operators
280       // ##########################################################################
281 
282       /// @{
283 
284       /// copy operator
285       DBTranslator4ContinuousVariable< ALLOC >&
286          operator=(const DBTranslator4ContinuousVariable< ALLOC >& from);
287 
288       /// move operator
289       DBTranslator4ContinuousVariable< ALLOC >&
290          operator=(DBTranslator4ContinuousVariable< ALLOC >&& from);
291 
292       /// @}
293 
294 
295       // ##########################################################################
296       /// @name Accessors / Modifiers
297       // ##########################################################################
298 
299       /// @{
300 
301       /// returns the translation of a string
302       /** This method tries to translate a given string into the
303        * DBTranslatedValue that should be stored into a DatabaseTable. If the
304        * translator cannot find the translation in its current dictionary, then
305        * two situations can obtain:
306        * -# if the translator is not in an editable dictionary mode, then the
307        *    translator raises a NotFound exception.
308        * -# if the translator is in an editable dictionary mode, i.e., it is
309        *    allowed to update its dictionary, then it tries to update the range
310        *    of its dictionary to include the new value. Upon success, it returns
311        *    the translated value, otherwise, it raises either:
312        *    - a TypeError exception if the string cannot be converted into a
313        *      value that can be inserted into the dictionary
314        *    - an OperationNotAllowed exception if the translation would induce
315        *      incoherent behavior (e.g., a DBTranslator4ContinuousVariable that
316        *      contains a variable whose domain is [x,y] as well as a missing
317        *      value symbol z \f$\in\f$ [x,y]).
318 
319        * @warning Note that missing values (i.e., string encoded as missing
320        * symbols) are translated as std::numeric_limits<float>::max ().
321        * @warning If the variable contained into the translator has a value in
322        * its domain equal to a missing value symbol, this value will be taken
323        * into account in the translations, not the missing value.
324        * @return the translated value of the string to be stored into a
325        * DatabaseTable
326        * @throws UnknownLabelInDatabase is raised if the number represented by
327        * the string is out of the range of the continuous variable and the
328        * translator is not in an editable dictionary mode.
329        * @throws OperationNotAllowed exception is raised if the translation
330        * cannot be found and the insertion of the string into the translator's
331        * dictionary fails because it would induce incoherent behavior (e.g.,
332        * a DBTranslator4ContinuousVariable that contains a variable whose domain
333        * is [x,y] as well as a missing value symbol z \f$\in\f$ [x,y]).
334        * @throws TypeError is raised if the translation cannot be found and
335        * the insertion of the string into the translator's dictionary fails
336        * due to str being impossible to be converted into an appropriate type. */
337       virtual DBTranslatedValue translate(const std::string& str) final;
338 
339       /// returns the original value for a given translation
340       /** @return the string that was translated into a given DBTranslatedValue.
341        * @throws UnknownLabelInDatabase is raised if this original value is
342        * outside the domain of the continuous variable stored within the
343        * translator */
344       virtual std::string translateBack(const DBTranslatedValue translated_val) const final;
345 
346       /// returns std::numeric_limits<std::size_t>::max ()
347       virtual std::size_t domainSize() const final;
348 
349       /// indicates that the translations should never be reordered
350       virtual bool needsReordering() const final;
351 
352       /** @brief returns an empty mapping, indicating that old tanslations
353        * are equal to the newly reordered ones. */
354       virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
355          reorder() final;
356 
357       /// returns the variable stored into the translator
358       virtual const IContinuousVariable* variable() const final;
359 
360       /// returns the translation of a missing value
361       virtual DBTranslatedValue missingValue() const final;
362 
363       /// @}
364 
365 
366 #ifndef DOXYGEN_SHOULD_SKIP_THIS
367 
368       private:
369       // the ContinuousVariable really used by the translator. As its values
370       // are floats, this speeds-up translations
371       ContinuousVariable< float > _variable_;
372 
373       // the ContinuousVariablee returned by method variable ()
374       // We must return a IContinuousVariable because the user may have
375       // saved into the translator a ContinuousVariable<X>, with X != float
376       IContinuousVariable* _real_variable_;
377 
378       // assign to each float missing symbol a Boolean indicating whether
379       // we already translated it or not. If we translated it, then we cannot
380       // change the range of the variable so that this range contains the symbol.
381       HashTable< std::string, bool, ALLOC< std::pair< float, bool > > >
382          _status_float_missing_symbols_;
383 
384       // a string containing a non real missing symbol
385       // (useful for back translations)
386       std::string _nonfloat_missing_symbol_;
387 
388       // indicates whether we should fit the range of the observed values
389       bool _fit_range_;
390 
391 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
392     };
393 
394   } /* namespace learning */
395 
396 } /* namespace gum */
397 
398 
399 // always include the template implementation
400 #include <agrum/tools/database/DBTranslator4ContinuousVariable_tpl.h>
401 
402 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_CONTINUOUS_VARIABLE_H */
403