1 /**
2  *
3  *   Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(_at_LIP6) & Christophe GONZALES(_at_AMU)
4  *   info_at_agrum_dot_org
5  *
6  *  This library is free software: you can redistribute it and/or modify
7  *  it under the terms of the GNU Lesser General Public License as published by
8  *  the Free Software Foundation, either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This library is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU Lesser General Public License for more details.
15  *
16  *  You should have received a copy of the GNU Lesser General Public License
17  *  along with this library.  If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The databases' cell translators for range variables
24  *
25  * @author Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H
28 #define GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H
29 
30 #include <agrum/agrum.h>
31 #include <agrum/tools/core/hashTable.h>
32 #include <agrum/tools/core/set.h>
33 #include <agrum/tools/database/DBTranslator.h>
34 #include <agrum/tools/variables/rangeVariable.h>
35 
36 
37 namespace gum {
38 
39   namespace learning {
40 
41 
42     /** @class DBTranslator4RangeVariable
43      * @headerfile DBTranslator4RangeVariable.h <agrum/tools/database/DBTranslator4RangeVariable.h>
44      * @brief The databases' cell translators for range variables
45      *
46      * Translators are used by DatabaseTable instances to transform datasets'
47      * strings into DBTranslatedValue instances. The point is that strings are
48      * not adequate for fast learning, they need to be preprocessed into a type
49      * that can be analyzed quickly (the so-called DBTranslatedValue type).
50      *
51      * A DBTranslator4RangeVariable is a translator that contains and
52      * exploits a RangeVariable for translations. Each time a string needs
53      * be translated, we ask the RangeVariable whether its domain contains
54      * the integer value represented in the string. If this is the case, then
55      * the DBTranslatedValue corresponding to the translation of the string
56      * contains in its discr_val field this integer value.
57      *
58      * @par Here is an example of how to use this class:
59      * @code
60      * // create the translator, with possible missing symbols: "N/A" and "???"
61      * // i.e., each time the translator reads a "N/A" or a "???" string, it
62      * // won't translate it into a number but into a missing value.
63      * std::vector<std::string> missing { "N/A", "???" };
64      * gum::learning::DBTranslator4RangeVariable<> translator ( missing );
65      *
66      * // gets the DBTranslatedValue corresponding to some strings
67      * auto val1 = translator.translate("5");
68      * auto val2 = translator.translate("4");
69      * // at this point, val1 and val2 are equal to
70      * // gum::learning::DBTranslatedValue { std::size_t(0) } and
71      * // gum::learning::DBTranslatedValue { std::size_t(1) } respectively.
72      * // In addition, the RangeVariable stored into the translator has
73      * // a domain equal to {4,5}.
74      * auto val3 = translator << "7";
75      * // val3 is encoded as gum::learning::DBTranslatedValue { std::size_t(3) }
76      * // because string "6" is implicitly encoded as
77      * // gum::learning::DBTranslatedValue { std::size_t(3) }.
78      * // In addition, the domain of the range variable is expanded to {4,5,6,7}.
79      *
80      * // add the numbers assigned to val1, val2, val3
81      * std::size_t sum = val1.discr_val + val2.discr_val + val3.discr_val;
82      *
83      * // translate missing values: val4 and val5 will be equal to:
84      * // DBTranslatedValue { std::numeric_limits<float>::max () }
85      * auto val4 = translator << "N/A";
86      * auto val5 = translator.translate ( "???" );
87      *
88      * // the following instructions raise TypeError exceptions because the
89      * // strings cannot be translated into integers
90      * auto val6 = translator << "422x";
91      * auto val7 = translator.translate ( "xxx" );
92      *
93      * // given a DBTranslatedValue that is supposed to contain an integer in
94      * // the range of the RangeVariable, get the corresponding string.
95      * std::string str;
96      * str = translator.translateBack ( val1 );        // str = "5"
97      * str = translator >> val2;                       // str = "4"
98      * str = translator >> gum::learning::DBTranslatedValue {std::size_t(2)};
99      *                                                 // str = "6"
100      *
101      * // translate back missing values: the string will corresponds to one of
102      * // the missing symbols known to the translator
103      * str = translator >> val4; // str = "N/A" or "???"
104      * str = translator >> val5; // str = "N/A" or "???"
105      *
106      * // get the variable stored within the translator
107      * const gum::RangeVariable* var =
108      *   dynamic_cast<const gum::RangeVariable*> ( translator.variable () );
109      *
110      * // it is possible to create a translator for an already known variable.
111      * // In this case, by default, the translator is not in editable mode, but
112      * // this behavior can be changed passing the right arguments to the
113      * // constructor of the translator, or using the setEditableDictionaryMode
114      * // method. Here, we create a range variable whose domain is {-2,...,10}
115      * gum::RangeVariable var ( "X", "", -2, 10 );
116      * gum::learning::DBTranslator4RangeVariable<> translator2 ( var, missing );
117      *
118      * auto xval1 = translator2.translate ( "-1" ).discr_val; // xval1 = 1
119      * auto xval2 = translator2.translate ( "7" ).discr_val;  // xval2 = 9
120      * auto xval3 = translator2.translate ( "N/A" ).discr_val;
121      * // here xval3 corresponds to a missing value, hence it is equal to
122      * // std::numeric_limits<size_t>::max ()
123      *
124      * // trying to translate a string which is outside the domain of var will
125      * // raise Exception NotFound
126      * translator2.translate ( "20" ); // NotFound
127      * @endcode
128      *
129      * @ingroup learning_database
130      */
131     template < template < typename > class ALLOC = std::allocator >
132     class DBTranslator4RangeVariable: public DBTranslator< ALLOC > {
133       public:
134       /// type for the allocators passed in arguments of methods
135       using allocator_type = typename DBTranslator< ALLOC >::allocator_type;
136 
137 
138       // ##########################################################################
139       /// @name Constructors / Destructors
140       // ##########################################################################
141 
142       /// @{
143 
144       /// default constructor without any initial variable
145       /** When using this constructor, it is assumed implicitly that the
146        * dictionary contained into the translator is editable. So, when reading
147        * the database, if we observe a value that has not been encountered
148        * before, we update the range of the dictionary of the translator (hence
149        * that of the variable contained by the translator).
150        * @param missing_symbols the set of symbols in the dataset
151        * representing missing values
152        * @param max_dico_entries the max number of entries that the dictionary
153        * can contain. If we try to add new entries in the dictionary, this will
154        * be considered as an error and a SizeError exception will be raised
155        * @param alloc The allocator used to allocate memory for all the
156        * fields of the DBTranslator4RangeVariable
157        */
158       template < template < typename > class XALLOC >
159       DBTranslator4RangeVariable(
160          const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
161          std::size_t           max_dico_entries = std::numeric_limits< std::size_t >::max(),
162          const allocator_type& alloc            = allocator_type());
163 
164       /// default constructor without any initial variable nor missing symbols
165       /** When using this constructor, it is assumed implicitly that the
166        * dictionary contained into the translator is editable. So, when reading
167        * the database, if we observe a value that has not been encountered
168        * before, we update the range of the dictionary of the translator (hence
169        * that of the variable contained by the translator).
170        * @param max_dico_entries the max number of entries that the dictionary
171        * can contain. If we try to add new entries in the dictionary, this will
172        * be considered as an error and a SizeError exception will be raised
173        * @param alloc The allocator used to allocate memory for all the
174        * fields of the DBTranslator4RangeVariable
175        */
176       DBTranslator4RangeVariable(std::size_t max_dico_entries
177                                  = std::numeric_limits< std::size_t >::max(),
178                                  const allocator_type& alloc = allocator_type());
179 
180       /// default constructor with a range variable as translator
181       /** @param var a range variable which will be used for translations.
182        * The translator keeps a copy of this variable
183        * @param missing_symbols the set of symbols in the dataset
184        * representing missing values
185        * @param editable_dictionary the mode in which the translator will perform
186        * translations: when false (the default), the translation of a string
187        * that does not correspond to an integer within the range of var will
188        * raise a NotFound exception; when true, the translator will try to
189        * expand the domain of the RangeVariable so that the number represented in
190        * the string belongs to this domain ((and therefore to the dictionary)
191        * @param max_dico_entries the max number of entries that the dictionary
192        * can contain. If we try to add new entries in the dictionary, this will
193        * be considered as an error and a SizeError exception will be raised
194        * @param alloc The allocator used to allocate memory for all the
195        * fields of the DBTranslator4RangeVariable
196        * @warning If the variable contained into the translator has a value in
197        * the range that is equal to a missing value symbol, the range value will
198        * be taken into account in the translations, not the missing value.
199        */
200       template < template < typename > class XALLOC >
201       DBTranslator4RangeVariable(
202          const RangeVariable&                                     var,
203          const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
204          const bool                                               editable_dictionary = false,
205          std::size_t           max_dico_entries = std::numeric_limits< std::size_t >::max(),
206          const allocator_type& alloc            = allocator_type());
207 
208       /** @brief default constructor with a range variable as translator
209        * but without missing symbols
210        *
211        * @param var a range variable which will be used for translations.
212        * The translator keeps a copy of this variable
213        * @param editable_dictionary the mode in which the translator will perform
214        * translations: when false (the default), the translation of a string
215        * that does not correspond to an integer within the range of var will
216        * raise a NotFound exception; when true, the translator will try to
217        * expand the domain of the RangeVariable so that the number represented in
218        * the string belongs to this domain ((and therefore to the dictionary)
219        * @param max_dico_entries the max number of entries that the dictionary
220        * can contain. If we try to add new entries in the dictionary, this will
221        * be considered as an error and a SizeError exception will be raised
222        * @param alloc The allocator used to allocate memory for all the
223        * fields of the DBTranslator4RangeVariable
224        * @warning If the variable contained into the translator has a value in
225        * the range that is equal to a missing value symbol, the range value will
226        * be taken into account in the translations, not the missing value.
227        */
228       DBTranslator4RangeVariable(const RangeVariable& var,
229                                  const bool           editable_dictionary = false,
230                                  std::size_t          max_dico_entries
231                                  = std::numeric_limits< std::size_t >::max(),
232                                  const allocator_type& alloc = allocator_type());
233 
234       /// copy constructor
235       DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC >& from);
236 
237       /// copy constructor with a given translator
238       DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC >& from,
239                                  const allocator_type&                      alloc);
240 
241       /// move constructor
242       DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC >&& from);
243 
244       /// move constructor with a given allocator
245       DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC >&& from,
246                                  const allocator_type&                 alloc);
247 
248       /// virtual copy constructor
249       virtual DBTranslator4RangeVariable< ALLOC >* clone() const;
250 
251       /// virtual copy constructor with a given allocator
252       virtual DBTranslator4RangeVariable< ALLOC >* clone(const allocator_type& alloc) const;
253 
254       /// destructor
255       virtual ~DBTranslator4RangeVariable();
256 
257       /// @}
258 
259 
260       // ##########################################################################
261       /// @name Operators
262       // ##########################################################################
263 
264       /// @{
265 
266       /// copy operator
267       DBTranslator4RangeVariable< ALLOC >&
268          operator=(const DBTranslator4RangeVariable< ALLOC >& from);
269 
270       /// move operator
271       DBTranslator4RangeVariable< ALLOC >& operator=(DBTranslator4RangeVariable< ALLOC >&& from);
272 
273       /// @}
274 
275 
276       // ##########################################################################
277       /// @name Accessors / Modifiers
278       // ##########################################################################
279 
280       /// @{
281 
282       /// returns the translation of a string
283       /** This method tries to translate a given string into the
284        * DBTranslatedValue  that should be stored into a databaseTable. If the
285        * translator cannot find the translation in its current dictionary, then
286        * two situations can obtain:
287        * -# if the translator is not in an editable dictionary mode, then the
288        *    translator raises a NotFound exception.
289        * -# if the translator is in an editable dictionary mode, i.e., it is
290        *    allowed to update its dictionary, then it tries to update the range
291        *    of its dictionary to include the new value. Upon success, it returns
292        *    the translated value, otherwise, it raises either:
293        *    - a TypeError exception if the string cannot be converted into a
294        *      value that can be inserted into the dictionary
295        *    - an OperationNotAllowed exception if the translation would induce
296        *      incoherent behavior (e.g., a translator that
297        *      contains a variable whose domain is [x,y] as well as a missing
298        *      value symbol z \f$\in\f$ [x,y]).
299        *    - a SizeError exception if the number of entries in the dictionary,
300        *      i.e., the domain size of the RangeVariable, has already reached
301        *      its maximum.
302        *
303        * @warning Note that missing values (i.e., string encoded as missing
304        * symbols) are translated as std::numeric_limits<std::size_t>::max ().
305        * @warning If the variable contained into the translator has a value in
306        * its range equal to a missing value symbol, then this value will be
307        * taken into account in the translation, not the missing value.
308        * @return the translated value of the string to be stored into a
309        * DatabaseTable
310        * @throws UnknownLabelInDatabase is raised if the translation cannot
311        * be found and the translator is not in an editable dictionary mode.
312        * @throws SizeError is raised if the number of entries (the range) in
313        * the dictionary has already reached its maximum.
314        * @throws TypeError is raised if the translation cannot be found and
315        * the translator is in an editable dictionary mode and the string does
316        * not correspond to an integer.
317        * @throws OperationNotAllowed exception is raised if the translation
318        * cannot be found and the insertion of the string into the translator's
319        * dictionary fails because it would induce incoherent behavior (e.g.,
320        * a translator that contains a variable whose domain is {x,y,z,t} as
321        * well as a missing value symbol z).
322        */
323       virtual DBTranslatedValue translate(const std::string& str) final;
324 
325       /// returns the original value for a given translation
326       /** @return the string that was translated into a given DBTranslatedValue.
327        * @throws UnknownLabelInDatabase is raised if this original value cannot
328        * be found */
329       virtual std::string translateBack(const DBTranslatedValue translated_val) const final;
330 
331       /// returns the domain size of a variable corresponding to the translations
332       /** Returns the size of the range of the variable. */
333       virtual std::size_t domainSize() const final;
334 
335       /** @brief indicates whether a reordering is needed to make the
336        * translations sorted by increasing numbers
337        *
338        * When constructing dynamically its dictionary, the translator may
339        * assign wrong DBTranslatedValue values to strings. For instance, a
340        * translator reading sequentially integer strings 2, 1, 3, may map
341        * 2 into DBTranslatedValue{std::size_t(0)},
342        * 1 into DBTranslatedValue{std::size_t(1)} and
343        * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
344        * having domain {2,1,3}. The user may prefer having domain {1,2,3}, i.e.,
345        * a domain specified with increasing values. This requires a
346        * reordering. Method needsReodering() returns a Boolean indicating
347        * whether such a reordering should be performed or whether the current
348        * order is OK.
349        */
350       virtual bool needsReordering() const final;
351 
352       /** @brief performs a reordering of the dictionary and returns a mapping
353        * from the old translated values to the new ones.
354        *
355        * When a reordering is needed, i.e., string values must be translated
356        * differently, Method reorder() computes how the translations should be
357        * changed. It updates accordingly the dictionary and returns the mapping
358        * that enables changing the old dictionary values into the new ones.
359        */
360       virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
361          reorder() final;
362 
363       /// returns the variable stored into the translator
364       virtual const RangeVariable* variable() const final;
365 
366       /// returns the translation of a missing value
367       virtual DBTranslatedValue missingValue() const final;
368 
369       /// @}
370 
371 
372 #ifndef DOXYGEN_SHOULD_SKIP_THIS
373 
374       private:
375       // the RangeVariable assigned to the translator, if any
376       RangeVariable _variable_;
377 
378       // assign to each integer missing symbol a Boolean indicating whether
379       // we already translated it or not. If we translated it, then we cannot
380       // change the range of the variable so that this range contains the symbol.
381       HashTable< std::string, bool, ALLOC< std::pair< std::string, bool > > >
382          _status_int_missing_symbols_;
383 
384       // the set of translations of the integer missing symbols found so far
385       Set< long, ALLOC< long > > _translated_int_missing_symbols_;
386 
387       // a string containing a non int missing symbol
388       // (useful for back translations)
389       std::string _nonint_missing_symbol_;
390 
391 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
392     };
393 
394 
395   } /* namespace learning */
396 
397 } /* namespace gum */
398 
399 
400 // always include the template implementation
401 #include <agrum/tools/database/DBTranslator4RangeVariable_tpl.h>
402 
403 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H */
404