1 /** 2 * 3 * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(_at_LIP6) & Christophe GONZALES(_at_AMU) 4 * info_at_agrum_dot_org 5 * 6 * This library is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public License 17 * along with this library. If not, see <http://www.gnu.org/licenses/>. 18 * 19 */ 20 21 22 /** @file 23 * @brief The databases' cell translators for range variables 24 * 25 * @author Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6) 26 */ 27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H 28 #define GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H 29 30 #include <agrum/agrum.h> 31 #include <agrum/tools/core/hashTable.h> 32 #include <agrum/tools/core/set.h> 33 #include <agrum/tools/database/DBTranslator.h> 34 #include <agrum/tools/variables/rangeVariable.h> 35 36 37 namespace gum { 38 39 namespace learning { 40 41 42 /** @class DBTranslator4RangeVariable 43 * @headerfile DBTranslator4RangeVariable.h <agrum/tools/database/DBTranslator4RangeVariable.h> 44 * @brief The databases' cell translators for range variables 45 * 46 * Translators are used by DatabaseTable instances to transform datasets' 47 * strings into DBTranslatedValue instances. The point is that strings are 48 * not adequate for fast learning, they need to be preprocessed into a type 49 * that can be analyzed quickly (the so-called DBTranslatedValue type). 50 * 51 * A DBTranslator4RangeVariable is a translator that contains and 52 * exploits a RangeVariable for translations. Each time a string needs 53 * be translated, we ask the RangeVariable whether its domain contains 54 * the integer value represented in the string. If this is the case, then 55 * the DBTranslatedValue corresponding to the translation of the string 56 * contains in its discr_val field this integer value. 57 * 58 * @par Here is an example of how to use this class: 59 * @code 60 * // create the translator, with possible missing symbols: "N/A" and "???" 61 * // i.e., each time the translator reads a "N/A" or a "???" string, it 62 * // won't translate it into a number but into a missing value. 63 * std::vector<std::string> missing { "N/A", "???" }; 64 * gum::learning::DBTranslator4RangeVariable<> translator ( missing ); 65 * 66 * // gets the DBTranslatedValue corresponding to some strings 67 * auto val1 = translator.translate("5"); 68 * auto val2 = translator.translate("4"); 69 * // at this point, val1 and val2 are equal to 70 * // gum::learning::DBTranslatedValue { std::size_t(0) } and 71 * // gum::learning::DBTranslatedValue { std::size_t(1) } respectively. 72 * // In addition, the RangeVariable stored into the translator has 73 * // a domain equal to {4,5}. 74 * auto val3 = translator << "7"; 75 * // val3 is encoded as gum::learning::DBTranslatedValue { std::size_t(3) } 76 * // because string "6" is implicitly encoded as 77 * // gum::learning::DBTranslatedValue { std::size_t(3) }. 78 * // In addition, the domain of the range variable is expanded to {4,5,6,7}. 79 * 80 * // add the numbers assigned to val1, val2, val3 81 * std::size_t sum = val1.discr_val + val2.discr_val + val3.discr_val; 82 * 83 * // translate missing values: val4 and val5 will be equal to: 84 * // DBTranslatedValue { std::numeric_limits<float>::max () } 85 * auto val4 = translator << "N/A"; 86 * auto val5 = translator.translate ( "???" ); 87 * 88 * // the following instructions raise TypeError exceptions because the 89 * // strings cannot be translated into integers 90 * auto val6 = translator << "422x"; 91 * auto val7 = translator.translate ( "xxx" ); 92 * 93 * // given a DBTranslatedValue that is supposed to contain an integer in 94 * // the range of the RangeVariable, get the corresponding string. 95 * std::string str; 96 * str = translator.translateBack ( val1 ); // str = "5" 97 * str = translator >> val2; // str = "4" 98 * str = translator >> gum::learning::DBTranslatedValue {std::size_t(2)}; 99 * // str = "6" 100 * 101 * // translate back missing values: the string will corresponds to one of 102 * // the missing symbols known to the translator 103 * str = translator >> val4; // str = "N/A" or "???" 104 * str = translator >> val5; // str = "N/A" or "???" 105 * 106 * // get the variable stored within the translator 107 * const gum::RangeVariable* var = 108 * dynamic_cast<const gum::RangeVariable*> ( translator.variable () ); 109 * 110 * // it is possible to create a translator for an already known variable. 111 * // In this case, by default, the translator is not in editable mode, but 112 * // this behavior can be changed passing the right arguments to the 113 * // constructor of the translator, or using the setEditableDictionaryMode 114 * // method. Here, we create a range variable whose domain is {-2,...,10} 115 * gum::RangeVariable var ( "X", "", -2, 10 ); 116 * gum::learning::DBTranslator4RangeVariable<> translator2 ( var, missing ); 117 * 118 * auto xval1 = translator2.translate ( "-1" ).discr_val; // xval1 = 1 119 * auto xval2 = translator2.translate ( "7" ).discr_val; // xval2 = 9 120 * auto xval3 = translator2.translate ( "N/A" ).discr_val; 121 * // here xval3 corresponds to a missing value, hence it is equal to 122 * // std::numeric_limits<size_t>::max () 123 * 124 * // trying to translate a string which is outside the domain of var will 125 * // raise Exception NotFound 126 * translator2.translate ( "20" ); // NotFound 127 * @endcode 128 * 129 * @ingroup learning_database 130 */ 131 template < template < typename > class ALLOC = std::allocator > 132 class DBTranslator4RangeVariable: public DBTranslator< ALLOC > { 133 public: 134 /// type for the allocators passed in arguments of methods 135 using allocator_type = typename DBTranslator< ALLOC >::allocator_type; 136 137 138 // ########################################################################## 139 /// @name Constructors / Destructors 140 // ########################################################################## 141 142 /// @{ 143 144 /// default constructor without any initial variable 145 /** When using this constructor, it is assumed implicitly that the 146 * dictionary contained into the translator is editable. So, when reading 147 * the database, if we observe a value that has not been encountered 148 * before, we update the range of the dictionary of the translator (hence 149 * that of the variable contained by the translator). 150 * @param missing_symbols the set of symbols in the dataset 151 * representing missing values 152 * @param max_dico_entries the max number of entries that the dictionary 153 * can contain. If we try to add new entries in the dictionary, this will 154 * be considered as an error and a SizeError exception will be raised 155 * @param alloc The allocator used to allocate memory for all the 156 * fields of the DBTranslator4RangeVariable 157 */ 158 template < template < typename > class XALLOC > 159 DBTranslator4RangeVariable( 160 const std::vector< std::string, XALLOC< std::string > >& missing_symbols, 161 std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(), 162 const allocator_type& alloc = allocator_type()); 163 164 /// default constructor without any initial variable nor missing symbols 165 /** When using this constructor, it is assumed implicitly that the 166 * dictionary contained into the translator is editable. So, when reading 167 * the database, if we observe a value that has not been encountered 168 * before, we update the range of the dictionary of the translator (hence 169 * that of the variable contained by the translator). 170 * @param max_dico_entries the max number of entries that the dictionary 171 * can contain. If we try to add new entries in the dictionary, this will 172 * be considered as an error and a SizeError exception will be raised 173 * @param alloc The allocator used to allocate memory for all the 174 * fields of the DBTranslator4RangeVariable 175 */ 176 DBTranslator4RangeVariable(std::size_t max_dico_entries 177 = std::numeric_limits< std::size_t >::max(), 178 const allocator_type& alloc = allocator_type()); 179 180 /// default constructor with a range variable as translator 181 /** @param var a range variable which will be used for translations. 182 * The translator keeps a copy of this variable 183 * @param missing_symbols the set of symbols in the dataset 184 * representing missing values 185 * @param editable_dictionary the mode in which the translator will perform 186 * translations: when false (the default), the translation of a string 187 * that does not correspond to an integer within the range of var will 188 * raise a NotFound exception; when true, the translator will try to 189 * expand the domain of the RangeVariable so that the number represented in 190 * the string belongs to this domain ((and therefore to the dictionary) 191 * @param max_dico_entries the max number of entries that the dictionary 192 * can contain. If we try to add new entries in the dictionary, this will 193 * be considered as an error and a SizeError exception will be raised 194 * @param alloc The allocator used to allocate memory for all the 195 * fields of the DBTranslator4RangeVariable 196 * @warning If the variable contained into the translator has a value in 197 * the range that is equal to a missing value symbol, the range value will 198 * be taken into account in the translations, not the missing value. 199 */ 200 template < template < typename > class XALLOC > 201 DBTranslator4RangeVariable( 202 const RangeVariable& var, 203 const std::vector< std::string, XALLOC< std::string > >& missing_symbols, 204 const bool editable_dictionary = false, 205 std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(), 206 const allocator_type& alloc = allocator_type()); 207 208 /** @brief default constructor with a range variable as translator 209 * but without missing symbols 210 * 211 * @param var a range variable which will be used for translations. 212 * The translator keeps a copy of this variable 213 * @param editable_dictionary the mode in which the translator will perform 214 * translations: when false (the default), the translation of a string 215 * that does not correspond to an integer within the range of var will 216 * raise a NotFound exception; when true, the translator will try to 217 * expand the domain of the RangeVariable so that the number represented in 218 * the string belongs to this domain ((and therefore to the dictionary) 219 * @param max_dico_entries the max number of entries that the dictionary 220 * can contain. If we try to add new entries in the dictionary, this will 221 * be considered as an error and a SizeError exception will be raised 222 * @param alloc The allocator used to allocate memory for all the 223 * fields of the DBTranslator4RangeVariable 224 * @warning If the variable contained into the translator has a value in 225 * the range that is equal to a missing value symbol, the range value will 226 * be taken into account in the translations, not the missing value. 227 */ 228 DBTranslator4RangeVariable(const RangeVariable& var, 229 const bool editable_dictionary = false, 230 std::size_t max_dico_entries 231 = std::numeric_limits< std::size_t >::max(), 232 const allocator_type& alloc = allocator_type()); 233 234 /// copy constructor 235 DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC >& from); 236 237 /// copy constructor with a given translator 238 DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC >& from, 239 const allocator_type& alloc); 240 241 /// move constructor 242 DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC >&& from); 243 244 /// move constructor with a given allocator 245 DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC >&& from, 246 const allocator_type& alloc); 247 248 /// virtual copy constructor 249 virtual DBTranslator4RangeVariable< ALLOC >* clone() const; 250 251 /// virtual copy constructor with a given allocator 252 virtual DBTranslator4RangeVariable< ALLOC >* clone(const allocator_type& alloc) const; 253 254 /// destructor 255 virtual ~DBTranslator4RangeVariable(); 256 257 /// @} 258 259 260 // ########################################################################## 261 /// @name Operators 262 // ########################################################################## 263 264 /// @{ 265 266 /// copy operator 267 DBTranslator4RangeVariable< ALLOC >& 268 operator=(const DBTranslator4RangeVariable< ALLOC >& from); 269 270 /// move operator 271 DBTranslator4RangeVariable< ALLOC >& operator=(DBTranslator4RangeVariable< ALLOC >&& from); 272 273 /// @} 274 275 276 // ########################################################################## 277 /// @name Accessors / Modifiers 278 // ########################################################################## 279 280 /// @{ 281 282 /// returns the translation of a string 283 /** This method tries to translate a given string into the 284 * DBTranslatedValue that should be stored into a databaseTable. If the 285 * translator cannot find the translation in its current dictionary, then 286 * two situations can obtain: 287 * -# if the translator is not in an editable dictionary mode, then the 288 * translator raises a NotFound exception. 289 * -# if the translator is in an editable dictionary mode, i.e., it is 290 * allowed to update its dictionary, then it tries to update the range 291 * of its dictionary to include the new value. Upon success, it returns 292 * the translated value, otherwise, it raises either: 293 * - a TypeError exception if the string cannot be converted into a 294 * value that can be inserted into the dictionary 295 * - an OperationNotAllowed exception if the translation would induce 296 * incoherent behavior (e.g., a translator that 297 * contains a variable whose domain is [x,y] as well as a missing 298 * value symbol z \f$\in\f$ [x,y]). 299 * - a SizeError exception if the number of entries in the dictionary, 300 * i.e., the domain size of the RangeVariable, has already reached 301 * its maximum. 302 * 303 * @warning Note that missing values (i.e., string encoded as missing 304 * symbols) are translated as std::numeric_limits<std::size_t>::max (). 305 * @warning If the variable contained into the translator has a value in 306 * its range equal to a missing value symbol, then this value will be 307 * taken into account in the translation, not the missing value. 308 * @return the translated value of the string to be stored into a 309 * DatabaseTable 310 * @throws UnknownLabelInDatabase is raised if the translation cannot 311 * be found and the translator is not in an editable dictionary mode. 312 * @throws SizeError is raised if the number of entries (the range) in 313 * the dictionary has already reached its maximum. 314 * @throws TypeError is raised if the translation cannot be found and 315 * the translator is in an editable dictionary mode and the string does 316 * not correspond to an integer. 317 * @throws OperationNotAllowed exception is raised if the translation 318 * cannot be found and the insertion of the string into the translator's 319 * dictionary fails because it would induce incoherent behavior (e.g., 320 * a translator that contains a variable whose domain is {x,y,z,t} as 321 * well as a missing value symbol z). 322 */ 323 virtual DBTranslatedValue translate(const std::string& str) final; 324 325 /// returns the original value for a given translation 326 /** @return the string that was translated into a given DBTranslatedValue. 327 * @throws UnknownLabelInDatabase is raised if this original value cannot 328 * be found */ 329 virtual std::string translateBack(const DBTranslatedValue translated_val) const final; 330 331 /// returns the domain size of a variable corresponding to the translations 332 /** Returns the size of the range of the variable. */ 333 virtual std::size_t domainSize() const final; 334 335 /** @brief indicates whether a reordering is needed to make the 336 * translations sorted by increasing numbers 337 * 338 * When constructing dynamically its dictionary, the translator may 339 * assign wrong DBTranslatedValue values to strings. For instance, a 340 * translator reading sequentially integer strings 2, 1, 3, may map 341 * 2 into DBTranslatedValue{std::size_t(0)}, 342 * 1 into DBTranslatedValue{std::size_t(1)} and 343 * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables 344 * having domain {2,1,3}. The user may prefer having domain {1,2,3}, i.e., 345 * a domain specified with increasing values. This requires a 346 * reordering. Method needsReodering() returns a Boolean indicating 347 * whether such a reordering should be performed or whether the current 348 * order is OK. 349 */ 350 virtual bool needsReordering() const final; 351 352 /** @brief performs a reordering of the dictionary and returns a mapping 353 * from the old translated values to the new ones. 354 * 355 * When a reordering is needed, i.e., string values must be translated 356 * differently, Method reorder() computes how the translations should be 357 * changed. It updates accordingly the dictionary and returns the mapping 358 * that enables changing the old dictionary values into the new ones. 359 */ 360 virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > 361 reorder() final; 362 363 /// returns the variable stored into the translator 364 virtual const RangeVariable* variable() const final; 365 366 /// returns the translation of a missing value 367 virtual DBTranslatedValue missingValue() const final; 368 369 /// @} 370 371 372 #ifndef DOXYGEN_SHOULD_SKIP_THIS 373 374 private: 375 // the RangeVariable assigned to the translator, if any 376 RangeVariable _variable_; 377 378 // assign to each integer missing symbol a Boolean indicating whether 379 // we already translated it or not. If we translated it, then we cannot 380 // change the range of the variable so that this range contains the symbol. 381 HashTable< std::string, bool, ALLOC< std::pair< std::string, bool > > > 382 _status_int_missing_symbols_; 383 384 // the set of translations of the integer missing symbols found so far 385 Set< long, ALLOC< long > > _translated_int_missing_symbols_; 386 387 // a string containing a non int missing symbol 388 // (useful for back translations) 389 std::string _nonint_missing_symbol_; 390 391 #endif /* DOXYGEN_SHOULD_SKIP_THIS */ 392 }; 393 394 395 } /* namespace learning */ 396 397 } /* namespace gum */ 398 399 400 // always include the template implementation 401 #include <agrum/tools/database/DBTranslator4RangeVariable_tpl.h> 402 403 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H */ 404