1 /** 2 * 3 * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(_at_LIP6) & Christophe GONZALES(_at_AMU) 4 * info_at_agrum_dot_org 5 * 6 * This library is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public License 17 * along with this library. If not, see <http://www.gnu.org/licenses/>. 18 * 19 */ 20 21 22 /** @file 23 * @brief The databases' cell translators for continuous variables 24 * 25 * @author Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6) 26 */ 27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_CONTINUOUS_VARIABLE_H 28 #define GUM_LEARNING_DB_TRANSLATOR_4_CONTINUOUS_VARIABLE_H 29 30 #include <string> 31 32 #include <agrum/agrum.h> 33 #include <agrum/tools/core/hashTable.h> 34 #include <agrum/tools/database/DBTranslator.h> 35 #include <agrum/tools/variables/continuousVariable.h> 36 37 38 namespace gum { 39 40 namespace learning { 41 42 43 /** @class DBTranslator4ContinuousVariable 44 * @headerfile DBTranslator4ContinuousVariable.h <agrum/tools/database/DBTranslator4ContinuousVariable.h> 45 * @brief The databases' cell translators for continuous variables 46 * 47 * Translators are used by DatabaseTable instances to transform datasets' 48 * strings into DBTranslatedValue instances. The point is that strings are 49 * not adequate for fast learning, they need to be preprocessed into a type 50 * that can be analyzed quickly (the so-called DBTranslatedValue type). 51 * 52 * A DBTranslator4ContinuousVariable is a translator that contains and 53 * exploits a ContinuousVariable for translations. Each time a string needs 54 * be translated, we ask the ContinuousVariable whether it belongs to its 55 * domain (which is supposed to be of type [x_min,x_max]). If this is the 56 * case, then the DBTranslatedValue corresponding to the translation of the 57 * string contains the floating point number specified in the string. 58 * 59 * @par Here is an example of how to use this class: 60 * @code 61 * // create the translator, with possible missing symbols: "N/A" and "???" 62 * // i.e., each time the translator reads a "N/A" or a "???" string, it 63 * // won't translate it into a number but into a missing value. 64 * std::vector<std::string> missing { "N/A", "???" }; 65 * gum::learning::DBTranslator4ContinuousVariable<> translator ( missing ); 66 * 67 * // gets the DBTranslatedValue corresponding to some strings 68 * auto val1 = translator.translate("5"); // val1 = DBTranslatedValue {5.0f} 69 * auto val2 = translator.translate("4.2"); // val2 = DBTRanslatedValue {4.2f} 70 * auto val3 = translator << "3.4"; // val3 = DBTranslatedValue {3.4f} 71 * 72 * // add the numbers assigned to val1, val2, val3 73 * float sum = val1.cont_val + val2.cont_val + val3.cont_val; 74 * 75 * // translate missing values: val4 and val5 will be equal to: 76 * // DBTranslatedValue { std::numeric_limits<float>::max () } 77 * auto val4 = translator << "N/A"; 78 * auto val5 = translator.translate ( "???" ); 79 * 80 * // the following instructions raise TypeError exceptions because the 81 * // strings cannot be translated into real numbers 82 * auto val6 = translator << "4.22x"; 83 * auto val7 = translator.translate ( "xxx" ); 84 * 85 * // given a DBTranslatedValue that is supposed to contain a float, get 86 * // the corresponding string. The strings should be equivalent to those 87 * // indicated below (maybe they could contain more zeroes after the dot). 88 * std::string str; 89 * str = translator.translateBack ( val1 ); // str ~ "5.0" 90 * str = translator >> val2; // str ~ "4.2" 91 * str = translator >> gum::learning::DBTranslatedValue {7.2e3f}; 92 * // str ~ "7.2 e3" 93 * 94 * // translate back missing values: the string will corresponds to one of 95 * // the missing symbols known to the translator 96 * str = translator >> val4; // str = "N/A" or "???" 97 * str = translator >> val5; // str = "N/A" or "???" 98 * 99 * // get the variable stored within the translator 100 * const gum::ContinuousVariable<float>* var = 101 * dynamic_cast<const gum::ContinuousVariable<float>*> 102 * ( translator.variable () ); 103 * 104 * // it is possible to create a translator for an already known variable. 105 * // In this case, by default, the translator is not in editable mode, but 106 * // this behavior can be changed passing the right arguments to the 107 * // constructor of the translator, or using the setEditableDictionaryMode 108 * // method. Here, we create a continuous variable whose domain is [-2,10] 109 * gum::ContinuousVariable<float> var ( "X", "", -2, 10 ); 110 * gum::learning::DBTranslator4ContinuousVariable<> translator2 (var,missing); 111 * 112 * float xval1 = translator2.translate ( "-1.4" ).cont_val; // xval1 = -1.4 113 * float xval2 = translator2.translate ( "7" ).cont_val; // xval2 = 7 114 * float xval3 = translator2.translate ( "N/A" ).cont_val; 115 * // here xval3 corresponds to a missing value, hence it is equal to 116 * // std::numeric_limits<float>::max () 117 * 118 * // trying to translate a string which is outside the domain of var will 119 * // raise Exception NotFound 120 * translator2.translate ( "20" ); // NotFound 121 * @endcode 122 * 123 * @ingroup learning_database 124 */ 125 template < template < typename > class ALLOC = std::allocator > 126 class DBTranslator4ContinuousVariable: public DBTranslator< ALLOC > { 127 public: 128 /// type for the allocators passed in arguments of methods 129 using allocator_type = typename DBTranslator< ALLOC >::allocator_type; 130 131 132 // ########################################################################## 133 /// @name Constructors / Destructors 134 // ########################################################################## 135 136 /// @{ 137 138 /// default constructor without any initial variable 139 /** When using this constructor, it is assumed implicitly that the 140 * continuous variable has a range from minus infinity to plus infinity. 141 * If the fit_range parameter is on, the range of the variable is updated 142 * so that it precisely fits the range of the observed values in the 143 * database. 144 * @param missing_symbols the set of symbols in the database 145 * representing missing values 146 * @param fit_range if true, the range of the variable is updated 147 * so that it precisely fits the range of the observed values in the 148 * database, else the range is kept to (-inf,inf) 149 * @param alloc The allocator used to allocate memory for all the 150 * fields of the DBTranslator4ContinuousVariable 151 */ 152 template < template < typename > class XALLOC > 153 DBTranslator4ContinuousVariable( 154 const std::vector< std::string, XALLOC< std::string > >& missing_symbols, 155 const bool fit_range = false, 156 const allocator_type& alloc = allocator_type()); 157 158 /// default constructor without any initial variable nor missing symbol 159 /** When using this constructor, it is assumed implicitly that the 160 * continuous variable has a range from minus infinity to plus infinity. 161 * If the fit_range parameter is on, the range of the variable is updated 162 * so that it precisely fits the range of the observed values in the 163 * database. 164 * @param fit_range if true, the range of the variable is updated 165 * so that it precisely fits the range of the observed values in the 166 * database, else the range is kept to (-inf,inf) 167 * @param alloc The allocator used to allocate memory for all the 168 * fields of the DBTranslator4ContinuousVariable 169 */ 170 DBTranslator4ContinuousVariable(const bool fit_range = false, 171 const allocator_type& alloc = allocator_type()); 172 173 /// default constructor with a continuous variable as translator 174 /** @param var a continuous variable that will be used for 175 * translations. The translator keeps a copy of this variable 176 * @param missing_symbols the set of symbols in the database 177 * representing missing values 178 * @param fit_range if true, the range of the variable is updated 179 * so that it precisely fits the range of the observed values in the 180 * database, else the range is kept to (-inf,inf) 181 * @param alloc The allocator used to allocate memory for all the 182 * fields of the DBTranslator4ContinuousVariable 183 * @warning If a missing value symbol is a number included in the range 184 * of the continuous variable, it will be discarded. If the fit_range 185 * parameter is on, the range of the variable is updated so that it 186 * can contain the range of the observed values in the database. */ 187 template < typename GUM_SCALAR, template < typename > class XALLOC > 188 DBTranslator4ContinuousVariable( 189 const ContinuousVariable< GUM_SCALAR >& var, 190 const std::vector< std::string, XALLOC< std::string > >& missing_symbols, 191 const bool fit_range = false, 192 const allocator_type& alloc = allocator_type()); 193 194 /** @brief default constructor with a continuous variable as translator 195 * but without missing symbol 196 * 197 * @param var a continuous variable that will be used for 198 * translations. The translator keeps a copy of this variable 199 * @param fit_range if true, the range of the variable is updated 200 * so that it precisely fits the range of the observed values in the 201 * database, else the range is kept to (-inf,inf) 202 * @param alloc The allocator used to allocate memory for all the 203 * fields of the DBTranslator4ContinuousVariable 204 * @warning If a missing value symbol is a number included in the range 205 * of the continuous variable, it will be discarded. If the fit_range 206 * parameter is on, the range of the variable is updated so that it 207 * can contain the range of the observed values in the database. */ 208 template < typename GUM_SCALAR > 209 DBTranslator4ContinuousVariable(const ContinuousVariable< GUM_SCALAR >& var, 210 const bool fit_range = false, 211 const allocator_type& alloc = allocator_type()); 212 213 /// default constructor with a IContinuous variable as translator 214 /** @param var a IContinuous variable that will be used for 215 * translations. The translator keeps a copy of this variable 216 * @param missing_symbols the set of symbols in the database 217 * representing missing values 218 * @param fit_range if true, the range of the variable is updated 219 * so that it precisely fits the range of the observed values in the 220 * database, else the range is kept to (-inf,inf) 221 * @param alloc The allocator used to allocate memory for all the 222 * fields of the DBTranslator4ContinuousVariable 223 * @warning If a missing value symbol is a number included in the range 224 * of the continuous variable, it will be discarded. If the fit_range 225 * parameter is on, the range of the variable is updated so that it 226 * can contain the range of the observed values in the database. */ 227 template < template < typename > class XALLOC > 228 DBTranslator4ContinuousVariable( 229 const IContinuousVariable& var, 230 const std::vector< std::string, XALLOC< std::string > >& missing_symbols, 231 const bool fit_range = false, 232 const allocator_type& alloc = allocator_type()); 233 234 /** @brief default constructor with a IContinuous variable as translator 235 * but without missing symbol 236 * 237 * @param var a IContinuous variable that will be used for 238 * translations. The translator keeps a copy of this variable 239 * @param fit_range if true, the range of the variable is updated 240 * so that it precisely fits the range of the observed values in the 241 * database, else the range is kept to (-inf,inf) 242 * @param alloc The allocator used to allocate memory for all the 243 * fields of the DBTranslator4ContinuousVariable 244 * @warning If a missing value symbol is a number included in the range 245 * of the continuous variable, it will be discarded. If the fit_range 246 * parameter is on, the range of the variable is updated so that it 247 * can contain the range of the observed values in the database. */ 248 DBTranslator4ContinuousVariable(const IContinuousVariable& var, 249 const bool fit_range = false, 250 const allocator_type& alloc = allocator_type()); 251 252 /// copy constructor 253 DBTranslator4ContinuousVariable(const DBTranslator4ContinuousVariable< ALLOC >& from); 254 255 /// copy constructor with a given allocator 256 DBTranslator4ContinuousVariable(const DBTranslator4ContinuousVariable< ALLOC >& from, 257 const allocator_type& alloc); 258 259 /// move constructor 260 DBTranslator4ContinuousVariable(DBTranslator4ContinuousVariable< ALLOC >&& from); 261 262 /// move constructor with a given allocator 263 DBTranslator4ContinuousVariable(DBTranslator4ContinuousVariable< ALLOC >&& from, 264 const allocator_type& alloc); 265 266 /// virtual copy constructor 267 virtual DBTranslator4ContinuousVariable< ALLOC >* clone() const; 268 269 /// virtual copy constructor with a given allocator 270 virtual DBTranslator4ContinuousVariable< ALLOC >* clone(const allocator_type& alloc) const; 271 272 /// destructor 273 virtual ~DBTranslator4ContinuousVariable(); 274 275 /// @} 276 277 278 // ########################################################################## 279 /// @name Operators 280 // ########################################################################## 281 282 /// @{ 283 284 /// copy operator 285 DBTranslator4ContinuousVariable< ALLOC >& 286 operator=(const DBTranslator4ContinuousVariable< ALLOC >& from); 287 288 /// move operator 289 DBTranslator4ContinuousVariable< ALLOC >& 290 operator=(DBTranslator4ContinuousVariable< ALLOC >&& from); 291 292 /// @} 293 294 295 // ########################################################################## 296 /// @name Accessors / Modifiers 297 // ########################################################################## 298 299 /// @{ 300 301 /// returns the translation of a string 302 /** This method tries to translate a given string into the 303 * DBTranslatedValue that should be stored into a DatabaseTable. If the 304 * translator cannot find the translation in its current dictionary, then 305 * two situations can obtain: 306 * -# if the translator is not in an editable dictionary mode, then the 307 * translator raises a NotFound exception. 308 * -# if the translator is in an editable dictionary mode, i.e., it is 309 * allowed to update its dictionary, then it tries to update the range 310 * of its dictionary to include the new value. Upon success, it returns 311 * the translated value, otherwise, it raises either: 312 * - a TypeError exception if the string cannot be converted into a 313 * value that can be inserted into the dictionary 314 * - an OperationNotAllowed exception if the translation would induce 315 * incoherent behavior (e.g., a DBTranslator4ContinuousVariable that 316 * contains a variable whose domain is [x,y] as well as a missing 317 * value symbol z \f$\in\f$ [x,y]). 318 319 * @warning Note that missing values (i.e., string encoded as missing 320 * symbols) are translated as std::numeric_limits<float>::max (). 321 * @warning If the variable contained into the translator has a value in 322 * its domain equal to a missing value symbol, this value will be taken 323 * into account in the translations, not the missing value. 324 * @return the translated value of the string to be stored into a 325 * DatabaseTable 326 * @throws UnknownLabelInDatabase is raised if the number represented by 327 * the string is out of the range of the continuous variable and the 328 * translator is not in an editable dictionary mode. 329 * @throws OperationNotAllowed exception is raised if the translation 330 * cannot be found and the insertion of the string into the translator's 331 * dictionary fails because it would induce incoherent behavior (e.g., 332 * a DBTranslator4ContinuousVariable that contains a variable whose domain 333 * is [x,y] as well as a missing value symbol z \f$\in\f$ [x,y]). 334 * @throws TypeError is raised if the translation cannot be found and 335 * the insertion of the string into the translator's dictionary fails 336 * due to str being impossible to be converted into an appropriate type. */ 337 virtual DBTranslatedValue translate(const std::string& str) final; 338 339 /// returns the original value for a given translation 340 /** @return the string that was translated into a given DBTranslatedValue. 341 * @throws UnknownLabelInDatabase is raised if this original value is 342 * outside the domain of the continuous variable stored within the 343 * translator */ 344 virtual std::string translateBack(const DBTranslatedValue translated_val) const final; 345 346 /// returns std::numeric_limits<std::size_t>::max () 347 virtual std::size_t domainSize() const final; 348 349 /// indicates that the translations should never be reordered 350 virtual bool needsReordering() const final; 351 352 /** @brief returns an empty mapping, indicating that old tanslations 353 * are equal to the newly reordered ones. */ 354 virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > 355 reorder() final; 356 357 /// returns the variable stored into the translator 358 virtual const IContinuousVariable* variable() const final; 359 360 /// returns the translation of a missing value 361 virtual DBTranslatedValue missingValue() const final; 362 363 /// @} 364 365 366 #ifndef DOXYGEN_SHOULD_SKIP_THIS 367 368 private: 369 // the ContinuousVariable really used by the translator. As its values 370 // are floats, this speeds-up translations 371 ContinuousVariable< float > _variable_; 372 373 // the ContinuousVariablee returned by method variable () 374 // We must return a IContinuousVariable because the user may have 375 // saved into the translator a ContinuousVariable<X>, with X != float 376 IContinuousVariable* _real_variable_; 377 378 // assign to each float missing symbol a Boolean indicating whether 379 // we already translated it or not. If we translated it, then we cannot 380 // change the range of the variable so that this range contains the symbol. 381 HashTable< std::string, bool, ALLOC< std::pair< float, bool > > > 382 _status_float_missing_symbols_; 383 384 // a string containing a non real missing symbol 385 // (useful for back translations) 386 std::string _nonfloat_missing_symbol_; 387 388 // indicates whether we should fit the range of the observed values 389 bool _fit_range_; 390 391 #endif /* DOXYGEN_SHOULD_SKIP_THIS */ 392 }; 393 394 } /* namespace learning */ 395 396 } /* namespace gum */ 397 398 399 // always include the template implementation 400 #include <agrum/tools/database/DBTranslator4ContinuousVariable_tpl.h> 401 402 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_CONTINUOUS_VARIABLE_H */ 403