1 // OpenNN: Open Neural Networks Library 2 // www.opennn.net 3 // 4 // D A T A S E T C L A S S H E A D E R 5 // 6 // Artificial Intelligence Techniques SL 7 // artelnics@artelnics.com 8 9 #ifndef DATASET_H 10 #define DATASET_H 11 12 // System includes 13 14 #include <iostream> 15 #include <fstream> 16 #include <string> 17 #include <sstream> 18 #include <cmath> 19 #include <algorithm> 20 #include <cstdlib> 21 #include <stdexcept> 22 #include <ctime> 23 #include <exception> 24 #include <random> 25 #include <regex> 26 #include <map> 27 #include <stdlib.h> 28 #include <stdio.h> 29 #include <limits.h> 30 31 // OpenNN includes 32 33 #include "config.h" 34 #include "statistics.h" 35 #include "correlations.h" 36 #include "opennn_strings.h" 37 38 using namespace std; 39 using namespace Eigen; 40 41 namespace OpenNN 42 { 43 44 /// This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting. 45 46 /// 47 /// It basically consists of a data Matrix separated by columns. 48 /// These columns can take different categories depending on the data hosted in them. 49 /// 50 /// With OpenNN DataSet class you can edit the data to prepare your model, such as eliminating missing values, 51 /// calculating correlations between variables (inputs and targets), not using certain variables or samples, etc \dots. 52 53 class DataSet 54 { 55 56 public: 57 58 // Constructors 59 60 explicit DataSet(); 61 62 explicit DataSet(const Tensor<type, 2>&); 63 64 explicit DataSet(const Index&, const Index&); 65 66 explicit DataSet(const Index&, const Index&, const Index&); 67 68 explicit DataSet(const string&, const char&, const bool&); 69 70 // Destructor 71 72 virtual ~DataSet(); 73 74 // Enumerations 75 76 /// Enumeration of available separators for the data file. 77 78 enum Separator{Space, Tab, Comma, Semicolon}; 79 80 /// Enumeration of available methods for missing values in the data. 81 82 enum MissingValuesMethod{Unuse, Mean, Median}; 83 84 /// Enumeration of available methods for scaling and unscaling the data. 85 86 enum ScalingUnscalingMethod{NoScaling, NoUnscaling, MinimumMaximum, MeanStandardDeviation, StandardDeviation, Logarithmic}; 87 88 /// Enumeration of the learning tasks. 89 90 enum ProjectType{Approximation, Classification, Forecasting, ImageApproximation, ImageClassification}; 91 92 /// This enumeration represents the possible uses of an sample 93 /// (training, selection, testing or unused). 94 95 enum SampleUse{Training, Selection, Testing, UnusedSample}; 96 97 /// This enumeration represents the possible uses of an variable 98 /// (input, target, time or unused). 99 100 enum VariableUse{Id, Input, Target, Time, UnusedVariable}; 101 102 /// This enumeration represents the data type of a column 103 /// (numeric, binary, categorical or time). 104 105 enum ColumnType{Numeric, Binary, Categorical, DateTime, Constant}; 106 107 // Structs 108 109 /// This structure represents the columns of the DataSet. 110 111 struct Column 112 { 113 /// Default constructor. 114 115 Column(); 116 117 /// Values constructor 118 119 Column(const string&, const VariableUse&, const ColumnType& = Numeric, const Tensor<string, 1>& = Tensor<string, 1>(), const Tensor<VariableUse, 1>& = Tensor<VariableUse, 1>()); 120 121 /// Destructor. 122 123 virtual ~Column(); 124 125 /// Column name. 126 127 string name; 128 129 /// Column use. 130 131 VariableUse column_use; 132 133 /// Column type. 134 135 ColumnType type; 136 137 /// Categories within the column. 138 139 Tensor<string, 1> categories; 140 141 /// Categories use. 142 143 Tensor<VariableUse, 1> categories_uses; 144 145 // Methods 146 147 Index get_categories_number() const; 148 Index get_used_categories_number() const; 149 150 Tensor<string, 1> get_used_variables_names() const; 151 152 void set_use(const VariableUse&); 153 void set_use(const string&); 154 155 void set_type(const string&); 156 157 void add_category(const string&); 158 159 void set_categories_uses(const Tensor<string, 1>&); 160 void set_categories_uses(const VariableUse&); 161 162 bool is_used(); 163 bool is_unused(); 164 165 void from_XML(const tinyxml2::XMLDocument&); 166 void write_XML(tinyxml2::XMLPrinter&) const; 167 }; 168 169 170 struct Batch 171 { 172 /// Default constructor. 173 BatchBatch174 Batch() {} 175 176 Batch(const Index& new_samples_number, DataSet* new_data_set_pointer); 177 178 /// Destructor. 179 ~BatchBatch180 virtual ~Batch() {} 181 182 Index get_samples_number() const; 183 184 void print(); 185 186 void fill(const Tensor<Index, 1>& samples, const Tensor<Index, 1>& inputs, const Tensor<Index, 1>& targets); 187 188 // void fill_submatrix(const Tensor<type, 2>& matrix, 189 // const Tensor<Index, 1>& rows_indices, 190 // const Tensor<Index, 1>& columns_indices, Tensor<type, 2>& submatrix); 191 192 193 Index samples_number = 0; 194 195 DataSet* data_set_pointer = nullptr; 196 197 Tensor<type, 2> inputs_2d; 198 Tensor<type, 4> inputs_4d; 199 200 Tensor<type, 2> targets_2d; 201 }; 202 203 204 // Samples get methods 205 get_samples_number()206 inline Index get_samples_number() const {return samples_uses.size();} 207 208 Index get_training_samples_number() const; 209 Index get_selection_samples_number() const; 210 Index get_testing_samples_number() const; 211 212 Index get_used_samples_number() const; 213 Index get_unused_samples_number() const; 214 215 Tensor<Index, 1> get_training_samples_indices() const; 216 Tensor<Index, 1> get_selection_samples_indices() const; 217 Tensor<Index, 1> get_testing_samples_indices() const; 218 219 Tensor<Index, 1> get_used_samples_indices() const; 220 Tensor<Index, 1> get_unused_samples_indices() const; 221 222 SampleUse get_sample_use(const Index&) const; 223 const Tensor<SampleUse, 1>& get_samples_uses() const; 224 225 Tensor<Index, 1> get_samples_uses_numbers() const; 226 Tensor<type, 1> get_samples_uses_percentages() const; 227 228 string get_sample_string(const Index&, const string& = ",") const; 229 230 // Columns get methods 231 232 Tensor<Column, 1> get_columns() const; 233 Tensor<Column, 1> get_time_series_columns() const; 234 Tensor<Column, 1> get_input_columns() const; 235 Tensor<Column, 1> get_target_columns() const; 236 Tensor<Column, 1> get_used_columns() const; 237 238 Index get_columns_number() const; 239 240 Index get_input_columns_number() const; 241 Index get_target_columns_number() const; 242 Index get_time_columns_number() const; 243 Index get_unused_columns_number() const; 244 Index get_used_columns_number() const; 245 246 Index get_column_index(const string&) const; 247 Index get_column_index(const Index&) const; 248 249 Tensor<Index, 1> get_input_columns_indices() const; 250 Tensor<Index, 1> get_target_columns_indices() const; 251 Tensor<Index, 1> get_unused_columns_indices() const; 252 Tensor<Index, 1> get_used_columns_indices() const; 253 254 Tensor<string, 1> get_columns_names() const; 255 256 Tensor<string, 1> get_input_columns_names() const; 257 Tensor<string, 1> get_target_columns_names() const; 258 Tensor<string, 1> get_used_columns_names() const; 259 get_column_type(const Index & index)260 ColumnType get_column_type(const Index& index) const {return columns[index].type;} 261 262 VariableUse get_column_use(const Index &) const; 263 Tensor<VariableUse, 1> get_columns_uses() const; 264 265 // Variables get methods 266 267 Index get_variables_number() const; 268 269 Index get_input_variables_number() const; 270 Index get_target_variables_number() const; 271 Index get_unused_variables_number() const; 272 Index get_used_variables_number() const; 273 274 string get_variable_name(const Index&) const; 275 Tensor<string, 1> get_variables_names() const; 276 277 Tensor<string, 1> get_input_variables_names() const; 278 Tensor<string, 1> get_target_variables_names() const; 279 280 Index get_variable_index(const string&name) const; 281 282 Tensor<Index, 1> get_variable_indices(const Index&) const; 283 Tensor<Index, 1> get_unused_variables_indices() const; 284 Tensor<Index, 1> get_used_variables_indices() const; 285 Tensor<Index, 1> get_input_variables_indices() const; 286 Tensor<Index, 1> get_target_variables_indices() const; 287 288 VariableUse get_variable_use(const Index&) const; 289 Tensor<VariableUse, 1> get_variables_uses() const; 290 291 const Tensor<Index, 1>& get_input_variables_dimensions() const; 292 293 // Batches get methods 294 295 Tensor<Index, 2> get_batches(const Tensor<Index,1>&, const Index&, const bool&, const Index& buffer_size= 100) const; 296 297 // Data get methods 298 299 const Tensor<type, 2>& get_data() const; 300 Tensor<type, 2>* get_data_pointer(); 301 302 const Tensor<type, 2>& get_time_series_data() const; 303 304 Tensor<type, 2> get_training_data() const; 305 Tensor<type, 2> get_selection_data() const; 306 Tensor<type, 2> get_testing_data() const; 307 Tensor<string, 1> get_time_series_columns_names() const; 308 Index get_time_series_columns_number() const; 309 310 Tensor<type, 2> get_input_data() const; 311 Tensor<type, 2> get_target_data() const; 312 313 Tensor<type, 2> get_input_data(const Tensor<Index, 1>&) const; 314 Tensor<type, 2> get_target_data(const Tensor<Index, 1>&) const; 315 316 Tensor<type, 2> get_training_input_data() const; 317 Tensor<type, 2> get_training_target_data() const; 318 319 Tensor<type, 2> get_selection_input_data() const; 320 Tensor<type, 2> get_selection_target_data() const; 321 322 Tensor<type, 2> get_testing_input_data() const; 323 Tensor<type, 2> get_testing_target_data() const; 324 325 Tensor<type, 1> get_sample_data(const Index&) const; 326 Tensor<type, 1> get_sample_data(const Index&, const Tensor<Index, 1>&) const; 327 Tensor<type, 2> get_sample_input_data(const Index&) const; 328 Tensor<type, 2> get_sample_target_data(const Index&) const; 329 330 Tensor<type, 2> get_column_data(const Index&) const; 331 Tensor<type, 2> get_column_data(const Index&, Tensor<Index, 1>&) const; 332 Tensor<type, 2> get_column_data(const Tensor<Index, 1>&) const; 333 Tensor<type, 2> get_column_data(const string&) const; 334 335 Tensor<type, 1> get_variable_data(const Index&) const; 336 Tensor<type, 1> get_variable_data(const string&) const; 337 338 Tensor<type, 1> get_variable_data(const Index&, const Tensor<Index, 1>&) const; 339 Tensor<type, 1> get_variable_data(const string&, const Tensor<Index, 1>&) const; 340 341 Tensor<Tensor<string, 1>, 1> get_data_file_preview() const; 342 343 Tensor<type, 2> get_subtensor_data(const Tensor<Index, 1>&, const Tensor<Index, 1>&) const; 344 345 // Members get methods 346 347 MissingValuesMethod get_missing_values_method() const; 348 349 const string& get_data_file_name() const; 350 351 const bool& get_header_line() const; 352 const bool& get_rows_label() const; 353 354 Tensor<string, 1> get_rows_label_tensor() const; 355 Tensor<string, 1> get_selection_rows_label_tensor(); 356 Tensor<string, 1> get_testing_rows_label_tensor(); 357 358 const Separator& get_separator() const; 359 char get_separator_char() const; 360 string get_separator_string() const; 361 362 const string& get_missing_values_label() const; 363 364 const Index& get_lags_number() const; 365 const Index& get_steps_ahead() const; 366 const Index& get_time_index() const; 367 368 static Tensor<string, 1> get_default_columns_names(const Index&); 369 370 static ScalingUnscalingMethod get_scaling_unscaling_method(const string&); 371 372 Index get_gmt() const; 373 374 const bool& get_display() const; 375 376 // Set methods 377 378 void set(); 379 void set(const Tensor<type, 2>&); 380 void set(const Index&, const Index&); 381 void set(const Index&, const Index&, const Index&); 382 void set(const DataSet&); 383 void set(const tinyxml2::XMLDocument&); 384 void set(const string&); 385 386 void set_default(); 387 388 void set_threads_number(const int&); 389 390 // Samples set methods 391 392 void set_samples_number(const Index&); 393 394 void set_training(); 395 void set_selection(); 396 void set_testing(); 397 398 void set_training(const Tensor<Index, 1>&); 399 void set_selection(const Tensor<Index, 1>&); 400 void set_testing(const Tensor<Index, 1>&); 401 402 void set_samples_unused(); 403 void set_samples_unused(const Tensor<Index, 1>&); 404 405 void set_sample_use(const Index&, const SampleUse&); 406 void set_sample_use(const Index&, const string&); 407 408 void set_samples_uses(const Tensor<SampleUse, 1>&); 409 void set_samples_uses(const Tensor<string, 1>&); 410 411 void set_k_fold_cross_validation_samples_uses(const Index&, const Index&); 412 413 // Columns set methods 414 415 void set_default_columns_uses(); 416 void set_default_classification_columns_uses(); 417 418 void set_default_columns_names(); 419 420 void set_column_name(const Index&, const string&); 421 422 void set_columns_uses(const Tensor<string, 1>&); 423 void set_columns_uses(const Tensor<VariableUse, 1>&); 424 void set_columns_unused(); 425 void set_input_columns_unused(); 426 427 void set_column_use(const Index&, const VariableUse&); 428 void set_column_use(const string&, const VariableUse&); 429 430 void set_columns_names(const Tensor<string, 1>&); 431 432 void set_columns_number(const Index&); 433 434 void set_binary_simple_columns(); 435 436 void binarize_input_data(const type&); 437 438 // Columns other methods 439 440 Tensor<type,2> transform_binary_column(const Tensor<type,1>&) const; 441 442 // Variables set methods 443 444 void set_variables_names(const Tensor<string, 1>&); 445 void set_variable_name(const Index&, const string&); 446 447 void set_input(); 448 void set_target(); 449 void set_variables_unused(); 450 451 void set_input_variables_dimensions(const Tensor<Index, 1>&); 452 453 // Data set methods 454 455 void set_data(const Tensor<type, 2>&); 456 457 // Members set methods 458 459 void set_data_file_name(const string&); 460 461 void set_has_columns_names(const bool&); 462 void set_has_rows_label(const bool&); 463 464 void set_separator(const Separator&); 465 void set_separator(const string&); 466 void set_separator(const char&); 467 468 void set_missing_values_label(const string&); 469 void set_missing_values_method(const MissingValuesMethod&); 470 void set_missing_values_method(const string&); 471 472 void set_lags_number(const Index&); 473 void set_steps_ahead_number(const Index&); 474 void set_time_index(const Index&); 475 476 void set_gmt(Index&); 477 478 void set_display(const bool&); 479 480 // Check methods 481 482 bool is_binary_classification() const; 483 bool is_multiple_classification() const; 484 485 bool is_empty() const; 486 487 bool is_less_than(const Tensor<type, 1>&, const type&) const; 488 489 bool is_sample_used(const Index&) const; 490 bool is_sample_unused(const Index&) const; 491 492 bool has_data() const; 493 494 bool has_binary_columns() const; 495 bool has_categorical_columns() const; 496 bool has_time_columns() const; 497 498 bool has_selection() const; 499 500 // Splitting methods 501 502 void split_samples_sequential(const type& training_ratio = static_cast<type>(0.6), 503 const type& selection_ratio = static_cast<type>(0.2), 504 const type& testing_ratio = static_cast<type>(0.2)); 505 506 void split_samples_random(const type& training_ratio = static_cast<type>(0.6), 507 const type& selection_ratio = static_cast<type>(0.2), 508 const type& testing_ratio = static_cast<type>(0.2)); 509 510 // Unusing methods 511 512 Tensor<string, 1> unuse_constant_columns(); 513 514 Tensor<Index, 1> unuse_repeated_samples(); 515 516 Tensor<string, 1> unuse_uncorrelated_columns(const type& = 0.25); 517 518 // Initialization methods 519 520 void initialize_data(const type&); 521 522 void set_data_random(); 523 void set_data_binary_random(); 524 525 // Descriptives methods 526 527 Tensor<Descriptives, 1> calculate_variables_descriptives() const; 528 Tensor<Descriptives, 1> calculate_used_variables_descriptives() const; 529 530 Tensor<Descriptives, 1> calculate_columns_descriptives_positive_samples() const; 531 Tensor<Descriptives, 1> calculate_columns_descriptives_negative_samples() const; 532 Tensor<Descriptives, 1> calculate_columns_descriptives_categories(const Index&) const; 533 534 Tensor<Descriptives, 1> calculate_columns_descriptives_training_samples() const; 535 Tensor<Descriptives, 1> calculate_columns_descriptives_selection_samples() const; 536 537 Tensor<Descriptives, 1> calculate_input_variables_descriptives() const; 538 Tensor<Descriptives, 1> calculate_target_variables_descriptives() const; 539 540 Tensor<type, 1> calculate_input_variables_minimums() const; 541 Tensor<type, 1> calculate_target_variables_minimums() const; 542 Tensor<type, 1> calculate_input_variables_maximums() const; 543 Tensor<type, 1> calculate_target_variables_maximums() const; 544 545 Tensor<type, 1> calculate_variables_means(const Tensor<Index, 1>&) const; 546 Tensor<type, 1> calculate_used_variables_minimums() const; 547 548 Descriptives calculate_input_descriptives(const Index&) const; 549 550 Tensor<type, 1> calculate_used_targets_mean() const; 551 Tensor<type, 1> calculate_selection_targets_mean() const; 552 553 Index calculate_used_negatives(const Index&) const; 554 Index calculate_training_negatives(const Index&) const; 555 Index calculate_selection_negatives(const Index&) const; 556 Index calculate_testing_negatives(const Index&) const; 557 558 // Distribution methods 559 560 Tensor<Histogram, 1> calculate_columns_distribution(const Index& = 10) const; 561 562 // Box and whiskers 563 564 Tensor<BoxPlot, 1> calculate_columns_box_plots() const; 565 566 // Inputs correlations 567 568 Tensor<type, 2> calculate_input_columns_correlations() const; 569 570 void print_inputs_correlations() const; 571 572 void print_top_inputs_correlations(const Index& = 10) const; 573 574 // Inputs-targets correlations 575 576 Tensor<CorrelationResults, 2> calculate_input_target_columns_correlations() const; 577 Tensor<type, 2> calculate_input_target_columns_correlations_values() const; 578 579 void print_input_target_columns_correlations() const; 580 581 void print_top_input_target_columns_correlations(const Index& = 10) const; 582 583 // Inputs-targets regressions 584 585 Tensor<RegressionResults, 2> calculate_input_target_columns_regressions() const; 586 587 // Principal components 588 589 Tensor<type, 2> calculate_covariance_matrix() const; 590 591 Tensor<type, 2> perform_principal_components_analysis(const type& = 0.0); 592 593 Tensor<type, 2> perform_principal_components_analysis(const Tensor<type, 2>&, const Tensor<type, 1>&, const type& = 0.0); 594 595 void transform_principal_components_data(const Tensor<type, 2>&); 596 597 void subtract_inputs_mean(); 598 599 // Filtering methods 600 601 Tensor<Index, 1> filter_column(const Index&, const type&, const type&); 602 Tensor<Index, 1> filter_column(const string&, const type&, const type&); 603 604 Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&); 605 606 // Data scaling 607 608 Tensor<string, 1> calculate_default_scaling_methods() const; 609 Tensor<string, 1> calculate_default_unscaling_methods() const; 610 void scale_data_minimum_maximum(const Tensor<Descriptives, 1>&); 611 void scale_minimum_maximum_binary(const type&, const type&, const Index&); 612 void scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>&); 613 Tensor<Descriptives, 1> scale_data_minimum_maximum(); 614 Tensor<Descriptives, 1> scale_data_mean_standard_deviation(); 615 616 // Input variables scaling 617 618 void scale_input_mean_standard_deviation(const Descriptives&, const Index&); 619 Descriptives scale_input_mean_standard_deviation(const Index&); 620 621 void scale_input_standard_deviation(const Descriptives&, const Index&); 622 Descriptives scale_input_standard_deviation(const Index&); 623 624 void scale_input_minimum_maximum(const Descriptives&, const Index&); 625 Descriptives scale_input_minimum_maximum(const Index&); 626 627 void scale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>&); 628 Tensor<Descriptives, 1> scale_input_variables_minimum_maximum(); 629 630 void unscale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>&); 631 632 Tensor<Descriptives, 1> scale_input_variables(const Tensor<string, 1>&); 633 634 // Target variables scaling 635 636 void scale_target_minimum_maximum(const Descriptives&, const Index&); 637 void scale_target_mean_standard_deviation(const Descriptives&, const Index&); 638 void scale_target_logarithmic(const Descriptives&, const Index&); 639 640 void scale_target_variables_minimum_maximum(const Tensor<Descriptives, 1>&); 641 Tensor<Descriptives, 1> scale_target_variables_minimum_maximum(); 642 643 void scale_target_variables_mean_standard_deviation(const Tensor<Descriptives, 1>&); 644 Tensor<Descriptives, 1> scale_target_variables_mean_standard_deviation(); 645 646 void scale_target_variables_logarithm(const Tensor<Descriptives, 1>&); 647 Tensor<Descriptives, 1> scale_target_variables_logarithm(); 648 649 Tensor<Descriptives, 1> scale_target_variables(const string&); 650 Tensor<Descriptives, 1> scale_target_variables(const Tensor<string, 1>&); 651 652 // Data unscaling 653 654 void unscale_input_variable_minimum_maximum(const Descriptives&, const Index&); 655 void unscale_input_mean_standard_deviation(const Descriptives&, const Index&); 656 void unscale_input_variable_standard_deviation(const Descriptives&, const Index&); 657 void unscale_input_variables(const Tensor<string,1>&, const Tensor<Descriptives, 1>&); 658 659 void unscale_target_minimum_maximum(const Descriptives&, const Index&); 660 void unscale_target_mean_standard_deviation(const Descriptives&, const Index&); 661 void unscale_target_logarithmic(const Descriptives&, const Index&); 662 void unscale_target_variables(const Tensor<string,1>&, const Tensor<Descriptives, 1>&); 663 664 // Classification methods 665 666 Tensor<Index, 1> calculate_target_distribution() const; 667 668 // Outlier detection 669 670 Tensor<Tensor<Index, 1>, 1> calculate_Tukey_outliers(const type& = 1.5) const; 671 672 void unuse_Tukey_outliers(const type& = 1.5); 673 674 // Time series methods 675 676 void transform_time_series_columns(); 677 void transform_time_series_data(); 678 void get_time_series_columns_number(const Index&); 679 void set_time_series_data(const Tensor<type, 2>&); 680 681 Tensor<type, 2> get_time_series_column_data(const Index&) const; 682 Tensor<type, 2> calculate_autocorrelations(const Index& = 10) const; 683 Tensor<Tensor<type, 1>, 2> calculate_cross_correlations(const Index& = 10) const; 684 Tensor<type, 2> calculate_lag_plot() const; 685 Tensor<type, 2> calculate_lag_plot(const Index&); 686 687 // Data generation 688 689 void generate_constant_data(const Index&, const Index&); 690 void generate_random_data(const Index&, const Index&); 691 void generate_sequential_data(const Index&, const Index&); 692 void generate_paraboloid_data(const Index&, const Index&); 693 void generate_Rosenbrock_data(const Index&, const Index&); 694 void generate_inputs_selection_data(const Index&, const Index&); 695 void generate_sum_data(const Index&, const Index&); 696 697 void generate_data_binary_classification(const Index&, const Index&); 698 void generate_data_multiple_classification(const Index&, const Index&, const Index&); 699 700 // Serialization methods 701 702 void print() const; 703 void print_summary() const; 704 705 void from_XML(const tinyxml2::XMLDocument&); 706 void write_XML(tinyxml2::XMLPrinter&) const; 707 708 void save(const string&) const; 709 void load(const string&); 710 711 void print_columns_types() const; 712 713 void print_data() const; 714 void print_data_preview() const; 715 716 void print_data_file_preview() const; 717 718 void save_data() const; 719 720 void save_data_binary(const string&) const; 721 722 // Data load methods 723 724 void read_csv(); 725 726 void load_data_binary(); 727 728 void load_time_series_data_binary(); 729 730 void check_input_csv(const string&, const char&) const; 731 Tensor<type, 2> read_input_csv(const string&, const char&, const string&, const bool&, const bool&) const; 732 733 // Trasform methods 734 735 void transform_time_series(); 736 void transform_association(); 737 738 void fill_time_series(const Index&); 739 740 void numeric_to_categorical(const Index&); 741 742 // Missing values 743 744 bool has_nan() const; 745 746 bool has_nan_row(const Index&) const; 747 748 void print_missing_values_information() const; 749 750 void impute_missing_values_unuse(); 751 void impute_missing_values_mean(); 752 void impute_missing_values_median(); 753 754 void scrub_missing_values(); 755 756 Tensor<Index, 1> count_nan_columns() const; 757 Index count_rows_with_nan() const; 758 Index count_nan() const; 759 760 void set_missing_values_number(const Index&); 761 void set_missing_values_number(); 762 763 void set_columns_missing_values_number(const Tensor<Index, 1>&); 764 void set_columns_missing_values_number(); 765 766 void set_rows_missing_values_number(const Index&); 767 void set_rows_missing_values_number(); 768 769 // Other methods 770 771 void fix_repeated_names(); 772 773 // scaling 774 775 void set_min_max_range(const type min, const type max); 776 777 // Eigen methods 778 779 Tensor<Index, 1> push_back(const Tensor<Index, 1>&, const Index&) const; 780 Tensor<string, 1> push_back(const Tensor<string, 1>&, const string&) const; 781 782 void initialize_sequential_eigen_tensor(Tensor<Index, 1>&, const Index&, const Index&, const Index&) const; 783 void intialize_sequential_eigen_type_tensor(Tensor<type, 1>&, const type&, const type&, const type&) const; 784 785 Tensor<Index, 2> split_samples(const Tensor<Index, 1>&, const Index&) const; 786 787 void fill_submatrix(const Tensor<type, 2>& matrix, 788 const Tensor<Index, 1>& rows_indices, 789 const Tensor<Index, 1>& columns_indices, type*submatrix); 790 791 bool get_has_rows_labels() const; 792 793 void shuffle(); 794 795 private: 796 797 NonBlockingThreadPool* non_blocking_thread_pool = nullptr; 798 ThreadPoolDevice* thread_pool_device = nullptr; 799 800 /// Data file name. 801 802 string data_file_name; 803 804 /// Separator character. 805 806 Separator separator = Comma; 807 808 /// Missing values label. 809 810 string missing_values_label = "NA"; 811 812 /// Number of lags. 813 814 Index lags_number; 815 816 /// Number of steps ahead. 817 818 Index steps_ahead; 819 820 /// Min Max Range Scaling 821 822 type min_range = -1; 823 type max_range = 1; 824 825 /// Data Matrix. 826 /// The number of rows is the number of samples. 827 /// The number of columns is the number of variables. 828 829 Tensor<type, 2> data; 830 831 /// Time series data matrix. 832 /// The number of rows is the number of samples before time series transfomration. 833 /// The number of columns is the number of variables before time series transformation. 834 835 Tensor<type, 2> time_series_data; 836 837 Tensor<Column, 1> time_series_columns; 838 839 /// Display messages to screen. 840 841 bool display = true; 842 843 /// Index where time variable is located for forecasting applications. 844 845 Index time_index; 846 847 /// Missing values method object. 848 849 MissingValuesMethod missing_values_method = Unuse; 850 851 // Samples 852 853 Tensor<SampleUse, 1> samples_uses; 854 855 // Variables 856 857 // Reader 858 859 void read_csv_1(); 860 861 void read_csv_2_simple(); 862 void read_csv_3_simple(); 863 864 void read_csv_2_complete(); 865 void read_csv_3_complete(); 866 867 void check_separators(const string&) const; 868 869 void check_special_characters(const string&) const; 870 871 /// Header which contains variables name. 872 873 bool has_columns_names = false; 874 875 Tensor<Index, 1> input_variables_dimensions; 876 877 Tensor<Column, 1> columns; 878 879 /// Header wihch contains the rows label. 880 881 bool has_rows_labels = false; 882 883 Tensor<string, 1> rows_labels; 884 885 Index gmt = 0; 886 887 Tensor<Tensor<string, 1>, 1> data_file_preview; 888 889 Eigen::array<IndexPair<Index>, 1> product_vector_vector = {IndexPair<Index>(0, 0)}; // Vector product, (0,0) first vector is transpose 890 891 /// Missing values 892 893 Index missing_values_number; 894 895 Tensor<Index, 1> columns_missing_values_number; 896 897 Index rows_missing_values_number; 898 899 #ifdef OPENNN_CUDA 900 #include "../../opennn-cuda/opennn_cuda/data_set_cuda.h" 901 #endif 902 903 }; 904 905 } 906 907 #endif 908 909 // OpenNN: Open Neural Networks Library. 910 // Copyright(C) 2005-2020 Artificial Intelligence Techniques, SL. 911 // 912 // This library is free software; you can redistribute it and/or 913 // modify it under the terms of the GNU Lesser General Public 914 // License as published by the Free Software Foundation; either 915 // version 2.1 of the License, or any later version. 916 // 917 // This library is distributed in the hope that it will be useful, 918 // but WITHOUT ANY WARRANTY; without even the implied warranty of 919 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 920 // Lesser General Public License for more details. 921 922 // You should have received a copy of the GNU Lesser General Public 923 // License along with this library; if not, write to the Free Software 924 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 925