1 //   OpenNN: Open Neural Networks Library
2 //   www.opennn.net
3 //
4 //   D A T A   S E T   C L A S S   H E A D E R
5 //
6 //   Artificial Intelligence Techniques SL
7 //   artelnics@artelnics.com
8 
9 #ifndef DATASET_H
10 #define DATASET_H
11 
12 // System includes
13 
14 #include <iostream>
15 #include <fstream>
16 #include <string>
17 #include <sstream>
18 #include <cmath>
19 #include <algorithm>
20 #include <cstdlib>
21 #include <stdexcept>
22 #include <ctime>
23 #include <exception>
24 #include <random>
25 #include <regex>
26 #include <map>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <limits.h>
30 
31 // OpenNN includes
32 
33 #include "config.h"
34 #include "statistics.h"
35 #include "correlations.h"
36 #include "opennn_strings.h"
37 
38 using namespace std;
39 using namespace Eigen;
40 
41 namespace OpenNN
42 {
43 
44 /// This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting.
45 
46 ///
47 /// It basically consists of a data Matrix separated by columns.
48 /// These columns can take different categories depending on the data hosted in them.
49 ///
50 /// With OpenNN DataSet class you can edit the data to prepare your model, such as eliminating missing values,
51 /// calculating correlations between variables (inputs and targets), not using certain variables or samples, etc \dots.
52 
53 class DataSet
54 {
55 
56 public:
57 
58    // Constructors
59 
60    explicit DataSet();
61 
62    explicit DataSet(const Tensor<type, 2>&);
63 
64    explicit DataSet(const Index&, const Index&);
65 
66    explicit DataSet(const Index&, const Index&, const Index&);
67 
68    explicit DataSet(const string&, const char&, const bool&);
69 
70    // Destructor
71 
72    virtual ~DataSet();
73 
74    // Enumerations
75 
76    /// Enumeration of available separators for the data file.
77 
78    enum Separator{Space, Tab, Comma, Semicolon};
79 
80    /// Enumeration of available methods for missing values in the data.
81 
82    enum MissingValuesMethod{Unuse, Mean, Median};
83 
84    /// Enumeration of available methods for scaling and unscaling the data.
85 
86    enum ScalingUnscalingMethod{NoScaling, NoUnscaling, MinimumMaximum, MeanStandardDeviation, StandardDeviation, Logarithmic};
87 
88    /// Enumeration of the learning tasks.
89 
90    enum ProjectType{Approximation, Classification, Forecasting, ImageApproximation, ImageClassification};
91 
92    /// This enumeration represents the possible uses of an sample
93    /// (training, selection, testing or unused).
94 
95    enum SampleUse{Training, Selection, Testing, UnusedSample};
96 
97    /// This enumeration represents the possible uses of an variable
98    /// (input, target, time or unused).
99 
100    enum VariableUse{Id, Input, Target, Time, UnusedVariable};
101 
102    /// This enumeration represents the data type of a column
103    /// (numeric, binary, categorical or time).
104 
105    enum ColumnType{Numeric, Binary, Categorical, DateTime, Constant};
106 
107    // Structs
108 
109    /// This structure represents the columns of the DataSet.
110 
111    struct Column
112    {
113        /// Default constructor.
114 
115        Column();
116 
117        /// Values constructor
118 
119        Column(const string&, const VariableUse&, const ColumnType& = Numeric, const Tensor<string, 1>& = Tensor<string, 1>(), const Tensor<VariableUse, 1>& = Tensor<VariableUse, 1>());
120 
121        /// Destructor.
122 
123        virtual ~Column();
124 
125        /// Column name.
126 
127        string name;
128 
129        /// Column use.
130 
131        VariableUse column_use;
132 
133        /// Column type.
134 
135        ColumnType type;
136 
137        /// Categories within the column.
138 
139        Tensor<string, 1> categories;
140 
141        /// Categories use.
142 
143        Tensor<VariableUse, 1> categories_uses;
144 
145        // Methods
146 
147        Index get_categories_number() const;
148        Index get_used_categories_number() const;
149 
150        Tensor<string, 1> get_used_variables_names() const;
151 
152        void set_use(const VariableUse&);
153        void set_use(const string&);
154 
155        void set_type(const string&);
156 
157        void add_category(const string&);
158 
159        void set_categories_uses(const Tensor<string, 1>&);
160        void set_categories_uses(const VariableUse&);
161 
162        bool is_used();
163        bool is_unused();
164 
165        void from_XML(const tinyxml2::XMLDocument&);
166        void write_XML(tinyxml2::XMLPrinter&) const;
167    };
168 
169 
170    struct Batch
171    {
172        /// Default constructor.
173 
BatchBatch174        Batch() {}
175 
176        Batch(const Index& new_samples_number, DataSet* new_data_set_pointer);
177 
178        /// Destructor.
179 
~BatchBatch180        virtual ~Batch() {}
181 
182        Index get_samples_number() const;
183 
184        void print();
185 
186        void fill(const Tensor<Index, 1>& samples, const Tensor<Index, 1>& inputs, const Tensor<Index, 1>& targets);
187 
188 //       void fill_submatrix(const Tensor<type, 2>& matrix,
189 //                 const Tensor<Index, 1>& rows_indices,
190 //                 const Tensor<Index, 1>& columns_indices, Tensor<type, 2>& submatrix);
191 
192 
193        Index samples_number = 0;
194 
195        DataSet* data_set_pointer = nullptr;
196 
197        Tensor<type, 2> inputs_2d;
198        Tensor<type, 4> inputs_4d;
199 
200        Tensor<type, 2> targets_2d;
201    };
202 
203 
204    // Samples get methods
205 
get_samples_number()206    inline Index get_samples_number() const {return samples_uses.size();}
207 
208    Index get_training_samples_number() const;
209    Index get_selection_samples_number() const;
210    Index get_testing_samples_number() const;
211 
212    Index get_used_samples_number() const;
213    Index get_unused_samples_number() const;
214 
215    Tensor<Index, 1> get_training_samples_indices() const;
216    Tensor<Index, 1> get_selection_samples_indices() const;
217    Tensor<Index, 1> get_testing_samples_indices() const;
218 
219    Tensor<Index, 1> get_used_samples_indices() const;
220    Tensor<Index, 1> get_unused_samples_indices() const;
221 
222    SampleUse get_sample_use(const Index&) const;
223    const Tensor<SampleUse, 1>& get_samples_uses() const;
224 
225    Tensor<Index, 1> get_samples_uses_numbers() const;
226    Tensor<type, 1> get_samples_uses_percentages() const;
227 
228    string get_sample_string(const Index&, const string& = ",") const;
229 
230    // Columns get methods
231 
232    Tensor<Column, 1> get_columns() const;
233    Tensor<Column, 1> get_time_series_columns() const;
234    Tensor<Column, 1> get_input_columns() const;
235    Tensor<Column, 1> get_target_columns() const;
236    Tensor<Column, 1> get_used_columns() const;
237 
238    Index get_columns_number() const;
239 
240    Index get_input_columns_number() const;
241    Index get_target_columns_number() const;
242    Index get_time_columns_number() const;
243    Index get_unused_columns_number() const;
244    Index get_used_columns_number() const;
245 
246    Index get_column_index(const string&) const;
247    Index get_column_index(const Index&) const;
248 
249    Tensor<Index, 1> get_input_columns_indices() const;
250    Tensor<Index, 1> get_target_columns_indices() const;
251    Tensor<Index, 1> get_unused_columns_indices() const;
252    Tensor<Index, 1> get_used_columns_indices() const;
253 
254    Tensor<string, 1> get_columns_names() const;
255 
256    Tensor<string, 1> get_input_columns_names() const;
257    Tensor<string, 1> get_target_columns_names() const;
258    Tensor<string, 1> get_used_columns_names() const;
259 
get_column_type(const Index & index)260    ColumnType get_column_type(const Index& index) const {return columns[index].type;}
261 
262    VariableUse get_column_use(const Index &) const;
263    Tensor<VariableUse, 1> get_columns_uses() const;
264 
265    // Variables get methods
266 
267    Index get_variables_number() const;
268 
269    Index get_input_variables_number() const;
270    Index get_target_variables_number() const;
271    Index get_unused_variables_number() const;
272    Index get_used_variables_number() const;
273 
274    string get_variable_name(const Index&) const;
275    Tensor<string, 1> get_variables_names() const;
276 
277    Tensor<string, 1> get_input_variables_names() const;
278    Tensor<string, 1> get_target_variables_names() const;
279 
280    Index get_variable_index(const string&name) const;
281 
282    Tensor<Index, 1> get_variable_indices(const Index&) const;
283    Tensor<Index, 1> get_unused_variables_indices() const;
284    Tensor<Index, 1> get_used_variables_indices() const;
285    Tensor<Index, 1> get_input_variables_indices() const;
286    Tensor<Index, 1> get_target_variables_indices() const;
287 
288    VariableUse get_variable_use(const Index&) const;
289    Tensor<VariableUse, 1> get_variables_uses() const;
290 
291    const Tensor<Index, 1>& get_input_variables_dimensions() const;
292 
293    // Batches get methods
294 
295    Tensor<Index, 2> get_batches(const Tensor<Index,1>&, const Index&, const bool&, const Index& buffer_size= 100) const;
296 
297    // Data get methods
298 
299    const Tensor<type, 2>& get_data() const;
300    Tensor<type, 2>* get_data_pointer();
301 
302    const Tensor<type, 2>& get_time_series_data() const;
303 
304    Tensor<type, 2> get_training_data() const;
305    Tensor<type, 2> get_selection_data() const;
306    Tensor<type, 2> get_testing_data() const;
307    Tensor<string, 1> get_time_series_columns_names() const;
308    Index get_time_series_columns_number() const;
309 
310    Tensor<type, 2> get_input_data() const;
311    Tensor<type, 2> get_target_data() const;
312 
313    Tensor<type, 2> get_input_data(const Tensor<Index, 1>&) const;
314    Tensor<type, 2> get_target_data(const Tensor<Index, 1>&) const;
315 
316    Tensor<type, 2> get_training_input_data() const;
317    Tensor<type, 2> get_training_target_data() const;
318 
319    Tensor<type, 2> get_selection_input_data() const;
320    Tensor<type, 2> get_selection_target_data() const;
321 
322    Tensor<type, 2> get_testing_input_data() const;
323    Tensor<type, 2> get_testing_target_data() const;
324 
325    Tensor<type, 1> get_sample_data(const Index&) const;
326    Tensor<type, 1> get_sample_data(const Index&, const Tensor<Index, 1>&) const;
327    Tensor<type, 2> get_sample_input_data(const Index&) const;
328    Tensor<type, 2> get_sample_target_data(const Index&) const;
329 
330    Tensor<type, 2> get_column_data(const Index&) const;
331    Tensor<type, 2> get_column_data(const Index&, Tensor<Index, 1>&) const;
332    Tensor<type, 2> get_column_data(const Tensor<Index, 1>&) const;
333    Tensor<type, 2> get_column_data(const string&) const;
334 
335    Tensor<type, 1> get_variable_data(const Index&) const;
336    Tensor<type, 1> get_variable_data(const string&) const;
337 
338    Tensor<type, 1> get_variable_data(const Index&, const Tensor<Index, 1>&) const;
339    Tensor<type, 1> get_variable_data(const string&, const Tensor<Index, 1>&) const;
340 
341    Tensor<Tensor<string, 1>, 1> get_data_file_preview() const;
342 
343    Tensor<type, 2> get_subtensor_data(const Tensor<Index, 1>&, const Tensor<Index, 1>&) const;
344 
345    // Members get methods
346 
347    MissingValuesMethod get_missing_values_method() const;
348 
349    const string& get_data_file_name() const;
350 
351    const bool& get_header_line() const;
352    const bool& get_rows_label() const;
353 
354    Tensor<string, 1> get_rows_label_tensor() const;
355    Tensor<string, 1> get_selection_rows_label_tensor();
356    Tensor<string, 1> get_testing_rows_label_tensor();
357 
358    const Separator& get_separator() const;
359    char get_separator_char() const;
360    string get_separator_string() const;
361 
362    const string& get_missing_values_label() const;
363 
364    const Index& get_lags_number() const;
365    const Index& get_steps_ahead() const;
366    const Index& get_time_index() const;
367 
368    static Tensor<string, 1> get_default_columns_names(const Index&);
369 
370    static ScalingUnscalingMethod get_scaling_unscaling_method(const string&);
371 
372    Index get_gmt() const;
373 
374    const bool& get_display() const;
375 
376    // Set methods
377 
378    void set();
379    void set(const Tensor<type, 2>&);
380    void set(const Index&, const Index&);
381    void set(const Index&, const Index&, const Index&);
382    void set(const DataSet&);
383    void set(const tinyxml2::XMLDocument&);
384    void set(const string&);
385 
386    void set_default();
387 
388    void set_threads_number(const int&);
389 
390    // Samples set methods
391 
392    void set_samples_number(const Index&);
393 
394    void set_training();
395    void set_selection();
396    void set_testing();
397 
398    void set_training(const Tensor<Index, 1>&);
399    void set_selection(const Tensor<Index, 1>&);
400    void set_testing(const Tensor<Index, 1>&);
401 
402    void set_samples_unused();
403    void set_samples_unused(const Tensor<Index, 1>&);
404 
405    void set_sample_use(const Index&, const SampleUse&);
406    void set_sample_use(const Index&, const string&);
407 
408    void set_samples_uses(const Tensor<SampleUse, 1>&);
409    void set_samples_uses(const Tensor<string, 1>&);
410 
411    void set_k_fold_cross_validation_samples_uses(const Index&, const Index&);
412 
413    // Columns set methods
414 
415    void set_default_columns_uses();
416    void set_default_classification_columns_uses();
417 
418    void set_default_columns_names();
419 
420    void set_column_name(const Index&, const string&);
421 
422    void set_columns_uses(const Tensor<string, 1>&);
423    void set_columns_uses(const Tensor<VariableUse, 1>&);
424    void set_columns_unused();
425    void set_input_columns_unused();
426 
427    void set_column_use(const Index&, const VariableUse&);
428    void set_column_use(const string&, const VariableUse&);
429 
430    void set_columns_names(const Tensor<string, 1>&);
431 
432    void set_columns_number(const Index&);
433 
434    void set_binary_simple_columns();
435 
436    void binarize_input_data(const type&);
437 
438    // Columns other methods
439 
440    Tensor<type,2> transform_binary_column(const Tensor<type,1>&) const;
441 
442    // Variables set methods
443 
444    void set_variables_names(const Tensor<string, 1>&);
445    void set_variable_name(const Index&, const string&);
446 
447    void set_input();
448    void set_target();
449    void set_variables_unused();
450 
451    void set_input_variables_dimensions(const Tensor<Index, 1>&);
452 
453    // Data set methods
454 
455    void set_data(const Tensor<type, 2>&);
456 
457    // Members set methods
458 
459    void set_data_file_name(const string&);
460 
461    void set_has_columns_names(const bool&);
462    void set_has_rows_label(const bool&);
463 
464    void set_separator(const Separator&);
465    void set_separator(const string&);
466    void set_separator(const char&);
467 
468    void set_missing_values_label(const string&);
469    void set_missing_values_method(const MissingValuesMethod&);
470    void set_missing_values_method(const string&);
471 
472    void set_lags_number(const Index&);
473    void set_steps_ahead_number(const Index&);
474    void set_time_index(const Index&);
475 
476    void set_gmt(Index&);
477 
478    void set_display(const bool&);
479 
480    // Check methods
481 
482    bool is_binary_classification() const;
483    bool is_multiple_classification() const;
484 
485    bool is_empty() const;
486 
487    bool is_less_than(const Tensor<type, 1>&, const type&) const;
488 
489    bool is_sample_used(const Index&) const;
490    bool is_sample_unused(const Index&) const;
491 
492    bool has_data() const;
493 
494    bool has_binary_columns() const;
495    bool has_categorical_columns() const;
496    bool has_time_columns() const;
497 
498    bool has_selection() const;
499 
500    // Splitting methods
501 
502    void split_samples_sequential(const type& training_ratio = static_cast<type>(0.6),
503                                    const type& selection_ratio = static_cast<type>(0.2),
504                                    const type& testing_ratio = static_cast<type>(0.2));
505 
506    void split_samples_random(const type& training_ratio = static_cast<type>(0.6),
507                                const type& selection_ratio = static_cast<type>(0.2),
508                                const type& testing_ratio = static_cast<type>(0.2));
509 
510    // Unusing methods
511 
512    Tensor<string, 1> unuse_constant_columns();
513 
514    Tensor<Index, 1> unuse_repeated_samples();
515 
516    Tensor<string, 1> unuse_uncorrelated_columns(const type& = 0.25);
517 
518    // Initialization methods
519 
520    void initialize_data(const type&);
521 
522    void set_data_random();
523    void set_data_binary_random();
524 
525    // Descriptives methods
526 
527    Tensor<Descriptives, 1> calculate_variables_descriptives() const;
528    Tensor<Descriptives, 1> calculate_used_variables_descriptives() const;
529 
530    Tensor<Descriptives, 1> calculate_columns_descriptives_positive_samples() const;
531    Tensor<Descriptives, 1> calculate_columns_descriptives_negative_samples() const;
532    Tensor<Descriptives, 1> calculate_columns_descriptives_categories(const Index&) const;
533 
534    Tensor<Descriptives, 1> calculate_columns_descriptives_training_samples() const;
535    Tensor<Descriptives, 1> calculate_columns_descriptives_selection_samples() const;
536 
537    Tensor<Descriptives, 1> calculate_input_variables_descriptives() const;
538    Tensor<Descriptives, 1> calculate_target_variables_descriptives() const;
539 
540    Tensor<type, 1> calculate_input_variables_minimums() const;
541    Tensor<type, 1> calculate_target_variables_minimums() const;
542    Tensor<type, 1> calculate_input_variables_maximums() const;
543    Tensor<type, 1> calculate_target_variables_maximums() const;
544 
545    Tensor<type, 1> calculate_variables_means(const Tensor<Index, 1>&) const;
546    Tensor<type, 1> calculate_used_variables_minimums() const;
547 
548    Descriptives calculate_input_descriptives(const Index&) const;
549 
550    Tensor<type, 1> calculate_used_targets_mean() const;
551    Tensor<type, 1> calculate_selection_targets_mean() const;
552 
553    Index calculate_used_negatives(const Index&) const;
554    Index calculate_training_negatives(const Index&) const;
555    Index calculate_selection_negatives(const Index&) const;
556    Index calculate_testing_negatives(const Index&) const;
557 
558    // Distribution methods
559 
560    Tensor<Histogram, 1> calculate_columns_distribution(const Index& = 10) const;
561 
562    // Box and whiskers
563 
564    Tensor<BoxPlot, 1> calculate_columns_box_plots() const;
565 
566    // Inputs correlations
567 
568    Tensor<type, 2> calculate_input_columns_correlations() const;
569 
570    void print_inputs_correlations() const;
571 
572    void print_top_inputs_correlations(const Index& = 10) const;
573 
574    // Inputs-targets correlations
575 
576    Tensor<CorrelationResults, 2> calculate_input_target_columns_correlations() const;
577    Tensor<type, 2> calculate_input_target_columns_correlations_values() const;
578 
579    void print_input_target_columns_correlations() const;
580 
581    void print_top_input_target_columns_correlations(const Index& = 10) const;
582 
583    // Inputs-targets regressions
584 
585    Tensor<RegressionResults, 2> calculate_input_target_columns_regressions() const;
586 
587    // Principal components
588 
589    Tensor<type, 2> calculate_covariance_matrix() const;
590 
591    Tensor<type, 2> perform_principal_components_analysis(const type& = 0.0);
592 
593    Tensor<type, 2> perform_principal_components_analysis(const Tensor<type, 2>&, const Tensor<type, 1>&, const type& = 0.0);
594 
595    void transform_principal_components_data(const Tensor<type, 2>&);
596 
597    void subtract_inputs_mean();
598 
599    // Filtering methods
600 
601    Tensor<Index, 1> filter_column(const Index&, const type&, const type&);
602    Tensor<Index, 1> filter_column(const string&, const type&, const type&);
603 
604    Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&);
605 
606    // Data scaling
607 
608    Tensor<string, 1> calculate_default_scaling_methods() const;
609    Tensor<string, 1> calculate_default_unscaling_methods() const;
610    void scale_data_minimum_maximum(const Tensor<Descriptives, 1>&);
611    void scale_minimum_maximum_binary(const type&, const type&, const Index&);
612    void scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>&);
613    Tensor<Descriptives, 1> scale_data_minimum_maximum();
614    Tensor<Descriptives, 1> scale_data_mean_standard_deviation();
615 
616    // Input variables scaling
617 
618    void scale_input_mean_standard_deviation(const Descriptives&, const Index&);
619    Descriptives scale_input_mean_standard_deviation(const Index&);
620 
621    void scale_input_standard_deviation(const Descriptives&, const Index&);
622    Descriptives scale_input_standard_deviation(const Index&);
623 
624    void scale_input_minimum_maximum(const Descriptives&, const Index&);
625    Descriptives scale_input_minimum_maximum(const Index&);
626 
627    void scale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>&);
628    Tensor<Descriptives, 1> scale_input_variables_minimum_maximum();
629 
630    void unscale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>&);
631 
632    Tensor<Descriptives, 1> scale_input_variables(const Tensor<string, 1>&);
633 
634    // Target variables scaling
635 
636    void scale_target_minimum_maximum(const Descriptives&, const Index&);
637    void scale_target_mean_standard_deviation(const Descriptives&, const Index&);
638    void scale_target_logarithmic(const Descriptives&, const Index&);
639 
640    void scale_target_variables_minimum_maximum(const Tensor<Descriptives, 1>&);
641    Tensor<Descriptives, 1> scale_target_variables_minimum_maximum();
642 
643    void scale_target_variables_mean_standard_deviation(const Tensor<Descriptives, 1>&);
644    Tensor<Descriptives, 1> scale_target_variables_mean_standard_deviation();
645 
646    void scale_target_variables_logarithm(const Tensor<Descriptives, 1>&);
647    Tensor<Descriptives, 1> scale_target_variables_logarithm();
648 
649    Tensor<Descriptives, 1> scale_target_variables(const string&);
650    Tensor<Descriptives, 1> scale_target_variables(const Tensor<string, 1>&);
651 
652    // Data unscaling
653 
654    void unscale_input_variable_minimum_maximum(const Descriptives&, const Index&);
655    void unscale_input_mean_standard_deviation(const Descriptives&, const Index&);
656    void unscale_input_variable_standard_deviation(const Descriptives&, const Index&);
657    void unscale_input_variables(const Tensor<string,1>&, const Tensor<Descriptives, 1>&);
658 
659    void unscale_target_minimum_maximum(const Descriptives&, const Index&);
660    void unscale_target_mean_standard_deviation(const Descriptives&, const Index&);
661    void unscale_target_logarithmic(const Descriptives&, const Index&);
662    void unscale_target_variables(const Tensor<string,1>&, const Tensor<Descriptives, 1>&);
663 
664    // Classification methods
665 
666    Tensor<Index, 1> calculate_target_distribution() const;
667 
668    // Outlier detection
669 
670    Tensor<Tensor<Index, 1>, 1> calculate_Tukey_outliers(const type& = 1.5) const;
671 
672    void unuse_Tukey_outliers(const type& = 1.5);
673 
674    // Time series methods
675 
676    void transform_time_series_columns();
677    void transform_time_series_data();
678    void get_time_series_columns_number(const Index&);
679    void set_time_series_data(const Tensor<type, 2>&);
680 
681    Tensor<type, 2> get_time_series_column_data(const Index&) const;
682    Tensor<type, 2> calculate_autocorrelations(const Index& = 10) const;
683    Tensor<Tensor<type, 1>, 2> calculate_cross_correlations(const Index& = 10) const;
684    Tensor<type, 2> calculate_lag_plot() const;
685    Tensor<type, 2> calculate_lag_plot(const Index&);
686 
687    // Data generation
688 
689    void generate_constant_data(const Index&, const Index&);
690    void generate_random_data(const Index&, const Index&);
691    void generate_sequential_data(const Index&, const Index&);
692    void generate_paraboloid_data(const Index&, const Index&);
693    void generate_Rosenbrock_data(const Index&, const Index&);
694    void generate_inputs_selection_data(const Index&, const Index&);
695    void generate_sum_data(const Index&, const Index&);
696 
697    void generate_data_binary_classification(const Index&, const Index&);
698    void generate_data_multiple_classification(const Index&, const Index&, const Index&);
699 
700    // Serialization methods
701 
702    void print() const;
703    void print_summary() const;
704 
705    void from_XML(const tinyxml2::XMLDocument&);
706    void write_XML(tinyxml2::XMLPrinter&) const;
707 
708    void save(const string&) const;
709    void load(const string&);
710 
711    void print_columns_types() const;
712 
713    void print_data() const;
714    void print_data_preview() const;
715 
716    void print_data_file_preview() const;
717 
718    void save_data() const;
719 
720    void save_data_binary(const string&) const;
721 
722    // Data load methods
723 
724    void read_csv();
725 
726    void load_data_binary();
727 
728    void load_time_series_data_binary();
729 
730    void check_input_csv(const string&, const char&) const;
731    Tensor<type, 2> read_input_csv(const string&, const char&, const string&, const bool&, const bool&) const;
732 
733    // Trasform methods
734 
735    void transform_time_series();
736    void transform_association();
737 
738    void fill_time_series(const Index&);
739 
740    void numeric_to_categorical(const Index&);
741 
742    // Missing values
743 
744    bool has_nan() const;
745 
746    bool has_nan_row(const Index&) const;
747 
748    void print_missing_values_information() const;
749 
750    void impute_missing_values_unuse();
751    void impute_missing_values_mean();
752    void impute_missing_values_median();
753 
754    void scrub_missing_values();
755 
756    Tensor<Index, 1> count_nan_columns() const;
757    Index count_rows_with_nan() const;
758    Index count_nan() const;
759 
760    void set_missing_values_number(const Index&);
761    void set_missing_values_number();
762 
763    void set_columns_missing_values_number(const Tensor<Index, 1>&);
764    void set_columns_missing_values_number();
765 
766    void set_rows_missing_values_number(const Index&);
767    void set_rows_missing_values_number();
768 
769    // Other methods
770 
771    void fix_repeated_names();
772 
773    // scaling
774 
775    void set_min_max_range(const type min, const type max);
776 
777    // Eigen methods
778 
779    Tensor<Index, 1> push_back(const Tensor<Index, 1>&, const Index&) const;
780    Tensor<string, 1> push_back(const Tensor<string, 1>&, const string&) const;
781 
782    void initialize_sequential_eigen_tensor(Tensor<Index, 1>&, const Index&, const Index&, const Index&) const;
783    void intialize_sequential_eigen_type_tensor(Tensor<type, 1>&, const type&, const type&, const type&) const;
784 
785    Tensor<Index, 2> split_samples(const Tensor<Index, 1>&, const Index&) const;
786 
787    void fill_submatrix(const Tensor<type, 2>& matrix,
788              const Tensor<Index, 1>& rows_indices,
789              const Tensor<Index, 1>& columns_indices, type*submatrix);
790 
791    bool get_has_rows_labels() const;
792 
793    void shuffle();
794 
795 private:
796 
797    NonBlockingThreadPool* non_blocking_thread_pool = nullptr;
798    ThreadPoolDevice* thread_pool_device = nullptr;
799 
800    /// Data file name.
801 
802    string data_file_name;
803 
804    /// Separator character.
805 
806    Separator separator = Comma;
807 
808    /// Missing values label.
809 
810    string missing_values_label = "NA";
811 
812    /// Number of lags.
813 
814    Index lags_number;
815 
816    /// Number of steps ahead.
817 
818    Index steps_ahead;
819 
820    /// Min Max Range Scaling
821 
822    type min_range = -1;
823    type max_range = 1;
824 
825    /// Data Matrix.
826    /// The number of rows is the number of samples.
827    /// The number of columns is the number of variables.
828 
829    Tensor<type, 2> data;
830 
831    /// Time series data matrix.
832    /// The number of rows is the number of samples before time series transfomration.
833    /// The number of columns is the number of variables before time series transformation.
834 
835    Tensor<type, 2> time_series_data;
836 
837    Tensor<Column, 1> time_series_columns;
838 
839    /// Display messages to screen.
840 
841    bool display = true;
842 
843    /// Index where time variable is located for forecasting applications.
844 
845    Index time_index;
846 
847    /// Missing values method object.
848 
849    MissingValuesMethod missing_values_method = Unuse;
850 
851    // Samples
852 
853    Tensor<SampleUse, 1> samples_uses;
854 
855    // Variables
856 
857    // Reader
858 
859    void read_csv_1();
860 
861    void read_csv_2_simple();
862    void read_csv_3_simple();
863 
864    void read_csv_2_complete();
865    void read_csv_3_complete();
866 
867    void check_separators(const string&) const;
868 
869    void check_special_characters(const string&) const;
870 
871    /// Header which contains variables name.
872 
873    bool has_columns_names = false;
874 
875    Tensor<Index, 1> input_variables_dimensions;
876 
877    Tensor<Column, 1> columns;
878 
879    /// Header wihch contains the rows label.
880 
881    bool has_rows_labels = false;
882 
883    Tensor<string, 1> rows_labels;
884 
885    Index gmt = 0;
886 
887    Tensor<Tensor<string, 1>, 1> data_file_preview;
888 
889    Eigen::array<IndexPair<Index>, 1> product_vector_vector = {IndexPair<Index>(0, 0)}; // Vector product, (0,0) first vector is transpose
890 
891    /// Missing values
892 
893    Index missing_values_number;
894 
895    Tensor<Index, 1> columns_missing_values_number;
896 
897    Index rows_missing_values_number;
898 
899 #ifdef OPENNN_CUDA
900     #include "../../opennn-cuda/opennn_cuda/data_set_cuda.h"
901 #endif
902 
903 };
904 
905 }
906 
907 #endif
908 
909 // OpenNN: Open Neural Networks Library.
910 // Copyright(C) 2005-2020 Artificial Intelligence Techniques, SL.
911 //
912 // This library is free software; you can redistribute it and/or
913 // modify it under the terms of the GNU Lesser General Public
914 // License as published by the Free Software Foundation; either
915 // version 2.1 of the License, or any later version.
916 //
917 // This library is distributed in the hope that it will be useful,
918 // but WITHOUT ANY WARRANTY; without even the implied warranty of
919 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
920 // Lesser General Public License for more details.
921 
922 // You should have received a copy of the GNU Lesser General Public
923 // License along with this library; if not, write to the Free Software
924 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
925