1 //   OpenNN: Open Neural Networks Library
2 //   www.opennn.net
3 //
4 //   D A T A   S E T   C L A S S
5 //
6 //   Artificial Intelligence Techniques SL
7 //   artelnics@artelnics.com
8 
9 #include "data_set.h"
10 #include <omp.h>
11 
12 using namespace  OpenNN;
13 
14 namespace OpenNN
15 {
16 
17 /// Default constructor.
18 /// It creates a data set object with zero samples and zero inputs and target variables.
19 /// It also initializes the rest of class members to their default values.
20 
DataSet()21 DataSet::DataSet()
22 {
23     set();
24 
25     set_default();
26 }
27 
28 
29 /// Default constructor. It creates a data set object from data Eigen Matrix.
30 /// It also initializes the rest of class members to their default values.
31 /// @param data Data Tensor<type, 2>.
32 
DataSet(const Tensor<type,2> & data)33 DataSet::DataSet(const Tensor<type, 2>& data)
34 {
35     set(data);
36 
37     set_default();
38 }
39 
40 
41 /// Samples and variables number constructor.
42 /// It creates a data set object with given samples and variables numbers.
43 /// All the variables are set as inputs.
44 /// It also initializes the rest of class members to their default values.
45 /// @param new_samples_number Number of samples in the data set.
46 /// @param new_variables_number Number of variables.
47 
DataSet(const Index & new_samples_number,const Index & new_variables_number)48 DataSet::DataSet(const Index& new_samples_number, const Index& new_variables_number)
49 {
50     set(new_samples_number, new_variables_number);
51 
52     set_default();
53 }
54 
55 
56 /// Samples number, input variables number and target variables number constructor.
57 /// It creates a data set object with given samples and inputs and target variables numbers.
58 /// It also initializes the rest of class members to their default values.
59 /// @param new_samples_number Number of samples in the data set.
60 /// @param new_inputs_number Number of input variables.
61 /// @param new_targets_number Number of target variables.
62 
DataSet(const Index & new_samples_number,const Index & new_inputs_number,const Index & new_targets_number)63 DataSet::DataSet(const Index& new_samples_number, const Index& new_inputs_number, const Index& new_targets_number)
64 {
65     set(new_samples_number, new_inputs_number, new_targets_number);
66 
67     set_default();
68 }
69 
70 
71 /// File and separator constructor. It creates a data set object by loading the object members from a data file.
72 /// It also sets a separator.
73 /// Please mind about the file format. This is specified in the User's Guide.
74 /// @param data_file_name Data file file name.
75 /// @param separator Data file file name.
76 
DataSet(const string & data_file_name,const char & separator,const bool & new_has_columns_names)77 DataSet::DataSet(const string& data_file_name, const char& separator, const bool& new_has_columns_names)
78 {
79     set();
80 
81     set_default();
82 
83     set_data_file_name(data_file_name);
84 
85     set_separator(separator);
86 
87     set_has_columns_names(new_has_columns_names);
88 
89     read_csv();
90 }
91 
92 
93 /// Destructor.
94 
~DataSet()95 DataSet::~DataSet()
96 {
97     delete non_blocking_thread_pool;
98     delete thread_pool_device;
99 }
100 
101 
102 /// Returns true if messages from this class can be displayed on the screen,
103 /// or false if messages from this class can't be displayed on the screen.
104 
get_display() const105 const bool& DataSet::get_display() const
106 {
107     return display;
108 }
109 
110 
111 /// Column default constructor
112 
Column()113 DataSet::Column::Column()
114 {
115     name = "";
116     column_use = Input;
117     type = Numeric;
118     categories.resize(0);
119     categories_uses.resize(0);
120 }
121 
122 
123 /// Column default constructor
124 
Column(const string & new_name,const VariableUse & new_column_use,const ColumnType & new_type,const Tensor<string,1> & new_categories,const Tensor<VariableUse,1> & new_categories_uses)125 DataSet::Column::Column(const string& new_name,
126                         const VariableUse& new_column_use,
127                         const ColumnType& new_type,
128                         const Tensor<string, 1>& new_categories,
129                         const Tensor<VariableUse, 1>& new_categories_uses)
130 {
131     name = new_name;
132     column_use = new_column_use;
133     type = new_type;
134     categories = new_categories;
135     categories_uses = new_categories_uses;
136 }
137 
138 /// Column destructor.
139 
~Column()140 DataSet::Column::~Column()
141 {}
142 
143 
144 /// Sets the use of the column and of the categories.
145 /// @param new_column_use New use of the column.
146 
set_use(const VariableUse & new_column_use)147 void DataSet::Column::set_use(const VariableUse& new_column_use)
148 {
149     column_use = new_column_use;
150 
151     for(Index i = 0; i < categories_uses.size(); i ++)
152     {
153         categories_uses(i) = new_column_use;
154     }
155 }
156 
157 
158 /// Sets the use of the column and of the categories.
159 /// @param new_column_use New use of the column in string format.
160 
set_use(const string & new_column_use)161 void DataSet::Column::set_use(const string& new_column_use)
162 {
163     if(new_column_use == "Input")
164     {
165         set_use(Input);
166     }
167     else if(new_column_use == "Target")
168     {
169         set_use(Target);
170     }
171     else if(new_column_use == "Time")
172     {
173         set_use(Time);
174     }
175     else if(new_column_use == "Unused")
176     {
177         set_use(UnusedVariable);
178     }
179     else
180     {
181         ostringstream buffer;
182 
183         buffer << "OpenNN Exception DataSet class.\n"
184                << "void set_use(const string&) method.\n"
185                << "Unknown use: " << new_column_use << "\n";
186 
187         throw logic_error(buffer.str());
188     }
189 }
190 
191 
192 /// Sets the column type.
193 /// @param new_column_type Column type in string format.
194 
set_type(const string & new_column_type)195 void DataSet::Column::set_type(const string& new_column_type)
196 {
197     if(new_column_type == "Numeric")
198     {
199         type = Numeric;
200     }
201     else if(new_column_type == "Binary")
202     {
203         type = Binary;
204     }
205     else if(new_column_type == "Categorical")
206     {
207         type = Categorical;
208     }
209     else if(new_column_type == "DateTime")
210     {
211         type = DateTime;
212     }
213     else if(new_column_type == "Constant")
214     {
215         type = Constant;
216     }
217     else
218     {
219         ostringstream buffer;
220 
221         buffer << "OpenNN Exception: DataSet class.\n"
222                << "void Column::set_type(const string&) method.\n"
223                << "Column type not valid (" << new_column_type << ").\n";
224 
225         throw logic_error(buffer.str());
226 
227     }
228 }
229 
230 
231 /// Adds a category to the categories vector of this column.
232 /// It also adds a default use for the category
233 /// @param new_category String that contains the name of the new category
234 
add_category(const string & new_category)235 void DataSet::Column::add_category(const string & new_category)
236 {
237     const Index old_categories_number = categories.size();
238 
239     Tensor<string, 1> old_categories = categories;
240     Tensor<VariableUse, 1> old_categories_uses = categories_uses;
241 
242     categories.resize(old_categories_number+1);
243     categories_uses.resize(old_categories_number+1);
244 
245     for(Index category_index = 0; category_index < old_categories_number; category_index++)
246     {
247         categories(category_index) = old_categories(category_index);
248         categories_uses(category_index) = column_use;
249     }
250 
251     categories(old_categories_number) = new_category;
252     categories_uses(old_categories_number) = column_use;
253 }
254 
255 
256 /// Sets the categories uses in the data set.
257 /// @param new_categories_uses String vector that contains the new categories of the data set.
258 
set_categories_uses(const Tensor<string,1> & new_categories_uses)259 void DataSet::Column::set_categories_uses(const Tensor<string, 1>& new_categories_uses)
260 {
261     const Index new_categories_uses_number = new_categories_uses.size();
262 
263     categories_uses.resize(new_categories_uses_number);
264 
265     for(Index i = 0; i < new_categories_uses.size(); i++)
266     {
267         if(new_categories_uses(i) == "Input")
268         {
269             categories_uses(i) = Input;
270         }
271         else if(new_categories_uses(i) == "Target")
272         {
273             categories_uses(i) = Target;
274         }
275         else if(new_categories_uses(i) == "Time")
276         {
277             categories_uses(i) = Time;
278         }
279         else if(new_categories_uses(i) == "Unused"
280                 || new_categories_uses(i) == "UnusedVariable")
281         {
282             categories_uses(i) = UnusedVariable;
283         }
284         else
285         {
286             ostringstream buffer;
287 
288             buffer << "OpenNN Exception: DataSet class.\n"
289                    << "void Column::set_categories_uses(const Tensor<string, 1>&) method.\n"
290                    << "Category use not valid (" << new_categories_uses(i) << ").\n";
291 
292             throw logic_error(buffer.str());
293 
294         }
295     }
296 }
297 
298 
299 /// Sets the categories uses in the data set.
300 /// @param new_categories_use New categories use
301 
set_categories_uses(const VariableUse & new_categories_use)302 void DataSet::Column::set_categories_uses(const VariableUse& new_categories_use)
303 {
304     categories_uses.setConstant(new_categories_use);
305 }
306 
307 
from_XML(const tinyxml2::XMLDocument & column_document)308 void DataSet::Column::from_XML(const tinyxml2::XMLDocument& column_document)
309 {
310     ostringstream buffer;
311 
312     // Name
313 
314     const tinyxml2::XMLElement* name_element = column_document.FirstChildElement("Name");
315 
316     if(!name_element)
317     {
318         buffer << "OpenNN Exception: DataSet class.\n"
319                << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
320                << "Name element is nullptr.\n";
321 
322         throw logic_error(buffer.str());
323     }
324 
325     if(name_element->GetText())
326     {
327         const string new_name = name_element->GetText();
328 
329         name = new_name;
330     }
331 
332     // Column use
333 
334     const tinyxml2::XMLElement* column_use_element = column_document.FirstChildElement("ColumnUse");
335 
336     if(!column_use_element)
337     {
338         buffer << "OpenNN Exception: DataSet class.\n"
339                << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
340                << "Column use element is nullptr.\n";
341 
342         throw logic_error(buffer.str());
343     }
344 
345     if(column_use_element->GetText())
346     {
347         const string new_column_use = column_use_element->GetText();
348 
349         set_use(new_column_use);
350     }
351 
352     // Type
353 
354     const tinyxml2::XMLElement* type_element = column_document.FirstChildElement("Type");
355 
356     if(!type_element)
357     {
358         buffer << "OpenNN Exception: DataSet class.\n"
359                << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
360                << "Type element is nullptr.\n";
361 
362         throw logic_error(buffer.str());
363     }
364 
365     if(type_element->GetText())
366     {
367         const string new_type = type_element->GetText();
368         set_type(new_type);
369     }
370 
371     if(type == Categorical)
372     {
373         // Categories
374 
375         const tinyxml2::XMLElement* categories_element = column_document.FirstChildElement("Categories");
376 
377         if(!categories_element)
378         {
379             buffer << "OpenNN Exception: DataSet class.\n"
380                    << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
381                    << "Categories element is nullptr.\n";
382 
383             throw logic_error(buffer.str());
384         }
385 
386         if(categories_element->GetText())
387         {
388             const string new_categories = categories_element->GetText();
389 
390             categories = get_tokens(new_categories, ';');
391         }
392 
393         // Categories uses
394 
395         const tinyxml2::XMLElement* categories_uses_element = column_document.FirstChildElement("CategoriesUses");
396 
397         if(!categories_uses_element)
398         {
399             buffer << "OpenNN Exception: DataSet class.\n"
400                    << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
401                    << "Categories uses element is nullptr.\n";
402 
403             throw logic_error(buffer.str());
404         }
405 
406         if(categories_uses_element->GetText())
407         {
408             const string new_categories_uses = categories_uses_element->GetText();
409 
410             set_categories_uses(get_tokens(new_categories_uses, ';'));
411         }
412     }
413 }
414 
415 
write_XML(tinyxml2::XMLPrinter & file_stream) const416 void DataSet::Column::write_XML(tinyxml2::XMLPrinter& file_stream) const
417 {
418     // Name
419 
420     file_stream.OpenElement("Name");
421 
422     file_stream.PushText(name.c_str());
423 
424     file_stream.CloseElement();
425 
426     // Column use
427 
428     file_stream.OpenElement("ColumnUse");
429 
430     if(column_use == Input)
431     {
432         file_stream.PushText("Input");
433     }
434     else if (column_use == Target)
435     {
436         file_stream.PushText("Target");
437     }
438     else if (column_use == UnusedVariable)
439     {
440         file_stream.PushText("Unused");
441     }
442     else
443     {
444         file_stream.PushText("Time");
445     }
446 
447     file_stream.CloseElement();
448 
449     // Type
450 
451     file_stream.OpenElement("Type");
452 
453     if(type == Numeric)
454     {
455         file_stream.PushText("Numeric");
456     }
457     else if (type == Binary)
458     {
459         file_stream.PushText("Binary");
460     }
461     else if (type == Categorical)
462     {
463         file_stream.PushText("Categorical");
464     }
465     else if(type == Constant)
466     {
467         file_stream.PushText("Constant");
468     }
469     else
470     {
471         file_stream.PushText("DateTime");
472     }
473 
474     file_stream.CloseElement();
475 
476     if(type == Categorical || type == Binary)
477     {
478         if(categories.size() == 0) return;
479 
480         // Categories
481 
482         file_stream.OpenElement("Categories");
483 
484         for(Index i = 0; i < categories.size(); i++)
485         {
486             file_stream.PushText(categories(i).c_str());
487 
488             if(i != categories.size()-1)
489             {
490                 file_stream.PushText(";");
491             }
492         }
493 
494         file_stream.CloseElement();
495 
496         // Categories uses
497 
498         file_stream.OpenElement("CategoriesUses");
499 
500         for(Index i = 0; i < categories_uses.size(); i++)
501         {
502             if(categories_uses(i) == Input)
503             {
504                 file_stream.PushText("Input");
505             }
506             else if (categories_uses(i) == Target)
507             {
508                 file_stream.PushText("Target");
509             }
510             else if (categories_uses(i) == Time)
511             {
512                 file_stream.PushText("Time");
513             }
514             else
515             {
516                 file_stream.PushText("Unused");
517             }
518 
519             if(i != categories_uses.size()-1)
520             {
521                 file_stream.PushText(";");
522             }
523         }
524 
525         file_stream.CloseElement();
526     }
527     /*else if(type == Binary)
528     {
529         if(categories.size() > 0)
530         {
531             // Categories
532 
533             file_stream.OpenElement("Categories");
534             file_stream.PushText(categories(0).c_str());
535             file_stream.PushText(";");
536             file_stream.PushText(categories(1).c_str());
537             file_stream.CloseElement();
538 
539             // Categories uses
540 
541             file_stream.OpenElement("CategoriesUses");
542 
543             if(categories_uses(0) == Input)
544             {
545                 file_stream.PushText("Input");
546             }
547             else if (categories_uses(0) == Target)
548             {
549                 file_stream.PushText("Target");
550             }
551             else if (categories_uses(0) == Time)
552             {
553                 file_stream.PushText("Time");
554             }
555             else
556             {
557                 file_stream.PushText("Unused");
558             }
559 
560             file_stream.PushText(";");
561 
562             if(categories_uses(1) == Input)
563             {
564                 file_stream.PushText("Input");
565             }
566             else if (categories_uses(1) == Target)
567             {
568                 file_stream.PushText("Target");
569             }
570             else if (categories_uses(1) == Time)
571             {
572                 file_stream.PushText("Time");
573             }
574             else
575             {
576                 file_stream.PushText("Unused");
577             }
578 
579             file_stream.CloseElement();
580         }
581     }*/
582 }
583 
584 
585 /// Returns the number of categories.
586 
get_categories_number() const587 Index DataSet::Column::get_categories_number() const
588 {
589     return categories.size();
590 }
591 
592 
593 /// Returns the number of used categories.
594 
get_used_categories_number() const595 Index DataSet::Column::get_used_categories_number() const
596 {
597     Index used_categories_number = 0;
598 
599     for(Index i = 0; i < categories.size(); i++)
600     {
601         if(categories_uses(i) != UnusedVariable) used_categories_number++;
602     }
603 
604     return used_categories_number;
605 }
606 
607 
608 /// Returns a string vector that contains the names of the used variables in the data set.
609 
get_used_variables_names() const610 Tensor<string, 1> DataSet::Column::get_used_variables_names() const
611 {
612     Tensor<string, 1> used_variables_names;
613 
614     if(type != Categorical && column_use != UnusedVariable)
615     {
616         used_variables_names.resize(1);
617         used_variables_names.setConstant(name);
618     }
619     else if(type == Categorical)
620     {
621         used_variables_names.resize(get_used_categories_number());
622 
623         Index category_index = 0;
624 
625         for(Index i = 0; i < categories.size(); i++)
626         {
627             if(categories_uses(i) != UnusedVariable)
628             {
629                 used_variables_names(category_index) = categories(i);
630 
631                 category_index++;
632             }
633         }
634     }
635 
636     return used_variables_names;
637 }
638 
639 
640 /// This method transforms the columns into time series for forecasting problems.
641 
transform_time_series_columns()642 void DataSet::transform_time_series_columns()
643 {
644     const Index columns_number = get_columns_number();
645 
646     Tensor<Column, 1> new_columns;
647 
648     if(has_time_columns())
649     {
650         new_columns.resize((columns_number-1)*(lags_number+steps_ahead));
651     }
652     else
653     {
654         new_columns.resize(columns_number*(lags_number+steps_ahead));
655     }
656 
657     Index lag_index = lags_number - 1;
658     Index ahead_index = 0;
659     Index column_index = 0;
660     Index new_column_index = 0;
661 
662     for(Index i = 0; i < columns_number*(lags_number+steps_ahead); i++)
663     {
664         column_index = i%columns_number;
665 
666         if(time_series_columns(column_index).type == DateTime)
667         {
668             continue;
669         }
670 
671         if(i < lags_number*columns_number)
672         {
673             new_columns(new_column_index).name = columns(column_index).name + "_lag_" + to_string(lag_index);
674             new_columns(new_column_index).set_use(Input);
675 
676             new_columns(new_column_index).type = columns(column_index).type;
677             new_columns(new_column_index).categories = columns(column_index).categories;
678             new_columns(new_column_index).categories_uses = columns(column_index).categories_uses;
679 
680             new_column_index++;
681         }
682         else
683         {
684             new_columns(new_column_index).name = columns(column_index).name + "_ahead_" + to_string(ahead_index);
685             new_columns(new_column_index).set_use(Target);
686 
687             new_columns(new_column_index).type = columns(column_index).type;
688             new_columns(new_column_index).categories = columns(column_index).categories;
689             new_columns(new_column_index).categories_uses = columns(column_index).categories_uses;
690 
691             new_column_index++;
692         }
693 
694         if(lag_index > 0 && column_index == columns_number - 1)
695         {
696             lag_index--;
697         }
698         else if(column_index == columns_number - 1)
699         {
700             ahead_index++;
701         }
702     }
703 
704     columns = new_columns;
705 }
706 
707 
708 /// Returns true if a given sample is to be used for training, selection or testing,
709 /// and false if it is to be unused.
710 /// @param index Sample index.
711 
is_sample_used(const Index & index) const712 bool DataSet::is_sample_used(const Index& index) const
713 {
714     if(samples_uses(index) == UnusedSample)
715     {
716         return false;
717     }
718     else
719     {
720         return true;
721     }
722 }
723 
724 
725 /// Returns true if a given sample is to be unused and false in other case.
726 /// @param index Sample index.
727 
is_sample_unused(const Index & index) const728 bool DataSet::is_sample_unused(const Index& index) const
729 {
730     if(samples_uses(index) == UnusedSample)
731     {
732         return true;
733     }
734     else
735     {
736         return false;
737     }
738 }
739 
740 
741 /// Returns a vector with the number of training, selection, testing
742 /// and unused samples.
743 /// The size of that vector is therefore four.
744 
get_samples_uses_numbers() const745 Tensor<Index, 1> DataSet::get_samples_uses_numbers() const
746 {
747     Tensor<Index, 1> count(4);
748 
749     const Index samples_number = get_samples_number();
750 
751     for(Index i = 0; i < samples_number; i++)
752     {
753         if(samples_uses(i) == Training)
754         {
755             count(0)++;
756         }
757         else if(samples_uses(i) == Selection)
758         {
759             count(1)++;
760         }
761         else if(samples_uses(i) == Testing)
762         {
763             count(2)++;
764         }
765         else
766         {
767             count(3)++;
768         }
769     }
770 
771     return count;
772 }
773 
774 
775 /// Returns a vector with the uses of the samples in percentages of the data set.
776 /// Uses: training, selection, testing and unused samples.
777 /// Note that the vector size is four.
778 
get_samples_uses_percentages() const779 Tensor<type, 1> DataSet::get_samples_uses_percentages() const
780 {
781     const Index samples_number = get_samples_number();
782     const Index training_samples_number = get_training_samples_number();
783     const Index selection_samples_number = get_selection_samples_number();
784     const Index testing_samples_number = get_testing_samples_number();
785     const Index unused_samples_number = get_unused_samples_number();
786 
787     const type training_samples_percentage = training_samples_number*100/static_cast<type>(samples_number);
788     const type selection_samples_percentage = selection_samples_number*100/static_cast<type>(samples_number);
789     const type testing_samples_percentage = testing_samples_number*100/static_cast<type>(samples_number);
790     const type unused_samples_percentage = unused_samples_number*100/static_cast<type>(samples_number);
791 
792     Tensor<type, 1> samples_uses_percentage(4);
793 
794     samples_uses_percentage.setValues({training_samples_percentage,
795                                          selection_samples_percentage,
796                                          testing_samples_percentage,
797                                          unused_samples_percentage});
798 
799     return samples_uses_percentage;
800 }
801 
802 
803 /// Returns a string with the values of the sample corresponding to the given index.
804 /// The values will be separated by the given separator char.
805 /// @param sample_index Index of the sample.
806 /// @param separator Separator.
807 
get_sample_string(const Index & sample_index,const string & separator) const808 string DataSet::get_sample_string(const Index& sample_index, const string& separator) const
809 {
810     const Tensor<type, 1> sample = data.chip(sample_index, 0);
811 
812     string sample_string = "";
813 
814     const Index columns_number = get_columns_number();
815 
816     Index variable_index = 0;
817 
818     for(Index i = 0; i < columns_number; i++)
819     {
820         if(columns(i).type == Numeric)
821         {
822             if(::isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
823             else sample_string += std::to_string(data(sample_index, variable_index));
824 
825             variable_index++;
826         }
827         else if(columns(i).type == Binary)
828         {
829             if(::isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
830             else sample_string += columns(i).categories(static_cast<Index>(data(sample_index, variable_index)));
831 
832             variable_index++;
833         }
834         else if(columns(i).type == DateTime)
835         {
836             // @todo do something
837 
838             if(::isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
839             else sample_string += std::to_string(data(sample_index, variable_index));
840 
841             variable_index++;
842         }
843         else if(columns(i).type == Categorical)
844         {
845             if(::isnan(data(sample_index, variable_index)))
846             {
847                 sample_string += missing_values_label;
848             }
849             else
850             {
851                 const Index categories_number = columns(i).get_categories_number();
852 
853                 for(Index j = 0; j < categories_number; j++)
854                 {
855                     if(abs(data(sample_index, variable_index+j) - static_cast<type>(1)) < std::numeric_limits<type>::min())
856                     {
857                         sample_string += columns(i).categories(j);
858                         break;
859                     }
860                 }
861 
862                 variable_index += categories_number;
863             }
864         }
865 
866         if(i != columns_number-1) sample_string += separator + " ";
867     }
868 
869     return sample_string;
870 }
871 
872 
873 /// Returns the indices of the samples which will be used for training.
874 
get_training_samples_indices() const875 Tensor<Index, 1> DataSet::get_training_samples_indices() const
876 {
877     const Index samples_number = get_samples_number();
878 
879     const Index training_samples_number = get_training_samples_number();
880 
881     Tensor<Index, 1> training_indices(training_samples_number);
882 
883     Index count = 0;
884 
885     for(Index i = 0; i < samples_number; i++)
886     {
887         if(samples_uses(i) == Training)
888         {
889             training_indices(count) = i;
890             count++;
891         }
892     }
893     return training_indices;
894 }
895 
896 
897 /// Returns the indices of the samples which will be used for selection.
898 
get_selection_samples_indices() const899 Tensor<Index, 1> DataSet::get_selection_samples_indices() const
900 {
901     const Index samples_number = get_samples_number();
902 
903     const Index selection_samples_number = get_selection_samples_number();
904 
905     Tensor<Index, 1> selection_indices(selection_samples_number);
906 
907     Index count = 0;
908 
909     for(Index i = 0; i < samples_number; i++)
910     {
911         if(samples_uses(i) == Selection)
912         {
913             selection_indices(count) = i;
914             count++;
915         }
916     }
917 
918     return selection_indices;
919 }
920 
921 
922 /// Returns the indices of the samples which will be used for testing.
923 
get_testing_samples_indices() const924 Tensor<Index, 1> DataSet::get_testing_samples_indices() const
925 {
926     const Index samples_number = get_samples_number();
927 
928     const Index testing_samples_number = get_testing_samples_number();
929 
930     Tensor<Index, 1> testing_indices(testing_samples_number);
931 
932     Index count = 0;
933 
934     for(Index i = 0; i < samples_number; i++)
935     {
936         if(samples_uses(i) == Testing)
937         {
938             testing_indices(count) = i;
939             count++;
940         }
941     }
942 
943     return testing_indices;
944 }
945 
946 
947 /// Returns the indices of the used samples(those which are not set unused).
948 
get_used_samples_indices() const949 Tensor<Index, 1> DataSet::get_used_samples_indices() const
950 {
951     const Index samples_number = get_samples_number();
952 
953     const Index used_samples_number = samples_number - get_unused_samples_number();
954 
955     Tensor<Index, 1> used_indices(used_samples_number);
956 
957     Index index = 0;
958 
959     for(Index i = 0; i < samples_number; i++)
960     {
961         if(samples_uses(i) != UnusedSample)
962         {
963             used_indices(index) = i;
964             index++;
965         }
966     }
967 
968     return used_indices;
969 }
970 
971 
972 /// Returns the indices of the samples set unused.
973 
get_unused_samples_indices() const974 Tensor<Index, 1> DataSet::get_unused_samples_indices() const
975 {
976     const Index samples_number = get_samples_number();
977 
978     const Index unused_samples_number = get_unused_samples_number();
979 
980     Tensor<Index, 1> unused_indices(unused_samples_number);
981 
982     Index count = 0;
983 
984     for(Index i = 0; i < samples_number; i++)
985     {
986         if(samples_uses(i) == UnusedSample)
987         {
988             unused_indices(count) = i;
989             count++;
990         }
991     }
992 
993     return unused_indices;
994 }
995 
996 
997 /// Returns the use of a single sample.
998 /// @param index Sample index.
999 
get_sample_use(const Index & index) const1000 DataSet::SampleUse DataSet::get_sample_use(const Index& index) const
1001 {
1002     return samples_uses(index);
1003 }
1004 
1005 
1006 /// Returns the use of every sample (training, selection, testing or unused) in a vector.
1007 
get_samples_uses() const1008 const Tensor<DataSet::SampleUse,1 >& DataSet::get_samples_uses() const
1009 {
1010     return samples_uses;
1011 }
1012 
1013 
1014 /// Returns a vector, where each element is a vector that contains the indices of the different batches of the training samples.
1015 /// @param shuffle Is a boleean.
1016 /// If shuffle is true, then the indices are shuffled into batches, and false otherwise
1017 /// @todo In forecasting must be false.
1018 
get_batches(const Tensor<Index,1> & samples_indices,const Index & batch_samples_number,const bool & shuffle,const Index & new_buffer_size) const1019 Tensor<Index, 2> DataSet::get_batches(const Tensor<Index,1>& samples_indices,
1020                                       const Index& batch_samples_number,
1021                                       const bool& shuffle,
1022                                       const Index& new_buffer_size) const
1023 {
1024     if(!shuffle) return split_samples(samples_indices, batch_samples_number);
1025 
1026     const Index samples_number = samples_indices.size();
1027 
1028     Index buffer_size = new_buffer_size;
1029     Index batches_number;
1030     Index batch_size = batch_samples_number;
1031 
1032     // Check batch size and samples number
1033 
1034     if(samples_number < batch_size)
1035     {
1036         batches_number = 1;
1037         batch_size = samples_number;
1038         buffer_size = batch_size;
1039 
1040         Tensor<Index,1> samples_copy(samples_indices);
1041 
1042         Tensor<Index, 2> batches(batches_number, batch_size);
1043 
1044         // Shuffle
1045 
1046         random_shuffle(samples_copy.data(), samples_copy.data() + samples_copy.size());
1047 
1048         for(Index i = 0; i < batch_size; i++)
1049             batches(0,i) = samples_copy(i);
1050 
1051         return batches;
1052 
1053     }
1054     else
1055     {
1056         batches_number = samples_number / batch_size;
1057     }
1058 
1059 
1060     Tensor<Index, 2> batches(batches_number, batch_size);
1061 
1062     Tensor<Index, 1> buffer(buffer_size);
1063     for(Index i = 0; i < buffer_size; i++) buffer(i) = i;
1064 
1065     Index next_index = buffer_size;
1066     Index random_index = 0;
1067 
1068     // Heuristic cases for batch shuffling
1069 
1070     if(batch_size < buffer_size)
1071     {
1072         Index diff = buffer_size/ batch_size;
1073 
1074         // Main Loop
1075 
1076         for(Index i = 0; i < batches_number; i++)
1077         {
1078             // Last batch
1079 
1080             if(i == batches_number-diff)
1081             {
1082                 Index buffer_index = 0;
1083 
1084                 for(Index k = batches_number-diff; k < batches_number; k++)
1085                 {
1086                     for(Index j = 0; j < batch_size; j++)
1087                     {
1088                         batches(k,j) = buffer(buffer_index);
1089 
1090                         buffer_index++;
1091                     }
1092                 }
1093 
1094                 break;
1095             }
1096 
1097             // Shuffle batches
1098 
1099             for(Index j = 0; j < batch_size; j++)
1100             {
1101                 random_index = static_cast<Index>(rand()%buffer_size);
1102 
1103                 batches(i, j) = buffer(random_index);
1104 
1105                 buffer(random_index) = samples_indices(next_index);
1106 
1107                 next_index++;
1108             }
1109         }
1110 
1111         return batches;
1112     }
1113     else // buffer_size <= batch_size
1114     {
1115 
1116         // Main Loop
1117 
1118         for(Index i = 0; i < batches_number; i++)
1119         {
1120             // Last batch
1121 
1122             if(i == batches_number-1)
1123             {
1124                 random_shuffle(buffer.data(), buffer.data() +  buffer.size());
1125 
1126                 if(batch_size <= buffer_size)
1127                 {
1128                     for(Index j = 0; j < batch_size;j++)
1129                     {
1130                         batches(i,j) = buffer(j);
1131                     }
1132                 }
1133                 else //buffer_size < batch_size
1134                 {
1135                     for(Index j = 0; j < buffer_size; j++)
1136                     {
1137                         batches(i,j) = buffer(j);
1138                     }
1139 
1140                     for(Index j = buffer_size; j < batch_size; j++)
1141                     {
1142                         batches(i,j) = samples_indices(next_index);
1143 
1144                         next_index++;
1145                     }
1146                 }
1147 
1148                 break;
1149             }
1150 
1151             // Shuffle batches
1152 
1153             for(Index j = 0; j < batch_size; j++)
1154             {
1155                 random_index = static_cast<Index>(rand()%buffer_size);
1156 
1157                 batches(i, j) = buffer(random_index);
1158 
1159                 buffer(random_index) = samples_indices(next_index);
1160 
1161                 next_index++;
1162 
1163             }
1164         }
1165 
1166         return batches;
1167     }
1168 }
1169 
1170 
1171 /// Returns the number of samples in the data set which will be used for training.
1172 
get_training_samples_number() const1173 Index DataSet::get_training_samples_number() const
1174 {
1175     const Index samples_number = get_samples_number();
1176 
1177     Index training_samples_number = 0;
1178 
1179     for(Index i = 0; i < samples_number; i++)
1180     {
1181         if(samples_uses(i) == Training)
1182         {
1183             training_samples_number++;
1184         }
1185     }
1186 
1187     return training_samples_number;
1188 }
1189 
1190 
1191 /// Returns the number of samples in the data set which will be used for selection.
1192 
get_selection_samples_number() const1193 Index DataSet::get_selection_samples_number() const
1194 {
1195     const Index samples_number = get_samples_number();
1196 
1197     Index selection_samples_number = 0;
1198 
1199     for(Index i = 0; i < samples_number; i++)
1200     {
1201         if(samples_uses(i) == Selection)
1202         {
1203             selection_samples_number++;
1204         }
1205     }
1206 
1207     return selection_samples_number;
1208 }
1209 
1210 
1211 /// Returns the number of samples in the data set which will be used for testing.
1212 
get_testing_samples_number() const1213 Index DataSet::get_testing_samples_number() const
1214 {
1215     const Index samples_number = get_samples_number();
1216 
1217     Index testing_samples_number = 0;
1218 
1219     for(Index i = 0; i < samples_number; i++)
1220     {
1221         if(samples_uses(i) == Testing)
1222         {
1223             testing_samples_number++;
1224         }
1225     }
1226 
1227     return testing_samples_number;
1228 }
1229 
1230 
1231 /// Returns the total number of training, selection and testing samples,
1232 /// i.e. those which are not "Unused".
1233 
get_used_samples_number() const1234 Index DataSet::get_used_samples_number() const
1235 {
1236     const Index samples_number = get_samples_number();
1237     const Index unused_samples_number = get_unused_samples_number();
1238 
1239     return (samples_number - unused_samples_number);
1240 }
1241 
1242 
1243 /// Returns the number of samples in the data set which will neither be used
1244 /// for training, selection or testing.
1245 
get_unused_samples_number() const1246 Index DataSet::get_unused_samples_number() const
1247 {
1248     const Index samples_number = get_samples_number();
1249 
1250     Index unused_samples_number = 0;
1251 
1252     for(Index i = 0; i < samples_number; i++)
1253     {
1254         if(samples_uses(i) == UnusedSample)
1255         {
1256             unused_samples_number++;
1257         }
1258     }
1259 
1260     return unused_samples_number;
1261 }
1262 
1263 
1264 /// Sets all the samples in the data set for training.
1265 
set_training()1266 void DataSet::set_training()
1267 {
1268     const Index samples_number = get_samples_number();
1269 
1270     for(Index i = 0; i < samples_number; i++)
1271     {
1272         samples_uses(i) = Training;
1273     }
1274 }
1275 
1276 
1277 /// Sets all the samples in the data set for selection.
1278 
set_selection()1279 void DataSet::set_selection()
1280 {
1281     const Index samples_number = get_samples_number();
1282 
1283     for(Index i = 0; i < samples_number; i++)
1284     {
1285         samples_uses(i) = Selection;
1286     }
1287 }
1288 
1289 
1290 /// Sets all the samples in the data set for testing.
1291 
set_testing()1292 void DataSet::set_testing()
1293 {
1294     const Index samples_number = get_samples_number();
1295 
1296     for(Index i = 0; i < samples_number; i++)
1297     {
1298         samples_uses(i) = Testing;
1299     }
1300 }
1301 
1302 
1303 /// Sets samples with given indices in the data set for training.
1304 /// @param indices Indices vector with the index of samples in the data set for training.
1305 
set_training(const Tensor<Index,1> & indices)1306 void DataSet::set_training(const Tensor<Index, 1>& indices)
1307 {
1308     Index index = 0;
1309 
1310     for(Index i = 0; i < indices.size(); i++)
1311     {
1312         index = indices(i);
1313 
1314         samples_uses(index) = Training;
1315     }
1316 }
1317 
1318 
1319 /// Sets samples with given indices in the data set for selection.
1320 /// @param indices Indices vector with the index of samples in the data set for selection.
1321 
set_selection(const Tensor<Index,1> & indices)1322 void DataSet::set_selection(const Tensor<Index, 1>& indices)
1323 {
1324     Index index = 0;
1325 
1326     for(Index i = 0; i < indices.size(); i++)
1327     {
1328         index = indices(i);
1329 
1330         samples_uses(index) = Selection;
1331     }
1332 }
1333 
1334 
1335 /// Sets samples with given indices in the data set for testing.
1336 /// @param indices Indices vector with the index of samples in the data set for testing.
1337 
set_testing(const Tensor<Index,1> & indices)1338 void DataSet::set_testing(const Tensor<Index, 1>& indices)
1339 {
1340     Index index = 0;
1341 
1342     for(Index i = 0; i < indices.size(); i++)
1343     {
1344         index = indices(i);
1345 
1346         samples_uses(index) = Testing;
1347     }
1348 }
1349 
1350 
1351 /// Sets all the samples in the data set for unused.
1352 
set_samples_unused()1353 void DataSet::set_samples_unused()
1354 {
1355     const Index samples_number = get_samples_number();
1356 
1357     for(Index i = 0; i < samples_number; i++)
1358     {
1359         samples_uses(i) = UnusedSample;
1360     }
1361 }
1362 
1363 
1364 /// Sets samples with given indices in the data set for unused.
1365 /// @param indices Indices vector with the index of samples in the data set for unused.
1366 
set_samples_unused(const Tensor<Index,1> & indices)1367 void DataSet::set_samples_unused(const Tensor<Index, 1>& indices)
1368 {
1369     for(Index i = 0; i < static_cast<Index>(indices.size()); i++)
1370     {
1371         const Index index = indices(i);
1372 
1373         samples_uses(index) = UnusedSample;
1374     }
1375 }
1376 
1377 
1378 /// Sets the use of a single sample.
1379 /// @param index Index of sample.
1380 /// @param new_use Use for that sample.
1381 
set_sample_use(const Index & index,const SampleUse & new_use)1382 void DataSet::set_sample_use(const Index& index, const SampleUse& new_use)
1383 {
1384     samples_uses(index) = new_use;
1385 
1386 }
1387 
1388 
1389 /// Sets the use of a single sample from a string.
1390 /// @param index Index of sample.
1391 /// @param new_use String with the use name("Training", "Selection", "Testing" or "Unused")
1392 
set_sample_use(const Index & index,const string & new_use)1393 void DataSet::set_sample_use(const Index& index, const string& new_use)
1394 {
1395     if(new_use == "Training")
1396     {
1397         samples_uses(index) = Training;
1398     }
1399     else if(new_use == "Selection")
1400     {
1401         samples_uses(index) = Selection;
1402     }
1403     else if(new_use == "Testing")
1404     {
1405         samples_uses(index) = Testing;
1406     }
1407     else if(new_use == "Unused")
1408     {
1409         samples_uses(index) = UnusedSample;
1410     }
1411     else
1412     {
1413         ostringstream buffer;
1414 
1415         buffer << "OpenNN Exception DataSet class.\n"
1416                << "void set_sample_use(const string&) method.\n"
1417                << "Unknown use: " << new_use << "\n";
1418 
1419         throw logic_error(buffer.str());
1420     }
1421 }
1422 
1423 
1424 /// Sets new uses to all the samples from a single vector.
1425 /// @param new_uses vector of use structures.
1426 /// The size of given vector must be equal to the number of samples.
1427 
set_samples_uses(const Tensor<SampleUse,1> & new_uses)1428 void DataSet::set_samples_uses(const Tensor<SampleUse, 1>& new_uses)
1429 {
1430     const Index samples_number = get_samples_number();
1431 
1432 #ifdef __OPENNN_DEBUG__
1433 
1434     const Index new_uses_size = new_uses.size();
1435 
1436     if(new_uses_size != samples_number)
1437     {
1438         ostringstream buffer;
1439 
1440         buffer << "OpenNN Exception: DataSet class.\n"
1441                << "void set_samples_uses(const Tensor<SampleUse, 1>&) method.\n"
1442                << "Size of uses(" << new_uses_size << ") must be equal to number of samples(" << samples_number << ").\n";
1443 
1444         throw logic_error(buffer.str());
1445     }
1446 
1447 #endif
1448 
1449     for(Index i = 0; i < samples_number; i++)
1450     {
1451         samples_uses(i) = new_uses(i);
1452     }
1453 }
1454 
1455 
1456 /// Sets new uses to all the samples from a single vector of strings.
1457 /// @param new_uses vector of use strings.
1458 /// Possible values for the elements are "Training", "Selection", "Testing" and "Unused".
1459 /// The size of given vector must be equal to the number of samples.
1460 
set_samples_uses(const Tensor<string,1> & new_uses)1461 void DataSet::set_samples_uses(const Tensor<string, 1>& new_uses)
1462 {
1463     const Index samples_number = get_samples_number();
1464 
1465     ostringstream buffer;
1466 
1467 #ifdef __OPENNN_DEBUG__
1468 
1469     const Index new_uses_size = new_uses.size();
1470 
1471     if(new_uses_size != samples_number)
1472     {
1473         buffer << "OpenNN Exception: DataSet class.\n"
1474                << "void set_samples_uses(const Tensor<string, 1>&) method.\n"
1475                << "Size of uses(" << new_uses_size << ") must be equal to number of samples(" << samples_number << ").\n";
1476 
1477         throw logic_error(buffer.str());
1478     }
1479 
1480 #endif
1481 
1482     for(Index i = 0; i < samples_number; i++)
1483     {
1484         if(new_uses(i).compare("Training") == 0 || new_uses(i).compare("0") == 0)
1485         {
1486             samples_uses(i) = Training;
1487         }
1488         else if(new_uses(i).compare("Selection") == 0 || new_uses(i).compare("1") == 0)
1489         {
1490             samples_uses(i) = Selection;
1491         }
1492         else if(new_uses(i).compare("Testing") == 0 || new_uses(i).compare("2") == 0)
1493         {
1494             samples_uses(i) = Testing;
1495         }
1496         else if(new_uses(i).compare("Unused") == 0 || new_uses(i).compare("3") == 0)
1497         {
1498             samples_uses(i) = UnusedSample;
1499         }
1500         else
1501         {
1502             buffer << "OpenNN Exception DataSet class.\n"
1503                    << "void set_samples_uses(const Tensor<string, 1>&) method.\n"
1504                    << "Unknown use: " << new_uses(i) << ".\n";
1505 
1506             throw logic_error(buffer.str());
1507         }
1508     }
1509 }
1510 
1511 
1512 /// Creates new training, selection and testing indices at random.
1513 /// @param training_samples_ratio Ratio of training samples in the data set.
1514 /// @param selection_samples_ratio Ratio of selection samples in the data set.
1515 /// @param testing_samples_ratio Ratio of testing samples in the data set.
1516 
split_samples_random(const type & training_samples_ratio,const type & selection_samples_ratio,const type & testing_samples_ratio)1517 void DataSet::split_samples_random(const type& training_samples_ratio,
1518                                      const type& selection_samples_ratio,
1519                                      const type& testing_samples_ratio)
1520 {
1521 
1522     const Index used_samples_number = get_used_samples_number();
1523 
1524     if(used_samples_number == 0) return;
1525 
1526     const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1527 
1528     // Get number of samples for training, selection and testing
1529 
1530     const Index selection_samples_number = static_cast<Index>(selection_samples_ratio*used_samples_number/total_ratio);
1531     const Index testing_samples_number = static_cast<Index>(testing_samples_ratio*used_samples_number/total_ratio);
1532     const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1533 
1534     const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1535 
1536     if(sum_samples_number != used_samples_number)
1537     {
1538         ostringstream buffer;
1539 
1540         buffer << "OpenNN Warning: DataSet class.\n"
1541                << "void split_samples_random(const type&, const type&, const type&) method.\n"
1542                << "Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1543 
1544         throw logic_error(buffer.str());
1545     }
1546 
1547     const Index samples_number = get_samples_number();
1548 
1549     Tensor<Index, 1> indices;
1550 
1551     initialize_sequential_eigen_tensor(indices, 0, 1, samples_number-1);
1552 
1553     random_shuffle(indices.data(), indices.data() + indices.size());
1554 
1555     Index count = 0;
1556 
1557     for(Index i = 0; i < samples_uses.size(); i++)
1558     {
1559         if(samples_uses(i) == UnusedSample) count ++;
1560     }
1561 
1562     Index i = 0;
1563     Index index;
1564 
1565     // Training
1566 
1567     Index count_training = 0;
1568 
1569     while(count_training != training_samples_number)
1570     {
1571         index = indices(i);
1572 
1573         if(samples_uses(index) != UnusedSample)
1574         {
1575             samples_uses(index)= Training;
1576             count_training++;
1577         }
1578 
1579         i++;
1580     }
1581 
1582     // Selection
1583 
1584     Index count_selection = 0;
1585 
1586     while(count_selection != selection_samples_number)
1587     {
1588         index = indices(i);
1589 
1590         if(samples_uses(index) != UnusedSample)
1591         {
1592             samples_uses(index) = Selection;
1593             count_selection++;
1594         }
1595 
1596         i++;
1597     }
1598 
1599     // Testing
1600 
1601 
1602     Index count_testing = 0;
1603 
1604     while(count_testing != testing_samples_number)
1605     {
1606         index = indices(i);
1607 
1608         if(samples_uses(index) != UnusedSample)
1609         {
1610             samples_uses(index) = Testing;
1611             count_testing++;
1612         }
1613 
1614         i++;
1615     }
1616 
1617     for(Index i = 0; i < samples_uses.size(); i++)
1618     {
1619         if(samples_uses(i) == UnusedSample)
1620         {
1621             cout << "Sample " << i << " is unused" << endl;
1622         }
1623     }
1624 
1625 
1626 }
1627 
1628 
1629 /// Creates new training, selection and testing indices with sequential indices.
1630 /// @param training_samples_ratio Ratio of training samples in the data set.
1631 /// @param selection_samples_ratio Ratio of selection samples in the data set.
1632 /// @param testing_samples_ratio Ratio of testing samples in the data set.
1633 
split_samples_sequential(const type & training_samples_ratio,const type & selection_samples_ratio,const type & testing_samples_ratio)1634 void DataSet::split_samples_sequential(const type& training_samples_ratio,
1635         const type& selection_samples_ratio,
1636         const type& testing_samples_ratio)
1637 {
1638     const Index used_samples_number = get_used_samples_number();
1639 
1640     if(used_samples_number == 0) return;
1641 
1642     const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1643 
1644     // Get number of samples for training, selection and testing
1645 
1646     const Index selection_samples_number = static_cast<Index>(selection_samples_ratio*used_samples_number/total_ratio);
1647     const Index testing_samples_number = static_cast<Index>(testing_samples_ratio*used_samples_number/total_ratio);
1648     const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1649 
1650     const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1651 
1652     if(sum_samples_number != used_samples_number)
1653     {
1654         ostringstream buffer;
1655 
1656         buffer << "OpenNN Warning: Samples class.\n"
1657                << "void split_samples_sequential(const type&, const type&, const type&) method.\n"
1658                << "Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1659 
1660         throw logic_error(buffer.str());
1661     }
1662 
1663     Index i = 0;
1664 
1665     // Training
1666 
1667     Index count_training = 0;
1668 
1669     while(count_training != training_samples_number)
1670     {
1671         if(samples_uses(i) != UnusedSample)
1672         {
1673             samples_uses(i) = Training;
1674             count_training++;
1675         }
1676 
1677         i++;
1678     }
1679 
1680     // Selection
1681 
1682     Index count_selection = 0;
1683 
1684     while(count_selection != selection_samples_number)
1685     {
1686         if(samples_uses(i) != UnusedSample)
1687         {
1688             samples_uses(i) = Selection;
1689             count_selection++;
1690         }
1691 
1692         i++;
1693     }
1694 
1695     // Testing
1696 
1697     Index count_testing = 0;
1698 
1699     while(count_testing != testing_samples_number)
1700     {
1701         if(samples_uses(i) != UnusedSample)
1702         {
1703             samples_uses(i) = Testing;
1704             count_testing++;
1705         }
1706         i++;
1707     }
1708 }
1709 
1710 
1711 /// This method separates the dataset into n-groups to validate a model with limited data.
1712 /// @param k Number of folds that a given data sample is given to be split into.
1713 /// @param fold_index.
1714 /// @todo Low priority
1715 
set_k_fold_cross_validation_samples_uses(const Index & k,const Index & fold_index)1716 void DataSet::set_k_fold_cross_validation_samples_uses(const Index& k, const Index& fold_index)
1717 {
1718     const Index samples_number = get_samples_number();
1719 
1720     const Index fold_size = samples_number/k;
1721 
1722     const Index start = fold_index*fold_size;
1723     const Index end = start + fold_size;
1724 
1725     split_samples_random(1, 0, 0);
1726 
1727     for(Index i = start; i < end; i++)
1728     {
1729         samples_uses(i) = Testing;
1730     }
1731 }
1732 
1733 
1734 /// This method sets the n columns of the dataset by default,
1735 /// i.e. until column n-1 are Input and column n is Target.
1736 
set_default_columns_uses()1737 void DataSet::set_default_columns_uses()
1738 {
1739     const Index size = columns.size();
1740 
1741     if(size == 0)
1742     {
1743         return;
1744     }
1745     else if(size == 1)
1746     {
1747         columns(0).set_use(UnusedVariable);
1748     }
1749     else
1750     {
1751         set_input();
1752 
1753         for(Index i = columns.size()-1; i >= 0; i--)
1754         {
1755             if(columns(i).type == Constant) continue;
1756             if(columns(i).type == Binary) continue;
1757             if(columns(i).type == Categorical) continue;
1758 
1759             columns(i).set_use(Target);
1760             break;
1761         }
1762 
1763         input_variables_dimensions.resize(1);
1764     }
1765 }
1766 
1767 
1768 /// This method sets the n columns of the dataset by default,
1769 /// i.e. until column n-1 are Input and column n is Target.
1770 
set_default_classification_columns_uses()1771 void DataSet::set_default_classification_columns_uses()
1772 {
1773     const Index size = columns.size();
1774 
1775     if(size == 0)
1776     {
1777         return;
1778     }
1779     else if(size == 1)
1780     {
1781         columns(0).set_use(UnusedVariable);
1782     }
1783     else
1784     {
1785         set_input();
1786 
1787         for(Index i = columns.size()-1; i >= 0; i--)
1788         {
1789             if(columns(i).type == Constant) continue;
1790 
1791             if(columns(i).type == Binary)
1792             {
1793                 columns(i).set_use(Target);
1794                 break;
1795             }
1796             else if(columns(i).type == Categorical)
1797             {
1798                 columns(i).set_use(Target);
1799                 break;
1800             }
1801         }
1802 
1803         input_variables_dimensions.resize(1);
1804     }
1805 }
1806 
1807 
1808 /// This method puts the names of the columns in the dataset.
1809 /// This is used when the dataset does not have a header,
1810 /// the default names are: column_0, column_1, ..., column_n.
1811 
set_default_columns_names()1812 void DataSet::set_default_columns_names()
1813 {
1814     const Index size = columns.size();
1815 
1816     if(size == 0)
1817     {
1818         return;
1819     }
1820     else if(size == 1)
1821     {
1822         return;
1823     }
1824     else
1825     {
1826         Index input_index = 1;
1827         Index target_index = 2;
1828 
1829         for(Index i = 0; i < size; i++)
1830         {
1831             if(columns(i).column_use == Input)
1832             {
1833                 columns(i).name = "input_" + std::to_string(input_index+1);
1834                 input_index++;
1835             }
1836             else if(columns(i).column_use == Target)
1837             {
1838                 columns(i).name = "target_" + std::to_string(target_index+1);
1839                 target_index++;
1840             }
1841         }
1842     }
1843 }
1844 
1845 
1846 /// Sets the name of a single column.
1847 /// @param index Index of column.
1848 /// @param new_use Use for that column.
1849 
set_column_name(const Index & column_index,const string & new_name)1850 void DataSet::set_column_name(const Index& column_index, const string& new_name)
1851 {
1852     columns(column_index).name = new_name;
1853 }
1854 
1855 
1856 /// Returns the use of a single variable.
1857 /// @param index Index of variable.
1858 
get_variable_use(const Index & index) const1859 DataSet::VariableUse DataSet::get_variable_use(const Index& index) const
1860 {
1861     return get_variables_uses()(index);
1862 }
1863 
1864 
1865 /// Returns a vector containing the use of the column, without taking into account the categories.
1866 
get_column_use(const Index & index) const1867 DataSet::VariableUse DataSet::get_column_use(const Index & index) const
1868 {
1869     return columns(index).column_use;
1870 }
1871 
1872 
1873 /// Returns the uses of each columns of the data set.
1874 
get_columns_uses() const1875 Tensor<DataSet::VariableUse, 1> DataSet::get_columns_uses() const
1876 {
1877     const Index columns_number = get_columns_number();
1878 
1879     Tensor<DataSet::VariableUse, 1> columns_uses(columns_number);
1880 
1881     for (Index i = 0; i < columns_number; i++)
1882     {
1883         columns_uses(i) = columns(i).column_use;
1884     }
1885 
1886     return columns_uses;
1887 }
1888 
1889 
1890 /// Returns a vector containing the use of each column, including the categories.
1891 /// The size of the vector is equal to the number of variables.
1892 
get_variables_uses() const1893 Tensor<DataSet::VariableUse, 1> DataSet::get_variables_uses() const
1894 {
1895     const Index columns_number = get_columns_number();
1896     const Index variables_number = get_variables_number();
1897 
1898     Tensor<VariableUse, 1> variables_uses(variables_number);
1899 
1900     Index index = 0;
1901 
1902     for(Index i = 0; i < columns_number; i++)
1903     {
1904         if(columns(i).type == Categorical)
1905         {
1906             for(Index i = 0; i < (columns(i).categories_uses).size(); i++)
1907             {
1908                 variables_uses(i + index) = (columns(i).categories_uses)(i);
1909             }
1910             index += columns(i).categories.size();
1911         }
1912         else
1913         {
1914             variables_uses(index) = columns(i).column_use;
1915             index++;
1916         }
1917     }
1918 
1919     return variables_uses;
1920 }
1921 
1922 
1923 /// Returns the name of a single variable in the data set.
1924 /// @param index Index of variable.
1925 
get_variable_name(const Index & variable_index) const1926 string DataSet::get_variable_name(const Index& variable_index) const
1927 {
1928 #ifdef __OPENNN_DEBUG__
1929 
1930     const Index variables_number = get_variables_number();
1931 
1932     if(variable_index >= variables_number)
1933     {
1934         ostringstream buffer;
1935 
1936         buffer << "OpenNN Exception: DataSet class.\n"
1937                << "string& get_variable_name(const Index) method.\n"
1938                << "Index of variable("<<variable_index<<") must be less than number of variables("<<variables_number<<").\n";
1939 
1940         throw logic_error(buffer.str());
1941     }
1942 
1943 #endif
1944 
1945     const Index columns_number = get_columns_number();
1946 
1947     Index index = 0;
1948 
1949     for(Index i = 0; i < columns_number; i++)
1950     {
1951         if(columns(i).type == Categorical)
1952         {
1953             for(Index j = 0; j < columns(i).get_categories_number(); j++)
1954             {
1955                 if(index == variable_index)
1956                 {
1957                     return columns(i).categories(j);
1958                 }
1959                 else
1960                 {
1961                     index++;
1962                 }
1963             }
1964         }
1965         else
1966         {
1967             if(index == variable_index)
1968             {
1969                 return columns(i).name;
1970             }
1971             else
1972             {
1973                 index++;
1974             }
1975         }
1976     }
1977 
1978     return string();
1979 }
1980 
1981 
1982 /// Returns a string vector with the names of all the variables in the data set.
1983 /// The size of the vector is the number of variables.
1984 
get_variables_names() const1985 Tensor<string, 1> DataSet::get_variables_names() const
1986 {
1987     const Index variables_number = get_variables_number();
1988 
1989     Tensor<string, 1> variables_names(variables_number);
1990 
1991     Index index = 0;
1992 
1993     for(Index i = 0; i < columns.size(); i++)
1994     {
1995         if(columns(i).type == Categorical)
1996         {
1997             for(Index j = 0; j < columns(i).categories.size(); j++)
1998             {
1999                 variables_names(index) = columns(i).categories(j);
2000 
2001                 index++;
2002             }
2003         }
2004         else
2005         {
2006             variables_names(index) = columns(i).name;
2007             index++;
2008         }
2009     }
2010 
2011     return variables_names;
2012 }
2013 
2014 
2015 /// Returns the names of the input variables in the data set.
2016 /// The size of the vector is the number of input variables.
2017 
get_input_variables_names() const2018 Tensor<string, 1> DataSet::get_input_variables_names() const
2019 {
2020     const Index input_variables_number = get_input_variables_number();
2021 
2022     const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
2023 
2024     Tensor<string, 1> input_variables_names(input_variables_number);
2025 
2026     Index index = 0;
2027 
2028     for(Index i = 0; i < input_columns_indices.size(); i++)
2029     {
2030         Index input_index = input_columns_indices(i);
2031 
2032         const Tensor<string, 1> current_used_variables_names = columns(input_index).get_used_variables_names();
2033 
2034         for(Index j = 0; j < current_used_variables_names.size(); j++)
2035         {
2036             input_variables_names(index + j) = current_used_variables_names(j);
2037         }
2038 
2039         index += current_used_variables_names.size();
2040     }
2041 
2042     return input_variables_names;
2043 }
2044 
2045 
2046 /// Returns the names of the target variables in the data set.
2047 /// The size of the vector is the number of target variables.
2048 
get_target_variables_names() const2049 Tensor<string, 1> DataSet::get_target_variables_names() const
2050 {
2051     const Index target_variables_number = get_target_variables_number();
2052 
2053     const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
2054 
2055     Tensor<string, 1> target_variables_names(target_variables_number);
2056 
2057     Index index = 0;
2058 
2059     for(Index i = 0; i < target_columns_indices.size(); i++)
2060     {
2061         Index target_index = target_columns_indices(i);
2062 
2063         const Tensor<string, 1> current_used_variables_names = columns(target_index).get_used_variables_names();
2064 
2065         for(Index j = 0; j < current_used_variables_names.size(); j++)
2066         {
2067             target_variables_names(index + j) = current_used_variables_names(j);
2068         }
2069 
2070         index += current_used_variables_names.size();
2071     }
2072 
2073     return target_variables_names;
2074 }
2075 
2076 
2077 /// Returns the dimensions of the input variables.
2078 
get_input_variables_dimensions() const2079 const Tensor<Index, 1>& DataSet::get_input_variables_dimensions() const
2080 {
2081     return input_variables_dimensions;
2082 }
2083 
2084 
2085 /// Returns the number of variables which are either input nor target.
2086 
get_used_variables_number() const2087 Index DataSet::get_used_variables_number() const
2088 {
2089     const Index variables_number = get_variables_number();
2090 
2091     const Index unused_variables_number = get_unused_variables_number();
2092 
2093     return (variables_number - unused_variables_number);
2094 }
2095 
2096 
2097 /// Returns a indices vector with the positions of the inputs.
2098 
get_input_columns_indices() const2099 Tensor<Index, 1> DataSet::get_input_columns_indices() const
2100 {
2101     const Index input_columns_number = get_input_columns_number();
2102 
2103     Tensor<Index, 1> input_columns_indices(input_columns_number);
2104 
2105     Index index = 0;
2106 
2107     for(Index i = 0; i < columns.size(); i++)
2108     {
2109         if(columns(i).column_use == Input)
2110         {
2111             input_columns_indices(index) = i;
2112             index++;
2113         }
2114     }
2115 
2116     return input_columns_indices;
2117 }
2118 
2119 
2120 /// Returns a indices vector with the positions of the targets.
2121 
get_target_columns_indices() const2122 Tensor<Index, 1> DataSet::get_target_columns_indices() const
2123 {
2124     const Index target_columns_number = get_target_columns_number();
2125 
2126     Tensor<Index, 1> target_columns_indices(target_columns_number);
2127 
2128     Index index = 0;
2129 
2130     for(Index i = 0; i < columns.size(); i++)
2131     {
2132         if(columns(i).column_use == Target)
2133         {
2134             target_columns_indices(index) = i;
2135             index++;
2136         }
2137     }
2138 
2139     return target_columns_indices;
2140 }
2141 
2142 
2143 /// Returns a indices vector with the positions of the unused columns.
2144 
get_unused_columns_indices() const2145 Tensor<Index, 1> DataSet::get_unused_columns_indices() const
2146 {
2147     const Index unused_columns_number = get_unused_columns_number();
2148 
2149     Tensor<Index, 1> unused_columns_indices(unused_columns_number);
2150 
2151     Index index = 0;
2152 
2153     for(Index i = 0; i < unused_columns_number; i++)
2154     {
2155 
2156         if(columns(i).column_use == UnusedVariable)
2157         {
2158             unused_columns_indices(index) = i;
2159             index++;
2160         }
2161     }
2162 
2163     return unused_columns_indices;
2164 }
2165 
2166 
2167 /// Returns a indices vector with the positions of the used columns.
2168 
get_used_columns_indices() const2169 Tensor<Index, 1> DataSet::get_used_columns_indices() const
2170 {
2171     const Index variables_number = get_variables_number();
2172 
2173     const Index used_variables_number = get_used_variables_number();
2174 
2175     Tensor<Index, 1> used_indices(used_variables_number);
2176 
2177     Index index = 0;
2178 
2179     for(Index i = 0; i < variables_number; i++)
2180     {
2181         if(columns(i).column_use  == Input
2182                 || columns(i).column_use  == Target
2183                 || columns(i).column_use  == Time)
2184         {
2185             used_indices(index) = i;
2186             index++;
2187         }
2188     }
2189 
2190     return used_indices;
2191 }
2192 
2193 
2194 /// Returns a string vector that contains the names of the columns.
2195 
get_columns_names() const2196 Tensor<string, 1> DataSet::get_columns_names() const
2197 {
2198     const Index columns_number = get_columns_number();
2199 
2200     Tensor<string, 1> columns_names(columns_number);
2201 
2202     for(Index i = 0; i < columns_number; i++)
2203     {
2204         columns_names(i) = columns(i).name;
2205     }
2206 
2207     return columns_names;
2208 }
2209 
2210 
get_time_series_columns_names() const2211 Tensor<string, 1> DataSet::get_time_series_columns_names() const
2212 {
2213     const Index columns_number = get_time_series_columns_number();
2214 
2215     Tensor<string, 1> columns_names(columns_number);
2216 
2217     for(Index i = 0; i < columns_number; i++)
2218     {
2219         columns_names(i) = time_series_columns(i).name;
2220     }
2221 
2222     return columns_names;
2223 }
2224 
2225 /// Returns a string vector that contains the names of the columns whose uses are Input.
2226 
get_input_columns_names() const2227 Tensor<string, 1> DataSet::get_input_columns_names() const
2228 {
2229     const Index input_columns_number = get_input_columns_number();
2230 
2231     Tensor<string, 1> input_columns_names(input_columns_number);
2232 
2233     Index index = 0;
2234 
2235     for(Index i = 0; i < columns.size(); i++)
2236     {
2237         if(columns(i).column_use == Input)
2238         {
2239             input_columns_names(index) = columns(i).name;
2240             index++;
2241         }
2242     }
2243 
2244     return input_columns_names;
2245 }
2246 
2247 
2248 /// Returns a string vector which contains the names of the columns whose uses are Target.
2249 
get_target_columns_names() const2250 Tensor<string, 1> DataSet::get_target_columns_names() const
2251 {
2252     const Index target_columns_number = get_target_columns_number();
2253 
2254     Tensor<string, 1> target_columns_names(target_columns_number);
2255 
2256     Index index = 0;
2257 
2258     for(Index i = 0; i < columns.size(); i++)
2259     {
2260         if(columns(i).column_use == Target)
2261         {
2262             target_columns_names(index) = columns(i).name;
2263             index++;
2264         }
2265     }
2266 
2267     return target_columns_names;
2268 
2269 }
2270 
2271 
2272 /// Returns a string vector which contains the names of the columns used whether Input, Target or Time.
2273 
get_used_columns_names() const2274 Tensor<string, 1> DataSet::get_used_columns_names() const
2275 {
2276     const Index columns_number = get_columns_number();
2277     const Index used_columns_number = get_used_columns_number();
2278 
2279     Tensor<string, 1> names(used_columns_number);
2280 
2281     Index index = 0 ;
2282 
2283     for(Index i = 0; i < columns_number; i++)
2284     {
2285         if(columns(i).column_use != UnusedVariable)
2286         {
2287             names(index) = columns(i).name;
2288             index++;
2289         }
2290     }
2291 
2292     return names;
2293 }
2294 
2295 
2296 /// Returns the number of columns whose uses are Input.
2297 
get_input_columns_number() const2298 Index DataSet::get_input_columns_number() const
2299 {
2300     Index input_columns_number = 0;
2301 
2302     for(Index i = 0; i < columns.size(); i++)
2303     {
2304         if(columns(i).column_use == Input)
2305         {
2306             input_columns_number++;
2307         }
2308     }
2309 
2310     return input_columns_number;
2311 }
2312 
2313 
2314 /// Returns the number of columns whose uses are Target.
2315 
get_target_columns_number() const2316 Index DataSet::get_target_columns_number() const
2317 {
2318     Index target_columns_number = 0;
2319 
2320     for(Index i = 0; i < columns.size(); i++)
2321     {
2322         if(columns(i).column_use == Target)
2323         {
2324             target_columns_number++;
2325         }
2326     }
2327 
2328     return target_columns_number;
2329 }
2330 
2331 
2332 /// Returns the number of columns whose uses are Time
2333 
get_time_columns_number() const2334 Index DataSet::get_time_columns_number() const
2335 {
2336     Index time_columns_number = 0;
2337 
2338     for(Index i = 0; i < columns.size(); i++)
2339     {
2340         if(columns(i).column_use == Time)
2341         {
2342             time_columns_number++;
2343         }
2344     }
2345 
2346     return time_columns_number;
2347 }
2348 
2349 
2350 /// Returns the number of columns that are not used.
2351 
get_unused_columns_number() const2352 Index DataSet::get_unused_columns_number() const
2353 {
2354     Index unused_columns_number = 0;
2355 
2356     for(Index i = 0; i < columns.size(); i++)
2357     {
2358         if(columns(i).column_use == UnusedVariable)
2359         {
2360             unused_columns_number++;
2361         }
2362     }
2363 
2364     return unused_columns_number;
2365 }
2366 
2367 
2368 /// Returns the number of columns that are used.
2369 
get_used_columns_number() const2370 Index DataSet::get_used_columns_number() const
2371 {
2372     Index used_columns_number = 0;
2373 
2374     for(Index i = 0; i < columns.size(); i++)
2375     {
2376         if(columns(i).column_use != UnusedVariable)
2377         {
2378             used_columns_number++;
2379         }
2380     }
2381 
2382     return used_columns_number;
2383 }
2384 
2385 
2386 /// Returns the columns of the data set.
2387 
get_columns() const2388 Tensor<DataSet::Column, 1> DataSet::get_columns() const
2389 {
2390     return columns;
2391 }
2392 
get_time_series_columns() const2393 Tensor<DataSet::Column, 1> DataSet::get_time_series_columns() const
2394 {
2395     return time_series_columns;
2396 }
2397 
2398 /// Returns the input columns of the data set.
2399 
get_input_columns() const2400 Tensor<DataSet::Column, 1> DataSet::get_input_columns() const
2401 {
2402     const Index inputs_number = get_input_columns_number();
2403 
2404     Tensor<Column, 1> input_columns(inputs_number);
2405     Index input_index = 0;
2406 
2407     for(Index i = 0; i < columns.size(); i++)
2408     {
2409         if(columns(i).column_use == Input)
2410         {
2411             input_columns(input_index) = columns(i);
2412             input_index++;
2413         }
2414     }
2415 
2416     return input_columns;
2417 }
2418 
2419 
2420 /// Returns the target columns of the data set.
2421 
get_target_columns() const2422 Tensor<DataSet::Column, 1> DataSet::get_target_columns() const
2423 {
2424     const Index targets_number = get_target_columns_number();
2425 
2426     Tensor<Column, 1> target_columns(targets_number);
2427     Index target_index = 0;
2428 
2429     for(Index i = 0; i < columns.size(); i++)
2430     {
2431         if(columns(i).column_use == Target)
2432         {
2433             target_columns(target_index) = columns(i);
2434             target_index++;
2435         }
2436     }
2437 
2438     return target_columns;
2439 }
2440 
2441 
2442 /// Returns the used columns of the data set.
2443 /// @todo
2444 
get_used_columns() const2445 Tensor<DataSet::Column, 1> DataSet::get_used_columns() const
2446 {
2447     const Tensor<Index, 1> used_columns_indices = get_used_columns_indices();
2448 
2449 //    return columns.get_subvector(used_columns_indices);
2450 
2451     return Tensor<DataSet::Column, 1>();
2452 }
2453 
2454 
2455 /// Returns the number of columns in the data set.
2456 
get_columns_number() const2457 Index DataSet::get_columns_number() const
2458 {
2459     return columns.size();
2460 }
2461 
2462 /// Returns the number of columns in the time series.
2463 
get_time_series_columns_number() const2464 Index DataSet::get_time_series_columns_number() const
2465 {
2466     return time_series_columns.size();
2467 }
2468 
2469 /// Returns the number of variables in the data set.
2470 
get_variables_number() const2471 Index DataSet::get_variables_number() const
2472 {
2473     Index variables_number = 0;
2474 
2475     for(Index i = 0; i < columns.size(); i++)
2476     {
2477         if(columns(i).type == Categorical)
2478         {
2479             variables_number += columns(i).categories.size();
2480         }
2481         else
2482         {
2483             variables_number++;
2484         }
2485     }
2486 
2487     return variables_number;
2488 }
2489 
2490 
2491 /// Returns the number of input variables of the data set.
2492 /// Note that the number of variables does not have to equal the number of columns in the data set,
2493 /// because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.
2494 
get_input_variables_number() const2495 Index DataSet::get_input_variables_number() const
2496 {
2497     Index inputs_number = 0;
2498 
2499     for(Index i = 0; i < columns.size(); i++)
2500     {
2501         if(columns(i).type == Categorical)
2502         {
2503             for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2504             {
2505                 if(columns(i).categories_uses(j) == Input) inputs_number++;
2506             }
2507         }
2508         else if(columns(i).column_use == Input)
2509         {
2510             inputs_number++;
2511         }
2512     }
2513 
2514     return inputs_number;
2515 }
2516 
2517 
2518 /// Returns the number of target variables of the data set.
2519 
get_target_variables_number() const2520 Index DataSet::get_target_variables_number() const
2521 {
2522     Index targets_number = 0;
2523 
2524     for(Index i = 0; i < columns.size(); i++)
2525     {
2526         if(columns(i).type == Categorical)
2527         {
2528             for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2529             {
2530                 if(columns(i).categories_uses(j) == Target) targets_number++;
2531             }
2532 
2533         }
2534         else if(columns(i).column_use == Target)
2535         {
2536             targets_number++;
2537         }
2538     }
2539 
2540     return targets_number;
2541 }
2542 
2543 
2544 /// Returns the number of variables which will neither be used as input nor as target.
2545 
get_unused_variables_number() const2546 Index DataSet::get_unused_variables_number() const
2547 {
2548     Index unused_number = 0;
2549 
2550     for(Index i = 0; i < columns.size(); i++)
2551     {
2552         if(columns(i).type == Categorical)
2553         {
2554             for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2555             {
2556                 if(columns(i).categories_uses(j) == UnusedVariable) unused_number++;
2557             }
2558 
2559         }
2560         else if(columns(i).column_use == UnusedVariable)
2561         {
2562             unused_number++;
2563         }
2564     }
2565 
2566     return unused_number;
2567 }
2568 
2569 
2570 /// Returns a variable index in the data set with given name.
2571 /// @param name Name of variable.
2572 
get_variable_index(const string & name) const2573 Index DataSet::get_variable_index(const string& name) const
2574 {
2575     const Index variables_number = get_variables_number();
2576 
2577     const Tensor<string, 1> variables_names = get_variables_names();
2578 
2579     for(Index i = 0; i < variables_number; i++)
2580     {
2581         if(variables_names(i) == name) return i;
2582     }
2583 
2584     return 0;
2585 
2586 //    throw exception("Exception: Index DataSet::get_variable_index(const string& name) const");
2587 }
2588 
2589 
2590 /// Returns the indices of the unused variables.
2591 
get_unused_variables_indices() const2592 Tensor<Index, 1> DataSet::get_unused_variables_indices() const
2593 {
2594     const Index unused_number = get_unused_variables_number();
2595 
2596     const Tensor<Index, 1> unused_columns_indices = get_unused_columns_indices();
2597 
2598     Tensor<Index, 1> unused_indices(unused_number);
2599 
2600     Index unused_index = 0;
2601     Index unused_variable_index = 0;
2602 
2603     for(Index i = 0; i < columns.size(); i++)
2604     {
2605         if(columns(i).type == Categorical)
2606         {
2607             const Index current_categories_number = columns(i).get_categories_number();
2608 
2609             for(Index j = 0; j < current_categories_number; j++)
2610             {
2611                 if(columns(i).categories_uses(j) == UnusedVariable)
2612                 {
2613                     unused_indices(unused_index) = unused_variable_index;
2614                     unused_index++;
2615                 }
2616 
2617                 unused_variable_index++;
2618             }
2619         }
2620         else if(columns(i).column_use == UnusedVariable)
2621         {
2622             unused_indices(unused_index) = i;
2623             unused_index++;
2624             unused_variable_index++;
2625         }
2626         else
2627         {
2628             unused_variable_index++;
2629         }
2630     }
2631 
2632     return unused_indices;
2633 }
2634 
2635 
2636 /// Returns the indices of the used variables.
2637 
get_used_variables_indices() const2638 Tensor<Index, 1> DataSet::get_used_variables_indices() const
2639 {
2640     const Index used_number = get_used_variables_number();
2641 
2642     Tensor<Index, 1> used_indices(used_number);
2643 
2644     Index used_index = 0;
2645     Index used_variable_index = 0;
2646 
2647     for(Index i = 0; i < columns.size(); i++)
2648     {
2649         if(columns(i).type == Categorical)
2650         {
2651             const Index current_categories_number = columns(i).get_categories_number();
2652 
2653             for(Index j = 0; j < current_categories_number; j++)
2654             {
2655                 if(columns(i).categories_uses(j) != UnusedVariable)
2656                 {
2657                     used_indices(used_index) = used_variable_index;
2658                     used_index++;
2659                 }
2660 
2661                 used_variable_index++;
2662             }
2663         }
2664         else if(columns(i).column_use != UnusedVariable)
2665         {
2666             used_indices(used_index) = used_variable_index;
2667             used_index++;
2668             used_variable_index++;
2669         }
2670         else
2671         {
2672             used_variable_index++;
2673         }
2674     }
2675 
2676     return used_indices;
2677 }
2678 
2679 
2680 
2681 /// Returns the indices of the input variables.
2682 
get_input_variables_indices() const2683 Tensor<Index, 1> DataSet::get_input_variables_indices() const
2684 {
2685     const Index inputs_number = get_input_variables_number();
2686 
2687     const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
2688 
2689     Tensor<Index, 1> input_variables_indices(inputs_number);
2690 
2691     Index input_index = 0;
2692     Index input_variable_index = 0;
2693 
2694     for(Index i = 0; i < columns.size(); i++)
2695     {
2696 
2697         if(columns(i).type == Categorical)
2698         {
2699             const Index current_categories_number = columns(i).get_categories_number();
2700 
2701             for(Index j = 0; j < current_categories_number; j++)
2702             {
2703                 if(columns(i).categories_uses(j) == Input)
2704                 {
2705                     input_variables_indices(input_index) = input_variable_index;
2706                     input_index++;
2707                 }
2708 
2709                 input_variable_index++;
2710             }
2711         }
2712         else if(columns(i).column_use == Input) // Binary, numeric
2713         {
2714             input_variables_indices(input_index) = input_variable_index;
2715             input_index++;
2716             input_variable_index++;
2717         }
2718         else
2719         {
2720             input_variable_index++;
2721         }
2722     }
2723 
2724     return input_variables_indices;
2725 }
2726 
2727 
2728 /// Returns the indices of the target variables.
2729 
get_target_variables_indices() const2730 Tensor<Index, 1> DataSet::get_target_variables_indices() const
2731 {
2732     const Index targets_number = get_target_variables_number();
2733 
2734     const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
2735 
2736     Tensor<Index, 1> target_variables_indices(targets_number);
2737 
2738     Index target_index = 0;
2739     Index target_variable_index = 0;
2740 
2741     for(Index i = 0; i < columns.size(); i++)
2742     {
2743         if(columns(i).type == Categorical)
2744         {
2745             const Index current_categories_number = columns(i).get_categories_number();
2746 
2747             for(Index j = 0; j < current_categories_number; j++)
2748             {
2749                 if(columns(i).categories_uses(j) == Target)
2750                 {
2751                     target_variables_indices(target_index) = target_variable_index;
2752                     target_index++;
2753                 }
2754 
2755                 target_variable_index++;
2756             }
2757         }
2758         else if(columns(i).column_use == Target) // Binary, numeric
2759         {
2760             target_variables_indices(target_index) = target_variable_index;
2761             target_index++;
2762             target_variable_index++;
2763         }
2764         else
2765         {
2766             target_variable_index++;
2767         }
2768     }
2769 
2770     return target_variables_indices;
2771 }
2772 
2773 
2774 /// Sets the uses of the data set columns.
2775 /// @param new_columns_uses String vector that contains the new uses to be set,
2776 /// note that this vector needs to be the size of the number of columns in the data set.
2777 
set_columns_uses(const Tensor<string,1> & new_columns_uses)2778 void DataSet::set_columns_uses(const Tensor<string, 1>& new_columns_uses)
2779 {
2780     const Index new_columns_uses_size = new_columns_uses.size();
2781 
2782     if(new_columns_uses_size != columns.size())
2783     {
2784         ostringstream buffer;
2785 
2786         buffer << "OpenNN Exception DataSet class.\n"
2787                << "void set_columns_uses(const Tensor<string, 1>&) method.\n"
2788                << "Size of columns uses ("
2789                << new_columns_uses_size << ") must be equal to columns size ("
2790                << columns.size() << "). \n";
2791 
2792         throw logic_error(buffer.str());
2793     }
2794 
2795     for(Index i = 0; i < new_columns_uses.size(); i++)
2796     {
2797         columns(i).set_use(new_columns_uses(i));
2798     }
2799 
2800     input_variables_dimensions.resize(1);
2801     input_variables_dimensions.setConstant(get_input_variables_number());
2802 }
2803 
2804 
2805 /// Sets the uses of the data set columns.
2806 /// @param new_columns_uses DataSet::VariableUse vector that contains the new uses to be set,
2807 /// note that this vector needs to be the size of the number of columns in the data set.
2808 
set_columns_uses(const Tensor<VariableUse,1> & new_columns_uses)2809 void DataSet::set_columns_uses(const Tensor<VariableUse, 1>& new_columns_uses)
2810 {
2811     const Index new_columns_uses_size = new_columns_uses.size();
2812 
2813     if(new_columns_uses_size != columns.size())
2814     {
2815         ostringstream buffer;
2816 
2817         buffer << "OpenNN Exception DataSet class.\n"
2818                << "void set_columns_uses(const Tensor<string, 1>&) method.\n"
2819                << "Size of columns uses (" << new_columns_uses_size << ") must be equal to columns size (" << columns.size() << "). \n";
2820 
2821         throw logic_error(buffer.str());
2822     }
2823 
2824     for(Index i = 0; i < new_columns_uses.size(); i++)
2825     {
2826         columns(i).set_use(new_columns_uses(i));
2827     }
2828 
2829     input_variables_dimensions.resize(1);
2830     input_variables_dimensions.setConstant(get_input_variables_number());
2831 }
2832 
2833 
2834 /// Sets all columns in the dataset as unused columns.
2835 
set_columns_unused()2836 void DataSet::set_columns_unused()
2837 {
2838     const Index columns_number = get_columns_number();
2839 
2840     for(Index i = 0; i < columns_number; i++)
2841     {
2842         set_column_use(i, UnusedVariable);
2843     }
2844 }
2845 
2846 
2847 /// Sets all input columns in the dataset as unused columns.
2848 
set_input_columns_unused()2849 void DataSet::set_input_columns_unused()
2850 {
2851     const Index columns_number = get_columns_number();
2852 
2853     for(Index i = 0; i < columns_number; i++)
2854     {
2855         if(columns(i).column_use == DataSet::Input) set_column_use(i, UnusedVariable);
2856     }
2857 }
2858 
2859 
2860 /// Sets the use of a single column.
2861 /// @param index Index of column.
2862 /// @param new_use Use for that column.
2863 
set_column_use(const Index & index,const VariableUse & new_use)2864 void DataSet::set_column_use(const Index& index, const VariableUse& new_use)
2865 {
2866     columns(index).column_use = new_use;
2867 
2868     if(columns(index).type == Categorical)
2869     {
2870         columns(index).set_categories_uses(new_use);
2871     }
2872 }
2873 
2874 
2875 /// Sets the use of a single column.
2876 /// @param name Name of column.
2877 /// @param new_use Use for that column.
2878 
set_column_use(const string & name,const VariableUse & new_use)2879 void DataSet::set_column_use(const string& name, const VariableUse& new_use)
2880 {
2881     const Index index = get_column_index(name);
2882 
2883     set_column_use(index, new_use);
2884 }
2885 
2886 
2887 /// This method set the name of a single variable.
2888 /// @param index Index of variable.
2889 /// @param new_name Name of variable.
2890 
set_variable_name(const Index & variable_index,const string & new_variable_name)2891 void DataSet::set_variable_name(const Index& variable_index, const string& new_variable_name)
2892 {
2893 #ifdef __OPENNN_DEBUG__
2894 
2895     const Index variables_number = get_variables_number();
2896 
2897     if(variable_index >= variables_number)
2898     {
2899         ostringstream buffer;
2900 
2901         buffer << "OpenNN Exception: Variables class.\n"
2902                << "void set_name(const Index&, const string&) method.\n"
2903                << "Index of variable must be less than number of variables.\n";
2904 
2905         throw logic_error(buffer.str());
2906     }
2907 
2908 #endif
2909 
2910     const Index columns_number = get_columns_number();
2911 
2912     Index index = 0;
2913 
2914     for(Index i = 0; i < columns_number; i++)
2915     {
2916         if(columns(i).type == Categorical)
2917         {
2918             for(Index j = 0; j < columns(i).get_categories_number(); j++)
2919             {
2920                 if(index == variable_index)
2921                 {
2922                     columns(i).categories(j) = new_variable_name;
2923                     return;
2924                 }
2925                 else
2926                 {
2927                     index++;
2928                 }
2929             }
2930         }
2931         else
2932         {
2933             if(index == variable_index)
2934             {
2935                 columns(i).name = new_variable_name;
2936                 return;
2937             }
2938             else
2939             {
2940                 index++;
2941             }
2942         }
2943     }
2944 }
2945 
2946 
2947 /// Sets new names for the variables in the data set from a vector of strings.
2948 /// The size of that vector must be equal to the total number of variables.
2949 /// @param new_names Name of variables.
2950 
set_variables_names(const Tensor<string,1> & new_variables_names)2951 void DataSet::set_variables_names(const Tensor<string, 1>& new_variables_names)
2952 {
2953 #ifdef __OPENNN_DEBUG__
2954 
2955     const Index variables_number = get_variables_number();
2956 
2957     const Index size = new_variables_names.size();
2958 
2959     if(size != variables_number)
2960     {
2961         ostringstream buffer;
2962 
2963         buffer << "OpenNN Exception: Variables class.\n"
2964                << "void set_names(const Tensor<string, 1>&) method.\n"
2965                << "Size (" << size << ") must be equal to number of variables (" << variables_number << ").\n";
2966 
2967         throw logic_error(buffer.str());
2968     }
2969 
2970 #endif
2971 
2972     const Index columns_number = get_columns_number();
2973 
2974     Index index = 0;
2975 
2976     for(Index i = 0; i < columns_number; i++)
2977     {
2978         if(columns(i).type == Categorical)
2979         {
2980             for(Index j = 0; j < columns(i).get_categories_number(); j++)
2981             {
2982                 columns(i).categories(j) = new_variables_names(index);
2983                 index++;
2984             }
2985         }
2986         else
2987         {
2988             columns(i).name = new_variables_names(index);
2989             index++;
2990         }
2991     }
2992 }
2993 
2994 
2995 /// Sets new names for the columns in the data set from a vector of strings.
2996 /// The size of that vector must be equal to the total number of variables.
2997 /// @param new_names Name of variables.
2998 
set_columns_names(const Tensor<string,1> & new_names)2999 void DataSet::set_columns_names(const Tensor<string, 1>& new_names)
3000 {
3001     const Index new_names_size = new_names.size();
3002     const Index columns_number = get_columns_number();
3003 
3004     if(new_names_size != columns_number)
3005     {
3006         ostringstream buffer;
3007 
3008         buffer << "OpenNN Exception: DataSet class.\n"
3009                << "void set_columns_names(const Tensor<string, 1>&).\n"
3010                << "Size of names (" << new_names.size() << ") is not equal to columns number (" << columns_number << ").\n";
3011 
3012         throw logic_error(buffer.str());
3013     }
3014 
3015     for(Index i = 0; i < columns_number; i++)
3016     {
3017         columns(i).name = new_names(i);
3018     }
3019 }
3020 
3021 
3022 /// Sets all the variables in the data set as input variables.
3023 
set_input()3024 void DataSet::set_input()
3025 {
3026     for(Index i = 0; i < columns.size(); i++)
3027     {
3028         if(columns(i).type == Constant) continue;
3029 
3030         columns(i).set_use(Input);
3031     }
3032 }
3033 
3034 
3035 /// Sets all the variables in the data set as target variables.
3036 
set_target()3037 void DataSet::set_target()
3038 {
3039     for(Index i = 0; i < columns.size(); i++)
3040     {
3041         columns(i).set_use(Target);
3042     }
3043 }
3044 
3045 
3046 /// Sets all the variables in the data set as unused variables.
3047 
set_variables_unused()3048 void DataSet::set_variables_unused()
3049 {
3050     for(Index i = 0; i < columns.size(); i++)
3051     {
3052         columns(i).set_use(UnusedVariable);
3053     }
3054 }
3055 
3056 
3057 /// Sets a new number of variables in the variables object.
3058 /// All variables are set as inputs but the last one, which is set as targets.
3059 /// @param new_variables_number Number of variables.
3060 
set_columns_number(const Index & new_variables_number)3061 void DataSet::set_columns_number(const Index& new_variables_number)
3062 {
3063     columns.resize(new_variables_number);
3064 
3065     set_default_columns_uses();
3066 }
3067 
3068 
binarize_input_data(const type & threshold)3069 void DataSet::binarize_input_data(const type& threshold)
3070 {
3071     const Index samples_number = get_samples_number();
3072 
3073     const Index input_variables_number = get_input_variables_number();
3074 
3075     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3076 
3077     for(Index i = 0; i < samples_number; i++)
3078     {
3079         for(Index j = 0; j < input_variables_number; i++)
3080         {
3081             const Index input_variable_index = input_variables_indices[j];
3082 
3083             data(i,input_variable_index) < threshold
3084                     ? data(i,input_variable_index) = 0
3085                     : data(i,input_variable_index) = 1;
3086         }
3087     }
3088 }
3089 
3090 
transform_binary_column(const Tensor<type,1> & column) const3091 Tensor<type,2> DataSet::transform_binary_column(const Tensor<type,1>& column) const
3092 {
3093     const Index rows_number = column.dimension(0);
3094 
3095     Tensor<type, 2> new_column(rows_number , 2);
3096     new_column.setZero();
3097 
3098     for(Index i = 0; i < rows_number; i++)
3099     {
3100         if(abs(column(i) - static_cast<type>(1)) < std::numeric_limits<type>::min())
3101         {
3102             new_column(i,1) = static_cast<type>(1);
3103         }
3104         else if(abs(column(i) - static_cast<type>(0)) < std::numeric_limits<type>::min())
3105         {
3106             new_column(i,0) = static_cast<type>(1);
3107         }
3108         else
3109         {
3110             new_column(i,0) = NAN;
3111             new_column(i,1) = NAN;
3112         }
3113     }
3114 
3115     return new_column;
3116 }
3117 
3118 
set_binary_simple_columns()3119 void DataSet::set_binary_simple_columns()
3120 {
3121     bool is_binary = true;
3122 
3123     Index variable_index = 0;
3124 
3125     Index different_values = 0;
3126 
3127     for(Index column_index = 0; column_index < columns.size(); column_index++)
3128     {
3129         if(columns(column_index).type == Numeric)
3130         {
3131             Tensor<type, 1> values(3);
3132             values.setRandom();
3133             different_values = 0;
3134             is_binary = true;
3135 
3136             for(Index row_index = 0; row_index < data.dimension(0); row_index++)
3137             {
3138                 if(!::isnan(data(row_index, variable_index))
3139                 && data(row_index, variable_index) != values(0)
3140                 && data(row_index, variable_index) != values(1))
3141                 {
3142                     values(different_values) = data(row_index, variable_index);
3143 
3144                     different_values++;
3145                 }
3146 
3147                 if(row_index == (data.dimension(0)-1)){
3148                     if(different_values==1){
3149                         is_binary = false;
3150                         break;
3151                     }
3152                 }
3153 
3154                 if(different_values > 2)
3155                 {
3156                     is_binary = false;
3157                     break;
3158                 }
3159             }
3160 
3161             if(is_binary)
3162             {
3163                 columns(column_index).type = Binary;
3164                 scale_minimum_maximum_binary(values(0), values(1), column_index);
3165                 columns(column_index).categories.resize(2);
3166 
3167                 if(values(0) == 0 && values(1) == 1)
3168                 {
3169                     columns(column_index).categories(0) = "Negative (0)";
3170                     columns(column_index).categories(1) = "Positive (1)";
3171                 }
3172                 else if(values(0) == 1 && values(1) == 0)
3173                 {
3174                     columns(column_index).categories(0) = "Positive (1)";
3175                     columns(column_index).categories(1) = "Negative (0)";
3176                 }
3177                 else
3178                 {
3179                     columns(column_index).categories(0) = "Class_1";// + std::to_string(values(0));
3180                     columns(column_index).categories(1) = "Class_2";// + std::to_string(values(1));
3181                 }
3182 
3183                 const VariableUse column_use = columns(column_index).column_use;
3184                 columns(column_index).categories_uses.resize(2);
3185                 columns(column_index).categories_uses(0) = column_use;
3186                 columns(column_index).categories_uses(1) = column_use;
3187             }
3188 
3189             variable_index++;
3190         }
3191         else if(columns(column_index).type == Categorical)
3192         {
3193             variable_index += columns(column_index).get_categories_number();
3194         }
3195         else
3196         {
3197             variable_index++;
3198         }
3199     }
3200 }
3201 
3202 
3203 /// Sets new input dimensions in the data set.
3204 
set_input_variables_dimensions(const Tensor<Index,1> & new_inputs_dimensions)3205 void DataSet::set_input_variables_dimensions(const Tensor<Index, 1>& new_inputs_dimensions)
3206 {
3207     input_variables_dimensions = new_inputs_dimensions;
3208 }
3209 
3210 
3211 /// Returns true if the data set is a binary classification problem, false otherwise.
3212 /// @todo
3213 
is_binary_classification() const3214 bool DataSet::is_binary_classification() const
3215 {
3216     if(get_target_variables_number() != 1)
3217     {
3218         return false;
3219     }
3220 
3221     return true;
3222 }
3223 
3224 
3225 /// Returns true if the data set is a multiple classification problem, false otherwise.
3226 /// @todo
3227 
is_multiple_classification() const3228 bool DataSet::is_multiple_classification() const
3229 {
3230     return true;
3231 }
3232 
3233 
3234 /// Returns true if the data matrix is empty, and false otherwise.
3235 
is_empty() const3236 bool DataSet::is_empty() const
3237 {
3238     if(data.dimension(0) == 0 || data.dimension(1) == 0)
3239     {
3240         return true;
3241     }
3242 
3243     return false;
3244 }
3245 
3246 
3247 /// Returns true if any value is less or equal than a given value, and false otherwise.
3248 
is_less_than(const Tensor<type,1> & column,const type & value) const3249 bool DataSet::is_less_than(const Tensor<type, 1>& column, const type& value) const
3250 {
3251     Tensor<bool, 1> if_sentence = column <= column.constant(value);
3252 
3253     Tensor<bool, 1> sentence(column.size());
3254     sentence.setConstant(true);
3255 
3256     Tensor<bool, 1> else_sentence(column.size());
3257     else_sentence.setConstant(false);
3258 
3259     Tensor<bool, 0> is_less = (if_sentence.select(sentence, else_sentence)).any();
3260 
3261     return is_less(0);
3262 }
3263 
3264 
3265 /// Returns a reference to the data matrix in the data set.
3266 /// The number of rows is equal to the number of samples.
3267 /// The number of columns is equal to the number of variables.
3268 
get_data() const3269 const Tensor<type, 2>& DataSet::get_data() const
3270 {
3271     return data;
3272 }
3273 
3274 
get_data_pointer()3275 Tensor<type, 2>* DataSet::get_data_pointer()
3276 {
3277     return &data;
3278 }
3279 
3280 
3281 /// Returns a reference to the time series data matrix in the data set.
3282 /// Only for time series problems.
3283 
get_time_series_data() const3284 const Tensor<type, 2>& DataSet::get_time_series_data() const
3285 {
3286     return time_series_data;
3287 }
3288 
3289 
3290 /// Returns a string with the method used.
3291 
get_missing_values_method() const3292 DataSet::MissingValuesMethod DataSet::get_missing_values_method() const
3293 {
3294     return missing_values_method;
3295 }
3296 
3297 
3298 /// Returns the name of the data file.
3299 
get_data_file_name() const3300 const string& DataSet::get_data_file_name() const
3301 {
3302     return data_file_name;
3303 }
3304 
3305 
3306 /// Returns true if the first line of the data file has a header with the names of the variables, and false otherwise.
3307 
get_header_line() const3308 const bool& DataSet::get_header_line() const
3309 {
3310     return has_columns_names;
3311 }
3312 
3313 
3314 /// Returns true if the data file has rows label, and false otherwise.
3315 
get_rows_label() const3316 const bool& DataSet::get_rows_label() const
3317 {
3318     return has_rows_labels;
3319 }
3320 
3321 
get_rows_label_tensor() const3322 Tensor<string, 1> DataSet::get_rows_label_tensor() const
3323 {
3324     return rows_labels;
3325 }
3326 
get_testing_rows_label_tensor()3327 Tensor<string, 1> DataSet::get_testing_rows_label_tensor()
3328 {
3329     const Index testing_samples_number = get_testing_samples_number();
3330     const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3331     Tensor<string, 1> testing_rows_label(testing_samples_number);
3332 
3333     for(Index i = 0; i < testing_samples_number; i++)
3334     {
3335         testing_rows_label(i) = rows_labels(testing_indices(i));
3336     }
3337 
3338     return testing_rows_label;
3339 }
3340 
3341 
get_selection_rows_label_tensor()3342 Tensor<string, 1> DataSet::get_selection_rows_label_tensor()
3343 {
3344     const Index selection_samples_number = get_selection_samples_number();
3345     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3346     Tensor<string, 1> selection_rows_label(selection_samples_number);
3347 
3348     for(Index i = 0; i < selection_samples_number; i++)
3349     {
3350         selection_rows_label(i) = rows_labels(selection_indices(i));
3351     }
3352 
3353     return selection_rows_label;
3354 }
3355 
3356 
3357 /// Returns the separator to be used in the data file.
3358 
get_separator() const3359 const DataSet::Separator& DataSet::get_separator() const
3360 {
3361     return separator;
3362 }
3363 
3364 
3365 /// Returns the string which will be used as separator in the data file.
3366 
get_separator_char() const3367 char DataSet::get_separator_char() const
3368 {
3369     switch(separator)
3370     {
3371     case Space:
3372         return ' ';
3373 
3374     case Tab:
3375         return '\t';
3376 
3377     case Comma:
3378         return ',';
3379 
3380     case Semicolon:
3381         return ';';
3382     }
3383 
3384     return char();
3385 }
3386 
3387 
3388 /// Returns the string which will be used as separator in the data file.
3389 
get_separator_string() const3390 string DataSet::get_separator_string() const
3391 {
3392     switch(separator)
3393     {
3394     case Space:
3395         return "Space";
3396 
3397     case Tab:
3398         return "Tab";
3399 
3400     case Comma:
3401         return "Comma";
3402 
3403     case Semicolon:
3404         return "Semicolon";
3405     }
3406 
3407     return string();
3408 }
3409 
3410 
3411 /// Returns the string which will be used as label for the missing values in the data file.
3412 
get_missing_values_label() const3413 const string& DataSet::get_missing_values_label() const
3414 {
3415     return missing_values_label;
3416 }
3417 
3418 
3419 /// Returns the number of lags to be used in a time series prediction application.
3420 
get_lags_number() const3421 const Index& DataSet::get_lags_number() const
3422 {
3423     return lags_number;
3424 }
3425 
3426 
3427 /// Returns the number of steps ahead to be used in a time series prediction application.
3428 
get_steps_ahead() const3429 const Index& DataSet::get_steps_ahead() const
3430 {
3431     return steps_ahead;
3432 }
3433 
3434 
3435 /// Returns the indices of the time variables in the data set.
3436 
get_time_index() const3437 const Index& DataSet::get_time_index() const
3438 {
3439     return time_index;
3440 }
3441 
3442 
3443 /// Returns a value of the scaling-unscaling method enumeration from a string containing the name of that method.
3444 /// @param scaling_unscaling_method String with the name of the scaling and unscaling method.
3445 
get_scaling_unscaling_method(const string & scaling_unscaling_method)3446 DataSet::ScalingUnscalingMethod DataSet::get_scaling_unscaling_method(const string& scaling_unscaling_method)
3447 {
3448     if(scaling_unscaling_method == "NoScaling")
3449     {
3450         return NoScaling;
3451     }
3452     else if(scaling_unscaling_method == "NoUnscaling")
3453     {
3454         return NoUnscaling;
3455     }
3456     else if(scaling_unscaling_method == "MinimumMaximum")
3457     {
3458         return MinimumMaximum;
3459     }
3460     else if(scaling_unscaling_method == "Logarithmic")
3461     {
3462         return Logarithmic;
3463     }
3464     else if(scaling_unscaling_method == "MeanStandardDeviation")
3465     {
3466         return MeanStandardDeviation;
3467     }
3468     else if(scaling_unscaling_method == "StandardDeviation")
3469     {
3470         return StandardDeviation;
3471     }
3472     else
3473     {
3474         ostringstream buffer;
3475 
3476         buffer << "OpenNN Exception: DataSet class.\n"
3477                << "static ScalingUnscalingMethod get_scaling_unscaling_method(const string).\n"
3478                << "Unknown scaling-unscaling method: " << scaling_unscaling_method << ".\n";
3479 
3480         throw logic_error(buffer.str());
3481     }
3482 }
3483 
3484 
3485 /// Returns a matrix with the training samples in the data set.
3486 /// The number of rows is the number of training
3487 /// The number of columns is the number of variables.
3488 
get_training_data() const3489 Tensor<type, 2> DataSet::get_training_data() const
3490 {
3491 
3492 //       const Index variables_number = get_variables_number();
3493 
3494 //       Tensor<Index, 1> variables_indices(0, 1, variables_number-1);
3495 
3496        Tensor<Index, 1> variables_indices = get_used_variables_indices();
3497 
3498        const Tensor<Index, 1> training_indices = get_training_samples_indices();
3499 
3500        return get_subtensor_data(training_indices, variables_indices);
3501 
3502 //    return Tensor<type,2>();
3503 }
3504 
3505 
3506 /// Returns a matrix with the selection samples in the data set.
3507 /// The number of rows is the number of selection
3508 /// The number of columns is the number of variables.
3509 
get_selection_data() const3510 Tensor<type, 2> DataSet::get_selection_data() const
3511 {
3512     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3513 
3514     const Index variables_number = get_variables_number();
3515 
3516     Tensor<Index, 1> variables_indices;
3517     initialize_sequential_eigen_tensor(variables_indices, 0, 1, variables_number-1);
3518 
3519     return get_subtensor_data(selection_indices, variables_indices);
3520 }
3521 
3522 
3523 /// Returns a matrix with the testing samples in the data set.
3524 /// The number of rows is the number of testing
3525 /// The number of columns is the number of variables.
3526 
get_testing_data() const3527 Tensor<type, 2> DataSet::get_testing_data() const
3528 {
3529     const Index variables_number = get_variables_number();
3530 
3531     Tensor<Index, 1> variables_indices;
3532     initialize_sequential_eigen_tensor(variables_indices, 0, 1, variables_number-1);
3533 
3534     const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3535 
3536     return get_subtensor_data(testing_indices, variables_indices);
3537 }
3538 
3539 
3540 /// Returns a matrix with the input variables in the data set.
3541 /// The number of rows is the number of
3542 /// The number of columns is the number of input variables.
3543 
get_input_data() const3544 Tensor<type, 2> DataSet::get_input_data() const
3545 {
3546     const Index samples_number = get_samples_number();
3547 
3548     Tensor<Index, 1> indices;
3549     initialize_sequential_eigen_tensor(indices, 0, 1, samples_number-1);
3550 
3551     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3552 
3553     return get_subtensor_data(indices, input_variables_indices);
3554 }
3555 
3556 
3557 /// Returns a matrix with the target variables in the data set.
3558 /// The number of rows is the number of
3559 /// The number of columns is the number of target variables.
3560 
get_target_data() const3561 Tensor<type, 2> DataSet::get_target_data() const
3562 {
3563     const Tensor<Index, 1> indices = get_used_samples_indices();
3564 
3565     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3566 
3567     return get_subtensor_data(indices, target_variables_indices);
3568 }
3569 
3570 
3571 /// Returns a tensor with the input variables in the data set.
3572 /// The number of rows is the number of
3573 /// The number of columns is the number of input variables.
3574 
get_input_data(const Tensor<Index,1> & samples_indices) const3575 Tensor<type, 2> DataSet::get_input_data(const Tensor<Index, 1>& samples_indices) const
3576 {
3577     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3578 
3579     return get_subtensor_data(samples_indices, input_variables_indices);
3580 }
3581 
3582 
3583 /// Returns a tensor with the target variables in the data set.
3584 /// The number of rows is the number of
3585 /// The number of columns is the number of input variables.
3586 
get_target_data(const Tensor<Index,1> & samples_indices) const3587 Tensor<type, 2> DataSet::get_target_data(const Tensor<Index, 1>& samples_indices) const
3588 {
3589     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3590 
3591     return get_subtensor_data(samples_indices, target_variables_indices);
3592 }
3593 
3594 
3595 /// Returns a matrix with training samples and input variables.
3596 /// The number of rows is the number of training
3597 /// The number of columns is the number of input variables.
3598 
get_training_input_data() const3599 Tensor<type, 2> DataSet::get_training_input_data() const
3600 {
3601     const Tensor<Index, 1> training_indices = get_training_samples_indices();
3602 
3603     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3604 
3605     return get_subtensor_data(training_indices, input_variables_indices);
3606 }
3607 
3608 
3609 /// Returns a tensor with training samples and target variables.
3610 /// The number of rows is the number of training
3611 /// The number of columns is the number of target variables.
3612 
get_training_target_data() const3613 Tensor<type, 2> DataSet::get_training_target_data() const
3614 {
3615     const Tensor<Index, 1> training_indices = get_training_samples_indices();
3616 
3617     const Tensor<Index, 1>& target_variables_indices = get_target_variables_indices();
3618 
3619     return get_subtensor_data(training_indices, target_variables_indices);
3620 }
3621 
3622 
3623 /// Returns a tensor with selection samples and input variables.
3624 /// The number of rows is the number of selection
3625 /// The number of columns is the number of input variables.
3626 
get_selection_input_data() const3627 Tensor<type, 2> DataSet::get_selection_input_data() const
3628 {
3629     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3630 
3631     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3632 
3633     return get_subtensor_data(selection_indices, input_variables_indices);
3634 }
3635 
3636 
3637 /// Returns a tensor with selection samples and target variables.
3638 /// The number of rows is the number of selection
3639 /// The number of columns is the number of target variables.
3640 
get_selection_target_data() const3641 Tensor<type, 2> DataSet::get_selection_target_data() const
3642 {
3643     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3644 
3645     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3646 
3647     return get_subtensor_data(selection_indices, target_variables_indices);
3648 }
3649 
3650 
3651 /// Returns a tensor with testing samples and input variables.
3652 /// The number of rows is the number of testing
3653 /// The number of columns is the number of input variables.
3654 
get_testing_input_data() const3655 Tensor<type, 2> DataSet::get_testing_input_data() const
3656 {
3657     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3658 
3659     const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3660 
3661     return get_subtensor_data(testing_indices, input_variables_indices);
3662 }
3663 
3664 
3665 /// Returns a tensor with testing samples and target variables.
3666 /// The number of rows is the number of testing
3667 /// The number of columns is the number of target variables.
3668 
get_testing_target_data() const3669 Tensor<type, 2> DataSet::get_testing_target_data() const
3670 {
3671     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3672 
3673     const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3674 
3675     return get_subtensor_data(testing_indices, target_variables_indices);
3676 }
3677 
3678 
3679 /// Returns the inputs and target values of a single sample in the data set.
3680 /// @param index Index of the sample.
3681 
get_sample_data(const Index & index) const3682 Tensor<type, 1> DataSet::get_sample_data(const Index& index) const
3683 {
3684 
3685 #ifdef __OPENNN_DEBUG__
3686 
3687     const Index samples_number = get_samples_number();
3688 
3689     if(index >= samples_number)
3690     {
3691         ostringstream buffer;
3692 
3693         buffer << "OpenNN Exception: DataSet class.\n"
3694                << "Tensor<type, 1> get_sample(const Index&) const method.\n"
3695                << "Index of sample (" << index << ") must be less than number of samples (" << samples_number << ").\n";
3696 
3697         throw logic_error(buffer.str());
3698     }
3699 
3700 #endif
3701 
3702     // Get sample
3703 
3704     return data.chip(index,0);
3705 }
3706 
3707 
3708 /// Returns the inputs and target values of a single sample in the data set.
3709 /// @param sample_index Index of the sample.
3710 /// @param variables_indices Indices of the variables.
3711 
get_sample_data(const Index & sample_index,const Tensor<Index,1> & variables_indices) const3712 Tensor<type, 1> DataSet::get_sample_data(const Index& sample_index, const Tensor<Index, 1>& variables_indices) const
3713 {
3714 #ifdef __OPENNN_DEBUG__
3715 
3716     const Index samples_number = get_samples_number();
3717 
3718     if(sample_index >= samples_number)
3719     {
3720         ostringstream buffer;
3721 
3722         buffer << "OpenNN Exception: DataSet class.\n"
3723                << "Tensor<type, 1> get_sample(const Index&, const Tensor<Index, 1>&) const method.\n"
3724                << "Index of sample must be less than number of \n";
3725 
3726         throw logic_error(buffer.str());
3727     }
3728 
3729 #endif
3730 
3731     const Index variables_number = variables_indices.size();
3732 
3733     Tensor<type, 1 > row(variables_number);
3734 
3735     for(Index i = 0; i < variables_number; i++)
3736     {
3737         Index variable_index = variables_indices(i);
3738 
3739         row(i) = data(sample_index, variable_index);
3740     }
3741 
3742     return row;
3743 
3744     //return data.get_row(sample_index, variables_indices);
3745 
3746 }
3747 
3748 
3749 /// Returns the inputs values of a single sample in the data set.
3750 /// @param sample_index Index of the sample.
3751 
get_sample_input_data(const Index & sample_index) const3752 Tensor<type, 2> DataSet::get_sample_input_data(const Index & sample_index) const
3753 {
3754     const Index input_variables_number = get_input_variables_number();
3755 
3756     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3757 
3758     Tensor<type, 2> inputs(1, input_variables_number);
3759 
3760     for(Index i = 0; i < input_variables_number; i++)
3761         inputs(0, i) = data(sample_index, input_variables_indices(i));
3762 
3763     return inputs;
3764 }
3765 
3766 
3767 /// Returns the target values of a single sample in the data set.
3768 /// @param sample_index Index of the sample.
3769 
get_sample_target_data(const Index & sample_index) const3770 Tensor<type, 2> DataSet::get_sample_target_data(const Index & sample_index) const
3771 {
3772     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3773 
3774     return get_subtensor_data(Tensor<Index, 1>(sample_index), target_variables_indices);
3775 }
3776 
3777 
3778 /// Returns the index of the column with the given name.
3779 /// @param column_name Name of the column to be found.
3780 
get_column_index(const string & column_name) const3781 Index DataSet::get_column_index(const string& column_name) const
3782 {
3783     const Index columns_number = get_columns_number();
3784 
3785     for(Index i = 0; i < columns_number; i++)
3786     {
3787         if(columns(i).name == column_name) return i;
3788     }
3789 
3790     ostringstream buffer;
3791 
3792     buffer << "OpenNN Exception: DataSet class.\n"
3793            << "Index get_column_index(const string&&) const method.\n"
3794            << "Cannot find " << column_name << "\n";
3795 
3796     throw logic_error(buffer.str());
3797 }
3798 
3799 
3800 /// Returns the index of the column to which a variable index belongs.
3801 /// @param variable_index Index of the variable to be found.
3802 
get_column_index(const Index & variable_index) const3803 Index DataSet::get_column_index(const Index& variable_index) const
3804 {
3805     const Index columns_number = get_columns_number();
3806 
3807     Index total_variables_number = 0;
3808 
3809     for(Index i = 0; i < columns_number; i++)
3810     {
3811         if(columns(i).type == Categorical)
3812         {
3813             total_variables_number += columns(i).get_categories_number();
3814         }
3815         else
3816         {
3817             total_variables_number++;
3818         }
3819 
3820         if((variable_index+1) <= total_variables_number) return i;
3821     }
3822 
3823     ostringstream buffer;
3824 
3825     buffer << "OpenNN Exception: DataSet class.\n"
3826            << "Index get_column_index(const type&) const method.\n"
3827            << "Cannot find variable index: " << variable_index << ".\n";
3828 
3829     throw logic_error(buffer.str());
3830 }
3831 
3832 
3833 /// Returns the indices of a variable in the data set.
3834 /// Note that the number of variables does not have to equal the number of columns in the data set,
3835 /// because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.
3836 
get_variable_indices(const Index & column_index) const3837 Tensor<Index, 1> DataSet::get_variable_indices(const Index& column_index) const
3838 {
3839     Index index = 0;
3840 
3841     for(Index i = 0; i < column_index; i++)
3842     {
3843         if(columns(i).type == Categorical)
3844         {
3845             index += columns(i).categories.size();
3846         }
3847         else
3848         {
3849             index++;
3850         }
3851     }
3852 
3853     if(columns(column_index).type == Categorical)
3854     {
3855         Tensor<Index, 1> variable_indices(columns(column_index).categories.size());
3856 
3857         for (Index j = 0; j<columns(column_index).categories.size(); j++)
3858         {
3859             variable_indices(j) = index+j;
3860         }
3861 
3862         return variable_indices;
3863     }
3864     else
3865     {
3866         Tensor<Index, 1> indices(1);
3867         indices.setConstant(index);
3868 
3869         return indices;
3870     }
3871 }
3872 
3873 
3874 /// Returns the data from the data set of the given variables indices.
3875 /// @param variables_indices Variable indices.
3876 /// @todo
3877 
get_column_data(const Tensor<Index,1> & variables_indices) const3878 Tensor<type, 2> DataSet::get_column_data(const Tensor<Index, 1>& variables_indices) const
3879 {
3880 //    return data.get_submatrix_columns(variables_indices);
3881 
3882     return Tensor<type, 2>();
3883 }
3884 
3885 
3886 /// Returns the data from the data set column with a given index,
3887 /// these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
3888 /// @param column_index Index of the column.
3889 
get_column_data(const Index & column_index) const3890 Tensor<type, 2> DataSet::get_column_data(const Index& column_index) const
3891 {
3892     Index columns_number = 1;
3893     const Index rows_number = data.dimension(0);
3894 
3895     if(columns(column_index).type == Categorical)
3896     {
3897         columns_number = columns(column_index).get_categories_number();
3898     }
3899 
3900     Eigen::array<Index, 2> extents = {rows_number, columns_number};
3901     Eigen::array<Index, 2> offsets = {0, get_variable_indices(column_index)(0)};
3902 
3903     return data.slice(offsets, extents);
3904 }
3905 
3906 /// Returns the data from the time series column with a given index,
3907 /// @param column_index Index of the column.
3908 
get_time_series_column_data(const Index & column_index) const3909 Tensor<type, 2> DataSet::get_time_series_column_data(const Index& column_index) const
3910 {
3911     Index columns_number = 1;
3912     const Index rows_number = data.dimension(0);
3913 
3914     if(time_series_columns(column_index).type == Categorical)
3915     {
3916         columns_number = time_series_columns(column_index).get_categories_number();
3917     }
3918 
3919     Eigen::array<Index, 2> extents = {rows_number, columns_number};
3920     Eigen::array<Index, 2> offsets = {0, get_variable_indices(column_index)(0)};
3921 
3922     return time_series_data.slice(offsets, extents);
3923 }
3924 
3925 
3926 /// Returns the data from the data set column with a given index,
3927 /// these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
3928 /// @param column_index Index of the column.
3929 /// @param rows_indices Rows of the indices.
3930 
get_column_data(const Index & column_index,Tensor<Index,1> & rows_indices) const3931 Tensor<type, 2> DataSet::get_column_data(const Index& column_index, Tensor<Index, 1>& rows_indices) const
3932 {
3933     return get_subtensor_data(rows_indices, get_variable_indices(column_index));
3934 }
3935 
3936 
3937 /// Returns the data from the data set column with a given name,
3938 /// these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
3939 /// @param column_name Name of the column.
3940 
get_column_data(const string & column_name) const3941 Tensor<type, 2> DataSet::get_column_data(const string& column_name) const
3942 {
3943     const Index column_index = get_column_index(column_name);
3944 
3945     return get_column_data(column_index);
3946 }
3947 
3948 
3949 /// Returns all the samples of a single variable in the data set.
3950 /// @param index Index of the variable.
3951 
get_variable_data(const Index & index) const3952 Tensor<type, 1> DataSet::get_variable_data(const Index& index) const
3953 {
3954 
3955 #ifdef __OPENNN_DEBUG__
3956 
3957     const Index variables_number = get_variables_number();
3958 
3959     if(index >= variables_number)
3960     {
3961         ostringstream buffer;
3962 
3963         buffer << "OpenNN Exception: DataSet class.\n"
3964                << "Tensor<type, 1> get_variable(const Index&) const method.\n"
3965                << "Index of variable must be less than number of \n";
3966 
3967         throw logic_error(buffer.str());
3968     }
3969 
3970 #endif
3971 
3972     return data.chip(index, 1);
3973 }
3974 
3975 
3976 /// Returns all the samples of a single variable in the data set.
3977 /// @param variable_name Name of the variable.
3978 
get_variable_data(const string & variable_name) const3979 Tensor<type, 1> DataSet::get_variable_data(const string& variable_name) const
3980 {
3981 
3982     const Tensor<string, 1> variable_names = get_variables_names();
3983 
3984     Index size = 0;
3985 
3986     for(Index i = 0; i < variable_names.size(); i++)
3987     {
3988         if(variable_names(i) ==  variable_name) size++;
3989     }
3990 
3991     Tensor<Index, 1> variable_index(size);
3992 
3993     Index index = 0;
3994 
3995     for(Index i = 0; i < variable_names.size(); i++)
3996     {
3997         if(variable_names(i) ==  variable_name)
3998         {
3999             variable_index(index) = i;
4000 
4001             index++;
4002         }
4003     }
4004 
4005 #ifdef __OPENNN_DEBUG__
4006 
4007     const Index variables_size = variable_index.size();
4008 
4009     if(variables_size == 0)
4010     {
4011         ostringstream buffer;
4012 
4013         buffer << "OpenNN Exception: DataSet class.\n"
4014                << "Tensor<type, 1> get_variable(const string&) const method.\n"
4015                << "Variable: " << variable_name << " does not exist.\n";
4016 
4017         throw logic_error(buffer.str());
4018     }
4019 
4020     if(variables_size > 1)
4021     {
4022         ostringstream buffer;
4023 
4024         buffer << "OpenNN Exception: DataSet class.\n"
4025                << "Tensor<type, 1> get_variable(const string&) const method.\n"
4026                << "Variable: " << variable_name << " appears more than once in the data set.\n";
4027 
4028         throw logic_error(buffer.str());
4029     }
4030 
4031 #endif
4032 
4033     return data.chip(variable_index(0), 1);
4034 }
4035 
4036 
4037 /// Returns a given set of samples of a single variable in the data set.
4038 /// @param variable_index Index of the variable.
4039 /// @param samples_indices Indices of the
4040 
get_variable_data(const Index & variable_index,const Tensor<Index,1> & samples_indices) const4041 Tensor<type, 1> DataSet::get_variable_data(const Index& variable_index, const Tensor<Index, 1>& samples_indices) const
4042 {
4043 
4044 #ifdef __OPENNN_DEBUG__
4045 
4046     const Index variables_number = get_variables_number();
4047 
4048     if(variable_index >= variables_number)
4049     {
4050         ostringstream buffer;
4051 
4052         buffer << "OpenNN Exception: DataSet class.\n"
4053                << "Tensor<type, 1> get_variable(const Index&, const Tensor<Index, 1>&) const method.\n"
4054                << "Index of variable must be less than number of \n";
4055 
4056         throw logic_error(buffer.str());
4057     }
4058 
4059 #endif
4060 
4061     const Index samples_indices_size = samples_indices.size();
4062 
4063     Tensor<type, 1 > column(samples_indices_size);
4064 
4065     for(Index i = 0; i < samples_indices_size; i++)
4066     {
4067         Index sample_index = samples_indices(i);
4068 
4069         column(i) = data(sample_index, variable_index);
4070     }
4071 
4072     return column;
4073 }
4074 
4075 
4076 /// Returns a given set of samples of a single variable in the data set.
4077 /// @param variable_name Name of the variable.
4078 /// @param samples_indices Indices of the
4079 
get_variable_data(const string & variable_name,const Tensor<Index,1> & samples_indices) const4080 Tensor<type, 1> DataSet::get_variable_data(const string& variable_name, const Tensor<Index, 1>& samples_indices) const
4081 {
4082 
4083     const Tensor<string, 1> variable_names = get_variables_names();
4084 
4085     Index size = 0;
4086 
4087     for(Index i = 0; i < variable_names.size(); i++)
4088     {
4089         if(variable_names(i) ==  variable_name) size++;
4090     }
4091 
4092     Tensor<Index, 1> variable_index(size);
4093 
4094     Index index = 0;
4095 
4096     for(Index i = 0; i < variable_names.size(); i++)
4097     {
4098         if(variable_names(i) ==  variable_name)
4099         {
4100             variable_index(index) = i;
4101 
4102             index++;
4103         }
4104     }
4105 
4106 #ifdef __OPENNN_DEBUG__
4107 
4108     const Index variables_size = variable_index.size();
4109 
4110     if(variables_size == 0)
4111     {
4112         ostringstream buffer;
4113 
4114         buffer << "OpenNN Exception: DataSet class.\n"
4115                << "Tensor<type, 1> get_variable(const string&) const method.\n"
4116                << "Variable: " << variable_name << " does not exist.\n";
4117 
4118         throw logic_error(buffer.str());
4119     }
4120 
4121     if(variables_size > 1)
4122     {
4123         ostringstream buffer;
4124 
4125         buffer << "OpenNN Exception: DataSet class.\n"
4126                << "Tensor<type, 1> get_variable(const string&, const Tensor<Index, 1>&) const method.\n"
4127                << "Variable: " << variable_name << " appears more than once in the data set.\n";
4128 
4129         throw logic_error(buffer.str());
4130     }
4131 
4132 #endif
4133 
4134     const Index samples_indices_size = samples_indices.size();
4135 
4136     Tensor<type, 1 > column(samples_indices_size);
4137 
4138     for(Index i = 0; i < samples_indices_size; i++)
4139     {
4140         Index sample_index = samples_indices(i);
4141 
4142         column(i) = data(sample_index, variable_index(0));
4143     }
4144 
4145     return column;
4146 }
4147 
4148 
get_data_file_preview() const4149 Tensor<Tensor<string, 1>, 1> DataSet::get_data_file_preview() const
4150 {
4151     return data_file_preview;
4152 }
4153 
4154 
get_subtensor_data(const Tensor<Index,1> & rows_indices,const Tensor<Index,1> & variables_indices) const4155 Tensor<type, 2> DataSet::get_subtensor_data(const Tensor<Index, 1> & rows_indices, const Tensor<Index, 1> & variables_indices) const
4156 {
4157     const Index rows_number = rows_indices.size();
4158     const Index variables_number = variables_indices.size();
4159 
4160     Tensor<type, 2> subtensor(rows_number, variables_number);
4161 
4162     Index row_index;
4163     Index variable_index;
4164 
4165     const Tensor<type, 2>& data = get_data();
4166 
4167     for(Index i = 0; i < rows_number; i++)
4168     {
4169         row_index = rows_indices(i);
4170 
4171         for(Index j = 0; j < variables_number; j++)
4172         {
4173             variable_index = variables_indices(j);
4174 
4175             subtensor(i, j) = data(row_index, variable_index);
4176         }
4177     }
4178 
4179     return subtensor;
4180 }
4181 
4182 
4183 /// Sets zero samples and zero variables in the data set.
4184 
set()4185 void DataSet::set()
4186 {
4187     data_file_name = "";
4188 
4189     data.resize(0,0);
4190 }
4191 
4192 
4193 /// Sets all variables from a data matrix.
4194 /// @param new_data Data matrix.
4195 
set(const Tensor<type,2> & new_data)4196 void DataSet::set(const Tensor<type, 2>& new_data)
4197 {
4198     data_file_name = "";
4199 
4200     const Index variables_number = new_data.dimension(1);
4201     const Index samples_number = new_data.dimension(0);
4202 
4203     set(samples_number, variables_number);
4204 
4205     data = new_data;
4206 
4207     set_default_columns_uses();
4208 }
4209 
4210 
4211 /// Sets new numbers of samples and variables in the inputs targets data set.
4212 /// All the samples are set for training.
4213 /// All the variables are set as inputs.
4214 /// @param new_samples_number Number of
4215 /// @param new_variables_number Number of variables.
4216 
set(const Index & new_samples_number,const Index & new_variables_number)4217 void DataSet::set(const Index& new_samples_number, const Index& new_variables_number)
4218 {
4219 #ifdef __OPENNN_DEBUG__
4220 
4221     if(new_samples_number == 0)
4222     {
4223         ostringstream buffer;
4224 
4225         buffer << "OpenNN Exception: DataSet class.\n"
4226                << "void set(const Index&, const Index&) method.\n"
4227                << "Number of samples must be greater than zero.\n";
4228 
4229         throw logic_error(buffer.str());
4230     }
4231 
4232     if(new_variables_number == 0)
4233     {
4234         ostringstream buffer;
4235 
4236         buffer << "OpenNN Exception: DataSet class.\n"
4237                << "void set(const Index&, const Index&) method.\n"
4238                << "Number of variables must be greater than zero.\n";
4239 
4240         throw logic_error(buffer.str());
4241     }
4242 
4243 #endif
4244 
4245     data.resize(new_samples_number, new_variables_number);
4246 
4247     columns.resize(new_variables_number);
4248 
4249     for(Index index = 0; index < new_variables_number-1; index++)
4250     {
4251         columns(index).name = "column_" + to_string(index+1);
4252         columns(index).column_use = Input;
4253         columns(index).type = Numeric;
4254     }
4255 
4256     columns(new_variables_number-1).name = "column_" + to_string(new_variables_number);
4257     columns(new_variables_number-1).column_use = Target;
4258     columns(new_variables_number-1).type = Numeric;
4259 
4260     samples_uses.resize(new_samples_number);
4261     split_samples_random();
4262 }
4263 
4264 
4265 /// Sets new numbers of samples and inputs and target variables in the data set.
4266 /// The variables in the data set are the number of inputs plus the number of targets.
4267 /// @param new_samples_number Number of
4268 /// @param new_inputs_number Number of input variables.
4269 /// @param new_targets_number Number of target variables.
4270 
set(const Index & new_samples_number,const Index & new_inputs_number,const Index & new_targets_number)4271 void DataSet::set(const Index& new_samples_number,
4272                   const Index& new_inputs_number,
4273                   const Index& new_targets_number)
4274 {
4275 
4276     data_file_name = "";
4277 
4278     const Index new_variables_number = new_inputs_number + new_targets_number;
4279 
4280     data.resize(new_samples_number, new_variables_number);
4281 
4282     columns.resize(new_variables_number);
4283 
4284     for(Index i = 0; i < new_variables_number; i++)
4285     {
4286         if(i < new_inputs_number)
4287         {
4288             columns(i).name = "column_" + to_string(i+1);
4289             columns(i).column_use = Input;
4290             columns(i).type = Numeric;
4291         }
4292         else
4293         {
4294             columns(i).name = "column_" + to_string(i+1);
4295             columns(i).column_use = Target;
4296             columns(i).type = Numeric;
4297         }
4298     }
4299 
4300     input_variables_dimensions.resize(new_inputs_number);
4301 
4302     samples_uses.resize(new_samples_number);
4303     split_samples_random();
4304 }
4305 
4306 
4307 /// Sets the members of this data set object with those from another data set object.
4308 /// @param other_data_set Data set object to be copied.
4309 
set(const DataSet & other_data_set)4310 void DataSet::set(const DataSet& other_data_set)
4311 {
4312     data_file_name = other_data_set.data_file_name;
4313 
4314     has_columns_names = other_data_set.has_columns_names;
4315 
4316     separator = other_data_set.separator;
4317 
4318     missing_values_label = other_data_set.missing_values_label;
4319 
4320     data = other_data_set.data;
4321 
4322     columns = other_data_set.columns;
4323 
4324     display = other_data_set.display;
4325 }
4326 
4327 
4328 /// Sets the data set members from a XML document.
4329 /// @param data_set_document TinyXML document containing the member data.
4330 
set(const tinyxml2::XMLDocument & data_set_document)4331 void DataSet::set(const tinyxml2::XMLDocument& data_set_document)
4332 {
4333     set_default();
4334 
4335     from_XML(data_set_document);
4336 }
4337 
4338 
4339 /// Sets the data set members by loading them from a XML file.
4340 /// @param file_name Data set XML file_name.
4341 
set(const string & file_name)4342 void DataSet::set(const string& file_name)
4343 {
4344     load(file_name);
4345 }
4346 
4347 /// Sets a new display value.
4348 /// If it is set to true messages from this class are to be displayed on the screen;
4349 /// if it is set to false messages from this class are not to be displayed on the screen.
4350 /// @param new_display Display value.
4351 
set_display(const bool & new_display)4352 void DataSet::set_display(const bool& new_display)
4353 {
4354     display = new_display;
4355 }
4356 
4357 
4358 /// Sets the default member values:
4359 /// <ul>
4360 /// <li> Display: True.
4361 /// </ul>
4362 
set_default()4363 void DataSet::set_default()
4364 {
4365     delete non_blocking_thread_pool;
4366     delete thread_pool_device;
4367 
4368     const int n = omp_get_max_threads();
4369     non_blocking_thread_pool = new NonBlockingThreadPool(n);
4370     thread_pool_device = new ThreadPoolDevice(non_blocking_thread_pool, n);
4371 
4372     has_columns_names = false;
4373 
4374     separator = Comma;
4375 
4376     missing_values_label = "NA";
4377 
4378     lags_number = 0;
4379 
4380     steps_ahead = 0;
4381 
4382     set_default_columns_uses();
4383 
4384     set_default_columns_names();
4385 
4386     input_variables_dimensions.resize(1);
4387 
4388     input_variables_dimensions.setConstant(get_input_variables_number());
4389 
4390 }
4391 
4392 
4393 /// Sets a new data matrix.
4394 /// The number of rows must be equal to the number of
4395 /// The number of columns must be equal to the number of variables.
4396 /// Indices of all training, selection and testing samples and inputs and target variables do not change.
4397 /// @param new_data Data matrix.
4398 
set_data(const Tensor<type,2> & new_data)4399 void DataSet::set_data(const Tensor<type, 2>& new_data)
4400 {
4401 
4402     const Index samples_number = new_data.dimension(0);
4403     const Index variables_number = new_data.dimension(1);
4404 
4405     set(samples_number, variables_number);
4406 
4407     data = new_data;
4408 }
4409 
set_time_series_data(const Tensor<type,2> & new_data)4410 void DataSet::set_time_series_data(const Tensor<type, 2>& new_data)
4411 {
4412     time_series_data = new_data;
4413 }
4414 
4415 /// Sets the name of the data file.
4416 /// It also loads the data from that file.
4417 /// Moreover, it sets the variables and samples objects.
4418 /// @param new_data_file_name Name of the file containing the data.
4419 
set_data_file_name(const string & new_data_file_name)4420 void DataSet::set_data_file_name(const string& new_data_file_name)
4421 {
4422     data_file_name = new_data_file_name;
4423 }
4424 
4425 
4426 /// Sets if the data file contains a header with the names of the columns.
4427 
set_has_columns_names(const bool & new_has_columns_names)4428 void DataSet::set_has_columns_names(const bool& new_has_columns_names)
4429 {
4430     has_columns_names = new_has_columns_names;
4431 }
4432 
4433 
4434 /// Sets if the data file contains rows label.
4435 
set_has_rows_label(const bool & new_has_rows_label)4436 void DataSet::set_has_rows_label(const bool& new_has_rows_label)
4437 {
4438     has_rows_labels = new_has_rows_label;
4439 }
4440 
4441 
4442 /// Sets a new separator.
4443 /// @param new_separator Separator value.
4444 
set_separator(const Separator & new_separator)4445 void DataSet::set_separator(const Separator& new_separator)
4446 {
4447     separator = new_separator;
4448 }
4449 
4450 
4451 /// Sets a new separator from a char.
4452 /// @param new_separator Char with the separator value.
4453 
set_separator(const char & new_separator)4454 void DataSet::set_separator(const char& new_separator)
4455 {
4456     if(new_separator == ' ')
4457     {
4458         separator = Space;
4459     }
4460     else if(new_separator == '\t')
4461     {
4462         separator = Tab;
4463     }
4464     else if(new_separator == ',')
4465     {
4466         separator = Comma;
4467     }
4468     else if(new_separator == ';')
4469     {
4470         separator = Semicolon;
4471     }
4472     else
4473     {
4474         ostringstream buffer;
4475 
4476         buffer << "OpenNN Exception: DataSet class.\n"
4477                << "void set_separator(const char&) method.\n"
4478                << "Unknown separator: " << new_separator << ".\n";
4479 
4480         throw logic_error(buffer.str());
4481     }
4482 }
4483 
4484 
4485 /// Sets a new separator from a string.
4486 /// @param new_separator Char with the separator value.
4487 
set_separator(const string & new_separator_string)4488 void DataSet::set_separator(const string& new_separator_string)
4489 {
4490     if(new_separator_string == "Space")
4491     {
4492         separator = Space;
4493     }
4494     else if(new_separator_string == "Tab")
4495     {
4496         separator = Tab;
4497     }
4498     else if(new_separator_string == "Comma")
4499     {
4500         separator = Comma;
4501     }
4502     else if(new_separator_string == "Semicolon")
4503     {
4504         separator = Semicolon;
4505     }
4506     else
4507     {
4508         ostringstream buffer;
4509 
4510         buffer << "OpenNN Exception: DataSet class.\n"
4511                << "void set_separator(const string&) method.\n"
4512                << "Unknown separator: " << new_separator_string << ".\n";
4513 
4514         throw logic_error(buffer.str());
4515     }
4516 }
4517 
4518 
4519 
4520 /// Sets a new label for the missing values.
4521 /// @param new_missing_values_label Label for the missing values.
4522 
set_missing_values_label(const string & new_missing_values_label)4523 void DataSet::set_missing_values_label(const string& new_missing_values_label)
4524 {
4525 #ifdef __OPENNN_DEBUG__
4526 
4527     if(get_trimmed(new_missing_values_label).empty())
4528     {
4529        ostringstream buffer;
4530 
4531        buffer << "OpenNN Exception: DataSet class.\n"
4532               << "void set_missing_values_label(const string&) method.\n"
4533               << "Missing values label cannot be empty.\n";
4534 
4535        throw logic_error(buffer.str());
4536     }
4537 
4538 #endif
4539 
4540     missing_values_label = new_missing_values_label;
4541 }
4542 
4543 
4544 /// Sets a new method for the missing values.
4545 /// @param new_missing_values_method Method for the missing values.
4546 
set_missing_values_method(const DataSet::MissingValuesMethod & new_missing_values_method)4547 void DataSet::set_missing_values_method(const DataSet::MissingValuesMethod& new_missing_values_method)
4548 {
4549     missing_values_method = new_missing_values_method;
4550 }
4551 
4552 
set_missing_values_method(const string & new_missing_values_method)4553 void DataSet::set_missing_values_method(const string & new_missing_values_method)
4554 {
4555     if(new_missing_values_method == "Unuse")
4556     {
4557         missing_values_method = Unuse;
4558     }
4559     else if(new_missing_values_method == "Mean")
4560     {
4561         missing_values_method = Mean;
4562     }
4563     else if(new_missing_values_method == "Median")
4564     {
4565         missing_values_method = Median;
4566     }
4567     else
4568     {
4569         ostringstream buffer;
4570 
4571         buffer << "OpenNN Exception: DataSet class.\n"
4572                << "void set_missing_values_method(const string & method.\n"
4573                << "Not known method type.\n";
4574 
4575         throw logic_error(buffer.str());
4576     }
4577 }
4578 
4579 
4580 /// Sets a new number of lags to be defined for a time series prediction application.
4581 /// When loading the data file, the time series data will be modified according to this number.
4582 /// @param new_lags_number Number of lags(x-1, ..., x-l) to be used.
4583 
set_lags_number(const Index & new_lags_number)4584 void DataSet::set_lags_number(const Index& new_lags_number)
4585 {
4586     lags_number = new_lags_number;
4587 }
4588 
4589 
4590 /// Sets a new number of steps ahead to be defined for a time series prediction application.
4591 /// When loading the data file, the time series data will be modified according to this number.
4592 /// @param new_steps_ahead_number Number of steps ahead to be used.
4593 
set_steps_ahead_number(const Index & new_steps_ahead_number)4594 void DataSet::set_steps_ahead_number(const Index& new_steps_ahead_number)
4595 {
4596     steps_ahead = new_steps_ahead_number;
4597 }
4598 
4599 
4600 /// Sets the new position where the time data is located in the data set.
4601 /// @param new_time_index Position where the time data is located.
4602 
set_time_index(const Index & new_time_index)4603 void DataSet::set_time_index(const Index& new_time_index)
4604 {
4605     time_index = new_time_index;
4606 }
4607 
4608 
set_threads_number(const int & new_threads_number)4609 void DataSet::set_threads_number(const int& new_threads_number)
4610 {
4611     if(non_blocking_thread_pool != nullptr) delete non_blocking_thread_pool;
4612     if(thread_pool_device != nullptr) delete thread_pool_device;
4613 
4614     non_blocking_thread_pool = new NonBlockingThreadPool(new_threads_number);
4615     thread_pool_device = new ThreadPoolDevice(non_blocking_thread_pool, new_threads_number);
4616 }
4617 
4618 
4619 /// Sets a new number of samples in the data set.
4620 /// All samples are also set for training.
4621 /// The indices of the inputs and target variables do not change.
4622 /// @param new_samples_number Number of samples.
4623 
set_samples_number(const Index & new_samples_number)4624 void DataSet::set_samples_number(const Index& new_samples_number)
4625 {
4626     const Index variables_number = get_variables_number();
4627 
4628     set(new_samples_number,variables_number);
4629 }
4630 
4631 
4632 /// Removes the input of target indices of that variables with zero standard deviation.
4633 /// It might change the size of the vectors containing the inputs and targets indices.
4634 
unuse_constant_columns()4635 Tensor<string, 1> DataSet::unuse_constant_columns()
4636 {
4637     const Index columns_number = get_columns_number();
4638 
4639 #ifdef __OPENNN_DEBUG__
4640 
4641     if(columns_number == 0)
4642     {
4643         ostringstream buffer;
4644 
4645         buffer << "OpenNN Exception: DataSet class.\n"
4646                << "Tensor<string, 1> unuse_constant_columns() method.\n"
4647                << "Number of columns is zero.\n";
4648 
4649         throw logic_error(buffer.str());
4650     }
4651 
4652 #endif
4653 
4654     Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
4655 
4656     Tensor<string, 1> constant_columns(0);
4657 
4658     Index variable_index = 0;
4659 
4660     for(Index i = 0; i < columns_number; i++)
4661     {
4662 
4663 
4664         if(columns(i).column_use == Input)
4665         {
4666 
4667             if(columns(i).type == Categorical)
4668             {
4669 
4670                 const Index categories_number = columns(i).categories.size();
4671 
4672                 bool is_constant = true;
4673 
4674                 for(Index j = 0; j < categories_number; j++)
4675                 {
4676 
4677                     const type column_standard_deviation = standard_deviation(data.chip(variable_index+j,1), used_samples_indices);
4678                     if((column_standard_deviation - 0) > numeric_limits<type>::min())
4679                     {
4680                         is_constant = false;
4681                         break;
4682                     }
4683 
4684                 }
4685 
4686                 if(is_constant) columns(i).set_use(UnusedVariable);
4687 
4688                 constant_columns = push_back(constant_columns, columns(i).name);
4689 
4690             }
4691             else
4692             {
4693 
4694                 const type column_standard_deviation = standard_deviation(data.chip(variable_index,1), used_samples_indices);
4695 
4696                 if((column_standard_deviation - 0) < numeric_limits<type>::min())
4697 
4698                 {
4699                     columns(i).set_use(UnusedVariable);
4700 
4701                     constant_columns = push_back(constant_columns, columns(i).name);
4702 
4703                 }
4704             }
4705         }
4706 
4707         columns(i).type == Categorical ? variable_index += columns(i).categories.size() : variable_index++;
4708 
4709     }
4710     return constant_columns;
4711 }
4712 
4713 
4714 /// Removes the training, selection and testing indices of that samples which are repeated in the data matrix.
4715 /// It might change the size of the vectors containing the training, selection and testing indices.
4716 
unuse_repeated_samples()4717 Tensor<Index, 1> DataSet::unuse_repeated_samples()
4718 {
4719     const Index samples_number = get_samples_number();
4720 
4721 #ifdef __OPENNN_DEBUG__
4722 
4723     if(samples_number == 0)
4724     {
4725         ostringstream buffer;
4726 
4727         buffer << "OpenNN Exception: DataSet class.\n"
4728                << "Tensor<Index, 1> unuse_repeated_samples() method.\n"
4729                << "Number of samples is zero.\n";
4730 
4731         throw logic_error(buffer.str());
4732     }
4733 
4734 #endif
4735 
4736     Tensor<Index, 1> repeated_samples;
4737 
4738     Tensor<type, 1> sample_i;
4739     Tensor<type, 1> sample_j;
4740 
4741     #pragma omp parallel for private(sample_i, sample_j) schedule(dynamic)
4742 
4743     for(Index i = 0; i < static_cast<Index>(samples_number); i++)
4744     {
4745         sample_i = get_sample_data(i);
4746 
4747         for(Index j = static_cast<Index>(i+1); j < samples_number; j++)
4748         {
4749             sample_j = get_sample_data(j);
4750 
4751             if(get_sample_use(j) != UnusedSample
4752                     && std::equal(sample_i.data(), sample_i.data()+sample_i.size(), sample_j.data()))
4753             {
4754                 set_sample_use(j, UnusedSample);
4755 
4756                 repeated_samples = push_back(repeated_samples, j);
4757             }
4758         }
4759     }
4760 
4761     return repeated_samples;
4762 }
4763 
4764 
4765 /// Return unused variables without correlation.
4766 /// @param minimum_correlation Minimum correlation between variables.
4767 
unuse_uncorrelated_columns(const type & minimum_correlation)4768 Tensor<string, 1> DataSet::unuse_uncorrelated_columns(const type& minimum_correlation)
4769 {
4770     Tensor<string, 1> unused_columns;
4771 
4772     const Tensor<CorrelationResults, 2> correlations = calculate_input_target_columns_correlations();
4773 
4774     const Index input_columns_number = get_input_columns_number();
4775     const Index target_columns_number = get_target_columns_number();
4776 
4777     const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
4778 
4779     for(Index i = 0; i < input_columns_number; i++)
4780     {
4781         const Index index = input_columns_indices(i);
4782 
4783         for(Index j = 0; j < target_columns_number; j++)
4784         {
4785             if(columns(index).column_use != UnusedVariable && abs(correlations(i,j).correlation) < minimum_correlation)
4786             {
4787                 columns(index).set_use(UnusedVariable);
4788 
4789                 unused_columns = push_back(unused_columns, columns(index).name);
4790             }
4791         }
4792     }
4793 
4794     return unused_columns;
4795 }
4796 
4797 
4798 /// Returns the distribution of each of the columns. In the case of numeric columns, it returns a
4799 /// histogram, for the case of categorical columns, it returns the frequencies of each category nad for the
4800 /// binary columns it returns the frequencies of the positives and negatives.
4801 /// The default number of bins is 10.
4802 /// @param bins_number Number of bins.
4803 
calculate_columns_distribution(const Index & bins_number) const4804 Tensor<Histogram, 1> DataSet::calculate_columns_distribution(const Index& bins_number) const
4805 {
4806     const Index columns_number = columns.size();
4807     const Index used_columns_number = get_used_columns_number();
4808     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
4809     const Index used_samples_number = used_samples_indices.size();
4810 
4811     Tensor<Histogram, 1> histograms(used_columns_number);
4812 
4813     Index variable_index = 0;
4814     Index used_column_index = 0;
4815 
4816     for(Index i = 0; i < columns_number; i++)
4817     {
4818         if(columns(i).type == Numeric)
4819         {
4820             if(columns(i).column_use == UnusedVariable)
4821             {
4822                 variable_index++;
4823             }
4824             else
4825             {
4826                 Tensor<type, 1> column(used_samples_number);
4827 
4828                 for(Index j = 0; j < used_samples_number; j++)
4829                 {
4830                     column(j) = data(used_samples_indices(j), variable_index);
4831                 }
4832 
4833                 histograms(used_column_index) = histogram(column, bins_number);
4834 
4835                 variable_index++;
4836                 used_column_index++;
4837             }
4838         }
4839         else if(columns(i).type == Categorical)
4840         {
4841             const Index categories_number = columns(i).get_categories_number();
4842 
4843             if(columns(i).column_use == UnusedVariable)
4844             {
4845                 variable_index += categories_number;
4846             }
4847             else
4848             {
4849                 Tensor<Index, 1> categories_frequencies(categories_number);
4850                 categories_frequencies.setZero();
4851                 Tensor<type, 1> centers(categories_number);
4852 
4853                 for(Index j = 0; j < categories_number; j++)
4854                 {
4855                     for(Index k = 0; k < used_samples_number; k++)
4856                     {
4857                         if(abs(data(used_samples_indices(k), variable_index) - 1) < numeric_limits<type>::min())
4858                         {
4859                             categories_frequencies(j)++;
4860                         }
4861                     }
4862 
4863                     centers(j) = static_cast<type>(j);
4864 
4865                     variable_index++;
4866                 }
4867 
4868                 histograms(used_column_index).frequencies = categories_frequencies;
4869                 histograms(used_column_index).centers = centers;
4870 
4871                 used_column_index++;
4872             }
4873         }
4874         else if(columns(i).type == Binary)
4875         {
4876             if(columns(i).column_use == UnusedVariable)
4877             {
4878                 variable_index++;
4879             }
4880             else
4881             {
4882                 Tensor<Index, 1> binary_frequencies(2);
4883                 binary_frequencies.setZero();
4884 
4885                 for(Index j = 0; j < used_samples_number; j++)
4886                 {
4887                     if(fabsf(data(used_samples_indices(j), variable_index) - 1) < numeric_limits<type>::min())
4888                     {
4889                         binary_frequencies(0)++;
4890                     }
4891                     else
4892                     {
4893                         binary_frequencies(1)++;
4894                     }
4895                 }
4896 
4897                 histograms(used_column_index).frequencies = binary_frequencies;
4898                 variable_index++;
4899                 used_column_index++;
4900             }
4901         }
4902         else // Time @todo
4903         {
4904             variable_index++;
4905         }
4906     }
4907 
4908     return histograms;
4909 }
4910 
4911 
4912 /// Returns a vector of subvectors with the values of a box and whiskers plot.
4913 /// The size of the vector is equal to the number of used variables.
4914 /// The size of the subvectors is 5 and they consist on:
4915 /// <ul>
4916 /// <li> Minimum
4917 /// <li> First quartile
4918 /// <li> Second quartile
4919 /// <li> Third quartile
4920 /// <li> Maximum
4921 /// </ul>
4922 
calculate_columns_box_plots() const4923 Tensor<BoxPlot, 1> DataSet::calculate_columns_box_plots() const
4924 {
4925     Index used_columns_number = get_used_columns_number();
4926 
4927     Index columns_number = get_columns_number();
4928 
4929     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
4930 
4931     Tensor<BoxPlot, 1> box_plots(used_columns_number);
4932 
4933     Index used_column_index = 0;
4934     Index variable_index = 0;
4935 
4936     for(Index i = 0; i < columns_number; i++)
4937     {
4938         if(columns(i).type == Numeric || columns(i).type == Binary)
4939         {
4940             if(columns(i).column_use != UnusedVariable)
4941             {
4942                 cout << "Column: " << columns(i).name << endl;
4943 
4944                 box_plots(used_column_index) = box_plot(data.chip(variable_index, 1), used_samples_indices);
4945 
4946                 cout << "min: " << box_plots(used_column_index).minimum << endl;
4947                 cout << "max: " << box_plots(used_column_index).maximum << endl;
4948 
4949 
4950                 used_column_index++;
4951             }
4952 
4953             variable_index++;
4954         }
4955         else if(columns(i).type == Categorical)
4956         {
4957             variable_index += columns(i).get_categories_number();
4958         }
4959         else
4960         {
4961             variable_index++;
4962         }
4963     }
4964 
4965     return box_plots;
4966 }
4967 
4968 
4969 /// Counts the number of used negatives of the selected target.
4970 /// @param target_index Index of the target to evaluate.
4971 
calculate_used_negatives(const Index & target_index) const4972 Index DataSet::calculate_used_negatives(const Index& target_index) const
4973 {
4974     Index negatives = 0;
4975 
4976     const Tensor<Index, 1> used_indices = get_used_samples_indices();
4977 
4978     const Index used_samples_number = used_indices.size();
4979 
4980     for(Index i = 0; i < used_samples_number; i++)
4981     {
4982         const Index training_index = used_indices(i);
4983 
4984         if(fabsf(data(training_index, target_index)) < numeric_limits<type>::min())
4985         {
4986             negatives++;
4987         }
4988         else if(fabsf(data(training_index, target_index) - static_cast<type>(1)) > static_cast<type>(1.0e-3))
4989         {
4990             ostringstream buffer;
4991 
4992             buffer << "OpenNN Exception: DataSet class.\n"
4993                    << "Index calculate_used_negatives(const Index&) const method.\n"
4994                    << "Training sample is neither a positive nor a negative: " << data(training_index, target_index) << endl;
4995 
4996             throw logic_error(buffer.str());
4997         }
4998     }
4999 
5000     return negatives;
5001 }
5002 
5003 
5004 /// Counts the number of negatives of the selected target in the training data.
5005 /// @param target_index Index of the target to evaluate.
5006 
calculate_training_negatives(const Index & target_index) const5007 Index DataSet::calculate_training_negatives(const Index& target_index) const
5008 {
5009     Index negatives = 0;
5010 
5011     const Tensor<Index, 1> training_indices = get_training_samples_indices();
5012 
5013     const Index training_samples_number = training_indices.size();
5014 
5015     for(Index i = 0; i < training_samples_number; i++)
5016     {
5017         const Index training_index = training_indices(i);
5018 
5019         if(fabsf(data(training_index, target_index)) < numeric_limits<type>::min())
5020         {
5021             negatives++;
5022         }
5023         else if(fabsf(data(training_index, target_index) - static_cast<type>(1)) > static_cast<type>(1.0e-3))
5024         {
5025             ostringstream buffer;
5026 
5027             buffer << "OpenNN Exception: DataSet class.\n"
5028                    << "Index calculate_training_negatives(const Index&) const method.\n"
5029                    << "Training sample is neither a positive nor a negative: " << data(training_index, target_index) << endl;
5030 
5031             throw logic_error(buffer.str());
5032         }
5033     }
5034 
5035     return negatives;
5036 }
5037 
5038 
5039 /// Counts the number of negatives of the selected target in the selection data.
5040 /// @param target_index Index of the target to evaluate.
5041 
calculate_selection_negatives(const Index & target_index) const5042 Index DataSet::calculate_selection_negatives(const Index& target_index) const
5043 {
5044     Index negatives = 0;
5045 
5046     const Index selection_samples_number = get_selection_samples_number();
5047 
5048     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5049 
5050     for(Index i = 0; i < static_cast<Index>(selection_samples_number); i++)
5051     {
5052         const Index selection_index = selection_indices(i);
5053 
5054         if(fabsf(data(selection_index, target_index)) < numeric_limits<type>::min())
5055         {
5056             negatives++;
5057         }
5058         else if(fabsf(data(selection_index, target_index) - 1) > numeric_limits<type>::min())
5059         {
5060             ostringstream buffer;
5061 
5062             buffer << "OpenNN Exception: DataSet class.\n"
5063                    << "Index calculate_testing_negatives(const Index&) const method.\n"
5064                    << "Selection sample is neither a positive nor a negative: " << data(selection_index, target_index) << endl;
5065 
5066             throw logic_error(buffer.str());
5067         }
5068     }
5069 
5070     return negatives;
5071 }
5072 
5073 
5074 /// Counts the number of negatives of the selected target in the testing data.
5075 /// @param target_index Index of the target to evaluate.
5076 
calculate_testing_negatives(const Index & target_index) const5077 Index DataSet::calculate_testing_negatives(const Index& target_index) const
5078 {
5079     Index negatives = 0;
5080 
5081     const Index testing_samples_number = get_testing_samples_number();
5082 
5083     const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
5084 
5085     for(Index i = 0; i < static_cast<Index>(testing_samples_number); i++)
5086     {
5087         const Index testing_index = testing_indices(i);
5088 
5089         if(data(testing_index, target_index) < numeric_limits<type>::min())
5090         {
5091             negatives++;
5092         }
5093     }
5094 
5095     return negatives;
5096 }
5097 
5098 
5099 /// Returns a vector of vectors containing some basic descriptives of all the variables in the data set.
5100 /// The size of this vector is four. The subvectors are:
5101 /// <ul>
5102 /// <li> Minimum.
5103 /// <li> Maximum.
5104 /// <li> Mean.
5105 /// <li> Standard deviation.
5106 /// </ul>
5107 
calculate_variables_descriptives() const5108 Tensor<Descriptives, 1> DataSet::calculate_variables_descriptives() const
5109 {
5110     return descriptives(data);
5111 }
5112 
5113 
5114 /// Returns a vector of vectors containing some basic descriptives of the used variables and samples
5115 /// The size of this vector is four. The subvectors are:
5116 /// <ul>
5117 /// <li> Minimum.
5118 /// <li> Maximum.
5119 /// <li> Mean.
5120 /// <li> Standard deviation.
5121 /// </ul>
5122 
calculate_used_variables_descriptives() const5123 Tensor<Descriptives, 1> DataSet::calculate_used_variables_descriptives() const
5124 {
5125     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5126     const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
5127 
5128     return descriptives(data, used_samples_indices, used_variables_indices);
5129 }
5130 
5131 
5132 /// Calculate the descriptives of the samples with positive targets in binary classification problems.
5133 /// @todo Low priority.
5134 
calculate_columns_descriptives_positive_samples() const5135 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_positive_samples() const
5136 {
5137 
5138 #ifdef __OPENNN_DEBUG__
5139 
5140     const Index targets_number = get_target_variables_number();
5141 
5142     if(targets_number != 1)
5143     {
5144         ostringstream buffer;
5145 
5146         buffer << "OpenNN Exception: DataSet class.\n"
5147                << "Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5148                << "Number of targets muste be 1.\n";
5149 
5150         throw logic_error(buffer.str());
5151     }
5152 #endif
5153 
5154     const Index target_index = get_target_variables_indices()(0);
5155 
5156 
5157     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5158     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5159 
5160     const Index samples_number = used_samples_indices.size();
5161 
5162     // Count used positive samples
5163 
5164     Index positive_samples_number = 0;
5165 
5166     for (Index i = 0; i < samples_number; i++)
5167     {
5168         Index sample_index = used_samples_indices(i);
5169 
5170         if(abs(data(sample_index, target_index) - 1) < numeric_limits<type>::min()) positive_samples_number++;
5171     }
5172 
5173         // Get used positive samples indices
5174 
5175     Tensor<Index, 1> positive_used_samples_indices(positive_samples_number);
5176     Index positive_sample_index = 0;
5177 
5178     for(Index i = 0; i < samples_number; i++)
5179     {
5180         Index sample_index = used_samples_indices(i);
5181 
5182         if(abs(data(sample_index, target_index) - 1) < numeric_limits<type>::min())
5183         {
5184             positive_used_samples_indices(positive_sample_index) = sample_index;
5185             positive_sample_index++;
5186         }
5187     }
5188     return descriptives(data, positive_used_samples_indices, input_variables_indices);
5189 }
5190 
5191 
5192 /// Calculate the descriptives of the samples with neagtive targets in binary classification problems.
5193 /// @todo Low priority.
5194 
calculate_columns_descriptives_negative_samples() const5195 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_negative_samples() const
5196 {
5197 
5198 #ifdef __OPENNN_DEBUG__
5199 
5200     const Index targets_number = get_target_variables_number();
5201 
5202     if(targets_number != 1)
5203     {
5204         ostringstream buffer;
5205 
5206         buffer << "OpenNN Exception: DataSet class.\n"
5207                << "Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5208                << "Number of targets muste be 1.\n";
5209 
5210         throw logic_error(buffer.str());
5211     }
5212 #endif
5213 
5214     const Index target_index = get_target_variables_indices()(0);
5215 
5216     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5217     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5218 
5219     const Index samples_number = used_samples_indices.size();
5220 
5221     // Count used negative samples
5222 
5223     Index negative_samples_number = 0;
5224 
5225     for (Index i = 0; i < samples_number; i++)
5226     {
5227         Index sample_index = used_samples_indices(i);
5228 
5229         if(data(sample_index, target_index) < numeric_limits<type>::min()) negative_samples_number++;
5230     }
5231 
5232     // Get used negative samples indices
5233 
5234     Tensor<Index, 1> negative_used_samples_indices(negative_samples_number);
5235     Index negative_sample_index = 0;
5236 
5237     for(Index i = 0; i < samples_number; i++)
5238     {
5239         Index sample_index = used_samples_indices(i);
5240 
5241         if(data(sample_index, target_index) < numeric_limits<type>::min())
5242         {
5243             negative_used_samples_indices(negative_sample_index) = sample_index;
5244             negative_sample_index++;
5245         }
5246 
5247     }
5248 
5249     return descriptives(data, negative_used_samples_indices, input_variables_indices);
5250 }
5251 
5252 
5253 /// Returns a matrix with the data set descriptive statistics.
5254 /// @param class_index Data set index number to make the descriptive statistics.
5255 
calculate_columns_descriptives_categories(const Index & class_index) const5256 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_categories(const Index& class_index) const
5257 {
5258     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5259     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5260 
5261     const Index samples_number = used_samples_indices.size();
5262 
5263     // Count used class samples
5264 
5265     Index class_samples_number = 0;
5266 
5267     for (Index i = 0; i < samples_number; i++)
5268     {
5269         Index sample_index = used_samples_indices(i);
5270 
5271         if(abs(data(sample_index, class_index) - 1) < numeric_limits<type>::min()) class_samples_number++;
5272     }
5273 
5274     // Get used class samples indices
5275 
5276     Tensor<Index, 1> class_used_samples_indices(class_samples_number);
5277     class_used_samples_indices.setZero();
5278     Index class_sample_index = 0;
5279 
5280     for(Index i = 0; i < samples_number; i++)
5281     {
5282         Index sample_index = used_samples_indices(i);
5283 
5284         if(abs(data(sample_index, class_index) - 1) < numeric_limits<type>::min())
5285         {
5286             class_used_samples_indices(class_sample_index) = sample_index;
5287             class_sample_index++;
5288         }
5289     }
5290 
5291     return descriptives(data, class_used_samples_indices, input_variables_indices);
5292 }
5293 
5294 
5295 /// Returns a vector of vectors containing some basic descriptives of all variables on the training
5296 /// The size of this vector is two. The subvectors are:
5297 /// <ul>
5298 /// <li> Training data minimum.
5299 /// <li> Training data maximum.
5300 /// <li> Training data mean.
5301 /// <li> Training data standard deviation.
5302 /// </ul>
5303 
calculate_columns_descriptives_training_samples() const5304 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_training_samples() const
5305 {
5306     const Tensor<Index, 1> training_indices = get_training_samples_indices();
5307 
5308     const Tensor<Index, 1> used_indices = get_used_columns_indices();
5309 
5310     return descriptives(data, training_indices, used_indices);
5311 }
5312 
5313 
5314 /// Returns a vector of vectors containing some basic descriptives of all variables on the selection
5315 /// The size of this vector is two. The subvectors are:
5316 /// <ul>
5317 /// <li> Selection data minimum.
5318 /// <li> Selection data maximum.
5319 /// <li> Selection data mean.
5320 /// <li> Selection data standard deviation.
5321 /// </ul>
5322 
calculate_columns_descriptives_selection_samples() const5323 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_selection_samples() const
5324 {
5325     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5326 
5327     const Tensor<Index, 1> used_indices = get_used_columns_indices();
5328 
5329     return descriptives(data, selection_indices, used_indices);
5330 }
5331 
5332 
5333 /// Returns a vector of Descriptives structures with some basic statistics of the input variables on the used
5334 /// This includes the minimum, maximum, mean and standard deviation.
5335 /// The size of this vector is the number of inputs.
5336 
calculate_input_variables_descriptives() const5337 Tensor<Descriptives, 1> DataSet::calculate_input_variables_descriptives() const
5338 {
5339     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5340 
5341     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5342 
5343     return descriptives(data, used_samples_indices, input_variables_indices);
5344 }
5345 
5346 
5347 /// Returns a vector of vectors with some basic descriptives of the target variables on all
5348 /// The size of this vector is four. The subvectors are:
5349 /// <ul>
5350 /// <li> Target variables minimum.
5351 /// <li> Target variables maximum.
5352 /// <li> Target variables mean.
5353 /// <li> Target variables standard deviation.
5354 /// </ul>
5355 
calculate_target_variables_descriptives() const5356 Tensor<Descriptives, 1> DataSet::calculate_target_variables_descriptives() const
5357 {
5358     const Tensor<Index, 1> used_indices = get_used_samples_indices();
5359 
5360     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5361 
5362     return descriptives(data, used_indices, target_variables_indices);
5363 }
5364 
5365 
5366 /// Returns a vector containing the minimums of the input variables.
5367 
calculate_input_variables_minimums() const5368 Tensor<type, 1> DataSet::calculate_input_variables_minimums() const
5369 {
5370     return columns_minimums(data, get_used_samples_indices(), get_input_variables_indices());
5371 }
5372 
5373 
5374 /// Returns a vector containing the minimums of the target variables.
5375 
calculate_target_variables_minimums() const5376 Tensor<type, 1> DataSet::calculate_target_variables_minimums() const
5377 {
5378     return columns_minimums(data, get_used_samples_indices(), get_target_variables_indices());
5379 }
5380 
5381 
5382 
5383 /// Returns a vector containing the maximums of the input variables.
5384 
calculate_input_variables_maximums() const5385 Tensor<type, 1> DataSet::calculate_input_variables_maximums() const
5386 {
5387     return columns_maximums(data, get_used_samples_indices(), get_input_variables_indices());
5388 }
5389 
5390 
5391 /// Returns a vector containing the maximums of the target variables.
5392 
calculate_target_variables_maximums() const5393 Tensor<type, 1> DataSet::calculate_target_variables_maximums() const
5394 {
5395     return columns_maximums(data, get_used_samples_indices(), get_target_variables_indices());
5396 }
5397 
5398 
5399 /// Returns a vector containing the maximum of the used variables.
5400 
calculate_used_variables_minimums() const5401 Tensor<type, 1> DataSet::calculate_used_variables_minimums() const
5402 {
5403     return columns_minimums(data, get_used_samples_indices(), get_used_variables_indices());
5404 }
5405 
5406 /// Returns a vector containing the means of a set of given variables.
5407 /// @param variables_indices Indices of the variables.
5408 
calculate_variables_means(const Tensor<Index,1> & variables_indices) const5409 Tensor<type, 1> DataSet::calculate_variables_means(const Tensor<Index, 1>& variables_indices) const
5410 {
5411     const Index variables_number = variables_indices.size();
5412 
5413     Tensor<type, 1> means(variables_number);
5414 
5415     #pragma omp parallel for
5416 
5417     for(Index i = 0; i < variables_number; i++)
5418     {
5419         const Index variable_index = variables_indices(i);
5420 
5421         const Tensor<type, 0> mean = data.chip(variable_index,1).mean();
5422 
5423         means(i) = mean(0);
5424     }
5425 
5426     return means;
5427 }
5428 
5429 
5430 /// Returns a vector with some basic descriptives of the given input variable on all
5431 /// The size of this vector is four:
5432 /// <ul>
5433 /// <li> Input variable minimum.
5434 /// <li> Input variable maximum.
5435 /// <li> Input variable mean.
5436 /// <li> Input variable standard deviation.
5437 /// </ul>
5438 /// @todo
5439 
calculate_input_descriptives(const Index & input_index) const5440 Descriptives DataSet::calculate_input_descriptives(const Index& input_index) const
5441 {
5442 //    return descriptives_missing_values(data.chip(input_index,1));
5443 
5444     return Descriptives();
5445 }
5446 
5447 
calculate_used_targets_mean() const5448 Tensor<type, 1> DataSet::calculate_used_targets_mean() const
5449 {
5450     const Tensor<Index, 1> used_indices = get_used_samples_indices();
5451 
5452     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5453 
5454     return mean(data, used_indices, target_variables_indices);
5455 }
5456 
5457 
5458 
5459 /// Returns the mean values of the target variables on the selection
5460 
calculate_selection_targets_mean() const5461 Tensor<type, 1> DataSet::calculate_selection_targets_mean() const
5462 {
5463     const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5464 
5465     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5466 
5467     return mean(data, selection_indices, target_variables_indices);
5468 }
5469 
5470 
5471 /// Returns the value of the gmt that has the data set, by default it is 0.
5472 /// This is recommended to use in forecasting problems.
5473 
get_gmt() const5474 Index DataSet::get_gmt() const
5475 {
5476     return gmt;
5477 }
5478 
5479 
5480 /// Sets the value of the gmt, by default it is 0.
5481 /// This is recommended to use in forecasting problems.
5482 
set_gmt(Index & new_gmt)5483 void DataSet::set_gmt(Index& new_gmt)
5484 {
5485     gmt = new_gmt;
5486 }
5487 
5488 
5489 /// Calculates the correlations between all outputs and all inputs.
5490 /// It returns a matrix with the data stored in CorrelationsResults format, where the number of rows is the input number
5491 /// and number of columns is the target number.
5492 /// Each element contains the correlation between a single input and a single target.
5493 
calculate_input_target_columns_correlations() const5494 Tensor<CorrelationResults, 2> DataSet::calculate_input_target_columns_correlations() const
5495 {
5496     const Index input_columns_number = get_input_columns_number();
5497     const Index target_columns_number = get_target_columns_number();
5498 
5499     const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5500     Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
5501 
5502     Tensor<CorrelationResults, 2> correlations(input_columns_number, target_columns_number);
5503 
5504 //    #pragma omp parallel for
5505 
5506     for(Index i = 0; i < input_columns_number; i++)
5507     {
5508         const Index input_index = input_columns_indices(i);
5509 
5510         Tensor<type, 2> input = get_column_data(input_index);
5511 
5512         const ColumnType input_type = columns(input_index).type;
5513 
5514         for(Index j = 0; j < target_columns_number; j++)
5515         {
5516             const Index target_index = target_columns_indices(j);
5517 
5518             Tensor<type, 2> target = get_column_data(target_index);
5519 
5520             const ColumnType target_type = columns(target_index).type;
5521 
5522             cout << "Calculating " << columns(input_index).name << " - " << columns(target_index).name << " correlations. \n" ;
5523 
5524             if(input_type == Numeric && target_type == Numeric)
5525             {
5526                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5527                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5528 
5529                 const CorrelationResults linear_correlation = linear_correlations(thread_pool_device, input_column, target_column);
5530                 const CorrelationResults exponential_correlation = exponential_correlations(thread_pool_device, input_column, target_column);
5531                 const CorrelationResults logarithmic_correlation = logarithmic_correlations(thread_pool_device, input_column, target_column);
5532                 const CorrelationResults power_correlation = power_correlations(thread_pool_device, input_column, target_column);
5533 
5534                 CorrelationResults strongest_correlation = linear_correlation;
5535 
5536                 if(abs(exponential_correlation.correlation) > abs(strongest_correlation.correlation)) strongest_correlation = exponential_correlation;
5537                 if(abs(logarithmic_correlation.correlation) > abs(strongest_correlation.correlation)) strongest_correlation = logarithmic_correlation;
5538                 if(abs(power_correlation.correlation) > abs(strongest_correlation.correlation)) strongest_correlation = power_correlation;
5539 
5540                 correlations(i,j) = strongest_correlation;
5541             }
5542             else if(input_type == Binary && target_type == Binary)
5543             {
5544                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5545                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5546 
5547                 correlations(i,j) = linear_correlations(thread_pool_device, input_column, target_column);
5548             }
5549             else if(input_type == Categorical && target_type == Categorical)
5550             {
5551                 // @todo
5552                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target);
5553 
5554 //                correlations(i,j) = karl_pearson_correlations(thread_pool_device, input, target);
5555             }
5556             else if(input_type == Numeric && target_type == Binary)
5557             {
5558                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5559                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5560 
5561                 correlations(i,j) = logistic_correlations(thread_pool_device, input_column, target_column);
5562             }
5563             else if(input_type == Binary && target_type == Numeric)
5564             {
5565                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5566                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5567 
5568                 correlations(i,j) = logistic_correlations(thread_pool_device, input_column, target_column);
5569             }
5570             else if(input_type == Categorical && target_type == Numeric)
5571             {
5572                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5573 
5574                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target/*target_column*/);
5575             }
5576             else if(input_type == Numeric && target_type == Categorical)
5577             {
5578                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5579 
5580                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, target, input/*input_column*/);
5581             }
5582             else if(input_type == Binary && target_type == Categorical)
5583             {
5584                 const TensorMap<Tensor<type, 1>> input_column(input.data(), input.dimension(0));
5585 
5586                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, target, input/*input_column*/);
5587 
5588 //                correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target);
5589 
5590 //                const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5591 
5592 //                Tensor<type, 2> new_input = transform_binary_column(input_column);
5593 
5594 //                correlations(i,j) = karl_pearson_correlations(thread_pool_device, new_input, target);
5595             }
5596             else if(input_type == Categorical && target_type == Binary)
5597             {
5598                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target);
5599 
5600 //                const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5601 
5602 //                Tensor<type, 2> new_target = transform_binary_column(target_column);
5603 
5604 //                correlations(i,j) = karl_pearson_correlations(thread_pool_device, input, new_target);
5605             }
5606             else if(input_type == DateTime || target_type == DateTime)
5607             {
5608                 correlations(i,j).correlation = 0;
5609             }
5610             else
5611             {
5612                 ostringstream buffer;
5613 
5614                 buffer << "OpenNN Exception: DataSet class.\n"
5615                        << "Tensor<type, 2> calculate_input_target_columns_correlations() const method.\n"
5616                        << "Case not found: Column " << columns(input_index).name << " and Column " << columns(target_index).name << ".\n";
5617 
5618                 throw logic_error(buffer.str());
5619             }
5620 
5621             cout << "Correlation: " << correlations(i,j).correlation << endl;
5622 
5623         }
5624     }
5625 
5626     return correlations;
5627 }
5628 
5629 
5630 /// Calculates the correlations between all outputs and all inputs.
5631 /// It returns a matrix with the number of rows is the input number
5632 /// and number of columns is the target number.
5633 /// Each element contains the correlation between a single input and a single target.
5634 
calculate_input_target_columns_correlations_values() const5635 Tensor<type, 2> DataSet::calculate_input_target_columns_correlations_values() const
5636 {
5637     Tensor<CorrelationResults, 2> correlations = calculate_input_target_columns_correlations();
5638 
5639     const Index rows_number = correlations.dimension(0);
5640     const Index columns_number = correlations.dimension(1);
5641 
5642     Tensor<type, 2> correlations_values(rows_number, columns_number);
5643 
5644     for(Index i = 0; i < rows_number; i++)
5645     {
5646         for(Index j = 0; j < columns_number; j++)
5647         {
5648             correlations_values(i,j) = correlations(i,j).correlation;
5649         }
5650     }
5651 
5652     return correlations_values;
5653 }
5654 
5655 
5656 /// Returns true if the data contain missing values.
5657 
has_nan() const5658 bool DataSet::has_nan() const
5659 {
5660     for(Index i = 0; i < data.size(); i++) if(::isnan(data(i))) return true;
5661 
5662     return false;
5663 }
5664 
5665 
5666 /// Returns true if the given row contains missing values.
5667 
has_nan_row(const Index & row_index) const5668 bool DataSet::has_nan_row(const Index& row_index) const
5669 {
5670     for(Index j = 0; j < data.dimension(1); j++)
5671     {
5672         if(::isnan(data(row_index,j))) return true;
5673     }
5674 
5675     return false;
5676 }
5677 
5678 
5679 /// Print on screen the information about the missing values in the data set.
5680 /// <ul>
5681 /// <li> Total number of missing values.
5682 /// <li> Number of variables with missing values.
5683 /// <li> Number of samples with missing values.
5684 /// </ul>
5685 /// @todo implement with indices of variables and samples?
5686 
print_missing_values_information() const5687 void DataSet::print_missing_values_information() const
5688 {
5689 //    const Index missing_values_number = data.count_nan();
5690 
5691 //    cout << "Missing values number: " << missing_values_number << " (" << missing_values_number*100/data.size() << "%)" << endl;
5692 
5693 //    const Index variables_with_missing_values = data.count_columns_with_nan();
5694 
5695 //    cout << "Variables with missing values: " << variables_with_missing_values << " (" << variables_with_missing_values*100/data.dimension(1) << "%)" << endl;
5696 
5697 //    const Index samples_with_missing_values = data.count_rows_with_nan();
5698 
5699 //    cout << "Samples with missing values: " << samples_with_missing_values << " (" << samples_with_missing_values*100/data.dimension(0) << "%)" << endl;
5700 }
5701 
5702 
5703 /// Print on screen the correlation between targets and inputs.
5704 
print_input_target_columns_correlations() const5705 void DataSet::print_input_target_columns_correlations() const
5706 {
5707     const Index inputs_number = get_input_variables_number();
5708     const Index targets_number = get_target_variables_number();
5709 
5710     const Tensor<string, 1> inputs_names = get_input_variables_names();
5711     const Tensor<string, 1> targets_name = get_target_variables_names();
5712 
5713     const Tensor<RegressionResults, 2> correlations;// = calculate_input_target_columns_correlations();
5714 
5715     for(Index j = 0; j < targets_number; j++)
5716     {
5717         for(Index i = 0; i < inputs_number; i++)
5718         {
5719             cout << targets_name(j) << " - " << inputs_names(i) << ": " << correlations(i,j).correlation << endl;
5720         }
5721     }
5722 }
5723 
5724 
5725 /// This method print on screen the corretaliont between inputs and targets.
5726 /// @param number Number of variables to be printed.
5727 /// @todo
5728 
print_top_input_target_columns_correlations(const Index & number) const5729 void DataSet::print_top_input_target_columns_correlations(const Index& number) const
5730 {
5731     const Index inputs_number = get_input_columns_number();
5732     const Index targets_number = get_target_columns_number();
5733 
5734     const Tensor<string, 1> inputs_names = get_input_variables_names();
5735     const Tensor<string, 1> targets_name = get_target_variables_names();
5736 
5737     const Tensor<RegressionResults, 2> correlations;// = calculate_input_target_columns_correlations();
5738 
5739     Tensor<type, 1> target_correlations(inputs_number);
5740 
5741     Tensor<string, 2> top_correlations(inputs_number, 2);
5742 
5743     map<type,string> top_correlation;
5744 
5745     for(Index i = 0 ; i < inputs_number; i++)
5746     {
5747         for(Index j = 0 ; j < targets_number ; j++)
5748         {
5749 //            top_correlation.insert(pair<type,string>(correlations(i,j), inputs_names(i) + " - " + targets_name(j)));
5750         }
5751     }
5752 
5753     map<type,string>::iterator it;
5754 
5755     for(it = top_correlation.begin(); it!=top_correlation.end(); it++)
5756     {
5757         cout << "Correlation:  " << (*it).first << "  between  " << (*it).second << "" << endl;
5758     }
5759 }
5760 
5761 
5762 /// Calculates the regressions between all outputs and all inputs.
5763 /// It returns a matrix with the data stored in RegressionResults format, where the number of rows is the input number
5764 /// and number of columns is the target number.
5765 /// Each element contains the correlation between a single input and a single target.
5766 
calculate_input_target_columns_regressions() const5767 Tensor<RegressionResults, 2> DataSet::calculate_input_target_columns_regressions() const
5768 {
5769     const Index input_columns_number = get_input_columns_number();
5770     const Index target_columns_number = get_target_columns_number();
5771 
5772     const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5773     Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
5774 
5775     Tensor<RegressionResults, 2> regressions(input_columns_number, target_columns_number);
5776 
5777     //@todo check pragma, if uncommented, for does not work well.
5778 //#pragma omp parallel for
5779 
5780     for(Index i = 0; i < input_columns_number; i++)
5781     {
5782         cout << endl;
5783 
5784         const Index input_index = input_columns_indices(i);
5785 
5786         Tensor<type, 2> input = get_column_data(input_index);
5787 
5788         const ColumnType input_type = columns(input_index).type;
5789 
5790         cout << "Calculating " << columns(input_index).name;
5791 
5792         for(Index j = 0; j < target_columns_number; j++)
5793         {
5794             const Index target_index = target_columns_indices(j);
5795 
5796             Tensor<type, 2> target = get_column_data(target_index);
5797 
5798             const ColumnType target_type = columns(target_index).type;
5799 
5800             cout << " - " << columns(target_columns_indices(j)).name << " regression. \n" ;
5801 
5802             if(input_type == Numeric && target_type == Numeric)
5803             {
5804                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5805                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5806 
5807                 const RegressionResults linear_regression = OpenNN::linear_regression(thread_pool_device, input_column, target_column);
5808                 const RegressionResults exponential_regression = OpenNN::exponential_regression(thread_pool_device, input_column, target_column);
5809                 const RegressionResults logarithmic_regression = OpenNN::logarithmic_regression(thread_pool_device, input_column, target_column);
5810                 const RegressionResults power_regression = OpenNN::power_regression(thread_pool_device, input_column, target_column);
5811 
5812                 RegressionResults strongest_regression = linear_regression;
5813 
5814                 if(abs(exponential_regression.correlation) > abs(strongest_regression.correlation)) strongest_regression = exponential_regression;
5815                 if(abs(logarithmic_regression.correlation) > abs(strongest_regression.correlation)) strongest_regression = logarithmic_regression;
5816                 if(abs(power_regression.correlation) > abs(strongest_regression.correlation)) strongest_regression = power_regression;
5817 
5818                 regressions(i,j) = strongest_regression;
5819             }
5820             else if(input_type == Binary && target_type == Binary)
5821             {
5822                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5823                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5824 
5825                 regressions(i,j) = linear_regression(thread_pool_device, input_column, target_column);
5826             }
5827             else if(input_type == Numeric && target_type == Binary)
5828             {
5829                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5830                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5831 
5832                 regressions(i,j) = logistic_regression(thread_pool_device, input_column, target_column);
5833             }
5834             else if(input_type == Binary && target_type == Numeric)
5835             {
5836                 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5837                 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5838 
5839                 regressions(i,j) = logistic_regression(thread_pool_device, input_column, target_column);
5840             }
5841             else if(input_type == Categorical && target_type == Categorical)
5842             {
5843                 // Nothing
5844 
5845                 regressions(i,j).a = 0;
5846                 regressions(i,j).b = 0;
5847             }
5848             else if(input_type == Categorical && target_type == Numeric)
5849             {
5850                 // Nothing
5851 
5852                 regressions(i,j).a = 0;
5853                 regressions(i,j).b = 0;
5854             }
5855             else if(input_type == Numeric && target_type == Categorical)
5856             {
5857                 // Nothing
5858 
5859                 regressions(i,j).a = 0;
5860                 regressions(i,j).b = 0;
5861             }
5862             else if(input_type == Binary && target_type == Categorical)
5863             {
5864                 // nothing
5865 
5866                 regressions(i,j).a = 0;
5867                 regressions(i,j).b = 0;
5868             }
5869             else if(input_type == Categorical && target_type == Binary)
5870             {
5871                 // nothing
5872 
5873                 regressions(i,j).a = 0;
5874                 regressions(i,j).b = 0;
5875             }
5876             else
5877             {
5878                 ostringstream buffer;
5879 
5880                 buffer << "OpenNN Exception: DataSet class.\n"
5881                        << "Tensor<type, 2> calculate_input_target_columns_regressions() const method.\n"
5882                        << "Case not found: Column " << columns(input_index).name << " and Column " << columns(target_index).name << ".\n";
5883 
5884                 throw logic_error(buffer.str());
5885             }
5886         }
5887     }
5888 
5889     return regressions;
5890 }
5891 
5892 
5893 /// Calculate the correlation between each input in the data set.
5894 /// Returns a matrix with the correlation values between variables in the data set.
5895 
calculate_input_columns_correlations() const5896 Tensor<type, 2> DataSet::calculate_input_columns_correlations() const
5897 {
5898     const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5899 
5900     const Index input_columns_number = get_input_columns_number();
5901 
5902     Tensor<type, 2> correlations(input_columns_number, input_columns_number);
5903     correlations.setConstant(1);
5904 
5905     for(Index i = 0; i < input_columns_number; i++)
5906     {
5907         const Index current_input_index_i = input_columns_indices(i);
5908 
5909         const ColumnType type_i = columns(current_input_index_i).type;
5910 
5911         Tensor<type, 2> input_i = get_column_data(current_input_index_i);
5912 
5913         cout << "Calculating " << columns(current_input_index_i).name << " correlations. " << endl;
5914 
5915         #pragma omp parallel for
5916 
5917         for(Index j = i; j < input_columns_number; j++)
5918         {
5919             const Index current_input_index_j = input_columns_indices(j);
5920 
5921             const ColumnType type_j = columns(current_input_index_j).type;
5922 
5923             Tensor<type, 2> input_j = get_column_data(current_input_index_j);
5924 
5925             if(current_input_index_i == current_input_index_j)
5926             {
5927                 correlations(i,j) = 1;
5928                 continue;
5929             }
5930 
5931             if(type_i == Numeric && type_j == Numeric)
5932             {
5933                 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5934                 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5935 
5936                 const type linear_correlation = OpenNN::linear_correlation(thread_pool_device, current_input_i, current_input_j);
5937                 const type exponential_correlation = OpenNN::exponential_correlation(thread_pool_device, current_input_i, current_input_j);
5938                 const type logarithmic_correlation = OpenNN::logarithmic_correlation(thread_pool_device, current_input_i, current_input_j);
5939                 const type power_correlation = OpenNN::power_correlation(thread_pool_device, current_input_i, current_input_j);
5940 
5941                 type strongest_correlation = linear_correlation;
5942 
5943                 if(fabsf(exponential_correlation) > fabsf(strongest_correlation)) strongest_correlation = exponential_correlation;
5944                 if(fabsf(logarithmic_correlation) > fabsf(strongest_correlation)) strongest_correlation = logarithmic_correlation;
5945                 if(fabsf(power_correlation) > fabsf(strongest_correlation)) strongest_correlation = power_correlation;
5946 
5947                 correlations(i,j) = strongest_correlation;
5948             }
5949             else if(type_i == Binary && type_j == Binary)
5950             {
5951                 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5952                 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5953 
5954                 correlations(i,j) = linear_correlation(thread_pool_device, current_input_i, current_input_j);
5955             }
5956             else if(type_i == Categorical && type_j == Categorical)
5957             {
5958                 correlations(i,j) = karl_pearson_correlation(thread_pool_device, input_i, input_j);
5959             }
5960             else if(type_i == Numeric && type_j == Binary)
5961             {
5962                 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5963                 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5964 
5965                 correlations(i,j) = logistic_correlations(thread_pool_device, current_input_i, current_input_j).correlation;
5966             }
5967             else if(type_i == Binary && type_j == Numeric)
5968             {
5969                 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5970                 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5971 
5972                 correlations(i,j) = logistic_correlations(thread_pool_device, current_input_i, current_input_j).correlation;
5973             }
5974             else if(type_i == Categorical && type_j == Numeric)
5975             {
5976                 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5977 
5978                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_i, input_j/*current_input_j*/).correlation;
5979             }
5980             else if(type_i == Numeric && type_j == Categorical)
5981             {
5982                 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5983 
5984                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_j, input_i/*current_input_i*/).correlation;
5985             }
5986             else if(type_i == Categorical && type_j == Binary)
5987             {
5988                 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5989 
5990                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_i, input_j/*current_input_j*/).correlation;
5991             }
5992             else if(type_i == Binary && type_j == Categorical)
5993             {
5994                 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5995 
5996                 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_j, input_i/*current_input_i*/).correlation;
5997             }
5998             else
5999             {
6000                 ostringstream buffer;
6001 
6002                 buffer << "OpenNN Exception: DataSet class.\n"
6003                        << "Tensor<type, 2> calculate_inputs_correlations() const method.\n"
6004                        << "Case not found: Column " << columns(input_columns_indices(i)).name << " and Column " << columns(input_columns_indices(j)).name << ".\n";
6005 
6006                 throw logic_error(buffer.str());
6007             }
6008 
6009         }
6010     }
6011 
6012     for(Index i = 0; i < input_columns_number; i++)
6013     {
6014         for(Index j = 0; j < i; j++)
6015         {
6016             correlations(i,j) = correlations(j,i);
6017         }
6018     }
6019 
6020     return correlations;
6021 
6022 }
6023 
6024 
6025 /// Print on screen the correlation between variables in the data set.
6026 
print_inputs_correlations() const6027 void DataSet::print_inputs_correlations() const
6028 {
6029     const Tensor<type, 2> inputs_correlations = calculate_input_columns_correlations();
6030 
6031     cout << inputs_correlations << endl;
6032 }
6033 
6034 
print_data_file_preview() const6035 void DataSet::print_data_file_preview() const
6036 {
6037     const Index size = data_file_preview.size();
6038 
6039     for(Index i = 0;  i < size; i++)
6040     {
6041         for(Index j = 0; j < data_file_preview(i).size(); j++)
6042         {
6043             cout << data_file_preview(i)(j) << " ";
6044         }
6045 
6046         cout << endl;
6047     }
6048 }
6049 
6050 
6051 /// This method print on screen the corretaliont between variables.
6052 /// @param number Number of variables to be printed.
6053 /// @todo Low priority.
6054 
print_top_inputs_correlations(const Index & number) const6055 void DataSet::print_top_inputs_correlations(const Index& number) const
6056 {
6057     const Index variables_number = get_input_variables_number();
6058 
6059     const Tensor<string, 1> variables_name = get_input_variables_names();
6060 
6061     const Tensor<type, 2> variables_correlations = calculate_input_columns_correlations();
6062 
6063     const Index correlations_number = variables_number*(variables_number-1)/2;
6064 
6065     Tensor<string, 2> top_correlations(correlations_number, 3);
6066 
6067     map<type, string> top_correlation;
6068 
6069     for(Index i = 0; i < variables_number; i++)
6070     {
6071         for(Index j = i; j < variables_number; j++)
6072         {
6073             if(i == j) continue;
6074 
6075             top_correlation.insert(pair<type,string>(variables_correlations(i,j), variables_name(i) + " - " + variables_name(j)));
6076          }
6077      }
6078 
6079     map<type,string> :: iterator it;
6080 
6081     for(it=top_correlation.begin(); it!=top_correlation.end(); it++)
6082     {
6083         cout << "Correlation: " << (*it).first << "  between  " << (*it).second << "" << endl;
6084     }
6085 }
6086 
6087 
6088 /// Returns the covariance matrix for the input data set.
6089 /// The number of rows of the matrix is the number of inputs.
6090 /// The number of columns of the matrix is the number of inputs.
6091 /// @todo
6092 
calculate_covariance_matrix() const6093 Tensor<type, 2> DataSet::calculate_covariance_matrix() const
6094 {
6095     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6096     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
6097 
6098     const Index inputs_number = get_input_variables_number();
6099 
6100     Tensor<type, 2> covariance_matrix(inputs_number, inputs_number);
6101 
6102     for(Index i = 0; i < static_cast<Index>(inputs_number); i++)
6103     {
6104         const Index first_input_index = input_variables_indices(i);
6105 
6106 //        const Tensor<type, 1> first_inputs = data.get_column(first_input_index, used_samples_indices);
6107 
6108         for(Index j = i; j < inputs_number; j++)
6109         {
6110             const Index second_input_index = input_variables_indices(j);
6111 
6112 //            const Tensor<type, 1> second_inputs = data.get_column(second_input_index, used_samples_indices);
6113 
6114 //            covariance_matrix(i,j) = covariance(first_inputs, second_inputs);
6115             covariance_matrix(j,i) = covariance_matrix(i,j);
6116         }
6117     }
6118 
6119     return covariance_matrix;
6120 }
6121 
6122 
6123 /// Performs the principal components analysis of the inputs.
6124 /// It returns a matrix containing the principal components getd in rows.
6125 /// This method deletes the unused samples of the original data set.
6126 /// @param minimum_explained_variance Minimum percentage of variance used to select a principal component.
6127 
perform_principal_components_analysis(const type & minimum_explained_variance)6128 Tensor<type, 2> DataSet::perform_principal_components_analysis(const type& minimum_explained_variance)
6129 {
6130     // Subtract off the mean
6131 
6132     subtract_inputs_mean();
6133 
6134     // Calculate covariance matrix
6135 
6136     const Tensor<type, 2> covariance_matrix = this->calculate_covariance_matrix();
6137 
6138     // Calculate eigenvectors
6139 
6140 //    const Tensor<type, 2> eigenvectors = OpenNN::eigenvectors(covariance_matrix);
6141 
6142     // Calculate eigenvalues
6143 
6144 //    const Tensor<type, 2> eigenvalues = OpenNN::eigenvalues(covariance_matrix);
6145 
6146     // Calculate explained variance
6147 
6148 //    const Tensor<type, 1> explained_variance = OpenNN::explained_variance(eigenvalues.chip(0,1));
6149 
6150     // Sort principal components
6151 
6152 //    const Tensor<Index, 1> sorted_principal_components_indices = explained_variance.sort_descending_indices();
6153 
6154     // Choose eigenvectors
6155 
6156     const Index inputs_number = covariance_matrix.dimension(1);
6157 
6158     Tensor<Index, 1> principal_components_indices;
6159 
6160     Index index;
6161 
6162     for(Index i = 0; i < inputs_number; i++)
6163     {
6164 //        index = sorted_principal_components_indices(i);
6165 
6166 //        if(explained_variance(index) >= minimum_explained_variance)
6167         {
6168 //            principal_components_indices.push_back(i);
6169         }
6170 //        else
6171         {
6172             continue;
6173         }
6174     }
6175 
6176     const Index principal_components_number = principal_components_indices.size();
6177 
6178     // Arrange principal components matrix
6179 
6180     Tensor<type, 2> principal_components;
6181 
6182     if(principal_components_number == 0)
6183     {
6184         return principal_components;
6185     }
6186     else
6187     {
6188         principal_components.resize(principal_components_number, inputs_number);
6189     }
6190 
6191     for(Index i = 0; i < principal_components_number; i++)
6192     {
6193 //        index = sorted_principal_components_indices(i);
6194 
6195 //        principal_components.set_row(i, eigenvectors.chip(index,1));
6196     }
6197 
6198     // Return feature matrix
6199 
6200 //    return principal_components.get_submatrix_rows(principal_components_indices);
6201 
6202     return Tensor<type, 2>();
6203 }
6204 
6205 
6206 /// Performs the principal components analysis of the inputs.
6207 /// It returns a matrix containing the principal components arranged in rows.
6208 /// This method deletes the unused samples of the original data set.
6209 /// @param covariance_matrix Matrix of covariances.
6210 /// @param explained_variance vector of the explained variances of the variables.
6211 /// @param minimum_explained_variance Minimum percentage of variance used to select a principal component.
6212 /// @todo
6213 
perform_principal_components_analysis(const Tensor<type,2> & covariance_matrix,const Tensor<type,1> & explained_variance,const type & minimum_explained_variance)6214 Tensor<type, 2> DataSet::perform_principal_components_analysis(const Tensor<type, 2>& covariance_matrix,
6215         const Tensor<type, 1>& explained_variance,
6216         const type& minimum_explained_variance)
6217 {
6218         // Subtract off the mean
6219 
6220         subtract_inputs_mean();
6221 
6222         // Calculate eigenvectors
6223 
6224 //        const Tensor<type, 2> eigenvectors = OpenNN::eigenvectors(covariance_matrix);
6225 
6226         // Sort principal components
6227 
6228 //        const Tensor<Index, 1> sorted_principal_components_indices = explained_variance.sort_descending_indices();
6229 
6230         // Choose eigenvectors
6231 
6232         const Index inputs_number = covariance_matrix.dimension(1);
6233 
6234         Tensor<Index, 1> principal_components_indices;
6235 
6236         Index index;
6237 
6238         for(Index i = 0; i < inputs_number; i++)
6239         {
6240 //            index = sorted_principal_components_indices(i);
6241 
6242 //            if(explained_variance(index) >= minimum_explained_variance)
6243             {
6244 //                principal_components_indices.push_back(i);
6245             }
6246 //            else
6247             {
6248                 continue;
6249             }
6250         }
6251 
6252         const Index principal_components_number = principal_components_indices.size();
6253 
6254         // Arrange principal components matrix
6255 
6256         Tensor<type, 2> principal_components;
6257 
6258         if(principal_components_number == 0)
6259         {
6260             return principal_components;
6261         }
6262         else
6263         {
6264             principal_components.resize(principal_components_number, inputs_number);
6265         }
6266 
6267         for(Index i = 0; i < principal_components_number; i++)
6268         {
6269 //            index = sorted_principal_components_indices(i);
6270 
6271 //            principal_components.set_row(i, eigenvectors.chip(index,1));
6272         }
6273 
6274         // Return feature matrix
6275 
6276 //        return principal_components.get_submatrix_rows(principal_components_indices);
6277 
6278     return Tensor<type, 2>();
6279 }
6280 
6281 
6282 /// Transforms the data according to the principal components.
6283 /// @param principal_components Matrix containing the principal components.
6284 /// @todo
6285 
transform_principal_components_data(const Tensor<type,2> & principal_components)6286 void DataSet::transform_principal_components_data(const Tensor<type, 2>& principal_components)
6287 {
6288     const Tensor<type, 2> targets = get_target_data();
6289 
6290     subtract_inputs_mean();
6291 
6292     const Index principal_components_number = principal_components.dimension(0);
6293 
6294     // Transform data
6295 
6296     const Tensor<Index, 1> used_samples = get_used_samples_indices();
6297 
6298     const Index new_samples_number = get_used_samples_number();
6299 
6300     const Tensor<type, 2> inputs = get_input_data();
6301 
6302     Tensor<type, 2> new_data(new_samples_number, principal_components_number);
6303 
6304     Index sample_index;
6305 
6306     for(Index i = 0; i < new_samples_number; i++)
6307     {
6308         sample_index = used_samples(i);
6309 
6310         for(Index j = 0; j < principal_components_number; j++)
6311         {
6312             Tensor<type, 0> dot = (inputs.chip(sample_index, 0)).contract(principal_components.chip(j,0),product_vector_vector);
6313 
6314             new_data(i,j) = dot(0);
6315 //            new_data(i,j) = dot(inputs.chip(sample_index, 0), principal_components.chip(j, 0));
6316         }
6317     }
6318 
6319 //        data = new_data.assemble_columns(targets);
6320 
6321 }
6322 
6323 
6324 /// Scales the data matrix with given mean and standard deviation values.
6325 /// It updates the data matrix.
6326 /// @param data_descriptives vector of descriptives structures for all the variables in the data set.
6327 /// The size of that vector must be equal to the number of variables.
6328 /// @todo
6329 
scale_data_mean_standard_deviation(const Tensor<Descriptives,1> & data_descriptives)6330 void DataSet::scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>& data_descriptives)
6331 {
6332 
6333    #ifdef __OPENNN_DEBUG__
6334 
6335    ostringstream buffer;
6336 
6337    const Index columns_number = data.dimension(1);
6338 
6339    const Index descriptives_size = data_descriptives.size();
6340 
6341    if(descriptives_size != columns_number)
6342    {
6343       buffer << "OpenNN Exception: DataSet class.\n"
6344              << "void scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>&) method.\n"
6345              << "Size of descriptives must be equal to number of columns.\n";
6346 
6347       throw logic_error(buffer.str());
6348    }
6349 
6350    #endif
6351 
6352    const Index variables_number = get_variables_number();
6353 
6354    for(Index i = 0; i < variables_number; i++)
6355    {
6356        if(display && abs(data_descriptives(i).standard_deviation) < numeric_limits<type>::min())
6357        {
6358           cout << "OpenNN Warning: DataSet class.\n"
6359                     << "void scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>&) method.\n"
6360                     << "Standard deviation of variable " <<  i << " is zero.\n"
6361                     << "That variable won't be scaled.\n";
6362         }
6363     }
6364 
6365 //   scale_mean_standard_deviation(data, data_descriptives);
6366 
6367 }
6368 
6369 
6370 /// Scales the data using the minimum and maximum method,
6371 /// and the minimum and maximum values calculated from the data matrix.
6372 /// It also returns the descriptives from all columns.
6373 
scale_data_minimum_maximum()6374 Tensor<Descriptives, 1> DataSet::scale_data_minimum_maximum()
6375 {
6376     const Tensor<Descriptives, 1> data_descriptives = calculate_variables_descriptives();
6377 
6378     scale_data_minimum_maximum(data_descriptives);
6379 
6380     return data_descriptives;
6381 }
6382 
6383 
6384 /// Scales the data using the mean and standard deviation method,
6385 /// and the mean and standard deviation values calculated from the data matrix.
6386 /// It also returns the descriptives from all columns.
6387 
scale_data_mean_standard_deviation()6388 Tensor<Descriptives, 1> DataSet::scale_data_mean_standard_deviation()
6389 {
6390     const Tensor<Descriptives, 1> data_descriptives = calculate_variables_descriptives();
6391 
6392     scale_data_mean_standard_deviation(data_descriptives);
6393 
6394     return data_descriptives;
6395 }
6396 
6397 
scale_minimum_maximum_binary(const type & value_1,const type & value_2,const Index & column_index)6398 void DataSet::scale_minimum_maximum_binary(const type& value_1, const type& value_2,const Index& column_index)
6399 {
6400     const Index rows_number = data.dimension(0);
6401 
6402     type slope = 0;
6403     type intercept = 0;
6404 
6405     if(value_1>value_2){
6406         slope = 1/(value_1-value_2);
6407         intercept = -value_2/(value_1-value_2);
6408     }else{
6409         slope = 1/(value_2-value_1);
6410         intercept = -value_1/(value_2-value_1);
6411     }
6412 
6413     for(Index i = 0; i < rows_number; i++)
6414     {
6415         data(i, column_index) = slope*data(i, column_index)+intercept;
6416     }
6417 
6418 }
6419 
6420 /// Subtracts off the mean to every of the input variables.
6421 
subtract_inputs_mean()6422 void DataSet::subtract_inputs_mean()
6423 {
6424     Tensor<Descriptives, 1> input_statistics = calculate_input_variables_descriptives();
6425 
6426     Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6427     Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
6428 
6429     Index input_index;
6430     Index sample_index;
6431 
6432     type input_mean;
6433 
6434     for(Index i = 0; i < input_variables_indices.size(); i++)
6435     {
6436         input_index = input_variables_indices(i);
6437 
6438         input_mean = input_statistics(i).mean;
6439 
6440         for(Index j = 0; j < used_samples_indices.size(); j++)
6441         {
6442             sample_index = used_samples_indices(j);
6443 
6444             data(sample_index,input_index) -= input_mean;
6445         }
6446     }
6447 }
6448 
6449 
6450 /// Returns a vector of strings containing the scaling method that best fits each
6451 /// of the input variables.
6452 
calculate_default_scaling_methods() const6453 Tensor<string, 1> DataSet::calculate_default_scaling_methods() const
6454 {
6455     const Tensor<Index, 1> used_inputs_indices = get_input_variables_indices();
6456     const Index used_inputs_number = used_inputs_indices.size();
6457 
6458     Index current_distribution;
6459     Tensor<string, 1> scaling_methods(used_inputs_number);
6460 
6461     #pragma omp parallel for private(current_distribution)
6462 
6463     for(Index i = 0; i < static_cast<Index>(used_inputs_number); i++)
6464     {
6465         current_distribution = perform_distribution_distance_analysis(data.chip(used_inputs_indices(i),1));
6466 
6467         if(current_distribution == 0) // Normal distribution
6468         {
6469             scaling_methods(i) = "MeanStandardDeviation";
6470         }
6471         else if(current_distribution == 1) // Uniform distribution
6472         {
6473             scaling_methods(i) = "MinimumMaximum";
6474         }
6475         else // Default
6476         {
6477             scaling_methods(i) = "MinimumMaximum";
6478         }
6479     }
6480 
6481     return scaling_methods;
6482 }
6483 
6484 
6485 /// Returns a vector of strings containing the scaling method that best fits each
6486 /// of the target variables.
6487 
calculate_default_unscaling_methods() const6488 Tensor<string, 1> DataSet::calculate_default_unscaling_methods() const
6489 {
6490     const Tensor<Index, 1> used_targets_indices = get_target_variables_indices();
6491     const Index used_targets_number = used_targets_indices.size();
6492 
6493     Index current_distribution;
6494     Tensor<string, 1> scaling_methods(used_targets_number);
6495 
6496     #pragma omp parallel for private(current_distribution)
6497 
6498     for(Index i = 0; i < static_cast<Index>(used_targets_number); i++)
6499     {
6500         current_distribution = perform_distribution_distance_analysis(data.chip(used_targets_indices(i),1));
6501 
6502         if(current_distribution == 0) // Normal distribution
6503         {
6504             scaling_methods(i) = "MeanStandardDeviation";
6505         }
6506         else if(current_distribution == 1) // Uniform distribution
6507         {
6508             scaling_methods(i) = "MinimumMaximum";
6509         }
6510         else // Default
6511         {
6512             scaling_methods(i) = "MinimumMaximum";
6513         }
6514     }
6515 
6516     return scaling_methods;
6517 }
6518 
6519 
6520 /// Scales the data matrix with given minimum and maximum values.
6521 /// It updates the data matrix.
6522 /// @param data_descriptives vector of descriptives structures for all the variables in the data set.
6523 /// The size of that vector must be equal to the number of variables.
6524 /// @todo
6525 
scale_data_minimum_maximum(const Tensor<Descriptives,1> & data_descriptives)6526 void DataSet::scale_data_minimum_maximum(const Tensor<Descriptives, 1>& data_descriptives)
6527 {
6528     const Index variables_number = get_variables_number();
6529 
6530 #ifdef __OPENNN_DEBUG__
6531 
6532     ostringstream buffer;
6533 
6534     const Index descriptives_size = data_descriptives.size();
6535 
6536     if(descriptives_size != variables_number)
6537     {
6538         buffer << "OpenNN Exception: DataSet class.\n"
6539                << "void scale_data_minimum_maximum(const Tensor<Descriptives, 1>&) method.\n"
6540                << "Size of data descriptives must be equal to number of variables.\n";
6541 
6542         throw logic_error(buffer.str());
6543     }
6544 
6545 #endif
6546 
6547     for(Index i = 0; i < variables_number; i++)
6548     {
6549         if(display
6550                 && abs(data_descriptives(i).maximum - data_descriptives(i).minimum) < numeric_limits<type>::min())
6551         {
6552             cout << "OpenNN Warning: DataSet class.\n"
6553                  << "void scale_data_minimum_maximum(const Tensor<Descriptives, 1>&) method.\n"
6554                  << "Range of variable " <<  i << " is zero.\n"
6555                  << "That variable won't be scaled.\n";
6556         }
6557     }
6558 
6559 //       scale_minimum_maximum(data, data_descriptives);
6560 }
6561 
6562 
6563 /// Scales the given input variables with given mean and standard deviation values.
6564 /// It updates the input variable of the data matrix.
6565 /// @param input_statistics vector of descriptives structures for the input variables.
6566 /// @param input_index Index of the input to be scaled.
6567 
scale_input_mean_standard_deviation(const Descriptives & input_statistics,const Index & input_index)6568 void DataSet::scale_input_mean_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
6569 {
6570     const type slope = (input_statistics.standard_deviation -0) < static_cast<type>(1e-3) ?
6571                 0 :
6572                 static_cast<type>(1)/input_statistics.standard_deviation;
6573 
6574     const type intercept = (input_statistics.standard_deviation -0) < static_cast<type>(1e-3) ?
6575                 0 :
6576                 -static_cast<type>(1)*input_statistics.mean/input_statistics.standard_deviation;
6577 
6578     for(Index i = 0; i < data.dimension(0); i++)
6579     {
6580         data(i, input_index) = data(i, input_index)*slope + intercept;
6581     }
6582 }
6583 
6584 
6585 /// Scales the given input variables with the calculated mean and standard deviation values from the data matrix.
6586 /// It updates the input variables of the data matrix.
6587 /// It also returns a vector with the variables descriptives.
6588 /// @param input_index Index of the input to be scaled.
6589 
scale_input_mean_standard_deviation(const Index & input_index)6590 Descriptives DataSet::scale_input_mean_standard_deviation(const Index& input_index)
6591 {
6592 #ifdef __OPENNN_DEBUG__
6593 
6594     if(is_empty())
6595     {
6596         ostringstream buffer;
6597 
6598         buffer << "OpenNN Exception: DataSet class.\n"
6599                << "Descriptives scale_input_mean_standard_deviation(const Index&) method.\n"
6600                << "Data file is not loaded.\n";
6601 
6602         throw logic_error(buffer.str());
6603     }
6604 
6605 #endif
6606 
6607     const Descriptives input_statistics = calculate_input_descriptives(input_index);
6608 
6609     scale_input_mean_standard_deviation(input_statistics, input_index);
6610 
6611     return input_statistics;
6612 }
6613 
6614 
6615 /// Scales the given input variables with given standard deviation values.
6616 /// It updates the input variable of the data matrix.
6617 /// @param inputs_statistics vector of descriptives structures for the input variables.
6618 /// @param input_index Index of the input to be scaled.
6619 
scale_input_standard_deviation(const Descriptives & input_statistics,const Index & input_index)6620 void DataSet::scale_input_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
6621 {
6622     for(Index i = 0; i < data.dimension(0); i++)
6623     {
6624         data(i, input_index) = static_cast<type>(2)*(data(i, input_index)) / input_statistics.standard_deviation;
6625     }
6626 }
6627 
6628 
6629 /// Scales the given input variables with the calculated standard deviation values from the data matrix.
6630 /// It updates the input variables of the data matrix.
6631 /// It also returns a vector with the variables descriptives.
6632 /// @param input_index Index of the input to be scaled.
6633 
scale_input_standard_deviation(const Index & input_index)6634 Descriptives DataSet::scale_input_standard_deviation(const Index& input_index)
6635 {
6636 #ifdef __OPENNN_DEBUG__
6637 
6638     if(is_empty())
6639     {
6640         ostringstream buffer;
6641 
6642         buffer << "OpenNN Exception: DataSet class.\n"
6643                << "Descriptives scale_input_standard_deviation(const Index&) method.\n"
6644                << "Data file is not loaded.\n";
6645 
6646         throw logic_error(buffer.str());
6647     }
6648 
6649 #endif
6650 
6651     const Descriptives input_statistics = calculate_input_descriptives(input_index);
6652 
6653     scale_input_standard_deviation(input_statistics, input_index);
6654 
6655     return input_statistics;
6656 }
6657 
6658 
6659 /// Scales the given input variable with given minimum and maximum values.
6660 /// It updates the input variables of the data matrix.
6661 /// @param input_statistics vector with the descriptives of the input variable.
6662 /// @param input_index Index of the input to be scaled.
6663 
scale_input_minimum_maximum(const Descriptives & input_statistics,const Index & input_index)6664 void DataSet::scale_input_minimum_maximum(const Descriptives& input_statistics, const Index& input_index)
6665 {
6666     const type slope = std::abs(input_statistics.maximum-input_statistics.minimum) < static_cast<type>(1e-3) ?
6667                 0 :
6668                 (max_range-min_range)/(input_statistics.maximum-input_statistics.minimum);
6669 
6670     const type intercept = std::abs(input_statistics.maximum-input_statistics.minimum) < static_cast<type>(1e-3) ?
6671                 0 :
6672                 (min_range*input_statistics.maximum-max_range*input_statistics.minimum)/(input_statistics.maximum-input_statistics.minimum);
6673 
6674     for(Index i = 0; i < data.dimension(0); i++)
6675     {
6676         data(i, input_index) = data(i, input_index)*slope + intercept;
6677     }
6678 }
6679 
6680 
6681 /// Scales the given input variable with the calculated minimum and maximum values from the data matrix.
6682 /// It updates the input variable of the data matrix.
6683 /// It also returns a vector with the minimum and maximum values of the input variables.
6684 
scale_input_minimum_maximum(const Index & input_index)6685 Descriptives DataSet::scale_input_minimum_maximum(const Index& input_index)
6686 {
6687 #ifdef __OPENNN_DEBUG__
6688 
6689     if(is_empty())
6690     {
6691         ostringstream buffer;
6692 
6693         buffer << "OpenNN Exception: DataSet class.\n"
6694                << "Descriptives scale_input_minimum_maximum(const Index&) method.\n"
6695                << "Data file is not loaded.\n";
6696 
6697         throw logic_error(buffer.str());
6698     }
6699 
6700 #endif
6701 
6702     const Descriptives input_statistics = calculate_input_descriptives(input_index);
6703 
6704     scale_input_minimum_maximum(input_statistics, input_index);
6705 
6706     return input_statistics;
6707 }
6708 
6709 
scale_input_variables_minimum_maximum(const Tensor<Descriptives,1> & inputs_descriptives)6710 void DataSet::scale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>& inputs_descriptives)
6711 {
6712     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6713 
6714     const Index input_variables_number = input_variables_indices.size();
6715 
6716     for(Index i = 0; i < input_variables_number; i++)
6717     {
6718         scale_input_minimum_maximum(inputs_descriptives[i], input_variables_indices[i]);
6719     }
6720 }
6721 
6722 
scale_input_variables_minimum_maximum()6723 Tensor<Descriptives, 1> DataSet::scale_input_variables_minimum_maximum()
6724 {
6725     const Tensor<Descriptives, 1> inputs_descriptives = calculate_input_variables_descriptives();
6726 
6727     scale_input_variables_minimum_maximum(inputs_descriptives);
6728 
6729     return inputs_descriptives;
6730 
6731 }
6732 
6733 
unscale_input_variables_minimum_maximum(const Tensor<Descriptives,1> & inputs_descriptives)6734 void DataSet::unscale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>& inputs_descriptives)
6735 {
6736     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6737 
6738     const Index input_variables_number = input_variables_indices.size();
6739 
6740     for(Index i = 0; i < input_variables_number; i++)
6741     {
6742         unscale_input_variable_minimum_maximum(inputs_descriptives[i], input_variables_indices[i]);
6743     }
6744 }
6745 
6746 
6747 /// It scales every input variable with the given method.
6748 /// The method to be used is that in the scaling and unscaling method variable.
6749 
scale_input_variables(const Tensor<string,1> & scaling_unscaling_methods)6750 Tensor<Descriptives, 1> DataSet::scale_input_variables(const Tensor<string, 1>& scaling_unscaling_methods)
6751 {
6752     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6753 
6754     const Tensor<Descriptives, 1> inputs_descriptives = calculate_input_variables_descriptives();
6755 
6756     for(Index i = 0; i < scaling_unscaling_methods.dimension(0); i++)
6757     {
6758         switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
6759         {
6760         case NoScaling:
6761         {
6762             // Do nothing
6763         }
6764         break;
6765 
6766         case MinimumMaximum:
6767         {
6768             scale_input_minimum_maximum(inputs_descriptives(i), input_variables_indices(i));
6769         }
6770         break;
6771 
6772         case MeanStandardDeviation:
6773         {
6774             scale_input_mean_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
6775         }
6776         break;
6777 
6778         case StandardDeviation:
6779         {
6780             scale_input_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
6781         }
6782         break;
6783 
6784         default:
6785         {
6786             ostringstream buffer;
6787 
6788             buffer << "OpenNN Exception: DataSet class\n"
6789                    << "void scale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6790                    << "Unknown scaling and unscaling method: " << scaling_unscaling_methods(i) << "\n";
6791 
6792             throw logic_error(buffer.str());
6793         }
6794         }
6795     }
6796 
6797     return inputs_descriptives;
6798 }
6799 
6800 
6801 /// Scales the target variables with given mean and standard deviation values.
6802 /// It updates the target variables of the data matrix.
6803 /// @param targets_descriptives vector of descriptives structures for all the targets in the data set.
6804 /// The size of that vector must be equal to the number of target variables.
6805 
scale_target_variables_mean_standard_deviation(const Tensor<Descriptives,1> & targets_descriptives)6806 void DataSet::scale_target_variables_mean_standard_deviation(const Tensor<Descriptives, 1>& targets_descriptives)
6807 {
6808     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6809     const Index target_variables_number = target_variables_indices.size();
6810 
6811     Index variable_index;
6812 
6813     for(Index i = 0; i < data.dimension(0); i++)
6814     {
6815         for(Index j = 0; j < target_variables_number; j++)
6816         {
6817             variable_index = target_variables_indices(j);
6818 
6819             if(!::isnan(data(i,variable_index)))
6820             {
6821                 data(i, variable_index) =
6822                         (data(i, variable_index)-targets_descriptives(j).mean)/(targets_descriptives(j).standard_deviation);
6823             }
6824         }
6825     }
6826 }
6827 
6828 
6829 /// Scales the target variables with the calculated mean and standard deviation values from the data matrix.
6830 /// It updates the target variables of the data matrix.
6831 /// It also returns a vector of descriptives structures with the basic descriptives of all the variables.
6832 
scale_target_variables_mean_standard_deviation()6833 Tensor<Descriptives, 1> DataSet::scale_target_variables_mean_standard_deviation()
6834 {
6835 #ifdef __OPENNN_DEBUG__
6836 
6837     if(is_empty())
6838     {
6839         ostringstream buffer;
6840 
6841         buffer << "OpenNN Exception: DataSet class.\n"
6842                << "Tensor<Descriptives, 1> scale_target_variables_mean_standard_deviation() method.\n"
6843                << "Data file is not loaded.\n";
6844 
6845         throw logic_error(buffer.str());
6846     }
6847 
6848 #endif
6849 
6850     const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
6851 
6852     scale_target_variables_mean_standard_deviation(targets_descriptives);
6853 
6854     return targets_descriptives;
6855 }
6856 
6857 
6858 /// Scales the target variables with given minimum and maximum values.
6859 /// It updates the target variables of the data matrix.
6860 /// @param targets_descriptives vector of descriptives structures for all the targets in the data set.
6861 /// The size of that vector must be equal to the number of target variables.
6862 
scale_target_variables_minimum_maximum(const Tensor<Descriptives,1> & targets_descriptives)6863 void DataSet::scale_target_variables_minimum_maximum(const Tensor<Descriptives, 1>& targets_descriptives)
6864 {
6865 #ifdef __OPENNN_DEBUG__
6866 
6867     if(is_empty())
6868     {
6869         ostringstream buffer;
6870 
6871         buffer << "OpenNN Exception: DataSet class.\n"
6872                << "Tensor<Descriptives, 1> scale_target_variables_minimum_maximum() method.\n"
6873                << "Data file is not loaded.\n";
6874 
6875         throw logic_error(buffer.str());
6876     }
6877 
6878 #endif
6879 
6880 //    const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6881 //    const Index target_variables_number = target_variables_indices.size();
6882 
6883 //    Index variable_index;
6884 
6885 //    for(Index i = 0; i < data.dimension(0); i++)
6886 //    {
6887 //        for(Index j = 0; j < target_variables_number; j++)
6888 //        {
6889 //            variable_index = target_variables_indices(j);
6890 
6891 //            if(!::isnan(data(i,variable_index)))
6892 //            {
6893 //                data(i, variable_index) =
6894 //                        static_cast<type>(2.0)*(data(i, variable_index)-targets_descriptives(j).minimum)/(targets_descriptives(j).maximum-targets_descriptives(j).minimum)-static_cast<type>(1.0);
6895 //            }
6896 //        }
6897 //    }
6898 
6899     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6900     const Index target_variables_number = target_variables_indices.size();
6901 
6902     for(Index i = 0; i < target_variables_number; i++)
6903     {
6904         scale_target_minimum_maximum(targets_descriptives[i], target_variables_indices[i]);
6905     }
6906 }
6907 
6908 
6909 /// Scales the target variables with the calculated minimum and maximum values from the data matrix.
6910 /// It updates the target variables of the data matrix.
6911 /// It also returns a vector of vectors with the descriptives of the input target variables.
6912 
scale_target_variables_minimum_maximum()6913 Tensor<Descriptives, 1> DataSet::scale_target_variables_minimum_maximum()
6914 {
6915     const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
6916 
6917     scale_target_variables_minimum_maximum(targets_descriptives);
6918 
6919     return targets_descriptives;
6920 }
6921 
6922 
6923 /// Scales the target variables with the logarithmic scale using the given minimum and maximum values.
6924 /// It updates the target variables of the data matrix.
6925 /// @param targets_descriptives vector of descriptives structures for all the targets in the data set.
6926 /// The size of that vector must be equal to the number of target variables.
6927 
scale_target_variables_logarithm(const Tensor<Descriptives,1> & targets_descriptives)6928 void DataSet::scale_target_variables_logarithm(const Tensor<Descriptives, 1>& targets_descriptives)
6929 {
6930 #ifdef __OPENNN_DEBUG__
6931 
6932     if(is_empty())
6933     {
6934         ostringstream buffer;
6935 
6936         buffer << "OpenNN Exception: DataSet class.\n"
6937                << "Tensor<Descriptives, 1> scale_target_variables_logarithm() method.\n"
6938                << "Data file is not loaded.\n";
6939 
6940         throw logic_error(buffer.str());
6941     }
6942 
6943 #endif
6944 
6945     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6946     const Index target_variables_number = target_variables_indices.size();
6947 
6948     Index variable_index;
6949 
6950     for(Index i = 0; i < data.dimension(0); i++)
6951     {
6952         for(Index j = 0; j < target_variables_number; j++)
6953         {
6954             variable_index = target_variables_indices(j);
6955 
6956             if(!::isnan(data(i,variable_index)))
6957             {
6958                 data(i, variable_index) =
6959                         static_cast<type>(0.5)*(exp(data(i, variable_index)))*(targets_descriptives(j).maximum-targets_descriptives(j).minimum)+ targets_descriptives(j).minimum;
6960             }
6961         }
6962     }
6963 }
6964 
6965 
6966 /// Scales the target variables with the logarithmic scale using the calculated minimum and maximum values
6967 /// from the data matrix.
6968 /// It updates the target variables of the data matrix.
6969 /// It also returns a vector of vectors with the descriptives of the input target variables.
6970 
scale_target_variables_logarithm()6971 Tensor<Descriptives, 1> DataSet::scale_target_variables_logarithm()
6972 {
6973     const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
6974 
6975     scale_target_variables_logarithm(targets_descriptives);
6976 
6977     return targets_descriptives;
6978 }
6979 
6980 
6981 /// Calculates the input and target variables descriptives.
6982 /// Then it scales the target variables with those values.
6983 /// The method to be used is that in the scaling and unscaling method variable.
6984 /// Finally, it returns the descriptives.
6985 
scale_target_variables(const string & scaling_unscaling_method)6986 Tensor<Descriptives, 1> DataSet::scale_target_variables(const string& scaling_unscaling_method)
6987 {
6988     switch(get_scaling_unscaling_method(scaling_unscaling_method))
6989     {
6990     case NoUnscaling:
6991     {
6992         return calculate_target_variables_descriptives();
6993     }
6994 
6995     case MinimumMaximum:
6996     {
6997         return scale_target_variables_minimum_maximum();
6998     }
6999 
7000     case Logarithmic:
7001     {
7002         return scale_target_variables_logarithm();
7003     }
7004 
7005     case MeanStandardDeviation:
7006     {
7007         return scale_target_variables_mean_standard_deviation();
7008     }
7009 
7010     default:
7011     {
7012         ostringstream buffer;
7013 
7014         buffer << "OpenNN Exception: DataSet class\n"
7015                << "Tensor<Descriptives, 1> scale_target_variables(const string&) method.\n"
7016                << "Unknown scaling and unscaling method.\n";
7017 
7018         throw logic_error(buffer.str());
7019     }
7020     }
7021 }
7022 
7023 
scale_target_minimum_maximum(const Descriptives & target_statistics,const Index & target_index)7024 void DataSet::scale_target_minimum_maximum(const Descriptives& target_statistics, const Index& target_index)
7025 {
7026     const type slope = std::abs(target_statistics.maximum-target_statistics.minimum) < static_cast<type>(1e-3) ?
7027                 0 :
7028                 (max_range-min_range)/(target_statistics.maximum-target_statistics.minimum);
7029 
7030     const type intercept = std::abs(target_statistics.maximum-target_statistics.minimum) < static_cast<type>(1e-3) ?
7031                 0 :
7032                 (min_range*target_statistics.maximum-max_range*target_statistics.minimum)/(target_statistics.maximum-target_statistics.minimum);
7033 
7034     for(Index i = 0; i < data.dimension(0); i++)
7035     {
7036         data(i, target_index) = data(i, target_index)*slope + intercept;
7037     }
7038 }
7039 
7040 
scale_target_mean_standard_deviation(const Descriptives & target_statistics,const Index & target_index)7041 void DataSet::scale_target_mean_standard_deviation(const Descriptives& target_statistics, const Index& target_index)
7042 {
7043     const type slope = std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3) ?
7044                 0 :
7045                 static_cast<type>(1)/target_statistics.standard_deviation;
7046 
7047     const type intercept = std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3) ?
7048                 0 :
7049                 -target_statistics.mean/target_statistics.standard_deviation;
7050 
7051     for(Index i = 0; i < data.dimension(0); i++)
7052     {
7053         data(i, target_index) = data(i, target_index)*slope + intercept;
7054     }
7055 }
7056 
7057 
scale_target_logarithmic(const Descriptives & target_statistics,const Index & target_index)7058 void DataSet::scale_target_logarithmic(const Descriptives& target_statistics, const Index& target_index)
7059 {
7060     for(Index i = 0; i < data.dimension(0); i++)
7061     {
7062         if(std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3))
7063         {
7064             data(i, target_index) = 0;
7065         }
7066         else
7067         {
7068             data(i, target_index) = static_cast<type>(0.5)*(exp(data(i,target_index)-1))*(target_statistics.maximum-target_statistics.minimum) + target_statistics.minimum;
7069         }
7070     }
7071 }
7072 
7073 
7074 /// It scales the input variables with that values.
7075 /// The method to be used is that in the scaling and unscaling method variable.
7076 
scale_target_variables(const Tensor<string,1> & scaling_unscaling_methods)7077 Tensor<Descriptives, 1> DataSet::scale_target_variables(const Tensor<string, 1>& scaling_unscaling_methods)
7078 {
7079     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
7080     const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
7081 
7082 //    Index column_index;
7083 
7084     for (Index i = 0; i < scaling_unscaling_methods.size(); i++)
7085     {
7086 //        column_index = get_column_index(target_variables_indices(i));
7087 
7088 //        if(columns(column_index).type == Binary || columns(column_index).type == Categorical) continue;
7089 
7090         switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
7091         {
7092         case NoUnscaling:
7093             break;
7094 
7095         case MinimumMaximum:
7096             scale_target_minimum_maximum(targets_descriptives(i), target_variables_indices(i));
7097             break;
7098 
7099         case MeanStandardDeviation:
7100             scale_target_mean_standard_deviation(targets_descriptives(i), target_variables_indices(i));
7101             break;
7102 
7103         case Logarithmic:
7104             scale_target_logarithmic(targets_descriptives(i), target_variables_indices(i));
7105             break;
7106 
7107         default:
7108         {
7109             ostringstream buffer;
7110 
7111             buffer << "OpenNN Exception: DataSet class\n"
7112                    << "void scale_target_variables(const string&, const Tensor<Descriptives, 1>&) method.\n"
7113                    << "Unknown scaling and unscaling method.\n";
7114 
7115             throw logic_error(buffer.str());
7116         }
7117         }
7118     }
7119     return targets_descriptives;
7120 }
7121 
7122 
7123 /// Unscales the given input variable with given minimum and maximum values.
7124 /// It updates the input variables of the data matrix.
7125 /// @param input_statistics vector with the descriptives of the input variable.
7126 /// @param input_index Index of the input to be scaled.
7127 
unscale_input_variable_minimum_maximum(const Descriptives & input_statistics,const Index & input_index)7128 void DataSet::unscale_input_variable_minimum_maximum(const Descriptives& input_statistics, const Index & input_index)
7129 {
7130     const type slope = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : (input_statistics.maximum-input_statistics.minimum)/(max_range-min_range);
7131 
7132     const type intercept = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : -(min_range*input_statistics.maximum-max_range*input_statistics.minimum)/(max_range-min_range);
7133 
7134     for(Index i = 0; i < data.dimension(0); i++)
7135     {
7136         data(i, input_index) = data(i, input_index)*slope + intercept;
7137     }
7138 }
7139 
7140 
7141 /// Uncales the given input variables with given mean and standard deviation values.
7142 /// It updates the input variable of the data matrix.
7143 /// @param input_statistics vector of descriptives structures for the input variables.
7144 /// @param input_index Index of the input to be scaled.
7145 
unscale_input_mean_standard_deviation(const Descriptives & input_statistics,const Index & input_index)7146 void DataSet::unscale_input_mean_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
7147 {
7148     const type slope = std::abs(input_statistics.mean - 0) < static_cast<type>(1e-3) ? 0 : input_statistics.standard_deviation/static_cast<type>(2);
7149 
7150     const type intercept = std::abs(input_statistics.mean-0) < static_cast<type>(1e-3) ? input_statistics.minimum : input_statistics.mean;
7151 
7152     for(Index i = 0; i < data.dimension(0); i++)
7153     {
7154         data(i, input_index) = data(i, input_index)*slope + intercept;
7155     }
7156 }
7157 
7158 
7159 /// Unscales the given input variables with given standard deviation values.
7160 /// It updates the input variable of the data matrix.
7161 /// @param inputs_statistics vector of descriptives structures for the input variables.
7162 /// @param input_index Index of the input to be scaled.
7163 
unscale_input_variable_standard_deviation(const Descriptives & input_statistics,const Index & input_index)7164 void DataSet::unscale_input_variable_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
7165 {
7166     const type slope = std::abs(input_statistics.mean-0) < static_cast<type>(1e-3) ? 0 : input_statistics.standard_deviation/static_cast<type>(2);
7167 
7168     const type intercept = std::abs(input_statistics.mean-0) < static_cast<type>(1e-3) ? input_statistics.minimum : 0;
7169 
7170     for(Index i = 0; i < data.dimension(0); i++)
7171     {
7172         data(i, input_index) = data(i, input_index)*slope + intercept;
7173     }
7174 }
7175 
7176 
7177 /// It unscales every input variable with the given method.
7178 /// The method to be used is that in the scaling and unscaling method variable.
7179 
unscale_input_variables(const Tensor<string,1> & scaling_unscaling_methods,const Tensor<Descriptives,1> & inputs_descriptives)7180 void DataSet::unscale_input_variables(const Tensor<string, 1>& scaling_unscaling_methods, const Tensor<Descriptives, 1>& inputs_descriptives)
7181 {
7182     const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
7183 
7184     for(Index i = 0; i < scaling_unscaling_methods.size(); i++)
7185     {
7186         switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
7187         {
7188         case NoScaling:
7189         {
7190             // Do nothing
7191         }
7192         break;
7193 
7194         case MinimumMaximum:
7195         {
7196             unscale_input_variable_minimum_maximum(inputs_descriptives(i), input_variables_indices(i));
7197         }
7198         break;
7199 
7200         case MeanStandardDeviation:
7201         {
7202             unscale_input_mean_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
7203         }
7204         break;
7205 
7206         case StandardDeviation:
7207         {
7208             unscale_input_variable_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
7209         }
7210         break;
7211 
7212         default:
7213         {
7214             ostringstream buffer;
7215 
7216             buffer << "OpenNN Exception: DataSet class\n"
7217                    << "void unscale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
7218                    << "Unknown unscaling and unscaling method: " << scaling_unscaling_methods(i) << "\n";
7219 
7220             throw logic_error(buffer.str());
7221         }
7222         }
7223     }
7224 }
7225 
7226 
unscale_target_minimum_maximum(const Descriptives & target_statistics,const Index & target_index)7227 void DataSet::unscale_target_minimum_maximum(const Descriptives& target_statistics, const Index& target_index)
7228 {
7229     const type slope = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : (target_statistics.maximum-target_statistics.minimum)/(max_range-min_range);
7230 
7231     const type intercept = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : -(min_range*target_statistics.maximum-max_range*target_statistics.minimum)/(max_range-min_range);
7232 
7233     for(Index i = 0; i < data.dimension(0); i++)
7234     {
7235         data(i, target_index) = data(i, target_index)*slope + intercept;
7236     }
7237 }
7238 
7239 
unscale_target_mean_standard_deviation(const Descriptives & target_statistics,const Index & target_index)7240 void DataSet::unscale_target_mean_standard_deviation(const Descriptives& target_statistics, const Index& target_index)
7241 {
7242     const type slope = std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3) ?
7243                 0 :
7244                 target_statistics.standard_deviation/static_cast<type>(2);
7245 
7246     const type intercept = target_statistics.mean;
7247 
7248     for(Index i = 0; i < data.dimension(0); i++)
7249     {
7250         data(i, target_index) = data(i, target_index)*slope + intercept;
7251     }
7252 }
7253 
7254 
unscale_target_logarithmic(const Descriptives & target_statistics,const Index & target_index)7255 void DataSet::unscale_target_logarithmic(const Descriptives& target_statistics, const Index& target_index)
7256 {
7257     for(Index i = 0; i < data.dimension(0); i++)
7258     {
7259         if(std::abs(target_statistics.maximum - target_statistics.minimum) < static_cast<type>(1e-3))
7260         {
7261             data(i, target_index) = target_statistics.minimum;
7262         }
7263         else
7264         {
7265             data(i, target_index) = log(static_cast<type>(2)*(data(i,target_index)-target_statistics.minimum)/(target_statistics.maximum-target_statistics.minimum));
7266         }
7267     }
7268 }
7269 
7270 
7271 /// It unscales the input variables with that values.
7272 /// The method to be used is that in the scaling and unscaling method variable.
7273 
unscale_target_variables(const Tensor<string,1> & scaling_unscaling_methods,const Tensor<Descriptives,1> & targets_descriptives)7274 void DataSet::unscale_target_variables(const Tensor<string, 1>& scaling_unscaling_methods, const Tensor<Descriptives, 1>& targets_descriptives)
7275 {
7276     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
7277 
7278     for (Index i = 0; i < scaling_unscaling_methods.size(); i++)
7279     {
7280         switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
7281         {
7282         case NoUnscaling:
7283             break;
7284 
7285         case MinimumMaximum:
7286             unscale_target_minimum_maximum(targets_descriptives(i), target_variables_indices(i));
7287             break;
7288 
7289         case MeanStandardDeviation:
7290             unscale_target_mean_standard_deviation(targets_descriptives(i), target_variables_indices(i));
7291             break;
7292 
7293         case Logarithmic:
7294             unscale_target_logarithmic(targets_descriptives(i), target_variables_indices(i));
7295             break;
7296 
7297         default:
7298         {
7299             ostringstream buffer;
7300 
7301             buffer << "OpenNN Exception: DataSet class\n"
7302                    << "void unscale_targets(const string&, const Tensor<Descriptives, 1>&) method.\n"
7303                    << "Unknown unscaling and unscaling method.\n";
7304 
7305             throw logic_error(buffer.str());
7306         }
7307         }
7308     }
7309 }
7310 
7311 
7312 
7313 
7314 /// Initializes the data matrix with a given value.
7315 /// @param new_value Initialization value.
7316 
initialize_data(const type & new_value)7317 void DataSet::initialize_data(const type& new_value)
7318 {
7319     data.setConstant(new_value);
7320 }
7321 
7322 
7323 /// Initializes the data matrix with random values chosen from a uniform distribution
7324 /// with given minimum and maximum.
7325 
set_data_random()7326 void DataSet::set_data_random()
7327 {
7328     data.setRandom();
7329 }
7330 
7331 
7332 /// Initializes the data matrix with random values chosen from a uniform distribution
7333 /// with given minimum and maximum. The targets will be binary randoms.
7334 
set_data_binary_random()7335 void DataSet::set_data_binary_random()
7336 {
7337     data.setRandom();
7338 
7339     const Index samples_number = data.dimension(0);
7340     const Index variables_number = data.dimension(1);
7341 
7342     const Index input_variables_number = get_input_variables_number();
7343 
7344     for(Index i = 0; i < samples_number; i++)
7345     {
7346         for(Index j = input_variables_number; j < variables_number; j++)
7347         {
7348             data(i,j) = (1+static_cast<type>(pow((-1),rand())))/2;
7349         }
7350     }
7351 }
7352 
7353 
7354 /// Sets max and min scaling range for minmaxscaling.
7355 /// @param min and max for scaling range.
7356 
set_min_max_range(const type min,const type max)7357 void DataSet::set_min_max_range(const type min, const type max)
7358 {
7359     min_range = min;
7360     max_range = max;
7361 }
7362 
7363 /// Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory.
7364 
write_XML(tinyxml2::XMLPrinter & file_stream) const7365 void DataSet::write_XML(tinyxml2::XMLPrinter& file_stream) const
7366 {
7367     ostringstream buffer;
7368 
7369     file_stream.OpenElement("DataSet");
7370 
7371     // Data file
7372 
7373     file_stream.OpenElement("DataFile");
7374 
7375     // File type ?
7376 
7377     {
7378         file_stream.OpenElement("FileType");
7379 
7380         file_stream.PushText("csv");
7381 
7382         file_stream.CloseElement();
7383     }
7384 
7385     // Data file name
7386     {
7387         file_stream.OpenElement("DataFileName");
7388 
7389         file_stream.PushText(data_file_name.c_str());
7390 
7391         file_stream.CloseElement();
7392     }
7393 
7394     // Separator
7395     {
7396         file_stream.OpenElement("Separator");
7397 
7398         file_stream.PushText(get_separator_string().c_str());
7399 
7400         file_stream.CloseElement();
7401     }
7402 
7403     // Columns names
7404     {
7405         file_stream.OpenElement("ColumnsNames");
7406 
7407         buffer.str("");
7408         buffer << has_columns_names;
7409 
7410         file_stream.PushText(buffer.str().c_str());
7411 
7412         file_stream.CloseElement();
7413     }
7414 
7415     // Rows labels
7416     {
7417         file_stream.OpenElement("RowsLabels");
7418 
7419         buffer.str("");
7420         buffer << has_rows_labels;
7421 
7422         file_stream.PushText(buffer.str().c_str());
7423 
7424         file_stream.CloseElement();
7425     }
7426 
7427     // Missing values label
7428     {
7429         file_stream.OpenElement("MissingValuesLabel");
7430 
7431         file_stream.PushText(missing_values_label.c_str());
7432 
7433         file_stream.CloseElement();
7434     }
7435 
7436     // Lags number
7437     {
7438         file_stream.OpenElement("LagsNumber");
7439 
7440         buffer.str("");
7441         buffer << get_lags_number();
7442 
7443         file_stream.PushText(buffer.str().c_str());
7444 
7445         file_stream.CloseElement();
7446     }
7447 
7448     // Steps Ahead
7449     {
7450         file_stream.OpenElement("StepsAhead");
7451 
7452         buffer.str("");
7453         buffer << get_steps_ahead();
7454 
7455         file_stream.PushText(buffer.str().c_str());
7456 
7457         file_stream.CloseElement();
7458     }
7459 
7460     // Time Index
7461     {
7462         file_stream.OpenElement("TimeIndex");
7463 
7464         buffer.str("");
7465         buffer << get_time_index();
7466 
7467         file_stream.PushText(buffer.str().c_str());
7468 
7469         file_stream.CloseElement();
7470     }
7471     // Close DataFile
7472 
7473     file_stream.CloseElement();
7474 
7475     // Columns
7476 
7477     file_stream.OpenElement("Columns");
7478 
7479     // Columns number
7480     {
7481         file_stream.OpenElement("ColumnsNumber");
7482 
7483         buffer.str("");
7484         buffer << get_columns_number();
7485 
7486         file_stream.PushText(buffer.str().c_str());
7487 
7488         file_stream.CloseElement();
7489     }
7490 
7491     // Columns items
7492 
7493     {
7494         const Index columns_number = get_columns_number();
7495 
7496         for(Index i = 0; i < columns_number; i++)
7497         {
7498             file_stream.OpenElement("Column");
7499 
7500             file_stream.PushAttribute("Item", to_string(i+1).c_str());
7501 
7502             columns(i).write_XML(file_stream);
7503 
7504             file_stream.CloseElement();
7505         }
7506     }
7507 
7508     // Close columns
7509 
7510     file_stream.CloseElement();
7511 
7512     // Rows labels
7513 
7514     if(has_rows_labels)
7515     {
7516         const Index rows_labels_number = rows_labels.dimension(0);
7517 
7518         file_stream.OpenElement("RowsLabels");
7519 
7520         buffer.str("");
7521 
7522         for(Index i = 0; i < rows_labels_number; i++)
7523         {
7524             buffer << rows_labels(i);
7525 
7526             if(i != rows_labels_number-1) buffer << ",";
7527         }
7528 
7529         file_stream.PushText(buffer.str().c_str());
7530 
7531         file_stream.CloseElement();
7532     }
7533 
7534     // Samples
7535 
7536     file_stream.OpenElement("Samples");
7537 
7538     // Samples number
7539     {
7540         file_stream.OpenElement("SamplesNumber");
7541 
7542         buffer.str("");
7543         buffer << get_samples_number();
7544 
7545         file_stream.PushText(buffer.str().c_str());
7546 
7547         file_stream.CloseElement();
7548     }
7549 
7550     // Samples uses
7551 
7552     {
7553         file_stream.OpenElement("SamplesUses");
7554 
7555         buffer.str("");
7556 
7557         const Index samples_number = get_samples_number();
7558 
7559         for(Index i = 0; i < samples_number; i++)
7560         {
7561             buffer << samples_uses(i);
7562 
7563             if(i < (samples_number-1)) buffer << " ";
7564         }
7565 
7566         file_stream.PushText(buffer.str().c_str());
7567 
7568         file_stream.CloseElement();
7569     }
7570 
7571     // Close samples
7572 
7573     file_stream.CloseElement();
7574 
7575     // Missing values
7576 
7577     file_stream.OpenElement("MissingValues");
7578 
7579     // Missing values method
7580 
7581     {
7582         file_stream.OpenElement("MissingValuesMethod");
7583 
7584         if(missing_values_method == Mean)
7585         {
7586             file_stream.PushText("Mean");
7587         }
7588         else if(missing_values_method == Median)
7589         {
7590             file_stream.PushText("Median");
7591         }
7592         else
7593         {
7594             file_stream.PushText("Unuse");
7595         }
7596 
7597         file_stream.CloseElement();
7598     }
7599 
7600     // Missing values number
7601 
7602     {
7603         file_stream.OpenElement("MissingValuesNumber");
7604 
7605         buffer.str("");
7606         buffer << missing_values_number;
7607 
7608         file_stream.PushText(buffer.str().c_str());
7609 
7610         file_stream.CloseElement();
7611     }
7612 
7613     if(missing_values_number > 0)
7614     {
7615         // Columns missing values number
7616 
7617         {
7618             file_stream.OpenElement("ColumnsMissingValuesNumber");
7619 
7620             cout << "count nan columns" << endl;
7621             const Index columns_number = columns_missing_values_number.size();
7622 
7623             buffer.str("");
7624 
7625             for (Index i = 0; i < columns_number; i++)
7626             {
7627                 buffer << columns_missing_values_number(i);
7628 
7629                 if(i != (columns_number-1)) buffer << " ";
7630             }
7631 
7632             file_stream.PushText(buffer.str().c_str());
7633 
7634             file_stream.CloseElement();
7635         }
7636 
7637         // Rows missing values number
7638 
7639         {
7640             file_stream.OpenElement("RowsMissingValuesNumber");
7641 
7642             buffer.str("");
7643             buffer << rows_missing_values_number;
7644 
7645             file_stream.PushText(buffer.str().c_str());
7646 
7647             file_stream.CloseElement();
7648         }
7649     }
7650 
7651     // Missing values
7652 
7653     file_stream.CloseElement();
7654 
7655     // Preview data
7656 
7657     file_stream.OpenElement("PreviewData");
7658 
7659     file_stream.OpenElement("PreviewSize");
7660 
7661     buffer.str("");
7662     buffer << data_file_preview.size();
7663 
7664     file_stream.PushText(buffer.str().c_str());
7665 
7666     file_stream.CloseElement();
7667 
7668     for(Index i = 0; i < data_file_preview.size(); i++)
7669     {
7670         file_stream.OpenElement("Row");
7671 
7672         file_stream.PushAttribute("Item", to_string(i+1).c_str());
7673 
7674         for(Index j = 0; j < data_file_preview(i).size(); j++)
7675         {
7676             file_stream.PushText(data_file_preview(i)(j).c_str());
7677 
7678             if(j != data_file_preview(i).size()-1)
7679             {
7680                 file_stream.PushText(",");
7681             }
7682         }
7683 
7684         file_stream.CloseElement();
7685     }
7686 
7687     // Close preview data
7688 
7689     file_stream.CloseElement();
7690 
7691     // Close data set
7692 
7693     file_stream.CloseElement();
7694 }
7695 
7696 
from_XML(const tinyxml2::XMLDocument & data_set_document)7697 void DataSet::from_XML(const tinyxml2::XMLDocument& data_set_document)
7698 {
7699     ostringstream buffer;
7700 
7701     // Data set element
7702 
7703     const tinyxml2::XMLElement* data_set_element = data_set_document.FirstChildElement("DataSet");
7704 
7705     if(!data_set_element)
7706     {
7707         buffer << "OpenNN Exception: DataSet class.\n"
7708                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7709                << "Data set element is nullptr.\n";
7710 
7711         throw logic_error(buffer.str());
7712     }
7713 
7714     // Data file
7715 
7716     const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement("DataFile");
7717 
7718     if(!data_file_element)
7719     {
7720         buffer << "OpenNN Exception: DataSet class.\n"
7721                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7722                << "Data file element is nullptr.\n";
7723 
7724         throw logic_error(buffer.str());
7725     }
7726 
7727     // Data file name
7728 
7729     const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement("DataFileName");
7730 
7731     if(!data_file_name_element)
7732     {
7733         buffer << "OpenNN Exception: DataSet class.\n"
7734                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7735                << "DataFileName element is nullptr.\n";
7736 
7737         throw logic_error(buffer.str());
7738     }
7739 
7740     if(data_file_name_element->GetText())
7741     {
7742         const string new_data_file_name = data_file_name_element->GetText();
7743 
7744         set_data_file_name(new_data_file_name);
7745     }
7746 
7747     // Separator
7748 
7749     const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement("Separator");
7750 
7751     if(separator_element)
7752     {
7753         if(separator_element->GetText())
7754         {
7755             const string new_separator = separator_element->GetText();
7756 
7757             set_separator(new_separator);
7758         }
7759         else
7760         {
7761             set_separator("Comma");
7762         }
7763     }
7764     else
7765     {
7766         set_separator("Comma");
7767     }
7768 
7769     // Has columns names
7770 
7771     const tinyxml2::XMLElement* columns_names_element = data_file_element->FirstChildElement("ColumnsNames");
7772 
7773     if(columns_names_element)
7774     {
7775         const string new_columns_names_string = columns_names_element->GetText();
7776 
7777         try
7778         {
7779             set_has_columns_names(new_columns_names_string == "1");
7780         }
7781         catch(const logic_error& e)
7782         {
7783             cerr << e.what() << endl;
7784         }
7785     }
7786 
7787     // Rows labels
7788 
7789     const tinyxml2::XMLElement* rows_label_element = data_file_element->FirstChildElement("RowsLabels");
7790 
7791     if(rows_label_element)
7792     {
7793         const string new_rows_label_string = rows_label_element->GetText();
7794 
7795         try
7796         {
7797             set_has_rows_label(new_rows_label_string == "1");
7798         }
7799         catch(const logic_error& e)
7800         {
7801             cerr << e.what() << endl;
7802         }
7803     }
7804 
7805     // Missing values label
7806 
7807     const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement("MissingValuesLabel");
7808 
7809     if(missing_values_label_element)
7810     {
7811         if(missing_values_label_element->GetText())
7812         {
7813             const string new_missing_values_label = missing_values_label_element->GetText();
7814 
7815             set_missing_values_label(new_missing_values_label);
7816         }
7817         else
7818         {
7819             set_missing_values_label("NA");
7820         }
7821     }
7822     else
7823     {
7824         set_missing_values_label("NA");
7825     }
7826 
7827     // Forecasting
7828 
7829     // Lags number
7830 
7831     const tinyxml2::XMLElement* lags_number_element = data_file_element->FirstChildElement("LagsNumber");
7832 
7833     if(!lags_number_element)
7834     {
7835         buffer << "OpenNN Exception: DataSet class.\n"
7836                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7837                << "Lags number element is nullptr.\n";
7838 
7839         throw logic_error(buffer.str());
7840     }
7841 
7842     if(lags_number_element->GetText())
7843     {
7844         const Index new_lags_number = static_cast<Index>(atoi(lags_number_element->GetText()));
7845 
7846         set_lags_number(new_lags_number);
7847     }
7848 
7849     // Steps ahead
7850 
7851     const tinyxml2::XMLElement* steps_ahead_element = data_file_element->FirstChildElement("StepsAhead");
7852 
7853     if(!steps_ahead_element)
7854     {
7855         buffer << "OpenNN Exception: DataSet class.\n"
7856                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7857                << "Steps ahead element is nullptr.\n";
7858 
7859         throw logic_error(buffer.str());
7860     }
7861 
7862     if(steps_ahead_element->GetText())
7863     {
7864         const Index new_steps_ahead = static_cast<Index>(atoi(steps_ahead_element->GetText()));
7865 
7866         set_steps_ahead_number(new_steps_ahead);
7867     }
7868 
7869     // Time index
7870 
7871     const tinyxml2::XMLElement* time_index_element = data_file_element->FirstChildElement("TimeIndex");
7872 
7873     if(!time_index_element)
7874     {
7875         buffer << "OpenNN Exception: DataSet class.\n"
7876                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7877                << "Time index element is nullptr.\n";
7878 
7879         throw logic_error(buffer.str());
7880     }
7881 
7882     if(time_index_element->GetText())
7883     {
7884         const Index new_time_index = static_cast<Index>(atoi(time_index_element->GetText()));
7885 
7886         set_time_index(new_time_index);
7887     }
7888 
7889     // Columns
7890 
7891     const tinyxml2::XMLElement* columns_element = data_set_element->FirstChildElement("Columns");
7892 
7893     if(!columns_element)
7894     {
7895         buffer << "OpenNN Exception: DataSet class.\n"
7896                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7897                << "Columns element is nullptr.\n";
7898 
7899         throw logic_error(buffer.str());
7900     }
7901 
7902     // Columns number
7903 
7904     const tinyxml2::XMLElement* columns_number_element = columns_element->FirstChildElement("ColumnsNumber");
7905 
7906     if(!columns_number_element)
7907     {
7908         buffer << "OpenNN Exception: DataSet class.\n"
7909                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7910                << "Columns number element is nullptr.\n";
7911 
7912         throw logic_error(buffer.str());
7913     }
7914 
7915     Index new_columns_number = 0;
7916 
7917     if(columns_number_element->GetText())
7918     {
7919         new_columns_number = static_cast<Index>(atoi(columns_number_element->GetText()));
7920 
7921         set_columns_number(new_columns_number);
7922     }
7923 
7924     // Columns
7925 
7926     const tinyxml2::XMLElement* start_element = columns_number_element;
7927 
7928     if(new_columns_number > 0)
7929     {
7930         for(Index i = 0; i < new_columns_number; i++)
7931         {
7932             const tinyxml2::XMLElement* column_element = start_element->NextSiblingElement("Column");
7933             start_element = column_element;
7934 
7935             if(column_element->Attribute("Item") != std::to_string(i+1))
7936             {
7937                 buffer << "OpenNN Exception: DataSet class.\n"
7938                        << "void DataSet:from_XML(const tinyxml2::XMLDocument&) method.\n"
7939                        << "Column item number (" << i+1 << ") does not match (" << column_element->Attribute("Item") << ").\n";
7940 
7941                 throw logic_error(buffer.str());
7942             }
7943 
7944             // Name
7945 
7946             const tinyxml2::XMLElement* name_element = column_element->FirstChildElement("Name");
7947 
7948             if(!name_element)
7949             {
7950                 buffer << "OpenNN Exception: DataSet class.\n"
7951                        << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7952                        << "Name element is nullptr.\n";
7953 
7954                 throw logic_error(buffer.str());
7955             }
7956 
7957             if(name_element->GetText())
7958             {
7959                 const string new_name = name_element->GetText();
7960 
7961                 columns(i).name = new_name;
7962             }
7963 
7964             // Column use
7965 
7966             const tinyxml2::XMLElement* column_use_element = column_element->FirstChildElement("ColumnUse");
7967 
7968             if(!column_use_element)
7969             {
7970                 buffer << "OpenNN Exception: DataSet class.\n"
7971                        << "void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7972                        << "Column use element is nullptr.\n";
7973 
7974                 throw logic_error(buffer.str());
7975             }
7976 
7977             if(column_use_element->GetText())
7978             {
7979                 const string new_column_use = column_use_element->GetText();
7980 
7981                 columns(i).set_use(new_column_use);
7982             }
7983 
7984             // Type
7985 
7986             const tinyxml2::XMLElement* type_element = column_element->FirstChildElement("Type");
7987 
7988             if(!type_element)
7989             {
7990                 buffer << "OpenNN Exception: DataSet class.\n"
7991                        << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7992                        << "Type element is nullptr.\n";
7993 
7994                 throw logic_error(buffer.str());
7995             }
7996 
7997             if(type_element->GetText())
7998             {
7999                 const string new_type = type_element->GetText();
8000                 columns(i).set_type(new_type);
8001             }
8002 
8003             if(columns(i).type == Categorical || columns(i).type == Binary)
8004             {
8005                 // Categories
8006 
8007                 const tinyxml2::XMLElement* categories_element = column_element->FirstChildElement("Categories");
8008 
8009                 if(!categories_element)
8010                 {
8011                     buffer << "OpenNN Exception: DataSet class.\n"
8012                            << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
8013                            << "Categories element is nullptr.\n";
8014 
8015                     throw logic_error(buffer.str());
8016                 }
8017 
8018                 if(categories_element->GetText())
8019                 {
8020                     const string new_categories = categories_element->GetText();
8021 
8022                     columns(i).categories = get_tokens(new_categories, ';');
8023                 }
8024 
8025                 // Categories uses
8026 
8027                 const tinyxml2::XMLElement* categories_uses_element = column_element->FirstChildElement("CategoriesUses");
8028 
8029                 if(!categories_uses_element)
8030                 {
8031                     buffer << "OpenNN Exception: DataSet class.\n"
8032                            << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
8033                            << "Categories uses element is nullptr.\n";
8034 
8035                     throw logic_error(buffer.str());
8036                 }
8037 
8038                 if(categories_uses_element->GetText())
8039                 {
8040                     const string new_categories_uses = categories_uses_element->GetText();
8041 
8042                     columns(i).set_categories_uses(get_tokens(new_categories_uses, ';'));
8043                 }
8044             }
8045         }
8046     }
8047 
8048     // Rows label
8049 
8050     if(has_rows_labels)
8051     {
8052         // Rows labels begin tag
8053 
8054         const tinyxml2::XMLElement* rows_labels_element = data_set_element->FirstChildElement("RowsLabels");
8055 
8056         if(!rows_labels_element)
8057         {
8058             buffer << "OpenNN Exception: DataSet class.\n"
8059                    << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8060                    << "Rows labels element is nullptr.\n";
8061 
8062             throw logic_error(buffer.str());
8063         }
8064 
8065         // Rows labels
8066 
8067         if(rows_labels_element->GetText())
8068         {
8069             const string new_rows_labels = rows_labels_element->GetText();
8070 
8071             rows_labels = get_tokens(new_rows_labels, ',');
8072         }
8073 
8074 
8075     }
8076 
8077     // Samples
8078 
8079     const tinyxml2::XMLElement* samples_element = data_set_element->FirstChildElement("Samples");
8080 
8081     if(!samples_element)
8082     {
8083         buffer << "OpenNN Exception: DataSet class.\n"
8084                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8085                << "Samples element is nullptr.\n";
8086 
8087         throw logic_error(buffer.str());
8088     }
8089 
8090     // Samples number
8091 
8092     const tinyxml2::XMLElement* samples_number_element = samples_element->FirstChildElement("SamplesNumber");
8093 
8094     if(!samples_number_element)
8095     {
8096         buffer << "OpenNN Exception: DataSet class.\n"
8097                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8098                << "Samples number element is nullptr.\n";
8099 
8100         throw logic_error(buffer.str());
8101     }
8102 
8103     if(samples_number_element->GetText())
8104     {
8105         const Index new_samples_number = static_cast<Index>(atoi(samples_number_element->GetText()));
8106 
8107         samples_uses.resize(new_samples_number);
8108     }
8109 
8110     // Samples uses
8111 
8112     const tinyxml2::XMLElement* samples_uses_element = samples_element->FirstChildElement("SamplesUses");
8113 
8114     if(!samples_uses_element)
8115     {
8116         buffer << "OpenNN Exception: DataSet class.\n"
8117                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8118                << "Samples uses element is nullptr.\n";
8119 
8120         throw logic_error(buffer.str());
8121     }
8122 
8123     if(samples_uses_element->GetText())
8124     {
8125         set_samples_uses(get_tokens(samples_uses_element->GetText(), ' '));
8126     }
8127 
8128     // Missing values
8129 
8130     const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement("MissingValues");
8131 
8132     if(!missing_values_element)
8133     {
8134         buffer << "OpenNN Exception: DataSet class.\n"
8135                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8136                << "Missing values element is nullptr.\n";
8137 
8138         throw logic_error(buffer.str());
8139     }
8140 
8141     // Missing values method
8142 
8143     const tinyxml2::XMLElement* missing_values_method_element = missing_values_element->FirstChildElement("MissingValuesMethod");
8144 
8145     if(!missing_values_method_element)
8146     {
8147         buffer << "OpenNN Exception: DataSet class.\n"
8148                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8149                << "Missing values method element is nullptr.\n";
8150 
8151         throw logic_error(buffer.str());
8152     }
8153 
8154     if(missing_values_method_element->GetText())
8155     {
8156         set_missing_values_method(missing_values_method_element->GetText());
8157     }
8158 
8159     // Missing values number
8160 
8161     const tinyxml2::XMLElement* missing_values_number_element = missing_values_element->FirstChildElement("MissingValuesNumber");
8162 
8163     if(!missing_values_number_element)
8164     {
8165         buffer << "OpenNN Exception: DataSet class.\n"
8166                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8167                << "Missing values number element is nullptr.\n";
8168 
8169         throw logic_error(buffer.str());
8170     }
8171 
8172     if(missing_values_number_element->GetText())
8173     {
8174         missing_values_number = static_cast<Index>(atoi(missing_values_number_element->GetText()));
8175     }
8176 
8177     if(missing_values_number > 0)
8178     {
8179         // Columns Missing values number
8180 
8181         const tinyxml2::XMLElement* columns_missing_values_number_element = missing_values_element->FirstChildElement("ColumnsMissingValuesNumber");
8182 
8183         if(!columns_missing_values_number_element)
8184         {
8185             buffer << "OpenNN Exception: DataSet class.\n"
8186                    << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8187                    << "Columns missing values number element is nullptr.\n";
8188 
8189             throw logic_error(buffer.str());
8190         }
8191 
8192         if(columns_missing_values_number_element->GetText())
8193         {
8194             Tensor<string, 1> new_columns_missing_values_number = get_tokens(columns_missing_values_number_element->GetText(), ' ');
8195 
8196             columns_missing_values_number.resize(new_columns_missing_values_number.size());
8197 
8198             for(Index i = 0; i < new_columns_missing_values_number.size(); i++)
8199             {
8200                 columns_missing_values_number(i) = atoi(new_columns_missing_values_number(i).c_str());
8201             }
8202         }
8203 
8204         // Rows missing values number
8205 
8206         const tinyxml2::XMLElement* rows_missing_values_number_element = missing_values_element->FirstChildElement("RowsMissingValuesNumber");
8207 
8208         if(!rows_missing_values_number_element)
8209         {
8210             buffer << "OpenNN Exception: DataSet class.\n"
8211                    << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8212                    << "Rows missing values number element is nullptr.\n";
8213 
8214             throw logic_error(buffer.str());
8215         }
8216 
8217         if(rows_missing_values_number_element->GetText())
8218         {
8219             rows_missing_values_number = static_cast<Index>(atoi(rows_missing_values_number_element->GetText()));
8220         }
8221     }
8222 
8223     // Preview data
8224 
8225     const tinyxml2::XMLElement* preview_data_element = data_set_element->FirstChildElement("PreviewData");
8226 
8227     if(!preview_data_element)
8228     {
8229         buffer << "OpenNN Exception: DataSet class.\n"
8230                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8231                << "Preview data element is nullptr.\n";
8232 
8233         throw logic_error(buffer.str());
8234     }
8235 
8236     // Preview size
8237 
8238     const tinyxml2::XMLElement* preview_size_element = preview_data_element->FirstChildElement("PreviewSize");
8239 
8240     if(!preview_size_element)
8241     {
8242         buffer << "OpenNN Exception: DataSet class.\n"
8243                << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8244                << "Preview size element is nullptr.\n";
8245 
8246         throw logic_error(buffer.str());
8247     }
8248 
8249     Index new_preview_size = 0;
8250 
8251     if(preview_size_element->GetText())
8252     {
8253         new_preview_size = static_cast<Index>(atoi(preview_size_element->GetText()));
8254 
8255         if(new_preview_size > 0) data_file_preview.resize(new_preview_size);
8256     }
8257 
8258     // Preview data
8259 
8260     start_element = preview_size_element;
8261 
8262     for(Index i = 0; i < new_preview_size; i++)
8263     {
8264         const tinyxml2::XMLElement* row_element = start_element->NextSiblingElement("Row");
8265         start_element = row_element;
8266 
8267         if(row_element->Attribute("Item") != std::to_string(i+1))
8268         {
8269             buffer << "OpenNN Exception: DataSet class.\n"
8270                    << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8271                    << "Row item number (" << i+1 << ") does not match (" << row_element->Attribute("Item") << ").\n";
8272 
8273             throw logic_error(buffer.str());
8274         }
8275 
8276         if(row_element->GetText())
8277         {
8278             data_file_preview(i) = get_tokens(row_element->GetText(), ',');
8279         }
8280     }
8281 
8282     // Display
8283 
8284     const tinyxml2::XMLElement* display_element = data_set_element->FirstChildElement("Display");
8285 
8286     if(display_element)
8287     {
8288         const string new_display_string = display_element->GetText();
8289 
8290         try
8291         {
8292             set_display(new_display_string != "0");
8293         }
8294         catch(const logic_error& e)
8295         {
8296             cerr << e.what() << endl;
8297         }
8298     }
8299 }
8300 
8301 
8302 /// Prints to the screen in text format the main numbers from the data set object.
8303 
print_summary() const8304 void DataSet::print_summary() const
8305 {
8306     if(display)
8307     {
8308         const Index variables_number = get_variables_number();
8309         const Index samples_number = get_samples_number();
8310 
8311         cout << "Data set object summary:\n"
8312              << "Number of variables: " << variables_number << "\n"
8313              << "Number of samples: " << samples_number << "\n";
8314     }
8315 }
8316 
8317 
8318 /// Saves the members of a data set object to a XML-type file in an XML-type format.
8319 /// @param file_name Name of data set XML-type file.
8320 ///
8321 /// @todo
8322 
save(const string & file_name) const8323 void DataSet::save(const string& file_name) const
8324 {
8325     FILE *pFile;
8326 //    int err;
8327 
8328 //    err = fopen_s(&pFile, file_name.c_str(), "w");
8329     pFile = fopen(file_name.c_str(), "w");
8330 
8331     tinyxml2::XMLPrinter document(pFile);
8332 
8333     write_XML(document);
8334 
8335     fclose(pFile);
8336 }
8337 
8338 
8339 /// Loads the members of a data set object from a XML-type file:
8340 /// <ul>
8341 /// <li> Samples number.
8342 /// <li> Training samples number.
8343 /// <li> Training samples indices.
8344 /// <li> Selection samples number.
8345 /// <li> Selection samples indices.
8346 /// <li> Testing samples number.
8347 /// <li> Testing samples indices.
8348 /// <li> Input variables number.
8349 /// <li> Input variables indices.
8350 /// <li> Target variables number.
8351 /// <li> Target variables indices.
8352 /// <li> Input variables name.
8353 /// <li> Target variables name.
8354 /// <li> Input variables description.
8355 /// <li> Target variables description.
8356 /// <li> Display.
8357 /// <li> Data.
8358 /// </ul>
8359 /// Please mind about the file format. This is specified in the User's Guide.
8360 /// @param file_name Name of data set XML-type file.
8361 
load(const string & file_name)8362 void DataSet::load(const string& file_name)
8363 {
8364     tinyxml2::XMLDocument document;
8365 
8366     if(document.LoadFile(file_name.c_str()))
8367     {
8368         ostringstream buffer;
8369 
8370         buffer << "OpenNN Exception: DataSet class.\n"
8371                << "void load(const string&) method.\n"
8372                << "Cannot load XML file " << file_name << ".\n";
8373 
8374         throw logic_error(buffer.str());
8375     }
8376 
8377     from_XML(document);
8378 }
8379 
8380 
print_columns_types() const8381 void DataSet::print_columns_types() const
8382 {
8383     const Index columns_number = get_columns_number();
8384 
8385     for(Index i = 0; i < columns_number; i++)
8386     {
8387         if(columns(i).type == Numeric) cout << "Numeric ";
8388         else if(columns(i).type == Binary) cout << "Binary ";
8389         else if(columns(i).type == Categorical) cout << "Categorical ";
8390         else if(columns(i).type == DateTime) cout << "DateTime ";
8391         else if(columns(i).type == Constant) cout << "Constant ";
8392 
8393     }
8394 
8395     cout << endl;
8396 }
8397 
8398 
8399 /// Prints to the screen the values of the data matrix.
8400 
print_data() const8401 void DataSet::print_data() const
8402 {
8403     if(display) cout << data << endl;
8404 }
8405 
8406 
8407 /// Prints to the sceen a preview of the data matrix,
8408 /// i.e., the first, second and last samples
8409 
print_data_preview() const8410 void DataSet::print_data_preview() const
8411 {
8412     if(!display) return;
8413 
8414     const Index samples_number = get_samples_number();
8415 
8416     if(samples_number > 0)
8417     {
8418         const Tensor<type, 1> first_sample = data.chip(0, 0);
8419 
8420         cout << "First sample:  \n";
8421 
8422         for(int i = 0; i< first_sample.dimension(0); i++)
8423         {
8424             cout  << first_sample(i) << "  ";
8425         }
8426 
8427     cout << endl;
8428     }
8429 
8430     if(samples_number > 1)
8431     {
8432         const Tensor<type, 1> second_sample = data.chip(1, 0);
8433 
8434         cout << "Second sample:  \n";
8435 
8436         for(int i = 0; i< second_sample.dimension(0); i++)
8437         {
8438 
8439             cout  << second_sample(i) << "  ";
8440         }
8441 
8442         cout << endl;
8443     }
8444 
8445 
8446     if(samples_number > 2)
8447     {
8448         const Tensor<type, 1> last_sample = data.chip(samples_number-1, 0);
8449 
8450         cout << "Last sample:  \n";
8451 
8452         for(int i = 0; i< last_sample.dimension(0); i++)
8453         {
8454 
8455             cout  << last_sample(i) << "  ";
8456         }
8457 
8458         cout << endl;
8459     }
8460 }
8461 
8462 
8463 /// Saves to the data file the values of the data matrix.
8464 
save_data() const8465 void DataSet::save_data() const
8466 {
8467     ofstream file(data_file_name.c_str());
8468 
8469     if(!file.is_open())
8470     {
8471       ostringstream buffer;
8472 
8473       buffer << "OpenNN Exception: Matrix template." << endl
8474              << "void save_csv(const string&, const char&, const Vector<string>&, const Vector<string>&) method." << endl
8475              << "Cannot open matrix data file: " << data_file_name << endl;
8476 
8477       throw logic_error(buffer.str());
8478     }
8479 
8480     file.precision(20);
8481 
8482     const Index samples_number = get_samples_number();
8483     const Index variables_number = get_variables_number();
8484 
8485     const Tensor<string, 1> variables_names = get_variables_names();
8486 
8487     char separator_char = ',';//get_separator_char();
8488 
8489     if(this->has_rows_labels)
8490     {
8491         file << "id" << separator_char;
8492     }
8493     for(Index j = 0; j < variables_number; j++)
8494     {
8495         file << variables_names[j];
8496 
8497         if(j != variables_number-1)
8498         {
8499             file << separator_char;
8500         }
8501     }
8502 
8503     file << endl;
8504 
8505     for(Index i = 0; i < samples_number; i++)
8506     {
8507         if(this->has_rows_labels)
8508         {
8509             file << rows_labels(i) << separator_char;
8510         }
8511        for(Index j = 0; j < variables_number; j++)
8512        {
8513            file << data(i,j);
8514 
8515            if(j != variables_number-1)
8516            {
8517                file << separator_char;
8518            }
8519        }
8520 
8521        file << endl;
8522     }
8523 
8524     file.close();
8525 }
8526 
8527 
8528 /// Saves to the data file the values of the data matrix in binary format.
8529 
save_data_binary(const string & binary_data_file_name) const8530 void DataSet::save_data_binary(const string& binary_data_file_name) const
8531 {
8532     ofstream file(binary_data_file_name.c_str(), ios::binary);
8533 
8534     if(!file.is_open())
8535     {
8536         ostringstream buffer;
8537 
8538         buffer << "OpenNN Exception: DataSet template." << endl
8539                << "void save_data_binary(const string) method." << endl
8540                << "Cannot open data binary file." << endl;
8541 
8542         throw logic_error(buffer.str());
8543     }
8544 
8545     // Write data
8546 
8547     streamsize size = sizeof(Index);
8548 
8549     Index columns_number = data.dimension(1);
8550     Index rows_number = data.dimension(0);
8551 
8552     cout << "Rows number: " << rows_number << endl;
8553     cout << "Columns number: " << columns_number << endl;
8554 
8555     cout << "Saving binary data file..." << endl;
8556 
8557     file.write(reinterpret_cast<char*>(&columns_number), size);
8558     file.write(reinterpret_cast<char*>(&rows_number), size);
8559 
8560     size = sizeof(type);
8561 
8562     type value;
8563 
8564     for(int i = 0; i < columns_number; i++)
8565     {
8566         for(int j = 0; j < rows_number; j++)
8567         {
8568             value = data(j,i);
8569 
8570             file.write(reinterpret_cast<char*>(&value), size);
8571         }
8572     }
8573 
8574     file.close();
8575 
8576 
8577 /*
8578     file.write(reinterpret_cast<char*>(&columns_number), size);
8579     file.write(reinterpret_cast<char*>(&rows_number), size);
8580 
8581     size = sizeof(type);
8582 
8583     type value;
8584 
8585     for(int i = 0; i < columns_number*rows_number; i++)
8586     {
8587 //        for(int j = 0; j < rows_number; j++)
8588 //        {
8589             value = data(i);
8590 
8591             file.write(reinterpret_cast<char*>(&value), size);
8592 //        }
8593     }
8594 
8595     file.close();
8596 */
8597 
8598 
8599     cout << "Binary data file saved." << endl;
8600 }
8601 
8602 
8603 /// Arranges an input-target DataSet from a time series matrix, according to the number of lags.
8604 
transform_time_series()8605 void DataSet::transform_time_series()
8606 {
8607     if(lags_number == 0) return;
8608 
8609     const Index variables_number = get_variables_number();
8610     const Index samples_number = get_samples_number();
8611 
8612     time_series_data = data;
8613 
8614     time_series_columns = columns;
8615 
8616     transform_time_series_columns();
8617 
8618     const Index time_series_samples_number = get_samples_number()-(lags_number-1+steps_ahead);
8619     const Index time_series_variables_number = get_columns_number();
8620 
8621     data.resize(time_series_samples_number, time_series_variables_number);
8622 
8623     Tensor<type, 2> new_data(time_series_samples_number, time_series_variables_number);
8624     Tensor<type, 1> variable_data;
8625 
8626     Index new_data_variable = 0;
8627 
8628     Index time_series_variable= 0;
8629 
8630 
8631 // lags
8632 
8633     for(Index lag = lags_number; lag > 0; lag--)
8634     {
8635 
8636         for(Index variable = 0; variable < variables_number; variable++)
8637             {
8638 
8639             variable_data = time_series_data.chip(variable, 1);
8640 
8641             for(Index j = 0; j <= time_series_samples_number; j++)
8642             {
8643 
8644                 new_data(j, time_series_variable) = variable_data(j+lags_number-lag);
8645             }
8646             time_series_variable++;
8647         }
8648     }
8649 
8650 // steps ahead
8651     for(Index ahead = 1; ahead <= steps_ahead; ahead++)
8652     {
8653         for(Index variable = 0; variable < variables_number; variable++)
8654             {
8655             variable_data = time_series_data.chip(variable, 1);
8656 
8657             for(Index j = 0; j < time_series_samples_number; j++)
8658             {
8659                 new_data(j, time_series_variable) = variable_data(j+ahead+lags_number-1);
8660             }
8661 
8662             time_series_variable++;
8663         }
8664     }
8665 
8666     set_data(new_data);
8667 }
8668 
8669 
8670 /// Arranges the data set for association.
8671 /// @todo Low priority. Variables and samples.
8672 
transform_association()8673 void DataSet::transform_association()
8674 {
8675 // OpenNN::transform_association(data);
8676 }
8677 
8678 
8679 /// @todo
8680 
fill_time_series(const Index & period)8681 void DataSet::fill_time_series(const Index& period )
8682 {
8683     Index rows = static_cast<Index>((data(data.dimension(0)- 1, 0)- data(0,0)) / period) + 1 ;
8684 
8685     Tensor<type, 2> new_data(rows, data.dimension(1));
8686 
8687     new_data.setConstant(static_cast<type>(NAN));
8688 
8689     Index j = 1;
8690 
8691 //    new_data.set_row(0, data.chip(0, 0));
8692 
8693     cout.precision(20);
8694 
8695     for (Index i = 1; i < rows ; i++)
8696     {
8697       if(static_cast<Index>(data(j, 0)) == static_cast<Index>(data(j - 1, 0)))
8698       {
8699 
8700           j = j + 1;
8701       }
8702       if(static_cast<Index>(data(j, 0)) == static_cast<Index>(data(0,0) + i * period))
8703       {
8704 //          new_data.set_row(i, data.chip(j, 0));
8705 
8706           j = j + 1;
8707       }
8708       else
8709       {
8710           new_data(i,0) = data(0,0) + i * period;
8711       }
8712     }
8713 
8714     time_series_data = new_data;
8715 
8716     data = new_data;
8717 }
8718 
8719 
8720 /// This method loads the data from a binary data file.
8721 
load_data_binary()8722 void DataSet::load_data_binary()
8723 {
8724     ifstream file;
8725 
8726     file.open(data_file_name.c_str(), ios::binary);
8727 
8728     if(!file.is_open())
8729     {
8730         ostringstream buffer;
8731 
8732         buffer << "OpenNN Exception: DataSet template.\n"
8733                << "void load_binary(const string&) method.\n"
8734                << "Cannot open binary file: " << data_file_name << "\n";
8735 
8736         throw logic_error(buffer.str());
8737     }
8738 
8739     streamsize size = sizeof(Index);
8740 
8741     Index columns_number;
8742     Index rows_number;
8743 
8744     file.read(reinterpret_cast<char*>(&columns_number), size);
8745     file.read(reinterpret_cast<char*>(&rows_number), size);
8746 
8747     size = sizeof(type);
8748 
8749     type value;
8750 
8751     data.resize(rows_number, columns_number);
8752 
8753 //    Index row_index = 0;
8754 //    Index column_index = 0;
8755 
8756     for(Index i = 0; i < rows_number*columns_number; i++)
8757     {
8758         file.read(reinterpret_cast<char*>(&value), size);
8759 
8760         data(i) = value;
8761 /*
8762         data(row_index, column_index) = value;
8763 
8764         row_index++;
8765 
8766         if((i+1)%rows_number == 0)
8767         {
8768             row_index = 0;
8769             column_index++;
8770         }
8771 */
8772     }
8773 
8774     file.close();
8775 }
8776 
8777 
8778 /// This method loads data from a binary data file for time series prediction methodata_set.
8779 /// @todo
8780 
load_time_series_data_binary()8781 void DataSet::load_time_series_data_binary()
8782 {
8783 //    time_series_data.load_binary(data_file_name);
8784 }
8785 
8786 
8787 /// This method checks if the input data file has the correct format. Returns an error message.
8788 
check_input_csv(const string & input_data_file_name,const char & separator_char) const8789 void DataSet::check_input_csv(const string & input_data_file_name, const char & separator_char) const
8790 {
8791     ifstream file(input_data_file_name.c_str());
8792 
8793     if(!file.is_open())
8794     {
8795         ostringstream buffer;
8796 
8797         buffer << "OpenNN Exception: DataSet class.\n"
8798                << "void check_input_csv() method.\n"
8799                << "Cannot open input data file: " << input_data_file_name << "\n";
8800 
8801         throw logic_error(buffer.str());
8802     }
8803 
8804     string line;
8805     Index line_number = 0;
8806     Index total_lines = 0;
8807 
8808     Index tokens_count;
8809 
8810     const Index columns_number = get_columns_number() - get_target_columns_number();
8811 
8812     while(file.good())
8813     {
8814         line_number++;
8815 
8816         getline(file, line);
8817 
8818         trim(line);
8819 
8820         erase(line, '"');
8821 
8822         if(line.empty()) continue;
8823 
8824         total_lines++;
8825 
8826         tokens_count = count_tokens(line, separator_char);
8827 
8828         if(tokens_count != columns_number)
8829         {
8830             ostringstream buffer;
8831 
8832             buffer << "OpenNN Exception: DataSet class.\n"
8833                    << "void check_input_csv() method.\n"
8834                    << "Line " << line_number << ": Size of tokens in input file ("
8835                    << tokens_count << ") is not equal to number of columns("
8836                    << columns_number << "). \n"
8837                    << "Input csv must contain values for all the variables except the target. \n";
8838 
8839             throw logic_error(buffer.str());
8840         }
8841     }
8842 
8843     file.close();
8844 
8845     if(total_lines == 0)
8846     {
8847         ostringstream buffer;
8848 
8849         buffer << "OpenNN Exception: DataSet class.\n"
8850                << "void check_input_csv() method.\n"
8851                << "Input data file is empty. \n";
8852 
8853         throw logic_error(buffer.str());
8854     }
8855 }
8856 
8857 
8858 /// This method loads data from a file and returns a matrix containing the input columns.
8859 
read_input_csv(const string & input_data_file_name,const char & separator_char,const string & missing_values_label,const bool & has_columns_name,const bool & has_rows_label) const8860 Tensor<type, 2> DataSet::read_input_csv(const string& input_data_file_name,
8861                                         const char& separator_char,
8862                                         const string& missing_values_label,
8863                                         const bool& has_columns_name,
8864                                         const bool& has_rows_label) const
8865 {
8866     ifstream file(input_data_file_name.c_str());
8867 
8868     if(!file.is_open())
8869     {
8870         ostringstream buffer;
8871 
8872         buffer << "OpenNN Exception: DataSet class.\n"
8873                << "void read_input_csv() method.\n"
8874                << "Cannot open input data file: " << input_data_file_name << "\n";
8875 
8876         throw logic_error(buffer.str());
8877     }
8878 
8879     // Count samples number
8880 
8881     Index input_samples_count = 0;
8882 
8883     string line;
8884     Index line_number = 0;
8885 
8886     Index tokens_count;
8887 
8888     const Index columns_number = get_columns_number() - get_target_columns_number();
8889 
8890     while(file.good())
8891     {
8892         line_number++;
8893 
8894         getline(file, line);
8895 
8896         trim(line);
8897 
8898         erase(line, '"');
8899 
8900         if(line.empty()) continue;
8901 
8902         tokens_count = count_tokens(line, separator_char);
8903 
8904         if(tokens_count != columns_number)
8905         {
8906             ostringstream buffer;
8907 
8908             buffer << "OpenNN Exception: DataSet class.\n"
8909                    << "void read_input_csv() method.\n"
8910                    << "Line " << line_number << ": Size of tokens("
8911                    << tokens_count << ") is not equal to number of columns("
8912                    << columns_number << ").\n";
8913 
8914             throw logic_error(buffer.str());
8915         }
8916 
8917         input_samples_count++;
8918     }
8919 
8920     file.close();
8921 
8922     Index variables_number = get_input_variables_number();
8923 
8924     if(has_columns_name) input_samples_count--;
8925 
8926     Tensor<type, 2> input_data(input_samples_count, variables_number);
8927 
8928     // Fill input data
8929 
8930     file.open(input_data_file_name.c_str());
8931 
8932     if(!file.is_open())
8933     {
8934         ostringstream buffer;
8935 
8936         buffer << "OpenNN Exception: DataSet class.\n"
8937                << "void read_input_csv() method.\n"
8938                << "Cannot open input data file: " << input_data_file_name << " for filling input data file. \n";
8939 
8940         throw logic_error(buffer.str());
8941     }
8942 
8943     // Read first line
8944 
8945     if(has_columns_name)
8946     {
8947         while(file.good())
8948         {
8949             getline(file, line);
8950 
8951             if(line.empty()) continue;
8952 
8953             break;
8954         }
8955     }
8956 
8957     // Read rest of the lines
8958 
8959     Tensor<string, 1> tokens;
8960 
8961     line_number = 0;
8962     Index variable_index = 0;
8963     Index token_index = 0;
8964     bool is_ID = has_rows_label;
8965 
8966     const bool is_float = is_same<type, float>::value;
8967     bool has_missing_values = false;
8968 
8969     while(file.good())
8970     {
8971         getline(file, line);
8972 
8973         trim(line);
8974 
8975         erase(line, '"');
8976 
8977         if(line.empty()) continue;
8978 
8979         tokens = get_tokens(line, separator_char);
8980 
8981         variable_index = 0;
8982         token_index = 0;
8983         is_ID = has_rows_label;
8984 
8985         for(Index i = 0; i < columns.size(); i++)
8986         {
8987             if(is_ID)
8988             {
8989                 is_ID = false;
8990                 continue;
8991             }
8992 
8993             if(columns(i).column_use == UnusedVariable)
8994             {
8995                 token_index++;
8996                 continue;
8997             }
8998             else if(columns(i).column_use != Input)
8999             {
9000                 continue;
9001             }
9002 
9003             if(columns(i).type == Numeric)
9004             {
9005                 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
9006                 {
9007                     has_missing_values = true;
9008                     input_data(line_number, variable_index) = static_cast<type>(NAN);
9009                 }
9010                 else if(is_float)
9011                 {
9012                     input_data(line_number, variable_index) = strtof(tokens(token_index).data(), NULL);
9013                 }
9014                 else
9015                 {
9016                     input_data(line_number, variable_index) = stof(tokens(token_index));
9017                 }
9018 
9019                 variable_index++;
9020             }
9021             else if(columns(i).type == Binary)
9022             {
9023                 if(tokens(token_index) == missing_values_label)
9024                 {
9025                     has_missing_values = true;
9026                     input_data(line_number, variable_index) = static_cast<type>(NAN);
9027                 }
9028                 else if(columns(i).categories.size() > 0 && tokens(token_index) == columns(i).categories(0))
9029                 {
9030                     input_data(line_number, variable_index) = 1.0;
9031                 }
9032                 else if(tokens(token_index) == columns(i).name)
9033                 {
9034                     input_data(line_number, variable_index) = 1.0;
9035                 }
9036 
9037                 variable_index++;
9038             }
9039             else if(columns(i).type == Categorical)
9040             {
9041                 for(Index k = 0; k < columns(i).get_categories_number(); k++)
9042                 {
9043                     if(tokens(token_index) == missing_values_label)
9044                     {
9045                         has_missing_values = true;
9046                         input_data(line_number, variable_index) = static_cast<type>(NAN);
9047                     }
9048                     else if(tokens(token_index) == columns(i).categories(k))
9049                     {
9050                         input_data(line_number, variable_index) = 1.0;
9051                     }
9052 
9053                     variable_index++;
9054                 }
9055             }
9056             else if(columns(i).type == DateTime)
9057             {
9058                 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
9059                 {
9060                     has_missing_values = true;
9061                     input_data(line_number, variable_index) = static_cast<type>(NAN);
9062                 }
9063                 else
9064                 {
9065                     input_data(line_number, variable_index) = static_cast<type>(date_to_timestamp(tokens(token_index), gmt));
9066                 }
9067 
9068                 variable_index++;
9069             }
9070             else if(columns(i).type == Constant)
9071             {
9072                 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
9073                 {
9074                     has_missing_values = true;
9075                     input_data(line_number, variable_index) = static_cast<type>(NAN);
9076                 }
9077                 else if(is_float)
9078                 {
9079                     input_data(line_number, variable_index) = strtof(tokens(token_index).data(), NULL);
9080                 }
9081                 else
9082                 {
9083                     input_data(line_number, variable_index) = stof(tokens(token_index));
9084                 }
9085 
9086                 variable_index++;
9087             }
9088 
9089             token_index++;
9090         }
9091 
9092         line_number++;
9093     }
9094 
9095     file.close();
9096 
9097     if(!has_missing_values)
9098     {
9099         return input_data;
9100     }
9101     else
9102     {
9103         // Scrub missing values
9104 
9105         const MissingValuesMethod missing_values_method = get_missing_values_method();
9106 
9107         if(missing_values_method == MissingValuesMethod::Unuse || missing_values_method == MissingValuesMethod::Mean)
9108         {
9109             const Tensor<type, 1> means = mean(input_data);
9110 
9111             const Index samples_number = input_data.dimension(0);
9112             const Index variables_number = input_data.dimension(1);
9113 
9114         #pragma omp parallel for schedule(dynamic)
9115 
9116             for(Index j = 0; j < variables_number; j++)
9117             {
9118                 for(Index i = 0; i < samples_number; i++)
9119                 {
9120                     if(::isnan(input_data(i, j)))
9121                     {
9122                         input_data(i,j) = means(j);
9123                     }
9124                 }
9125             }
9126         }
9127         else
9128         {
9129             const Tensor<type, 1> medians = median(input_data);
9130 
9131             const Index samples_number = input_data.dimension(0);
9132             const Index variables_number = input_data.dimension(1);
9133 
9134         #pragma omp parallel for schedule(dynamic)
9135 
9136             for(Index j = 0; j < variables_number; j++)
9137             {
9138                 for(Index i = 0; i < samples_number; i++)
9139                 {
9140                     if(::isnan(input_data(i, j)))
9141                     {
9142                         input_data(i,j) = medians(j);
9143                     }
9144                 }
9145             }
9146         }
9147 
9148         return input_data;
9149     }
9150 }
9151 
9152 
9153 /// Returns a vector containing the number of samples of each class in the data set.
9154 /// If the number of target variables is one then the number of classes is two.
9155 /// If the number of target variables is greater than one then the number of classes is equal to the number
9156 /// of target variables.
9157 /// @todo Low priority. Return class_distribution is wrong
9158 
calculate_target_distribution() const9159 Tensor<Index, 1> DataSet::calculate_target_distribution() const
9160 {
9161     const Index samples_number = get_samples_number();
9162     const Index targets_number = get_target_variables_number();
9163     const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
9164 
9165     Tensor<Index, 1> class_distribution;
9166 
9167     if(targets_number == 1) // Two classes
9168     {
9169         class_distribution = Tensor<Index, 1>(2);
9170 
9171         Index target_index = target_variables_indices(0);
9172 
9173         Index positives = 0;
9174         Index negatives = 0;
9175 
9176         for(Index sample_index = 0; sample_index < static_cast<Index>(samples_number); sample_index++)
9177         {
9178             if(!::isnan(data(static_cast<Index>(sample_index),target_index)))
9179             {
9180                 if(data(static_cast<Index>(sample_index),target_index) < static_cast<type>(0.5))
9181                 {
9182                     negatives++;
9183                 }
9184                 else
9185                 {
9186                     positives++;
9187                 }
9188             }
9189         }
9190 
9191         class_distribution(0) = negatives;
9192         class_distribution(1) = positives;
9193     }
9194     else // More than two classes
9195     {
9196         class_distribution = Tensor<Index, 1>(targets_number);
9197 
9198         for(Index i = 0; i < samples_number; i++)
9199         {
9200             if(get_sample_use(i) != UnusedSample)
9201             {
9202                 for(Index j = 0; j < targets_number; j++)
9203                 {
9204                     if(data(i,target_variables_indices(j)) == static_cast<type>(NAN)) continue;
9205 
9206                     if(data(i,target_variables_indices(j)) > 0.5) class_distribution(j)++;
9207                 }
9208             }
9209         }
9210     }
9211 
9212     return class_distribution;
9213 }
9214 
9215 
9216 /// Calculate the outliers from the data set using the Tukey's test.
9217 /// @param cleaning_parameter Parameter used to detect outliers.
9218 /// @todo Low priority.
9219 
calculate_Tukey_outliers(const type & cleaning_parameter) const9220 Tensor<Tensor<Index, 1>, 1> DataSet::calculate_Tukey_outliers(const type& cleaning_parameter) const
9221 {
9222     const Index samples_number = get_used_samples_number();
9223     const Tensor<Index, 1> samples_indices = get_used_samples_indices();
9224 
9225     const Index columns_number = get_columns_number();
9226     const Index used_columns_number = get_used_columns_number();
9227     const Tensor<Index, 1> used_columns_indices = get_used_columns_indices();
9228 
9229     Tensor<Tensor<Index, 1>, 1> return_values(2);
9230 
9231     return_values(0) = Tensor<Index, 1>(samples_number);
9232     return_values(1) = Tensor<Index, 1>(used_columns_number);
9233 
9234     return_values(0).setZero();
9235     return_values(1).setZero();
9236 
9237     Tensor<BoxPlot, 1> box_plots = calculate_columns_box_plots();
9238 
9239     Index used_column_index = 0;
9240     Index variable_index = 0;
9241 
9242     #pragma omp parallel for
9243 
9244     for(Index i = 0; i < columns_number; i++)
9245     {
9246         if(columns(i).column_use == UnusedVariable && columns(i).type == Categorical)
9247         {
9248             variable_index += columns(i).get_categories_number();
9249             continue;
9250         }
9251         else if(columns(i).column_use == UnusedVariable) // Numeric, Binary or DateTime
9252         {
9253             variable_index++;
9254             continue;
9255         }
9256 
9257         if(columns(i).type == Categorical || columns(i).type == Binary || columns(i).type == DateTime)
9258         {
9259             used_column_index++;
9260             columns(i).get_categories_number() == 0 ? variable_index++ : variable_index += columns(i).get_categories_number();
9261             continue;
9262         }
9263         else // Numeric
9264         {
9265             const type interquartile_range = box_plots(used_column_index).third_quartile - box_plots(used_column_index).first_quartile;
9266 
9267             if(interquartile_range < numeric_limits<type>::epsilon())
9268             {
9269                 used_column_index++;
9270                 variable_index++;
9271                 continue;
9272             }
9273 
9274             Index columns_outliers = 0;
9275 
9276             for(Index j = 0; j < samples_number; j++)
9277             {
9278                 const Tensor<type, 1> sample = get_sample_data(samples_indices(static_cast<Index>(j)));
9279 
9280                 if(sample(variable_index) <(box_plots(used_column_index).first_quartile - cleaning_parameter*interquartile_range) ||
9281                         sample(variable_index) >(box_plots(used_column_index).third_quartile + cleaning_parameter*interquartile_range))
9282                 {
9283                     return_values(0)(static_cast<Index>(j)) = 1;
9284 
9285                     columns_outliers++;
9286                 }
9287             }
9288 
9289             return_values(1)(used_column_index) = columns_outliers;
9290 
9291             used_column_index++;
9292             variable_index++;
9293         }
9294     }
9295 
9296     return return_values;
9297 }
9298 
9299 
9300 /// Calculate the outliers from the data set using the Tukey's test and sets in samples object.
9301 /// @param cleaning_parameter Parameter used to detect outliers
9302 /// @todo
9303 
unuse_Tukey_outliers(const type & cleaning_parameter)9304 void DataSet::unuse_Tukey_outliers(const type& cleaning_parameter)
9305 {
9306     const Tensor<Tensor<Index, 1>, 1> outliers_indices = calculate_Tukey_outliers(cleaning_parameter);
9307 
9308 //    const Tensor<Index, 1> outliers_samples = outliers_indices(0).get_indices_greater_than(0);
9309 
9310 //    set_samples_unused(outliers_samples);
9311 
9312 }
9313 
9314 
9315 /// Returns a matrix with the values of autocorrelation for every variable in the data set.
9316 /// The number of rows is equal to the number of
9317 /// The number of columns is the maximum lags number.
9318 /// @param maximum_lags_number Maximum lags number for which autocorrelation is calculated.
9319 /// @todo
9320 
calculate_autocorrelations(const Index & maximum_lags_number) const9321 Tensor<type, 2> DataSet::calculate_autocorrelations(const Index& maximum_lags_number) const
9322 {
9323     if(maximum_lags_number > get_used_samples_number())
9324     {
9325         ostringstream buffer;
9326 
9327         buffer << "OpenNN Exception: DataSet class.\n"
9328                << "Tensor<type, 2> autocorrelations(const Index&) method.\n"
9329                << "Maximum lags number(" << maximum_lags_number << ") is greater than the number of samples("
9330                << get_used_samples_number() <<") \n";
9331 
9332         throw logic_error(buffer.str());
9333     }
9334 
9335     const Index variables_number = data.dimension(1);
9336 
9337     Tensor<type, 2> autocorrelations(variables_number, maximum_lags_number);
9338 
9339     for(Index j = 0; j < variables_number; j++)
9340     {
9341 //        autocorrelations.set_row(j, OpenNN::autocorrelations(data.chip(j,1), maximum_lags_number));
9342     }
9343 
9344     return autocorrelations;
9345 }
9346 
9347 
9348 /// Calculates the cross-correlation between all the variables in the data set.
9349 
calculate_cross_correlations(const Index & lags_number) const9350 Tensor<Tensor<type, 1>, 2> DataSet::calculate_cross_correlations(const Index& lags_number) const
9351 {
9352     const Index variables_number = get_variables_number();
9353 
9354     Tensor<Tensor<type, 1>, 2> cross_correlations(variables_number, variables_number);
9355 
9356     Tensor<type, 1> actual_column;
9357 
9358     for(Index i = 0; i < variables_number; i++)
9359     {
9360         actual_column = data.chip(i,1);
9361 
9362         for(Index j = 0; j < variables_number; j++)
9363         {
9364             cross_correlations(i,j) = OpenNN::cross_correlations(actual_column, data.chip(j,1), lags_number);
9365         }
9366     }
9367 
9368     return cross_correlations;
9369 }
9370 
9371 
9372 /// @todo
9373 
calculate_lag_plot() const9374 Tensor<type, 2> DataSet::calculate_lag_plot() const
9375 {
9376     const Index samples_number = get_used_samples_number();
9377 
9378     const Index columns_number = data.dimension(1) - 1;
9379 
9380     Tensor<type, 2> lag_plot(samples_number, columns_number);
9381 
9382 //    lag_plot = data.get_submatrix_columns(columns_indices);
9383 
9384     return lag_plot;
9385 }
9386 
9387 
9388 /// @todo, check
9389 
calculate_lag_plot(const Index & maximum_lags_number)9390 Tensor<type, 2> DataSet::calculate_lag_plot(const Index& maximum_lags_number)
9391 {
9392     const Index samples_number = get_used_samples_number();
9393 
9394     if(maximum_lags_number > samples_number)
9395     {
9396         ostringstream buffer;
9397 
9398         buffer << "OpenNN Exception: DataSet class.\n"
9399                << "Tensor<type, 2> calculate_lag_plot(const Index&) method.\n"
9400                << "Maximum lags number(" << maximum_lags_number
9401                << ") is greater than the number of samples("
9402                << samples_number << ") \n";
9403 
9404         throw logic_error(buffer.str());
9405     }
9406 
9407     //const Tensor<type, 2> lag_plot = time_series_data.calculate_lag_plot(maximum_lags_number, time_index);
9408 
9409 //    return lag_plot;
9410 
9411     return Tensor<type, 2>();
9412 }
9413 
9414 
9415 /// Generates an artificial dataset with a given number of samples and number of variables
9416 /// by constant data.
9417 /// @param samples_number Number of samples in the dataset.
9418 /// @param variables_number Number of variables in the dataset.
9419 /// @todo
9420 
generate_constant_data(const Index & samples_number,const Index & variables_number)9421 void DataSet::generate_constant_data(const Index& samples_number, const Index& variables_number)
9422 {
9423     set(samples_number, variables_number);
9424 
9425 //    data.setRandom(-5.12, 5.12);
9426 
9427     for(Index i = 0; i < samples_number; i++)
9428     {
9429         data(i, variables_number-1) = 0;
9430     }
9431 
9432     scale_minimum_maximum(data);
9433 
9434     set_default_columns_uses();
9435 }
9436 
9437 
9438 /// Generates an artificial dataset with a given number of samples and number of variables
9439 /// using random data.
9440 /// @param samples_number Number of samples in the dataset.
9441 /// @param variables_number Number of variables in the dataset.
9442 /// @todo
9443 
generate_random_data(const Index & samples_number,const Index & variables_number)9444 void DataSet::generate_random_data(const Index& samples_number, const Index& variables_number)
9445 {
9446     set(samples_number, variables_number);
9447 
9448     data.setRandom();
9449 
9450 //        data.setRandom(0.0, 1.0);
9451 
9452 }
9453 
9454 
9455 /// Generates an artificial dataset with a given number of samples and number of variables
9456 /// using a sequential data.
9457 /// @param samples_number Number of samples in the dataset.
9458 /// @param variables_number Number of variables in the dataset.
9459 
generate_sequential_data(const Index & samples_number,const Index & variables_number)9460 void DataSet::generate_sequential_data(const Index& samples_number, const Index& variables_number)
9461 {
9462     set(samples_number, variables_number);
9463 
9464     for(Index i = 0; i < samples_number; i++)
9465     {
9466         for(Index j = 0; j < variables_number; j++)
9467         {
9468             data(i,j) = static_cast<type>(j);
9469         }
9470     }
9471 }
9472 
9473 
9474 /// Generates an artificial dataset with a given number of samples and number of variables
9475 /// using a paraboloid data.
9476 /// @param samples_number Number of samples in the dataset.
9477 /// @param variables_number Number of variables in the dataset.
9478 /// @todo
9479 
generate_paraboloid_data(const Index & samples_number,const Index & variables_number)9480 void DataSet::generate_paraboloid_data(const Index& samples_number, const Index& variables_number)
9481 {
9482     const Index inputs_number = variables_number-1;
9483 
9484     set(samples_number, variables_number);
9485 
9486 //    data.setRandom();
9487 
9488     data.setRandom();
9489 
9490     for(Index i = 0; i < samples_number; i++)
9491     {
9492 //        const type norm = l2_norm(data.chip(i, 0).delete_last(1));
9493 
9494 //        data(i, inputs_number) = norm*norm;
9495     }
9496 
9497     scale_minimum_maximum(data);
9498 }
9499 
9500 
9501 /// Generates an artificial dataset with a given number of samples and number of variables
9502 /// using the Rosenbrock function.
9503 /// @param samples_number Number of samples in the dataset.
9504 /// @param variables_number Number of variables in the dataset.
9505 /// @todo
9506 
generate_Rosenbrock_data(const Index & samples_number,const Index & variables_number)9507 void DataSet::generate_Rosenbrock_data(const Index& samples_number, const Index& variables_number)
9508 {
9509     const Index inputs_number = variables_number-1;
9510 
9511     set(samples_number, variables_number);
9512 
9513     data.setRandom();
9514 
9515     #pragma omp parallel for
9516 
9517     for(Index i = 0; i < samples_number; i++)
9518     {
9519         type rosenbrock = 0;
9520 
9521         for(Index j = 0; j < inputs_number-1; j++)
9522         {
9523             const type value = data(i,j);
9524             const type next_value = data(i,j+1);
9525 
9526             rosenbrock += (1 - value)*(1 - value)
9527                 + 100*(next_value-value*value)*(next_value-value*value);
9528         }
9529 
9530         data(i, inputs_number) = rosenbrock;
9531     }
9532 
9533     set_default_columns_uses();
9534 }
9535 
9536 
9537 /// @todo
9538 
generate_inputs_selection_data(const Index & samples_number,const Index & variables_number)9539 void DataSet::generate_inputs_selection_data(const Index& samples_number, const Index& variables_number)
9540 {
9541     set(samples_number,variables_number);
9542 
9543     data.setRandom();
9544 
9545     for(Index i = 0; i < samples_number; i++)
9546     {
9547         for(Index j = 0; j < variables_number-2; j++)
9548         {
9549             data(i,variables_number-1) += data(i,j);
9550         }
9551     }
9552 
9553     set_default_columns_uses();
9554 }
9555 
9556 
generate_sum_data(const Index & samples_number,const Index & variables_number)9557 void DataSet::generate_sum_data(const Index& samples_number, const Index& variables_number)
9558 {
9559     set(samples_number,variables_number);
9560 
9561     data.setRandom();
9562 
9563     for(Index i = 0; i < samples_number; i++)
9564     {
9565         for(Index j = 0; j < variables_number-1; j++)
9566         {
9567             data(i,variables_number-1) += data(i,j);
9568         }
9569     }
9570 
9571     set_default();
9572 
9573     scale_data_mean_standard_deviation();
9574 
9575 }
9576 
9577 
9578 /// Generate artificial data for a binary classification problem with a given number of samples and inputs.
9579 /// @param samples_number Number of the samples to generate.
9580 /// @param inputs_number Number of the variables that the data set will have.
9581 /// @todo
9582 
generate_data_binary_classification(const Index & samples_number,const Index & inputs_number)9583 void DataSet::generate_data_binary_classification(const Index& samples_number, const Index& inputs_number)
9584 {
9585     const Index negatives = samples_number/2;
9586     const Index positives = samples_number - negatives;
9587 
9588     // Negatives data
9589 
9590     Tensor<type, 1> target_0(negatives);
9591 
9592     Tensor<type, 2> class_0(negatives, inputs_number+1);
9593 
9594 //        class_0.setRandom(-0.5, 1.0);
9595 
9596 //        class_0.set_column(inputs_number, target_0, "");
9597 
9598         // Positives data
9599 
9600 //        Tensor<type, 1> target_1(positives, 1.0);
9601 
9602 //        Tensor<type, 2> class_1(positives, inputs_number+1);
9603 
9604 //        class_1.setRandom(0.5, 1.0);
9605 
9606 //        class_1.set_column(inputs_number, target_1, "");
9607 
9608         // Assemble
9609 
9610 //        set(class_0.assemble_rows(class_1));
9611 }
9612 
9613 
9614 /// @todo Low priority.
9615 
generate_data_multiple_classification(const Index & samples_number,const Index & inputs_number,const Index & outputs_number)9616 void DataSet::generate_data_multiple_classification(const Index& samples_number, const Index& inputs_number, const Index& outputs_number)
9617 {
9618     Tensor<type, 2> new_data(samples_number, inputs_number);
9619 
9620     new_data.setRandom();
9621 
9622     Tensor<type, 2> targets(samples_number, outputs_number);
9623 
9624     Index target_index = 0;
9625 
9626     for(Index i = 0; i < samples_number; i ++)
9627     {
9628         target_index = static_cast<unsigned>(rand())%outputs_number;
9629 
9630         targets(i, target_index) = 1.0;
9631     }
9632 
9633 //        set(new_data.assemble_columns(targets));
9634 }
9635 
9636 
9637 /// Returns true if the data matrix is not empty(it has not been loaded),
9638 /// and false otherwise.
9639 
has_data() const9640 bool DataSet::has_data() const
9641 {
9642     if(is_empty())
9643     {
9644         return false;
9645     }
9646     else
9647     {
9648         return true;
9649     }
9650 }
9651 
9652 
9653 /// Unuses those samples with values outside a defined range.
9654 /// @param minimums vector of minimum values in the range.
9655 /// The size must be equal to the number of variables.
9656 /// @param maximums vector of maximum values in the range.
9657 /// The size must be equal to the number of variables.
9658 /// @todo Low priority.
9659 
filter_data(const Tensor<type,1> & minimums,const Tensor<type,1> & maximums)9660 Tensor<Index, 1> DataSet::filter_data(const Tensor<type, 1>& minimums, const Tensor<type, 1>& maximums)
9661 {
9662     const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
9663 
9664     const Index used_variables_number = used_variables_indices.size();
9665 
9666 #ifdef __OPENNN_DEBUG__
9667 
9668     if(minimums.size() != used_variables_number)
9669     {
9670         ostringstream buffer;
9671 
9672         buffer << "OpenNN Exception: DataSet class.\n"
9673                << "Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9674                << "Size of minimums(" << minimums.size() << ") is not equal to number of variables(" << used_variables_number << ").\n";
9675 
9676         throw logic_error(buffer.str());
9677     }
9678 
9679     if(maximums.size() != used_variables_number)
9680     {
9681         ostringstream buffer;
9682 
9683         buffer << "OpenNN Exception: DataSet class.\n"
9684                << "Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9685                << "Size of maximums(" << maximums.size() << ") is not equal to number of variables(" << used_variables_number << ").\n";
9686 
9687         throw logic_error(buffer.str());
9688     }
9689 
9690 #endif
9691 
9692     const Index samples_number = get_samples_number();
9693 
9694     Tensor<type, 1> filtered_indices(samples_number);
9695     filtered_indices.setZero();
9696 
9697     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9698     const Index used_samples_number = used_samples_indices.size();
9699 
9700     Index sample_index = 0;
9701 
9702     for(Index i = 0; i < used_variables_number; i++)
9703     {
9704         const Index variable_index = used_variables_indices(i);
9705 
9706         for(Index j = 0; j < used_samples_number; j++)
9707         {
9708             sample_index = used_samples_indices(j);
9709 
9710             if(get_sample_use(sample_index) == UnusedSample) continue;
9711 
9712             if(isnan(data(sample_index, variable_index))) continue;
9713 
9714             if(fabsf(data(sample_index, variable_index) - minimums(i)) <= static_cast<type>(1e-3)
9715                     || fabsf(data(sample_index, variable_index) - maximums(i)) <= static_cast<type>(1e-3)) continue;
9716 
9717             if(data(sample_index,variable_index) < minimums(i)
9718                     || data(sample_index,variable_index) > maximums(i))
9719             {
9720                 filtered_indices(sample_index) = 1.0;
9721 
9722                 set_sample_use(sample_index, UnusedSample);
9723             }
9724         }
9725     }
9726 
9727     Index filtered_samples_number =
9728             static_cast<Index>(std::count_if(filtered_indices.data(), filtered_indices.data()+filtered_indices.size(), [](type value) {return value > static_cast<type>(0.5);}));
9729 
9730     Tensor<Index, 1> filtered_samples_indices(filtered_samples_number);
9731     Index index = 0;
9732 
9733     for(Index i = 0; i < samples_number; i++)
9734     {
9735         if(filtered_indices(i) > static_cast<type>(0.5))
9736         {
9737             filtered_samples_indices(index) = i;
9738             index++;
9739         }
9740     }
9741 
9742     return filtered_samples_indices;
9743 }
9744 
9745 
9746 /// Filter data set variable using a rank.
9747 /// The values within the variable must be between minimum and maximum.
9748 /// @param variable_index Index number where the variable to be filtered is located.
9749 /// @param minimum Value that determine the lower limit.
9750 /// @param maximum Value that determine the upper limit.
9751 /// Returns a indices vector.
9752 /// @todo
9753 
filter_column(const Index & variable_index,const type & minimum,const type & maximum)9754 Tensor<Index, 1> DataSet::filter_column(const Index& variable_index, const type& minimum, const type& maximum)
9755 {
9756     const Index samples_number = get_samples_number();
9757 
9758     Tensor<type, 1> filtered_indices(samples_number);
9759 
9760     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9761 
9762     const Tensor<Index, 1> current_samples_indices = used_samples_indices;
9763 
9764     const Index current_samples_number = current_samples_indices.size();
9765 
9766     for(Index i = 0; i < current_samples_number; i++)
9767     {
9768         const Index index = current_samples_indices(i);
9769 
9770         if(data(index,variable_index) < minimum || data(index,variable_index) > maximum)
9771         {
9772             filtered_indices(index) = 1.0;
9773 
9774             set_sample_use(index, UnusedSample);
9775         }
9776     }
9777 
9778 //        return filtered_indices.get_indices_greater_than(0.5);
9779 
9780     return Tensor<Index, 1>();
9781 }
9782 
9783 
9784 /// Filter data set variable using a rank.
9785 /// The values within the variable must be between minimum and maximum.
9786 /// @param variable_name String name where the variable to be filtered is located.
9787 /// @param minimum Value that determine the lower limit.
9788 /// @param maximum Value that determine the upper limit.
9789 /// Returns a indices vector.
9790 /// @todo
9791 
filter_column(const string & variable_name,const type & minimum,const type & maximum)9792 Tensor<Index, 1> DataSet::filter_column(const string& variable_name, const type& minimum, const type& maximum)
9793 {
9794     const Index variable_index = get_variable_index(variable_name);
9795 
9796     const Index samples_number = get_samples_number();
9797 
9798     Tensor<type, 1> filtered_indices(samples_number);
9799 
9800     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9801 
9802     const Index current_samples_number = used_samples_indices.size();
9803 
9804     for(Index i = 0; i < current_samples_number; i++)
9805     {
9806         const Index index = used_samples_indices(i);
9807 
9808         if(data(index,variable_index) < minimum || data(index,variable_index) > maximum)
9809         {
9810             filtered_indices(index) = 1.0;
9811 
9812             set_sample_use(index, UnusedSample);
9813         }
9814     }
9815 
9816 //        return filtered_indices.get_indices_greater_than(0.5);
9817 
9818     return Tensor<Index, 1>();
9819 }
9820 
9821 
9822 /// This method converts a numerical variable into categorical.
9823 /// Note that this method resizes the dataset.
9824 /// @param variable_index Index of the variable to be converted.
9825 
numeric_to_categorical(const Index & variable_index)9826 void DataSet::numeric_to_categorical(const Index& variable_index)
9827 {
9828 #ifdef __OPENNN_DEBUG__
9829 
9830     const Index variables_number = get_variables_number();
9831 
9832     if(variable_index >= variables_number)
9833     {
9834         ostringstream buffer;
9835 
9836         buffer << "OpenNN Exception: DataSet class.\n"
9837                << "void convert_categorical_variable(const Index&) method.\n"
9838                << "Index of variable(" << variable_index << ") must be less than number of variables (" << variables_number << ").\n";
9839 
9840         throw logic_error(buffer.str());
9841     }
9842 
9843 #endif
9844 
9845 //    const Tensor<type, 1> categories = data.get_column(variable_index).get_unique_elements();
9846 
9847 //    data = data.to_categorical(variable_index);
9848 
9849 //    columns(variable_index).categories_uses = Tensor<VariableUse, 1>(categories.size(), columns(variable_index).column_use);
9850 //    columns(variable_index).type = Categorical;
9851 //    columns(variable_index).categories = categories.to_string_vector();
9852 }
9853 
9854 
9855 /// Sets all the samples with missing values to "Unused".
9856 
impute_missing_values_unuse()9857 void DataSet::impute_missing_values_unuse()
9858 {
9859     const Index samples_number = get_samples_number();
9860 
9861     #pragma omp parallel for
9862 
9863     for(Index i = 0; i <samples_number; i++)
9864     {
9865         if(has_nan_row(i))
9866         {
9867             set_sample_use(i, "Unused");
9868         }
9869     }
9870 }
9871 
9872 /// Substitutes all the missing values by the mean of the corresponding variable.
9873 
impute_missing_values_mean()9874 void DataSet::impute_missing_values_mean()
9875 {
9876     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9877     const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
9878 
9879     const Tensor<type, 1> means = mean(data, used_samples_indices, used_variables_indices);
9880 
9881     const Index samples_number = used_samples_indices.size();
9882     const Index variables_number = used_variables_indices.size();
9883 
9884     Index current_variable;
9885     Index current_sample;
9886 
9887 #pragma omp parallel for schedule(dynamic)
9888 
9889     for(Index j = 0; j < variables_number; j++)
9890     {
9891         current_variable = used_variables_indices(j);
9892 
9893         for(Index i = 0; i < samples_number; i++)
9894         {
9895             current_sample = used_samples_indices(i);
9896 
9897             if(::isnan(data(current_sample, current_variable)))
9898             {
9899                 data(current_sample,current_variable) = means(j);
9900             }
9901         }
9902     }
9903 }
9904 
9905 
9906 /// Substitutes all the missing values by the median of the corresponding variable.
9907 
impute_missing_values_median()9908 void DataSet::impute_missing_values_median()
9909 {
9910     const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9911     const Tensor<Index, 1> used_variables_indices = get_used_columns_indices();
9912 
9913     const Tensor<type, 1> medians = median(data, used_samples_indices, used_variables_indices);
9914 
9915     const Index variables_number = used_variables_indices.size();
9916     const Index samples_number = used_samples_indices.size();
9917 
9918 #pragma omp parallel for schedule(dynamic)
9919 
9920     for(Index j = 0; j < variables_number; j++)
9921     {
9922         for(Index i = 0 ; i < samples_number ; i++)
9923         {
9924             if(::isnan(data(used_samples_indices(i),used_variables_indices(j)))) data(used_samples_indices(i),used_variables_indices(j)) = medians(j);
9925         }
9926     }
9927 }
9928 
9929 
9930 /// General method for dealing with missing values.
9931 /// It switches among the different scrubbing methods available,
9932 /// according to the corresponding value in the missing values object.
9933 
scrub_missing_values()9934 void DataSet::scrub_missing_values()
9935 {
9936     switch(missing_values_method)
9937     {
9938     case Unuse:
9939     {
9940         impute_missing_values_unuse();
9941     }
9942         break;
9943 
9944     case Mean:
9945     {
9946         impute_missing_values_mean();
9947     }
9948         break;
9949 
9950     case Median:
9951     {
9952         impute_missing_values_median();
9953     }
9954         break;
9955     }
9956 }
9957 
9958 
9959 /// @todo Time series stuff?
9960 
read_csv()9961 void DataSet::read_csv()
9962 {
9963     read_csv_1();
9964 
9965     if(!has_time_columns() && !has_categorical_columns())
9966     {
9967         read_csv_2_simple();
9968 
9969         read_csv_3_simple();
9970     }
9971     else
9972     {
9973 
9974     //  categorical data
9975 
9976         read_csv_2_complete();
9977 
9978         read_csv_3_complete();
9979     }
9980 }
9981 
9982 
get_default_columns_names(const Index & columns_number)9983 Tensor<string, 1> DataSet::get_default_columns_names(const Index& columns_number)
9984 {
9985     Tensor<string, 1> columns_names(columns_number);
9986 
9987     for(Index i = 0; i < columns_number; i++)
9988     {
9989         ostringstream buffer;
9990 
9991         buffer << "column_" << i+1;
9992 
9993         columns_names(i) = buffer.str();
9994     }
9995 
9996     return columns_names;
9997 }
9998 
9999 
read_csv_1()10000 void DataSet::read_csv_1()
10001 {
10002     ifstream file(data_file_name.c_str());
10003 
10004     if(!file.is_open())
10005     {
10006         ostringstream buffer;
10007 
10008         buffer << "OpenNN Exception: DataSet class.\n"
10009                << "void read_csv() method.\n"
10010                << "Cannot open data file: " << data_file_name << "\n";
10011 
10012         throw logic_error(buffer.str());
10013     }
10014 
10015     const char separator_char = get_separator_char();
10016 
10017     cout << "Setting data file preview..." << endl;
10018 
10019     Index lines_number = has_columns_names ? 4 : 3;
10020 
10021     data_file_preview.resize(lines_number);
10022 
10023     string line;
10024 
10025     Index lines_count = 0;
10026 
10027     while(file.good())
10028     {
10029         getline(file, line);
10030 
10031         trim(line);
10032 
10033         erase(line, '"');
10034 
10035         if(line.empty()) continue;
10036 
10037         check_separators(line);
10038 
10039         check_special_characters(line);
10040 
10041         data_file_preview(lines_count) = get_tokens(line, separator_char);
10042 
10043         lines_count++;
10044 
10045         if(lines_count == lines_number) break;
10046     }
10047 
10048     file.close();
10049 
10050     // Check empty file    @todo, size() methods returns 0
10051 
10052     if(data_file_preview(0).size() == 0)
10053     {
10054         ostringstream buffer;
10055 
10056         buffer << "OpenNN Exception: DataSet class.\n"
10057                << "void read_csv_1() method.\n"
10058                << "File " << data_file_name << " is empty.\n";
10059 
10060         throw logic_error(buffer.str());
10061     }
10062 
10063     // Set rows labels and columns names
10064 
10065     cout << "Setting rows labels..." << endl;
10066 
10067     string first_name = data_file_preview(0)(0);
10068     transform(first_name.begin(), first_name.end(), first_name.begin(), ::tolower);
10069 
10070     if(contains_substring(first_name, "id"))
10071     {
10072         has_rows_labels = true;
10073     }
10074 
10075     const Index columns_number = has_rows_labels ? data_file_preview(0).size()-1 : data_file_preview(0).size();
10076 
10077     columns.resize(columns_number);
10078 
10079     // Check if header has numeric value
10080 
10081     if(has_columns_names && has_numbers(data_file_preview(0)))
10082     {
10083         ostringstream buffer;
10084 
10085         buffer << "OpenNN Exception: DataSet class.\n"
10086                << "void read_csv_1() method.\n"
10087                << "Some columns names are numeric.\n";
10088 
10089         throw logic_error(buffer.str());
10090     }
10091 
10092     // Columns names
10093 
10094     cout << "Setting columns names..." << endl;
10095 
10096     if(has_columns_names)
10097     {
10098         has_rows_labels ? set_columns_names(data_file_preview(0).slice(Eigen::array<Eigen::Index, 1>({1}), Eigen::array<Eigen::Index, 1>({data_file_preview(0).size()-1})))
10099                         : set_columns_names(data_file_preview(0));
10100     }
10101     else
10102     {
10103         set_columns_names(get_default_columns_names(columns_number));
10104     }
10105 
10106     // Columns types
10107 
10108     cout << "Setting columns types..." << endl;
10109 
10110     Index column_index = 0;
10111 
10112     for(Index i = 0; i < data_file_preview(0).dimension(0); i++)
10113     {
10114         if(has_rows_labels && i == 0) continue;
10115 
10116         if(((is_numeric_string(data_file_preview(1)(i)) && data_file_preview(1)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10117         || ((is_numeric_string(data_file_preview(2)(i)) && data_file_preview(2)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10118         || ((is_numeric_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10119         || ((is_numeric_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) != missing_values_label) || data_file_preview(1)(i).empty()))
10120         {
10121             columns(column_index).type = Numeric;
10122             column_index++;
10123         }
10124         else if((is_date_time_string(data_file_preview(1)(i)) && data_file_preview(1)(i) != missing_values_label)
10125              || (is_date_time_string(data_file_preview(2)(i)) && data_file_preview(2)(i) != missing_values_label)
10126              || (is_date_time_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) != missing_values_label)
10127              || (is_date_time_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) != missing_values_label))
10128         {
10129             columns(column_index).type = DateTime;
10130             column_index++;
10131         }
10132         else
10133         {
10134             columns(column_index).type = Categorical;
10135             column_index++;
10136         }
10137     }
10138 
10139 
10140 }
10141 
10142 
read_csv_2_simple()10143 void DataSet::read_csv_2_simple()
10144 {
10145     ifstream file(data_file_name.c_str());
10146 
10147     if(!file.is_open())
10148     {
10149         ostringstream buffer;
10150 
10151         buffer << "OpenNN Exception: DataSet class.\n"
10152                << "void read_csv_2_simple() method.\n"
10153                << "Cannot open data file: " << data_file_name << "\n";
10154 
10155         throw logic_error(buffer.str());
10156     }
10157 
10158     string line;
10159     Index line_number = 0;
10160 
10161     if(has_columns_names)
10162     {
10163         while(file.good())
10164         {
10165             line_number++;
10166 
10167             getline(file, line);
10168 
10169             trim(line);
10170 
10171             erase(line, '"');
10172 
10173             if(line.empty()) continue;
10174 
10175             break;
10176         }
10177     }
10178 
10179     Index samples_count = 0;
10180 
10181     Index tokens_count;
10182 
10183     cout << "Setting data dimensions..." << endl;
10184 
10185     const char separator_char = get_separator_char();
10186 
10187     const Index columns_number = get_columns_number();
10188     const Index raw_columns_number = has_rows_labels ? columns_number + 1 : columns_number;
10189 
10190     while(file.good())
10191     {
10192         line_number++;
10193 
10194         getline(file, line);
10195 
10196         trim(line);
10197 
10198         //erase(line, '"');
10199 
10200         if(line.empty()) continue;
10201 
10202         tokens_count = count_tokens(line, separator_char);
10203 
10204         if(tokens_count != raw_columns_number)
10205         {
10206             ostringstream buffer;
10207 
10208             buffer << "OpenNN Exception: DataSet class.\n"
10209                    << "void read_csv_2_simple() method.\n"
10210                    << "Line " << line_number << ": Size of tokens("
10211                    << tokens_count << ") is not equal to number of columns("
10212                    << raw_columns_number << ").\n";
10213 
10214             throw logic_error(buffer.str());
10215         }
10216 
10217         samples_count++;
10218     }
10219 
10220     file.close();
10221 
10222     data.resize(samples_count, columns_number);
10223 
10224     set_default_columns_uses();
10225 
10226     samples_uses.resize(samples_count);
10227     samples_uses.setConstant(Training);
10228 
10229     split_samples_random();
10230 }
10231 
10232 
read_csv_3_simple()10233 void DataSet::read_csv_3_simple()
10234 {
10235     ifstream file(data_file_name.c_str());
10236 
10237     if(!file.is_open())
10238     {
10239         ostringstream buffer;
10240 
10241         buffer << "OpenNN Exception: DataSet class.\n"
10242                << "void read_csv_2_simple() method.\n"
10243                << "Cannot open data file: " << data_file_name << "\n";
10244 
10245         throw logic_error(buffer.str());
10246     }
10247 
10248     const bool is_float = is_same<type, float>::value;
10249 
10250     const char separator_char = get_separator_char();
10251 
10252     string line;
10253 
10254     // Read header
10255 
10256     if(has_columns_names)
10257     {
10258         while(file.good())
10259         {
10260             getline(file, line);
10261 
10262             if(line.empty()) continue;
10263 
10264             break;
10265         }
10266     }
10267 
10268 
10269     // Read data
10270 
10271     Index j = 0;
10272 
10273     //???
10274 
10275     const Index raw_columns_number = has_rows_labels ? get_columns_number() + 1 : get_columns_number();
10276 
10277     Tensor<string, 1> tokens(raw_columns_number);
10278 
10279     const Index samples_number = data.dimension(0);
10280 
10281     if(has_rows_labels) rows_labels.resize(samples_number);
10282 
10283     cout << "Reading data..." << endl;
10284 
10285     Index sample_index = 0;
10286     Index column_index = 0;
10287 
10288 
10289     while(file.good())
10290     {
10291         getline(file, line);
10292 
10293         trim(line);
10294 
10295         erase(line, '"');
10296 
10297         if(line.empty()) continue;
10298 
10299         fill_tokens(line, separator_char, tokens);
10300 
10301         for(j = 0; j < raw_columns_number; j++)
10302         {
10303             trim(tokens(j));
10304 
10305             if(has_rows_labels && j == 0)
10306             {
10307                 rows_labels(sample_index) = tokens(j);
10308             }
10309             else if(tokens(j) == missing_values_label || tokens(j).empty())
10310             {
10311                 data(sample_index, column_index) = static_cast<type>(NAN);
10312                 column_index++;
10313             }
10314             else if(is_float)
10315             {
10316                 data(sample_index, column_index) = strtof(tokens(j).data(), NULL);
10317                 column_index++;
10318             }
10319             else
10320             {
10321                 data(sample_index, column_index) = stof(tokens(j));
10322                 column_index++;
10323             }
10324         }
10325 
10326         column_index = 0;
10327         sample_index++;
10328     }
10329 
10330     const Index data_file_preview_index = has_columns_names ? 3 : 2;
10331 
10332     data_file_preview(data_file_preview_index) = tokens;
10333 
10334     file.close();
10335 
10336     cout << "Data read succesfully..." << endl;
10337 
10338     // Check Constant
10339 
10340 
10341     cout << "Checking constant columns..." << endl;
10342 
10343     Index variable_index = 0;
10344 
10345     for(Index column = 0; column < get_columns_number(); column++)
10346     {
10347         if(columns(column).type == Numeric)
10348         {
10349             // @todo avoid chip
10350 
10351 //            if(is_constant_numeric(data.chip(variable_index, 1)))
10352 //            {
10353 //                columns(column).type = Constant;
10354 //                columns(column).column_use = UnusedVariable;
10355 //            }
10356 
10357             const type a = data(0, variable_index);
10358 
10359             bool constant = true;
10360 
10361             for (int i = 1; i < data.dimension(0); i++)
10362             {
10363                 if (abs(data(i, variable_index)-a) > 1e-3 || ::isnan(data(i, variable_index)) || ::isnan(a))
10364                     constant = false;
10365             }
10366 
10367             if(constant)
10368             {
10369                 columns(column).type = Constant;
10370                 columns(column).column_use = UnusedVariable;
10371             }
10372 
10373             variable_index++;
10374         }
10375         else if(columns(column).type == DateTime)
10376         {
10377             columns(column).column_use = UnusedVariable;
10378             variable_index++;
10379         }
10380         else if(columns(column).type == Constant)
10381         {
10382             variable_index++;
10383         }
10384         else if(columns(column).type == Binary)
10385         {
10386             if(columns(column).get_categories_number() == 1)
10387             {
10388                 columns(column).type = Constant;
10389                 columns(column).column_use = UnusedVariable;
10390             }
10391 
10392             variable_index++;
10393         }
10394         else if(columns(column).type == Categorical)
10395         {
10396             if(columns(column).get_categories_number() == 1)
10397             {
10398                 columns(column).type = Constant;
10399                 columns(column).column_use = UnusedVariable;
10400             }
10401 
10402             variable_index += columns(column).get_categories_number();
10403         }
10404 
10405 //        if(is_constant_numeric(data.chip(column, 1)) && columns(column).type!=DateTime)
10406 //        {
10407 //            columns(column).type = Constant;
10408 //            columns(column).column_use = UnusedVariable;
10409 //        }
10410     }
10411     // Check Binary
10412 
10413     cout << "Checking binary columns..." << endl;
10414 
10415     set_binary_simple_columns();
10416 
10417 
10418 }
10419 
10420 
read_csv_2_complete()10421 void DataSet::read_csv_2_complete()
10422 {
10423     ifstream file(data_file_name.c_str());
10424 
10425     if(!file.is_open())
10426     {
10427         ostringstream buffer;
10428 
10429         buffer << "OpenNN Exception: DataSet class.\n"
10430                << "void read_csv_2_complete() method.\n"
10431                << "Cannot open data file: " << data_file_name << "\n";
10432 
10433         throw logic_error(buffer.str());
10434     }
10435 
10436     const char separator_char = get_separator_char();
10437 
10438     string line;
10439 
10440     Tensor<string, 1> tokens;
10441 
10442     Index lines_count = 0;
10443     Index tokens_count;
10444 
10445     const Index columns_number = columns.size();
10446 
10447     for(unsigned j = 0; j < columns_number; j++)
10448     {
10449         if(columns(j).type != Categorical)
10450         {
10451             columns(j).column_use = Input;
10452         }
10453     }
10454 
10455     // Skip header
10456 
10457     if(has_columns_names)
10458     {
10459         while(file.good())
10460         {
10461             getline(file, line);
10462 
10463             trim(line);
10464 
10465             if(line.empty()) continue;
10466 
10467             break;
10468         }
10469     }
10470 
10471     // Read data
10472 
10473     cout << "Setting data dimensions..." << endl;
10474 
10475     const Index raw_columns_number = has_rows_labels ? columns_number + 1 : columns_number;
10476 
10477     Index column_index = 0;
10478 
10479     while(file.good())
10480     {
10481         getline(file, line);
10482 
10483         trim(line);
10484 
10485         if(line.empty()) continue;
10486 
10487         tokens = get_tokens(line, separator_char);
10488 
10489         tokens_count = tokens.size();
10490 
10491         if(static_cast<unsigned>(tokens_count) != raw_columns_number)
10492         {
10493             const string message =
10494                 "Sample " + to_string(lines_count+1) + " error:\n"
10495                 "Size of tokens (" + to_string(tokens_count) + ") is not equal to number of columns (" + to_string(raw_columns_number) + ").\n"
10496                 "Please check the format of the data file (e.g: Use of commas both as decimal and column separator)";
10497 
10498             throw logic_error(message);
10499         }
10500 
10501         for(unsigned j = 0; j < raw_columns_number; j++)
10502         {
10503             if(has_rows_labels && j == 0)
10504             {
10505                 continue;
10506             }
10507 
10508             trim(tokens(j));
10509 
10510             if(columns(column_index).type == Categorical)
10511             {
10512                 if(find(columns(column_index).categories.data(), columns(column_index).categories.data() + columns(column_index).categories.size(), tokens(j)) == (columns(column_index).categories.data() + columns(column_index).categories.size()))
10513                 {
10514                     if(tokens(j) == missing_values_label)
10515                     {
10516                         column_index++;
10517                         continue;
10518                     }
10519 
10520                     columns(column_index).add_category(tokens(j));
10521                 }
10522             }
10523 
10524             column_index++;
10525         }
10526 
10527         column_index = 0;
10528 
10529         lines_count++;
10530     }
10531 
10532 
10533     cout << "Setting types..." << endl;
10534 
10535     for(Index j = 0; j < columns_number; j++)
10536     {
10537         if(columns(j).type == Categorical)
10538         {
10539             if(columns(j).categories.size() == 2)
10540             {
10541                 columns(j).type = Binary;
10542             }
10543         }
10544     }
10545 
10546     file.close();
10547 
10548     const Index samples_number = static_cast<unsigned>(lines_count);
10549 
10550     const Index variables_number = get_variables_number();
10551 
10552     data.resize(static_cast<Index>(samples_number), variables_number);
10553     data.setZero();
10554 
10555     if(has_rows_labels) rows_labels.resize(samples_number);
10556 
10557     set_default_columns_uses();
10558 
10559     samples_uses.resize(static_cast<Index>(samples_number));
10560 
10561     samples_uses.setConstant(Training);
10562 
10563     split_samples_random();
10564 }
10565 
10566 
read_csv_3_complete()10567 void DataSet::read_csv_3_complete()
10568 {
10569     ifstream file(data_file_name.c_str());
10570 
10571     if(!file.is_open())
10572     {
10573         ostringstream buffer;
10574 
10575         buffer << "OpenNN Exception: DataSet class.\n"
10576                << "void read_csv_3_complete() method.\n"
10577                << "Cannot open data file: " << data_file_name << "\n";
10578 
10579         throw logic_error(buffer.str());
10580     }
10581 
10582 
10583 
10584     const char separator_char = get_separator_char();
10585 
10586     const Index columns_number = columns.size();
10587 
10588     const Index raw_columns_number = has_rows_labels ? columns_number+1 : columns_number;
10589 
10590     string line;
10591 
10592     Tensor<string, 1> tokens;
10593 
10594     string token;
10595 
10596     unsigned sample_index = 0;
10597     unsigned variable_index = 0;
10598     unsigned column_index = 0;
10599 
10600     // Skip header
10601 
10602     if(has_columns_names)
10603     {
10604         while(file.good())
10605         {
10606             getline(file, line);
10607 
10608             trim(line);
10609 
10610             if(line.empty()) continue;
10611 
10612             break;
10613         }
10614     }
10615 
10616 
10617     // Read data
10618 
10619     cout << "Reading data..." << endl;
10620 
10621     while(file.good())
10622     {
10623         getline(file, line);
10624 
10625         trim(line);
10626 
10627         erase(line, '"');
10628 
10629         if(line.empty()) continue;
10630 
10631         tokens = get_tokens(line, separator_char);
10632 
10633         variable_index = 0;
10634         column_index = 0;
10635 
10636         for(Index j = 0; j < raw_columns_number; j++)
10637         {
10638 
10639             trim(tokens(j));
10640 
10641             if(has_rows_labels && j ==0)
10642             {
10643                 rows_labels(sample_index) = tokens(j);
10644                 continue;
10645             }
10646             else if(columns(column_index).type == Numeric)
10647             {
10648 
10649                 if(tokens(j) == missing_values_label || tokens(j).empty())
10650                 {
10651                     data(sample_index, variable_index) = static_cast<type>(NAN);
10652                     variable_index++;
10653                 }
10654                 else
10655                 {
10656                     try
10657                     {
10658 
10659                         data(sample_index, variable_index) = static_cast<type>(stod(tokens(j)));
10660                         variable_index++;
10661                     }
10662                     catch (invalid_argument)
10663                     {
10664                         ostringstream buffer;
10665 
10666                         buffer << "OpenNN Exception: DataSet class.\n"
10667                                << "void read_csv_3_complete() method.\n"
10668                                << "Sample " << sample_index << "; Invalid number: " << tokens(j) << "\n";
10669 
10670                         throw logic_error(buffer.str());
10671                     }
10672                 }
10673             }
10674             else if(columns(column_index).type == DateTime)
10675             {
10676                 if(tokens(j) == missing_values_label || tokens(j).empty())
10677                 {
10678                     data(sample_index, variable_index) = static_cast<type>(NAN);
10679                     variable_index++;
10680                 }
10681                 else
10682                 {
10683                     data(sample_index, variable_index) = static_cast<type>(date_to_timestamp(tokens(j), gmt));
10684                     variable_index++;
10685                 }
10686             }
10687             else if(columns(column_index).type == Categorical)
10688             {
10689                 for(Index k = 0; k < columns(column_index).get_categories_number(); k++)
10690                 {
10691                     if(tokens(j) == missing_values_label)
10692                     {
10693                         data(sample_index, variable_index) = static_cast<type>(NAN);
10694                     }
10695                     else if(tokens(j) == columns(column_index).categories(k))
10696                     {
10697                         data(sample_index, variable_index) = 1.0;
10698                     }
10699 
10700                     variable_index++;
10701                 }
10702             }
10703             else if(columns(column_index).type == Binary)
10704             {
10705                 if(tokens(j) == missing_values_label)
10706                 {
10707                     data(sample_index, variable_index) = static_cast<type>(NAN);
10708                 }
10709                 else if(columns(column_index).categories.size() > 0 && tokens(j) == columns(column_index).categories(0))
10710                 {
10711                     data(sample_index, variable_index) = 1.0;
10712                 }
10713                 else if(tokens(j) == columns(column_index).name)
10714                 {
10715                     data(sample_index, variable_index) = 1.0;
10716                 }
10717 
10718                 variable_index++;
10719             }
10720 
10721             column_index++;
10722         }
10723 
10724         sample_index++;
10725     }
10726 
10727     const Index data_file_preview_index = has_columns_names ? 3 : 2;
10728 
10729     data_file_preview(data_file_preview_index) = tokens;
10730 
10731     cout << "Data read succesfully..." << endl;
10732 
10733     file.close();
10734 
10735     // Check binary
10736     cout << "Checking binary columns..." << endl;
10737 
10738     set_binary_simple_columns();
10739 
10740     // Check Constant and DateTime to unused
10741 
10742     cout << "Checking constant columns..." << endl;
10743 
10744     variable_index = 0;
10745 
10746     for(Index column = 0; column < get_columns_number(); column++)
10747     {
10748         if(columns(column).type == Numeric)
10749         {
10750             const Tensor<type, 1> numeric_column = data.chip(variable_index, 1);
10751 
10752             if(standard_deviation(numeric_column) - static_cast<type>(0) < static_cast<type>(1.0-3))
10753             {
10754 
10755                 columns(column).type = Constant;
10756                 columns(column).column_use = UnusedVariable;
10757             }
10758 
10759             variable_index++;
10760         }
10761         else if(columns(column).type == DateTime)
10762         {
10763             columns(column).column_use = UnusedVariable;
10764             variable_index++;
10765         }
10766         else if(columns(column).type == Constant)
10767         {
10768             columns(column).column_use = UnusedVariable;
10769 
10770             variable_index++;
10771         }
10772         else if(columns(column).type == Binary)
10773         {
10774             if(columns(column).get_categories_number() == 1)
10775             {
10776                 columns(column).type = Constant;
10777                 columns(column).column_use = UnusedVariable;
10778                 columns(column).set_categories_uses(UnusedVariable);
10779             }
10780 
10781             variable_index++;
10782         }
10783         else if(columns(column).type == Categorical)
10784         {
10785             if(columns(column).get_categories_number() == 1)
10786             {
10787                 columns(column).type = Constant;
10788                 columns(column).column_use = UnusedVariable;
10789                 columns(column).set_categories_uses(UnusedVariable);
10790             }
10791 
10792             variable_index += columns(column).get_categories_number();
10793         }
10794     }
10795 }
10796 
10797 
check_separators(const string & line) const10798 void DataSet::check_separators(const string& line) const
10799 {
10800     if(line.find(',') == string::npos
10801             && line.find(';') == string::npos
10802             && line.find(' ') == string::npos
10803             && line.find('\t') == string::npos)
10804     {
10805         return;
10806     }
10807 
10808     const char separator_char = get_separator_char();
10809 
10810     if(line.find(separator_char) == string::npos)
10811     {
10812         const string message =
10813             "Error: " + get_separator_string() + " separator not found in data file " + data_file_name + ".";
10814 
10815         throw logic_error(message);
10816     }
10817 
10818     if(separator == Space)
10819     {
10820         if(line.find(',') != string::npos)
10821         {
10822             const string message =
10823                 "Error: Found comma (',') in data file " + data_file_name + ", but separator is space (' ').";
10824 
10825             throw logic_error(message);
10826         }
10827         if(line.find(';') != string::npos)
10828         {
10829             const string message =
10830                 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is space (' ').";
10831 
10832             throw logic_error(message);
10833         }
10834     }
10835     else if(separator == Tab)
10836     {
10837         if(line.find(',') != string::npos)
10838         {
10839             const string message =
10840                 "Error: Found comma (',') in data file " + data_file_name + ", but separator is tab ('   ').";
10841 
10842             throw logic_error(message);
10843         }
10844         if(line.find(';') != string::npos)
10845         {
10846             const string message =
10847                 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is tab ('   ').";
10848 
10849             throw logic_error(message);
10850         }
10851     }
10852     else if(separator == Comma)
10853     {
10854         if(line.find(";") != string::npos)
10855         {
10856             const string message =
10857                 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is comma (',').";
10858 
10859             throw logic_error(message);
10860         }
10861     }
10862     else if(separator == Semicolon)
10863     {
10864         if(line.find(",") != string::npos)
10865         {
10866             const string message =
10867                 "Error: Found comma (',') in data file " + data_file_name + ", but separator is semicolon (';'). " + line;
10868 
10869             throw logic_error(message);
10870         }
10871     }
10872 }
10873 
10874 
check_special_characters(const string & line) const10875 void DataSet::check_special_characters(const string & line) const
10876 {
10877     if(line.find_first_of("|@#~€¬^*") != std::string::npos)
10878     {
10879         const string message =
10880             "Error: found special characters in line: " + line + ". Please, review the document.";
10881 
10882         throw logic_error(message);
10883     }
10884 
10885 #ifdef __unix__
10886     if(line.find("\r") != std::string::npos)
10887     {
10888         const string message =
10889                 "Error: mixed break line characters in line: " + line + ". Please, review the document.";
10890         throw logic_error(message);
10891     }
10892 #endif
10893 }
10894 
10895 
has_binary_columns() const10896 bool DataSet::has_binary_columns() const
10897 {
10898     const Index variables_number = columns.size();
10899 
10900     for(Index i = 0; i < variables_number; i++)
10901     {
10902         if(columns(i).type == Binary) return true;
10903     }
10904 
10905     return false;
10906 }
10907 
10908 
has_categorical_columns() const10909 bool DataSet::has_categorical_columns() const
10910 {
10911     const Index variables_number = columns.size();
10912 
10913     for(Index i = 0; i < variables_number; i++)
10914     {
10915         if(columns(i).type == Categorical) return true;
10916     }
10917 
10918     return false;
10919 }
10920 
10921 
has_time_columns() const10922 bool DataSet::has_time_columns() const
10923 {
10924     const Index variables_number = columns.size();
10925 
10926     for(Index i = 0; i < variables_number; i++)
10927     {
10928         if(columns(i).type == DateTime) return true;
10929     }
10930 
10931     return false;
10932 }
10933 
10934 
has_selection() const10935 bool DataSet::has_selection() const
10936 {
10937     if(get_selection_samples_number() == 0) return false;
10938 
10939     return true;
10940 }
10941 
10942 
count_nan_columns() const10943 Tensor<Index, 1> DataSet::count_nan_columns() const
10944 {
10945     const Index columns_number = get_columns_number();
10946     const Index rows_number = get_samples_number();
10947 
10948     Tensor<Index, 1> nan_columns(get_columns_number());
10949     nan_columns.setZero();
10950 
10951     for(Index column_index = 0; column_index < columns_number; column_index++)
10952     {
10953         const Index current_variable_index = get_variable_indices(column_index)(0);
10954 
10955         for(Index row_index = 0; row_index < rows_number; row_index++)
10956         {
10957             if(isnan(data(row_index,current_variable_index)))
10958             {
10959                 nan_columns(column_index) = nan_columns(column_index) + 1;
10960             }
10961         }
10962     }
10963 
10964     return nan_columns;
10965 }
10966 
10967 
count_rows_with_nan() const10968 Index DataSet::count_rows_with_nan() const
10969 {
10970     Index rows_with_nan = 0;
10971 
10972     const Index rows_number = data.dimension(0);
10973     const Index columns_number = data.dimension(1);
10974 
10975     bool has_nan = true;
10976 
10977     for(Index row_index = 0; row_index < rows_number; row_index++)
10978     {
10979         has_nan = false;
10980 
10981         for(Index column_index = 0; column_index < columns_number; column_index++)
10982         {
10983             if(isnan(data(row_index, column_index)))
10984             {
10985                 has_nan = true;
10986                 break;
10987             }
10988         }
10989 
10990         if(has_nan) rows_with_nan++;
10991     }
10992 
10993     return rows_with_nan;
10994 }
10995 
10996 
count_nan() const10997 Index DataSet::count_nan() const
10998 {
10999     const Index rows_number = data.dimension(0);
11000     const Index columns_number = data.dimension(1);
11001 
11002     Index count = 0;
11003 
11004     #pragma omp parallel for reduction(+: count)
11005 
11006     for(Index row_index = 0; row_index < rows_number; row_index++)
11007     {
11008         for(Index column_index = 0; column_index < columns_number; column_index++)
11009         {
11010             if(isnan(data(row_index, column_index))) count++;
11011         }
11012     }
11013 
11014     return count;
11015 }
11016 
11017 
set_missing_values_number(const Index & new_missing_values_number)11018 void DataSet::set_missing_values_number(const Index& new_missing_values_number)
11019 {
11020     missing_values_number = new_missing_values_number;
11021 }
11022 
11023 
set_missing_values_number()11024 void DataSet::set_missing_values_number()
11025 {
11026     missing_values_number = count_nan();
11027 }
11028 
11029 
set_columns_missing_values_number(const Tensor<Index,1> & new_columns_missing_values_number)11030 void DataSet::set_columns_missing_values_number(const Tensor<Index, 1>& new_columns_missing_values_number)
11031 {
11032     columns_missing_values_number = new_columns_missing_values_number;
11033 }
11034 
11035 
set_columns_missing_values_number()11036 void DataSet::set_columns_missing_values_number()
11037 {
11038     columns_missing_values_number = count_nan_columns();
11039 }
11040 
11041 
set_rows_missing_values_number(const Index & new_rows_missing_values_number)11042 void DataSet::set_rows_missing_values_number(const Index& new_rows_missing_values_number)
11043 {
11044     rows_missing_values_number = new_rows_missing_values_number;
11045 }
11046 
11047 
set_rows_missing_values_number()11048 void DataSet::set_rows_missing_values_number()
11049 {
11050     rows_missing_values_number = count_rows_with_nan();
11051 }
11052 
11053 
fix_repeated_names()11054 void DataSet::fix_repeated_names()
11055 {
11056     // Fix columns names
11057 
11058     const Index columns_number = columns.size();
11059 
11060     std::map<std::string, Index> columns_count_map;
11061 
11062     for(Index i = 0; i < columns_number; i++)
11063     {
11064         auto result = columns_count_map.insert(std::pair<std::string, Index>(columns(i).name, 1));
11065 
11066         if (!result.second) result.first->second++;
11067     }
11068 
11069     for (auto & element : columns_count_map)
11070     {
11071         if(element.second > 1)
11072         {
11073             const string repeated_name = element.first;
11074             Index repeated_index = 1;
11075 
11076             for(Index i = 0; i < columns.size(); i++)
11077             {
11078                 if(columns(i).name == repeated_name)
11079                 {
11080                     columns(i).name = columns(i).name + "_" + std::to_string(repeated_index);
11081                     repeated_index++;
11082                 }
11083             }
11084         }
11085     }
11086 
11087     // Fix variables names
11088 
11089     if(has_categorical_columns() || has_binary_columns())
11090     {
11091         Tensor<string, 1> variables_names = get_variables_names();
11092 
11093         const Index variables_number = variables_names.size();
11094 
11095         std::map<std::string, Index> variables_count_map;
11096 
11097         for(Index i = 0; i < variables_number; i++)
11098         {
11099             auto result = variables_count_map.insert(std::pair<std::string, Index>(variables_names(i), 1));
11100 
11101             if (!result.second) result.first->second++;
11102         }
11103 
11104         for (auto & element : variables_count_map)
11105         {
11106             if(element.second > 1)
11107             {
11108                 const string repeated_name = element.first;
11109 
11110                 for(Index i = 0; i < variables_number; i++)
11111                 {
11112                     if(variables_names(i) == repeated_name)
11113                     {
11114                         const Index column_index = get_column_index(i);
11115 
11116                         if(columns(column_index).type != Categorical) continue;
11117 
11118                         variables_names(i) = variables_names(i) + "_" + columns(column_index).name;
11119                     }
11120                 }
11121             }
11122         }
11123 
11124         set_variables_names(variables_names);
11125     }
11126 }
11127 
11128 
push_back(const Tensor<Index,1> & old_vector,const Index & new_string) const11129 Tensor<Index, 1> DataSet::push_back(const Tensor<Index, 1>& old_vector, const Index& new_string) const
11130 {
11131     const Index old_size = old_vector.size();
11132 
11133     const Index new_size = old_size+1;
11134 
11135     Tensor<Index, 1> new_vector(new_size);
11136 
11137     for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
11138 
11139     new_vector(new_size-1) = new_string;
11140 
11141     return new_vector;
11142 }
11143 
11144 
push_back(const Tensor<string,1> & old_vector,const string & new_string) const11145 Tensor<string, 1> DataSet::push_back(const Tensor<string, 1>& old_vector, const string& new_string) const
11146 {
11147     const Index old_size = old_vector.size();
11148 
11149     const Index new_size = old_size+1;
11150 
11151     Tensor<string, 1> new_vector(new_size);
11152 
11153     for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
11154 
11155     new_vector(new_size-1) = new_string;
11156 
11157     return new_vector;
11158 }
11159 
11160 
initialize_sequential_eigen_tensor(Tensor<Index,1> & new_tensor,const Index & start,const Index & step,const Index & end) const11161 void DataSet::initialize_sequential_eigen_tensor(Tensor<Index, 1>& new_tensor,
11162         const Index& start, const Index& step, const Index& end) const
11163 {
11164     const Index new_size = (end-start)/step+1;
11165 
11166     new_tensor.resize(new_size);
11167     new_tensor(0) = start;
11168 
11169     for(Index i = 1; i < new_size-1; i++)
11170     {
11171         new_tensor(i) = new_tensor(i-1)+step;
11172     }
11173 
11174     new_tensor(new_size-1) = end;
11175 }
11176 
11177 
intialize_sequential_eigen_type_tensor(Tensor<type,1> & new_tensor,const type & start,const type & step,const type & end) const11178 void DataSet::intialize_sequential_eigen_type_tensor(Tensor<type, 1>& new_tensor,
11179         const type& start, const type& step, const type& end) const
11180 {
11181     const Index new_size = (end-start)/step+1;
11182 
11183     new_tensor.resize(new_size);
11184     new_tensor(0) = start;
11185 
11186     for(Index i = 1; i < new_size-1; i++)
11187     {
11188         new_tensor(i) = new_tensor(i-1)+step;
11189     }
11190 
11191     new_tensor(new_size-1) = end;
11192 }
11193 
11194 
split_samples(const Tensor<Index,1> & samples_indices,const Index & new_batch_size) const11195 Tensor<Index, 2> DataSet::split_samples(const Tensor<Index, 1>& samples_indices, const Index & new_batch_size) const
11196 {
11197     const Index samples_number = samples_indices.dimension(0);
11198 
11199     Index batches_number;
11200     Index batch_size = new_batch_size;
11201 
11202 //    const Index batches_number =  samples_number / batch_size;
11203     if(samples_number < batch_size)
11204     {
11205         batches_number = 1;
11206         batch_size = samples_number;
11207     }
11208     else
11209     {
11210         batches_number = samples_number / batch_size;
11211     }
11212 
11213 
11214     Tensor<Index, 2> batches(batches_number, batch_size);
11215 
11216     Index count = 0;
11217 
11218     for(Index i = 0; i < batches_number; ++i)
11219     {
11220         for(Index j = 0; j < batch_size; ++j)
11221         {
11222             batches(i,j) = samples_indices(count);
11223 
11224             count++;
11225         }
11226     }
11227 
11228     return batches;
11229 }
11230 
fill_submatrix(const Tensor<type,2> & matrix,const Tensor<Index,1> & rows_indices,const Tensor<Index,1> & columns_indices,type * submatrix_pointer)11231 void DataSet::fill_submatrix(const Tensor<type, 2>& matrix,
11232           const Tensor<Index, 1>& rows_indices,
11233           const Tensor<Index, 1>& columns_indices, type* submatrix_pointer)
11234 {
11235     const Index rows_number = rows_indices.size();
11236     const Index columns_number = columns_indices.size();
11237 
11238     const type* matrix_pointer = matrix.data();
11239 
11240 #pragma omp parallel for
11241 
11242     for(Index j = 0; j < columns_number; j++)
11243     {
11244         const type* matrix_column_pointer = matrix_pointer + matrix.dimension(0)*columns_indices[j];
11245         type* submatrix_column_pointer = submatrix_pointer + rows_number*j;
11246 
11247         const type* value_pointer = nullptr;
11248         const Index* rows_indices_pointer = rows_indices.data();
11249         for(Index i = 0; i < rows_number; i++)
11250         {
11251             value_pointer = matrix_column_pointer + *rows_indices_pointer;
11252             rows_indices_pointer++;
11253             *submatrix_column_pointer = *value_pointer;
11254             submatrix_column_pointer++;
11255         }
11256     }
11257 }
11258 
11259 
fill(const Tensor<Index,1> & samples,const Tensor<Index,1> & inputs,const Tensor<Index,1> & targets)11260 void DataSet::Batch::fill(const Tensor<Index, 1>& samples,
11261                           const Tensor<Index, 1>& inputs,
11262                           const Tensor<Index, 1>& targets)
11263 {
11264     const Tensor<type, 2>& data = data_set_pointer->get_data();
11265 
11266     const Tensor<Index, 1>& input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11267 
11268     if(input_variables_dimensions.size() == 1)
11269     {
11270         data_set_pointer->fill_submatrix(data, samples, inputs, inputs_2d.data());
11271     }
11272     else if(input_variables_dimensions.size() == 3)
11273     {
11274 /*
11275         const Index channels_number = input_variables_dimensions(0);
11276         const Index rows_number = input_variables_dimensions(1);
11277         const Index columns_number = input_variables_dimensions(2);
11278         inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11279         Index index = 0;
11280         for(Index image = 0; image < samples_number; image++)
11281         {
11282             index = 0;
11283             for(Index channel = 0; channel < channels_number; channel++)
11284             {
11285                 for(Index row = 0; row < rows_number; row++)
11286                 {
11287                     for(Index column = 0; column < columns_number; column++)
11288                     {
11289                         inputs_4d(image, channel, row, column) = data(image, index);
11290                         index++;
11291                     }
11292                 }
11293             }
11294         }
11295 */
11296     }
11297     data_set_pointer->fill_submatrix(data, samples, targets, targets_2d.data());
11298 }
11299 
11300 
Batch(const Index & new_samples_number,DataSet * new_data_set_pointer)11301 DataSet::Batch::Batch(const Index& new_samples_number, DataSet* new_data_set_pointer)
11302 {
11303     samples_number = new_samples_number;
11304 
11305     data_set_pointer = new_data_set_pointer;
11306 
11307     const Index input_variables_number = data_set_pointer->get_input_variables_number();
11308     const Index target_variables_number = data_set_pointer->get_target_variables_number();
11309 
11310     const Tensor<Index, 1> input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11311 
11312     if(input_variables_dimensions.rank() == 1)
11313     {
11314         inputs_2d.resize(samples_number, input_variables_number);
11315     }
11316     else if(input_variables_dimensions.rank() == 3)
11317     {
11318         const Index channels_number = input_variables_dimensions(0);
11319         const Index rows_number = input_variables_dimensions(1);
11320         const Index columns_number = input_variables_dimensions(2);
11321 
11322         inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11323     }
11324 
11325 
11326     targets_2d.resize(samples_number, target_variables_number);
11327 }
11328 
11329 
get_samples_number() const11330 Index DataSet::Batch::get_samples_number() const
11331 {
11332     return samples_number;
11333 }
11334 
11335 
print()11336 void DataSet::Batch::print()
11337 {
11338     cout << "Batch structure" << endl;
11339 
11340     cout << "Inputs:" << endl;
11341     cout << inputs_2d << endl;
11342 
11343     cout << "Targets:" << endl;
11344     cout << targets_2d << endl;
11345 }
11346 
11347 
shuffle()11348 void DataSet::shuffle()
11349 {
11350     const Index data_rows = data.dimension(0);
11351     const Index data_columns = data.dimension(1);
11352 
11353     Tensor<Index, 1> indices(data_rows);
11354 
11355     for(Index i = 0; i < data_rows; i++) indices(i) = i;
11356 
11357     random_shuffle(&indices(0), &indices(data_rows-1));
11358 
11359     Tensor<type, 2> new_data(data_rows, data_columns);
11360     Tensor<string, 1> new_rows_labels(data_rows);
11361 
11362     Index index = 0;
11363 
11364     for(Index i = 0; i < data_rows; i++)
11365     {
11366         index = indices(i);
11367 
11368         new_rows_labels(i) = rows_labels(index);
11369 
11370         for(Index j = 0; j < data_columns; j++)
11371         {
11372             new_data(i,j) = data(index,j);
11373         }
11374     }
11375 
11376     data = new_data;
11377     rows_labels = new_rows_labels;
11378 }
11379 
11380 
get_has_rows_labels() const11381 bool DataSet::get_has_rows_labels() const
11382 {
11383     return this->has_rows_labels;
11384 }
11385 
11386 }
11387 
11388 
11389 // OpenNN: Open Neural Networks Library.
11390 // Copyright(C) 2005-2020 Artificial Intelligence Techniques, SL.
11391 //
11392 // This library is free software; you can redistribute it and/or
11393 // modify it under the terms of the GNU Lesser General Public
11394 // License as published by the Free Software Foundation; either
11395 // version 2.1 of the License, or any later version.
11396 //
11397 // This library is distributed in the hope that it will be useful,
11398 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11399 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11400 // Lesser General Public License for more details.
11401 
11402 // You should have received a copy of the GNU Lesser General Public
11403 // License along with this library; if not, write to the Free Software
11404 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
11405