1 // OpenNN: Open Neural Networks Library
2 // www.opennn.net
3 //
4 // D A T A S E T C L A S S
5 //
6 // Artificial Intelligence Techniques SL
7 // artelnics@artelnics.com
8
9 #include "data_set.h"
10 #include <omp.h>
11
12 using namespace OpenNN;
13
14 namespace OpenNN
15 {
16
17 /// Default constructor.
18 /// It creates a data set object with zero samples and zero inputs and target variables.
19 /// It also initializes the rest of class members to their default values.
20
DataSet()21 DataSet::DataSet()
22 {
23 set();
24
25 set_default();
26 }
27
28
29 /// Default constructor. It creates a data set object from data Eigen Matrix.
30 /// It also initializes the rest of class members to their default values.
31 /// @param data Data Tensor<type, 2>.
32
DataSet(const Tensor<type,2> & data)33 DataSet::DataSet(const Tensor<type, 2>& data)
34 {
35 set(data);
36
37 set_default();
38 }
39
40
41 /// Samples and variables number constructor.
42 /// It creates a data set object with given samples and variables numbers.
43 /// All the variables are set as inputs.
44 /// It also initializes the rest of class members to their default values.
45 /// @param new_samples_number Number of samples in the data set.
46 /// @param new_variables_number Number of variables.
47
DataSet(const Index & new_samples_number,const Index & new_variables_number)48 DataSet::DataSet(const Index& new_samples_number, const Index& new_variables_number)
49 {
50 set(new_samples_number, new_variables_number);
51
52 set_default();
53 }
54
55
56 /// Samples number, input variables number and target variables number constructor.
57 /// It creates a data set object with given samples and inputs and target variables numbers.
58 /// It also initializes the rest of class members to their default values.
59 /// @param new_samples_number Number of samples in the data set.
60 /// @param new_inputs_number Number of input variables.
61 /// @param new_targets_number Number of target variables.
62
DataSet(const Index & new_samples_number,const Index & new_inputs_number,const Index & new_targets_number)63 DataSet::DataSet(const Index& new_samples_number, const Index& new_inputs_number, const Index& new_targets_number)
64 {
65 set(new_samples_number, new_inputs_number, new_targets_number);
66
67 set_default();
68 }
69
70
71 /// File and separator constructor. It creates a data set object by loading the object members from a data file.
72 /// It also sets a separator.
73 /// Please mind about the file format. This is specified in the User's Guide.
74 /// @param data_file_name Data file file name.
75 /// @param separator Data file file name.
76
DataSet(const string & data_file_name,const char & separator,const bool & new_has_columns_names)77 DataSet::DataSet(const string& data_file_name, const char& separator, const bool& new_has_columns_names)
78 {
79 set();
80
81 set_default();
82
83 set_data_file_name(data_file_name);
84
85 set_separator(separator);
86
87 set_has_columns_names(new_has_columns_names);
88
89 read_csv();
90 }
91
92
93 /// Destructor.
94
~DataSet()95 DataSet::~DataSet()
96 {
97 delete non_blocking_thread_pool;
98 delete thread_pool_device;
99 }
100
101
102 /// Returns true if messages from this class can be displayed on the screen,
103 /// or false if messages from this class can't be displayed on the screen.
104
get_display() const105 const bool& DataSet::get_display() const
106 {
107 return display;
108 }
109
110
111 /// Column default constructor
112
Column()113 DataSet::Column::Column()
114 {
115 name = "";
116 column_use = Input;
117 type = Numeric;
118 categories.resize(0);
119 categories_uses.resize(0);
120 }
121
122
123 /// Column default constructor
124
Column(const string & new_name,const VariableUse & new_column_use,const ColumnType & new_type,const Tensor<string,1> & new_categories,const Tensor<VariableUse,1> & new_categories_uses)125 DataSet::Column::Column(const string& new_name,
126 const VariableUse& new_column_use,
127 const ColumnType& new_type,
128 const Tensor<string, 1>& new_categories,
129 const Tensor<VariableUse, 1>& new_categories_uses)
130 {
131 name = new_name;
132 column_use = new_column_use;
133 type = new_type;
134 categories = new_categories;
135 categories_uses = new_categories_uses;
136 }
137
138 /// Column destructor.
139
~Column()140 DataSet::Column::~Column()
141 {}
142
143
144 /// Sets the use of the column and of the categories.
145 /// @param new_column_use New use of the column.
146
set_use(const VariableUse & new_column_use)147 void DataSet::Column::set_use(const VariableUse& new_column_use)
148 {
149 column_use = new_column_use;
150
151 for(Index i = 0; i < categories_uses.size(); i ++)
152 {
153 categories_uses(i) = new_column_use;
154 }
155 }
156
157
158 /// Sets the use of the column and of the categories.
159 /// @param new_column_use New use of the column in string format.
160
set_use(const string & new_column_use)161 void DataSet::Column::set_use(const string& new_column_use)
162 {
163 if(new_column_use == "Input")
164 {
165 set_use(Input);
166 }
167 else if(new_column_use == "Target")
168 {
169 set_use(Target);
170 }
171 else if(new_column_use == "Time")
172 {
173 set_use(Time);
174 }
175 else if(new_column_use == "Unused")
176 {
177 set_use(UnusedVariable);
178 }
179 else
180 {
181 ostringstream buffer;
182
183 buffer << "OpenNN Exception DataSet class.\n"
184 << "void set_use(const string&) method.\n"
185 << "Unknown use: " << new_column_use << "\n";
186
187 throw logic_error(buffer.str());
188 }
189 }
190
191
192 /// Sets the column type.
193 /// @param new_column_type Column type in string format.
194
set_type(const string & new_column_type)195 void DataSet::Column::set_type(const string& new_column_type)
196 {
197 if(new_column_type == "Numeric")
198 {
199 type = Numeric;
200 }
201 else if(new_column_type == "Binary")
202 {
203 type = Binary;
204 }
205 else if(new_column_type == "Categorical")
206 {
207 type = Categorical;
208 }
209 else if(new_column_type == "DateTime")
210 {
211 type = DateTime;
212 }
213 else if(new_column_type == "Constant")
214 {
215 type = Constant;
216 }
217 else
218 {
219 ostringstream buffer;
220
221 buffer << "OpenNN Exception: DataSet class.\n"
222 << "void Column::set_type(const string&) method.\n"
223 << "Column type not valid (" << new_column_type << ").\n";
224
225 throw logic_error(buffer.str());
226
227 }
228 }
229
230
231 /// Adds a category to the categories vector of this column.
232 /// It also adds a default use for the category
233 /// @param new_category String that contains the name of the new category
234
add_category(const string & new_category)235 void DataSet::Column::add_category(const string & new_category)
236 {
237 const Index old_categories_number = categories.size();
238
239 Tensor<string, 1> old_categories = categories;
240 Tensor<VariableUse, 1> old_categories_uses = categories_uses;
241
242 categories.resize(old_categories_number+1);
243 categories_uses.resize(old_categories_number+1);
244
245 for(Index category_index = 0; category_index < old_categories_number; category_index++)
246 {
247 categories(category_index) = old_categories(category_index);
248 categories_uses(category_index) = column_use;
249 }
250
251 categories(old_categories_number) = new_category;
252 categories_uses(old_categories_number) = column_use;
253 }
254
255
256 /// Sets the categories uses in the data set.
257 /// @param new_categories_uses String vector that contains the new categories of the data set.
258
set_categories_uses(const Tensor<string,1> & new_categories_uses)259 void DataSet::Column::set_categories_uses(const Tensor<string, 1>& new_categories_uses)
260 {
261 const Index new_categories_uses_number = new_categories_uses.size();
262
263 categories_uses.resize(new_categories_uses_number);
264
265 for(Index i = 0; i < new_categories_uses.size(); i++)
266 {
267 if(new_categories_uses(i) == "Input")
268 {
269 categories_uses(i) = Input;
270 }
271 else if(new_categories_uses(i) == "Target")
272 {
273 categories_uses(i) = Target;
274 }
275 else if(new_categories_uses(i) == "Time")
276 {
277 categories_uses(i) = Time;
278 }
279 else if(new_categories_uses(i) == "Unused"
280 || new_categories_uses(i) == "UnusedVariable")
281 {
282 categories_uses(i) = UnusedVariable;
283 }
284 else
285 {
286 ostringstream buffer;
287
288 buffer << "OpenNN Exception: DataSet class.\n"
289 << "void Column::set_categories_uses(const Tensor<string, 1>&) method.\n"
290 << "Category use not valid (" << new_categories_uses(i) << ").\n";
291
292 throw logic_error(buffer.str());
293
294 }
295 }
296 }
297
298
299 /// Sets the categories uses in the data set.
300 /// @param new_categories_use New categories use
301
set_categories_uses(const VariableUse & new_categories_use)302 void DataSet::Column::set_categories_uses(const VariableUse& new_categories_use)
303 {
304 categories_uses.setConstant(new_categories_use);
305 }
306
307
from_XML(const tinyxml2::XMLDocument & column_document)308 void DataSet::Column::from_XML(const tinyxml2::XMLDocument& column_document)
309 {
310 ostringstream buffer;
311
312 // Name
313
314 const tinyxml2::XMLElement* name_element = column_document.FirstChildElement("Name");
315
316 if(!name_element)
317 {
318 buffer << "OpenNN Exception: DataSet class.\n"
319 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
320 << "Name element is nullptr.\n";
321
322 throw logic_error(buffer.str());
323 }
324
325 if(name_element->GetText())
326 {
327 const string new_name = name_element->GetText();
328
329 name = new_name;
330 }
331
332 // Column use
333
334 const tinyxml2::XMLElement* column_use_element = column_document.FirstChildElement("ColumnUse");
335
336 if(!column_use_element)
337 {
338 buffer << "OpenNN Exception: DataSet class.\n"
339 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
340 << "Column use element is nullptr.\n";
341
342 throw logic_error(buffer.str());
343 }
344
345 if(column_use_element->GetText())
346 {
347 const string new_column_use = column_use_element->GetText();
348
349 set_use(new_column_use);
350 }
351
352 // Type
353
354 const tinyxml2::XMLElement* type_element = column_document.FirstChildElement("Type");
355
356 if(!type_element)
357 {
358 buffer << "OpenNN Exception: DataSet class.\n"
359 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
360 << "Type element is nullptr.\n";
361
362 throw logic_error(buffer.str());
363 }
364
365 if(type_element->GetText())
366 {
367 const string new_type = type_element->GetText();
368 set_type(new_type);
369 }
370
371 if(type == Categorical)
372 {
373 // Categories
374
375 const tinyxml2::XMLElement* categories_element = column_document.FirstChildElement("Categories");
376
377 if(!categories_element)
378 {
379 buffer << "OpenNN Exception: DataSet class.\n"
380 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
381 << "Categories element is nullptr.\n";
382
383 throw logic_error(buffer.str());
384 }
385
386 if(categories_element->GetText())
387 {
388 const string new_categories = categories_element->GetText();
389
390 categories = get_tokens(new_categories, ';');
391 }
392
393 // Categories uses
394
395 const tinyxml2::XMLElement* categories_uses_element = column_document.FirstChildElement("CategoriesUses");
396
397 if(!categories_uses_element)
398 {
399 buffer << "OpenNN Exception: DataSet class.\n"
400 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
401 << "Categories uses element is nullptr.\n";
402
403 throw logic_error(buffer.str());
404 }
405
406 if(categories_uses_element->GetText())
407 {
408 const string new_categories_uses = categories_uses_element->GetText();
409
410 set_categories_uses(get_tokens(new_categories_uses, ';'));
411 }
412 }
413 }
414
415
write_XML(tinyxml2::XMLPrinter & file_stream) const416 void DataSet::Column::write_XML(tinyxml2::XMLPrinter& file_stream) const
417 {
418 // Name
419
420 file_stream.OpenElement("Name");
421
422 file_stream.PushText(name.c_str());
423
424 file_stream.CloseElement();
425
426 // Column use
427
428 file_stream.OpenElement("ColumnUse");
429
430 if(column_use == Input)
431 {
432 file_stream.PushText("Input");
433 }
434 else if (column_use == Target)
435 {
436 file_stream.PushText("Target");
437 }
438 else if (column_use == UnusedVariable)
439 {
440 file_stream.PushText("Unused");
441 }
442 else
443 {
444 file_stream.PushText("Time");
445 }
446
447 file_stream.CloseElement();
448
449 // Type
450
451 file_stream.OpenElement("Type");
452
453 if(type == Numeric)
454 {
455 file_stream.PushText("Numeric");
456 }
457 else if (type == Binary)
458 {
459 file_stream.PushText("Binary");
460 }
461 else if (type == Categorical)
462 {
463 file_stream.PushText("Categorical");
464 }
465 else if(type == Constant)
466 {
467 file_stream.PushText("Constant");
468 }
469 else
470 {
471 file_stream.PushText("DateTime");
472 }
473
474 file_stream.CloseElement();
475
476 if(type == Categorical || type == Binary)
477 {
478 if(categories.size() == 0) return;
479
480 // Categories
481
482 file_stream.OpenElement("Categories");
483
484 for(Index i = 0; i < categories.size(); i++)
485 {
486 file_stream.PushText(categories(i).c_str());
487
488 if(i != categories.size()-1)
489 {
490 file_stream.PushText(";");
491 }
492 }
493
494 file_stream.CloseElement();
495
496 // Categories uses
497
498 file_stream.OpenElement("CategoriesUses");
499
500 for(Index i = 0; i < categories_uses.size(); i++)
501 {
502 if(categories_uses(i) == Input)
503 {
504 file_stream.PushText("Input");
505 }
506 else if (categories_uses(i) == Target)
507 {
508 file_stream.PushText("Target");
509 }
510 else if (categories_uses(i) == Time)
511 {
512 file_stream.PushText("Time");
513 }
514 else
515 {
516 file_stream.PushText("Unused");
517 }
518
519 if(i != categories_uses.size()-1)
520 {
521 file_stream.PushText(";");
522 }
523 }
524
525 file_stream.CloseElement();
526 }
527 /*else if(type == Binary)
528 {
529 if(categories.size() > 0)
530 {
531 // Categories
532
533 file_stream.OpenElement("Categories");
534 file_stream.PushText(categories(0).c_str());
535 file_stream.PushText(";");
536 file_stream.PushText(categories(1).c_str());
537 file_stream.CloseElement();
538
539 // Categories uses
540
541 file_stream.OpenElement("CategoriesUses");
542
543 if(categories_uses(0) == Input)
544 {
545 file_stream.PushText("Input");
546 }
547 else if (categories_uses(0) == Target)
548 {
549 file_stream.PushText("Target");
550 }
551 else if (categories_uses(0) == Time)
552 {
553 file_stream.PushText("Time");
554 }
555 else
556 {
557 file_stream.PushText("Unused");
558 }
559
560 file_stream.PushText(";");
561
562 if(categories_uses(1) == Input)
563 {
564 file_stream.PushText("Input");
565 }
566 else if (categories_uses(1) == Target)
567 {
568 file_stream.PushText("Target");
569 }
570 else if (categories_uses(1) == Time)
571 {
572 file_stream.PushText("Time");
573 }
574 else
575 {
576 file_stream.PushText("Unused");
577 }
578
579 file_stream.CloseElement();
580 }
581 }*/
582 }
583
584
585 /// Returns the number of categories.
586
get_categories_number() const587 Index DataSet::Column::get_categories_number() const
588 {
589 return categories.size();
590 }
591
592
593 /// Returns the number of used categories.
594
get_used_categories_number() const595 Index DataSet::Column::get_used_categories_number() const
596 {
597 Index used_categories_number = 0;
598
599 for(Index i = 0; i < categories.size(); i++)
600 {
601 if(categories_uses(i) != UnusedVariable) used_categories_number++;
602 }
603
604 return used_categories_number;
605 }
606
607
608 /// Returns a string vector that contains the names of the used variables in the data set.
609
get_used_variables_names() const610 Tensor<string, 1> DataSet::Column::get_used_variables_names() const
611 {
612 Tensor<string, 1> used_variables_names;
613
614 if(type != Categorical && column_use != UnusedVariable)
615 {
616 used_variables_names.resize(1);
617 used_variables_names.setConstant(name);
618 }
619 else if(type == Categorical)
620 {
621 used_variables_names.resize(get_used_categories_number());
622
623 Index category_index = 0;
624
625 for(Index i = 0; i < categories.size(); i++)
626 {
627 if(categories_uses(i) != UnusedVariable)
628 {
629 used_variables_names(category_index) = categories(i);
630
631 category_index++;
632 }
633 }
634 }
635
636 return used_variables_names;
637 }
638
639
640 /// This method transforms the columns into time series for forecasting problems.
641
transform_time_series_columns()642 void DataSet::transform_time_series_columns()
643 {
644 const Index columns_number = get_columns_number();
645
646 Tensor<Column, 1> new_columns;
647
648 if(has_time_columns())
649 {
650 new_columns.resize((columns_number-1)*(lags_number+steps_ahead));
651 }
652 else
653 {
654 new_columns.resize(columns_number*(lags_number+steps_ahead));
655 }
656
657 Index lag_index = lags_number - 1;
658 Index ahead_index = 0;
659 Index column_index = 0;
660 Index new_column_index = 0;
661
662 for(Index i = 0; i < columns_number*(lags_number+steps_ahead); i++)
663 {
664 column_index = i%columns_number;
665
666 if(time_series_columns(column_index).type == DateTime)
667 {
668 continue;
669 }
670
671 if(i < lags_number*columns_number)
672 {
673 new_columns(new_column_index).name = columns(column_index).name + "_lag_" + to_string(lag_index);
674 new_columns(new_column_index).set_use(Input);
675
676 new_columns(new_column_index).type = columns(column_index).type;
677 new_columns(new_column_index).categories = columns(column_index).categories;
678 new_columns(new_column_index).categories_uses = columns(column_index).categories_uses;
679
680 new_column_index++;
681 }
682 else
683 {
684 new_columns(new_column_index).name = columns(column_index).name + "_ahead_" + to_string(ahead_index);
685 new_columns(new_column_index).set_use(Target);
686
687 new_columns(new_column_index).type = columns(column_index).type;
688 new_columns(new_column_index).categories = columns(column_index).categories;
689 new_columns(new_column_index).categories_uses = columns(column_index).categories_uses;
690
691 new_column_index++;
692 }
693
694 if(lag_index > 0 && column_index == columns_number - 1)
695 {
696 lag_index--;
697 }
698 else if(column_index == columns_number - 1)
699 {
700 ahead_index++;
701 }
702 }
703
704 columns = new_columns;
705 }
706
707
708 /// Returns true if a given sample is to be used for training, selection or testing,
709 /// and false if it is to be unused.
710 /// @param index Sample index.
711
is_sample_used(const Index & index) const712 bool DataSet::is_sample_used(const Index& index) const
713 {
714 if(samples_uses(index) == UnusedSample)
715 {
716 return false;
717 }
718 else
719 {
720 return true;
721 }
722 }
723
724
725 /// Returns true if a given sample is to be unused and false in other case.
726 /// @param index Sample index.
727
is_sample_unused(const Index & index) const728 bool DataSet::is_sample_unused(const Index& index) const
729 {
730 if(samples_uses(index) == UnusedSample)
731 {
732 return true;
733 }
734 else
735 {
736 return false;
737 }
738 }
739
740
741 /// Returns a vector with the number of training, selection, testing
742 /// and unused samples.
743 /// The size of that vector is therefore four.
744
get_samples_uses_numbers() const745 Tensor<Index, 1> DataSet::get_samples_uses_numbers() const
746 {
747 Tensor<Index, 1> count(4);
748
749 const Index samples_number = get_samples_number();
750
751 for(Index i = 0; i < samples_number; i++)
752 {
753 if(samples_uses(i) == Training)
754 {
755 count(0)++;
756 }
757 else if(samples_uses(i) == Selection)
758 {
759 count(1)++;
760 }
761 else if(samples_uses(i) == Testing)
762 {
763 count(2)++;
764 }
765 else
766 {
767 count(3)++;
768 }
769 }
770
771 return count;
772 }
773
774
775 /// Returns a vector with the uses of the samples in percentages of the data set.
776 /// Uses: training, selection, testing and unused samples.
777 /// Note that the vector size is four.
778
get_samples_uses_percentages() const779 Tensor<type, 1> DataSet::get_samples_uses_percentages() const
780 {
781 const Index samples_number = get_samples_number();
782 const Index training_samples_number = get_training_samples_number();
783 const Index selection_samples_number = get_selection_samples_number();
784 const Index testing_samples_number = get_testing_samples_number();
785 const Index unused_samples_number = get_unused_samples_number();
786
787 const type training_samples_percentage = training_samples_number*100/static_cast<type>(samples_number);
788 const type selection_samples_percentage = selection_samples_number*100/static_cast<type>(samples_number);
789 const type testing_samples_percentage = testing_samples_number*100/static_cast<type>(samples_number);
790 const type unused_samples_percentage = unused_samples_number*100/static_cast<type>(samples_number);
791
792 Tensor<type, 1> samples_uses_percentage(4);
793
794 samples_uses_percentage.setValues({training_samples_percentage,
795 selection_samples_percentage,
796 testing_samples_percentage,
797 unused_samples_percentage});
798
799 return samples_uses_percentage;
800 }
801
802
803 /// Returns a string with the values of the sample corresponding to the given index.
804 /// The values will be separated by the given separator char.
805 /// @param sample_index Index of the sample.
806 /// @param separator Separator.
807
get_sample_string(const Index & sample_index,const string & separator) const808 string DataSet::get_sample_string(const Index& sample_index, const string& separator) const
809 {
810 const Tensor<type, 1> sample = data.chip(sample_index, 0);
811
812 string sample_string = "";
813
814 const Index columns_number = get_columns_number();
815
816 Index variable_index = 0;
817
818 for(Index i = 0; i < columns_number; i++)
819 {
820 if(columns(i).type == Numeric)
821 {
822 if(::isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
823 else sample_string += std::to_string(data(sample_index, variable_index));
824
825 variable_index++;
826 }
827 else if(columns(i).type == Binary)
828 {
829 if(::isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
830 else sample_string += columns(i).categories(static_cast<Index>(data(sample_index, variable_index)));
831
832 variable_index++;
833 }
834 else if(columns(i).type == DateTime)
835 {
836 // @todo do something
837
838 if(::isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
839 else sample_string += std::to_string(data(sample_index, variable_index));
840
841 variable_index++;
842 }
843 else if(columns(i).type == Categorical)
844 {
845 if(::isnan(data(sample_index, variable_index)))
846 {
847 sample_string += missing_values_label;
848 }
849 else
850 {
851 const Index categories_number = columns(i).get_categories_number();
852
853 for(Index j = 0; j < categories_number; j++)
854 {
855 if(abs(data(sample_index, variable_index+j) - static_cast<type>(1)) < std::numeric_limits<type>::min())
856 {
857 sample_string += columns(i).categories(j);
858 break;
859 }
860 }
861
862 variable_index += categories_number;
863 }
864 }
865
866 if(i != columns_number-1) sample_string += separator + " ";
867 }
868
869 return sample_string;
870 }
871
872
873 /// Returns the indices of the samples which will be used for training.
874
get_training_samples_indices() const875 Tensor<Index, 1> DataSet::get_training_samples_indices() const
876 {
877 const Index samples_number = get_samples_number();
878
879 const Index training_samples_number = get_training_samples_number();
880
881 Tensor<Index, 1> training_indices(training_samples_number);
882
883 Index count = 0;
884
885 for(Index i = 0; i < samples_number; i++)
886 {
887 if(samples_uses(i) == Training)
888 {
889 training_indices(count) = i;
890 count++;
891 }
892 }
893 return training_indices;
894 }
895
896
897 /// Returns the indices of the samples which will be used for selection.
898
get_selection_samples_indices() const899 Tensor<Index, 1> DataSet::get_selection_samples_indices() const
900 {
901 const Index samples_number = get_samples_number();
902
903 const Index selection_samples_number = get_selection_samples_number();
904
905 Tensor<Index, 1> selection_indices(selection_samples_number);
906
907 Index count = 0;
908
909 for(Index i = 0; i < samples_number; i++)
910 {
911 if(samples_uses(i) == Selection)
912 {
913 selection_indices(count) = i;
914 count++;
915 }
916 }
917
918 return selection_indices;
919 }
920
921
922 /// Returns the indices of the samples which will be used for testing.
923
get_testing_samples_indices() const924 Tensor<Index, 1> DataSet::get_testing_samples_indices() const
925 {
926 const Index samples_number = get_samples_number();
927
928 const Index testing_samples_number = get_testing_samples_number();
929
930 Tensor<Index, 1> testing_indices(testing_samples_number);
931
932 Index count = 0;
933
934 for(Index i = 0; i < samples_number; i++)
935 {
936 if(samples_uses(i) == Testing)
937 {
938 testing_indices(count) = i;
939 count++;
940 }
941 }
942
943 return testing_indices;
944 }
945
946
947 /// Returns the indices of the used samples(those which are not set unused).
948
get_used_samples_indices() const949 Tensor<Index, 1> DataSet::get_used_samples_indices() const
950 {
951 const Index samples_number = get_samples_number();
952
953 const Index used_samples_number = samples_number - get_unused_samples_number();
954
955 Tensor<Index, 1> used_indices(used_samples_number);
956
957 Index index = 0;
958
959 for(Index i = 0; i < samples_number; i++)
960 {
961 if(samples_uses(i) != UnusedSample)
962 {
963 used_indices(index) = i;
964 index++;
965 }
966 }
967
968 return used_indices;
969 }
970
971
972 /// Returns the indices of the samples set unused.
973
get_unused_samples_indices() const974 Tensor<Index, 1> DataSet::get_unused_samples_indices() const
975 {
976 const Index samples_number = get_samples_number();
977
978 const Index unused_samples_number = get_unused_samples_number();
979
980 Tensor<Index, 1> unused_indices(unused_samples_number);
981
982 Index count = 0;
983
984 for(Index i = 0; i < samples_number; i++)
985 {
986 if(samples_uses(i) == UnusedSample)
987 {
988 unused_indices(count) = i;
989 count++;
990 }
991 }
992
993 return unused_indices;
994 }
995
996
997 /// Returns the use of a single sample.
998 /// @param index Sample index.
999
get_sample_use(const Index & index) const1000 DataSet::SampleUse DataSet::get_sample_use(const Index& index) const
1001 {
1002 return samples_uses(index);
1003 }
1004
1005
1006 /// Returns the use of every sample (training, selection, testing or unused) in a vector.
1007
get_samples_uses() const1008 const Tensor<DataSet::SampleUse,1 >& DataSet::get_samples_uses() const
1009 {
1010 return samples_uses;
1011 }
1012
1013
1014 /// Returns a vector, where each element is a vector that contains the indices of the different batches of the training samples.
1015 /// @param shuffle Is a boleean.
1016 /// If shuffle is true, then the indices are shuffled into batches, and false otherwise
1017 /// @todo In forecasting must be false.
1018
get_batches(const Tensor<Index,1> & samples_indices,const Index & batch_samples_number,const bool & shuffle,const Index & new_buffer_size) const1019 Tensor<Index, 2> DataSet::get_batches(const Tensor<Index,1>& samples_indices,
1020 const Index& batch_samples_number,
1021 const bool& shuffle,
1022 const Index& new_buffer_size) const
1023 {
1024 if(!shuffle) return split_samples(samples_indices, batch_samples_number);
1025
1026 const Index samples_number = samples_indices.size();
1027
1028 Index buffer_size = new_buffer_size;
1029 Index batches_number;
1030 Index batch_size = batch_samples_number;
1031
1032 // Check batch size and samples number
1033
1034 if(samples_number < batch_size)
1035 {
1036 batches_number = 1;
1037 batch_size = samples_number;
1038 buffer_size = batch_size;
1039
1040 Tensor<Index,1> samples_copy(samples_indices);
1041
1042 Tensor<Index, 2> batches(batches_number, batch_size);
1043
1044 // Shuffle
1045
1046 random_shuffle(samples_copy.data(), samples_copy.data() + samples_copy.size());
1047
1048 for(Index i = 0; i < batch_size; i++)
1049 batches(0,i) = samples_copy(i);
1050
1051 return batches;
1052
1053 }
1054 else
1055 {
1056 batches_number = samples_number / batch_size;
1057 }
1058
1059
1060 Tensor<Index, 2> batches(batches_number, batch_size);
1061
1062 Tensor<Index, 1> buffer(buffer_size);
1063 for(Index i = 0; i < buffer_size; i++) buffer(i) = i;
1064
1065 Index next_index = buffer_size;
1066 Index random_index = 0;
1067
1068 // Heuristic cases for batch shuffling
1069
1070 if(batch_size < buffer_size)
1071 {
1072 Index diff = buffer_size/ batch_size;
1073
1074 // Main Loop
1075
1076 for(Index i = 0; i < batches_number; i++)
1077 {
1078 // Last batch
1079
1080 if(i == batches_number-diff)
1081 {
1082 Index buffer_index = 0;
1083
1084 for(Index k = batches_number-diff; k < batches_number; k++)
1085 {
1086 for(Index j = 0; j < batch_size; j++)
1087 {
1088 batches(k,j) = buffer(buffer_index);
1089
1090 buffer_index++;
1091 }
1092 }
1093
1094 break;
1095 }
1096
1097 // Shuffle batches
1098
1099 for(Index j = 0; j < batch_size; j++)
1100 {
1101 random_index = static_cast<Index>(rand()%buffer_size);
1102
1103 batches(i, j) = buffer(random_index);
1104
1105 buffer(random_index) = samples_indices(next_index);
1106
1107 next_index++;
1108 }
1109 }
1110
1111 return batches;
1112 }
1113 else // buffer_size <= batch_size
1114 {
1115
1116 // Main Loop
1117
1118 for(Index i = 0; i < batches_number; i++)
1119 {
1120 // Last batch
1121
1122 if(i == batches_number-1)
1123 {
1124 random_shuffle(buffer.data(), buffer.data() + buffer.size());
1125
1126 if(batch_size <= buffer_size)
1127 {
1128 for(Index j = 0; j < batch_size;j++)
1129 {
1130 batches(i,j) = buffer(j);
1131 }
1132 }
1133 else //buffer_size < batch_size
1134 {
1135 for(Index j = 0; j < buffer_size; j++)
1136 {
1137 batches(i,j) = buffer(j);
1138 }
1139
1140 for(Index j = buffer_size; j < batch_size; j++)
1141 {
1142 batches(i,j) = samples_indices(next_index);
1143
1144 next_index++;
1145 }
1146 }
1147
1148 break;
1149 }
1150
1151 // Shuffle batches
1152
1153 for(Index j = 0; j < batch_size; j++)
1154 {
1155 random_index = static_cast<Index>(rand()%buffer_size);
1156
1157 batches(i, j) = buffer(random_index);
1158
1159 buffer(random_index) = samples_indices(next_index);
1160
1161 next_index++;
1162
1163 }
1164 }
1165
1166 return batches;
1167 }
1168 }
1169
1170
1171 /// Returns the number of samples in the data set which will be used for training.
1172
get_training_samples_number() const1173 Index DataSet::get_training_samples_number() const
1174 {
1175 const Index samples_number = get_samples_number();
1176
1177 Index training_samples_number = 0;
1178
1179 for(Index i = 0; i < samples_number; i++)
1180 {
1181 if(samples_uses(i) == Training)
1182 {
1183 training_samples_number++;
1184 }
1185 }
1186
1187 return training_samples_number;
1188 }
1189
1190
1191 /// Returns the number of samples in the data set which will be used for selection.
1192
get_selection_samples_number() const1193 Index DataSet::get_selection_samples_number() const
1194 {
1195 const Index samples_number = get_samples_number();
1196
1197 Index selection_samples_number = 0;
1198
1199 for(Index i = 0; i < samples_number; i++)
1200 {
1201 if(samples_uses(i) == Selection)
1202 {
1203 selection_samples_number++;
1204 }
1205 }
1206
1207 return selection_samples_number;
1208 }
1209
1210
1211 /// Returns the number of samples in the data set which will be used for testing.
1212
get_testing_samples_number() const1213 Index DataSet::get_testing_samples_number() const
1214 {
1215 const Index samples_number = get_samples_number();
1216
1217 Index testing_samples_number = 0;
1218
1219 for(Index i = 0; i < samples_number; i++)
1220 {
1221 if(samples_uses(i) == Testing)
1222 {
1223 testing_samples_number++;
1224 }
1225 }
1226
1227 return testing_samples_number;
1228 }
1229
1230
1231 /// Returns the total number of training, selection and testing samples,
1232 /// i.e. those which are not "Unused".
1233
get_used_samples_number() const1234 Index DataSet::get_used_samples_number() const
1235 {
1236 const Index samples_number = get_samples_number();
1237 const Index unused_samples_number = get_unused_samples_number();
1238
1239 return (samples_number - unused_samples_number);
1240 }
1241
1242
1243 /// Returns the number of samples in the data set which will neither be used
1244 /// for training, selection or testing.
1245
get_unused_samples_number() const1246 Index DataSet::get_unused_samples_number() const
1247 {
1248 const Index samples_number = get_samples_number();
1249
1250 Index unused_samples_number = 0;
1251
1252 for(Index i = 0; i < samples_number; i++)
1253 {
1254 if(samples_uses(i) == UnusedSample)
1255 {
1256 unused_samples_number++;
1257 }
1258 }
1259
1260 return unused_samples_number;
1261 }
1262
1263
1264 /// Sets all the samples in the data set for training.
1265
set_training()1266 void DataSet::set_training()
1267 {
1268 const Index samples_number = get_samples_number();
1269
1270 for(Index i = 0; i < samples_number; i++)
1271 {
1272 samples_uses(i) = Training;
1273 }
1274 }
1275
1276
1277 /// Sets all the samples in the data set for selection.
1278
set_selection()1279 void DataSet::set_selection()
1280 {
1281 const Index samples_number = get_samples_number();
1282
1283 for(Index i = 0; i < samples_number; i++)
1284 {
1285 samples_uses(i) = Selection;
1286 }
1287 }
1288
1289
1290 /// Sets all the samples in the data set for testing.
1291
set_testing()1292 void DataSet::set_testing()
1293 {
1294 const Index samples_number = get_samples_number();
1295
1296 for(Index i = 0; i < samples_number; i++)
1297 {
1298 samples_uses(i) = Testing;
1299 }
1300 }
1301
1302
1303 /// Sets samples with given indices in the data set for training.
1304 /// @param indices Indices vector with the index of samples in the data set for training.
1305
set_training(const Tensor<Index,1> & indices)1306 void DataSet::set_training(const Tensor<Index, 1>& indices)
1307 {
1308 Index index = 0;
1309
1310 for(Index i = 0; i < indices.size(); i++)
1311 {
1312 index = indices(i);
1313
1314 samples_uses(index) = Training;
1315 }
1316 }
1317
1318
1319 /// Sets samples with given indices in the data set for selection.
1320 /// @param indices Indices vector with the index of samples in the data set for selection.
1321
set_selection(const Tensor<Index,1> & indices)1322 void DataSet::set_selection(const Tensor<Index, 1>& indices)
1323 {
1324 Index index = 0;
1325
1326 for(Index i = 0; i < indices.size(); i++)
1327 {
1328 index = indices(i);
1329
1330 samples_uses(index) = Selection;
1331 }
1332 }
1333
1334
1335 /// Sets samples with given indices in the data set for testing.
1336 /// @param indices Indices vector with the index of samples in the data set for testing.
1337
set_testing(const Tensor<Index,1> & indices)1338 void DataSet::set_testing(const Tensor<Index, 1>& indices)
1339 {
1340 Index index = 0;
1341
1342 for(Index i = 0; i < indices.size(); i++)
1343 {
1344 index = indices(i);
1345
1346 samples_uses(index) = Testing;
1347 }
1348 }
1349
1350
1351 /// Sets all the samples in the data set for unused.
1352
set_samples_unused()1353 void DataSet::set_samples_unused()
1354 {
1355 const Index samples_number = get_samples_number();
1356
1357 for(Index i = 0; i < samples_number; i++)
1358 {
1359 samples_uses(i) = UnusedSample;
1360 }
1361 }
1362
1363
1364 /// Sets samples with given indices in the data set for unused.
1365 /// @param indices Indices vector with the index of samples in the data set for unused.
1366
set_samples_unused(const Tensor<Index,1> & indices)1367 void DataSet::set_samples_unused(const Tensor<Index, 1>& indices)
1368 {
1369 for(Index i = 0; i < static_cast<Index>(indices.size()); i++)
1370 {
1371 const Index index = indices(i);
1372
1373 samples_uses(index) = UnusedSample;
1374 }
1375 }
1376
1377
1378 /// Sets the use of a single sample.
1379 /// @param index Index of sample.
1380 /// @param new_use Use for that sample.
1381
set_sample_use(const Index & index,const SampleUse & new_use)1382 void DataSet::set_sample_use(const Index& index, const SampleUse& new_use)
1383 {
1384 samples_uses(index) = new_use;
1385
1386 }
1387
1388
1389 /// Sets the use of a single sample from a string.
1390 /// @param index Index of sample.
1391 /// @param new_use String with the use name("Training", "Selection", "Testing" or "Unused")
1392
set_sample_use(const Index & index,const string & new_use)1393 void DataSet::set_sample_use(const Index& index, const string& new_use)
1394 {
1395 if(new_use == "Training")
1396 {
1397 samples_uses(index) = Training;
1398 }
1399 else if(new_use == "Selection")
1400 {
1401 samples_uses(index) = Selection;
1402 }
1403 else if(new_use == "Testing")
1404 {
1405 samples_uses(index) = Testing;
1406 }
1407 else if(new_use == "Unused")
1408 {
1409 samples_uses(index) = UnusedSample;
1410 }
1411 else
1412 {
1413 ostringstream buffer;
1414
1415 buffer << "OpenNN Exception DataSet class.\n"
1416 << "void set_sample_use(const string&) method.\n"
1417 << "Unknown use: " << new_use << "\n";
1418
1419 throw logic_error(buffer.str());
1420 }
1421 }
1422
1423
1424 /// Sets new uses to all the samples from a single vector.
1425 /// @param new_uses vector of use structures.
1426 /// The size of given vector must be equal to the number of samples.
1427
set_samples_uses(const Tensor<SampleUse,1> & new_uses)1428 void DataSet::set_samples_uses(const Tensor<SampleUse, 1>& new_uses)
1429 {
1430 const Index samples_number = get_samples_number();
1431
1432 #ifdef __OPENNN_DEBUG__
1433
1434 const Index new_uses_size = new_uses.size();
1435
1436 if(new_uses_size != samples_number)
1437 {
1438 ostringstream buffer;
1439
1440 buffer << "OpenNN Exception: DataSet class.\n"
1441 << "void set_samples_uses(const Tensor<SampleUse, 1>&) method.\n"
1442 << "Size of uses(" << new_uses_size << ") must be equal to number of samples(" << samples_number << ").\n";
1443
1444 throw logic_error(buffer.str());
1445 }
1446
1447 #endif
1448
1449 for(Index i = 0; i < samples_number; i++)
1450 {
1451 samples_uses(i) = new_uses(i);
1452 }
1453 }
1454
1455
1456 /// Sets new uses to all the samples from a single vector of strings.
1457 /// @param new_uses vector of use strings.
1458 /// Possible values for the elements are "Training", "Selection", "Testing" and "Unused".
1459 /// The size of given vector must be equal to the number of samples.
1460
set_samples_uses(const Tensor<string,1> & new_uses)1461 void DataSet::set_samples_uses(const Tensor<string, 1>& new_uses)
1462 {
1463 const Index samples_number = get_samples_number();
1464
1465 ostringstream buffer;
1466
1467 #ifdef __OPENNN_DEBUG__
1468
1469 const Index new_uses_size = new_uses.size();
1470
1471 if(new_uses_size != samples_number)
1472 {
1473 buffer << "OpenNN Exception: DataSet class.\n"
1474 << "void set_samples_uses(const Tensor<string, 1>&) method.\n"
1475 << "Size of uses(" << new_uses_size << ") must be equal to number of samples(" << samples_number << ").\n";
1476
1477 throw logic_error(buffer.str());
1478 }
1479
1480 #endif
1481
1482 for(Index i = 0; i < samples_number; i++)
1483 {
1484 if(new_uses(i).compare("Training") == 0 || new_uses(i).compare("0") == 0)
1485 {
1486 samples_uses(i) = Training;
1487 }
1488 else if(new_uses(i).compare("Selection") == 0 || new_uses(i).compare("1") == 0)
1489 {
1490 samples_uses(i) = Selection;
1491 }
1492 else if(new_uses(i).compare("Testing") == 0 || new_uses(i).compare("2") == 0)
1493 {
1494 samples_uses(i) = Testing;
1495 }
1496 else if(new_uses(i).compare("Unused") == 0 || new_uses(i).compare("3") == 0)
1497 {
1498 samples_uses(i) = UnusedSample;
1499 }
1500 else
1501 {
1502 buffer << "OpenNN Exception DataSet class.\n"
1503 << "void set_samples_uses(const Tensor<string, 1>&) method.\n"
1504 << "Unknown use: " << new_uses(i) << ".\n";
1505
1506 throw logic_error(buffer.str());
1507 }
1508 }
1509 }
1510
1511
1512 /// Creates new training, selection and testing indices at random.
1513 /// @param training_samples_ratio Ratio of training samples in the data set.
1514 /// @param selection_samples_ratio Ratio of selection samples in the data set.
1515 /// @param testing_samples_ratio Ratio of testing samples in the data set.
1516
split_samples_random(const type & training_samples_ratio,const type & selection_samples_ratio,const type & testing_samples_ratio)1517 void DataSet::split_samples_random(const type& training_samples_ratio,
1518 const type& selection_samples_ratio,
1519 const type& testing_samples_ratio)
1520 {
1521
1522 const Index used_samples_number = get_used_samples_number();
1523
1524 if(used_samples_number == 0) return;
1525
1526 const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1527
1528 // Get number of samples for training, selection and testing
1529
1530 const Index selection_samples_number = static_cast<Index>(selection_samples_ratio*used_samples_number/total_ratio);
1531 const Index testing_samples_number = static_cast<Index>(testing_samples_ratio*used_samples_number/total_ratio);
1532 const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1533
1534 const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1535
1536 if(sum_samples_number != used_samples_number)
1537 {
1538 ostringstream buffer;
1539
1540 buffer << "OpenNN Warning: DataSet class.\n"
1541 << "void split_samples_random(const type&, const type&, const type&) method.\n"
1542 << "Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1543
1544 throw logic_error(buffer.str());
1545 }
1546
1547 const Index samples_number = get_samples_number();
1548
1549 Tensor<Index, 1> indices;
1550
1551 initialize_sequential_eigen_tensor(indices, 0, 1, samples_number-1);
1552
1553 random_shuffle(indices.data(), indices.data() + indices.size());
1554
1555 Index count = 0;
1556
1557 for(Index i = 0; i < samples_uses.size(); i++)
1558 {
1559 if(samples_uses(i) == UnusedSample) count ++;
1560 }
1561
1562 Index i = 0;
1563 Index index;
1564
1565 // Training
1566
1567 Index count_training = 0;
1568
1569 while(count_training != training_samples_number)
1570 {
1571 index = indices(i);
1572
1573 if(samples_uses(index) != UnusedSample)
1574 {
1575 samples_uses(index)= Training;
1576 count_training++;
1577 }
1578
1579 i++;
1580 }
1581
1582 // Selection
1583
1584 Index count_selection = 0;
1585
1586 while(count_selection != selection_samples_number)
1587 {
1588 index = indices(i);
1589
1590 if(samples_uses(index) != UnusedSample)
1591 {
1592 samples_uses(index) = Selection;
1593 count_selection++;
1594 }
1595
1596 i++;
1597 }
1598
1599 // Testing
1600
1601
1602 Index count_testing = 0;
1603
1604 while(count_testing != testing_samples_number)
1605 {
1606 index = indices(i);
1607
1608 if(samples_uses(index) != UnusedSample)
1609 {
1610 samples_uses(index) = Testing;
1611 count_testing++;
1612 }
1613
1614 i++;
1615 }
1616
1617 for(Index i = 0; i < samples_uses.size(); i++)
1618 {
1619 if(samples_uses(i) == UnusedSample)
1620 {
1621 cout << "Sample " << i << " is unused" << endl;
1622 }
1623 }
1624
1625
1626 }
1627
1628
1629 /// Creates new training, selection and testing indices with sequential indices.
1630 /// @param training_samples_ratio Ratio of training samples in the data set.
1631 /// @param selection_samples_ratio Ratio of selection samples in the data set.
1632 /// @param testing_samples_ratio Ratio of testing samples in the data set.
1633
split_samples_sequential(const type & training_samples_ratio,const type & selection_samples_ratio,const type & testing_samples_ratio)1634 void DataSet::split_samples_sequential(const type& training_samples_ratio,
1635 const type& selection_samples_ratio,
1636 const type& testing_samples_ratio)
1637 {
1638 const Index used_samples_number = get_used_samples_number();
1639
1640 if(used_samples_number == 0) return;
1641
1642 const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1643
1644 // Get number of samples for training, selection and testing
1645
1646 const Index selection_samples_number = static_cast<Index>(selection_samples_ratio*used_samples_number/total_ratio);
1647 const Index testing_samples_number = static_cast<Index>(testing_samples_ratio*used_samples_number/total_ratio);
1648 const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1649
1650 const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1651
1652 if(sum_samples_number != used_samples_number)
1653 {
1654 ostringstream buffer;
1655
1656 buffer << "OpenNN Warning: Samples class.\n"
1657 << "void split_samples_sequential(const type&, const type&, const type&) method.\n"
1658 << "Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1659
1660 throw logic_error(buffer.str());
1661 }
1662
1663 Index i = 0;
1664
1665 // Training
1666
1667 Index count_training = 0;
1668
1669 while(count_training != training_samples_number)
1670 {
1671 if(samples_uses(i) != UnusedSample)
1672 {
1673 samples_uses(i) = Training;
1674 count_training++;
1675 }
1676
1677 i++;
1678 }
1679
1680 // Selection
1681
1682 Index count_selection = 0;
1683
1684 while(count_selection != selection_samples_number)
1685 {
1686 if(samples_uses(i) != UnusedSample)
1687 {
1688 samples_uses(i) = Selection;
1689 count_selection++;
1690 }
1691
1692 i++;
1693 }
1694
1695 // Testing
1696
1697 Index count_testing = 0;
1698
1699 while(count_testing != testing_samples_number)
1700 {
1701 if(samples_uses(i) != UnusedSample)
1702 {
1703 samples_uses(i) = Testing;
1704 count_testing++;
1705 }
1706 i++;
1707 }
1708 }
1709
1710
1711 /// This method separates the dataset into n-groups to validate a model with limited data.
1712 /// @param k Number of folds that a given data sample is given to be split into.
1713 /// @param fold_index.
1714 /// @todo Low priority
1715
set_k_fold_cross_validation_samples_uses(const Index & k,const Index & fold_index)1716 void DataSet::set_k_fold_cross_validation_samples_uses(const Index& k, const Index& fold_index)
1717 {
1718 const Index samples_number = get_samples_number();
1719
1720 const Index fold_size = samples_number/k;
1721
1722 const Index start = fold_index*fold_size;
1723 const Index end = start + fold_size;
1724
1725 split_samples_random(1, 0, 0);
1726
1727 for(Index i = start; i < end; i++)
1728 {
1729 samples_uses(i) = Testing;
1730 }
1731 }
1732
1733
1734 /// This method sets the n columns of the dataset by default,
1735 /// i.e. until column n-1 are Input and column n is Target.
1736
set_default_columns_uses()1737 void DataSet::set_default_columns_uses()
1738 {
1739 const Index size = columns.size();
1740
1741 if(size == 0)
1742 {
1743 return;
1744 }
1745 else if(size == 1)
1746 {
1747 columns(0).set_use(UnusedVariable);
1748 }
1749 else
1750 {
1751 set_input();
1752
1753 for(Index i = columns.size()-1; i >= 0; i--)
1754 {
1755 if(columns(i).type == Constant) continue;
1756 if(columns(i).type == Binary) continue;
1757 if(columns(i).type == Categorical) continue;
1758
1759 columns(i).set_use(Target);
1760 break;
1761 }
1762
1763 input_variables_dimensions.resize(1);
1764 }
1765 }
1766
1767
1768 /// This method sets the n columns of the dataset by default,
1769 /// i.e. until column n-1 are Input and column n is Target.
1770
set_default_classification_columns_uses()1771 void DataSet::set_default_classification_columns_uses()
1772 {
1773 const Index size = columns.size();
1774
1775 if(size == 0)
1776 {
1777 return;
1778 }
1779 else if(size == 1)
1780 {
1781 columns(0).set_use(UnusedVariable);
1782 }
1783 else
1784 {
1785 set_input();
1786
1787 for(Index i = columns.size()-1; i >= 0; i--)
1788 {
1789 if(columns(i).type == Constant) continue;
1790
1791 if(columns(i).type == Binary)
1792 {
1793 columns(i).set_use(Target);
1794 break;
1795 }
1796 else if(columns(i).type == Categorical)
1797 {
1798 columns(i).set_use(Target);
1799 break;
1800 }
1801 }
1802
1803 input_variables_dimensions.resize(1);
1804 }
1805 }
1806
1807
1808 /// This method puts the names of the columns in the dataset.
1809 /// This is used when the dataset does not have a header,
1810 /// the default names are: column_0, column_1, ..., column_n.
1811
set_default_columns_names()1812 void DataSet::set_default_columns_names()
1813 {
1814 const Index size = columns.size();
1815
1816 if(size == 0)
1817 {
1818 return;
1819 }
1820 else if(size == 1)
1821 {
1822 return;
1823 }
1824 else
1825 {
1826 Index input_index = 1;
1827 Index target_index = 2;
1828
1829 for(Index i = 0; i < size; i++)
1830 {
1831 if(columns(i).column_use == Input)
1832 {
1833 columns(i).name = "input_" + std::to_string(input_index+1);
1834 input_index++;
1835 }
1836 else if(columns(i).column_use == Target)
1837 {
1838 columns(i).name = "target_" + std::to_string(target_index+1);
1839 target_index++;
1840 }
1841 }
1842 }
1843 }
1844
1845
1846 /// Sets the name of a single column.
1847 /// @param index Index of column.
1848 /// @param new_use Use for that column.
1849
set_column_name(const Index & column_index,const string & new_name)1850 void DataSet::set_column_name(const Index& column_index, const string& new_name)
1851 {
1852 columns(column_index).name = new_name;
1853 }
1854
1855
1856 /// Returns the use of a single variable.
1857 /// @param index Index of variable.
1858
get_variable_use(const Index & index) const1859 DataSet::VariableUse DataSet::get_variable_use(const Index& index) const
1860 {
1861 return get_variables_uses()(index);
1862 }
1863
1864
1865 /// Returns a vector containing the use of the column, without taking into account the categories.
1866
get_column_use(const Index & index) const1867 DataSet::VariableUse DataSet::get_column_use(const Index & index) const
1868 {
1869 return columns(index).column_use;
1870 }
1871
1872
1873 /// Returns the uses of each columns of the data set.
1874
get_columns_uses() const1875 Tensor<DataSet::VariableUse, 1> DataSet::get_columns_uses() const
1876 {
1877 const Index columns_number = get_columns_number();
1878
1879 Tensor<DataSet::VariableUse, 1> columns_uses(columns_number);
1880
1881 for (Index i = 0; i < columns_number; i++)
1882 {
1883 columns_uses(i) = columns(i).column_use;
1884 }
1885
1886 return columns_uses;
1887 }
1888
1889
1890 /// Returns a vector containing the use of each column, including the categories.
1891 /// The size of the vector is equal to the number of variables.
1892
get_variables_uses() const1893 Tensor<DataSet::VariableUse, 1> DataSet::get_variables_uses() const
1894 {
1895 const Index columns_number = get_columns_number();
1896 const Index variables_number = get_variables_number();
1897
1898 Tensor<VariableUse, 1> variables_uses(variables_number);
1899
1900 Index index = 0;
1901
1902 for(Index i = 0; i < columns_number; i++)
1903 {
1904 if(columns(i).type == Categorical)
1905 {
1906 for(Index i = 0; i < (columns(i).categories_uses).size(); i++)
1907 {
1908 variables_uses(i + index) = (columns(i).categories_uses)(i);
1909 }
1910 index += columns(i).categories.size();
1911 }
1912 else
1913 {
1914 variables_uses(index) = columns(i).column_use;
1915 index++;
1916 }
1917 }
1918
1919 return variables_uses;
1920 }
1921
1922
1923 /// Returns the name of a single variable in the data set.
1924 /// @param index Index of variable.
1925
get_variable_name(const Index & variable_index) const1926 string DataSet::get_variable_name(const Index& variable_index) const
1927 {
1928 #ifdef __OPENNN_DEBUG__
1929
1930 const Index variables_number = get_variables_number();
1931
1932 if(variable_index >= variables_number)
1933 {
1934 ostringstream buffer;
1935
1936 buffer << "OpenNN Exception: DataSet class.\n"
1937 << "string& get_variable_name(const Index) method.\n"
1938 << "Index of variable("<<variable_index<<") must be less than number of variables("<<variables_number<<").\n";
1939
1940 throw logic_error(buffer.str());
1941 }
1942
1943 #endif
1944
1945 const Index columns_number = get_columns_number();
1946
1947 Index index = 0;
1948
1949 for(Index i = 0; i < columns_number; i++)
1950 {
1951 if(columns(i).type == Categorical)
1952 {
1953 for(Index j = 0; j < columns(i).get_categories_number(); j++)
1954 {
1955 if(index == variable_index)
1956 {
1957 return columns(i).categories(j);
1958 }
1959 else
1960 {
1961 index++;
1962 }
1963 }
1964 }
1965 else
1966 {
1967 if(index == variable_index)
1968 {
1969 return columns(i).name;
1970 }
1971 else
1972 {
1973 index++;
1974 }
1975 }
1976 }
1977
1978 return string();
1979 }
1980
1981
1982 /// Returns a string vector with the names of all the variables in the data set.
1983 /// The size of the vector is the number of variables.
1984
get_variables_names() const1985 Tensor<string, 1> DataSet::get_variables_names() const
1986 {
1987 const Index variables_number = get_variables_number();
1988
1989 Tensor<string, 1> variables_names(variables_number);
1990
1991 Index index = 0;
1992
1993 for(Index i = 0; i < columns.size(); i++)
1994 {
1995 if(columns(i).type == Categorical)
1996 {
1997 for(Index j = 0; j < columns(i).categories.size(); j++)
1998 {
1999 variables_names(index) = columns(i).categories(j);
2000
2001 index++;
2002 }
2003 }
2004 else
2005 {
2006 variables_names(index) = columns(i).name;
2007 index++;
2008 }
2009 }
2010
2011 return variables_names;
2012 }
2013
2014
2015 /// Returns the names of the input variables in the data set.
2016 /// The size of the vector is the number of input variables.
2017
get_input_variables_names() const2018 Tensor<string, 1> DataSet::get_input_variables_names() const
2019 {
2020 const Index input_variables_number = get_input_variables_number();
2021
2022 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
2023
2024 Tensor<string, 1> input_variables_names(input_variables_number);
2025
2026 Index index = 0;
2027
2028 for(Index i = 0; i < input_columns_indices.size(); i++)
2029 {
2030 Index input_index = input_columns_indices(i);
2031
2032 const Tensor<string, 1> current_used_variables_names = columns(input_index).get_used_variables_names();
2033
2034 for(Index j = 0; j < current_used_variables_names.size(); j++)
2035 {
2036 input_variables_names(index + j) = current_used_variables_names(j);
2037 }
2038
2039 index += current_used_variables_names.size();
2040 }
2041
2042 return input_variables_names;
2043 }
2044
2045
2046 /// Returns the names of the target variables in the data set.
2047 /// The size of the vector is the number of target variables.
2048
get_target_variables_names() const2049 Tensor<string, 1> DataSet::get_target_variables_names() const
2050 {
2051 const Index target_variables_number = get_target_variables_number();
2052
2053 const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
2054
2055 Tensor<string, 1> target_variables_names(target_variables_number);
2056
2057 Index index = 0;
2058
2059 for(Index i = 0; i < target_columns_indices.size(); i++)
2060 {
2061 Index target_index = target_columns_indices(i);
2062
2063 const Tensor<string, 1> current_used_variables_names = columns(target_index).get_used_variables_names();
2064
2065 for(Index j = 0; j < current_used_variables_names.size(); j++)
2066 {
2067 target_variables_names(index + j) = current_used_variables_names(j);
2068 }
2069
2070 index += current_used_variables_names.size();
2071 }
2072
2073 return target_variables_names;
2074 }
2075
2076
2077 /// Returns the dimensions of the input variables.
2078
get_input_variables_dimensions() const2079 const Tensor<Index, 1>& DataSet::get_input_variables_dimensions() const
2080 {
2081 return input_variables_dimensions;
2082 }
2083
2084
2085 /// Returns the number of variables which are either input nor target.
2086
get_used_variables_number() const2087 Index DataSet::get_used_variables_number() const
2088 {
2089 const Index variables_number = get_variables_number();
2090
2091 const Index unused_variables_number = get_unused_variables_number();
2092
2093 return (variables_number - unused_variables_number);
2094 }
2095
2096
2097 /// Returns a indices vector with the positions of the inputs.
2098
get_input_columns_indices() const2099 Tensor<Index, 1> DataSet::get_input_columns_indices() const
2100 {
2101 const Index input_columns_number = get_input_columns_number();
2102
2103 Tensor<Index, 1> input_columns_indices(input_columns_number);
2104
2105 Index index = 0;
2106
2107 for(Index i = 0; i < columns.size(); i++)
2108 {
2109 if(columns(i).column_use == Input)
2110 {
2111 input_columns_indices(index) = i;
2112 index++;
2113 }
2114 }
2115
2116 return input_columns_indices;
2117 }
2118
2119
2120 /// Returns a indices vector with the positions of the targets.
2121
get_target_columns_indices() const2122 Tensor<Index, 1> DataSet::get_target_columns_indices() const
2123 {
2124 const Index target_columns_number = get_target_columns_number();
2125
2126 Tensor<Index, 1> target_columns_indices(target_columns_number);
2127
2128 Index index = 0;
2129
2130 for(Index i = 0; i < columns.size(); i++)
2131 {
2132 if(columns(i).column_use == Target)
2133 {
2134 target_columns_indices(index) = i;
2135 index++;
2136 }
2137 }
2138
2139 return target_columns_indices;
2140 }
2141
2142
2143 /// Returns a indices vector with the positions of the unused columns.
2144
get_unused_columns_indices() const2145 Tensor<Index, 1> DataSet::get_unused_columns_indices() const
2146 {
2147 const Index unused_columns_number = get_unused_columns_number();
2148
2149 Tensor<Index, 1> unused_columns_indices(unused_columns_number);
2150
2151 Index index = 0;
2152
2153 for(Index i = 0; i < unused_columns_number; i++)
2154 {
2155
2156 if(columns(i).column_use == UnusedVariable)
2157 {
2158 unused_columns_indices(index) = i;
2159 index++;
2160 }
2161 }
2162
2163 return unused_columns_indices;
2164 }
2165
2166
2167 /// Returns a indices vector with the positions of the used columns.
2168
get_used_columns_indices() const2169 Tensor<Index, 1> DataSet::get_used_columns_indices() const
2170 {
2171 const Index variables_number = get_variables_number();
2172
2173 const Index used_variables_number = get_used_variables_number();
2174
2175 Tensor<Index, 1> used_indices(used_variables_number);
2176
2177 Index index = 0;
2178
2179 for(Index i = 0; i < variables_number; i++)
2180 {
2181 if(columns(i).column_use == Input
2182 || columns(i).column_use == Target
2183 || columns(i).column_use == Time)
2184 {
2185 used_indices(index) = i;
2186 index++;
2187 }
2188 }
2189
2190 return used_indices;
2191 }
2192
2193
2194 /// Returns a string vector that contains the names of the columns.
2195
get_columns_names() const2196 Tensor<string, 1> DataSet::get_columns_names() const
2197 {
2198 const Index columns_number = get_columns_number();
2199
2200 Tensor<string, 1> columns_names(columns_number);
2201
2202 for(Index i = 0; i < columns_number; i++)
2203 {
2204 columns_names(i) = columns(i).name;
2205 }
2206
2207 return columns_names;
2208 }
2209
2210
get_time_series_columns_names() const2211 Tensor<string, 1> DataSet::get_time_series_columns_names() const
2212 {
2213 const Index columns_number = get_time_series_columns_number();
2214
2215 Tensor<string, 1> columns_names(columns_number);
2216
2217 for(Index i = 0; i < columns_number; i++)
2218 {
2219 columns_names(i) = time_series_columns(i).name;
2220 }
2221
2222 return columns_names;
2223 }
2224
2225 /// Returns a string vector that contains the names of the columns whose uses are Input.
2226
get_input_columns_names() const2227 Tensor<string, 1> DataSet::get_input_columns_names() const
2228 {
2229 const Index input_columns_number = get_input_columns_number();
2230
2231 Tensor<string, 1> input_columns_names(input_columns_number);
2232
2233 Index index = 0;
2234
2235 for(Index i = 0; i < columns.size(); i++)
2236 {
2237 if(columns(i).column_use == Input)
2238 {
2239 input_columns_names(index) = columns(i).name;
2240 index++;
2241 }
2242 }
2243
2244 return input_columns_names;
2245 }
2246
2247
2248 /// Returns a string vector which contains the names of the columns whose uses are Target.
2249
get_target_columns_names() const2250 Tensor<string, 1> DataSet::get_target_columns_names() const
2251 {
2252 const Index target_columns_number = get_target_columns_number();
2253
2254 Tensor<string, 1> target_columns_names(target_columns_number);
2255
2256 Index index = 0;
2257
2258 for(Index i = 0; i < columns.size(); i++)
2259 {
2260 if(columns(i).column_use == Target)
2261 {
2262 target_columns_names(index) = columns(i).name;
2263 index++;
2264 }
2265 }
2266
2267 return target_columns_names;
2268
2269 }
2270
2271
2272 /// Returns a string vector which contains the names of the columns used whether Input, Target or Time.
2273
get_used_columns_names() const2274 Tensor<string, 1> DataSet::get_used_columns_names() const
2275 {
2276 const Index columns_number = get_columns_number();
2277 const Index used_columns_number = get_used_columns_number();
2278
2279 Tensor<string, 1> names(used_columns_number);
2280
2281 Index index = 0 ;
2282
2283 for(Index i = 0; i < columns_number; i++)
2284 {
2285 if(columns(i).column_use != UnusedVariable)
2286 {
2287 names(index) = columns(i).name;
2288 index++;
2289 }
2290 }
2291
2292 return names;
2293 }
2294
2295
2296 /// Returns the number of columns whose uses are Input.
2297
get_input_columns_number() const2298 Index DataSet::get_input_columns_number() const
2299 {
2300 Index input_columns_number = 0;
2301
2302 for(Index i = 0; i < columns.size(); i++)
2303 {
2304 if(columns(i).column_use == Input)
2305 {
2306 input_columns_number++;
2307 }
2308 }
2309
2310 return input_columns_number;
2311 }
2312
2313
2314 /// Returns the number of columns whose uses are Target.
2315
get_target_columns_number() const2316 Index DataSet::get_target_columns_number() const
2317 {
2318 Index target_columns_number = 0;
2319
2320 for(Index i = 0; i < columns.size(); i++)
2321 {
2322 if(columns(i).column_use == Target)
2323 {
2324 target_columns_number++;
2325 }
2326 }
2327
2328 return target_columns_number;
2329 }
2330
2331
2332 /// Returns the number of columns whose uses are Time
2333
get_time_columns_number() const2334 Index DataSet::get_time_columns_number() const
2335 {
2336 Index time_columns_number = 0;
2337
2338 for(Index i = 0; i < columns.size(); i++)
2339 {
2340 if(columns(i).column_use == Time)
2341 {
2342 time_columns_number++;
2343 }
2344 }
2345
2346 return time_columns_number;
2347 }
2348
2349
2350 /// Returns the number of columns that are not used.
2351
get_unused_columns_number() const2352 Index DataSet::get_unused_columns_number() const
2353 {
2354 Index unused_columns_number = 0;
2355
2356 for(Index i = 0; i < columns.size(); i++)
2357 {
2358 if(columns(i).column_use == UnusedVariable)
2359 {
2360 unused_columns_number++;
2361 }
2362 }
2363
2364 return unused_columns_number;
2365 }
2366
2367
2368 /// Returns the number of columns that are used.
2369
get_used_columns_number() const2370 Index DataSet::get_used_columns_number() const
2371 {
2372 Index used_columns_number = 0;
2373
2374 for(Index i = 0; i < columns.size(); i++)
2375 {
2376 if(columns(i).column_use != UnusedVariable)
2377 {
2378 used_columns_number++;
2379 }
2380 }
2381
2382 return used_columns_number;
2383 }
2384
2385
2386 /// Returns the columns of the data set.
2387
get_columns() const2388 Tensor<DataSet::Column, 1> DataSet::get_columns() const
2389 {
2390 return columns;
2391 }
2392
get_time_series_columns() const2393 Tensor<DataSet::Column, 1> DataSet::get_time_series_columns() const
2394 {
2395 return time_series_columns;
2396 }
2397
2398 /// Returns the input columns of the data set.
2399
get_input_columns() const2400 Tensor<DataSet::Column, 1> DataSet::get_input_columns() const
2401 {
2402 const Index inputs_number = get_input_columns_number();
2403
2404 Tensor<Column, 1> input_columns(inputs_number);
2405 Index input_index = 0;
2406
2407 for(Index i = 0; i < columns.size(); i++)
2408 {
2409 if(columns(i).column_use == Input)
2410 {
2411 input_columns(input_index) = columns(i);
2412 input_index++;
2413 }
2414 }
2415
2416 return input_columns;
2417 }
2418
2419
2420 /// Returns the target columns of the data set.
2421
get_target_columns() const2422 Tensor<DataSet::Column, 1> DataSet::get_target_columns() const
2423 {
2424 const Index targets_number = get_target_columns_number();
2425
2426 Tensor<Column, 1> target_columns(targets_number);
2427 Index target_index = 0;
2428
2429 for(Index i = 0; i < columns.size(); i++)
2430 {
2431 if(columns(i).column_use == Target)
2432 {
2433 target_columns(target_index) = columns(i);
2434 target_index++;
2435 }
2436 }
2437
2438 return target_columns;
2439 }
2440
2441
2442 /// Returns the used columns of the data set.
2443 /// @todo
2444
get_used_columns() const2445 Tensor<DataSet::Column, 1> DataSet::get_used_columns() const
2446 {
2447 const Tensor<Index, 1> used_columns_indices = get_used_columns_indices();
2448
2449 // return columns.get_subvector(used_columns_indices);
2450
2451 return Tensor<DataSet::Column, 1>();
2452 }
2453
2454
2455 /// Returns the number of columns in the data set.
2456
get_columns_number() const2457 Index DataSet::get_columns_number() const
2458 {
2459 return columns.size();
2460 }
2461
2462 /// Returns the number of columns in the time series.
2463
get_time_series_columns_number() const2464 Index DataSet::get_time_series_columns_number() const
2465 {
2466 return time_series_columns.size();
2467 }
2468
2469 /// Returns the number of variables in the data set.
2470
get_variables_number() const2471 Index DataSet::get_variables_number() const
2472 {
2473 Index variables_number = 0;
2474
2475 for(Index i = 0; i < columns.size(); i++)
2476 {
2477 if(columns(i).type == Categorical)
2478 {
2479 variables_number += columns(i).categories.size();
2480 }
2481 else
2482 {
2483 variables_number++;
2484 }
2485 }
2486
2487 return variables_number;
2488 }
2489
2490
2491 /// Returns the number of input variables of the data set.
2492 /// Note that the number of variables does not have to equal the number of columns in the data set,
2493 /// because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.
2494
get_input_variables_number() const2495 Index DataSet::get_input_variables_number() const
2496 {
2497 Index inputs_number = 0;
2498
2499 for(Index i = 0; i < columns.size(); i++)
2500 {
2501 if(columns(i).type == Categorical)
2502 {
2503 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2504 {
2505 if(columns(i).categories_uses(j) == Input) inputs_number++;
2506 }
2507 }
2508 else if(columns(i).column_use == Input)
2509 {
2510 inputs_number++;
2511 }
2512 }
2513
2514 return inputs_number;
2515 }
2516
2517
2518 /// Returns the number of target variables of the data set.
2519
get_target_variables_number() const2520 Index DataSet::get_target_variables_number() const
2521 {
2522 Index targets_number = 0;
2523
2524 for(Index i = 0; i < columns.size(); i++)
2525 {
2526 if(columns(i).type == Categorical)
2527 {
2528 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2529 {
2530 if(columns(i).categories_uses(j) == Target) targets_number++;
2531 }
2532
2533 }
2534 else if(columns(i).column_use == Target)
2535 {
2536 targets_number++;
2537 }
2538 }
2539
2540 return targets_number;
2541 }
2542
2543
2544 /// Returns the number of variables which will neither be used as input nor as target.
2545
get_unused_variables_number() const2546 Index DataSet::get_unused_variables_number() const
2547 {
2548 Index unused_number = 0;
2549
2550 for(Index i = 0; i < columns.size(); i++)
2551 {
2552 if(columns(i).type == Categorical)
2553 {
2554 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2555 {
2556 if(columns(i).categories_uses(j) == UnusedVariable) unused_number++;
2557 }
2558
2559 }
2560 else if(columns(i).column_use == UnusedVariable)
2561 {
2562 unused_number++;
2563 }
2564 }
2565
2566 return unused_number;
2567 }
2568
2569
2570 /// Returns a variable index in the data set with given name.
2571 /// @param name Name of variable.
2572
get_variable_index(const string & name) const2573 Index DataSet::get_variable_index(const string& name) const
2574 {
2575 const Index variables_number = get_variables_number();
2576
2577 const Tensor<string, 1> variables_names = get_variables_names();
2578
2579 for(Index i = 0; i < variables_number; i++)
2580 {
2581 if(variables_names(i) == name) return i;
2582 }
2583
2584 return 0;
2585
2586 // throw exception("Exception: Index DataSet::get_variable_index(const string& name) const");
2587 }
2588
2589
2590 /// Returns the indices of the unused variables.
2591
get_unused_variables_indices() const2592 Tensor<Index, 1> DataSet::get_unused_variables_indices() const
2593 {
2594 const Index unused_number = get_unused_variables_number();
2595
2596 const Tensor<Index, 1> unused_columns_indices = get_unused_columns_indices();
2597
2598 Tensor<Index, 1> unused_indices(unused_number);
2599
2600 Index unused_index = 0;
2601 Index unused_variable_index = 0;
2602
2603 for(Index i = 0; i < columns.size(); i++)
2604 {
2605 if(columns(i).type == Categorical)
2606 {
2607 const Index current_categories_number = columns(i).get_categories_number();
2608
2609 for(Index j = 0; j < current_categories_number; j++)
2610 {
2611 if(columns(i).categories_uses(j) == UnusedVariable)
2612 {
2613 unused_indices(unused_index) = unused_variable_index;
2614 unused_index++;
2615 }
2616
2617 unused_variable_index++;
2618 }
2619 }
2620 else if(columns(i).column_use == UnusedVariable)
2621 {
2622 unused_indices(unused_index) = i;
2623 unused_index++;
2624 unused_variable_index++;
2625 }
2626 else
2627 {
2628 unused_variable_index++;
2629 }
2630 }
2631
2632 return unused_indices;
2633 }
2634
2635
2636 /// Returns the indices of the used variables.
2637
get_used_variables_indices() const2638 Tensor<Index, 1> DataSet::get_used_variables_indices() const
2639 {
2640 const Index used_number = get_used_variables_number();
2641
2642 Tensor<Index, 1> used_indices(used_number);
2643
2644 Index used_index = 0;
2645 Index used_variable_index = 0;
2646
2647 for(Index i = 0; i < columns.size(); i++)
2648 {
2649 if(columns(i).type == Categorical)
2650 {
2651 const Index current_categories_number = columns(i).get_categories_number();
2652
2653 for(Index j = 0; j < current_categories_number; j++)
2654 {
2655 if(columns(i).categories_uses(j) != UnusedVariable)
2656 {
2657 used_indices(used_index) = used_variable_index;
2658 used_index++;
2659 }
2660
2661 used_variable_index++;
2662 }
2663 }
2664 else if(columns(i).column_use != UnusedVariable)
2665 {
2666 used_indices(used_index) = used_variable_index;
2667 used_index++;
2668 used_variable_index++;
2669 }
2670 else
2671 {
2672 used_variable_index++;
2673 }
2674 }
2675
2676 return used_indices;
2677 }
2678
2679
2680
2681 /// Returns the indices of the input variables.
2682
get_input_variables_indices() const2683 Tensor<Index, 1> DataSet::get_input_variables_indices() const
2684 {
2685 const Index inputs_number = get_input_variables_number();
2686
2687 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
2688
2689 Tensor<Index, 1> input_variables_indices(inputs_number);
2690
2691 Index input_index = 0;
2692 Index input_variable_index = 0;
2693
2694 for(Index i = 0; i < columns.size(); i++)
2695 {
2696
2697 if(columns(i).type == Categorical)
2698 {
2699 const Index current_categories_number = columns(i).get_categories_number();
2700
2701 for(Index j = 0; j < current_categories_number; j++)
2702 {
2703 if(columns(i).categories_uses(j) == Input)
2704 {
2705 input_variables_indices(input_index) = input_variable_index;
2706 input_index++;
2707 }
2708
2709 input_variable_index++;
2710 }
2711 }
2712 else if(columns(i).column_use == Input) // Binary, numeric
2713 {
2714 input_variables_indices(input_index) = input_variable_index;
2715 input_index++;
2716 input_variable_index++;
2717 }
2718 else
2719 {
2720 input_variable_index++;
2721 }
2722 }
2723
2724 return input_variables_indices;
2725 }
2726
2727
2728 /// Returns the indices of the target variables.
2729
get_target_variables_indices() const2730 Tensor<Index, 1> DataSet::get_target_variables_indices() const
2731 {
2732 const Index targets_number = get_target_variables_number();
2733
2734 const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
2735
2736 Tensor<Index, 1> target_variables_indices(targets_number);
2737
2738 Index target_index = 0;
2739 Index target_variable_index = 0;
2740
2741 for(Index i = 0; i < columns.size(); i++)
2742 {
2743 if(columns(i).type == Categorical)
2744 {
2745 const Index current_categories_number = columns(i).get_categories_number();
2746
2747 for(Index j = 0; j < current_categories_number; j++)
2748 {
2749 if(columns(i).categories_uses(j) == Target)
2750 {
2751 target_variables_indices(target_index) = target_variable_index;
2752 target_index++;
2753 }
2754
2755 target_variable_index++;
2756 }
2757 }
2758 else if(columns(i).column_use == Target) // Binary, numeric
2759 {
2760 target_variables_indices(target_index) = target_variable_index;
2761 target_index++;
2762 target_variable_index++;
2763 }
2764 else
2765 {
2766 target_variable_index++;
2767 }
2768 }
2769
2770 return target_variables_indices;
2771 }
2772
2773
2774 /// Sets the uses of the data set columns.
2775 /// @param new_columns_uses String vector that contains the new uses to be set,
2776 /// note that this vector needs to be the size of the number of columns in the data set.
2777
set_columns_uses(const Tensor<string,1> & new_columns_uses)2778 void DataSet::set_columns_uses(const Tensor<string, 1>& new_columns_uses)
2779 {
2780 const Index new_columns_uses_size = new_columns_uses.size();
2781
2782 if(new_columns_uses_size != columns.size())
2783 {
2784 ostringstream buffer;
2785
2786 buffer << "OpenNN Exception DataSet class.\n"
2787 << "void set_columns_uses(const Tensor<string, 1>&) method.\n"
2788 << "Size of columns uses ("
2789 << new_columns_uses_size << ") must be equal to columns size ("
2790 << columns.size() << "). \n";
2791
2792 throw logic_error(buffer.str());
2793 }
2794
2795 for(Index i = 0; i < new_columns_uses.size(); i++)
2796 {
2797 columns(i).set_use(new_columns_uses(i));
2798 }
2799
2800 input_variables_dimensions.resize(1);
2801 input_variables_dimensions.setConstant(get_input_variables_number());
2802 }
2803
2804
2805 /// Sets the uses of the data set columns.
2806 /// @param new_columns_uses DataSet::VariableUse vector that contains the new uses to be set,
2807 /// note that this vector needs to be the size of the number of columns in the data set.
2808
set_columns_uses(const Tensor<VariableUse,1> & new_columns_uses)2809 void DataSet::set_columns_uses(const Tensor<VariableUse, 1>& new_columns_uses)
2810 {
2811 const Index new_columns_uses_size = new_columns_uses.size();
2812
2813 if(new_columns_uses_size != columns.size())
2814 {
2815 ostringstream buffer;
2816
2817 buffer << "OpenNN Exception DataSet class.\n"
2818 << "void set_columns_uses(const Tensor<string, 1>&) method.\n"
2819 << "Size of columns uses (" << new_columns_uses_size << ") must be equal to columns size (" << columns.size() << "). \n";
2820
2821 throw logic_error(buffer.str());
2822 }
2823
2824 for(Index i = 0; i < new_columns_uses.size(); i++)
2825 {
2826 columns(i).set_use(new_columns_uses(i));
2827 }
2828
2829 input_variables_dimensions.resize(1);
2830 input_variables_dimensions.setConstant(get_input_variables_number());
2831 }
2832
2833
2834 /// Sets all columns in the dataset as unused columns.
2835
set_columns_unused()2836 void DataSet::set_columns_unused()
2837 {
2838 const Index columns_number = get_columns_number();
2839
2840 for(Index i = 0; i < columns_number; i++)
2841 {
2842 set_column_use(i, UnusedVariable);
2843 }
2844 }
2845
2846
2847 /// Sets all input columns in the dataset as unused columns.
2848
set_input_columns_unused()2849 void DataSet::set_input_columns_unused()
2850 {
2851 const Index columns_number = get_columns_number();
2852
2853 for(Index i = 0; i < columns_number; i++)
2854 {
2855 if(columns(i).column_use == DataSet::Input) set_column_use(i, UnusedVariable);
2856 }
2857 }
2858
2859
2860 /// Sets the use of a single column.
2861 /// @param index Index of column.
2862 /// @param new_use Use for that column.
2863
set_column_use(const Index & index,const VariableUse & new_use)2864 void DataSet::set_column_use(const Index& index, const VariableUse& new_use)
2865 {
2866 columns(index).column_use = new_use;
2867
2868 if(columns(index).type == Categorical)
2869 {
2870 columns(index).set_categories_uses(new_use);
2871 }
2872 }
2873
2874
2875 /// Sets the use of a single column.
2876 /// @param name Name of column.
2877 /// @param new_use Use for that column.
2878
set_column_use(const string & name,const VariableUse & new_use)2879 void DataSet::set_column_use(const string& name, const VariableUse& new_use)
2880 {
2881 const Index index = get_column_index(name);
2882
2883 set_column_use(index, new_use);
2884 }
2885
2886
2887 /// This method set the name of a single variable.
2888 /// @param index Index of variable.
2889 /// @param new_name Name of variable.
2890
set_variable_name(const Index & variable_index,const string & new_variable_name)2891 void DataSet::set_variable_name(const Index& variable_index, const string& new_variable_name)
2892 {
2893 #ifdef __OPENNN_DEBUG__
2894
2895 const Index variables_number = get_variables_number();
2896
2897 if(variable_index >= variables_number)
2898 {
2899 ostringstream buffer;
2900
2901 buffer << "OpenNN Exception: Variables class.\n"
2902 << "void set_name(const Index&, const string&) method.\n"
2903 << "Index of variable must be less than number of variables.\n";
2904
2905 throw logic_error(buffer.str());
2906 }
2907
2908 #endif
2909
2910 const Index columns_number = get_columns_number();
2911
2912 Index index = 0;
2913
2914 for(Index i = 0; i < columns_number; i++)
2915 {
2916 if(columns(i).type == Categorical)
2917 {
2918 for(Index j = 0; j < columns(i).get_categories_number(); j++)
2919 {
2920 if(index == variable_index)
2921 {
2922 columns(i).categories(j) = new_variable_name;
2923 return;
2924 }
2925 else
2926 {
2927 index++;
2928 }
2929 }
2930 }
2931 else
2932 {
2933 if(index == variable_index)
2934 {
2935 columns(i).name = new_variable_name;
2936 return;
2937 }
2938 else
2939 {
2940 index++;
2941 }
2942 }
2943 }
2944 }
2945
2946
2947 /// Sets new names for the variables in the data set from a vector of strings.
2948 /// The size of that vector must be equal to the total number of variables.
2949 /// @param new_names Name of variables.
2950
set_variables_names(const Tensor<string,1> & new_variables_names)2951 void DataSet::set_variables_names(const Tensor<string, 1>& new_variables_names)
2952 {
2953 #ifdef __OPENNN_DEBUG__
2954
2955 const Index variables_number = get_variables_number();
2956
2957 const Index size = new_variables_names.size();
2958
2959 if(size != variables_number)
2960 {
2961 ostringstream buffer;
2962
2963 buffer << "OpenNN Exception: Variables class.\n"
2964 << "void set_names(const Tensor<string, 1>&) method.\n"
2965 << "Size (" << size << ") must be equal to number of variables (" << variables_number << ").\n";
2966
2967 throw logic_error(buffer.str());
2968 }
2969
2970 #endif
2971
2972 const Index columns_number = get_columns_number();
2973
2974 Index index = 0;
2975
2976 for(Index i = 0; i < columns_number; i++)
2977 {
2978 if(columns(i).type == Categorical)
2979 {
2980 for(Index j = 0; j < columns(i).get_categories_number(); j++)
2981 {
2982 columns(i).categories(j) = new_variables_names(index);
2983 index++;
2984 }
2985 }
2986 else
2987 {
2988 columns(i).name = new_variables_names(index);
2989 index++;
2990 }
2991 }
2992 }
2993
2994
2995 /// Sets new names for the columns in the data set from a vector of strings.
2996 /// The size of that vector must be equal to the total number of variables.
2997 /// @param new_names Name of variables.
2998
set_columns_names(const Tensor<string,1> & new_names)2999 void DataSet::set_columns_names(const Tensor<string, 1>& new_names)
3000 {
3001 const Index new_names_size = new_names.size();
3002 const Index columns_number = get_columns_number();
3003
3004 if(new_names_size != columns_number)
3005 {
3006 ostringstream buffer;
3007
3008 buffer << "OpenNN Exception: DataSet class.\n"
3009 << "void set_columns_names(const Tensor<string, 1>&).\n"
3010 << "Size of names (" << new_names.size() << ") is not equal to columns number (" << columns_number << ").\n";
3011
3012 throw logic_error(buffer.str());
3013 }
3014
3015 for(Index i = 0; i < columns_number; i++)
3016 {
3017 columns(i).name = new_names(i);
3018 }
3019 }
3020
3021
3022 /// Sets all the variables in the data set as input variables.
3023
set_input()3024 void DataSet::set_input()
3025 {
3026 for(Index i = 0; i < columns.size(); i++)
3027 {
3028 if(columns(i).type == Constant) continue;
3029
3030 columns(i).set_use(Input);
3031 }
3032 }
3033
3034
3035 /// Sets all the variables in the data set as target variables.
3036
set_target()3037 void DataSet::set_target()
3038 {
3039 for(Index i = 0; i < columns.size(); i++)
3040 {
3041 columns(i).set_use(Target);
3042 }
3043 }
3044
3045
3046 /// Sets all the variables in the data set as unused variables.
3047
set_variables_unused()3048 void DataSet::set_variables_unused()
3049 {
3050 for(Index i = 0; i < columns.size(); i++)
3051 {
3052 columns(i).set_use(UnusedVariable);
3053 }
3054 }
3055
3056
3057 /// Sets a new number of variables in the variables object.
3058 /// All variables are set as inputs but the last one, which is set as targets.
3059 /// @param new_variables_number Number of variables.
3060
set_columns_number(const Index & new_variables_number)3061 void DataSet::set_columns_number(const Index& new_variables_number)
3062 {
3063 columns.resize(new_variables_number);
3064
3065 set_default_columns_uses();
3066 }
3067
3068
binarize_input_data(const type & threshold)3069 void DataSet::binarize_input_data(const type& threshold)
3070 {
3071 const Index samples_number = get_samples_number();
3072
3073 const Index input_variables_number = get_input_variables_number();
3074
3075 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3076
3077 for(Index i = 0; i < samples_number; i++)
3078 {
3079 for(Index j = 0; j < input_variables_number; i++)
3080 {
3081 const Index input_variable_index = input_variables_indices[j];
3082
3083 data(i,input_variable_index) < threshold
3084 ? data(i,input_variable_index) = 0
3085 : data(i,input_variable_index) = 1;
3086 }
3087 }
3088 }
3089
3090
transform_binary_column(const Tensor<type,1> & column) const3091 Tensor<type,2> DataSet::transform_binary_column(const Tensor<type,1>& column) const
3092 {
3093 const Index rows_number = column.dimension(0);
3094
3095 Tensor<type, 2> new_column(rows_number , 2);
3096 new_column.setZero();
3097
3098 for(Index i = 0; i < rows_number; i++)
3099 {
3100 if(abs(column(i) - static_cast<type>(1)) < std::numeric_limits<type>::min())
3101 {
3102 new_column(i,1) = static_cast<type>(1);
3103 }
3104 else if(abs(column(i) - static_cast<type>(0)) < std::numeric_limits<type>::min())
3105 {
3106 new_column(i,0) = static_cast<type>(1);
3107 }
3108 else
3109 {
3110 new_column(i,0) = NAN;
3111 new_column(i,1) = NAN;
3112 }
3113 }
3114
3115 return new_column;
3116 }
3117
3118
set_binary_simple_columns()3119 void DataSet::set_binary_simple_columns()
3120 {
3121 bool is_binary = true;
3122
3123 Index variable_index = 0;
3124
3125 Index different_values = 0;
3126
3127 for(Index column_index = 0; column_index < columns.size(); column_index++)
3128 {
3129 if(columns(column_index).type == Numeric)
3130 {
3131 Tensor<type, 1> values(3);
3132 values.setRandom();
3133 different_values = 0;
3134 is_binary = true;
3135
3136 for(Index row_index = 0; row_index < data.dimension(0); row_index++)
3137 {
3138 if(!::isnan(data(row_index, variable_index))
3139 && data(row_index, variable_index) != values(0)
3140 && data(row_index, variable_index) != values(1))
3141 {
3142 values(different_values) = data(row_index, variable_index);
3143
3144 different_values++;
3145 }
3146
3147 if(row_index == (data.dimension(0)-1)){
3148 if(different_values==1){
3149 is_binary = false;
3150 break;
3151 }
3152 }
3153
3154 if(different_values > 2)
3155 {
3156 is_binary = false;
3157 break;
3158 }
3159 }
3160
3161 if(is_binary)
3162 {
3163 columns(column_index).type = Binary;
3164 scale_minimum_maximum_binary(values(0), values(1), column_index);
3165 columns(column_index).categories.resize(2);
3166
3167 if(values(0) == 0 && values(1) == 1)
3168 {
3169 columns(column_index).categories(0) = "Negative (0)";
3170 columns(column_index).categories(1) = "Positive (1)";
3171 }
3172 else if(values(0) == 1 && values(1) == 0)
3173 {
3174 columns(column_index).categories(0) = "Positive (1)";
3175 columns(column_index).categories(1) = "Negative (0)";
3176 }
3177 else
3178 {
3179 columns(column_index).categories(0) = "Class_1";// + std::to_string(values(0));
3180 columns(column_index).categories(1) = "Class_2";// + std::to_string(values(1));
3181 }
3182
3183 const VariableUse column_use = columns(column_index).column_use;
3184 columns(column_index).categories_uses.resize(2);
3185 columns(column_index).categories_uses(0) = column_use;
3186 columns(column_index).categories_uses(1) = column_use;
3187 }
3188
3189 variable_index++;
3190 }
3191 else if(columns(column_index).type == Categorical)
3192 {
3193 variable_index += columns(column_index).get_categories_number();
3194 }
3195 else
3196 {
3197 variable_index++;
3198 }
3199 }
3200 }
3201
3202
3203 /// Sets new input dimensions in the data set.
3204
set_input_variables_dimensions(const Tensor<Index,1> & new_inputs_dimensions)3205 void DataSet::set_input_variables_dimensions(const Tensor<Index, 1>& new_inputs_dimensions)
3206 {
3207 input_variables_dimensions = new_inputs_dimensions;
3208 }
3209
3210
3211 /// Returns true if the data set is a binary classification problem, false otherwise.
3212 /// @todo
3213
is_binary_classification() const3214 bool DataSet::is_binary_classification() const
3215 {
3216 if(get_target_variables_number() != 1)
3217 {
3218 return false;
3219 }
3220
3221 return true;
3222 }
3223
3224
3225 /// Returns true if the data set is a multiple classification problem, false otherwise.
3226 /// @todo
3227
is_multiple_classification() const3228 bool DataSet::is_multiple_classification() const
3229 {
3230 return true;
3231 }
3232
3233
3234 /// Returns true if the data matrix is empty, and false otherwise.
3235
is_empty() const3236 bool DataSet::is_empty() const
3237 {
3238 if(data.dimension(0) == 0 || data.dimension(1) == 0)
3239 {
3240 return true;
3241 }
3242
3243 return false;
3244 }
3245
3246
3247 /// Returns true if any value is less or equal than a given value, and false otherwise.
3248
is_less_than(const Tensor<type,1> & column,const type & value) const3249 bool DataSet::is_less_than(const Tensor<type, 1>& column, const type& value) const
3250 {
3251 Tensor<bool, 1> if_sentence = column <= column.constant(value);
3252
3253 Tensor<bool, 1> sentence(column.size());
3254 sentence.setConstant(true);
3255
3256 Tensor<bool, 1> else_sentence(column.size());
3257 else_sentence.setConstant(false);
3258
3259 Tensor<bool, 0> is_less = (if_sentence.select(sentence, else_sentence)).any();
3260
3261 return is_less(0);
3262 }
3263
3264
3265 /// Returns a reference to the data matrix in the data set.
3266 /// The number of rows is equal to the number of samples.
3267 /// The number of columns is equal to the number of variables.
3268
get_data() const3269 const Tensor<type, 2>& DataSet::get_data() const
3270 {
3271 return data;
3272 }
3273
3274
get_data_pointer()3275 Tensor<type, 2>* DataSet::get_data_pointer()
3276 {
3277 return &data;
3278 }
3279
3280
3281 /// Returns a reference to the time series data matrix in the data set.
3282 /// Only for time series problems.
3283
get_time_series_data() const3284 const Tensor<type, 2>& DataSet::get_time_series_data() const
3285 {
3286 return time_series_data;
3287 }
3288
3289
3290 /// Returns a string with the method used.
3291
get_missing_values_method() const3292 DataSet::MissingValuesMethod DataSet::get_missing_values_method() const
3293 {
3294 return missing_values_method;
3295 }
3296
3297
3298 /// Returns the name of the data file.
3299
get_data_file_name() const3300 const string& DataSet::get_data_file_name() const
3301 {
3302 return data_file_name;
3303 }
3304
3305
3306 /// Returns true if the first line of the data file has a header with the names of the variables, and false otherwise.
3307
get_header_line() const3308 const bool& DataSet::get_header_line() const
3309 {
3310 return has_columns_names;
3311 }
3312
3313
3314 /// Returns true if the data file has rows label, and false otherwise.
3315
get_rows_label() const3316 const bool& DataSet::get_rows_label() const
3317 {
3318 return has_rows_labels;
3319 }
3320
3321
get_rows_label_tensor() const3322 Tensor<string, 1> DataSet::get_rows_label_tensor() const
3323 {
3324 return rows_labels;
3325 }
3326
get_testing_rows_label_tensor()3327 Tensor<string, 1> DataSet::get_testing_rows_label_tensor()
3328 {
3329 const Index testing_samples_number = get_testing_samples_number();
3330 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3331 Tensor<string, 1> testing_rows_label(testing_samples_number);
3332
3333 for(Index i = 0; i < testing_samples_number; i++)
3334 {
3335 testing_rows_label(i) = rows_labels(testing_indices(i));
3336 }
3337
3338 return testing_rows_label;
3339 }
3340
3341
get_selection_rows_label_tensor()3342 Tensor<string, 1> DataSet::get_selection_rows_label_tensor()
3343 {
3344 const Index selection_samples_number = get_selection_samples_number();
3345 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3346 Tensor<string, 1> selection_rows_label(selection_samples_number);
3347
3348 for(Index i = 0; i < selection_samples_number; i++)
3349 {
3350 selection_rows_label(i) = rows_labels(selection_indices(i));
3351 }
3352
3353 return selection_rows_label;
3354 }
3355
3356
3357 /// Returns the separator to be used in the data file.
3358
get_separator() const3359 const DataSet::Separator& DataSet::get_separator() const
3360 {
3361 return separator;
3362 }
3363
3364
3365 /// Returns the string which will be used as separator in the data file.
3366
get_separator_char() const3367 char DataSet::get_separator_char() const
3368 {
3369 switch(separator)
3370 {
3371 case Space:
3372 return ' ';
3373
3374 case Tab:
3375 return '\t';
3376
3377 case Comma:
3378 return ',';
3379
3380 case Semicolon:
3381 return ';';
3382 }
3383
3384 return char();
3385 }
3386
3387
3388 /// Returns the string which will be used as separator in the data file.
3389
get_separator_string() const3390 string DataSet::get_separator_string() const
3391 {
3392 switch(separator)
3393 {
3394 case Space:
3395 return "Space";
3396
3397 case Tab:
3398 return "Tab";
3399
3400 case Comma:
3401 return "Comma";
3402
3403 case Semicolon:
3404 return "Semicolon";
3405 }
3406
3407 return string();
3408 }
3409
3410
3411 /// Returns the string which will be used as label for the missing values in the data file.
3412
get_missing_values_label() const3413 const string& DataSet::get_missing_values_label() const
3414 {
3415 return missing_values_label;
3416 }
3417
3418
3419 /// Returns the number of lags to be used in a time series prediction application.
3420
get_lags_number() const3421 const Index& DataSet::get_lags_number() const
3422 {
3423 return lags_number;
3424 }
3425
3426
3427 /// Returns the number of steps ahead to be used in a time series prediction application.
3428
get_steps_ahead() const3429 const Index& DataSet::get_steps_ahead() const
3430 {
3431 return steps_ahead;
3432 }
3433
3434
3435 /// Returns the indices of the time variables in the data set.
3436
get_time_index() const3437 const Index& DataSet::get_time_index() const
3438 {
3439 return time_index;
3440 }
3441
3442
3443 /// Returns a value of the scaling-unscaling method enumeration from a string containing the name of that method.
3444 /// @param scaling_unscaling_method String with the name of the scaling and unscaling method.
3445
get_scaling_unscaling_method(const string & scaling_unscaling_method)3446 DataSet::ScalingUnscalingMethod DataSet::get_scaling_unscaling_method(const string& scaling_unscaling_method)
3447 {
3448 if(scaling_unscaling_method == "NoScaling")
3449 {
3450 return NoScaling;
3451 }
3452 else if(scaling_unscaling_method == "NoUnscaling")
3453 {
3454 return NoUnscaling;
3455 }
3456 else if(scaling_unscaling_method == "MinimumMaximum")
3457 {
3458 return MinimumMaximum;
3459 }
3460 else if(scaling_unscaling_method == "Logarithmic")
3461 {
3462 return Logarithmic;
3463 }
3464 else if(scaling_unscaling_method == "MeanStandardDeviation")
3465 {
3466 return MeanStandardDeviation;
3467 }
3468 else if(scaling_unscaling_method == "StandardDeviation")
3469 {
3470 return StandardDeviation;
3471 }
3472 else
3473 {
3474 ostringstream buffer;
3475
3476 buffer << "OpenNN Exception: DataSet class.\n"
3477 << "static ScalingUnscalingMethod get_scaling_unscaling_method(const string).\n"
3478 << "Unknown scaling-unscaling method: " << scaling_unscaling_method << ".\n";
3479
3480 throw logic_error(buffer.str());
3481 }
3482 }
3483
3484
3485 /// Returns a matrix with the training samples in the data set.
3486 /// The number of rows is the number of training
3487 /// The number of columns is the number of variables.
3488
get_training_data() const3489 Tensor<type, 2> DataSet::get_training_data() const
3490 {
3491
3492 // const Index variables_number = get_variables_number();
3493
3494 // Tensor<Index, 1> variables_indices(0, 1, variables_number-1);
3495
3496 Tensor<Index, 1> variables_indices = get_used_variables_indices();
3497
3498 const Tensor<Index, 1> training_indices = get_training_samples_indices();
3499
3500 return get_subtensor_data(training_indices, variables_indices);
3501
3502 // return Tensor<type,2>();
3503 }
3504
3505
3506 /// Returns a matrix with the selection samples in the data set.
3507 /// The number of rows is the number of selection
3508 /// The number of columns is the number of variables.
3509
get_selection_data() const3510 Tensor<type, 2> DataSet::get_selection_data() const
3511 {
3512 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3513
3514 const Index variables_number = get_variables_number();
3515
3516 Tensor<Index, 1> variables_indices;
3517 initialize_sequential_eigen_tensor(variables_indices, 0, 1, variables_number-1);
3518
3519 return get_subtensor_data(selection_indices, variables_indices);
3520 }
3521
3522
3523 /// Returns a matrix with the testing samples in the data set.
3524 /// The number of rows is the number of testing
3525 /// The number of columns is the number of variables.
3526
get_testing_data() const3527 Tensor<type, 2> DataSet::get_testing_data() const
3528 {
3529 const Index variables_number = get_variables_number();
3530
3531 Tensor<Index, 1> variables_indices;
3532 initialize_sequential_eigen_tensor(variables_indices, 0, 1, variables_number-1);
3533
3534 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3535
3536 return get_subtensor_data(testing_indices, variables_indices);
3537 }
3538
3539
3540 /// Returns a matrix with the input variables in the data set.
3541 /// The number of rows is the number of
3542 /// The number of columns is the number of input variables.
3543
get_input_data() const3544 Tensor<type, 2> DataSet::get_input_data() const
3545 {
3546 const Index samples_number = get_samples_number();
3547
3548 Tensor<Index, 1> indices;
3549 initialize_sequential_eigen_tensor(indices, 0, 1, samples_number-1);
3550
3551 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3552
3553 return get_subtensor_data(indices, input_variables_indices);
3554 }
3555
3556
3557 /// Returns a matrix with the target variables in the data set.
3558 /// The number of rows is the number of
3559 /// The number of columns is the number of target variables.
3560
get_target_data() const3561 Tensor<type, 2> DataSet::get_target_data() const
3562 {
3563 const Tensor<Index, 1> indices = get_used_samples_indices();
3564
3565 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3566
3567 return get_subtensor_data(indices, target_variables_indices);
3568 }
3569
3570
3571 /// Returns a tensor with the input variables in the data set.
3572 /// The number of rows is the number of
3573 /// The number of columns is the number of input variables.
3574
get_input_data(const Tensor<Index,1> & samples_indices) const3575 Tensor<type, 2> DataSet::get_input_data(const Tensor<Index, 1>& samples_indices) const
3576 {
3577 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3578
3579 return get_subtensor_data(samples_indices, input_variables_indices);
3580 }
3581
3582
3583 /// Returns a tensor with the target variables in the data set.
3584 /// The number of rows is the number of
3585 /// The number of columns is the number of input variables.
3586
get_target_data(const Tensor<Index,1> & samples_indices) const3587 Tensor<type, 2> DataSet::get_target_data(const Tensor<Index, 1>& samples_indices) const
3588 {
3589 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3590
3591 return get_subtensor_data(samples_indices, target_variables_indices);
3592 }
3593
3594
3595 /// Returns a matrix with training samples and input variables.
3596 /// The number of rows is the number of training
3597 /// The number of columns is the number of input variables.
3598
get_training_input_data() const3599 Tensor<type, 2> DataSet::get_training_input_data() const
3600 {
3601 const Tensor<Index, 1> training_indices = get_training_samples_indices();
3602
3603 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3604
3605 return get_subtensor_data(training_indices, input_variables_indices);
3606 }
3607
3608
3609 /// Returns a tensor with training samples and target variables.
3610 /// The number of rows is the number of training
3611 /// The number of columns is the number of target variables.
3612
get_training_target_data() const3613 Tensor<type, 2> DataSet::get_training_target_data() const
3614 {
3615 const Tensor<Index, 1> training_indices = get_training_samples_indices();
3616
3617 const Tensor<Index, 1>& target_variables_indices = get_target_variables_indices();
3618
3619 return get_subtensor_data(training_indices, target_variables_indices);
3620 }
3621
3622
3623 /// Returns a tensor with selection samples and input variables.
3624 /// The number of rows is the number of selection
3625 /// The number of columns is the number of input variables.
3626
get_selection_input_data() const3627 Tensor<type, 2> DataSet::get_selection_input_data() const
3628 {
3629 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3630
3631 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3632
3633 return get_subtensor_data(selection_indices, input_variables_indices);
3634 }
3635
3636
3637 /// Returns a tensor with selection samples and target variables.
3638 /// The number of rows is the number of selection
3639 /// The number of columns is the number of target variables.
3640
get_selection_target_data() const3641 Tensor<type, 2> DataSet::get_selection_target_data() const
3642 {
3643 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3644
3645 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3646
3647 return get_subtensor_data(selection_indices, target_variables_indices);
3648 }
3649
3650
3651 /// Returns a tensor with testing samples and input variables.
3652 /// The number of rows is the number of testing
3653 /// The number of columns is the number of input variables.
3654
get_testing_input_data() const3655 Tensor<type, 2> DataSet::get_testing_input_data() const
3656 {
3657 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3658
3659 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3660
3661 return get_subtensor_data(testing_indices, input_variables_indices);
3662 }
3663
3664
3665 /// Returns a tensor with testing samples and target variables.
3666 /// The number of rows is the number of testing
3667 /// The number of columns is the number of target variables.
3668
get_testing_target_data() const3669 Tensor<type, 2> DataSet::get_testing_target_data() const
3670 {
3671 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3672
3673 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3674
3675 return get_subtensor_data(testing_indices, target_variables_indices);
3676 }
3677
3678
3679 /// Returns the inputs and target values of a single sample in the data set.
3680 /// @param index Index of the sample.
3681
get_sample_data(const Index & index) const3682 Tensor<type, 1> DataSet::get_sample_data(const Index& index) const
3683 {
3684
3685 #ifdef __OPENNN_DEBUG__
3686
3687 const Index samples_number = get_samples_number();
3688
3689 if(index >= samples_number)
3690 {
3691 ostringstream buffer;
3692
3693 buffer << "OpenNN Exception: DataSet class.\n"
3694 << "Tensor<type, 1> get_sample(const Index&) const method.\n"
3695 << "Index of sample (" << index << ") must be less than number of samples (" << samples_number << ").\n";
3696
3697 throw logic_error(buffer.str());
3698 }
3699
3700 #endif
3701
3702 // Get sample
3703
3704 return data.chip(index,0);
3705 }
3706
3707
3708 /// Returns the inputs and target values of a single sample in the data set.
3709 /// @param sample_index Index of the sample.
3710 /// @param variables_indices Indices of the variables.
3711
get_sample_data(const Index & sample_index,const Tensor<Index,1> & variables_indices) const3712 Tensor<type, 1> DataSet::get_sample_data(const Index& sample_index, const Tensor<Index, 1>& variables_indices) const
3713 {
3714 #ifdef __OPENNN_DEBUG__
3715
3716 const Index samples_number = get_samples_number();
3717
3718 if(sample_index >= samples_number)
3719 {
3720 ostringstream buffer;
3721
3722 buffer << "OpenNN Exception: DataSet class.\n"
3723 << "Tensor<type, 1> get_sample(const Index&, const Tensor<Index, 1>&) const method.\n"
3724 << "Index of sample must be less than number of \n";
3725
3726 throw logic_error(buffer.str());
3727 }
3728
3729 #endif
3730
3731 const Index variables_number = variables_indices.size();
3732
3733 Tensor<type, 1 > row(variables_number);
3734
3735 for(Index i = 0; i < variables_number; i++)
3736 {
3737 Index variable_index = variables_indices(i);
3738
3739 row(i) = data(sample_index, variable_index);
3740 }
3741
3742 return row;
3743
3744 //return data.get_row(sample_index, variables_indices);
3745
3746 }
3747
3748
3749 /// Returns the inputs values of a single sample in the data set.
3750 /// @param sample_index Index of the sample.
3751
get_sample_input_data(const Index & sample_index) const3752 Tensor<type, 2> DataSet::get_sample_input_data(const Index & sample_index) const
3753 {
3754 const Index input_variables_number = get_input_variables_number();
3755
3756 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3757
3758 Tensor<type, 2> inputs(1, input_variables_number);
3759
3760 for(Index i = 0; i < input_variables_number; i++)
3761 inputs(0, i) = data(sample_index, input_variables_indices(i));
3762
3763 return inputs;
3764 }
3765
3766
3767 /// Returns the target values of a single sample in the data set.
3768 /// @param sample_index Index of the sample.
3769
get_sample_target_data(const Index & sample_index) const3770 Tensor<type, 2> DataSet::get_sample_target_data(const Index & sample_index) const
3771 {
3772 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3773
3774 return get_subtensor_data(Tensor<Index, 1>(sample_index), target_variables_indices);
3775 }
3776
3777
3778 /// Returns the index of the column with the given name.
3779 /// @param column_name Name of the column to be found.
3780
get_column_index(const string & column_name) const3781 Index DataSet::get_column_index(const string& column_name) const
3782 {
3783 const Index columns_number = get_columns_number();
3784
3785 for(Index i = 0; i < columns_number; i++)
3786 {
3787 if(columns(i).name == column_name) return i;
3788 }
3789
3790 ostringstream buffer;
3791
3792 buffer << "OpenNN Exception: DataSet class.\n"
3793 << "Index get_column_index(const string&&) const method.\n"
3794 << "Cannot find " << column_name << "\n";
3795
3796 throw logic_error(buffer.str());
3797 }
3798
3799
3800 /// Returns the index of the column to which a variable index belongs.
3801 /// @param variable_index Index of the variable to be found.
3802
get_column_index(const Index & variable_index) const3803 Index DataSet::get_column_index(const Index& variable_index) const
3804 {
3805 const Index columns_number = get_columns_number();
3806
3807 Index total_variables_number = 0;
3808
3809 for(Index i = 0; i < columns_number; i++)
3810 {
3811 if(columns(i).type == Categorical)
3812 {
3813 total_variables_number += columns(i).get_categories_number();
3814 }
3815 else
3816 {
3817 total_variables_number++;
3818 }
3819
3820 if((variable_index+1) <= total_variables_number) return i;
3821 }
3822
3823 ostringstream buffer;
3824
3825 buffer << "OpenNN Exception: DataSet class.\n"
3826 << "Index get_column_index(const type&) const method.\n"
3827 << "Cannot find variable index: " << variable_index << ".\n";
3828
3829 throw logic_error(buffer.str());
3830 }
3831
3832
3833 /// Returns the indices of a variable in the data set.
3834 /// Note that the number of variables does not have to equal the number of columns in the data set,
3835 /// because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.
3836
get_variable_indices(const Index & column_index) const3837 Tensor<Index, 1> DataSet::get_variable_indices(const Index& column_index) const
3838 {
3839 Index index = 0;
3840
3841 for(Index i = 0; i < column_index; i++)
3842 {
3843 if(columns(i).type == Categorical)
3844 {
3845 index += columns(i).categories.size();
3846 }
3847 else
3848 {
3849 index++;
3850 }
3851 }
3852
3853 if(columns(column_index).type == Categorical)
3854 {
3855 Tensor<Index, 1> variable_indices(columns(column_index).categories.size());
3856
3857 for (Index j = 0; j<columns(column_index).categories.size(); j++)
3858 {
3859 variable_indices(j) = index+j;
3860 }
3861
3862 return variable_indices;
3863 }
3864 else
3865 {
3866 Tensor<Index, 1> indices(1);
3867 indices.setConstant(index);
3868
3869 return indices;
3870 }
3871 }
3872
3873
3874 /// Returns the data from the data set of the given variables indices.
3875 /// @param variables_indices Variable indices.
3876 /// @todo
3877
get_column_data(const Tensor<Index,1> & variables_indices) const3878 Tensor<type, 2> DataSet::get_column_data(const Tensor<Index, 1>& variables_indices) const
3879 {
3880 // return data.get_submatrix_columns(variables_indices);
3881
3882 return Tensor<type, 2>();
3883 }
3884
3885
3886 /// Returns the data from the data set column with a given index,
3887 /// these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
3888 /// @param column_index Index of the column.
3889
get_column_data(const Index & column_index) const3890 Tensor<type, 2> DataSet::get_column_data(const Index& column_index) const
3891 {
3892 Index columns_number = 1;
3893 const Index rows_number = data.dimension(0);
3894
3895 if(columns(column_index).type == Categorical)
3896 {
3897 columns_number = columns(column_index).get_categories_number();
3898 }
3899
3900 Eigen::array<Index, 2> extents = {rows_number, columns_number};
3901 Eigen::array<Index, 2> offsets = {0, get_variable_indices(column_index)(0)};
3902
3903 return data.slice(offsets, extents);
3904 }
3905
3906 /// Returns the data from the time series column with a given index,
3907 /// @param column_index Index of the column.
3908
get_time_series_column_data(const Index & column_index) const3909 Tensor<type, 2> DataSet::get_time_series_column_data(const Index& column_index) const
3910 {
3911 Index columns_number = 1;
3912 const Index rows_number = data.dimension(0);
3913
3914 if(time_series_columns(column_index).type == Categorical)
3915 {
3916 columns_number = time_series_columns(column_index).get_categories_number();
3917 }
3918
3919 Eigen::array<Index, 2> extents = {rows_number, columns_number};
3920 Eigen::array<Index, 2> offsets = {0, get_variable_indices(column_index)(0)};
3921
3922 return time_series_data.slice(offsets, extents);
3923 }
3924
3925
3926 /// Returns the data from the data set column with a given index,
3927 /// these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
3928 /// @param column_index Index of the column.
3929 /// @param rows_indices Rows of the indices.
3930
get_column_data(const Index & column_index,Tensor<Index,1> & rows_indices) const3931 Tensor<type, 2> DataSet::get_column_data(const Index& column_index, Tensor<Index, 1>& rows_indices) const
3932 {
3933 return get_subtensor_data(rows_indices, get_variable_indices(column_index));
3934 }
3935
3936
3937 /// Returns the data from the data set column with a given name,
3938 /// these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
3939 /// @param column_name Name of the column.
3940
get_column_data(const string & column_name) const3941 Tensor<type, 2> DataSet::get_column_data(const string& column_name) const
3942 {
3943 const Index column_index = get_column_index(column_name);
3944
3945 return get_column_data(column_index);
3946 }
3947
3948
3949 /// Returns all the samples of a single variable in the data set.
3950 /// @param index Index of the variable.
3951
get_variable_data(const Index & index) const3952 Tensor<type, 1> DataSet::get_variable_data(const Index& index) const
3953 {
3954
3955 #ifdef __OPENNN_DEBUG__
3956
3957 const Index variables_number = get_variables_number();
3958
3959 if(index >= variables_number)
3960 {
3961 ostringstream buffer;
3962
3963 buffer << "OpenNN Exception: DataSet class.\n"
3964 << "Tensor<type, 1> get_variable(const Index&) const method.\n"
3965 << "Index of variable must be less than number of \n";
3966
3967 throw logic_error(buffer.str());
3968 }
3969
3970 #endif
3971
3972 return data.chip(index, 1);
3973 }
3974
3975
3976 /// Returns all the samples of a single variable in the data set.
3977 /// @param variable_name Name of the variable.
3978
get_variable_data(const string & variable_name) const3979 Tensor<type, 1> DataSet::get_variable_data(const string& variable_name) const
3980 {
3981
3982 const Tensor<string, 1> variable_names = get_variables_names();
3983
3984 Index size = 0;
3985
3986 for(Index i = 0; i < variable_names.size(); i++)
3987 {
3988 if(variable_names(i) == variable_name) size++;
3989 }
3990
3991 Tensor<Index, 1> variable_index(size);
3992
3993 Index index = 0;
3994
3995 for(Index i = 0; i < variable_names.size(); i++)
3996 {
3997 if(variable_names(i) == variable_name)
3998 {
3999 variable_index(index) = i;
4000
4001 index++;
4002 }
4003 }
4004
4005 #ifdef __OPENNN_DEBUG__
4006
4007 const Index variables_size = variable_index.size();
4008
4009 if(variables_size == 0)
4010 {
4011 ostringstream buffer;
4012
4013 buffer << "OpenNN Exception: DataSet class.\n"
4014 << "Tensor<type, 1> get_variable(const string&) const method.\n"
4015 << "Variable: " << variable_name << " does not exist.\n";
4016
4017 throw logic_error(buffer.str());
4018 }
4019
4020 if(variables_size > 1)
4021 {
4022 ostringstream buffer;
4023
4024 buffer << "OpenNN Exception: DataSet class.\n"
4025 << "Tensor<type, 1> get_variable(const string&) const method.\n"
4026 << "Variable: " << variable_name << " appears more than once in the data set.\n";
4027
4028 throw logic_error(buffer.str());
4029 }
4030
4031 #endif
4032
4033 return data.chip(variable_index(0), 1);
4034 }
4035
4036
4037 /// Returns a given set of samples of a single variable in the data set.
4038 /// @param variable_index Index of the variable.
4039 /// @param samples_indices Indices of the
4040
get_variable_data(const Index & variable_index,const Tensor<Index,1> & samples_indices) const4041 Tensor<type, 1> DataSet::get_variable_data(const Index& variable_index, const Tensor<Index, 1>& samples_indices) const
4042 {
4043
4044 #ifdef __OPENNN_DEBUG__
4045
4046 const Index variables_number = get_variables_number();
4047
4048 if(variable_index >= variables_number)
4049 {
4050 ostringstream buffer;
4051
4052 buffer << "OpenNN Exception: DataSet class.\n"
4053 << "Tensor<type, 1> get_variable(const Index&, const Tensor<Index, 1>&) const method.\n"
4054 << "Index of variable must be less than number of \n";
4055
4056 throw logic_error(buffer.str());
4057 }
4058
4059 #endif
4060
4061 const Index samples_indices_size = samples_indices.size();
4062
4063 Tensor<type, 1 > column(samples_indices_size);
4064
4065 for(Index i = 0; i < samples_indices_size; i++)
4066 {
4067 Index sample_index = samples_indices(i);
4068
4069 column(i) = data(sample_index, variable_index);
4070 }
4071
4072 return column;
4073 }
4074
4075
4076 /// Returns a given set of samples of a single variable in the data set.
4077 /// @param variable_name Name of the variable.
4078 /// @param samples_indices Indices of the
4079
get_variable_data(const string & variable_name,const Tensor<Index,1> & samples_indices) const4080 Tensor<type, 1> DataSet::get_variable_data(const string& variable_name, const Tensor<Index, 1>& samples_indices) const
4081 {
4082
4083 const Tensor<string, 1> variable_names = get_variables_names();
4084
4085 Index size = 0;
4086
4087 for(Index i = 0; i < variable_names.size(); i++)
4088 {
4089 if(variable_names(i) == variable_name) size++;
4090 }
4091
4092 Tensor<Index, 1> variable_index(size);
4093
4094 Index index = 0;
4095
4096 for(Index i = 0; i < variable_names.size(); i++)
4097 {
4098 if(variable_names(i) == variable_name)
4099 {
4100 variable_index(index) = i;
4101
4102 index++;
4103 }
4104 }
4105
4106 #ifdef __OPENNN_DEBUG__
4107
4108 const Index variables_size = variable_index.size();
4109
4110 if(variables_size == 0)
4111 {
4112 ostringstream buffer;
4113
4114 buffer << "OpenNN Exception: DataSet class.\n"
4115 << "Tensor<type, 1> get_variable(const string&) const method.\n"
4116 << "Variable: " << variable_name << " does not exist.\n";
4117
4118 throw logic_error(buffer.str());
4119 }
4120
4121 if(variables_size > 1)
4122 {
4123 ostringstream buffer;
4124
4125 buffer << "OpenNN Exception: DataSet class.\n"
4126 << "Tensor<type, 1> get_variable(const string&, const Tensor<Index, 1>&) const method.\n"
4127 << "Variable: " << variable_name << " appears more than once in the data set.\n";
4128
4129 throw logic_error(buffer.str());
4130 }
4131
4132 #endif
4133
4134 const Index samples_indices_size = samples_indices.size();
4135
4136 Tensor<type, 1 > column(samples_indices_size);
4137
4138 for(Index i = 0; i < samples_indices_size; i++)
4139 {
4140 Index sample_index = samples_indices(i);
4141
4142 column(i) = data(sample_index, variable_index(0));
4143 }
4144
4145 return column;
4146 }
4147
4148
get_data_file_preview() const4149 Tensor<Tensor<string, 1>, 1> DataSet::get_data_file_preview() const
4150 {
4151 return data_file_preview;
4152 }
4153
4154
get_subtensor_data(const Tensor<Index,1> & rows_indices,const Tensor<Index,1> & variables_indices) const4155 Tensor<type, 2> DataSet::get_subtensor_data(const Tensor<Index, 1> & rows_indices, const Tensor<Index, 1> & variables_indices) const
4156 {
4157 const Index rows_number = rows_indices.size();
4158 const Index variables_number = variables_indices.size();
4159
4160 Tensor<type, 2> subtensor(rows_number, variables_number);
4161
4162 Index row_index;
4163 Index variable_index;
4164
4165 const Tensor<type, 2>& data = get_data();
4166
4167 for(Index i = 0; i < rows_number; i++)
4168 {
4169 row_index = rows_indices(i);
4170
4171 for(Index j = 0; j < variables_number; j++)
4172 {
4173 variable_index = variables_indices(j);
4174
4175 subtensor(i, j) = data(row_index, variable_index);
4176 }
4177 }
4178
4179 return subtensor;
4180 }
4181
4182
4183 /// Sets zero samples and zero variables in the data set.
4184
set()4185 void DataSet::set()
4186 {
4187 data_file_name = "";
4188
4189 data.resize(0,0);
4190 }
4191
4192
4193 /// Sets all variables from a data matrix.
4194 /// @param new_data Data matrix.
4195
set(const Tensor<type,2> & new_data)4196 void DataSet::set(const Tensor<type, 2>& new_data)
4197 {
4198 data_file_name = "";
4199
4200 const Index variables_number = new_data.dimension(1);
4201 const Index samples_number = new_data.dimension(0);
4202
4203 set(samples_number, variables_number);
4204
4205 data = new_data;
4206
4207 set_default_columns_uses();
4208 }
4209
4210
4211 /// Sets new numbers of samples and variables in the inputs targets data set.
4212 /// All the samples are set for training.
4213 /// All the variables are set as inputs.
4214 /// @param new_samples_number Number of
4215 /// @param new_variables_number Number of variables.
4216
set(const Index & new_samples_number,const Index & new_variables_number)4217 void DataSet::set(const Index& new_samples_number, const Index& new_variables_number)
4218 {
4219 #ifdef __OPENNN_DEBUG__
4220
4221 if(new_samples_number == 0)
4222 {
4223 ostringstream buffer;
4224
4225 buffer << "OpenNN Exception: DataSet class.\n"
4226 << "void set(const Index&, const Index&) method.\n"
4227 << "Number of samples must be greater than zero.\n";
4228
4229 throw logic_error(buffer.str());
4230 }
4231
4232 if(new_variables_number == 0)
4233 {
4234 ostringstream buffer;
4235
4236 buffer << "OpenNN Exception: DataSet class.\n"
4237 << "void set(const Index&, const Index&) method.\n"
4238 << "Number of variables must be greater than zero.\n";
4239
4240 throw logic_error(buffer.str());
4241 }
4242
4243 #endif
4244
4245 data.resize(new_samples_number, new_variables_number);
4246
4247 columns.resize(new_variables_number);
4248
4249 for(Index index = 0; index < new_variables_number-1; index++)
4250 {
4251 columns(index).name = "column_" + to_string(index+1);
4252 columns(index).column_use = Input;
4253 columns(index).type = Numeric;
4254 }
4255
4256 columns(new_variables_number-1).name = "column_" + to_string(new_variables_number);
4257 columns(new_variables_number-1).column_use = Target;
4258 columns(new_variables_number-1).type = Numeric;
4259
4260 samples_uses.resize(new_samples_number);
4261 split_samples_random();
4262 }
4263
4264
4265 /// Sets new numbers of samples and inputs and target variables in the data set.
4266 /// The variables in the data set are the number of inputs plus the number of targets.
4267 /// @param new_samples_number Number of
4268 /// @param new_inputs_number Number of input variables.
4269 /// @param new_targets_number Number of target variables.
4270
set(const Index & new_samples_number,const Index & new_inputs_number,const Index & new_targets_number)4271 void DataSet::set(const Index& new_samples_number,
4272 const Index& new_inputs_number,
4273 const Index& new_targets_number)
4274 {
4275
4276 data_file_name = "";
4277
4278 const Index new_variables_number = new_inputs_number + new_targets_number;
4279
4280 data.resize(new_samples_number, new_variables_number);
4281
4282 columns.resize(new_variables_number);
4283
4284 for(Index i = 0; i < new_variables_number; i++)
4285 {
4286 if(i < new_inputs_number)
4287 {
4288 columns(i).name = "column_" + to_string(i+1);
4289 columns(i).column_use = Input;
4290 columns(i).type = Numeric;
4291 }
4292 else
4293 {
4294 columns(i).name = "column_" + to_string(i+1);
4295 columns(i).column_use = Target;
4296 columns(i).type = Numeric;
4297 }
4298 }
4299
4300 input_variables_dimensions.resize(new_inputs_number);
4301
4302 samples_uses.resize(new_samples_number);
4303 split_samples_random();
4304 }
4305
4306
4307 /// Sets the members of this data set object with those from another data set object.
4308 /// @param other_data_set Data set object to be copied.
4309
set(const DataSet & other_data_set)4310 void DataSet::set(const DataSet& other_data_set)
4311 {
4312 data_file_name = other_data_set.data_file_name;
4313
4314 has_columns_names = other_data_set.has_columns_names;
4315
4316 separator = other_data_set.separator;
4317
4318 missing_values_label = other_data_set.missing_values_label;
4319
4320 data = other_data_set.data;
4321
4322 columns = other_data_set.columns;
4323
4324 display = other_data_set.display;
4325 }
4326
4327
4328 /// Sets the data set members from a XML document.
4329 /// @param data_set_document TinyXML document containing the member data.
4330
set(const tinyxml2::XMLDocument & data_set_document)4331 void DataSet::set(const tinyxml2::XMLDocument& data_set_document)
4332 {
4333 set_default();
4334
4335 from_XML(data_set_document);
4336 }
4337
4338
4339 /// Sets the data set members by loading them from a XML file.
4340 /// @param file_name Data set XML file_name.
4341
set(const string & file_name)4342 void DataSet::set(const string& file_name)
4343 {
4344 load(file_name);
4345 }
4346
4347 /// Sets a new display value.
4348 /// If it is set to true messages from this class are to be displayed on the screen;
4349 /// if it is set to false messages from this class are not to be displayed on the screen.
4350 /// @param new_display Display value.
4351
set_display(const bool & new_display)4352 void DataSet::set_display(const bool& new_display)
4353 {
4354 display = new_display;
4355 }
4356
4357
4358 /// Sets the default member values:
4359 /// <ul>
4360 /// <li> Display: True.
4361 /// </ul>
4362
set_default()4363 void DataSet::set_default()
4364 {
4365 delete non_blocking_thread_pool;
4366 delete thread_pool_device;
4367
4368 const int n = omp_get_max_threads();
4369 non_blocking_thread_pool = new NonBlockingThreadPool(n);
4370 thread_pool_device = new ThreadPoolDevice(non_blocking_thread_pool, n);
4371
4372 has_columns_names = false;
4373
4374 separator = Comma;
4375
4376 missing_values_label = "NA";
4377
4378 lags_number = 0;
4379
4380 steps_ahead = 0;
4381
4382 set_default_columns_uses();
4383
4384 set_default_columns_names();
4385
4386 input_variables_dimensions.resize(1);
4387
4388 input_variables_dimensions.setConstant(get_input_variables_number());
4389
4390 }
4391
4392
4393 /// Sets a new data matrix.
4394 /// The number of rows must be equal to the number of
4395 /// The number of columns must be equal to the number of variables.
4396 /// Indices of all training, selection and testing samples and inputs and target variables do not change.
4397 /// @param new_data Data matrix.
4398
set_data(const Tensor<type,2> & new_data)4399 void DataSet::set_data(const Tensor<type, 2>& new_data)
4400 {
4401
4402 const Index samples_number = new_data.dimension(0);
4403 const Index variables_number = new_data.dimension(1);
4404
4405 set(samples_number, variables_number);
4406
4407 data = new_data;
4408 }
4409
set_time_series_data(const Tensor<type,2> & new_data)4410 void DataSet::set_time_series_data(const Tensor<type, 2>& new_data)
4411 {
4412 time_series_data = new_data;
4413 }
4414
4415 /// Sets the name of the data file.
4416 /// It also loads the data from that file.
4417 /// Moreover, it sets the variables and samples objects.
4418 /// @param new_data_file_name Name of the file containing the data.
4419
set_data_file_name(const string & new_data_file_name)4420 void DataSet::set_data_file_name(const string& new_data_file_name)
4421 {
4422 data_file_name = new_data_file_name;
4423 }
4424
4425
4426 /// Sets if the data file contains a header with the names of the columns.
4427
set_has_columns_names(const bool & new_has_columns_names)4428 void DataSet::set_has_columns_names(const bool& new_has_columns_names)
4429 {
4430 has_columns_names = new_has_columns_names;
4431 }
4432
4433
4434 /// Sets if the data file contains rows label.
4435
set_has_rows_label(const bool & new_has_rows_label)4436 void DataSet::set_has_rows_label(const bool& new_has_rows_label)
4437 {
4438 has_rows_labels = new_has_rows_label;
4439 }
4440
4441
4442 /// Sets a new separator.
4443 /// @param new_separator Separator value.
4444
set_separator(const Separator & new_separator)4445 void DataSet::set_separator(const Separator& new_separator)
4446 {
4447 separator = new_separator;
4448 }
4449
4450
4451 /// Sets a new separator from a char.
4452 /// @param new_separator Char with the separator value.
4453
set_separator(const char & new_separator)4454 void DataSet::set_separator(const char& new_separator)
4455 {
4456 if(new_separator == ' ')
4457 {
4458 separator = Space;
4459 }
4460 else if(new_separator == '\t')
4461 {
4462 separator = Tab;
4463 }
4464 else if(new_separator == ',')
4465 {
4466 separator = Comma;
4467 }
4468 else if(new_separator == ';')
4469 {
4470 separator = Semicolon;
4471 }
4472 else
4473 {
4474 ostringstream buffer;
4475
4476 buffer << "OpenNN Exception: DataSet class.\n"
4477 << "void set_separator(const char&) method.\n"
4478 << "Unknown separator: " << new_separator << ".\n";
4479
4480 throw logic_error(buffer.str());
4481 }
4482 }
4483
4484
4485 /// Sets a new separator from a string.
4486 /// @param new_separator Char with the separator value.
4487
set_separator(const string & new_separator_string)4488 void DataSet::set_separator(const string& new_separator_string)
4489 {
4490 if(new_separator_string == "Space")
4491 {
4492 separator = Space;
4493 }
4494 else if(new_separator_string == "Tab")
4495 {
4496 separator = Tab;
4497 }
4498 else if(new_separator_string == "Comma")
4499 {
4500 separator = Comma;
4501 }
4502 else if(new_separator_string == "Semicolon")
4503 {
4504 separator = Semicolon;
4505 }
4506 else
4507 {
4508 ostringstream buffer;
4509
4510 buffer << "OpenNN Exception: DataSet class.\n"
4511 << "void set_separator(const string&) method.\n"
4512 << "Unknown separator: " << new_separator_string << ".\n";
4513
4514 throw logic_error(buffer.str());
4515 }
4516 }
4517
4518
4519
4520 /// Sets a new label for the missing values.
4521 /// @param new_missing_values_label Label for the missing values.
4522
set_missing_values_label(const string & new_missing_values_label)4523 void DataSet::set_missing_values_label(const string& new_missing_values_label)
4524 {
4525 #ifdef __OPENNN_DEBUG__
4526
4527 if(get_trimmed(new_missing_values_label).empty())
4528 {
4529 ostringstream buffer;
4530
4531 buffer << "OpenNN Exception: DataSet class.\n"
4532 << "void set_missing_values_label(const string&) method.\n"
4533 << "Missing values label cannot be empty.\n";
4534
4535 throw logic_error(buffer.str());
4536 }
4537
4538 #endif
4539
4540 missing_values_label = new_missing_values_label;
4541 }
4542
4543
4544 /// Sets a new method for the missing values.
4545 /// @param new_missing_values_method Method for the missing values.
4546
set_missing_values_method(const DataSet::MissingValuesMethod & new_missing_values_method)4547 void DataSet::set_missing_values_method(const DataSet::MissingValuesMethod& new_missing_values_method)
4548 {
4549 missing_values_method = new_missing_values_method;
4550 }
4551
4552
set_missing_values_method(const string & new_missing_values_method)4553 void DataSet::set_missing_values_method(const string & new_missing_values_method)
4554 {
4555 if(new_missing_values_method == "Unuse")
4556 {
4557 missing_values_method = Unuse;
4558 }
4559 else if(new_missing_values_method == "Mean")
4560 {
4561 missing_values_method = Mean;
4562 }
4563 else if(new_missing_values_method == "Median")
4564 {
4565 missing_values_method = Median;
4566 }
4567 else
4568 {
4569 ostringstream buffer;
4570
4571 buffer << "OpenNN Exception: DataSet class.\n"
4572 << "void set_missing_values_method(const string & method.\n"
4573 << "Not known method type.\n";
4574
4575 throw logic_error(buffer.str());
4576 }
4577 }
4578
4579
4580 /// Sets a new number of lags to be defined for a time series prediction application.
4581 /// When loading the data file, the time series data will be modified according to this number.
4582 /// @param new_lags_number Number of lags(x-1, ..., x-l) to be used.
4583
set_lags_number(const Index & new_lags_number)4584 void DataSet::set_lags_number(const Index& new_lags_number)
4585 {
4586 lags_number = new_lags_number;
4587 }
4588
4589
4590 /// Sets a new number of steps ahead to be defined for a time series prediction application.
4591 /// When loading the data file, the time series data will be modified according to this number.
4592 /// @param new_steps_ahead_number Number of steps ahead to be used.
4593
set_steps_ahead_number(const Index & new_steps_ahead_number)4594 void DataSet::set_steps_ahead_number(const Index& new_steps_ahead_number)
4595 {
4596 steps_ahead = new_steps_ahead_number;
4597 }
4598
4599
4600 /// Sets the new position where the time data is located in the data set.
4601 /// @param new_time_index Position where the time data is located.
4602
set_time_index(const Index & new_time_index)4603 void DataSet::set_time_index(const Index& new_time_index)
4604 {
4605 time_index = new_time_index;
4606 }
4607
4608
set_threads_number(const int & new_threads_number)4609 void DataSet::set_threads_number(const int& new_threads_number)
4610 {
4611 if(non_blocking_thread_pool != nullptr) delete non_blocking_thread_pool;
4612 if(thread_pool_device != nullptr) delete thread_pool_device;
4613
4614 non_blocking_thread_pool = new NonBlockingThreadPool(new_threads_number);
4615 thread_pool_device = new ThreadPoolDevice(non_blocking_thread_pool, new_threads_number);
4616 }
4617
4618
4619 /// Sets a new number of samples in the data set.
4620 /// All samples are also set for training.
4621 /// The indices of the inputs and target variables do not change.
4622 /// @param new_samples_number Number of samples.
4623
set_samples_number(const Index & new_samples_number)4624 void DataSet::set_samples_number(const Index& new_samples_number)
4625 {
4626 const Index variables_number = get_variables_number();
4627
4628 set(new_samples_number,variables_number);
4629 }
4630
4631
4632 /// Removes the input of target indices of that variables with zero standard deviation.
4633 /// It might change the size of the vectors containing the inputs and targets indices.
4634
unuse_constant_columns()4635 Tensor<string, 1> DataSet::unuse_constant_columns()
4636 {
4637 const Index columns_number = get_columns_number();
4638
4639 #ifdef __OPENNN_DEBUG__
4640
4641 if(columns_number == 0)
4642 {
4643 ostringstream buffer;
4644
4645 buffer << "OpenNN Exception: DataSet class.\n"
4646 << "Tensor<string, 1> unuse_constant_columns() method.\n"
4647 << "Number of columns is zero.\n";
4648
4649 throw logic_error(buffer.str());
4650 }
4651
4652 #endif
4653
4654 Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
4655
4656 Tensor<string, 1> constant_columns(0);
4657
4658 Index variable_index = 0;
4659
4660 for(Index i = 0; i < columns_number; i++)
4661 {
4662
4663
4664 if(columns(i).column_use == Input)
4665 {
4666
4667 if(columns(i).type == Categorical)
4668 {
4669
4670 const Index categories_number = columns(i).categories.size();
4671
4672 bool is_constant = true;
4673
4674 for(Index j = 0; j < categories_number; j++)
4675 {
4676
4677 const type column_standard_deviation = standard_deviation(data.chip(variable_index+j,1), used_samples_indices);
4678 if((column_standard_deviation - 0) > numeric_limits<type>::min())
4679 {
4680 is_constant = false;
4681 break;
4682 }
4683
4684 }
4685
4686 if(is_constant) columns(i).set_use(UnusedVariable);
4687
4688 constant_columns = push_back(constant_columns, columns(i).name);
4689
4690 }
4691 else
4692 {
4693
4694 const type column_standard_deviation = standard_deviation(data.chip(variable_index,1), used_samples_indices);
4695
4696 if((column_standard_deviation - 0) < numeric_limits<type>::min())
4697
4698 {
4699 columns(i).set_use(UnusedVariable);
4700
4701 constant_columns = push_back(constant_columns, columns(i).name);
4702
4703 }
4704 }
4705 }
4706
4707 columns(i).type == Categorical ? variable_index += columns(i).categories.size() : variable_index++;
4708
4709 }
4710 return constant_columns;
4711 }
4712
4713
4714 /// Removes the training, selection and testing indices of that samples which are repeated in the data matrix.
4715 /// It might change the size of the vectors containing the training, selection and testing indices.
4716
unuse_repeated_samples()4717 Tensor<Index, 1> DataSet::unuse_repeated_samples()
4718 {
4719 const Index samples_number = get_samples_number();
4720
4721 #ifdef __OPENNN_DEBUG__
4722
4723 if(samples_number == 0)
4724 {
4725 ostringstream buffer;
4726
4727 buffer << "OpenNN Exception: DataSet class.\n"
4728 << "Tensor<Index, 1> unuse_repeated_samples() method.\n"
4729 << "Number of samples is zero.\n";
4730
4731 throw logic_error(buffer.str());
4732 }
4733
4734 #endif
4735
4736 Tensor<Index, 1> repeated_samples;
4737
4738 Tensor<type, 1> sample_i;
4739 Tensor<type, 1> sample_j;
4740
4741 #pragma omp parallel for private(sample_i, sample_j) schedule(dynamic)
4742
4743 for(Index i = 0; i < static_cast<Index>(samples_number); i++)
4744 {
4745 sample_i = get_sample_data(i);
4746
4747 for(Index j = static_cast<Index>(i+1); j < samples_number; j++)
4748 {
4749 sample_j = get_sample_data(j);
4750
4751 if(get_sample_use(j) != UnusedSample
4752 && std::equal(sample_i.data(), sample_i.data()+sample_i.size(), sample_j.data()))
4753 {
4754 set_sample_use(j, UnusedSample);
4755
4756 repeated_samples = push_back(repeated_samples, j);
4757 }
4758 }
4759 }
4760
4761 return repeated_samples;
4762 }
4763
4764
4765 /// Return unused variables without correlation.
4766 /// @param minimum_correlation Minimum correlation between variables.
4767
unuse_uncorrelated_columns(const type & minimum_correlation)4768 Tensor<string, 1> DataSet::unuse_uncorrelated_columns(const type& minimum_correlation)
4769 {
4770 Tensor<string, 1> unused_columns;
4771
4772 const Tensor<CorrelationResults, 2> correlations = calculate_input_target_columns_correlations();
4773
4774 const Index input_columns_number = get_input_columns_number();
4775 const Index target_columns_number = get_target_columns_number();
4776
4777 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
4778
4779 for(Index i = 0; i < input_columns_number; i++)
4780 {
4781 const Index index = input_columns_indices(i);
4782
4783 for(Index j = 0; j < target_columns_number; j++)
4784 {
4785 if(columns(index).column_use != UnusedVariable && abs(correlations(i,j).correlation) < minimum_correlation)
4786 {
4787 columns(index).set_use(UnusedVariable);
4788
4789 unused_columns = push_back(unused_columns, columns(index).name);
4790 }
4791 }
4792 }
4793
4794 return unused_columns;
4795 }
4796
4797
4798 /// Returns the distribution of each of the columns. In the case of numeric columns, it returns a
4799 /// histogram, for the case of categorical columns, it returns the frequencies of each category nad for the
4800 /// binary columns it returns the frequencies of the positives and negatives.
4801 /// The default number of bins is 10.
4802 /// @param bins_number Number of bins.
4803
calculate_columns_distribution(const Index & bins_number) const4804 Tensor<Histogram, 1> DataSet::calculate_columns_distribution(const Index& bins_number) const
4805 {
4806 const Index columns_number = columns.size();
4807 const Index used_columns_number = get_used_columns_number();
4808 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
4809 const Index used_samples_number = used_samples_indices.size();
4810
4811 Tensor<Histogram, 1> histograms(used_columns_number);
4812
4813 Index variable_index = 0;
4814 Index used_column_index = 0;
4815
4816 for(Index i = 0; i < columns_number; i++)
4817 {
4818 if(columns(i).type == Numeric)
4819 {
4820 if(columns(i).column_use == UnusedVariable)
4821 {
4822 variable_index++;
4823 }
4824 else
4825 {
4826 Tensor<type, 1> column(used_samples_number);
4827
4828 for(Index j = 0; j < used_samples_number; j++)
4829 {
4830 column(j) = data(used_samples_indices(j), variable_index);
4831 }
4832
4833 histograms(used_column_index) = histogram(column, bins_number);
4834
4835 variable_index++;
4836 used_column_index++;
4837 }
4838 }
4839 else if(columns(i).type == Categorical)
4840 {
4841 const Index categories_number = columns(i).get_categories_number();
4842
4843 if(columns(i).column_use == UnusedVariable)
4844 {
4845 variable_index += categories_number;
4846 }
4847 else
4848 {
4849 Tensor<Index, 1> categories_frequencies(categories_number);
4850 categories_frequencies.setZero();
4851 Tensor<type, 1> centers(categories_number);
4852
4853 for(Index j = 0; j < categories_number; j++)
4854 {
4855 for(Index k = 0; k < used_samples_number; k++)
4856 {
4857 if(abs(data(used_samples_indices(k), variable_index) - 1) < numeric_limits<type>::min())
4858 {
4859 categories_frequencies(j)++;
4860 }
4861 }
4862
4863 centers(j) = static_cast<type>(j);
4864
4865 variable_index++;
4866 }
4867
4868 histograms(used_column_index).frequencies = categories_frequencies;
4869 histograms(used_column_index).centers = centers;
4870
4871 used_column_index++;
4872 }
4873 }
4874 else if(columns(i).type == Binary)
4875 {
4876 if(columns(i).column_use == UnusedVariable)
4877 {
4878 variable_index++;
4879 }
4880 else
4881 {
4882 Tensor<Index, 1> binary_frequencies(2);
4883 binary_frequencies.setZero();
4884
4885 for(Index j = 0; j < used_samples_number; j++)
4886 {
4887 if(fabsf(data(used_samples_indices(j), variable_index) - 1) < numeric_limits<type>::min())
4888 {
4889 binary_frequencies(0)++;
4890 }
4891 else
4892 {
4893 binary_frequencies(1)++;
4894 }
4895 }
4896
4897 histograms(used_column_index).frequencies = binary_frequencies;
4898 variable_index++;
4899 used_column_index++;
4900 }
4901 }
4902 else // Time @todo
4903 {
4904 variable_index++;
4905 }
4906 }
4907
4908 return histograms;
4909 }
4910
4911
4912 /// Returns a vector of subvectors with the values of a box and whiskers plot.
4913 /// The size of the vector is equal to the number of used variables.
4914 /// The size of the subvectors is 5 and they consist on:
4915 /// <ul>
4916 /// <li> Minimum
4917 /// <li> First quartile
4918 /// <li> Second quartile
4919 /// <li> Third quartile
4920 /// <li> Maximum
4921 /// </ul>
4922
calculate_columns_box_plots() const4923 Tensor<BoxPlot, 1> DataSet::calculate_columns_box_plots() const
4924 {
4925 Index used_columns_number = get_used_columns_number();
4926
4927 Index columns_number = get_columns_number();
4928
4929 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
4930
4931 Tensor<BoxPlot, 1> box_plots(used_columns_number);
4932
4933 Index used_column_index = 0;
4934 Index variable_index = 0;
4935
4936 for(Index i = 0; i < columns_number; i++)
4937 {
4938 if(columns(i).type == Numeric || columns(i).type == Binary)
4939 {
4940 if(columns(i).column_use != UnusedVariable)
4941 {
4942 cout << "Column: " << columns(i).name << endl;
4943
4944 box_plots(used_column_index) = box_plot(data.chip(variable_index, 1), used_samples_indices);
4945
4946 cout << "min: " << box_plots(used_column_index).minimum << endl;
4947 cout << "max: " << box_plots(used_column_index).maximum << endl;
4948
4949
4950 used_column_index++;
4951 }
4952
4953 variable_index++;
4954 }
4955 else if(columns(i).type == Categorical)
4956 {
4957 variable_index += columns(i).get_categories_number();
4958 }
4959 else
4960 {
4961 variable_index++;
4962 }
4963 }
4964
4965 return box_plots;
4966 }
4967
4968
4969 /// Counts the number of used negatives of the selected target.
4970 /// @param target_index Index of the target to evaluate.
4971
calculate_used_negatives(const Index & target_index) const4972 Index DataSet::calculate_used_negatives(const Index& target_index) const
4973 {
4974 Index negatives = 0;
4975
4976 const Tensor<Index, 1> used_indices = get_used_samples_indices();
4977
4978 const Index used_samples_number = used_indices.size();
4979
4980 for(Index i = 0; i < used_samples_number; i++)
4981 {
4982 const Index training_index = used_indices(i);
4983
4984 if(fabsf(data(training_index, target_index)) < numeric_limits<type>::min())
4985 {
4986 negatives++;
4987 }
4988 else if(fabsf(data(training_index, target_index) - static_cast<type>(1)) > static_cast<type>(1.0e-3))
4989 {
4990 ostringstream buffer;
4991
4992 buffer << "OpenNN Exception: DataSet class.\n"
4993 << "Index calculate_used_negatives(const Index&) const method.\n"
4994 << "Training sample is neither a positive nor a negative: " << data(training_index, target_index) << endl;
4995
4996 throw logic_error(buffer.str());
4997 }
4998 }
4999
5000 return negatives;
5001 }
5002
5003
5004 /// Counts the number of negatives of the selected target in the training data.
5005 /// @param target_index Index of the target to evaluate.
5006
calculate_training_negatives(const Index & target_index) const5007 Index DataSet::calculate_training_negatives(const Index& target_index) const
5008 {
5009 Index negatives = 0;
5010
5011 const Tensor<Index, 1> training_indices = get_training_samples_indices();
5012
5013 const Index training_samples_number = training_indices.size();
5014
5015 for(Index i = 0; i < training_samples_number; i++)
5016 {
5017 const Index training_index = training_indices(i);
5018
5019 if(fabsf(data(training_index, target_index)) < numeric_limits<type>::min())
5020 {
5021 negatives++;
5022 }
5023 else if(fabsf(data(training_index, target_index) - static_cast<type>(1)) > static_cast<type>(1.0e-3))
5024 {
5025 ostringstream buffer;
5026
5027 buffer << "OpenNN Exception: DataSet class.\n"
5028 << "Index calculate_training_negatives(const Index&) const method.\n"
5029 << "Training sample is neither a positive nor a negative: " << data(training_index, target_index) << endl;
5030
5031 throw logic_error(buffer.str());
5032 }
5033 }
5034
5035 return negatives;
5036 }
5037
5038
5039 /// Counts the number of negatives of the selected target in the selection data.
5040 /// @param target_index Index of the target to evaluate.
5041
calculate_selection_negatives(const Index & target_index) const5042 Index DataSet::calculate_selection_negatives(const Index& target_index) const
5043 {
5044 Index negatives = 0;
5045
5046 const Index selection_samples_number = get_selection_samples_number();
5047
5048 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5049
5050 for(Index i = 0; i < static_cast<Index>(selection_samples_number); i++)
5051 {
5052 const Index selection_index = selection_indices(i);
5053
5054 if(fabsf(data(selection_index, target_index)) < numeric_limits<type>::min())
5055 {
5056 negatives++;
5057 }
5058 else if(fabsf(data(selection_index, target_index) - 1) > numeric_limits<type>::min())
5059 {
5060 ostringstream buffer;
5061
5062 buffer << "OpenNN Exception: DataSet class.\n"
5063 << "Index calculate_testing_negatives(const Index&) const method.\n"
5064 << "Selection sample is neither a positive nor a negative: " << data(selection_index, target_index) << endl;
5065
5066 throw logic_error(buffer.str());
5067 }
5068 }
5069
5070 return negatives;
5071 }
5072
5073
5074 /// Counts the number of negatives of the selected target in the testing data.
5075 /// @param target_index Index of the target to evaluate.
5076
calculate_testing_negatives(const Index & target_index) const5077 Index DataSet::calculate_testing_negatives(const Index& target_index) const
5078 {
5079 Index negatives = 0;
5080
5081 const Index testing_samples_number = get_testing_samples_number();
5082
5083 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
5084
5085 for(Index i = 0; i < static_cast<Index>(testing_samples_number); i++)
5086 {
5087 const Index testing_index = testing_indices(i);
5088
5089 if(data(testing_index, target_index) < numeric_limits<type>::min())
5090 {
5091 negatives++;
5092 }
5093 }
5094
5095 return negatives;
5096 }
5097
5098
5099 /// Returns a vector of vectors containing some basic descriptives of all the variables in the data set.
5100 /// The size of this vector is four. The subvectors are:
5101 /// <ul>
5102 /// <li> Minimum.
5103 /// <li> Maximum.
5104 /// <li> Mean.
5105 /// <li> Standard deviation.
5106 /// </ul>
5107
calculate_variables_descriptives() const5108 Tensor<Descriptives, 1> DataSet::calculate_variables_descriptives() const
5109 {
5110 return descriptives(data);
5111 }
5112
5113
5114 /// Returns a vector of vectors containing some basic descriptives of the used variables and samples
5115 /// The size of this vector is four. The subvectors are:
5116 /// <ul>
5117 /// <li> Minimum.
5118 /// <li> Maximum.
5119 /// <li> Mean.
5120 /// <li> Standard deviation.
5121 /// </ul>
5122
calculate_used_variables_descriptives() const5123 Tensor<Descriptives, 1> DataSet::calculate_used_variables_descriptives() const
5124 {
5125 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5126 const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
5127
5128 return descriptives(data, used_samples_indices, used_variables_indices);
5129 }
5130
5131
5132 /// Calculate the descriptives of the samples with positive targets in binary classification problems.
5133 /// @todo Low priority.
5134
calculate_columns_descriptives_positive_samples() const5135 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_positive_samples() const
5136 {
5137
5138 #ifdef __OPENNN_DEBUG__
5139
5140 const Index targets_number = get_target_variables_number();
5141
5142 if(targets_number != 1)
5143 {
5144 ostringstream buffer;
5145
5146 buffer << "OpenNN Exception: DataSet class.\n"
5147 << "Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5148 << "Number of targets muste be 1.\n";
5149
5150 throw logic_error(buffer.str());
5151 }
5152 #endif
5153
5154 const Index target_index = get_target_variables_indices()(0);
5155
5156
5157 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5158 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5159
5160 const Index samples_number = used_samples_indices.size();
5161
5162 // Count used positive samples
5163
5164 Index positive_samples_number = 0;
5165
5166 for (Index i = 0; i < samples_number; i++)
5167 {
5168 Index sample_index = used_samples_indices(i);
5169
5170 if(abs(data(sample_index, target_index) - 1) < numeric_limits<type>::min()) positive_samples_number++;
5171 }
5172
5173 // Get used positive samples indices
5174
5175 Tensor<Index, 1> positive_used_samples_indices(positive_samples_number);
5176 Index positive_sample_index = 0;
5177
5178 for(Index i = 0; i < samples_number; i++)
5179 {
5180 Index sample_index = used_samples_indices(i);
5181
5182 if(abs(data(sample_index, target_index) - 1) < numeric_limits<type>::min())
5183 {
5184 positive_used_samples_indices(positive_sample_index) = sample_index;
5185 positive_sample_index++;
5186 }
5187 }
5188 return descriptives(data, positive_used_samples_indices, input_variables_indices);
5189 }
5190
5191
5192 /// Calculate the descriptives of the samples with neagtive targets in binary classification problems.
5193 /// @todo Low priority.
5194
calculate_columns_descriptives_negative_samples() const5195 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_negative_samples() const
5196 {
5197
5198 #ifdef __OPENNN_DEBUG__
5199
5200 const Index targets_number = get_target_variables_number();
5201
5202 if(targets_number != 1)
5203 {
5204 ostringstream buffer;
5205
5206 buffer << "OpenNN Exception: DataSet class.\n"
5207 << "Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5208 << "Number of targets muste be 1.\n";
5209
5210 throw logic_error(buffer.str());
5211 }
5212 #endif
5213
5214 const Index target_index = get_target_variables_indices()(0);
5215
5216 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5217 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5218
5219 const Index samples_number = used_samples_indices.size();
5220
5221 // Count used negative samples
5222
5223 Index negative_samples_number = 0;
5224
5225 for (Index i = 0; i < samples_number; i++)
5226 {
5227 Index sample_index = used_samples_indices(i);
5228
5229 if(data(sample_index, target_index) < numeric_limits<type>::min()) negative_samples_number++;
5230 }
5231
5232 // Get used negative samples indices
5233
5234 Tensor<Index, 1> negative_used_samples_indices(negative_samples_number);
5235 Index negative_sample_index = 0;
5236
5237 for(Index i = 0; i < samples_number; i++)
5238 {
5239 Index sample_index = used_samples_indices(i);
5240
5241 if(data(sample_index, target_index) < numeric_limits<type>::min())
5242 {
5243 negative_used_samples_indices(negative_sample_index) = sample_index;
5244 negative_sample_index++;
5245 }
5246
5247 }
5248
5249 return descriptives(data, negative_used_samples_indices, input_variables_indices);
5250 }
5251
5252
5253 /// Returns a matrix with the data set descriptive statistics.
5254 /// @param class_index Data set index number to make the descriptive statistics.
5255
calculate_columns_descriptives_categories(const Index & class_index) const5256 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_categories(const Index& class_index) const
5257 {
5258 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5259 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5260
5261 const Index samples_number = used_samples_indices.size();
5262
5263 // Count used class samples
5264
5265 Index class_samples_number = 0;
5266
5267 for (Index i = 0; i < samples_number; i++)
5268 {
5269 Index sample_index = used_samples_indices(i);
5270
5271 if(abs(data(sample_index, class_index) - 1) < numeric_limits<type>::min()) class_samples_number++;
5272 }
5273
5274 // Get used class samples indices
5275
5276 Tensor<Index, 1> class_used_samples_indices(class_samples_number);
5277 class_used_samples_indices.setZero();
5278 Index class_sample_index = 0;
5279
5280 for(Index i = 0; i < samples_number; i++)
5281 {
5282 Index sample_index = used_samples_indices(i);
5283
5284 if(abs(data(sample_index, class_index) - 1) < numeric_limits<type>::min())
5285 {
5286 class_used_samples_indices(class_sample_index) = sample_index;
5287 class_sample_index++;
5288 }
5289 }
5290
5291 return descriptives(data, class_used_samples_indices, input_variables_indices);
5292 }
5293
5294
5295 /// Returns a vector of vectors containing some basic descriptives of all variables on the training
5296 /// The size of this vector is two. The subvectors are:
5297 /// <ul>
5298 /// <li> Training data minimum.
5299 /// <li> Training data maximum.
5300 /// <li> Training data mean.
5301 /// <li> Training data standard deviation.
5302 /// </ul>
5303
calculate_columns_descriptives_training_samples() const5304 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_training_samples() const
5305 {
5306 const Tensor<Index, 1> training_indices = get_training_samples_indices();
5307
5308 const Tensor<Index, 1> used_indices = get_used_columns_indices();
5309
5310 return descriptives(data, training_indices, used_indices);
5311 }
5312
5313
5314 /// Returns a vector of vectors containing some basic descriptives of all variables on the selection
5315 /// The size of this vector is two. The subvectors are:
5316 /// <ul>
5317 /// <li> Selection data minimum.
5318 /// <li> Selection data maximum.
5319 /// <li> Selection data mean.
5320 /// <li> Selection data standard deviation.
5321 /// </ul>
5322
calculate_columns_descriptives_selection_samples() const5323 Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_selection_samples() const
5324 {
5325 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5326
5327 const Tensor<Index, 1> used_indices = get_used_columns_indices();
5328
5329 return descriptives(data, selection_indices, used_indices);
5330 }
5331
5332
5333 /// Returns a vector of Descriptives structures with some basic statistics of the input variables on the used
5334 /// This includes the minimum, maximum, mean and standard deviation.
5335 /// The size of this vector is the number of inputs.
5336
calculate_input_variables_descriptives() const5337 Tensor<Descriptives, 1> DataSet::calculate_input_variables_descriptives() const
5338 {
5339 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5340
5341 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5342
5343 return descriptives(data, used_samples_indices, input_variables_indices);
5344 }
5345
5346
5347 /// Returns a vector of vectors with some basic descriptives of the target variables on all
5348 /// The size of this vector is four. The subvectors are:
5349 /// <ul>
5350 /// <li> Target variables minimum.
5351 /// <li> Target variables maximum.
5352 /// <li> Target variables mean.
5353 /// <li> Target variables standard deviation.
5354 /// </ul>
5355
calculate_target_variables_descriptives() const5356 Tensor<Descriptives, 1> DataSet::calculate_target_variables_descriptives() const
5357 {
5358 const Tensor<Index, 1> used_indices = get_used_samples_indices();
5359
5360 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5361
5362 return descriptives(data, used_indices, target_variables_indices);
5363 }
5364
5365
5366 /// Returns a vector containing the minimums of the input variables.
5367
calculate_input_variables_minimums() const5368 Tensor<type, 1> DataSet::calculate_input_variables_minimums() const
5369 {
5370 return columns_minimums(data, get_used_samples_indices(), get_input_variables_indices());
5371 }
5372
5373
5374 /// Returns a vector containing the minimums of the target variables.
5375
calculate_target_variables_minimums() const5376 Tensor<type, 1> DataSet::calculate_target_variables_minimums() const
5377 {
5378 return columns_minimums(data, get_used_samples_indices(), get_target_variables_indices());
5379 }
5380
5381
5382
5383 /// Returns a vector containing the maximums of the input variables.
5384
calculate_input_variables_maximums() const5385 Tensor<type, 1> DataSet::calculate_input_variables_maximums() const
5386 {
5387 return columns_maximums(data, get_used_samples_indices(), get_input_variables_indices());
5388 }
5389
5390
5391 /// Returns a vector containing the maximums of the target variables.
5392
calculate_target_variables_maximums() const5393 Tensor<type, 1> DataSet::calculate_target_variables_maximums() const
5394 {
5395 return columns_maximums(data, get_used_samples_indices(), get_target_variables_indices());
5396 }
5397
5398
5399 /// Returns a vector containing the maximum of the used variables.
5400
calculate_used_variables_minimums() const5401 Tensor<type, 1> DataSet::calculate_used_variables_minimums() const
5402 {
5403 return columns_minimums(data, get_used_samples_indices(), get_used_variables_indices());
5404 }
5405
5406 /// Returns a vector containing the means of a set of given variables.
5407 /// @param variables_indices Indices of the variables.
5408
calculate_variables_means(const Tensor<Index,1> & variables_indices) const5409 Tensor<type, 1> DataSet::calculate_variables_means(const Tensor<Index, 1>& variables_indices) const
5410 {
5411 const Index variables_number = variables_indices.size();
5412
5413 Tensor<type, 1> means(variables_number);
5414
5415 #pragma omp parallel for
5416
5417 for(Index i = 0; i < variables_number; i++)
5418 {
5419 const Index variable_index = variables_indices(i);
5420
5421 const Tensor<type, 0> mean = data.chip(variable_index,1).mean();
5422
5423 means(i) = mean(0);
5424 }
5425
5426 return means;
5427 }
5428
5429
5430 /// Returns a vector with some basic descriptives of the given input variable on all
5431 /// The size of this vector is four:
5432 /// <ul>
5433 /// <li> Input variable minimum.
5434 /// <li> Input variable maximum.
5435 /// <li> Input variable mean.
5436 /// <li> Input variable standard deviation.
5437 /// </ul>
5438 /// @todo
5439
calculate_input_descriptives(const Index & input_index) const5440 Descriptives DataSet::calculate_input_descriptives(const Index& input_index) const
5441 {
5442 // return descriptives_missing_values(data.chip(input_index,1));
5443
5444 return Descriptives();
5445 }
5446
5447
calculate_used_targets_mean() const5448 Tensor<type, 1> DataSet::calculate_used_targets_mean() const
5449 {
5450 const Tensor<Index, 1> used_indices = get_used_samples_indices();
5451
5452 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5453
5454 return mean(data, used_indices, target_variables_indices);
5455 }
5456
5457
5458
5459 /// Returns the mean values of the target variables on the selection
5460
calculate_selection_targets_mean() const5461 Tensor<type, 1> DataSet::calculate_selection_targets_mean() const
5462 {
5463 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5464
5465 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5466
5467 return mean(data, selection_indices, target_variables_indices);
5468 }
5469
5470
5471 /// Returns the value of the gmt that has the data set, by default it is 0.
5472 /// This is recommended to use in forecasting problems.
5473
get_gmt() const5474 Index DataSet::get_gmt() const
5475 {
5476 return gmt;
5477 }
5478
5479
5480 /// Sets the value of the gmt, by default it is 0.
5481 /// This is recommended to use in forecasting problems.
5482
set_gmt(Index & new_gmt)5483 void DataSet::set_gmt(Index& new_gmt)
5484 {
5485 gmt = new_gmt;
5486 }
5487
5488
5489 /// Calculates the correlations between all outputs and all inputs.
5490 /// It returns a matrix with the data stored in CorrelationsResults format, where the number of rows is the input number
5491 /// and number of columns is the target number.
5492 /// Each element contains the correlation between a single input and a single target.
5493
calculate_input_target_columns_correlations() const5494 Tensor<CorrelationResults, 2> DataSet::calculate_input_target_columns_correlations() const
5495 {
5496 const Index input_columns_number = get_input_columns_number();
5497 const Index target_columns_number = get_target_columns_number();
5498
5499 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5500 Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
5501
5502 Tensor<CorrelationResults, 2> correlations(input_columns_number, target_columns_number);
5503
5504 // #pragma omp parallel for
5505
5506 for(Index i = 0; i < input_columns_number; i++)
5507 {
5508 const Index input_index = input_columns_indices(i);
5509
5510 Tensor<type, 2> input = get_column_data(input_index);
5511
5512 const ColumnType input_type = columns(input_index).type;
5513
5514 for(Index j = 0; j < target_columns_number; j++)
5515 {
5516 const Index target_index = target_columns_indices(j);
5517
5518 Tensor<type, 2> target = get_column_data(target_index);
5519
5520 const ColumnType target_type = columns(target_index).type;
5521
5522 cout << "Calculating " << columns(input_index).name << " - " << columns(target_index).name << " correlations. \n" ;
5523
5524 if(input_type == Numeric && target_type == Numeric)
5525 {
5526 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5527 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5528
5529 const CorrelationResults linear_correlation = linear_correlations(thread_pool_device, input_column, target_column);
5530 const CorrelationResults exponential_correlation = exponential_correlations(thread_pool_device, input_column, target_column);
5531 const CorrelationResults logarithmic_correlation = logarithmic_correlations(thread_pool_device, input_column, target_column);
5532 const CorrelationResults power_correlation = power_correlations(thread_pool_device, input_column, target_column);
5533
5534 CorrelationResults strongest_correlation = linear_correlation;
5535
5536 if(abs(exponential_correlation.correlation) > abs(strongest_correlation.correlation)) strongest_correlation = exponential_correlation;
5537 if(abs(logarithmic_correlation.correlation) > abs(strongest_correlation.correlation)) strongest_correlation = logarithmic_correlation;
5538 if(abs(power_correlation.correlation) > abs(strongest_correlation.correlation)) strongest_correlation = power_correlation;
5539
5540 correlations(i,j) = strongest_correlation;
5541 }
5542 else if(input_type == Binary && target_type == Binary)
5543 {
5544 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5545 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5546
5547 correlations(i,j) = linear_correlations(thread_pool_device, input_column, target_column);
5548 }
5549 else if(input_type == Categorical && target_type == Categorical)
5550 {
5551 // @todo
5552 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target);
5553
5554 // correlations(i,j) = karl_pearson_correlations(thread_pool_device, input, target);
5555 }
5556 else if(input_type == Numeric && target_type == Binary)
5557 {
5558 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5559 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5560
5561 correlations(i,j) = logistic_correlations(thread_pool_device, input_column, target_column);
5562 }
5563 else if(input_type == Binary && target_type == Numeric)
5564 {
5565 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5566 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5567
5568 correlations(i,j) = logistic_correlations(thread_pool_device, input_column, target_column);
5569 }
5570 else if(input_type == Categorical && target_type == Numeric)
5571 {
5572 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5573
5574 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target/*target_column*/);
5575 }
5576 else if(input_type == Numeric && target_type == Categorical)
5577 {
5578 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5579
5580 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, target, input/*input_column*/);
5581 }
5582 else if(input_type == Binary && target_type == Categorical)
5583 {
5584 const TensorMap<Tensor<type, 1>> input_column(input.data(), input.dimension(0));
5585
5586 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, target, input/*input_column*/);
5587
5588 // correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target);
5589
5590 // const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5591
5592 // Tensor<type, 2> new_input = transform_binary_column(input_column);
5593
5594 // correlations(i,j) = karl_pearson_correlations(thread_pool_device, new_input, target);
5595 }
5596 else if(input_type == Categorical && target_type == Binary)
5597 {
5598 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input, target);
5599
5600 // const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5601
5602 // Tensor<type, 2> new_target = transform_binary_column(target_column);
5603
5604 // correlations(i,j) = karl_pearson_correlations(thread_pool_device, input, new_target);
5605 }
5606 else if(input_type == DateTime || target_type == DateTime)
5607 {
5608 correlations(i,j).correlation = 0;
5609 }
5610 else
5611 {
5612 ostringstream buffer;
5613
5614 buffer << "OpenNN Exception: DataSet class.\n"
5615 << "Tensor<type, 2> calculate_input_target_columns_correlations() const method.\n"
5616 << "Case not found: Column " << columns(input_index).name << " and Column " << columns(target_index).name << ".\n";
5617
5618 throw logic_error(buffer.str());
5619 }
5620
5621 cout << "Correlation: " << correlations(i,j).correlation << endl;
5622
5623 }
5624 }
5625
5626 return correlations;
5627 }
5628
5629
5630 /// Calculates the correlations between all outputs and all inputs.
5631 /// It returns a matrix with the number of rows is the input number
5632 /// and number of columns is the target number.
5633 /// Each element contains the correlation between a single input and a single target.
5634
calculate_input_target_columns_correlations_values() const5635 Tensor<type, 2> DataSet::calculate_input_target_columns_correlations_values() const
5636 {
5637 Tensor<CorrelationResults, 2> correlations = calculate_input_target_columns_correlations();
5638
5639 const Index rows_number = correlations.dimension(0);
5640 const Index columns_number = correlations.dimension(1);
5641
5642 Tensor<type, 2> correlations_values(rows_number, columns_number);
5643
5644 for(Index i = 0; i < rows_number; i++)
5645 {
5646 for(Index j = 0; j < columns_number; j++)
5647 {
5648 correlations_values(i,j) = correlations(i,j).correlation;
5649 }
5650 }
5651
5652 return correlations_values;
5653 }
5654
5655
5656 /// Returns true if the data contain missing values.
5657
has_nan() const5658 bool DataSet::has_nan() const
5659 {
5660 for(Index i = 0; i < data.size(); i++) if(::isnan(data(i))) return true;
5661
5662 return false;
5663 }
5664
5665
5666 /// Returns true if the given row contains missing values.
5667
has_nan_row(const Index & row_index) const5668 bool DataSet::has_nan_row(const Index& row_index) const
5669 {
5670 for(Index j = 0; j < data.dimension(1); j++)
5671 {
5672 if(::isnan(data(row_index,j))) return true;
5673 }
5674
5675 return false;
5676 }
5677
5678
5679 /// Print on screen the information about the missing values in the data set.
5680 /// <ul>
5681 /// <li> Total number of missing values.
5682 /// <li> Number of variables with missing values.
5683 /// <li> Number of samples with missing values.
5684 /// </ul>
5685 /// @todo implement with indices of variables and samples?
5686
print_missing_values_information() const5687 void DataSet::print_missing_values_information() const
5688 {
5689 // const Index missing_values_number = data.count_nan();
5690
5691 // cout << "Missing values number: " << missing_values_number << " (" << missing_values_number*100/data.size() << "%)" << endl;
5692
5693 // const Index variables_with_missing_values = data.count_columns_with_nan();
5694
5695 // cout << "Variables with missing values: " << variables_with_missing_values << " (" << variables_with_missing_values*100/data.dimension(1) << "%)" << endl;
5696
5697 // const Index samples_with_missing_values = data.count_rows_with_nan();
5698
5699 // cout << "Samples with missing values: " << samples_with_missing_values << " (" << samples_with_missing_values*100/data.dimension(0) << "%)" << endl;
5700 }
5701
5702
5703 /// Print on screen the correlation between targets and inputs.
5704
print_input_target_columns_correlations() const5705 void DataSet::print_input_target_columns_correlations() const
5706 {
5707 const Index inputs_number = get_input_variables_number();
5708 const Index targets_number = get_target_variables_number();
5709
5710 const Tensor<string, 1> inputs_names = get_input_variables_names();
5711 const Tensor<string, 1> targets_name = get_target_variables_names();
5712
5713 const Tensor<RegressionResults, 2> correlations;// = calculate_input_target_columns_correlations();
5714
5715 for(Index j = 0; j < targets_number; j++)
5716 {
5717 for(Index i = 0; i < inputs_number; i++)
5718 {
5719 cout << targets_name(j) << " - " << inputs_names(i) << ": " << correlations(i,j).correlation << endl;
5720 }
5721 }
5722 }
5723
5724
5725 /// This method print on screen the corretaliont between inputs and targets.
5726 /// @param number Number of variables to be printed.
5727 /// @todo
5728
print_top_input_target_columns_correlations(const Index & number) const5729 void DataSet::print_top_input_target_columns_correlations(const Index& number) const
5730 {
5731 const Index inputs_number = get_input_columns_number();
5732 const Index targets_number = get_target_columns_number();
5733
5734 const Tensor<string, 1> inputs_names = get_input_variables_names();
5735 const Tensor<string, 1> targets_name = get_target_variables_names();
5736
5737 const Tensor<RegressionResults, 2> correlations;// = calculate_input_target_columns_correlations();
5738
5739 Tensor<type, 1> target_correlations(inputs_number);
5740
5741 Tensor<string, 2> top_correlations(inputs_number, 2);
5742
5743 map<type,string> top_correlation;
5744
5745 for(Index i = 0 ; i < inputs_number; i++)
5746 {
5747 for(Index j = 0 ; j < targets_number ; j++)
5748 {
5749 // top_correlation.insert(pair<type,string>(correlations(i,j), inputs_names(i) + " - " + targets_name(j)));
5750 }
5751 }
5752
5753 map<type,string>::iterator it;
5754
5755 for(it = top_correlation.begin(); it!=top_correlation.end(); it++)
5756 {
5757 cout << "Correlation: " << (*it).first << " between " << (*it).second << "" << endl;
5758 }
5759 }
5760
5761
5762 /// Calculates the regressions between all outputs and all inputs.
5763 /// It returns a matrix with the data stored in RegressionResults format, where the number of rows is the input number
5764 /// and number of columns is the target number.
5765 /// Each element contains the correlation between a single input and a single target.
5766
calculate_input_target_columns_regressions() const5767 Tensor<RegressionResults, 2> DataSet::calculate_input_target_columns_regressions() const
5768 {
5769 const Index input_columns_number = get_input_columns_number();
5770 const Index target_columns_number = get_target_columns_number();
5771
5772 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5773 Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
5774
5775 Tensor<RegressionResults, 2> regressions(input_columns_number, target_columns_number);
5776
5777 //@todo check pragma, if uncommented, for does not work well.
5778 //#pragma omp parallel for
5779
5780 for(Index i = 0; i < input_columns_number; i++)
5781 {
5782 cout << endl;
5783
5784 const Index input_index = input_columns_indices(i);
5785
5786 Tensor<type, 2> input = get_column_data(input_index);
5787
5788 const ColumnType input_type = columns(input_index).type;
5789
5790 cout << "Calculating " << columns(input_index).name;
5791
5792 for(Index j = 0; j < target_columns_number; j++)
5793 {
5794 const Index target_index = target_columns_indices(j);
5795
5796 Tensor<type, 2> target = get_column_data(target_index);
5797
5798 const ColumnType target_type = columns(target_index).type;
5799
5800 cout << " - " << columns(target_columns_indices(j)).name << " regression. \n" ;
5801
5802 if(input_type == Numeric && target_type == Numeric)
5803 {
5804 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5805 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5806
5807 const RegressionResults linear_regression = OpenNN::linear_regression(thread_pool_device, input_column, target_column);
5808 const RegressionResults exponential_regression = OpenNN::exponential_regression(thread_pool_device, input_column, target_column);
5809 const RegressionResults logarithmic_regression = OpenNN::logarithmic_regression(thread_pool_device, input_column, target_column);
5810 const RegressionResults power_regression = OpenNN::power_regression(thread_pool_device, input_column, target_column);
5811
5812 RegressionResults strongest_regression = linear_regression;
5813
5814 if(abs(exponential_regression.correlation) > abs(strongest_regression.correlation)) strongest_regression = exponential_regression;
5815 if(abs(logarithmic_regression.correlation) > abs(strongest_regression.correlation)) strongest_regression = logarithmic_regression;
5816 if(abs(power_regression.correlation) > abs(strongest_regression.correlation)) strongest_regression = power_regression;
5817
5818 regressions(i,j) = strongest_regression;
5819 }
5820 else if(input_type == Binary && target_type == Binary)
5821 {
5822 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5823 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5824
5825 regressions(i,j) = linear_regression(thread_pool_device, input_column, target_column);
5826 }
5827 else if(input_type == Numeric && target_type == Binary)
5828 {
5829 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5830 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5831
5832 regressions(i,j) = logistic_regression(thread_pool_device, input_column, target_column);
5833 }
5834 else if(input_type == Binary && target_type == Numeric)
5835 {
5836 const TensorMap<Tensor<type,1>> input_column(input.data(), input.dimension(0));
5837 const TensorMap<Tensor<type,1>> target_column(target.data(), target.dimension(0));
5838
5839 regressions(i,j) = logistic_regression(thread_pool_device, input_column, target_column);
5840 }
5841 else if(input_type == Categorical && target_type == Categorical)
5842 {
5843 // Nothing
5844
5845 regressions(i,j).a = 0;
5846 regressions(i,j).b = 0;
5847 }
5848 else if(input_type == Categorical && target_type == Numeric)
5849 {
5850 // Nothing
5851
5852 regressions(i,j).a = 0;
5853 regressions(i,j).b = 0;
5854 }
5855 else if(input_type == Numeric && target_type == Categorical)
5856 {
5857 // Nothing
5858
5859 regressions(i,j).a = 0;
5860 regressions(i,j).b = 0;
5861 }
5862 else if(input_type == Binary && target_type == Categorical)
5863 {
5864 // nothing
5865
5866 regressions(i,j).a = 0;
5867 regressions(i,j).b = 0;
5868 }
5869 else if(input_type == Categorical && target_type == Binary)
5870 {
5871 // nothing
5872
5873 regressions(i,j).a = 0;
5874 regressions(i,j).b = 0;
5875 }
5876 else
5877 {
5878 ostringstream buffer;
5879
5880 buffer << "OpenNN Exception: DataSet class.\n"
5881 << "Tensor<type, 2> calculate_input_target_columns_regressions() const method.\n"
5882 << "Case not found: Column " << columns(input_index).name << " and Column " << columns(target_index).name << ".\n";
5883
5884 throw logic_error(buffer.str());
5885 }
5886 }
5887 }
5888
5889 return regressions;
5890 }
5891
5892
5893 /// Calculate the correlation between each input in the data set.
5894 /// Returns a matrix with the correlation values between variables in the data set.
5895
calculate_input_columns_correlations() const5896 Tensor<type, 2> DataSet::calculate_input_columns_correlations() const
5897 {
5898 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5899
5900 const Index input_columns_number = get_input_columns_number();
5901
5902 Tensor<type, 2> correlations(input_columns_number, input_columns_number);
5903 correlations.setConstant(1);
5904
5905 for(Index i = 0; i < input_columns_number; i++)
5906 {
5907 const Index current_input_index_i = input_columns_indices(i);
5908
5909 const ColumnType type_i = columns(current_input_index_i).type;
5910
5911 Tensor<type, 2> input_i = get_column_data(current_input_index_i);
5912
5913 cout << "Calculating " << columns(current_input_index_i).name << " correlations. " << endl;
5914
5915 #pragma omp parallel for
5916
5917 for(Index j = i; j < input_columns_number; j++)
5918 {
5919 const Index current_input_index_j = input_columns_indices(j);
5920
5921 const ColumnType type_j = columns(current_input_index_j).type;
5922
5923 Tensor<type, 2> input_j = get_column_data(current_input_index_j);
5924
5925 if(current_input_index_i == current_input_index_j)
5926 {
5927 correlations(i,j) = 1;
5928 continue;
5929 }
5930
5931 if(type_i == Numeric && type_j == Numeric)
5932 {
5933 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5934 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5935
5936 const type linear_correlation = OpenNN::linear_correlation(thread_pool_device, current_input_i, current_input_j);
5937 const type exponential_correlation = OpenNN::exponential_correlation(thread_pool_device, current_input_i, current_input_j);
5938 const type logarithmic_correlation = OpenNN::logarithmic_correlation(thread_pool_device, current_input_i, current_input_j);
5939 const type power_correlation = OpenNN::power_correlation(thread_pool_device, current_input_i, current_input_j);
5940
5941 type strongest_correlation = linear_correlation;
5942
5943 if(fabsf(exponential_correlation) > fabsf(strongest_correlation)) strongest_correlation = exponential_correlation;
5944 if(fabsf(logarithmic_correlation) > fabsf(strongest_correlation)) strongest_correlation = logarithmic_correlation;
5945 if(fabsf(power_correlation) > fabsf(strongest_correlation)) strongest_correlation = power_correlation;
5946
5947 correlations(i,j) = strongest_correlation;
5948 }
5949 else if(type_i == Binary && type_j == Binary)
5950 {
5951 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5952 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5953
5954 correlations(i,j) = linear_correlation(thread_pool_device, current_input_i, current_input_j);
5955 }
5956 else if(type_i == Categorical && type_j == Categorical)
5957 {
5958 correlations(i,j) = karl_pearson_correlation(thread_pool_device, input_i, input_j);
5959 }
5960 else if(type_i == Numeric && type_j == Binary)
5961 {
5962 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5963 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5964
5965 correlations(i,j) = logistic_correlations(thread_pool_device, current_input_i, current_input_j).correlation;
5966 }
5967 else if(type_i == Binary && type_j == Numeric)
5968 {
5969 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5970 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5971
5972 correlations(i,j) = logistic_correlations(thread_pool_device, current_input_i, current_input_j).correlation;
5973 }
5974 else if(type_i == Categorical && type_j == Numeric)
5975 {
5976 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5977
5978 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_i, input_j/*current_input_j*/).correlation;
5979 }
5980 else if(type_i == Numeric && type_j == Categorical)
5981 {
5982 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5983
5984 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_j, input_i/*current_input_i*/).correlation;
5985 }
5986 else if(type_i == Categorical && type_j == Binary)
5987 {
5988 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
5989
5990 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_i, input_j/*current_input_j*/).correlation;
5991 }
5992 else if(type_i == Binary && type_j == Categorical)
5993 {
5994 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
5995
5996 correlations(i,j) = multiple_logistic_correlations(thread_pool_device, input_j, input_i/*current_input_i*/).correlation;
5997 }
5998 else
5999 {
6000 ostringstream buffer;
6001
6002 buffer << "OpenNN Exception: DataSet class.\n"
6003 << "Tensor<type, 2> calculate_inputs_correlations() const method.\n"
6004 << "Case not found: Column " << columns(input_columns_indices(i)).name << " and Column " << columns(input_columns_indices(j)).name << ".\n";
6005
6006 throw logic_error(buffer.str());
6007 }
6008
6009 }
6010 }
6011
6012 for(Index i = 0; i < input_columns_number; i++)
6013 {
6014 for(Index j = 0; j < i; j++)
6015 {
6016 correlations(i,j) = correlations(j,i);
6017 }
6018 }
6019
6020 return correlations;
6021
6022 }
6023
6024
6025 /// Print on screen the correlation between variables in the data set.
6026
print_inputs_correlations() const6027 void DataSet::print_inputs_correlations() const
6028 {
6029 const Tensor<type, 2> inputs_correlations = calculate_input_columns_correlations();
6030
6031 cout << inputs_correlations << endl;
6032 }
6033
6034
print_data_file_preview() const6035 void DataSet::print_data_file_preview() const
6036 {
6037 const Index size = data_file_preview.size();
6038
6039 for(Index i = 0; i < size; i++)
6040 {
6041 for(Index j = 0; j < data_file_preview(i).size(); j++)
6042 {
6043 cout << data_file_preview(i)(j) << " ";
6044 }
6045
6046 cout << endl;
6047 }
6048 }
6049
6050
6051 /// This method print on screen the corretaliont between variables.
6052 /// @param number Number of variables to be printed.
6053 /// @todo Low priority.
6054
print_top_inputs_correlations(const Index & number) const6055 void DataSet::print_top_inputs_correlations(const Index& number) const
6056 {
6057 const Index variables_number = get_input_variables_number();
6058
6059 const Tensor<string, 1> variables_name = get_input_variables_names();
6060
6061 const Tensor<type, 2> variables_correlations = calculate_input_columns_correlations();
6062
6063 const Index correlations_number = variables_number*(variables_number-1)/2;
6064
6065 Tensor<string, 2> top_correlations(correlations_number, 3);
6066
6067 map<type, string> top_correlation;
6068
6069 for(Index i = 0; i < variables_number; i++)
6070 {
6071 for(Index j = i; j < variables_number; j++)
6072 {
6073 if(i == j) continue;
6074
6075 top_correlation.insert(pair<type,string>(variables_correlations(i,j), variables_name(i) + " - " + variables_name(j)));
6076 }
6077 }
6078
6079 map<type,string> :: iterator it;
6080
6081 for(it=top_correlation.begin(); it!=top_correlation.end(); it++)
6082 {
6083 cout << "Correlation: " << (*it).first << " between " << (*it).second << "" << endl;
6084 }
6085 }
6086
6087
6088 /// Returns the covariance matrix for the input data set.
6089 /// The number of rows of the matrix is the number of inputs.
6090 /// The number of columns of the matrix is the number of inputs.
6091 /// @todo
6092
calculate_covariance_matrix() const6093 Tensor<type, 2> DataSet::calculate_covariance_matrix() const
6094 {
6095 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6096 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
6097
6098 const Index inputs_number = get_input_variables_number();
6099
6100 Tensor<type, 2> covariance_matrix(inputs_number, inputs_number);
6101
6102 for(Index i = 0; i < static_cast<Index>(inputs_number); i++)
6103 {
6104 const Index first_input_index = input_variables_indices(i);
6105
6106 // const Tensor<type, 1> first_inputs = data.get_column(first_input_index, used_samples_indices);
6107
6108 for(Index j = i; j < inputs_number; j++)
6109 {
6110 const Index second_input_index = input_variables_indices(j);
6111
6112 // const Tensor<type, 1> second_inputs = data.get_column(second_input_index, used_samples_indices);
6113
6114 // covariance_matrix(i,j) = covariance(first_inputs, second_inputs);
6115 covariance_matrix(j,i) = covariance_matrix(i,j);
6116 }
6117 }
6118
6119 return covariance_matrix;
6120 }
6121
6122
6123 /// Performs the principal components analysis of the inputs.
6124 /// It returns a matrix containing the principal components getd in rows.
6125 /// This method deletes the unused samples of the original data set.
6126 /// @param minimum_explained_variance Minimum percentage of variance used to select a principal component.
6127
perform_principal_components_analysis(const type & minimum_explained_variance)6128 Tensor<type, 2> DataSet::perform_principal_components_analysis(const type& minimum_explained_variance)
6129 {
6130 // Subtract off the mean
6131
6132 subtract_inputs_mean();
6133
6134 // Calculate covariance matrix
6135
6136 const Tensor<type, 2> covariance_matrix = this->calculate_covariance_matrix();
6137
6138 // Calculate eigenvectors
6139
6140 // const Tensor<type, 2> eigenvectors = OpenNN::eigenvectors(covariance_matrix);
6141
6142 // Calculate eigenvalues
6143
6144 // const Tensor<type, 2> eigenvalues = OpenNN::eigenvalues(covariance_matrix);
6145
6146 // Calculate explained variance
6147
6148 // const Tensor<type, 1> explained_variance = OpenNN::explained_variance(eigenvalues.chip(0,1));
6149
6150 // Sort principal components
6151
6152 // const Tensor<Index, 1> sorted_principal_components_indices = explained_variance.sort_descending_indices();
6153
6154 // Choose eigenvectors
6155
6156 const Index inputs_number = covariance_matrix.dimension(1);
6157
6158 Tensor<Index, 1> principal_components_indices;
6159
6160 Index index;
6161
6162 for(Index i = 0; i < inputs_number; i++)
6163 {
6164 // index = sorted_principal_components_indices(i);
6165
6166 // if(explained_variance(index) >= minimum_explained_variance)
6167 {
6168 // principal_components_indices.push_back(i);
6169 }
6170 // else
6171 {
6172 continue;
6173 }
6174 }
6175
6176 const Index principal_components_number = principal_components_indices.size();
6177
6178 // Arrange principal components matrix
6179
6180 Tensor<type, 2> principal_components;
6181
6182 if(principal_components_number == 0)
6183 {
6184 return principal_components;
6185 }
6186 else
6187 {
6188 principal_components.resize(principal_components_number, inputs_number);
6189 }
6190
6191 for(Index i = 0; i < principal_components_number; i++)
6192 {
6193 // index = sorted_principal_components_indices(i);
6194
6195 // principal_components.set_row(i, eigenvectors.chip(index,1));
6196 }
6197
6198 // Return feature matrix
6199
6200 // return principal_components.get_submatrix_rows(principal_components_indices);
6201
6202 return Tensor<type, 2>();
6203 }
6204
6205
6206 /// Performs the principal components analysis of the inputs.
6207 /// It returns a matrix containing the principal components arranged in rows.
6208 /// This method deletes the unused samples of the original data set.
6209 /// @param covariance_matrix Matrix of covariances.
6210 /// @param explained_variance vector of the explained variances of the variables.
6211 /// @param minimum_explained_variance Minimum percentage of variance used to select a principal component.
6212 /// @todo
6213
perform_principal_components_analysis(const Tensor<type,2> & covariance_matrix,const Tensor<type,1> & explained_variance,const type & minimum_explained_variance)6214 Tensor<type, 2> DataSet::perform_principal_components_analysis(const Tensor<type, 2>& covariance_matrix,
6215 const Tensor<type, 1>& explained_variance,
6216 const type& minimum_explained_variance)
6217 {
6218 // Subtract off the mean
6219
6220 subtract_inputs_mean();
6221
6222 // Calculate eigenvectors
6223
6224 // const Tensor<type, 2> eigenvectors = OpenNN::eigenvectors(covariance_matrix);
6225
6226 // Sort principal components
6227
6228 // const Tensor<Index, 1> sorted_principal_components_indices = explained_variance.sort_descending_indices();
6229
6230 // Choose eigenvectors
6231
6232 const Index inputs_number = covariance_matrix.dimension(1);
6233
6234 Tensor<Index, 1> principal_components_indices;
6235
6236 Index index;
6237
6238 for(Index i = 0; i < inputs_number; i++)
6239 {
6240 // index = sorted_principal_components_indices(i);
6241
6242 // if(explained_variance(index) >= minimum_explained_variance)
6243 {
6244 // principal_components_indices.push_back(i);
6245 }
6246 // else
6247 {
6248 continue;
6249 }
6250 }
6251
6252 const Index principal_components_number = principal_components_indices.size();
6253
6254 // Arrange principal components matrix
6255
6256 Tensor<type, 2> principal_components;
6257
6258 if(principal_components_number == 0)
6259 {
6260 return principal_components;
6261 }
6262 else
6263 {
6264 principal_components.resize(principal_components_number, inputs_number);
6265 }
6266
6267 for(Index i = 0; i < principal_components_number; i++)
6268 {
6269 // index = sorted_principal_components_indices(i);
6270
6271 // principal_components.set_row(i, eigenvectors.chip(index,1));
6272 }
6273
6274 // Return feature matrix
6275
6276 // return principal_components.get_submatrix_rows(principal_components_indices);
6277
6278 return Tensor<type, 2>();
6279 }
6280
6281
6282 /// Transforms the data according to the principal components.
6283 /// @param principal_components Matrix containing the principal components.
6284 /// @todo
6285
transform_principal_components_data(const Tensor<type,2> & principal_components)6286 void DataSet::transform_principal_components_data(const Tensor<type, 2>& principal_components)
6287 {
6288 const Tensor<type, 2> targets = get_target_data();
6289
6290 subtract_inputs_mean();
6291
6292 const Index principal_components_number = principal_components.dimension(0);
6293
6294 // Transform data
6295
6296 const Tensor<Index, 1> used_samples = get_used_samples_indices();
6297
6298 const Index new_samples_number = get_used_samples_number();
6299
6300 const Tensor<type, 2> inputs = get_input_data();
6301
6302 Tensor<type, 2> new_data(new_samples_number, principal_components_number);
6303
6304 Index sample_index;
6305
6306 for(Index i = 0; i < new_samples_number; i++)
6307 {
6308 sample_index = used_samples(i);
6309
6310 for(Index j = 0; j < principal_components_number; j++)
6311 {
6312 Tensor<type, 0> dot = (inputs.chip(sample_index, 0)).contract(principal_components.chip(j,0),product_vector_vector);
6313
6314 new_data(i,j) = dot(0);
6315 // new_data(i,j) = dot(inputs.chip(sample_index, 0), principal_components.chip(j, 0));
6316 }
6317 }
6318
6319 // data = new_data.assemble_columns(targets);
6320
6321 }
6322
6323
6324 /// Scales the data matrix with given mean and standard deviation values.
6325 /// It updates the data matrix.
6326 /// @param data_descriptives vector of descriptives structures for all the variables in the data set.
6327 /// The size of that vector must be equal to the number of variables.
6328 /// @todo
6329
scale_data_mean_standard_deviation(const Tensor<Descriptives,1> & data_descriptives)6330 void DataSet::scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>& data_descriptives)
6331 {
6332
6333 #ifdef __OPENNN_DEBUG__
6334
6335 ostringstream buffer;
6336
6337 const Index columns_number = data.dimension(1);
6338
6339 const Index descriptives_size = data_descriptives.size();
6340
6341 if(descriptives_size != columns_number)
6342 {
6343 buffer << "OpenNN Exception: DataSet class.\n"
6344 << "void scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>&) method.\n"
6345 << "Size of descriptives must be equal to number of columns.\n";
6346
6347 throw logic_error(buffer.str());
6348 }
6349
6350 #endif
6351
6352 const Index variables_number = get_variables_number();
6353
6354 for(Index i = 0; i < variables_number; i++)
6355 {
6356 if(display && abs(data_descriptives(i).standard_deviation) < numeric_limits<type>::min())
6357 {
6358 cout << "OpenNN Warning: DataSet class.\n"
6359 << "void scale_data_mean_standard_deviation(const Tensor<Descriptives, 1>&) method.\n"
6360 << "Standard deviation of variable " << i << " is zero.\n"
6361 << "That variable won't be scaled.\n";
6362 }
6363 }
6364
6365 // scale_mean_standard_deviation(data, data_descriptives);
6366
6367 }
6368
6369
6370 /// Scales the data using the minimum and maximum method,
6371 /// and the minimum and maximum values calculated from the data matrix.
6372 /// It also returns the descriptives from all columns.
6373
scale_data_minimum_maximum()6374 Tensor<Descriptives, 1> DataSet::scale_data_minimum_maximum()
6375 {
6376 const Tensor<Descriptives, 1> data_descriptives = calculate_variables_descriptives();
6377
6378 scale_data_minimum_maximum(data_descriptives);
6379
6380 return data_descriptives;
6381 }
6382
6383
6384 /// Scales the data using the mean and standard deviation method,
6385 /// and the mean and standard deviation values calculated from the data matrix.
6386 /// It also returns the descriptives from all columns.
6387
scale_data_mean_standard_deviation()6388 Tensor<Descriptives, 1> DataSet::scale_data_mean_standard_deviation()
6389 {
6390 const Tensor<Descriptives, 1> data_descriptives = calculate_variables_descriptives();
6391
6392 scale_data_mean_standard_deviation(data_descriptives);
6393
6394 return data_descriptives;
6395 }
6396
6397
scale_minimum_maximum_binary(const type & value_1,const type & value_2,const Index & column_index)6398 void DataSet::scale_minimum_maximum_binary(const type& value_1, const type& value_2,const Index& column_index)
6399 {
6400 const Index rows_number = data.dimension(0);
6401
6402 type slope = 0;
6403 type intercept = 0;
6404
6405 if(value_1>value_2){
6406 slope = 1/(value_1-value_2);
6407 intercept = -value_2/(value_1-value_2);
6408 }else{
6409 slope = 1/(value_2-value_1);
6410 intercept = -value_1/(value_2-value_1);
6411 }
6412
6413 for(Index i = 0; i < rows_number; i++)
6414 {
6415 data(i, column_index) = slope*data(i, column_index)+intercept;
6416 }
6417
6418 }
6419
6420 /// Subtracts off the mean to every of the input variables.
6421
subtract_inputs_mean()6422 void DataSet::subtract_inputs_mean()
6423 {
6424 Tensor<Descriptives, 1> input_statistics = calculate_input_variables_descriptives();
6425
6426 Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6427 Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
6428
6429 Index input_index;
6430 Index sample_index;
6431
6432 type input_mean;
6433
6434 for(Index i = 0; i < input_variables_indices.size(); i++)
6435 {
6436 input_index = input_variables_indices(i);
6437
6438 input_mean = input_statistics(i).mean;
6439
6440 for(Index j = 0; j < used_samples_indices.size(); j++)
6441 {
6442 sample_index = used_samples_indices(j);
6443
6444 data(sample_index,input_index) -= input_mean;
6445 }
6446 }
6447 }
6448
6449
6450 /// Returns a vector of strings containing the scaling method that best fits each
6451 /// of the input variables.
6452
calculate_default_scaling_methods() const6453 Tensor<string, 1> DataSet::calculate_default_scaling_methods() const
6454 {
6455 const Tensor<Index, 1> used_inputs_indices = get_input_variables_indices();
6456 const Index used_inputs_number = used_inputs_indices.size();
6457
6458 Index current_distribution;
6459 Tensor<string, 1> scaling_methods(used_inputs_number);
6460
6461 #pragma omp parallel for private(current_distribution)
6462
6463 for(Index i = 0; i < static_cast<Index>(used_inputs_number); i++)
6464 {
6465 current_distribution = perform_distribution_distance_analysis(data.chip(used_inputs_indices(i),1));
6466
6467 if(current_distribution == 0) // Normal distribution
6468 {
6469 scaling_methods(i) = "MeanStandardDeviation";
6470 }
6471 else if(current_distribution == 1) // Uniform distribution
6472 {
6473 scaling_methods(i) = "MinimumMaximum";
6474 }
6475 else // Default
6476 {
6477 scaling_methods(i) = "MinimumMaximum";
6478 }
6479 }
6480
6481 return scaling_methods;
6482 }
6483
6484
6485 /// Returns a vector of strings containing the scaling method that best fits each
6486 /// of the target variables.
6487
calculate_default_unscaling_methods() const6488 Tensor<string, 1> DataSet::calculate_default_unscaling_methods() const
6489 {
6490 const Tensor<Index, 1> used_targets_indices = get_target_variables_indices();
6491 const Index used_targets_number = used_targets_indices.size();
6492
6493 Index current_distribution;
6494 Tensor<string, 1> scaling_methods(used_targets_number);
6495
6496 #pragma omp parallel for private(current_distribution)
6497
6498 for(Index i = 0; i < static_cast<Index>(used_targets_number); i++)
6499 {
6500 current_distribution = perform_distribution_distance_analysis(data.chip(used_targets_indices(i),1));
6501
6502 if(current_distribution == 0) // Normal distribution
6503 {
6504 scaling_methods(i) = "MeanStandardDeviation";
6505 }
6506 else if(current_distribution == 1) // Uniform distribution
6507 {
6508 scaling_methods(i) = "MinimumMaximum";
6509 }
6510 else // Default
6511 {
6512 scaling_methods(i) = "MinimumMaximum";
6513 }
6514 }
6515
6516 return scaling_methods;
6517 }
6518
6519
6520 /// Scales the data matrix with given minimum and maximum values.
6521 /// It updates the data matrix.
6522 /// @param data_descriptives vector of descriptives structures for all the variables in the data set.
6523 /// The size of that vector must be equal to the number of variables.
6524 /// @todo
6525
scale_data_minimum_maximum(const Tensor<Descriptives,1> & data_descriptives)6526 void DataSet::scale_data_minimum_maximum(const Tensor<Descriptives, 1>& data_descriptives)
6527 {
6528 const Index variables_number = get_variables_number();
6529
6530 #ifdef __OPENNN_DEBUG__
6531
6532 ostringstream buffer;
6533
6534 const Index descriptives_size = data_descriptives.size();
6535
6536 if(descriptives_size != variables_number)
6537 {
6538 buffer << "OpenNN Exception: DataSet class.\n"
6539 << "void scale_data_minimum_maximum(const Tensor<Descriptives, 1>&) method.\n"
6540 << "Size of data descriptives must be equal to number of variables.\n";
6541
6542 throw logic_error(buffer.str());
6543 }
6544
6545 #endif
6546
6547 for(Index i = 0; i < variables_number; i++)
6548 {
6549 if(display
6550 && abs(data_descriptives(i).maximum - data_descriptives(i).minimum) < numeric_limits<type>::min())
6551 {
6552 cout << "OpenNN Warning: DataSet class.\n"
6553 << "void scale_data_minimum_maximum(const Tensor<Descriptives, 1>&) method.\n"
6554 << "Range of variable " << i << " is zero.\n"
6555 << "That variable won't be scaled.\n";
6556 }
6557 }
6558
6559 // scale_minimum_maximum(data, data_descriptives);
6560 }
6561
6562
6563 /// Scales the given input variables with given mean and standard deviation values.
6564 /// It updates the input variable of the data matrix.
6565 /// @param input_statistics vector of descriptives structures for the input variables.
6566 /// @param input_index Index of the input to be scaled.
6567
scale_input_mean_standard_deviation(const Descriptives & input_statistics,const Index & input_index)6568 void DataSet::scale_input_mean_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
6569 {
6570 const type slope = (input_statistics.standard_deviation -0) < static_cast<type>(1e-3) ?
6571 0 :
6572 static_cast<type>(1)/input_statistics.standard_deviation;
6573
6574 const type intercept = (input_statistics.standard_deviation -0) < static_cast<type>(1e-3) ?
6575 0 :
6576 -static_cast<type>(1)*input_statistics.mean/input_statistics.standard_deviation;
6577
6578 for(Index i = 0; i < data.dimension(0); i++)
6579 {
6580 data(i, input_index) = data(i, input_index)*slope + intercept;
6581 }
6582 }
6583
6584
6585 /// Scales the given input variables with the calculated mean and standard deviation values from the data matrix.
6586 /// It updates the input variables of the data matrix.
6587 /// It also returns a vector with the variables descriptives.
6588 /// @param input_index Index of the input to be scaled.
6589
scale_input_mean_standard_deviation(const Index & input_index)6590 Descriptives DataSet::scale_input_mean_standard_deviation(const Index& input_index)
6591 {
6592 #ifdef __OPENNN_DEBUG__
6593
6594 if(is_empty())
6595 {
6596 ostringstream buffer;
6597
6598 buffer << "OpenNN Exception: DataSet class.\n"
6599 << "Descriptives scale_input_mean_standard_deviation(const Index&) method.\n"
6600 << "Data file is not loaded.\n";
6601
6602 throw logic_error(buffer.str());
6603 }
6604
6605 #endif
6606
6607 const Descriptives input_statistics = calculate_input_descriptives(input_index);
6608
6609 scale_input_mean_standard_deviation(input_statistics, input_index);
6610
6611 return input_statistics;
6612 }
6613
6614
6615 /// Scales the given input variables with given standard deviation values.
6616 /// It updates the input variable of the data matrix.
6617 /// @param inputs_statistics vector of descriptives structures for the input variables.
6618 /// @param input_index Index of the input to be scaled.
6619
scale_input_standard_deviation(const Descriptives & input_statistics,const Index & input_index)6620 void DataSet::scale_input_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
6621 {
6622 for(Index i = 0; i < data.dimension(0); i++)
6623 {
6624 data(i, input_index) = static_cast<type>(2)*(data(i, input_index)) / input_statistics.standard_deviation;
6625 }
6626 }
6627
6628
6629 /// Scales the given input variables with the calculated standard deviation values from the data matrix.
6630 /// It updates the input variables of the data matrix.
6631 /// It also returns a vector with the variables descriptives.
6632 /// @param input_index Index of the input to be scaled.
6633
scale_input_standard_deviation(const Index & input_index)6634 Descriptives DataSet::scale_input_standard_deviation(const Index& input_index)
6635 {
6636 #ifdef __OPENNN_DEBUG__
6637
6638 if(is_empty())
6639 {
6640 ostringstream buffer;
6641
6642 buffer << "OpenNN Exception: DataSet class.\n"
6643 << "Descriptives scale_input_standard_deviation(const Index&) method.\n"
6644 << "Data file is not loaded.\n";
6645
6646 throw logic_error(buffer.str());
6647 }
6648
6649 #endif
6650
6651 const Descriptives input_statistics = calculate_input_descriptives(input_index);
6652
6653 scale_input_standard_deviation(input_statistics, input_index);
6654
6655 return input_statistics;
6656 }
6657
6658
6659 /// Scales the given input variable with given minimum and maximum values.
6660 /// It updates the input variables of the data matrix.
6661 /// @param input_statistics vector with the descriptives of the input variable.
6662 /// @param input_index Index of the input to be scaled.
6663
scale_input_minimum_maximum(const Descriptives & input_statistics,const Index & input_index)6664 void DataSet::scale_input_minimum_maximum(const Descriptives& input_statistics, const Index& input_index)
6665 {
6666 const type slope = std::abs(input_statistics.maximum-input_statistics.minimum) < static_cast<type>(1e-3) ?
6667 0 :
6668 (max_range-min_range)/(input_statistics.maximum-input_statistics.minimum);
6669
6670 const type intercept = std::abs(input_statistics.maximum-input_statistics.minimum) < static_cast<type>(1e-3) ?
6671 0 :
6672 (min_range*input_statistics.maximum-max_range*input_statistics.minimum)/(input_statistics.maximum-input_statistics.minimum);
6673
6674 for(Index i = 0; i < data.dimension(0); i++)
6675 {
6676 data(i, input_index) = data(i, input_index)*slope + intercept;
6677 }
6678 }
6679
6680
6681 /// Scales the given input variable with the calculated minimum and maximum values from the data matrix.
6682 /// It updates the input variable of the data matrix.
6683 /// It also returns a vector with the minimum and maximum values of the input variables.
6684
scale_input_minimum_maximum(const Index & input_index)6685 Descriptives DataSet::scale_input_minimum_maximum(const Index& input_index)
6686 {
6687 #ifdef __OPENNN_DEBUG__
6688
6689 if(is_empty())
6690 {
6691 ostringstream buffer;
6692
6693 buffer << "OpenNN Exception: DataSet class.\n"
6694 << "Descriptives scale_input_minimum_maximum(const Index&) method.\n"
6695 << "Data file is not loaded.\n";
6696
6697 throw logic_error(buffer.str());
6698 }
6699
6700 #endif
6701
6702 const Descriptives input_statistics = calculate_input_descriptives(input_index);
6703
6704 scale_input_minimum_maximum(input_statistics, input_index);
6705
6706 return input_statistics;
6707 }
6708
6709
scale_input_variables_minimum_maximum(const Tensor<Descriptives,1> & inputs_descriptives)6710 void DataSet::scale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>& inputs_descriptives)
6711 {
6712 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6713
6714 const Index input_variables_number = input_variables_indices.size();
6715
6716 for(Index i = 0; i < input_variables_number; i++)
6717 {
6718 scale_input_minimum_maximum(inputs_descriptives[i], input_variables_indices[i]);
6719 }
6720 }
6721
6722
scale_input_variables_minimum_maximum()6723 Tensor<Descriptives, 1> DataSet::scale_input_variables_minimum_maximum()
6724 {
6725 const Tensor<Descriptives, 1> inputs_descriptives = calculate_input_variables_descriptives();
6726
6727 scale_input_variables_minimum_maximum(inputs_descriptives);
6728
6729 return inputs_descriptives;
6730
6731 }
6732
6733
unscale_input_variables_minimum_maximum(const Tensor<Descriptives,1> & inputs_descriptives)6734 void DataSet::unscale_input_variables_minimum_maximum(const Tensor<Descriptives, 1>& inputs_descriptives)
6735 {
6736 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6737
6738 const Index input_variables_number = input_variables_indices.size();
6739
6740 for(Index i = 0; i < input_variables_number; i++)
6741 {
6742 unscale_input_variable_minimum_maximum(inputs_descriptives[i], input_variables_indices[i]);
6743 }
6744 }
6745
6746
6747 /// It scales every input variable with the given method.
6748 /// The method to be used is that in the scaling and unscaling method variable.
6749
scale_input_variables(const Tensor<string,1> & scaling_unscaling_methods)6750 Tensor<Descriptives, 1> DataSet::scale_input_variables(const Tensor<string, 1>& scaling_unscaling_methods)
6751 {
6752 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6753
6754 const Tensor<Descriptives, 1> inputs_descriptives = calculate_input_variables_descriptives();
6755
6756 for(Index i = 0; i < scaling_unscaling_methods.dimension(0); i++)
6757 {
6758 switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
6759 {
6760 case NoScaling:
6761 {
6762 // Do nothing
6763 }
6764 break;
6765
6766 case MinimumMaximum:
6767 {
6768 scale_input_minimum_maximum(inputs_descriptives(i), input_variables_indices(i));
6769 }
6770 break;
6771
6772 case MeanStandardDeviation:
6773 {
6774 scale_input_mean_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
6775 }
6776 break;
6777
6778 case StandardDeviation:
6779 {
6780 scale_input_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
6781 }
6782 break;
6783
6784 default:
6785 {
6786 ostringstream buffer;
6787
6788 buffer << "OpenNN Exception: DataSet class\n"
6789 << "void scale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6790 << "Unknown scaling and unscaling method: " << scaling_unscaling_methods(i) << "\n";
6791
6792 throw logic_error(buffer.str());
6793 }
6794 }
6795 }
6796
6797 return inputs_descriptives;
6798 }
6799
6800
6801 /// Scales the target variables with given mean and standard deviation values.
6802 /// It updates the target variables of the data matrix.
6803 /// @param targets_descriptives vector of descriptives structures for all the targets in the data set.
6804 /// The size of that vector must be equal to the number of target variables.
6805
scale_target_variables_mean_standard_deviation(const Tensor<Descriptives,1> & targets_descriptives)6806 void DataSet::scale_target_variables_mean_standard_deviation(const Tensor<Descriptives, 1>& targets_descriptives)
6807 {
6808 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6809 const Index target_variables_number = target_variables_indices.size();
6810
6811 Index variable_index;
6812
6813 for(Index i = 0; i < data.dimension(0); i++)
6814 {
6815 for(Index j = 0; j < target_variables_number; j++)
6816 {
6817 variable_index = target_variables_indices(j);
6818
6819 if(!::isnan(data(i,variable_index)))
6820 {
6821 data(i, variable_index) =
6822 (data(i, variable_index)-targets_descriptives(j).mean)/(targets_descriptives(j).standard_deviation);
6823 }
6824 }
6825 }
6826 }
6827
6828
6829 /// Scales the target variables with the calculated mean and standard deviation values from the data matrix.
6830 /// It updates the target variables of the data matrix.
6831 /// It also returns a vector of descriptives structures with the basic descriptives of all the variables.
6832
scale_target_variables_mean_standard_deviation()6833 Tensor<Descriptives, 1> DataSet::scale_target_variables_mean_standard_deviation()
6834 {
6835 #ifdef __OPENNN_DEBUG__
6836
6837 if(is_empty())
6838 {
6839 ostringstream buffer;
6840
6841 buffer << "OpenNN Exception: DataSet class.\n"
6842 << "Tensor<Descriptives, 1> scale_target_variables_mean_standard_deviation() method.\n"
6843 << "Data file is not loaded.\n";
6844
6845 throw logic_error(buffer.str());
6846 }
6847
6848 #endif
6849
6850 const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
6851
6852 scale_target_variables_mean_standard_deviation(targets_descriptives);
6853
6854 return targets_descriptives;
6855 }
6856
6857
6858 /// Scales the target variables with given minimum and maximum values.
6859 /// It updates the target variables of the data matrix.
6860 /// @param targets_descriptives vector of descriptives structures for all the targets in the data set.
6861 /// The size of that vector must be equal to the number of target variables.
6862
scale_target_variables_minimum_maximum(const Tensor<Descriptives,1> & targets_descriptives)6863 void DataSet::scale_target_variables_minimum_maximum(const Tensor<Descriptives, 1>& targets_descriptives)
6864 {
6865 #ifdef __OPENNN_DEBUG__
6866
6867 if(is_empty())
6868 {
6869 ostringstream buffer;
6870
6871 buffer << "OpenNN Exception: DataSet class.\n"
6872 << "Tensor<Descriptives, 1> scale_target_variables_minimum_maximum() method.\n"
6873 << "Data file is not loaded.\n";
6874
6875 throw logic_error(buffer.str());
6876 }
6877
6878 #endif
6879
6880 // const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6881 // const Index target_variables_number = target_variables_indices.size();
6882
6883 // Index variable_index;
6884
6885 // for(Index i = 0; i < data.dimension(0); i++)
6886 // {
6887 // for(Index j = 0; j < target_variables_number; j++)
6888 // {
6889 // variable_index = target_variables_indices(j);
6890
6891 // if(!::isnan(data(i,variable_index)))
6892 // {
6893 // data(i, variable_index) =
6894 // static_cast<type>(2.0)*(data(i, variable_index)-targets_descriptives(j).minimum)/(targets_descriptives(j).maximum-targets_descriptives(j).minimum)-static_cast<type>(1.0);
6895 // }
6896 // }
6897 // }
6898
6899 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6900 const Index target_variables_number = target_variables_indices.size();
6901
6902 for(Index i = 0; i < target_variables_number; i++)
6903 {
6904 scale_target_minimum_maximum(targets_descriptives[i], target_variables_indices[i]);
6905 }
6906 }
6907
6908
6909 /// Scales the target variables with the calculated minimum and maximum values from the data matrix.
6910 /// It updates the target variables of the data matrix.
6911 /// It also returns a vector of vectors with the descriptives of the input target variables.
6912
scale_target_variables_minimum_maximum()6913 Tensor<Descriptives, 1> DataSet::scale_target_variables_minimum_maximum()
6914 {
6915 const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
6916
6917 scale_target_variables_minimum_maximum(targets_descriptives);
6918
6919 return targets_descriptives;
6920 }
6921
6922
6923 /// Scales the target variables with the logarithmic scale using the given minimum and maximum values.
6924 /// It updates the target variables of the data matrix.
6925 /// @param targets_descriptives vector of descriptives structures for all the targets in the data set.
6926 /// The size of that vector must be equal to the number of target variables.
6927
scale_target_variables_logarithm(const Tensor<Descriptives,1> & targets_descriptives)6928 void DataSet::scale_target_variables_logarithm(const Tensor<Descriptives, 1>& targets_descriptives)
6929 {
6930 #ifdef __OPENNN_DEBUG__
6931
6932 if(is_empty())
6933 {
6934 ostringstream buffer;
6935
6936 buffer << "OpenNN Exception: DataSet class.\n"
6937 << "Tensor<Descriptives, 1> scale_target_variables_logarithm() method.\n"
6938 << "Data file is not loaded.\n";
6939
6940 throw logic_error(buffer.str());
6941 }
6942
6943 #endif
6944
6945 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6946 const Index target_variables_number = target_variables_indices.size();
6947
6948 Index variable_index;
6949
6950 for(Index i = 0; i < data.dimension(0); i++)
6951 {
6952 for(Index j = 0; j < target_variables_number; j++)
6953 {
6954 variable_index = target_variables_indices(j);
6955
6956 if(!::isnan(data(i,variable_index)))
6957 {
6958 data(i, variable_index) =
6959 static_cast<type>(0.5)*(exp(data(i, variable_index)))*(targets_descriptives(j).maximum-targets_descriptives(j).minimum)+ targets_descriptives(j).minimum;
6960 }
6961 }
6962 }
6963 }
6964
6965
6966 /// Scales the target variables with the logarithmic scale using the calculated minimum and maximum values
6967 /// from the data matrix.
6968 /// It updates the target variables of the data matrix.
6969 /// It also returns a vector of vectors with the descriptives of the input target variables.
6970
scale_target_variables_logarithm()6971 Tensor<Descriptives, 1> DataSet::scale_target_variables_logarithm()
6972 {
6973 const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
6974
6975 scale_target_variables_logarithm(targets_descriptives);
6976
6977 return targets_descriptives;
6978 }
6979
6980
6981 /// Calculates the input and target variables descriptives.
6982 /// Then it scales the target variables with those values.
6983 /// The method to be used is that in the scaling and unscaling method variable.
6984 /// Finally, it returns the descriptives.
6985
scale_target_variables(const string & scaling_unscaling_method)6986 Tensor<Descriptives, 1> DataSet::scale_target_variables(const string& scaling_unscaling_method)
6987 {
6988 switch(get_scaling_unscaling_method(scaling_unscaling_method))
6989 {
6990 case NoUnscaling:
6991 {
6992 return calculate_target_variables_descriptives();
6993 }
6994
6995 case MinimumMaximum:
6996 {
6997 return scale_target_variables_minimum_maximum();
6998 }
6999
7000 case Logarithmic:
7001 {
7002 return scale_target_variables_logarithm();
7003 }
7004
7005 case MeanStandardDeviation:
7006 {
7007 return scale_target_variables_mean_standard_deviation();
7008 }
7009
7010 default:
7011 {
7012 ostringstream buffer;
7013
7014 buffer << "OpenNN Exception: DataSet class\n"
7015 << "Tensor<Descriptives, 1> scale_target_variables(const string&) method.\n"
7016 << "Unknown scaling and unscaling method.\n";
7017
7018 throw logic_error(buffer.str());
7019 }
7020 }
7021 }
7022
7023
scale_target_minimum_maximum(const Descriptives & target_statistics,const Index & target_index)7024 void DataSet::scale_target_minimum_maximum(const Descriptives& target_statistics, const Index& target_index)
7025 {
7026 const type slope = std::abs(target_statistics.maximum-target_statistics.minimum) < static_cast<type>(1e-3) ?
7027 0 :
7028 (max_range-min_range)/(target_statistics.maximum-target_statistics.minimum);
7029
7030 const type intercept = std::abs(target_statistics.maximum-target_statistics.minimum) < static_cast<type>(1e-3) ?
7031 0 :
7032 (min_range*target_statistics.maximum-max_range*target_statistics.minimum)/(target_statistics.maximum-target_statistics.minimum);
7033
7034 for(Index i = 0; i < data.dimension(0); i++)
7035 {
7036 data(i, target_index) = data(i, target_index)*slope + intercept;
7037 }
7038 }
7039
7040
scale_target_mean_standard_deviation(const Descriptives & target_statistics,const Index & target_index)7041 void DataSet::scale_target_mean_standard_deviation(const Descriptives& target_statistics, const Index& target_index)
7042 {
7043 const type slope = std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3) ?
7044 0 :
7045 static_cast<type>(1)/target_statistics.standard_deviation;
7046
7047 const type intercept = std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3) ?
7048 0 :
7049 -target_statistics.mean/target_statistics.standard_deviation;
7050
7051 for(Index i = 0; i < data.dimension(0); i++)
7052 {
7053 data(i, target_index) = data(i, target_index)*slope + intercept;
7054 }
7055 }
7056
7057
scale_target_logarithmic(const Descriptives & target_statistics,const Index & target_index)7058 void DataSet::scale_target_logarithmic(const Descriptives& target_statistics, const Index& target_index)
7059 {
7060 for(Index i = 0; i < data.dimension(0); i++)
7061 {
7062 if(std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3))
7063 {
7064 data(i, target_index) = 0;
7065 }
7066 else
7067 {
7068 data(i, target_index) = static_cast<type>(0.5)*(exp(data(i,target_index)-1))*(target_statistics.maximum-target_statistics.minimum) + target_statistics.minimum;
7069 }
7070 }
7071 }
7072
7073
7074 /// It scales the input variables with that values.
7075 /// The method to be used is that in the scaling and unscaling method variable.
7076
scale_target_variables(const Tensor<string,1> & scaling_unscaling_methods)7077 Tensor<Descriptives, 1> DataSet::scale_target_variables(const Tensor<string, 1>& scaling_unscaling_methods)
7078 {
7079 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
7080 const Tensor<Descriptives, 1> targets_descriptives = calculate_target_variables_descriptives();
7081
7082 // Index column_index;
7083
7084 for (Index i = 0; i < scaling_unscaling_methods.size(); i++)
7085 {
7086 // column_index = get_column_index(target_variables_indices(i));
7087
7088 // if(columns(column_index).type == Binary || columns(column_index).type == Categorical) continue;
7089
7090 switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
7091 {
7092 case NoUnscaling:
7093 break;
7094
7095 case MinimumMaximum:
7096 scale_target_minimum_maximum(targets_descriptives(i), target_variables_indices(i));
7097 break;
7098
7099 case MeanStandardDeviation:
7100 scale_target_mean_standard_deviation(targets_descriptives(i), target_variables_indices(i));
7101 break;
7102
7103 case Logarithmic:
7104 scale_target_logarithmic(targets_descriptives(i), target_variables_indices(i));
7105 break;
7106
7107 default:
7108 {
7109 ostringstream buffer;
7110
7111 buffer << "OpenNN Exception: DataSet class\n"
7112 << "void scale_target_variables(const string&, const Tensor<Descriptives, 1>&) method.\n"
7113 << "Unknown scaling and unscaling method.\n";
7114
7115 throw logic_error(buffer.str());
7116 }
7117 }
7118 }
7119 return targets_descriptives;
7120 }
7121
7122
7123 /// Unscales the given input variable with given minimum and maximum values.
7124 /// It updates the input variables of the data matrix.
7125 /// @param input_statistics vector with the descriptives of the input variable.
7126 /// @param input_index Index of the input to be scaled.
7127
unscale_input_variable_minimum_maximum(const Descriptives & input_statistics,const Index & input_index)7128 void DataSet::unscale_input_variable_minimum_maximum(const Descriptives& input_statistics, const Index & input_index)
7129 {
7130 const type slope = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : (input_statistics.maximum-input_statistics.minimum)/(max_range-min_range);
7131
7132 const type intercept = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : -(min_range*input_statistics.maximum-max_range*input_statistics.minimum)/(max_range-min_range);
7133
7134 for(Index i = 0; i < data.dimension(0); i++)
7135 {
7136 data(i, input_index) = data(i, input_index)*slope + intercept;
7137 }
7138 }
7139
7140
7141 /// Uncales the given input variables with given mean and standard deviation values.
7142 /// It updates the input variable of the data matrix.
7143 /// @param input_statistics vector of descriptives structures for the input variables.
7144 /// @param input_index Index of the input to be scaled.
7145
unscale_input_mean_standard_deviation(const Descriptives & input_statistics,const Index & input_index)7146 void DataSet::unscale_input_mean_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
7147 {
7148 const type slope = std::abs(input_statistics.mean - 0) < static_cast<type>(1e-3) ? 0 : input_statistics.standard_deviation/static_cast<type>(2);
7149
7150 const type intercept = std::abs(input_statistics.mean-0) < static_cast<type>(1e-3) ? input_statistics.minimum : input_statistics.mean;
7151
7152 for(Index i = 0; i < data.dimension(0); i++)
7153 {
7154 data(i, input_index) = data(i, input_index)*slope + intercept;
7155 }
7156 }
7157
7158
7159 /// Unscales the given input variables with given standard deviation values.
7160 /// It updates the input variable of the data matrix.
7161 /// @param inputs_statistics vector of descriptives structures for the input variables.
7162 /// @param input_index Index of the input to be scaled.
7163
unscale_input_variable_standard_deviation(const Descriptives & input_statistics,const Index & input_index)7164 void DataSet::unscale_input_variable_standard_deviation(const Descriptives& input_statistics, const Index& input_index)
7165 {
7166 const type slope = std::abs(input_statistics.mean-0) < static_cast<type>(1e-3) ? 0 : input_statistics.standard_deviation/static_cast<type>(2);
7167
7168 const type intercept = std::abs(input_statistics.mean-0) < static_cast<type>(1e-3) ? input_statistics.minimum : 0;
7169
7170 for(Index i = 0; i < data.dimension(0); i++)
7171 {
7172 data(i, input_index) = data(i, input_index)*slope + intercept;
7173 }
7174 }
7175
7176
7177 /// It unscales every input variable with the given method.
7178 /// The method to be used is that in the scaling and unscaling method variable.
7179
unscale_input_variables(const Tensor<string,1> & scaling_unscaling_methods,const Tensor<Descriptives,1> & inputs_descriptives)7180 void DataSet::unscale_input_variables(const Tensor<string, 1>& scaling_unscaling_methods, const Tensor<Descriptives, 1>& inputs_descriptives)
7181 {
7182 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
7183
7184 for(Index i = 0; i < scaling_unscaling_methods.size(); i++)
7185 {
7186 switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
7187 {
7188 case NoScaling:
7189 {
7190 // Do nothing
7191 }
7192 break;
7193
7194 case MinimumMaximum:
7195 {
7196 unscale_input_variable_minimum_maximum(inputs_descriptives(i), input_variables_indices(i));
7197 }
7198 break;
7199
7200 case MeanStandardDeviation:
7201 {
7202 unscale_input_mean_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
7203 }
7204 break;
7205
7206 case StandardDeviation:
7207 {
7208 unscale_input_variable_standard_deviation(inputs_descriptives(i), input_variables_indices(i));
7209 }
7210 break;
7211
7212 default:
7213 {
7214 ostringstream buffer;
7215
7216 buffer << "OpenNN Exception: DataSet class\n"
7217 << "void unscale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
7218 << "Unknown unscaling and unscaling method: " << scaling_unscaling_methods(i) << "\n";
7219
7220 throw logic_error(buffer.str());
7221 }
7222 }
7223 }
7224 }
7225
7226
unscale_target_minimum_maximum(const Descriptives & target_statistics,const Index & target_index)7227 void DataSet::unscale_target_minimum_maximum(const Descriptives& target_statistics, const Index& target_index)
7228 {
7229 const type slope = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : (target_statistics.maximum-target_statistics.minimum)/(max_range-min_range);
7230
7231 const type intercept = std::abs(max_range-min_range) < static_cast<type>(1e-3) ? 0 : -(min_range*target_statistics.maximum-max_range*target_statistics.minimum)/(max_range-min_range);
7232
7233 for(Index i = 0; i < data.dimension(0); i++)
7234 {
7235 data(i, target_index) = data(i, target_index)*slope + intercept;
7236 }
7237 }
7238
7239
unscale_target_mean_standard_deviation(const Descriptives & target_statistics,const Index & target_index)7240 void DataSet::unscale_target_mean_standard_deviation(const Descriptives& target_statistics, const Index& target_index)
7241 {
7242 const type slope = std::abs(target_statistics.standard_deviation-0) < static_cast<type>(1e-3) ?
7243 0 :
7244 target_statistics.standard_deviation/static_cast<type>(2);
7245
7246 const type intercept = target_statistics.mean;
7247
7248 for(Index i = 0; i < data.dimension(0); i++)
7249 {
7250 data(i, target_index) = data(i, target_index)*slope + intercept;
7251 }
7252 }
7253
7254
unscale_target_logarithmic(const Descriptives & target_statistics,const Index & target_index)7255 void DataSet::unscale_target_logarithmic(const Descriptives& target_statistics, const Index& target_index)
7256 {
7257 for(Index i = 0; i < data.dimension(0); i++)
7258 {
7259 if(std::abs(target_statistics.maximum - target_statistics.minimum) < static_cast<type>(1e-3))
7260 {
7261 data(i, target_index) = target_statistics.minimum;
7262 }
7263 else
7264 {
7265 data(i, target_index) = log(static_cast<type>(2)*(data(i,target_index)-target_statistics.minimum)/(target_statistics.maximum-target_statistics.minimum));
7266 }
7267 }
7268 }
7269
7270
7271 /// It unscales the input variables with that values.
7272 /// The method to be used is that in the scaling and unscaling method variable.
7273
unscale_target_variables(const Tensor<string,1> & scaling_unscaling_methods,const Tensor<Descriptives,1> & targets_descriptives)7274 void DataSet::unscale_target_variables(const Tensor<string, 1>& scaling_unscaling_methods, const Tensor<Descriptives, 1>& targets_descriptives)
7275 {
7276 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
7277
7278 for (Index i = 0; i < scaling_unscaling_methods.size(); i++)
7279 {
7280 switch(get_scaling_unscaling_method(scaling_unscaling_methods(i)))
7281 {
7282 case NoUnscaling:
7283 break;
7284
7285 case MinimumMaximum:
7286 unscale_target_minimum_maximum(targets_descriptives(i), target_variables_indices(i));
7287 break;
7288
7289 case MeanStandardDeviation:
7290 unscale_target_mean_standard_deviation(targets_descriptives(i), target_variables_indices(i));
7291 break;
7292
7293 case Logarithmic:
7294 unscale_target_logarithmic(targets_descriptives(i), target_variables_indices(i));
7295 break;
7296
7297 default:
7298 {
7299 ostringstream buffer;
7300
7301 buffer << "OpenNN Exception: DataSet class\n"
7302 << "void unscale_targets(const string&, const Tensor<Descriptives, 1>&) method.\n"
7303 << "Unknown unscaling and unscaling method.\n";
7304
7305 throw logic_error(buffer.str());
7306 }
7307 }
7308 }
7309 }
7310
7311
7312
7313
7314 /// Initializes the data matrix with a given value.
7315 /// @param new_value Initialization value.
7316
initialize_data(const type & new_value)7317 void DataSet::initialize_data(const type& new_value)
7318 {
7319 data.setConstant(new_value);
7320 }
7321
7322
7323 /// Initializes the data matrix with random values chosen from a uniform distribution
7324 /// with given minimum and maximum.
7325
set_data_random()7326 void DataSet::set_data_random()
7327 {
7328 data.setRandom();
7329 }
7330
7331
7332 /// Initializes the data matrix with random values chosen from a uniform distribution
7333 /// with given minimum and maximum. The targets will be binary randoms.
7334
set_data_binary_random()7335 void DataSet::set_data_binary_random()
7336 {
7337 data.setRandom();
7338
7339 const Index samples_number = data.dimension(0);
7340 const Index variables_number = data.dimension(1);
7341
7342 const Index input_variables_number = get_input_variables_number();
7343
7344 for(Index i = 0; i < samples_number; i++)
7345 {
7346 for(Index j = input_variables_number; j < variables_number; j++)
7347 {
7348 data(i,j) = (1+static_cast<type>(pow((-1),rand())))/2;
7349 }
7350 }
7351 }
7352
7353
7354 /// Sets max and min scaling range for minmaxscaling.
7355 /// @param min and max for scaling range.
7356
set_min_max_range(const type min,const type max)7357 void DataSet::set_min_max_range(const type min, const type max)
7358 {
7359 min_range = min;
7360 max_range = max;
7361 }
7362
7363 /// Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory.
7364
write_XML(tinyxml2::XMLPrinter & file_stream) const7365 void DataSet::write_XML(tinyxml2::XMLPrinter& file_stream) const
7366 {
7367 ostringstream buffer;
7368
7369 file_stream.OpenElement("DataSet");
7370
7371 // Data file
7372
7373 file_stream.OpenElement("DataFile");
7374
7375 // File type ?
7376
7377 {
7378 file_stream.OpenElement("FileType");
7379
7380 file_stream.PushText("csv");
7381
7382 file_stream.CloseElement();
7383 }
7384
7385 // Data file name
7386 {
7387 file_stream.OpenElement("DataFileName");
7388
7389 file_stream.PushText(data_file_name.c_str());
7390
7391 file_stream.CloseElement();
7392 }
7393
7394 // Separator
7395 {
7396 file_stream.OpenElement("Separator");
7397
7398 file_stream.PushText(get_separator_string().c_str());
7399
7400 file_stream.CloseElement();
7401 }
7402
7403 // Columns names
7404 {
7405 file_stream.OpenElement("ColumnsNames");
7406
7407 buffer.str("");
7408 buffer << has_columns_names;
7409
7410 file_stream.PushText(buffer.str().c_str());
7411
7412 file_stream.CloseElement();
7413 }
7414
7415 // Rows labels
7416 {
7417 file_stream.OpenElement("RowsLabels");
7418
7419 buffer.str("");
7420 buffer << has_rows_labels;
7421
7422 file_stream.PushText(buffer.str().c_str());
7423
7424 file_stream.CloseElement();
7425 }
7426
7427 // Missing values label
7428 {
7429 file_stream.OpenElement("MissingValuesLabel");
7430
7431 file_stream.PushText(missing_values_label.c_str());
7432
7433 file_stream.CloseElement();
7434 }
7435
7436 // Lags number
7437 {
7438 file_stream.OpenElement("LagsNumber");
7439
7440 buffer.str("");
7441 buffer << get_lags_number();
7442
7443 file_stream.PushText(buffer.str().c_str());
7444
7445 file_stream.CloseElement();
7446 }
7447
7448 // Steps Ahead
7449 {
7450 file_stream.OpenElement("StepsAhead");
7451
7452 buffer.str("");
7453 buffer << get_steps_ahead();
7454
7455 file_stream.PushText(buffer.str().c_str());
7456
7457 file_stream.CloseElement();
7458 }
7459
7460 // Time Index
7461 {
7462 file_stream.OpenElement("TimeIndex");
7463
7464 buffer.str("");
7465 buffer << get_time_index();
7466
7467 file_stream.PushText(buffer.str().c_str());
7468
7469 file_stream.CloseElement();
7470 }
7471 // Close DataFile
7472
7473 file_stream.CloseElement();
7474
7475 // Columns
7476
7477 file_stream.OpenElement("Columns");
7478
7479 // Columns number
7480 {
7481 file_stream.OpenElement("ColumnsNumber");
7482
7483 buffer.str("");
7484 buffer << get_columns_number();
7485
7486 file_stream.PushText(buffer.str().c_str());
7487
7488 file_stream.CloseElement();
7489 }
7490
7491 // Columns items
7492
7493 {
7494 const Index columns_number = get_columns_number();
7495
7496 for(Index i = 0; i < columns_number; i++)
7497 {
7498 file_stream.OpenElement("Column");
7499
7500 file_stream.PushAttribute("Item", to_string(i+1).c_str());
7501
7502 columns(i).write_XML(file_stream);
7503
7504 file_stream.CloseElement();
7505 }
7506 }
7507
7508 // Close columns
7509
7510 file_stream.CloseElement();
7511
7512 // Rows labels
7513
7514 if(has_rows_labels)
7515 {
7516 const Index rows_labels_number = rows_labels.dimension(0);
7517
7518 file_stream.OpenElement("RowsLabels");
7519
7520 buffer.str("");
7521
7522 for(Index i = 0; i < rows_labels_number; i++)
7523 {
7524 buffer << rows_labels(i);
7525
7526 if(i != rows_labels_number-1) buffer << ",";
7527 }
7528
7529 file_stream.PushText(buffer.str().c_str());
7530
7531 file_stream.CloseElement();
7532 }
7533
7534 // Samples
7535
7536 file_stream.OpenElement("Samples");
7537
7538 // Samples number
7539 {
7540 file_stream.OpenElement("SamplesNumber");
7541
7542 buffer.str("");
7543 buffer << get_samples_number();
7544
7545 file_stream.PushText(buffer.str().c_str());
7546
7547 file_stream.CloseElement();
7548 }
7549
7550 // Samples uses
7551
7552 {
7553 file_stream.OpenElement("SamplesUses");
7554
7555 buffer.str("");
7556
7557 const Index samples_number = get_samples_number();
7558
7559 for(Index i = 0; i < samples_number; i++)
7560 {
7561 buffer << samples_uses(i);
7562
7563 if(i < (samples_number-1)) buffer << " ";
7564 }
7565
7566 file_stream.PushText(buffer.str().c_str());
7567
7568 file_stream.CloseElement();
7569 }
7570
7571 // Close samples
7572
7573 file_stream.CloseElement();
7574
7575 // Missing values
7576
7577 file_stream.OpenElement("MissingValues");
7578
7579 // Missing values method
7580
7581 {
7582 file_stream.OpenElement("MissingValuesMethod");
7583
7584 if(missing_values_method == Mean)
7585 {
7586 file_stream.PushText("Mean");
7587 }
7588 else if(missing_values_method == Median)
7589 {
7590 file_stream.PushText("Median");
7591 }
7592 else
7593 {
7594 file_stream.PushText("Unuse");
7595 }
7596
7597 file_stream.CloseElement();
7598 }
7599
7600 // Missing values number
7601
7602 {
7603 file_stream.OpenElement("MissingValuesNumber");
7604
7605 buffer.str("");
7606 buffer << missing_values_number;
7607
7608 file_stream.PushText(buffer.str().c_str());
7609
7610 file_stream.CloseElement();
7611 }
7612
7613 if(missing_values_number > 0)
7614 {
7615 // Columns missing values number
7616
7617 {
7618 file_stream.OpenElement("ColumnsMissingValuesNumber");
7619
7620 cout << "count nan columns" << endl;
7621 const Index columns_number = columns_missing_values_number.size();
7622
7623 buffer.str("");
7624
7625 for (Index i = 0; i < columns_number; i++)
7626 {
7627 buffer << columns_missing_values_number(i);
7628
7629 if(i != (columns_number-1)) buffer << " ";
7630 }
7631
7632 file_stream.PushText(buffer.str().c_str());
7633
7634 file_stream.CloseElement();
7635 }
7636
7637 // Rows missing values number
7638
7639 {
7640 file_stream.OpenElement("RowsMissingValuesNumber");
7641
7642 buffer.str("");
7643 buffer << rows_missing_values_number;
7644
7645 file_stream.PushText(buffer.str().c_str());
7646
7647 file_stream.CloseElement();
7648 }
7649 }
7650
7651 // Missing values
7652
7653 file_stream.CloseElement();
7654
7655 // Preview data
7656
7657 file_stream.OpenElement("PreviewData");
7658
7659 file_stream.OpenElement("PreviewSize");
7660
7661 buffer.str("");
7662 buffer << data_file_preview.size();
7663
7664 file_stream.PushText(buffer.str().c_str());
7665
7666 file_stream.CloseElement();
7667
7668 for(Index i = 0; i < data_file_preview.size(); i++)
7669 {
7670 file_stream.OpenElement("Row");
7671
7672 file_stream.PushAttribute("Item", to_string(i+1).c_str());
7673
7674 for(Index j = 0; j < data_file_preview(i).size(); j++)
7675 {
7676 file_stream.PushText(data_file_preview(i)(j).c_str());
7677
7678 if(j != data_file_preview(i).size()-1)
7679 {
7680 file_stream.PushText(",");
7681 }
7682 }
7683
7684 file_stream.CloseElement();
7685 }
7686
7687 // Close preview data
7688
7689 file_stream.CloseElement();
7690
7691 // Close data set
7692
7693 file_stream.CloseElement();
7694 }
7695
7696
from_XML(const tinyxml2::XMLDocument & data_set_document)7697 void DataSet::from_XML(const tinyxml2::XMLDocument& data_set_document)
7698 {
7699 ostringstream buffer;
7700
7701 // Data set element
7702
7703 const tinyxml2::XMLElement* data_set_element = data_set_document.FirstChildElement("DataSet");
7704
7705 if(!data_set_element)
7706 {
7707 buffer << "OpenNN Exception: DataSet class.\n"
7708 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7709 << "Data set element is nullptr.\n";
7710
7711 throw logic_error(buffer.str());
7712 }
7713
7714 // Data file
7715
7716 const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement("DataFile");
7717
7718 if(!data_file_element)
7719 {
7720 buffer << "OpenNN Exception: DataSet class.\n"
7721 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7722 << "Data file element is nullptr.\n";
7723
7724 throw logic_error(buffer.str());
7725 }
7726
7727 // Data file name
7728
7729 const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement("DataFileName");
7730
7731 if(!data_file_name_element)
7732 {
7733 buffer << "OpenNN Exception: DataSet class.\n"
7734 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7735 << "DataFileName element is nullptr.\n";
7736
7737 throw logic_error(buffer.str());
7738 }
7739
7740 if(data_file_name_element->GetText())
7741 {
7742 const string new_data_file_name = data_file_name_element->GetText();
7743
7744 set_data_file_name(new_data_file_name);
7745 }
7746
7747 // Separator
7748
7749 const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement("Separator");
7750
7751 if(separator_element)
7752 {
7753 if(separator_element->GetText())
7754 {
7755 const string new_separator = separator_element->GetText();
7756
7757 set_separator(new_separator);
7758 }
7759 else
7760 {
7761 set_separator("Comma");
7762 }
7763 }
7764 else
7765 {
7766 set_separator("Comma");
7767 }
7768
7769 // Has columns names
7770
7771 const tinyxml2::XMLElement* columns_names_element = data_file_element->FirstChildElement("ColumnsNames");
7772
7773 if(columns_names_element)
7774 {
7775 const string new_columns_names_string = columns_names_element->GetText();
7776
7777 try
7778 {
7779 set_has_columns_names(new_columns_names_string == "1");
7780 }
7781 catch(const logic_error& e)
7782 {
7783 cerr << e.what() << endl;
7784 }
7785 }
7786
7787 // Rows labels
7788
7789 const tinyxml2::XMLElement* rows_label_element = data_file_element->FirstChildElement("RowsLabels");
7790
7791 if(rows_label_element)
7792 {
7793 const string new_rows_label_string = rows_label_element->GetText();
7794
7795 try
7796 {
7797 set_has_rows_label(new_rows_label_string == "1");
7798 }
7799 catch(const logic_error& e)
7800 {
7801 cerr << e.what() << endl;
7802 }
7803 }
7804
7805 // Missing values label
7806
7807 const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement("MissingValuesLabel");
7808
7809 if(missing_values_label_element)
7810 {
7811 if(missing_values_label_element->GetText())
7812 {
7813 const string new_missing_values_label = missing_values_label_element->GetText();
7814
7815 set_missing_values_label(new_missing_values_label);
7816 }
7817 else
7818 {
7819 set_missing_values_label("NA");
7820 }
7821 }
7822 else
7823 {
7824 set_missing_values_label("NA");
7825 }
7826
7827 // Forecasting
7828
7829 // Lags number
7830
7831 const tinyxml2::XMLElement* lags_number_element = data_file_element->FirstChildElement("LagsNumber");
7832
7833 if(!lags_number_element)
7834 {
7835 buffer << "OpenNN Exception: DataSet class.\n"
7836 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7837 << "Lags number element is nullptr.\n";
7838
7839 throw logic_error(buffer.str());
7840 }
7841
7842 if(lags_number_element->GetText())
7843 {
7844 const Index new_lags_number = static_cast<Index>(atoi(lags_number_element->GetText()));
7845
7846 set_lags_number(new_lags_number);
7847 }
7848
7849 // Steps ahead
7850
7851 const tinyxml2::XMLElement* steps_ahead_element = data_file_element->FirstChildElement("StepsAhead");
7852
7853 if(!steps_ahead_element)
7854 {
7855 buffer << "OpenNN Exception: DataSet class.\n"
7856 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7857 << "Steps ahead element is nullptr.\n";
7858
7859 throw logic_error(buffer.str());
7860 }
7861
7862 if(steps_ahead_element->GetText())
7863 {
7864 const Index new_steps_ahead = static_cast<Index>(atoi(steps_ahead_element->GetText()));
7865
7866 set_steps_ahead_number(new_steps_ahead);
7867 }
7868
7869 // Time index
7870
7871 const tinyxml2::XMLElement* time_index_element = data_file_element->FirstChildElement("TimeIndex");
7872
7873 if(!time_index_element)
7874 {
7875 buffer << "OpenNN Exception: DataSet class.\n"
7876 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7877 << "Time index element is nullptr.\n";
7878
7879 throw logic_error(buffer.str());
7880 }
7881
7882 if(time_index_element->GetText())
7883 {
7884 const Index new_time_index = static_cast<Index>(atoi(time_index_element->GetText()));
7885
7886 set_time_index(new_time_index);
7887 }
7888
7889 // Columns
7890
7891 const tinyxml2::XMLElement* columns_element = data_set_element->FirstChildElement("Columns");
7892
7893 if(!columns_element)
7894 {
7895 buffer << "OpenNN Exception: DataSet class.\n"
7896 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7897 << "Columns element is nullptr.\n";
7898
7899 throw logic_error(buffer.str());
7900 }
7901
7902 // Columns number
7903
7904 const tinyxml2::XMLElement* columns_number_element = columns_element->FirstChildElement("ColumnsNumber");
7905
7906 if(!columns_number_element)
7907 {
7908 buffer << "OpenNN Exception: DataSet class.\n"
7909 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7910 << "Columns number element is nullptr.\n";
7911
7912 throw logic_error(buffer.str());
7913 }
7914
7915 Index new_columns_number = 0;
7916
7917 if(columns_number_element->GetText())
7918 {
7919 new_columns_number = static_cast<Index>(atoi(columns_number_element->GetText()));
7920
7921 set_columns_number(new_columns_number);
7922 }
7923
7924 // Columns
7925
7926 const tinyxml2::XMLElement* start_element = columns_number_element;
7927
7928 if(new_columns_number > 0)
7929 {
7930 for(Index i = 0; i < new_columns_number; i++)
7931 {
7932 const tinyxml2::XMLElement* column_element = start_element->NextSiblingElement("Column");
7933 start_element = column_element;
7934
7935 if(column_element->Attribute("Item") != std::to_string(i+1))
7936 {
7937 buffer << "OpenNN Exception: DataSet class.\n"
7938 << "void DataSet:from_XML(const tinyxml2::XMLDocument&) method.\n"
7939 << "Column item number (" << i+1 << ") does not match (" << column_element->Attribute("Item") << ").\n";
7940
7941 throw logic_error(buffer.str());
7942 }
7943
7944 // Name
7945
7946 const tinyxml2::XMLElement* name_element = column_element->FirstChildElement("Name");
7947
7948 if(!name_element)
7949 {
7950 buffer << "OpenNN Exception: DataSet class.\n"
7951 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7952 << "Name element is nullptr.\n";
7953
7954 throw logic_error(buffer.str());
7955 }
7956
7957 if(name_element->GetText())
7958 {
7959 const string new_name = name_element->GetText();
7960
7961 columns(i).name = new_name;
7962 }
7963
7964 // Column use
7965
7966 const tinyxml2::XMLElement* column_use_element = column_element->FirstChildElement("ColumnUse");
7967
7968 if(!column_use_element)
7969 {
7970 buffer << "OpenNN Exception: DataSet class.\n"
7971 << "void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7972 << "Column use element is nullptr.\n";
7973
7974 throw logic_error(buffer.str());
7975 }
7976
7977 if(column_use_element->GetText())
7978 {
7979 const string new_column_use = column_use_element->GetText();
7980
7981 columns(i).set_use(new_column_use);
7982 }
7983
7984 // Type
7985
7986 const tinyxml2::XMLElement* type_element = column_element->FirstChildElement("Type");
7987
7988 if(!type_element)
7989 {
7990 buffer << "OpenNN Exception: DataSet class.\n"
7991 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7992 << "Type element is nullptr.\n";
7993
7994 throw logic_error(buffer.str());
7995 }
7996
7997 if(type_element->GetText())
7998 {
7999 const string new_type = type_element->GetText();
8000 columns(i).set_type(new_type);
8001 }
8002
8003 if(columns(i).type == Categorical || columns(i).type == Binary)
8004 {
8005 // Categories
8006
8007 const tinyxml2::XMLElement* categories_element = column_element->FirstChildElement("Categories");
8008
8009 if(!categories_element)
8010 {
8011 buffer << "OpenNN Exception: DataSet class.\n"
8012 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
8013 << "Categories element is nullptr.\n";
8014
8015 throw logic_error(buffer.str());
8016 }
8017
8018 if(categories_element->GetText())
8019 {
8020 const string new_categories = categories_element->GetText();
8021
8022 columns(i).categories = get_tokens(new_categories, ';');
8023 }
8024
8025 // Categories uses
8026
8027 const tinyxml2::XMLElement* categories_uses_element = column_element->FirstChildElement("CategoriesUses");
8028
8029 if(!categories_uses_element)
8030 {
8031 buffer << "OpenNN Exception: DataSet class.\n"
8032 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
8033 << "Categories uses element is nullptr.\n";
8034
8035 throw logic_error(buffer.str());
8036 }
8037
8038 if(categories_uses_element->GetText())
8039 {
8040 const string new_categories_uses = categories_uses_element->GetText();
8041
8042 columns(i).set_categories_uses(get_tokens(new_categories_uses, ';'));
8043 }
8044 }
8045 }
8046 }
8047
8048 // Rows label
8049
8050 if(has_rows_labels)
8051 {
8052 // Rows labels begin tag
8053
8054 const tinyxml2::XMLElement* rows_labels_element = data_set_element->FirstChildElement("RowsLabels");
8055
8056 if(!rows_labels_element)
8057 {
8058 buffer << "OpenNN Exception: DataSet class.\n"
8059 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8060 << "Rows labels element is nullptr.\n";
8061
8062 throw logic_error(buffer.str());
8063 }
8064
8065 // Rows labels
8066
8067 if(rows_labels_element->GetText())
8068 {
8069 const string new_rows_labels = rows_labels_element->GetText();
8070
8071 rows_labels = get_tokens(new_rows_labels, ',');
8072 }
8073
8074
8075 }
8076
8077 // Samples
8078
8079 const tinyxml2::XMLElement* samples_element = data_set_element->FirstChildElement("Samples");
8080
8081 if(!samples_element)
8082 {
8083 buffer << "OpenNN Exception: DataSet class.\n"
8084 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8085 << "Samples element is nullptr.\n";
8086
8087 throw logic_error(buffer.str());
8088 }
8089
8090 // Samples number
8091
8092 const tinyxml2::XMLElement* samples_number_element = samples_element->FirstChildElement("SamplesNumber");
8093
8094 if(!samples_number_element)
8095 {
8096 buffer << "OpenNN Exception: DataSet class.\n"
8097 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8098 << "Samples number element is nullptr.\n";
8099
8100 throw logic_error(buffer.str());
8101 }
8102
8103 if(samples_number_element->GetText())
8104 {
8105 const Index new_samples_number = static_cast<Index>(atoi(samples_number_element->GetText()));
8106
8107 samples_uses.resize(new_samples_number);
8108 }
8109
8110 // Samples uses
8111
8112 const tinyxml2::XMLElement* samples_uses_element = samples_element->FirstChildElement("SamplesUses");
8113
8114 if(!samples_uses_element)
8115 {
8116 buffer << "OpenNN Exception: DataSet class.\n"
8117 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8118 << "Samples uses element is nullptr.\n";
8119
8120 throw logic_error(buffer.str());
8121 }
8122
8123 if(samples_uses_element->GetText())
8124 {
8125 set_samples_uses(get_tokens(samples_uses_element->GetText(), ' '));
8126 }
8127
8128 // Missing values
8129
8130 const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement("MissingValues");
8131
8132 if(!missing_values_element)
8133 {
8134 buffer << "OpenNN Exception: DataSet class.\n"
8135 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8136 << "Missing values element is nullptr.\n";
8137
8138 throw logic_error(buffer.str());
8139 }
8140
8141 // Missing values method
8142
8143 const tinyxml2::XMLElement* missing_values_method_element = missing_values_element->FirstChildElement("MissingValuesMethod");
8144
8145 if(!missing_values_method_element)
8146 {
8147 buffer << "OpenNN Exception: DataSet class.\n"
8148 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8149 << "Missing values method element is nullptr.\n";
8150
8151 throw logic_error(buffer.str());
8152 }
8153
8154 if(missing_values_method_element->GetText())
8155 {
8156 set_missing_values_method(missing_values_method_element->GetText());
8157 }
8158
8159 // Missing values number
8160
8161 const tinyxml2::XMLElement* missing_values_number_element = missing_values_element->FirstChildElement("MissingValuesNumber");
8162
8163 if(!missing_values_number_element)
8164 {
8165 buffer << "OpenNN Exception: DataSet class.\n"
8166 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8167 << "Missing values number element is nullptr.\n";
8168
8169 throw logic_error(buffer.str());
8170 }
8171
8172 if(missing_values_number_element->GetText())
8173 {
8174 missing_values_number = static_cast<Index>(atoi(missing_values_number_element->GetText()));
8175 }
8176
8177 if(missing_values_number > 0)
8178 {
8179 // Columns Missing values number
8180
8181 const tinyxml2::XMLElement* columns_missing_values_number_element = missing_values_element->FirstChildElement("ColumnsMissingValuesNumber");
8182
8183 if(!columns_missing_values_number_element)
8184 {
8185 buffer << "OpenNN Exception: DataSet class.\n"
8186 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8187 << "Columns missing values number element is nullptr.\n";
8188
8189 throw logic_error(buffer.str());
8190 }
8191
8192 if(columns_missing_values_number_element->GetText())
8193 {
8194 Tensor<string, 1> new_columns_missing_values_number = get_tokens(columns_missing_values_number_element->GetText(), ' ');
8195
8196 columns_missing_values_number.resize(new_columns_missing_values_number.size());
8197
8198 for(Index i = 0; i < new_columns_missing_values_number.size(); i++)
8199 {
8200 columns_missing_values_number(i) = atoi(new_columns_missing_values_number(i).c_str());
8201 }
8202 }
8203
8204 // Rows missing values number
8205
8206 const tinyxml2::XMLElement* rows_missing_values_number_element = missing_values_element->FirstChildElement("RowsMissingValuesNumber");
8207
8208 if(!rows_missing_values_number_element)
8209 {
8210 buffer << "OpenNN Exception: DataSet class.\n"
8211 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8212 << "Rows missing values number element is nullptr.\n";
8213
8214 throw logic_error(buffer.str());
8215 }
8216
8217 if(rows_missing_values_number_element->GetText())
8218 {
8219 rows_missing_values_number = static_cast<Index>(atoi(rows_missing_values_number_element->GetText()));
8220 }
8221 }
8222
8223 // Preview data
8224
8225 const tinyxml2::XMLElement* preview_data_element = data_set_element->FirstChildElement("PreviewData");
8226
8227 if(!preview_data_element)
8228 {
8229 buffer << "OpenNN Exception: DataSet class.\n"
8230 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8231 << "Preview data element is nullptr.\n";
8232
8233 throw logic_error(buffer.str());
8234 }
8235
8236 // Preview size
8237
8238 const tinyxml2::XMLElement* preview_size_element = preview_data_element->FirstChildElement("PreviewSize");
8239
8240 if(!preview_size_element)
8241 {
8242 buffer << "OpenNN Exception: DataSet class.\n"
8243 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8244 << "Preview size element is nullptr.\n";
8245
8246 throw logic_error(buffer.str());
8247 }
8248
8249 Index new_preview_size = 0;
8250
8251 if(preview_size_element->GetText())
8252 {
8253 new_preview_size = static_cast<Index>(atoi(preview_size_element->GetText()));
8254
8255 if(new_preview_size > 0) data_file_preview.resize(new_preview_size);
8256 }
8257
8258 // Preview data
8259
8260 start_element = preview_size_element;
8261
8262 for(Index i = 0; i < new_preview_size; i++)
8263 {
8264 const tinyxml2::XMLElement* row_element = start_element->NextSiblingElement("Row");
8265 start_element = row_element;
8266
8267 if(row_element->Attribute("Item") != std::to_string(i+1))
8268 {
8269 buffer << "OpenNN Exception: DataSet class.\n"
8270 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
8271 << "Row item number (" << i+1 << ") does not match (" << row_element->Attribute("Item") << ").\n";
8272
8273 throw logic_error(buffer.str());
8274 }
8275
8276 if(row_element->GetText())
8277 {
8278 data_file_preview(i) = get_tokens(row_element->GetText(), ',');
8279 }
8280 }
8281
8282 // Display
8283
8284 const tinyxml2::XMLElement* display_element = data_set_element->FirstChildElement("Display");
8285
8286 if(display_element)
8287 {
8288 const string new_display_string = display_element->GetText();
8289
8290 try
8291 {
8292 set_display(new_display_string != "0");
8293 }
8294 catch(const logic_error& e)
8295 {
8296 cerr << e.what() << endl;
8297 }
8298 }
8299 }
8300
8301
8302 /// Prints to the screen in text format the main numbers from the data set object.
8303
print_summary() const8304 void DataSet::print_summary() const
8305 {
8306 if(display)
8307 {
8308 const Index variables_number = get_variables_number();
8309 const Index samples_number = get_samples_number();
8310
8311 cout << "Data set object summary:\n"
8312 << "Number of variables: " << variables_number << "\n"
8313 << "Number of samples: " << samples_number << "\n";
8314 }
8315 }
8316
8317
8318 /// Saves the members of a data set object to a XML-type file in an XML-type format.
8319 /// @param file_name Name of data set XML-type file.
8320 ///
8321 /// @todo
8322
save(const string & file_name) const8323 void DataSet::save(const string& file_name) const
8324 {
8325 FILE *pFile;
8326 // int err;
8327
8328 // err = fopen_s(&pFile, file_name.c_str(), "w");
8329 pFile = fopen(file_name.c_str(), "w");
8330
8331 tinyxml2::XMLPrinter document(pFile);
8332
8333 write_XML(document);
8334
8335 fclose(pFile);
8336 }
8337
8338
8339 /// Loads the members of a data set object from a XML-type file:
8340 /// <ul>
8341 /// <li> Samples number.
8342 /// <li> Training samples number.
8343 /// <li> Training samples indices.
8344 /// <li> Selection samples number.
8345 /// <li> Selection samples indices.
8346 /// <li> Testing samples number.
8347 /// <li> Testing samples indices.
8348 /// <li> Input variables number.
8349 /// <li> Input variables indices.
8350 /// <li> Target variables number.
8351 /// <li> Target variables indices.
8352 /// <li> Input variables name.
8353 /// <li> Target variables name.
8354 /// <li> Input variables description.
8355 /// <li> Target variables description.
8356 /// <li> Display.
8357 /// <li> Data.
8358 /// </ul>
8359 /// Please mind about the file format. This is specified in the User's Guide.
8360 /// @param file_name Name of data set XML-type file.
8361
load(const string & file_name)8362 void DataSet::load(const string& file_name)
8363 {
8364 tinyxml2::XMLDocument document;
8365
8366 if(document.LoadFile(file_name.c_str()))
8367 {
8368 ostringstream buffer;
8369
8370 buffer << "OpenNN Exception: DataSet class.\n"
8371 << "void load(const string&) method.\n"
8372 << "Cannot load XML file " << file_name << ".\n";
8373
8374 throw logic_error(buffer.str());
8375 }
8376
8377 from_XML(document);
8378 }
8379
8380
print_columns_types() const8381 void DataSet::print_columns_types() const
8382 {
8383 const Index columns_number = get_columns_number();
8384
8385 for(Index i = 0; i < columns_number; i++)
8386 {
8387 if(columns(i).type == Numeric) cout << "Numeric ";
8388 else if(columns(i).type == Binary) cout << "Binary ";
8389 else if(columns(i).type == Categorical) cout << "Categorical ";
8390 else if(columns(i).type == DateTime) cout << "DateTime ";
8391 else if(columns(i).type == Constant) cout << "Constant ";
8392
8393 }
8394
8395 cout << endl;
8396 }
8397
8398
8399 /// Prints to the screen the values of the data matrix.
8400
print_data() const8401 void DataSet::print_data() const
8402 {
8403 if(display) cout << data << endl;
8404 }
8405
8406
8407 /// Prints to the sceen a preview of the data matrix,
8408 /// i.e., the first, second and last samples
8409
print_data_preview() const8410 void DataSet::print_data_preview() const
8411 {
8412 if(!display) return;
8413
8414 const Index samples_number = get_samples_number();
8415
8416 if(samples_number > 0)
8417 {
8418 const Tensor<type, 1> first_sample = data.chip(0, 0);
8419
8420 cout << "First sample: \n";
8421
8422 for(int i = 0; i< first_sample.dimension(0); i++)
8423 {
8424 cout << first_sample(i) << " ";
8425 }
8426
8427 cout << endl;
8428 }
8429
8430 if(samples_number > 1)
8431 {
8432 const Tensor<type, 1> second_sample = data.chip(1, 0);
8433
8434 cout << "Second sample: \n";
8435
8436 for(int i = 0; i< second_sample.dimension(0); i++)
8437 {
8438
8439 cout << second_sample(i) << " ";
8440 }
8441
8442 cout << endl;
8443 }
8444
8445
8446 if(samples_number > 2)
8447 {
8448 const Tensor<type, 1> last_sample = data.chip(samples_number-1, 0);
8449
8450 cout << "Last sample: \n";
8451
8452 for(int i = 0; i< last_sample.dimension(0); i++)
8453 {
8454
8455 cout << last_sample(i) << " ";
8456 }
8457
8458 cout << endl;
8459 }
8460 }
8461
8462
8463 /// Saves to the data file the values of the data matrix.
8464
save_data() const8465 void DataSet::save_data() const
8466 {
8467 ofstream file(data_file_name.c_str());
8468
8469 if(!file.is_open())
8470 {
8471 ostringstream buffer;
8472
8473 buffer << "OpenNN Exception: Matrix template." << endl
8474 << "void save_csv(const string&, const char&, const Vector<string>&, const Vector<string>&) method." << endl
8475 << "Cannot open matrix data file: " << data_file_name << endl;
8476
8477 throw logic_error(buffer.str());
8478 }
8479
8480 file.precision(20);
8481
8482 const Index samples_number = get_samples_number();
8483 const Index variables_number = get_variables_number();
8484
8485 const Tensor<string, 1> variables_names = get_variables_names();
8486
8487 char separator_char = ',';//get_separator_char();
8488
8489 if(this->has_rows_labels)
8490 {
8491 file << "id" << separator_char;
8492 }
8493 for(Index j = 0; j < variables_number; j++)
8494 {
8495 file << variables_names[j];
8496
8497 if(j != variables_number-1)
8498 {
8499 file << separator_char;
8500 }
8501 }
8502
8503 file << endl;
8504
8505 for(Index i = 0; i < samples_number; i++)
8506 {
8507 if(this->has_rows_labels)
8508 {
8509 file << rows_labels(i) << separator_char;
8510 }
8511 for(Index j = 0; j < variables_number; j++)
8512 {
8513 file << data(i,j);
8514
8515 if(j != variables_number-1)
8516 {
8517 file << separator_char;
8518 }
8519 }
8520
8521 file << endl;
8522 }
8523
8524 file.close();
8525 }
8526
8527
8528 /// Saves to the data file the values of the data matrix in binary format.
8529
save_data_binary(const string & binary_data_file_name) const8530 void DataSet::save_data_binary(const string& binary_data_file_name) const
8531 {
8532 ofstream file(binary_data_file_name.c_str(), ios::binary);
8533
8534 if(!file.is_open())
8535 {
8536 ostringstream buffer;
8537
8538 buffer << "OpenNN Exception: DataSet template." << endl
8539 << "void save_data_binary(const string) method." << endl
8540 << "Cannot open data binary file." << endl;
8541
8542 throw logic_error(buffer.str());
8543 }
8544
8545 // Write data
8546
8547 streamsize size = sizeof(Index);
8548
8549 Index columns_number = data.dimension(1);
8550 Index rows_number = data.dimension(0);
8551
8552 cout << "Rows number: " << rows_number << endl;
8553 cout << "Columns number: " << columns_number << endl;
8554
8555 cout << "Saving binary data file..." << endl;
8556
8557 file.write(reinterpret_cast<char*>(&columns_number), size);
8558 file.write(reinterpret_cast<char*>(&rows_number), size);
8559
8560 size = sizeof(type);
8561
8562 type value;
8563
8564 for(int i = 0; i < columns_number; i++)
8565 {
8566 for(int j = 0; j < rows_number; j++)
8567 {
8568 value = data(j,i);
8569
8570 file.write(reinterpret_cast<char*>(&value), size);
8571 }
8572 }
8573
8574 file.close();
8575
8576
8577 /*
8578 file.write(reinterpret_cast<char*>(&columns_number), size);
8579 file.write(reinterpret_cast<char*>(&rows_number), size);
8580
8581 size = sizeof(type);
8582
8583 type value;
8584
8585 for(int i = 0; i < columns_number*rows_number; i++)
8586 {
8587 // for(int j = 0; j < rows_number; j++)
8588 // {
8589 value = data(i);
8590
8591 file.write(reinterpret_cast<char*>(&value), size);
8592 // }
8593 }
8594
8595 file.close();
8596 */
8597
8598
8599 cout << "Binary data file saved." << endl;
8600 }
8601
8602
8603 /// Arranges an input-target DataSet from a time series matrix, according to the number of lags.
8604
transform_time_series()8605 void DataSet::transform_time_series()
8606 {
8607 if(lags_number == 0) return;
8608
8609 const Index variables_number = get_variables_number();
8610 const Index samples_number = get_samples_number();
8611
8612 time_series_data = data;
8613
8614 time_series_columns = columns;
8615
8616 transform_time_series_columns();
8617
8618 const Index time_series_samples_number = get_samples_number()-(lags_number-1+steps_ahead);
8619 const Index time_series_variables_number = get_columns_number();
8620
8621 data.resize(time_series_samples_number, time_series_variables_number);
8622
8623 Tensor<type, 2> new_data(time_series_samples_number, time_series_variables_number);
8624 Tensor<type, 1> variable_data;
8625
8626 Index new_data_variable = 0;
8627
8628 Index time_series_variable= 0;
8629
8630
8631 // lags
8632
8633 for(Index lag = lags_number; lag > 0; lag--)
8634 {
8635
8636 for(Index variable = 0; variable < variables_number; variable++)
8637 {
8638
8639 variable_data = time_series_data.chip(variable, 1);
8640
8641 for(Index j = 0; j <= time_series_samples_number; j++)
8642 {
8643
8644 new_data(j, time_series_variable) = variable_data(j+lags_number-lag);
8645 }
8646 time_series_variable++;
8647 }
8648 }
8649
8650 // steps ahead
8651 for(Index ahead = 1; ahead <= steps_ahead; ahead++)
8652 {
8653 for(Index variable = 0; variable < variables_number; variable++)
8654 {
8655 variable_data = time_series_data.chip(variable, 1);
8656
8657 for(Index j = 0; j < time_series_samples_number; j++)
8658 {
8659 new_data(j, time_series_variable) = variable_data(j+ahead+lags_number-1);
8660 }
8661
8662 time_series_variable++;
8663 }
8664 }
8665
8666 set_data(new_data);
8667 }
8668
8669
8670 /// Arranges the data set for association.
8671 /// @todo Low priority. Variables and samples.
8672
transform_association()8673 void DataSet::transform_association()
8674 {
8675 // OpenNN::transform_association(data);
8676 }
8677
8678
8679 /// @todo
8680
fill_time_series(const Index & period)8681 void DataSet::fill_time_series(const Index& period )
8682 {
8683 Index rows = static_cast<Index>((data(data.dimension(0)- 1, 0)- data(0,0)) / period) + 1 ;
8684
8685 Tensor<type, 2> new_data(rows, data.dimension(1));
8686
8687 new_data.setConstant(static_cast<type>(NAN));
8688
8689 Index j = 1;
8690
8691 // new_data.set_row(0, data.chip(0, 0));
8692
8693 cout.precision(20);
8694
8695 for (Index i = 1; i < rows ; i++)
8696 {
8697 if(static_cast<Index>(data(j, 0)) == static_cast<Index>(data(j - 1, 0)))
8698 {
8699
8700 j = j + 1;
8701 }
8702 if(static_cast<Index>(data(j, 0)) == static_cast<Index>(data(0,0) + i * period))
8703 {
8704 // new_data.set_row(i, data.chip(j, 0));
8705
8706 j = j + 1;
8707 }
8708 else
8709 {
8710 new_data(i,0) = data(0,0) + i * period;
8711 }
8712 }
8713
8714 time_series_data = new_data;
8715
8716 data = new_data;
8717 }
8718
8719
8720 /// This method loads the data from a binary data file.
8721
load_data_binary()8722 void DataSet::load_data_binary()
8723 {
8724 ifstream file;
8725
8726 file.open(data_file_name.c_str(), ios::binary);
8727
8728 if(!file.is_open())
8729 {
8730 ostringstream buffer;
8731
8732 buffer << "OpenNN Exception: DataSet template.\n"
8733 << "void load_binary(const string&) method.\n"
8734 << "Cannot open binary file: " << data_file_name << "\n";
8735
8736 throw logic_error(buffer.str());
8737 }
8738
8739 streamsize size = sizeof(Index);
8740
8741 Index columns_number;
8742 Index rows_number;
8743
8744 file.read(reinterpret_cast<char*>(&columns_number), size);
8745 file.read(reinterpret_cast<char*>(&rows_number), size);
8746
8747 size = sizeof(type);
8748
8749 type value;
8750
8751 data.resize(rows_number, columns_number);
8752
8753 // Index row_index = 0;
8754 // Index column_index = 0;
8755
8756 for(Index i = 0; i < rows_number*columns_number; i++)
8757 {
8758 file.read(reinterpret_cast<char*>(&value), size);
8759
8760 data(i) = value;
8761 /*
8762 data(row_index, column_index) = value;
8763
8764 row_index++;
8765
8766 if((i+1)%rows_number == 0)
8767 {
8768 row_index = 0;
8769 column_index++;
8770 }
8771 */
8772 }
8773
8774 file.close();
8775 }
8776
8777
8778 /// This method loads data from a binary data file for time series prediction methodata_set.
8779 /// @todo
8780
load_time_series_data_binary()8781 void DataSet::load_time_series_data_binary()
8782 {
8783 // time_series_data.load_binary(data_file_name);
8784 }
8785
8786
8787 /// This method checks if the input data file has the correct format. Returns an error message.
8788
check_input_csv(const string & input_data_file_name,const char & separator_char) const8789 void DataSet::check_input_csv(const string & input_data_file_name, const char & separator_char) const
8790 {
8791 ifstream file(input_data_file_name.c_str());
8792
8793 if(!file.is_open())
8794 {
8795 ostringstream buffer;
8796
8797 buffer << "OpenNN Exception: DataSet class.\n"
8798 << "void check_input_csv() method.\n"
8799 << "Cannot open input data file: " << input_data_file_name << "\n";
8800
8801 throw logic_error(buffer.str());
8802 }
8803
8804 string line;
8805 Index line_number = 0;
8806 Index total_lines = 0;
8807
8808 Index tokens_count;
8809
8810 const Index columns_number = get_columns_number() - get_target_columns_number();
8811
8812 while(file.good())
8813 {
8814 line_number++;
8815
8816 getline(file, line);
8817
8818 trim(line);
8819
8820 erase(line, '"');
8821
8822 if(line.empty()) continue;
8823
8824 total_lines++;
8825
8826 tokens_count = count_tokens(line, separator_char);
8827
8828 if(tokens_count != columns_number)
8829 {
8830 ostringstream buffer;
8831
8832 buffer << "OpenNN Exception: DataSet class.\n"
8833 << "void check_input_csv() method.\n"
8834 << "Line " << line_number << ": Size of tokens in input file ("
8835 << tokens_count << ") is not equal to number of columns("
8836 << columns_number << "). \n"
8837 << "Input csv must contain values for all the variables except the target. \n";
8838
8839 throw logic_error(buffer.str());
8840 }
8841 }
8842
8843 file.close();
8844
8845 if(total_lines == 0)
8846 {
8847 ostringstream buffer;
8848
8849 buffer << "OpenNN Exception: DataSet class.\n"
8850 << "void check_input_csv() method.\n"
8851 << "Input data file is empty. \n";
8852
8853 throw logic_error(buffer.str());
8854 }
8855 }
8856
8857
8858 /// This method loads data from a file and returns a matrix containing the input columns.
8859
read_input_csv(const string & input_data_file_name,const char & separator_char,const string & missing_values_label,const bool & has_columns_name,const bool & has_rows_label) const8860 Tensor<type, 2> DataSet::read_input_csv(const string& input_data_file_name,
8861 const char& separator_char,
8862 const string& missing_values_label,
8863 const bool& has_columns_name,
8864 const bool& has_rows_label) const
8865 {
8866 ifstream file(input_data_file_name.c_str());
8867
8868 if(!file.is_open())
8869 {
8870 ostringstream buffer;
8871
8872 buffer << "OpenNN Exception: DataSet class.\n"
8873 << "void read_input_csv() method.\n"
8874 << "Cannot open input data file: " << input_data_file_name << "\n";
8875
8876 throw logic_error(buffer.str());
8877 }
8878
8879 // Count samples number
8880
8881 Index input_samples_count = 0;
8882
8883 string line;
8884 Index line_number = 0;
8885
8886 Index tokens_count;
8887
8888 const Index columns_number = get_columns_number() - get_target_columns_number();
8889
8890 while(file.good())
8891 {
8892 line_number++;
8893
8894 getline(file, line);
8895
8896 trim(line);
8897
8898 erase(line, '"');
8899
8900 if(line.empty()) continue;
8901
8902 tokens_count = count_tokens(line, separator_char);
8903
8904 if(tokens_count != columns_number)
8905 {
8906 ostringstream buffer;
8907
8908 buffer << "OpenNN Exception: DataSet class.\n"
8909 << "void read_input_csv() method.\n"
8910 << "Line " << line_number << ": Size of tokens("
8911 << tokens_count << ") is not equal to number of columns("
8912 << columns_number << ").\n";
8913
8914 throw logic_error(buffer.str());
8915 }
8916
8917 input_samples_count++;
8918 }
8919
8920 file.close();
8921
8922 Index variables_number = get_input_variables_number();
8923
8924 if(has_columns_name) input_samples_count--;
8925
8926 Tensor<type, 2> input_data(input_samples_count, variables_number);
8927
8928 // Fill input data
8929
8930 file.open(input_data_file_name.c_str());
8931
8932 if(!file.is_open())
8933 {
8934 ostringstream buffer;
8935
8936 buffer << "OpenNN Exception: DataSet class.\n"
8937 << "void read_input_csv() method.\n"
8938 << "Cannot open input data file: " << input_data_file_name << " for filling input data file. \n";
8939
8940 throw logic_error(buffer.str());
8941 }
8942
8943 // Read first line
8944
8945 if(has_columns_name)
8946 {
8947 while(file.good())
8948 {
8949 getline(file, line);
8950
8951 if(line.empty()) continue;
8952
8953 break;
8954 }
8955 }
8956
8957 // Read rest of the lines
8958
8959 Tensor<string, 1> tokens;
8960
8961 line_number = 0;
8962 Index variable_index = 0;
8963 Index token_index = 0;
8964 bool is_ID = has_rows_label;
8965
8966 const bool is_float = is_same<type, float>::value;
8967 bool has_missing_values = false;
8968
8969 while(file.good())
8970 {
8971 getline(file, line);
8972
8973 trim(line);
8974
8975 erase(line, '"');
8976
8977 if(line.empty()) continue;
8978
8979 tokens = get_tokens(line, separator_char);
8980
8981 variable_index = 0;
8982 token_index = 0;
8983 is_ID = has_rows_label;
8984
8985 for(Index i = 0; i < columns.size(); i++)
8986 {
8987 if(is_ID)
8988 {
8989 is_ID = false;
8990 continue;
8991 }
8992
8993 if(columns(i).column_use == UnusedVariable)
8994 {
8995 token_index++;
8996 continue;
8997 }
8998 else if(columns(i).column_use != Input)
8999 {
9000 continue;
9001 }
9002
9003 if(columns(i).type == Numeric)
9004 {
9005 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
9006 {
9007 has_missing_values = true;
9008 input_data(line_number, variable_index) = static_cast<type>(NAN);
9009 }
9010 else if(is_float)
9011 {
9012 input_data(line_number, variable_index) = strtof(tokens(token_index).data(), NULL);
9013 }
9014 else
9015 {
9016 input_data(line_number, variable_index) = stof(tokens(token_index));
9017 }
9018
9019 variable_index++;
9020 }
9021 else if(columns(i).type == Binary)
9022 {
9023 if(tokens(token_index) == missing_values_label)
9024 {
9025 has_missing_values = true;
9026 input_data(line_number, variable_index) = static_cast<type>(NAN);
9027 }
9028 else if(columns(i).categories.size() > 0 && tokens(token_index) == columns(i).categories(0))
9029 {
9030 input_data(line_number, variable_index) = 1.0;
9031 }
9032 else if(tokens(token_index) == columns(i).name)
9033 {
9034 input_data(line_number, variable_index) = 1.0;
9035 }
9036
9037 variable_index++;
9038 }
9039 else if(columns(i).type == Categorical)
9040 {
9041 for(Index k = 0; k < columns(i).get_categories_number(); k++)
9042 {
9043 if(tokens(token_index) == missing_values_label)
9044 {
9045 has_missing_values = true;
9046 input_data(line_number, variable_index) = static_cast<type>(NAN);
9047 }
9048 else if(tokens(token_index) == columns(i).categories(k))
9049 {
9050 input_data(line_number, variable_index) = 1.0;
9051 }
9052
9053 variable_index++;
9054 }
9055 }
9056 else if(columns(i).type == DateTime)
9057 {
9058 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
9059 {
9060 has_missing_values = true;
9061 input_data(line_number, variable_index) = static_cast<type>(NAN);
9062 }
9063 else
9064 {
9065 input_data(line_number, variable_index) = static_cast<type>(date_to_timestamp(tokens(token_index), gmt));
9066 }
9067
9068 variable_index++;
9069 }
9070 else if(columns(i).type == Constant)
9071 {
9072 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
9073 {
9074 has_missing_values = true;
9075 input_data(line_number, variable_index) = static_cast<type>(NAN);
9076 }
9077 else if(is_float)
9078 {
9079 input_data(line_number, variable_index) = strtof(tokens(token_index).data(), NULL);
9080 }
9081 else
9082 {
9083 input_data(line_number, variable_index) = stof(tokens(token_index));
9084 }
9085
9086 variable_index++;
9087 }
9088
9089 token_index++;
9090 }
9091
9092 line_number++;
9093 }
9094
9095 file.close();
9096
9097 if(!has_missing_values)
9098 {
9099 return input_data;
9100 }
9101 else
9102 {
9103 // Scrub missing values
9104
9105 const MissingValuesMethod missing_values_method = get_missing_values_method();
9106
9107 if(missing_values_method == MissingValuesMethod::Unuse || missing_values_method == MissingValuesMethod::Mean)
9108 {
9109 const Tensor<type, 1> means = mean(input_data);
9110
9111 const Index samples_number = input_data.dimension(0);
9112 const Index variables_number = input_data.dimension(1);
9113
9114 #pragma omp parallel for schedule(dynamic)
9115
9116 for(Index j = 0; j < variables_number; j++)
9117 {
9118 for(Index i = 0; i < samples_number; i++)
9119 {
9120 if(::isnan(input_data(i, j)))
9121 {
9122 input_data(i,j) = means(j);
9123 }
9124 }
9125 }
9126 }
9127 else
9128 {
9129 const Tensor<type, 1> medians = median(input_data);
9130
9131 const Index samples_number = input_data.dimension(0);
9132 const Index variables_number = input_data.dimension(1);
9133
9134 #pragma omp parallel for schedule(dynamic)
9135
9136 for(Index j = 0; j < variables_number; j++)
9137 {
9138 for(Index i = 0; i < samples_number; i++)
9139 {
9140 if(::isnan(input_data(i, j)))
9141 {
9142 input_data(i,j) = medians(j);
9143 }
9144 }
9145 }
9146 }
9147
9148 return input_data;
9149 }
9150 }
9151
9152
9153 /// Returns a vector containing the number of samples of each class in the data set.
9154 /// If the number of target variables is one then the number of classes is two.
9155 /// If the number of target variables is greater than one then the number of classes is equal to the number
9156 /// of target variables.
9157 /// @todo Low priority. Return class_distribution is wrong
9158
calculate_target_distribution() const9159 Tensor<Index, 1> DataSet::calculate_target_distribution() const
9160 {
9161 const Index samples_number = get_samples_number();
9162 const Index targets_number = get_target_variables_number();
9163 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
9164
9165 Tensor<Index, 1> class_distribution;
9166
9167 if(targets_number == 1) // Two classes
9168 {
9169 class_distribution = Tensor<Index, 1>(2);
9170
9171 Index target_index = target_variables_indices(0);
9172
9173 Index positives = 0;
9174 Index negatives = 0;
9175
9176 for(Index sample_index = 0; sample_index < static_cast<Index>(samples_number); sample_index++)
9177 {
9178 if(!::isnan(data(static_cast<Index>(sample_index),target_index)))
9179 {
9180 if(data(static_cast<Index>(sample_index),target_index) < static_cast<type>(0.5))
9181 {
9182 negatives++;
9183 }
9184 else
9185 {
9186 positives++;
9187 }
9188 }
9189 }
9190
9191 class_distribution(0) = negatives;
9192 class_distribution(1) = positives;
9193 }
9194 else // More than two classes
9195 {
9196 class_distribution = Tensor<Index, 1>(targets_number);
9197
9198 for(Index i = 0; i < samples_number; i++)
9199 {
9200 if(get_sample_use(i) != UnusedSample)
9201 {
9202 for(Index j = 0; j < targets_number; j++)
9203 {
9204 if(data(i,target_variables_indices(j)) == static_cast<type>(NAN)) continue;
9205
9206 if(data(i,target_variables_indices(j)) > 0.5) class_distribution(j)++;
9207 }
9208 }
9209 }
9210 }
9211
9212 return class_distribution;
9213 }
9214
9215
9216 /// Calculate the outliers from the data set using the Tukey's test.
9217 /// @param cleaning_parameter Parameter used to detect outliers.
9218 /// @todo Low priority.
9219
calculate_Tukey_outliers(const type & cleaning_parameter) const9220 Tensor<Tensor<Index, 1>, 1> DataSet::calculate_Tukey_outliers(const type& cleaning_parameter) const
9221 {
9222 const Index samples_number = get_used_samples_number();
9223 const Tensor<Index, 1> samples_indices = get_used_samples_indices();
9224
9225 const Index columns_number = get_columns_number();
9226 const Index used_columns_number = get_used_columns_number();
9227 const Tensor<Index, 1> used_columns_indices = get_used_columns_indices();
9228
9229 Tensor<Tensor<Index, 1>, 1> return_values(2);
9230
9231 return_values(0) = Tensor<Index, 1>(samples_number);
9232 return_values(1) = Tensor<Index, 1>(used_columns_number);
9233
9234 return_values(0).setZero();
9235 return_values(1).setZero();
9236
9237 Tensor<BoxPlot, 1> box_plots = calculate_columns_box_plots();
9238
9239 Index used_column_index = 0;
9240 Index variable_index = 0;
9241
9242 #pragma omp parallel for
9243
9244 for(Index i = 0; i < columns_number; i++)
9245 {
9246 if(columns(i).column_use == UnusedVariable && columns(i).type == Categorical)
9247 {
9248 variable_index += columns(i).get_categories_number();
9249 continue;
9250 }
9251 else if(columns(i).column_use == UnusedVariable) // Numeric, Binary or DateTime
9252 {
9253 variable_index++;
9254 continue;
9255 }
9256
9257 if(columns(i).type == Categorical || columns(i).type == Binary || columns(i).type == DateTime)
9258 {
9259 used_column_index++;
9260 columns(i).get_categories_number() == 0 ? variable_index++ : variable_index += columns(i).get_categories_number();
9261 continue;
9262 }
9263 else // Numeric
9264 {
9265 const type interquartile_range = box_plots(used_column_index).third_quartile - box_plots(used_column_index).first_quartile;
9266
9267 if(interquartile_range < numeric_limits<type>::epsilon())
9268 {
9269 used_column_index++;
9270 variable_index++;
9271 continue;
9272 }
9273
9274 Index columns_outliers = 0;
9275
9276 for(Index j = 0; j < samples_number; j++)
9277 {
9278 const Tensor<type, 1> sample = get_sample_data(samples_indices(static_cast<Index>(j)));
9279
9280 if(sample(variable_index) <(box_plots(used_column_index).first_quartile - cleaning_parameter*interquartile_range) ||
9281 sample(variable_index) >(box_plots(used_column_index).third_quartile + cleaning_parameter*interquartile_range))
9282 {
9283 return_values(0)(static_cast<Index>(j)) = 1;
9284
9285 columns_outliers++;
9286 }
9287 }
9288
9289 return_values(1)(used_column_index) = columns_outliers;
9290
9291 used_column_index++;
9292 variable_index++;
9293 }
9294 }
9295
9296 return return_values;
9297 }
9298
9299
9300 /// Calculate the outliers from the data set using the Tukey's test and sets in samples object.
9301 /// @param cleaning_parameter Parameter used to detect outliers
9302 /// @todo
9303
unuse_Tukey_outliers(const type & cleaning_parameter)9304 void DataSet::unuse_Tukey_outliers(const type& cleaning_parameter)
9305 {
9306 const Tensor<Tensor<Index, 1>, 1> outliers_indices = calculate_Tukey_outliers(cleaning_parameter);
9307
9308 // const Tensor<Index, 1> outliers_samples = outliers_indices(0).get_indices_greater_than(0);
9309
9310 // set_samples_unused(outliers_samples);
9311
9312 }
9313
9314
9315 /// Returns a matrix with the values of autocorrelation for every variable in the data set.
9316 /// The number of rows is equal to the number of
9317 /// The number of columns is the maximum lags number.
9318 /// @param maximum_lags_number Maximum lags number for which autocorrelation is calculated.
9319 /// @todo
9320
calculate_autocorrelations(const Index & maximum_lags_number) const9321 Tensor<type, 2> DataSet::calculate_autocorrelations(const Index& maximum_lags_number) const
9322 {
9323 if(maximum_lags_number > get_used_samples_number())
9324 {
9325 ostringstream buffer;
9326
9327 buffer << "OpenNN Exception: DataSet class.\n"
9328 << "Tensor<type, 2> autocorrelations(const Index&) method.\n"
9329 << "Maximum lags number(" << maximum_lags_number << ") is greater than the number of samples("
9330 << get_used_samples_number() <<") \n";
9331
9332 throw logic_error(buffer.str());
9333 }
9334
9335 const Index variables_number = data.dimension(1);
9336
9337 Tensor<type, 2> autocorrelations(variables_number, maximum_lags_number);
9338
9339 for(Index j = 0; j < variables_number; j++)
9340 {
9341 // autocorrelations.set_row(j, OpenNN::autocorrelations(data.chip(j,1), maximum_lags_number));
9342 }
9343
9344 return autocorrelations;
9345 }
9346
9347
9348 /// Calculates the cross-correlation between all the variables in the data set.
9349
calculate_cross_correlations(const Index & lags_number) const9350 Tensor<Tensor<type, 1>, 2> DataSet::calculate_cross_correlations(const Index& lags_number) const
9351 {
9352 const Index variables_number = get_variables_number();
9353
9354 Tensor<Tensor<type, 1>, 2> cross_correlations(variables_number, variables_number);
9355
9356 Tensor<type, 1> actual_column;
9357
9358 for(Index i = 0; i < variables_number; i++)
9359 {
9360 actual_column = data.chip(i,1);
9361
9362 for(Index j = 0; j < variables_number; j++)
9363 {
9364 cross_correlations(i,j) = OpenNN::cross_correlations(actual_column, data.chip(j,1), lags_number);
9365 }
9366 }
9367
9368 return cross_correlations;
9369 }
9370
9371
9372 /// @todo
9373
calculate_lag_plot() const9374 Tensor<type, 2> DataSet::calculate_lag_plot() const
9375 {
9376 const Index samples_number = get_used_samples_number();
9377
9378 const Index columns_number = data.dimension(1) - 1;
9379
9380 Tensor<type, 2> lag_plot(samples_number, columns_number);
9381
9382 // lag_plot = data.get_submatrix_columns(columns_indices);
9383
9384 return lag_plot;
9385 }
9386
9387
9388 /// @todo, check
9389
calculate_lag_plot(const Index & maximum_lags_number)9390 Tensor<type, 2> DataSet::calculate_lag_plot(const Index& maximum_lags_number)
9391 {
9392 const Index samples_number = get_used_samples_number();
9393
9394 if(maximum_lags_number > samples_number)
9395 {
9396 ostringstream buffer;
9397
9398 buffer << "OpenNN Exception: DataSet class.\n"
9399 << "Tensor<type, 2> calculate_lag_plot(const Index&) method.\n"
9400 << "Maximum lags number(" << maximum_lags_number
9401 << ") is greater than the number of samples("
9402 << samples_number << ") \n";
9403
9404 throw logic_error(buffer.str());
9405 }
9406
9407 //const Tensor<type, 2> lag_plot = time_series_data.calculate_lag_plot(maximum_lags_number, time_index);
9408
9409 // return lag_plot;
9410
9411 return Tensor<type, 2>();
9412 }
9413
9414
9415 /// Generates an artificial dataset with a given number of samples and number of variables
9416 /// by constant data.
9417 /// @param samples_number Number of samples in the dataset.
9418 /// @param variables_number Number of variables in the dataset.
9419 /// @todo
9420
generate_constant_data(const Index & samples_number,const Index & variables_number)9421 void DataSet::generate_constant_data(const Index& samples_number, const Index& variables_number)
9422 {
9423 set(samples_number, variables_number);
9424
9425 // data.setRandom(-5.12, 5.12);
9426
9427 for(Index i = 0; i < samples_number; i++)
9428 {
9429 data(i, variables_number-1) = 0;
9430 }
9431
9432 scale_minimum_maximum(data);
9433
9434 set_default_columns_uses();
9435 }
9436
9437
9438 /// Generates an artificial dataset with a given number of samples and number of variables
9439 /// using random data.
9440 /// @param samples_number Number of samples in the dataset.
9441 /// @param variables_number Number of variables in the dataset.
9442 /// @todo
9443
generate_random_data(const Index & samples_number,const Index & variables_number)9444 void DataSet::generate_random_data(const Index& samples_number, const Index& variables_number)
9445 {
9446 set(samples_number, variables_number);
9447
9448 data.setRandom();
9449
9450 // data.setRandom(0.0, 1.0);
9451
9452 }
9453
9454
9455 /// Generates an artificial dataset with a given number of samples and number of variables
9456 /// using a sequential data.
9457 /// @param samples_number Number of samples in the dataset.
9458 /// @param variables_number Number of variables in the dataset.
9459
generate_sequential_data(const Index & samples_number,const Index & variables_number)9460 void DataSet::generate_sequential_data(const Index& samples_number, const Index& variables_number)
9461 {
9462 set(samples_number, variables_number);
9463
9464 for(Index i = 0; i < samples_number; i++)
9465 {
9466 for(Index j = 0; j < variables_number; j++)
9467 {
9468 data(i,j) = static_cast<type>(j);
9469 }
9470 }
9471 }
9472
9473
9474 /// Generates an artificial dataset with a given number of samples and number of variables
9475 /// using a paraboloid data.
9476 /// @param samples_number Number of samples in the dataset.
9477 /// @param variables_number Number of variables in the dataset.
9478 /// @todo
9479
generate_paraboloid_data(const Index & samples_number,const Index & variables_number)9480 void DataSet::generate_paraboloid_data(const Index& samples_number, const Index& variables_number)
9481 {
9482 const Index inputs_number = variables_number-1;
9483
9484 set(samples_number, variables_number);
9485
9486 // data.setRandom();
9487
9488 data.setRandom();
9489
9490 for(Index i = 0; i < samples_number; i++)
9491 {
9492 // const type norm = l2_norm(data.chip(i, 0).delete_last(1));
9493
9494 // data(i, inputs_number) = norm*norm;
9495 }
9496
9497 scale_minimum_maximum(data);
9498 }
9499
9500
9501 /// Generates an artificial dataset with a given number of samples and number of variables
9502 /// using the Rosenbrock function.
9503 /// @param samples_number Number of samples in the dataset.
9504 /// @param variables_number Number of variables in the dataset.
9505 /// @todo
9506
generate_Rosenbrock_data(const Index & samples_number,const Index & variables_number)9507 void DataSet::generate_Rosenbrock_data(const Index& samples_number, const Index& variables_number)
9508 {
9509 const Index inputs_number = variables_number-1;
9510
9511 set(samples_number, variables_number);
9512
9513 data.setRandom();
9514
9515 #pragma omp parallel for
9516
9517 for(Index i = 0; i < samples_number; i++)
9518 {
9519 type rosenbrock = 0;
9520
9521 for(Index j = 0; j < inputs_number-1; j++)
9522 {
9523 const type value = data(i,j);
9524 const type next_value = data(i,j+1);
9525
9526 rosenbrock += (1 - value)*(1 - value)
9527 + 100*(next_value-value*value)*(next_value-value*value);
9528 }
9529
9530 data(i, inputs_number) = rosenbrock;
9531 }
9532
9533 set_default_columns_uses();
9534 }
9535
9536
9537 /// @todo
9538
generate_inputs_selection_data(const Index & samples_number,const Index & variables_number)9539 void DataSet::generate_inputs_selection_data(const Index& samples_number, const Index& variables_number)
9540 {
9541 set(samples_number,variables_number);
9542
9543 data.setRandom();
9544
9545 for(Index i = 0; i < samples_number; i++)
9546 {
9547 for(Index j = 0; j < variables_number-2; j++)
9548 {
9549 data(i,variables_number-1) += data(i,j);
9550 }
9551 }
9552
9553 set_default_columns_uses();
9554 }
9555
9556
generate_sum_data(const Index & samples_number,const Index & variables_number)9557 void DataSet::generate_sum_data(const Index& samples_number, const Index& variables_number)
9558 {
9559 set(samples_number,variables_number);
9560
9561 data.setRandom();
9562
9563 for(Index i = 0; i < samples_number; i++)
9564 {
9565 for(Index j = 0; j < variables_number-1; j++)
9566 {
9567 data(i,variables_number-1) += data(i,j);
9568 }
9569 }
9570
9571 set_default();
9572
9573 scale_data_mean_standard_deviation();
9574
9575 }
9576
9577
9578 /// Generate artificial data for a binary classification problem with a given number of samples and inputs.
9579 /// @param samples_number Number of the samples to generate.
9580 /// @param inputs_number Number of the variables that the data set will have.
9581 /// @todo
9582
generate_data_binary_classification(const Index & samples_number,const Index & inputs_number)9583 void DataSet::generate_data_binary_classification(const Index& samples_number, const Index& inputs_number)
9584 {
9585 const Index negatives = samples_number/2;
9586 const Index positives = samples_number - negatives;
9587
9588 // Negatives data
9589
9590 Tensor<type, 1> target_0(negatives);
9591
9592 Tensor<type, 2> class_0(negatives, inputs_number+1);
9593
9594 // class_0.setRandom(-0.5, 1.0);
9595
9596 // class_0.set_column(inputs_number, target_0, "");
9597
9598 // Positives data
9599
9600 // Tensor<type, 1> target_1(positives, 1.0);
9601
9602 // Tensor<type, 2> class_1(positives, inputs_number+1);
9603
9604 // class_1.setRandom(0.5, 1.0);
9605
9606 // class_1.set_column(inputs_number, target_1, "");
9607
9608 // Assemble
9609
9610 // set(class_0.assemble_rows(class_1));
9611 }
9612
9613
9614 /// @todo Low priority.
9615
generate_data_multiple_classification(const Index & samples_number,const Index & inputs_number,const Index & outputs_number)9616 void DataSet::generate_data_multiple_classification(const Index& samples_number, const Index& inputs_number, const Index& outputs_number)
9617 {
9618 Tensor<type, 2> new_data(samples_number, inputs_number);
9619
9620 new_data.setRandom();
9621
9622 Tensor<type, 2> targets(samples_number, outputs_number);
9623
9624 Index target_index = 0;
9625
9626 for(Index i = 0; i < samples_number; i ++)
9627 {
9628 target_index = static_cast<unsigned>(rand())%outputs_number;
9629
9630 targets(i, target_index) = 1.0;
9631 }
9632
9633 // set(new_data.assemble_columns(targets));
9634 }
9635
9636
9637 /// Returns true if the data matrix is not empty(it has not been loaded),
9638 /// and false otherwise.
9639
has_data() const9640 bool DataSet::has_data() const
9641 {
9642 if(is_empty())
9643 {
9644 return false;
9645 }
9646 else
9647 {
9648 return true;
9649 }
9650 }
9651
9652
9653 /// Unuses those samples with values outside a defined range.
9654 /// @param minimums vector of minimum values in the range.
9655 /// The size must be equal to the number of variables.
9656 /// @param maximums vector of maximum values in the range.
9657 /// The size must be equal to the number of variables.
9658 /// @todo Low priority.
9659
filter_data(const Tensor<type,1> & minimums,const Tensor<type,1> & maximums)9660 Tensor<Index, 1> DataSet::filter_data(const Tensor<type, 1>& minimums, const Tensor<type, 1>& maximums)
9661 {
9662 const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
9663
9664 const Index used_variables_number = used_variables_indices.size();
9665
9666 #ifdef __OPENNN_DEBUG__
9667
9668 if(minimums.size() != used_variables_number)
9669 {
9670 ostringstream buffer;
9671
9672 buffer << "OpenNN Exception: DataSet class.\n"
9673 << "Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9674 << "Size of minimums(" << minimums.size() << ") is not equal to number of variables(" << used_variables_number << ").\n";
9675
9676 throw logic_error(buffer.str());
9677 }
9678
9679 if(maximums.size() != used_variables_number)
9680 {
9681 ostringstream buffer;
9682
9683 buffer << "OpenNN Exception: DataSet class.\n"
9684 << "Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9685 << "Size of maximums(" << maximums.size() << ") is not equal to number of variables(" << used_variables_number << ").\n";
9686
9687 throw logic_error(buffer.str());
9688 }
9689
9690 #endif
9691
9692 const Index samples_number = get_samples_number();
9693
9694 Tensor<type, 1> filtered_indices(samples_number);
9695 filtered_indices.setZero();
9696
9697 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9698 const Index used_samples_number = used_samples_indices.size();
9699
9700 Index sample_index = 0;
9701
9702 for(Index i = 0; i < used_variables_number; i++)
9703 {
9704 const Index variable_index = used_variables_indices(i);
9705
9706 for(Index j = 0; j < used_samples_number; j++)
9707 {
9708 sample_index = used_samples_indices(j);
9709
9710 if(get_sample_use(sample_index) == UnusedSample) continue;
9711
9712 if(isnan(data(sample_index, variable_index))) continue;
9713
9714 if(fabsf(data(sample_index, variable_index) - minimums(i)) <= static_cast<type>(1e-3)
9715 || fabsf(data(sample_index, variable_index) - maximums(i)) <= static_cast<type>(1e-3)) continue;
9716
9717 if(data(sample_index,variable_index) < minimums(i)
9718 || data(sample_index,variable_index) > maximums(i))
9719 {
9720 filtered_indices(sample_index) = 1.0;
9721
9722 set_sample_use(sample_index, UnusedSample);
9723 }
9724 }
9725 }
9726
9727 Index filtered_samples_number =
9728 static_cast<Index>(std::count_if(filtered_indices.data(), filtered_indices.data()+filtered_indices.size(), [](type value) {return value > static_cast<type>(0.5);}));
9729
9730 Tensor<Index, 1> filtered_samples_indices(filtered_samples_number);
9731 Index index = 0;
9732
9733 for(Index i = 0; i < samples_number; i++)
9734 {
9735 if(filtered_indices(i) > static_cast<type>(0.5))
9736 {
9737 filtered_samples_indices(index) = i;
9738 index++;
9739 }
9740 }
9741
9742 return filtered_samples_indices;
9743 }
9744
9745
9746 /// Filter data set variable using a rank.
9747 /// The values within the variable must be between minimum and maximum.
9748 /// @param variable_index Index number where the variable to be filtered is located.
9749 /// @param minimum Value that determine the lower limit.
9750 /// @param maximum Value that determine the upper limit.
9751 /// Returns a indices vector.
9752 /// @todo
9753
filter_column(const Index & variable_index,const type & minimum,const type & maximum)9754 Tensor<Index, 1> DataSet::filter_column(const Index& variable_index, const type& minimum, const type& maximum)
9755 {
9756 const Index samples_number = get_samples_number();
9757
9758 Tensor<type, 1> filtered_indices(samples_number);
9759
9760 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9761
9762 const Tensor<Index, 1> current_samples_indices = used_samples_indices;
9763
9764 const Index current_samples_number = current_samples_indices.size();
9765
9766 for(Index i = 0; i < current_samples_number; i++)
9767 {
9768 const Index index = current_samples_indices(i);
9769
9770 if(data(index,variable_index) < minimum || data(index,variable_index) > maximum)
9771 {
9772 filtered_indices(index) = 1.0;
9773
9774 set_sample_use(index, UnusedSample);
9775 }
9776 }
9777
9778 // return filtered_indices.get_indices_greater_than(0.5);
9779
9780 return Tensor<Index, 1>();
9781 }
9782
9783
9784 /// Filter data set variable using a rank.
9785 /// The values within the variable must be between minimum and maximum.
9786 /// @param variable_name String name where the variable to be filtered is located.
9787 /// @param minimum Value that determine the lower limit.
9788 /// @param maximum Value that determine the upper limit.
9789 /// Returns a indices vector.
9790 /// @todo
9791
filter_column(const string & variable_name,const type & minimum,const type & maximum)9792 Tensor<Index, 1> DataSet::filter_column(const string& variable_name, const type& minimum, const type& maximum)
9793 {
9794 const Index variable_index = get_variable_index(variable_name);
9795
9796 const Index samples_number = get_samples_number();
9797
9798 Tensor<type, 1> filtered_indices(samples_number);
9799
9800 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9801
9802 const Index current_samples_number = used_samples_indices.size();
9803
9804 for(Index i = 0; i < current_samples_number; i++)
9805 {
9806 const Index index = used_samples_indices(i);
9807
9808 if(data(index,variable_index) < minimum || data(index,variable_index) > maximum)
9809 {
9810 filtered_indices(index) = 1.0;
9811
9812 set_sample_use(index, UnusedSample);
9813 }
9814 }
9815
9816 // return filtered_indices.get_indices_greater_than(0.5);
9817
9818 return Tensor<Index, 1>();
9819 }
9820
9821
9822 /// This method converts a numerical variable into categorical.
9823 /// Note that this method resizes the dataset.
9824 /// @param variable_index Index of the variable to be converted.
9825
numeric_to_categorical(const Index & variable_index)9826 void DataSet::numeric_to_categorical(const Index& variable_index)
9827 {
9828 #ifdef __OPENNN_DEBUG__
9829
9830 const Index variables_number = get_variables_number();
9831
9832 if(variable_index >= variables_number)
9833 {
9834 ostringstream buffer;
9835
9836 buffer << "OpenNN Exception: DataSet class.\n"
9837 << "void convert_categorical_variable(const Index&) method.\n"
9838 << "Index of variable(" << variable_index << ") must be less than number of variables (" << variables_number << ").\n";
9839
9840 throw logic_error(buffer.str());
9841 }
9842
9843 #endif
9844
9845 // const Tensor<type, 1> categories = data.get_column(variable_index).get_unique_elements();
9846
9847 // data = data.to_categorical(variable_index);
9848
9849 // columns(variable_index).categories_uses = Tensor<VariableUse, 1>(categories.size(), columns(variable_index).column_use);
9850 // columns(variable_index).type = Categorical;
9851 // columns(variable_index).categories = categories.to_string_vector();
9852 }
9853
9854
9855 /// Sets all the samples with missing values to "Unused".
9856
impute_missing_values_unuse()9857 void DataSet::impute_missing_values_unuse()
9858 {
9859 const Index samples_number = get_samples_number();
9860
9861 #pragma omp parallel for
9862
9863 for(Index i = 0; i <samples_number; i++)
9864 {
9865 if(has_nan_row(i))
9866 {
9867 set_sample_use(i, "Unused");
9868 }
9869 }
9870 }
9871
9872 /// Substitutes all the missing values by the mean of the corresponding variable.
9873
impute_missing_values_mean()9874 void DataSet::impute_missing_values_mean()
9875 {
9876 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9877 const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
9878
9879 const Tensor<type, 1> means = mean(data, used_samples_indices, used_variables_indices);
9880
9881 const Index samples_number = used_samples_indices.size();
9882 const Index variables_number = used_variables_indices.size();
9883
9884 Index current_variable;
9885 Index current_sample;
9886
9887 #pragma omp parallel for schedule(dynamic)
9888
9889 for(Index j = 0; j < variables_number; j++)
9890 {
9891 current_variable = used_variables_indices(j);
9892
9893 for(Index i = 0; i < samples_number; i++)
9894 {
9895 current_sample = used_samples_indices(i);
9896
9897 if(::isnan(data(current_sample, current_variable)))
9898 {
9899 data(current_sample,current_variable) = means(j);
9900 }
9901 }
9902 }
9903 }
9904
9905
9906 /// Substitutes all the missing values by the median of the corresponding variable.
9907
impute_missing_values_median()9908 void DataSet::impute_missing_values_median()
9909 {
9910 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9911 const Tensor<Index, 1> used_variables_indices = get_used_columns_indices();
9912
9913 const Tensor<type, 1> medians = median(data, used_samples_indices, used_variables_indices);
9914
9915 const Index variables_number = used_variables_indices.size();
9916 const Index samples_number = used_samples_indices.size();
9917
9918 #pragma omp parallel for schedule(dynamic)
9919
9920 for(Index j = 0; j < variables_number; j++)
9921 {
9922 for(Index i = 0 ; i < samples_number ; i++)
9923 {
9924 if(::isnan(data(used_samples_indices(i),used_variables_indices(j)))) data(used_samples_indices(i),used_variables_indices(j)) = medians(j);
9925 }
9926 }
9927 }
9928
9929
9930 /// General method for dealing with missing values.
9931 /// It switches among the different scrubbing methods available,
9932 /// according to the corresponding value in the missing values object.
9933
scrub_missing_values()9934 void DataSet::scrub_missing_values()
9935 {
9936 switch(missing_values_method)
9937 {
9938 case Unuse:
9939 {
9940 impute_missing_values_unuse();
9941 }
9942 break;
9943
9944 case Mean:
9945 {
9946 impute_missing_values_mean();
9947 }
9948 break;
9949
9950 case Median:
9951 {
9952 impute_missing_values_median();
9953 }
9954 break;
9955 }
9956 }
9957
9958
9959 /// @todo Time series stuff?
9960
read_csv()9961 void DataSet::read_csv()
9962 {
9963 read_csv_1();
9964
9965 if(!has_time_columns() && !has_categorical_columns())
9966 {
9967 read_csv_2_simple();
9968
9969 read_csv_3_simple();
9970 }
9971 else
9972 {
9973
9974 // categorical data
9975
9976 read_csv_2_complete();
9977
9978 read_csv_3_complete();
9979 }
9980 }
9981
9982
get_default_columns_names(const Index & columns_number)9983 Tensor<string, 1> DataSet::get_default_columns_names(const Index& columns_number)
9984 {
9985 Tensor<string, 1> columns_names(columns_number);
9986
9987 for(Index i = 0; i < columns_number; i++)
9988 {
9989 ostringstream buffer;
9990
9991 buffer << "column_" << i+1;
9992
9993 columns_names(i) = buffer.str();
9994 }
9995
9996 return columns_names;
9997 }
9998
9999
read_csv_1()10000 void DataSet::read_csv_1()
10001 {
10002 ifstream file(data_file_name.c_str());
10003
10004 if(!file.is_open())
10005 {
10006 ostringstream buffer;
10007
10008 buffer << "OpenNN Exception: DataSet class.\n"
10009 << "void read_csv() method.\n"
10010 << "Cannot open data file: " << data_file_name << "\n";
10011
10012 throw logic_error(buffer.str());
10013 }
10014
10015 const char separator_char = get_separator_char();
10016
10017 cout << "Setting data file preview..." << endl;
10018
10019 Index lines_number = has_columns_names ? 4 : 3;
10020
10021 data_file_preview.resize(lines_number);
10022
10023 string line;
10024
10025 Index lines_count = 0;
10026
10027 while(file.good())
10028 {
10029 getline(file, line);
10030
10031 trim(line);
10032
10033 erase(line, '"');
10034
10035 if(line.empty()) continue;
10036
10037 check_separators(line);
10038
10039 check_special_characters(line);
10040
10041 data_file_preview(lines_count) = get_tokens(line, separator_char);
10042
10043 lines_count++;
10044
10045 if(lines_count == lines_number) break;
10046 }
10047
10048 file.close();
10049
10050 // Check empty file @todo, size() methods returns 0
10051
10052 if(data_file_preview(0).size() == 0)
10053 {
10054 ostringstream buffer;
10055
10056 buffer << "OpenNN Exception: DataSet class.\n"
10057 << "void read_csv_1() method.\n"
10058 << "File " << data_file_name << " is empty.\n";
10059
10060 throw logic_error(buffer.str());
10061 }
10062
10063 // Set rows labels and columns names
10064
10065 cout << "Setting rows labels..." << endl;
10066
10067 string first_name = data_file_preview(0)(0);
10068 transform(first_name.begin(), first_name.end(), first_name.begin(), ::tolower);
10069
10070 if(contains_substring(first_name, "id"))
10071 {
10072 has_rows_labels = true;
10073 }
10074
10075 const Index columns_number = has_rows_labels ? data_file_preview(0).size()-1 : data_file_preview(0).size();
10076
10077 columns.resize(columns_number);
10078
10079 // Check if header has numeric value
10080
10081 if(has_columns_names && has_numbers(data_file_preview(0)))
10082 {
10083 ostringstream buffer;
10084
10085 buffer << "OpenNN Exception: DataSet class.\n"
10086 << "void read_csv_1() method.\n"
10087 << "Some columns names are numeric.\n";
10088
10089 throw logic_error(buffer.str());
10090 }
10091
10092 // Columns names
10093
10094 cout << "Setting columns names..." << endl;
10095
10096 if(has_columns_names)
10097 {
10098 has_rows_labels ? set_columns_names(data_file_preview(0).slice(Eigen::array<Eigen::Index, 1>({1}), Eigen::array<Eigen::Index, 1>({data_file_preview(0).size()-1})))
10099 : set_columns_names(data_file_preview(0));
10100 }
10101 else
10102 {
10103 set_columns_names(get_default_columns_names(columns_number));
10104 }
10105
10106 // Columns types
10107
10108 cout << "Setting columns types..." << endl;
10109
10110 Index column_index = 0;
10111
10112 for(Index i = 0; i < data_file_preview(0).dimension(0); i++)
10113 {
10114 if(has_rows_labels && i == 0) continue;
10115
10116 if(((is_numeric_string(data_file_preview(1)(i)) && data_file_preview(1)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10117 || ((is_numeric_string(data_file_preview(2)(i)) && data_file_preview(2)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10118 || ((is_numeric_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10119 || ((is_numeric_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) != missing_values_label) || data_file_preview(1)(i).empty()))
10120 {
10121 columns(column_index).type = Numeric;
10122 column_index++;
10123 }
10124 else if((is_date_time_string(data_file_preview(1)(i)) && data_file_preview(1)(i) != missing_values_label)
10125 || (is_date_time_string(data_file_preview(2)(i)) && data_file_preview(2)(i) != missing_values_label)
10126 || (is_date_time_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) != missing_values_label)
10127 || (is_date_time_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) != missing_values_label))
10128 {
10129 columns(column_index).type = DateTime;
10130 column_index++;
10131 }
10132 else
10133 {
10134 columns(column_index).type = Categorical;
10135 column_index++;
10136 }
10137 }
10138
10139
10140 }
10141
10142
read_csv_2_simple()10143 void DataSet::read_csv_2_simple()
10144 {
10145 ifstream file(data_file_name.c_str());
10146
10147 if(!file.is_open())
10148 {
10149 ostringstream buffer;
10150
10151 buffer << "OpenNN Exception: DataSet class.\n"
10152 << "void read_csv_2_simple() method.\n"
10153 << "Cannot open data file: " << data_file_name << "\n";
10154
10155 throw logic_error(buffer.str());
10156 }
10157
10158 string line;
10159 Index line_number = 0;
10160
10161 if(has_columns_names)
10162 {
10163 while(file.good())
10164 {
10165 line_number++;
10166
10167 getline(file, line);
10168
10169 trim(line);
10170
10171 erase(line, '"');
10172
10173 if(line.empty()) continue;
10174
10175 break;
10176 }
10177 }
10178
10179 Index samples_count = 0;
10180
10181 Index tokens_count;
10182
10183 cout << "Setting data dimensions..." << endl;
10184
10185 const char separator_char = get_separator_char();
10186
10187 const Index columns_number = get_columns_number();
10188 const Index raw_columns_number = has_rows_labels ? columns_number + 1 : columns_number;
10189
10190 while(file.good())
10191 {
10192 line_number++;
10193
10194 getline(file, line);
10195
10196 trim(line);
10197
10198 //erase(line, '"');
10199
10200 if(line.empty()) continue;
10201
10202 tokens_count = count_tokens(line, separator_char);
10203
10204 if(tokens_count != raw_columns_number)
10205 {
10206 ostringstream buffer;
10207
10208 buffer << "OpenNN Exception: DataSet class.\n"
10209 << "void read_csv_2_simple() method.\n"
10210 << "Line " << line_number << ": Size of tokens("
10211 << tokens_count << ") is not equal to number of columns("
10212 << raw_columns_number << ").\n";
10213
10214 throw logic_error(buffer.str());
10215 }
10216
10217 samples_count++;
10218 }
10219
10220 file.close();
10221
10222 data.resize(samples_count, columns_number);
10223
10224 set_default_columns_uses();
10225
10226 samples_uses.resize(samples_count);
10227 samples_uses.setConstant(Training);
10228
10229 split_samples_random();
10230 }
10231
10232
read_csv_3_simple()10233 void DataSet::read_csv_3_simple()
10234 {
10235 ifstream file(data_file_name.c_str());
10236
10237 if(!file.is_open())
10238 {
10239 ostringstream buffer;
10240
10241 buffer << "OpenNN Exception: DataSet class.\n"
10242 << "void read_csv_2_simple() method.\n"
10243 << "Cannot open data file: " << data_file_name << "\n";
10244
10245 throw logic_error(buffer.str());
10246 }
10247
10248 const bool is_float = is_same<type, float>::value;
10249
10250 const char separator_char = get_separator_char();
10251
10252 string line;
10253
10254 // Read header
10255
10256 if(has_columns_names)
10257 {
10258 while(file.good())
10259 {
10260 getline(file, line);
10261
10262 if(line.empty()) continue;
10263
10264 break;
10265 }
10266 }
10267
10268
10269 // Read data
10270
10271 Index j = 0;
10272
10273 //???
10274
10275 const Index raw_columns_number = has_rows_labels ? get_columns_number() + 1 : get_columns_number();
10276
10277 Tensor<string, 1> tokens(raw_columns_number);
10278
10279 const Index samples_number = data.dimension(0);
10280
10281 if(has_rows_labels) rows_labels.resize(samples_number);
10282
10283 cout << "Reading data..." << endl;
10284
10285 Index sample_index = 0;
10286 Index column_index = 0;
10287
10288
10289 while(file.good())
10290 {
10291 getline(file, line);
10292
10293 trim(line);
10294
10295 erase(line, '"');
10296
10297 if(line.empty()) continue;
10298
10299 fill_tokens(line, separator_char, tokens);
10300
10301 for(j = 0; j < raw_columns_number; j++)
10302 {
10303 trim(tokens(j));
10304
10305 if(has_rows_labels && j == 0)
10306 {
10307 rows_labels(sample_index) = tokens(j);
10308 }
10309 else if(tokens(j) == missing_values_label || tokens(j).empty())
10310 {
10311 data(sample_index, column_index) = static_cast<type>(NAN);
10312 column_index++;
10313 }
10314 else if(is_float)
10315 {
10316 data(sample_index, column_index) = strtof(tokens(j).data(), NULL);
10317 column_index++;
10318 }
10319 else
10320 {
10321 data(sample_index, column_index) = stof(tokens(j));
10322 column_index++;
10323 }
10324 }
10325
10326 column_index = 0;
10327 sample_index++;
10328 }
10329
10330 const Index data_file_preview_index = has_columns_names ? 3 : 2;
10331
10332 data_file_preview(data_file_preview_index) = tokens;
10333
10334 file.close();
10335
10336 cout << "Data read succesfully..." << endl;
10337
10338 // Check Constant
10339
10340
10341 cout << "Checking constant columns..." << endl;
10342
10343 Index variable_index = 0;
10344
10345 for(Index column = 0; column < get_columns_number(); column++)
10346 {
10347 if(columns(column).type == Numeric)
10348 {
10349 // @todo avoid chip
10350
10351 // if(is_constant_numeric(data.chip(variable_index, 1)))
10352 // {
10353 // columns(column).type = Constant;
10354 // columns(column).column_use = UnusedVariable;
10355 // }
10356
10357 const type a = data(0, variable_index);
10358
10359 bool constant = true;
10360
10361 for (int i = 1; i < data.dimension(0); i++)
10362 {
10363 if (abs(data(i, variable_index)-a) > 1e-3 || ::isnan(data(i, variable_index)) || ::isnan(a))
10364 constant = false;
10365 }
10366
10367 if(constant)
10368 {
10369 columns(column).type = Constant;
10370 columns(column).column_use = UnusedVariable;
10371 }
10372
10373 variable_index++;
10374 }
10375 else if(columns(column).type == DateTime)
10376 {
10377 columns(column).column_use = UnusedVariable;
10378 variable_index++;
10379 }
10380 else if(columns(column).type == Constant)
10381 {
10382 variable_index++;
10383 }
10384 else if(columns(column).type == Binary)
10385 {
10386 if(columns(column).get_categories_number() == 1)
10387 {
10388 columns(column).type = Constant;
10389 columns(column).column_use = UnusedVariable;
10390 }
10391
10392 variable_index++;
10393 }
10394 else if(columns(column).type == Categorical)
10395 {
10396 if(columns(column).get_categories_number() == 1)
10397 {
10398 columns(column).type = Constant;
10399 columns(column).column_use = UnusedVariable;
10400 }
10401
10402 variable_index += columns(column).get_categories_number();
10403 }
10404
10405 // if(is_constant_numeric(data.chip(column, 1)) && columns(column).type!=DateTime)
10406 // {
10407 // columns(column).type = Constant;
10408 // columns(column).column_use = UnusedVariable;
10409 // }
10410 }
10411 // Check Binary
10412
10413 cout << "Checking binary columns..." << endl;
10414
10415 set_binary_simple_columns();
10416
10417
10418 }
10419
10420
read_csv_2_complete()10421 void DataSet::read_csv_2_complete()
10422 {
10423 ifstream file(data_file_name.c_str());
10424
10425 if(!file.is_open())
10426 {
10427 ostringstream buffer;
10428
10429 buffer << "OpenNN Exception: DataSet class.\n"
10430 << "void read_csv_2_complete() method.\n"
10431 << "Cannot open data file: " << data_file_name << "\n";
10432
10433 throw logic_error(buffer.str());
10434 }
10435
10436 const char separator_char = get_separator_char();
10437
10438 string line;
10439
10440 Tensor<string, 1> tokens;
10441
10442 Index lines_count = 0;
10443 Index tokens_count;
10444
10445 const Index columns_number = columns.size();
10446
10447 for(unsigned j = 0; j < columns_number; j++)
10448 {
10449 if(columns(j).type != Categorical)
10450 {
10451 columns(j).column_use = Input;
10452 }
10453 }
10454
10455 // Skip header
10456
10457 if(has_columns_names)
10458 {
10459 while(file.good())
10460 {
10461 getline(file, line);
10462
10463 trim(line);
10464
10465 if(line.empty()) continue;
10466
10467 break;
10468 }
10469 }
10470
10471 // Read data
10472
10473 cout << "Setting data dimensions..." << endl;
10474
10475 const Index raw_columns_number = has_rows_labels ? columns_number + 1 : columns_number;
10476
10477 Index column_index = 0;
10478
10479 while(file.good())
10480 {
10481 getline(file, line);
10482
10483 trim(line);
10484
10485 if(line.empty()) continue;
10486
10487 tokens = get_tokens(line, separator_char);
10488
10489 tokens_count = tokens.size();
10490
10491 if(static_cast<unsigned>(tokens_count) != raw_columns_number)
10492 {
10493 const string message =
10494 "Sample " + to_string(lines_count+1) + " error:\n"
10495 "Size of tokens (" + to_string(tokens_count) + ") is not equal to number of columns (" + to_string(raw_columns_number) + ").\n"
10496 "Please check the format of the data file (e.g: Use of commas both as decimal and column separator)";
10497
10498 throw logic_error(message);
10499 }
10500
10501 for(unsigned j = 0; j < raw_columns_number; j++)
10502 {
10503 if(has_rows_labels && j == 0)
10504 {
10505 continue;
10506 }
10507
10508 trim(tokens(j));
10509
10510 if(columns(column_index).type == Categorical)
10511 {
10512 if(find(columns(column_index).categories.data(), columns(column_index).categories.data() + columns(column_index).categories.size(), tokens(j)) == (columns(column_index).categories.data() + columns(column_index).categories.size()))
10513 {
10514 if(tokens(j) == missing_values_label)
10515 {
10516 column_index++;
10517 continue;
10518 }
10519
10520 columns(column_index).add_category(tokens(j));
10521 }
10522 }
10523
10524 column_index++;
10525 }
10526
10527 column_index = 0;
10528
10529 lines_count++;
10530 }
10531
10532
10533 cout << "Setting types..." << endl;
10534
10535 for(Index j = 0; j < columns_number; j++)
10536 {
10537 if(columns(j).type == Categorical)
10538 {
10539 if(columns(j).categories.size() == 2)
10540 {
10541 columns(j).type = Binary;
10542 }
10543 }
10544 }
10545
10546 file.close();
10547
10548 const Index samples_number = static_cast<unsigned>(lines_count);
10549
10550 const Index variables_number = get_variables_number();
10551
10552 data.resize(static_cast<Index>(samples_number), variables_number);
10553 data.setZero();
10554
10555 if(has_rows_labels) rows_labels.resize(samples_number);
10556
10557 set_default_columns_uses();
10558
10559 samples_uses.resize(static_cast<Index>(samples_number));
10560
10561 samples_uses.setConstant(Training);
10562
10563 split_samples_random();
10564 }
10565
10566
read_csv_3_complete()10567 void DataSet::read_csv_3_complete()
10568 {
10569 ifstream file(data_file_name.c_str());
10570
10571 if(!file.is_open())
10572 {
10573 ostringstream buffer;
10574
10575 buffer << "OpenNN Exception: DataSet class.\n"
10576 << "void read_csv_3_complete() method.\n"
10577 << "Cannot open data file: " << data_file_name << "\n";
10578
10579 throw logic_error(buffer.str());
10580 }
10581
10582
10583
10584 const char separator_char = get_separator_char();
10585
10586 const Index columns_number = columns.size();
10587
10588 const Index raw_columns_number = has_rows_labels ? columns_number+1 : columns_number;
10589
10590 string line;
10591
10592 Tensor<string, 1> tokens;
10593
10594 string token;
10595
10596 unsigned sample_index = 0;
10597 unsigned variable_index = 0;
10598 unsigned column_index = 0;
10599
10600 // Skip header
10601
10602 if(has_columns_names)
10603 {
10604 while(file.good())
10605 {
10606 getline(file, line);
10607
10608 trim(line);
10609
10610 if(line.empty()) continue;
10611
10612 break;
10613 }
10614 }
10615
10616
10617 // Read data
10618
10619 cout << "Reading data..." << endl;
10620
10621 while(file.good())
10622 {
10623 getline(file, line);
10624
10625 trim(line);
10626
10627 erase(line, '"');
10628
10629 if(line.empty()) continue;
10630
10631 tokens = get_tokens(line, separator_char);
10632
10633 variable_index = 0;
10634 column_index = 0;
10635
10636 for(Index j = 0; j < raw_columns_number; j++)
10637 {
10638
10639 trim(tokens(j));
10640
10641 if(has_rows_labels && j ==0)
10642 {
10643 rows_labels(sample_index) = tokens(j);
10644 continue;
10645 }
10646 else if(columns(column_index).type == Numeric)
10647 {
10648
10649 if(tokens(j) == missing_values_label || tokens(j).empty())
10650 {
10651 data(sample_index, variable_index) = static_cast<type>(NAN);
10652 variable_index++;
10653 }
10654 else
10655 {
10656 try
10657 {
10658
10659 data(sample_index, variable_index) = static_cast<type>(stod(tokens(j)));
10660 variable_index++;
10661 }
10662 catch (invalid_argument)
10663 {
10664 ostringstream buffer;
10665
10666 buffer << "OpenNN Exception: DataSet class.\n"
10667 << "void read_csv_3_complete() method.\n"
10668 << "Sample " << sample_index << "; Invalid number: " << tokens(j) << "\n";
10669
10670 throw logic_error(buffer.str());
10671 }
10672 }
10673 }
10674 else if(columns(column_index).type == DateTime)
10675 {
10676 if(tokens(j) == missing_values_label || tokens(j).empty())
10677 {
10678 data(sample_index, variable_index) = static_cast<type>(NAN);
10679 variable_index++;
10680 }
10681 else
10682 {
10683 data(sample_index, variable_index) = static_cast<type>(date_to_timestamp(tokens(j), gmt));
10684 variable_index++;
10685 }
10686 }
10687 else if(columns(column_index).type == Categorical)
10688 {
10689 for(Index k = 0; k < columns(column_index).get_categories_number(); k++)
10690 {
10691 if(tokens(j) == missing_values_label)
10692 {
10693 data(sample_index, variable_index) = static_cast<type>(NAN);
10694 }
10695 else if(tokens(j) == columns(column_index).categories(k))
10696 {
10697 data(sample_index, variable_index) = 1.0;
10698 }
10699
10700 variable_index++;
10701 }
10702 }
10703 else if(columns(column_index).type == Binary)
10704 {
10705 if(tokens(j) == missing_values_label)
10706 {
10707 data(sample_index, variable_index) = static_cast<type>(NAN);
10708 }
10709 else if(columns(column_index).categories.size() > 0 && tokens(j) == columns(column_index).categories(0))
10710 {
10711 data(sample_index, variable_index) = 1.0;
10712 }
10713 else if(tokens(j) == columns(column_index).name)
10714 {
10715 data(sample_index, variable_index) = 1.0;
10716 }
10717
10718 variable_index++;
10719 }
10720
10721 column_index++;
10722 }
10723
10724 sample_index++;
10725 }
10726
10727 const Index data_file_preview_index = has_columns_names ? 3 : 2;
10728
10729 data_file_preview(data_file_preview_index) = tokens;
10730
10731 cout << "Data read succesfully..." << endl;
10732
10733 file.close();
10734
10735 // Check binary
10736 cout << "Checking binary columns..." << endl;
10737
10738 set_binary_simple_columns();
10739
10740 // Check Constant and DateTime to unused
10741
10742 cout << "Checking constant columns..." << endl;
10743
10744 variable_index = 0;
10745
10746 for(Index column = 0; column < get_columns_number(); column++)
10747 {
10748 if(columns(column).type == Numeric)
10749 {
10750 const Tensor<type, 1> numeric_column = data.chip(variable_index, 1);
10751
10752 if(standard_deviation(numeric_column) - static_cast<type>(0) < static_cast<type>(1.0-3))
10753 {
10754
10755 columns(column).type = Constant;
10756 columns(column).column_use = UnusedVariable;
10757 }
10758
10759 variable_index++;
10760 }
10761 else if(columns(column).type == DateTime)
10762 {
10763 columns(column).column_use = UnusedVariable;
10764 variable_index++;
10765 }
10766 else if(columns(column).type == Constant)
10767 {
10768 columns(column).column_use = UnusedVariable;
10769
10770 variable_index++;
10771 }
10772 else if(columns(column).type == Binary)
10773 {
10774 if(columns(column).get_categories_number() == 1)
10775 {
10776 columns(column).type = Constant;
10777 columns(column).column_use = UnusedVariable;
10778 columns(column).set_categories_uses(UnusedVariable);
10779 }
10780
10781 variable_index++;
10782 }
10783 else if(columns(column).type == Categorical)
10784 {
10785 if(columns(column).get_categories_number() == 1)
10786 {
10787 columns(column).type = Constant;
10788 columns(column).column_use = UnusedVariable;
10789 columns(column).set_categories_uses(UnusedVariable);
10790 }
10791
10792 variable_index += columns(column).get_categories_number();
10793 }
10794 }
10795 }
10796
10797
check_separators(const string & line) const10798 void DataSet::check_separators(const string& line) const
10799 {
10800 if(line.find(',') == string::npos
10801 && line.find(';') == string::npos
10802 && line.find(' ') == string::npos
10803 && line.find('\t') == string::npos)
10804 {
10805 return;
10806 }
10807
10808 const char separator_char = get_separator_char();
10809
10810 if(line.find(separator_char) == string::npos)
10811 {
10812 const string message =
10813 "Error: " + get_separator_string() + " separator not found in data file " + data_file_name + ".";
10814
10815 throw logic_error(message);
10816 }
10817
10818 if(separator == Space)
10819 {
10820 if(line.find(',') != string::npos)
10821 {
10822 const string message =
10823 "Error: Found comma (',') in data file " + data_file_name + ", but separator is space (' ').";
10824
10825 throw logic_error(message);
10826 }
10827 if(line.find(';') != string::npos)
10828 {
10829 const string message =
10830 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is space (' ').";
10831
10832 throw logic_error(message);
10833 }
10834 }
10835 else if(separator == Tab)
10836 {
10837 if(line.find(',') != string::npos)
10838 {
10839 const string message =
10840 "Error: Found comma (',') in data file " + data_file_name + ", but separator is tab (' ').";
10841
10842 throw logic_error(message);
10843 }
10844 if(line.find(';') != string::npos)
10845 {
10846 const string message =
10847 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is tab (' ').";
10848
10849 throw logic_error(message);
10850 }
10851 }
10852 else if(separator == Comma)
10853 {
10854 if(line.find(";") != string::npos)
10855 {
10856 const string message =
10857 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is comma (',').";
10858
10859 throw logic_error(message);
10860 }
10861 }
10862 else if(separator == Semicolon)
10863 {
10864 if(line.find(",") != string::npos)
10865 {
10866 const string message =
10867 "Error: Found comma (',') in data file " + data_file_name + ", but separator is semicolon (';'). " + line;
10868
10869 throw logic_error(message);
10870 }
10871 }
10872 }
10873
10874
check_special_characters(const string & line) const10875 void DataSet::check_special_characters(const string & line) const
10876 {
10877 if(line.find_first_of("|@#~€¬^*") != std::string::npos)
10878 {
10879 const string message =
10880 "Error: found special characters in line: " + line + ". Please, review the document.";
10881
10882 throw logic_error(message);
10883 }
10884
10885 #ifdef __unix__
10886 if(line.find("\r") != std::string::npos)
10887 {
10888 const string message =
10889 "Error: mixed break line characters in line: " + line + ". Please, review the document.";
10890 throw logic_error(message);
10891 }
10892 #endif
10893 }
10894
10895
has_binary_columns() const10896 bool DataSet::has_binary_columns() const
10897 {
10898 const Index variables_number = columns.size();
10899
10900 for(Index i = 0; i < variables_number; i++)
10901 {
10902 if(columns(i).type == Binary) return true;
10903 }
10904
10905 return false;
10906 }
10907
10908
has_categorical_columns() const10909 bool DataSet::has_categorical_columns() const
10910 {
10911 const Index variables_number = columns.size();
10912
10913 for(Index i = 0; i < variables_number; i++)
10914 {
10915 if(columns(i).type == Categorical) return true;
10916 }
10917
10918 return false;
10919 }
10920
10921
has_time_columns() const10922 bool DataSet::has_time_columns() const
10923 {
10924 const Index variables_number = columns.size();
10925
10926 for(Index i = 0; i < variables_number; i++)
10927 {
10928 if(columns(i).type == DateTime) return true;
10929 }
10930
10931 return false;
10932 }
10933
10934
has_selection() const10935 bool DataSet::has_selection() const
10936 {
10937 if(get_selection_samples_number() == 0) return false;
10938
10939 return true;
10940 }
10941
10942
count_nan_columns() const10943 Tensor<Index, 1> DataSet::count_nan_columns() const
10944 {
10945 const Index columns_number = get_columns_number();
10946 const Index rows_number = get_samples_number();
10947
10948 Tensor<Index, 1> nan_columns(get_columns_number());
10949 nan_columns.setZero();
10950
10951 for(Index column_index = 0; column_index < columns_number; column_index++)
10952 {
10953 const Index current_variable_index = get_variable_indices(column_index)(0);
10954
10955 for(Index row_index = 0; row_index < rows_number; row_index++)
10956 {
10957 if(isnan(data(row_index,current_variable_index)))
10958 {
10959 nan_columns(column_index) = nan_columns(column_index) + 1;
10960 }
10961 }
10962 }
10963
10964 return nan_columns;
10965 }
10966
10967
count_rows_with_nan() const10968 Index DataSet::count_rows_with_nan() const
10969 {
10970 Index rows_with_nan = 0;
10971
10972 const Index rows_number = data.dimension(0);
10973 const Index columns_number = data.dimension(1);
10974
10975 bool has_nan = true;
10976
10977 for(Index row_index = 0; row_index < rows_number; row_index++)
10978 {
10979 has_nan = false;
10980
10981 for(Index column_index = 0; column_index < columns_number; column_index++)
10982 {
10983 if(isnan(data(row_index, column_index)))
10984 {
10985 has_nan = true;
10986 break;
10987 }
10988 }
10989
10990 if(has_nan) rows_with_nan++;
10991 }
10992
10993 return rows_with_nan;
10994 }
10995
10996
count_nan() const10997 Index DataSet::count_nan() const
10998 {
10999 const Index rows_number = data.dimension(0);
11000 const Index columns_number = data.dimension(1);
11001
11002 Index count = 0;
11003
11004 #pragma omp parallel for reduction(+: count)
11005
11006 for(Index row_index = 0; row_index < rows_number; row_index++)
11007 {
11008 for(Index column_index = 0; column_index < columns_number; column_index++)
11009 {
11010 if(isnan(data(row_index, column_index))) count++;
11011 }
11012 }
11013
11014 return count;
11015 }
11016
11017
set_missing_values_number(const Index & new_missing_values_number)11018 void DataSet::set_missing_values_number(const Index& new_missing_values_number)
11019 {
11020 missing_values_number = new_missing_values_number;
11021 }
11022
11023
set_missing_values_number()11024 void DataSet::set_missing_values_number()
11025 {
11026 missing_values_number = count_nan();
11027 }
11028
11029
set_columns_missing_values_number(const Tensor<Index,1> & new_columns_missing_values_number)11030 void DataSet::set_columns_missing_values_number(const Tensor<Index, 1>& new_columns_missing_values_number)
11031 {
11032 columns_missing_values_number = new_columns_missing_values_number;
11033 }
11034
11035
set_columns_missing_values_number()11036 void DataSet::set_columns_missing_values_number()
11037 {
11038 columns_missing_values_number = count_nan_columns();
11039 }
11040
11041
set_rows_missing_values_number(const Index & new_rows_missing_values_number)11042 void DataSet::set_rows_missing_values_number(const Index& new_rows_missing_values_number)
11043 {
11044 rows_missing_values_number = new_rows_missing_values_number;
11045 }
11046
11047
set_rows_missing_values_number()11048 void DataSet::set_rows_missing_values_number()
11049 {
11050 rows_missing_values_number = count_rows_with_nan();
11051 }
11052
11053
fix_repeated_names()11054 void DataSet::fix_repeated_names()
11055 {
11056 // Fix columns names
11057
11058 const Index columns_number = columns.size();
11059
11060 std::map<std::string, Index> columns_count_map;
11061
11062 for(Index i = 0; i < columns_number; i++)
11063 {
11064 auto result = columns_count_map.insert(std::pair<std::string, Index>(columns(i).name, 1));
11065
11066 if (!result.second) result.first->second++;
11067 }
11068
11069 for (auto & element : columns_count_map)
11070 {
11071 if(element.second > 1)
11072 {
11073 const string repeated_name = element.first;
11074 Index repeated_index = 1;
11075
11076 for(Index i = 0; i < columns.size(); i++)
11077 {
11078 if(columns(i).name == repeated_name)
11079 {
11080 columns(i).name = columns(i).name + "_" + std::to_string(repeated_index);
11081 repeated_index++;
11082 }
11083 }
11084 }
11085 }
11086
11087 // Fix variables names
11088
11089 if(has_categorical_columns() || has_binary_columns())
11090 {
11091 Tensor<string, 1> variables_names = get_variables_names();
11092
11093 const Index variables_number = variables_names.size();
11094
11095 std::map<std::string, Index> variables_count_map;
11096
11097 for(Index i = 0; i < variables_number; i++)
11098 {
11099 auto result = variables_count_map.insert(std::pair<std::string, Index>(variables_names(i), 1));
11100
11101 if (!result.second) result.first->second++;
11102 }
11103
11104 for (auto & element : variables_count_map)
11105 {
11106 if(element.second > 1)
11107 {
11108 const string repeated_name = element.first;
11109
11110 for(Index i = 0; i < variables_number; i++)
11111 {
11112 if(variables_names(i) == repeated_name)
11113 {
11114 const Index column_index = get_column_index(i);
11115
11116 if(columns(column_index).type != Categorical) continue;
11117
11118 variables_names(i) = variables_names(i) + "_" + columns(column_index).name;
11119 }
11120 }
11121 }
11122 }
11123
11124 set_variables_names(variables_names);
11125 }
11126 }
11127
11128
push_back(const Tensor<Index,1> & old_vector,const Index & new_string) const11129 Tensor<Index, 1> DataSet::push_back(const Tensor<Index, 1>& old_vector, const Index& new_string) const
11130 {
11131 const Index old_size = old_vector.size();
11132
11133 const Index new_size = old_size+1;
11134
11135 Tensor<Index, 1> new_vector(new_size);
11136
11137 for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
11138
11139 new_vector(new_size-1) = new_string;
11140
11141 return new_vector;
11142 }
11143
11144
push_back(const Tensor<string,1> & old_vector,const string & new_string) const11145 Tensor<string, 1> DataSet::push_back(const Tensor<string, 1>& old_vector, const string& new_string) const
11146 {
11147 const Index old_size = old_vector.size();
11148
11149 const Index new_size = old_size+1;
11150
11151 Tensor<string, 1> new_vector(new_size);
11152
11153 for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
11154
11155 new_vector(new_size-1) = new_string;
11156
11157 return new_vector;
11158 }
11159
11160
initialize_sequential_eigen_tensor(Tensor<Index,1> & new_tensor,const Index & start,const Index & step,const Index & end) const11161 void DataSet::initialize_sequential_eigen_tensor(Tensor<Index, 1>& new_tensor,
11162 const Index& start, const Index& step, const Index& end) const
11163 {
11164 const Index new_size = (end-start)/step+1;
11165
11166 new_tensor.resize(new_size);
11167 new_tensor(0) = start;
11168
11169 for(Index i = 1; i < new_size-1; i++)
11170 {
11171 new_tensor(i) = new_tensor(i-1)+step;
11172 }
11173
11174 new_tensor(new_size-1) = end;
11175 }
11176
11177
intialize_sequential_eigen_type_tensor(Tensor<type,1> & new_tensor,const type & start,const type & step,const type & end) const11178 void DataSet::intialize_sequential_eigen_type_tensor(Tensor<type, 1>& new_tensor,
11179 const type& start, const type& step, const type& end) const
11180 {
11181 const Index new_size = (end-start)/step+1;
11182
11183 new_tensor.resize(new_size);
11184 new_tensor(0) = start;
11185
11186 for(Index i = 1; i < new_size-1; i++)
11187 {
11188 new_tensor(i) = new_tensor(i-1)+step;
11189 }
11190
11191 new_tensor(new_size-1) = end;
11192 }
11193
11194
split_samples(const Tensor<Index,1> & samples_indices,const Index & new_batch_size) const11195 Tensor<Index, 2> DataSet::split_samples(const Tensor<Index, 1>& samples_indices, const Index & new_batch_size) const
11196 {
11197 const Index samples_number = samples_indices.dimension(0);
11198
11199 Index batches_number;
11200 Index batch_size = new_batch_size;
11201
11202 // const Index batches_number = samples_number / batch_size;
11203 if(samples_number < batch_size)
11204 {
11205 batches_number = 1;
11206 batch_size = samples_number;
11207 }
11208 else
11209 {
11210 batches_number = samples_number / batch_size;
11211 }
11212
11213
11214 Tensor<Index, 2> batches(batches_number, batch_size);
11215
11216 Index count = 0;
11217
11218 for(Index i = 0; i < batches_number; ++i)
11219 {
11220 for(Index j = 0; j < batch_size; ++j)
11221 {
11222 batches(i,j) = samples_indices(count);
11223
11224 count++;
11225 }
11226 }
11227
11228 return batches;
11229 }
11230
fill_submatrix(const Tensor<type,2> & matrix,const Tensor<Index,1> & rows_indices,const Tensor<Index,1> & columns_indices,type * submatrix_pointer)11231 void DataSet::fill_submatrix(const Tensor<type, 2>& matrix,
11232 const Tensor<Index, 1>& rows_indices,
11233 const Tensor<Index, 1>& columns_indices, type* submatrix_pointer)
11234 {
11235 const Index rows_number = rows_indices.size();
11236 const Index columns_number = columns_indices.size();
11237
11238 const type* matrix_pointer = matrix.data();
11239
11240 #pragma omp parallel for
11241
11242 for(Index j = 0; j < columns_number; j++)
11243 {
11244 const type* matrix_column_pointer = matrix_pointer + matrix.dimension(0)*columns_indices[j];
11245 type* submatrix_column_pointer = submatrix_pointer + rows_number*j;
11246
11247 const type* value_pointer = nullptr;
11248 const Index* rows_indices_pointer = rows_indices.data();
11249 for(Index i = 0; i < rows_number; i++)
11250 {
11251 value_pointer = matrix_column_pointer + *rows_indices_pointer;
11252 rows_indices_pointer++;
11253 *submatrix_column_pointer = *value_pointer;
11254 submatrix_column_pointer++;
11255 }
11256 }
11257 }
11258
11259
fill(const Tensor<Index,1> & samples,const Tensor<Index,1> & inputs,const Tensor<Index,1> & targets)11260 void DataSet::Batch::fill(const Tensor<Index, 1>& samples,
11261 const Tensor<Index, 1>& inputs,
11262 const Tensor<Index, 1>& targets)
11263 {
11264 const Tensor<type, 2>& data = data_set_pointer->get_data();
11265
11266 const Tensor<Index, 1>& input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11267
11268 if(input_variables_dimensions.size() == 1)
11269 {
11270 data_set_pointer->fill_submatrix(data, samples, inputs, inputs_2d.data());
11271 }
11272 else if(input_variables_dimensions.size() == 3)
11273 {
11274 /*
11275 const Index channels_number = input_variables_dimensions(0);
11276 const Index rows_number = input_variables_dimensions(1);
11277 const Index columns_number = input_variables_dimensions(2);
11278 inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11279 Index index = 0;
11280 for(Index image = 0; image < samples_number; image++)
11281 {
11282 index = 0;
11283 for(Index channel = 0; channel < channels_number; channel++)
11284 {
11285 for(Index row = 0; row < rows_number; row++)
11286 {
11287 for(Index column = 0; column < columns_number; column++)
11288 {
11289 inputs_4d(image, channel, row, column) = data(image, index);
11290 index++;
11291 }
11292 }
11293 }
11294 }
11295 */
11296 }
11297 data_set_pointer->fill_submatrix(data, samples, targets, targets_2d.data());
11298 }
11299
11300
Batch(const Index & new_samples_number,DataSet * new_data_set_pointer)11301 DataSet::Batch::Batch(const Index& new_samples_number, DataSet* new_data_set_pointer)
11302 {
11303 samples_number = new_samples_number;
11304
11305 data_set_pointer = new_data_set_pointer;
11306
11307 const Index input_variables_number = data_set_pointer->get_input_variables_number();
11308 const Index target_variables_number = data_set_pointer->get_target_variables_number();
11309
11310 const Tensor<Index, 1> input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11311
11312 if(input_variables_dimensions.rank() == 1)
11313 {
11314 inputs_2d.resize(samples_number, input_variables_number);
11315 }
11316 else if(input_variables_dimensions.rank() == 3)
11317 {
11318 const Index channels_number = input_variables_dimensions(0);
11319 const Index rows_number = input_variables_dimensions(1);
11320 const Index columns_number = input_variables_dimensions(2);
11321
11322 inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11323 }
11324
11325
11326 targets_2d.resize(samples_number, target_variables_number);
11327 }
11328
11329
get_samples_number() const11330 Index DataSet::Batch::get_samples_number() const
11331 {
11332 return samples_number;
11333 }
11334
11335
print()11336 void DataSet::Batch::print()
11337 {
11338 cout << "Batch structure" << endl;
11339
11340 cout << "Inputs:" << endl;
11341 cout << inputs_2d << endl;
11342
11343 cout << "Targets:" << endl;
11344 cout << targets_2d << endl;
11345 }
11346
11347
shuffle()11348 void DataSet::shuffle()
11349 {
11350 const Index data_rows = data.dimension(0);
11351 const Index data_columns = data.dimension(1);
11352
11353 Tensor<Index, 1> indices(data_rows);
11354
11355 for(Index i = 0; i < data_rows; i++) indices(i) = i;
11356
11357 random_shuffle(&indices(0), &indices(data_rows-1));
11358
11359 Tensor<type, 2> new_data(data_rows, data_columns);
11360 Tensor<string, 1> new_rows_labels(data_rows);
11361
11362 Index index = 0;
11363
11364 for(Index i = 0; i < data_rows; i++)
11365 {
11366 index = indices(i);
11367
11368 new_rows_labels(i) = rows_labels(index);
11369
11370 for(Index j = 0; j < data_columns; j++)
11371 {
11372 new_data(i,j) = data(index,j);
11373 }
11374 }
11375
11376 data = new_data;
11377 rows_labels = new_rows_labels;
11378 }
11379
11380
get_has_rows_labels() const11381 bool DataSet::get_has_rows_labels() const
11382 {
11383 return this->has_rows_labels;
11384 }
11385
11386 }
11387
11388
11389 // OpenNN: Open Neural Networks Library.
11390 // Copyright(C) 2005-2020 Artificial Intelligence Techniques, SL.
11391 //
11392 // This library is free software; you can redistribute it and/or
11393 // modify it under the terms of the GNU Lesser General Public
11394 // License as published by the Free Software Foundation; either
11395 // version 2.1 of the License, or any later version.
11396 //
11397 // This library is distributed in the hope that it will be useful,
11398 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11399 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11400 // Lesser General Public License for more details.
11401
11402 // You should have received a copy of the GNU Lesser General Public
11403 // License along with this library; if not, write to the Free Software
11404 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
11405