1 #include <dmlc/data.h>
2 #include <dmlc/filesystem.h>
3 #include <string>
4 #include <iostream>
5 #include <fstream>
6 #include <vector>
7 #include <algorithm>
8 #include <random>
9 #include <future>
10 #include <cstdlib>
11 #include <gtest/gtest.h>
12 
13 namespace {
14 
CountDimensions(dmlc::Parser<uint32_t> * parser,size_t * out_num_row,size_t * out_num_col)15 inline void CountDimensions(dmlc::Parser<uint32_t>* parser,
16                             size_t* out_num_row, size_t* out_num_col) {
17   size_t num_row = 0;
18   size_t num_col = 0;
19   while (parser->Next()) {
20     const dmlc::RowBlock<uint32_t>& batch = parser->Value();
21     num_row += batch.size;
22     for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
23       const uint32_t index = batch.index[i];
24       num_col = std::max(num_col, static_cast<size_t>(index + 1));
25     }
26   }
27   *out_num_row = num_row;
28   *out_num_col = num_col;
29 }
30 
31 struct RecordIOHeader {
32   uint32_t flag;
33   float label;
34   uint64_t image_id[2];
35 };
36 
37 }  // namespace anonymous
38 
TEST(InputSplit,test_split_csv_noeol)39 TEST(InputSplit, test_split_csv_noeol) {
40   size_t num_row, num_col;
41   {
42     /* Create a test case for partitioned csv with NOEOL */
43     dmlc::TemporaryDirectory tempdir;
44     {
45       std::ofstream of(tempdir.path + "/train_0.csv", std::ios::binary);
46       of << "0,1,1,1";  // NOEOL (no '\n' at end of file)
47     }
48     {
49       std::ofstream of(tempdir.path + "/train_1.csv", std::ios::binary);
50       of << "0,1,1,2\n";
51     }
52     {
53       std::ofstream of(tempdir.path + "/train_2.csv", std::ios::binary);
54       of << "0,1,1,2\n";
55     }
56     /* Load the test case with InputSplit and obtain matrix dimensions */
57     {
58       std::unique_ptr<dmlc::Parser<uint32_t> > parser(
59         dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), 0, 1, "csv"));
60       CountDimensions(parser.get(), &num_row, &num_col);
61     }
62   }
63   /* Check matrix dimensions: must be 3x4 */
64   ASSERT_EQ(num_row, 3U);
65   ASSERT_EQ(num_col, 4U);
66 }
67 
TEST(InputSplit,test_split_libsvm_noeol)68 TEST(InputSplit, test_split_libsvm_noeol) {
69   {
70     /* Create a test case for partitioned libsvm with NOEOL */
71     dmlc::TemporaryDirectory tempdir;
72     const char* line
73       = "1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 "
74         "77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1";
75     {
76       std::ofstream of(tempdir.path + "/train_0.libsvm", std::ios::binary);
77       of << line << "\n";
78     }
79     {
80       std::ofstream of(tempdir.path + "/train_1.libsvm", std::ios::binary);
81       of << line;  // NOEOL (no '\n' at end of file)
82     }
83     std::unique_ptr<dmlc::Parser<uint32_t> > parser(
84       dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), 0, 1, "libsvm"));
85     size_t num_row, num_col;
86     CountDimensions(parser.get(), &num_row, &num_col);
87     ASSERT_EQ(num_row, 2);
88     ASSERT_EQ(num_col, 125);
89   }
90 }
91 
TEST(InputSplit,test_split_libsvm)92 TEST(InputSplit, test_split_libsvm) {
93   size_t num_row, num_col;
94   {
95     /* Create a test case for partitioned libsvm */
96     dmlc::TemporaryDirectory tempdir;
97     const int nfile = 5;
98     for (int file_id = 0; file_id < nfile; ++file_id) {
99       std::ofstream of(tempdir.path + "/test_" + std::to_string(file_id) + ".libsvm",
100                        std::ios::binary);
101       of << "1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 "
102          << "77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1\n";
103     }
104     /* Load the test case with InputSplit and obtain matrix dimensions */
105     {
106       std::unique_ptr<dmlc::Parser<uint32_t> > parser(
107         dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), 0, 1, "libsvm"));
108       CountDimensions(parser.get(), &num_row, &num_col);
109     }
110   }
111   /* Check matrix dimensions: must be 5x125 */
112   ASSERT_EQ(num_row, 5U);
113   ASSERT_EQ(num_col, 125U);
114 }
115 
TEST(InputSplit,test_split_libsvm_distributed)116 TEST(InputSplit, test_split_libsvm_distributed) {
117   {
118     /* Create a test case for partitioned libsvm */
119     dmlc::TemporaryDirectory tempdir;
120     const char* line
121       = "1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 "
122         "77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1\n";
123     const int nfile = 5;
124     for (int file_id = 0; file_id < nfile; ++file_id) {
125       std::ofstream of(tempdir.path + "/test_" + std::to_string(file_id) + ".libsvm",
126                        std::ios::binary);
127       const int nrepeat = (file_id == 0 ? 6 : 1);
128       for (int i = 0; i < nrepeat; ++i) {
129         of << line;
130       }
131     }
132 
133     /* Load the test case with InputSplit and obtain matrix dimensions */
134     const int npart = 2;
135     const size_t expected_dims[npart][2] = { {6, 125}, {4, 125} };
136     for (int part_id = 0; part_id < npart; ++part_id) {
137       std::unique_ptr<dmlc::Parser<uint32_t> > parser(
138         dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), part_id, npart, "libsvm"));
139       size_t num_row, num_col;
140       CountDimensions(parser.get(), &num_row, &num_col);
141       ASSERT_EQ(num_row, expected_dims[part_id][0]);
142       ASSERT_EQ(num_col, expected_dims[part_id][1]);
143     }
144   }
145 }
146 
147 #ifdef DMLC_UNIT_TESTS_USE_CMAKE
148 /* Don't run the following when CMake is not used */
149 
150 #include "./build_config.h"
151 #include <dmlc/build_config.h>
152 
153 #ifndef DMLC_CMAKE_LITTLE_ENDIAN
154   #error "DMLC_CMAKE_LITTLE_ENDIAN not defined"
155 #endif // DMLC_CMAKE_LITTLE_ENDIAN
156 
157 #if DMLC_CMAKE_LITTLE_ENDIAN
158 
TEST(InputSplit,test_recordio)159 TEST(InputSplit, test_recordio) {
160   dmlc::TemporaryDirectory tempdir;
161 
162   std::unique_ptr<dmlc::InputSplit> source(
163     dmlc::InputSplit::Create(CMAKE_CURRENT_SOURCE_DIR "/sample.rec", 0, 1, "recordio"));
164 
165   source->BeforeFirst();
166   dmlc::InputSplit::Blob rec;
167   char* content;
168   RecordIOHeader header;
169   size_t content_size;
170 
171   int idx = 1;
172 
173   while (source->NextRecord(&rec)) {
174     ASSERT_GT(rec.size, sizeof(header));
175     std::memcpy(&header, rec.dptr, sizeof(header));
176     content = reinterpret_cast<char*>(rec.dptr) + sizeof(header);
177     content_size = rec.size - sizeof(header);
178 
179     std::string expected;
180     for (int i = 0; i < 10; ++i) {
181       expected += std::to_string(idx) + "\n";
182     }
183 
184     ASSERT_EQ(header.label, static_cast<float>(idx % 2));
185     ASSERT_EQ(header.image_id[0], idx);
186     ASSERT_EQ(std::string(content, content_size), expected);
187 
188     ++idx;
189   }
190 }
191 
192 #endif  // DMLC_CMAKE_LITTLE_ENDIAN
193 
194 #endif  // DMLC_UNIT_TESTS_USE_CMAKE
195