1 #include <dmlc/data.h>
2 #include <dmlc/filesystem.h>
3 #include <string>
4 #include <iostream>
5 #include <fstream>
6 #include <vector>
7 #include <algorithm>
8 #include <random>
9 #include <future>
10 #include <cstdlib>
11 #include <gtest/gtest.h>
12
13 namespace {
14
CountDimensions(dmlc::Parser<uint32_t> * parser,size_t * out_num_row,size_t * out_num_col)15 inline void CountDimensions(dmlc::Parser<uint32_t>* parser,
16 size_t* out_num_row, size_t* out_num_col) {
17 size_t num_row = 0;
18 size_t num_col = 0;
19 while (parser->Next()) {
20 const dmlc::RowBlock<uint32_t>& batch = parser->Value();
21 num_row += batch.size;
22 for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
23 const uint32_t index = batch.index[i];
24 num_col = std::max(num_col, static_cast<size_t>(index + 1));
25 }
26 }
27 *out_num_row = num_row;
28 *out_num_col = num_col;
29 }
30
31 struct RecordIOHeader {
32 uint32_t flag;
33 float label;
34 uint64_t image_id[2];
35 };
36
37 } // namespace anonymous
38
TEST(InputSplit,test_split_csv_noeol)39 TEST(InputSplit, test_split_csv_noeol) {
40 size_t num_row, num_col;
41 {
42 /* Create a test case for partitioned csv with NOEOL */
43 dmlc::TemporaryDirectory tempdir;
44 {
45 std::ofstream of(tempdir.path + "/train_0.csv", std::ios::binary);
46 of << "0,1,1,1"; // NOEOL (no '\n' at end of file)
47 }
48 {
49 std::ofstream of(tempdir.path + "/train_1.csv", std::ios::binary);
50 of << "0,1,1,2\n";
51 }
52 {
53 std::ofstream of(tempdir.path + "/train_2.csv", std::ios::binary);
54 of << "0,1,1,2\n";
55 }
56 /* Load the test case with InputSplit and obtain matrix dimensions */
57 {
58 std::unique_ptr<dmlc::Parser<uint32_t> > parser(
59 dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), 0, 1, "csv"));
60 CountDimensions(parser.get(), &num_row, &num_col);
61 }
62 }
63 /* Check matrix dimensions: must be 3x4 */
64 ASSERT_EQ(num_row, 3U);
65 ASSERT_EQ(num_col, 4U);
66 }
67
TEST(InputSplit,test_split_libsvm_noeol)68 TEST(InputSplit, test_split_libsvm_noeol) {
69 {
70 /* Create a test case for partitioned libsvm with NOEOL */
71 dmlc::TemporaryDirectory tempdir;
72 const char* line
73 = "1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 "
74 "77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1";
75 {
76 std::ofstream of(tempdir.path + "/train_0.libsvm", std::ios::binary);
77 of << line << "\n";
78 }
79 {
80 std::ofstream of(tempdir.path + "/train_1.libsvm", std::ios::binary);
81 of << line; // NOEOL (no '\n' at end of file)
82 }
83 std::unique_ptr<dmlc::Parser<uint32_t> > parser(
84 dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), 0, 1, "libsvm"));
85 size_t num_row, num_col;
86 CountDimensions(parser.get(), &num_row, &num_col);
87 ASSERT_EQ(num_row, 2);
88 ASSERT_EQ(num_col, 125);
89 }
90 }
91
TEST(InputSplit,test_split_libsvm)92 TEST(InputSplit, test_split_libsvm) {
93 size_t num_row, num_col;
94 {
95 /* Create a test case for partitioned libsvm */
96 dmlc::TemporaryDirectory tempdir;
97 const int nfile = 5;
98 for (int file_id = 0; file_id < nfile; ++file_id) {
99 std::ofstream of(tempdir.path + "/test_" + std::to_string(file_id) + ".libsvm",
100 std::ios::binary);
101 of << "1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 "
102 << "77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1\n";
103 }
104 /* Load the test case with InputSplit and obtain matrix dimensions */
105 {
106 std::unique_ptr<dmlc::Parser<uint32_t> > parser(
107 dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), 0, 1, "libsvm"));
108 CountDimensions(parser.get(), &num_row, &num_col);
109 }
110 }
111 /* Check matrix dimensions: must be 5x125 */
112 ASSERT_EQ(num_row, 5U);
113 ASSERT_EQ(num_col, 125U);
114 }
115
TEST(InputSplit,test_split_libsvm_distributed)116 TEST(InputSplit, test_split_libsvm_distributed) {
117 {
118 /* Create a test case for partitioned libsvm */
119 dmlc::TemporaryDirectory tempdir;
120 const char* line
121 = "1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 "
122 "77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1\n";
123 const int nfile = 5;
124 for (int file_id = 0; file_id < nfile; ++file_id) {
125 std::ofstream of(tempdir.path + "/test_" + std::to_string(file_id) + ".libsvm",
126 std::ios::binary);
127 const int nrepeat = (file_id == 0 ? 6 : 1);
128 for (int i = 0; i < nrepeat; ++i) {
129 of << line;
130 }
131 }
132
133 /* Load the test case with InputSplit and obtain matrix dimensions */
134 const int npart = 2;
135 const size_t expected_dims[npart][2] = { {6, 125}, {4, 125} };
136 for (int part_id = 0; part_id < npart; ++part_id) {
137 std::unique_ptr<dmlc::Parser<uint32_t> > parser(
138 dmlc::Parser<uint32_t>::Create(tempdir.path.c_str(), part_id, npart, "libsvm"));
139 size_t num_row, num_col;
140 CountDimensions(parser.get(), &num_row, &num_col);
141 ASSERT_EQ(num_row, expected_dims[part_id][0]);
142 ASSERT_EQ(num_col, expected_dims[part_id][1]);
143 }
144 }
145 }
146
147 #ifdef DMLC_UNIT_TESTS_USE_CMAKE
148 /* Don't run the following when CMake is not used */
149
150 #include "./build_config.h"
151 #include <dmlc/build_config.h>
152
153 #ifndef DMLC_CMAKE_LITTLE_ENDIAN
154 #error "DMLC_CMAKE_LITTLE_ENDIAN not defined"
155 #endif // DMLC_CMAKE_LITTLE_ENDIAN
156
157 #if DMLC_CMAKE_LITTLE_ENDIAN
158
TEST(InputSplit,test_recordio)159 TEST(InputSplit, test_recordio) {
160 dmlc::TemporaryDirectory tempdir;
161
162 std::unique_ptr<dmlc::InputSplit> source(
163 dmlc::InputSplit::Create(CMAKE_CURRENT_SOURCE_DIR "/sample.rec", 0, 1, "recordio"));
164
165 source->BeforeFirst();
166 dmlc::InputSplit::Blob rec;
167 char* content;
168 RecordIOHeader header;
169 size_t content_size;
170
171 int idx = 1;
172
173 while (source->NextRecord(&rec)) {
174 ASSERT_GT(rec.size, sizeof(header));
175 std::memcpy(&header, rec.dptr, sizeof(header));
176 content = reinterpret_cast<char*>(rec.dptr) + sizeof(header);
177 content_size = rec.size - sizeof(header);
178
179 std::string expected;
180 for (int i = 0; i < 10; ++i) {
181 expected += std::to_string(idx) + "\n";
182 }
183
184 ASSERT_EQ(header.label, static_cast<float>(idx % 2));
185 ASSERT_EQ(header.image_id[0], idx);
186 ASSERT_EQ(std::string(content, content_size), expected);
187
188 ++idx;
189 }
190 }
191
192 #endif // DMLC_CMAKE_LITTLE_ENDIAN
193
194 #endif // DMLC_UNIT_TESTS_USE_CMAKE
195