1 #include "../src/data/csv_parser.h"
2 #include "../src/data/libsvm_parser.h"
3 #include "../src/data/libfm_parser.h"
4 #include <cstdio>
5 #include <cstdlib>
6 #include <dmlc/io.h>
7 #include <gtest/gtest.h>
8
9 using namespace dmlc;
10 using namespace dmlc::data;
11
12 namespace parser_test {
13 template <typename IndexType, typename DType = real_t>
14 class CSVParserTest : public CSVParser<IndexType, DType> {
15 public:
CSVParserTest(InputSplit * source,const std::map<std::string,std::string> & args,int nthread)16 explicit CSVParserTest(InputSplit *source,
17 const std::map<std::string, std::string> &args,
18 int nthread)
19 : CSVParser<IndexType, DType>(source, args, nthread) {}
CallParseBlock(char * begin,char * end,RowBlockContainer<IndexType,DType> * out)20 void CallParseBlock(char *begin, char *end,
21 RowBlockContainer<IndexType, DType> *out) {
22 CSVParser<IndexType, DType>::ParseBlock(begin, end, out);
23 }
24 };
25
26 template <typename IndexType, typename DType = real_t>
27 class LibSVMParserTest : public LibSVMParser<IndexType, DType> {
28 public:
LibSVMParserTest(InputSplit * source,const std::map<std::string,std::string> & args,int nthread)29 explicit LibSVMParserTest(InputSplit *source,
30 const std::map<std::string, std::string> &args,
31 int nthread)
32 : LibSVMParser<IndexType, DType>(source, args, nthread) {}
CallParseBlock(char * begin,char * end,RowBlockContainer<IndexType,DType> * out)33 void CallParseBlock(char *begin, char *end,
34 RowBlockContainer<IndexType, DType> *out) {
35 LibSVMParser<IndexType, DType>::ParseBlock(begin, end, out);
36 }
37 };
38
39 template <typename IndexType, typename DType = real_t>
40 class LibFMParserTest : public LibFMParser<IndexType, DType> {
41 public:
LibFMParserTest(InputSplit * source,const std::map<std::string,std::string> & args,int nthread)42 explicit LibFMParserTest(InputSplit *source,
43 const std::map<std::string, std::string> &args,
44 int nthread)
45 : LibFMParser<IndexType, DType>(source, args, nthread) {}
CallParseBlock(char * begin,char * end,RowBlockContainer<IndexType,DType> * out)46 void CallParseBlock(char *begin, char *end,
47 RowBlockContainer<IndexType, DType> *out) {
48 LibFMParser<IndexType, DType>::ParseBlock(begin, end, out);
49 }
50 };
51
52 } // namespace parser_test
53
54 namespace {
55
56 template <typename IndexType>
CountDimensions(RowBlockContainer<IndexType> * rctr,size_t * out_num_row,size_t * out_num_col)57 static inline void CountDimensions(RowBlockContainer<IndexType>* rctr,
58 size_t* out_num_row, size_t* out_num_col) {
59 size_t num_row = rctr->label.size();
60 size_t num_col = 0;
61 for (size_t i = rctr->offset[0]; i < rctr->offset[num_row]; ++i) {
62 const IndexType index = rctr->index[i];
63 num_col = std::max(num_col, static_cast<size_t>(index + 1));
64 }
65 *out_num_row = num_row;
66 *out_num_col = num_col;
67 }
68
69 } // namespace anonymous
70
TEST(CSVParser,test_ignore_bom)71 TEST(CSVParser, test_ignore_bom) {
72 using namespace parser_test;
73 InputSplit *source = nullptr;
74 const std::map<std::string, std::string> args;
75 std::unique_ptr<CSVParserTest<unsigned>> parser(
76 new CSVParserTest<unsigned>(source, args, 1));
77 std::string data = "\xEF\xBB\xBF\x31\n\xEF\xBB\x32\n";
78 char *out_data = (char *)data.c_str();
79 std::unique_ptr<RowBlockContainer<unsigned> > rctr {new RowBlockContainer<unsigned>()};
80 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
81 CHECK(rctr->value[0] == 1);
82 CHECK(rctr->value[1] == 0);
83 data = "\xEF\xBB\xBF\x31\n\xEF\xBB\xBF\x32\n";
84 out_data = (char *)data.c_str();
85 rctr.reset(new RowBlockContainer<unsigned>());
86 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
87
88 CHECK(rctr->value[0] == 1);
89 CHECK(rctr->value[1] == 2);
90 }
91
TEST(CSVParser,test_standard_case)92 TEST(CSVParser, test_standard_case) {
93 using namespace parser_test;
94 InputSplit *source = nullptr;
95 const std::map<std::string, std::string> args;
96 std::unique_ptr<CSVParserTest<unsigned>> parser(
97 new CSVParserTest<unsigned>(source, args, 1));
98 std::unique_ptr<RowBlockContainer<unsigned>> rctr { new RowBlockContainer<unsigned>() };
99 std::string data = "0,1,2,3\n4,5,6,7\n8,9,10,11\n";
100 char *out_data = const_cast<char *>(data.c_str());
101 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
102 for (size_t i = 0; i < rctr->value.size(); i++) {
103 CHECK(i == rctr->value[i]);
104 }
105 }
106
TEST(CSVParser,test_int32_parse)107 TEST(CSVParser, test_int32_parse) {
108 using namespace parser_test;
109 InputSplit *source = nullptr;
110 const std::map<std::string, std::string> args;
111 std::unique_ptr<CSVParserTest<unsigned, int32_t>> parser(
112 new CSVParserTest<unsigned, int32_t>(source, args, 1));
113 std::unique_ptr<RowBlockContainer<unsigned, int32_t>> rctr {
114 new RowBlockContainer<unsigned, int32_t>()};
115 std::string data = "20000000,20000001,20000002,20000003\n"
116 "20000004,20000005,20000006,20000007\n"
117 "20000008,20000009,20000010,20000011\n";
118 char *out_data = const_cast<char *>(data.c_str());
119 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
120 for (size_t i = 0; i < rctr->value.size(); i++) {
121 CHECK((i+20000000) == (size_t)rctr->value[i]);
122 }
123 }
124
TEST(CSVParser,test_int64_parse)125 TEST(CSVParser, test_int64_parse) {
126 using namespace parser_test;
127 InputSplit *source = nullptr;
128 const std::map<std::string, std::string> args;
129 std::unique_ptr<CSVParserTest<unsigned, int64_t>> parser(
130 new CSVParserTest<unsigned, int64_t>(source, args, 1));
131 std::unique_ptr<RowBlockContainer<unsigned, int64_t> > rctr {
132 new RowBlockContainer<unsigned, int64_t>()};
133 std::string data = "2147483648,2147483649,2147483650,2147483651\n"
134 "2147483652,2147483653,2147483654,2147483655\n"
135 "2147483656,2147483657,2147483658,2147483659\n";
136 char *out_data = const_cast<char *>(data.c_str());
137 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
138 for (size_t i = 0; i < rctr->value.size(); i++) {
139 CHECK((i+2147483648) == (size_t)rctr->value[i]);
140 }
141 }
142
TEST(CSVParser,test_different_newlines)143 TEST(CSVParser, test_different_newlines) {
144 using namespace parser_test;
145 InputSplit *source = nullptr;
146 const std::map<std::string, std::string> args;
147 std::unique_ptr<CSVParserTest<unsigned>> parser(
148 new CSVParserTest<unsigned>(source, args, 1));
149 std::unique_ptr<RowBlockContainer<unsigned> > rctr {new RowBlockContainer<unsigned>()};
150 std::string data = "0,1,2,3\r\n4,5,6,7\r\n8,9,10,11\r\n";
151 char *out_data = const_cast<char *>(data.c_str());
152 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
153 for (size_t i = 0; i < rctr->value.size(); i++) {
154 CHECK(i == rctr->value[i]);
155 }
156 }
157
TEST(CSVParser,test_noeol)158 TEST(CSVParser, test_noeol) {
159 using namespace parser_test;
160 InputSplit *source = nullptr;
161 const std::map<std::string, std::string> args;
162 std::unique_ptr<CSVParserTest<unsigned>> parser(
163 new CSVParserTest<unsigned>(source, args, 1));
164 std::unique_ptr<RowBlockContainer<unsigned> > rctr {new RowBlockContainer<unsigned>()} ;
165 std::string data = "0,1,2,3\r\n4,5,6,7\r\n8,9,10,11";
166 char *out_data = const_cast<char *>(data.c_str());
167 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
168 for (size_t i = 0; i < rctr->value.size(); i++) {
169 CHECK(i == rctr->value[i]);
170 }
171 }
172
TEST(CSVParser,test_delimiter)173 TEST(CSVParser, test_delimiter) {
174 using namespace parser_test;
175 InputSplit *source = nullptr;
176 const std::map<std::string, std::string> args{ {"delimiter", " "} };
177 std::unique_ptr<CSVParserTest<unsigned>> parser(
178 new CSVParserTest<unsigned>(source, args, 1));
179 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
180 std::string data = "0 1 2 3\n4 5 6 7\n8 9 10 11";
181 char *out_data = const_cast<char *>(data.c_str());
182 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
183 for (size_t i = 0; i < rctr->value.size(); i++) {
184 CHECK(i == rctr->value[i]);
185 }
186 }
187
TEST(CSVParser,test_weight_column)188 TEST(CSVParser, test_weight_column) {
189 using namespace parser_test;
190 InputSplit *source = nullptr;
191 const std::map<std::string, std::string> args{ {"weight_column", "2"} };
192 std::unique_ptr<CSVParserTest<unsigned>> parser(
193 new CSVParserTest<unsigned>(source, args, 1));
194 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
195 std::string data = "0,1,2,3\n4,5,6,7\n8,9,10,11";
196 char *out_data = const_cast<char *>(data.c_str());
197 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
198 CHECK_EQ(rctr->weight.size(), 3U);
199 for (size_t i = 0; i < rctr->weight.size(); i++) {
200 CHECK_EQ(rctr->weight[i], 2.0f + 4.0f * i);
201 }
202 const std::vector<real_t>
203 expected_values{0.0f, 1.0f, 3.0f, 4.0f, 5.0f, 7.0f, 8.0f, 9.0f, 11.0f};
204 CHECK_EQ(rctr->value.size(), expected_values.size());
205 for (size_t i = 0; i < rctr->value.size(); i++) {
206 CHECK_EQ(rctr->value[i], expected_values[i]);
207 }
208 }
209
TEST(CSVParser,test_weight_column_2)210 TEST(CSVParser, test_weight_column_2) {
211 using namespace parser_test;
212 InputSplit *source = nullptr;
213 const std::map<std::string, std::string> args;
214 std::unique_ptr<CSVParserTest<unsigned>> parser(
215 new CSVParserTest<unsigned>(source, args, 1));
216 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
217 std::string data = "0,1,2,3\n4,5,6,7\n8,9,10,11";
218 char *out_data = const_cast<char *>(data.c_str());
219 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
220 CHECK(rctr->weight.empty());
221 CHECK_EQ(rctr->value.size(), 12U);
222 for (size_t i = 0; i < rctr->value.size(); i++) {
223 CHECK(i == rctr->value[i]);
224 }
225 }
226
test_qid(std::string data)227 void test_qid(std::string data) {
228 using namespace parser_test;
229 InputSplit *source = nullptr;
230 const std::map<std::string, std::string> args;
231 std::unique_ptr<LibSVMParserTest<unsigned>> parser(
232 new LibSVMParserTest<unsigned>(source, args, 1));
233 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
234 char* out_data = const_cast<char*>(data.c_str());
235 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
236 const std::vector<size_t> expected_offset{
237 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
238 };
239 const std::vector<real_t> expected_label{
240 3, 2, 1, 1, 1, 2, 1, 1, 2, 3, 4, 1
241 };
242 const std::vector<uint64_t> expected_qid{
243 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
244 };
245 const std::vector<unsigned> expected_index{
246 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5,
247 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5,
248 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5
249 };
250 const std::vector<real_t> expected_value{
251 1.0f, 1.0f, 0.0f, 0.2f, 0.0f, 0.0f, 0.0f, 1.0f, 0.1f, 1.0f, 0.0f, 1.0f, 0.0f, 0.4f, 0.0f, 0.0f,
252 0.0f, 1.0f, 0.3f, 0.0f, 0.0f, 0.0f, 1.0f, 0.2f, 0.0f, 1.0f, 0.0f, 1.0f, 0.4f, 0.0f, 0.0f, 0.0f,
253 1.0f, 0.1f, 0.0f, 0.0f, 0.0f, 1.0f, 0.2f, 0.0f, 0.0f, 0.0f, 1.0f, 0.1f, 1.0f, 1.0f, 1.0f, 0.0f,
254 0.3f, 0.0f, 1.0f, 0.0f, 0.0f, 0.4f, 1.0f, 0.0f, 1.0f, 1.0f, 0.5f, 0.0f
255 };
256 CHECK(rctr->offset == expected_offset);
257 CHECK(rctr->label == expected_label);
258 CHECK(rctr->qid == expected_qid);
259 CHECK(rctr->index == expected_index);
260 CHECK(rctr->value == expected_value);
261 }
262
TEST(LibSVMParser,test_qid)263 TEST(LibSVMParser, test_qid) {
264 std::string data = R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
265 2 qid:1 1:0 2:0 3:1 4:0.1 5:1
266 1 qid:1 1:0 2:1 3:0 4:0.4 5:0
267 1 qid:1 1:0 2:0 3:1 4:0.3 5:0
268 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
269 2 qid:2 1:1 2:0 3:1 4:0.4 5:0
270 1 qid:2 1:0 2:0 3:1 4:0.1 5:0
271 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
272 2 qid:3 1:0 2:0 3:1 4:0.1 5:1
273 3 qid:3 1:1 2:1 3:0 4:0.3 5:0
274 4 qid:3 1:1 2:0 3:0 4:0.4 5:1
275 1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
276 test_qid(data);
277 }
278
TEST(LibSVMParser,test_qid_with_comment)279 TEST(LibSVMParser, test_qid_with_comment) {
280 std::string data = R"qid(# what does foo bar mean anyway
281 3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # foo
282 2 qid:1 1:0 2:0 3:1 4:0.1 5:1
283 1 qid:1 1:0 2:1 3:0 4:0.4 5:0
284 1 qid:1 1:0 2:0 3:1 4:0.3 5:0
285 1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # bar
286 2 qid:2 1:1 2:0 3:1 4:0.4 5:0
287 1 qid:2 1:0 2:0 3:1 4:0.1 5:0
288 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
289 2 qid:3 1:0 2:0 3:1 4:0.1 5:1
290 3 qid:3 1:1 2:1 3:0 4:0.3 5:0
291 4 qid:3 1:1 2:0 3:0 4:0.4 5:1
292 1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
293 test_qid(data);
294 }
295
TEST(LibSVMParser,test_excess_decimal_digits)296 TEST(LibSVMParser, test_excess_decimal_digits) {
297 using namespace parser_test;
298 InputSplit *source = nullptr;
299 const std::map<std::string, std::string> args;
300 std::unique_ptr<LibSVMParserTest<unsigned>> parser(
301 new LibSVMParserTest<unsigned>(source, args, 1));
302 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
303 std::string data = "0 1:17.065995780200002000000 4:17.0659957802 "
304 "6:0.00017065995780200002 8:0.000170659957802\n";
305 char* out_data = const_cast<char*>(data.c_str());
306 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
307
308 size_t num_row, num_col;
309 CountDimensions(rctr.get(), &num_row, &num_col);
310 CHECK_EQ(num_row, 1U);
311 CHECK_EQ(num_col, 9U);
312
313 const std::vector<unsigned> expected_index{1, 4, 6, 8};
314 CHECK(rctr->index == expected_index); // perform element-wise comparsion
315 CHECK_EQ(rctr->value[0], rctr->value[1]);
316 CHECK_EQ(rctr->value[2], rctr->value[3]);
317 }
318
TEST(LibSVMParser,test_indexing_mode_0_based)319 TEST(LibSVMParser, test_indexing_mode_0_based) {
320 using namespace parser_test;
321 InputSplit *source = nullptr;
322 const std::map<std::string, std::string> args;
323 std::unique_ptr<LibSVMParserTest<unsigned>> parser(
324 new LibSVMParserTest<unsigned>(source, args, 1));
325 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
326 std::string data = "1 1:1 2:-1\n0 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
327 char* out_data = const_cast<char*>(data.c_str());
328 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
329
330 size_t num_row, num_col;
331 CountDimensions(rctr.get(), &num_row, &num_col);
332 CHECK_EQ(num_row, 4U);
333 CHECK_EQ(num_col, 3U);
334
335 const std::vector<unsigned> expected_index{1, 2, 1, 2, 1, 2, 1, 2};
336 const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
337 CHECK(rctr->index == expected_index); // perform element-wise comparsion
338 CHECK(rctr->value == expected_value);
339 }
340
TEST(LibSVMParser,test_indexing_mode_1_based)341 TEST(LibSVMParser, test_indexing_mode_1_based) {
342 using namespace parser_test;
343 InputSplit *source = nullptr;
344 const std::map<std::string, std::string> args{{"indexing_mode", "1"}};
345 std::unique_ptr<LibSVMParserTest<unsigned>> parser(
346 new LibSVMParserTest<unsigned>(source, args, 1));
347 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
348 std::string data = "1 1:1 2:-1\n0 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
349 char* out_data = const_cast<char*>(data.c_str());
350 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
351
352 size_t num_row, num_col;
353 CountDimensions(rctr.get(), &num_row, &num_col);
354 CHECK_EQ(num_row, 4U);
355 CHECK_EQ(num_col, 2U);
356
357 const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
358 // with indexing_mode=1, parser will subtract 1 from each feature index
359 const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
360 CHECK(rctr->index == expected_index); // perform element-wise comparsion
361 CHECK(rctr->value == expected_value);
362 }
363
TEST(LibSVMParser,test_indexing_mode_auto_detect)364 TEST(LibSVMParser, test_indexing_mode_auto_detect) {
365 using namespace parser_test;
366 InputSplit *source = nullptr;
367 const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
368 std::unique_ptr<LibSVMParserTest<unsigned>> parser(
369 new LibSVMParserTest<unsigned>(source, args, 1));
370 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
371 std::string data = "1 1:1 2:-1\n0 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
372 char* out_data = const_cast<char*>(data.c_str());
373 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
374
375 size_t num_row, num_col;
376 CountDimensions(rctr.get(), &num_row, &num_col);
377 CHECK_EQ(num_row, 4U);
378 CHECK_EQ(num_col, 2U);
379
380 const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
381 // expect to detect 1-based indexing, since the least feature id is 1
382 const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
383 CHECK(rctr->index == expected_index); // perform element-wise comparsion
384 CHECK(rctr->value == expected_value);
385 }
386
TEST(LibSVMParser,test_indexing_mode_auto_detect_2)387 TEST(LibSVMParser, test_indexing_mode_auto_detect_2) {
388 using namespace parser_test;
389 InputSplit *source = nullptr;
390 const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
391 std::unique_ptr<LibSVMParserTest<unsigned>> parser(
392 new LibSVMParserTest<unsigned>(source, args, 1));
393 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
394 std::string data = "1 1:1 2:-1\n0 0:-2 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
395 char* out_data = const_cast<char*>(data.c_str());
396 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
397
398 size_t num_row, num_col;
399 CountDimensions(rctr.get(), &num_row, &num_col);
400 CHECK_EQ(num_row, 4U);
401 CHECK_EQ(num_col, 3U);
402
403 const std::vector<unsigned> expected_index{1, 2, 0, 1, 2, 1, 2, 1, 2};
404 // expect to detect 0-based indexing, since the least feature id is 0
405 const std::vector<real_t> expected_value{1, -1, -2, -1, 1, -1, -1, 1, 1};
406 CHECK(rctr->index == expected_index); // perform element-wise comparsion
407 CHECK(rctr->value == expected_value);
408 }
409
TEST(LibFMParser,test_indexing_mode_0_based)410 TEST(LibFMParser, test_indexing_mode_0_based) {
411 using namespace parser_test;
412 InputSplit *source = nullptr;
413 const std::map<std::string, std::string> args;
414 std::unique_ptr<LibFMParserTest<unsigned>> parser(
415 new LibFMParserTest<unsigned>(source, args, 1));
416 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
417 std::string data
418 = "1 1:1:1 1:2:-1\n0 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
419 char* out_data = const_cast<char*>(data.c_str());
420 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
421
422 size_t num_row, num_col;
423 CountDimensions(rctr.get(), &num_row, &num_col);
424 CHECK_EQ(num_row, 4U);
425 CHECK_EQ(num_col, 3U);
426
427 const std::vector<unsigned> expected_field{1, 1, 1, 2, 2, 1, 2, 2};
428 const std::vector<unsigned> expected_index{1, 2, 1, 2, 1, 2, 1, 2};
429 const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
430 CHECK(rctr->field == expected_field);
431 CHECK(rctr->index == expected_index);
432 CHECK(rctr->value == expected_value); // perform element-wise comparsion
433 }
434
TEST(LibFMParser,test_indexing_mode_1_based)435 TEST(LibFMParser, test_indexing_mode_1_based) {
436 using namespace parser_test;
437 InputSplit *source = nullptr;
438 const std::map<std::string, std::string> args{{"indexing_mode", "1"}};
439 std::unique_ptr<LibFMParserTest<unsigned>> parser(
440 new LibFMParserTest<unsigned>(source, args, 1));
441 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
442 std::string data
443 = "1 1:1:1 1:2:-1\n0 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
444 char* out_data = const_cast<char*>(data.c_str());
445 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
446
447 size_t num_row, num_col;
448 CountDimensions(rctr.get(), &num_row, &num_col);
449 CHECK_EQ(num_row, 4U);
450 CHECK_EQ(num_col, 2U);
451
452 const std::vector<unsigned> expected_field{0, 0, 0, 1, 1, 0, 1, 1};
453 const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
454 // with indexing_mode=1, parser will subtract 1 from field/feature indices
455 const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
456 CHECK(rctr->field == expected_field);
457 CHECK(rctr->index == expected_index);
458 CHECK(rctr->value == expected_value); // perform element-wise comparsion
459 }
460
TEST(LibFMParser,test_indexing_mode_auto_detect)461 TEST(LibFMParser, test_indexing_mode_auto_detect) {
462 using namespace parser_test;
463 InputSplit *source = nullptr;
464 const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
465 std::unique_ptr<LibFMParserTest<unsigned>> parser(
466 new LibFMParserTest<unsigned>(source, args, 1));
467 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
468 std::string data
469 = "1 1:1:1 1:2:-1\n0 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
470 char* out_data = const_cast<char*>(data.c_str());
471 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
472
473 size_t num_row, num_col;
474 CountDimensions(rctr.get(), &num_row, &num_col);
475 CHECK_EQ(num_row, 4U);
476 CHECK_EQ(num_col, 2U);
477
478 const std::vector<unsigned> expected_field{0, 0, 0, 1, 1, 0, 1, 1};
479 const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
480 // expect to detect 1-based indexing, since all field/feature id's exceed 0
481 const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
482 CHECK(rctr->field == expected_field);
483 CHECK(rctr->index == expected_index);
484 CHECK(rctr->value == expected_value); // perform element-wise comparsion
485 }
486
TEST(LibFMParser,test_indexing_mode_auto_detect_2)487 TEST(LibFMParser, test_indexing_mode_auto_detect_2) {
488 using namespace parser_test;
489 InputSplit *source = nullptr;
490 const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
491 std::unique_ptr<LibFMParserTest<unsigned>> parser(
492 new LibFMParserTest<unsigned>(source, args, 1));
493 std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
494 std::string data
495 = "1 1:1:1 1:2:-1\n0 0:0:-2 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
496 char* out_data = const_cast<char*>(data.c_str());
497 parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
498
499 size_t num_row, num_col;
500 CountDimensions(rctr.get(), &num_row, &num_col);
501 CHECK_EQ(num_row, 4U);
502 CHECK_EQ(num_col, 3U);
503
504 const std::vector<unsigned> expected_field{1, 1, 0, 1, 2, 2, 1, 2, 2};
505 const std::vector<unsigned> expected_index{1, 2, 0, 1, 2, 1, 2, 1, 2};
506 // expect to detect 0-based indexing, since second row has feature id 0
507 const std::vector<real_t> expected_value{1, -1, -2, -1, 1, -1, -1, 1, 1};
508 CHECK(rctr->field == expected_field);
509 CHECK(rctr->index == expected_index);
510 CHECK(rctr->value == expected_value); // perform element-wise comparsion
511 }
512