1 #include "../src/data/csv_parser.h"
2 #include "../src/data/libsvm_parser.h"
3 #include "../src/data/libfm_parser.h"
4 #include <cstdio>
5 #include <cstdlib>
6 #include <dmlc/io.h>
7 #include <gtest/gtest.h>
8 
9 using namespace dmlc;
10 using namespace dmlc::data;
11 
12 namespace parser_test {
13 template <typename IndexType, typename DType = real_t>
14 class CSVParserTest : public CSVParser<IndexType, DType> {
15 public:
CSVParserTest(InputSplit * source,const std::map<std::string,std::string> & args,int nthread)16   explicit CSVParserTest(InputSplit *source,
17                          const std::map<std::string, std::string> &args,
18                          int nthread)
19       : CSVParser<IndexType, DType>(source, args, nthread) {}
CallParseBlock(char * begin,char * end,RowBlockContainer<IndexType,DType> * out)20   void CallParseBlock(char *begin, char *end,
21                       RowBlockContainer<IndexType, DType> *out) {
22     CSVParser<IndexType, DType>::ParseBlock(begin, end, out);
23   }
24 };
25 
26 template <typename IndexType, typename DType = real_t>
27 class LibSVMParserTest : public LibSVMParser<IndexType, DType> {
28 public:
LibSVMParserTest(InputSplit * source,const std::map<std::string,std::string> & args,int nthread)29   explicit LibSVMParserTest(InputSplit *source,
30                             const std::map<std::string, std::string> &args,
31                             int nthread)
32       : LibSVMParser<IndexType, DType>(source, args, nthread) {}
CallParseBlock(char * begin,char * end,RowBlockContainer<IndexType,DType> * out)33   void CallParseBlock(char *begin, char *end,
34                       RowBlockContainer<IndexType, DType> *out) {
35     LibSVMParser<IndexType, DType>::ParseBlock(begin, end, out);
36   }
37 };
38 
39 template <typename IndexType, typename DType = real_t>
40 class LibFMParserTest : public LibFMParser<IndexType, DType> {
41 public:
LibFMParserTest(InputSplit * source,const std::map<std::string,std::string> & args,int nthread)42   explicit LibFMParserTest(InputSplit *source,
43                            const std::map<std::string, std::string> &args,
44                            int nthread)
45       : LibFMParser<IndexType, DType>(source, args, nthread) {}
CallParseBlock(char * begin,char * end,RowBlockContainer<IndexType,DType> * out)46   void CallParseBlock(char *begin, char *end,
47                       RowBlockContainer<IndexType, DType> *out) {
48     LibFMParser<IndexType, DType>::ParseBlock(begin, end, out);
49   }
50 };
51 
52 }  // namespace parser_test
53 
54 namespace {
55 
56 template <typename IndexType>
CountDimensions(RowBlockContainer<IndexType> * rctr,size_t * out_num_row,size_t * out_num_col)57 static inline void CountDimensions(RowBlockContainer<IndexType>* rctr,
58                                    size_t* out_num_row, size_t* out_num_col) {
59   size_t num_row = rctr->label.size();
60   size_t num_col = 0;
61   for (size_t i = rctr->offset[0]; i < rctr->offset[num_row]; ++i) {
62     const IndexType index = rctr->index[i];
63     num_col = std::max(num_col, static_cast<size_t>(index + 1));
64   }
65   *out_num_row = num_row;
66   *out_num_col = num_col;
67 }
68 
69 }  // namespace anonymous
70 
TEST(CSVParser,test_ignore_bom)71 TEST(CSVParser, test_ignore_bom) {
72   using namespace parser_test;
73   InputSplit *source = nullptr;
74   const std::map<std::string, std::string> args;
75   std::unique_ptr<CSVParserTest<unsigned>> parser(
76       new CSVParserTest<unsigned>(source, args, 1));
77   std::string data = "\xEF\xBB\xBF\x31\n\xEF\xBB\x32\n";
78   char *out_data = (char *)data.c_str();
79   std::unique_ptr<RowBlockContainer<unsigned> > rctr {new RowBlockContainer<unsigned>()};
80   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
81   CHECK(rctr->value[0] == 1);
82   CHECK(rctr->value[1] == 0);
83   data = "\xEF\xBB\xBF\x31\n\xEF\xBB\xBF\x32\n";
84   out_data = (char *)data.c_str();
85   rctr.reset(new RowBlockContainer<unsigned>());
86   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
87 
88   CHECK(rctr->value[0] == 1);
89   CHECK(rctr->value[1] == 2);
90 }
91 
TEST(CSVParser,test_standard_case)92 TEST(CSVParser, test_standard_case) {
93   using namespace parser_test;
94   InputSplit *source = nullptr;
95   const std::map<std::string, std::string> args;
96   std::unique_ptr<CSVParserTest<unsigned>> parser(
97       new CSVParserTest<unsigned>(source, args, 1));
98   std::unique_ptr<RowBlockContainer<unsigned>> rctr { new RowBlockContainer<unsigned>() };
99   std::string data = "0,1,2,3\n4,5,6,7\n8,9,10,11\n";
100   char *out_data = const_cast<char *>(data.c_str());
101   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
102   for (size_t i = 0; i < rctr->value.size(); i++) {
103     CHECK(i == rctr->value[i]);
104   }
105 }
106 
TEST(CSVParser,test_int32_parse)107 TEST(CSVParser, test_int32_parse) {
108   using namespace parser_test;
109   InputSplit *source = nullptr;
110   const std::map<std::string, std::string> args;
111   std::unique_ptr<CSVParserTest<unsigned, int32_t>> parser(
112       new CSVParserTest<unsigned, int32_t>(source, args, 1));
113   std::unique_ptr<RowBlockContainer<unsigned, int32_t>> rctr {
114     new RowBlockContainer<unsigned, int32_t>()};
115   std::string data = "20000000,20000001,20000002,20000003\n"
116                      "20000004,20000005,20000006,20000007\n"
117                      "20000008,20000009,20000010,20000011\n";
118   char *out_data = const_cast<char *>(data.c_str());
119   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
120   for (size_t i = 0; i < rctr->value.size(); i++) {
121     CHECK((i+20000000) == (size_t)rctr->value[i]);
122   }
123 }
124 
TEST(CSVParser,test_int64_parse)125 TEST(CSVParser, test_int64_parse) {
126   using namespace parser_test;
127   InputSplit *source = nullptr;
128   const std::map<std::string, std::string> args;
129   std::unique_ptr<CSVParserTest<unsigned, int64_t>> parser(
130     new CSVParserTest<unsigned, int64_t>(source, args, 1));
131   std::unique_ptr<RowBlockContainer<unsigned, int64_t> > rctr {
132     new RowBlockContainer<unsigned, int64_t>()};
133   std::string data = "2147483648,2147483649,2147483650,2147483651\n"
134                      "2147483652,2147483653,2147483654,2147483655\n"
135                      "2147483656,2147483657,2147483658,2147483659\n";
136   char *out_data = const_cast<char *>(data.c_str());
137   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
138   for (size_t i = 0; i < rctr->value.size(); i++) {
139     CHECK((i+2147483648) == (size_t)rctr->value[i]);
140   }
141 }
142 
TEST(CSVParser,test_different_newlines)143 TEST(CSVParser, test_different_newlines) {
144   using namespace parser_test;
145   InputSplit *source = nullptr;
146   const std::map<std::string, std::string> args;
147   std::unique_ptr<CSVParserTest<unsigned>> parser(
148       new CSVParserTest<unsigned>(source, args, 1));
149   std::unique_ptr<RowBlockContainer<unsigned> > rctr {new RowBlockContainer<unsigned>()};
150   std::string data = "0,1,2,3\r\n4,5,6,7\r\n8,9,10,11\r\n";
151   char *out_data = const_cast<char *>(data.c_str());
152   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
153   for (size_t i = 0; i < rctr->value.size(); i++) {
154     CHECK(i == rctr->value[i]);
155   }
156 }
157 
TEST(CSVParser,test_noeol)158 TEST(CSVParser, test_noeol) {
159   using namespace parser_test;
160   InputSplit *source = nullptr;
161   const std::map<std::string, std::string> args;
162   std::unique_ptr<CSVParserTest<unsigned>> parser(
163       new CSVParserTest<unsigned>(source, args, 1));
164   std::unique_ptr<RowBlockContainer<unsigned> > rctr {new RowBlockContainer<unsigned>()} ;
165   std::string data = "0,1,2,3\r\n4,5,6,7\r\n8,9,10,11";
166   char *out_data = const_cast<char *>(data.c_str());
167   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
168   for (size_t i = 0; i < rctr->value.size(); i++) {
169     CHECK(i == rctr->value[i]);
170   }
171 }
172 
TEST(CSVParser,test_delimiter)173 TEST(CSVParser, test_delimiter) {
174   using namespace parser_test;
175   InputSplit *source = nullptr;
176   const std::map<std::string, std::string> args{ {"delimiter", " "} };
177   std::unique_ptr<CSVParserTest<unsigned>> parser(
178       new CSVParserTest<unsigned>(source, args, 1));
179   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
180   std::string data = "0 1 2 3\n4 5 6 7\n8 9 10 11";
181   char *out_data = const_cast<char *>(data.c_str());
182   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
183   for (size_t i = 0; i < rctr->value.size(); i++) {
184     CHECK(i == rctr->value[i]);
185   }
186 }
187 
TEST(CSVParser,test_weight_column)188 TEST(CSVParser, test_weight_column) {
189   using namespace parser_test;
190   InputSplit *source = nullptr;
191   const std::map<std::string, std::string> args{ {"weight_column", "2"} };
192   std::unique_ptr<CSVParserTest<unsigned>> parser(
193       new CSVParserTest<unsigned>(source, args, 1));
194   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
195   std::string data = "0,1,2,3\n4,5,6,7\n8,9,10,11";
196   char *out_data = const_cast<char *>(data.c_str());
197   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
198   CHECK_EQ(rctr->weight.size(), 3U);
199   for (size_t i = 0; i < rctr->weight.size(); i++) {
200     CHECK_EQ(rctr->weight[i], 2.0f + 4.0f * i);
201   }
202   const std::vector<real_t>
203     expected_values{0.0f, 1.0f, 3.0f, 4.0f, 5.0f, 7.0f, 8.0f, 9.0f, 11.0f};
204   CHECK_EQ(rctr->value.size(), expected_values.size());
205   for (size_t i = 0; i < rctr->value.size(); i++) {
206     CHECK_EQ(rctr->value[i], expected_values[i]);
207   }
208 }
209 
TEST(CSVParser,test_weight_column_2)210 TEST(CSVParser, test_weight_column_2) {
211   using namespace parser_test;
212   InputSplit *source = nullptr;
213   const std::map<std::string, std::string> args;
214   std::unique_ptr<CSVParserTest<unsigned>> parser(
215       new CSVParserTest<unsigned>(source, args, 1));
216   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
217   std::string data = "0,1,2,3\n4,5,6,7\n8,9,10,11";
218   char *out_data = const_cast<char *>(data.c_str());
219   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
220   CHECK(rctr->weight.empty());
221   CHECK_EQ(rctr->value.size(), 12U);
222   for (size_t i = 0; i < rctr->value.size(); i++) {
223     CHECK(i == rctr->value[i]);
224   }
225 }
226 
test_qid(std::string data)227 void test_qid(std::string data) {
228   using namespace parser_test;
229   InputSplit *source = nullptr;
230   const std::map<std::string, std::string> args;
231   std::unique_ptr<LibSVMParserTest<unsigned>> parser(
232       new LibSVMParserTest<unsigned>(source, args, 1));
233   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
234   char* out_data = const_cast<char*>(data.c_str());
235   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
236   const std::vector<size_t> expected_offset{
237     0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
238   };
239   const std::vector<real_t> expected_label{
240     3, 2, 1, 1, 1, 2, 1, 1, 2, 3, 4, 1
241   };
242   const std::vector<uint64_t> expected_qid{
243     1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
244   };
245   const std::vector<unsigned> expected_index{
246     1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5,
247     1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5,
248     1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5
249   };
250   const std::vector<real_t> expected_value{
251     1.0f, 1.0f, 0.0f, 0.2f, 0.0f, 0.0f, 0.0f, 1.0f, 0.1f, 1.0f, 0.0f, 1.0f, 0.0f, 0.4f, 0.0f, 0.0f,
252     0.0f, 1.0f, 0.3f, 0.0f, 0.0f, 0.0f, 1.0f, 0.2f, 0.0f, 1.0f, 0.0f, 1.0f, 0.4f, 0.0f, 0.0f, 0.0f,
253     1.0f, 0.1f, 0.0f, 0.0f, 0.0f, 1.0f, 0.2f, 0.0f, 0.0f, 0.0f, 1.0f, 0.1f, 1.0f, 1.0f, 1.0f, 0.0f,
254     0.3f, 0.0f, 1.0f, 0.0f, 0.0f, 0.4f, 1.0f, 0.0f, 1.0f, 1.0f, 0.5f, 0.0f
255   };
256   CHECK(rctr->offset == expected_offset);
257   CHECK(rctr->label == expected_label);
258   CHECK(rctr->qid == expected_qid);
259   CHECK(rctr->index == expected_index);
260   CHECK(rctr->value == expected_value);
261 }
262 
TEST(LibSVMParser,test_qid)263 TEST(LibSVMParser, test_qid) {
264   std::string data = R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
265                            2 qid:1 1:0 2:0 3:1 4:0.1 5:1
266                            1 qid:1 1:0 2:1 3:0 4:0.4 5:0
267                            1 qid:1 1:0 2:0 3:1 4:0.3 5:0
268                            1 qid:2 1:0 2:0 3:1 4:0.2 5:0
269                            2 qid:2 1:1 2:0 3:1 4:0.4 5:0
270                            1 qid:2 1:0 2:0 3:1 4:0.1 5:0
271                            1 qid:2 1:0 2:0 3:1 4:0.2 5:0
272                            2 qid:3 1:0 2:0 3:1 4:0.1 5:1
273                            3 qid:3 1:1 2:1 3:0 4:0.3 5:0
274                            4 qid:3 1:1 2:0 3:0 4:0.4 5:1
275                            1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
276   test_qid(data);
277 }
278 
TEST(LibSVMParser,test_qid_with_comment)279 TEST(LibSVMParser, test_qid_with_comment) {
280   std::string data = R"qid(# what does foo bar mean anyway
281                            3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # foo
282                            2 qid:1 1:0 2:0 3:1 4:0.1 5:1
283                            1 qid:1 1:0 2:1 3:0 4:0.4 5:0
284                            1 qid:1 1:0 2:0 3:1 4:0.3 5:0
285                            1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # bar
286                            2 qid:2 1:1 2:0 3:1 4:0.4 5:0
287                            1 qid:2 1:0 2:0 3:1 4:0.1 5:0
288                            1 qid:2 1:0 2:0 3:1 4:0.2 5:0
289                            2 qid:3 1:0 2:0 3:1 4:0.1 5:1
290                            3 qid:3 1:1 2:1 3:0 4:0.3 5:0
291                            4 qid:3 1:1 2:0 3:0 4:0.4 5:1
292                            1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
293   test_qid(data);
294 }
295 
TEST(LibSVMParser,test_excess_decimal_digits)296 TEST(LibSVMParser, test_excess_decimal_digits) {
297   using namespace parser_test;
298   InputSplit *source = nullptr;
299   const std::map<std::string, std::string> args;
300   std::unique_ptr<LibSVMParserTest<unsigned>> parser(
301       new LibSVMParserTest<unsigned>(source, args, 1));
302   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
303   std::string data = "0 1:17.065995780200002000000 4:17.0659957802 "
304                      "6:0.00017065995780200002 8:0.000170659957802\n";
305   char* out_data = const_cast<char*>(data.c_str());
306   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
307 
308   size_t num_row, num_col;
309   CountDimensions(rctr.get(), &num_row, &num_col);
310   CHECK_EQ(num_row, 1U);
311   CHECK_EQ(num_col, 9U);
312 
313   const std::vector<unsigned> expected_index{1, 4, 6, 8};
314   CHECK(rctr->index == expected_index);  // perform element-wise comparsion
315   CHECK_EQ(rctr->value[0], rctr->value[1]);
316   CHECK_EQ(rctr->value[2], rctr->value[3]);
317 }
318 
TEST(LibSVMParser,test_indexing_mode_0_based)319 TEST(LibSVMParser, test_indexing_mode_0_based) {
320   using namespace parser_test;
321   InputSplit *source = nullptr;
322   const std::map<std::string, std::string> args;
323   std::unique_ptr<LibSVMParserTest<unsigned>> parser(
324       new LibSVMParserTest<unsigned>(source, args, 1));
325   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
326   std::string data = "1 1:1 2:-1\n0 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
327   char* out_data = const_cast<char*>(data.c_str());
328   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
329 
330   size_t num_row, num_col;
331   CountDimensions(rctr.get(), &num_row, &num_col);
332   CHECK_EQ(num_row, 4U);
333   CHECK_EQ(num_col, 3U);
334 
335   const std::vector<unsigned> expected_index{1, 2, 1, 2, 1, 2, 1, 2};
336   const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
337   CHECK(rctr->index == expected_index);  // perform element-wise comparsion
338   CHECK(rctr->value == expected_value);
339 }
340 
TEST(LibSVMParser,test_indexing_mode_1_based)341 TEST(LibSVMParser, test_indexing_mode_1_based) {
342   using namespace parser_test;
343   InputSplit *source = nullptr;
344   const std::map<std::string, std::string> args{{"indexing_mode", "1"}};
345   std::unique_ptr<LibSVMParserTest<unsigned>> parser(
346       new LibSVMParserTest<unsigned>(source, args, 1));
347   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
348   std::string data = "1 1:1 2:-1\n0 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
349   char* out_data = const_cast<char*>(data.c_str());
350   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
351 
352   size_t num_row, num_col;
353   CountDimensions(rctr.get(), &num_row, &num_col);
354   CHECK_EQ(num_row, 4U);
355   CHECK_EQ(num_col, 2U);
356 
357   const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
358     // with indexing_mode=1, parser will subtract 1 from each feature index
359   const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
360   CHECK(rctr->index == expected_index);  // perform element-wise comparsion
361   CHECK(rctr->value == expected_value);
362 }
363 
TEST(LibSVMParser,test_indexing_mode_auto_detect)364 TEST(LibSVMParser, test_indexing_mode_auto_detect) {
365   using namespace parser_test;
366   InputSplit *source = nullptr;
367   const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
368   std::unique_ptr<LibSVMParserTest<unsigned>> parser(
369       new LibSVMParserTest<unsigned>(source, args, 1));
370   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
371   std::string data = "1 1:1 2:-1\n0 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
372   char* out_data = const_cast<char*>(data.c_str());
373   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
374 
375   size_t num_row, num_col;
376   CountDimensions(rctr.get(), &num_row, &num_col);
377   CHECK_EQ(num_row, 4U);
378   CHECK_EQ(num_col, 2U);
379 
380   const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
381     // expect to detect 1-based indexing, since the least feature id is 1
382   const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
383   CHECK(rctr->index == expected_index);  // perform element-wise comparsion
384   CHECK(rctr->value == expected_value);
385 }
386 
TEST(LibSVMParser,test_indexing_mode_auto_detect_2)387 TEST(LibSVMParser, test_indexing_mode_auto_detect_2) {
388   using namespace parser_test;
389   InputSplit *source = nullptr;
390   const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
391   std::unique_ptr<LibSVMParserTest<unsigned>> parser(
392       new LibSVMParserTest<unsigned>(source, args, 1));
393   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
394   std::string data = "1 1:1 2:-1\n0 0:-2 1:-1 2:1\n1 1:-1 2:-1\n0 1:1 2:1\n";
395   char* out_data = const_cast<char*>(data.c_str());
396   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
397 
398   size_t num_row, num_col;
399   CountDimensions(rctr.get(), &num_row, &num_col);
400   CHECK_EQ(num_row, 4U);
401   CHECK_EQ(num_col, 3U);
402 
403   const std::vector<unsigned> expected_index{1, 2, 0, 1, 2, 1, 2, 1, 2};
404     // expect to detect 0-based indexing, since the least feature id is 0
405   const std::vector<real_t> expected_value{1, -1, -2, -1, 1, -1, -1, 1, 1};
406   CHECK(rctr->index == expected_index);  // perform element-wise comparsion
407   CHECK(rctr->value == expected_value);
408 }
409 
TEST(LibFMParser,test_indexing_mode_0_based)410 TEST(LibFMParser, test_indexing_mode_0_based) {
411   using namespace parser_test;
412   InputSplit *source = nullptr;
413   const std::map<std::string, std::string> args;
414   std::unique_ptr<LibFMParserTest<unsigned>> parser(
415       new LibFMParserTest<unsigned>(source, args, 1));
416   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
417   std::string data
418     = "1 1:1:1 1:2:-1\n0 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
419   char* out_data = const_cast<char*>(data.c_str());
420   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
421 
422   size_t num_row, num_col;
423   CountDimensions(rctr.get(), &num_row, &num_col);
424   CHECK_EQ(num_row, 4U);
425   CHECK_EQ(num_col, 3U);
426 
427   const std::vector<unsigned> expected_field{1, 1, 1, 2, 2, 1, 2, 2};
428   const std::vector<unsigned> expected_index{1, 2, 1, 2, 1, 2, 1, 2};
429   const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
430   CHECK(rctr->field == expected_field);
431   CHECK(rctr->index == expected_index);
432   CHECK(rctr->value == expected_value);  // perform element-wise comparsion
433 }
434 
TEST(LibFMParser,test_indexing_mode_1_based)435 TEST(LibFMParser, test_indexing_mode_1_based) {
436   using namespace parser_test;
437   InputSplit *source = nullptr;
438   const std::map<std::string, std::string> args{{"indexing_mode", "1"}};
439   std::unique_ptr<LibFMParserTest<unsigned>> parser(
440       new LibFMParserTest<unsigned>(source, args, 1));
441   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
442   std::string data
443     = "1 1:1:1 1:2:-1\n0 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
444   char* out_data = const_cast<char*>(data.c_str());
445   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
446 
447   size_t num_row, num_col;
448   CountDimensions(rctr.get(), &num_row, &num_col);
449   CHECK_EQ(num_row, 4U);
450   CHECK_EQ(num_col, 2U);
451 
452   const std::vector<unsigned> expected_field{0, 0, 0, 1, 1, 0, 1, 1};
453   const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
454     // with indexing_mode=1, parser will subtract 1 from field/feature indices
455   const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
456   CHECK(rctr->field == expected_field);
457   CHECK(rctr->index == expected_index);
458   CHECK(rctr->value == expected_value);  // perform element-wise comparsion
459 }
460 
TEST(LibFMParser,test_indexing_mode_auto_detect)461 TEST(LibFMParser, test_indexing_mode_auto_detect) {
462   using namespace parser_test;
463   InputSplit *source = nullptr;
464   const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
465   std::unique_ptr<LibFMParserTest<unsigned>> parser(
466       new LibFMParserTest<unsigned>(source, args, 1));
467   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
468   std::string data
469     = "1 1:1:1 1:2:-1\n0 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
470   char* out_data = const_cast<char*>(data.c_str());
471   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
472 
473   size_t num_row, num_col;
474   CountDimensions(rctr.get(), &num_row, &num_col);
475   CHECK_EQ(num_row, 4U);
476   CHECK_EQ(num_col, 2U);
477 
478   const std::vector<unsigned> expected_field{0, 0, 0, 1, 1, 0, 1, 1};
479   const std::vector<unsigned> expected_index{0, 1, 0, 1, 0, 1, 0, 1};
480     // expect to detect 1-based indexing, since all field/feature id's exceed 0
481   const std::vector<real_t> expected_value{1, -1, -1, 1, -1, -1, 1, 1};
482   CHECK(rctr->field == expected_field);
483   CHECK(rctr->index == expected_index);
484   CHECK(rctr->value == expected_value);  // perform element-wise comparsion
485 }
486 
TEST(LibFMParser,test_indexing_mode_auto_detect_2)487 TEST(LibFMParser, test_indexing_mode_auto_detect_2) {
488   using namespace parser_test;
489   InputSplit *source = nullptr;
490   const std::map<std::string, std::string> args{{"indexing_mode", "-1"}};
491   std::unique_ptr<LibFMParserTest<unsigned>> parser(
492       new LibFMParserTest<unsigned>(source, args, 1));
493   std::unique_ptr<RowBlockContainer<unsigned>> rctr {new RowBlockContainer<unsigned>()};
494   std::string data
495     = "1 1:1:1 1:2:-1\n0 0:0:-2 1:1:-1 2:2:1\n1 2:1:-1 1:2:-1\n0 2:1:1 2:2:1\n";
496   char* out_data = const_cast<char*>(data.c_str());
497   parser->CallParseBlock(out_data, out_data + data.size(), rctr.get());
498 
499   size_t num_row, num_col;
500   CountDimensions(rctr.get(), &num_row, &num_col);
501   CHECK_EQ(num_row, 4U);
502   CHECK_EQ(num_col, 3U);
503 
504   const std::vector<unsigned> expected_field{1, 1, 0, 1, 2, 2, 1, 2, 2};
505   const std::vector<unsigned> expected_index{1, 2, 0, 1, 2, 1, 2, 1, 2};
506     // expect to detect 0-based indexing, since second row has feature id 0
507   const std::vector<real_t> expected_value{1, -1, -2, -1, 1, -1, -1, 1, 1};
508   CHECK(rctr->field == expected_field);
509   CHECK(rctr->index == expected_index);
510   CHECK(rctr->value == expected_value);  // perform element-wise comparsion
511 }
512