1 // Copyright 2016-2020 by Contributors
2 #include <dmlc/io.h>
3 #include <dmlc/filesystem.h>
4 #include <xgboost/data.h>
5 #include <string>
6 #include <memory>
7 #include "../../../src/common/version.h"
8 
9 #include "../helpers.h"
10 #include "xgboost/base.h"
11 
TEST(MetaInfo,GetSet)12 TEST(MetaInfo, GetSet) {
13   xgboost::MetaInfo info;
14 
15   double double2[2] = {1.0, 2.0};
16 
17   EXPECT_EQ(info.labels_.Size(), 0);
18   info.SetInfo("label", double2, xgboost::DataType::kFloat32, 2);
19   EXPECT_EQ(info.labels_.Size(), 2);
20 
21   float float2[2] = {1.0f, 2.0f};
22   EXPECT_EQ(info.GetWeight(1), 1.0f)
23     << "When no weights are given, was expecting default value 1";
24   info.SetInfo("weight", float2, xgboost::DataType::kFloat32, 2);
25   EXPECT_EQ(info.GetWeight(1), 2.0f);
26 
27   uint32_t uint32_t2[2] = {1U, 2U};
28   EXPECT_EQ(info.base_margin_.Size(), 0);
29   info.SetInfo("base_margin", uint32_t2, xgboost::DataType::kUInt32, 2);
30   EXPECT_EQ(info.base_margin_.Size(), 2);
31 
32   uint64_t uint64_t2[2] = {1U, 2U};
33   EXPECT_EQ(info.group_ptr_.size(), 0);
34   info.SetInfo("group", uint64_t2, xgboost::DataType::kUInt64, 2);
35   ASSERT_EQ(info.group_ptr_.size(), 3);
36   EXPECT_EQ(info.group_ptr_[2], 3);
37 
38   info.Clear();
39   ASSERT_EQ(info.group_ptr_.size(), 0);
40 }
41 
TEST(MetaInfo,GetSetFeature)42 TEST(MetaInfo, GetSetFeature) {
43   xgboost::MetaInfo info;
44   EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
45   EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
46   EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
47   EXPECT_NO_THROW(info.SetFeatureInfo("feature_type", nullptr, 0));
48   ASSERT_EQ(info.feature_type_names.size(), 0);
49   ASSERT_EQ(info.feature_types.Size(), 0);
50   ASSERT_EQ(info.feature_names.size(), 0);
51 
52   size_t constexpr kCols = 19;
53   std::vector<std::string> types(kCols, u8"float");
54   std::vector<char const*> c_types(kCols);
55   std::transform(types.cbegin(), types.cend(), c_types.begin(),
56                  [](auto const &str) { return str.c_str(); });
57   // Info has 0 column
58   EXPECT_THROW(
59       info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()),
60       dmlc::Error);
61   info.num_col_ = kCols;
62   EXPECT_NO_THROW(
63       info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
64 
65   // Test clear.
66   info.SetFeatureInfo("feature_type", nullptr, 0);
67   ASSERT_EQ(info.feature_type_names.size(), 0);
68   ASSERT_EQ(info.feature_types.Size(), 0);
69   // Other conditions are tested in `SaveLoadBinary`.
70 }
71 
TEST(MetaInfo,SaveLoadBinary)72 TEST(MetaInfo, SaveLoadBinary) {
73   xgboost::MetaInfo info;
74   uint64_t constexpr kRows { 64 }, kCols { 32 };
75   auto generator = []() {
76                      static float f = 0;
77                      return f++;
78                    };
79   std::vector<float> values (kRows);
80   std::generate(values.begin(), values.end(), generator);
81   info.SetInfo("label", values.data(), xgboost::DataType::kFloat32, kRows);
82   info.SetInfo("weight", values.data(), xgboost::DataType::kFloat32, kRows);
83   info.SetInfo("base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
84 
85   info.num_row_ = kRows;
86   info.num_col_ = kCols;
87 
88   auto featname = u8"特征名";
89   std::vector<std::string> types(kCols, u8"float");
90   std::vector<char const*> c_types(kCols);
91   std::transform(types.cbegin(), types.cend(), c_types.begin(),
92                  [](auto const &str) { return str.c_str(); });
93   info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size());
94   std::vector<std::string> names(kCols, featname);
95   std::vector<char const*> c_names(kCols);
96   std::transform(names.cbegin(), names.cend(), c_names.begin(),
97                  [](auto const &str) { return str.c_str(); });
98   info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());;
99 
100   dmlc::TemporaryDirectory tempdir;
101   const std::string tmp_file = tempdir.path + "/metainfo.binary";
102   {
103     std::unique_ptr<dmlc::Stream> fs {
104       dmlc::Stream::Create(tmp_file.c_str(), "w")
105     };
106     info.SaveBinary(fs.get());
107   }
108 
109   {
110     // Round-trip test
111     std::unique_ptr<dmlc::Stream> fs {
112       dmlc::Stream::Create(tmp_file.c_str(), "r")
113     };
114     xgboost::MetaInfo inforead;
115     inforead.LoadBinary(fs.get());
116     ASSERT_EQ(inforead.num_row_, kRows);
117     EXPECT_EQ(inforead.num_row_, info.num_row_);
118     EXPECT_EQ(inforead.num_col_, info.num_col_);
119     EXPECT_EQ(inforead.num_nonzero_, info.num_nonzero_);
120 
121     ASSERT_EQ(inforead.labels_.HostVector(), values);
122     EXPECT_EQ(inforead.labels_.HostVector(), info.labels_.HostVector());
123     EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
124     EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
125     EXPECT_EQ(inforead.base_margin_.HostVector(), info.base_margin_.HostVector());
126 
127     EXPECT_EQ(inforead.feature_type_names.size(), kCols);
128     EXPECT_EQ(inforead.feature_types.Size(), kCols);
129     EXPECT_TRUE(std::all_of(inforead.feature_type_names.cbegin(),
130                             inforead.feature_type_names.cend(),
131                             [](auto const &str) { return str == u8"float"; }));
132     auto h_ft = inforead.feature_types.HostSpan();
133     EXPECT_TRUE(std::all_of(h_ft.cbegin(), h_ft.cend(), [](auto f) {
134       return f == xgboost::FeatureType::kNumerical;
135     }));
136 
137     EXPECT_EQ(inforead.feature_names.size(), kCols);
138     EXPECT_TRUE(std::all_of(inforead.feature_names.cbegin(),
139                             inforead.feature_names.cend(),
140                             [=](auto const& str) {
141                               return str == featname;
142                             }));
143   }
144 }
145 
TEST(MetaInfo,LoadQid)146 TEST(MetaInfo, LoadQid) {
147   dmlc::TemporaryDirectory tempdir;
148   std::string tmp_file = tempdir.path + "/qid_test.libsvm";
149   {
150     std::unique_ptr<dmlc::Stream> fs(
151       dmlc::Stream::Create(tmp_file.c_str(), "w"));
152     dmlc::ostream os(fs.get());
153     os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
154                 2 qid:1 1:0 2:0 3:1 4:0.1 5:1
155                 1 qid:1 1:0 2:1 3:0 4:0.4 5:0
156                 1 qid:1 1:0 2:0 3:1 4:0.3 5:0
157                 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
158                 2 qid:2 1:1 2:0 3:1 4:0.4 5:0
159                 1 qid:2 1:0 2:0 3:1 4:0.1 5:0
160                 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
161                 2 qid:3 1:0 2:0 3:1 4:0.1 5:1
162                 3 qid:3 1:1 2:1 3:0 4:0.3 5:0
163                 4 qid:3 1:1 2:0 3:0 4:0.4 5:1
164                 1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
165     os.set_stream(nullptr);
166   }
167   std::unique_ptr<xgboost::DMatrix> dmat(
168     xgboost::DMatrix::Load(tmp_file, true, false, "libsvm"));
169 
170   const xgboost::MetaInfo& info = dmat->Info();
171   const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
172   CHECK(info.group_ptr_ == expected_group_ptr);
173 
174   const std::vector<xgboost::bst_row_t> expected_offset{
175     0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
176   };
177   const std::vector<xgboost::Entry> expected_data{
178       xgboost::Entry(1, 1),   xgboost::Entry(2, 1),   xgboost::Entry(3, 0),
179       xgboost::Entry(4, 0.2), xgboost::Entry(5, 0),   xgboost::Entry(1, 0),
180       xgboost::Entry(2, 0),   xgboost::Entry(3, 1),   xgboost::Entry(4, 0.1),
181       xgboost::Entry(5, 1),   xgboost::Entry(1, 0),   xgboost::Entry(2, 1),
182       xgboost::Entry(3, 0),   xgboost::Entry(4, 0.4), xgboost::Entry(5, 0),
183       xgboost::Entry(1, 0),   xgboost::Entry(2, 0),   xgboost::Entry(3, 1),
184       xgboost::Entry(4, 0.3), xgboost::Entry(5, 0),   xgboost::Entry(1, 0),
185       xgboost::Entry(2, 0),   xgboost::Entry(3, 1),   xgboost::Entry(4, 0.2),
186       xgboost::Entry(5, 0),   xgboost::Entry(1, 1),   xgboost::Entry(2, 0),
187       xgboost::Entry(3, 1),   xgboost::Entry(4, 0.4), xgboost::Entry(5, 0),
188       xgboost::Entry(1, 0),   xgboost::Entry(2, 0),   xgboost::Entry(3, 1),
189       xgboost::Entry(4, 0.1), xgboost::Entry(5, 0),   xgboost::Entry(1, 0),
190       xgboost::Entry(2, 0),   xgboost::Entry(3, 1),   xgboost::Entry(4, 0.2),
191       xgboost::Entry(5, 0),   xgboost::Entry(1, 0),   xgboost::Entry(2, 0),
192       xgboost::Entry(3, 1),   xgboost::Entry(4, 0.1), xgboost::Entry(5, 1),
193       xgboost::Entry(1, 1),   xgboost::Entry(2, 1),   xgboost::Entry(3, 0),
194       xgboost::Entry(4, 0.3), xgboost::Entry(5, 0),   xgboost::Entry(1, 1),
195       xgboost::Entry(2, 0),   xgboost::Entry(3, 0),   xgboost::Entry(4, 0.4),
196       xgboost::Entry(5, 1),   xgboost::Entry(1, 0),   xgboost::Entry(2, 1),
197       xgboost::Entry(3, 1),   xgboost::Entry(4, 0.5), {5, 0}};
198   for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
199     CHECK_EQ(batch.base_rowid, 0);
200     CHECK(batch.offset.HostVector() == expected_offset);
201     CHECK(batch.data.HostVector() == expected_data);
202   }
203 }
204 
TEST(MetaInfo,CPUQid)205 TEST(MetaInfo, CPUQid) {
206   xgboost::MetaInfo info;
207   info.num_row_ = 100;
208   std::vector<uint32_t> qid(info.num_row_, 0);
209   for (size_t i = 0; i < qid.size(); ++i) {
210     qid[i] = i;
211   }
212 
213   info.SetInfo("qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
214   ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
215   ASSERT_EQ(info.group_ptr_.front(), 0);
216   ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
217 
218   for (size_t i = 0; i < info.num_row_ + 1; ++i) {
219     ASSERT_EQ(info.group_ptr_[i], i);
220   }
221 }
222 
TEST(MetaInfo,Validate)223 TEST(MetaInfo, Validate) {
224   xgboost::MetaInfo info;
225   info.num_row_ = 10;
226   info.num_nonzero_ = 12;
227   info.num_col_ = 3;
228   std::vector<xgboost::bst_group_t> groups (11);
229   info.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, 11);
230   EXPECT_THROW(info.Validate(0), dmlc::Error);
231 
232   std::vector<float> labels(info.num_row_ + 1);
233   info.SetInfo("label", labels.data(), xgboost::DataType::kFloat32, info.num_row_ + 1);
234   EXPECT_THROW(info.Validate(0), dmlc::Error);
235 
236 #if defined(XGBOOST_USE_CUDA)
237   info.group_ptr_.clear();
238   labels.resize(info.num_row_);
239   info.SetInfo("label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
240   info.labels_.SetDevice(0);
241   EXPECT_THROW(info.Validate(1), dmlc::Error);
242 #endif  // defined(XGBOOST_USE_CUDA)
243 }
244 
TEST(MetaInfo,HostExtend)245 TEST(MetaInfo, HostExtend) {
246   xgboost::MetaInfo lhs, rhs;
247   size_t const kRows = 100;
248   lhs.labels_.Resize(kRows);
249   lhs.num_row_ = kRows;
250   rhs.labels_.Resize(kRows);
251   rhs.num_row_ = kRows;
252   ASSERT_TRUE(lhs.labels_.HostCanRead());
253   ASSERT_TRUE(rhs.labels_.HostCanRead());
254 
255   size_t per_group = 10;
256   std::vector<xgboost::bst_group_t> groups;
257   for (size_t g = 0; g < kRows / per_group; ++g) {
258     groups.emplace_back(per_group);
259   }
260   lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
261   rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
262 
263   lhs.Extend(rhs, true, true);
264   ASSERT_EQ(lhs.num_row_, kRows * 2);
265   ASSERT_TRUE(lhs.labels_.HostCanRead());
266   ASSERT_TRUE(rhs.labels_.HostCanRead());
267   ASSERT_FALSE(lhs.labels_.DeviceCanRead());
268   ASSERT_FALSE(rhs.labels_.DeviceCanRead());
269 
270   ASSERT_EQ(lhs.group_ptr_.front(), 0);
271   ASSERT_EQ(lhs.group_ptr_.back(), kRows * 2);
272   for (size_t i = 0; i < kRows * 2 / per_group; ++i) {
273     ASSERT_EQ(lhs.group_ptr_.at(i), per_group * i);
274   }
275 }
276