1 // Copyright 2016-2020 by Contributors
2 #include <dmlc/io.h>
3 #include <dmlc/filesystem.h>
4 #include <xgboost/data.h>
5 #include <string>
6 #include <memory>
7 #include "../../../src/common/version.h"
8
9 #include "../helpers.h"
10 #include "xgboost/base.h"
11
TEST(MetaInfo,GetSet)12 TEST(MetaInfo, GetSet) {
13 xgboost::MetaInfo info;
14
15 double double2[2] = {1.0, 2.0};
16
17 EXPECT_EQ(info.labels_.Size(), 0);
18 info.SetInfo("label", double2, xgboost::DataType::kFloat32, 2);
19 EXPECT_EQ(info.labels_.Size(), 2);
20
21 float float2[2] = {1.0f, 2.0f};
22 EXPECT_EQ(info.GetWeight(1), 1.0f)
23 << "When no weights are given, was expecting default value 1";
24 info.SetInfo("weight", float2, xgboost::DataType::kFloat32, 2);
25 EXPECT_EQ(info.GetWeight(1), 2.0f);
26
27 uint32_t uint32_t2[2] = {1U, 2U};
28 EXPECT_EQ(info.base_margin_.Size(), 0);
29 info.SetInfo("base_margin", uint32_t2, xgboost::DataType::kUInt32, 2);
30 EXPECT_EQ(info.base_margin_.Size(), 2);
31
32 uint64_t uint64_t2[2] = {1U, 2U};
33 EXPECT_EQ(info.group_ptr_.size(), 0);
34 info.SetInfo("group", uint64_t2, xgboost::DataType::kUInt64, 2);
35 ASSERT_EQ(info.group_ptr_.size(), 3);
36 EXPECT_EQ(info.group_ptr_[2], 3);
37
38 info.Clear();
39 ASSERT_EQ(info.group_ptr_.size(), 0);
40 }
41
TEST(MetaInfo,GetSetFeature)42 TEST(MetaInfo, GetSetFeature) {
43 xgboost::MetaInfo info;
44 EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
45 EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
46 EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
47 EXPECT_NO_THROW(info.SetFeatureInfo("feature_type", nullptr, 0));
48 ASSERT_EQ(info.feature_type_names.size(), 0);
49 ASSERT_EQ(info.feature_types.Size(), 0);
50 ASSERT_EQ(info.feature_names.size(), 0);
51
52 size_t constexpr kCols = 19;
53 std::vector<std::string> types(kCols, u8"float");
54 std::vector<char const*> c_types(kCols);
55 std::transform(types.cbegin(), types.cend(), c_types.begin(),
56 [](auto const &str) { return str.c_str(); });
57 // Info has 0 column
58 EXPECT_THROW(
59 info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()),
60 dmlc::Error);
61 info.num_col_ = kCols;
62 EXPECT_NO_THROW(
63 info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
64
65 // Test clear.
66 info.SetFeatureInfo("feature_type", nullptr, 0);
67 ASSERT_EQ(info.feature_type_names.size(), 0);
68 ASSERT_EQ(info.feature_types.Size(), 0);
69 // Other conditions are tested in `SaveLoadBinary`.
70 }
71
TEST(MetaInfo,SaveLoadBinary)72 TEST(MetaInfo, SaveLoadBinary) {
73 xgboost::MetaInfo info;
74 uint64_t constexpr kRows { 64 }, kCols { 32 };
75 auto generator = []() {
76 static float f = 0;
77 return f++;
78 };
79 std::vector<float> values (kRows);
80 std::generate(values.begin(), values.end(), generator);
81 info.SetInfo("label", values.data(), xgboost::DataType::kFloat32, kRows);
82 info.SetInfo("weight", values.data(), xgboost::DataType::kFloat32, kRows);
83 info.SetInfo("base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
84
85 info.num_row_ = kRows;
86 info.num_col_ = kCols;
87
88 auto featname = u8"特征名";
89 std::vector<std::string> types(kCols, u8"float");
90 std::vector<char const*> c_types(kCols);
91 std::transform(types.cbegin(), types.cend(), c_types.begin(),
92 [](auto const &str) { return str.c_str(); });
93 info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size());
94 std::vector<std::string> names(kCols, featname);
95 std::vector<char const*> c_names(kCols);
96 std::transform(names.cbegin(), names.cend(), c_names.begin(),
97 [](auto const &str) { return str.c_str(); });
98 info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());;
99
100 dmlc::TemporaryDirectory tempdir;
101 const std::string tmp_file = tempdir.path + "/metainfo.binary";
102 {
103 std::unique_ptr<dmlc::Stream> fs {
104 dmlc::Stream::Create(tmp_file.c_str(), "w")
105 };
106 info.SaveBinary(fs.get());
107 }
108
109 {
110 // Round-trip test
111 std::unique_ptr<dmlc::Stream> fs {
112 dmlc::Stream::Create(tmp_file.c_str(), "r")
113 };
114 xgboost::MetaInfo inforead;
115 inforead.LoadBinary(fs.get());
116 ASSERT_EQ(inforead.num_row_, kRows);
117 EXPECT_EQ(inforead.num_row_, info.num_row_);
118 EXPECT_EQ(inforead.num_col_, info.num_col_);
119 EXPECT_EQ(inforead.num_nonzero_, info.num_nonzero_);
120
121 ASSERT_EQ(inforead.labels_.HostVector(), values);
122 EXPECT_EQ(inforead.labels_.HostVector(), info.labels_.HostVector());
123 EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
124 EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
125 EXPECT_EQ(inforead.base_margin_.HostVector(), info.base_margin_.HostVector());
126
127 EXPECT_EQ(inforead.feature_type_names.size(), kCols);
128 EXPECT_EQ(inforead.feature_types.Size(), kCols);
129 EXPECT_TRUE(std::all_of(inforead.feature_type_names.cbegin(),
130 inforead.feature_type_names.cend(),
131 [](auto const &str) { return str == u8"float"; }));
132 auto h_ft = inforead.feature_types.HostSpan();
133 EXPECT_TRUE(std::all_of(h_ft.cbegin(), h_ft.cend(), [](auto f) {
134 return f == xgboost::FeatureType::kNumerical;
135 }));
136
137 EXPECT_EQ(inforead.feature_names.size(), kCols);
138 EXPECT_TRUE(std::all_of(inforead.feature_names.cbegin(),
139 inforead.feature_names.cend(),
140 [=](auto const& str) {
141 return str == featname;
142 }));
143 }
144 }
145
TEST(MetaInfo,LoadQid)146 TEST(MetaInfo, LoadQid) {
147 dmlc::TemporaryDirectory tempdir;
148 std::string tmp_file = tempdir.path + "/qid_test.libsvm";
149 {
150 std::unique_ptr<dmlc::Stream> fs(
151 dmlc::Stream::Create(tmp_file.c_str(), "w"));
152 dmlc::ostream os(fs.get());
153 os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
154 2 qid:1 1:0 2:0 3:1 4:0.1 5:1
155 1 qid:1 1:0 2:1 3:0 4:0.4 5:0
156 1 qid:1 1:0 2:0 3:1 4:0.3 5:0
157 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
158 2 qid:2 1:1 2:0 3:1 4:0.4 5:0
159 1 qid:2 1:0 2:0 3:1 4:0.1 5:0
160 1 qid:2 1:0 2:0 3:1 4:0.2 5:0
161 2 qid:3 1:0 2:0 3:1 4:0.1 5:1
162 3 qid:3 1:1 2:1 3:0 4:0.3 5:0
163 4 qid:3 1:1 2:0 3:0 4:0.4 5:1
164 1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
165 os.set_stream(nullptr);
166 }
167 std::unique_ptr<xgboost::DMatrix> dmat(
168 xgboost::DMatrix::Load(tmp_file, true, false, "libsvm"));
169
170 const xgboost::MetaInfo& info = dmat->Info();
171 const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
172 CHECK(info.group_ptr_ == expected_group_ptr);
173
174 const std::vector<xgboost::bst_row_t> expected_offset{
175 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
176 };
177 const std::vector<xgboost::Entry> expected_data{
178 xgboost::Entry(1, 1), xgboost::Entry(2, 1), xgboost::Entry(3, 0),
179 xgboost::Entry(4, 0.2), xgboost::Entry(5, 0), xgboost::Entry(1, 0),
180 xgboost::Entry(2, 0), xgboost::Entry(3, 1), xgboost::Entry(4, 0.1),
181 xgboost::Entry(5, 1), xgboost::Entry(1, 0), xgboost::Entry(2, 1),
182 xgboost::Entry(3, 0), xgboost::Entry(4, 0.4), xgboost::Entry(5, 0),
183 xgboost::Entry(1, 0), xgboost::Entry(2, 0), xgboost::Entry(3, 1),
184 xgboost::Entry(4, 0.3), xgboost::Entry(5, 0), xgboost::Entry(1, 0),
185 xgboost::Entry(2, 0), xgboost::Entry(3, 1), xgboost::Entry(4, 0.2),
186 xgboost::Entry(5, 0), xgboost::Entry(1, 1), xgboost::Entry(2, 0),
187 xgboost::Entry(3, 1), xgboost::Entry(4, 0.4), xgboost::Entry(5, 0),
188 xgboost::Entry(1, 0), xgboost::Entry(2, 0), xgboost::Entry(3, 1),
189 xgboost::Entry(4, 0.1), xgboost::Entry(5, 0), xgboost::Entry(1, 0),
190 xgboost::Entry(2, 0), xgboost::Entry(3, 1), xgboost::Entry(4, 0.2),
191 xgboost::Entry(5, 0), xgboost::Entry(1, 0), xgboost::Entry(2, 0),
192 xgboost::Entry(3, 1), xgboost::Entry(4, 0.1), xgboost::Entry(5, 1),
193 xgboost::Entry(1, 1), xgboost::Entry(2, 1), xgboost::Entry(3, 0),
194 xgboost::Entry(4, 0.3), xgboost::Entry(5, 0), xgboost::Entry(1, 1),
195 xgboost::Entry(2, 0), xgboost::Entry(3, 0), xgboost::Entry(4, 0.4),
196 xgboost::Entry(5, 1), xgboost::Entry(1, 0), xgboost::Entry(2, 1),
197 xgboost::Entry(3, 1), xgboost::Entry(4, 0.5), {5, 0}};
198 for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
199 CHECK_EQ(batch.base_rowid, 0);
200 CHECK(batch.offset.HostVector() == expected_offset);
201 CHECK(batch.data.HostVector() == expected_data);
202 }
203 }
204
TEST(MetaInfo,CPUQid)205 TEST(MetaInfo, CPUQid) {
206 xgboost::MetaInfo info;
207 info.num_row_ = 100;
208 std::vector<uint32_t> qid(info.num_row_, 0);
209 for (size_t i = 0; i < qid.size(); ++i) {
210 qid[i] = i;
211 }
212
213 info.SetInfo("qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
214 ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
215 ASSERT_EQ(info.group_ptr_.front(), 0);
216 ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
217
218 for (size_t i = 0; i < info.num_row_ + 1; ++i) {
219 ASSERT_EQ(info.group_ptr_[i], i);
220 }
221 }
222
TEST(MetaInfo,Validate)223 TEST(MetaInfo, Validate) {
224 xgboost::MetaInfo info;
225 info.num_row_ = 10;
226 info.num_nonzero_ = 12;
227 info.num_col_ = 3;
228 std::vector<xgboost::bst_group_t> groups (11);
229 info.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, 11);
230 EXPECT_THROW(info.Validate(0), dmlc::Error);
231
232 std::vector<float> labels(info.num_row_ + 1);
233 info.SetInfo("label", labels.data(), xgboost::DataType::kFloat32, info.num_row_ + 1);
234 EXPECT_THROW(info.Validate(0), dmlc::Error);
235
236 #if defined(XGBOOST_USE_CUDA)
237 info.group_ptr_.clear();
238 labels.resize(info.num_row_);
239 info.SetInfo("label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
240 info.labels_.SetDevice(0);
241 EXPECT_THROW(info.Validate(1), dmlc::Error);
242 #endif // defined(XGBOOST_USE_CUDA)
243 }
244
TEST(MetaInfo,HostExtend)245 TEST(MetaInfo, HostExtend) {
246 xgboost::MetaInfo lhs, rhs;
247 size_t const kRows = 100;
248 lhs.labels_.Resize(kRows);
249 lhs.num_row_ = kRows;
250 rhs.labels_.Resize(kRows);
251 rhs.num_row_ = kRows;
252 ASSERT_TRUE(lhs.labels_.HostCanRead());
253 ASSERT_TRUE(rhs.labels_.HostCanRead());
254
255 size_t per_group = 10;
256 std::vector<xgboost::bst_group_t> groups;
257 for (size_t g = 0; g < kRows / per_group; ++g) {
258 groups.emplace_back(per_group);
259 }
260 lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
261 rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
262
263 lhs.Extend(rhs, true, true);
264 ASSERT_EQ(lhs.num_row_, kRows * 2);
265 ASSERT_TRUE(lhs.labels_.HostCanRead());
266 ASSERT_TRUE(rhs.labels_.HostCanRead());
267 ASSERT_FALSE(lhs.labels_.DeviceCanRead());
268 ASSERT_FALSE(rhs.labels_.DeviceCanRead());
269
270 ASSERT_EQ(lhs.group_ptr_.front(), 0);
271 ASSERT_EQ(lhs.group_ptr_.back(), kRows * 2);
272 for (size_t i = 0; i < kRows * 2 / per_group; ++i) {
273 ASSERT_EQ(lhs.group_ptr_.at(i), per_group * i);
274 }
275 }
276