1 /*!
2 * Copyright 2016-2019 XGBoost contributors
3 */
4 #ifndef XGBOOST_TESTS_CPP_HELPERS_H_
5 #define XGBOOST_TESTS_CPP_HELPERS_H_
6
7 #include <iostream>
8 #include <fstream>
9 #include <cstdio>
10 #include <string>
11 #include <memory>
12 #include <vector>
13 #include <sys/stat.h>
14 #include <sys/types.h>
15
16 #include <gtest/gtest.h>
17
18 #include <dmlc/filesystem.h>
19 #include <xgboost/base.h>
20 #include <xgboost/json.h>
21 #include <xgboost/generic_parameters.h>
22
23 #include "../../src/common/common.h"
24 #include "../../src/gbm/gbtree_model.h"
25 #include "../../src/data/array_interface.h"
26
27 #if defined(__CUDACC__)
28 #define DeclareUnifiedTest(name) GPU ## name
29 #else
30 #define DeclareUnifiedTest(name) name
31 #endif
32
33 #if defined(__CUDACC__)
34 #define GPUIDX 0
35 #else
36 #define GPUIDX -1
37 #endif
38
39 namespace xgboost {
40 class ObjFunction;
41 class Metric;
42 struct LearnerModelParam;
43 class GradientBooster;
44 }
45
46 template <typename Float>
RelError(Float l,Float r)47 Float RelError(Float l, Float r) {
48 static_assert(std::is_floating_point<Float>::value, "");
49 return std::abs(1.0f - l / r);
50 }
51
52 bool FileExists(const std::string& filename);
53
54 int64_t GetFileSize(const std::string& filename);
55
56 void CreateSimpleTestData(const std::string& filename);
57
58 // Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
59 // 0-based indexing.
60 void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
61
62 void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
63 std::vector<xgboost::bst_float> preds,
64 std::vector<xgboost::bst_float> labels,
65 std::vector<xgboost::bst_float> weights,
66 std::vector<xgboost::bst_float> out_grad,
67 std::vector<xgboost::bst_float> out_hess);
68
69 xgboost::Json CheckConfigReloadImpl(xgboost::Configurable* const configurable,
70 std::string name);
71
72 template <typename T>
73 xgboost::Json CheckConfigReload(std::unique_ptr<T> const& configurable,
74 std::string name = "") {
75 return CheckConfigReloadImpl(dynamic_cast<xgboost::Configurable*>(configurable.get()),
76 name);
77 }
78
79 void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
80 std::vector<xgboost::bst_float> preds,
81 std::vector<xgboost::bst_float> labels,
82 std::vector<xgboost::bst_float> weights,
83 std::vector<xgboost::bst_uint> groups,
84 std::vector<xgboost::bst_float> out_grad,
85 std::vector<xgboost::bst_float> out_hess);
86
87 xgboost::bst_float GetMetricEval(
88 xgboost::Metric * metric,
89 xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
90 std::vector<xgboost::bst_float> labels,
91 std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
92 std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
93
94 namespace xgboost {
95 bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
96 std::vector<xgboost::bst_float>::const_iterator _end1,
97 std::vector<xgboost::bst_float>::const_iterator _beg2);
98
99 /*!
100 * \brief Linear congruential generator.
101 *
102 * The distribution defined in std is not portable. Given the same seed, it
103 * migth produce different outputs on different platforms or with different
104 * compilers. The SimpleLCG implemented here is to make sure all tests are
105 * reproducible.
106 */
107 class SimpleLCG {
108 private:
109 using StateType = int64_t;
110 static StateType constexpr kDefaultInit = 3;
111 static StateType constexpr default_alpha_ = 61;
112 static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;
113
114 StateType state_;
115 StateType const alpha_;
116 StateType const mod_;
117
118 StateType seed_;
119
120 public:
SimpleLCG()121 SimpleLCG() : state_{kDefaultInit},
122 alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
123 SimpleLCG(SimpleLCG const& that) = default;
124 SimpleLCG(SimpleLCG&& that) = default;
125
Seed(StateType seed)126 void Seed(StateType seed) {
127 seed_ = seed;
128 }
129 /*!
130 * \brief Initialize SimpleLCG.
131 *
132 * \param state Initial state, can also be considered as seed. If set to
133 * zero, SimpleLCG will use internal default value.
134 * \param alpha multiplier
135 * \param mod modulo
136 */
137 explicit SimpleLCG(StateType state,
138 StateType alpha=default_alpha_, StateType mod=max_value_)
139 : state_{state == 0 ? kDefaultInit : state},
140 alpha_{alpha}, mod_{mod} , seed_{state} {}
141
142 StateType operator()();
143 StateType Min() const;
144 StateType Max() const;
145 };
146
147 template <typename ResultT>
148 class SimpleRealUniformDistribution {
149 private:
150 ResultT const lower_;
151 ResultT const upper_;
152
153 /*! \brief Over-simplified version of std::generate_canonical. */
154 template <size_t Bits, typename GeneratorT>
GenerateCanonical(GeneratorT * rng)155 ResultT GenerateCanonical(GeneratorT* rng) const {
156 static_assert(std::is_floating_point<ResultT>::value,
157 "Result type must be floating point.");
158 long double const r = (static_cast<long double>(rng->Max())
159 - static_cast<long double>(rng->Min())) + 1.0L;
160 auto const log2r = static_cast<size_t>(std::log(r) / std::log(2.0L));
161 size_t m = std::max<size_t>(1UL, (Bits + log2r - 1UL) / log2r);
162 ResultT sum_value = 0, r_k = 1;
163
164 for (size_t k = m; k != 0; --k) {
165 sum_value += ResultT((*rng)() - rng->Min()) * r_k;
166 r_k *= r;
167 }
168
169 ResultT res = sum_value / r_k;
170 return res;
171 }
172
173 public:
SimpleRealUniformDistribution(ResultT l,ResultT u)174 SimpleRealUniformDistribution(ResultT l, ResultT u) :
175 lower_{l}, upper_{u} {}
176
177 template <typename GeneratorT>
operator()178 ResultT operator()(GeneratorT* rng) const {
179 ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
180 GeneratorT>(rng);
181 auto ret = (tmp * (upper_ - lower_)) + lower_;
182 // Correct floating point error.
183 return std::max(ret, lower_);
184 }
185 };
186
187 template <typename T>
GetArrayInterface(HostDeviceVector<T> * storage,size_t rows,size_t cols)188 Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
189 Json array_interface{Object()};
190 array_interface["data"] = std::vector<Json>(2);
191 if (storage->DeviceCanRead()) {
192 array_interface["data"][0] =
193 Integer(reinterpret_cast<int64_t>(storage->ConstDevicePointer()));
194 } else {
195 array_interface["data"][0] =
196 Integer(reinterpret_cast<int64_t>(storage->ConstHostPointer()));
197 }
198 array_interface["data"][1] = Boolean(false);
199
200 array_interface["shape"] = std::vector<Json>(2);
201 array_interface["shape"][0] = rows;
202 array_interface["shape"][1] = cols;
203
204 char t = ArrayInterfaceHandler::TypeChar<T>();
205 array_interface["typestr"] = String(std::string{"<"} + t + std::to_string(sizeof(T)));
206 array_interface["version"] = 1;
207 return array_interface;
208 }
209
210 // Generate in-memory random data without using DMatrix.
211 class RandomDataGenerator {
212 bst_row_t rows_;
213 size_t cols_;
214 float sparsity_;
215
216 float lower_;
217 float upper_;
218
219 int32_t device_;
220 int32_t seed_;
221 SimpleLCG lcg_;
222
223 size_t bins_;
224
225 Json ArrayInterfaceImpl(HostDeviceVector<float> *storage, size_t rows,
226 size_t cols) const;
227
228 public:
RandomDataGenerator(bst_row_t rows,size_t cols,float sparsity)229 RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
230 : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
231 device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {}
232
Lower(float v)233 RandomDataGenerator &Lower(float v) {
234 lower_ = v;
235 return *this;
236 }
Upper(float v)237 RandomDataGenerator& Upper(float v) {
238 upper_ = v;
239 return *this;
240 }
Device(int32_t d)241 RandomDataGenerator& Device(int32_t d) {
242 device_ = d;
243 return *this;
244 }
Seed(int32_t s)245 RandomDataGenerator& Seed(int32_t s) {
246 seed_ = s;
247 lcg_.Seed(seed_);
248 return *this;
249 }
Bins(size_t b)250 RandomDataGenerator& Bins(size_t b) {
251 bins_ = b;
252 return *this;
253 }
254
255 void GenerateDense(HostDeviceVector<float>* out) const;
256
257 std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
258
259 /*!
260 * \brief Generate batches of array interface stored in consecutive memory.
261 *
262 * \param storage The consecutive momory used to store the arrays.
263 * \param batches Number of batches.
264 *
265 * \return A vector storing JSON string representation of interface for each batch, and
266 * a single JSON string representing the consecutive memory as a whole
267 * (combining all the batches).
268 */
269 std::pair<std::vector<std::string>, std::string>
270 GenerateArrayInterfaceBatch(HostDeviceVector<float> *storage,
271 size_t batches) const;
272
273 std::string GenerateColumnarArrayInterface(
274 std::vector<HostDeviceVector<float>> *data) const;
275
276 void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
277 HostDeviceVector<bst_feature_t>* columns) const;
278
279 std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
280 bool float_label = true,
281 size_t classes = 1) const;
282 #if defined(XGBOOST_USE_CUDA)
283 std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label = false,
284 bool float_label = true,
285 size_t classes = 1);
286 #endif
287 };
288
289 inline std::vector<float>
GenerateRandomCategoricalSingleColumn(int n,size_t num_categories)290 GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
291 std::vector<float> x(n);
292 std::mt19937 rng(0);
293 std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
294 std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
295 // Make sure each category is present
296 for(size_t i = 0; i < num_categories; i++) {
297 x[i] = i;
298 }
299 return x;
300 }
301
302 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
303 int num_rows, int num_columns);
304
305 std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
306
307 /**
308 * \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
309 * size_t page_size);
310 *
311 * \brief Creates dmatrix with some records, each record containing random number of
312 * features in [1, n_cols]
313 *
314 * \param n_rows Number of records to create.
315 * \param n_cols Max number of features within that record.
316 * \param page_size Sparse page size for the pages within the dmatrix. If page size is 0
317 * then the entire dmatrix is resident in memory; else, multiple sparse pages
318 * of page size are created and backed to disk, which would have to be
319 * streamed in at point of use.
320 * \param deterministic The content inside the dmatrix is constant for this configuration, if true;
321 * else, the content changes every time this method is invoked
322 *
323 * \return The new dmatrix.
324 */
325 std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
326 size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
327 const dmlc::TemporaryDirectory& tempdir = dmlc::TemporaryDirectory());
328
329 gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, size_t n_classes = 1);
330
331 std::unique_ptr<GradientBooster> CreateTrainedGBM(
332 std::string name, Args kwargs, size_t kRows, size_t kCols,
333 LearnerModelParam const* learner_model_param,
334 GenericParameter const* generic_param);
335
CreateEmptyGenericParam(int gpu_id)336 inline GenericParameter CreateEmptyGenericParam(int gpu_id) {
337 xgboost::GenericParameter tparam;
338 std::vector<std::pair<std::string, std::string>> args {
339 {"gpu_id", std::to_string(gpu_id)}};
340 tparam.Init(args);
341 return tparam;
342 }
343
344 inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
345 float lower= 0.0f, float upper = 1.0f) {
346 xgboost::SimpleLCG gen;
347 xgboost::SimpleRealUniformDistribution<bst_float> dist(lower, upper);
348 std::vector<GradientPair> h_gpair(n_rows);
349 for (auto &gpair : h_gpair) {
350 bst_float grad = dist(&gen);
351 bst_float hess = dist(&gen);
352 gpair = GradientPair(grad, hess);
353 }
354 HostDeviceVector<GradientPair> gpair(h_gpair);
355 return gpair;
356 }
357
358 typedef void *DMatrixHandle; // NOLINT(*);
359
360 class ArrayIterForTest {
361 protected:
362 HostDeviceVector<float> data_;
363 size_t iter_ {0};
364 DMatrixHandle proxy_;
365 std::unique_ptr<RandomDataGenerator> rng_;
366
367 std::vector<std::string> batches_;
368 std::string interface_;
369 size_t rows_;
370 size_t cols_;
371 size_t n_batches_;
372
373 public:
374 size_t static constexpr kRows { 1000 };
375 size_t static constexpr kBatches { 100 };
376 size_t static constexpr kCols { 13 };
377
AsArray()378 std::string AsArray() const {
379 return interface_;
380 }
381
382 virtual int Next();
Reset()383 virtual void Reset() {
384 iter_ = 0;
385 }
Iter()386 size_t Iter() const { return iter_; }
387 auto Proxy() -> decltype(proxy_) { return proxy_; }
388
389 explicit ArrayIterForTest(float sparsity, size_t rows = kRows,
390 size_t cols = kCols, size_t batches = kBatches);
391 virtual ~ArrayIterForTest();
392 };
393
394 class CudaArrayIterForTest : public ArrayIterForTest {
395 public:
396 size_t static constexpr kRows{1000};
397 size_t static constexpr kBatches{100};
398 size_t static constexpr kCols{13};
399
400 explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
401 size_t cols = kCols, size_t batches = kBatches);
402 int Next() override;
403 ~CudaArrayIterForTest() override = default;
404 };
405
406 void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
407 std::vector<size_t> *p_row_ptr,
408 std::vector<bst_feature_t> *p_cids);
409
410 typedef void *DataIterHandle; // NOLINT(*)
411
Reset(DataIterHandle self)412 inline void Reset(DataIterHandle self) {
413 static_cast<ArrayIterForTest*>(self)->Reset();
414 }
415
Next(DataIterHandle self)416 inline int Next(DataIterHandle self) {
417 return static_cast<ArrayIterForTest*>(self)->Next();
418 }
419
420 class RMMAllocator;
421 using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
422 RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
423
424 } // namespace xgboost
425 #endif
426