1 /*!
2  * Copyright 2016-2019 XGBoost contributors
3  */
4 #ifndef XGBOOST_TESTS_CPP_HELPERS_H_
5 #define XGBOOST_TESTS_CPP_HELPERS_H_
6 
7 #include <iostream>
8 #include <fstream>
9 #include <cstdio>
10 #include <string>
11 #include <memory>
12 #include <vector>
13 #include <sys/stat.h>
14 #include <sys/types.h>
15 
16 #include <gtest/gtest.h>
17 
18 #include <dmlc/filesystem.h>
19 #include <xgboost/base.h>
20 #include <xgboost/json.h>
21 #include <xgboost/generic_parameters.h>
22 
23 #include "../../src/common/common.h"
24 #include "../../src/gbm/gbtree_model.h"
25 #include "../../src/data/array_interface.h"
26 
27 #if defined(__CUDACC__)
28 #define DeclareUnifiedTest(name) GPU ## name
29 #else
30 #define DeclareUnifiedTest(name) name
31 #endif
32 
33 #if defined(__CUDACC__)
34 #define GPUIDX 0
35 #else
36 #define GPUIDX -1
37 #endif
38 
39 namespace xgboost {
40 class ObjFunction;
41 class Metric;
42 struct LearnerModelParam;
43 class GradientBooster;
44 }
45 
46 template <typename Float>
RelError(Float l,Float r)47 Float RelError(Float l, Float r) {
48   static_assert(std::is_floating_point<Float>::value, "");
49   return std::abs(1.0f - l / r);
50 }
51 
52 bool FileExists(const std::string& filename);
53 
54 int64_t GetFileSize(const std::string& filename);
55 
56 void CreateSimpleTestData(const std::string& filename);
57 
58 // Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
59 // 0-based indexing.
60 void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
61 
62 void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
63                       std::vector<xgboost::bst_float> preds,
64                       std::vector<xgboost::bst_float> labels,
65                       std::vector<xgboost::bst_float> weights,
66                       std::vector<xgboost::bst_float> out_grad,
67                       std::vector<xgboost::bst_float> out_hess);
68 
69 xgboost::Json CheckConfigReloadImpl(xgboost::Configurable* const configurable,
70                                     std::string name);
71 
72 template <typename T>
73 xgboost::Json CheckConfigReload(std::unique_ptr<T> const& configurable,
74                                 std::string name = "") {
75   return CheckConfigReloadImpl(dynamic_cast<xgboost::Configurable*>(configurable.get()),
76                                name);
77 }
78 
79 void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
80                              std::vector<xgboost::bst_float> preds,
81                              std::vector<xgboost::bst_float> labels,
82                              std::vector<xgboost::bst_float> weights,
83                              std::vector<xgboost::bst_uint> groups,
84                              std::vector<xgboost::bst_float> out_grad,
85                              std::vector<xgboost::bst_float> out_hess);
86 
87 xgboost::bst_float GetMetricEval(
88   xgboost::Metric * metric,
89   xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
90   std::vector<xgboost::bst_float> labels,
91   std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
92   std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
93 
94 namespace xgboost {
95 bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
96             std::vector<xgboost::bst_float>::const_iterator _end1,
97             std::vector<xgboost::bst_float>::const_iterator _beg2);
98 
99 /*!
100  * \brief Linear congruential generator.
101  *
102  * The distribution defined in std is not portable. Given the same seed, it
103  * migth produce different outputs on different platforms or with different
104  * compilers.  The SimpleLCG implemented here is to make sure all tests are
105  * reproducible.
106  */
107 class SimpleLCG {
108  private:
109   using StateType = int64_t;
110   static StateType constexpr kDefaultInit = 3;
111   static StateType constexpr default_alpha_ = 61;
112   static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;
113 
114   StateType state_;
115   StateType const alpha_;
116   StateType const mod_;
117 
118   StateType seed_;
119 
120  public:
SimpleLCG()121   SimpleLCG() : state_{kDefaultInit},
122                 alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
123   SimpleLCG(SimpleLCG const& that) = default;
124   SimpleLCG(SimpleLCG&& that) = default;
125 
Seed(StateType seed)126   void Seed(StateType seed) {
127     seed_ = seed;
128   }
129   /*!
130    * \brief Initialize SimpleLCG.
131    *
132    * \param state  Initial state, can also be considered as seed. If set to
133    *               zero, SimpleLCG will use internal default value.
134    * \param alpha  multiplier
135    * \param mod    modulo
136    */
137   explicit SimpleLCG(StateType state,
138                      StateType alpha=default_alpha_, StateType mod=max_value_)
139       : state_{state == 0 ? kDefaultInit : state},
140         alpha_{alpha}, mod_{mod} , seed_{state} {}
141 
142   StateType operator()();
143   StateType Min() const;
144   StateType Max() const;
145 };
146 
147 template <typename ResultT>
148 class SimpleRealUniformDistribution {
149  private:
150   ResultT const lower_;
151   ResultT const upper_;
152 
153   /*! \brief Over-simplified version of std::generate_canonical. */
154   template <size_t Bits, typename GeneratorT>
GenerateCanonical(GeneratorT * rng)155   ResultT GenerateCanonical(GeneratorT* rng) const {
156     static_assert(std::is_floating_point<ResultT>::value,
157                   "Result type must be floating point.");
158     long double const r = (static_cast<long double>(rng->Max())
159                            - static_cast<long double>(rng->Min())) + 1.0L;
160     auto const log2r = static_cast<size_t>(std::log(r) / std::log(2.0L));
161     size_t m = std::max<size_t>(1UL, (Bits + log2r - 1UL) / log2r);
162     ResultT sum_value = 0, r_k = 1;
163 
164     for (size_t k = m; k != 0; --k) {
165       sum_value += ResultT((*rng)() - rng->Min()) * r_k;
166       r_k *= r;
167     }
168 
169     ResultT res = sum_value / r_k;
170     return res;
171   }
172 
173  public:
SimpleRealUniformDistribution(ResultT l,ResultT u)174   SimpleRealUniformDistribution(ResultT l, ResultT u) :
175       lower_{l}, upper_{u} {}
176 
177   template <typename GeneratorT>
operator()178   ResultT operator()(GeneratorT* rng) const {
179     ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
180                                     GeneratorT>(rng);
181     auto ret = (tmp * (upper_ - lower_)) + lower_;
182     // Correct floating point error.
183     return std::max(ret, lower_);
184   }
185 };
186 
187 template <typename T>
GetArrayInterface(HostDeviceVector<T> * storage,size_t rows,size_t cols)188 Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
189   Json array_interface{Object()};
190   array_interface["data"] = std::vector<Json>(2);
191   if (storage->DeviceCanRead()) {
192     array_interface["data"][0] =
193         Integer(reinterpret_cast<int64_t>(storage->ConstDevicePointer()));
194   } else {
195     array_interface["data"][0] =
196         Integer(reinterpret_cast<int64_t>(storage->ConstHostPointer()));
197   }
198   array_interface["data"][1] = Boolean(false);
199 
200   array_interface["shape"] = std::vector<Json>(2);
201   array_interface["shape"][0] = rows;
202   array_interface["shape"][1] = cols;
203 
204   char t = ArrayInterfaceHandler::TypeChar<T>();
205   array_interface["typestr"] = String(std::string{"<"} + t + std::to_string(sizeof(T)));
206   array_interface["version"] = 1;
207   return array_interface;
208 }
209 
210 // Generate in-memory random data without using DMatrix.
211 class RandomDataGenerator {
212   bst_row_t rows_;
213   size_t cols_;
214   float sparsity_;
215 
216   float lower_;
217   float upper_;
218 
219   int32_t device_;
220   int32_t seed_;
221   SimpleLCG lcg_;
222 
223   size_t bins_;
224 
225   Json ArrayInterfaceImpl(HostDeviceVector<float> *storage, size_t rows,
226                           size_t cols) const;
227 
228  public:
RandomDataGenerator(bst_row_t rows,size_t cols,float sparsity)229   RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
230       : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
231         device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {}
232 
Lower(float v)233   RandomDataGenerator &Lower(float v) {
234     lower_ = v;
235     return *this;
236   }
Upper(float v)237   RandomDataGenerator& Upper(float v) {
238     upper_ = v;
239     return *this;
240   }
Device(int32_t d)241   RandomDataGenerator& Device(int32_t d) {
242     device_ = d;
243     return *this;
244   }
Seed(int32_t s)245   RandomDataGenerator& Seed(int32_t s) {
246     seed_ = s;
247     lcg_.Seed(seed_);
248     return *this;
249   }
Bins(size_t b)250   RandomDataGenerator& Bins(size_t b) {
251     bins_ = b;
252     return *this;
253   }
254 
255   void GenerateDense(HostDeviceVector<float>* out) const;
256 
257   std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
258 
259   /*!
260    * \brief Generate batches of array interface stored in consecutive memory.
261    *
262    * \param storage The consecutive momory used to store the arrays.
263    * \param batches Number of batches.
264    *
265    * \return A vector storing JSON string representation of interface for each batch, and
266    *         a single JSON string representing the consecutive memory as a whole
267    *         (combining all the batches).
268    */
269   std::pair<std::vector<std::string>, std::string>
270   GenerateArrayInterfaceBatch(HostDeviceVector<float> *storage,
271                               size_t batches) const;
272 
273   std::string GenerateColumnarArrayInterface(
274       std::vector<HostDeviceVector<float>> *data) const;
275 
276   void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
277                    HostDeviceVector<bst_feature_t>* columns) const;
278 
279   std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
280                                            bool float_label = true,
281                                            size_t classes = 1) const;
282 #if defined(XGBOOST_USE_CUDA)
283   std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label = false,
284                                                  bool float_label = true,
285                                                  size_t classes = 1);
286 #endif
287 };
288 
289 inline std::vector<float>
GenerateRandomCategoricalSingleColumn(int n,size_t num_categories)290 GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
291   std::vector<float> x(n);
292   std::mt19937 rng(0);
293   std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
294   std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
295   // Make sure each category is present
296   for(size_t i = 0; i < num_categories; i++) {
297     x[i] = i;
298   }
299   return x;
300 }
301 
302 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
303                                             int num_rows, int num_columns);
304 
305 std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
306 
307 /**
308  * \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
309  *                                                            size_t page_size);
310  *
311  * \brief Creates dmatrix with some records, each record containing random number of
312  *        features in [1, n_cols]
313  *
314  * \param n_rows      Number of records to create.
315  * \param n_cols      Max number of features within that record.
316  * \param page_size   Sparse page size for the pages within the dmatrix. If page size is 0
317  *                    then the entire dmatrix is resident in memory; else, multiple sparse pages
318  *                    of page size are created and backed to disk, which would have to be
319  *                    streamed in at point of use.
320  * \param deterministic The content inside the dmatrix is constant for this configuration, if true;
321  *                      else, the content changes every time this method is invoked
322  *
323  * \return The new dmatrix.
324  */
325 std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
326     size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
327     const dmlc::TemporaryDirectory& tempdir = dmlc::TemporaryDirectory());
328 
329 gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, size_t n_classes = 1);
330 
331 std::unique_ptr<GradientBooster> CreateTrainedGBM(
332     std::string name, Args kwargs, size_t kRows, size_t kCols,
333     LearnerModelParam const* learner_model_param,
334     GenericParameter const* generic_param);
335 
CreateEmptyGenericParam(int gpu_id)336 inline GenericParameter CreateEmptyGenericParam(int gpu_id) {
337   xgboost::GenericParameter tparam;
338   std::vector<std::pair<std::string, std::string>> args {
339     {"gpu_id", std::to_string(gpu_id)}};
340   tparam.Init(args);
341   return tparam;
342 }
343 
344 inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
345                                                               float lower= 0.0f, float upper = 1.0f) {
346   xgboost::SimpleLCG gen;
347   xgboost::SimpleRealUniformDistribution<bst_float> dist(lower, upper);
348   std::vector<GradientPair> h_gpair(n_rows);
349   for (auto &gpair : h_gpair) {
350     bst_float grad = dist(&gen);
351     bst_float hess = dist(&gen);
352     gpair = GradientPair(grad, hess);
353   }
354   HostDeviceVector<GradientPair> gpair(h_gpair);
355   return gpair;
356 }
357 
358 typedef void *DMatrixHandle;  // NOLINT(*);
359 
360 class ArrayIterForTest {
361  protected:
362   HostDeviceVector<float> data_;
363   size_t iter_ {0};
364   DMatrixHandle proxy_;
365   std::unique_ptr<RandomDataGenerator> rng_;
366 
367   std::vector<std::string> batches_;
368   std::string interface_;
369   size_t rows_;
370   size_t cols_;
371   size_t n_batches_;
372 
373  public:
374   size_t static constexpr kRows { 1000 };
375   size_t static constexpr kBatches { 100 };
376   size_t static constexpr kCols { 13 };
377 
AsArray()378   std::string AsArray() const {
379     return interface_;
380   }
381 
382   virtual int Next();
Reset()383   virtual void Reset() {
384     iter_ = 0;
385   }
Iter()386   size_t Iter() const { return iter_; }
387   auto Proxy() -> decltype(proxy_) { return proxy_; }
388 
389   explicit ArrayIterForTest(float sparsity, size_t rows = kRows,
390                             size_t cols = kCols, size_t batches = kBatches);
391   virtual ~ArrayIterForTest();
392 };
393 
394 class CudaArrayIterForTest : public ArrayIterForTest {
395  public:
396   size_t static constexpr kRows{1000};
397   size_t static constexpr kBatches{100};
398   size_t static constexpr kCols{13};
399 
400   explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
401                                 size_t cols = kCols, size_t batches = kBatches);
402   int Next() override;
403   ~CudaArrayIterForTest() override = default;
404 };
405 
406 void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
407                   std::vector<size_t> *p_row_ptr,
408                   std::vector<bst_feature_t> *p_cids);
409 
410 typedef void *DataIterHandle;  // NOLINT(*)
411 
Reset(DataIterHandle self)412 inline void Reset(DataIterHandle self) {
413   static_cast<ArrayIterForTest*>(self)->Reset();
414 }
415 
Next(DataIterHandle self)416 inline int Next(DataIterHandle self) {
417   return static_cast<ArrayIterForTest*>(self)->Next();
418 }
419 
420 class RMMAllocator;
421 using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
422 RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
423 
424 }  // namespace xgboost
425 #endif
426