1 /*!
2  * Copyright 2017 by Contributors
3  * \file column_matrix.h
4  * \brief Utility for fast column-wise access
5  * \author Philip Cho
6  */
7 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <limits>
12 #include <vector>
13 #include <memory>
14 #include "hist_util.h"
15 #include "../data/gradient_index.h"
16 
17 namespace xgboost {
18 namespace common {
19 
20 class ColumnMatrix;
21 /*! \brief column type */
22 enum ColumnType {
23   kDenseColumn,
24   kSparseColumn
25 };
26 
27 /*! \brief a column storage, to be used with ApplySplit. Note that each
28     bin id is stored as index[i] + index_base.
29     Different types of column index for each column allow
30     to reduce the memory usage. */
31 template <typename BinIdxType>
32 class Column {
33  public:
34   static constexpr int32_t kMissingId = -1;
35 
Column(ColumnType type,common::Span<const BinIdxType> index,const uint32_t index_base)36   Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
37       : type_(type),
38         index_(index),
39         index_base_(index_base) {}
40 
41   virtual ~Column() = default;
42 
GetGlobalBinIdx(size_t idx)43   uint32_t GetGlobalBinIdx(size_t idx) const {
44     return index_base_ + static_cast<uint32_t>(index_[idx]);
45   }
46 
GetFeatureBinIdx(size_t idx)47   BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
48 
GetBaseIdx()49   uint32_t GetBaseIdx() const { return index_base_; }
50 
GetFeatureBinIdxPtr()51   common::Span<const BinIdxType> GetFeatureBinIdxPtr() const { return index_; }
52 
GetType()53   ColumnType GetType() const { return type_; }
54 
55   /* returns number of elements in column */
Size()56   size_t Size() const { return index_.size(); }
57 
58  private:
59   /* type of column */
60   ColumnType type_;
61   /* bin indexes in range [0, max_bins - 1] */
62   common::Span<const BinIdxType> index_;
63   /* bin index offset for specific feature */
64   const uint32_t index_base_;
65 };
66 
67 template <typename BinIdxType>
68 class SparseColumn: public Column<BinIdxType> {
69  public:
SparseColumn(ColumnType type,common::Span<const BinIdxType> index,uint32_t index_base,common::Span<const size_t> row_ind)70   SparseColumn(ColumnType type, common::Span<const BinIdxType> index,
71               uint32_t index_base, common::Span<const size_t> row_ind)
72       : Column<BinIdxType>(type, index, index_base),
73         row_ind_(row_ind) {}
74 
GetRowData()75   const size_t* GetRowData() const { return row_ind_.data(); }
76 
GetBinIdx(size_t rid,size_t * state)77   int32_t GetBinIdx(size_t rid, size_t* state) const {
78     const size_t column_size = this->Size();
79     if (!((*state) < column_size)) {
80       return this->kMissingId;
81     }
82     while ((*state) < column_size && GetRowIdx(*state) < rid) {
83       ++(*state);
84     }
85     if (((*state) < column_size) && GetRowIdx(*state) == rid) {
86       return this->GetGlobalBinIdx(*state);
87     } else {
88       return this->kMissingId;
89     }
90   }
91 
GetInitialState(const size_t first_row_id)92   size_t GetInitialState(const size_t first_row_id) const {
93     const size_t* row_data = GetRowData();
94     const size_t column_size = this->Size();
95     // search first nonzero row with index >= rid_span.front()
96     const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
97     // column_size if all messing
98     return p - row_data;
99   }
100 
GetRowIdx(size_t idx)101   size_t GetRowIdx(size_t idx) const {
102     return row_ind_.data()[idx];
103   }
104 
105  private:
106   /* indexes of rows */
107   common::Span<const size_t> row_ind_;
108 };
109 
110 template <typename BinIdxType, bool any_missing>
111 class DenseColumn: public Column<BinIdxType> {
112  public:
DenseColumn(ColumnType type,common::Span<const BinIdxType> index,uint32_t index_base,const std::vector<bool> & missing_flags,size_t feature_offset)113   DenseColumn(ColumnType type, common::Span<const BinIdxType> index,
114               uint32_t index_base, const std::vector<bool>& missing_flags,
115               size_t feature_offset)
116       : Column<BinIdxType>(type, index, index_base),
117         missing_flags_(missing_flags),
118         feature_offset_(feature_offset) {}
IsMissing(size_t idx)119   bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
120 
GetBinIdx(size_t idx,size_t * state)121   int32_t GetBinIdx(size_t idx, size_t* state) const {
122     if (any_missing) {
123       return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx);
124     } else {
125       return this->GetGlobalBinIdx(idx);
126     }
127   }
128 
GetInitialState(const size_t first_row_id)129   size_t GetInitialState(const size_t first_row_id) const {
130     return 0;
131   }
132 
133  private:
134   /* flags for missing values in dense columns */
135   const std::vector<bool>& missing_flags_;
136   size_t feature_offset_;
137 };
138 
139 /*! \brief a collection of columns, with support for construction from
140     GHistIndexMatrix. */
141 class ColumnMatrix {
142  public:
143   // get number of features
GetNumFeature()144   inline bst_uint GetNumFeature() const {
145     return static_cast<bst_uint>(type_.size());
146   }
147 
148   // construct column matrix from GHistIndexMatrix
Init(const GHistIndexMatrix & gmat,double sparse_threshold)149   inline void Init(const GHistIndexMatrix& gmat,
150                    double  sparse_threshold) {
151     const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
152     const size_t nrow = gmat.row_ptr.size() - 1;
153     // identify type of each column
154     feature_counts_.resize(nfeature);
155     type_.resize(nfeature);
156     std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
157     uint32_t max_val = std::numeric_limits<uint32_t>::max();
158     for (int32_t fid = 0; fid < nfeature; ++fid) {
159       CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
160     }
161     bool all_dense = gmat.IsDense();
162     gmat.GetFeatureCounts(&feature_counts_[0]);
163     // classify features
164     for (int32_t fid = 0; fid < nfeature; ++fid) {
165       if (static_cast<double>(feature_counts_[fid])
166                  < sparse_threshold * nrow) {
167         type_[fid] = kSparseColumn;
168         all_dense = false;
169       } else {
170         type_[fid] = kDenseColumn;
171       }
172     }
173 
174     // want to compute storage boundary for each feature
175     // using variants of prefix sum scan
176     feature_offsets_.resize(nfeature + 1);
177     size_t accum_index_ = 0;
178     feature_offsets_[0] = accum_index_;
179     for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
180       if (type_[fid - 1] == kDenseColumn) {
181         accum_index_ += static_cast<size_t>(nrow);
182       } else {
183         accum_index_ += feature_counts_[fid - 1];
184       }
185       feature_offsets_[fid] = accum_index_;
186     }
187 
188     SetTypeSize(gmat.max_num_bins);
189 
190     index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
191     if (!all_dense) {
192       row_ind_.resize(feature_offsets_[nfeature]);
193     }
194 
195     // store least bin id for each feature
196     index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
197 
198     const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
199     any_missing_ = !noMissingValues;
200 
201     if (noMissingValues) {
202       missing_flags_.resize(feature_offsets_[nfeature], false);
203     } else {
204       missing_flags_.resize(feature_offsets_[nfeature], true);
205     }
206 
207     // pre-fill index_ for dense columns
208     if (all_dense) {
209       BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
210       if (gmat_bin_size == kUint8BinsTypeSize) {
211           SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
212       } else if (gmat_bin_size == kUint16BinsTypeSize) {
213           SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
214       } else {
215           CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize);
216           SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
217       }
218     /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize
219        but for ColumnMatrix we still have a chance to reduce the memory consumption */
220     } else {
221       if (bins_type_size_ == kUint8BinsTypeSize) {
222           SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
223       } else if (bins_type_size_ == kUint16BinsTypeSize) {
224           SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
225       } else {
226           CHECK_EQ(bins_type_size_, kUint32BinsTypeSize);
227           SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
228       }
229     }
230   }
231 
232   /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
SetTypeSize(size_t max_num_bins)233   void SetTypeSize(size_t max_num_bins) {
234     if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
235       bins_type_size_ = kUint8BinsTypeSize;
236     } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
237       bins_type_size_ = kUint16BinsTypeSize;
238     } else {
239       bins_type_size_ = kUint32BinsTypeSize;
240     }
241   }
242 
243   /* Fetch an individual column. This code should be used with type swith
244      to determine type of bin id's */
245   template <typename BinIdxType, bool any_missing>
GetColumn(unsigned fid)246   std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
247     CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
248 
249     const size_t feature_offset = feature_offsets_[fid];  // to get right place for certain feature
250     const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
251     common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>(
252                                                  &index_[feature_offset * bins_type_size_]),
253                                                  column_size };
254     std::unique_ptr<const Column<BinIdxType> > res;
255     if (type_[fid] == ColumnType::kDenseColumn) {
256       CHECK_EQ(any_missing, any_missing_);
257       res.reset(new DenseColumn<BinIdxType, any_missing>(type_[fid], bin_index, index_base_[fid],
258                                              missing_flags_, feature_offset));
259     } else {
260       res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
261                                        {&row_ind_[feature_offset], column_size}));
262     }
263     return res;
264   }
265 
266   template <typename T>
SetIndexAllDense(T * index,const GHistIndexMatrix & gmat,const size_t nrow,const size_t nfeature,const bool noMissingValues)267   inline void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat,
268                                const size_t nrow, const size_t nfeature,
269                                const bool noMissingValues) {
270     T* local_index = reinterpret_cast<T*>(&index_[0]);
271 
272     /* missing values make sense only for column with type kDenseColumn,
273        and if no missing values were observed it could be handled much faster. */
274     if (noMissingValues) {
275       ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
276         const size_t ibegin = rid*nfeature;
277         const size_t iend = (rid+1)*nfeature;
278         size_t j = 0;
279         for (size_t i = ibegin; i < iend; ++i, ++j) {
280             const size_t idx = feature_offsets_[j];
281             local_index[idx + rid] = index[i];
282         }
283       });
284     } else {
285       /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
286       size_t rbegin = 0;
287       for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
288         const xgboost::Entry* data_ptr = batch.data.HostVector().data();
289         const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
290         const size_t batch_size = batch.Size();
291         CHECK_LT(batch_size, offset_vec.size());
292         for (size_t rid = 0; rid < batch_size; ++rid) {
293           const size_t size = offset_vec[rid + 1] - offset_vec[rid];
294           SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
295           const size_t ibegin = gmat.row_ptr[rbegin + rid];
296           const size_t iend = gmat.row_ptr[rbegin + rid + 1];
297           CHECK_EQ(ibegin + inst.size(), iend);
298           size_t j = 0;
299           size_t fid = 0;
300           for (size_t i = ibegin; i < iend; ++i, ++j) {
301               fid = inst[j].index;
302               const size_t idx = feature_offsets_[fid];
303               /* rbegin allows to store indexes from specific SparsePage batch */
304               local_index[idx + rbegin + rid] = index[i];
305               missing_flags_[idx + rbegin + rid] = false;
306           }
307         }
308         rbegin += batch.Size();
309       }
310     }
311   }
312 
313   template<typename T>
SetIndex(uint32_t * index,const GHistIndexMatrix & gmat,const size_t nfeature)314   inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat,
315                        const size_t nfeature) {
316     std::vector<size_t> num_nonzeros;
317     num_nonzeros.resize(nfeature);
318     std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
319 
320     T* local_index = reinterpret_cast<T*>(&index_[0]);
321     size_t rbegin = 0;
322     for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
323       const xgboost::Entry* data_ptr = batch.data.HostVector().data();
324       const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
325       const size_t batch_size = batch.Size();
326       CHECK_LT(batch_size, offset_vec.size());
327       for (size_t rid = 0; rid < batch_size; ++rid) {
328         const size_t ibegin = gmat.row_ptr[rbegin + rid];
329         const size_t iend = gmat.row_ptr[rbegin + rid + 1];
330         size_t fid = 0;
331         const size_t size = offset_vec[rid + 1] - offset_vec[rid];
332         SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
333 
334         CHECK_EQ(ibegin + inst.size(), iend);
335         size_t j = 0;
336         for (size_t i = ibegin; i < iend; ++i, ++j) {
337           const uint32_t bin_id = index[i];
338 
339           fid = inst[j].index;
340           if (type_[fid] == kDenseColumn) {
341             T* begin = &local_index[feature_offsets_[fid]];
342             begin[rid + rbegin] = bin_id - index_base_[fid];
343             missing_flags_[feature_offsets_[fid] + rid + rbegin] = false;
344           } else {
345             T* begin = &local_index[feature_offsets_[fid]];
346             begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
347             row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
348             ++num_nonzeros[fid];
349           }
350         }
351       }
352       rbegin += batch.Size();
353     }
354   }
GetTypeSize()355   BinTypeSize GetTypeSize() const {
356     return bins_type_size_;
357   }
358 
359   // This is just an utility function
NoMissingValues(const size_t n_elements,const size_t n_row,const size_t n_features)360   bool NoMissingValues(const size_t n_elements,
361                              const size_t n_row, const size_t n_features) {
362     return n_elements == n_features * n_row;
363   }
364 
365   // And this returns part of state
AnyMissing()366   bool AnyMissing() const {
367     return any_missing_;
368   }
369 
370  private:
371   std::vector<uint8_t> index_;
372 
373   std::vector<size_t> feature_counts_;
374   std::vector<ColumnType> type_;
375   std::vector<size_t> row_ind_;
376   /* indicate where each column's index and row_ind is stored. */
377   std::vector<size_t> feature_offsets_;
378 
379   // index_base_[fid]: least bin id for feature fid
380   uint32_t* index_base_;
381   std::vector<bool> missing_flags_;
382   BinTypeSize bins_type_size_;
383   bool any_missing_;
384 };
385 
386 }  // namespace common
387 }  // namespace xgboost
388 #endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
389