1 /*! 2 * Copyright 2017 by Contributors 3 * \file column_matrix.h 4 * \brief Utility for fast column-wise access 5 * \author Philip Cho 6 */ 7 8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_ 9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_ 10 11 #include <limits> 12 #include <vector> 13 #include <memory> 14 #include "hist_util.h" 15 #include "../data/gradient_index.h" 16 17 namespace xgboost { 18 namespace common { 19 20 class ColumnMatrix; 21 /*! \brief column type */ 22 enum ColumnType { 23 kDenseColumn, 24 kSparseColumn 25 }; 26 27 /*! \brief a column storage, to be used with ApplySplit. Note that each 28 bin id is stored as index[i] + index_base. 29 Different types of column index for each column allow 30 to reduce the memory usage. */ 31 template <typename BinIdxType> 32 class Column { 33 public: 34 static constexpr int32_t kMissingId = -1; 35 Column(ColumnType type,common::Span<const BinIdxType> index,const uint32_t index_base)36 Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base) 37 : type_(type), 38 index_(index), 39 index_base_(index_base) {} 40 41 virtual ~Column() = default; 42 GetGlobalBinIdx(size_t idx)43 uint32_t GetGlobalBinIdx(size_t idx) const { 44 return index_base_ + static_cast<uint32_t>(index_[idx]); 45 } 46 GetFeatureBinIdx(size_t idx)47 BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; } 48 GetBaseIdx()49 uint32_t GetBaseIdx() const { return index_base_; } 50 GetFeatureBinIdxPtr()51 common::Span<const BinIdxType> GetFeatureBinIdxPtr() const { return index_; } 52 GetType()53 ColumnType GetType() const { return type_; } 54 55 /* returns number of elements in column */ Size()56 size_t Size() const { return index_.size(); } 57 58 private: 59 /* type of column */ 60 ColumnType type_; 61 /* bin indexes in range [0, max_bins - 1] */ 62 common::Span<const BinIdxType> index_; 63 /* bin index offset for specific feature */ 64 const uint32_t index_base_; 65 }; 66 67 template <typename BinIdxType> 68 class SparseColumn: public Column<BinIdxType> { 69 public: SparseColumn(ColumnType type,common::Span<const BinIdxType> index,uint32_t index_base,common::Span<const size_t> row_ind)70 SparseColumn(ColumnType type, common::Span<const BinIdxType> index, 71 uint32_t index_base, common::Span<const size_t> row_ind) 72 : Column<BinIdxType>(type, index, index_base), 73 row_ind_(row_ind) {} 74 GetRowData()75 const size_t* GetRowData() const { return row_ind_.data(); } 76 GetBinIdx(size_t rid,size_t * state)77 int32_t GetBinIdx(size_t rid, size_t* state) const { 78 const size_t column_size = this->Size(); 79 if (!((*state) < column_size)) { 80 return this->kMissingId; 81 } 82 while ((*state) < column_size && GetRowIdx(*state) < rid) { 83 ++(*state); 84 } 85 if (((*state) < column_size) && GetRowIdx(*state) == rid) { 86 return this->GetGlobalBinIdx(*state); 87 } else { 88 return this->kMissingId; 89 } 90 } 91 GetInitialState(const size_t first_row_id)92 size_t GetInitialState(const size_t first_row_id) const { 93 const size_t* row_data = GetRowData(); 94 const size_t column_size = this->Size(); 95 // search first nonzero row with index >= rid_span.front() 96 const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id); 97 // column_size if all messing 98 return p - row_data; 99 } 100 GetRowIdx(size_t idx)101 size_t GetRowIdx(size_t idx) const { 102 return row_ind_.data()[idx]; 103 } 104 105 private: 106 /* indexes of rows */ 107 common::Span<const size_t> row_ind_; 108 }; 109 110 template <typename BinIdxType, bool any_missing> 111 class DenseColumn: public Column<BinIdxType> { 112 public: DenseColumn(ColumnType type,common::Span<const BinIdxType> index,uint32_t index_base,const std::vector<bool> & missing_flags,size_t feature_offset)113 DenseColumn(ColumnType type, common::Span<const BinIdxType> index, 114 uint32_t index_base, const std::vector<bool>& missing_flags, 115 size_t feature_offset) 116 : Column<BinIdxType>(type, index, index_base), 117 missing_flags_(missing_flags), 118 feature_offset_(feature_offset) {} IsMissing(size_t idx)119 bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; } 120 GetBinIdx(size_t idx,size_t * state)121 int32_t GetBinIdx(size_t idx, size_t* state) const { 122 if (any_missing) { 123 return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx); 124 } else { 125 return this->GetGlobalBinIdx(idx); 126 } 127 } 128 GetInitialState(const size_t first_row_id)129 size_t GetInitialState(const size_t first_row_id) const { 130 return 0; 131 } 132 133 private: 134 /* flags for missing values in dense columns */ 135 const std::vector<bool>& missing_flags_; 136 size_t feature_offset_; 137 }; 138 139 /*! \brief a collection of columns, with support for construction from 140 GHistIndexMatrix. */ 141 class ColumnMatrix { 142 public: 143 // get number of features GetNumFeature()144 inline bst_uint GetNumFeature() const { 145 return static_cast<bst_uint>(type_.size()); 146 } 147 148 // construct column matrix from GHistIndexMatrix Init(const GHistIndexMatrix & gmat,double sparse_threshold)149 inline void Init(const GHistIndexMatrix& gmat, 150 double sparse_threshold) { 151 const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1); 152 const size_t nrow = gmat.row_ptr.size() - 1; 153 // identify type of each column 154 feature_counts_.resize(nfeature); 155 type_.resize(nfeature); 156 std::fill(feature_counts_.begin(), feature_counts_.end(), 0); 157 uint32_t max_val = std::numeric_limits<uint32_t>::max(); 158 for (int32_t fid = 0; fid < nfeature; ++fid) { 159 CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val); 160 } 161 bool all_dense = gmat.IsDense(); 162 gmat.GetFeatureCounts(&feature_counts_[0]); 163 // classify features 164 for (int32_t fid = 0; fid < nfeature; ++fid) { 165 if (static_cast<double>(feature_counts_[fid]) 166 < sparse_threshold * nrow) { 167 type_[fid] = kSparseColumn; 168 all_dense = false; 169 } else { 170 type_[fid] = kDenseColumn; 171 } 172 } 173 174 // want to compute storage boundary for each feature 175 // using variants of prefix sum scan 176 feature_offsets_.resize(nfeature + 1); 177 size_t accum_index_ = 0; 178 feature_offsets_[0] = accum_index_; 179 for (int32_t fid = 1; fid < nfeature + 1; ++fid) { 180 if (type_[fid - 1] == kDenseColumn) { 181 accum_index_ += static_cast<size_t>(nrow); 182 } else { 183 accum_index_ += feature_counts_[fid - 1]; 184 } 185 feature_offsets_[fid] = accum_index_; 186 } 187 188 SetTypeSize(gmat.max_num_bins); 189 190 index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0); 191 if (!all_dense) { 192 row_ind_.resize(feature_offsets_[nfeature]); 193 } 194 195 // store least bin id for each feature 196 index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data()); 197 198 const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature); 199 any_missing_ = !noMissingValues; 200 201 if (noMissingValues) { 202 missing_flags_.resize(feature_offsets_[nfeature], false); 203 } else { 204 missing_flags_.resize(feature_offsets_[nfeature], true); 205 } 206 207 // pre-fill index_ for dense columns 208 if (all_dense) { 209 BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize(); 210 if (gmat_bin_size == kUint8BinsTypeSize) { 211 SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues); 212 } else if (gmat_bin_size == kUint16BinsTypeSize) { 213 SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues); 214 } else { 215 CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize); 216 SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues); 217 } 218 /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize 219 but for ColumnMatrix we still have a chance to reduce the memory consumption */ 220 } else { 221 if (bins_type_size_ == kUint8BinsTypeSize) { 222 SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nfeature); 223 } else if (bins_type_size_ == kUint16BinsTypeSize) { 224 SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nfeature); 225 } else { 226 CHECK_EQ(bins_type_size_, kUint32BinsTypeSize); 227 SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nfeature); 228 } 229 } 230 } 231 232 /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */ SetTypeSize(size_t max_num_bins)233 void SetTypeSize(size_t max_num_bins) { 234 if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) { 235 bins_type_size_ = kUint8BinsTypeSize; 236 } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) { 237 bins_type_size_ = kUint16BinsTypeSize; 238 } else { 239 bins_type_size_ = kUint32BinsTypeSize; 240 } 241 } 242 243 /* Fetch an individual column. This code should be used with type swith 244 to determine type of bin id's */ 245 template <typename BinIdxType, bool any_missing> GetColumn(unsigned fid)246 std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const { 247 CHECK_EQ(sizeof(BinIdxType), bins_type_size_); 248 249 const size_t feature_offset = feature_offsets_[fid]; // to get right place for certain feature 250 const size_t column_size = feature_offsets_[fid + 1] - feature_offset; 251 common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>( 252 &index_[feature_offset * bins_type_size_]), 253 column_size }; 254 std::unique_ptr<const Column<BinIdxType> > res; 255 if (type_[fid] == ColumnType::kDenseColumn) { 256 CHECK_EQ(any_missing, any_missing_); 257 res.reset(new DenseColumn<BinIdxType, any_missing>(type_[fid], bin_index, index_base_[fid], 258 missing_flags_, feature_offset)); 259 } else { 260 res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid], 261 {&row_ind_[feature_offset], column_size})); 262 } 263 return res; 264 } 265 266 template <typename T> SetIndexAllDense(T * index,const GHistIndexMatrix & gmat,const size_t nrow,const size_t nfeature,const bool noMissingValues)267 inline void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, 268 const size_t nrow, const size_t nfeature, 269 const bool noMissingValues) { 270 T* local_index = reinterpret_cast<T*>(&index_[0]); 271 272 /* missing values make sense only for column with type kDenseColumn, 273 and if no missing values were observed it could be handled much faster. */ 274 if (noMissingValues) { 275 ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) { 276 const size_t ibegin = rid*nfeature; 277 const size_t iend = (rid+1)*nfeature; 278 size_t j = 0; 279 for (size_t i = ibegin; i < iend; ++i, ++j) { 280 const size_t idx = feature_offsets_[j]; 281 local_index[idx + rid] = index[i]; 282 } 283 }); 284 } else { 285 /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */ 286 size_t rbegin = 0; 287 for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) { 288 const xgboost::Entry* data_ptr = batch.data.HostVector().data(); 289 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector(); 290 const size_t batch_size = batch.Size(); 291 CHECK_LT(batch_size, offset_vec.size()); 292 for (size_t rid = 0; rid < batch_size; ++rid) { 293 const size_t size = offset_vec[rid + 1] - offset_vec[rid]; 294 SparsePage::Inst inst = {data_ptr + offset_vec[rid], size}; 295 const size_t ibegin = gmat.row_ptr[rbegin + rid]; 296 const size_t iend = gmat.row_ptr[rbegin + rid + 1]; 297 CHECK_EQ(ibegin + inst.size(), iend); 298 size_t j = 0; 299 size_t fid = 0; 300 for (size_t i = ibegin; i < iend; ++i, ++j) { 301 fid = inst[j].index; 302 const size_t idx = feature_offsets_[fid]; 303 /* rbegin allows to store indexes from specific SparsePage batch */ 304 local_index[idx + rbegin + rid] = index[i]; 305 missing_flags_[idx + rbegin + rid] = false; 306 } 307 } 308 rbegin += batch.Size(); 309 } 310 } 311 } 312 313 template<typename T> SetIndex(uint32_t * index,const GHistIndexMatrix & gmat,const size_t nfeature)314 inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat, 315 const size_t nfeature) { 316 std::vector<size_t> num_nonzeros; 317 num_nonzeros.resize(nfeature); 318 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0); 319 320 T* local_index = reinterpret_cast<T*>(&index_[0]); 321 size_t rbegin = 0; 322 for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) { 323 const xgboost::Entry* data_ptr = batch.data.HostVector().data(); 324 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector(); 325 const size_t batch_size = batch.Size(); 326 CHECK_LT(batch_size, offset_vec.size()); 327 for (size_t rid = 0; rid < batch_size; ++rid) { 328 const size_t ibegin = gmat.row_ptr[rbegin + rid]; 329 const size_t iend = gmat.row_ptr[rbegin + rid + 1]; 330 size_t fid = 0; 331 const size_t size = offset_vec[rid + 1] - offset_vec[rid]; 332 SparsePage::Inst inst = {data_ptr + offset_vec[rid], size}; 333 334 CHECK_EQ(ibegin + inst.size(), iend); 335 size_t j = 0; 336 for (size_t i = ibegin; i < iend; ++i, ++j) { 337 const uint32_t bin_id = index[i]; 338 339 fid = inst[j].index; 340 if (type_[fid] == kDenseColumn) { 341 T* begin = &local_index[feature_offsets_[fid]]; 342 begin[rid + rbegin] = bin_id - index_base_[fid]; 343 missing_flags_[feature_offsets_[fid] + rid + rbegin] = false; 344 } else { 345 T* begin = &local_index[feature_offsets_[fid]]; 346 begin[num_nonzeros[fid]] = bin_id - index_base_[fid]; 347 row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin; 348 ++num_nonzeros[fid]; 349 } 350 } 351 } 352 rbegin += batch.Size(); 353 } 354 } GetTypeSize()355 BinTypeSize GetTypeSize() const { 356 return bins_type_size_; 357 } 358 359 // This is just an utility function NoMissingValues(const size_t n_elements,const size_t n_row,const size_t n_features)360 bool NoMissingValues(const size_t n_elements, 361 const size_t n_row, const size_t n_features) { 362 return n_elements == n_features * n_row; 363 } 364 365 // And this returns part of state AnyMissing()366 bool AnyMissing() const { 367 return any_missing_; 368 } 369 370 private: 371 std::vector<uint8_t> index_; 372 373 std::vector<size_t> feature_counts_; 374 std::vector<ColumnType> type_; 375 std::vector<size_t> row_ind_; 376 /* indicate where each column's index and row_ind is stored. */ 377 std::vector<size_t> feature_offsets_; 378 379 // index_base_[fid]: least bin id for feature fid 380 uint32_t* index_base_; 381 std::vector<bool> missing_flags_; 382 BinTypeSize bins_type_size_; 383 bool any_missing_; 384 }; 385 386 } // namespace common 387 } // namespace xgboost 388 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_ 389