1 /*!
2  * Copyright 2021 by XGBoost Contributors
3  */
4 #ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
5 #define XGBOOST_TREE_HIST_HISTOGRAM_H_
6 
7 #include <algorithm>
8 #include <limits>
9 #include <vector>
10 
11 #include "rabit/rabit.h"
12 #include "xgboost/tree_model.h"
13 #include "../../common/hist_util.h"
14 
15 namespace xgboost {
16 namespace tree {
17 template <typename GradientSumT, typename ExpandEntry> class HistogramBuilder {
18   using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
19   using GHistRowT = common::GHistRow<GradientSumT>;
20 
21   /*! \brief culmulative histogram of gradients. */
22   common::HistCollection<GradientSumT> hist_;
23   /*! \brief culmulative local parent histogram of gradients. */
24   common::HistCollection<GradientSumT> hist_local_worker_;
25   common::GHistBuilder<GradientSumT> builder_;
26   common::ParallelGHistBuilder<GradientSumT> buffer_;
27   rabit::Reducer<GradientPairT, GradientPairT::Reduce> reducer_;
28   int32_t max_bin_ {-1};
29   int32_t n_threads_ {-1};
30   // Whether XGBoost is running in distributed environment.
31   bool is_distributed_ {false};
32 
33  public:
34   /**
35    * \param total_bins       Total number of bins across all features
36    * \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
37    *                         training parameter.
38    * \param n_threads        Number of threads.
39    * \param is_distributed   Mostly used for testing to allow injecting parameters instead
40    *                         of using global rabit variable.
41    */
42   void Reset(uint32_t total_bins, int32_t max_bin_per_feat, int32_t n_threads,
43              bool is_distributed = rabit::IsDistributed()) {
44     CHECK_GE(n_threads, 1);
45     n_threads_ = n_threads;
46     CHECK_GE(max_bin_per_feat, 2);
47     max_bin_ = max_bin_per_feat;
48     hist_.Init(total_bins);
49     hist_local_worker_.Init(total_bins);
50     buffer_.Init(total_bins);
51     builder_ = common::GHistBuilder<GradientSumT>(n_threads, total_bins);
52     is_distributed_ = is_distributed;
53   }
54 
55   template <bool any_missing>
56   void
BuildLocalHistograms(DMatrix * p_fmat,std::vector<ExpandEntry> nodes_for_explicit_hist_build,common::RowSetCollection const & row_set_collection,const std::vector<GradientPair> & gpair_h)57   BuildLocalHistograms(DMatrix *p_fmat,
58                        std::vector<ExpandEntry> nodes_for_explicit_hist_build,
59                        common::RowSetCollection const &row_set_collection,
60                        const std::vector<GradientPair> &gpair_h) {
61     const size_t n_nodes = nodes_for_explicit_hist_build.size();
62 
63     // create space of size (# rows in each node)
64     common::BlockedSpace2d space(
65         n_nodes,
66         [&](size_t node) {
67           const int32_t nid = nodes_for_explicit_hist_build[node].nid;
68           return row_set_collection[nid].Size();
69         },
70         256);
71 
72     std::vector<GHistRowT> target_hists(n_nodes);
73     for (size_t i = 0; i < n_nodes; ++i) {
74       const int32_t nid = nodes_for_explicit_hist_build[i].nid;
75       target_hists[i] = hist_[nid];
76     }
77     buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
78 
79     // Parallel processing by nodes and data in each node
80     for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(
81              BatchParam{GenericParameter::kCpuId, max_bin_})) {
82       common::ParallelFor2d(
83           space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
84             const auto tid = static_cast<unsigned>(omp_get_thread_num());
85             const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
86 
87             auto start_of_row_set = row_set_collection[nid].begin;
88             auto rid_set = common::RowSetCollection::Elem(
89                 start_of_row_set + r.begin(), start_of_row_set + r.end(), nid);
90             builder_.template BuildHist<any_missing>(
91                 gpair_h, rid_set, gmat,
92                 buffer_.GetInitializedHist(tid, nid_in_set));
93           });
94     }
95   }
96 
97   void
AddHistRows(int * starting_index,int * sync_count,std::vector<ExpandEntry> const & nodes_for_explicit_hist_build,std::vector<ExpandEntry> const & nodes_for_subtraction_trick,RegTree * p_tree)98   AddHistRows(int *starting_index, int *sync_count,
99               std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
100               std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
101               RegTree *p_tree) {
102     if (is_distributed_) {
103       this->AddHistRowsDistributed(starting_index, sync_count,
104                                    nodes_for_explicit_hist_build,
105                                    nodes_for_subtraction_trick, p_tree);
106     } else {
107       this->AddHistRowsLocal(starting_index, sync_count,
108                              nodes_for_explicit_hist_build,
109                              nodes_for_subtraction_trick);
110     }
111   }
112 
113   /* Main entry point of this class, build histogram for tree nodes. */
BuildHist(DMatrix * p_fmat,RegTree * p_tree,common::RowSetCollection const & row_set_collection,std::vector<ExpandEntry> const & nodes_for_explicit_hist_build,std::vector<ExpandEntry> const & nodes_for_subtraction_trick,std::vector<GradientPair> const & gpair)114   void BuildHist(DMatrix *p_fmat, RegTree *p_tree,
115                  common::RowSetCollection const &row_set_collection,
116                  std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
117                  std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
118                  std::vector<GradientPair> const &gpair) {
119     int starting_index = std::numeric_limits<int>::max();
120     int sync_count = 0;
121     this->AddHistRows(&starting_index, &sync_count,
122                       nodes_for_explicit_hist_build,
123                       nodes_for_subtraction_trick, p_tree);
124     if (p_fmat->IsDense()) {
125       BuildLocalHistograms<false>(p_fmat, nodes_for_explicit_hist_build,
126                                   row_set_collection, gpair);
127     } else {
128       BuildLocalHistograms<true>(p_fmat, nodes_for_explicit_hist_build,
129                                  row_set_collection, gpair);
130     }
131     if (is_distributed_) {
132       this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
133                                      nodes_for_subtraction_trick,
134                                      starting_index, sync_count);
135     } else {
136       this->SyncHistogramLocal(p_tree, nodes_for_explicit_hist_build,
137                                nodes_for_subtraction_trick, starting_index,
138                                sync_count);
139     }
140   }
141 
SyncHistogramDistributed(RegTree * p_tree,std::vector<ExpandEntry> const & nodes_for_explicit_hist_build,std::vector<ExpandEntry> const & nodes_for_subtraction_trick,int starting_index,int sync_count)142   void SyncHistogramDistributed(
143       RegTree *p_tree,
144       std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
145       std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
146       int starting_index, int sync_count) {
147     const size_t nbins = builder_.GetNumBins();
148     common::BlockedSpace2d space(
149         nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; },
150         1024);
151     common::ParallelFor2d(
152         space, n_threads_, [&](size_t node, common::Range1d r) {
153           const auto &entry = nodes_for_explicit_hist_build[node];
154           auto this_hist = this->hist_[entry.nid];
155           // Merging histograms from each thread into once
156           buffer_.ReduceHist(node, r.begin(), r.end());
157           // Store posible parent node
158           auto this_local = hist_local_worker_[entry.nid];
159           common::CopyHist(this_local, this_hist, r.begin(), r.end());
160 
161           if (!(*p_tree)[entry.nid].IsRoot()) {
162             const size_t parent_id = (*p_tree)[entry.nid].Parent();
163             const int subtraction_node_id =
164                 nodes_for_subtraction_trick[node].nid;
165             auto parent_hist = this->hist_local_worker_[parent_id];
166             auto sibling_hist = this->hist_[subtraction_node_id];
167             common::SubtractionHist(sibling_hist, parent_hist, this_hist,
168                                     r.begin(), r.end());
169             // Store posible parent node
170             auto sibling_local = hist_local_worker_[subtraction_node_id];
171             common::CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
172           }
173         });
174 
175     reducer_.Allreduce(this->hist_[starting_index].data(),
176                        builder_.GetNumBins() * sync_count);
177 
178     ParallelSubtractionHist(space, nodes_for_explicit_hist_build,
179                             nodes_for_subtraction_trick, p_tree);
180 
181     common::BlockedSpace2d space2(
182         nodes_for_subtraction_trick.size(), [&](size_t) { return nbins; },
183         1024);
184     ParallelSubtractionHist(space2, nodes_for_subtraction_trick,
185                             nodes_for_explicit_hist_build, p_tree);
186   }
187 
SyncHistogramLocal(RegTree * p_tree,std::vector<ExpandEntry> const & nodes_for_explicit_hist_build,std::vector<ExpandEntry> const & nodes_for_subtraction_trick,int starting_index,int sync_count)188   void SyncHistogramLocal(
189       RegTree *p_tree,
190       std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
191       std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
192       int starting_index, int sync_count) {
193     const size_t nbins = this->builder_.GetNumBins();
194     common::BlockedSpace2d space(
195         nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; },
196         1024);
197 
198     common::ParallelFor2d(
199         space, this->n_threads_, [&](size_t node, common::Range1d r) {
200           const auto &entry = nodes_for_explicit_hist_build[node];
201           auto this_hist = this->hist_[entry.nid];
202           // Merging histograms from each thread into once
203           this->buffer_.ReduceHist(node, r.begin(), r.end());
204 
205           if (!(*p_tree)[entry.nid].IsRoot()) {
206             const size_t parent_id = (*p_tree)[entry.nid].Parent();
207             const int subtraction_node_id =
208                 nodes_for_subtraction_trick[node].nid;
209             auto parent_hist = this->hist_[parent_id];
210             auto sibling_hist = this->hist_[subtraction_node_id];
211             common::SubtractionHist(sibling_hist, parent_hist, this_hist,
212                                     r.begin(), r.end());
213           }
214         });
215   }
216 
217  public:
218   /* Getters for tests. */
Histogram()219   common::HistCollection<GradientSumT> const& Histogram() {
220     return hist_;
221   }
Buffer()222   auto& Buffer() { return buffer_; }
223 
224  private:
225   void
ParallelSubtractionHist(const common::BlockedSpace2d & space,const std::vector<ExpandEntry> & nodes,const std::vector<ExpandEntry> & subtraction_nodes,const RegTree * p_tree)226   ParallelSubtractionHist(const common::BlockedSpace2d &space,
227                           const std::vector<ExpandEntry> &nodes,
228                           const std::vector<ExpandEntry> &subtraction_nodes,
229                           const RegTree *p_tree) {
230     common::ParallelFor2d(
231         space, this->n_threads_, [&](size_t node, common::Range1d r) {
232           const auto &entry = nodes[node];
233           if (!((*p_tree)[entry.nid].IsLeftChild())) {
234             auto this_hist = this->hist_[entry.nid];
235 
236             if (!(*p_tree)[entry.nid].IsRoot()) {
237               const int subtraction_node_id = subtraction_nodes[node].nid;
238               auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()];
239               auto sibling_hist = hist_[subtraction_node_id];
240               common::SubtractionHist(this_hist, parent_hist, sibling_hist,
241                                       r.begin(), r.end());
242             }
243           }
244         });
245   }
246 
247   // Add a tree node to histogram buffer in local training environment.
AddHistRowsLocal(int * starting_index,int * sync_count,std::vector<ExpandEntry> const & nodes_for_explicit_hist_build,std::vector<ExpandEntry> const & nodes_for_subtraction_trick)248   void AddHistRowsLocal(
249       int *starting_index, int *sync_count,
250       std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
251       std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
252     for (auto const &entry : nodes_for_explicit_hist_build) {
253       int nid = entry.nid;
254       this->hist_.AddHistRow(nid);
255       (*starting_index) = std::min(nid, (*starting_index));
256     }
257     (*sync_count) = nodes_for_explicit_hist_build.size();
258 
259     for (auto const &node : nodes_for_subtraction_trick) {
260       this->hist_.AddHistRow(node.nid);
261     }
262     this->hist_.AllocateAllData();
263   }
264 
AddHistRowsDistributed(int * starting_index,int * sync_count,std::vector<ExpandEntry> const & nodes_for_explicit_hist_build,std::vector<ExpandEntry> const & nodes_for_subtraction_trick,RegTree * p_tree)265   void AddHistRowsDistributed(
266       int *starting_index, int *sync_count,
267       std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
268       std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
269       RegTree *p_tree) {
270     const size_t explicit_size = nodes_for_explicit_hist_build.size();
271     const size_t subtaction_size = nodes_for_subtraction_trick.size();
272     std::vector<int> merged_node_ids(explicit_size + subtaction_size);
273     for (size_t i = 0; i < explicit_size; ++i) {
274       merged_node_ids[i] = nodes_for_explicit_hist_build[i].nid;
275     }
276     for (size_t i = 0; i < subtaction_size; ++i) {
277       merged_node_ids[explicit_size + i] = nodes_for_subtraction_trick[i].nid;
278     }
279     std::sort(merged_node_ids.begin(), merged_node_ids.end());
280     int n_left = 0;
281     for (auto const &nid : merged_node_ids) {
282       if ((*p_tree)[nid].IsLeftChild()) {
283         this->hist_.AddHistRow(nid);
284         (*starting_index) = std::min(nid, (*starting_index));
285         n_left++;
286         this->hist_local_worker_.AddHistRow(nid);
287       }
288     }
289     for (auto const &nid : merged_node_ids) {
290       if (!((*p_tree)[nid].IsLeftChild())) {
291         this->hist_.AddHistRow(nid);
292         this->hist_local_worker_.AddHistRow(nid);
293       }
294     }
295     this->hist_.AllocateAllData();
296     this->hist_local_worker_.AllocateAllData();
297     (*sync_count) = std::max(1, n_left);
298   }
299 };
300 }      // namespace tree
301 }      // namespace xgboost
302 #endif  // XGBOOST_TREE_HIST_HISTOGRAM_H_
303