1 /*******************************************************************************
2 * Copyright 2018-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16 
17 #ifndef COMMON_MEMORY_TRACKING_HPP
18 #define COMMON_MEMORY_TRACKING_HPP
19 
20 #include <assert.h>
21 #include <unordered_map>
22 
23 #include "memory_debug.hpp"
24 #include "memory_storage.hpp"
25 #include "nstl.hpp"
26 #include "utils.hpp"
27 
28 namespace dnnl {
29 namespace impl {
30 
31 struct exec_ctx_t;
32 
33 namespace memory_tracking {
34 
35 /* Memory tracking capabilities
36  *
37  * The main purpose of this header file is to provide uniform way to register
38  * required memory for a scratchpad at a primitive descriptor creation time
39  * and then easily access it having only the base address of the scratchpad.
40  *
41  * Primitives might contain multiple disjoint parts that require temporary
42  * buffers (known as scratchpad) during their execution. A primitive descriptor
43  * should summarize all the needs into one single number -- the buffer size
44  * that would be requested from a user. At execution time, the corresponding
45  * primitive will receive a base pointer to a scratchpad. It then needs to
46  * provide each part of algorithm the corresponding piece of memory. Three main
47  * challenges here are:
48  * 1. Track correct offset (from the base scratchpad address) for each piece
49  * 2. Algorithm might require that different memory pieces to be aligned, so
50  *    the scratchpad size is no more just a sum of size of the corresponding
51  *    subparts.
52  * 3. While a primitive is responsible for its scratchpad, the implementation
53  *    might use some other basic blocks (e.g. cpu_reducer) that also require
54  *    scratchpad memory. So there should be a simple way of passing the
55  *    information back and force between the main algorithm (a primitive) and
56  *    auxiliary stuff that lives completely separately from it (e.g. reducer).
57  *
58  * To address these challenges this header file provides 3 structures:
59  * 1. registry_t  -- the class the stores the information about requested
60  *                   memory. The information includes required size and desired
61  *                   alignment for each piece. This class is also responsible
62  *                   for computing the right offset to a given piece using the
63  *                   base pointer.
64  *                   This class is basically a ledger with all entries.
65  *                   Lives in primitive descriptors.
66  *
67  * 2. registrar_t -- the interface to a registry_t to book memory. Used at
68  *                   primitive descriptor creation time only. Contains a
69  *                   reference to the corresponding *mutable* registry.
70  *                   Always modifiable.
71  *                   Allows chaining (using prefixes).
72  *
73  * 3. grantor_t   -- the interface to a registry_t to access memory. Used at
74  *                   primitive execution time only. Contains a reference to
75  *                   the corresponding *constant* registry and base pointer.
76  *                   Always constant.
77  *                   Allows chaining (using prefixes).
78  *
79  * Both registrar_t and grantor_t allow chaining with extra prefix provided.
80  * The feature is useful when a primitive offload a part of computations to
81  * some other primitives which require their own scratchpad space
82  * (e.g. reducer). Prefixes are used to avoid key collision in cases when
83  * multiple sub-primitive (e.g. multiple reducers) are used.
84  *
85  * A short example below demonstrates how to use aforementioned classes. In it
86  * the main primitive is convolution that uses scratchpad for keeping padded
87  * bias. It also needs a reducer, that needs its own space as well.
88  *
89  *  ``` c++
90  *  struct reducer_t {
91  *      static void init(registrar_t &scratchpad) {
92  *          // reserve space for 980*1024 floats (one page aligned)
93  *          scratchpad.book<float>(key_space, 980 * 1024, 4096);
94  *      }
95  *
96  *      void exec(const grantor_t &scratchpad) {
97  *          // get the pointer to preserved space. scratchpad came from
98  *          // upper primitive (convolution in this example)
99  *          auto space = scratchpad.get<float>(key_reducer_space);
100  *
101  *          space[:] += ...;
102  *      }
103  *  };
104  *
105  *  struct conv_t {
106  *      struct pd_t {
107  *          void init() {
108  *              registrar_t scratchpad(scratchpad_registry_);
109  *
110  *              // reserve space for 128 elements which are two bytes long that
111  *              // require 4 byte alignment, but preferably have 64 byte
112  *              // alignment for performance reasons
113  *              // two alignment parameters are included for implementation
114  *              // flexibility targeted at memory debugging purposes
115  *              scratchpad.book(key_conv_padded_bias, 128, 2, 4, 64);
116  *
117  *              // create a proxy registrar for the reducer All entries made
118  *              // by reducer would live in convolution's registry, but would
119  *              // have their own `prefix`, so no interference with conv's
120  *              // buffers.
121  *              registrar_t reducer_scratchpad(scratchpad, prefix_reducer);
122  *
123  *              reducer_t::init(reducer_scratchpad);
124  *          }
125  *
126  *          registry_t scratchpad_registry_;
127  *      }
128  *
129  *      void exec() {
130  *          // get the base pointer to a scratchpad memory from a user
131  *          void *scratchpad_ptr = this->input(DNNL_MEM_SCRATCHPAD);
132  *
133  *          // create a grantor to the scratchpad (and provide the base
134  *          // pointer).
135  *          grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr);
136  *
137  *          // access the padded_bias (need only key name and the grantor)
138  *          auto padded_bias = scratchpad.get<float>(key_conv_padded_bias);
139  *
140  *          // to give the `right` grantor to reducer we need to add the
141  *          // corresponding prefix, so that reducer would be able to access
142  *          // its keys. The call is very similar to the one in pd_t::init
143  *          // with only difference in types: grantor_t vs registrar_t.
144  *          grantor_t reducer_scratchpad(scratchpad, prefix_reducer);
145  *          reducer->exec(reducer_scratchpad);
146  *      }
147  *  };
148  *  ```
149  */
150 
151 /* namespace with common keys and prefixes */
152 namespace names {
153 enum {
154     key_none = 0,
155     key_barrier,
156     key_bnorm_bf16cvt,
157     key_bnorm_tmp_mean,
158     key_bnorm_tmp_var,
159     key_bnorm_tmp_diff_ss,
160     key_bnorm_tmp_stats,
161     key_bnorm_reduction,
162     key_brgemm_primitive_batch,
163     key_brgemm_primitive_buffer,
164     key_brgemm_primitive_buffer_a,
165     key_brgemm_primitive_buffer_b,
166     key_brgemm_primitive_buffer_comp,
167     key_brgemm_primitive_zp_comp_a,
168     key_brgemm_primitive_zp_comp_b,
169     key_concat_iptrs,
170     key_concat_istrides,
171     key_concat_nelems,
172     key_concat_optrs,
173     key_concat_tent_dst,
174     key_conv_adjusted_scales,
175     key_conv_amx_inp_buffer,
176     key_conv_amx_tilecfg,
177     key_conv_amx_tile_buffer,
178     key_conv_amx_wei_buffer,
179     key_conv_amx_wsp_buffer,
180     key_conv_bia_reduction,
181     key_conv_bias_bf16_convert_wsp,
182     key_conv_cudnn,
183     key_conv_cudnn_algo,
184     key_conv_cudnn_filter,
185     key_conv_cudnn_temp,
186     key_conv_dst_bf16_convert_wsp,
187     key_conv_brgemm_addr_a,
188     key_conv_brgemm_addr_b,
189     key_conv_brgemm_batch,
190     key_conv_brgemm_buffer,
191     key_conv_brgemm_inp_buffer,
192     key_conv_brgemm_inp_buffer_mask,
193     key_conv_bwd_w_1st_bia_reorder,
194     key_conv_bwd_w_1st_wei_reorder,
195     key_conv_gemm_acc,
196     key_conv_gemm_col,
197     key_conv_gemm_imtr,
198     key_conv_gemm_zp_src_comp,
199     key_conv_int_dat_in_acc_dt,
200     key_conv_padded_bias,
201     key_conv_rtus_space,
202     key_conv_store_wsp,
203     key_conv_tails,
204     key_conv_tr_diff_dst,
205     key_conv_tr_diff_dst_bctx,
206     key_conv_tr_src,
207     key_conv_tr_src_bctx,
208     key_conv_wei_reduction,
209     key_conv_wei_bia_reduction,
210     key_conv_wei_bia_reduction_bctx,
211     key_conv_zero_point_flag,
212     key_conv_zero_point_pad,
213     key_deconv_bias,
214     key_deconv_sum,
215     key_deconv_zp,
216     key_eltwise_diff_dst,
217     key_eltwise_src,
218     key_fusion_forward_scratchpad,
219     key_fusion_inout_buffer,
220     key_gemm_int_c_in_acc_dt,
221     key_gemm_tmp_buffer,
222     key_gemm_flag,
223     key_iprod_bias_bf16_convert_wsp,
224     key_iprod_dst_bf16_convert_wsp,
225     key_iprod_dst_reorder,
226     key_iprod_int_dat_in_acc_dt,
227     key_lnorm_inv_sqrtvar,
228     key_lnorm_tmp_mean,
229     key_lnorm_tmp_var,
230     key_lnorm_tmp_diff_ss,
231     key_lnorm_reduction,
232     key_matmul_dst_in_acc_dt,
233     key_pool_dst_bf16cvt,
234     key_pool_dst_plain2blocked_cvt,
235     key_pool_ind_plain2blocked_cvt,
236     key_pool_src_bf16cvt,
237     key_pool_src_plain2blocked_cvt,
238     key_prelu_reduction,
239     key_reducer_space,
240     key_reducer_space_bctx,
241     key_reduction,
242     key_reorder_cross_space,
243     key_reorder_space,
244     key_reorder_scales,
245     key_reorder_wino_plain,
246     key_reorder_wino_transform_space,
247     key_reorder_rnn_space,
248     key_reorder_rnn_weights_bf16_cvt,
249     key_reorder_rnn_weights_quantization,
250     key_reorder_rnn_weights_reduction,
251     key_reorder_rnn_weights_transposition,
252     key_rnn_space,
253     key_rnn_cell,
254     key_rnn_gates,
255     key_rnn_gates_blocked,
256     key_rnn_src_layer_trans,
257     key_rnn_src_iter_trans,
258     key_rnn_ht,
259     key_rnn_diff_ht,
260     key_rnn_ptrs_bia,
261     key_rnn_ptrs_wei_layer,
262     key_rnn_ptrs_wei_iter,
263     key_rnn_ptrs_wei_projection,
264     key_softmax_reduction,
265     key_sum_reduction,
266     key_sum_srcs_cvt,
267     key_wino_U,
268     key_wino_V,
269     key_wino_M,
270     // These two keys should always be the last ones,
271     // even though they are not in alphabetical order
272     key_nested,
273     key_nested_multiple,
274 };
275 
276 enum {
277     prefix_none = 0,
278     prefix_fusion,
279     prefix_reducer_bia,
280     prefix_reducer_wei,
281 };
282 } // namespace names
283 
284 // level 0: 00 00 00 xxx
285 // level 1: 00 00 aa xxx
286 // level 2: 00 aa bb xxx
287 // level 3: aa bb cc xxx
288 // max # of levels: 3 + 1 (base_level)
289 // here:
290 //      xxx        : [1 ..    MAX_KEY) : key
291 //      aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3
292 
293 using key_t = uint32_t;
294 enum {
295     MAX_KEY = (1u << 10),
296     MAX_PREFIX = (1u << 7),
297 };
298 
299 /// generates global key based on a prefix and a local key
make_key(key_t prefix,key_t key)300 inline key_t make_key(key_t prefix, key_t key) {
301     return prefix + key;
302 }
303 
304 /// generates global prefix based on the global parent and the local ones
make_prefix(key_t parent_prefix,key_t prefix)305 inline key_t make_prefix(key_t parent_prefix, key_t prefix) {
306     return MAX_PREFIX * parent_prefix + MAX_KEY * prefix;
307 }
308 
309 struct registrar_t;
310 struct grantor_t;
311 
312 enum { default_alignment = 128 };
get_alignment(size_t alignment)313 inline size_t get_alignment(size_t alignment) {
314     size_t minimal_alignment
315             = memory_debug::is_mem_debug() ? getpagesize() : default_alignment;
316     return nstl::max<size_t>(alignment, minimal_alignment);
317 }
318 
buffer_protect_size()319 inline size_t buffer_protect_size() {
320     return memory_debug::is_mem_debug()
321             ? memory_debug::protect_size() + getpagesize()
322             : 0;
323 }
324 
325 struct registry_t {
326     struct entry_t {
327         size_t offset, size, capacity, alignment;
328 
329         // apply offset and alignment + check memory_debug (host/cpu only)
330         const void *compute_ptr(const void *base_ptr) const;
331     };
332 
333     // perf_align is the desired alignment for performance.
334     // data_align is the minimum data alignment required for functionality,
335     //    this parameter is included for memory debugging purposes.
bookdnnl::impl::memory_tracking::registry_t336     void book(const key_t &key, size_t size, size_t data_align,
337             size_t perf_align = default_alignment) {
338         if (size == 0) return;
339         assert(offset_map_.count(key) == 0);
340         size_t alignment = memory_debug::is_mem_debug()
341                 ? data_align
342                 : nstl::max(data_align, perf_align);
343 
344         if (memory_debug::is_mem_debug() && size_ == 0)
345             size_ += get_alignment(alignment) + buffer_protect_size();
346 
347         assert(alignment > 0 && (alignment & (alignment - 1)) == 0);
348         size_t capacity
349                 = size + get_alignment(alignment) + buffer_protect_size();
350         offset_map_[key] = entry_t {size_, size, capacity, alignment};
351 
352         size_ += capacity;
353     }
354 
getdnnl::impl::memory_tracking::registry_t355     entry_t get(const key_t &key) const {
356         if (size() == 0 || offset_map_.count(key) != 1)
357             return entry_t {0, 0, 0, 0};
358         return offset_map_.at(key);
359     }
360 
sizednnl::impl::memory_tracking::registry_t361     size_t size() const { return size_; }
362 
363     registrar_t registrar();
364     grantor_t grantor(const memory_storage_t *mem_storage,
365             const exec_ctx_t &exec_ctx) const;
366 
367     template <typename return_type>
368     class common_iterator_t {
369     private:
370         const void *base_ptr;
371         std::unordered_map<key_t, entry_t>::const_iterator iter;
372 
373     public:
common_iterator_t(const void * base_ptr_,const std::unordered_map<key_t,entry_t> & map,bool is_begin=true)374         common_iterator_t(const void *base_ptr_,
375                 const std::unordered_map<key_t, entry_t> &map,
376                 bool is_begin = true) {
377             base_ptr = base_ptr_;
378             if (is_begin) {
379                 iter = map.cbegin();
380             } else {
381                 iter = map.cend();
382             }
383         }
operator ++(int)384         common_iterator_t &operator++(int) {
385             iter++;
386             return *this;
387         }
operator ==(const common_iterator_t & rhs) const388         bool operator==(const common_iterator_t &rhs) const {
389             return iter == rhs.iter;
390         }
operator !=(const common_iterator_t & rhs) const391         bool operator!=(const common_iterator_t &rhs) const {
392             return iter != rhs.iter;
393         }
operator *() const394         std::pair<return_type, size_t> operator*() const {
395             const entry_t &entry = iter->second;
396             const void *ptr_start = entry.compute_ptr(base_ptr);
397             return std::pair<return_type, size_t> {
398                     (return_type)ptr_start, entry.size};
399         }
400     };
401     typedef common_iterator_t<void *> iterator;
402     typedef common_iterator_t<const void *> const_iterator;
begindnnl::impl::memory_tracking::registry_t403     iterator begin(void *base_ptr_) const {
404         return iterator(base_ptr_, offset_map_);
405     }
enddnnl::impl::memory_tracking::registry_t406     iterator end(void *base_ptr_) const {
407         return iterator(base_ptr_, offset_map_, false);
408     }
cbegindnnl::impl::memory_tracking::registry_t409     const_iterator cbegin(const void *base_ptr_) const {
410         return const_iterator(base_ptr_, offset_map_);
411     }
cenddnnl::impl::memory_tracking::registry_t412     const_iterator cend(const void *base_ptr_) const {
413         return const_iterator(base_ptr_, offset_map_, false);
414     }
415 
416 protected:
417     std::unordered_map<key_t, entry_t> offset_map_;
418     size_t size_ = 0;
419 };
420 
421 struct registrar_t {
registrar_tdnnl::impl::memory_tracking::registrar_t422     registrar_t(registry_t &registry) : registry_(registry), prefix_(0) {}
registrar_tdnnl::impl::memory_tracking::registrar_t423     registrar_t(registrar_t &parent, const key_t &prefix)
424         : registry_(parent.registry_)
425         , prefix_(make_prefix(parent.prefix_, prefix)) {}
426 
bookdnnl::impl::memory_tracking::registrar_t427     void book(const key_t &key, size_t nelems, size_t data_size,
428             size_t data_align = 0, size_t perf_align = default_alignment) {
429         if (data_align == 0) data_align = data_size;
430         registry_.book(make_key(prefix_, key), nelems * data_size, data_align,
431                 perf_align);
432     }
433     template <typename T>
bookdnnl::impl::memory_tracking::registrar_t434     void book(const key_t &key, size_t nelems,
435             size_t perf_align = default_alignment) {
436         registry_.book(make_key(prefix_, key), nelems * sizeof(T), alignof(T),
437                 perf_align);
438     }
439 
bookdnnl::impl::memory_tracking::registrar_t440     void book(const key_t &key, const registry_t &registry,
441             size_t perf_align = default_alignment) {
442         registry_.book(make_key(prefix_, key), registry.size(), 1, perf_align);
443     }
444 
sizednnl::impl::memory_tracking::registrar_t445     size_t size() const { return registry_.size(); }
446 
447 protected:
448     registry_t &registry_;
449     const key_t prefix_;
450 };
451 
452 struct grantor_t {
grantor_tdnnl::impl::memory_tracking::grantor_t453     grantor_t(const registry_t &registry,
454             const memory_storage_t *base_mem_storage,
455             const exec_ctx_t &exec_ctx)
456         : registry_(registry)
457         , prefix_(0)
458         , base_mem_storage_(base_mem_storage)
459         , exec_ctx_(&exec_ctx) {}
grantor_tdnnl::impl::memory_tracking::grantor_t460     grantor_t(const grantor_t &parent, const key_t &prefix)
461         : registry_(parent.registry_)
462         , prefix_(make_prefix(parent.prefix_, prefix))
463         , base_mem_storage_(parent.base_mem_storage_)
464         , exec_ctx_(parent.exec_ctx_) {}
465 
466     template <typename T = void>
getdnnl::impl::memory_tracking::grantor_t467     T *get(const key_t &key) const {
468         if (!base_mem_storage_) {
469             assert(registry_.size() == 0);
470             return nullptr;
471         }
472         auto e = registry_.get(make_key(prefix_, key));
473         if (e.size == 0) return nullptr;
474 
475         char *host_storage_ptr = get_host_storage_ptr(base_mem_storage_);
476         char *base_ptr = host_storage_ptr + base_mem_storage_->base_offset();
477         return (T *)e.compute_ptr(base_ptr);
478     }
479 
get_memory_storagednnl::impl::memory_tracking::grantor_t480     std::unique_ptr<memory_storage_t> get_memory_storage(
481             const key_t &key) const {
482         if (!base_mem_storage_) {
483             assert(registry_.size() == 0);
484             return nullptr;
485         }
486         auto e = registry_.get(make_key(prefix_, key));
487         if (e.size == 0) return nullptr;
488 
489         if (is_cpu_engine(base_mem_storage_)) {
490             char *host_storage_ptr = get_host_storage_ptr(base_mem_storage_);
491             char *base_ptr
492                     = host_storage_ptr + base_mem_storage_->base_offset();
493             char *aligned_ptr = (char *)e.compute_ptr(base_ptr);
494             size_t aligned_offset = size_t(aligned_ptr - host_storage_ptr);
495             return base_mem_storage_->get_sub_storage(aligned_offset, e.size);
496         }
497 
498         const size_t aligned_offset
499                 = reinterpret_cast<size_t>(utils::align_ptr<char>(
500                         reinterpret_cast<char *>(e.offset), e.alignment));
501         assert(aligned_offset + e.size <= registry_.size());
502         return base_mem_storage_->get_sub_storage(aligned_offset, e.size);
503     }
504 
get_base_storagednnl::impl::memory_tracking::grantor_t505     const memory_storage_t *get_base_storage() const {
506         return base_mem_storage_;
507     }
get_registrydnnl::impl::memory_tracking::grantor_t508     const registry_t &get_registry() const { return registry_; }
509 
510 protected:
511     const registry_t &registry_;
512     const key_t prefix_;
513     const memory_storage_t *base_mem_storage_;
514     const exec_ctx_t *exec_ctx_;
515 
516 private:
517     char *get_host_storage_ptr(const memory_storage_t *storage) const;
518     bool is_cpu_engine(const memory_storage_t *mem_storage) const;
519 };
520 
registrar()521 inline registrar_t registry_t::registrar() {
522     return registrar_t(*this);
523 }
grantor(const memory_storage_t * mem_storage,const exec_ctx_t & exec_ctx) const524 inline grantor_t registry_t::grantor(
525         const memory_storage_t *mem_storage, const exec_ctx_t &exec_ctx) const {
526     return grantor_t(*this, mem_storage, exec_ctx);
527 }
528 
529 } // namespace memory_tracking
530 } // namespace impl
531 } // namespace dnnl
532 
533 #endif
534