1 /*******************************************************************************
2 * Copyright 2018-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17 #ifndef COMMON_MEMORY_TRACKING_HPP
18 #define COMMON_MEMORY_TRACKING_HPP
19
20 #include <assert.h>
21 #include <unordered_map>
22
23 #include "memory_debug.hpp"
24 #include "memory_storage.hpp"
25 #include "nstl.hpp"
26 #include "utils.hpp"
27
28 namespace dnnl {
29 namespace impl {
30
31 struct exec_ctx_t;
32
33 namespace memory_tracking {
34
35 /* Memory tracking capabilities
36 *
37 * The main purpose of this header file is to provide uniform way to register
38 * required memory for a scratchpad at a primitive descriptor creation time
39 * and then easily access it having only the base address of the scratchpad.
40 *
41 * Primitives might contain multiple disjoint parts that require temporary
42 * buffers (known as scratchpad) during their execution. A primitive descriptor
43 * should summarize all the needs into one single number -- the buffer size
44 * that would be requested from a user. At execution time, the corresponding
45 * primitive will receive a base pointer to a scratchpad. It then needs to
46 * provide each part of algorithm the corresponding piece of memory. Three main
47 * challenges here are:
48 * 1. Track correct offset (from the base scratchpad address) for each piece
49 * 2. Algorithm might require that different memory pieces to be aligned, so
50 * the scratchpad size is no more just a sum of size of the corresponding
51 * subparts.
52 * 3. While a primitive is responsible for its scratchpad, the implementation
53 * might use some other basic blocks (e.g. cpu_reducer) that also require
54 * scratchpad memory. So there should be a simple way of passing the
55 * information back and force between the main algorithm (a primitive) and
56 * auxiliary stuff that lives completely separately from it (e.g. reducer).
57 *
58 * To address these challenges this header file provides 3 structures:
59 * 1. registry_t -- the class the stores the information about requested
60 * memory. The information includes required size and desired
61 * alignment for each piece. This class is also responsible
62 * for computing the right offset to a given piece using the
63 * base pointer.
64 * This class is basically a ledger with all entries.
65 * Lives in primitive descriptors.
66 *
67 * 2. registrar_t -- the interface to a registry_t to book memory. Used at
68 * primitive descriptor creation time only. Contains a
69 * reference to the corresponding *mutable* registry.
70 * Always modifiable.
71 * Allows chaining (using prefixes).
72 *
73 * 3. grantor_t -- the interface to a registry_t to access memory. Used at
74 * primitive execution time only. Contains a reference to
75 * the corresponding *constant* registry and base pointer.
76 * Always constant.
77 * Allows chaining (using prefixes).
78 *
79 * Both registrar_t and grantor_t allow chaining with extra prefix provided.
80 * The feature is useful when a primitive offload a part of computations to
81 * some other primitives which require their own scratchpad space
82 * (e.g. reducer). Prefixes are used to avoid key collision in cases when
83 * multiple sub-primitive (e.g. multiple reducers) are used.
84 *
85 * A short example below demonstrates how to use aforementioned classes. In it
86 * the main primitive is convolution that uses scratchpad for keeping padded
87 * bias. It also needs a reducer, that needs its own space as well.
88 *
89 * ``` c++
90 * struct reducer_t {
91 * static void init(registrar_t &scratchpad) {
92 * // reserve space for 980*1024 floats (one page aligned)
93 * scratchpad.book<float>(key_space, 980 * 1024, 4096);
94 * }
95 *
96 * void exec(const grantor_t &scratchpad) {
97 * // get the pointer to preserved space. scratchpad came from
98 * // upper primitive (convolution in this example)
99 * auto space = scratchpad.get<float>(key_reducer_space);
100 *
101 * space[:] += ...;
102 * }
103 * };
104 *
105 * struct conv_t {
106 * struct pd_t {
107 * void init() {
108 * registrar_t scratchpad(scratchpad_registry_);
109 *
110 * // reserve space for 128 elements which are two bytes long that
111 * // require 4 byte alignment, but preferably have 64 byte
112 * // alignment for performance reasons
113 * // two alignment parameters are included for implementation
114 * // flexibility targeted at memory debugging purposes
115 * scratchpad.book(key_conv_padded_bias, 128, 2, 4, 64);
116 *
117 * // create a proxy registrar for the reducer All entries made
118 * // by reducer would live in convolution's registry, but would
119 * // have their own `prefix`, so no interference with conv's
120 * // buffers.
121 * registrar_t reducer_scratchpad(scratchpad, prefix_reducer);
122 *
123 * reducer_t::init(reducer_scratchpad);
124 * }
125 *
126 * registry_t scratchpad_registry_;
127 * }
128 *
129 * void exec() {
130 * // get the base pointer to a scratchpad memory from a user
131 * void *scratchpad_ptr = this->input(DNNL_MEM_SCRATCHPAD);
132 *
133 * // create a grantor to the scratchpad (and provide the base
134 * // pointer).
135 * grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr);
136 *
137 * // access the padded_bias (need only key name and the grantor)
138 * auto padded_bias = scratchpad.get<float>(key_conv_padded_bias);
139 *
140 * // to give the `right` grantor to reducer we need to add the
141 * // corresponding prefix, so that reducer would be able to access
142 * // its keys. The call is very similar to the one in pd_t::init
143 * // with only difference in types: grantor_t vs registrar_t.
144 * grantor_t reducer_scratchpad(scratchpad, prefix_reducer);
145 * reducer->exec(reducer_scratchpad);
146 * }
147 * };
148 * ```
149 */
150
151 /* namespace with common keys and prefixes */
152 namespace names {
153 enum {
154 key_none = 0,
155 key_barrier,
156 key_bnorm_bf16cvt,
157 key_bnorm_tmp_mean,
158 key_bnorm_tmp_var,
159 key_bnorm_tmp_diff_ss,
160 key_bnorm_tmp_stats,
161 key_bnorm_reduction,
162 key_brgemm_primitive_batch,
163 key_brgemm_primitive_buffer,
164 key_brgemm_primitive_buffer_a,
165 key_brgemm_primitive_buffer_b,
166 key_brgemm_primitive_buffer_comp,
167 key_brgemm_primitive_zp_comp_a,
168 key_brgemm_primitive_zp_comp_b,
169 key_concat_iptrs,
170 key_concat_istrides,
171 key_concat_nelems,
172 key_concat_optrs,
173 key_concat_tent_dst,
174 key_conv_adjusted_scales,
175 key_conv_amx_inp_buffer,
176 key_conv_amx_tilecfg,
177 key_conv_amx_tile_buffer,
178 key_conv_amx_wei_buffer,
179 key_conv_amx_wsp_buffer,
180 key_conv_bia_reduction,
181 key_conv_bias_bf16_convert_wsp,
182 key_conv_cudnn,
183 key_conv_cudnn_algo,
184 key_conv_cudnn_filter,
185 key_conv_cudnn_temp,
186 key_conv_dst_bf16_convert_wsp,
187 key_conv_brgemm_addr_a,
188 key_conv_brgemm_addr_b,
189 key_conv_brgemm_batch,
190 key_conv_brgemm_buffer,
191 key_conv_brgemm_inp_buffer,
192 key_conv_brgemm_inp_buffer_mask,
193 key_conv_bwd_w_1st_bia_reorder,
194 key_conv_bwd_w_1st_wei_reorder,
195 key_conv_gemm_acc,
196 key_conv_gemm_col,
197 key_conv_gemm_imtr,
198 key_conv_gemm_zp_src_comp,
199 key_conv_int_dat_in_acc_dt,
200 key_conv_padded_bias,
201 key_conv_rtus_space,
202 key_conv_store_wsp,
203 key_conv_tails,
204 key_conv_tr_diff_dst,
205 key_conv_tr_diff_dst_bctx,
206 key_conv_tr_src,
207 key_conv_tr_src_bctx,
208 key_conv_wei_reduction,
209 key_conv_wei_bia_reduction,
210 key_conv_wei_bia_reduction_bctx,
211 key_conv_zero_point_flag,
212 key_conv_zero_point_pad,
213 key_deconv_bias,
214 key_deconv_sum,
215 key_deconv_zp,
216 key_eltwise_diff_dst,
217 key_eltwise_src,
218 key_fusion_forward_scratchpad,
219 key_fusion_inout_buffer,
220 key_gemm_int_c_in_acc_dt,
221 key_gemm_tmp_buffer,
222 key_gemm_flag,
223 key_iprod_bias_bf16_convert_wsp,
224 key_iprod_dst_bf16_convert_wsp,
225 key_iprod_dst_reorder,
226 key_iprod_int_dat_in_acc_dt,
227 key_lnorm_inv_sqrtvar,
228 key_lnorm_tmp_mean,
229 key_lnorm_tmp_var,
230 key_lnorm_tmp_diff_ss,
231 key_lnorm_reduction,
232 key_matmul_dst_in_acc_dt,
233 key_pool_dst_bf16cvt,
234 key_pool_dst_plain2blocked_cvt,
235 key_pool_ind_plain2blocked_cvt,
236 key_pool_src_bf16cvt,
237 key_pool_src_plain2blocked_cvt,
238 key_prelu_reduction,
239 key_reducer_space,
240 key_reducer_space_bctx,
241 key_reduction,
242 key_reorder_cross_space,
243 key_reorder_space,
244 key_reorder_scales,
245 key_reorder_wino_plain,
246 key_reorder_wino_transform_space,
247 key_reorder_rnn_space,
248 key_reorder_rnn_weights_bf16_cvt,
249 key_reorder_rnn_weights_quantization,
250 key_reorder_rnn_weights_reduction,
251 key_reorder_rnn_weights_transposition,
252 key_rnn_space,
253 key_rnn_cell,
254 key_rnn_gates,
255 key_rnn_gates_blocked,
256 key_rnn_src_layer_trans,
257 key_rnn_src_iter_trans,
258 key_rnn_ht,
259 key_rnn_diff_ht,
260 key_rnn_ptrs_bia,
261 key_rnn_ptrs_wei_layer,
262 key_rnn_ptrs_wei_iter,
263 key_rnn_ptrs_wei_projection,
264 key_softmax_reduction,
265 key_sum_reduction,
266 key_sum_srcs_cvt,
267 key_wino_U,
268 key_wino_V,
269 key_wino_M,
270 // These two keys should always be the last ones,
271 // even though they are not in alphabetical order
272 key_nested,
273 key_nested_multiple,
274 };
275
276 enum {
277 prefix_none = 0,
278 prefix_fusion,
279 prefix_reducer_bia,
280 prefix_reducer_wei,
281 };
282 } // namespace names
283
284 // level 0: 00 00 00 xxx
285 // level 1: 00 00 aa xxx
286 // level 2: 00 aa bb xxx
287 // level 3: aa bb cc xxx
288 // max # of levels: 3 + 1 (base_level)
289 // here:
290 // xxx : [1 .. MAX_KEY) : key
291 // aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3
292
293 using key_t = uint32_t;
294 enum {
295 MAX_KEY = (1u << 10),
296 MAX_PREFIX = (1u << 7),
297 };
298
299 /// generates global key based on a prefix and a local key
make_key(key_t prefix,key_t key)300 inline key_t make_key(key_t prefix, key_t key) {
301 return prefix + key;
302 }
303
304 /// generates global prefix based on the global parent and the local ones
make_prefix(key_t parent_prefix,key_t prefix)305 inline key_t make_prefix(key_t parent_prefix, key_t prefix) {
306 return MAX_PREFIX * parent_prefix + MAX_KEY * prefix;
307 }
308
309 struct registrar_t;
310 struct grantor_t;
311
312 enum { default_alignment = 128 };
get_alignment(size_t alignment)313 inline size_t get_alignment(size_t alignment) {
314 size_t minimal_alignment
315 = memory_debug::is_mem_debug() ? getpagesize() : default_alignment;
316 return nstl::max<size_t>(alignment, minimal_alignment);
317 }
318
buffer_protect_size()319 inline size_t buffer_protect_size() {
320 return memory_debug::is_mem_debug()
321 ? memory_debug::protect_size() + getpagesize()
322 : 0;
323 }
324
325 struct registry_t {
326 struct entry_t {
327 size_t offset, size, capacity, alignment;
328
329 // apply offset and alignment + check memory_debug (host/cpu only)
330 const void *compute_ptr(const void *base_ptr) const;
331 };
332
333 // perf_align is the desired alignment for performance.
334 // data_align is the minimum data alignment required for functionality,
335 // this parameter is included for memory debugging purposes.
bookdnnl::impl::memory_tracking::registry_t336 void book(const key_t &key, size_t size, size_t data_align,
337 size_t perf_align = default_alignment) {
338 if (size == 0) return;
339 assert(offset_map_.count(key) == 0);
340 size_t alignment = memory_debug::is_mem_debug()
341 ? data_align
342 : nstl::max(data_align, perf_align);
343
344 if (memory_debug::is_mem_debug() && size_ == 0)
345 size_ += get_alignment(alignment) + buffer_protect_size();
346
347 assert(alignment > 0 && (alignment & (alignment - 1)) == 0);
348 size_t capacity
349 = size + get_alignment(alignment) + buffer_protect_size();
350 offset_map_[key] = entry_t {size_, size, capacity, alignment};
351
352 size_ += capacity;
353 }
354
getdnnl::impl::memory_tracking::registry_t355 entry_t get(const key_t &key) const {
356 if (size() == 0 || offset_map_.count(key) != 1)
357 return entry_t {0, 0, 0, 0};
358 return offset_map_.at(key);
359 }
360
sizednnl::impl::memory_tracking::registry_t361 size_t size() const { return size_; }
362
363 registrar_t registrar();
364 grantor_t grantor(const memory_storage_t *mem_storage,
365 const exec_ctx_t &exec_ctx) const;
366
367 template <typename return_type>
368 class common_iterator_t {
369 private:
370 const void *base_ptr;
371 std::unordered_map<key_t, entry_t>::const_iterator iter;
372
373 public:
common_iterator_t(const void * base_ptr_,const std::unordered_map<key_t,entry_t> & map,bool is_begin=true)374 common_iterator_t(const void *base_ptr_,
375 const std::unordered_map<key_t, entry_t> &map,
376 bool is_begin = true) {
377 base_ptr = base_ptr_;
378 if (is_begin) {
379 iter = map.cbegin();
380 } else {
381 iter = map.cend();
382 }
383 }
operator ++(int)384 common_iterator_t &operator++(int) {
385 iter++;
386 return *this;
387 }
operator ==(const common_iterator_t & rhs) const388 bool operator==(const common_iterator_t &rhs) const {
389 return iter == rhs.iter;
390 }
operator !=(const common_iterator_t & rhs) const391 bool operator!=(const common_iterator_t &rhs) const {
392 return iter != rhs.iter;
393 }
operator *() const394 std::pair<return_type, size_t> operator*() const {
395 const entry_t &entry = iter->second;
396 const void *ptr_start = entry.compute_ptr(base_ptr);
397 return std::pair<return_type, size_t> {
398 (return_type)ptr_start, entry.size};
399 }
400 };
401 typedef common_iterator_t<void *> iterator;
402 typedef common_iterator_t<const void *> const_iterator;
begindnnl::impl::memory_tracking::registry_t403 iterator begin(void *base_ptr_) const {
404 return iterator(base_ptr_, offset_map_);
405 }
enddnnl::impl::memory_tracking::registry_t406 iterator end(void *base_ptr_) const {
407 return iterator(base_ptr_, offset_map_, false);
408 }
cbegindnnl::impl::memory_tracking::registry_t409 const_iterator cbegin(const void *base_ptr_) const {
410 return const_iterator(base_ptr_, offset_map_);
411 }
cenddnnl::impl::memory_tracking::registry_t412 const_iterator cend(const void *base_ptr_) const {
413 return const_iterator(base_ptr_, offset_map_, false);
414 }
415
416 protected:
417 std::unordered_map<key_t, entry_t> offset_map_;
418 size_t size_ = 0;
419 };
420
421 struct registrar_t {
registrar_tdnnl::impl::memory_tracking::registrar_t422 registrar_t(registry_t ®istry) : registry_(registry), prefix_(0) {}
registrar_tdnnl::impl::memory_tracking::registrar_t423 registrar_t(registrar_t &parent, const key_t &prefix)
424 : registry_(parent.registry_)
425 , prefix_(make_prefix(parent.prefix_, prefix)) {}
426
bookdnnl::impl::memory_tracking::registrar_t427 void book(const key_t &key, size_t nelems, size_t data_size,
428 size_t data_align = 0, size_t perf_align = default_alignment) {
429 if (data_align == 0) data_align = data_size;
430 registry_.book(make_key(prefix_, key), nelems * data_size, data_align,
431 perf_align);
432 }
433 template <typename T>
bookdnnl::impl::memory_tracking::registrar_t434 void book(const key_t &key, size_t nelems,
435 size_t perf_align = default_alignment) {
436 registry_.book(make_key(prefix_, key), nelems * sizeof(T), alignof(T),
437 perf_align);
438 }
439
bookdnnl::impl::memory_tracking::registrar_t440 void book(const key_t &key, const registry_t ®istry,
441 size_t perf_align = default_alignment) {
442 registry_.book(make_key(prefix_, key), registry.size(), 1, perf_align);
443 }
444
sizednnl::impl::memory_tracking::registrar_t445 size_t size() const { return registry_.size(); }
446
447 protected:
448 registry_t ®istry_;
449 const key_t prefix_;
450 };
451
452 struct grantor_t {
grantor_tdnnl::impl::memory_tracking::grantor_t453 grantor_t(const registry_t ®istry,
454 const memory_storage_t *base_mem_storage,
455 const exec_ctx_t &exec_ctx)
456 : registry_(registry)
457 , prefix_(0)
458 , base_mem_storage_(base_mem_storage)
459 , exec_ctx_(&exec_ctx) {}
grantor_tdnnl::impl::memory_tracking::grantor_t460 grantor_t(const grantor_t &parent, const key_t &prefix)
461 : registry_(parent.registry_)
462 , prefix_(make_prefix(parent.prefix_, prefix))
463 , base_mem_storage_(parent.base_mem_storage_)
464 , exec_ctx_(parent.exec_ctx_) {}
465
466 template <typename T = void>
getdnnl::impl::memory_tracking::grantor_t467 T *get(const key_t &key) const {
468 if (!base_mem_storage_) {
469 assert(registry_.size() == 0);
470 return nullptr;
471 }
472 auto e = registry_.get(make_key(prefix_, key));
473 if (e.size == 0) return nullptr;
474
475 char *host_storage_ptr = get_host_storage_ptr(base_mem_storage_);
476 char *base_ptr = host_storage_ptr + base_mem_storage_->base_offset();
477 return (T *)e.compute_ptr(base_ptr);
478 }
479
get_memory_storagednnl::impl::memory_tracking::grantor_t480 std::unique_ptr<memory_storage_t> get_memory_storage(
481 const key_t &key) const {
482 if (!base_mem_storage_) {
483 assert(registry_.size() == 0);
484 return nullptr;
485 }
486 auto e = registry_.get(make_key(prefix_, key));
487 if (e.size == 0) return nullptr;
488
489 if (is_cpu_engine(base_mem_storage_)) {
490 char *host_storage_ptr = get_host_storage_ptr(base_mem_storage_);
491 char *base_ptr
492 = host_storage_ptr + base_mem_storage_->base_offset();
493 char *aligned_ptr = (char *)e.compute_ptr(base_ptr);
494 size_t aligned_offset = size_t(aligned_ptr - host_storage_ptr);
495 return base_mem_storage_->get_sub_storage(aligned_offset, e.size);
496 }
497
498 const size_t aligned_offset
499 = reinterpret_cast<size_t>(utils::align_ptr<char>(
500 reinterpret_cast<char *>(e.offset), e.alignment));
501 assert(aligned_offset + e.size <= registry_.size());
502 return base_mem_storage_->get_sub_storage(aligned_offset, e.size);
503 }
504
get_base_storagednnl::impl::memory_tracking::grantor_t505 const memory_storage_t *get_base_storage() const {
506 return base_mem_storage_;
507 }
get_registrydnnl::impl::memory_tracking::grantor_t508 const registry_t &get_registry() const { return registry_; }
509
510 protected:
511 const registry_t ®istry_;
512 const key_t prefix_;
513 const memory_storage_t *base_mem_storage_;
514 const exec_ctx_t *exec_ctx_;
515
516 private:
517 char *get_host_storage_ptr(const memory_storage_t *storage) const;
518 bool is_cpu_engine(const memory_storage_t *mem_storage) const;
519 };
520
registrar()521 inline registrar_t registry_t::registrar() {
522 return registrar_t(*this);
523 }
grantor(const memory_storage_t * mem_storage,const exec_ctx_t & exec_ctx) const524 inline grantor_t registry_t::grantor(
525 const memory_storage_t *mem_storage, const exec_ctx_t &exec_ctx) const {
526 return grantor_t(*this, mem_storage, exec_ctx);
527 }
528
529 } // namespace memory_tracking
530 } // namespace impl
531 } // namespace dnnl
532
533 #endif
534