1 /*******************************************************************************
2 * Copyright 2017-2020 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16 
17 #include <assert.h>
18 
19 #include "oneapi/dnnl/dnnl_types.h"
20 
21 #include "common/dnnl_thread.hpp"
22 #include "common/nstl.hpp"
23 #include "common/utils.hpp"
24 
25 #include "cpu/platform.hpp"
26 
27 #include "cpu/x64/cpu_reducer.hpp"
28 
29 namespace dnnl {
30 namespace impl {
31 namespace cpu {
32 namespace x64 {
33 
34 using namespace memory_tracking::names;
35 
balance()36 void reduce_balancer_t::balance() {
37     using namespace nstl;
38     using namespace utils;
39 
40     assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0);
41 
42     const int job_complexity = 1;
43 
44     const int min_njobs_per_group = max(1, njobs_ / nthr_);
45     const int max_njobs_per_group
46             = max(1, static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
47 
48     /* initial guess */
49     int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
50     int nthr_per_group
51             = allow_nthr_in_group_ ? min(nthr_ / ngroups, reduction_size_) : 1;
52     int njobs_per_group_ub = div_up(njobs_, ngroups);
53 
54     /* rough upper-bound estimation, will be fixed during brute force */
55     size_t thread_complexity_ub = (size_t)njobs_ * job_size_ * reduction_size_;
56 
57     /* brute force parameters for the best balance... */
58     for (int c_njobs_per_group = min_njobs_per_group;
59             c_njobs_per_group < njobs_; ++c_njobs_per_group) {
60         /* current assumption */
61         int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
62         int c_nthr_per_group = allow_nthr_in_group_
63                 ? min(nthr_ / c_ngroups, reduction_size_)
64                 : 1;
65         int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
66 
67         if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group)
68             continue;
69 
70         int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
71         size_t c_group_size_ub = (size_t)job_size_ * c_njobs_per_group_ub;
72         size_t c_thread_complexity_ub = c_group_size_ub
73                 * (job_complexity * c_thread_reduction_ub
74                         + (c_nthr_per_group != 1));
75 
76         if (c_thread_complexity_ub < thread_complexity_ub) {
77             ngroups = c_ngroups;
78             nthr_per_group = c_nthr_per_group;
79             njobs_per_group_ub = c_njobs_per_group_ub;
80             thread_complexity_ub = c_thread_complexity_ub;
81         }
82     }
83 
84     assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1);
85     assert(ngroups * nthr_per_group <= nthr_);
86     assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
87             || nthr_per_group == 1); /* no reduction buffer overflow */
88     assert(IMPLICATION(!allow_nthr_in_group_, nthr_per_group == 1));
89 
90     ngroups_ = ngroups;
91     nthr_per_group_ = nthr_per_group;
92     njobs_per_group_ub_ = njobs_per_group_ub;
93 }
94 
95 /* reducer jit-ted driver */
96 
97 using namespace Xbyak;
98 
99 template <impl::data_type_t data_type>
100 struct reducer_2d_driver_t : public jit_generator {
101     using data_t = typename prec_traits<data_type>::type;
102 
reducer_2d_driver_tdnnl::impl::cpu::x64::reducer_2d_driver_t103     reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step,
104             size_t dst_step, bool nullify_dst)
105         : n_src_(n_src)
106         , src_ld_(src_ld)
107         , src_step_(src_step)
108         , dst_step_(dst_step)
109         , nullify_dst_(nullify_dst) {}
110     virtual void operator()(
111             data_t *dst, const data_t *srcs, size_t ny, size_t nx)
112             = 0;
113 
114 protected:
115     int n_src_;
116     size_t src_ld_, src_step_, dst_step_;
117     bool nullify_dst_;
118 };
119 
120 template <impl::data_type_t data_type, cpu_isa_t isa>
121 struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
122     DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
123 
124     using data_t = typename prec_traits<data_type>::type;
125 
operator ()dnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t126     void operator()(
127             data_t *dst, const data_t *srcs, size_t ny, size_t nx) override {
128         jit_generator::operator()(dst, srcs, ny, nx);
129     }
130 
131     /* cpu specific part */
132     using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
133     const AddressFrame &vmmword = (isa == avx2) ? this->yword : this->zword;
uni_vadddnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t134     void uni_vadd(const Xmm &x1, const Xmm &x2, const Operand &op) {
135         if (data_type == data_type::f32)
136             this->vaddps(x1, x2, op);
137         else
138             this->vpaddd(x1, x2, op);
139     }
uni_adddnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t140     void uni_add(const Xmm &x1, const Operand &op) {
141         if (data_type == data_type::f32)
142             this->addss(x1, op);
143         else
144             this->paddd(x1, op);
145     }
146 
147     const int vlen = cpu_isa_traits<isa>::vlen;
148     const int typesize
149             = sizeof(typename dnnl::impl::prec_traits<data_type>::type);
150     Xbyak::Reg64 reg_dst = abi_param1;
151     Xbyak::Reg64 reg_src = abi_param2;
152     Xbyak::Reg64 reg_ny = abi_param3;
153     Xbyak::Reg64 reg_nx = abi_param4;
154 
155     Xbyak::Reg64 reg_x = this->rax;
156     Xbyak::Reg64 reg_src_id = this->r10;
157     Xbyak::Reg64 reg_long_offt = this->r11;
158 
reducer_2d_driver_f_s_32_tdnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t159     reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
160             size_t dst_step, bool nullify_dst)
161         : reducer_2d_driver_t<data_type>(
162                 n_src, src_ld, src_step, dst_step, nullify_dst) {}
163 
nullify_dstdnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t164     void nullify_dst(int nloads, int load_len) {
165         UNUSED(load_len);
166         for (int i = 0; i < nloads; ++i)
167             this->uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
168         /* prefetches[dst] ? */
169     }
170 
load_dstdnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t171     void load_dst(int nloads, int load_len) {
172         for (int i = 0; i < nloads; ++i) {
173             if (load_len == typesize)
174                 this->movd(Xmm(i), this->ptr[reg_dst + i * load_len]);
175             else if (load_len == vlen)
176                 this->vmovups(Vmm(i), this->ptr[reg_dst + i * load_len]);
177             else
178                 assert(!"unsupported");
179         }
180     }
181 
store_dstdnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t182     void store_dst(int nloads, int load_len) {
183         for (int i = 0; i < nloads; ++i) {
184             if (load_len == typesize)
185                 this->movd(this->ptr[reg_dst + i * load_len], Xmm(i));
186             else if (load_len == vlen)
187                 this->vmovups(this->ptr[reg_dst + i * load_len], Vmm(i));
188             else
189                 assert(!"unsupported");
190         }
191     }
192 
accumulatednnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t193     void accumulate(int nloads, int load_len, size_t base_off) {
194         for (int i = 0; i < nloads; ++i) {
195             size_t off = base_off + i * load_len;
196 
197             if (load_len == typesize)
198                 this->uni_add(Xmm(i), this->ptr[reg_src + off]);
199             else if (load_len == vlen)
200                 this->uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
201             else
202                 assert(!"unsupported");
203         }
204     }
205 
loop_xdnnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t206     void loop_x() {
207         const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1};
208         const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
209 
210         const int load_len[nbranches] = {vlen, vlen, typesize};
211         Label loop_x_label[nbranches + 1];
212 
213         this->mov(reg_x, reg_nx);
214 
215         for (int id = 0; id < nbranches; ++id) {
216             this->L(loop_x_label[id]);
217 
218             this->cmp(reg_x, nloads[id] * load_len[id]);
219             this->jl(loop_x_label[id + 1], this->T_NEAR);
220 
221             if (this->nullify_dst_)
222                 nullify_dst(nloads[id], load_len[id]);
223             else
224                 load_dst(nloads[id], load_len[id]);
225 
226             if (nloads[id] > 1) {
227                 Label loop_srcs;
228                 this->mov(reg_src_id, this->n_src_);
229                 this->L(loop_srcs);
230 
231                 accumulate(nloads[id], load_len[id], 0);
232                 this->add(reg_src, this->src_ld_ * typesize);
233 
234                 this->dec(reg_src_id);
235                 this->jnz(loop_srcs, this->T_NEAR);
236 
237                 size_t base_off
238                         = (size_t)this->n_src_ * this->src_ld_ * typesize;
239                 this->safe_sub(reg_src, base_off, reg_long_offt);
240             } else {
241                 for (int src_id = 0; src_id < this->n_src_; ++src_id) {
242                     const size_t base_off
243                             = (size_t)src_id * this->src_ld_ * typesize;
244                     accumulate(nloads[id], load_len[id], base_off);
245                 }
246             }
247 
248             store_dst(nloads[id], load_len[id]);
249 
250             this->add(reg_src, nloads[id] * load_len[id]);
251             this->add(reg_dst, nloads[id] * load_len[id]);
252 
253             this->sub(reg_x, nloads[id] * load_len[id]);
254 
255             this->jmp(loop_x_label[id], this->T_NEAR);
256         }
257 
258         this->L(loop_x_label[nbranches]);
259 
260         /* restore address registers */
261         this->sub(reg_src, reg_nx);
262         this->sub(reg_dst, reg_nx);
263     }
264 
generatednnl::impl::cpu::x64::reducer_2d_driver_f_s_32_t265     void generate() override {
266         assert(isa == avx2 || isa == avx512_common || isa == avx512_mic);
267 
268         this->preamble();
269 
270         this->shl(reg_nx, 2);
271 
272         Label ny_loop;
273         this->L(ny_loop);
274 
275         loop_x();
276 
277         this->add(reg_dst, this->dst_step_ * typesize);
278         this->add(reg_src, this->src_step_ * typesize);
279 
280         this->dec(reg_ny);
281         this->jnz(ny_loop, this->T_NEAR);
282 
283         this->postamble();
284     }
285 };
286 
287 template <impl::data_type_t data_type>
create_reduce_2d_drv(int n_src,size_t src_ld,size_t src_step,size_t dst_step,bool nullify_dst)288 inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src,
289         size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
290     if (mayiuse(avx512_common))
291         return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(
292                 n_src, src_ld, src_step, dst_step, nullify_dst);
293     else if (mayiuse(avx2))
294         return new reducer_2d_driver_f_s_32_t<data_type, avx2>(
295                 n_src, src_ld, src_step, dst_step, nullify_dst);
296     assert(!"unimplemented");
297     return nullptr;
298 }
299 
300 /* cpu_reducer_t */
301 
302 template <impl::data_type_t data_type>
init_scratchpad(memory_tracking::registrar_t & scratchpad) const303 void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
304         memory_tracking::registrar_t &scratchpad) const {
305     if (balancer_.nthr_per_group_ == 1) return;
306 
307     const size_t space_size = balancer_.ngroups_
308             * (balancer_.nthr_per_group_ - 1)
309             * cpu_reducer_t<data_type>::space_per_thread(balancer_);
310     scratchpad.book<data_t>(key_reducer_space, space_size, PAGE_4K);
311     scratchpad.book<simple_barrier::ctx_t>(
312             key_reducer_space_bctx, balancer_.ngroups_);
313 }
314 
315 template <impl::data_type_t data_type>
cpu_reducer_t(const conf_t & conf)316 cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
317     : conf_(conf), drv_(nullptr) {
318     if (balancer().nthr_per_group_ == 1) return;
319 
320     drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1,
321             space_per_thread(balancer()), 0, 0, false);
322 }
323 
324 template <impl::data_type_t data_type>
~cpu_reducer_t()325 cpu_reducer_t<data_type>::~cpu_reducer_t() {
326     delete drv_;
327 }
328 
329 template <impl::data_type_t data_type>
create_kernel()330 status_t cpu_reducer_t<data_type>::create_kernel() {
331     return (drv_) ? drv_->create_kernel() : status::success;
332 }
333 
334 template <impl::data_type_t data_type>
335 typename cpu_reducer_t<data_type>::data_t *
get_local_ptr(int ithr,data_t * dst,const memory_tracking::grantor_t & scratchpad) const336 cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
337         const memory_tracking::grantor_t &scratchpad) const {
338     const int id_in_grp = balancer().id_in_group(ithr);
339 
340     /* threads 0 from each group writes directly to the destination */
341     if (id_in_grp == 0)
342         return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
343 
344     const int grp_id = balancer().group_id(ithr);
345     const int offset_factor
346             = grp_id * (balancer().nthr_per_group_ - 1) + (id_in_grp - 1);
347 
348     auto space = scratchpad.template get<data_t>(key_reducer_space);
349     return space + offset_factor * space_per_thread(balancer());
350 }
351 
352 template <impl::data_type_t data_type>
reduce_nolock(int ithr,data_t * dst,const memory_tracking::grantor_t & scratchpad) const353 void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
354         const memory_tracking::grantor_t &scratchpad) const {
355     bool redundant_reduction
356             = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
357     if (redundant_reduction) return;
358 
359 #ifdef SIMPLE_IMPL
360     if (balancer().id_in_group(ithr) != 0)
361         return; /* only threads 0 do the reduction */
362 
363     const int njobs_in_grp = balancer().ithr_njobs(ithr);
364     data_t *d = get_local_ptr(ithr, dst, scratchpad);
365     for (int id_in_grp = 1; id_in_grp < balancer().nthr_per_group_;
366             ++id_in_grp) {
367         const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
368         for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
369             d[i] += space[i];
370     }
371 #else
372     using namespace utils;
373 
374     const int id_in_grp = balancer().id_in_group(ithr);
375     const int njobs_in_grp = balancer().ithr_njobs(ithr);
376     const size_t cl = 64 / sizeof(data_t);
377 
378     const size_t reduction_size = njobs_in_grp * balancer().job_size_;
379     size_t start {0}, end {0};
380     balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
381             id_in_grp, start, end);
382 
383     if (start == end) return;
384 
385     data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl;
386     const data_t *space
387             = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) + start * cl;
388     const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
389 
390     (*drv_)(d, space, 1, len);
391 #endif
392 }
393 
394 template struct cpu_reducer_t<data_type::f32>;
395 template struct cpu_reducer_t<data_type::s32>;
396 
397 /* cpu_reducer_2d_t */
398 
399 template <impl::data_type_t data_type>
init_scratchpad(memory_tracking::registrar_t & scratchpad) const400 void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
401         memory_tracking::registrar_t &scratchpad) const {
402     if (balancer_.nthr_per_group_ == 1) return;
403 
404     const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
405             * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
406     scratchpad.book<data_t>(key_reducer_space, space_size);
407     scratchpad.book<simple_barrier::ctx_t>(
408             key_reducer_space_bctx, balancer_.ngroups_);
409 }
410 
411 template <impl::data_type_t data_type>
cpu_reducer_2d_t(const conf_t & conf)412 cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
413     : conf_(conf), drv_(nullptr) {
414     if (balancer().nthr_per_group_ == 1) return;
415 
416     drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
417             space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
418             true);
419 }
420 
421 template <impl::data_type_t data_type>
~cpu_reducer_2d_t()422 cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() {
423     delete drv_;
424 }
425 
426 template <impl::data_type_t data_type>
create_kernel()427 status_t cpu_reducer_2d_t<data_type>::create_kernel() {
428     return (drv_) ? drv_->create_kernel() : status::success;
429 }
430 
431 template <impl::data_type_t data_type>
432 typename cpu_reducer_2d_t<data_type>::data_t *
get_local_ptr(int ithr,const memory_tracking::grantor_t & scratchpad) const433 cpu_reducer_2d_t<data_type>::get_local_ptr(
434         int ithr, const memory_tracking::grantor_t &scratchpad) const {
435     const int id_in_grp = balancer().id_in_group(ithr);
436     const int grp_id = balancer().group_id(ithr);
437     const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
438     auto space = scratchpad.template get<data_t>(key_reducer_space);
439     return space + offset_factor * space_per_thread(balancer());
440 }
441 
442 template <impl::data_type_t data_type>
choose_x_blocking(int nx,int ny,int nthr_per_grp) const443 int cpu_reducer_2d_t<data_type>::choose_x_blocking(
444         int nx, int ny, int nthr_per_grp) const {
445     // find x_blocking for better balance reducing work between threads
446     assert(conf_.x_block_ > 0 && nx > conf_.x_block_
447             && nx % conf_.x_block_ == 0);
448     int x_blocking = nx / conf_.x_block_;
449     int min_x_blocking
450             = utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny));
451     while (true) {
452         if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2)
453             x_blocking /= 2;
454         else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3)
455             x_blocking /= 3;
456         else
457             break;
458     }
459     if (x_blocking >= min_x_blocking * 4) x_blocking = 1;
460     x_blocking *= conf_.x_block_;
461     return x_blocking;
462 }
463 
464 template <impl::data_type_t data_type>
reduce_block(const data_t * space_base,data_t * dst,int job,int start_y,int start_x,int ny_start,int nx_start,int ny_step,int nx_step) const465 void cpu_reducer_2d_t<data_type>::reduce_block(const data_t *space_base,
466         data_t *dst, int job, int start_y, int start_x, int ny_start,
467         int nx_start, int ny_step, int nx_step) const {
468     data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ + start_x + nx_start;
469     const data_t *space = space_base + (size_t)job * balancer().job_size_
470             + (size_t)ny_start * conf_.job_size_x_ + nx_start;
471 #ifdef SIMPLE_IMPL
472     for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) {
473         const data_t *w = &space[idg * space_per_thread(balancer())];
474         for (int y = 0; y < ny_step; ++y)
475             for (int x = 0; x < nx_step; ++x) {
476                 d[y * conf_.dst_x_ + x]
477                         = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x])
478                         + w[y * conf_.job_size_x_ + x];
479             }
480     }
481 #else
482     (*drv_)(d, space, ny_step, nx_step);
483 #endif
484 }
485 
486 template <impl::data_type_t data_type>
reduce_nolock(int ithr,data_t * dst,const memory_tracking::grantor_t & scratchpad) const487 void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
488         const memory_tracking::grantor_t &scratchpad) const {
489     bool redundant_reduction
490             = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
491     if (redundant_reduction) return;
492 
493     const int id_in_grp = balancer().id_in_group(ithr);
494     const int njobs_in_grp = balancer().ithr_njobs(ithr);
495     const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
496     const int global_job_start = balancer().ithr_job_off(ithr);
497 
498     const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
499 
500     const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
501     const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
502 
503     if (id_in_grp >= pr_grps * pr_nthr_per_grp) return; /* idle */
504 
505     const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
506     const int pr_my_id = id_in_grp % pr_nthr_per_grp;
507 
508     int pr_job_start {0}, pr_job_end {0};
509     balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
510 
511     for (int j = pr_job_start; j < pr_job_end; ++j) {
512         const int global_job = global_job_start + j;
513         const int j_y = global_job / njobs_x;
514         const int j_x = global_job % njobs_x;
515         const int start_y = j_y * conf_.job_size_y_;
516         const int start_x = j_x * conf_.job_size_x_;
517         const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
518         const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
519         int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
520 
521         int nxy_start {0}, nxy_end {0};
522         balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, nxy_start,
523                 nxy_end);
524         if (nxy_start == nxy_end) continue;
525         nxy_start *= x_blocking;
526         nxy_end *= x_blocking;
527 
528         int nxy = nxy_start;
529         if (nxy % nx != 0) {
530             int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
531             reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
532                     nxy % nx, 1, nx_step);
533             nxy += nx_step;
534         }
535         if ((nxy_end - nxy) > nx) {
536             int ny_step = (nxy_end - nxy) / nx;
537             reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
538                     nxy % nx, ny_step, nx);
539             nxy += nx * ny_step;
540         }
541         if ((nxy_end - nxy) > 0) {
542             reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
543                     nxy % nx, 1, nxy_end - nxy);
544         }
545     }
546 }
547 
548 template struct cpu_reducer_2d_t<data_type::f32>;
549 template struct cpu_reducer_2d_t<data_type::s32>;
550 
551 /* accumulator section */
552 
553 template <impl::data_type_t data_type>
cpu_accumulator_1d_t()554 cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t() : drv_(nullptr) {
555     drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false);
556 }
557 
558 template <impl::data_type_t data_type>
~cpu_accumulator_1d_t()559 cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
560     delete drv_;
561 }
562 
563 template <impl::data_type_t data_type>
create_kernel()564 status_t cpu_accumulator_1d_t<data_type>::create_kernel() {
565     return drv_->create_kernel();
566 }
567 
568 template <impl::data_type_t data_type>
accumulate(data_t * dst,const data_t * src,size_t size)569 void cpu_accumulator_1d_t<data_type>::accumulate(
570         data_t *dst, const data_t *src, size_t size) {
571     (*drv_)(dst, src, 1, size);
572 }
573 
574 template struct cpu_accumulator_1d_t<data_type::f32>;
575 template struct cpu_accumulator_1d_t<data_type::s32>;
576 
577 } // namespace x64
578 } // namespace cpu
579 } // namespace impl
580 } // namespace dnnl
581 
582 // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
583