1 /*******************************************************************************
2 * Copyright 2020-2021 Intel Corporation
3 * Copyright 2020-2021 FUJITSU LIMITED
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #include <assert.h>
19 
20 #include "dnnl_types.h"
21 
22 #include "common/dnnl_thread.hpp"
23 #include "common/nstl.hpp"
24 #include "common/utils.hpp"
25 
26 #include "cpu/platform.hpp"
27 
28 #include "cpu/aarch64/cpu_reducer.hpp"
29 
30 namespace dnnl {
31 namespace impl {
32 namespace cpu {
33 namespace aarch64 {
34 
35 using namespace memory_tracking::names;
36 
balance()37 void reduce_balancer_t::balance() {
38     using namespace nstl;
39     using namespace utils;
40 
41     assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0);
42 
43     const int job_complexity = 1;
44 
45     const int min_njobs_per_group = max(1, njobs_ / nthr_);
46     const int max_njobs_per_group
47             = max(1, static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
48 
49     /* initial guess */
50     int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
51     int nthr_per_group
52             = allow_nthr_in_group_ ? min(nthr_ / ngroups, reduction_size_) : 1;
53     int njobs_per_group_ub = div_up(njobs_, ngroups);
54 
55     /* rough upper-bound estimation, will be fixed during brute force */
56     size_t thread_complexity_ub = (size_t)njobs_ * job_size_ * reduction_size_;
57 
58     /* brute force parameters for the best balance... */
59     for (int c_njobs_per_group = min_njobs_per_group;
60             c_njobs_per_group < njobs_; ++c_njobs_per_group) {
61         /* current assumption */
62         int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
63         int c_nthr_per_group = allow_nthr_in_group_
64                 ? min(nthr_ / c_ngroups, reduction_size_)
65                 : 1;
66         int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
67 
68         if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group)
69             continue;
70 
71         int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
72         size_t c_group_size_ub = (size_t)job_size_ * c_njobs_per_group_ub;
73         size_t c_thread_complexity_ub = c_group_size_ub
74                 * (job_complexity * c_thread_reduction_ub
75                         + (c_nthr_per_group != 1));
76 
77         if (c_thread_complexity_ub < thread_complexity_ub) {
78             ngroups = c_ngroups;
79             nthr_per_group = c_nthr_per_group;
80             njobs_per_group_ub = c_njobs_per_group_ub;
81             thread_complexity_ub = c_thread_complexity_ub;
82         }
83     }
84 
85     assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1);
86     assert(ngroups * nthr_per_group <= nthr_);
87     assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
88             || nthr_per_group == 1); /* no reduction buffer overflow */
89     assert(IMPLICATION(!allow_nthr_in_group_, nthr_per_group == 1));
90 
91     ngroups_ = ngroups;
92     nthr_per_group_ = nthr_per_group;
93     njobs_per_group_ub_ = njobs_per_group_ub;
94 }
95 
96 /* reducer jit-ted driver */
97 
98 using namespace Xbyak_aarch64;
99 
100 template <impl::data_type_t data_type>
101 struct reducer_2d_driver_t : public jit_generator {
102     using data_t = typename prec_traits<data_type>::type;
103 
reducer_2d_driver_tdnnl::impl::cpu::aarch64::reducer_2d_driver_t104     reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step,
105             size_t dst_step, bool nullify_dst)
106         : n_src_(n_src)
107         , src_ld_(src_ld)
108         , src_step_(src_step)
109         , dst_step_(dst_step)
110         , nullify_dst_(nullify_dst) {}
111     virtual void operator()(
112             data_t *dst, const data_t *srcs, size_t ny, size_t nx)
113             = 0;
114 
115 protected:
116     int n_src_;
117     size_t src_ld_, src_step_, dst_step_;
118     bool nullify_dst_;
119 };
120 
121 template <impl::data_type_t data_type, cpu_isa_t isa>
122 struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
123     DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
124 
125     using data_t = typename prec_traits<data_type>::type;
126 
operator ()dnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t127     void operator()(
128             data_t *dst, const data_t *srcs, size_t ny, size_t nx) override {
129         jit_generator::operator()(dst, srcs, ny, nx);
130     }
131 
132     /* cpu specific part */
133     using Vmm = Xbyak_aarch64::ZRegS;
134 
135     const int vlen = cpu_isa_traits<isa>::vlen;
136     const int typesize
137             = sizeof(typename dnnl::impl::prec_traits<data_type>::type);
138     XReg reg_dst = abi_param1;
139     XReg reg_src = abi_param2;
140     XReg reg_ny = abi_param3;
141     XReg reg_nx = abi_param4;
142 
143     XReg reg_x = this->x19;
144     XReg reg_src_id = this->x20;
145     XReg reg_long_offt = this->x21;
146 
147     XReg reg_tmp_imm = this->x29;
148     XReg reg_tmp_ptr = this->x30;
149 
150     PReg preg_one = this->p3;
151     PReg preg_all = this->p4;
152 
reducer_2d_driver_f_s_32_tdnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t153     reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
154             size_t dst_step, bool nullify_dst)
155         : reducer_2d_driver_t<data_type>(
156                 n_src, src_ld, src_step, dst_step, nullify_dst) {}
157 
uni_loaddnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t158     void uni_load(const Vmm &z1, const XReg &src, size_t off, int load_len) {
159         auto src_ptr = (off == 0) ? src : reg_tmp_ptr;
160         if (off != 0) this->add_imm(src_ptr, src, off, reg_tmp_imm);
161 
162         if (load_len == typesize)
163             this->ld1w(z1, preg_one.s, ptr(src_ptr));
164         else if (load_len == vlen)
165             this->ld1w(z1, preg_all.s, ptr(src_ptr));
166         else
167             assert(!"unsupported");
168     }
169 
uni_storednnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t170     void uni_store(const Vmm &z1, const XReg &dst, size_t off, int load_len) {
171         auto dst_ptr = (off == 0) ? dst : reg_tmp_ptr;
172         if (off != 0) this->add_imm(dst_ptr, dst, off, reg_tmp_imm);
173 
174         if (load_len == typesize)
175             this->st1w(z1, preg_one.s, ptr(dst_ptr));
176         else if (load_len == vlen)
177             this->st1w(z1, preg_all.s, ptr(dst_ptr));
178         else
179             assert(!"unsupported");
180     }
181 
nullify_dstdnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t182     void nullify_dst(int nloads, int load_len) {
183         UNUSED(load_len);
184         for (int i = 0; i < nloads; ++i)
185             this->fmov(Vmm(i)); // Zero clear
186         /* prefetches[dst] ? */
187     }
188 
load_dstdnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t189     void load_dst(int nloads, int load_len) {
190         for (int i = 0; i < nloads; ++i)
191             uni_load(Vmm(i), reg_dst, i * load_len, load_len);
192     }
193 
store_dstdnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t194     void store_dst(int nloads, int load_len) {
195         for (int i = 0; i < nloads; ++i)
196             uni_store(Vmm(i), reg_dst, i * load_len, load_len);
197     }
198 
accumulatednnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t199     void accumulate(int nloads, int load_len, size_t base_off) {
200         for (int i = 0; i < nloads; ++i) {
201             size_t off = base_off + i * load_len;
202             uni_load(Vmm(cpu_isa_traits<isa>::n_vregs - 1), reg_src, off,
203                     load_len);
204             if (data_type == data_type::f32)
205                 this->fadd(
206                         Vmm(i), Vmm(i), Vmm(cpu_isa_traits<isa>::n_vregs - 1));
207             else
208                 this->add(
209                         Vmm(i), Vmm(i), Vmm(cpu_isa_traits<isa>::n_vregs - 1));
210         }
211     }
212 
loop_xdnnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t213     void loop_x() {
214         const int nloads[] = {cpu_isa_traits<isa>::n_vregs - 1, 1, 1};
215         const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
216 
217         const int load_len[nbranches] = {vlen, vlen, typesize};
218         Label loop_x_label[nbranches + 1];
219 
220         this->ptrue(preg_all.b);
221         if (typesize == 4)
222             this->ptrue(preg_one.s, VL1);
223         else
224             assert(!"Unsupported typesize");
225 
226         this->mov(reg_x, reg_nx);
227 
228         for (int id = 0; id < nbranches; ++id) {
229             this->L(loop_x_label[id]);
230 
231             this->cmp(reg_x, nloads[id] * load_len[id]);
232             this->b(LT, loop_x_label[id + 1]);
233 
234             if (this->nullify_dst_)
235                 nullify_dst(nloads[id], load_len[id]);
236             else
237                 load_dst(nloads[id], load_len[id]);
238 
239             if (nloads[id] > 1) {
240                 Label loop_srcs;
241                 this->mov_imm(reg_src_id, this->n_src_);
242                 this->L(loop_srcs);
243 
244                 accumulate(nloads[id], load_len[id], 0);
245                 this->add_imm(reg_src, reg_src, this->src_ld_ * typesize,
246                         reg_tmp_imm);
247 
248                 this->subs(reg_src_id, reg_src_id, 1); // dec
249                 this->b(NE, loop_srcs);
250 
251                 size_t base_off
252                         = (size_t)this->n_src_ * this->src_ld_ * typesize;
253                 this->sub_imm(reg_src, reg_src, base_off, reg_tmp_imm);
254             } else {
255                 for (int src_id = 0; src_id < this->n_src_; ++src_id) {
256                     const size_t base_off
257                             = (size_t)src_id * this->src_ld_ * typesize;
258                     accumulate(nloads[id], load_len[id], base_off);
259                 }
260             }
261 
262             store_dst(nloads[id], load_len[id]);
263 
264             this->add_imm(
265                     reg_src, reg_src, nloads[id] * load_len[id], reg_tmp_imm);
266             this->add_imm(
267                     reg_dst, reg_dst, nloads[id] * load_len[id], reg_tmp_imm);
268 
269             this->sub_imm(reg_x, reg_x, nloads[id] * load_len[id], reg_tmp_imm);
270 
271             this->b(loop_x_label[id]);
272         }
273 
274         this->L(loop_x_label[nbranches]);
275 
276         /* restore address registers */
277         this->sub(reg_src, reg_src, reg_nx);
278         this->sub(reg_dst, reg_dst, reg_nx);
279     }
280 
generatednnl::impl::cpu::aarch64::reducer_2d_driver_f_s_32_t281     void generate() override {
282         assert(isa == sve_512);
283 
284         this->preamble();
285 
286         this->lsl(reg_nx, reg_nx, 2);
287 
288         Label ny_loop;
289         this->L(ny_loop);
290 
291         loop_x();
292 
293         this->add_imm(
294                 reg_dst, reg_dst, this->dst_step_ * typesize, reg_tmp_imm);
295         this->add_imm(
296                 reg_src, reg_src, this->src_step_ * typesize, reg_tmp_imm);
297 
298         this->subs(reg_ny, reg_ny, 1); //dec(reg_ny);
299         this->b(NE, ny_loop); // jnz
300 
301         this->postamble();
302     }
303 };
304 
305 template <impl::data_type_t data_type>
create_reduce_2d_drv(int n_src,size_t src_ld,size_t src_step,size_t dst_step,bool nullify_dst)306 inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src,
307         size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
308     if (mayiuse(sve_512))
309         return new reducer_2d_driver_f_s_32_t<data_type, sve_512>(
310                 n_src, src_ld, src_step, dst_step, nullify_dst);
311     assert(!"unimplemented");
312     return nullptr;
313 }
314 
315 /* cpu_reducer_t */
316 
317 template <impl::data_type_t data_type>
init_scratchpad(memory_tracking::registrar_t & scratchpad) const318 void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
319         memory_tracking::registrar_t &scratchpad) const {
320     if (balancer_.nthr_per_group_ == 1) return;
321 
322     const size_t space_size = balancer_.ngroups_
323             * (balancer_.nthr_per_group_ - 1)
324             * cpu_reducer_t<data_type>::space_per_thread(balancer_);
325     scratchpad.book<data_t>(key_reducer_space, space_size, PAGE_4K);
326     scratchpad.book<simple_barrier::ctx_t>(
327             key_reducer_space_bctx, balancer_.ngroups_);
328 }
329 
330 template <impl::data_type_t data_type>
cpu_reducer_t(const conf_t & conf)331 cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
332     : conf_(conf), drv_(nullptr) {
333     if (balancer().nthr_per_group_ == 1) return;
334 
335     drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1,
336             space_per_thread(balancer()), 0, 0, false);
337 }
338 
339 template <impl::data_type_t data_type>
~cpu_reducer_t()340 cpu_reducer_t<data_type>::~cpu_reducer_t() {
341     delete drv_;
342 }
343 
344 template <impl::data_type_t data_type>
create_kernel()345 status_t cpu_reducer_t<data_type>::create_kernel() {
346     return (drv_) ? drv_->create_kernel() : status::success;
347 }
348 
349 template <impl::data_type_t data_type>
350 typename cpu_reducer_t<data_type>::data_t *
get_local_ptr(int ithr,data_t * dst,const memory_tracking::grantor_t & scratchpad) const351 cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
352         const memory_tracking::grantor_t &scratchpad) const {
353     const int id_in_grp = balancer().id_in_group(ithr);
354 
355     /* threads 0 from each group writes directly to the destination */
356     if (id_in_grp == 0)
357         return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
358 
359     const int grp_id = balancer().group_id(ithr);
360     const int offset_factor
361             = grp_id * (balancer().nthr_per_group_ - 1) + (id_in_grp - 1);
362 
363     auto space = scratchpad.template get<data_t>(key_reducer_space);
364     return space + offset_factor * space_per_thread(balancer());
365 }
366 
367 template <impl::data_type_t data_type>
reduce_nolock(int ithr,data_t * dst,const memory_tracking::grantor_t & scratchpad) const368 void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
369         const memory_tracking::grantor_t &scratchpad) const {
370     bool redundant_reduction
371             = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
372     if (redundant_reduction) return;
373 
374 #ifdef SIMPLE_IMPL
375     if (balancer().id_in_group(ithr) != 0)
376         return; /* only threads 0 do the reduction */
377 
378     const int njobs_in_grp = balancer().ithr_njobs(ithr);
379     data_t *d = get_local_ptr(ithr, dst, scratchpad);
380     for (int id_in_grp = 1; id_in_grp < balancer().nthr_per_group_;
381             ++id_in_grp) {
382         const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
383         for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
384             d[i] += space[i];
385     }
386 #else
387     using namespace utils;
388 
389     const int id_in_grp = balancer().id_in_group(ithr);
390     const int njobs_in_grp = balancer().ithr_njobs(ithr);
391     const size_t cl = 64 / sizeof(data_t);
392 
393     const size_t reduction_size = njobs_in_grp * balancer().job_size_;
394     size_t start {0}, end {0};
395     balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
396             id_in_grp, start, end);
397 
398     if (start == end) return;
399 
400     data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl;
401     const data_t *space
402             = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) + start * cl;
403     const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
404 
405     (*drv_)(d, space, 1, len);
406 #endif
407 }
408 
409 template struct cpu_reducer_t<data_type::f32>;
410 template struct cpu_reducer_t<data_type::s32>;
411 
412 /* cpu_reducer_2d_t */
413 
414 template <impl::data_type_t data_type>
init_scratchpad(memory_tracking::registrar_t & scratchpad) const415 void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
416         memory_tracking::registrar_t &scratchpad) const {
417     if (balancer_.nthr_per_group_ == 1) return;
418 
419     const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
420             * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
421     scratchpad.book<data_t>(key_reducer_space, space_size);
422     scratchpad.book<simple_barrier::ctx_t>(
423             key_reducer_space_bctx, balancer_.ngroups_);
424 }
425 
426 template <impl::data_type_t data_type>
cpu_reducer_2d_t(const conf_t & conf)427 cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
428     : conf_(conf), drv_(nullptr) {
429     if (balancer().nthr_per_group_ == 1) return;
430 
431     drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
432             space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
433             true);
434 }
435 
436 template <impl::data_type_t data_type>
~cpu_reducer_2d_t()437 cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() {
438     delete drv_;
439 }
440 
441 template <impl::data_type_t data_type>
create_kernel()442 status_t cpu_reducer_2d_t<data_type>::create_kernel() {
443     return (drv_) ? drv_->create_kernel() : status::success;
444 }
445 
446 template <impl::data_type_t data_type>
447 typename cpu_reducer_2d_t<data_type>::data_t *
get_local_ptr(int ithr,const memory_tracking::grantor_t & scratchpad) const448 cpu_reducer_2d_t<data_type>::get_local_ptr(
449         int ithr, const memory_tracking::grantor_t &scratchpad) const {
450     const int id_in_grp = balancer().id_in_group(ithr);
451     const int grp_id = balancer().group_id(ithr);
452     const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
453     auto space = scratchpad.template get<data_t>(key_reducer_space);
454     return space + offset_factor * space_per_thread(balancer());
455 }
456 
457 template <impl::data_type_t data_type>
choose_x_blocking(int nx,int ny,int nthr_per_grp) const458 int cpu_reducer_2d_t<data_type>::choose_x_blocking(
459         int nx, int ny, int nthr_per_grp) const {
460     // find x_blocking for better balance reducing work between threads
461     assert(conf_.x_block_ > 0 && nx > conf_.x_block_
462             && nx % conf_.x_block_ == 0);
463     int x_blocking = nx / conf_.x_block_;
464     int min_x_blocking
465             = utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny));
466     while (true) {
467         if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2)
468             x_blocking /= 2;
469         else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3)
470             x_blocking /= 3;
471         else
472             break;
473     }
474     if (x_blocking >= min_x_blocking * 4) x_blocking = 1;
475     x_blocking *= conf_.x_block_;
476     return x_blocking;
477 }
478 
479 template <impl::data_type_t data_type>
reduce_block(const data_t * space_base,data_t * dst,int job,int start_y,int start_x,int ny_start,int nx_start,int ny_step,int nx_step) const480 void cpu_reducer_2d_t<data_type>::reduce_block(const data_t *space_base,
481         data_t *dst, int job, int start_y, int start_x, int ny_start,
482         int nx_start, int ny_step, int nx_step) const {
483     data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ + start_x + nx_start;
484     const data_t *space = space_base + (size_t)job * balancer().job_size_
485             + (size_t)ny_start * conf_.job_size_x_ + nx_start;
486 #ifdef SIMPLE_IMPL
487     for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) {
488         const data_t *w = &space[idg * space_per_thread(balancer())];
489         for (int y = 0; y < ny_step; ++y)
490             for (int x = 0; x < nx_step; ++x) {
491                 d[y * conf_.dst_x_ + x]
492                         = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x])
493                         + w[y * conf_.job_size_x_ + x];
494             }
495     }
496 #else
497     (*drv_)(d, space, ny_step, nx_step);
498 #endif
499 }
500 
501 template <impl::data_type_t data_type>
reduce_nolock(int ithr,data_t * dst,const memory_tracking::grantor_t & scratchpad) const502 void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
503         const memory_tracking::grantor_t &scratchpad) const {
504     bool redundant_reduction
505             = balancer().nthr_per_group_ == 1 || balancer().idle(ithr);
506     if (redundant_reduction) return;
507 
508     const int id_in_grp = balancer().id_in_group(ithr);
509     const int njobs_in_grp = balancer().ithr_njobs(ithr);
510     const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
511     const int global_job_start = balancer().ithr_job_off(ithr);
512 
513     const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
514 
515     const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
516     const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
517 
518     if (id_in_grp >= pr_grps * pr_nthr_per_grp) return; /* idle */
519 
520     const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
521     const int pr_my_id = id_in_grp % pr_nthr_per_grp;
522 
523     int pr_job_start {0}, pr_job_end {0};
524     balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
525 
526     for (int j = pr_job_start; j < pr_job_end; ++j) {
527         const int global_job = global_job_start + j;
528         const int j_y = global_job / njobs_x;
529         const int j_x = global_job % njobs_x;
530         const int start_y = j_y * conf_.job_size_y_;
531         const int start_x = j_x * conf_.job_size_x_;
532         const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
533         const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
534         int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
535 
536         int nxy_start {0}, nxy_end {0};
537         balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, nxy_start,
538                 nxy_end);
539         if (nxy_start == nxy_end) continue;
540         nxy_start *= x_blocking;
541         nxy_end *= x_blocking;
542 
543         int nxy = nxy_start;
544         if (nxy % nx != 0) {
545             int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
546             reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
547                     nxy % nx, 1, nx_step);
548             nxy += nx_step;
549         }
550         if ((nxy_end - nxy) > nx) {
551             int ny_step = (nxy_end - nxy) / nx;
552             reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
553                     nxy % nx, ny_step, nx);
554             nxy += nx * ny_step;
555         }
556         if ((nxy_end - nxy) > 0) {
557             reduce_block(space_base, dst, j, start_y, start_x, nxy / nx,
558                     nxy % nx, 1, nxy_end - nxy);
559         }
560     }
561 }
562 
563 template struct cpu_reducer_2d_t<data_type::f32>;
564 template struct cpu_reducer_2d_t<data_type::s32>;
565 
566 /* accumulator section */
567 
568 template <impl::data_type_t data_type>
cpu_accumulator_1d_t()569 cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t() : drv_(nullptr) {
570     drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false);
571 }
572 
573 template <impl::data_type_t data_type>
~cpu_accumulator_1d_t()574 cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
575     delete drv_;
576 }
577 
578 template <impl::data_type_t data_type>
create_kernel()579 status_t cpu_accumulator_1d_t<data_type>::create_kernel() {
580     return drv_->create_kernel();
581 }
582 
583 template <impl::data_type_t data_type>
accumulate(data_t * dst,const data_t * src,size_t size)584 void cpu_accumulator_1d_t<data_type>::accumulate(
585         data_t *dst, const data_t *src, size_t size) {
586     (*drv_)(dst, src, 1, size);
587 }
588 
589 template struct cpu_accumulator_1d_t<data_type::f32>;
590 template struct cpu_accumulator_1d_t<data_type::s32>;
591 
592 } // namespace aarch64
593 } // namespace cpu
594 } // namespace impl
595 } // namespace dnnl
596 
597 // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
598