1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22 
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28 
29 using namespace clover;
30 
kernel(clover::program & prog,const std::string & name,const std::vector<binary::argument> & bargs)31 kernel::kernel(clover::program &prog, const std::string &name,
32                const std::vector<binary::argument> &bargs) :
33    program(prog), _name(name), exec(*this),
34    program_ref(prog._kernel_ref_counter) {
35    for (auto &barg : bargs) {
36       if (barg.semantic == binary::argument::general)
37          _args.emplace_back(argument::create(barg));
38    }
39    for (auto &dev : prog.devices()) {
40       auto &b = prog.build(dev).bin;
41       auto bsym = find(name_equals(name), b.syms);
42       const auto f = id_type_equals(bsym.section, binary::section::data_constant);
43       if (!any_of(f, b.secs))
44          continue;
45 
46       auto mconst = find(f, b.secs);
47       auto rb = std::make_unique<root_buffer>(prog.context(), std::vector<cl_mem_properties>(),
48                                               CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
49                                               mconst.size, mconst.data.data());
50       _constant_buffers.emplace(&dev, std::move(rb));
51    }
52 }
53 
54 template<typename V>
55 static inline std::vector<uint>
pad_vector(command_queue & q,const V & v,uint x)56 pad_vector(command_queue &q, const V &v, uint x) {
57    std::vector<uint> w { v.begin(), v.end() };
58    w.resize(q.device().max_block_size().size(), x);
59    return w;
60 }
61 
62 void
launch(command_queue & q,const std::vector<size_t> & grid_offset,const std::vector<size_t> & grid_size,const std::vector<size_t> & block_size)63 kernel::launch(command_queue &q,
64                const std::vector<size_t> &grid_offset,
65                const std::vector<size_t> &grid_size,
66                const std::vector<size_t> &block_size) {
67    const auto b = program().build(q.device()).bin;
68    const auto reduced_grid_size =
69       map(divides(), grid_size, block_size);
70 
71    if (any_of(is_zero(), grid_size))
72       return;
73 
74    void *st = exec.bind(&q, grid_offset);
75    struct pipe_grid_info info = {};
76 
77    // The handles are created during exec_context::bind(), so we need make
78    // sure to call exec_context::bind() before retrieving them.
79    std::vector<uint32_t *> g_handles = map([&](size_t h) {
80          return (uint32_t *)&exec.input[h];
81       }, exec.g_handles);
82 
83    q.pipe->bind_compute_state(q.pipe, st);
84    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
85                                0, exec.samplers.size(),
86                                exec.samplers.data());
87 
88    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
89                              exec.sviews.size(), 0, false, exec.sviews.data());
90    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
91                              exec.iviews.size(), 0, exec.iviews.data());
92    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
93                                  exec.resources.data());
94    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
95                               exec.g_buffers.data(), g_handles.data());
96 
97    // Fill information for the launch_grid() call.
98    info.work_dim = grid_size.size();
99    copy(pad_vector(q, block_size, 1), info.block);
100    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
101    info.pc = find(name_equals(_name), b.syms).offset;
102    info.input = exec.input.data();
103 
104    q.pipe->launch_grid(q.pipe, &info);
105 
106    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
107    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
108    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
109                              0, exec.iviews.size(), NULL);
110    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
111                              0, exec.sviews.size(), false, NULL);
112    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
113                                exec.samplers.size(), NULL);
114 
115    q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
116    exec.unbind();
117 }
118 
119 size_t
mem_local() const120 kernel::mem_local() const {
121    size_t sz = 0;
122 
123    for (auto &arg : args()) {
124       if (dynamic_cast<local_argument *>(&arg))
125          sz += arg.storage();
126    }
127 
128    return sz;
129 }
130 
131 size_t
mem_private() const132 kernel::mem_private() const {
133    return 0;
134 }
135 
136 const std::string &
name() const137 kernel::name() const {
138    return _name;
139 }
140 
141 std::vector<size_t>
optimal_block_size(const command_queue & q,const std::vector<size_t> & grid_size) const142 kernel::optimal_block_size(const command_queue &q,
143                            const std::vector<size_t> &grid_size) const {
144    if (any_of(is_zero(), grid_size))
145       return grid_size;
146 
147    return factor::find_grid_optimal_factor<size_t>(
148       q.device().max_threads_per_block(), q.device().max_block_size(),
149       grid_size);
150 }
151 
152 std::vector<size_t>
required_block_size() const153 kernel::required_block_size() const {
154    return find(name_equals(_name), program().symbols()).reqd_work_group_size;
155 }
156 
157 kernel::argument_range
args()158 kernel::args() {
159    return map(derefs(), _args);
160 }
161 
162 kernel::const_argument_range
args() const163 kernel::args() const {
164    return map(derefs(), _args);
165 }
166 
167 std::vector<clover::binary::arg_info>
args_infos()168 kernel::args_infos() {
169    std::vector<clover::binary::arg_info> infos;
170    for (auto &barg: find(name_equals(_name), program().symbols()).args)
171       if (barg.semantic == clover::binary::argument::general)
172          infos.emplace_back(barg.info);
173 
174    return infos;
175 }
176 
177 const binary &
binary(const command_queue & q) const178 kernel::binary(const command_queue &q) const {
179    return program().build(q.device()).bin;
180 }
181 
exec_context(kernel & kern)182 kernel::exec_context::exec_context(kernel &kern) :
183    kern(kern), q(NULL), print_handler(), mem_local(0), st(NULL), cs() {
184 }
185 
~exec_context()186 kernel::exec_context::~exec_context() {
187    if (st)
188       q->pipe->delete_compute_state(q->pipe, st);
189 }
190 
191 void *
bind(intrusive_ptr<command_queue> _q,const std::vector<size_t> & grid_offset)192 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
193                            const std::vector<size_t> &grid_offset) {
194    std::swap(q, _q);
195 
196    // Bind kernel arguments.
197    auto &b = kern.program().build(q->device()).bin;
198    auto bsym = find(name_equals(kern.name()), b.syms);
199    auto bargs = bsym.args;
200    auto msec = find(id_type_equals(bsym.section, binary::section::text_executable), b.secs);
201    auto explicit_arg = kern._args.begin();
202 
203    for (auto &barg : bargs) {
204       switch (barg.semantic) {
205       case binary::argument::general:
206          (*(explicit_arg++))->bind(*this, barg);
207          break;
208 
209       case binary::argument::grid_dimension: {
210          const cl_uint dimension = grid_offset.size();
211          auto arg = argument::create(barg);
212 
213          arg->set(sizeof(dimension), &dimension);
214          arg->bind(*this, barg);
215          break;
216       }
217       case binary::argument::grid_offset: {
218          for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
219             auto arg = argument::create(barg);
220 
221             arg->set(sizeof(x), &x);
222             arg->bind(*this, barg);
223          }
224          break;
225       }
226       case binary::argument::image_size: {
227          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
228          std::vector<cl_uint> image_size{
229                static_cast<cl_uint>(img->width()),
230                static_cast<cl_uint>(img->height()),
231                static_cast<cl_uint>(img->depth())};
232          for (auto x : image_size) {
233             auto arg = argument::create(barg);
234 
235             arg->set(sizeof(x), &x);
236             arg->bind(*this, barg);
237          }
238          break;
239       }
240       case binary::argument::image_format: {
241          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
242          cl_image_format fmt = img->format();
243          std::vector<cl_uint> image_format{
244                static_cast<cl_uint>(fmt.image_channel_data_type),
245                static_cast<cl_uint>(fmt.image_channel_order)};
246          for (auto x : image_format) {
247             auto arg = argument::create(barg);
248 
249             arg->set(sizeof(x), &x);
250             arg->bind(*this, barg);
251          }
252          break;
253       }
254       case binary::argument::constant_buffer: {
255          auto arg = argument::create(barg);
256          cl_mem buf = kern._constant_buffers.at(&q->device()).get();
257          arg->set(sizeof(buf), &buf);
258          arg->bind(*this, barg);
259          break;
260       }
261       case binary::argument::printf_buffer: {
262          print_handler = printf_handler::create(q, b.printf_infos,
263                                                 b.printf_strings_in_buffer,
264                                                 q->device().max_printf_buffer_size());
265          cl_mem print_mem = print_handler->get_mem();
266 
267          auto arg = argument::create(barg);
268          arg->set(sizeof(cl_mem), &print_mem);
269          arg->bind(*this, barg);
270          break;
271       }
272       }
273    }
274 
275    // Create a new compute state if anything changed.
276    if (!st || q != _q ||
277        cs.req_local_mem != mem_local ||
278        cs.req_input_mem != input.size()) {
279       if (st)
280          _q->pipe->delete_compute_state(_q->pipe, st);
281 
282       cs.ir_type = q->device().ir_format();
283       cs.prog = &(msec.data[0]);
284       cs.req_local_mem = mem_local;
285       cs.req_input_mem = input.size();
286       st = q->pipe->create_compute_state(q->pipe, &cs);
287       if (!st) {
288          unbind(); // Cleanup
289          throw error(CL_OUT_OF_RESOURCES);
290       }
291    }
292 
293    return st;
294 }
295 
296 void
unbind()297 kernel::exec_context::unbind() {
298    if (print_handler)
299       print_handler->print();
300 
301    for (auto &arg : kern.args())
302       arg.unbind(*this);
303 
304    input.clear();
305    samplers.clear();
306    sviews.clear();
307    iviews.clear();
308    resources.clear();
309    g_buffers.clear();
310    g_handles.clear();
311    mem_local = 0;
312 }
313 
314 namespace {
315    template<typename T>
316    std::vector<uint8_t>
bytes(const T & x)317    bytes(const T& x) {
318       return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
319    }
320 
321    ///
322    /// Transform buffer \a v from the native byte order into the byte
323    /// order specified by \a e.
324    ///
325    template<typename T>
326    void
byteswap(T & v,pipe_endian e)327    byteswap(T &v, pipe_endian e) {
328       if (PIPE_ENDIAN_NATIVE != e)
329          std::reverse(v.begin(), v.end());
330    }
331 
332    ///
333    /// Pad buffer \a v to the next multiple of \a n.
334    ///
335    template<typename T>
336    void
align(T & v,size_t n)337    align(T &v, size_t n) {
338       v.resize(util_align_npot(v.size(), n));
339    }
340 
341    bool
msb(const std::vector<uint8_t> & s)342    msb(const std::vector<uint8_t> &s) {
343       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
344          return s.back() & 0x80;
345       else
346          return s.front() & 0x80;
347    }
348 
349    ///
350    /// Resize buffer \a v to size \a n using sign or zero extension
351    /// according to \a ext.
352    ///
353    template<typename T>
354    void
extend(T & v,enum binary::argument::ext_type ext,size_t n)355    extend(T &v, enum binary::argument::ext_type ext, size_t n) {
356       const size_t m = std::min(v.size(), n);
357       const bool sign_ext = (ext == binary::argument::sign_ext);
358       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
359       T w(n, fill);
360 
361       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
362          std::copy_n(v.begin(), m, w.begin());
363       else
364          std::copy_n(v.end() - m, m, w.end() - m);
365 
366       std::swap(v, w);
367    }
368 
369    ///
370    /// Append buffer \a w to \a v.
371    ///
372    template<typename T>
373    void
insert(T & v,const T & w)374    insert(T &v, const T &w) {
375       v.insert(v.end(), w.begin(), w.end());
376    }
377 
378    ///
379    /// Append \a n elements to the end of buffer \a v.
380    ///
381    template<typename T>
382    size_t
allocate(T & v,size_t n)383    allocate(T &v, size_t n) {
384       size_t pos = v.size();
385       v.resize(pos + n);
386       return pos;
387    }
388 }
389 
390 std::unique_ptr<kernel::argument>
create(const binary::argument & barg)391 kernel::argument::create(const binary::argument &barg) {
392    switch (barg.type) {
393    case binary::argument::scalar:
394       return std::unique_ptr<kernel::argument>(new scalar_argument(barg.size));
395 
396    case binary::argument::global:
397       return std::unique_ptr<kernel::argument>(new global_argument);
398 
399    case binary::argument::local:
400       return std::unique_ptr<kernel::argument>(new local_argument);
401 
402    case binary::argument::constant:
403       return std::unique_ptr<kernel::argument>(new constant_argument);
404 
405    case binary::argument::image_rd:
406       return std::unique_ptr<kernel::argument>(new image_rd_argument);
407 
408    case binary::argument::image_wr:
409       return std::unique_ptr<kernel::argument>(new image_wr_argument);
410 
411    case binary::argument::sampler:
412       return std::unique_ptr<kernel::argument>(new sampler_argument);
413 
414    }
415    throw error(CL_INVALID_KERNEL_DEFINITION);
416 }
417 
argument()418 kernel::argument::argument() : _set(false) {
419 }
420 
421 bool
set() const422 kernel::argument::set() const {
423    return _set;
424 }
425 
426 size_t
storage() const427 kernel::argument::storage() const {
428    return 0;
429 }
430 
scalar_argument(size_t size)431 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
432 }
433 
434 void
set(size_t size,const void * value)435 kernel::scalar_argument::set(size_t size, const void *value) {
436    if (!value)
437       throw error(CL_INVALID_ARG_VALUE);
438 
439    if (size != this->size)
440       throw error(CL_INVALID_ARG_SIZE);
441 
442    v = { (uint8_t *)value, (uint8_t *)value + size };
443    _set = true;
444 }
445 
446 void
bind(exec_context & ctx,const binary::argument & barg)447 kernel::scalar_argument::bind(exec_context &ctx,
448                               const binary::argument &barg) {
449    auto w = v;
450 
451    extend(w, barg.ext_type, barg.target_size);
452    byteswap(w, ctx.q->device().endianness());
453    align(ctx.input, barg.target_align);
454    insert(ctx.input, w);
455 }
456 
457 void
unbind(exec_context & ctx)458 kernel::scalar_argument::unbind(exec_context &ctx) {
459 }
460 
global_argument()461 kernel::global_argument::global_argument() : buf(nullptr), svm(nullptr) {
462 }
463 
464 void
set(size_t size,const void * value)465 kernel::global_argument::set(size_t size, const void *value) {
466    if (size != sizeof(cl_mem))
467       throw error(CL_INVALID_ARG_SIZE);
468 
469    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
470    svm = nullptr;
471    _set = true;
472 }
473 
474 void
set_svm(const void * value)475 kernel::global_argument::set_svm(const void *value) {
476    svm = value;
477    buf = nullptr;
478    _set = true;
479 }
480 
481 void
bind(exec_context & ctx,const binary::argument & barg)482 kernel::global_argument::bind(exec_context &ctx,
483                               const binary::argument &barg) {
484    align(ctx.input, barg.target_align);
485 
486    if (buf) {
487       const resource &r = buf->resource_in(*ctx.q);
488       ctx.g_handles.push_back(ctx.input.size());
489       ctx.g_buffers.push_back(r.pipe);
490 
491       // How to handle multi-demensional offsets?
492       // We don't need to.  Buffer offsets are always
493       // one-dimensional.
494       auto v = bytes(r.offset[0]);
495       extend(v, barg.ext_type, barg.target_size);
496       byteswap(v, ctx.q->device().endianness());
497       insert(ctx.input, v);
498    } else if (svm) {
499       auto v = bytes(svm);
500       extend(v, barg.ext_type, barg.target_size);
501       byteswap(v, ctx.q->device().endianness());
502       insert(ctx.input, v);
503    } else {
504       // Null pointer.
505       allocate(ctx.input, barg.target_size);
506    }
507 }
508 
509 void
unbind(exec_context & ctx)510 kernel::global_argument::unbind(exec_context &ctx) {
511 }
512 
513 size_t
storage() const514 kernel::local_argument::storage() const {
515    return _storage;
516 }
517 
518 void
set(size_t size,const void * value)519 kernel::local_argument::set(size_t size, const void *value) {
520    if (value)
521       throw error(CL_INVALID_ARG_VALUE);
522 
523    if (!size)
524       throw error(CL_INVALID_ARG_SIZE);
525 
526    _storage = size;
527    _set = true;
528 }
529 
530 void
bind(exec_context & ctx,const binary::argument & barg)531 kernel::local_argument::bind(exec_context &ctx,
532                              const binary::argument &barg) {
533    ctx.mem_local = ::align(ctx.mem_local, barg.target_align);
534    auto v = bytes(ctx.mem_local);
535 
536    extend(v, binary::argument::zero_ext, barg.target_size);
537    byteswap(v, ctx.q->device().endianness());
538    align(ctx.input, ctx.q->device().address_bits() / 8);
539    insert(ctx.input, v);
540 
541    ctx.mem_local += _storage;
542 }
543 
544 void
unbind(exec_context & ctx)545 kernel::local_argument::unbind(exec_context &ctx) {
546 }
547 
constant_argument()548 kernel::constant_argument::constant_argument() : buf(nullptr), st(nullptr) {
549 }
550 
551 void
set(size_t size,const void * value)552 kernel::constant_argument::set(size_t size, const void *value) {
553    if (size != sizeof(cl_mem))
554       throw error(CL_INVALID_ARG_SIZE);
555 
556    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
557    _set = true;
558 }
559 
560 void
bind(exec_context & ctx,const binary::argument & barg)561 kernel::constant_argument::bind(exec_context &ctx,
562                                 const binary::argument &barg) {
563    align(ctx.input, barg.target_align);
564 
565    if (buf) {
566       resource &r = buf->resource_in(*ctx.q);
567       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
568 
569       extend(v, binary::argument::zero_ext, barg.target_size);
570       byteswap(v, ctx.q->device().endianness());
571       insert(ctx.input, v);
572 
573       st = r.bind_surface(*ctx.q, false);
574       ctx.resources.push_back(st);
575    } else {
576       // Null pointer.
577       allocate(ctx.input, barg.target_size);
578    }
579 }
580 
581 void
unbind(exec_context & ctx)582 kernel::constant_argument::unbind(exec_context &ctx) {
583    if (buf)
584       buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st);
585 }
586 
image_rd_argument()587 kernel::image_rd_argument::image_rd_argument() : st(nullptr) {
588 }
589 
590 void
set(size_t size,const void * value)591 kernel::image_rd_argument::set(size_t size, const void *value) {
592    if (!value)
593       throw error(CL_INVALID_ARG_VALUE);
594 
595    if (size != sizeof(cl_mem))
596       throw error(CL_INVALID_ARG_SIZE);
597 
598    img = &obj<image>(*(cl_mem *)value);
599    _set = true;
600 }
601 
602 void
bind(exec_context & ctx,const binary::argument & barg)603 kernel::image_rd_argument::bind(exec_context &ctx,
604                                 const binary::argument &barg) {
605    auto v = bytes(ctx.sviews.size());
606 
607    extend(v, binary::argument::zero_ext, barg.target_size);
608    byteswap(v, ctx.q->device().endianness());
609    align(ctx.input, barg.target_align);
610    insert(ctx.input, v);
611 
612    st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);
613    ctx.sviews.push_back(st);
614 }
615 
616 void
unbind(exec_context & ctx)617 kernel::image_rd_argument::unbind(exec_context &ctx) {
618    img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st);
619 }
620 
621 void
set(size_t size,const void * value)622 kernel::image_wr_argument::set(size_t size, const void *value) {
623    if (!value)
624       throw error(CL_INVALID_ARG_VALUE);
625 
626    if (size != sizeof(cl_mem))
627       throw error(CL_INVALID_ARG_SIZE);
628 
629    img = &obj<image>(*(cl_mem *)value);
630    _set = true;
631 }
632 
633 void
bind(exec_context & ctx,const binary::argument & barg)634 kernel::image_wr_argument::bind(exec_context &ctx,
635                                 const binary::argument &barg) {
636    auto v = bytes(ctx.iviews.size());
637 
638    extend(v, binary::argument::zero_ext, barg.target_size);
639    byteswap(v, ctx.q->device().endianness());
640    align(ctx.input, barg.target_align);
641    insert(ctx.input, v);
642    ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));
643 }
644 
645 void
unbind(exec_context & ctx)646 kernel::image_wr_argument::unbind(exec_context &ctx) {
647 }
648 
sampler_argument()649 kernel::sampler_argument::sampler_argument() : s(nullptr), st(nullptr) {
650 }
651 
652 void
set(size_t size,const void * value)653 kernel::sampler_argument::set(size_t size, const void *value) {
654    if (!value)
655       throw error(CL_INVALID_SAMPLER);
656 
657    if (size != sizeof(cl_sampler))
658       throw error(CL_INVALID_ARG_SIZE);
659 
660    s = &obj(*(cl_sampler *)value);
661    _set = true;
662 }
663 
664 void
bind(exec_context & ctx,const binary::argument & barg)665 kernel::sampler_argument::bind(exec_context &ctx,
666                                const binary::argument &barg) {
667    st = s->bind(*ctx.q);
668    ctx.samplers.push_back(st);
669 }
670 
671 void
unbind(exec_context & ctx)672 kernel::sampler_argument::unbind(exec_context &ctx) {
673    s->unbind(*ctx.q, st);
674 }
675