1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22 
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28 
29 using namespace clover;
30 
kernel(clover::program & prog,const std::string & name,const std::vector<module::argument> & margs)31 kernel::kernel(clover::program &prog, const std::string &name,
32                const std::vector<module::argument> &margs) :
33    program(prog), _name(name), exec(*this),
34    program_ref(prog._kernel_ref_counter) {
35    for (auto &marg : margs) {
36       if (marg.semantic == module::argument::general)
37          _args.emplace_back(argument::create(marg));
38    }
39 }
40 
41 template<typename V>
42 static inline std::vector<uint>
pad_vector(command_queue & q,const V & v,uint x)43 pad_vector(command_queue &q, const V &v, uint x) {
44    std::vector<uint> w { v.begin(), v.end() };
45    w.resize(q.device().max_block_size().size(), x);
46    return w;
47 }
48 
49 void
launch(command_queue & q,const std::vector<size_t> & grid_offset,const std::vector<size_t> & grid_size,const std::vector<size_t> & block_size)50 kernel::launch(command_queue &q,
51                const std::vector<size_t> &grid_offset,
52                const std::vector<size_t> &grid_size,
53                const std::vector<size_t> &block_size) {
54    const auto m = program().build(q.device()).binary;
55    const auto reduced_grid_size =
56       map(divides(), grid_size, block_size);
57    void *st = exec.bind(&q, grid_offset);
58    struct pipe_grid_info info = {};
59 
60    // The handles are created during exec_context::bind(), so we need make
61    // sure to call exec_context::bind() before retrieving them.
62    std::vector<uint32_t *> g_handles = map([&](size_t h) {
63          return (uint32_t *)&exec.input[h];
64       }, exec.g_handles);
65 
66    q.pipe->bind_compute_state(q.pipe, st);
67    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
68                                0, exec.samplers.size(),
69                                exec.samplers.data());
70 
71    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
72                              exec.sviews.size(), exec.sviews.data());
73    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
74                                  exec.resources.data());
75    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
76                               exec.g_buffers.data(), g_handles.data());
77 
78    // Fill information for the launch_grid() call.
79    info.work_dim = grid_size.size();
80    copy(pad_vector(q, block_size, 1), info.block);
81    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
82    info.pc = find(name_equals(_name), m.syms).offset;
83    info.input = exec.input.data();
84 
85    q.pipe->launch_grid(q.pipe, &info);
86 
87    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
88    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
89    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
90                              exec.sviews.size(), NULL);
91    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
92                                exec.samplers.size(), NULL);
93 
94    q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
95    exec.unbind();
96 }
97 
98 size_t
mem_local() const99 kernel::mem_local() const {
100    size_t sz = 0;
101 
102    for (auto &arg : args()) {
103       if (dynamic_cast<local_argument *>(&arg))
104          sz += arg.storage();
105    }
106 
107    return sz;
108 }
109 
110 size_t
mem_private() const111 kernel::mem_private() const {
112    return 0;
113 }
114 
115 const std::string &
name() const116 kernel::name() const {
117    return _name;
118 }
119 
120 std::vector<size_t>
optimal_block_size(const command_queue & q,const std::vector<size_t> & grid_size) const121 kernel::optimal_block_size(const command_queue &q,
122                            const std::vector<size_t> &grid_size) const {
123    return factor::find_grid_optimal_factor<size_t>(
124       q.device().max_threads_per_block(), q.device().max_block_size(),
125       grid_size);
126 }
127 
128 std::vector<size_t>
required_block_size() const129 kernel::required_block_size() const {
130    return { 0, 0, 0 };
131 }
132 
133 kernel::argument_range
args()134 kernel::args() {
135    return map(derefs(), _args);
136 }
137 
138 kernel::const_argument_range
args() const139 kernel::args() const {
140    return map(derefs(), _args);
141 }
142 
143 const module &
module(const command_queue & q) const144 kernel::module(const command_queue &q) const {
145    return program().build(q.device()).binary;
146 }
147 
exec_context(kernel & kern)148 kernel::exec_context::exec_context(kernel &kern) :
149    kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
150 }
151 
~exec_context()152 kernel::exec_context::~exec_context() {
153    if (st)
154       q->pipe->delete_compute_state(q->pipe, st);
155 }
156 
157 void *
bind(intrusive_ptr<command_queue> _q,const std::vector<size_t> & grid_offset)158 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
159                            const std::vector<size_t> &grid_offset) {
160    std::swap(q, _q);
161 
162    // Bind kernel arguments.
163    auto &m = kern.program().build(q->device()).binary;
164    auto msym = find(name_equals(kern.name()), m.syms);
165    auto margs = msym.args;
166    auto msec = find(id_equals(msym.section), m.secs);
167    auto explicit_arg = kern._args.begin();
168 
169    for (auto &marg : margs) {
170       switch (marg.semantic) {
171       case module::argument::general:
172          (*(explicit_arg++))->bind(*this, marg);
173          break;
174 
175       case module::argument::grid_dimension: {
176          const cl_uint dimension = grid_offset.size();
177          auto arg = argument::create(marg);
178 
179          arg->set(sizeof(dimension), &dimension);
180          arg->bind(*this, marg);
181          break;
182       }
183       case module::argument::grid_offset: {
184          for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
185             auto arg = argument::create(marg);
186 
187             arg->set(sizeof(x), &x);
188             arg->bind(*this, marg);
189          }
190          break;
191       }
192       case module::argument::image_size: {
193          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
194          std::vector<cl_uint> image_size{
195                static_cast<cl_uint>(img->width()),
196                static_cast<cl_uint>(img->height()),
197                static_cast<cl_uint>(img->depth())};
198          for (auto x : image_size) {
199             auto arg = argument::create(marg);
200 
201             arg->set(sizeof(x), &x);
202             arg->bind(*this, marg);
203          }
204          break;
205       }
206       case module::argument::image_format: {
207          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
208          cl_image_format fmt = img->format();
209          std::vector<cl_uint> image_format{
210                static_cast<cl_uint>(fmt.image_channel_data_type),
211                static_cast<cl_uint>(fmt.image_channel_order)};
212          for (auto x : image_format) {
213             auto arg = argument::create(marg);
214 
215             arg->set(sizeof(x), &x);
216             arg->bind(*this, marg);
217          }
218          break;
219       }
220       }
221    }
222 
223    // Create a new compute state if anything changed.
224    if (!st || q != _q ||
225        cs.req_local_mem != mem_local ||
226        cs.req_input_mem != input.size()) {
227       if (st)
228          _q->pipe->delete_compute_state(_q->pipe, st);
229 
230       cs.ir_type = q->device().ir_format();
231       cs.prog = &(msec.data[0]);
232       cs.req_local_mem = mem_local;
233       cs.req_input_mem = input.size();
234       st = q->pipe->create_compute_state(q->pipe, &cs);
235       if (!st) {
236          unbind(); // Cleanup
237          throw error(CL_OUT_OF_RESOURCES);
238       }
239    }
240 
241    return st;
242 }
243 
244 void
unbind()245 kernel::exec_context::unbind() {
246    for (auto &arg : kern.args())
247       arg.unbind(*this);
248 
249    input.clear();
250    samplers.clear();
251    sviews.clear();
252    resources.clear();
253    g_buffers.clear();
254    g_handles.clear();
255    mem_local = 0;
256 }
257 
258 namespace {
259    template<typename T>
260    std::vector<uint8_t>
bytes(const T & x)261    bytes(const T& x) {
262       return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
263    }
264 
265    ///
266    /// Transform buffer \a v from the native byte order into the byte
267    /// order specified by \a e.
268    ///
269    template<typename T>
270    void
byteswap(T & v,pipe_endian e)271    byteswap(T &v, pipe_endian e) {
272       if (PIPE_ENDIAN_NATIVE != e)
273          std::reverse(v.begin(), v.end());
274    }
275 
276    ///
277    /// Pad buffer \a v to the next multiple of \a n.
278    ///
279    template<typename T>
280    void
align(T & v,size_t n)281    align(T &v, size_t n) {
282       v.resize(util_align_npot(v.size(), n));
283    }
284 
285    bool
msb(const std::vector<uint8_t> & s)286    msb(const std::vector<uint8_t> &s) {
287       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
288          return s.back() & 0x80;
289       else
290          return s.front() & 0x80;
291    }
292 
293    ///
294    /// Resize buffer \a v to size \a n using sign or zero extension
295    /// according to \a ext.
296    ///
297    template<typename T>
298    void
extend(T & v,enum module::argument::ext_type ext,size_t n)299    extend(T &v, enum module::argument::ext_type ext, size_t n) {
300       const size_t m = std::min(v.size(), n);
301       const bool sign_ext = (ext == module::argument::sign_ext);
302       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
303       T w(n, fill);
304 
305       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
306          std::copy_n(v.begin(), m, w.begin());
307       else
308          std::copy_n(v.end() - m, m, w.end() - m);
309 
310       std::swap(v, w);
311    }
312 
313    ///
314    /// Append buffer \a w to \a v.
315    ///
316    template<typename T>
317    void
insert(T & v,const T & w)318    insert(T &v, const T &w) {
319       v.insert(v.end(), w.begin(), w.end());
320    }
321 
322    ///
323    /// Append \a n elements to the end of buffer \a v.
324    ///
325    template<typename T>
326    size_t
allocate(T & v,size_t n)327    allocate(T &v, size_t n) {
328       size_t pos = v.size();
329       v.resize(pos + n);
330       return pos;
331    }
332 }
333 
334 std::unique_ptr<kernel::argument>
create(const module::argument & marg)335 kernel::argument::create(const module::argument &marg) {
336    switch (marg.type) {
337    case module::argument::scalar:
338       return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
339 
340    case module::argument::global:
341       return std::unique_ptr<kernel::argument>(new global_argument);
342 
343    case module::argument::local:
344       return std::unique_ptr<kernel::argument>(new local_argument);
345 
346    case module::argument::constant:
347       return std::unique_ptr<kernel::argument>(new constant_argument);
348 
349    case module::argument::image2d_rd:
350    case module::argument::image3d_rd:
351       return std::unique_ptr<kernel::argument>(new image_rd_argument);
352 
353    case module::argument::image2d_wr:
354    case module::argument::image3d_wr:
355       return std::unique_ptr<kernel::argument>(new image_wr_argument);
356 
357    case module::argument::sampler:
358       return std::unique_ptr<kernel::argument>(new sampler_argument);
359 
360    }
361    throw error(CL_INVALID_KERNEL_DEFINITION);
362 }
363 
argument()364 kernel::argument::argument() : _set(false) {
365 }
366 
367 bool
set() const368 kernel::argument::set() const {
369    return _set;
370 }
371 
372 size_t
storage() const373 kernel::argument::storage() const {
374    return 0;
375 }
376 
scalar_argument(size_t size)377 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
378 }
379 
380 void
set(size_t size,const void * value)381 kernel::scalar_argument::set(size_t size, const void *value) {
382    if (!value)
383       throw error(CL_INVALID_ARG_VALUE);
384 
385    if (size != this->size)
386       throw error(CL_INVALID_ARG_SIZE);
387 
388    v = { (uint8_t *)value, (uint8_t *)value + size };
389    _set = true;
390 }
391 
392 void
bind(exec_context & ctx,const module::argument & marg)393 kernel::scalar_argument::bind(exec_context &ctx,
394                               const module::argument &marg) {
395    auto w = v;
396 
397    extend(w, marg.ext_type, marg.target_size);
398    byteswap(w, ctx.q->device().endianness());
399    align(ctx.input, marg.target_align);
400    insert(ctx.input, w);
401 }
402 
403 void
unbind(exec_context & ctx)404 kernel::scalar_argument::unbind(exec_context &ctx) {
405 }
406 
407 void
set(size_t size,const void * value)408 kernel::global_argument::set(size_t size, const void *value) {
409    if (size != sizeof(cl_mem))
410       throw error(CL_INVALID_ARG_SIZE);
411 
412    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
413    svm = nullptr;
414    _set = true;
415 }
416 
417 void
set_svm(const void * value)418 kernel::global_argument::set_svm(const void *value) {
419    svm = value;
420    buf = nullptr;
421    _set = true;
422 }
423 
424 void
bind(exec_context & ctx,const module::argument & marg)425 kernel::global_argument::bind(exec_context &ctx,
426                               const module::argument &marg) {
427    align(ctx.input, marg.target_align);
428 
429    if (buf) {
430       const resource &r = buf->resource(*ctx.q);
431       ctx.g_handles.push_back(ctx.input.size());
432       ctx.g_buffers.push_back(r.pipe);
433 
434       // How to handle multi-demensional offsets?
435       // We don't need to.  Buffer offsets are always
436       // one-dimensional.
437       auto v = bytes(r.offset[0]);
438       extend(v, marg.ext_type, marg.target_size);
439       byteswap(v, ctx.q->device().endianness());
440       insert(ctx.input, v);
441    } else if (svm) {
442       auto v = bytes(svm);
443       extend(v, marg.ext_type, marg.target_size);
444       byteswap(v, ctx.q->device().endianness());
445       insert(ctx.input, v);
446    } else {
447       // Null pointer.
448       allocate(ctx.input, marg.target_size);
449    }
450 }
451 
452 void
unbind(exec_context & ctx)453 kernel::global_argument::unbind(exec_context &ctx) {
454 }
455 
456 size_t
storage() const457 kernel::local_argument::storage() const {
458    return _storage;
459 }
460 
461 void
set(size_t size,const void * value)462 kernel::local_argument::set(size_t size, const void *value) {
463    if (value)
464       throw error(CL_INVALID_ARG_VALUE);
465 
466    if (!size)
467       throw error(CL_INVALID_ARG_SIZE);
468 
469    _storage = size;
470    _set = true;
471 }
472 
473 void
bind(exec_context & ctx,const module::argument & marg)474 kernel::local_argument::bind(exec_context &ctx,
475                              const module::argument &marg) {
476    auto v = bytes(ctx.mem_local);
477 
478    extend(v, module::argument::zero_ext, marg.target_size);
479    byteswap(v, ctx.q->device().endianness());
480    align(ctx.input, marg.target_align);
481    insert(ctx.input, v);
482 
483    ctx.mem_local += _storage;
484 }
485 
486 void
unbind(exec_context & ctx)487 kernel::local_argument::unbind(exec_context &ctx) {
488 }
489 
490 void
set(size_t size,const void * value)491 kernel::constant_argument::set(size_t size, const void *value) {
492    if (size != sizeof(cl_mem))
493       throw error(CL_INVALID_ARG_SIZE);
494 
495    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
496    _set = true;
497 }
498 
499 void
bind(exec_context & ctx,const module::argument & marg)500 kernel::constant_argument::bind(exec_context &ctx,
501                                 const module::argument &marg) {
502    align(ctx.input, marg.target_align);
503 
504    if (buf) {
505       resource &r = buf->resource(*ctx.q);
506       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
507 
508       extend(v, module::argument::zero_ext, marg.target_size);
509       byteswap(v, ctx.q->device().endianness());
510       insert(ctx.input, v);
511 
512       st = r.bind_surface(*ctx.q, false);
513       ctx.resources.push_back(st);
514    } else {
515       // Null pointer.
516       allocate(ctx.input, marg.target_size);
517    }
518 }
519 
520 void
unbind(exec_context & ctx)521 kernel::constant_argument::unbind(exec_context &ctx) {
522    if (buf)
523       buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
524 }
525 
526 void
set(size_t size,const void * value)527 kernel::image_rd_argument::set(size_t size, const void *value) {
528    if (!value)
529       throw error(CL_INVALID_ARG_VALUE);
530 
531    if (size != sizeof(cl_mem))
532       throw error(CL_INVALID_ARG_SIZE);
533 
534    img = &obj<image>(*(cl_mem *)value);
535    _set = true;
536 }
537 
538 void
bind(exec_context & ctx,const module::argument & marg)539 kernel::image_rd_argument::bind(exec_context &ctx,
540                                 const module::argument &marg) {
541    auto v = bytes(ctx.sviews.size());
542 
543    extend(v, module::argument::zero_ext, marg.target_size);
544    byteswap(v, ctx.q->device().endianness());
545    align(ctx.input, marg.target_align);
546    insert(ctx.input, v);
547 
548    st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
549    ctx.sviews.push_back(st);
550 }
551 
552 void
unbind(exec_context & ctx)553 kernel::image_rd_argument::unbind(exec_context &ctx) {
554    img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
555 }
556 
557 void
set(size_t size,const void * value)558 kernel::image_wr_argument::set(size_t size, const void *value) {
559    if (!value)
560       throw error(CL_INVALID_ARG_VALUE);
561 
562    if (size != sizeof(cl_mem))
563       throw error(CL_INVALID_ARG_SIZE);
564 
565    img = &obj<image>(*(cl_mem *)value);
566    _set = true;
567 }
568 
569 void
bind(exec_context & ctx,const module::argument & marg)570 kernel::image_wr_argument::bind(exec_context &ctx,
571                                 const module::argument &marg) {
572    auto v = bytes(ctx.resources.size());
573 
574    extend(v, module::argument::zero_ext, marg.target_size);
575    byteswap(v, ctx.q->device().endianness());
576    align(ctx.input, marg.target_align);
577    insert(ctx.input, v);
578 
579    st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
580    ctx.resources.push_back(st);
581 }
582 
583 void
unbind(exec_context & ctx)584 kernel::image_wr_argument::unbind(exec_context &ctx) {
585    img->resource(*ctx.q).unbind_surface(*ctx.q, st);
586 }
587 
588 void
set(size_t size,const void * value)589 kernel::sampler_argument::set(size_t size, const void *value) {
590    if (!value)
591       throw error(CL_INVALID_SAMPLER);
592 
593    if (size != sizeof(cl_sampler))
594       throw error(CL_INVALID_ARG_SIZE);
595 
596    s = &obj(*(cl_sampler *)value);
597    _set = true;
598 }
599 
600 void
bind(exec_context & ctx,const module::argument & marg)601 kernel::sampler_argument::bind(exec_context &ctx,
602                                const module::argument &marg) {
603    st = s->bind(*ctx.q);
604    ctx.samplers.push_back(st);
605 }
606 
607 void
unbind(exec_context & ctx)608 kernel::sampler_argument::unbind(exec_context &ctx) {
609    s->unbind(*ctx.q, st);
610 }
611