1 /***************************************************************************
2 ocl_memory.h
3 -------------------
4 W. Michael Brown
5
6 OpenCL Specific Memory Management and Vector/Matrix Containers
7
8 __________________________________________________________________________
9 This file is part of the Geryon Unified Coprocessor Library (UCL)
10 __________________________________________________________________________
11
12 begin : Wed Jan 13 2010
13 copyright : (C) 2010 by W. Michael Brown
14 email : brownw@ornl.gov
15 ***************************************************************************/
16
17 /* -----------------------------------------------------------------------
18 Copyright (2010) Sandia Corporation. Under the terms of Contract
19 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
20 certain rights in this software. This software is distributed under
21 the Simplified BSD License.
22 ----------------------------------------------------------------------- */
23
24 #ifndef OCL_MEMORY_H
25 #define OCL_MEMORY_H
26
27 #include <iostream>
28 #include <cassert>
29 #include <cstring>
30 #include "ucl_types.h"
31
32 namespace ucl_opencl {
33
34 // --------------------------------------------------------------------------
35 // - API Specific Types
36 // --------------------------------------------------------------------------
37 struct ocl_kernel_dim {
38 size_t x,y,z;
39 ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
xocl_kernel_dim40 x(_x), y(_y), z(_z) {}
41 operator size_t * () { return (size_t *)this; }
42 operator const size_t * () const { return (const size_t *)this; }
43 };
44 typedef ocl_kernel_dim ucl_kernel_dim;
45
46 // --------------------------------------------------------------------------
47 // - API SPECIFIC DEVICE POINTERS
48 // --------------------------------------------------------------------------
49 typedef cl_mem device_ptr;
50
51 // --------------------------------------------------------------------------
52 // - HOST MEMORY ALLOCATION ROUTINES
53 // --------------------------------------------------------------------------
54
55 template <class mat_type, class copy_type>
_host_alloc(mat_type & mat,copy_type & cm,const size_t n,const enum UCL_MEMOPT kind,const enum UCL_MEMOPT kind2)56 inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
57 const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
58 cl_int error_flag;
59 cl_context context;
60 CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
61 &context,nullptr));
62
63 cl_mem_flags buffer_perm;
64 cl_map_flags map_perm;
65 if (kind2==UCL_NOT_SPECIFIED) {
66 if (kind==UCL_READ_ONLY) {
67 #ifdef CL_VERSION_1_2
68 buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
69 #else
70 buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
71 #endif
72 map_perm=CL_MAP_READ;
73 } else if (kind==UCL_WRITE_ONLY) {
74 #ifdef CL_VERSION_1_2
75 buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
76 #else
77 buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
78 #endif
79 map_perm=CL_MAP_WRITE;
80 } else {
81 buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
82 map_perm=CL_MAP_READ | CL_MAP_WRITE;
83 }
84 } else {
85 if (kind2==UCL_READ_ONLY)
86 buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
87 else if (kind2==UCL_WRITE_ONLY)
88 buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
89 else
90 buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
91
92 if (kind==UCL_READ_ONLY) {
93 #ifdef CL_VERSION_1_2
94 buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
95 #endif
96 map_perm=CL_MAP_READ;
97 } else if (kind==UCL_WRITE_ONLY) {
98 #ifdef CL_VERSION_1_2
99 buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
100 #endif
101 map_perm=CL_MAP_WRITE;
102 } else
103 map_perm=CL_MAP_READ | CL_MAP_WRITE;
104 }
105
106 mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag);
107 if (error_flag != CL_SUCCESS)
108 return UCL_MEMORY_ERROR;
109 *mat.host_ptr() = (typename mat_type::data_type*)
110 clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
111 map_perm,0,n,0,NULL,NULL,NULL);
112
113 mat.cq()=cm.cq();
114 CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
115 return UCL_SUCCESS;
116 }
117
118 template <class mat_type, class copy_type>
_host_view(mat_type & mat,copy_type & cm,const size_t o,const size_t n)119 inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
120 const size_t n) {
121 cl_int error_flag;
122 cl_buffer_region subbuffer;
123 subbuffer.origin = o;
124 subbuffer.size = n;
125 mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
126 CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
127 &error_flag);
128
129 CL_CHECK_ERR(error_flag);
130 CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
131 return UCL_SUCCESS;
132 }
133
134 template <class mat_type>
_host_alloc(mat_type & mat,UCL_Device & dev,const size_t n,const enum UCL_MEMOPT kind,const enum UCL_MEMOPT kind2)135 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
136 const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
137 cl_mem_flags buffer_perm;
138 cl_map_flags map_perm;
139 if (kind==UCL_READ_ONLY) {
140 #ifdef CL_VERSION_1_2
141 buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
142 #else
143 buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
144 #endif
145 map_perm=CL_MAP_READ;
146 } else if (kind==UCL_WRITE_ONLY) {
147 #ifdef CL_VERSION_1_2
148 buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
149 #else
150 buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
151 #endif
152 map_perm=CL_MAP_WRITE;
153 } else {
154 buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
155 map_perm=CL_MAP_READ | CL_MAP_WRITE;
156 }
157
158 cl_int error_flag;
159 mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,nullptr,&error_flag);
160 if (error_flag != CL_SUCCESS)
161 return UCL_MEMORY_ERROR;
162
163 *mat.host_ptr() = (typename mat_type::data_type*)
164 clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
165 map_perm,0,n,0,nullptr,nullptr,nullptr);
166 mat.cq()=dev.cq();
167 CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
168 return UCL_SUCCESS;
169 }
170
171 template <class mat_type>
_host_view(mat_type & mat,UCL_Device & dev,const size_t n)172 inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
173 cl_int error_flag;
174 mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
175 n,*mat.host_ptr(),&error_flag);
176 CL_CHECK_ERR(error_flag);
177 CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
178 return UCL_SUCCESS;
179 }
180
181 template <class mat_type>
_host_free(mat_type & mat)182 inline void _host_free(mat_type &mat) {
183 if (mat.cols()>0) {
184 CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
185 CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
186 }
187 }
188
189 template <class mat_type>
_host_resize(mat_type & mat,const size_t n)190 inline int _host_resize(mat_type &mat, const size_t n) {
191 cl_int error_flag;
192 cl_context context;
193 CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
194 &context,nullptr));
195 cl_mem_flags buffer_perm;
196 CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
197 &buffer_perm,nullptr));
198
199 CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
200
201 cl_map_flags map_perm;
202 if (mat.kind()==UCL_READ_ONLY)
203 map_perm=CL_MAP_READ;
204 else if (mat.kind()==UCL_WRITE_ONLY)
205 map_perm=CL_MAP_WRITE;
206 else
207 map_perm=CL_MAP_READ | CL_MAP_WRITE;
208
209 mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag);
210 if (error_flag != CL_SUCCESS)
211 return UCL_MEMORY_ERROR;
212 *mat.host_ptr() = (typename mat_type::data_type*)
213 clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
214 map_perm,0,n,0,nullptr,nullptr,nullptr);
215 return UCL_SUCCESS;
216 }
217
218 // --------------------------------------------------------------------------
219 // - DEVICE MEMORY ALLOCATION ROUTINES
220 // --------------------------------------------------------------------------
221
222 template <class mat_type, class copy_type>
_device_alloc(mat_type & mat,copy_type & cm,const size_t n,const enum UCL_MEMOPT kind)223 inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
224 const enum UCL_MEMOPT kind) {
225 cl_int error_flag;
226
227 cl_context context;
228 CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
229 &context,nullptr));
230 cl_mem_flags flag;
231 if (kind==UCL_READ_WRITE)
232 flag=CL_MEM_READ_WRITE;
233 else if (kind==UCL_READ_ONLY)
234 #ifdef CL_VERSION_1_2
235 flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
236 #else
237 flag=CL_MEM_READ_ONLY;
238 #endif
239 else if (kind==UCL_WRITE_ONLY)
240 #ifdef CL_VERSION_1_2
241 flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
242 #else
243 flag=CL_MEM_WRITE_ONLY;
244 #endif
245 else
246 assert(0==1);
247 mat.cbegin()=clCreateBuffer(context,flag,n,nullptr,&error_flag);
248 if (error_flag != CL_SUCCESS)
249 return UCL_MEMORY_ERROR;
250 mat.cq()=cm.cq();
251 CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
252 return UCL_SUCCESS;
253 }
254
255 template <class mat_type>
_device_alloc(mat_type & mat,UCL_Device & dev,const size_t n,const enum UCL_MEMOPT kind)256 inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
257 const enum UCL_MEMOPT kind) {
258 cl_int error_flag;
259 cl_mem_flags flag;
260 if (kind==UCL_READ_WRITE)
261 flag=CL_MEM_READ_WRITE;
262 else if (kind==UCL_READ_ONLY)
263 #ifdef CL_VERSION_1_2
264 flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
265 #else
266 flag=CL_MEM_READ_ONLY;
267 #endif
268 else if (kind==UCL_WRITE_ONLY)
269 #ifdef CL_VERSION_1_2
270 flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
271 #else
272 flag=CL_MEM_WRITE_ONLY;
273 #endif
274 else
275 assert(0==1);
276 mat.cbegin()=clCreateBuffer(dev.context(),flag,n,nullptr,
277 &error_flag);
278 if (error_flag != CL_SUCCESS)
279 return UCL_MEMORY_ERROR;
280 mat.cq()=dev.cq();
281 CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
282 return UCL_SUCCESS;
283 }
284
285 template <class mat_type, class copy_type>
_device_alloc(mat_type & mat,copy_type & cm,const size_t rows,const size_t cols,size_t & pitch,const enum UCL_MEMOPT kind)286 inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
287 const size_t cols, size_t &pitch,
288 const enum UCL_MEMOPT kind) {
289 size_t padded_cols=cols;
290 if (cols%256!=0)
291 padded_cols+=256-cols%256;
292 pitch=padded_cols*sizeof(typename mat_type::data_type);
293 return _device_alloc(mat,cm,pitch*rows,kind);
294 }
295
296 template <class mat_type>
_device_alloc(mat_type & mat,UCL_Device & dev,const size_t rows,const size_t cols,size_t & pitch,const enum UCL_MEMOPT kind)297 inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
298 const size_t cols, size_t &pitch,
299 const enum UCL_MEMOPT kind) {
300 size_t padded_cols=cols;
301 if (dev.device_type()!=UCL_CPU && cols%256!=0)
302 padded_cols+=256-cols%256;
303 pitch=padded_cols*sizeof(typename mat_type::data_type);
304 return _device_alloc(mat,dev,pitch*rows,kind);
305 }
306
307 template <class mat_type>
_device_free(mat_type & mat)308 inline void _device_free(mat_type &mat) {
309 if (mat.cols()>0) {
310 CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
311 CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
312 }
313 }
314
315 template <class mat_type>
_device_resize(mat_type & mat,const size_t n)316 inline int _device_resize(mat_type &mat, const size_t n) {
317 cl_int error_flag;
318
319 cl_context context;
320 CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
321 &context,nullptr));
322 CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
323
324 cl_mem_flags flag;
325 if (mat.kind()==UCL_READ_WRITE)
326 flag=CL_MEM_READ_WRITE;
327 else if (mat.kind()==UCL_READ_ONLY)
328 #ifdef CL_VERSION_1_2
329 flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
330 #else
331 flag=CL_MEM_READ_ONLY;
332 #endif
333 else if (mat.kind()==UCL_WRITE_ONLY)
334 #ifdef CL_VERSION_1_2
335 flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
336 #else
337 flag=CL_MEM_WRITE_ONLY;
338 #endif
339 else
340 assert(0==1);
341 mat.cbegin()=clCreateBuffer(context,flag,n,nullptr,&error_flag);
342 if (error_flag != CL_SUCCESS)
343 return UCL_MEMORY_ERROR;
344 return UCL_SUCCESS;
345 }
346
347 template <class mat_type>
_device_resize(mat_type & mat,const size_t rows,const size_t cols,size_t & pitch)348 inline int _device_resize(mat_type &mat, const size_t rows,
349 const size_t cols, size_t &pitch) {
350 size_t padded_cols=cols;
351 if (cols%256!=0)
352 padded_cols+=256-cols%256;
353 pitch=padded_cols*sizeof(typename mat_type::data_type);
354
355 cl_int error_flag;
356
357 cl_context context;
358 CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
359 &context,nullptr));
360 CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
361
362 cl_mem_flags flag;
363 if (mat.kind()==UCL_READ_WRITE)
364 flag=CL_MEM_READ_WRITE;
365 else if (mat.kind()==UCL_READ_ONLY)
366 #ifdef CL_VERSION_1_2
367 flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
368 #else
369 flag=CL_MEM_READ_ONLY;
370 #endif
371 else if (mat.kind()==UCL_WRITE_ONLY)
372 #ifdef CL_VERSION_1_2
373 flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
374 #else
375 flag=CL_MEM_WRITE_ONLY;
376 #endif
377 else
378 assert(0==1);
379 mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,nullptr,&error_flag);
380 if (error_flag != CL_SUCCESS)
381 return UCL_MEMORY_ERROR;
382 return UCL_SUCCESS;
383 }
384
385
386 // --------------------------------------------------------------------------
387 // - ZERO ROUTINES
388 // --------------------------------------------------------------------------
_host_zero(void * ptr,const size_t n)389 inline void _host_zero(void *ptr, const size_t n) {
390 memset(ptr,0,n);
391 }
392
393 inline void _ocl_build(cl_program &program, cl_device_id &device,
394 const char* options = "") {
395 clBuildProgram(program,1,&device,options,nullptr,nullptr);
396
397 cl_build_status build_status;
398 CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
399 sizeof(cl_build_status),&build_status,
400 nullptr));
401 if (build_status == CL_SUCCESS)
402 return;
403
404 size_t ms;
405 CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
406 nullptr, &ms));
407 char *build_log = new char[ms];
408 CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
409 build_log, nullptr));
410
411 std::cerr << std::endl
412 << "----------------------------------------------------------\n"
413 << " Error compiling OpenCL Program...\n"
414 << "----------------------------------------------------------\n";
415 std::cerr << build_log << std::endl;
416 delete[] build_log;
417 }
418
419 inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
420 const char **source, const size_t lines,
421 cl_kernel &kernel, const char *function,
422 const char *options="") {
423 cl_int error_flag;
424
425 cl_program program=clCreateProgramWithSource(context,lines,source,
426 nullptr,&error_flag);
427 CL_CHECK_ERR(error_flag);
428 _ocl_build(program,device,options);
429 kernel=clCreateKernel(program,function,&error_flag);
430 CL_CHECK_ERR(error_flag);
431 }
432
433 template <class mat_type>
_device_zero(mat_type & mat,const size_t n,command_queue & cq)434 inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
435 #ifdef CL_VERSION_1_2
436 #ifndef __APPLE__
437 #define UCL_CL_ZERO
438 #endif
439 #endif
440
441 #ifdef UCL_CL_ZERO
442 cl_int zeroint=0;
443 CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
444 mat.byteoff(),n,0,nullptr,nullptr));
445
446 #else
447 cl_context context;
448 CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
449 &context,nullptr));
450 cl_device_id device;
451 CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
452 sizeof(cl_device_id),&device,nullptr));
453
454 const char * szero[3]={
455 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
456 "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
457 " { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
458 };
459
460 cl_kernel kzero;
461 _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
462 _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
463
464 cl_int offset=mat.offset();
465 CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
466 CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
467 size_t kn=n/sizeof(typename mat_type::data_type);
468 CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
469 #endif
470 #ifdef GERYON_OCL_FLUSH
471 ucl_flush(cq);
472 #endif
473 }
474
475 // --------------------------------------------------------------------------
476 // - MEMCPY ROUTINES
477 // --------------------------------------------------------------------------
478
479 template<int mem1, int mem2> struct _ucl_memcpy;
480
481 // Both are images
482 template<> struct _ucl_memcpy<2,2> {
483 template <class p1, class p2>
484 static inline void mc(p1 &dst, const p2 &src, const size_t n,
485 cl_command_queue &cq, const cl_bool block,
486 const size_t dst_offset, const size_t src_offset) {
487 assert(0==1);
488 }
489 template <class p1, class p2>
490 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
491 const size_t spitch, const size_t cols,
492 const size_t rows, cl_command_queue &cq,
493 const cl_bool block,
494 const size_t dst_offset, const size_t src_offset) {
495 assert(0==1);
496 }
497 };
498
499 // Destination is texture, source on device
500 template<> struct _ucl_memcpy<2,0> {
501 template <class p1, class p2>
502 static inline void mc(p1 &dst, const p2 &src, const size_t n,
503 cl_command_queue &cq, const cl_bool block,
504 const size_t dst_offset, const size_t src_offset) {
505 assert(0==1);
506 }
507 template <class p1, class p2>
508 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
509 const size_t spitch, const size_t cols,
510 const size_t rows, cl_command_queue &cq,
511 const cl_bool block,
512 const size_t dst_offset, const size_t src_offset) {
513 assert(0==1);
514 }
515 };
516
517 // Destination is texture, source on host
518 template<> struct _ucl_memcpy<2,1> {
519 template <class p1, class p2>
520 static inline void mc(p1 &dst, const p2 &src, const size_t n,
521 cl_command_queue &cq, const cl_bool block,
522 const size_t dst_offset, const size_t src_offset) {
523 assert(0==1);
524 }
525 template <class p1, class p2>
526 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
527 const size_t spitch, const size_t cols,
528 const size_t rows, cl_command_queue &cq,
529 const cl_bool block,
530 const size_t dst_offset, const size_t src_offset) {
531 assert(0==1);
532 }
533 };
534
535 // Source is texture, dest on device
536 template<> struct _ucl_memcpy<0,2> {
537 template <class p1, class p2>
538 static inline void mc(p1 &dst, const p2 &src, const size_t n,
539 cl_command_queue &cq, const cl_bool block,
540 const size_t dst_offset, const size_t src_offset) {
541 assert(0==1);
542 }
543 template <class p1, class p2>
544 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
545 const size_t spitch, const size_t cols,
546 const size_t rows, cl_command_queue &cq,
547 const cl_bool block,
548 const size_t dst_offset, const size_t src_offset) {
549 assert(0==1);
550 }
551 };
552
553 // Source is texture, dest on host
554 template<> struct _ucl_memcpy<1,2> {
555 template <class p1, class p2>
556 static inline void mc(p1 &dst, const p2 &src, const size_t n,
557 cl_command_queue &cq, const cl_bool block,
558 const size_t dst_offset, const size_t src_offset) {
559 assert(0==1);
560 }
561 template <class p1, class p2>
562 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
563 const size_t spitch, const size_t cols,
564 const size_t rows, cl_command_queue &cq,
565 const cl_bool block,
566 const size_t dst_offset, const size_t src_offset) {
567 assert(0==1);
568 }
569 };
570
571 // Neither are textures, destination on host
572 template <> struct _ucl_memcpy<1,0> {
573 template <class p1, class p2>
574 static inline void mc(p1 &dst, const p2 &src, const size_t n,
575 cl_command_queue &cq, const cl_bool block,
576 const size_t dst_offset, const size_t src_offset) {
577 if (src.cbegin()==dst.cbegin()) {
578 #ifdef UCL_DBG_MEM_TRACE
579 std::cerr << "UCL_COPY 1S\n";
580 #endif
581 if (block) ucl_sync(cq);
582 return;
583 }
584 #ifdef UCL_DBG_MEM_TRACE
585 std::cerr << "UCL_COPY 1NS\n";
586 #endif
587 CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
588 dst.begin(),0,NULL,NULL));
589 #ifdef GERYON_OCL_FLUSH
590 if (block==CL_FALSE) ucl_flush(cq);
591 #endif
592 }
593 template <class p1, class p2>
594 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
595 const size_t spitch, const size_t cols,
596 const size_t rows, cl_command_queue &cq,
597 const cl_bool block,
598 size_t dst_offset, size_t src_offset) {
599 if (src.cbegin()==dst.cbegin()) {
600 if (block) ucl_sync(cq);
601 #ifdef UCL_DBG_MEM_TRACE
602 std::cerr << "UCL_COPY 2S\n";
603 #endif
604 return;
605 }
606 #ifdef UCL_DBG_MEM_TRACE
607 std::cerr << "UCL_COPY 2NS\n";
608 #endif
609 if (spitch==dpitch && dst.cols()==src.cols() &&
610 src.cols()==cols/src.element_size())
611 CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
612 spitch*rows,
613 (char *)dst.begin()+dst_offset,0,nullptr,
614 nullptr));
615 else
616 for (size_t i=0; i<rows; i++) {
617 CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
618 (char *)dst.begin()+dst_offset,0,nullptr,
619 nullptr));
620 src_offset+=spitch;
621 dst_offset+=dpitch;
622 }
623 #ifdef GERYON_OCL_FLUSH
624 if (block==CL_FALSE) ucl_flush(cq);
625 #endif
626 }
627 };
628
629 // Neither are textures, source on host
630 template <> struct _ucl_memcpy<0,1> {
631 template <class p1, class p2>
632 static inline void mc(p1 &dst, const p2 &src, const size_t n,
633 cl_command_queue &cq, const cl_bool block,
634 const size_t dst_offset, const size_t src_offset) {
635 if (src.cbegin()==dst.cbegin()) {
636 if (block) ucl_sync(cq);
637 #ifdef UCL_DBG_MEM_TRACE
638 std::cerr << "UCL_COPY 3S\n";
639 #endif
640 return;
641 }
642 #ifdef UCL_DBG_MEM_TRACE
643 std::cerr << "UCL_COPY 3NS\n";
644 #endif
645 CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
646 src.begin(),0,NULL,NULL));
647 #ifdef GERYON_OCL_FLUSH
648 if (block==CL_FALSE) ucl_flush(cq);
649 #endif
650 }
651 template <class p1, class p2>
652 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
653 const size_t spitch, const size_t cols,
654 const size_t rows, cl_command_queue &cq,
655 const cl_bool block,
656 size_t dst_offset, size_t src_offset) {
657 if (src.cbegin()==dst.cbegin()) {
658 if (block) ucl_sync(cq);
659 #ifdef UCL_DBG_MEM_TRACE
660 std::cerr << "UCL_COPY 4S\n";
661 #endif
662 return;
663 }
664 #ifdef UCL_DBG_MEM_TRACE
665 std::cerr << "UCL_COPY 4NS\n";
666 #endif
667 if (spitch==dpitch && dst.cols()==src.cols() &&
668 src.cols()==cols/src.element_size())
669 CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
670 spitch*rows,
671 (char *)src.begin()+src_offset,0,nullptr,
672 nullptr));
673 else
674 for (size_t i=0; i<rows; i++) {
675 CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,cols,
676 (char *)src.begin()+src_offset,0,nullptr,
677 nullptr));
678 src_offset+=spitch;
679 dst_offset+=dpitch;
680 }
681 #ifdef GERYON_OCL_FLUSH
682 if (block==CL_FALSE) ucl_flush(cq);
683 #endif
684 }
685 };
686
687 // Neither are textures, both on device
688 template <int mem1, int mem2> struct _ucl_memcpy {
689 template <class p1, class p2>
690 static inline void mc(p1 &dst, const p2 &src, const size_t n,
691 cl_command_queue &cq, const cl_bool block,
692 const size_t dst_offset, const size_t src_offset) {
693 if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
694 CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
695 dst_offset,n,0,nullptr,nullptr));
696 #ifdef UCL_DBG_MEM_TRACE
697 std::cerr << "UCL_COPY 6NS\n";
698 #endif
699 }
700 #ifdef UCL_DBG_MEM_TRACE
701 else std::cerr << "UCL_COPY 6S\n";
702 #endif
703
704 if (block==CL_TRUE) ucl_sync(cq);
705 #ifdef GERYON_OCL_FLUSH
706 else ucl_flush(cq);
707 #endif
708 }
709 template <class p1, class p2>
710 static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
711 const size_t spitch, const size_t cols,
712 const size_t rows, cl_command_queue &cq,
713 const cl_bool block,
714 size_t dst_offset, size_t src_offset) {
715 if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
716 #ifdef UCL_DBG_MEM_TRACE
717 std::cerr << "UCL_COPY 7NS\n";
718 #endif
719 if (spitch==dpitch && dst.cols()==src.cols() &&
720 src.cols()==cols/src.element_size())
721 CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
722 dst_offset,spitch*rows,0,nullptr,nullptr));
723
724 else
725 for (size_t i=0; i<rows; i++) {
726 CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
727 src_offset,dst_offset,cols,0,
728 nullptr,nullptr));
729 src_offset+=spitch;
730 dst_offset+=dpitch;
731 }
732 }
733 #ifdef UCL_DBG_MEM_TRACE
734 else std::cerr << "UCL_COPY 7S\n";
735 #endif
736
737 if (block==CL_TRUE) ucl_sync(cq);
738 #ifdef GERYON_OCL_FLUSH
739 else ucl_flush(cq);
740 #endif
741 }
742 };
743
744 template<class mat1, class mat2>
745 inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
746 _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,dst.cq(),CL_TRUE,
747 dst.byteoff(),src.byteoff());
748 }
749
750 template<class mat1, class mat2>
751 inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
752 cl_command_queue &cq) {
753 _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq,CL_FALSE,
754 dst.byteoff(),src.byteoff());
755 }
756
757 template<class mat1, class mat2>
758 inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
759 const size_t spitch, const size_t cols,
760 const size_t rows) {
761 _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
762 rows,dst.cq(),CL_TRUE,
763 dst.byteoff(),src.byteoff());
764 }
765
766 template<class mat1, class mat2>
767 inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
768 const size_t spitch, const size_t cols,
769 const size_t rows,cl_command_queue &cq) {
770 _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
771 rows,cq,CL_FALSE,
772 dst.byteoff(),src.byteoff());
773 }
774
775 } // namespace ucl_cudart
776
777 #endif
778
779