1 /***************************************************************************
2                                 ocl_memory.h
3                              -------------------
4                                W. Michael Brown
5 
6   OpenCL Specific Memory Management and Vector/Matrix Containers
7 
8  __________________________________________________________________________
9     This file is part of the Geryon Unified Coprocessor Library (UCL)
10  __________________________________________________________________________
11 
12     begin                : Wed Jan 13 2010
13     copyright            : (C) 2010 by W. Michael Brown
14     email                : brownw@ornl.gov
15  ***************************************************************************/
16 
17 /* -----------------------------------------------------------------------
18    Copyright (2010) Sandia Corporation.  Under the terms of Contract
19    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
20    certain rights in this software.  This software is distributed under
21    the Simplified BSD License.
22    ----------------------------------------------------------------------- */
23 
24 #ifndef OCL_MEMORY_H
25 #define OCL_MEMORY_H
26 
27 #include <iostream>
28 #include <cassert>
29 #include <cstring>
30 #include "ucl_types.h"
31 
32 namespace ucl_opencl {
33 
34 // --------------------------------------------------------------------------
35 // - API Specific Types
36 // --------------------------------------------------------------------------
37 struct ocl_kernel_dim {
38   size_t x,y,z;
39   ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
xocl_kernel_dim40     x(_x), y(_y), z(_z) {}
41   operator size_t * () { return (size_t *)this; }
42   operator const size_t * () const { return (const size_t *)this; }
43 };
44 typedef ocl_kernel_dim ucl_kernel_dim;
45 
46 // --------------------------------------------------------------------------
47 // - API SPECIFIC DEVICE POINTERS
48 // --------------------------------------------------------------------------
49 typedef cl_mem device_ptr;
50 
51 // --------------------------------------------------------------------------
52 // - HOST MEMORY ALLOCATION ROUTINES
53 // --------------------------------------------------------------------------
54 
55 template <class mat_type, class copy_type>
_host_alloc(mat_type & mat,copy_type & cm,const size_t n,const enum UCL_MEMOPT kind,const enum UCL_MEMOPT kind2)56 inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
57                        const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
58   cl_int error_flag;
59   cl_context context;
60   CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
61                                   &context,nullptr));
62 
63   cl_mem_flags buffer_perm;
64   cl_map_flags map_perm;
65   if (kind2==UCL_NOT_SPECIFIED) {
66     if (kind==UCL_READ_ONLY) {
67       #ifdef CL_VERSION_1_2
68       buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
69       #else
70       buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
71       #endif
72       map_perm=CL_MAP_READ;
73     } else if (kind==UCL_WRITE_ONLY) {
74       #ifdef CL_VERSION_1_2
75       buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
76       #else
77       buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
78       #endif
79       map_perm=CL_MAP_WRITE;
80     } else {
81       buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
82       map_perm=CL_MAP_READ | CL_MAP_WRITE;
83     }
84   } else {
85     if (kind2==UCL_READ_ONLY)
86       buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
87     else if (kind2==UCL_WRITE_ONLY)
88       buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
89     else
90       buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
91 
92     if (kind==UCL_READ_ONLY) {
93       #ifdef CL_VERSION_1_2
94       buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
95       #endif
96       map_perm=CL_MAP_READ;
97     } else if (kind==UCL_WRITE_ONLY) {
98       #ifdef CL_VERSION_1_2
99       buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
100       #endif
101       map_perm=CL_MAP_WRITE;
102     } else
103       map_perm=CL_MAP_READ | CL_MAP_WRITE;
104   }
105 
106   mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag);
107   if (error_flag != CL_SUCCESS)
108     return UCL_MEMORY_ERROR;
109   *mat.host_ptr() = (typename mat_type::data_type*)
110     clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
111 		       map_perm,0,n,0,NULL,NULL,NULL);
112 
113   mat.cq()=cm.cq();
114   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
115   return UCL_SUCCESS;
116 }
117 
118 template <class mat_type, class copy_type>
_host_view(mat_type & mat,copy_type & cm,const size_t o,const size_t n)119 inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
120                       const size_t n) {
121   cl_int error_flag;
122   cl_buffer_region subbuffer;
123   subbuffer.origin = o;
124   subbuffer.size = n;
125   mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
126                                  CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
127                                  &error_flag);
128 
129   CL_CHECK_ERR(error_flag);
130   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
131   return UCL_SUCCESS;
132 }
133 
134 template <class mat_type>
_host_alloc(mat_type & mat,UCL_Device & dev,const size_t n,const enum UCL_MEMOPT kind,const enum UCL_MEMOPT kind2)135 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
136                        const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
137   cl_mem_flags buffer_perm;
138   cl_map_flags map_perm;
139   if (kind==UCL_READ_ONLY) {
140     #ifdef CL_VERSION_1_2
141     buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
142     #else
143     buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
144     #endif
145     map_perm=CL_MAP_READ;
146   } else if (kind==UCL_WRITE_ONLY) {
147     #ifdef CL_VERSION_1_2
148     buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
149     #else
150     buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
151     #endif
152     map_perm=CL_MAP_WRITE;
153   } else {
154     buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
155     map_perm=CL_MAP_READ | CL_MAP_WRITE;
156   }
157 
158   cl_int error_flag;
159   mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,nullptr,&error_flag);
160   if (error_flag != CL_SUCCESS)
161     return UCL_MEMORY_ERROR;
162 
163   *mat.host_ptr() = (typename mat_type::data_type*)
164                     clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
165                                        map_perm,0,n,0,nullptr,nullptr,nullptr);
166   mat.cq()=dev.cq();
167   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
168   return UCL_SUCCESS;
169 }
170 
171 template <class mat_type>
_host_view(mat_type & mat,UCL_Device & dev,const size_t n)172 inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
173   cl_int error_flag;
174   mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
175                               n,*mat.host_ptr(),&error_flag);
176   CL_CHECK_ERR(error_flag);
177   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
178   return UCL_SUCCESS;
179 }
180 
181 template <class mat_type>
_host_free(mat_type & mat)182 inline void _host_free(mat_type &mat) {
183   if (mat.cols()>0) {
184     CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
185     CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
186   }
187 }
188 
189 template <class mat_type>
_host_resize(mat_type & mat,const size_t n)190 inline int _host_resize(mat_type &mat, const size_t n) {
191   cl_int error_flag;
192   cl_context context;
193   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
194                                   &context,nullptr));
195   cl_mem_flags buffer_perm;
196   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
197                                   &buffer_perm,nullptr));
198 
199   CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
200 
201   cl_map_flags map_perm;
202   if (mat.kind()==UCL_READ_ONLY)
203     map_perm=CL_MAP_READ;
204   else if (mat.kind()==UCL_WRITE_ONLY)
205     map_perm=CL_MAP_WRITE;
206   else
207     map_perm=CL_MAP_READ | CL_MAP_WRITE;
208 
209   mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag);
210   if (error_flag != CL_SUCCESS)
211     return UCL_MEMORY_ERROR;
212   *mat.host_ptr() = (typename mat_type::data_type*)
213                     clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
214                                        map_perm,0,n,0,nullptr,nullptr,nullptr);
215   return UCL_SUCCESS;
216 }
217 
218 // --------------------------------------------------------------------------
219 // - DEVICE MEMORY ALLOCATION ROUTINES
220 // --------------------------------------------------------------------------
221 
222 template <class mat_type, class copy_type>
_device_alloc(mat_type & mat,copy_type & cm,const size_t n,const enum UCL_MEMOPT kind)223 inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
224                          const enum UCL_MEMOPT kind) {
225   cl_int error_flag;
226 
227   cl_context context;
228   CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
229                &context,nullptr));
230   cl_mem_flags flag;
231   if (kind==UCL_READ_WRITE)
232     flag=CL_MEM_READ_WRITE;
233   else if (kind==UCL_READ_ONLY)
234     #ifdef CL_VERSION_1_2
235     flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
236     #else
237     flag=CL_MEM_READ_ONLY;
238     #endif
239   else if (kind==UCL_WRITE_ONLY)
240     #ifdef CL_VERSION_1_2
241     flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
242     #else
243     flag=CL_MEM_WRITE_ONLY;
244     #endif
245   else
246     assert(0==1);
247   mat.cbegin()=clCreateBuffer(context,flag,n,nullptr,&error_flag);
248   if (error_flag != CL_SUCCESS)
249     return UCL_MEMORY_ERROR;
250   mat.cq()=cm.cq();
251   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
252   return UCL_SUCCESS;
253 }
254 
255 template <class mat_type>
_device_alloc(mat_type & mat,UCL_Device & dev,const size_t n,const enum UCL_MEMOPT kind)256 inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
257                          const enum UCL_MEMOPT kind) {
258   cl_int error_flag;
259   cl_mem_flags flag;
260   if (kind==UCL_READ_WRITE)
261     flag=CL_MEM_READ_WRITE;
262   else if (kind==UCL_READ_ONLY)
263     #ifdef CL_VERSION_1_2
264     flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
265     #else
266     flag=CL_MEM_READ_ONLY;
267     #endif
268   else if (kind==UCL_WRITE_ONLY)
269     #ifdef CL_VERSION_1_2
270     flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
271     #else
272     flag=CL_MEM_WRITE_ONLY;
273     #endif
274   else
275     assert(0==1);
276   mat.cbegin()=clCreateBuffer(dev.context(),flag,n,nullptr,
277                               &error_flag);
278   if (error_flag != CL_SUCCESS)
279     return UCL_MEMORY_ERROR;
280   mat.cq()=dev.cq();
281   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
282   return UCL_SUCCESS;
283 }
284 
285 template <class mat_type, class copy_type>
_device_alloc(mat_type & mat,copy_type & cm,const size_t rows,const size_t cols,size_t & pitch,const enum UCL_MEMOPT kind)286 inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
287                          const size_t cols, size_t &pitch,
288                          const enum UCL_MEMOPT kind) {
289   size_t padded_cols=cols;
290   if (cols%256!=0)
291     padded_cols+=256-cols%256;
292   pitch=padded_cols*sizeof(typename mat_type::data_type);
293   return _device_alloc(mat,cm,pitch*rows,kind);
294 }
295 
296 template <class mat_type>
_device_alloc(mat_type & mat,UCL_Device & dev,const size_t rows,const size_t cols,size_t & pitch,const enum UCL_MEMOPT kind)297 inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
298                          const size_t cols, size_t &pitch,
299                          const enum UCL_MEMOPT kind) {
300   size_t padded_cols=cols;
301   if (dev.device_type()!=UCL_CPU && cols%256!=0)
302     padded_cols+=256-cols%256;
303   pitch=padded_cols*sizeof(typename mat_type::data_type);
304   return _device_alloc(mat,dev,pitch*rows,kind);
305 }
306 
307 template <class mat_type>
_device_free(mat_type & mat)308 inline void _device_free(mat_type &mat) {
309   if (mat.cols()>0) {
310     CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
311     CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
312   }
313 }
314 
315 template <class mat_type>
_device_resize(mat_type & mat,const size_t n)316 inline int _device_resize(mat_type &mat, const size_t n) {
317   cl_int error_flag;
318 
319   cl_context context;
320   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
321                &context,nullptr));
322   CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
323 
324   cl_mem_flags flag;
325   if (mat.kind()==UCL_READ_WRITE)
326     flag=CL_MEM_READ_WRITE;
327   else if (mat.kind()==UCL_READ_ONLY)
328     #ifdef CL_VERSION_1_2
329     flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
330     #else
331     flag=CL_MEM_READ_ONLY;
332     #endif
333   else if (mat.kind()==UCL_WRITE_ONLY)
334     #ifdef CL_VERSION_1_2
335     flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
336     #else
337     flag=CL_MEM_WRITE_ONLY;
338     #endif
339   else
340     assert(0==1);
341   mat.cbegin()=clCreateBuffer(context,flag,n,nullptr,&error_flag);
342   if (error_flag != CL_SUCCESS)
343     return UCL_MEMORY_ERROR;
344   return UCL_SUCCESS;
345 }
346 
347 template <class mat_type>
_device_resize(mat_type & mat,const size_t rows,const size_t cols,size_t & pitch)348 inline int _device_resize(mat_type &mat, const size_t rows,
349                          const size_t cols, size_t &pitch) {
350   size_t padded_cols=cols;
351   if (cols%256!=0)
352     padded_cols+=256-cols%256;
353   pitch=padded_cols*sizeof(typename mat_type::data_type);
354 
355   cl_int error_flag;
356 
357   cl_context context;
358   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
359                &context,nullptr));
360   CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
361 
362   cl_mem_flags flag;
363   if (mat.kind()==UCL_READ_WRITE)
364     flag=CL_MEM_READ_WRITE;
365   else if (mat.kind()==UCL_READ_ONLY)
366     #ifdef CL_VERSION_1_2
367     flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
368     #else
369     flag=CL_MEM_READ_ONLY;
370     #endif
371   else if (mat.kind()==UCL_WRITE_ONLY)
372     #ifdef CL_VERSION_1_2
373     flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
374     #else
375     flag=CL_MEM_WRITE_ONLY;
376     #endif
377   else
378     assert(0==1);
379   mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,nullptr,&error_flag);
380   if (error_flag != CL_SUCCESS)
381     return UCL_MEMORY_ERROR;
382   return UCL_SUCCESS;
383 }
384 
385 
386 // --------------------------------------------------------------------------
387 // - ZERO ROUTINES
388 // --------------------------------------------------------------------------
_host_zero(void * ptr,const size_t n)389 inline void _host_zero(void *ptr, const size_t n) {
390   memset(ptr,0,n);
391 }
392 
393 inline void _ocl_build(cl_program &program, cl_device_id &device,
394                        const char* options = "") {
395   clBuildProgram(program,1,&device,options,nullptr,nullptr);
396 
397   cl_build_status build_status;
398   CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
399                                      sizeof(cl_build_status),&build_status,
400                                      nullptr));
401   if (build_status == CL_SUCCESS)
402     return;
403 
404   size_t ms;
405   CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
406                                      nullptr, &ms));
407   char *build_log = new char[ms];
408   CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
409                                      build_log, nullptr));
410 
411   std::cerr << std::endl
412             << "----------------------------------------------------------\n"
413             << " Error compiling OpenCL Program...\n"
414             << "----------------------------------------------------------\n";
415   std::cerr << build_log << std::endl;
416   delete[] build_log;
417 }
418 
419 inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
420                                     const char **source, const size_t lines,
421                                     cl_kernel &kernel, const char *function,
422                                     const char *options="") {
423   cl_int error_flag;
424 
425   cl_program program=clCreateProgramWithSource(context,lines,source,
426                                                nullptr,&error_flag);
427   CL_CHECK_ERR(error_flag);
428   _ocl_build(program,device,options);
429   kernel=clCreateKernel(program,function,&error_flag);
430   CL_CHECK_ERR(error_flag);
431 }
432 
433 template <class mat_type>
_device_zero(mat_type & mat,const size_t n,command_queue & cq)434 inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
435   #ifdef CL_VERSION_1_2
436   #ifndef __APPLE__
437   #define UCL_CL_ZERO
438   #endif
439   #endif
440 
441   #ifdef UCL_CL_ZERO
442   cl_int zeroint=0;
443   CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
444                                    mat.byteoff(),n,0,nullptr,nullptr));
445 
446   #else
447   cl_context context;
448   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
449                                   &context,nullptr));
450   cl_device_id device;
451   CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
452                sizeof(cl_device_id),&device,nullptr));
453 
454   const char * szero[3]={
455     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
456     "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
457     "  { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
458   };
459 
460   cl_kernel kzero;
461   _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
462                    _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
463 
464   cl_int offset=mat.offset();
465   CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
466   CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
467   size_t kn=n/sizeof(typename mat_type::data_type);
468   CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
469   #endif
470   #ifdef GERYON_OCL_FLUSH
471   ucl_flush(cq);
472   #endif
473 }
474 
475 // --------------------------------------------------------------------------
476 // - MEMCPY ROUTINES
477 // --------------------------------------------------------------------------
478 
479 template<int mem1, int mem2> struct _ucl_memcpy;
480 
481 // Both are images
482 template<> struct _ucl_memcpy<2,2> {
483   template <class p1, class p2>
484   static inline void mc(p1 &dst, const p2 &src, const size_t n,
485                         cl_command_queue &cq, const cl_bool block,
486                         const size_t dst_offset, const size_t src_offset) {
487     assert(0==1);
488   }
489   template <class p1, class p2>
490   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
491                         const size_t spitch, const size_t cols,
492                         const size_t rows, cl_command_queue &cq,
493                         const cl_bool block,
494                         const size_t dst_offset, const size_t src_offset) {
495     assert(0==1);
496   }
497 };
498 
499 // Destination is texture, source on device
500 template<> struct _ucl_memcpy<2,0> {
501   template <class p1, class p2>
502   static inline void mc(p1 &dst, const p2 &src, const size_t n,
503                         cl_command_queue &cq, const cl_bool block,
504                         const size_t dst_offset, const size_t src_offset) {
505     assert(0==1);
506   }
507   template <class p1, class p2>
508   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
509                         const size_t spitch, const size_t cols,
510                         const size_t rows, cl_command_queue &cq,
511                         const cl_bool block,
512                         const size_t dst_offset, const size_t src_offset) {
513     assert(0==1);
514   }
515 };
516 
517 // Destination is texture, source on host
518 template<> struct _ucl_memcpy<2,1> {
519   template <class p1, class p2>
520   static inline void mc(p1 &dst, const p2 &src, const size_t n,
521                         cl_command_queue &cq, const cl_bool block,
522                         const size_t dst_offset, const size_t src_offset) {
523     assert(0==1);
524   }
525   template <class p1, class p2>
526   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
527                         const size_t spitch, const size_t cols,
528                         const size_t rows, cl_command_queue &cq,
529                         const cl_bool block,
530                         const size_t dst_offset, const size_t src_offset) {
531     assert(0==1);
532   }
533 };
534 
535 // Source is texture, dest on device
536 template<> struct _ucl_memcpy<0,2> {
537   template <class p1, class p2>
538   static inline void mc(p1 &dst, const p2 &src, const size_t n,
539                         cl_command_queue &cq, const cl_bool block,
540                         const size_t dst_offset, const size_t src_offset) {
541     assert(0==1);
542   }
543   template <class p1, class p2>
544   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
545                         const size_t spitch, const size_t cols,
546                         const size_t rows, cl_command_queue &cq,
547                         const cl_bool block,
548                         const size_t dst_offset, const size_t src_offset) {
549     assert(0==1);
550   }
551 };
552 
553 // Source is texture, dest on host
554 template<> struct _ucl_memcpy<1,2> {
555   template <class p1, class p2>
556   static inline void mc(p1 &dst, const p2 &src, const size_t n,
557                         cl_command_queue &cq, const cl_bool block,
558                         const size_t dst_offset, const size_t src_offset) {
559     assert(0==1);
560   }
561   template <class p1, class p2>
562   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
563                         const size_t spitch, const size_t cols,
564                         const size_t rows, cl_command_queue &cq,
565                         const cl_bool block,
566                         const size_t dst_offset, const size_t src_offset) {
567     assert(0==1);
568   }
569 };
570 
571 // Neither are textures, destination on host
572 template <> struct _ucl_memcpy<1,0> {
573   template <class p1, class p2>
574   static inline void mc(p1 &dst, const p2 &src, const size_t n,
575                         cl_command_queue &cq, const cl_bool block,
576                         const size_t dst_offset, const size_t src_offset) {
577     if (src.cbegin()==dst.cbegin()) {
578       #ifdef UCL_DBG_MEM_TRACE
579       std::cerr << "UCL_COPY 1S\n";
580       #endif
581       if (block) ucl_sync(cq);
582       return;
583     }
584     #ifdef UCL_DBG_MEM_TRACE
585     std::cerr << "UCL_COPY 1NS\n";
586     #endif
587     CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
588                                      dst.begin(),0,NULL,NULL));
589     #ifdef GERYON_OCL_FLUSH
590     if (block==CL_FALSE) ucl_flush(cq);
591     #endif
592   }
593   template <class p1, class p2>
594   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
595                         const size_t spitch, const size_t cols,
596                         const size_t rows, cl_command_queue &cq,
597                         const cl_bool block,
598                         size_t dst_offset, size_t src_offset) {
599     if (src.cbegin()==dst.cbegin()) {
600       if (block) ucl_sync(cq);
601       #ifdef UCL_DBG_MEM_TRACE
602       std::cerr << "UCL_COPY 2S\n";
603       #endif
604       return;
605     }
606     #ifdef UCL_DBG_MEM_TRACE
607     std::cerr << "UCL_COPY 2NS\n";
608     #endif
609     if (spitch==dpitch && dst.cols()==src.cols() &&
610         src.cols()==cols/src.element_size())
611       CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
612                                        spitch*rows,
613                                        (char *)dst.begin()+dst_offset,0,nullptr,
614                                        nullptr));
615     else
616       for (size_t i=0; i<rows; i++) {
617         CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
618                                          (char *)dst.begin()+dst_offset,0,nullptr,
619                                          nullptr));
620         src_offset+=spitch;
621         dst_offset+=dpitch;
622       }
623     #ifdef GERYON_OCL_FLUSH
624     if (block==CL_FALSE) ucl_flush(cq);
625     #endif
626   }
627 };
628 
629 // Neither are textures, source on host
630 template <> struct _ucl_memcpy<0,1> {
631   template <class p1, class p2>
632   static inline void mc(p1 &dst, const p2 &src, const size_t n,
633                         cl_command_queue &cq, const cl_bool block,
634                         const size_t dst_offset, const size_t src_offset) {
635     if (src.cbegin()==dst.cbegin()) {
636       if (block) ucl_sync(cq);
637       #ifdef UCL_DBG_MEM_TRACE
638       std::cerr << "UCL_COPY 3S\n";
639       #endif
640       return;
641     }
642     #ifdef UCL_DBG_MEM_TRACE
643     std::cerr << "UCL_COPY 3NS\n";
644     #endif
645     CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
646                                       src.begin(),0,NULL,NULL));
647     #ifdef GERYON_OCL_FLUSH
648     if (block==CL_FALSE) ucl_flush(cq);
649     #endif
650   }
651   template <class p1, class p2>
652   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
653                         const size_t spitch, const size_t cols,
654                         const size_t rows, cl_command_queue &cq,
655                         const cl_bool block,
656                         size_t dst_offset, size_t src_offset) {
657     if (src.cbegin()==dst.cbegin()) {
658       if (block) ucl_sync(cq);
659       #ifdef UCL_DBG_MEM_TRACE
660       std::cerr << "UCL_COPY 4S\n";
661       #endif
662       return;
663     }
664     #ifdef UCL_DBG_MEM_TRACE
665     std::cerr << "UCL_COPY 4NS\n";
666     #endif
667     if (spitch==dpitch && dst.cols()==src.cols() &&
668         src.cols()==cols/src.element_size())
669       CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
670                                         spitch*rows,
671                                         (char *)src.begin()+src_offset,0,nullptr,
672                                         nullptr));
673     else
674       for (size_t i=0; i<rows; i++) {
675         CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,cols,
676                                           (char *)src.begin()+src_offset,0,nullptr,
677                                           nullptr));
678         src_offset+=spitch;
679         dst_offset+=dpitch;
680       }
681     #ifdef GERYON_OCL_FLUSH
682     if (block==CL_FALSE) ucl_flush(cq);
683     #endif
684   }
685 };
686 
687 // Neither are textures, both on device
688 template <int mem1, int mem2> struct _ucl_memcpy {
689   template <class p1, class p2>
690   static inline void mc(p1 &dst, const p2 &src, const size_t n,
691                         cl_command_queue &cq, const cl_bool block,
692                         const size_t dst_offset, const size_t src_offset) {
693     if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
694       CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
695                                        dst_offset,n,0,nullptr,nullptr));
696       #ifdef UCL_DBG_MEM_TRACE
697       std::cerr << "UCL_COPY 6NS\n";
698       #endif
699     }
700     #ifdef UCL_DBG_MEM_TRACE
701     else std::cerr << "UCL_COPY 6S\n";
702     #endif
703 
704     if (block==CL_TRUE) ucl_sync(cq);
705     #ifdef GERYON_OCL_FLUSH
706     else ucl_flush(cq);
707     #endif
708   }
709   template <class p1, class p2>
710   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
711                         const size_t spitch, const size_t cols,
712                         const size_t rows, cl_command_queue &cq,
713                         const cl_bool block,
714                         size_t dst_offset, size_t src_offset) {
715     if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
716       #ifdef UCL_DBG_MEM_TRACE
717       std::cerr << "UCL_COPY 7NS\n";
718       #endif
719       if (spitch==dpitch && dst.cols()==src.cols() &&
720           src.cols()==cols/src.element_size())
721         CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
722                                          dst_offset,spitch*rows,0,nullptr,nullptr));
723 
724       else
725         for (size_t i=0; i<rows; i++) {
726           CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
727                                            src_offset,dst_offset,cols,0,
728                                            nullptr,nullptr));
729           src_offset+=spitch;
730           dst_offset+=dpitch;
731         }
732     }
733     #ifdef UCL_DBG_MEM_TRACE
734     else std::cerr << "UCL_COPY 7S\n";
735     #endif
736 
737     if (block==CL_TRUE) ucl_sync(cq);
738     #ifdef GERYON_OCL_FLUSH
739     else ucl_flush(cq);
740     #endif
741   }
742 };
743 
744 template<class mat1, class mat2>
745 inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
746   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,dst.cq(),CL_TRUE,
747                                                  dst.byteoff(),src.byteoff());
748 }
749 
750 template<class mat1, class mat2>
751 inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
752                        cl_command_queue &cq) {
753   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq,CL_FALSE,
754                                                  dst.byteoff(),src.byteoff());
755 }
756 
757 template<class mat1, class mat2>
758 inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
759                        const size_t spitch, const size_t cols,
760                        const size_t rows) {
761   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
762                                                  rows,dst.cq(),CL_TRUE,
763                                                  dst.byteoff(),src.byteoff());
764 }
765 
766 template<class mat1, class mat2>
767 inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
768                            const size_t spitch, const size_t cols,
769                            const size_t rows,cl_command_queue &cq) {
770   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
771                                                  rows,cq,CL_FALSE,
772                                                  dst.byteoff(),src.byteoff());
773 }
774 
775 } // namespace ucl_cudart
776 
777 #endif
778 
779