1 /***************************************************************************
2                                  ucl_copy.h
3                              -------------------
4                                W. Michael Brown
5 
6   Routines for copying matrix/vector data onto and off coprocessor device
7 
8  __________________________________________________________________________
9     This file is part of the Geryon Unified Coprocessor Library (UCL)
10  __________________________________________________________________________
11 
12     begin                : Mon Jan 4 2010
13     copyright            : (C) 2010 by W. Michael Brown
14     email                : brownw@ornl.gov
15  ***************************************************************************/
16 
17 /* -----------------------------------------------------------------------
18    Copyright (2010) Sandia Corporation.  Under the terms of Contract
19    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
20    certain rights in this software.  This software is distributed under
21    the Simplified BSD License.
22    ----------------------------------------------------------------------- */
23 
24 /***************************************************************************
25    The ucl_copy and ucl_cast_copy routines provide a general prototype for
26    copying data between host and device memory (including texture memory)
27    for the matrix and vector types in nvc_memory.
28 
29    For host/host and host/device transfers, typecasting is performed
30    automatically as necessary.
31 
32    The routines are written so that all branches can be removed by the
33    compiler during template instantiation.
34 
35    The routines currently assume row-major ordering for all types.
36 
37    For asynchronous copy in the default command queue, async is boolean true;
38    For asynchronous copy in a specified command queue, async is command queue
39    Otherwise, set async to boolean false;
40 
41    When performing frequent data copies that require casting, it is more
42    efficient to allocate a casting buffer once and then pass that buffer
43    to the copy routine. This can be accomplished with the ucl_cast_copy
44    routines.
45 
46    Examples
47       (x's represent alignment padding - to maintain alignment)
48       (o's represent a larger matrix in memory)
49       (vectors represented as single row)
50    ----------------------------------------------------------------
51        dst           src            command
52    ----------------------------------------------------------------
53     0 1 2 3 4 <-- 0 1 2 3 4          ucl_copy(dst,src,async)
54 
55     0 1 2 3   <-- 0 1 2 3 4          ucl_copy(dst,src,4,async)
56 
57     0 1 2     <-- 0 1 2 3 4 5        ucl_copy(dst,src,async)
58     3 4 5
59 
60     0 1 2 3 4 5 <-- 0 1 2            ucl_copy(dst,src,async)
61                     3 4 5
62 
63     0 1 2      <--  0 1 2            ucl_copy(dst,src,async)
64     3 4 5           3 4 5
65 
66     0 1 2      <--  0 1 2            ucl_copy(dst,src,6,async)
67     3 4 5           3 4 5
68                     5 6 7
69 
70     0 1 2      <--  0  1  2  3       ucl_copy(dst,src,2,3,async)
71     4 5 6           4  5  6  7
72                     8  9  10 11
73 
74     0 1 2 x x  <--  0 1 2            ucl_copy(dst,src,async)
75     3 4 5 x x       3 4 5
76 
77     0 1 2      <--  0 1 2 x x        ucl_copy(dst,src,async)
78     3 4 5           3 4 5 x x
79 
80     0 1 2 o o  <--  0 1 2            ucl_copy(dst,src,2,3,async)
81     3 4 5 o o       3 4 5
82     o o o o o
83 
84     0 1 2 o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,3,async)
85     3 4 5 o o
86     o o o o o
87 
88     0 1 o o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,2,async)
89     2 3 o o o
90     o o o o o
91 
92     0 1 2 o o  <--  0  1  2  3  4    ucl_copy(dst,src,2,3,async)
93     5 6 7 o o       5  6  7  8  9
94     o o o o o       10 11 12 13 14
95 
96     0 1 2 5 6 7  <--  0  1  2  3  4  ucl_copy(dst,src,2,3,async)
97                       5  6  7  8  9
98                       10 11 12 13 14
99 
100  ***************************************************************************/
101 
102 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
103 #ifdef UCL_COPY_ALLOW
104 
105 // --------------------------------------------------------------------------
106 // - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
107 // --------------------------------------------------------------------------
108 template <class mat1, class mat2>
_check_ucl_copy_perm(mat1 & dst,mat2 & src)109 inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
110   if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
111     if (dst.kind()==UCL_READ_ONLY) {
112       std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
113       assert(0==1);
114     } else if (src.kind()==UCL_WRITE_ONLY) {
115       std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
116       assert(0==1);
117     }
118   } else {
119     if (dst.kind()==UCL_WRITE_ONLY) {
120       std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
121       assert(0==1);
122     } else if (src.kind()==UCL_READ_ONLY) {
123       std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
124       assert(0==1);
125     }
126   }
127 }
128 
129 // --------------------------------------------------------------------------
130 // - HOST-HOST COPY ROUTINES
131 // --------------------------------------------------------------------------
132 
133 // Have to use specialization because some types don't have operator[]
134 template <int host_t1, int host_t2> struct _host_host_copy;
135 
136 // Both on host
137 template <> struct _host_host_copy<1,1> {
138   template <class mat1, class mat2>
139   static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
140     #ifdef UCL_DEBUG
141     assert(mat1::PADDED==0 && mat2::PADDED==0);
142     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
143     #endif
144     if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
145       #ifdef _OCL_MAT
146       if (dst.begin()==src.begin()) {
147         #ifdef UCL_DBG_MEM_TRACE
148         std::cerr << "UCL_COPY 7S\n";
149         #endif
150         return;
151       }
152       #endif
153       memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
154       #ifdef UCL_DBG_MEM_TRACE
155       std::cerr << "UCL_COPY 7NS\n";
156       #endif
157     } else
158       for (size_t i=0; i<numel; i++)
159         dst[i]=static_cast<typename mat1::data_type>(src[i]);
160   }
161   template <class mat1, class mat2>
162   static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
163                          const size_t cols) {
164     #ifdef UCL_DEBUG
165     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
166     #endif
167     size_t dst_row_size, src_row_size;
168     if (mat1::VECTOR)
169       dst_row_size=cols;
170     else
171       dst_row_size=dst.row_size();
172     if (mat2::VECTOR)
173       src_row_size=cols;
174     else
175       src_row_size=src.row_size();
176     if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
177       #ifdef _OCL_MAT
178       if (dst.begin()==src.begin()) {
179         #ifdef UCL_DBG_MEM_TRACE
180         std::cerr << "UCL_COPY 8S\n";
181         #endif
182         return;
183       }
184       #endif
185 
186       #ifdef UCL_DBG_MEM_TRACE
187       std::cerr << "UCL_COPY 8NS\n";
188       #endif
189       for (size_t i=0; i<rows; i++)
190         memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
191                cols*sizeof(typename mat1::data_type));
192     } else
193       for (size_t j=0; j<rows; j++) {
194         size_t dst_i=j*dst_row_size;
195         size_t d_end=dst_i+cols;
196         size_t src_i=j*src_row_size;
197         for (; dst_i<d_end; dst_i++) {
198           dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
199           src_i++;
200         }
201       }
202   }
203 };
204 
205 // Should never be here
206 template <int host_t1, int host_t2> struct _host_host_copy {
207   template <class mat1, class mat2>
208   static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
209     assert(0==1);
210   }
211   template <class mat1, class mat2>
212   static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
213                          const size_t cols) {
214     assert(0==1);
215   }
216 };
217 
218 // --------------------------------------------------------------------------
219 // - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING
220 // --------------------------------------------------------------------------
221 
222 // Helper functions for ucl_cast_copy
223 template <int host_type1, int host_type2> struct _ucl_cast_copy;
224 
225 // Destination is on host
226 template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
227   template <class mat1, class mat2, class mat3>
228   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
229                         mat3 &cast_buffer) {
230     ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type));
231     for (size_t i=0; i<numel; i++)
232       dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
233   }
234   template <class mat1, class mat2, class mat3>
235   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
236                         mat3 &cast_buffer,command_queue &cq) {
237     ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq);
238     cast_buffer.sync();
239     for (size_t i=0; i<numel; i++)
240       dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
241   }
242   template <class mat1, class mat2, class mat3>
243   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
244                         const size_t cols, mat3 &cast_buffer) {
245     // Asynchronous currently pointless here
246     #ifdef UCL_DEBUG
247     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
248     assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
249     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
250     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
251     #endif
252     if (mat1::VECTOR) {
253       ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
254                  src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
255       for (size_t i=0; i<rows*cols; i++)
256         dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
257     } else {
258       if (mat2::VECTOR)
259         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
260                    cols*sizeof(typename mat2::data_type),
261                    cols*sizeof(typename mat2::data_type),rows);
262       else
263         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
264                    src.row_bytes(),cols*sizeof(typename mat2::data_type),
265                    rows);
266       size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
267       for (size_t i=0; i<rows; i++) {
268         for (size_t j=0; j<cols; j++) {
269           dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
270           buff_i++;
271           dst_i++;
272         }
273         dst_i+=doff;
274       }
275     }
276   }
277   template <class mat1, class mat2, class mat3>
278   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
279                         const size_t cols, mat3 &cast_buffer,
280                         command_queue &cq) {
281     // Asynchronous currently pointless here
282     #ifdef UCL_DEBUG
283     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
284     assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
285     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
286     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
287     #endif
288     if (mat1::VECTOR) {
289       ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
290                  src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
291       cast_buffer.sync();
292       for (size_t i=0; i<rows*cols; i++)
293         dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
294     } else {
295       if (mat2::VECTOR)
296         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
297                    cols*sizeof(typename mat2::data_type),
298                    cols*sizeof(typename mat2::data_type),rows,cq);
299       else
300         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
301                    src.row_bytes(),cols*sizeof(typename mat2::data_type),
302                    rows,cq);
303       cast_buffer.sync();
304       size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
305       for (size_t i=0; i<rows; i++) {
306         for (size_t j=0; j<cols; j++) {
307           dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
308           buff_i++;
309           dst_i++;
310         }
311         dst_i+=doff;
312       }
313     }
314   }
315 };
316 
317 // Source is on host
318 template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
319   template <class mat1, class mat2, class mat3>
320   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
321                         mat3 &cast_buffer) {
322     for (size_t i=0; i<numel; i++)
323       cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
324     ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type));
325   }
326   template <class mat1, class mat2, class mat3>
327   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
328                         mat3 &cast_buffer, command_queue &cq) {
329     for (size_t i=0; i<numel; i++)
330       cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
331     ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq);
332   }
333   template <class mat1, class mat2, class mat3>
334   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
335                         const size_t cols, mat3 &cast_buffer) {
336     #ifdef UCL_DEBUG
337     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
338     assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
339     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
340     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
341     if (mat3::VECTOR==0) {
342       assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
343       assert(dst.rows()>=rows && dst.cols()>=cols);
344     }
345     #endif
346     if (mat2::VECTOR) {
347       if (mat3::VECTOR==0) {
348         size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
349         for (size_t i=0; i<rows; i++) {
350           for (size_t j=0; j<cols; j++) {
351             cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
352             ci++;
353             si++;
354           }
355           ci+=co;
356           si+=so;
357         }
358         ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
359                    cols*sizeof(typename mat1::data_type),rows);
360       } else {
361         for (size_t i=0; i<rows*cols; i++)
362           cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
363         ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
364                    cols*sizeof(typename mat1::data_type),
365                    cols*sizeof(typename mat1::data_type),rows);
366       }
367     } else if (mat1::VECTOR) {
368       size_t src_i=0, buf_i=0, soff=src.cols()-cols;
369       for (size_t i=0; i<rows; i++) {
370         for (size_t j=0; j<cols; j++) {
371           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
372           buf_i++;
373           src_i++;
374         }
375         src_i+=soff;
376       }
377       ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
378     } else {
379       size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
380       if (mat3::VECTOR==0) {
381         co=cast_buffer.cols()-cols;
382         spitch=cast_buffer.row_bytes();
383       } else {
384         co=0;
385         spitch=cols*sizeof(typename mat1::data_type);
386       }
387       for (size_t i=0; i<rows; i++) {
388         for (size_t j=0; j<cols; j++) {
389           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
390           buf_i++;
391           src_i++;
392         }
393         src_i+=so;
394         buf_i+=co;
395       }
396       ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
397                  cols*sizeof(typename mat1::data_type),rows);
398     }
399   }
400   template <class mat1, class mat2, class mat3>
401   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
402                         const size_t cols, mat3 &cast_buffer,
403                         command_queue &cq) {
404     #ifdef UCL_DEBUG
405     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
406     assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
407     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
408     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
409     if (mat3::VECTOR==0) {
410       assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
411       assert(dst.rows()>=rows && dst.cols()>=cols);
412     }
413     #endif
414     if (mat2::VECTOR) {
415       if (mat3::VECTOR==0) {
416         size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
417         for (size_t i=0; i<rows; i++) {
418           for (size_t j=0; j<cols; j++) {
419             cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
420             ci++;
421             si++;
422           }
423           ci+=co;
424           si+=so;
425         }
426         ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
427                    cols*sizeof(typename mat1::data_type),rows);
428       } else {
429         for (size_t i=0; i<rows*cols; i++)
430           cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
431         ucl_mv_cpy(dst,dst.row_bytes(),
432                    cast_buffer,cols*sizeof(typename mat1::data_type),
433                    cols*sizeof(typename mat1::data_type),rows,cq);
434       }
435     } else if (mat1::VECTOR) {
436       size_t src_i=0, buf_i=0, soff=src.cols()-cols;
437       for (size_t i=0; i<rows; i++) {
438         for (size_t j=0; j<cols; j++) {
439           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
440           buf_i++;
441           src_i++;
442         }
443         src_i+=soff;
444       }
445       ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
446     } else {
447       size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
448       if (mat3::VECTOR==0) {
449         co=cast_buffer.cols()-cols;
450         spitch=cast_buffer.row_bytes();
451       } else {
452         co=0;
453         spitch=cols*sizeof(typename mat1::data_type);
454       }
455       for (size_t i=0; i<rows; i++) {
456         for (size_t j=0; j<cols; j++) {
457           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
458           buf_i++;
459           src_i++;
460         }
461         src_i+=so;
462         buf_i+=co;
463       }
464       ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
465                  cols*sizeof(typename mat1::data_type),rows,cq);
466     }
467   }
468 };
469 
470 // Neither on host or both on host
471 template <> struct _ucl_cast_copy<1,1> {
472   template <class mat1, class mat2, class mat3>
473   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
474                         mat3 &cast_buffer, command_queue &cq) {
475     assert(0==1);
476   }
477   template <class mat1, class mat2, class mat3>
478   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
479                         mat3 &cast_buffer) {
480     assert(0==1);
481   }
482   template <class mat1, class mat2, class mat3>
483   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
484                         const size_t cols, mat3 &cast_buffer) {
485     assert(0==1);
486   }
487   template <class mat1, class mat2, class mat3>
488   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
489                         const size_t cols, mat3 &cast_buffer,
490                         command_queue &cq) {
491     assert(0==1);
492   }
493 };
494 
495 // Neither on host or both on host
496 template <> struct _ucl_cast_copy<0,0> {
497   template <class mat1, class mat2, class mat3>
498   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
499                         mat3 &cast_buffer, command_queue &cq) {
500     assert(0==1);
501   }
502   template <class mat1, class mat2, class mat3>
503   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
504                         mat3 &cast_buffer) {
505     assert(0==1);
506   }
507   template <class mat1, class mat2, class mat3>
508   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
509                         const size_t cols, mat3 &cast_buffer) {
510     assert(0==1);
511   }
512   template <class mat1, class mat2, class mat3>
513   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
514                         const size_t cols, mat3 &cast_buffer,
515                         command_queue &cq) {
516     assert(0==1);
517   }
518 };
519 
520 // --------------------------------------------------------------------------
521 // - 1D COPY - SPECIFIED NUMBER OF BYTES
522 // --------------------------------------------------------------------------
523 
524 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
525 /** \param numel Number of elements (not bytes) to copy
526   * \param cast_buffer Buffer on host with enough storage for casting
527   * - If the data types for the two matrices are same, no cast performed
528   * - Padding for 2D matrices is not considered in this routine.
529   * - Currently does not handle textures **/
530 template <class mat1, class mat2, class mat3>
531 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
532                           mat3 &cast_buffer, command_queue &cq) {
533   #ifdef UCL_DEBUG
534   assert(dst.numel()>=numel && src.numel()>=numel);
535   assert(cast_buffer.numel()>=numel);
536   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
537   #endif
538   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
539     ucl_copy(dst,src,numel,cq);
540   else {
541     #ifdef UCL_DEBUG
542     _check_ucl_copy_perm(dst,src);
543     #endif
544     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
545                                                       cast_buffer,cq);
546   }
547 }
548 
549 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
550 /** \param numel Number of elements (not bytes) to copy
551   * \param async Perform non-blocking copy on default stream
552   * \param cast_buffer Buffer on host with enough storage for casting
553   * - If the data types for the two matrices are same, no cast performed
554   * - Padding for 2D matrices is not considered in this routine.
555   * - Currently does not handle textures **/
556 template <class mat1, class mat2, class mat3>
557 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
558                           mat3 &cast_buffer, const bool async) {
559   #ifdef UCL_DEBUG
560   assert(dst.numel()>=numel && src.numel()>=numel);
561   assert(cast_buffer.numel()>=numel);
562   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
563   _check_ucl_copy_perm(dst,src);
564   #endif
565   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
566     ucl_copy(dst,src,numel,async);
567   else if (async)
568     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
569                                                       cast_buffer,dst.cq());
570   else
571     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
572                                                       cast_buffer);
573 }
574 
575 /// Asynchronous copy of matrix/vector (memory already allocated)
576 /** \param numel Number of elements (not bytes) to copy
577   * - If the data types of the two matrices are not the same,
578   *   casting will be performed automatically as long as the copy is
579   *   not device to device. For host/device transfers, a temporary
580   *   buffer is created for copy. When multiple casts occur, it is
581   *   more efficient to create a permanent casting buffer that can
582   *   be passed to an alternative  copy routine.
583   * - Padding for 2D matrices is not considered in this routine.
584   * - Currently does not handle textures **/
585 template <class mat1, class mat2>
586 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
587                      command_queue &cq) {
588   #ifdef UCL_DEBUG
589   assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
590   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
591   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
592   _check_ucl_copy_perm(dst,src);
593   #endif
594   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
595     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
596   else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
597       (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
598     if (mat1::MEM_TYPE==1) {
599       UCL_H_Vec<typename mat2::data_type> cast_buffer;
600       cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
601       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
602                                                         cast_buffer,cq);
603     } else {
604       UCL_H_Vec<typename mat1::data_type> cast_buffer;
605       cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
606       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
607                                                         cast_buffer,cq);
608     }
609   } else
610     ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
611 }
612 
613 /// Copy matrix/vector (memory already allocated)
614 /** \param numel Number of elements (not bytes) to copy
615   * \param async Perform non-blocking copy (ignored for host to host copy)
616   * - If the data types of the two matrices are not the same,
617   *   casting will be performed automatically as long as the copy is
618   *   not device to device. For host/device transfers, a temporary
619   *   buffer is created for copy. When multiple casts occur, it is
620   *   more efficient to create a permanent casting buffer that can
621   *   be passed to an alternative  copy routine.
622   * - Padding for 2D matrices is not considered in this routine.
623   * - The default stream is used for asynchronous copy
624   * - Currently does not handle textures **/
625 template <class mat1, class mat2>
626 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
627                      const bool async) {
628   #ifdef UCL_DEBUG
629   assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
630   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
631   _check_ucl_copy_perm(dst,src);
632   #endif
633   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
634     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
635   else if (async)
636     ucl_copy(dst,src,numel,dst.cq());
637   else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
638            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
639     if (mat1::MEM_TYPE==1) {
640       UCL_H_Vec<typename mat2::data_type> cast_buffer;
641       cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
642       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
643                                                         cast_buffer);
644     } else {
645       UCL_H_Vec<typename mat1::data_type> cast_buffer;
646       cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
647       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
648                                                         cast_buffer);
649     }
650   } else
651     ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
652 }
653 
654 // --------------------------------------------------------------------------
655 // - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS
656 // --------------------------------------------------------------------------
657 
658 /// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer)
659 /** \param async Perform non-blocking copy on default stream
660   * \param cast_buffer Buffer on host with enough storage for casting
661   * - If src is a vector, routine assumes row-major rows by cols copy
662   * - If src is a matrix, routine will copy upper left tile of matrix
663   * - If dst is a vector, routine assumes row-major rows by cols copy
664   * - If dst is a matrix, routine will copy into left tile of matrix
665   * - If the data types for the two matrices are same, no cast performed
666   * - Padding for 2D matrices is not considered in this routine.
667   * - Copy from vector to matrix and vice versa allowed
668   * - Currently does not handle textures **/
669 template <class mat1, class mat2, class mat3>
670 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
671                           const size_t cols, mat3 &cast_buffer,
672                           const bool async) {
673   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
674     ucl_copy(dst,src,rows,cols,async);
675   else if (async)
676     ucl_copy(dst,src,rows,cols,dst.cq());
677   else {
678     #ifdef UCL_DEBUG
679     _check_ucl_copy_perm(dst,src);
680     #endif
681     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
682                                                       cast_buffer);
683   }
684 }
685 
686 /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
687 /** \param cast_buffer Buffer on host with enough storage for casting
688   * - If src is a vector, routine assumes row-major rows by cols copy
689   * - If src is a matrix, routine will copy upper left tile of matrix
690   * - If dst is a vector, routine assumes row-major rows by cols copy
691   * - If dst is a matrix, routine will copy into upper left tile of matrix
692   * - If the data types for the two matrices are same, no cast performed
693   * - Padding for 2D matrices is not considered in this routine.
694   * - Copy from vector to matrix and vice versa allowed
695   * - Currently does not handle textures **/
696 template <class mat1, class mat2, class mat3>
697 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
698                           const size_t cols, mat3 &cast_buffer,
699                           command_queue &cq) {
700   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
701     ucl_copy(dst,src,rows,cols,cq);
702   else {
703     #ifdef UCL_DEBUG
704     _check_ucl_copy_perm(dst,src);
705     #endif
706     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
707                                                       cast_buffer,cq);
708   }
709 }
710 
711 /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
712 /** - If src is a vector, routine assumes row-major rows by cols copy
713   * - If src is a matrix, routine will copy upper left tile of matrix
714   * - If dst is a vector, routine assumes row-major rows by cols copy
715   * - If dst is a matrix, routine will copy into left tile of matrix
716   * - If the data types of the two matrices are not the same,
717   *   casting will be performed automatically as long as the copy is
718   *   not device to device. For host/device transfers, a temporary
719   *   buffer is created for copy. When multiple casts occur, it is
720   *   more efficient to create a permanent casting buffer that can
721   *   be passed to an alternative copy routine.
722   * - The copy should handle padding for 2D alignment correctly
723   * - Copy from vector to matrix and vice versa allowed
724   * - Currently does not handle textures **/
725 template <class mat1, class mat2>
726 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
727                      const size_t cols, command_queue &cq) {
728   #ifdef UCL_DEBUG
729   _check_ucl_copy_perm(dst,src);
730   #endif
731   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
732     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
733   else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
734            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
735     if (mat1::MEM_TYPE==1) {
736       UCL_H_Vec<typename mat2::data_type> cast_buffer;
737       cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
738       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
739                                                         cast_buffer,cq);
740     } else {
741       UCL_H_Vec<typename mat1::data_type> cast_buffer;
742       cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
743       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
744                                                         cast_buffer,cq);
745     }
746   // If we are here, at least one of the matrices must have VECTOR=0
747   } else if (mat1::VECTOR) {
748     #ifdef UCL_DEBUG
749     assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
750     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
751     #endif
752     ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
753                                cols*sizeof(typename mat1::data_type),rows,
754                                cq);
755   } else if (mat2::VECTOR) {
756     #ifdef UCL_DEBUG
757     assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
758     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
759     #endif
760     ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
761                cols*sizeof(typename mat1::data_type),rows,cq);
762   } else {
763     #ifdef UCL_DEBUG
764     assert(src.rows()>=rows && src.cols()>=cols);
765     assert(dst.rows()>=rows && dst.cols()>=cols);
766     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
767     #endif
768     ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
769                cols*sizeof(typename mat1::data_type),rows,cq);
770   }
771 }
772 
773 /// Copy subset of matrix rows,cols (memory already allocated)
774 /** \param async Perform non-blocking copy (ignored for host to host copy)
775   * - If src is a vector, routine assumes row-major rows by cols copy
776   * - If src is a matrix, routine will copy upper left tile of matrix
777   * - If dst is a vector, routine assumes row-major rows by cols copy
778   * - If dst is a matrix, routine will copy into left tile of matrix
779   * - If the data types of the two matrices are not the same,
780   *   casting will be performed automatically as long as the copy is
781   *   not device to device. For host/device transfers, a temporary
782   *   buffer is created for copy. When multiple casts occur, it is
783   *   more efficient to create a permanent casting buffer that can
784   *   be passed to an alternative  copy routine.
785   * - The copy should handle padding for 2D alignment correctly
786   * - Copy from vector to matrix and vice versa allowed
787   * - The default stream is used for asynchronous copy
788   * - Currently does not handle textures **/
789 template <class mat1, class mat2>
790 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
791                      const size_t cols, const bool async) {
792   #ifdef UCL_DEBUG
793   _check_ucl_copy_perm(dst,src);
794   #endif
795   if (async)
796     ucl_copy(dst,src,rows,cols,dst.cq());
797   else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
798     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
799   else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
800            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
801     if (mat1::MEM_TYPE==1) {
802       UCL_H_Vec<typename mat2::data_type> cast_buffer;
803       cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
804       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
805                                                         cast_buffer);
806     } else {
807       UCL_H_Vec<typename mat1::data_type> cast_buffer;
808       cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
809       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
810                                                         cast_buffer);
811     }
812   // If we are here, at least one of the matrices must have VECTOR=0
813   } else if (mat1::VECTOR) {
814     #ifdef UCL_DEBUG
815     assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
816     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
817     assert(mat2::VECTOR==0);
818     #endif
819     ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
820                    cols*sizeof(typename mat1::data_type),rows);
821   } else if (mat2::VECTOR) {
822     #ifdef UCL_DEBUG
823     assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
824     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
825     assert(mat1::VECTOR==0);
826     #endif
827     ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
828                cols*sizeof(typename mat1::data_type),rows);
829   } else {
830     #ifdef UCL_DEBUG
831     assert(src.rows()>=rows && src.cols()>=cols);
832     assert(dst.rows()>=rows && dst.cols()>=cols);
833     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
834     #endif
835     ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
836                cols*sizeof(typename mat1::data_type),rows);
837   }
838 }
839 
840 // --------------------------------------------------------------------------
841 // - 1D/2D COPY
842 // --------------------------------------------------------------------------
843 
844 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
845 /** \param async Perform non-blocking copy on default stream
846   * \param cast_buffer Buffer on host with enough storage for casting
847   * - If the data types for the two matrices are same, no cast performed
848   * - The number of bytes copied is determined by entire src data
849   * - Padding for 2D matrices is not considered in this routine.
850   * - Copy from vector to matrix and vice versa allowed
851   * - Currently does not handle textures **/
852 template <class mat1, class mat2, class mat3>
853 inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
854                           mat3 &cast_buffer, const bool async) {
855   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
856     ucl_copy(dst,src,async);
857   else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
858     ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async);
859   else if (mat1::PADDED==1)
860     ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async);
861   else
862     ucl_cast_copy(dst,src,src.numel(),cast_buffer,async);
863 }
864 
865 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
866 /** \param cast_buffer Buffer on host with enough storage for casting
867   * - If the data types for the two matrices are same, no cast performed
868   * - The number of bytes copied is determined by entire src data
869   * - Padding for 2D matrices is not considered in this routine.
870   * - Copy from vector to matrix and vice versa allowed
871   * - Currently does not handle textures **/
872 template <class mat1, class mat2, class mat3>
873 inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
874                           mat3 &cast_buffer, command_queue &cq) {
875   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
876     ucl_copy(dst,src,cq);
877   else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
878     ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq);
879   else if (mat1::PADDED==1)
880     ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq);
881   else
882     ucl_copy(dst,src,src.numel(),cast_buffer,cq);
883 }
884 
885 /// Asynchronous copy of matrix/vector (memory already allocated)
886 /** - The number of bytes copied is determined by entire src data
887   * - If the data types of the two matrices are not the same,
888   *   casting will be performed automatically as long as the copy is
889   *   not device to device. For host/device transfers, a temporary
890   *   buffer is created for copy. When multiple casts occur, it is
891   *   more efficient to create a permanent casting buffer that can
892   *   be passed to an alternative copy routine.
893   * - The copy should handle padding for 2D alignment correctly
894   * - Copy from vector to matrix and vice versa allowed
895   * - Currently does not handle textures **/
896 template <class mat1, class mat2>
897 inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) {
898   if (dst.row_bytes()==src.row_bytes() &&
899       src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
900       (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
901     ucl_copy(dst,src,src.row_size()*src.rows(),cq);
902   else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
903     ucl_copy(dst,src,src.rows(),src.cols(),cq);
904   else if (mat1::PADDED==1)
905     ucl_copy(dst,src,dst.rows(),dst.cols(),cq);
906   else
907     ucl_copy(dst,src,src.numel(),cq);
908 }
909 
910 /// Copy matrix/vector (memory already allocated)
911 /** \param async Perform non-blocking copy (ignored for host to host copy)
912   * - The number of bytes copied is determined by entire src data
913   * - If the data types of the two matrices are not the same,
914   *   casting will be performed automatically as long as the copy is
915   *   not device to device. For host/device transfers, a temporary
916   *   buffer is created for copy. When multiple casts occur, it is
917   *   more efficient to create a permanent casting buffer that can
918   *   be passed to an alternative  copy routine.
919   * - The copy should handle padding for 2D alignment correctly
920   * - Copy from vector to matrix and vice versa allowed
921   * - The default stream is used for asynchronous copy
922   * - Currently does not handle textures **/
923 template <class mat1, class mat2>
924 inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
925   if (async)
926     ucl_copy(dst,src,dst.cq());
927   else if (dst.row_bytes()==src.row_bytes() &&
928            src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
929            (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
930     ucl_copy(dst,src,src.row_size()*src.rows(),async);
931   else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
932     ucl_copy(dst,src,src.rows(),src.cols(),async);
933   else if (mat1::PADDED==1)
934     ucl_copy(dst,src,dst.rows(),dst.cols(),async);
935   else
936     ucl_copy(dst,src,src.numel(),async);
937 }
938 
939 #endif
940 
941