1 /***************************************************************************
2 ucl_copy.h
3 -------------------
4 W. Michael Brown
5
6 Routines for copying matrix/vector data onto and off coprocessor device
7
8 __________________________________________________________________________
9 This file is part of the Geryon Unified Coprocessor Library (UCL)
10 __________________________________________________________________________
11
12 begin : Mon Jan 4 2010
13 copyright : (C) 2010 by W. Michael Brown
14 email : brownw@ornl.gov
15 ***************************************************************************/
16
17 /* -----------------------------------------------------------------------
18 Copyright (2010) Sandia Corporation. Under the terms of Contract
19 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
20 certain rights in this software. This software is distributed under
21 the Simplified BSD License.
22 ----------------------------------------------------------------------- */
23
24 /***************************************************************************
25 The ucl_copy and ucl_cast_copy routines provide a general prototype for
26 copying data between host and device memory (including texture memory)
27 for the matrix and vector types in nvc_memory.
28
29 For host/host and host/device transfers, typecasting is performed
30 automatically as necessary.
31
32 The routines are written so that all branches can be removed by the
33 compiler during template instantiation.
34
35 The routines currently assume row-major ordering for all types.
36
37 For asynchronous copy in the default command queue, async is boolean true;
38 For asynchronous copy in a specified command queue, async is command queue
39 Otherwise, set async to boolean false;
40
41 When performing frequent data copies that require casting, it is more
42 efficient to allocate a casting buffer once and then pass that buffer
43 to the copy routine. This can be accomplished with the ucl_cast_copy
44 routines.
45
46 Examples
47 (x's represent alignment padding - to maintain alignment)
48 (o's represent a larger matrix in memory)
49 (vectors represented as single row)
50 ----------------------------------------------------------------
51 dst src command
52 ----------------------------------------------------------------
53 0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
54
55 0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
56
57 0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
58 3 4 5
59
60 0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
61 3 4 5
62
63 0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
64 3 4 5 3 4 5
65
66 0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
67 3 4 5 3 4 5
68 5 6 7
69
70 0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
71 4 5 6 4 5 6 7
72 8 9 10 11
73
74 0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
75 3 4 5 x x 3 4 5
76
77 0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
78 3 4 5 3 4 5 x x
79
80 0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
81 3 4 5 o o 3 4 5
82 o o o o o
83
84 0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
85 3 4 5 o o
86 o o o o o
87
88 0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
89 2 3 o o o
90 o o o o o
91
92 0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
93 5 6 7 o o 5 6 7 8 9
94 o o o o o 10 11 12 13 14
95
96 0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
97 5 6 7 8 9
98 10 11 12 13 14
99
100 ***************************************************************************/
101
102 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
103 #ifdef UCL_COPY_ALLOW
104
105 // --------------------------------------------------------------------------
106 // - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
107 // --------------------------------------------------------------------------
108 template <class mat1, class mat2>
_check_ucl_copy_perm(mat1 & dst,mat2 & src)109 inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
110 if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
111 if (dst.kind()==UCL_READ_ONLY) {
112 std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
113 assert(0==1);
114 } else if (src.kind()==UCL_WRITE_ONLY) {
115 std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
116 assert(0==1);
117 }
118 } else {
119 if (dst.kind()==UCL_WRITE_ONLY) {
120 std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
121 assert(0==1);
122 } else if (src.kind()==UCL_READ_ONLY) {
123 std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
124 assert(0==1);
125 }
126 }
127 }
128
129 // --------------------------------------------------------------------------
130 // - HOST-HOST COPY ROUTINES
131 // --------------------------------------------------------------------------
132
133 // Have to use specialization because some types don't have operator[]
134 template <int host_t1, int host_t2> struct _host_host_copy;
135
136 // Both on host
137 template <> struct _host_host_copy<1,1> {
138 template <class mat1, class mat2>
139 static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
140 #ifdef UCL_DEBUG
141 assert(mat1::PADDED==0 && mat2::PADDED==0);
142 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
143 #endif
144 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
145 #ifdef _OCL_MAT
146 if (dst.begin()==src.begin()) {
147 #ifdef UCL_DBG_MEM_TRACE
148 std::cerr << "UCL_COPY 7S\n";
149 #endif
150 return;
151 }
152 #endif
153 memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
154 #ifdef UCL_DBG_MEM_TRACE
155 std::cerr << "UCL_COPY 7NS\n";
156 #endif
157 } else
158 for (size_t i=0; i<numel; i++)
159 dst[i]=static_cast<typename mat1::data_type>(src[i]);
160 }
161 template <class mat1, class mat2>
162 static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
163 const size_t cols) {
164 #ifdef UCL_DEBUG
165 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
166 #endif
167 size_t dst_row_size, src_row_size;
168 if (mat1::VECTOR)
169 dst_row_size=cols;
170 else
171 dst_row_size=dst.row_size();
172 if (mat2::VECTOR)
173 src_row_size=cols;
174 else
175 src_row_size=src.row_size();
176 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
177 #ifdef _OCL_MAT
178 if (dst.begin()==src.begin()) {
179 #ifdef UCL_DBG_MEM_TRACE
180 std::cerr << "UCL_COPY 8S\n";
181 #endif
182 return;
183 }
184 #endif
185
186 #ifdef UCL_DBG_MEM_TRACE
187 std::cerr << "UCL_COPY 8NS\n";
188 #endif
189 for (size_t i=0; i<rows; i++)
190 memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
191 cols*sizeof(typename mat1::data_type));
192 } else
193 for (size_t j=0; j<rows; j++) {
194 size_t dst_i=j*dst_row_size;
195 size_t d_end=dst_i+cols;
196 size_t src_i=j*src_row_size;
197 for (; dst_i<d_end; dst_i++) {
198 dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
199 src_i++;
200 }
201 }
202 }
203 };
204
205 // Should never be here
206 template <int host_t1, int host_t2> struct _host_host_copy {
207 template <class mat1, class mat2>
208 static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
209 assert(0==1);
210 }
211 template <class mat1, class mat2>
212 static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
213 const size_t cols) {
214 assert(0==1);
215 }
216 };
217
218 // --------------------------------------------------------------------------
219 // - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING
220 // --------------------------------------------------------------------------
221
222 // Helper functions for ucl_cast_copy
223 template <int host_type1, int host_type2> struct _ucl_cast_copy;
224
225 // Destination is on host
226 template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
227 template <class mat1, class mat2, class mat3>
228 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
229 mat3 &cast_buffer) {
230 ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type));
231 for (size_t i=0; i<numel; i++)
232 dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
233 }
234 template <class mat1, class mat2, class mat3>
235 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
236 mat3 &cast_buffer,command_queue &cq) {
237 ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq);
238 cast_buffer.sync();
239 for (size_t i=0; i<numel; i++)
240 dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
241 }
242 template <class mat1, class mat2, class mat3>
243 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
244 const size_t cols, mat3 &cast_buffer) {
245 // Asynchronous currently pointless here
246 #ifdef UCL_DEBUG
247 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
248 assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
249 if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
250 if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
251 #endif
252 if (mat1::VECTOR) {
253 ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
254 src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
255 for (size_t i=0; i<rows*cols; i++)
256 dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
257 } else {
258 if (mat2::VECTOR)
259 ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
260 cols*sizeof(typename mat2::data_type),
261 cols*sizeof(typename mat2::data_type),rows);
262 else
263 ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
264 src.row_bytes(),cols*sizeof(typename mat2::data_type),
265 rows);
266 size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
267 for (size_t i=0; i<rows; i++) {
268 for (size_t j=0; j<cols; j++) {
269 dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
270 buff_i++;
271 dst_i++;
272 }
273 dst_i+=doff;
274 }
275 }
276 }
277 template <class mat1, class mat2, class mat3>
278 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
279 const size_t cols, mat3 &cast_buffer,
280 command_queue &cq) {
281 // Asynchronous currently pointless here
282 #ifdef UCL_DEBUG
283 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
284 assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
285 if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
286 if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
287 #endif
288 if (mat1::VECTOR) {
289 ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
290 src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
291 cast_buffer.sync();
292 for (size_t i=0; i<rows*cols; i++)
293 dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
294 } else {
295 if (mat2::VECTOR)
296 ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
297 cols*sizeof(typename mat2::data_type),
298 cols*sizeof(typename mat2::data_type),rows,cq);
299 else
300 ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
301 src.row_bytes(),cols*sizeof(typename mat2::data_type),
302 rows,cq);
303 cast_buffer.sync();
304 size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
305 for (size_t i=0; i<rows; i++) {
306 for (size_t j=0; j<cols; j++) {
307 dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
308 buff_i++;
309 dst_i++;
310 }
311 dst_i+=doff;
312 }
313 }
314 }
315 };
316
317 // Source is on host
318 template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
319 template <class mat1, class mat2, class mat3>
320 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
321 mat3 &cast_buffer) {
322 for (size_t i=0; i<numel; i++)
323 cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
324 ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type));
325 }
326 template <class mat1, class mat2, class mat3>
327 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
328 mat3 &cast_buffer, command_queue &cq) {
329 for (size_t i=0; i<numel; i++)
330 cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
331 ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq);
332 }
333 template <class mat1, class mat2, class mat3>
334 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
335 const size_t cols, mat3 &cast_buffer) {
336 #ifdef UCL_DEBUG
337 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
338 assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
339 if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
340 if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
341 if (mat3::VECTOR==0) {
342 assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
343 assert(dst.rows()>=rows && dst.cols()>=cols);
344 }
345 #endif
346 if (mat2::VECTOR) {
347 if (mat3::VECTOR==0) {
348 size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
349 for (size_t i=0; i<rows; i++) {
350 for (size_t j=0; j<cols; j++) {
351 cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
352 ci++;
353 si++;
354 }
355 ci+=co;
356 si+=so;
357 }
358 ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
359 cols*sizeof(typename mat1::data_type),rows);
360 } else {
361 for (size_t i=0; i<rows*cols; i++)
362 cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
363 ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
364 cols*sizeof(typename mat1::data_type),
365 cols*sizeof(typename mat1::data_type),rows);
366 }
367 } else if (mat1::VECTOR) {
368 size_t src_i=0, buf_i=0, soff=src.cols()-cols;
369 for (size_t i=0; i<rows; i++) {
370 for (size_t j=0; j<cols; j++) {
371 cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
372 buf_i++;
373 src_i++;
374 }
375 src_i+=soff;
376 }
377 ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
378 } else {
379 size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
380 if (mat3::VECTOR==0) {
381 co=cast_buffer.cols()-cols;
382 spitch=cast_buffer.row_bytes();
383 } else {
384 co=0;
385 spitch=cols*sizeof(typename mat1::data_type);
386 }
387 for (size_t i=0; i<rows; i++) {
388 for (size_t j=0; j<cols; j++) {
389 cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
390 buf_i++;
391 src_i++;
392 }
393 src_i+=so;
394 buf_i+=co;
395 }
396 ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
397 cols*sizeof(typename mat1::data_type),rows);
398 }
399 }
400 template <class mat1, class mat2, class mat3>
401 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
402 const size_t cols, mat3 &cast_buffer,
403 command_queue &cq) {
404 #ifdef UCL_DEBUG
405 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
406 assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
407 if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
408 if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
409 if (mat3::VECTOR==0) {
410 assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
411 assert(dst.rows()>=rows && dst.cols()>=cols);
412 }
413 #endif
414 if (mat2::VECTOR) {
415 if (mat3::VECTOR==0) {
416 size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
417 for (size_t i=0; i<rows; i++) {
418 for (size_t j=0; j<cols; j++) {
419 cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
420 ci++;
421 si++;
422 }
423 ci+=co;
424 si+=so;
425 }
426 ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
427 cols*sizeof(typename mat1::data_type),rows);
428 } else {
429 for (size_t i=0; i<rows*cols; i++)
430 cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
431 ucl_mv_cpy(dst,dst.row_bytes(),
432 cast_buffer,cols*sizeof(typename mat1::data_type),
433 cols*sizeof(typename mat1::data_type),rows,cq);
434 }
435 } else if (mat1::VECTOR) {
436 size_t src_i=0, buf_i=0, soff=src.cols()-cols;
437 for (size_t i=0; i<rows; i++) {
438 for (size_t j=0; j<cols; j++) {
439 cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
440 buf_i++;
441 src_i++;
442 }
443 src_i+=soff;
444 }
445 ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
446 } else {
447 size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
448 if (mat3::VECTOR==0) {
449 co=cast_buffer.cols()-cols;
450 spitch=cast_buffer.row_bytes();
451 } else {
452 co=0;
453 spitch=cols*sizeof(typename mat1::data_type);
454 }
455 for (size_t i=0; i<rows; i++) {
456 for (size_t j=0; j<cols; j++) {
457 cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
458 buf_i++;
459 src_i++;
460 }
461 src_i+=so;
462 buf_i+=co;
463 }
464 ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
465 cols*sizeof(typename mat1::data_type),rows,cq);
466 }
467 }
468 };
469
470 // Neither on host or both on host
471 template <> struct _ucl_cast_copy<1,1> {
472 template <class mat1, class mat2, class mat3>
473 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
474 mat3 &cast_buffer, command_queue &cq) {
475 assert(0==1);
476 }
477 template <class mat1, class mat2, class mat3>
478 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
479 mat3 &cast_buffer) {
480 assert(0==1);
481 }
482 template <class mat1, class mat2, class mat3>
483 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
484 const size_t cols, mat3 &cast_buffer) {
485 assert(0==1);
486 }
487 template <class mat1, class mat2, class mat3>
488 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
489 const size_t cols, mat3 &cast_buffer,
490 command_queue &cq) {
491 assert(0==1);
492 }
493 };
494
495 // Neither on host or both on host
496 template <> struct _ucl_cast_copy<0,0> {
497 template <class mat1, class mat2, class mat3>
498 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
499 mat3 &cast_buffer, command_queue &cq) {
500 assert(0==1);
501 }
502 template <class mat1, class mat2, class mat3>
503 static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
504 mat3 &cast_buffer) {
505 assert(0==1);
506 }
507 template <class mat1, class mat2, class mat3>
508 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
509 const size_t cols, mat3 &cast_buffer) {
510 assert(0==1);
511 }
512 template <class mat1, class mat2, class mat3>
513 static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
514 const size_t cols, mat3 &cast_buffer,
515 command_queue &cq) {
516 assert(0==1);
517 }
518 };
519
520 // --------------------------------------------------------------------------
521 // - 1D COPY - SPECIFIED NUMBER OF BYTES
522 // --------------------------------------------------------------------------
523
524 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
525 /** \param numel Number of elements (not bytes) to copy
526 * \param cast_buffer Buffer on host with enough storage for casting
527 * - If the data types for the two matrices are same, no cast performed
528 * - Padding for 2D matrices is not considered in this routine.
529 * - Currently does not handle textures **/
530 template <class mat1, class mat2, class mat3>
531 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
532 mat3 &cast_buffer, command_queue &cq) {
533 #ifdef UCL_DEBUG
534 assert(dst.numel()>=numel && src.numel()>=numel);
535 assert(cast_buffer.numel()>=numel);
536 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
537 #endif
538 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
539 ucl_copy(dst,src,numel,cq);
540 else {
541 #ifdef UCL_DEBUG
542 _check_ucl_copy_perm(dst,src);
543 #endif
544 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
545 cast_buffer,cq);
546 }
547 }
548
549 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
550 /** \param numel Number of elements (not bytes) to copy
551 * \param async Perform non-blocking copy on default stream
552 * \param cast_buffer Buffer on host with enough storage for casting
553 * - If the data types for the two matrices are same, no cast performed
554 * - Padding for 2D matrices is not considered in this routine.
555 * - Currently does not handle textures **/
556 template <class mat1, class mat2, class mat3>
557 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
558 mat3 &cast_buffer, const bool async) {
559 #ifdef UCL_DEBUG
560 assert(dst.numel()>=numel && src.numel()>=numel);
561 assert(cast_buffer.numel()>=numel);
562 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
563 _check_ucl_copy_perm(dst,src);
564 #endif
565 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
566 ucl_copy(dst,src,numel,async);
567 else if (async)
568 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
569 cast_buffer,dst.cq());
570 else
571 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
572 cast_buffer);
573 }
574
575 /// Asynchronous copy of matrix/vector (memory already allocated)
576 /** \param numel Number of elements (not bytes) to copy
577 * - If the data types of the two matrices are not the same,
578 * casting will be performed automatically as long as the copy is
579 * not device to device. For host/device transfers, a temporary
580 * buffer is created for copy. When multiple casts occur, it is
581 * more efficient to create a permanent casting buffer that can
582 * be passed to an alternative copy routine.
583 * - Padding for 2D matrices is not considered in this routine.
584 * - Currently does not handle textures **/
585 template <class mat1, class mat2>
586 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
587 command_queue &cq) {
588 #ifdef UCL_DEBUG
589 assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
590 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
591 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
592 _check_ucl_copy_perm(dst,src);
593 #endif
594 if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
595 _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
596 else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
597 (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
598 if (mat1::MEM_TYPE==1) {
599 UCL_H_Vec<typename mat2::data_type> cast_buffer;
600 cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
601 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
602 cast_buffer,cq);
603 } else {
604 UCL_H_Vec<typename mat1::data_type> cast_buffer;
605 cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
606 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
607 cast_buffer,cq);
608 }
609 } else
610 ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
611 }
612
613 /// Copy matrix/vector (memory already allocated)
614 /** \param numel Number of elements (not bytes) to copy
615 * \param async Perform non-blocking copy (ignored for host to host copy)
616 * - If the data types of the two matrices are not the same,
617 * casting will be performed automatically as long as the copy is
618 * not device to device. For host/device transfers, a temporary
619 * buffer is created for copy. When multiple casts occur, it is
620 * more efficient to create a permanent casting buffer that can
621 * be passed to an alternative copy routine.
622 * - Padding for 2D matrices is not considered in this routine.
623 * - The default stream is used for asynchronous copy
624 * - Currently does not handle textures **/
625 template <class mat1, class mat2>
626 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
627 const bool async) {
628 #ifdef UCL_DEBUG
629 assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
630 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
631 _check_ucl_copy_perm(dst,src);
632 #endif
633 if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
634 _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
635 else if (async)
636 ucl_copy(dst,src,numel,dst.cq());
637 else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
638 (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
639 if (mat1::MEM_TYPE==1) {
640 UCL_H_Vec<typename mat2::data_type> cast_buffer;
641 cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
642 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
643 cast_buffer);
644 } else {
645 UCL_H_Vec<typename mat1::data_type> cast_buffer;
646 cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
647 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
648 cast_buffer);
649 }
650 } else
651 ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
652 }
653
654 // --------------------------------------------------------------------------
655 // - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS
656 // --------------------------------------------------------------------------
657
658 /// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer)
659 /** \param async Perform non-blocking copy on default stream
660 * \param cast_buffer Buffer on host with enough storage for casting
661 * - If src is a vector, routine assumes row-major rows by cols copy
662 * - If src is a matrix, routine will copy upper left tile of matrix
663 * - If dst is a vector, routine assumes row-major rows by cols copy
664 * - If dst is a matrix, routine will copy into left tile of matrix
665 * - If the data types for the two matrices are same, no cast performed
666 * - Padding for 2D matrices is not considered in this routine.
667 * - Copy from vector to matrix and vice versa allowed
668 * - Currently does not handle textures **/
669 template <class mat1, class mat2, class mat3>
670 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
671 const size_t cols, mat3 &cast_buffer,
672 const bool async) {
673 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
674 ucl_copy(dst,src,rows,cols,async);
675 else if (async)
676 ucl_copy(dst,src,rows,cols,dst.cq());
677 else {
678 #ifdef UCL_DEBUG
679 _check_ucl_copy_perm(dst,src);
680 #endif
681 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
682 cast_buffer);
683 }
684 }
685
686 /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
687 /** \param cast_buffer Buffer on host with enough storage for casting
688 * - If src is a vector, routine assumes row-major rows by cols copy
689 * - If src is a matrix, routine will copy upper left tile of matrix
690 * - If dst is a vector, routine assumes row-major rows by cols copy
691 * - If dst is a matrix, routine will copy into upper left tile of matrix
692 * - If the data types for the two matrices are same, no cast performed
693 * - Padding for 2D matrices is not considered in this routine.
694 * - Copy from vector to matrix and vice versa allowed
695 * - Currently does not handle textures **/
696 template <class mat1, class mat2, class mat3>
697 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
698 const size_t cols, mat3 &cast_buffer,
699 command_queue &cq) {
700 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
701 ucl_copy(dst,src,rows,cols,cq);
702 else {
703 #ifdef UCL_DEBUG
704 _check_ucl_copy_perm(dst,src);
705 #endif
706 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
707 cast_buffer,cq);
708 }
709 }
710
711 /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
712 /** - If src is a vector, routine assumes row-major rows by cols copy
713 * - If src is a matrix, routine will copy upper left tile of matrix
714 * - If dst is a vector, routine assumes row-major rows by cols copy
715 * - If dst is a matrix, routine will copy into left tile of matrix
716 * - If the data types of the two matrices are not the same,
717 * casting will be performed automatically as long as the copy is
718 * not device to device. For host/device transfers, a temporary
719 * buffer is created for copy. When multiple casts occur, it is
720 * more efficient to create a permanent casting buffer that can
721 * be passed to an alternative copy routine.
722 * - The copy should handle padding for 2D alignment correctly
723 * - Copy from vector to matrix and vice versa allowed
724 * - Currently does not handle textures **/
725 template <class mat1, class mat2>
726 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
727 const size_t cols, command_queue &cq) {
728 #ifdef UCL_DEBUG
729 _check_ucl_copy_perm(dst,src);
730 #endif
731 if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
732 _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
733 else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
734 (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
735 if (mat1::MEM_TYPE==1) {
736 UCL_H_Vec<typename mat2::data_type> cast_buffer;
737 cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
738 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
739 cast_buffer,cq);
740 } else {
741 UCL_H_Vec<typename mat1::data_type> cast_buffer;
742 cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
743 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
744 cast_buffer,cq);
745 }
746 // If we are here, at least one of the matrices must have VECTOR=0
747 } else if (mat1::VECTOR) {
748 #ifdef UCL_DEBUG
749 assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
750 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
751 #endif
752 ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
753 cols*sizeof(typename mat1::data_type),rows,
754 cq);
755 } else if (mat2::VECTOR) {
756 #ifdef UCL_DEBUG
757 assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
758 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
759 #endif
760 ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
761 cols*sizeof(typename mat1::data_type),rows,cq);
762 } else {
763 #ifdef UCL_DEBUG
764 assert(src.rows()>=rows && src.cols()>=cols);
765 assert(dst.rows()>=rows && dst.cols()>=cols);
766 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
767 #endif
768 ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
769 cols*sizeof(typename mat1::data_type),rows,cq);
770 }
771 }
772
773 /// Copy subset of matrix rows,cols (memory already allocated)
774 /** \param async Perform non-blocking copy (ignored for host to host copy)
775 * - If src is a vector, routine assumes row-major rows by cols copy
776 * - If src is a matrix, routine will copy upper left tile of matrix
777 * - If dst is a vector, routine assumes row-major rows by cols copy
778 * - If dst is a matrix, routine will copy into left tile of matrix
779 * - If the data types of the two matrices are not the same,
780 * casting will be performed automatically as long as the copy is
781 * not device to device. For host/device transfers, a temporary
782 * buffer is created for copy. When multiple casts occur, it is
783 * more efficient to create a permanent casting buffer that can
784 * be passed to an alternative copy routine.
785 * - The copy should handle padding for 2D alignment correctly
786 * - Copy from vector to matrix and vice versa allowed
787 * - The default stream is used for asynchronous copy
788 * - Currently does not handle textures **/
789 template <class mat1, class mat2>
790 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
791 const size_t cols, const bool async) {
792 #ifdef UCL_DEBUG
793 _check_ucl_copy_perm(dst,src);
794 #endif
795 if (async)
796 ucl_copy(dst,src,rows,cols,dst.cq());
797 else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
798 _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
799 else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
800 (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
801 if (mat1::MEM_TYPE==1) {
802 UCL_H_Vec<typename mat2::data_type> cast_buffer;
803 cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
804 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
805 cast_buffer);
806 } else {
807 UCL_H_Vec<typename mat1::data_type> cast_buffer;
808 cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
809 _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
810 cast_buffer);
811 }
812 // If we are here, at least one of the matrices must have VECTOR=0
813 } else if (mat1::VECTOR) {
814 #ifdef UCL_DEBUG
815 assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
816 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
817 assert(mat2::VECTOR==0);
818 #endif
819 ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
820 cols*sizeof(typename mat1::data_type),rows);
821 } else if (mat2::VECTOR) {
822 #ifdef UCL_DEBUG
823 assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
824 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
825 assert(mat1::VECTOR==0);
826 #endif
827 ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
828 cols*sizeof(typename mat1::data_type),rows);
829 } else {
830 #ifdef UCL_DEBUG
831 assert(src.rows()>=rows && src.cols()>=cols);
832 assert(dst.rows()>=rows && dst.cols()>=cols);
833 assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
834 #endif
835 ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
836 cols*sizeof(typename mat1::data_type),rows);
837 }
838 }
839
840 // --------------------------------------------------------------------------
841 // - 1D/2D COPY
842 // --------------------------------------------------------------------------
843
844 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
845 /** \param async Perform non-blocking copy on default stream
846 * \param cast_buffer Buffer on host with enough storage for casting
847 * - If the data types for the two matrices are same, no cast performed
848 * - The number of bytes copied is determined by entire src data
849 * - Padding for 2D matrices is not considered in this routine.
850 * - Copy from vector to matrix and vice versa allowed
851 * - Currently does not handle textures **/
852 template <class mat1, class mat2, class mat3>
853 inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
854 mat3 &cast_buffer, const bool async) {
855 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
856 ucl_copy(dst,src,async);
857 else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
858 ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async);
859 else if (mat1::PADDED==1)
860 ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async);
861 else
862 ucl_cast_copy(dst,src,src.numel(),cast_buffer,async);
863 }
864
865 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
866 /** \param cast_buffer Buffer on host with enough storage for casting
867 * - If the data types for the two matrices are same, no cast performed
868 * - The number of bytes copied is determined by entire src data
869 * - Padding for 2D matrices is not considered in this routine.
870 * - Copy from vector to matrix and vice versa allowed
871 * - Currently does not handle textures **/
872 template <class mat1, class mat2, class mat3>
873 inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
874 mat3 &cast_buffer, command_queue &cq) {
875 if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
876 ucl_copy(dst,src,cq);
877 else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
878 ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq);
879 else if (mat1::PADDED==1)
880 ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq);
881 else
882 ucl_copy(dst,src,src.numel(),cast_buffer,cq);
883 }
884
885 /// Asynchronous copy of matrix/vector (memory already allocated)
886 /** - The number of bytes copied is determined by entire src data
887 * - If the data types of the two matrices are not the same,
888 * casting will be performed automatically as long as the copy is
889 * not device to device. For host/device transfers, a temporary
890 * buffer is created for copy. When multiple casts occur, it is
891 * more efficient to create a permanent casting buffer that can
892 * be passed to an alternative copy routine.
893 * - The copy should handle padding for 2D alignment correctly
894 * - Copy from vector to matrix and vice versa allowed
895 * - Currently does not handle textures **/
896 template <class mat1, class mat2>
897 inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) {
898 if (dst.row_bytes()==src.row_bytes() &&
899 src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
900 (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
901 ucl_copy(dst,src,src.row_size()*src.rows(),cq);
902 else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
903 ucl_copy(dst,src,src.rows(),src.cols(),cq);
904 else if (mat1::PADDED==1)
905 ucl_copy(dst,src,dst.rows(),dst.cols(),cq);
906 else
907 ucl_copy(dst,src,src.numel(),cq);
908 }
909
910 /// Copy matrix/vector (memory already allocated)
911 /** \param async Perform non-blocking copy (ignored for host to host copy)
912 * - The number of bytes copied is determined by entire src data
913 * - If the data types of the two matrices are not the same,
914 * casting will be performed automatically as long as the copy is
915 * not device to device. For host/device transfers, a temporary
916 * buffer is created for copy. When multiple casts occur, it is
917 * more efficient to create a permanent casting buffer that can
918 * be passed to an alternative copy routine.
919 * - The copy should handle padding for 2D alignment correctly
920 * - Copy from vector to matrix and vice versa allowed
921 * - The default stream is used for asynchronous copy
922 * - Currently does not handle textures **/
923 template <class mat1, class mat2>
924 inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
925 if (async)
926 ucl_copy(dst,src,dst.cq());
927 else if (dst.row_bytes()==src.row_bytes() &&
928 src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
929 (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
930 ucl_copy(dst,src,src.row_size()*src.rows(),async);
931 else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
932 ucl_copy(dst,src,src.rows(),src.cols(),async);
933 else if (mat1::PADDED==1)
934 ucl_copy(dst,src,dst.rows(),dst.cols(),async);
935 else
936 ucl_copy(dst,src,src.numel(),async);
937 }
938
939 #endif
940
941