1 /*************************************************************************** 2 ucl_d_vec.h 3 ------------------- 4 W. Michael Brown 5 6 Vector Container on Device 7 8 __________________________________________________________________________ 9 This file is part of the Geryon Unified Coprocessor Library (UCL) 10 __________________________________________________________________________ 11 12 begin : Thu Jun 25 2009 13 copyright : (C) 2009 by W. Michael Brown 14 email : brownw@ornl.gov 15 ***************************************************************************/ 16 17 /* ----------------------------------------------------------------------- 18 Copyright (2009) Sandia Corporation. Under the terms of Contract 19 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains 20 certain rights in this software. This software is distributed under 21 the Simplified BSD License. 22 ----------------------------------------------------------------------- */ 23 24 // Only allow this file to be included by CUDA and OpenCL specific headers 25 #ifdef _UCL_MAT_ALLOW 26 27 /// Row vector on device 28 template <class numtyp> 29 class UCL_D_Vec : public UCL_BaseMat { 30 public: 31 // Traits for copying data 32 // MEM_TYPE is 0 for device, 1 for host, and 2 for image 33 enum traits { 34 DATA_TYPE = _UCL_DATA_ID<numtyp>::id, 35 MEM_TYPE = 0, 36 PADDED = 0, 37 ROW_MAJOR = 1, 38 VECTOR = 1 39 }; 40 typedef numtyp data_type; 41 UCL_D_Vec()42 UCL_D_Vec() : _row_bytes(0), _cols(0) {} ~UCL_D_Vec()43 ~UCL_D_Vec() { _device_free(*this); } 44 45 /// Construct with n columns 46 /** \sa alloc() **/ 47 UCL_D_Vec(const size_t n, UCL_Device &device, 48 const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 49 _cols(0) { alloc(n,device,kind); } 50 51 /// Set up host vector with 'cols' columns and reserve memory 52 /** The kind parameter controls memory optimizations as follows: 53 * - UCL_READ_WRITE - Specify that you will read and write in kernels 54 * - UCL_WRITE_ONLY - Specify that you will only write in kernels 55 * - UCL_READ_ONLY - Specify that you will only read in kernels 56 * \param cq Default command queue for operations copied from another mat 57 * \return UCL_SUCCESS if the memory allocation is successful **/ 58 template <class mat_type> 59 inline int alloc(const size_t cols, mat_type &cq, 60 const enum UCL_MEMOPT kind=UCL_READ_WRITE) { 61 62 clear(); 63 64 _row_bytes=cols*sizeof(numtyp); 65 int err=_device_alloc(*this,cq,_row_bytes,kind); 66 if (err!=UCL_SUCCESS) { 67 #ifndef UCL_NO_EXIT 68 std::cerr << "UCL Error: Could not allocate " << _row_bytes 69 << " bytes on device.\n"; 70 _row_bytes=0; 71 UCL_GERYON_EXIT; 72 #endif 73 _row_bytes=0; 74 return err; 75 } 76 77 _kind=kind; 78 _cols=cols; 79 #ifndef _UCL_DEVICE_PTR_MAT 80 _end=_array+cols; 81 #endif 82 #ifdef _OCL_MAT 83 _offset=0; 84 #endif 85 return err; 86 } 87 88 /// Set up host vector with 'cols' columns and reserve memory 89 /** The kind parameter controls memory optimizations as follows: 90 * - UCL_READ_WRITE - Specify that you will read and write in kernels 91 * - UCL_WRITE_ONLY - Specify that you will only write in kernels 92 * - UCL_READ_ONLY - Specify that you will only read in kernels 93 * \param device Used to get the default command queue for operations 94 * \return UCL_SUCCESS if the memory allocation is successful **/ 95 inline int alloc(const size_t cols, UCL_Device &device, 96 const enum UCL_MEMOPT kind=UCL_READ_WRITE) { 97 clear(); 98 _row_bytes=cols*sizeof(numtyp); 99 int err=_device_alloc(*this,device,_row_bytes,kind); 100 if (err!=UCL_SUCCESS) { 101 #ifndef UCL_NO_EXIT 102 std::cerr << "UCL Error: Could not allocate " << _row_bytes 103 << " bytes on device.\n"; 104 _row_bytes=0; 105 UCL_GERYON_EXIT; 106 #endif 107 _row_bytes=0; 108 return err; 109 } 110 111 _kind=kind; 112 _cols=cols; 113 #ifndef _UCL_DEVICE_PTR_MAT 114 _end=_array+cols; 115 #endif 116 #ifdef _OCL_MAT 117 _offset=0; 118 #endif 119 return err; 120 } 121 122 /// Do not allocate memory, instead use an existing allocation from Geryon 123 /** This function must be passed a Geryon vector or matrix container. 124 * No memory is freed when the object is destructed. 125 * - The view does not prevent the memory from being freed by the 126 * allocating container when using CUDA APIs **/ 127 template <class ucl_type> view(ucl_type & input,const size_t rows,const size_t cols)128 inline void view(ucl_type &input, const size_t rows, const size_t cols) { 129 #ifdef UCL_DEBUG 130 assert(rows==1); 131 #endif 132 clear(); 133 _kind=UCL_VIEW; 134 _cols=cols; 135 _row_bytes=_cols*sizeof(numtyp); 136 this->_cq=input.cq(); 137 #ifdef _OCL_MAT 138 _offset=input.offset(); 139 _array=input.cbegin(); 140 CL_SAFE_CALL(clRetainMemObject(input.cbegin())); 141 CL_SAFE_CALL(clRetainCommandQueue(input.cq())); 142 #else 143 _device_view(&_array,input.begin()); 144 #endif 145 146 #ifndef _UCL_DEVICE_PTR_MAT 147 _end=_array+_cols; 148 #endif 149 } 150 151 /// Do not allocate memory, instead use an existing allocation from Geryon 152 /** This function must be passed a Geryon vector or matrix container. 153 * No memory is freed when the object is destructed. 154 * - The view does not prevent the memory from being freed by the 155 * allocating container when using CUDA APIs 156 * \param stride Number of _elements_ between the start of each row **/ 157 template <class ucl_type> view(ucl_type & input,const size_t rows,const size_t cols,const size_t stride)158 inline void view(ucl_type &input, const size_t rows, const size_t cols, 159 const size_t stride) { view(input,rows,cols); } 160 161 /// Do not allocate memory, instead use an existing allocation from Geryon 162 /** This function must be passed a Geryon vector or matrix container. 163 * No memory is freed when the object is destructed. 164 * - The view does not prevent the memory from being freed by the 165 * allocating container when using CUDA APIs 166 * - If a matrix is used a input, all elements (including padding) 167 * will be used for view **/ 168 template <class ucl_type> view(ucl_type & input,const size_t cols)169 inline void view(ucl_type &input, const size_t cols) 170 { view(input,1,cols); } 171 172 /// Do not allocate memory, instead use an existing allocation from Geryon 173 /** This function must be passed a Geryon vector or matrix container. 174 * No memory is freed when the object is destructed. 175 * - The view does not prevent the memory from being freed by the 176 * allocating container when using CUDA APIs 177 * - If a matrix is used a input, all elements (including padding) 178 * will be used for view **/ 179 template <class ucl_type> view(ucl_type & input)180 inline void view(ucl_type &input) 181 { view(input,input.rows()*input.row_size()); } 182 183 /// Do not allocate memory, instead use an existing allocation 184 /** - No memory is freed when the object is destructed. 185 * - The view does not prevent the memory from being freed by the 186 * allocating container when using CUDA APIs **/ 187 template <class ptr_type> view(ptr_type input,const size_t rows,const size_t cols,UCL_Device & dev)188 inline void view(ptr_type input, const size_t rows, const size_t cols, 189 UCL_Device &dev) { 190 #ifdef UCL_DEBUG 191 assert(rows==1); 192 #endif 193 clear(); 194 _kind=UCL_VIEW; 195 _cols=cols; 196 _row_bytes=_cols*sizeof(numtyp); 197 this->_cq=dev.cq(); 198 _array=input; 199 #ifndef _UCL_DEVICE_PTR_MAT 200 _end=_array+_cols; 201 #endif 202 #ifdef _OCL_MAT 203 _offset=0; 204 CL_SAFE_CALL(clRetainMemObject(input)); 205 CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); 206 #endif 207 } 208 209 /// Do not allocate memory, instead use an existing allocation 210 /** - No memory is freed when the object is destructed. 211 * - The view does not prevent the memory from being freed by the 212 * allocating container when using CUDA APIs 213 * \param stride Number of _elements_ between the start of each row **/ 214 template <class ptr_type> view(ptr_type input,const size_t rows,const size_t cols,const size_t stride,UCL_Device & dev)215 inline void view(ptr_type input, const size_t rows, const size_t cols, 216 const size_t stride, UCL_Device &dev) 217 { view(input,rows,cols,stride); } 218 219 /// Do not allocate memory, instead use an existing allocation 220 /** - No memory is freed when the object is destructed. 221 * - The view does not prevent the memory from being freed by the 222 * allocating container when using CUDA APIs **/ 223 template <class ptr_type> view(ptr_type input,const size_t cols,UCL_Device & dev)224 inline void view(ptr_type input, const size_t cols, UCL_Device &dev) 225 { view(input,1,cols,dev); } 226 227 /// Do not allocate memory, instead use an existing allocation from Geryon 228 /** This function must be passed a Geryon vector or matrix container. 229 * No memory is freed when the object is destructed. 230 * - The view does not prevent the memory from being freed by the 231 * allocating container when using CUDA APIs **/ 232 template <class ucl_type> view_offset(const size_t offset,ucl_type & input,const size_t rows,const size_t cols)233 inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, 234 const size_t cols) { 235 #ifdef UCL_DEBUG 236 assert(rows==1); 237 #endif 238 clear(); 239 _kind=UCL_VIEW; 240 _cols=cols; 241 _row_bytes=_cols*sizeof(numtyp); 242 this->_cq=input.cq(); 243 #ifdef _OCL_MAT 244 _array=input.begin(); 245 _offset=offset+input.offset(); 246 CL_SAFE_CALL(clRetainMemObject(input.begin())); 247 CL_SAFE_CALL(clRetainCommandQueue(input.cq())); 248 #else 249 _device_view(&_array,input.begin(),offset,sizeof(numtyp)); 250 #endif 251 252 #ifndef _UCL_DEVICE_PTR_MAT 253 _end=_array+_cols; 254 #endif 255 } 256 257 /// Do not allocate memory, instead use an existing allocation from Geryon 258 /** This function must be passed a Geryon vector or matrix container. 259 * No memory is freed when the object is destructed. 260 * - The view does not prevent the memory from being freed by the 261 * allocating container when using CUDA APIs 262 * \param stride Number of _elements_ between the start of each row **/ 263 template <class ucl_type> view_offset(const size_t offset,ucl_type & input,const size_t rows,const size_t cols,const size_t stride)264 inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, 265 const size_t cols, const size_t stride) 266 { view_offset(offset,input,rows,cols); } 267 268 /// Do not allocate memory, instead use an existing allocation from Geryon 269 /** This function must be passed a Geryon vector or matrix container. 270 * No memory is freed when the object is destructed. 271 * - The view does not prevent the memory from being freed by the 272 * allocating container when using CUDA APIs 273 * - If a matrix is used a input, all elements (including padding) 274 * will be used for view **/ 275 template <class ucl_type> view_offset(const size_t offset,ucl_type & input,const size_t cols)276 inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) 277 { view_offset(offset,input,1,cols); } 278 279 /// Do not allocate memory, instead use an existing allocation from Geryon 280 /** This function must be passed a Geryon vector or matrix container. 281 * No memory is freed when the object is destructed. 282 * - The view does not prevent the memory from being freed by the 283 * allocating container when using CUDA APIs 284 * - If a matrix is used a input, all elements (including padding) 285 * will be used for view **/ 286 template <class ucl_type> view_offset(const size_t offset,ucl_type & input)287 inline void view_offset(const size_t offset, ucl_type &input) 288 { view_offset(offset,input,input.rows()*input.row_size()-offset); } 289 290 /// Do not allocate memory, instead use an existing allocation 291 /** - No memory is freed when the object is destructed. 292 * - The view does not prevent the memory from being freed by the 293 * allocating container when using CUDA APIs **/ 294 template <class ptr_type> view_offset(const size_t offset,ptr_type input,const size_t rows,const size_t cols,UCL_Device & dev)295 inline void view_offset(const size_t offset,ptr_type input,const size_t rows, 296 const size_t cols, UCL_Device &dev) { 297 #ifdef UCL_DEBUG 298 assert(rows==1); 299 #endif 300 clear(); 301 _kind=UCL_VIEW; 302 _cols=cols; 303 _row_bytes=_cols*sizeof(numtyp); 304 this->_cq=dev.cq(); 305 306 #ifdef _OCL_MAT 307 _array=input; 308 _offset=offset; 309 CL_SAFE_CALL(clRetainMemObject(input)); 310 CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); 311 #else 312 #ifdef _UCL_DEVICE_PTR_MAT 313 _array=input+offset*sizeof(numtyp); 314 #else 315 _array=input+offset; 316 #endif 317 #endif 318 319 #ifndef _UCL_DEVICE_PTR_MAT 320 _end=_array+_cols; 321 #endif 322 } 323 324 /// Do not allocate memory, instead use an existing allocation 325 /** - No memory is freed when the object is destructed. 326 * - The view does not prevent the memory from being freed by the 327 * allocating container when using CUDA APIs 328 * \param stride Number of _elements_ between the start of each row **/ 329 template <class ptr_type> view_offset(const size_t offset,ptr_type input,const size_t rows,const size_t cols,const size_t stride,UCL_Device & dev)330 inline void view_offset(const size_t offset,ptr_type input,const size_t rows, 331 const size_t cols,const size_t stride,UCL_Device &dev) 332 { view_offset(offset,input,rows,cols,stride); } 333 334 /// Do not allocate memory, instead use an existing allocation 335 /** - No memory is freed when the object is destructed. 336 * - The view does not prevent the memory from being freed by the 337 * allocating container when using CUDA APIs **/ 338 template <class ptr_type> view_offset(const size_t offset,ptr_type input,const size_t cols,UCL_Device & dev)339 inline void view_offset(const size_t offset, ptr_type input, 340 const size_t cols, UCL_Device &dev) 341 { view_offset(offset,input,1,cols,dev); } 342 343 /// Free memory and set size to 0 clear()344 inline void clear() 345 { _device_free(*this); _cols=0; _kind=UCL_VIEW; } 346 347 /// Resize the allocation to contain cols elements 348 /** \note Cannot be used on views **/ resize(const int cols)349 inline int resize(const int cols) { 350 assert(_kind!=UCL_VIEW); 351 352 _row_bytes=cols*sizeof(numtyp); 353 int err=_device_resize(*this,_row_bytes); 354 if (err!=UCL_SUCCESS) { 355 #ifndef UCL_NO_EXIT 356 std::cerr << "UCL Error: Could not allocate " << _row_bytes 357 << " bytes on device.\n"; 358 _row_bytes=0; 359 UCL_GERYON_EXIT; 360 #endif 361 _row_bytes=0; 362 return err; 363 } 364 365 _cols=cols; 366 #ifndef _UCL_DEVICE_PTR_MAT 367 _end=_array+cols; 368 #endif 369 #ifdef _OCL_MAT 370 _offset=0; 371 #endif 372 return err; 373 } 374 375 /// Resize (only if bigger) the allocation to contain cols elements 376 /** \note Cannot be used on views **/ resize_ib(const int cols)377 inline int resize_ib(const int cols) 378 { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; } 379 380 /// Set each element to zero asynchronously in the default command_queue zero()381 inline void zero() { zero(_cq); } 382 /// Set first n elements to zero asynchronously in the default command_queue zero(const int n)383 inline void zero(const int n) { zero(n,_cq); } 384 /// Set each element to zero asynchronously zero(command_queue & cq)385 inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); } 386 /// Set first n elements to zero asynchronously zero(const int n,command_queue & cq)387 inline void zero(const int n, command_queue &cq) 388 { _device_zero(*this,n*sizeof(numtyp),cq); } 389 390 #ifdef _UCL_DEVICE_PTR_MAT 391 /// For OpenCL, returns a (void *) device pointer to memory allocation begin()392 inline device_ptr & begin() { return _array; } 393 /// For OpenCL, returns a (void *) device pointer to memory allocation begin()394 inline const device_ptr & begin() const { return _array; } 395 #else 396 /// For CUDA-RT, get device pointer to first element begin()397 inline numtyp * & begin() { return _array; } 398 /// For CUDA-RT, get device pointer to first element begin()399 inline numtyp * const & begin() const { return _array; } 400 /// For CUDA-RT, get device pointer to one past last element end()401 inline numtyp * end() { return _end; } 402 /// For CUDA-RT, get device pointer to one past last element end()403 inline numtyp * end() const { return _end; } 404 #endif 405 406 #ifdef _UCL_DEVICE_PTR_MAT 407 /// Returns an API specific device pointer 408 /** - For OpenCL, returns a &cl_mem object 409 * - For CUDA Driver, returns a &CUdeviceptr 410 * - For CUDA-RT, returns void** **/ cbegin()411 inline device_ptr & cbegin() { return _array; } 412 /// Returns an API specific device pointer 413 /** - For OpenCL, returns a &cl_mem object 414 * - For CUDA Driver, returns a &CUdeviceptr 415 * - For CUDA-RT, returns void** **/ cbegin()416 inline const device_ptr & cbegin() const { return _array; } 417 #else 418 /// Returns an API specific device pointer 419 /** - For OpenCL, returns a &cl_mem object 420 * - For CUDA Driver, returns a &CUdeviceptr 421 * - For CUDA-RT, returns numtyp** **/ cbegin()422 inline numtyp ** cbegin() { return &_array; } 423 /// Returns an API specific device pointer 424 /** - For OpenCL, returns a &cl_mem object 425 * - For CUDA Driver, returns a &CUdeviceptr 426 * - For CUDA-RT, returns numtyp** **/ cbegin()427 inline const numtyp ** cbegin() const { return &_array; } 428 /// For CUDA-RT, allocate row vector and bind texture safe_alloc(const size_t cols,UCL_Device & dev,textureReference * t)429 inline void safe_alloc(const size_t cols, UCL_Device &dev, 430 textureReference *t) 431 { alloc(cols,dev); assign_texture(t); bind(); } 432 /// For CUDA-RT, assign a texture to matrix assign_texture(textureReference * t)433 inline void assign_texture(textureReference *t) { _tex_ptr=t; } 434 /// For CUDA-RT, bind to texture bind()435 inline void bind() { 436 cuda_gb_get_channel<numtyp>(_channel); 437 (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; 438 (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; 439 (*_tex_ptr).filterMode = cudaFilterModePoint; 440 (*_tex_ptr).normalized = false; 441 CUDA_SAFE_CALL(cudaBindTexture(nullptr,_tex_ptr,_array,&_channel)); 442 } 443 /// For CUDA-RT, unbind texture unbind()444 inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } 445 #endif 446 447 /// Get the number of elements numel()448 inline size_t numel() const { return _cols; } 449 /// Get the number of rows rows()450 inline size_t rows() const { return 1; } 451 /// Get the number of columns cols()452 inline size_t cols() const { return _cols; } 453 ///Get the size of a row (including any padding) in elements row_size()454 inline size_t row_size() const { return _cols; } 455 /// Get the size of a row (including any padding) in bytes row_bytes()456 inline size_t row_bytes() const { return _row_bytes; } 457 /// Get the size in bytes of 1 element element_size()458 inline int element_size() const { return sizeof(numtyp); } 459 460 #ifdef _OCL_MAT 461 /// Return the offset (in elements) from begin() pointer where data starts 462 /** \note Always 0 for host matrices and CUDA APIs **/ offset()463 inline size_t offset() const { return _offset; } 464 #else 465 /// Return the offset (in elements) from begin() pointer where data starts 466 /** \note Always 0 for host matrices and CUDA APIs **/ offset()467 inline size_t offset() const { return 0; } 468 #endif 469 470 /// Return the offset (in bytes) from begin() pointer where data starts 471 /** \note Always 0 for host matrices and CUDA APIs **/ byteoff()472 inline size_t byteoff() const { return offset()*sizeof(numtyp); } 473 474 private: 475 size_t _row_bytes, _row_size, _rows, _cols; 476 477 #ifdef _UCL_DEVICE_PTR_MAT 478 device_ptr _array; 479 #else 480 numtyp *_array,*_end; 481 cudaChannelFormatDesc _channel; 482 textureReference *_tex_ptr; 483 #endif 484 485 #ifdef _OCL_MAT 486 size_t _offset; 487 #endif 488 }; 489 490 #endif 491 492