1 /***************************************************************************
2                                  ucl_d_vec.h
3                              -------------------
4                                W. Michael Brown
5 
6   Vector Container on Device
7 
8  __________________________________________________________________________
9     This file is part of the Geryon Unified Coprocessor Library (UCL)
10  __________________________________________________________________________
11 
12     begin                : Thu Jun 25 2009
13     copyright            : (C) 2009 by W. Michael Brown
14     email                : brownw@ornl.gov
15  ***************************************************************************/
16 
17 /* -----------------------------------------------------------------------
18    Copyright (2009) Sandia Corporation.  Under the terms of Contract
19    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
20    certain rights in this software.  This software is distributed under
21    the Simplified BSD License.
22    ----------------------------------------------------------------------- */
23 
24 // Only allow this file to be included by CUDA and OpenCL specific headers
25 #ifdef _UCL_MAT_ALLOW
26 
27 /// Row vector on device
28 template <class numtyp>
29 class UCL_D_Vec : public UCL_BaseMat {
30  public:
31   // Traits for copying data
32   // MEM_TYPE is 0 for device, 1 for host, and 2 for image
33   enum traits {
34     DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
35     MEM_TYPE = 0,
36     PADDED = 0,
37     ROW_MAJOR = 1,
38     VECTOR = 1
39   };
40   typedef numtyp data_type;
41 
UCL_D_Vec()42  UCL_D_Vec() : _row_bytes(0), _cols(0) {}
~UCL_D_Vec()43   ~UCL_D_Vec() { _device_free(*this); }
44 
45   /// Construct with n columns
46   /** \sa alloc() **/
47   UCL_D_Vec(const size_t n, UCL_Device &device,
48             const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
49     _cols(0) { alloc(n,device,kind); }
50 
51   /// Set up host vector with 'cols' columns and reserve memory
52   /** The kind parameter controls memory optimizations as follows:
53     * - UCL_READ_WRITE - Specify that you will read and write in kernels
54     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
55     * - UCL_READ_ONLY  - Specify that you will only read in kernels
56     * \param cq Default command queue for operations copied from another mat
57     * \return UCL_SUCCESS if the memory allocation is successful **/
58   template <class mat_type>
59   inline int alloc(const size_t cols, mat_type &cq,
60                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
61 
62     clear();
63 
64     _row_bytes=cols*sizeof(numtyp);
65     int err=_device_alloc(*this,cq,_row_bytes,kind);
66     if (err!=UCL_SUCCESS) {
67       #ifndef UCL_NO_EXIT
68       std::cerr << "UCL Error: Could not allocate " << _row_bytes
69                 << " bytes on device.\n";
70       _row_bytes=0;
71       UCL_GERYON_EXIT;
72       #endif
73       _row_bytes=0;
74       return err;
75     }
76 
77     _kind=kind;
78     _cols=cols;
79     #ifndef _UCL_DEVICE_PTR_MAT
80     _end=_array+cols;
81     #endif
82     #ifdef _OCL_MAT
83     _offset=0;
84     #endif
85     return err;
86   }
87 
88   /// Set up host vector with 'cols' columns and reserve memory
89   /** The kind parameter controls memory optimizations as follows:
90     * - UCL_READ_WRITE - Specify that you will read and write in kernels
91     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
92     * - UCL_READ_ONLY  - Specify that you will only read in kernels
93     * \param device Used to get the default command queue for operations
94     * \return UCL_SUCCESS if the memory allocation is successful **/
95   inline int alloc(const size_t cols, UCL_Device &device,
96                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
97     clear();
98     _row_bytes=cols*sizeof(numtyp);
99     int err=_device_alloc(*this,device,_row_bytes,kind);
100     if (err!=UCL_SUCCESS) {
101       #ifndef UCL_NO_EXIT
102       std::cerr << "UCL Error: Could not allocate " << _row_bytes
103                 << " bytes on device.\n";
104       _row_bytes=0;
105       UCL_GERYON_EXIT;
106       #endif
107       _row_bytes=0;
108       return err;
109     }
110 
111     _kind=kind;
112     _cols=cols;
113     #ifndef _UCL_DEVICE_PTR_MAT
114     _end=_array+cols;
115     #endif
116     #ifdef _OCL_MAT
117     _offset=0;
118     #endif
119     return err;
120   }
121 
122   /// Do not allocate memory, instead use an existing allocation from Geryon
123   /** This function must be passed a Geryon vector or matrix container.
124     * No memory is freed when the object is destructed.
125     * - The view does not prevent the memory from being freed by the
126     *   allocating container when using CUDA APIs **/
127   template <class ucl_type>
view(ucl_type & input,const size_t rows,const size_t cols)128   inline void view(ucl_type &input, const size_t rows, const size_t cols) {
129     #ifdef UCL_DEBUG
130     assert(rows==1);
131     #endif
132     clear();
133     _kind=UCL_VIEW;
134     _cols=cols;
135     _row_bytes=_cols*sizeof(numtyp);
136     this->_cq=input.cq();
137     #ifdef _OCL_MAT
138     _offset=input.offset();
139     _array=input.cbegin();
140     CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
141     CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
142     #else
143     _device_view(&_array,input.begin());
144     #endif
145 
146     #ifndef _UCL_DEVICE_PTR_MAT
147     _end=_array+_cols;
148     #endif
149   }
150 
151   /// Do not allocate memory, instead use an existing allocation from Geryon
152   /** This function must be passed a Geryon vector or matrix container.
153     * No memory is freed when the object is destructed.
154     * - The view does not prevent the memory from being freed by the
155     *   allocating container when using CUDA APIs
156     * \param stride Number of _elements_ between the start of each row **/
157   template <class ucl_type>
view(ucl_type & input,const size_t rows,const size_t cols,const size_t stride)158   inline void view(ucl_type &input, const size_t rows, const size_t cols,
159                    const size_t stride) { view(input,rows,cols); }
160 
161   /// Do not allocate memory, instead use an existing allocation from Geryon
162   /** This function must be passed a Geryon vector or matrix container.
163     * No memory is freed when the object is destructed.
164     * - The view does not prevent the memory from being freed by the
165     *   allocating container when using CUDA APIs
166     * - If a matrix is used a input, all elements (including padding)
167     *   will be used for view **/
168   template <class ucl_type>
view(ucl_type & input,const size_t cols)169   inline void view(ucl_type &input, const size_t cols)
170     { view(input,1,cols); }
171 
172   /// Do not allocate memory, instead use an existing allocation from Geryon
173   /** This function must be passed a Geryon vector or matrix container.
174     * No memory is freed when the object is destructed.
175     * - The view does not prevent the memory from being freed by the
176     *   allocating container when using CUDA APIs
177     * - If a matrix is used a input, all elements (including padding)
178     *   will be used for view **/
179   template <class ucl_type>
view(ucl_type & input)180   inline void view(ucl_type &input)
181     { view(input,input.rows()*input.row_size()); }
182 
183   /// Do not allocate memory, instead use an existing allocation
184   /** - No memory is freed when the object is destructed.
185     * - The view does not prevent the memory from being freed by the
186     *   allocating container when using CUDA APIs **/
187   template <class ptr_type>
view(ptr_type input,const size_t rows,const size_t cols,UCL_Device & dev)188   inline void view(ptr_type input, const size_t rows, const size_t cols,
189                    UCL_Device &dev) {
190     #ifdef UCL_DEBUG
191     assert(rows==1);
192     #endif
193     clear();
194     _kind=UCL_VIEW;
195     _cols=cols;
196     _row_bytes=_cols*sizeof(numtyp);
197     this->_cq=dev.cq();
198     _array=input;
199     #ifndef _UCL_DEVICE_PTR_MAT
200     _end=_array+_cols;
201     #endif
202     #ifdef _OCL_MAT
203     _offset=0;
204     CL_SAFE_CALL(clRetainMemObject(input));
205     CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
206     #endif
207   }
208 
209   /// Do not allocate memory, instead use an existing allocation
210   /** - No memory is freed when the object is destructed.
211     * - The view does not prevent the memory from being freed by the
212     *   allocating container when using CUDA APIs
213     * \param stride Number of _elements_ between the start of each row **/
214   template <class ptr_type>
view(ptr_type input,const size_t rows,const size_t cols,const size_t stride,UCL_Device & dev)215   inline void view(ptr_type input, const size_t rows, const size_t cols,
216                    const size_t stride, UCL_Device &dev)
217     { view(input,rows,cols,stride); }
218 
219   /// Do not allocate memory, instead use an existing allocation
220   /** - No memory is freed when the object is destructed.
221     * - The view does not prevent the memory from being freed by the
222     *   allocating container when using CUDA APIs **/
223   template <class ptr_type>
view(ptr_type input,const size_t cols,UCL_Device & dev)224   inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
225     { view(input,1,cols,dev); }
226 
227   /// Do not allocate memory, instead use an existing allocation from Geryon
228   /** This function must be passed a Geryon vector or matrix container.
229     * No memory is freed when the object is destructed.
230     * - The view does not prevent the memory from being freed by the
231     *   allocating container when using CUDA APIs **/
232   template <class ucl_type>
view_offset(const size_t offset,ucl_type & input,const size_t rows,const size_t cols)233   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
234                           const size_t cols) {
235     #ifdef UCL_DEBUG
236     assert(rows==1);
237     #endif
238     clear();
239     _kind=UCL_VIEW;
240     _cols=cols;
241     _row_bytes=_cols*sizeof(numtyp);
242     this->_cq=input.cq();
243     #ifdef _OCL_MAT
244     _array=input.begin();
245     _offset=offset+input.offset();
246     CL_SAFE_CALL(clRetainMemObject(input.begin()));
247     CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
248     #else
249     _device_view(&_array,input.begin(),offset,sizeof(numtyp));
250     #endif
251 
252     #ifndef _UCL_DEVICE_PTR_MAT
253     _end=_array+_cols;
254     #endif
255   }
256 
257   /// Do not allocate memory, instead use an existing allocation from Geryon
258   /** This function must be passed a Geryon vector or matrix container.
259     * No memory is freed when the object is destructed.
260     * - The view does not prevent the memory from being freed by the
261     *   allocating container when using CUDA APIs
262     * \param stride Number of _elements_ between the start of each row **/
263   template <class ucl_type>
view_offset(const size_t offset,ucl_type & input,const size_t rows,const size_t cols,const size_t stride)264   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
265                           const size_t cols, const size_t stride)
266     { view_offset(offset,input,rows,cols); }
267 
268   /// Do not allocate memory, instead use an existing allocation from Geryon
269   /** This function must be passed a Geryon vector or matrix container.
270     * No memory is freed when the object is destructed.
271     * - The view does not prevent the memory from being freed by the
272     *   allocating container when using CUDA APIs
273     * - If a matrix is used a input, all elements (including padding)
274     *   will be used for view **/
275   template <class ucl_type>
view_offset(const size_t offset,ucl_type & input,const size_t cols)276   inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
277     { view_offset(offset,input,1,cols); }
278 
279   /// Do not allocate memory, instead use an existing allocation from Geryon
280   /** This function must be passed a Geryon vector or matrix container.
281     * No memory is freed when the object is destructed.
282     * - The view does not prevent the memory from being freed by the
283     *   allocating container when using CUDA APIs
284     * - If a matrix is used a input, all elements (including padding)
285     *   will be used for view **/
286   template <class ucl_type>
view_offset(const size_t offset,ucl_type & input)287   inline void view_offset(const size_t offset, ucl_type &input)
288     { view_offset(offset,input,input.rows()*input.row_size()-offset); }
289 
290   /// Do not allocate memory, instead use an existing allocation
291   /** - No memory is freed when the object is destructed.
292     * - The view does not prevent the memory from being freed by the
293     *   allocating container when using CUDA APIs **/
294   template <class ptr_type>
view_offset(const size_t offset,ptr_type input,const size_t rows,const size_t cols,UCL_Device & dev)295   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
296                           const size_t cols, UCL_Device &dev) {
297     #ifdef UCL_DEBUG
298     assert(rows==1);
299     #endif
300     clear();
301     _kind=UCL_VIEW;
302     _cols=cols;
303     _row_bytes=_cols*sizeof(numtyp);
304     this->_cq=dev.cq();
305 
306     #ifdef _OCL_MAT
307     _array=input;
308     _offset=offset;
309     CL_SAFE_CALL(clRetainMemObject(input));
310     CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
311     #else
312     #ifdef _UCL_DEVICE_PTR_MAT
313     _array=input+offset*sizeof(numtyp);
314     #else
315     _array=input+offset;
316     #endif
317     #endif
318 
319     #ifndef _UCL_DEVICE_PTR_MAT
320     _end=_array+_cols;
321     #endif
322   }
323 
324   /// Do not allocate memory, instead use an existing allocation
325   /** - No memory is freed when the object is destructed.
326     * - The view does not prevent the memory from being freed by the
327     *   allocating container when using CUDA APIs
328     * \param stride Number of _elements_ between the start of each row **/
329   template <class ptr_type>
view_offset(const size_t offset,ptr_type input,const size_t rows,const size_t cols,const size_t stride,UCL_Device & dev)330   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
331                           const size_t cols,const size_t stride,UCL_Device &dev)
332     { view_offset(offset,input,rows,cols,stride); }
333 
334   /// Do not allocate memory, instead use an existing allocation
335   /** - No memory is freed when the object is destructed.
336     * - The view does not prevent the memory from being freed by the
337     *   allocating container when using CUDA APIs **/
338   template <class ptr_type>
view_offset(const size_t offset,ptr_type input,const size_t cols,UCL_Device & dev)339   inline void view_offset(const size_t offset, ptr_type input,
340                           const size_t cols, UCL_Device &dev)
341     { view_offset(offset,input,1,cols,dev); }
342 
343   /// Free memory and set size to 0
clear()344   inline void clear()
345     { _device_free(*this); _cols=0; _kind=UCL_VIEW;  }
346 
347   /// Resize the allocation to contain cols elements
348   /** \note Cannot be used on views **/
resize(const int cols)349   inline int resize(const int cols) {
350     assert(_kind!=UCL_VIEW);
351 
352     _row_bytes=cols*sizeof(numtyp);
353     int err=_device_resize(*this,_row_bytes);
354     if (err!=UCL_SUCCESS) {
355       #ifndef UCL_NO_EXIT
356       std::cerr << "UCL Error: Could not allocate " << _row_bytes
357                 << " bytes on device.\n";
358       _row_bytes=0;
359       UCL_GERYON_EXIT;
360       #endif
361       _row_bytes=0;
362       return err;
363     }
364 
365     _cols=cols;
366     #ifndef _UCL_DEVICE_PTR_MAT
367     _end=_array+cols;
368     #endif
369     #ifdef _OCL_MAT
370     _offset=0;
371     #endif
372     return err;
373   }
374 
375   /// Resize (only if bigger) the allocation to contain cols elements
376   /** \note Cannot be used on views **/
resize_ib(const int cols)377   inline int resize_ib(const int cols)
378     { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
379 
380   /// Set each element to zero asynchronously in the default command_queue
zero()381   inline void zero() { zero(_cq); }
382   /// Set first n elements to zero asynchronously in the default command_queue
zero(const int n)383   inline void zero(const int n) { zero(n,_cq); }
384   /// Set each element to zero asynchronously
zero(command_queue & cq)385   inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
386   /// Set first n elements to zero asynchronously
zero(const int n,command_queue & cq)387   inline void zero(const int n, command_queue &cq)
388     { _device_zero(*this,n*sizeof(numtyp),cq); }
389 
390   #ifdef _UCL_DEVICE_PTR_MAT
391   /// For OpenCL, returns a (void *) device pointer to memory allocation
begin()392   inline device_ptr & begin() { return _array; }
393   /// For OpenCL, returns a (void *) device pointer to memory allocation
begin()394   inline const device_ptr & begin() const { return _array; }
395   #else
396   /// For CUDA-RT, get device pointer to first element
begin()397   inline numtyp * & begin() { return _array; }
398   /// For CUDA-RT, get device pointer to first element
begin()399   inline numtyp * const & begin() const { return _array; }
400   /// For CUDA-RT, get device pointer to one past last element
end()401   inline numtyp * end() { return _end; }
402   /// For CUDA-RT, get device pointer to one past last element
end()403   inline numtyp * end() const { return _end; }
404   #endif
405 
406   #ifdef _UCL_DEVICE_PTR_MAT
407   /// Returns an API specific device pointer
408   /** - For OpenCL, returns a &cl_mem object
409     * - For CUDA Driver, returns a &CUdeviceptr
410     * - For CUDA-RT, returns void** **/
cbegin()411   inline device_ptr & cbegin() { return _array; }
412   /// Returns an API specific device pointer
413   /** - For OpenCL, returns a &cl_mem object
414     * - For CUDA Driver, returns a &CUdeviceptr
415     * - For CUDA-RT, returns void** **/
cbegin()416   inline const device_ptr & cbegin() const { return _array; }
417   #else
418   /// Returns an API specific device pointer
419   /** - For OpenCL, returns a &cl_mem object
420     * - For CUDA Driver, returns a &CUdeviceptr
421     * - For CUDA-RT, returns numtyp** **/
cbegin()422   inline numtyp ** cbegin() { return &_array; }
423   /// Returns an API specific device pointer
424   /** - For OpenCL, returns a &cl_mem object
425     * - For CUDA Driver, returns a &CUdeviceptr
426     * - For CUDA-RT, returns numtyp** **/
cbegin()427   inline const numtyp ** cbegin() const { return &_array; }
428   /// For CUDA-RT, allocate row vector and bind texture
safe_alloc(const size_t cols,UCL_Device & dev,textureReference * t)429   inline void safe_alloc(const size_t cols, UCL_Device &dev,
430                          textureReference *t)
431     { alloc(cols,dev); assign_texture(t); bind(); }
432   /// For CUDA-RT, assign a texture to matrix
assign_texture(textureReference * t)433   inline void assign_texture(textureReference *t) { _tex_ptr=t; }
434   /// For CUDA-RT, bind to texture
bind()435   inline void bind() {
436     cuda_gb_get_channel<numtyp>(_channel);
437     (*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
438     (*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
439     (*_tex_ptr).filterMode = cudaFilterModePoint;
440     (*_tex_ptr).normalized = false;
441     CUDA_SAFE_CALL(cudaBindTexture(nullptr,_tex_ptr,_array,&_channel));
442   }
443   /// For CUDA-RT, unbind texture
unbind()444   inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
445   #endif
446 
447   /// Get the number of elements
numel()448   inline size_t numel() const { return _cols; }
449   /// Get the number of rows
rows()450   inline size_t rows() const { return 1; }
451   /// Get the number of columns
cols()452   inline size_t cols() const { return _cols; }
453   ///Get the size of a row (including any padding) in elements
row_size()454   inline size_t row_size() const { return _cols; }
455   /// Get the size of a row (including any padding) in bytes
row_bytes()456   inline size_t row_bytes() const { return _row_bytes; }
457   /// Get the size in bytes of 1 element
element_size()458   inline int element_size() const { return sizeof(numtyp); }
459 
460   #ifdef _OCL_MAT
461   /// Return the offset (in elements) from begin() pointer where data starts
462   /** \note Always 0 for host matrices and CUDA APIs **/
offset()463   inline size_t offset() const { return _offset; }
464   #else
465   /// Return the offset (in elements) from begin() pointer where data starts
466   /** \note Always 0 for host matrices and CUDA APIs **/
offset()467   inline size_t offset() const { return 0; }
468   #endif
469 
470   /// Return the offset (in bytes) from begin() pointer where data starts
471   /** \note Always 0 for host matrices and CUDA APIs **/
byteoff()472   inline size_t byteoff() const { return offset()*sizeof(numtyp); }
473 
474  private:
475   size_t _row_bytes, _row_size, _rows, _cols;
476 
477   #ifdef _UCL_DEVICE_PTR_MAT
478   device_ptr _array;
479   #else
480   numtyp *_array,*_end;
481   cudaChannelFormatDesc _channel;
482   textureReference *_tex_ptr;
483   #endif
484 
485   #ifdef _OCL_MAT
486   size_t _offset;
487   #endif
488 };
489 
490 #endif
491 
492