1 #ifndef VEXCL_VECTOR_HPP
2 #define VEXCL_VECTOR_HPP
3 
4 /*
5 The MIT License
6 
7 Copyright (c) 2012-2018 Denis Demidov <dennis.demidov@gmail.com>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 */
27 
28 /**
29  * \file   vexcl/vector.hpp
30  * \author Denis Demidov <dennis.demidov@gmail.com>
31  * \brief  OpenCL device vector.
32  */
33 
34 #include <vector>
35 #include <map>
36 #include <iostream>
37 #include <iomanip>
38 #include <sstream>
39 #include <stdexcept>
40 #include <string>
41 #include <type_traits>
42 #include <functional>
43 
44 #include <boost/proto/proto.hpp>
45 #include <boost/io/ios_state.hpp>
46 #include <boost/iterator/iterator_facade.hpp>
47 #include <boost/thread.hpp>
48 
49 #include <vexcl/backend.hpp>
50 #include <vexcl/util.hpp>
51 #include <vexcl/operations.hpp>
52 #include <vexcl/profiler.hpp>
53 #include <vexcl/devlist.hpp>
54 
55 #ifdef BOOST_NO_NOEXCEPT
56 #  define noexcept throw()
57 #endif
58 
59 /// Vector expression template library for OpenCL.
60 namespace vex {
61 
62 //--- Partitioning ----------------------------------------------------------
63 
64 /// Weights device wrt to vector performance.
65 /**
66  * Launches the following kernel on each device:
67  \code
68  a = b + c;
69  \endcode
70  * where a, b and c are device vectors. Each device gets portion of the vector
71  * proportional to the performance of this operation.
72  */
73 inline double device_vector_perf(const backend::command_queue&);
74 
75 /// Assigns equal weight to each device.
76 /**
77  * This results in equal partitioning.
78  */
equal_weights(const backend::command_queue &)79 inline double equal_weights(const backend::command_queue&) {
80     return 1;
81 }
82 
83 template <bool dummy = true>
84 struct partitioning_scheme {
85     static_assert(dummy, "dummy parameter should be true");
86 
87     typedef std::function< double(const backend::command_queue&) > weight_function;
88 
setvex::partitioning_scheme89     static void set(weight_function f) {
90         boost::lock_guard<boost::mutex> lock(mx);
91 
92         if (!is_set) {
93             weight = f;
94             is_set = true;
95         } else {
96             std::cerr <<
97                 "Warning: "
98                 "device weighting function is already set and will be left as is."
99                 << std::endl;
100         }
101     }
102 
103     static std::vector<size_t> get(size_t n, const std::vector<backend::command_queue> &queue);
104 
105     private:
106         static bool is_set;
107         static weight_function weight;
108         static std::map<backend::device_id, double> device_weight;
109         static boost::mutex mx;
110 
init_weight_functionvex::partitioning_scheme111         static bool init_weight_function() {
112             boost::lock_guard<boost::mutex> lock(mx);
113             if (!is_set) {
114                 weight = device_vector_perf;
115                 is_set = true;
116             }
117             return true;
118         }
119 };
120 
121 template <bool dummy>
122 bool partitioning_scheme<dummy>::is_set = false;
123 
124 template <bool dummy>
125 std::map<backend::device_id, double> partitioning_scheme<dummy>::device_weight;
126 
127 template <bool dummy>
128 boost::mutex partitioning_scheme<dummy>::mx;
129 
130 template <bool dummy>
get(size_t n,const std::vector<backend::command_queue> & queue)131 std::vector<size_t> partitioning_scheme<dummy>::get(size_t n,
132         const std::vector<backend::command_queue> &queue)
133 {
134     static const bool once = init_weight_function();
135     (void)once; // do not warn about unused variable
136 
137     std::vector<size_t> part;
138     part.reserve(queue.size() + 1);
139     part.push_back(0);
140 
141     if (queue.size() > 1) {
142         std::vector<double> cumsum;
143         cumsum.reserve(queue.size() + 1);
144         cumsum.push_back(0);
145 
146         for(auto q = queue.begin(); q != queue.end(); q++) {
147             auto dev_id = backend::get_device_id(*q);
148             auto dw = device_weight.find(dev_id);
149 
150             double w = (dw == device_weight.end()) ?
151                 (device_weight[dev_id] = weight(*q)) :
152                 dw->second;
153 
154             cumsum.push_back(cumsum.back() + w);
155         }
156 
157         for(unsigned d = 1; d < queue.size(); d++)
158             part.push_back(
159                     std::min(n,
160                         alignup(static_cast<size_t>(n * cumsum[d] / cumsum.back()))
161                         )
162                     );
163     }
164 
165     part.push_back(n);
166     return part;
167 }
168 
169 template <bool dummy>
170 typename partitioning_scheme<dummy>::weight_function partitioning_scheme<dummy>::weight;
171 
172 /// Partitioning scheme for vectors and matrices.
173 /**
174  * Should be set once before any object of vector or matrix type is declared.
175  * Otherwise default parttioning function (partition_by_vector_perf) is
176  * selected.
177  */
set_partitioning(std::function<double (const backend::command_queue &)> f)178 inline void set_partitioning(
179         std::function< double(const backend::command_queue&) > f
180         )
181 {
182     partitioning_scheme<>::set(f);
183 }
184 
185 /// Returns partitioning for the specified vector size on a given set of queues.
partition(size_t n,const std::vector<backend::command_queue> & queue)186 inline std::vector<size_t> partition(size_t n,
187             const std::vector<backend::command_queue> &queue)
188 {
189     return partitioning_scheme<>::get(n, queue);
190 }
191 
192 
193 //--- Vector Type -----------------------------------------------------------
194 struct vector_terminal {};
195 
196 typedef vector_expression<
197     typename boost::proto::terminal< vector_terminal >::type
198     > vector_terminal_expression;
199 
200 namespace traits {
201 
202 // Hold vector terminals by reference:
203 template <class T>
204 struct hold_terminal_by_reference< T,
205         typename std::enable_if<
206             boost::proto::matches<
207                 typename boost::proto::result_of::as_expr< T >::type,
208                 boost::proto::terminal< vector_terminal >
209             >::value
210         >::type
211     >
212     : std::true_type
213 { };
214 
215 } // namespace traits
216 
217 /// \defgroup containers Container classes
218 
219 /// Device vector.
220 template <typename T>
221 class vector : public vector_terminal_expression {
222     public:
223         typedef T      value_type;
224         typedef size_t size_type;
225 
226         // Proxy class.
227         //
228         // Instances of this class are returned from vector::operator[]. These
229         // may be used to read or write single element of a vector, although
230         // this operations are too expensive to be used extensively and should
231         // be reserved for debugging purposes.
232         class element {
233             public:
234                 // Reads the associated element of a vector.
operator T() const235                 operator T() const {
236                     T val = T();
237                     buf.read(queue, index, 1, &val, true);
238                     return val;
239                 }
240 
241                 // Writes the associated element of a vector.
operator =(T val)242                 T operator=(T val) {
243                     buf.write(queue, index, 1, &val, true);
244                     return val;
245                 }
246 
operator =(const element & other)247                 T operator=(const element &other) {
248                     return (*this) = static_cast<T>(other);
249                 }
250 
swap(element && a,element && b)251                 friend void swap(element &&a, element &&b) {
252                     T tmp = static_cast<T>(a);
253                     a     = static_cast<T>(b);
254                     b     = tmp;
255                 }
256 
257             private:
element(const backend::command_queue & q,const backend::device_vector<T> & b,size_t i)258                 element(const backend::command_queue    &q,
259                         const backend::device_vector<T> &b,
260                         size_t i
261                         ) : queue(q), buf(b), index(i)
262                 {}
263 
264                 const backend::command_queue    &queue;
265                 const backend::device_vector<T> &buf;
266 
267                 size_t index;
268 
269                 friend class vector;
270         };
271 
272         //  Iterator class.
273         //
274         // This class may in principle be used with standard template library,
275         // although its main purpose is range specification for vector copy
276         // operations.
277         template <class vector_type, class element_type>
278         class iterator_type
279             : public boost::iterator_facade<
280                         iterator_type<vector_type, element_type>,
281                         T,
282                         std::random_access_iterator_tag,
283                         element_type
284                      >
285         {
286             public:
287                 typedef boost::iterator_facade<
288                             iterator_type<vector_type, element_type>,
289                             T,
290                             std::random_access_iterator_tag,
291                             element_type
292                          > super_type;
293                 typedef typename super_type::reference       reference;
294                 typedef typename super_type::difference_type difference_type;
295 
296                 static const bool device_iterator = true;
297 
298                 vector_type *vec;
299                 size_t  pos;
300                 size_t  part;
301 
302             private:
303                 friend class ::boost::iterator_core_access;
304                 friend class vector;
305 
iterator_type(vector_type & vec,size_t pos)306                 iterator_type(vector_type &vec, size_t pos)
307                     : vec(&vec), pos(pos), part(0)
308                 {
309                     if (!vec.part.empty()) {
310                         part = std::upper_bound(
311                                 vec.part.begin(), vec.part.end(), pos
312                                 ) - vec.part.begin() - 1;
313                     }
314                 }
315 
dereference() const316                 reference dereference() const {
317                     return element_type(
318                             vec->queue[part], vec->buf[part],
319                             pos - vec->part[part]
320                             );
321                 }
322 
equal(const iterator_type & it) const323                 bool equal(const iterator_type &it) const {
324                     return pos == it.pos;
325                 }
326 
increment()327                 void increment() {
328                     ++pos;
329                     while (part < vec->nparts() && pos >= vec->part[part + 1])
330                         ++part;
331                 }
332 
decrement()333                 void decrement() {
334                     --pos;
335                     while (part > 0 && pos < vec->part[part])
336                         --part;
337                 }
338 
advance(difference_type n)339                 void advance(difference_type n) {
340                     pos += n;
341                     if (n > 0) {
342                         while (part < vec->nparts() && pos >= vec->part[part + 1])
343                             ++part;
344                     } else if (n < 0) {
345                         while (part > 0 && pos < vec->part[part])
346                             --part;
347                     }
348                 }
349 
distance_to(const iterator_type & it) const350                 difference_type distance_to(const iterator_type &it) const {
351                     return static_cast<difference_type>(it.pos - pos);
352                 }
353         };
354 
355         typedef iterator_type<vector, element> iterator;
356         typedef iterator_type<const vector, const element> const_iterator;
357 
358         /// Empty constructor.
vector()359         vector() {}
360 
361 #ifdef VEXCL_NO_COPY_CONSTRUCTORS
362     private:
363 #endif
364         /// Copy constructor.
vector(const vector & v)365         vector(const vector &v) : queue(v.queue), part(v.part)
366         {
367 #ifdef VEXCL_SHOW_COPIES
368             std::cout << "Copying vex::vector<" << type_name<T>()
369                       << "> of size " << size() << std::endl;
370 #endif
371             if (size()) allocate_buffers(backend::MEM_READ_WRITE, 0);
372             *this = v;
373         }
374 #ifdef VEXCL_NO_COPY_CONSTRUCTORS
375     public:
376 #endif
377 
378         /// Move constructor
vector(vector && v)379         vector(vector &&v) noexcept {
380             swap(v);
381         }
382 
383         /// Wraps a native buffer without owning it.
384         /**
385          * May be used to apply VexCL functions to buffers allocated and
386          * managed outside of VexCL.
387          */
vector(const backend::command_queue & q,const backend::device_vector<T> & buffer,size_t size=0)388         vector(const backend::command_queue &q,
389                const backend::device_vector<T> &buffer,
390                size_t size = 0
391                ) : queue(1, q), part(2), buf(1, buffer)
392         {
393             part[0] = 0;
394             part[1] = size ? size : buffer.size();
395         }
396 
397         /// Creates vector of the given size and optionally copies host data.
vector(const std::vector<backend::command_queue> & queue,size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)398         vector(const std::vector<backend::command_queue> &queue,
399                 size_t size, const T *host = 0,
400                 backend::mem_flags flags = backend::MEM_READ_WRITE
401               ) : queue(queue), part(vex::partition(size, queue))
402         {
403             if (size) allocate_buffers(flags, host);
404         }
405 
406 #ifndef VEXCL_NO_STATIC_CONTEXT_CONSTRUCTORS
407         /// Creates vector of the given size and optionally copies host data.
408         /** This version uses the most recently created VexCL context.  */
vector(size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)409         vector(size_t size, const T *host = 0,
410                 backend::mem_flags flags = backend::MEM_READ_WRITE
411               ) : queue(current_context().queue()), part(vex::partition(size, queue))
412         {
413             if (size) allocate_buffers(flags, host);
414         }
415 #endif
416 
417         /// Creates new device vector and copies the host vector.
vector(const std::vector<backend::command_queue> & queue,const std::vector<T> & host,backend::mem_flags flags=backend::MEM_READ_WRITE)418         vector(const std::vector<backend::command_queue> &queue,
419                 const std::vector<T> &host,
420                 backend::mem_flags flags = backend::MEM_READ_WRITE
421               ) : queue(queue), part(vex::partition(host.size(), queue))
422         {
423             if (!host.empty()) allocate_buffers(flags, host.data());
424         }
425 
426 #ifndef VEXCL_NO_STATIC_CONTEXT_CONSTRUCTORS
427         /// Creates new device vector and copies the host vector.
428         /** This version uses the most recently created VexCL context.  */
vector(const std::vector<T> & host,backend::mem_flags flags=backend::MEM_READ_WRITE)429         vector(const std::vector<T> &host,
430                 backend::mem_flags flags = backend::MEM_READ_WRITE
431               ) : queue(current_context().queue()), part(vex::partition(host.size(), queue))
432         {
433             if (!host.empty()) allocate_buffers(flags, host.data());
434         }
435 #endif
436 
437         /// Constructs new vector from vector expression.
438         /**
439          * This will fail if VexCL is unable to automatically determine the
440          * expression size and the compute devices to use.
441          */
442         template <class Expr
443 #if !defined(BOOST_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS) && !defined(DOXYGEN)
444             , class Enable = typename std::enable_if<
445             !std::is_integral<Expr>::value &&
446                 boost::proto::matches<
447                     typename boost::proto::result_of::as_expr<Expr>::type,
448                     vector_expr_grammar
449                 >::value
450             >::type
451 #endif
452         >
vector(const Expr & expr)453         vector(const Expr &expr) {
454 #ifdef BOOST_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS
455             static_assert(
456                 boost::proto::matches<
457                     typename boost::proto::result_of::as_expr<Expr>::type,
458                     vector_expr_grammar
459                 >::value,
460                 "Only vector expressions can be used to initialize a vector"
461                 );
462 #endif
463             detail::get_expression_properties prop;
464             detail::extract_terminals()(boost::proto::as_child(expr), prop);
465 
466             precondition(!prop.queue.empty() && !prop.part.empty(),
467                     "Can not determine expression size and queue list"
468                     );
469 
470             queue = prop.queue;
471             part  = prop.part;
472 
473             allocate_buffers(backend::MEM_READ_WRITE, 0);
474 
475             *this = expr;
476         }
477 
478         template <typename U>
reinterpret() const479         vector<U> reinterpret() const {
480             vector<U> r;
481             r.queue = queue;
482             r.part  = part;
483             r.buf.reserve(buf.size());
484             for(size_t i = 0; i < buf.size(); ++i) {
485                 r.buf.push_back(buf[i].template reinterpret<U>());
486                 r.part[i+1] = r.part[i+1] * sizeof(T) / sizeof(U);
487             }
488             return r;
489         }
490 
491         /// Swap function.
swap(vector & v)492         void swap(vector &v) {
493             std::swap(queue,   v.queue);
494             std::swap(part,    v.part);
495             std::swap(buf,     v.buf);
496         }
497 
498         /// Resizes the vector.
499         /**
500          * Borrows devices, size, and data from the given vector.
501          * Any data contained in the resized vector will be lost as a result.
502          */
resize(const vector & v,backend::mem_flags flags=backend::MEM_READ_WRITE)503         void resize(const vector &v, backend::mem_flags flags = backend::MEM_READ_WRITE)
504         {
505             // Reallocate bufers
506             *this = std::move(vector(v.queue, v.size(), 0, flags));
507 
508             // Copy data
509             *this = v;
510         }
511 
512         /// Resizes the vector with the given parameters.
513         /**
514          * This is equivalent to reconstructing the vector with the given
515          * parameters.
516          * Any data contained in the resized vector will be lost as a result.
517          */
resize(const std::vector<backend::command_queue> & queue,size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)518         void resize(const std::vector<backend::command_queue> &queue,
519                 size_t size, const T *host = 0,
520                 backend::mem_flags flags = backend::MEM_READ_WRITE
521                 )
522         {
523             *this = std::move(vector(queue, size, host, flags));
524         }
525 
526         /// Resizes the vector.
527         /**
528          * This is equivalent to reconstructing the vector with the given
529          * parameters.
530          * Any data contained in the resized vector will be lost as a result.
531          */
resize(const std::vector<backend::command_queue> & queue,const std::vector<T> & host,backend::mem_flags flags=backend::MEM_READ_WRITE)532         void resize(const std::vector<backend::command_queue> &queue,
533                 const std::vector<T> &host,
534                 backend::mem_flags flags = backend::MEM_READ_WRITE
535               )
536         {
537             *this = std::move(vector(queue, host, flags));
538         }
539 
540         /// Resizes the vector.
541         /*
542          * This is equivalent to reconstructing the vector with the given
543          * parameters.
544          * This version uses the most recently created VexCL context.
545          */
resize(size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)546         void resize(size_t size, const T *host = 0, backend::mem_flags flags = backend::MEM_READ_WRITE)
547         {
548             vector(size, host, flags).swap(*this);
549         }
550 
551         /// Fills vector with zeros.
552         /** This does not change the vector size! */
clear()553         void clear() {
554             *this = static_cast<T>(0);
555         }
556 
557         /// Returns memory buffer located on the given device.
operator ()(unsigned d=0) const558         const backend::device_vector<T>& operator()(unsigned d = 0) const {
559             return buf[d];
560         }
561 
562         /// Returns memory buffer located on the given device.
operator ()(unsigned d=0)563         backend::device_vector<T>& operator()(unsigned d = 0) {
564             return buf[d];
565         }
566 
567         /// Returns const iterator to the first element of the vector.
begin() const568         const_iterator begin() const {
569             return const_iterator(*this, 0);
570         }
571 
572         /// Returns const iterator referring to the past-the-end element in the vector.
end() const573         const_iterator end() const {
574             return const_iterator(*this, size());
575         }
576 
577         /// Returns iterator to the first element of the vector.
begin()578         iterator begin() {
579             return iterator(*this, 0);
580         }
581 
582         /// Returns iterator referring to the past-the-end element in the vector.
end()583         iterator end() {
584             return iterator(*this, size());
585         }
586 
587         /// Access vector element.
operator [](size_t index) const588         const element operator[](size_t index) const {
589             size_t d = std::upper_bound(
590                     part.begin(), part.end(), index) - part.begin() - 1;
591             return element(queue[d], buf[d], index - part[d]);
592         }
593 
594         /// Access vector element.
operator [](size_t index)595         element operator[](size_t index) {
596             unsigned d = static_cast<unsigned>(
597                 std::upper_bound(part.begin(), part.end(), index) - part.begin() - 1
598                 );
599             return element(queue[d], buf[d], index - part[d]);
600         }
601 
602         /// at() style access is identical to operator[]
at(size_t index) const603         const element at(size_t index) const {
604             if(index >= size())
605                 throw std::out_of_range("vexcl::vector");
606             return operator[](index);
607         }
608 
609         /// at() style access is identical to operator[]
at(size_t index)610         element at(size_t index) {
611             if(index >= size())
612                 throw std::out_of_range("vexcl::vector");
613             return operator[](index);
614         }
615 
616         /// Returns vector size.
size() const617         size_t size() const {
618             return part.empty() ? 0 : part.back();
619         }
620 
621         /// Returns number of vector parts.
622         /** Each partition is located on single device.
623          */
nparts() const624         size_t nparts() const {
625             return queue.size();
626         }
627 
628         /// Returns vector part size on the given device.
part_size(unsigned d) const629         size_t part_size(unsigned d) const {
630             return part[d + 1] - part[d];
631         }
632 
633         /// Returns index of the first element located on the given device.
part_start(unsigned d) const634         size_t part_start(unsigned d) const {
635             return part[d];
636         }
637 
638         /// Returns reference to the vector of command queues used to construct the vector.
queue_list() const639         const std::vector<backend::command_queue>& queue_list() const {
640             return queue;
641         }
642 
643         // Returns reference to vector's partition.
partition() const644         const std::vector<size_t>& partition() const {
645             return part;
646         }
647 
648         /// Maps vector part located on the given device to a host array.
649         /**
650          * This returns a smart pointer that will be unmapped automatically
651          * upon destruction */
652         typename backend::device_vector<T>::mapped_array
map(unsigned d=0)653         map(unsigned d = 0) {
654             return buf[d].map(queue[d]);
655         }
656 
657         /// Maps vector part located on the given device to a host array.
658         /**
659          * This returns a smart pointer that will be unmapped automatically
660          * upon destruction */
661         typename backend::device_vector<T>::mapped_array
map(unsigned d=0) const662         map(unsigned d = 0) const {
663             return buf[d].map(queue[d]);
664         }
665 
666         /// Copy assignment
operator =(const vector & x)667         const vector& operator=(const vector &x) {
668             if (&x != this)
669                 detail::assign_expression<assign::SET>(*this, x, queue, part);
670             return *this;
671         }
672 
673         /// Move assignment.
operator =(vector && v)674         const vector& operator=(vector &&v) {
675             swap(v);
676             return *this;
677         }
678 
679 #define VEXCL_ASSIGNMENT(op, op_type)                                          \
680   /** Expression assignment operator. */                                       \
681   template <class Expr>                                                        \
682   auto operator op(const Expr & expr) ->                                       \
683       typename std::enable_if<                                                 \
684           boost::proto::matches<                                               \
685               typename boost::proto::result_of::as_expr<Expr>::type,           \
686               vector_expr_grammar>::value,                                     \
687           const vector &>::type                                                \
688   {                                                                            \
689     detail::assign_expression<op_type>(*this, expr, queue, part);              \
690     return *this;                                                              \
691   }
692 
VEXCL_ASSIGNMENTS(VEXCL_ASSIGNMENT)693         VEXCL_ASSIGNMENTS(VEXCL_ASSIGNMENT)
694 
695 #undef VEXCL_ASSIGNMENT
696 
697 #ifndef DOXYGEN
698         template <class Expr>
699         typename std::enable_if<
700             boost::proto::matches<
701                 typename boost::proto::result_of::as_expr<Expr>::type,
702                 additive_vector_transform_grammar
703             >::value,
704             const vector&
705         >::type
706         operator=(const Expr &expr) {
707             detail::apply_additive_transform</*append=*/false>(
708                     *this, detail::simplify_additive_transform()( expr )
709                     );
710 
711             return *this;
712         }
713 
714         template <class Expr>
715         typename std::enable_if<
716             boost::proto::matches<
717                 typename boost::proto::result_of::as_expr<Expr>::type,
718                 additive_vector_transform_grammar
719             >::value,
720             const vector&
721         >::type
operator +=(const Expr & expr)722         operator+=(const Expr &expr) {
723             detail::apply_additive_transform</*append=*/true>(
724                     *this, detail::simplify_additive_transform()( expr )
725                     );
726 
727             return *this;
728         }
729 
730         template <class Expr>
731         typename std::enable_if<
732             boost::proto::matches<
733                 typename boost::proto::result_of::as_expr<Expr>::type,
734                 additive_vector_transform_grammar
735             >::value,
736             const vector&
737         >::type
operator -=(const Expr & expr)738         operator-=(const Expr &expr) {
739             detail::apply_additive_transform</*append=*/true>(
740                     *this, detail::simplify_additive_transform()( -expr )
741                     );
742 
743             return *this;
744         }
745 
746         template <class Expr>
747         typename std::enable_if<
748             !boost::proto::matches<
749                 typename boost::proto::result_of::as_expr<Expr>::type,
750                 vector_expr_grammar
751             >::value &&
752             !boost::proto::matches<
753                 typename boost::proto::result_of::as_expr<Expr>::type,
754                 additive_vector_transform_grammar
755             >::value,
756             const vector&
757         >::type
operator =(const Expr & expr)758         operator=(const Expr &expr) {
759             *this  = detail::extract_vector_expressions()( expr );
760             *this += detail::extract_additive_vector_transforms()( expr );
761 
762             return *this;
763         }
764 
765         template <class Expr>
766         typename std::enable_if<
767             !boost::proto::matches<
768                 typename boost::proto::result_of::as_expr<Expr>::type,
769                 vector_expr_grammar
770             >::value &&
771             !boost::proto::matches<
772                 typename boost::proto::result_of::as_expr<Expr>::type,
773                 additive_vector_transform_grammar
774             >::value,
775             const vector&
776         >::type
operator +=(const Expr & expr)777         operator+=(const Expr &expr) {
778             *this += detail::extract_vector_expressions()( expr );
779             *this += detail::extract_additive_vector_transforms()( expr );
780 
781             return *this;
782         }
783 
784         template <class Expr>
785         typename std::enable_if<
786             !boost::proto::matches<
787                 typename boost::proto::result_of::as_expr<Expr>::type,
788                 vector_expr_grammar
789             >::value &&
790             !boost::proto::matches<
791                 typename boost::proto::result_of::as_expr<Expr>::type,
792                 additive_vector_transform_grammar
793             >::value,
794             const vector&
795         >::type
operator -=(const Expr & expr)796         operator-=(const Expr &expr) {
797             *this -= detail::extract_vector_expressions()( expr );
798             *this -= detail::extract_additive_vector_transforms()( expr );
799 
800             return *this;
801         }
802 #endif
803 
804         // Copy data from host buffer to device(s).
write_data(size_t offset,size_t size,const T * hostptr,bool blocking)805         void write_data(size_t offset, size_t size, const T *hostptr, bool blocking)
806         {
807             if (!size) return;
808 
809             for(unsigned d = 0; d < queue.size(); d++) {
810                 size_t start = std::max(offset,        part[d]);
811                 size_t stop  = std::min(offset + size, part[d + 1]);
812 
813                 if (stop <= start) continue;
814 
815                 buf[d].write(queue[d], start - part[d], stop - start, hostptr + start - offset);
816             }
817 
818             if (blocking)
819                 for(size_t d = 0; d < queue.size(); d++) {
820                     size_t start = std::max(offset,        part[d]);
821                     size_t stop  = std::min(offset + size, part[d + 1]);
822 
823                     if (start < stop) queue[d].finish();
824                 }
825         }
826 
827         // Copy data from host buffer to device(s).
write_data(size_t offset,size_t size,const T * hostptr,bool blocking,std::vector<backend::command_queue> & q)828         void write_data(size_t offset, size_t size, const T *hostptr,
829                 bool blocking, std::vector<backend::command_queue> &q)
830         {
831             precondition(q.size() == queue.size(), "The queue list has wrong size");
832 
833             if (!size) return;
834 
835             for(unsigned d = 0; d < q.size(); d++) {
836                 precondition(
837                         backend::get_context_id(q[d]) == backend::get_context_id(queue[d]),
838                         "Wrong context!"
839                         );
840 
841                 size_t start = std::max(offset,        part[d]);
842                 size_t stop  = std::min(offset + size, part[d + 1]);
843 
844                 if (stop <= start) continue;
845 
846                 buf[d].write(q[d], start - part[d], stop - start, hostptr + start - offset);
847             }
848 
849             if (blocking)
850                 for(size_t d = 0; d < q.size(); d++) {
851                     size_t start = std::max(offset,        part[d]);
852                     size_t stop  = std::min(offset + size, part[d + 1]);
853 
854                     if (start < stop) q[d].finish();
855                 }
856         }
857 
858         // Copy data from device(s) to host buffer .
read_data(size_t offset,size_t size,T * hostptr,bool blocking) const859         void read_data(size_t offset, size_t size, T *hostptr, bool blocking) const
860         {
861             if (!size) return;
862 
863             for(unsigned d = 0; d < queue.size(); d++) {
864                 size_t start = std::max(offset,        part[d]);
865                 size_t stop  = std::min(offset + size, part[d + 1]);
866 
867                 if (stop <= start) continue;
868 
869                 buf[d].read(queue[d], start - part[d], stop - start, hostptr + start - offset);
870             }
871 
872             if (blocking)
873                 for(unsigned d = 0; d < queue.size(); d++) {
874                     size_t start = std::max(offset,        part[d]);
875                     size_t stop  = std::min(offset + size, part[d + 1]);
876 
877                     if (start < stop) queue[d].finish();
878                 }
879         }
880 
881         // Copy data from device(s) to host buffer .
read_data(size_t offset,size_t size,T * hostptr,bool blocking,std::vector<backend::command_queue> & q) const882         void read_data(size_t offset, size_t size, T *hostptr,
883                 bool blocking, std::vector<backend::command_queue> &q
884                 ) const
885         {
886             precondition(q.size() == queue.size(), "The queue list has wrong size");
887 
888             if (!size) return;
889 
890             for(unsigned d = 0; d < q.size(); d++) {
891                 precondition(
892                         backend::get_context_id(q[d]) == backend::get_context_id(queue[d]),
893                         "Wrong context!"
894                         );
895 
896                 size_t start = std::max(offset,        part[d]);
897                 size_t stop  = std::min(offset + size, part[d + 1]);
898 
899                 if (stop <= start) continue;
900 
901                 buf[d].read(q[d], start - part[d], stop - start, hostptr + start - offset);
902             }
903 
904             if (blocking)
905                 for(unsigned d = 0; d < q.size(); d++) {
906                     size_t start = std::max(offset,        part[d]);
907                     size_t stop  = std::min(offset + size, part[d + 1]);
908 
909                     if (start < stop) q[d].finish();
910                 }
911         }
912 
913     private:
914         mutable std::vector<backend::command_queue> queue;
915         std::vector<size_t>                      part;
916         std::vector< backend::device_vector<T> > buf;
917 
allocate_buffers(backend::mem_flags flags,const T * hostptr)918         void allocate_buffers(backend::mem_flags flags, const T *hostptr) {
919             buf.clear();
920             buf.reserve(queue.size());
921 
922             for(unsigned d = 0; d < queue.size(); d++)
923                 buf.push_back(
924                         backend::device_vector<T>(
925                             queue[d], part[d + 1] - part[d],
926                             hostptr ? hostptr + part[d] : 0, flags)
927                         );
928         }
929 
930         template <typename U>
931         friend class vector;
932 
933         template <typename S, size_t N>
934         friend class multivector;
935 };
936 
937 //---------------------------------------------------------------------------
938 // Support for vector expressions
939 //---------------------------------------------------------------------------
940 namespace traits {
941 
942 template <>
943 struct is_vector_expr_terminal< vector_terminal > : std::true_type {};
944 
945 template <>
946 struct proto_terminal_is_value< vector_terminal > : std::true_type {};
947 
948 template <typename T>
949 struct kernel_param_declaration< vector<T> > {
getvex::traits::kernel_param_declaration950     static void get(backend::source_generator &src,
951             const vector<T>&,
952             const backend::command_queue&, const std::string &prm_name,
953             detail::kernel_generator_state_ptr)
954     {
955         src.parameter< global_ptr<T> >(prm_name);
956     }
957 };
958 
959 template <typename T>
960 struct partial_vector_expr< vector<T> > {
getvex::traits::partial_vector_expr961     static void get(backend::source_generator &src,
962             const vector<T>&,
963             const backend::command_queue&, const std::string &prm_name,
964             detail::kernel_generator_state_ptr)
965     {
966         src << prm_name << "[idx]";
967     }
968 };
969 
970 template <typename T>
971 struct kernel_arg_setter< vector<T> > {
setvex::traits::kernel_arg_setter972     static void set(const vector<T> &term,
973             backend::kernel &kernel, unsigned device, size_t/*index_offset*/,
974             detail::kernel_generator_state_ptr)
975     {
976         kernel.push_arg(term(device));
977     }
978 };
979 
980 template <class T>
981 struct expression_properties< vector<T> > {
getvex::traits::expression_properties982     static void get(const vector<T> &term,
983             std::vector<backend::command_queue> &queue_list,
984             std::vector<size_t> &partition,
985             size_t &size
986             )
987     {
988         queue_list = term.queue_list();
989         partition  = term.partition();
990         size       = term.size();
991     }
992 };
993 
994 } // namespace traits
995 
996 //---------------------------------------------------------------------------
997 /// Copy device vector to host vector.
998 template <class Td, class Th>
999 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,std::vector<Th> & hv,bool blocking=true)1000 copy(const vex::vector<Td> &dv, std::vector<Th> &hv, bool blocking = true) {
1001     dv.read_data(0, dv.size(), hv.data(), blocking);
1002 }
1003 
1004 template <class Td, class Th>
1005 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,std::vector<Th> & hv,bool blocking=true)1006 copy(const vex::vector<Td> &dv, std::vector<Th> &hv, bool blocking = true) {
1007     std::vector<Td> tmp(dv.size());
1008     dv.read_data(0, dv.size(), tmp.data(), true);
1009     std::copy(tmp.begin(), tmp.end(), hv.begin());
1010 }
1011 
1012 /// Copy device vector to host pointer.
1013 template <class Td, class Th>
1014 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,Th * hv,bool blocking=true)1015 copy(const vex::vector<Td> &dv, Th *hv, bool blocking = true) {
1016     dv.read_data(0, dv.size(), hv, blocking);
1017 }
1018 
1019 template <class Td, class Th>
1020 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,Th * hv,bool blocking=true)1021 copy(const vex::vector<Td> &dv, Th *hv, bool blocking = true) {
1022     std::vector<Td> tmp(dv.size());
1023     dv.read_data(0, dv.size(), tmp.data(), true);
1024     std::copy(tmp.begin(), tmp.end(), hv);
1025 }
1026 
1027 /// Copy host vector to device vector.
1028 template <class Th, class Td>
1029 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1030 copy(const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true) {
1031     dv.write_data(0, dv.size(), hv.data(), blocking);
1032 }
1033 
1034 template <class Th, class Td>
1035 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1036 copy(const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true) {
1037     std::vector<Td> tmp(hv.size());
1038     for (size_t i = 0; i < hv.size(); ++i)
1039         tmp[i] = hv[i];
1040     dv.write_data(0, dv.size(), tmp.data(), true);
1041 }
1042 
1043 /// Copy host pointer to device vector.
1044 template <class Th, class Td>
1045 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const Th * hv,vex::vector<Td> & dv,bool blocking=true)1046 copy(const Th *hv, vex::vector<Td> &dv, bool blocking = true) {
1047     dv.write_data(0, dv.size(), hv, blocking);
1048 }
1049 
1050 template <class Th, class Td>
1051 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const Th * hv,vex::vector<Td> & dv,bool blocking=true)1052 copy(const Th *hv, vex::vector<Td> &dv, bool blocking = true) {
1053     std::vector<Td> tmp(hv, hv + dv.size());
1054     dv.write_data(0, dv.size(), tmp.data(), true);
1055 }
1056 
1057 /// Copy device vector to host vector.
1058 template <class Td, class Th>
copy(std::vector<backend::command_queue> & q,const vex::vector<Td> & dv,std::vector<Th> & hv,bool blocking=true)1059 void copy(std::vector<backend::command_queue> &q,
1060         const vex::vector<Td> &dv, std::vector<Th> &hv, bool blocking = true)
1061 {
1062     if (std::is_same<Td, Th>::value) {
1063         dv.read_data(0, dv.size(), hv.data(), blocking, q);
1064     } else {
1065         std::vector<Td> tmp(dv.size());
1066         dv.read_data(0, dv.size(), tmp.data(), true, q);
1067         std::copy(tmp.begin(), tmp.end(), hv.begin());
1068     }
1069 }
1070 
1071 /// Copy device vector to host pointer.
1072 template <class Td, class Th>
1073 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const vex::vector<Td> & dv,Th * hv,bool blocking=true)1074 copy(std::vector<backend::command_queue> &q,
1075         const vex::vector<Td> &dv, Th *hv, bool blocking = true)
1076 {
1077     dv.read_data(0, dv.size(), hv, blocking, q);
1078 }
1079 
1080 template <class Td, class Th>
1081 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const vex::vector<Td> & dv,Th * hv,bool blocking=true)1082 copy(std::vector<backend::command_queue> &q,
1083         const vex::vector<Td> &dv, Th *hv, bool blocking = true)
1084 {
1085     std::vector<Td> tmp(dv.size());
1086     dv.read_data(0, dv.size(), tmp.data(), true, q);
1087     std::copy(tmp.begin(), tmp.end(), hv);
1088 }
1089 
1090 /// Copy host vector to device vector.
1091 template <class Th, class Td>
1092 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1093 copy(std::vector<backend::command_queue> &q,
1094         const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true)
1095 {
1096     dv.write_data(0, dv.size(), hv.data(), blocking, q);
1097 }
1098 
1099 template <class Th, class Td>
1100 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1101 copy(std::vector<backend::command_queue> &q,
1102         const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true)
1103 {
1104     std::vector<Td> tmp(hv.begin(), hv.end());
1105     dv.write_data(0, dv.size(), tmp.data(), true, q);
1106 }
1107 
1108 /// Copy host pointer to device vector.
1109 template <class Th, class Td>
1110 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const Th * hv,vex::vector<Td> & dv,bool blocking=true)1111 copy(std::vector<backend::command_queue> &q,
1112         const Th *hv, vex::vector<Td> &dv, bool blocking = true)
1113 {
1114     dv.write_data(0, dv.size(), hv, blocking, q);
1115 }
1116 
1117 template <class Th, class Td>
1118 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const Th * hv,vex::vector<Td> & dv,bool blocking=true)1119 copy(std::vector<backend::command_queue> &q,
1120         const Th *hv, vex::vector<Td> &dv, bool blocking = true)
1121 {
1122     std::vector<Td> tmp(hv, hv + dv.size());
1123     dv.write_data(0, dv.size(), tmp.data(), true, q);
1124 }
1125 
1126 /// Copy device vector to device vector.
1127 template <class T1, class T2>
copy(const vex::vector<T1> & src,vex::vector<T2> & dst)1128 void copy(const vex::vector<T1> &src, vex::vector<T2> &dst) {
1129     dst = src;
1130 }
1131 
1132 template<class Iterator, class Enable = void>
1133 struct stored_on_device : std::false_type {};
1134 
1135 template<class Iterator>
1136 struct stored_on_device<Iterator,
1137     typename std::enable_if<Iterator::device_iterator>::type
1138     > : std::true_type {};
1139 
1140 /// Copy range from device vector to host vector.
1141 template<class InputIterator, class OutputIterator>
1142 #ifdef DOXYGEN
1143 OutputIterator
1144 #else
1145 typename std::enable_if<
1146     std::is_same<
1147         typename std::iterator_traits<InputIterator>::value_type,
1148         typename std::iterator_traits<OutputIterator>::value_type
1149         >::value &&
1150     stored_on_device<InputIterator>::value &&
1151     !stored_on_device<OutputIterator>::value,
1152     OutputIterator
1153     >::type
1154 #endif
copy(InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1155 copy(InputIterator first, InputIterator last,
1156         OutputIterator result, bool blocking = true)
1157 {
1158     first.vec->read_data(first.pos, last - first, &result[0], blocking);
1159     return result + (last - first);
1160 }
1161 
1162 /// Copy range from host vector to device vector.
1163 template <class InputIterator, class OutputIterator>
1164 #ifdef DOXYGEN
1165 OutputIterator
1166 #else
1167 typename std::enable_if<
1168     std::is_same<
1169         typename std::iterator_traits<InputIterator>::value_type,
1170         typename std::iterator_traits<OutputIterator>::value_type
1171         >::value &&
1172     !stored_on_device<InputIterator>::value &&
1173     stored_on_device<OutputIterator>::value,
1174     OutputIterator
1175     >::type
1176 #endif
copy(InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1177 copy(InputIterator first, InputIterator last,
1178         OutputIterator result, bool blocking = true)
1179 {
1180     result.vec->write_data(result.pos, last - first, &first[0], blocking);
1181     return result + (last - first);
1182 }
1183 
1184 /// Copy range from device vector to host vector.
1185 template<class InputIterator, class OutputIterator>
1186 #ifdef DOXYGEN
1187 OutputIterator
1188 #else
1189 typename std::enable_if<
1190     std::is_same<
1191         typename std::iterator_traits<InputIterator>::value_type,
1192         typename std::iterator_traits<OutputIterator>::value_type
1193         >::value &&
1194     stored_on_device<InputIterator>::value &&
1195     !stored_on_device<OutputIterator>::value,
1196     OutputIterator
1197     >::type
1198 #endif
copy(std::vector<backend::command_queue> & q,InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1199 copy(std::vector<backend::command_queue> &q,
1200         InputIterator first, InputIterator last,
1201         OutputIterator result, bool blocking = true)
1202 {
1203     first.vec->read_data(first.pos, last - first, &result[0], blocking, q);
1204     return result + (last - first);
1205 }
1206 
1207 /// Copy range from host vector to device vector.
1208 template <class InputIterator, class OutputIterator>
1209 #ifdef DOXYGEN
1210 OutputIterator
1211 #else
1212 typename std::enable_if<
1213     std::is_same<
1214         typename std::iterator_traits<InputIterator>::value_type,
1215         typename std::iterator_traits<OutputIterator>::value_type
1216         >::value &&
1217     !stored_on_device<InputIterator>::value &&
1218     stored_on_device<OutputIterator>::value,
1219     OutputIterator
1220     >::type
1221 #endif
copy(std::vector<backend::command_queue> & q,InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1222 copy(std::vector<backend::command_queue> &q,
1223         InputIterator first, InputIterator last,
1224         OutputIterator result, bool blocking = true)
1225 {
1226     result.vec->write_data(result.pos, last - first, &first[0], blocking, q);
1227     return result + (last - first);
1228 }
1229 
1230 /// Swap two vectors.
1231 template <typename T>
swap(vector<T> & x,vector<T> & y)1232 void swap(vector<T> &x, vector<T> &y) {
1233     x.swap(y);
1234 }
1235 
1236 /// Returns device weight after simple bandwidth test
device_vector_perf(const backend::command_queue & q)1237 inline double device_vector_perf(const backend::command_queue &q) {
1238     static const size_t test_size = 1024U * 1024U;
1239     std::vector<backend::command_queue> queue(1, q);
1240 
1241     // Allocate test vectors on current device and measure execution
1242     // time of a simple kernel.
1243     vex::vector<float> a(queue, test_size);
1244     vex::vector<float> b(queue, test_size);
1245     vex::vector<float> c(queue, test_size);
1246 
1247     // Skip the first run.
1248     a = b + c;
1249 
1250     // Measure the second run.
1251     profiler<> prof(queue);
1252     prof.tic_cl("");
1253     a = b + c;
1254     return 1.0 / prof.toc("");
1255 }
1256 
1257 
1258 /// Download and print the vector elements.
1259 template<class T>
operator <<(std::ostream & o,const vex::vector<T> & t)1260 std::ostream &operator<<(std::ostream &o, const vex::vector<T> &t) {
1261     boost::io::ios_all_saver stream_state(o);
1262     const size_t chunk = std::is_integral<T>::value ? 10 : 5;
1263 
1264     o << "{" << std::setprecision(6);
1265     for(unsigned p = 0; p < t.nparts(); ++p) {
1266         if (size_t ps = t.part_size(p)) {
1267             auto ptr = t.map(p);
1268 
1269             for(size_t i = t.part_start(p), j = 0; j < ps; ++j, ++i) {
1270                 if (i % chunk == 0) o << "\n" << std::setw(6) << i << ":";
1271 
1272                 if (std::is_integral<T>::value)
1273                     o << " " << std::setw(6) << ptr[j];
1274                 else if (std::is_arithmetic<T>::value)
1275                     o << std::scientific << std::setw(14) << ptr[j];
1276                 else
1277                     o << " " << ptr[j];
1278             }
1279         }
1280     }
1281     return o << "\n}\n";
1282 }
1283 
1284 } // namespace vex
1285 
1286 namespace boost { namespace fusion { namespace traits {
1287 
1288 template <class T>
1289 struct is_sequence< vex::vector<T> > : std::false_type
1290 {};
1291 
1292 } } }
1293 
1294 
1295 #endif
1296