1 #ifndef VEXCL_VECTOR_HPP
2 #define VEXCL_VECTOR_HPP
3
4 /*
5 The MIT License
6
7 Copyright (c) 2012-2018 Denis Demidov <dennis.demidov@gmail.com>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 */
27
28 /**
29 * \file vexcl/vector.hpp
30 * \author Denis Demidov <dennis.demidov@gmail.com>
31 * \brief OpenCL device vector.
32 */
33
34 #include <vector>
35 #include <map>
36 #include <iostream>
37 #include <iomanip>
38 #include <sstream>
39 #include <stdexcept>
40 #include <string>
41 #include <type_traits>
42 #include <functional>
43
44 #include <boost/proto/proto.hpp>
45 #include <boost/io/ios_state.hpp>
46 #include <boost/iterator/iterator_facade.hpp>
47 #include <boost/thread.hpp>
48
49 #include <vexcl/backend.hpp>
50 #include <vexcl/util.hpp>
51 #include <vexcl/operations.hpp>
52 #include <vexcl/profiler.hpp>
53 #include <vexcl/devlist.hpp>
54
55 #ifdef BOOST_NO_NOEXCEPT
56 # define noexcept throw()
57 #endif
58
59 /// Vector expression template library for OpenCL.
60 namespace vex {
61
62 //--- Partitioning ----------------------------------------------------------
63
64 /// Weights device wrt to vector performance.
65 /**
66 * Launches the following kernel on each device:
67 \code
68 a = b + c;
69 \endcode
70 * where a, b and c are device vectors. Each device gets portion of the vector
71 * proportional to the performance of this operation.
72 */
73 inline double device_vector_perf(const backend::command_queue&);
74
75 /// Assigns equal weight to each device.
76 /**
77 * This results in equal partitioning.
78 */
equal_weights(const backend::command_queue &)79 inline double equal_weights(const backend::command_queue&) {
80 return 1;
81 }
82
83 template <bool dummy = true>
84 struct partitioning_scheme {
85 static_assert(dummy, "dummy parameter should be true");
86
87 typedef std::function< double(const backend::command_queue&) > weight_function;
88
setvex::partitioning_scheme89 static void set(weight_function f) {
90 boost::lock_guard<boost::mutex> lock(mx);
91
92 if (!is_set) {
93 weight = f;
94 is_set = true;
95 } else {
96 std::cerr <<
97 "Warning: "
98 "device weighting function is already set and will be left as is."
99 << std::endl;
100 }
101 }
102
103 static std::vector<size_t> get(size_t n, const std::vector<backend::command_queue> &queue);
104
105 private:
106 static bool is_set;
107 static weight_function weight;
108 static std::map<backend::device_id, double> device_weight;
109 static boost::mutex mx;
110
init_weight_functionvex::partitioning_scheme111 static bool init_weight_function() {
112 boost::lock_guard<boost::mutex> lock(mx);
113 if (!is_set) {
114 weight = device_vector_perf;
115 is_set = true;
116 }
117 return true;
118 }
119 };
120
121 template <bool dummy>
122 bool partitioning_scheme<dummy>::is_set = false;
123
124 template <bool dummy>
125 std::map<backend::device_id, double> partitioning_scheme<dummy>::device_weight;
126
127 template <bool dummy>
128 boost::mutex partitioning_scheme<dummy>::mx;
129
130 template <bool dummy>
get(size_t n,const std::vector<backend::command_queue> & queue)131 std::vector<size_t> partitioning_scheme<dummy>::get(size_t n,
132 const std::vector<backend::command_queue> &queue)
133 {
134 static const bool once = init_weight_function();
135 (void)once; // do not warn about unused variable
136
137 std::vector<size_t> part;
138 part.reserve(queue.size() + 1);
139 part.push_back(0);
140
141 if (queue.size() > 1) {
142 std::vector<double> cumsum;
143 cumsum.reserve(queue.size() + 1);
144 cumsum.push_back(0);
145
146 for(auto q = queue.begin(); q != queue.end(); q++) {
147 auto dev_id = backend::get_device_id(*q);
148 auto dw = device_weight.find(dev_id);
149
150 double w = (dw == device_weight.end()) ?
151 (device_weight[dev_id] = weight(*q)) :
152 dw->second;
153
154 cumsum.push_back(cumsum.back() + w);
155 }
156
157 for(unsigned d = 1; d < queue.size(); d++)
158 part.push_back(
159 std::min(n,
160 alignup(static_cast<size_t>(n * cumsum[d] / cumsum.back()))
161 )
162 );
163 }
164
165 part.push_back(n);
166 return part;
167 }
168
169 template <bool dummy>
170 typename partitioning_scheme<dummy>::weight_function partitioning_scheme<dummy>::weight;
171
172 /// Partitioning scheme for vectors and matrices.
173 /**
174 * Should be set once before any object of vector or matrix type is declared.
175 * Otherwise default parttioning function (partition_by_vector_perf) is
176 * selected.
177 */
set_partitioning(std::function<double (const backend::command_queue &)> f)178 inline void set_partitioning(
179 std::function< double(const backend::command_queue&) > f
180 )
181 {
182 partitioning_scheme<>::set(f);
183 }
184
185 /// Returns partitioning for the specified vector size on a given set of queues.
partition(size_t n,const std::vector<backend::command_queue> & queue)186 inline std::vector<size_t> partition(size_t n,
187 const std::vector<backend::command_queue> &queue)
188 {
189 return partitioning_scheme<>::get(n, queue);
190 }
191
192
193 //--- Vector Type -----------------------------------------------------------
194 struct vector_terminal {};
195
196 typedef vector_expression<
197 typename boost::proto::terminal< vector_terminal >::type
198 > vector_terminal_expression;
199
200 namespace traits {
201
202 // Hold vector terminals by reference:
203 template <class T>
204 struct hold_terminal_by_reference< T,
205 typename std::enable_if<
206 boost::proto::matches<
207 typename boost::proto::result_of::as_expr< T >::type,
208 boost::proto::terminal< vector_terminal >
209 >::value
210 >::type
211 >
212 : std::true_type
213 { };
214
215 } // namespace traits
216
217 /// \defgroup containers Container classes
218
219 /// Device vector.
220 template <typename T>
221 class vector : public vector_terminal_expression {
222 public:
223 typedef T value_type;
224 typedef size_t size_type;
225
226 // Proxy class.
227 //
228 // Instances of this class are returned from vector::operator[]. These
229 // may be used to read or write single element of a vector, although
230 // this operations are too expensive to be used extensively and should
231 // be reserved for debugging purposes.
232 class element {
233 public:
234 // Reads the associated element of a vector.
operator T() const235 operator T() const {
236 T val = T();
237 buf.read(queue, index, 1, &val, true);
238 return val;
239 }
240
241 // Writes the associated element of a vector.
operator =(T val)242 T operator=(T val) {
243 buf.write(queue, index, 1, &val, true);
244 return val;
245 }
246
operator =(const element & other)247 T operator=(const element &other) {
248 return (*this) = static_cast<T>(other);
249 }
250
swap(element && a,element && b)251 friend void swap(element &&a, element &&b) {
252 T tmp = static_cast<T>(a);
253 a = static_cast<T>(b);
254 b = tmp;
255 }
256
257 private:
element(const backend::command_queue & q,const backend::device_vector<T> & b,size_t i)258 element(const backend::command_queue &q,
259 const backend::device_vector<T> &b,
260 size_t i
261 ) : queue(q), buf(b), index(i)
262 {}
263
264 const backend::command_queue &queue;
265 const backend::device_vector<T> &buf;
266
267 size_t index;
268
269 friend class vector;
270 };
271
272 // Iterator class.
273 //
274 // This class may in principle be used with standard template library,
275 // although its main purpose is range specification for vector copy
276 // operations.
277 template <class vector_type, class element_type>
278 class iterator_type
279 : public boost::iterator_facade<
280 iterator_type<vector_type, element_type>,
281 T,
282 std::random_access_iterator_tag,
283 element_type
284 >
285 {
286 public:
287 typedef boost::iterator_facade<
288 iterator_type<vector_type, element_type>,
289 T,
290 std::random_access_iterator_tag,
291 element_type
292 > super_type;
293 typedef typename super_type::reference reference;
294 typedef typename super_type::difference_type difference_type;
295
296 static const bool device_iterator = true;
297
298 vector_type *vec;
299 size_t pos;
300 size_t part;
301
302 private:
303 friend class ::boost::iterator_core_access;
304 friend class vector;
305
iterator_type(vector_type & vec,size_t pos)306 iterator_type(vector_type &vec, size_t pos)
307 : vec(&vec), pos(pos), part(0)
308 {
309 if (!vec.part.empty()) {
310 part = std::upper_bound(
311 vec.part.begin(), vec.part.end(), pos
312 ) - vec.part.begin() - 1;
313 }
314 }
315
dereference() const316 reference dereference() const {
317 return element_type(
318 vec->queue[part], vec->buf[part],
319 pos - vec->part[part]
320 );
321 }
322
equal(const iterator_type & it) const323 bool equal(const iterator_type &it) const {
324 return pos == it.pos;
325 }
326
increment()327 void increment() {
328 ++pos;
329 while (part < vec->nparts() && pos >= vec->part[part + 1])
330 ++part;
331 }
332
decrement()333 void decrement() {
334 --pos;
335 while (part > 0 && pos < vec->part[part])
336 --part;
337 }
338
advance(difference_type n)339 void advance(difference_type n) {
340 pos += n;
341 if (n > 0) {
342 while (part < vec->nparts() && pos >= vec->part[part + 1])
343 ++part;
344 } else if (n < 0) {
345 while (part > 0 && pos < vec->part[part])
346 --part;
347 }
348 }
349
distance_to(const iterator_type & it) const350 difference_type distance_to(const iterator_type &it) const {
351 return static_cast<difference_type>(it.pos - pos);
352 }
353 };
354
355 typedef iterator_type<vector, element> iterator;
356 typedef iterator_type<const vector, const element> const_iterator;
357
358 /// Empty constructor.
vector()359 vector() {}
360
361 #ifdef VEXCL_NO_COPY_CONSTRUCTORS
362 private:
363 #endif
364 /// Copy constructor.
vector(const vector & v)365 vector(const vector &v) : queue(v.queue), part(v.part)
366 {
367 #ifdef VEXCL_SHOW_COPIES
368 std::cout << "Copying vex::vector<" << type_name<T>()
369 << "> of size " << size() << std::endl;
370 #endif
371 if (size()) allocate_buffers(backend::MEM_READ_WRITE, 0);
372 *this = v;
373 }
374 #ifdef VEXCL_NO_COPY_CONSTRUCTORS
375 public:
376 #endif
377
378 /// Move constructor
vector(vector && v)379 vector(vector &&v) noexcept {
380 swap(v);
381 }
382
383 /// Wraps a native buffer without owning it.
384 /**
385 * May be used to apply VexCL functions to buffers allocated and
386 * managed outside of VexCL.
387 */
vector(const backend::command_queue & q,const backend::device_vector<T> & buffer,size_t size=0)388 vector(const backend::command_queue &q,
389 const backend::device_vector<T> &buffer,
390 size_t size = 0
391 ) : queue(1, q), part(2), buf(1, buffer)
392 {
393 part[0] = 0;
394 part[1] = size ? size : buffer.size();
395 }
396
397 /// Creates vector of the given size and optionally copies host data.
vector(const std::vector<backend::command_queue> & queue,size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)398 vector(const std::vector<backend::command_queue> &queue,
399 size_t size, const T *host = 0,
400 backend::mem_flags flags = backend::MEM_READ_WRITE
401 ) : queue(queue), part(vex::partition(size, queue))
402 {
403 if (size) allocate_buffers(flags, host);
404 }
405
406 #ifndef VEXCL_NO_STATIC_CONTEXT_CONSTRUCTORS
407 /// Creates vector of the given size and optionally copies host data.
408 /** This version uses the most recently created VexCL context. */
vector(size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)409 vector(size_t size, const T *host = 0,
410 backend::mem_flags flags = backend::MEM_READ_WRITE
411 ) : queue(current_context().queue()), part(vex::partition(size, queue))
412 {
413 if (size) allocate_buffers(flags, host);
414 }
415 #endif
416
417 /// Creates new device vector and copies the host vector.
vector(const std::vector<backend::command_queue> & queue,const std::vector<T> & host,backend::mem_flags flags=backend::MEM_READ_WRITE)418 vector(const std::vector<backend::command_queue> &queue,
419 const std::vector<T> &host,
420 backend::mem_flags flags = backend::MEM_READ_WRITE
421 ) : queue(queue), part(vex::partition(host.size(), queue))
422 {
423 if (!host.empty()) allocate_buffers(flags, host.data());
424 }
425
426 #ifndef VEXCL_NO_STATIC_CONTEXT_CONSTRUCTORS
427 /// Creates new device vector and copies the host vector.
428 /** This version uses the most recently created VexCL context. */
vector(const std::vector<T> & host,backend::mem_flags flags=backend::MEM_READ_WRITE)429 vector(const std::vector<T> &host,
430 backend::mem_flags flags = backend::MEM_READ_WRITE
431 ) : queue(current_context().queue()), part(vex::partition(host.size(), queue))
432 {
433 if (!host.empty()) allocate_buffers(flags, host.data());
434 }
435 #endif
436
437 /// Constructs new vector from vector expression.
438 /**
439 * This will fail if VexCL is unable to automatically determine the
440 * expression size and the compute devices to use.
441 */
442 template <class Expr
443 #if !defined(BOOST_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS) && !defined(DOXYGEN)
444 , class Enable = typename std::enable_if<
445 !std::is_integral<Expr>::value &&
446 boost::proto::matches<
447 typename boost::proto::result_of::as_expr<Expr>::type,
448 vector_expr_grammar
449 >::value
450 >::type
451 #endif
452 >
vector(const Expr & expr)453 vector(const Expr &expr) {
454 #ifdef BOOST_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS
455 static_assert(
456 boost::proto::matches<
457 typename boost::proto::result_of::as_expr<Expr>::type,
458 vector_expr_grammar
459 >::value,
460 "Only vector expressions can be used to initialize a vector"
461 );
462 #endif
463 detail::get_expression_properties prop;
464 detail::extract_terminals()(boost::proto::as_child(expr), prop);
465
466 precondition(!prop.queue.empty() && !prop.part.empty(),
467 "Can not determine expression size and queue list"
468 );
469
470 queue = prop.queue;
471 part = prop.part;
472
473 allocate_buffers(backend::MEM_READ_WRITE, 0);
474
475 *this = expr;
476 }
477
478 template <typename U>
reinterpret() const479 vector<U> reinterpret() const {
480 vector<U> r;
481 r.queue = queue;
482 r.part = part;
483 r.buf.reserve(buf.size());
484 for(size_t i = 0; i < buf.size(); ++i) {
485 r.buf.push_back(buf[i].template reinterpret<U>());
486 r.part[i+1] = r.part[i+1] * sizeof(T) / sizeof(U);
487 }
488 return r;
489 }
490
491 /// Swap function.
swap(vector & v)492 void swap(vector &v) {
493 std::swap(queue, v.queue);
494 std::swap(part, v.part);
495 std::swap(buf, v.buf);
496 }
497
498 /// Resizes the vector.
499 /**
500 * Borrows devices, size, and data from the given vector.
501 * Any data contained in the resized vector will be lost as a result.
502 */
resize(const vector & v,backend::mem_flags flags=backend::MEM_READ_WRITE)503 void resize(const vector &v, backend::mem_flags flags = backend::MEM_READ_WRITE)
504 {
505 // Reallocate bufers
506 *this = std::move(vector(v.queue, v.size(), 0, flags));
507
508 // Copy data
509 *this = v;
510 }
511
512 /// Resizes the vector with the given parameters.
513 /**
514 * This is equivalent to reconstructing the vector with the given
515 * parameters.
516 * Any data contained in the resized vector will be lost as a result.
517 */
resize(const std::vector<backend::command_queue> & queue,size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)518 void resize(const std::vector<backend::command_queue> &queue,
519 size_t size, const T *host = 0,
520 backend::mem_flags flags = backend::MEM_READ_WRITE
521 )
522 {
523 *this = std::move(vector(queue, size, host, flags));
524 }
525
526 /// Resizes the vector.
527 /**
528 * This is equivalent to reconstructing the vector with the given
529 * parameters.
530 * Any data contained in the resized vector will be lost as a result.
531 */
resize(const std::vector<backend::command_queue> & queue,const std::vector<T> & host,backend::mem_flags flags=backend::MEM_READ_WRITE)532 void resize(const std::vector<backend::command_queue> &queue,
533 const std::vector<T> &host,
534 backend::mem_flags flags = backend::MEM_READ_WRITE
535 )
536 {
537 *this = std::move(vector(queue, host, flags));
538 }
539
540 /// Resizes the vector.
541 /*
542 * This is equivalent to reconstructing the vector with the given
543 * parameters.
544 * This version uses the most recently created VexCL context.
545 */
resize(size_t size,const T * host=0,backend::mem_flags flags=backend::MEM_READ_WRITE)546 void resize(size_t size, const T *host = 0, backend::mem_flags flags = backend::MEM_READ_WRITE)
547 {
548 vector(size, host, flags).swap(*this);
549 }
550
551 /// Fills vector with zeros.
552 /** This does not change the vector size! */
clear()553 void clear() {
554 *this = static_cast<T>(0);
555 }
556
557 /// Returns memory buffer located on the given device.
operator ()(unsigned d=0) const558 const backend::device_vector<T>& operator()(unsigned d = 0) const {
559 return buf[d];
560 }
561
562 /// Returns memory buffer located on the given device.
operator ()(unsigned d=0)563 backend::device_vector<T>& operator()(unsigned d = 0) {
564 return buf[d];
565 }
566
567 /// Returns const iterator to the first element of the vector.
begin() const568 const_iterator begin() const {
569 return const_iterator(*this, 0);
570 }
571
572 /// Returns const iterator referring to the past-the-end element in the vector.
end() const573 const_iterator end() const {
574 return const_iterator(*this, size());
575 }
576
577 /// Returns iterator to the first element of the vector.
begin()578 iterator begin() {
579 return iterator(*this, 0);
580 }
581
582 /// Returns iterator referring to the past-the-end element in the vector.
end()583 iterator end() {
584 return iterator(*this, size());
585 }
586
587 /// Access vector element.
operator [](size_t index) const588 const element operator[](size_t index) const {
589 size_t d = std::upper_bound(
590 part.begin(), part.end(), index) - part.begin() - 1;
591 return element(queue[d], buf[d], index - part[d]);
592 }
593
594 /// Access vector element.
operator [](size_t index)595 element operator[](size_t index) {
596 unsigned d = static_cast<unsigned>(
597 std::upper_bound(part.begin(), part.end(), index) - part.begin() - 1
598 );
599 return element(queue[d], buf[d], index - part[d]);
600 }
601
602 /// at() style access is identical to operator[]
at(size_t index) const603 const element at(size_t index) const {
604 if(index >= size())
605 throw std::out_of_range("vexcl::vector");
606 return operator[](index);
607 }
608
609 /// at() style access is identical to operator[]
at(size_t index)610 element at(size_t index) {
611 if(index >= size())
612 throw std::out_of_range("vexcl::vector");
613 return operator[](index);
614 }
615
616 /// Returns vector size.
size() const617 size_t size() const {
618 return part.empty() ? 0 : part.back();
619 }
620
621 /// Returns number of vector parts.
622 /** Each partition is located on single device.
623 */
nparts() const624 size_t nparts() const {
625 return queue.size();
626 }
627
628 /// Returns vector part size on the given device.
part_size(unsigned d) const629 size_t part_size(unsigned d) const {
630 return part[d + 1] - part[d];
631 }
632
633 /// Returns index of the first element located on the given device.
part_start(unsigned d) const634 size_t part_start(unsigned d) const {
635 return part[d];
636 }
637
638 /// Returns reference to the vector of command queues used to construct the vector.
queue_list() const639 const std::vector<backend::command_queue>& queue_list() const {
640 return queue;
641 }
642
643 // Returns reference to vector's partition.
partition() const644 const std::vector<size_t>& partition() const {
645 return part;
646 }
647
648 /// Maps vector part located on the given device to a host array.
649 /**
650 * This returns a smart pointer that will be unmapped automatically
651 * upon destruction */
652 typename backend::device_vector<T>::mapped_array
map(unsigned d=0)653 map(unsigned d = 0) {
654 return buf[d].map(queue[d]);
655 }
656
657 /// Maps vector part located on the given device to a host array.
658 /**
659 * This returns a smart pointer that will be unmapped automatically
660 * upon destruction */
661 typename backend::device_vector<T>::mapped_array
map(unsigned d=0) const662 map(unsigned d = 0) const {
663 return buf[d].map(queue[d]);
664 }
665
666 /// Copy assignment
operator =(const vector & x)667 const vector& operator=(const vector &x) {
668 if (&x != this)
669 detail::assign_expression<assign::SET>(*this, x, queue, part);
670 return *this;
671 }
672
673 /// Move assignment.
operator =(vector && v)674 const vector& operator=(vector &&v) {
675 swap(v);
676 return *this;
677 }
678
679 #define VEXCL_ASSIGNMENT(op, op_type) \
680 /** Expression assignment operator. */ \
681 template <class Expr> \
682 auto operator op(const Expr & expr) -> \
683 typename std::enable_if< \
684 boost::proto::matches< \
685 typename boost::proto::result_of::as_expr<Expr>::type, \
686 vector_expr_grammar>::value, \
687 const vector &>::type \
688 { \
689 detail::assign_expression<op_type>(*this, expr, queue, part); \
690 return *this; \
691 }
692
VEXCL_ASSIGNMENTS(VEXCL_ASSIGNMENT)693 VEXCL_ASSIGNMENTS(VEXCL_ASSIGNMENT)
694
695 #undef VEXCL_ASSIGNMENT
696
697 #ifndef DOXYGEN
698 template <class Expr>
699 typename std::enable_if<
700 boost::proto::matches<
701 typename boost::proto::result_of::as_expr<Expr>::type,
702 additive_vector_transform_grammar
703 >::value,
704 const vector&
705 >::type
706 operator=(const Expr &expr) {
707 detail::apply_additive_transform</*append=*/false>(
708 *this, detail::simplify_additive_transform()( expr )
709 );
710
711 return *this;
712 }
713
714 template <class Expr>
715 typename std::enable_if<
716 boost::proto::matches<
717 typename boost::proto::result_of::as_expr<Expr>::type,
718 additive_vector_transform_grammar
719 >::value,
720 const vector&
721 >::type
operator +=(const Expr & expr)722 operator+=(const Expr &expr) {
723 detail::apply_additive_transform</*append=*/true>(
724 *this, detail::simplify_additive_transform()( expr )
725 );
726
727 return *this;
728 }
729
730 template <class Expr>
731 typename std::enable_if<
732 boost::proto::matches<
733 typename boost::proto::result_of::as_expr<Expr>::type,
734 additive_vector_transform_grammar
735 >::value,
736 const vector&
737 >::type
operator -=(const Expr & expr)738 operator-=(const Expr &expr) {
739 detail::apply_additive_transform</*append=*/true>(
740 *this, detail::simplify_additive_transform()( -expr )
741 );
742
743 return *this;
744 }
745
746 template <class Expr>
747 typename std::enable_if<
748 !boost::proto::matches<
749 typename boost::proto::result_of::as_expr<Expr>::type,
750 vector_expr_grammar
751 >::value &&
752 !boost::proto::matches<
753 typename boost::proto::result_of::as_expr<Expr>::type,
754 additive_vector_transform_grammar
755 >::value,
756 const vector&
757 >::type
operator =(const Expr & expr)758 operator=(const Expr &expr) {
759 *this = detail::extract_vector_expressions()( expr );
760 *this += detail::extract_additive_vector_transforms()( expr );
761
762 return *this;
763 }
764
765 template <class Expr>
766 typename std::enable_if<
767 !boost::proto::matches<
768 typename boost::proto::result_of::as_expr<Expr>::type,
769 vector_expr_grammar
770 >::value &&
771 !boost::proto::matches<
772 typename boost::proto::result_of::as_expr<Expr>::type,
773 additive_vector_transform_grammar
774 >::value,
775 const vector&
776 >::type
operator +=(const Expr & expr)777 operator+=(const Expr &expr) {
778 *this += detail::extract_vector_expressions()( expr );
779 *this += detail::extract_additive_vector_transforms()( expr );
780
781 return *this;
782 }
783
784 template <class Expr>
785 typename std::enable_if<
786 !boost::proto::matches<
787 typename boost::proto::result_of::as_expr<Expr>::type,
788 vector_expr_grammar
789 >::value &&
790 !boost::proto::matches<
791 typename boost::proto::result_of::as_expr<Expr>::type,
792 additive_vector_transform_grammar
793 >::value,
794 const vector&
795 >::type
operator -=(const Expr & expr)796 operator-=(const Expr &expr) {
797 *this -= detail::extract_vector_expressions()( expr );
798 *this -= detail::extract_additive_vector_transforms()( expr );
799
800 return *this;
801 }
802 #endif
803
804 // Copy data from host buffer to device(s).
write_data(size_t offset,size_t size,const T * hostptr,bool blocking)805 void write_data(size_t offset, size_t size, const T *hostptr, bool blocking)
806 {
807 if (!size) return;
808
809 for(unsigned d = 0; d < queue.size(); d++) {
810 size_t start = std::max(offset, part[d]);
811 size_t stop = std::min(offset + size, part[d + 1]);
812
813 if (stop <= start) continue;
814
815 buf[d].write(queue[d], start - part[d], stop - start, hostptr + start - offset);
816 }
817
818 if (blocking)
819 for(size_t d = 0; d < queue.size(); d++) {
820 size_t start = std::max(offset, part[d]);
821 size_t stop = std::min(offset + size, part[d + 1]);
822
823 if (start < stop) queue[d].finish();
824 }
825 }
826
827 // Copy data from host buffer to device(s).
write_data(size_t offset,size_t size,const T * hostptr,bool blocking,std::vector<backend::command_queue> & q)828 void write_data(size_t offset, size_t size, const T *hostptr,
829 bool blocking, std::vector<backend::command_queue> &q)
830 {
831 precondition(q.size() == queue.size(), "The queue list has wrong size");
832
833 if (!size) return;
834
835 for(unsigned d = 0; d < q.size(); d++) {
836 precondition(
837 backend::get_context_id(q[d]) == backend::get_context_id(queue[d]),
838 "Wrong context!"
839 );
840
841 size_t start = std::max(offset, part[d]);
842 size_t stop = std::min(offset + size, part[d + 1]);
843
844 if (stop <= start) continue;
845
846 buf[d].write(q[d], start - part[d], stop - start, hostptr + start - offset);
847 }
848
849 if (blocking)
850 for(size_t d = 0; d < q.size(); d++) {
851 size_t start = std::max(offset, part[d]);
852 size_t stop = std::min(offset + size, part[d + 1]);
853
854 if (start < stop) q[d].finish();
855 }
856 }
857
858 // Copy data from device(s) to host buffer .
read_data(size_t offset,size_t size,T * hostptr,bool blocking) const859 void read_data(size_t offset, size_t size, T *hostptr, bool blocking) const
860 {
861 if (!size) return;
862
863 for(unsigned d = 0; d < queue.size(); d++) {
864 size_t start = std::max(offset, part[d]);
865 size_t stop = std::min(offset + size, part[d + 1]);
866
867 if (stop <= start) continue;
868
869 buf[d].read(queue[d], start - part[d], stop - start, hostptr + start - offset);
870 }
871
872 if (blocking)
873 for(unsigned d = 0; d < queue.size(); d++) {
874 size_t start = std::max(offset, part[d]);
875 size_t stop = std::min(offset + size, part[d + 1]);
876
877 if (start < stop) queue[d].finish();
878 }
879 }
880
881 // Copy data from device(s) to host buffer .
read_data(size_t offset,size_t size,T * hostptr,bool blocking,std::vector<backend::command_queue> & q) const882 void read_data(size_t offset, size_t size, T *hostptr,
883 bool blocking, std::vector<backend::command_queue> &q
884 ) const
885 {
886 precondition(q.size() == queue.size(), "The queue list has wrong size");
887
888 if (!size) return;
889
890 for(unsigned d = 0; d < q.size(); d++) {
891 precondition(
892 backend::get_context_id(q[d]) == backend::get_context_id(queue[d]),
893 "Wrong context!"
894 );
895
896 size_t start = std::max(offset, part[d]);
897 size_t stop = std::min(offset + size, part[d + 1]);
898
899 if (stop <= start) continue;
900
901 buf[d].read(q[d], start - part[d], stop - start, hostptr + start - offset);
902 }
903
904 if (blocking)
905 for(unsigned d = 0; d < q.size(); d++) {
906 size_t start = std::max(offset, part[d]);
907 size_t stop = std::min(offset + size, part[d + 1]);
908
909 if (start < stop) q[d].finish();
910 }
911 }
912
913 private:
914 mutable std::vector<backend::command_queue> queue;
915 std::vector<size_t> part;
916 std::vector< backend::device_vector<T> > buf;
917
allocate_buffers(backend::mem_flags flags,const T * hostptr)918 void allocate_buffers(backend::mem_flags flags, const T *hostptr) {
919 buf.clear();
920 buf.reserve(queue.size());
921
922 for(unsigned d = 0; d < queue.size(); d++)
923 buf.push_back(
924 backend::device_vector<T>(
925 queue[d], part[d + 1] - part[d],
926 hostptr ? hostptr + part[d] : 0, flags)
927 );
928 }
929
930 template <typename U>
931 friend class vector;
932
933 template <typename S, size_t N>
934 friend class multivector;
935 };
936
937 //---------------------------------------------------------------------------
938 // Support for vector expressions
939 //---------------------------------------------------------------------------
940 namespace traits {
941
942 template <>
943 struct is_vector_expr_terminal< vector_terminal > : std::true_type {};
944
945 template <>
946 struct proto_terminal_is_value< vector_terminal > : std::true_type {};
947
948 template <typename T>
949 struct kernel_param_declaration< vector<T> > {
getvex::traits::kernel_param_declaration950 static void get(backend::source_generator &src,
951 const vector<T>&,
952 const backend::command_queue&, const std::string &prm_name,
953 detail::kernel_generator_state_ptr)
954 {
955 src.parameter< global_ptr<T> >(prm_name);
956 }
957 };
958
959 template <typename T>
960 struct partial_vector_expr< vector<T> > {
getvex::traits::partial_vector_expr961 static void get(backend::source_generator &src,
962 const vector<T>&,
963 const backend::command_queue&, const std::string &prm_name,
964 detail::kernel_generator_state_ptr)
965 {
966 src << prm_name << "[idx]";
967 }
968 };
969
970 template <typename T>
971 struct kernel_arg_setter< vector<T> > {
setvex::traits::kernel_arg_setter972 static void set(const vector<T> &term,
973 backend::kernel &kernel, unsigned device, size_t/*index_offset*/,
974 detail::kernel_generator_state_ptr)
975 {
976 kernel.push_arg(term(device));
977 }
978 };
979
980 template <class T>
981 struct expression_properties< vector<T> > {
getvex::traits::expression_properties982 static void get(const vector<T> &term,
983 std::vector<backend::command_queue> &queue_list,
984 std::vector<size_t> &partition,
985 size_t &size
986 )
987 {
988 queue_list = term.queue_list();
989 partition = term.partition();
990 size = term.size();
991 }
992 };
993
994 } // namespace traits
995
996 //---------------------------------------------------------------------------
997 /// Copy device vector to host vector.
998 template <class Td, class Th>
999 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,std::vector<Th> & hv,bool blocking=true)1000 copy(const vex::vector<Td> &dv, std::vector<Th> &hv, bool blocking = true) {
1001 dv.read_data(0, dv.size(), hv.data(), blocking);
1002 }
1003
1004 template <class Td, class Th>
1005 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,std::vector<Th> & hv,bool blocking=true)1006 copy(const vex::vector<Td> &dv, std::vector<Th> &hv, bool blocking = true) {
1007 std::vector<Td> tmp(dv.size());
1008 dv.read_data(0, dv.size(), tmp.data(), true);
1009 std::copy(tmp.begin(), tmp.end(), hv.begin());
1010 }
1011
1012 /// Copy device vector to host pointer.
1013 template <class Td, class Th>
1014 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,Th * hv,bool blocking=true)1015 copy(const vex::vector<Td> &dv, Th *hv, bool blocking = true) {
1016 dv.read_data(0, dv.size(), hv, blocking);
1017 }
1018
1019 template <class Td, class Th>
1020 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const vex::vector<Td> & dv,Th * hv,bool blocking=true)1021 copy(const vex::vector<Td> &dv, Th *hv, bool blocking = true) {
1022 std::vector<Td> tmp(dv.size());
1023 dv.read_data(0, dv.size(), tmp.data(), true);
1024 std::copy(tmp.begin(), tmp.end(), hv);
1025 }
1026
1027 /// Copy host vector to device vector.
1028 template <class Th, class Td>
1029 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1030 copy(const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true) {
1031 dv.write_data(0, dv.size(), hv.data(), blocking);
1032 }
1033
1034 template <class Th, class Td>
1035 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1036 copy(const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true) {
1037 std::vector<Td> tmp(hv.size());
1038 for (size_t i = 0; i < hv.size(); ++i)
1039 tmp[i] = hv[i];
1040 dv.write_data(0, dv.size(), tmp.data(), true);
1041 }
1042
1043 /// Copy host pointer to device vector.
1044 template <class Th, class Td>
1045 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(const Th * hv,vex::vector<Td> & dv,bool blocking=true)1046 copy(const Th *hv, vex::vector<Td> &dv, bool blocking = true) {
1047 dv.write_data(0, dv.size(), hv, blocking);
1048 }
1049
1050 template <class Th, class Td>
1051 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(const Th * hv,vex::vector<Td> & dv,bool blocking=true)1052 copy(const Th *hv, vex::vector<Td> &dv, bool blocking = true) {
1053 std::vector<Td> tmp(hv, hv + dv.size());
1054 dv.write_data(0, dv.size(), tmp.data(), true);
1055 }
1056
1057 /// Copy device vector to host vector.
1058 template <class Td, class Th>
copy(std::vector<backend::command_queue> & q,const vex::vector<Td> & dv,std::vector<Th> & hv,bool blocking=true)1059 void copy(std::vector<backend::command_queue> &q,
1060 const vex::vector<Td> &dv, std::vector<Th> &hv, bool blocking = true)
1061 {
1062 if (std::is_same<Td, Th>::value) {
1063 dv.read_data(0, dv.size(), hv.data(), blocking, q);
1064 } else {
1065 std::vector<Td> tmp(dv.size());
1066 dv.read_data(0, dv.size(), tmp.data(), true, q);
1067 std::copy(tmp.begin(), tmp.end(), hv.begin());
1068 }
1069 }
1070
1071 /// Copy device vector to host pointer.
1072 template <class Td, class Th>
1073 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const vex::vector<Td> & dv,Th * hv,bool blocking=true)1074 copy(std::vector<backend::command_queue> &q,
1075 const vex::vector<Td> &dv, Th *hv, bool blocking = true)
1076 {
1077 dv.read_data(0, dv.size(), hv, blocking, q);
1078 }
1079
1080 template <class Td, class Th>
1081 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const vex::vector<Td> & dv,Th * hv,bool blocking=true)1082 copy(std::vector<backend::command_queue> &q,
1083 const vex::vector<Td> &dv, Th *hv, bool blocking = true)
1084 {
1085 std::vector<Td> tmp(dv.size());
1086 dv.read_data(0, dv.size(), tmp.data(), true, q);
1087 std::copy(tmp.begin(), tmp.end(), hv);
1088 }
1089
1090 /// Copy host vector to device vector.
1091 template <class Th, class Td>
1092 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1093 copy(std::vector<backend::command_queue> &q,
1094 const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true)
1095 {
1096 dv.write_data(0, dv.size(), hv.data(), blocking, q);
1097 }
1098
1099 template <class Th, class Td>
1100 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const std::vector<Th> & hv,vex::vector<Td> & dv,bool blocking=true)1101 copy(std::vector<backend::command_queue> &q,
1102 const std::vector<Th> &hv, vex::vector<Td> &dv, bool blocking = true)
1103 {
1104 std::vector<Td> tmp(hv.begin(), hv.end());
1105 dv.write_data(0, dv.size(), tmp.data(), true, q);
1106 }
1107
1108 /// Copy host pointer to device vector.
1109 template <class Th, class Td>
1110 typename std::enable_if<std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const Th * hv,vex::vector<Td> & dv,bool blocking=true)1111 copy(std::vector<backend::command_queue> &q,
1112 const Th *hv, vex::vector<Td> &dv, bool blocking = true)
1113 {
1114 dv.write_data(0, dv.size(), hv, blocking, q);
1115 }
1116
1117 template <class Th, class Td>
1118 typename std::enable_if<!std::is_same<Td, Th>::value, void>::type
copy(std::vector<backend::command_queue> & q,const Th * hv,vex::vector<Td> & dv,bool blocking=true)1119 copy(std::vector<backend::command_queue> &q,
1120 const Th *hv, vex::vector<Td> &dv, bool blocking = true)
1121 {
1122 std::vector<Td> tmp(hv, hv + dv.size());
1123 dv.write_data(0, dv.size(), tmp.data(), true, q);
1124 }
1125
1126 /// Copy device vector to device vector.
1127 template <class T1, class T2>
copy(const vex::vector<T1> & src,vex::vector<T2> & dst)1128 void copy(const vex::vector<T1> &src, vex::vector<T2> &dst) {
1129 dst = src;
1130 }
1131
1132 template<class Iterator, class Enable = void>
1133 struct stored_on_device : std::false_type {};
1134
1135 template<class Iterator>
1136 struct stored_on_device<Iterator,
1137 typename std::enable_if<Iterator::device_iterator>::type
1138 > : std::true_type {};
1139
1140 /// Copy range from device vector to host vector.
1141 template<class InputIterator, class OutputIterator>
1142 #ifdef DOXYGEN
1143 OutputIterator
1144 #else
1145 typename std::enable_if<
1146 std::is_same<
1147 typename std::iterator_traits<InputIterator>::value_type,
1148 typename std::iterator_traits<OutputIterator>::value_type
1149 >::value &&
1150 stored_on_device<InputIterator>::value &&
1151 !stored_on_device<OutputIterator>::value,
1152 OutputIterator
1153 >::type
1154 #endif
copy(InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1155 copy(InputIterator first, InputIterator last,
1156 OutputIterator result, bool blocking = true)
1157 {
1158 first.vec->read_data(first.pos, last - first, &result[0], blocking);
1159 return result + (last - first);
1160 }
1161
1162 /// Copy range from host vector to device vector.
1163 template <class InputIterator, class OutputIterator>
1164 #ifdef DOXYGEN
1165 OutputIterator
1166 #else
1167 typename std::enable_if<
1168 std::is_same<
1169 typename std::iterator_traits<InputIterator>::value_type,
1170 typename std::iterator_traits<OutputIterator>::value_type
1171 >::value &&
1172 !stored_on_device<InputIterator>::value &&
1173 stored_on_device<OutputIterator>::value,
1174 OutputIterator
1175 >::type
1176 #endif
copy(InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1177 copy(InputIterator first, InputIterator last,
1178 OutputIterator result, bool blocking = true)
1179 {
1180 result.vec->write_data(result.pos, last - first, &first[0], blocking);
1181 return result + (last - first);
1182 }
1183
1184 /// Copy range from device vector to host vector.
1185 template<class InputIterator, class OutputIterator>
1186 #ifdef DOXYGEN
1187 OutputIterator
1188 #else
1189 typename std::enable_if<
1190 std::is_same<
1191 typename std::iterator_traits<InputIterator>::value_type,
1192 typename std::iterator_traits<OutputIterator>::value_type
1193 >::value &&
1194 stored_on_device<InputIterator>::value &&
1195 !stored_on_device<OutputIterator>::value,
1196 OutputIterator
1197 >::type
1198 #endif
copy(std::vector<backend::command_queue> & q,InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1199 copy(std::vector<backend::command_queue> &q,
1200 InputIterator first, InputIterator last,
1201 OutputIterator result, bool blocking = true)
1202 {
1203 first.vec->read_data(first.pos, last - first, &result[0], blocking, q);
1204 return result + (last - first);
1205 }
1206
1207 /// Copy range from host vector to device vector.
1208 template <class InputIterator, class OutputIterator>
1209 #ifdef DOXYGEN
1210 OutputIterator
1211 #else
1212 typename std::enable_if<
1213 std::is_same<
1214 typename std::iterator_traits<InputIterator>::value_type,
1215 typename std::iterator_traits<OutputIterator>::value_type
1216 >::value &&
1217 !stored_on_device<InputIterator>::value &&
1218 stored_on_device<OutputIterator>::value,
1219 OutputIterator
1220 >::type
1221 #endif
copy(std::vector<backend::command_queue> & q,InputIterator first,InputIterator last,OutputIterator result,bool blocking=true)1222 copy(std::vector<backend::command_queue> &q,
1223 InputIterator first, InputIterator last,
1224 OutputIterator result, bool blocking = true)
1225 {
1226 result.vec->write_data(result.pos, last - first, &first[0], blocking, q);
1227 return result + (last - first);
1228 }
1229
1230 /// Swap two vectors.
1231 template <typename T>
swap(vector<T> & x,vector<T> & y)1232 void swap(vector<T> &x, vector<T> &y) {
1233 x.swap(y);
1234 }
1235
1236 /// Returns device weight after simple bandwidth test
device_vector_perf(const backend::command_queue & q)1237 inline double device_vector_perf(const backend::command_queue &q) {
1238 static const size_t test_size = 1024U * 1024U;
1239 std::vector<backend::command_queue> queue(1, q);
1240
1241 // Allocate test vectors on current device and measure execution
1242 // time of a simple kernel.
1243 vex::vector<float> a(queue, test_size);
1244 vex::vector<float> b(queue, test_size);
1245 vex::vector<float> c(queue, test_size);
1246
1247 // Skip the first run.
1248 a = b + c;
1249
1250 // Measure the second run.
1251 profiler<> prof(queue);
1252 prof.tic_cl("");
1253 a = b + c;
1254 return 1.0 / prof.toc("");
1255 }
1256
1257
1258 /// Download and print the vector elements.
1259 template<class T>
operator <<(std::ostream & o,const vex::vector<T> & t)1260 std::ostream &operator<<(std::ostream &o, const vex::vector<T> &t) {
1261 boost::io::ios_all_saver stream_state(o);
1262 const size_t chunk = std::is_integral<T>::value ? 10 : 5;
1263
1264 o << "{" << std::setprecision(6);
1265 for(unsigned p = 0; p < t.nparts(); ++p) {
1266 if (size_t ps = t.part_size(p)) {
1267 auto ptr = t.map(p);
1268
1269 for(size_t i = t.part_start(p), j = 0; j < ps; ++j, ++i) {
1270 if (i % chunk == 0) o << "\n" << std::setw(6) << i << ":";
1271
1272 if (std::is_integral<T>::value)
1273 o << " " << std::setw(6) << ptr[j];
1274 else if (std::is_arithmetic<T>::value)
1275 o << std::scientific << std::setw(14) << ptr[j];
1276 else
1277 o << " " << ptr[j];
1278 }
1279 }
1280 }
1281 return o << "\n}\n";
1282 }
1283
1284 } // namespace vex
1285
1286 namespace boost { namespace fusion { namespace traits {
1287
1288 template <class T>
1289 struct is_sequence< vex::vector<T> > : std::false_type
1290 {};
1291
1292 } } }
1293
1294
1295 #endif
1296