1 #pragma once
2
3 #include "../cuda_flow.hpp"
4 #include "../cuda_capturer.hpp"
5 #include "../cuda_meta.hpp"
6
7 /**
8 @file cuda_find.hpp
9 @brief cuda find algorithms include file
10 */
11
12 namespace tf::detail {
13
14 /** @private */
15 template <typename T>
16 struct cudaFindPair {
17
18 T key;
19 unsigned index;
20
operator unsignedtf::detail::cudaFindPair21 __device__ operator unsigned () const { return index; }
22 };
23
24 /** @private */
25 template <typename P, typename I, typename U>
cuda_find_if_loop(P && p,I input,unsigned count,unsigned * idx,U pred)26 void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {
27
28 if(count == 0) {
29 cuda_single_task(p, [=] __device__ () { *idx = 0; });
30 return;
31 }
32
33 using E = std::decay_t<P>;
34
35 auto B = (count + E::nv - 1) / E::nv;
36
37 // set the index to the maximum
38 cuda_single_task(p, [=] __device__ () { *idx = count; });
39
40 // launch the kernel to atomic-find the minimum
41 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
42
43 __shared__ unsigned shm_id;
44
45 if(!tid) {
46 shm_id = count;
47 }
48
49 __syncthreads();
50
51 auto tile = cuda_get_tile(bid, E::nv, count);
52
53 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
54 input + tile.begin, tid, tile.count()
55 );
56
57 auto id = count;
58
59 for(unsigned i=0; i<E::vt; i++) {
60 auto j = E::nt*i + tid;
61 if(j < tile.count() && pred(x[i])) {
62 id = j + tile.begin;
63 break;
64 }
65 }
66
67 // Note: the reduce version is not faster though
68 // reduce to a scalar per block.
69 //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;
70
71 //id = cudaBlockReduce<E::nt, unsigned>()(
72 // tid,
73 // id,
74 // shm,
75 // (tile.count() < E::nt ? tile.count() : E::nt),
76 // cuda_minimum<unsigned>{},
77 // false
78 //);
79
80 // only need the minimum id
81 atomicMin(&shm_id, id);
82 __syncthreads();
83
84 // reduce all to the global memory
85 if(!tid) {
86 atomicMin(idx, shm_id);
87 //atomicMin(idx, id);
88 }
89 });
90 }
91
92 /** @private */
93 template <typename P, typename I, typename O>
cuda_min_element_loop(P && p,I input,unsigned count,unsigned * idx,O op,void * ptr)94 void cuda_min_element_loop(
95 P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
96 ) {
97
98 if(count == 0) {
99 cuda_single_task(p, [=] __device__ () { *idx = 0; });
100 return;
101 }
102
103 using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
104
105 cuda_uninitialized_reduce_loop(p,
106 cuda_make_load_iterator<T>([=]__device__(auto i){
107 return T{*(input+i), i};
108 }),
109 count,
110 idx,
111 [=] __device__ (const auto& a, const auto& b) {
112 return op(a.key, b.key) ? a : b;
113 },
114 ptr
115 );
116 }
117
118 /** @private */
119 template <typename P, typename I, typename O>
cuda_max_element_loop(P && p,I input,unsigned count,unsigned * idx,O op,void * ptr)120 void cuda_max_element_loop(
121 P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
122 ) {
123
124 if(count == 0) {
125 cuda_single_task(p, [=] __device__ () { *idx = 0; });
126 return;
127 }
128
129 using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
130
131 cuda_uninitialized_reduce_loop(p,
132 cuda_make_load_iterator<T>([=]__device__(auto i){
133 return T{*(input+i), i};
134 }),
135 count,
136 idx,
137 [=] __device__ (const auto& a, const auto& b) {
138 return op(a.key, b.key) ? b : a;
139 },
140 ptr
141 );
142 }
143
144 } // end of namespace tf::detail ---------------------------------------------
145
146 namespace tf {
147
148
149 // ----------------------------------------------------------------------------
150 // cuda_find_if
151 // ----------------------------------------------------------------------------
152
153 /**
154 @brief finds the index of the first element that satisfies the given criteria
155
156 @tparam P execution policy type
157 @tparam I input iterator type
158 @tparam U unary operator type
159
160 @param p execution policy
161 @param first iterator to the beginning of the range
162 @param last iterator to the end of the range
163 @param idx pointer to the index of the found element
164 @param op unary operator which returns @c true for the required element
165
166 The function launches kernels asynchronously to find the index @c idx of the
167 first element in the range <tt>[first, last)</tt>
168 such that <tt>op(*(first+idx))</tt> is true.
169 This is equivalent to the parallel execution of the following loop:
170
171 @code{.cpp}
172 unsigned idx = 0;
173 for(; first != last; ++first, ++idx) {
174 if (p(*first)) {
175 return idx;
176 }
177 }
178 return idx;
179 @endcode
180 */
181 template <typename P, typename I, typename U>
cuda_find_if(P && p,I first,I last,unsigned * idx,U op)182 void cuda_find_if(
183 P&& p, I first, I last, unsigned* idx, U op
184 ) {
185 detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);
186 }
187
188 // ----------------------------------------------------------------------------
189 // cudaFlow
190 // ----------------------------------------------------------------------------
191
192 // Function: find_if
193 template <typename I, typename U>
find_if(I first,I last,unsigned * idx,U op)194 cudaTask cudaFlow::find_if(I first, I last, unsigned* idx, U op) {
195 return capture([=](cudaFlowCapturer& cap){
196 cap.make_optimizer<cudaLinearCapturing>();
197 cap.find_if(first, last, idx, op);
198 });
199 }
200
201 // Function: find_if
202 template <typename I, typename U>
find_if(cudaTask task,I first,I last,unsigned * idx,U op)203 void cudaFlow::find_if(cudaTask task, I first, I last, unsigned* idx, U op) {
204 capture(task, [=](cudaFlowCapturer& cap){
205 cap.make_optimizer<cudaLinearCapturing>();
206 cap.find_if(first, last, idx, op);
207 });
208 }
209
210 // ----------------------------------------------------------------------------
211 // cudaFlowCapturer
212 // ----------------------------------------------------------------------------
213
214 // Function: find_if
215 template <typename I, typename U>
find_if(I first,I last,unsigned * idx,U op)216 cudaTask cudaFlowCapturer::find_if(I first, I last, unsigned* idx, U op) {
217 return on([=](cudaStream_t stream) mutable {
218 cudaDefaultExecutionPolicy p(stream);
219 cuda_find_if(p, first, last, idx, op);
220 });
221 }
222
223 // Function: find_if
224 template <typename I, typename U>
find_if(cudaTask task,I first,I last,unsigned * idx,U op)225 void cudaFlowCapturer::find_if(
226 cudaTask task, I first, I last, unsigned* idx, U op
227 ) {
228 on(task, [=](cudaStream_t stream) mutable {
229 cudaDefaultExecutionPolicy p(stream);
230 cuda_find_if(p, first, last, idx, op);
231 });
232 }
233
234 // ----------------------------------------------------------------------------
235 // cuda_min_element
236 // ----------------------------------------------------------------------------
237
238 /**
239 @brief queries the buffer size in bytes needed to call tf::cuda_min_element
240
241 @tparam P execution policy type
242 @tparam T value type
243
244 @param count number of elements to search
245
246 The function is used to decide the buffer size in bytes for calling
247 tf::cuda_min_element.
248 */
249 template <typename P, typename T>
cuda_min_element_buffer_size(unsigned count)250 unsigned cuda_min_element_buffer_size(unsigned count) {
251 return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
252 }
253
254 /**
255 @brief finds the index of the minimum element in a range
256
257 @tparam P execution policy type
258 @tparam I input iterator type
259 @tparam O comparator type
260
261 @param p execution policy object
262 @param first iterator to the beginning of the range
263 @param last iterator to the end of the range
264 @param idx solution index of the minimum element
265 @param op comparison function object
266 @param buf pointer to the buffer
267
268 The function launches kernels asynchronously to find
269 the smallest element in the range <tt>[first, last)</tt>
270 using the given comparator @c op.
271 You need to provide a buffer that holds at least
272 tf::cuda_min_element_buffer_size bytes for internal use.
273 The function is equivalent to a parallel execution of the following loop:
274
275 @code{.cpp}
276 if(first == last) {
277 return 0;
278 }
279 auto smallest = first;
280 for (++first; first != last; ++first) {
281 if (op(*first, *smallest)) {
282 smallest = first;
283 }
284 }
285 return std::distance(first, smallest);
286 @endcode
287 */
288 template <typename P, typename I, typename O>
cuda_min_element(P && p,I first,I last,unsigned * idx,O op,void * buf)289 void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
290 detail::cuda_min_element_loop(
291 p, first, std::distance(first, last), idx, op, buf
292 );
293 }
294
295 // ----------------------------------------------------------------------------
296 // cudaFlowCapturer::min_element
297 // ----------------------------------------------------------------------------
298
299 // Function: min_element
300 template <typename I, typename O>
min_element(I first,I last,unsigned * idx,O op)301 cudaTask cudaFlowCapturer::min_element(I first, I last, unsigned* idx, O op) {
302
303 using T = typename std::iterator_traits<I>::value_type;
304
305 auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
306 std::distance(first, last)
307 );
308
309 return on([=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
310 (cudaStream_t stream) mutable {
311 cudaDefaultExecutionPolicy p(stream);
312 cuda_min_element(p, first, last, idx, op, buf.get().data());
313 });
314 }
315
316 // Function: min_element
317 template <typename I, typename O>
min_element(cudaTask task,I first,I last,unsigned * idx,O op)318 void cudaFlowCapturer::min_element(
319 cudaTask task, I first, I last, unsigned* idx, O op
320 ) {
321
322 using T = typename std::iterator_traits<I>::value_type;
323
324 auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
325 std::distance(first, last)
326 );
327
328 on(task, [=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
329 (cudaStream_t stream) mutable {
330 cudaDefaultExecutionPolicy p(stream);
331 cuda_min_element(p, first, last, idx, op, buf.get().data());
332 });
333 }
334
335 // ----------------------------------------------------------------------------
336 // cudaFlow::min_element
337 // ----------------------------------------------------------------------------
338
339 // Function: min_element
340 template <typename I, typename O>
min_element(I first,I last,unsigned * idx,O op)341 cudaTask cudaFlow::min_element(I first, I last, unsigned* idx, O op) {
342 return capture([=](cudaFlowCapturer& cap){
343 cap.make_optimizer<cudaLinearCapturing>();
344 cap.min_element(first, last, idx, op);
345 });
346 }
347
348 // Function: min_element
349 template <typename I, typename O>
min_element(cudaTask task,I first,I last,unsigned * idx,O op)350 void cudaFlow::min_element(
351 cudaTask task, I first, I last, unsigned* idx, O op
352 ) {
353 capture(task, [=](cudaFlowCapturer& cap){
354 cap.make_optimizer<cudaLinearCapturing>();
355 cap.min_element(first, last, idx, op);
356 });
357 }
358
359 // ----------------------------------------------------------------------------
360 // cuda_max_element
361 // ----------------------------------------------------------------------------
362
363 /**
364 @brief queries the buffer size in bytes needed to call tf::cuda_max_element
365
366 @tparam P execution policy type
367 @tparam T value type
368
369 @param count number of elements to search
370
371 The function is used to decide the buffer size in bytes for calling
372 tf::cuda_max_element.
373 */
374 template <typename P, typename T>
cuda_max_element_buffer_size(unsigned count)375 unsigned cuda_max_element_buffer_size(unsigned count) {
376 return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
377 }
378
379 /**
380 @brief finds the index of the maximum element in a range
381
382 @tparam P execution policy type
383 @tparam I input iterator type
384 @tparam O comparator type
385
386 @param p execution policy object
387 @param first iterator to the beginning of the range
388 @param last iterator to the end of the range
389 @param idx solution index of the maximum element
390 @param op comparison function object
391 @param buf pointer to the buffer
392
393 The function launches kernels asynchronously to find
394 the largest element in the range <tt>[first, last)</tt>
395 using the given comparator @c op.
396 You need to provide a buffer that holds at least
397 tf::cuda_max_element_buffer_size bytes for internal use.
398 The function is equivalent to a parallel execution of the following loop:
399
400 @code{.cpp}
401 if(first == last) {
402 return 0;
403 }
404 auto largest = first;
405 for (++first; first != last; ++first) {
406 if (op(*largest, *first)) {
407 largest = first;
408 }
409 }
410 return std::distance(first, largest);
411 @endcode
412 */
413 template <typename P, typename I, typename O>
cuda_max_element(P && p,I first,I last,unsigned * idx,O op,void * buf)414 void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
415 detail::cuda_max_element_loop(
416 p, first, std::distance(first, last), idx, op, buf
417 );
418 }
419
420 // ----------------------------------------------------------------------------
421 // cudaFlowCapturer::max_element
422 // ----------------------------------------------------------------------------
423
424 // Function: max_element
425 template <typename I, typename O>
max_element(I first,I last,unsigned * idx,O op)426 cudaTask cudaFlowCapturer::max_element(I first, I last, unsigned* idx, O op) {
427
428 using T = typename std::iterator_traits<I>::value_type;
429
430 auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
431 std::distance(first, last)
432 );
433
434 return on([=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
435 (cudaStream_t stream) mutable {
436 cudaDefaultExecutionPolicy p(stream);
437 cuda_max_element(p, first, last, idx, op, buf.get().data());
438 });
439 }
440
441 // Function: max_element
442 template <typename I, typename O>
max_element(cudaTask task,I first,I last,unsigned * idx,O op)443 void cudaFlowCapturer::max_element(
444 cudaTask task, I first, I last, unsigned* idx, O op
445 ) {
446
447 using T = typename std::iterator_traits<I>::value_type;
448
449 auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
450 std::distance(first, last)
451 );
452
453 on(task, [=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
454 (cudaStream_t stream) mutable {
455 cudaDefaultExecutionPolicy p(stream);
456 cuda_max_element(p, first, last, idx, op, buf.get().data());
457 });
458 }
459
460 // ----------------------------------------------------------------------------
461 // cudaFlow::max_element
462 // ----------------------------------------------------------------------------
463
464 // Function: max_element
465 template <typename I, typename O>
max_element(I first,I last,unsigned * idx,O op)466 cudaTask cudaFlow::max_element(I first, I last, unsigned* idx, O op) {
467 return capture([=](cudaFlowCapturer& cap){
468 cap.make_optimizer<cudaLinearCapturing>();
469 cap.max_element(first, last, idx, op);
470 });
471 }
472
473 // Function: max_element
474 template <typename I, typename O>
max_element(cudaTask task,I first,I last,unsigned * idx,O op)475 void cudaFlow::max_element(
476 cudaTask task, I first, I last, unsigned* idx, O op
477 ) {
478 capture(task, [=](cudaFlowCapturer& cap){
479 cap.make_optimizer<cudaLinearCapturing>();
480 cap.max_element(first, last, idx, op);
481 });
482 }
483
484 } // end of namespace tf -----------------------------------------------------
485
486
487