1 #pragma once
2 
3 #include "../cuda_flow.hpp"
4 #include "../cuda_capturer.hpp"
5 #include "../cuda_meta.hpp"
6 
7 /**
8 @file cuda_find.hpp
9 @brief cuda find algorithms include file
10 */
11 
12 namespace tf::detail {
13 
14 /** @private */
15 template <typename T>
16 struct cudaFindPair {
17 
18   T key;
19   unsigned index;
20 
operator unsignedtf::detail::cudaFindPair21   __device__ operator unsigned () const { return index; }
22 };
23 
24 /** @private */
25 template <typename P, typename I, typename U>
cuda_find_if_loop(P && p,I input,unsigned count,unsigned * idx,U pred)26 void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {
27 
28   if(count == 0) {
29     cuda_single_task(p, [=] __device__ () { *idx = 0; });
30     return;
31   }
32 
33   using E = std::decay_t<P>;
34 
35   auto B = (count + E::nv - 1) / E::nv;
36 
37   // set the index to the maximum
38   cuda_single_task(p, [=] __device__ () { *idx = count; });
39 
40   // launch the kernel to atomic-find the minimum
41   cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
42 
43     __shared__ unsigned shm_id;
44 
45     if(!tid) {
46       shm_id = count;
47     }
48 
49     __syncthreads();
50 
51     auto tile = cuda_get_tile(bid, E::nv, count);
52 
53     auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
54       input + tile.begin, tid, tile.count()
55     );
56 
57     auto id = count;
58 
59     for(unsigned i=0; i<E::vt; i++) {
60       auto j = E::nt*i + tid;
61       if(j < tile.count() && pred(x[i])) {
62         id = j + tile.begin;
63         break;
64       }
65     }
66 
67     // Note: the reduce version is not faster though
68     // reduce to a scalar per block.
69     //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;
70 
71     //id = cudaBlockReduce<E::nt, unsigned>()(
72     //  tid,
73     //  id,
74     //  shm,
75     //  (tile.count() < E::nt ? tile.count() : E::nt),
76     //  cuda_minimum<unsigned>{},
77     //  false
78     //);
79 
80     // only need the minimum id
81     atomicMin(&shm_id, id);
82     __syncthreads();
83 
84     // reduce all to the global memory
85     if(!tid) {
86       atomicMin(idx, shm_id);
87       //atomicMin(idx, id);
88     }
89   });
90 }
91 
92 /** @private */
93 template <typename P, typename I, typename O>
cuda_min_element_loop(P && p,I input,unsigned count,unsigned * idx,O op,void * ptr)94 void cuda_min_element_loop(
95   P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
96 ) {
97 
98   if(count == 0) {
99     cuda_single_task(p, [=] __device__ () { *idx = 0; });
100     return;
101   }
102 
103   using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
104 
105   cuda_uninitialized_reduce_loop(p,
106     cuda_make_load_iterator<T>([=]__device__(auto i){
107       return T{*(input+i), i};
108     }),
109     count,
110     idx,
111     [=] __device__ (const auto& a, const auto& b) {
112       return op(a.key, b.key) ? a : b;
113     },
114     ptr
115   );
116 }
117 
118 /** @private */
119 template <typename P, typename I, typename O>
cuda_max_element_loop(P && p,I input,unsigned count,unsigned * idx,O op,void * ptr)120 void cuda_max_element_loop(
121   P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
122 ) {
123 
124   if(count == 0) {
125     cuda_single_task(p, [=] __device__ () { *idx = 0; });
126     return;
127   }
128 
129   using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
130 
131   cuda_uninitialized_reduce_loop(p,
132     cuda_make_load_iterator<T>([=]__device__(auto i){
133       return T{*(input+i), i};
134     }),
135     count,
136     idx,
137     [=] __device__ (const auto& a, const auto& b) {
138       return op(a.key, b.key) ? b : a;
139     },
140     ptr
141   );
142 }
143 
144 }  // end of namespace tf::detail ---------------------------------------------
145 
146 namespace tf {
147 
148 
149 // ----------------------------------------------------------------------------
150 // cuda_find_if
151 // ----------------------------------------------------------------------------
152 
153 /**
154 @brief finds the index of the first element that satisfies the given criteria
155 
156 @tparam P execution policy type
157 @tparam I input iterator type
158 @tparam U unary operator type
159 
160 @param p execution policy
161 @param first iterator to the beginning of the range
162 @param last iterator to the end of the range
163 @param idx pointer to the index of the found element
164 @param op unary operator which returns @c true for the required element
165 
166 The function launches kernels asynchronously to find the index @c idx of the
167 first element in the range <tt>[first, last)</tt>
168 such that <tt>op(*(first+idx))</tt> is true.
169 This is equivalent to the parallel execution of the following loop:
170 
171 @code{.cpp}
172 unsigned idx = 0;
173 for(; first != last; ++first, ++idx) {
174   if (p(*first)) {
175     return idx;
176   }
177 }
178 return idx;
179 @endcode
180 */
181 template <typename P, typename I, typename U>
cuda_find_if(P && p,I first,I last,unsigned * idx,U op)182 void cuda_find_if(
183   P&& p, I first, I last, unsigned* idx, U op
184 ) {
185   detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);
186 }
187 
188 // ----------------------------------------------------------------------------
189 // cudaFlow
190 // ----------------------------------------------------------------------------
191 
192 // Function: find_if
193 template <typename I, typename U>
find_if(I first,I last,unsigned * idx,U op)194 cudaTask cudaFlow::find_if(I first, I last, unsigned* idx, U op) {
195   return capture([=](cudaFlowCapturer& cap){
196     cap.make_optimizer<cudaLinearCapturing>();
197     cap.find_if(first, last, idx, op);
198   });
199 }
200 
201 // Function: find_if
202 template <typename I, typename U>
find_if(cudaTask task,I first,I last,unsigned * idx,U op)203 void cudaFlow::find_if(cudaTask task, I first, I last, unsigned* idx, U op) {
204   capture(task, [=](cudaFlowCapturer& cap){
205     cap.make_optimizer<cudaLinearCapturing>();
206     cap.find_if(first, last, idx, op);
207   });
208 }
209 
210 // ----------------------------------------------------------------------------
211 // cudaFlowCapturer
212 // ----------------------------------------------------------------------------
213 
214 // Function: find_if
215 template <typename I, typename U>
find_if(I first,I last,unsigned * idx,U op)216 cudaTask cudaFlowCapturer::find_if(I first, I last, unsigned* idx, U op) {
217   return on([=](cudaStream_t stream) mutable {
218     cudaDefaultExecutionPolicy p(stream);
219     cuda_find_if(p, first, last, idx, op);
220   });
221 }
222 
223 // Function: find_if
224 template <typename I, typename U>
find_if(cudaTask task,I first,I last,unsigned * idx,U op)225 void cudaFlowCapturer::find_if(
226   cudaTask task, I first, I last, unsigned* idx, U op
227 ) {
228   on(task, [=](cudaStream_t stream) mutable {
229     cudaDefaultExecutionPolicy p(stream);
230     cuda_find_if(p, first, last, idx, op);
231   });
232 }
233 
234 // ----------------------------------------------------------------------------
235 // cuda_min_element
236 // ----------------------------------------------------------------------------
237 
238 /**
239 @brief queries the buffer size in bytes needed to call tf::cuda_min_element
240 
241 @tparam P execution policy type
242 @tparam T value type
243 
244 @param count number of elements to search
245 
246 The function is used to decide the buffer size in bytes for calling
247 tf::cuda_min_element.
248 */
249 template <typename P, typename T>
cuda_min_element_buffer_size(unsigned count)250 unsigned cuda_min_element_buffer_size(unsigned count) {
251   return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
252 }
253 
254 /**
255 @brief finds the index of the minimum element in a range
256 
257 @tparam P execution policy type
258 @tparam I input iterator type
259 @tparam O comparator type
260 
261 @param p execution policy object
262 @param first iterator to the beginning of the range
263 @param last iterator to the end of the range
264 @param idx solution index of the minimum element
265 @param op comparison function object
266 @param buf pointer to the buffer
267 
268 The function launches kernels asynchronously to find
269 the smallest element in the range <tt>[first, last)</tt>
270 using the given comparator @c op.
271 You need to provide a buffer that holds at least
272 tf::cuda_min_element_buffer_size bytes for internal use.
273 The function is equivalent to a parallel execution of the following loop:
274 
275 @code{.cpp}
276 if(first == last) {
277   return 0;
278 }
279 auto smallest = first;
280 for (++first; first != last; ++first) {
281   if (op(*first, *smallest)) {
282     smallest = first;
283   }
284 }
285 return std::distance(first, smallest);
286 @endcode
287 */
288 template <typename P, typename I, typename O>
cuda_min_element(P && p,I first,I last,unsigned * idx,O op,void * buf)289 void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
290   detail::cuda_min_element_loop(
291     p, first, std::distance(first, last), idx, op, buf
292   );
293 }
294 
295 // ----------------------------------------------------------------------------
296 // cudaFlowCapturer::min_element
297 // ----------------------------------------------------------------------------
298 
299 // Function: min_element
300 template <typename I, typename O>
min_element(I first,I last,unsigned * idx,O op)301 cudaTask cudaFlowCapturer::min_element(I first, I last, unsigned* idx, O op) {
302 
303   using T = typename std::iterator_traits<I>::value_type;
304 
305   auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
306     std::distance(first, last)
307   );
308 
309   return on([=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
310   (cudaStream_t stream) mutable {
311     cudaDefaultExecutionPolicy p(stream);
312     cuda_min_element(p, first, last, idx, op, buf.get().data());
313   });
314 }
315 
316 // Function: min_element
317 template <typename I, typename O>
min_element(cudaTask task,I first,I last,unsigned * idx,O op)318 void cudaFlowCapturer::min_element(
319   cudaTask task, I first, I last, unsigned* idx, O op
320 ) {
321 
322   using T = typename std::iterator_traits<I>::value_type;
323 
324   auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
325     std::distance(first, last)
326   );
327 
328   on(task, [=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
329   (cudaStream_t stream) mutable {
330     cudaDefaultExecutionPolicy p(stream);
331     cuda_min_element(p, first, last, idx, op, buf.get().data());
332   });
333 }
334 
335 // ----------------------------------------------------------------------------
336 // cudaFlow::min_element
337 // ----------------------------------------------------------------------------
338 
339 // Function: min_element
340 template <typename I, typename O>
min_element(I first,I last,unsigned * idx,O op)341 cudaTask cudaFlow::min_element(I first, I last, unsigned* idx, O op) {
342   return capture([=](cudaFlowCapturer& cap){
343     cap.make_optimizer<cudaLinearCapturing>();
344     cap.min_element(first, last, idx, op);
345   });
346 }
347 
348 // Function: min_element
349 template <typename I, typename O>
min_element(cudaTask task,I first,I last,unsigned * idx,O op)350 void cudaFlow::min_element(
351   cudaTask task, I first, I last, unsigned* idx, O op
352 ) {
353   capture(task, [=](cudaFlowCapturer& cap){
354     cap.make_optimizer<cudaLinearCapturing>();
355     cap.min_element(first, last, idx, op);
356   });
357 }
358 
359 // ----------------------------------------------------------------------------
360 // cuda_max_element
361 // ----------------------------------------------------------------------------
362 
363 /**
364 @brief queries the buffer size in bytes needed to call tf::cuda_max_element
365 
366 @tparam P execution policy type
367 @tparam T value type
368 
369 @param count number of elements to search
370 
371 The function is used to decide the buffer size in bytes for calling
372 tf::cuda_max_element.
373 */
374 template <typename P, typename T>
cuda_max_element_buffer_size(unsigned count)375 unsigned cuda_max_element_buffer_size(unsigned count) {
376   return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
377 }
378 
379 /**
380 @brief finds the index of the maximum element in a range
381 
382 @tparam P execution policy type
383 @tparam I input iterator type
384 @tparam O comparator type
385 
386 @param p execution policy object
387 @param first iterator to the beginning of the range
388 @param last iterator to the end of the range
389 @param idx solution index of the maximum element
390 @param op comparison function object
391 @param buf pointer to the buffer
392 
393 The function launches kernels asynchronously to find
394 the largest element in the range <tt>[first, last)</tt>
395 using the given comparator @c op.
396 You need to provide a buffer that holds at least
397 tf::cuda_max_element_buffer_size bytes for internal use.
398 The function is equivalent to a parallel execution of the following loop:
399 
400 @code{.cpp}
401 if(first == last) {
402   return 0;
403 }
404 auto largest = first;
405 for (++first; first != last; ++first) {
406   if (op(*largest, *first)) {
407     largest = first;
408   }
409 }
410 return std::distance(first, largest);
411 @endcode
412 */
413 template <typename P, typename I, typename O>
cuda_max_element(P && p,I first,I last,unsigned * idx,O op,void * buf)414 void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
415   detail::cuda_max_element_loop(
416     p, first, std::distance(first, last), idx, op, buf
417   );
418 }
419 
420 // ----------------------------------------------------------------------------
421 // cudaFlowCapturer::max_element
422 // ----------------------------------------------------------------------------
423 
424 // Function: max_element
425 template <typename I, typename O>
max_element(I first,I last,unsigned * idx,O op)426 cudaTask cudaFlowCapturer::max_element(I first, I last, unsigned* idx, O op) {
427 
428   using T = typename std::iterator_traits<I>::value_type;
429 
430   auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
431     std::distance(first, last)
432   );
433 
434   return on([=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
435   (cudaStream_t stream) mutable {
436     cudaDefaultExecutionPolicy p(stream);
437     cuda_max_element(p, first, last, idx, op, buf.get().data());
438   });
439 }
440 
441 // Function: max_element
442 template <typename I, typename O>
max_element(cudaTask task,I first,I last,unsigned * idx,O op)443 void cudaFlowCapturer::max_element(
444   cudaTask task, I first, I last, unsigned* idx, O op
445 ) {
446 
447   using T = typename std::iterator_traits<I>::value_type;
448 
449   auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
450     std::distance(first, last)
451   );
452 
453   on(task, [=, buf=MoC{cudaScopedDeviceMemory<std::byte>(bufsz)}]
454   (cudaStream_t stream) mutable {
455     cudaDefaultExecutionPolicy p(stream);
456     cuda_max_element(p, first, last, idx, op, buf.get().data());
457   });
458 }
459 
460 // ----------------------------------------------------------------------------
461 // cudaFlow::max_element
462 // ----------------------------------------------------------------------------
463 
464 // Function: max_element
465 template <typename I, typename O>
max_element(I first,I last,unsigned * idx,O op)466 cudaTask cudaFlow::max_element(I first, I last, unsigned* idx, O op) {
467   return capture([=](cudaFlowCapturer& cap){
468     cap.make_optimizer<cudaLinearCapturing>();
469     cap.max_element(first, last, idx, op);
470   });
471 }
472 
473 // Function: max_element
474 template <typename I, typename O>
max_element(cudaTask task,I first,I last,unsigned * idx,O op)475 void cudaFlow::max_element(
476   cudaTask task, I first, I last, unsigned* idx, O op
477 ) {
478   capture(task, [=](cudaFlowCapturer& cap){
479     cap.make_optimizer<cudaLinearCapturing>();
480     cap.max_element(first, last, idx, op);
481   });
482 }
483 
484 }  // end of namespace tf -----------------------------------------------------
485 
486 
487