1
2// =================================================================================================
3// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
4// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
5// width of 100 characters per line.
6//
7// Author(s):
8//   Cedric Nugteren <www.cedricnugteren.nl>
9//
10// This file contains the Xamax kernel. It implements index of (absolute) min/max computation using
11// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is
12// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
13// is executed with a single workgroup only, computing the final result.
14//
15// =================================================================================================
16
17// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
18// literal). Comment-out this line for syntax-highlighting when developing.
19R"(
20
21// Parameters set by the tuner or by the database. Here they are given a basic default value in case
22// this kernel file is used outside of the CLBlast library.
23#ifndef WGS1
24  #define WGS1 64     // The local work-group size of the main kernel
25#endif
26#ifndef WGS2
27  #define WGS2 64     // The local work-group size of the epilogue kernel
28#endif
29
30// =================================================================================================
31
32// The main reduction kernel, performing the loading and the majority of the operation
33__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
34void Xamax(const int n,
35           const __global real* restrict xgm, const int x_offset, const int x_inc,
36           __global singlereal* maxgm, __global unsigned int* imaxgm) {
37  __local singlereal maxlm[WGS1];
38  __local unsigned int imaxlm[WGS1];
39  const int lid = get_local_id(0);
40  const int wgid = get_group_id(0);
41  const int num_groups = get_num_groups(0);
42
43  // Performs loading and the first steps of the reduction
44  #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version
45    singlereal max = SMALLEST;
46  #else
47    singlereal max = ZERO;
48  #endif
49  unsigned int imax = 0;
50  int id = wgid*WGS1 + lid;
51  while (id < n) {
52    const int x_index = id*x_inc + x_offset;
53    #if PRECISION == 3232 || PRECISION == 6464
54      singlereal x = xgm[x_index].x;
55    #else
56      singlereal x = xgm[x_index];
57    #endif
58    #if defined(ROUTINE_MAX) // non-absolute maximum version
59      // nothing special here
60    #elif defined(ROUTINE_MIN) // non-absolute minimum version
61      x = -x;
62    #elif defined(ROUTINE_AMIN) // absolute minimum version
63      x = -fabs(x);
64    #else
65      x = fabs(x);
66    #endif
67    if (x >= max) {
68      max = x;
69      imax = id*x_inc + x_offset;
70    }
71    id += WGS1*num_groups;
72  }
73  maxlm[lid] = max;
74  imaxlm[lid] = imax;
75  barrier(CLK_LOCAL_MEM_FENCE);
76
77  // Performs reduction in local memory
78  #pragma unroll
79  for (int s=WGS1/2; s>0; s=s>>1) {
80    if (lid < s) {
81      if (maxlm[lid + s] >= maxlm[lid]) {
82        maxlm[lid] = maxlm[lid + s];
83        imaxlm[lid] = imaxlm[lid + s];
84      }
85    }
86    barrier(CLK_LOCAL_MEM_FENCE);
87  }
88
89  // Stores the per-workgroup result
90  if (lid == 0) {
91    maxgm[wgid] = maxlm[0];
92    imaxgm[wgid] = imaxlm[0];
93  }
94}
95
96// =================================================================================================
97
98// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
99// be launched with a single workgroup only.
100__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
101void XamaxEpilogue(const __global singlereal* restrict maxgm,
102                   const __global unsigned int* restrict imaxgm,
103                   __global unsigned int* imax, const int imax_offset) {
104  __local singlereal maxlm[WGS2];
105  __local unsigned int imaxlm[WGS2];
106  const int lid = get_local_id(0);
107
108  // Performs the first step of the reduction while loading the data
109  if (maxgm[lid + WGS2] >= maxgm[lid]) {
110    maxlm[lid] = maxgm[lid + WGS2];
111    imaxlm[lid] = imaxgm[lid + WGS2];
112  }
113  else {
114    maxlm[lid] = maxgm[lid];
115    imaxlm[lid] = imaxgm[lid];
116  }
117  barrier(CLK_LOCAL_MEM_FENCE);
118
119  // Performs reduction in local memory
120  #pragma unroll
121  for (int s=WGS2/2; s>0; s=s>>1) {
122    if (lid < s) {
123      if (maxlm[lid + s] >= maxlm[lid]) {
124        maxlm[lid] = maxlm[lid + s];
125        imaxlm[lid] = imaxlm[lid + s];
126      }
127    }
128    barrier(CLK_LOCAL_MEM_FENCE);
129  }
130
131  // Stores the final result
132  if (lid == 0) {
133    imax[imax_offset] = imaxlm[0];
134  }
135}
136
137// =================================================================================================
138
139// End of the C++11 raw string literal
140)"
141
142// =================================================================================================
143