1 2// ================================================================================================= 3// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5// width of 100 characters per line. 6// 7// Author(s): 8// Cedric Nugteren <www.cedricnugteren.nl> 9// 10// This file contains the Xamax kernel. It implements index of (absolute) min/max computation using 11// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is 12// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel 13// is executed with a single workgroup only, computing the final result. 14// 15// ================================================================================================= 16 17// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 18// literal). Comment-out this line for syntax-highlighting when developing. 19R"( 20 21// Parameters set by the tuner or by the database. Here they are given a basic default value in case 22// this kernel file is used outside of the CLBlast library. 23#ifndef WGS1 24 #define WGS1 64 // The local work-group size of the main kernel 25#endif 26#ifndef WGS2 27 #define WGS2 64 // The local work-group size of the epilogue kernel 28#endif 29 30// ================================================================================================= 31 32// The main reduction kernel, performing the loading and the majority of the operation 33__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) 34void Xamax(const int n, 35 const __global real* restrict xgm, const int x_offset, const int x_inc, 36 __global singlereal* maxgm, __global unsigned int* imaxgm) { 37 __local singlereal maxlm[WGS1]; 38 __local unsigned int imaxlm[WGS1]; 39 const int lid = get_local_id(0); 40 const int wgid = get_group_id(0); 41 const int num_groups = get_num_groups(0); 42 43 // Performs loading and the first steps of the reduction 44 #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version 45 singlereal max = SMALLEST; 46 #else 47 singlereal max = ZERO; 48 #endif 49 unsigned int imax = 0; 50 int id = wgid*WGS1 + lid; 51 while (id < n) { 52 const int x_index = id*x_inc + x_offset; 53 #if PRECISION == 3232 || PRECISION == 6464 54 singlereal x = xgm[x_index].x; 55 #else 56 singlereal x = xgm[x_index]; 57 #endif 58 #if defined(ROUTINE_MAX) // non-absolute maximum version 59 // nothing special here 60 #elif defined(ROUTINE_MIN) // non-absolute minimum version 61 x = -x; 62 #elif defined(ROUTINE_AMIN) // absolute minimum version 63 x = -fabs(x); 64 #else 65 x = fabs(x); 66 #endif 67 if (x >= max) { 68 max = x; 69 imax = id*x_inc + x_offset; 70 } 71 id += WGS1*num_groups; 72 } 73 maxlm[lid] = max; 74 imaxlm[lid] = imax; 75 barrier(CLK_LOCAL_MEM_FENCE); 76 77 // Performs reduction in local memory 78 #pragma unroll 79 for (int s=WGS1/2; s>0; s=s>>1) { 80 if (lid < s) { 81 if (maxlm[lid + s] >= maxlm[lid]) { 82 maxlm[lid] = maxlm[lid + s]; 83 imaxlm[lid] = imaxlm[lid + s]; 84 } 85 } 86 barrier(CLK_LOCAL_MEM_FENCE); 87 } 88 89 // Stores the per-workgroup result 90 if (lid == 0) { 91 maxgm[wgid] = maxlm[0]; 92 imaxgm[wgid] = imaxlm[0]; 93 } 94} 95 96// ================================================================================================= 97 98// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to 99// be launched with a single workgroup only. 100__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) 101void XamaxEpilogue(const __global singlereal* restrict maxgm, 102 const __global unsigned int* restrict imaxgm, 103 __global unsigned int* imax, const int imax_offset) { 104 __local singlereal maxlm[WGS2]; 105 __local unsigned int imaxlm[WGS2]; 106 const int lid = get_local_id(0); 107 108 // Performs the first step of the reduction while loading the data 109 if (maxgm[lid + WGS2] >= maxgm[lid]) { 110 maxlm[lid] = maxgm[lid + WGS2]; 111 imaxlm[lid] = imaxgm[lid + WGS2]; 112 } 113 else { 114 maxlm[lid] = maxgm[lid]; 115 imaxlm[lid] = imaxgm[lid]; 116 } 117 barrier(CLK_LOCAL_MEM_FENCE); 118 119 // Performs reduction in local memory 120 #pragma unroll 121 for (int s=WGS2/2; s>0; s=s>>1) { 122 if (lid < s) { 123 if (maxlm[lid + s] >= maxlm[lid]) { 124 maxlm[lid] = maxlm[lid + s]; 125 imaxlm[lid] = imaxlm[lid + s]; 126 } 127 } 128 barrier(CLK_LOCAL_MEM_FENCE); 129 } 130 131 // Stores the final result 132 if (lid == 0) { 133 imax[imax_offset] = imaxlm[0]; 134 } 135} 136 137// ================================================================================================= 138 139// End of the C++11 raw string literal 140)" 141 142// ================================================================================================= 143