1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #include "shared/source/helpers/local_work_size.h"
9 
10 #include "shared/source/debug_settings/debug_settings_manager.h"
11 #include "shared/source/helpers/array_count.h"
12 #include "shared/source/helpers/basic_math.h"
13 #include "shared/source/helpers/debug_helpers.h"
14 #include "shared/source/helpers/hw_helper.h"
15 #include "shared/source/program/kernel_info.h"
16 
17 #include <cmath>
18 #include <cstdint>
19 #include <ctime>
20 
21 namespace NEO {
22 
23 //threshold used to determine what kind of device is underneath
24 //big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices
25 constexpr uint32_t highThreadCountThreshold = 56u;
26 
27 static const uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1};
28 
29 static const uint32_t primeNumbers[] = {
30     251,
31     241,
32     239, 233,
33     229, 227, 223,
34     211,
35     199, 197, 193, 191,
36     181,
37     179, 173,
38     167, 163,
39     157, 151,
40     149,
41     139, 137, 131,
42     127,
43     113,
44     109, 107, 103, 101,
45     97,
46     89, 83,
47     79, 73, 71,
48     67, 61,
49     59, 53,
50     47, 43, 41,
51     37, 31,
52     29, 23,
53     19, 17, 13, 11,
54     7, 5, 3, 2};
55 
56 static const size_t MAX_PRIMES = sizeof(primeNumbers) / sizeof(primeNumbers[0]);
57 
58 // Recursive template function to test prime factors
59 template <uint32_t primeIndex>
factor(size_t workItems,uint32_t workSize,uint32_t maxWorkGroupSize)60 static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
61     auto primeNumber = primeNumbers[primeIndex];
62 
63     auto newWorkSize = workSize * primeNumber;
64     if (newWorkSize <= workItems) {
65         while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
66             workSize = newWorkSize;
67             newWorkSize = workSize * primeNumber;
68         }
69 
70         workSize = factor<primeIndex - 1>(workItems, workSize, maxWorkGroupSize);
71     }
72 
73     return workSize;
74 }
75 
76 // Terminator of recursive factoring logic
77 template <>
factor(size_t workItems,uint32_t workSize,uint32_t maxWorkGroupSize)78 inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
79     uint32_t primeIndex = 0;
80     auto primeNumber = primeNumbers[primeIndex];
81 
82     auto newWorkSize = workSize * primeNumber;
83     if (newWorkSize <= workItems) {
84         while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
85             workSize = newWorkSize;
86             newWorkSize = workSize * primeNumber;
87         }
88     }
89 
90     return workSize;
91 }
92 
computePowerOfTwoLWS(const size_t workItems[3],WorkSizeInfo & workGroupInfo,size_t workGroupSize[3],const uint32_t workDim,bool canUseNx4)93 void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) {
94     uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0;
95     auto arraySize = arrayCount(optimalHardwareThreadCountGeneric);
96     auto simdSize = workGroupInfo.simdSize;
97 
98     while (targetIndex < arraySize &&
99            optimalHardwareThreadCountGeneric[targetIndex] > 1 &&
100            workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) {
101         targetIndex++;
102     }
103     uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex];
104 
105     if (workDim == 2) {
106         uint32_t xDim, yDim;
107         xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1);
108         while (xDim > workItems[0])
109             xDim = xDim >> 1;
110         yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim);
111         workGroupSize[0] = xDim;
112         workGroupSize[1] = yDim;
113     } else {
114         uint32_t xDim, yDim, zDim;
115         xDim = uint32_t(optimalLocalThreads * simdSize);
116         while (xDim > workItems[0])
117             xDim = xDim >> 1;
118         yDim = uint32_t(optimalLocalThreads * simdSize) / xDim;
119         while (yDim > workItems[1])
120             yDim = yDim >> 1;
121         UNRECOVERABLE_IF((xDim * yDim) == 0);
122         zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim);
123         workGroupSize[0] = xDim;
124         workGroupSize[1] = yDim;
125         workGroupSize[2] = zDim;
126     }
127 }
128 
choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024],uint32_t xyzFactorsLen[3],size_t workGroupSize[3],const size_t workItems[3],WorkSizeInfo & wsInfo)129 void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo) {
130     float ratioDiff = 0;
131     float localRatio = float(0xffffffff);
132     uint64_t localWkgs = 0xffffffff;
133     uint64_t workGroups;
134     for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) {
135         for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) {
136 
137             uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx];
138             uint32_t Ydim = xyzFactors[1][YFactorsIdx];
139 
140             if ((Xdim * Ydim) > wsInfo.maxWorkGroupSize) {
141                 break;
142             }
143             if ((Xdim * Ydim) < wsInfo.minWorkGroupSize) {
144                 continue;
145             }
146 
147             workGroups = Math::divideAndRoundUp(workItems[0], Xdim);
148             workGroups *= Math::divideAndRoundUp(workItems[1], Ydim);
149 
150             ratioDiff = log((float)Xdim) - log((float)Ydim);
151             ratioDiff = fabs(wsInfo.targetRatio - ratioDiff);
152 
153             if (wsInfo.useStrictRatio == true) {
154                 if (ratioDiff < localRatio) {
155                     workGroupSize[0] = Xdim;
156                     workGroupSize[1] = Ydim;
157                     localRatio = ratioDiff;
158                     localWkgs = workGroups;
159                 }
160             } else {
161                 if ((workGroups < localWkgs) ||
162                     ((workGroups == localWkgs) && (ratioDiff < localRatio))) {
163                     workGroupSize[0] = Xdim;
164                     workGroupSize[1] = Ydim;
165                     localRatio = ratioDiff;
166                     localWkgs = workGroups;
167                 }
168             }
169         }
170     }
171 }
choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024],uint32_t xyzFactorsLen[3],size_t workGroupSize[3],const size_t workItems[3],WorkSizeInfo & wsInfo,uint32_t workdim)172 void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, uint32_t workdim) {
173     uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
174     uint64_t workGroups;
175     for (uint32_t ZFactorsIdx = 0; ZFactorsIdx < xyzFactorsLen[2]; ++ZFactorsIdx) {
176         for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) {
177             for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) {
178 
179                 uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx];
180                 uint32_t Ydim = xyzFactors[1][YFactorsIdx];
181                 uint32_t Zdim = xyzFactors[2][ZFactorsIdx];
182 
183                 if ((Xdim * Ydim * Zdim) > wsInfo.maxWorkGroupSize) {
184                     break;
185                 }
186                 if ((Xdim * Ydim * Zdim) < wsInfo.minWorkGroupSize) {
187                     continue;
188                 }
189 
190                 workGroups = Math::divideAndRoundUp(workItems[0], Xdim);
191                 workGroups *= Math::divideAndRoundUp(workItems[1], Ydim);
192                 workGroups *= Math::divideAndRoundUp(workItems[2], Zdim);
193                 uint64_t euThrdsDispatched;
194 
195                 euThrdsDispatched = Math::divideAndRoundUp(Xdim * Ydim * Zdim, wsInfo.simdSize);
196                 euThrdsDispatched *= workGroups;
197 
198                 if (euThrdsDispatched < localEuThrdsDispatched) {
199                     localEuThrdsDispatched = euThrdsDispatched;
200                     workGroupSize[0] = Xdim;
201                     workGroupSize[1] = Ydim;
202                     workGroupSize[2] = Zdim;
203                 }
204             }
205         }
206     }
207 }
208 
setSpecialWorkgroupSize(size_t workgroupSize[3])209 void setSpecialWorkgroupSize(size_t workgroupSize[3]) {
210     workgroupSize[0] = 1;
211     workgroupSize[1] = 1;
212     workgroupSize[2] = 1;
213 }
214 
computeWorkgroupSize1D(uint32_t maxWorkGroupSize,size_t workGroupSize[3],const size_t workItems[3],size_t simdSize)215 void computeWorkgroupSize1D(uint32_t maxWorkGroupSize,
216                             size_t workGroupSize[3],
217                             const size_t workItems[3],
218                             size_t simdSize) {
219     auto items = workItems[0];
220 
221     // Determine the LSB set to quickly handle factors of 2
222     auto numBits = Math::getMinLsbSet(static_cast<uint32_t>(items));
223 
224     // Clamp power of 2 result to maxWorkGroupSize
225     uint32_t workSize = 1u << numBits;
226 
227     //Assumes maxWorkGroupSize is a power of two.
228     DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0);
229     workSize = std::min(workSize, maxWorkGroupSize);
230 
231     // Try all primes as potential factors
232     workSize = factor<MAX_PRIMES - 1>(items, workSize, maxWorkGroupSize);
233 
234     workGroupSize[0] = workSize;
235     workGroupSize[1] = 1;
236     workGroupSize[2] = 1;
237 }
238 
computeWorkgroupSize2D(uint32_t maxWorkGroupSize,size_t workGroupSize[3],const size_t workItems[3],size_t simdSize)239 void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) {
240     uint32_t xFactors[1024];
241     uint32_t yFactors[1024];
242     uint32_t xFactorsLen = 0;
243     uint32_t yFactorsLen = 0;
244     uint64_t waste;
245     uint64_t localWSWaste = 0xffffffffffffffff;
246     uint64_t euThrdsDispatched;
247     uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
248     uint64_t workGroups;
249     uint32_t xDim;
250     uint32_t yDim;
251 
252     for (int i = 0; i < 3; i++)
253         workGroupSize[i] = 1;
254 
255     for (uint32_t i = 2; i <= maxWorkGroupSize; i++) {
256         if ((workItems[0] % i) == 0) {
257             xFactors[xFactorsLen++] = i;
258         }
259         if (((workItems[1] % i) == 0)) {
260             yFactors[yFactorsLen++] = i;
261         }
262     }
263 
264     for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) {
265         for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) {
266             // Pick a LocalWorkSize that is a multiple as well as appropriate:
267             // 1 <= workGroupSize[ 0 ] <= workItems[ 0 ]
268             // 1 <= workGroupSize[ 1 ] <= workItems[ 1 ]
269             xDim = xFactors[xFactorsLen - 1 - xFactorsIdx];
270             yDim = yFactors[yFactorsIdx];
271 
272             if ((xDim * yDim) > maxWorkGroupSize) {
273                 // The yDim value is too big, so break out of this loop.
274                 // No other entries will work.
275                 break;
276             }
277 
278             // Find the wasted channels.
279             workGroups = Math::divideAndRoundUp(workItems[0], xDim);
280             workGroups *= Math::divideAndRoundUp(workItems[1], yDim);
281 
282             // Compaction Mode!
283             euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize);
284             euThrdsDispatched *= workGroups;
285 
286             waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1));
287             waste *= workGroups;
288 
289             if (((euThrdsDispatched < localEuThrdsDispatched) ||
290                  ((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) {
291                 localWSWaste = waste;
292                 localEuThrdsDispatched = euThrdsDispatched;
293                 workGroupSize[0] = xDim;
294                 workGroupSize[1] = yDim;
295             }
296         }
297     }
298 }
299 
computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize,size_t workGroupSize[3],const size_t workItems[3],size_t simdSize,const uint32_t workDim)300 void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
301     for (int i = 0; i < 3; i++)
302         workGroupSize[i] = 1;
303     size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
304     for (auto i = 0u; i < workDim; i++) {
305         uint32_t requiredWorkItemsCount = maxWorkGroupSize;
306         while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
307             requiredWorkItemsCount >>= 1;
308         itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
309     }
310     if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
311         while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
312             if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
313                 itemsPowerOfTwoDivisors[0] >>= 1;
314             else
315                 itemsPowerOfTwoDivisors[1] >>= 1;
316         }
317         for (auto i = 0u; i < 3; i++)
318             workGroupSize[i] = itemsPowerOfTwoDivisors[i];
319         return;
320 
321     } else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
322         computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
323         return;
324     } else {
325         for (auto i = 0u; i < workDim; i++)
326             workGroupSize[i] = workItems[i];
327         return;
328     }
329 }
330 
computeWorkgroupSizeND(WorkSizeInfo & wsInfo,size_t workGroupSize[3],const size_t workItems[3],const uint32_t workDim)331 void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
332     for (int i = 0; i < 3; i++)
333         workGroupSize[i] = 1;
334 
335     uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2];
336 
337     UNRECOVERABLE_IF(wsInfo.simdSize == 0);
338 
339     //Find biggest power of two which devide each dimension size
340     if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
341         if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) {
342             computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
343             return;
344         }
345 
346         size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
347         for (auto i = 0u; i < workDim; i++) {
348             uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
349             while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
350                 requiredWorkItemsCount >>= 1;
351             itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
352         }
353 
354         bool canUseNx4 = (wsInfo.imgUsed &&
355                           (itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) &&
356                           itemsPowerOfTwoDivisors[1] >= 4);
357 
358         //If computed dimension sizes which are powers of two are creating group which is
359         //bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
360         uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
361         if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) {
362             computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
363             return;
364         }
365         //If coputed workgroup is at this point in correct size
366         else if (allItems >= wsInfo.simdSize) {
367             itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1];
368             for (auto i = 0u; i < workDim; i++)
369                 workGroupSize[i] = itemsPowerOfTwoDivisors[i];
370             return;
371         }
372     }
373     //If dimensions are not powers of two but total number of items is less than max work group size
374     if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) {
375         for (auto i = 0u; i < workDim; i++)
376             workGroupSize[i] = workItems[i];
377         return;
378     } else {
379         if (workDim == 1)
380             computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize);
381         else {
382             uint32_t xyzFactors[3][1024];
383             uint32_t xyzFactorsLen[3] = {};
384 
385             //check if algorithm should use ratio
386             wsInfo.checkRatio(workItems);
387 
388             //find all divisors for all dimensions
389             for (int i = 0; i < 3; i++)
390                 xyzFactors[i][xyzFactorsLen[i]++] = 1;
391             for (auto i = 0u; i < workDim; i++) {
392                 for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) {
393                     if ((workItems[i] % j) == 0) {
394                         xyzFactors[i][xyzFactorsLen[i]++] = j;
395                     }
396                 }
397             }
398             if (wsInfo.useRatio) {
399                 choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo);
400                 if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) {
401                     wsInfo.useStrictRatio = false;
402                     choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo);
403                 }
404             } else
405                 choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, workDim);
406         }
407     }
408 }
409 
computeWorkgroupsNumber(const Vec3<size_t> & gws,const Vec3<size_t> & lws)410 Vec3<size_t> computeWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
411     return (Vec3<size_t>(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0),
412                          gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0),
413                          gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0)));
414 }
415 
generateWorkgroupsNumber(const Vec3<size_t> & gws,const Vec3<size_t> & lws)416 Vec3<size_t> generateWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
417     return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3<size_t>(0, 0, 0);
418 }
419 
canonizeWorkgroup(const Vec3<size_t> & workgroup)420 Vec3<size_t> canonizeWorkgroup(const Vec3<size_t> &workgroup) {
421     return ((workgroup.x > 0) ? Vec3<size_t>({workgroup.x, std::max(workgroup.y, static_cast<size_t>(1)), std::max(workgroup.z, static_cast<size_t>(1))})
422                               : Vec3<size_t>(0, 0, 0));
423 }
424 
425 } // namespace NEO
426