1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 #include <stdlib.h>
19 #include <string.h>
20 #include <clblas_stddef.h>
21
22 #include "matrix_dims.h"
23 #include "problem_iter.h"
24 #include "solution_assert.h"
25 #include "solution_seq.h"
26
27 bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole);
28 void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step);
29
30 static cl_int
31 enqueueKernel(
32 SolutionStep *step,
33 const Kernel *kernel,
34 cl_uint numEventsInWaitList,
35 const cl_event *eventWaitList,
36 cl_event *event);
37
38 static void
39 splitSolutionStep(
40 SolutionStep *rem,
41 SolutionStep *cut,
42 SDimComponent component,
43 size_t chunk,
44 bool backward);
45
46 static cl_int
47 executeImageStep(
48 SolutionStep *step,
49 cl_uint numEventsInWaitList,
50 const cl_event *eventWaitList,
51 cl_event *event);
52
53 void
freeSolutionSeq(ListHead * seq)54 freeSolutionSeq(ListHead *seq)
55 {
56 listDoForEachSafe(seq, freeSolutionStep);
57 listInitHead(seq);
58 }
59
60 cl_int
executeSolutionSeq(const ListHead * seq)61 executeSolutionSeq(const ListHead *seq)
62 {
63 cl_int err = CL_SUCCESS;
64 ListNode *i;
65 SolutionStep *step;
66
67
68 /* Enqueue computing kernels */
69 for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS);
70 i = i->next) {
71
72 step = container_of(i, node, SolutionStep);
73 if (step->cmdQueue == NULL) {
74 continue;
75 }
76
77 if (step->args.scimage[0]) {
78 err = executeImageStep(step, step->numEventsInWaitList,
79 step->eventWaitList, step->event);
80 }
81 else {
82 #ifdef DEBUG_2
83 printf("enqueueKernel from executreSolutionSeq...\n");
84 #endif
85
86 err = enqueueKernel(step,
87 step->kernels[CLBLAS_COMPUTING_KERNEL],
88 step->numEventsInWaitList, step->eventWaitList,
89 step->event);
90 }
91 }
92
93 return err;
94 }
95
96 /* private functions */
97
98 void VISIBILITY_HIDDEN
freeSolutionStep(ListNode * node)99 freeSolutionStep(ListNode *node)
100 {
101 SolutionStep *step = container_of(node, node, SolutionStep);
102 int i;
103
104 for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; i++) {
105 if (step->kernels[i] != NULL) {
106 putKernel(clblasKernelCache, step->kernels[i]);
107 }
108 }
109 releaseStepImgs(step);
110 free(step);
111 }
112
113 static cl_int
enqueueKernel(SolutionStep * step,const Kernel * kernel,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)114 enqueueKernel(
115 SolutionStep *step,
116 const Kernel *kernel,
117 cl_uint numEventsInWaitList,
118 const cl_event *eventWaitList,
119 cl_event *event)
120 {
121 cl_int err;
122 KernelDesc kernelDesc;
123 KernelErrorInfo errInfo;
124 MemoryPattern *pattern;
125 const CLBLASKernExtra *kextra = (const CLBLASKernExtra*)kernel->extra;
126 SubproblemDim subdims[MAX_SUBDIMS];
127
128 step->args.kernType = kextra->kernType;
129 pattern = &clblasSolvers[step->funcID].memPatterns[step->patternID];
130 kernelDesc.workDim = step->pgran.wgDim;
131
132 memcpy(subdims, step->subdims, sizeof(step->subdims));
133
134 if(NULL==pattern->sops->calcThreads)
135 {
136 SubproblemDim globDim;
137 const PGranularity *pgran;
138
139 pgran = (pattern->nrLevels == 1) ? NULL : &step->pgran;
140 kargsToProbDims(&globDim, step->funcID, &step->args, false);
141
142 // fixup dimensions in respect with desired work dispatch order
143 if ((step->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) {
144 if (pattern->sops->innerDecompositionAxis(&step->args) ==
145 DECOMP_AXIS_X) {
146
147 /*
148 * these dimensions will not used more anywhere, so we can
149 * just swap them
150 */
151 swapDimXY(&subdims[0]);
152 swapDimXY(&subdims[1]);
153 swapDimXY(&globDim);
154 }
155 }
156
157 calcGlobalThreads(kernelDesc.globalThreads, subdims,
158 pgran, globDim.y, globDim.x);
159 }
160 else
161 {
162 #ifdef DEBUG_2
163 printf("calcThreads is defined\n");
164 #endif
165
166 pattern->sops->calcThreads( kernelDesc.globalThreads,
167 subdims,
168 &step->pgran,
169 &step->args,
170 kextra);
171 }
172
173 //
174 // Store the numWGSpawned for this kernel
175 // This size can be used by sequence-steps down the line
176 // e.g. Reduction of intermediate results of each work group
177 //
178 step->pgran.numWGSpawned[0] = kernelDesc.globalThreads[0] / step->pgran.wgSize[0];
179 step->pgran.numWGSpawned[1] = kernelDesc.globalThreads[1] / step->pgran.wgSize[1];
180
181 kernelDesc.localThreads[0] = step->pgran.wgSize[0];
182 kernelDesc.localThreads[1] = step->pgran.wgSize[1];
183 kernelDesc.workDim = step->pgran.wgDim;
184 kernelDesc.waitListSize = numEventsInWaitList;
185 kernelDesc.eventWaitList = eventWaitList;
186 kernelDesc.nowait = 1;
187 kernelDesc.event = event;
188 kernelDesc.needExecTime = 0;
189
190 memset(kernelDesc.args, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS);
191 pattern->sops->assignKargs(kernelDesc.args, (const void*)&(step->args),
192 kextra);
193
194 errInfo.wrongArg = 0;
195 errInfo.phase = 0;
196
197 /*
198 * TODO: log launchClKernel errors
199 */
200 dumpKernel(step, kextra->kernType);
201
202 err = clCreateKernelsInProgram(kernel->program, 1, &kernelDesc.kernel,
203 NULL);
204 if (err == CL_SUCCESS) {
205 err = launchClKernel(&kernelDesc, step->cmdQueue, &errInfo);
206 clReleaseKernel(kernelDesc.kernel);
207 }
208
209 return err;
210 }
211
212 bool VISIBILITY_HIDDEN
isMatrixInImage(MemoryPattern * pattern,MatrixRole mrole)213 isMatrixInImage(
214 MemoryPattern *pattern,
215 MatrixRole mrole)
216 {
217 const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
218 bool ret = false;
219
220 if (extra != NULL) {
221 switch (mrole) {
222 case MATRIX_A:
223 ret = (extra->mobjA == CLMEM_IMAGE);
224 break;
225 case MATRIX_B:
226 ret = (extra->mobjB == CLMEM_IMAGE);
227 break;
228 default:
229 break;
230 }
231 }
232
233 return ret;
234 }
235
236 void VISIBILITY_HIDDEN
releaseStepImgs(SolutionStep * step)237 releaseStepImgs(SolutionStep *step)
238 {
239 int i;
240 cl_mem *imgs = step->args.scimage;
241 cl_device_id devID = NULL;;
242
243 for (i = 0; (i < 2) && (imgs[i] != NULL); i++) {
244 if (devID == NULL) {
245 getQueueDevice(step->cmdQueue, &devID);
246 }
247 putSCImage(devID, imgs[i]);
248 imgs[i] = NULL; //to avoid double release
249 }
250 }
251
252 static cl_int
executeImageStep(SolutionStep * step,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)253 executeImageStep(
254 SolutionStep *step,
255 cl_uint numEventsInWaitList,
256 const cl_event *eventWaitList,
257 cl_event *event)
258 {
259 SolutionStep outerStep, innerStep, execStep;
260 cl_int err = CL_SUCCESS;
261 int currImg = 0;
262 size_t imgWidth, imgHeight;
263 size_t ha, hb;
264 size_t maxPanels[MATRIX_ROLES_NUMBER], maxBlocks[MATRIX_ROLES_NUMBER];
265 size_t off;
266 SubproblemDim wholeDim;
267 MatrixRole mrole;
268 CLBlasKargs *kargs = &step->args;
269 cl_mem *imgs = kargs->scimage;
270 MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID];
271 ProblemIterator innerIter, outerIter;
272 int oend = 0, iend;
273 SDimComponent comp[2];
274 bool backward;
275 ListHead doneSteps;
276 CLBlasKernelType ktype;
277
278 kargsToProbDims(&wholeDim, step->funcID, kargs, false);
279 memset(maxPanels, 0, sizeof(maxPanels));
280 memset(maxBlocks, 0, sizeof(maxPanels));
281
282 memcpy(&outerStep, step, sizeof(SolutionStep));
283 memcpy(&execStep, step, sizeof(SolutionStep));
284 listInitHead(&doneSteps);
285
286 /*
287 * Cover the whole problem with dimension which matrix blocks are
288 * fitted to images at.
289 */
290
291 for (mrole = MATRIX_A; mrole < MATRIX_C; mrole++) {
292 if (!isMatrixInImage(mempat, mrole)) {
293 continue;
294 }
295
296 clGetImageInfo(imgs[currImg], CL_IMAGE_WIDTH, sizeof(imgWidth),
297 &imgWidth, NULL);
298 clGetImageInfo(imgs[currImg], CL_IMAGE_HEIGHT, sizeof(imgHeight),
299 &imgHeight, NULL);
300
301 if (step->funcID == CLBLAS_TRSM) {
302 maxPanels[mrole] = 0;
303 maxBlocks[mrole] = 0;
304 } else {
305 maxPanels[mrole] = imgHeight / matrBlockHeight(step->subdims, mrole,
306 clblasLeft);
307 }
308 currImg++;
309 }
310
311 /*
312 * for GEMM function we can take both the matrices as outer, it depends on
313 * their sizes and image sizes
314 */
315 if (step->funcID == CLBLAS_GEMM) {
316 size_t dx, dy;
317
318 // FIXME: check which of them use really an image
319
320 ha = matrBlockHeight(&wholeDim, MATRIX_A, clblasLeft);
321 hb = matrBlockHeight(&wholeDim, MATRIX_B, clblasLeft);
322
323 dx = maxPanels[MATRIX_B] * matrBlockHeight(step->subdims, MATRIX_B,
324 clblasLeft);
325 dy = maxPanels[MATRIX_A] * matrBlockHeight(step->subdims, MATRIX_A,
326 clblasLeft);
327
328 // hb + (hb*ha)/dx < ha + (ha*hb)/dy
329 if ((hb / ha) < (1 + hb / dy) / (1 + ha / dx)) {
330 mrole = MATRIX_B;
331 }
332 else {
333 mrole = MATRIX_A;
334 }
335 }
336 else {
337 mrole = MATRIX_B;
338 }
339 /*
340 * Let's cover the whole image based step.
341 * Pattern iterator is used for traversing
342 */
343 initProblemIterator(&outerIter, step->funcID, mrole, kargs,
344 maxPanels[mrole], maxBlocks[mrole], step->subdims);
345 if (mrole == MATRIX_B) {
346 comp[0] = SDIM_X;
347 comp[1] = SDIM_Y;
348 mrole = MATRIX_A;
349 }
350 else {
351 comp[0] = SDIM_Y;
352 comp[1] = SDIM_X;
353 mrole = MATRIX_B;
354 }
355 initProblemIterator(&innerIter, step->funcID, mrole,
356 kargs, maxPanels[mrole], maxBlocks[mrole],
357 step->subdims);
358 backward = isIterBackward(&innerIter);
359
360 /*
361 * Difference in overflowing checking in the outer and inner loops
362 * is due to
363 */
364 do {
365 iteratorReset(&innerIter);
366 iend = 0;
367 oend = iterateProblem(&outerIter);
368 off = iterLastOffset(&outerIter);
369
370 splitSolutionStep(&outerStep, &execStep, comp[0],
371 off, false);
372 if (execStep.funcID == CLBLAS_GEMM) {
373 fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0);
374 }
375
376 memcpy(&innerStep, &execStep, sizeof(SolutionStep));
377
378 ktype = (comp[0] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL :
379 CLBLAS_PREP_B_KERNEL;
380
381 if (execStep.kernels[ktype] != NULL) {
382 err = enqueueKernel(&execStep, execStep.kernels[ktype],
383 numEventsInWaitList, eventWaitList, event);
384 if (err != CL_SUCCESS) {
385 break;
386 }
387 }
388
389 do {
390 iend = iterateProblem(&innerIter);
391 off = iterLastOffset(&innerIter);
392 splitSolutionStep(&innerStep, &execStep,
393 comp[1], off, backward);
394 if (execStep.funcID == CLBLAS_GEMM) {
395 fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0);
396 }
397
398 assertImageSubstep(step, &execStep, &doneSteps);
399
400 ktype = (comp[1] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL :
401 CLBLAS_PREP_B_KERNEL;
402 if (execStep.kernels[ktype] != NULL) {
403 err = enqueueKernel(&execStep, execStep.kernels[ktype],
404 numEventsInWaitList, eventWaitList, event);
405 }
406 if (err == CL_SUCCESS) {
407 err = enqueueKernel(&execStep,
408 execStep.kernels[CLBLAS_COMPUTING_KERNEL],
409 numEventsInWaitList, eventWaitList,
410 event);
411 }
412 } while (!iend && (err == CL_SUCCESS));
413 } while (!oend && (err == CL_SUCCESS));
414
415 if (err == CL_SUCCESS) {
416 assertImageStep(step, &doneSteps);
417 }
418 releaseImageAssertion(&doneSteps);
419
420 return err;
421 }
422
423 static void
splitSolutionStep(SolutionStep * rem,SolutionStep * cut,SDimComponent component,size_t chunk,bool backward)424 splitSolutionStep(
425 SolutionStep *rem,
426 SolutionStep *cut,
427 SDimComponent component,
428 size_t chunk,
429 bool backward)
430 {
431 SubproblemDim remDim, cutDim;
432 SubproblemDim remDimOff, cutDimOff;
433
434 kargsToProbDims(&remDimOff, rem->funcID, &rem->args, true);
435 kargsToProbDims(&remDim, rem->funcID, &rem->args, false);
436 memcpy(&cutDim, &remDim, sizeof(SubproblemDim));
437 memcpy(&cutDimOff, &remDimOff, sizeof(SubproblemDim));
438
439 memcpy(cut, rem, sizeof(SolutionStep));
440 if (component == SDIM_Y) {
441 if (backward) {
442 cutDimOff.y += remDim.y - chunk;
443 }
444 else {
445 remDimOff.y += chunk;
446 }
447 cutDim.y = chunk;
448 remDim.y -= chunk;
449 }
450 else {
451 if (backward) {
452 cutDimOff.x += remDim.x - chunk;
453 }
454 else {
455 remDimOff.x += chunk;
456 }
457 cutDim.x = chunk;
458 remDim.x -= chunk;
459 }
460
461 probDimsToKargs(&rem->args, rem->funcID, &remDimOff, true);
462 probDimsToKargs(&rem->args, rem->funcID, &remDim, false);
463 probDimsToKargs(&cut->args, cut->funcID, &cutDimOff, true);
464 probDimsToKargs(&cut->args, cut->funcID, &cutDim, false);
465 }
466