1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #include <stdlib.h>
19 #include <string.h>
20 #include <clblas_stddef.h>
21 
22 #include "matrix_dims.h"
23 #include "problem_iter.h"
24 #include "solution_assert.h"
25 #include "solution_seq.h"
26 
27 bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole);
28 void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step);
29 
30 static cl_int
31 enqueueKernel(
32     SolutionStep *step,
33     const Kernel *kernel,
34     cl_uint numEventsInWaitList,
35     const cl_event *eventWaitList,
36     cl_event *event);
37 
38 static void
39 splitSolutionStep(
40     SolutionStep *rem,
41     SolutionStep *cut,
42     SDimComponent component,
43     size_t chunk,
44     bool backward);
45 
46 static cl_int
47 executeImageStep(
48     SolutionStep *step,
49     cl_uint numEventsInWaitList,
50     const cl_event *eventWaitList,
51     cl_event *event);
52 
53 void
freeSolutionSeq(ListHead * seq)54 freeSolutionSeq(ListHead *seq)
55 {
56     listDoForEachSafe(seq, freeSolutionStep);
57     listInitHead(seq);
58 }
59 
60 cl_int
executeSolutionSeq(const ListHead * seq)61 executeSolutionSeq(const ListHead *seq)
62 {
63     cl_int err = CL_SUCCESS;
64     ListNode *i;
65     SolutionStep *step;
66 
67 
68     /* Enqueue computing kernels */
69     for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS);
70          i = i->next) {
71 
72         step = container_of(i, node, SolutionStep);
73         if (step->cmdQueue == NULL) {
74             continue;
75         }
76 
77         if (step->args.scimage[0]) {
78             err = executeImageStep(step, step->numEventsInWaitList,
79                                    step->eventWaitList, step->event);
80         }
81         else {
82 			#ifdef DEBUG_2
83 			printf("enqueueKernel from executreSolutionSeq...\n");
84 			#endif
85 
86             err = enqueueKernel(step,
87                                 step->kernels[CLBLAS_COMPUTING_KERNEL],
88                                 step->numEventsInWaitList, step->eventWaitList,
89                                 step->event);
90         }
91     }
92 
93     return err;
94 }
95 
96 /* private functions */
97 
98 void VISIBILITY_HIDDEN
freeSolutionStep(ListNode * node)99 freeSolutionStep(ListNode *node)
100 {
101     SolutionStep *step = container_of(node, node, SolutionStep);
102     int i;
103 
104     for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; i++) {
105         if (step->kernels[i] != NULL) {
106             putKernel(clblasKernelCache, step->kernels[i]);
107         }
108     }
109     releaseStepImgs(step);
110     free(step);
111 }
112 
113 static cl_int
enqueueKernel(SolutionStep * step,const Kernel * kernel,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)114 enqueueKernel(
115     SolutionStep *step,
116     const Kernel *kernel,
117     cl_uint numEventsInWaitList,
118     const cl_event *eventWaitList,
119     cl_event *event)
120 {
121     cl_int err;
122     KernelDesc kernelDesc;
123     KernelErrorInfo errInfo;
124     MemoryPattern *pattern;
125     const CLBLASKernExtra *kextra = (const CLBLASKernExtra*)kernel->extra;
126     SubproblemDim subdims[MAX_SUBDIMS];
127 
128     step->args.kernType = kextra->kernType;
129     pattern = &clblasSolvers[step->funcID].memPatterns[step->patternID];
130     kernelDesc.workDim = step->pgran.wgDim;
131 
132     memcpy(subdims, step->subdims, sizeof(step->subdims));
133 
134     if(NULL==pattern->sops->calcThreads)
135     {
136         SubproblemDim globDim;
137         const PGranularity *pgran;
138 
139         pgran = (pattern->nrLevels == 1) ? NULL : &step->pgran;
140         kargsToProbDims(&globDim, step->funcID, &step->args, false);
141 
142         // fixup dimensions in respect with desired work dispatch order
143         if ((step->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) {
144             if (pattern->sops->innerDecompositionAxis(&step->args) ==
145                 DECOMP_AXIS_X) {
146 
147                 /*
148                  * these dimensions will not used more anywhere, so we can
149                  * just swap them
150                  */
151                 swapDimXY(&subdims[0]);
152                 swapDimXY(&subdims[1]);
153                 swapDimXY(&globDim);
154             }
155         }
156 
157         calcGlobalThreads(kernelDesc.globalThreads, subdims,
158                           pgran, globDim.y, globDim.x);
159     }
160     else
161     {
162 		#ifdef DEBUG_2
163 		printf("calcThreads is defined\n");
164 		#endif
165 
166 		pattern->sops->calcThreads(	kernelDesc.globalThreads,
167 									subdims,
168 									&step->pgran,
169 									&step->args,
170 									kextra);
171     }
172 
173     //
174     // Store the numWGSpawned for this kernel
175     // This size can be used by sequence-steps down the line
176     // e.g. Reduction of intermediate results of each work group
177     //
178     step->pgran.numWGSpawned[0] = kernelDesc.globalThreads[0] / step->pgran.wgSize[0];
179     step->pgran.numWGSpawned[1] = kernelDesc.globalThreads[1] / step->pgran.wgSize[1];
180 
181     kernelDesc.localThreads[0] = step->pgran.wgSize[0];
182     kernelDesc.localThreads[1] = step->pgran.wgSize[1];
183     kernelDesc.workDim = step->pgran.wgDim;
184     kernelDesc.waitListSize = numEventsInWaitList;
185     kernelDesc.eventWaitList = eventWaitList;
186     kernelDesc.nowait = 1;
187     kernelDesc.event = event;
188     kernelDesc.needExecTime = 0;
189 
190     memset(kernelDesc.args, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS);
191     pattern->sops->assignKargs(kernelDesc.args, (const void*)&(step->args),
192                                kextra);
193 
194     errInfo.wrongArg = 0;
195     errInfo.phase = 0;
196 
197     /*
198      * TODO: log launchClKernel errors
199      */
200     dumpKernel(step, kextra->kernType);
201 
202     err = clCreateKernelsInProgram(kernel->program, 1, &kernelDesc.kernel,
203                                    NULL);
204     if (err == CL_SUCCESS) {
205         err = launchClKernel(&kernelDesc, step->cmdQueue, &errInfo);
206         clReleaseKernel(kernelDesc.kernel);
207     }
208 
209     return err;
210 }
211 
212 bool VISIBILITY_HIDDEN
isMatrixInImage(MemoryPattern * pattern,MatrixRole mrole)213 isMatrixInImage(
214     MemoryPattern *pattern,
215     MatrixRole mrole)
216 {
217     const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
218     bool ret = false;
219 
220     if (extra != NULL) {
221         switch (mrole) {
222         case MATRIX_A:
223             ret = (extra->mobjA == CLMEM_IMAGE);
224             break;
225         case MATRIX_B:
226             ret = (extra->mobjB == CLMEM_IMAGE);
227             break;
228         default:
229             break;
230         }
231     }
232 
233     return ret;
234 }
235 
236 void VISIBILITY_HIDDEN
releaseStepImgs(SolutionStep * step)237 releaseStepImgs(SolutionStep *step)
238 {
239     int i;
240     cl_mem *imgs = step->args.scimage;
241     cl_device_id devID = NULL;;
242 
243     for (i = 0; (i < 2) && (imgs[i] != NULL); i++) {
244         if (devID == NULL) {
245             getQueueDevice(step->cmdQueue, &devID);
246         }
247         putSCImage(devID, imgs[i]);
248         imgs[i] = NULL; //to avoid double release
249     }
250 }
251 
252 static cl_int
executeImageStep(SolutionStep * step,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)253 executeImageStep(
254     SolutionStep *step,
255     cl_uint numEventsInWaitList,
256     const cl_event *eventWaitList,
257     cl_event *event)
258 {
259     SolutionStep outerStep, innerStep, execStep;
260     cl_int err = CL_SUCCESS;
261     int currImg = 0;
262     size_t imgWidth, imgHeight;
263     size_t ha, hb;
264     size_t maxPanels[MATRIX_ROLES_NUMBER], maxBlocks[MATRIX_ROLES_NUMBER];
265     size_t off;
266     SubproblemDim wholeDim;
267     MatrixRole mrole;
268     CLBlasKargs *kargs = &step->args;
269     cl_mem *imgs = kargs->scimage;
270     MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID];
271     ProblemIterator innerIter, outerIter;
272     int oend = 0, iend;
273     SDimComponent comp[2];
274     bool backward;
275     ListHead doneSteps;
276     CLBlasKernelType ktype;
277 
278     kargsToProbDims(&wholeDim, step->funcID, kargs, false);
279     memset(maxPanels, 0, sizeof(maxPanels));
280     memset(maxBlocks, 0, sizeof(maxPanels));
281 
282     memcpy(&outerStep, step, sizeof(SolutionStep));
283     memcpy(&execStep, step, sizeof(SolutionStep));
284     listInitHead(&doneSteps);
285 
286     /*
287      * Cover the whole problem with dimension which matrix blocks are
288      * fitted to images at.
289      */
290 
291     for (mrole = MATRIX_A; mrole < MATRIX_C; mrole++) {
292         if (!isMatrixInImage(mempat, mrole)) {
293             continue;
294         }
295 
296         clGetImageInfo(imgs[currImg], CL_IMAGE_WIDTH, sizeof(imgWidth),
297                        &imgWidth, NULL);
298         clGetImageInfo(imgs[currImg], CL_IMAGE_HEIGHT, sizeof(imgHeight),
299                        &imgHeight, NULL);
300 
301         if (step->funcID == CLBLAS_TRSM) {
302             maxPanels[mrole] = 0;
303             maxBlocks[mrole] = 0;
304         } else {
305             maxPanels[mrole] = imgHeight / matrBlockHeight(step->subdims, mrole,
306                                                            clblasLeft);
307         }
308         currImg++;
309     }
310 
311     /*
312      * for GEMM function we can take both the matrices as outer, it depends on
313      * their sizes and image sizes
314      */
315     if (step->funcID == CLBLAS_GEMM) {
316         size_t dx, dy;
317 
318         // FIXME: check which of them use really an image
319 
320         ha = matrBlockHeight(&wholeDim, MATRIX_A, clblasLeft);
321         hb = matrBlockHeight(&wholeDim, MATRIX_B, clblasLeft);
322 
323         dx = maxPanels[MATRIX_B] * matrBlockHeight(step->subdims, MATRIX_B,
324                                                    clblasLeft);
325         dy = maxPanels[MATRIX_A] * matrBlockHeight(step->subdims, MATRIX_A,
326                                                    clblasLeft);
327 
328         // hb + (hb*ha)/dx < ha + (ha*hb)/dy
329         if ((hb / ha) < (1 + hb / dy) / (1 + ha / dx)) {
330             mrole = MATRIX_B;
331         }
332         else {
333             mrole = MATRIX_A;
334         }
335     }
336     else {
337         mrole = MATRIX_B;
338     }
339     /*
340      * Let's cover the whole image based step.
341      * Pattern iterator is used for traversing
342      */
343     initProblemIterator(&outerIter, step->funcID, mrole, kargs,
344                         maxPanels[mrole], maxBlocks[mrole], step->subdims);
345     if (mrole == MATRIX_B) {
346         comp[0] = SDIM_X;
347         comp[1] = SDIM_Y;
348         mrole = MATRIX_A;
349     }
350     else {
351         comp[0] = SDIM_Y;
352         comp[1] = SDIM_X;
353         mrole = MATRIX_B;
354     }
355     initProblemIterator(&innerIter, step->funcID, mrole,
356                         kargs, maxPanels[mrole], maxBlocks[mrole],
357                         step->subdims);
358     backward = isIterBackward(&innerIter);
359 
360     /*
361      * Difference in overflowing checking in the outer and inner loops
362      * is due to
363      */
364     do {
365         iteratorReset(&innerIter);
366         iend = 0;
367         oend = iterateProblem(&outerIter);
368         off = iterLastOffset(&outerIter);
369 
370         splitSolutionStep(&outerStep, &execStep, comp[0],
371                                   off, false);
372         if (execStep.funcID == CLBLAS_GEMM) {
373             fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0);
374         }
375 
376         memcpy(&innerStep, &execStep, sizeof(SolutionStep));
377 
378         ktype = (comp[0] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL :
379                                       CLBLAS_PREP_B_KERNEL;
380 
381         if (execStep.kernels[ktype] != NULL) {
382             err = enqueueKernel(&execStep, execStep.kernels[ktype],
383                                 numEventsInWaitList, eventWaitList, event);
384             if (err != CL_SUCCESS) {
385                  break;
386             }
387         }
388 
389         do {
390             iend = iterateProblem(&innerIter);
391             off = iterLastOffset(&innerIter);
392             splitSolutionStep(&innerStep, &execStep,
393                               comp[1], off, backward);
394             if (execStep.funcID == CLBLAS_GEMM) {
395                 fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0);
396             }
397 
398             assertImageSubstep(step, &execStep, &doneSteps);
399 
400             ktype = (comp[1] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL :
401                                           CLBLAS_PREP_B_KERNEL;
402             if (execStep.kernels[ktype] != NULL) {
403                 err = enqueueKernel(&execStep, execStep.kernels[ktype],
404                                     numEventsInWaitList, eventWaitList, event);
405             }
406             if (err == CL_SUCCESS) {
407                 err = enqueueKernel(&execStep,
408                                     execStep.kernels[CLBLAS_COMPUTING_KERNEL],
409                                     numEventsInWaitList, eventWaitList,
410                                     event);
411             }
412         } while (!iend && (err == CL_SUCCESS));
413     } while (!oend && (err == CL_SUCCESS));
414 
415     if (err == CL_SUCCESS) {
416         assertImageStep(step, &doneSteps);
417     }
418     releaseImageAssertion(&doneSteps);
419 
420     return err;
421 }
422 
423 static void
splitSolutionStep(SolutionStep * rem,SolutionStep * cut,SDimComponent component,size_t chunk,bool backward)424 splitSolutionStep(
425     SolutionStep *rem,
426     SolutionStep *cut,
427     SDimComponent component,
428     size_t chunk,
429     bool backward)
430 {
431     SubproblemDim remDim, cutDim;
432     SubproblemDim remDimOff, cutDimOff;
433 
434     kargsToProbDims(&remDimOff, rem->funcID, &rem->args, true);
435     kargsToProbDims(&remDim, rem->funcID, &rem->args, false);
436     memcpy(&cutDim, &remDim, sizeof(SubproblemDim));
437     memcpy(&cutDimOff, &remDimOff, sizeof(SubproblemDim));
438 
439     memcpy(cut, rem, sizeof(SolutionStep));
440     if (component == SDIM_Y) {
441         if (backward) {
442             cutDimOff.y += remDim.y - chunk;
443         }
444         else {
445             remDimOff.y += chunk;
446         }
447         cutDim.y = chunk;
448         remDim.y -= chunk;
449     }
450     else {
451         if (backward) {
452             cutDimOff.x += remDim.x - chunk;
453         }
454         else {
455             remDimOff.x += chunk;
456         }
457         cutDim.x = chunk;
458         remDim.x -= chunk;
459     }
460 
461     probDimsToKargs(&rem->args, rem->funcID, &remDimOff, true);
462     probDimsToKargs(&rem->args, rem->funcID, &remDim, false);
463     probDimsToKargs(&cut->args, cut->funcID, &cutDimOff, true);
464     probDimsToKargs(&cut->args, cut->funcID, &cutDim, false);
465 }
466