1 /* ************************************************************************
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #include<stdio.h>
18 #include<stdlib.h>
19 #include<limits.h>
20 #include<clBLAS.h>
21 
22 #define SWAP(TYPE,a,b)  do { TYPE swap_tmp_ = a ; a = b ; b = swap_tmp_ ; } while(0)
23 
24 // Return true if the area starting from pint (x,y) and of size (w,h) is
25 // within the array of size d1 x d2
inside2d(size_t d1,size_t d2,int x,int y,size_t w,size_t h)26 static int inside2d( size_t d1, size_t d2, int x, int y, size_t w, size_t h )
27 {
28   // Very very large dimensions are likely a bug
29   size_t MAXDIM = ((size_t)INT_MAX)  ;
30   size_t max_w = (size_t)(d1-x) ;
31   size_t max_h = (size_t)(d2-y) ;
32 
33   if ( d1 >= MAXDIM ) return 0 ;
34   if ( d2 >= MAXDIM ) return 0 ;
35   if ( w  >= MAXDIM ) return 0 ;
36   if ( h  >= MAXDIM ) return 0 ;
37 
38   if ( x < 0 || x >= (int)d1 ) return 0 ;
39   if ( w > max_w ) return 0 ;
40 
41   if ( y < 0 || y >= (int)d2 ) return 0 ;
42   if ( h > max_h ) return 0 ;
43 
44   return 1 ;
45 }
46 
clblasMatrixSizeInfo(clblasOrder order,size_t rows,size_t columns,size_t elemsize,size_t padding,size_t * ld,size_t * fullsize)47 clblasStatus clblasMatrixSizeInfo(clblasOrder order,
48                                   size_t rows,
49                                   size_t columns,
50                                   size_t elemsize,
51                                   size_t padding,
52                                   size_t * ld,
53                                   size_t * fullsize)
54 {
55   size_t x;
56   size_t y;
57 
58   if( order == clblasRowMajor )
59   {
60     x = columns;
61     y = rows;
62   }
63   else
64   {
65     x = rows;
66     y = columns;
67   }
68 
69   // set if not NULL
70   if( ld ) *ld = x + padding;
71   if( fullsize ) *fullsize = (size_t) ( (x + padding) * y * elemsize );
72 
73   return clblasSuccess;
74 }
75 
76 
clblasCreateMatrix(cl_context context,clblasOrder order,size_t rows,size_t columns,size_t elemsize,size_t padding,size_t * ld,size_t * fullsize,cl_int * err)77 cl_mem clblasCreateMatrix(
78     cl_context context,
79     clblasOrder order,
80     size_t rows,
81     size_t columns,
82     size_t elemsize,
83     size_t padding,
84     size_t * ld,
85     size_t * fullsize,
86     cl_int * err)
87 {
88   size_t tmp_fullsize;
89   cl_mem_flags flags = CL_MEM_READ_WRITE;
90 
91   clblasMatrixSizeInfo(
92       order,
93       rows,
94       columns,
95       elemsize,
96       padding,
97       ld,
98       &tmp_fullsize);
99 
100   // set if not NULL
101   if(fullsize != NULL) *fullsize = tmp_fullsize;
102 
103   return clCreateBuffer(
104       context,
105       flags,
106       tmp_fullsize,
107       NULL,
108       err);
109 }
110 
111 /*
112  * Internal function:
113  *  see clblasCreateMatrix()
114  */
clblasCreateMatrixWithLd(cl_context context,clblasOrder order,size_t rows,size_t columns,size_t elemsize,size_t ld,size_t * fullsize,cl_int * err)115 cl_mem clblasCreateMatrixWithLd(
116     cl_context context,
117     clblasOrder order,
118     size_t rows,
119     size_t columns,
120     size_t elemsize,
121     size_t ld,
122     size_t * fullsize,
123     cl_int * err)
124 {
125   int nbelem;
126   cl_mem_flags flags = CL_MEM_READ_WRITE;
127 
128   // compute number of elements
129   if( order == clblasRowMajor  )
130   {
131     // check ld
132     if( ld < columns )
133     {
134       *err = clblasInvalidValue;
135       return 0;
136     }
137 
138     nbelem = rows * ld;
139   }
140   else if( order == clblasColumnMajor )
141   {
142     // check ld
143     if( ld < rows )
144     {
145       *err = clblasInvalidValue;
146       return 0;
147     }
148 
149     nbelem = ld * columns;
150   }
151 
152   // set if not NULL
153   if( fullsize ) *fullsize = (size_t) (nbelem * elemsize );
154 
155   // allocate
156   return clCreateBuffer(
157       context,
158       flags,
159       *fullsize,
160       NULL,
161       err);
162 }
163 
164 
clblasCreateMatrixFromHost(cl_context context,clblasOrder order,size_t rows,size_t columns,size_t elemsize,size_t ld,void * host,size_t off_host,size_t ld_host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_int * err)165 cl_mem clblasCreateMatrixFromHost(
166     cl_context context,
167     clblasOrder order,
168     size_t rows,
169     size_t columns,
170     size_t elemsize,
171     size_t ld,
172     void * host,
173     size_t off_host,
174     size_t ld_host,
175     cl_command_queue command_queue,
176     cl_uint numEventsInWaitList,
177     const cl_event *eventWaitList,
178     cl_int * err)
179 {
180   size_t fullsize;
181   cl_mem out;
182   size_t i;
183 
184   out = clblasCreateMatrixWithLd(
185       context,
186       order,
187       rows,
188       columns,
189       elemsize,
190       ld,
191       &fullsize,
192       err);
193 
194   if( ! *err )
195   {
196     printf("ok\n");
197     // TODO use ReadMatrix instead ?
198     if( order == clblasRowMajor )
199     {
200       for( i = 0; i < rows; i++ )
201       {
202         const size_t host_orig[3] = {off_host, off_host, 0};
203         const size_t buff_orig[3] = {0, 0, 0};
204         const size_t region[3] = {columns*elemsize, rows, 1};
205         *err = clEnqueueWriteBufferRect(
206             command_queue,
207             out,
208             CL_TRUE,
209             buff_orig,
210             host_orig,
211             region,
212             columns * elemsize,
213             0,
214             ld_host * elemsize,
215             0,
216             host,
217             numEventsInWaitList,
218             eventWaitList,
219             NULL);
220       }
221     }
222   }
223 
224   return out;
225 }
226 
227 /*
228  * Internal function:
229  *  enqueue event in list and wait for it if blocking
230  */
emptyAction(cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_bool blocking)231 static clblasStatus emptyAction(
232     cl_command_queue command_queue,
233     cl_uint numEventsInWaitList,
234     const cl_event *eventWaitList,
235     cl_event *event,
236     cl_bool blocking)
237 {
238   cl_int err ;
239 
240   err = clEnqueueBarrierWithWaitList(
241       command_queue,
242       numEventsInWaitList,
243       eventWaitList,
244       event);
245 
246   if (err != clblasSuccess)
247     return  (clblasStatus)err;
248 
249   if(blocking)
250     return  (clblasStatus)clWaitForEvents(1, event);
251   else
252     return (clblasStatus)err;
253 }
254 
255 /*
256  * Internal function:
257  *  Generic version of clblasWriteSubMatrix with blocking arg
258  *  event must be non-NULL if blocking is set to CL_TRUE
259  */
_clblasWriteSubMatrix(clblasOrder order,size_t element_size,const void * A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,cl_mem B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_bool blocking)260 static clblasStatus _clblasWriteSubMatrix(
261     clblasOrder order,
262     size_t element_size,
263     const void *A, size_t offA, size_t ldA,
264     size_t nrA, size_t ncA,
265     size_t xA, size_t yA,
266     cl_mem B, size_t offB, size_t ldB,
267     size_t nrB, size_t ncB,
268     size_t xB, size_t yB,
269     size_t nx, size_t ny,
270     cl_command_queue command_queue,
271     cl_uint numEventsInWaitList,
272     const cl_event *eventWaitList,
273     cl_event *event,
274     cl_bool blocking)
275 {
276 
277   if( order == clblasRowMajor )
278   {
279     SWAP(size_t, xA, yA);
280     SWAP(size_t, nrA, ncA);
281     SWAP(size_t, xB, yB);
282     SWAP(size_t, nrB, ncB);
283     SWAP(size_t, nx, ny);
284   }
285 
286   // Check that the specified area is within the array A
287   if ( !inside2d( nrA,ncA, xA,yA , nx,ny ) ) {
288     return clblasInvalidValue ;
289   }
290 
291   // Check that the specified area is within the array B
292   if ( !inside2d( nrB,ncB, xB,yB , nx,ny ) ) {
293     return clblasInvalidValue ;
294   }
295 
296 
297   if( nx == 0 || ny == 0 )
298   {
299     return emptyAction(
300         command_queue,
301         numEventsInWaitList,
302         eventWaitList,
303         event,
304         blocking);
305   }
306 
307   {
308     const size_t origA[3] = { (xA+offA)*element_size, yA, 0 };
309     const size_t origB[3] = { (xB+offB)*element_size, yB, 0 };
310     const size_t region[3] = { nx * element_size, ny, 1 };
311 
312     return (clblasStatus) clEnqueueWriteBufferRect(
313         command_queue,
314         B,
315         blocking,
316         origB,
317         origA,
318         region,
319         ldB * element_size,
320         0,
321         ldA * element_size,
322         0,
323         A,
324         numEventsInWaitList,
325         eventWaitList,
326         event);
327   }
328 }
329 
clblasWriteSubMatrix(clblasOrder order,size_t element_size,const void * A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,cl_mem B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)330 clblasStatus clblasWriteSubMatrix(
331     clblasOrder order,
332     size_t element_size,
333     const void *A, size_t offA, size_t ldA,
334     size_t nrA, size_t ncA,
335     size_t xA, size_t yA,
336     cl_mem B, size_t offB, size_t ldB,
337     size_t nrB, size_t ncB,
338     size_t xB, size_t yB,
339     size_t nx, size_t ny,
340     cl_command_queue command_queue,
341     cl_uint numEventsInWaitList,
342     const cl_event *eventWaitList)
343 {
344   cl_event evt;
345 
346   return _clblasWriteSubMatrix(
347       order,
348       element_size,
349       A, offA, ldA,
350       nrA, ncA,
351       xA, yA,
352       B, offB, ldB,
353       nrB, ncB,
354       xB, yB,
355       nx, ny,
356       command_queue,
357       numEventsInWaitList,
358       eventWaitList,
359       &evt,
360       CL_TRUE);
361 }
362 
clblasWriteSubMatrixAsync(clblasOrder order,size_t element_size,const void * A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,cl_mem B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)363 clblasStatus clblasWriteSubMatrixAsync(
364     clblasOrder order,
365     size_t element_size,
366     const void *A, size_t offA, size_t ldA,
367     size_t nrA, size_t ncA,
368     size_t xA, size_t yA,
369     cl_mem B, size_t offB, size_t ldB,
370     size_t nrB, size_t ncB,
371     size_t xB, size_t yB,
372     size_t nx, size_t ny,
373     cl_command_queue command_queue,
374     cl_uint numEventsInWaitList,
375     const cl_event *eventWaitList,
376     cl_event *event)
377 {
378   return _clblasWriteSubMatrix(
379       order,
380       element_size,
381       A, offA, ldA,
382       nrA, ncA,
383       xA, yA,
384       B, offB, ldB,
385       nrB, ncB,
386       xB, yB,
387       nx, ny,
388       command_queue,
389       numEventsInWaitList,
390       eventWaitList,
391       event,
392       CL_FALSE);
393 }
394 
395 
396 /*
397  * Internal function:
398  *  Generic version of clblasReadSubMatrix with blocking arg
399  *  event must be non-NULL if blocking is set to CL_TRUE
400  */
_clblasReadSubMatrix(clblasOrder order,size_t element_size,const cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,void * B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_bool blocking)401 static clblasStatus _clblasReadSubMatrix(
402     clblasOrder order,
403     size_t element_size,
404     const cl_mem A, size_t offA, size_t ldA,
405     size_t nrA, size_t ncA,
406     size_t xA, size_t yA,
407     void *B, size_t offB, size_t ldB,
408     size_t nrB, size_t ncB,
409     size_t xB, size_t yB,
410     size_t nx, size_t ny,
411     cl_command_queue command_queue,
412     cl_uint numEventsInWaitList,
413     const cl_event *eventWaitList,
414     cl_event *event,
415     cl_bool blocking)
416 {
417 
418   if( order == clblasRowMajor )
419   {
420     SWAP(size_t, xA, yA);
421     SWAP(size_t, nrA, ncA);
422     SWAP(size_t, xB, yB);
423     SWAP(size_t, nrB, ncB);
424     SWAP(size_t, nx, ny);
425   }
426 
427   if( nx == 0 || ny == 0 )
428   {
429     return emptyAction(
430         command_queue,
431         numEventsInWaitList,
432         eventWaitList,
433         event,
434         blocking);
435   }
436 
437   // Check that the specified area is within the array A
438   if ( !inside2d( nrA,ncA, xA,yA , nx,ny ) ) {
439     return clblasInvalidValue ;
440   }
441 
442   // Check that the specified area is within the array B
443   if ( !inside2d( nrB,ncB, xB,yB , nx,ny ) ) {
444     return clblasInvalidValue ;
445   }
446 
447   {
448     const size_t origA[3] = { (xA+offA)*element_size, yA, 0 };
449     const size_t origB[3] = { (xB+offB)*element_size, yB, 0 };
450     const size_t region[3] = { nx * element_size, ny, 1 };
451 
452     return (clblasStatus) clEnqueueReadBufferRect(
453         command_queue,
454         A,
455         blocking,
456         origA,
457         origB,
458         region,
459         ldA * element_size,
460         0,
461         ldB * element_size,
462         0,
463         B,
464         numEventsInWaitList,
465         eventWaitList,
466         event);
467   }
468 }
469 
470 
clblasReadSubMatrix(clblasOrder order,size_t element_size,const cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,void * B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)471 clblasStatus clblasReadSubMatrix(
472     clblasOrder order,
473     size_t element_size,
474     const cl_mem A, size_t offA, size_t ldA,
475     size_t nrA, size_t ncA,
476     size_t xA, size_t yA,
477     void *B, size_t offB, size_t ldB,
478     size_t nrB, size_t ncB,
479     size_t xB, size_t yB,
480     size_t nx, size_t ny,
481     cl_command_queue command_queue,
482     cl_uint numEventsInWaitList,
483     const cl_event *eventWaitList)
484 {
485   cl_event evt;
486 
487   return _clblasReadSubMatrix(
488       order,
489       element_size,
490       A, offA, ldA,
491       nrA, ncA,
492       xA, yA,
493       B, offB, ldB,
494       nrB, ncB,
495       xB, yB,
496       nx, ny,
497       command_queue,
498       numEventsInWaitList,
499       eventWaitList,
500       &evt,
501       CL_TRUE);
502 }
503 
504 
clblasReadSubMatrixAsync(clblasOrder order,size_t element_size,const cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,void * B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)505 clblasStatus clblasReadSubMatrixAsync(
506     clblasOrder order,
507     size_t element_size,
508     const cl_mem A, size_t offA, size_t ldA,
509     size_t nrA, size_t ncA,
510     size_t xA, size_t yA,
511     void *B, size_t offB, size_t ldB,
512     size_t nrB, size_t ncB,
513     size_t xB, size_t yB,
514     size_t nx, size_t ny,
515     cl_command_queue command_queue,
516     cl_uint numEventsInWaitList,
517     const cl_event *eventWaitList,
518     cl_event *event)
519 {
520   return _clblasReadSubMatrix(
521       order,
522       element_size,
523       A, offA, ldA,
524       nrA, ncA,
525       xA, yA,
526       B, offB, ldB,
527       nrB, ncB,
528       xB, yB,
529       nx, ny,
530       command_queue,
531       numEventsInWaitList,
532       eventWaitList,
533       event,
534       CL_TRUE);
535 }
536 
537 
538 /*
539  * Internal function:
540  *  Generic version of clblasCopySubMatrix with blocking arg
541  *  event must be non-NULL if blocking is set to CL_TRUE
542  */
_clblasCopySubMatrix(clblasOrder order,size_t element_size,const cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,cl_mem B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_bool blocking)543 static clblasStatus _clblasCopySubMatrix(
544     clblasOrder order,
545     size_t element_size,
546     const cl_mem A, size_t offA, size_t ldA,
547     size_t nrA, size_t ncA,
548     size_t xA, size_t yA,
549     cl_mem B, size_t offB, size_t ldB,
550     size_t nrB, size_t ncB,
551     size_t xB, size_t yB,
552     size_t nx, size_t ny,
553     cl_command_queue command_queue,
554     cl_uint numEventsInWaitList,
555     const cl_event *eventWaitList,
556     cl_event *event,
557     cl_bool blocking)
558 {
559   cl_int err;
560   if( order == clblasRowMajor )
561   {
562     SWAP(size_t, xA, yA);
563     SWAP(size_t, nrA, ncA);
564     SWAP(size_t, xB, yB);
565     SWAP(size_t, nrB, ncB);
566     SWAP(size_t, nx, ny);
567   }
568 
569   if( nx == 0 || ny == 0 )
570   {
571     return emptyAction(
572         command_queue,
573         numEventsInWaitList,
574         eventWaitList,
575         event,
576         CL_FALSE);
577   }
578 
579   // Check that the specified area is within the array A
580   if ( !inside2d( nrA,ncA, xA,yA , nx,ny ) ) {
581     return clblasInvalidValue ;
582   }
583 
584   // Check that the specified area is within the array B
585   if ( !inside2d( nrB,ncB, xB,yB , nx,ny ) ) {
586     return clblasInvalidValue ;
587   }
588 
589   {
590     const size_t origA[3] = { (xA+offA)*element_size, yA, 0 };
591     const size_t origB[3] = { (xB+offB)*element_size, yB, 0 };
592     const size_t region[3] = { nx * element_size, ny, 1 };
593 
594     err = clEnqueueCopyBufferRect(
595                                   command_queue,
596                                   A,
597                                   B,
598                                   origA,
599                                   origB,
600                                   region,
601                                   ldA * element_size,
602                                   0,
603                                   ldB * element_size,
604                                   0,
605                                   numEventsInWaitList,
606                                   eventWaitList,
607                                   event);
608   }
609 
610   if (err != clblasSuccess)
611     return  (clblasStatus)err;
612 
613   if(blocking)
614     return  (clblasStatus)clWaitForEvents(1, event);
615   else
616     return (clblasStatus)err;
617 }
618 
619 
clblasCopySubMatrix(clblasOrder order,size_t element_size,const cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,cl_mem B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)620 clblasStatus clblasCopySubMatrix(
621     clblasOrder order,
622     size_t element_size,
623     const cl_mem A, size_t offA, size_t ldA,
624     size_t nrA, size_t ncA,
625     size_t xA, size_t yA,
626     cl_mem B, size_t offB, size_t ldB,
627     size_t nrB, size_t ncB,
628     size_t xB, size_t yB,
629     size_t nx, size_t ny,
630     cl_command_queue command_queue,
631     cl_uint numEventsInWaitList,
632     const cl_event *eventWaitList)
633 {
634   cl_event evt;
635 
636   return (clblasStatus) _clblasCopySubMatrix(
637       order,
638       element_size,
639       A, offA, ldA,
640       nrA, ncA,
641       xA, yA,
642       B, offB, ldB,
643       nrB, ncB,
644       xB, yB,
645       nx, ny,
646       command_queue,
647       numEventsInWaitList,
648       eventWaitList,
649       &evt,
650       CL_TRUE);
651 }
652 
653 
clblasCopySubMatrixAsync(clblasOrder order,size_t element_size,const cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,cl_mem B,size_t offB,size_t ldB,size_t nrB,size_t ncB,size_t xB,size_t yB,size_t nx,size_t ny,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)654 clblasStatus clblasCopySubMatrixAsync(
655     clblasOrder order,
656     size_t element_size,
657     const cl_mem A, size_t offA, size_t ldA,
658     size_t nrA, size_t ncA,
659     size_t xA, size_t yA,
660     cl_mem B, size_t offB, size_t ldB,
661     size_t nrB, size_t ncB,
662     size_t xB, size_t yB,
663     size_t nx, size_t ny,
664     cl_command_queue command_queue,
665     cl_uint numEventsInWaitList,
666     const cl_event *eventWaitList,
667     cl_event *event)
668 {
669   return (clblasStatus) _clblasCopySubMatrix(
670       order,
671       element_size,
672       A, offA, ldA,
673       nrA, ncA,
674       xA, yA,
675       B, offB, ldB,
676       nrB, ncB,
677       xB, yB,
678       nx, ny,
679       command_queue,
680       numEventsInWaitList,
681       eventWaitList,
682       event,
683       CL_FALSE);
684 }
685 
686 
clblasWriteVector(size_t nb_elem,size_t element_size,const void * A,size_t offA,cl_mem B,size_t offB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)687 clblasStatus clblasWriteVector(
688     size_t nb_elem,
689     size_t element_size,
690     const void *A, size_t offA,
691     cl_mem B, size_t offB,
692     cl_command_queue command_queue,
693     cl_uint numEventsInWaitList,
694     const cl_event *eventWaitList)
695 {
696   return clblasWriteMatrix(
697       clblasColumnMajor,
698       nb_elem, 1,
699       element_size,
700       A, offA, nb_elem,
701       B, offB, nb_elem,
702       command_queue,
703       numEventsInWaitList,
704       eventWaitList);
705 }
706 
707 
clblasWriteVectorAsync(size_t nb_elem,size_t element_size,const void * A,size_t offA,cl_mem B,size_t offB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * events)708 clblasStatus clblasWriteVectorAsync(
709     size_t nb_elem,
710     size_t element_size,
711     const void *A, size_t offA,
712     cl_mem B, size_t offB,
713     cl_command_queue command_queue,
714     cl_uint numEventsInWaitList,
715     const cl_event *eventWaitList,
716     cl_event *events)
717 {
718   return clblasWriteMatrixAsync(
719       clblasColumnMajor,
720       nb_elem, 1,
721       element_size,
722       A, offA, nb_elem,
723       B, offB, nb_elem,
724       command_queue,
725       numEventsInWaitList,
726       eventWaitList,
727       events);
728 }
729 
730 
clblasReadVector(size_t nb_elem,size_t element_size,const cl_mem A,size_t offA,void * B,size_t offB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)731 clblasStatus clblasReadVector(
732     size_t nb_elem,
733     size_t element_size,
734     const cl_mem A, size_t offA,
735     void * B, size_t offB,
736     cl_command_queue command_queue,
737     cl_uint numEventsInWaitList,
738     const cl_event *eventWaitList)
739 {
740   return clblasReadMatrix(
741       clblasColumnMajor,
742       nb_elem, 1,
743       element_size,
744       A, offA, nb_elem,
745       B, offB, nb_elem,
746       command_queue,
747       numEventsInWaitList,
748       eventWaitList);
749 }
750 
751 
clblasReadVectorAsync(size_t nb_elem,size_t element_size,const cl_mem A,size_t offA,void * B,size_t offB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * events)752 clblasStatus clblasReadVectorAsync(
753     size_t nb_elem,
754     size_t element_size,
755     const cl_mem A, size_t offA,
756     void * B, size_t offB,
757     cl_command_queue command_queue,
758     cl_uint numEventsInWaitList,
759     const cl_event *eventWaitList,
760     cl_event *events)
761 {
762   return clblasReadMatrixAsync(
763       clblasColumnMajor,
764       nb_elem, 1,
765       element_size,
766       A, offA, nb_elem,
767       B, offB, nb_elem,
768       command_queue,
769       numEventsInWaitList,
770       eventWaitList,
771       events);
772 }
773 
774 
clblasCopyVector(size_t nb_elem,size_t element_size,const cl_mem A,size_t offA,cl_mem B,size_t offB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)775 clblasStatus clblasCopyVector(
776     size_t nb_elem,
777     size_t element_size,
778     const cl_mem A, size_t offA,
779     cl_mem B, size_t offB,
780     cl_command_queue command_queue,
781     cl_uint numEventsInWaitList,
782     const cl_event *eventWaitList)
783 {
784   return clblasCopyMatrix(
785       clblasColumnMajor,
786       nb_elem, 1,
787       element_size,
788       A, offA, nb_elem,
789       B, offB, nb_elem,
790       command_queue,
791       numEventsInWaitList,
792       eventWaitList);
793 }
794 
795 
clblasCopyVectorAsync(size_t nb_elem,size_t element_size,const cl_mem A,size_t offA,cl_mem B,size_t offB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * events)796 clblasStatus clblasCopyVectorAsync(
797     size_t nb_elem,
798     size_t element_size,
799     const cl_mem A, size_t offA,
800     cl_mem B, size_t offB,
801     cl_command_queue command_queue,
802     cl_uint numEventsInWaitList,
803     const cl_event *eventWaitList,
804     cl_event *events)
805 {
806   return clblasCopyMatrixAsync(
807       clblasColumnMajor,
808       nb_elem, 1,
809       element_size,
810       A, offA, nb_elem,
811       B, offB, nb_elem,
812       command_queue,
813       numEventsInWaitList,
814       eventWaitList,
815       events);
816 }
817 
818 
clblasWriteMatrix(clblasOrder order,size_t sx,size_t sy,size_t element_size,const void * A,size_t offA,size_t ldA,cl_mem B,size_t offB,size_t ldB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)819 clblasStatus clblasWriteMatrix(
820     clblasOrder order,
821     size_t sx, size_t sy,
822     size_t element_size,
823     const void *A, size_t offA, size_t ldA,
824     cl_mem B, size_t offB, size_t ldB,
825     cl_command_queue command_queue,
826     cl_uint numEventsInWaitList,
827     const cl_event *eventWaitList)
828 {
829   return clblasWriteSubMatrix(
830       order,
831       element_size,
832       A, offA, ldA,
833       sx, sy,
834       0, 0,
835       B, offB, ldB,
836       sx, sy,
837       0, 0,
838       sx, sy,
839       command_queue,
840       numEventsInWaitList,
841       eventWaitList);
842 }
843 
844 
clblasWriteMatrixAsync(clblasOrder order,size_t sx,size_t sy,size_t element_size,const void * A,size_t offA,size_t ldA,cl_mem B,size_t offB,size_t ldB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * events)845 clblasStatus clblasWriteMatrixAsync(
846     clblasOrder order,
847     size_t sx, size_t sy,
848     size_t element_size,
849     const void *A, size_t offA, size_t ldA,
850     cl_mem B, size_t offB, size_t ldB,
851     cl_command_queue command_queue,
852     cl_uint numEventsInWaitList,
853     const cl_event *eventWaitList,
854     cl_event *events)
855 {
856   return clblasWriteSubMatrixAsync(
857       order,
858       element_size,
859       A, offA, ldA,
860       sx, sy,
861       0, 0,
862       B, offB, ldB,
863       sx, sy,
864       0, 0,
865       sx, sy,
866       command_queue,
867       numEventsInWaitList,
868       eventWaitList,
869       events);
870 }
871 
872 
clblasReadMatrix(clblasOrder order,size_t sx,size_t sy,size_t element_size,const cl_mem A,size_t offA,size_t ldA,void * B,size_t offB,size_t ldB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)873 clblasStatus clblasReadMatrix(
874     clblasOrder order,
875     size_t sx, size_t sy,
876     size_t element_size,
877     const cl_mem A, size_t offA, size_t ldA,
878     void * B, size_t offB, size_t ldB,
879     cl_command_queue command_queue,
880     cl_uint numEventsInWaitList,
881     const cl_event *eventWaitList)
882 {
883   return clblasReadSubMatrix(
884       order,
885       element_size,
886       A, offA, ldA,
887       sx, sy,
888       0, 0,
889       B, offB, ldB,
890       sx, sy,
891       0, 0,
892       sx, sy,
893       command_queue,
894       numEventsInWaitList,
895       eventWaitList);
896 }
897 
898 
clblasReadMatrixAsync(clblasOrder order,size_t sx,size_t sy,size_t element_size,const cl_mem A,size_t offA,size_t ldA,void * B,size_t offB,size_t ldB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * events)899 clblasStatus clblasReadMatrixAsync(
900     clblasOrder order,
901     size_t sx, size_t sy,
902     size_t element_size,
903     const cl_mem A, size_t offA, size_t ldA,
904     void * B, size_t offB, size_t ldB,
905     cl_command_queue command_queue,
906     cl_uint numEventsInWaitList,
907     const cl_event *eventWaitList,
908     cl_event *events)
909 {
910   return clblasReadSubMatrixAsync(
911       order,
912       element_size,
913       A, offA, ldA,
914       sx, sy,
915       0, 0,
916       B, offB, ldB,
917       sx, sy,
918       0, 0,
919       sx, sy,
920       command_queue,
921       numEventsInWaitList,
922       eventWaitList,
923       events);
924 }
925 
926 
clblasCopyMatrix(clblasOrder order,size_t sx,size_t sy,size_t element_size,const cl_mem A,size_t offA,size_t ldA,cl_mem B,size_t offB,size_t ldB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)927 clblasStatus clblasCopyMatrix(
928     clblasOrder order,
929     size_t sx, size_t sy,
930     size_t element_size,
931     const cl_mem A, size_t offA, size_t ldA,
932     cl_mem B, size_t offB, size_t ldB,
933     cl_command_queue command_queue,
934     cl_uint numEventsInWaitList,
935     const cl_event *eventWaitList)
936 {
937   return clblasCopySubMatrix(
938       order,
939       element_size,
940       A, offA, ldA,
941       sx, sy,
942       0, 0,
943       B, offB, ldB,
944       sx, sy,
945       0, 0,
946       sx, sy,
947       command_queue,
948       numEventsInWaitList,
949       eventWaitList);
950 }
951 
952 
clblasCopyMatrixAsync(clblasOrder order,size_t sx,size_t sy,size_t element_size,const cl_mem A,size_t offA,size_t ldA,cl_mem B,size_t offB,size_t ldB,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * events)953 clblasStatus clblasCopyMatrixAsync(
954     clblasOrder order,
955     size_t sx, size_t sy,
956     size_t element_size,
957     const cl_mem A, size_t offA, size_t ldA,
958     cl_mem B, size_t offB, size_t ldB,
959     cl_command_queue command_queue,
960     cl_uint numEventsInWaitList,
961     const cl_event *eventWaitList,
962     cl_event *events)
963 {
964   return clblasCopySubMatrixAsync(
965       order,
966       element_size,
967       A, offA, ldA,
968       sx, sy,
969       0, 0,
970       B, offB, ldB,
971       sx, sy,
972       0, 0,
973       sx, sy,
974       command_queue,
975       numEventsInWaitList,
976       eventWaitList,
977       events);
978 }
979 
980