1 /************************************************************************
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #include <string.h>
18 #include <clBLAS.h>
19 #include <limits.h>
20 
21 #include <functor.h>
22 #include <functor_selector.h>
23 
24 
25 #define SWAP(TYPE,a,b)  do { TYPE swap_tmp_ = a ; a = b ; b = swap_tmp_ ; } while(0)
26 
27 // Return true if the area starting from pint (x,y) and of size (w,h) is
28 // within the array of size d1 x d2
inside2d(size_t d1,size_t d2,int x,int y,size_t w,size_t h)29 static int inside2d( size_t d1, size_t d2, int x, int y, size_t w, size_t h )
30 {
31   // Very very large dimensions are likely a bug
32   size_t MAXDIM = ((size_t)INT_MAX)  ;
33   if ( d1 >= MAXDIM ) return 0 ;
34   if ( d2 >= MAXDIM ) return 0 ;
35   if ( w  >= MAXDIM ) return 0 ;
36   if ( h  >= MAXDIM ) return 0 ;
37 
38   if ( x < 0 || x >= (int)d1 ) return 0 ;
39   size_t max_w = (size_t)(d1-x) ;
40   if ( w > max_w ) return 0 ;
41 
42   if ( y < 0 || y >= (int)d2 ) return 0 ;
43   size_t max_h = (size_t)(d2-y) ;
44   if ( h > max_h ) return 0 ;
45 
46   return 1 ;
47 }
48 
49 extern "C"
clblasFillVectorAsync(size_t nb_elem,size_t element_size,cl_mem A,size_t offA,const void * host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)50 clblasStatus clblasFillVectorAsync( size_t nb_elem,
51     size_t element_size,
52     cl_mem A, size_t offA,
53     const void * host,
54     cl_command_queue command_queue,
55     cl_uint numEventsInWaitList,
56     const cl_event *eventWaitList,
57     cl_event *event)
58 {
59 
60   return (clblasStatus) clEnqueueFillBuffer(command_queue,
61                                             A,
62                                             host,
63                                             element_size,
64                                             offA*element_size,
65                                             nb_elem*element_size,
66                                             numEventsInWaitList,
67                                             eventWaitList,
68                                             event);
69 }
70 
71 
72 
73 extern "C"
clblasFillVector(size_t nb_elem,size_t element_size,cl_mem A,size_t offA,const void * host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)74 clblasStatus clblasFillVector(
75     size_t nb_elem,
76     size_t element_size,
77     cl_mem A, size_t offA,
78     const void * host,
79     cl_command_queue command_queue,
80     cl_uint numEventsInWaitList,
81     const cl_event *eventWaitList)
82 {
83   cl_event event ;
84   cl_int err = clblasFillVectorAsync(
85       nb_elem,
86       element_size,
87       A, offA,
88       host,
89       command_queue,
90       numEventsInWaitList, eventWaitList,
91       &event) ;
92 
93   if (err == clblasSuccess)  {
94     err = clWaitForEvents(1,&event) ;
95   }
96 
97   return (clblasStatus)  err ;
98 }
99 
100 extern "C"
clblasFillSubMatrixAsync(clblasOrder order,size_t element_size,cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,int xA,int yA,size_t nx,size_t ny,const void * host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)101 clblasStatus clblasFillSubMatrixAsync(
102     clblasOrder order,
103     size_t element_size,
104     cl_mem A, size_t offA, size_t ldA,
105     size_t nrA, size_t ncA,
106     int xA, int yA,
107     size_t nx, size_t ny,
108     const void *host,
109     cl_command_queue command_queue,
110     cl_uint numEventsInWaitList,
111     const cl_event *eventWaitList,
112     cl_event *event)
113 {
114   // Transform Row-major into equivalent ColumnMajor so X becomes the contiguous dimension.
115   if( order == clblasRowMajor )
116   {
117     SWAP(size_t, nrA, ncA);
118     SWAP(int,    xA,  yA);
119     SWAP(size_t, nx,  ny);
120   }
121 
122   // Check that the specified area is within the array
123   if ( !inside2d( nrA,ncA, xA,yA , nx,  ny ) ) {
124     return clblasInvalidValue ;
125   }
126 
127   // If the area to fill is contiguous then use clblasFillVector
128   if ( nx==ldA || ny==1 )
129   {
130     return clblasFillVectorAsync( nx*ny,
131                                   element_size,
132                                   A,
133                                   offA + xA + yA*ldA,
134                                   host,
135                                   command_queue,
136                                   numEventsInWaitList,
137                                   eventWaitList,
138                                   event) ;
139   }
140   else if (1)
141   {
142 
143     clblasFill2DFunctor::Args args(A,
144                                    offA + xA + yA*ldA,
145                                    nx,ny,
146                                    ldA,
147                                    element_size,
148                                    host,
149                                    command_queue,
150                                    numEventsInWaitList,
151                                    eventWaitList,
152                                    event) ;
153 
154     clblasFunctorSelector  * fselector = clblasFunctorSelector::find(command_queue);
155 
156     clblasFill2DFunctor * functor = fselector->select_fill2d_specific(args);
157 
158     if (!functor)
159       return clblasInvalidValue ;
160 
161     cl_int err = functor->execute(args);
162 
163     functor->release();
164     return (clblasStatus) err ;
165   }
166   else
167   {
168     // Temporary: perform one fill per row
169     cl_int err ;
170     for( size_t i=0; i<ny ; i++ )
171       {
172           err =  clblasFillVectorAsync( nx ,
173                                        element_size,
174                                        A,
175                                        offA + xA + (yA+i)*ldA,
176                                        host,
177                                        command_queue,
178                                        numEventsInWaitList,
179                                        eventWaitList,
180                                        event) ;
181          if (err!=clblasSuccess)
182            return (clblasStatus) err ;
183       }
184     return clblasSuccess ;
185   }
186 }
187 
188 extern "C"
clblasFillSubMatrix(clblasOrder order,size_t element_size,cl_mem A,size_t offA,size_t ldA,size_t nrA,size_t ncA,size_t xA,size_t yA,size_t nx,size_t ny,const void * host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)189 clblasStatus clblasFillSubMatrix(
190     clblasOrder order,
191     size_t element_size,
192     cl_mem A, size_t offA, size_t ldA,
193     size_t nrA, size_t ncA,
194     size_t xA, size_t yA,
195     size_t nx, size_t ny,
196     const void *host,
197     cl_command_queue command_queue,
198     cl_uint numEventsInWaitList,
199     const cl_event *eventWaitList)
200 {
201   cl_event event ;
202   cl_int err = clblasFillSubMatrixAsync(order,
203                                         element_size,
204                                         A, offA, ldA,
205                                         nrA, ncA,
206                                         xA, yA,
207                                         nx, ny,
208                                         host,
209                                         command_queue,
210                                         numEventsInWaitList,
211                                         eventWaitList,
212                                         &event
213                                    ) ;
214 
215   if (err == clblasSuccess)
216   {
217     err = clWaitForEvents(1,&event) ;
218   }
219 
220   return (clblasStatus)err ;
221 }
222 
223 
224 extern "C"
clblasFillMatrix(clblasOrder order,size_t element_size,cl_mem A,size_t offA,size_t ldA,size_t sxA,size_t syA,const void * host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList)225 clblasStatus clblasFillMatrix( clblasOrder order,
226                                size_t element_size,
227                                cl_mem A, size_t offA, size_t ldA,
228                                size_t sxA, size_t syA,
229                                const void *host,
230                                cl_command_queue command_queue,
231                                cl_uint numEventsInWaitList,
232                                const cl_event *eventWaitList)
233 {
234   return  clblasFillSubMatrix( order,
235                                element_size,
236                                A, offA, ldA,
237                                sxA, syA,
238                                0, 0,
239                                sxA, syA,
240                                host,
241                                command_queue,
242                                numEventsInWaitList,
243                                eventWaitList) ;
244 }
245 
246 
247 extern "C"
clblasFillMatrixAsync(clblasOrder order,size_t element_size,cl_mem A,size_t offA,size_t ldA,size_t sxA,size_t syA,const void * host,cl_command_queue command_queue,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)248 clblasStatus clblasFillMatrixAsync( clblasOrder order,
249                                     size_t element_size,
250                                     cl_mem A, size_t offA, size_t ldA,
251                                     size_t sxA, size_t syA,
252                                     const void *host,
253                                     cl_command_queue command_queue,
254                                     cl_uint numEventsInWaitList,
255                                     const cl_event *eventWaitList,
256                                     cl_event *event)
257 {
258 
259   return clblasFillSubMatrixAsync( order,
260                                    element_size,
261                                    A, offA, ldA,
262                                    sxA, syA,
263                                    0, 0,
264                                    sxA, syA,
265                                    host,
266                                    command_queue,
267                                    numEventsInWaitList,
268                                    eventWaitList,
269                                    event) ;
270 
271 }
272 
273