1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #include <clBLAS.h>
19 #include <toolslib.h>
20 #include <kern_cache.h>
21 #include <clBLAS.version.h>
22 #include <trace_malloc.h>
23 
24 #include "clblas-internal.h"
25 #include <events.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #ifdef BUILDING_CLBLAS
29 #include "AutoGemmTeardown.h"
30 #include "UserGemmClKernels.h"
31 #endif
32 
33 clblasStatus
clblasGetVersion(cl_uint * major,cl_uint * minor,cl_uint * patch)34 clblasGetVersion(cl_uint* major, cl_uint* minor, cl_uint* patch)
35 {
36     *major = clblasVersionMajor;
37     *minor = clblasVersionMinor;
38     *patch = clblasVersionPatch;
39 
40     return clblasSuccess;
41 }
42 
43 clblasStatus
clblasSetup(void)44 clblasSetup(void)
45 {
46     solver_id_t sidsNum;
47 	char* tmp			= NULL;
48 
49 	//	Made the cache unlimited by default
50 	size_t kCacheLimit = 0;
51 
52     if (clblasInitialized) {
53         return clblasSuccess;
54     }
55 
56     // printf("\n%s, line %d\n", __func__, __LINE__);
57     initMallocTrace();
58 
59 
60     clblasInitBinaryCache();
61 
62     clblasSolvers[CLBLAS_GEMM].nrPatterns =
63         initGemmMemPatterns(clblasSolvers[CLBLAS_GEMM].memPatterns);
64     clblasSolvers[CLBLAS_GEMM].defaultPattern = -1;
65 
66     clblasSolvers[CLBLAS_TRMM].nrPatterns =
67         initTrmmMemPatterns(clblasSolvers[CLBLAS_TRMM].memPatterns);
68     clblasSolvers[CLBLAS_TRMM].defaultPattern = -1;
69 
70     clblasSolvers[CLBLAS_TRSM].nrPatterns =
71         initTrsmMemPatterns(clblasSolvers[CLBLAS_TRSM].memPatterns);
72     clblasSolvers[CLBLAS_TRSM].defaultPattern = -1;
73 
74     clblasSolvers[CLBLAS_GEMV].nrPatterns =
75         initGemvMemPatterns(clblasSolvers[CLBLAS_GEMV].memPatterns);
76     clblasSolvers[CLBLAS_GEMV].defaultPattern = -1;
77 
78     clblasSolvers[CLBLAS_SYMV].nrPatterns =
79         initSymvMemPatterns(clblasSolvers[CLBLAS_SYMV].memPatterns);
80     clblasSolvers[CLBLAS_SYMV].defaultPattern = -1;
81 
82     clblasSolvers[CLBLAS_SYR2K].nrPatterns =
83         initSyr2kMemPatterns(clblasSolvers[CLBLAS_SYR2K].memPatterns);
84     clblasSolvers[CLBLAS_SYR2K].defaultPattern = -1;
85 
86     clblasSolvers[CLBLAS_SYRK].nrPatterns =
87         initSyrkMemPatterns(clblasSolvers[CLBLAS_SYRK].memPatterns);
88     clblasSolvers[CLBLAS_SYRK].defaultPattern = -1;
89 
90 	clblasSolvers[CLBLAS_TRMV].nrPatterns =
91 		initTrmvMemPatterns(clblasSolvers[CLBLAS_TRMV].memPatterns);
92 	clblasSolvers[CLBLAS_TRMV].defaultPattern = -1;
93 
94 	// HEMV uses the same memory pattern as TRMV.
95 	clblasSolvers[CLBLAS_HEMV].nrPatterns =
96 		initTrmvMemPatterns(clblasSolvers[CLBLAS_HEMV].memPatterns);
97 	clblasSolvers[CLBLAS_HEMV].defaultPattern = -1;
98 
99 	clblasSolvers[CLBLAS_TRSV].nrPatterns =
100 		initTrsvMemPatterns(clblasSolvers[CLBLAS_TRSV].memPatterns);
101 	clblasSolvers[CLBLAS_TRSV].defaultPattern = -1;
102 
103 	clblasSolvers[CLBLAS_TRSV_GEMV].nrPatterns =
104 		initTrsvGemvMemPatterns(clblasSolvers[CLBLAS_TRSV_GEMV].memPatterns);
105 	clblasSolvers[CLBLAS_TRSV_GEMV].defaultPattern = -1;
106 
107 	clblasSolvers[CLBLAS_SYMM].nrPatterns =
108 		initSymmMemPatterns(clblasSolvers[CLBLAS_SYMM].memPatterns);
109 	clblasSolvers[CLBLAS_SYMM].defaultPattern = -1;
110 
111 	clblasSolvers[CLBLAS_GEMM2].nrPatterns =
112 		initGemmV2MemPatterns(clblasSolvers[CLBLAS_GEMM2].memPatterns);
113 	clblasSolvers[CLBLAS_GEMM2].defaultPattern = -1;
114 
115 	clblasSolvers[CLBLAS_GEMM_TAIL].nrPatterns =
116 		initGemmV2TailMemPatterns(clblasSolvers[CLBLAS_GEMM_TAIL].memPatterns);
117 	clblasSolvers[CLBLAS_GEMM_TAIL].defaultPattern = -1;
118 
119 	clblasSolvers[CLBLAS_SYR].nrPatterns =
120         initSyrMemPatterns(clblasSolvers[CLBLAS_SYR].memPatterns);
121  	clblasSolvers[CLBLAS_SYR].defaultPattern = -1;
122 
123 	clblasSolvers[CLBLAS_SYR2].nrPatterns =
124         initSyr2MemPatterns(clblasSolvers[CLBLAS_SYR2].memPatterns);
125     clblasSolvers[CLBLAS_SYR2].defaultPattern = -1;
126 
127 	clblasSolvers[CLBLAS_GER].nrPatterns =
128 		initGerMemPatterns(clblasSolvers[CLBLAS_GER].memPatterns);
129 	clblasSolvers[CLBLAS_GER].defaultPattern = -1;
130 
131 	clblasSolvers[CLBLAS_HER].nrPatterns =
132         initHerMemPatterns(clblasSolvers[CLBLAS_HER].memPatterns);
133  	clblasSolvers[CLBLAS_HER].defaultPattern = -1;
134 
135 	clblasSolvers[CLBLAS_HER2].nrPatterns =
136         initHer2MemPatterns(clblasSolvers[CLBLAS_HER2].memPatterns);
137     clblasSolvers[CLBLAS_HER2].defaultPattern = -1;
138 
139     clblasSolvers[CLBLAS_GBMV].nrPatterns =
140 		initGbmvMemPatterns(clblasSolvers[CLBLAS_GBMV].memPatterns);
141 	clblasSolvers[CLBLAS_GBMV].defaultPattern = -1;
142 
143 	clblasSolvers[CLBLAS_SWAP].nrPatterns =
144         initSwapMemPatterns(clblasSolvers[CLBLAS_SWAP].memPatterns);
145     clblasSolvers[CLBLAS_SWAP].defaultPattern = -1;
146 
147     clblasSolvers[CLBLAS_SCAL].nrPatterns =
148         initScalMemPatterns(clblasSolvers[CLBLAS_SCAL].memPatterns);
149     clblasSolvers[CLBLAS_SCAL].defaultPattern = -1;
150 
151     clblasSolvers[CLBLAS_COPY].nrPatterns =
152         initCopyMemPatterns(clblasSolvers[CLBLAS_COPY].memPatterns);
153     clblasSolvers[CLBLAS_COPY].defaultPattern = -1;
154 
155      clblasSolvers[CLBLAS_AXPY].nrPatterns =
156         initAxpyMemPatterns(clblasSolvers[CLBLAS_AXPY].memPatterns);
157     clblasSolvers[CLBLAS_AXPY].defaultPattern = -1;
158 
159     clblasSolvers[CLBLAS_DOT].nrPatterns =
160        initDotMemPatterns(clblasSolvers[CLBLAS_DOT].memPatterns);
161     clblasSolvers[CLBLAS_DOT].defaultPattern = -1;
162 
163     clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].nrPatterns =
164        initReductionMemPatterns(clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].memPatterns);
165     clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].defaultPattern = -1;
166 
167     clblasSolvers[CLBLAS_ROTG].nrPatterns =
168        initRotgMemPatterns(clblasSolvers[CLBLAS_ROTG].memPatterns);
169     clblasSolvers[CLBLAS_ROTG].defaultPattern = -1;
170 
171     clblasSolvers[CLBLAS_ROTMG].nrPatterns =
172        initRotmgMemPatterns(clblasSolvers[CLBLAS_ROTMG].memPatterns);
173     clblasSolvers[CLBLAS_ROTMG].defaultPattern = -1;
174 
175     clblasSolvers[CLBLAS_ROTM].nrPatterns =
176        initRotmMemPatterns(clblasSolvers[CLBLAS_ROTM].memPatterns);
177     clblasSolvers[CLBLAS_ROTM].defaultPattern = -1;
178 
179     clblasSolvers[CLBLAS_iAMAX].nrPatterns =
180        initiAmaxMemPatterns(clblasSolvers[CLBLAS_iAMAX].memPatterns);
181     clblasSolvers[CLBLAS_iAMAX].defaultPattern = -1;
182 
183     clblasSolvers[CLBLAS_NRM2].nrPatterns =
184        initNrm2MemPatterns(clblasSolvers[CLBLAS_NRM2].memPatterns);
185     clblasSolvers[CLBLAS_NRM2].defaultPattern = -1;
186 
187     clblasSolvers[CLBLAS_ASUM].nrPatterns =
188        initAsumMemPatterns(clblasSolvers[CLBLAS_ASUM].memPatterns);
189     clblasSolvers[CLBLAS_ASUM].defaultPattern = -1;
190 
191     sidsNum = makeSolverID(BLAS_FUNCTIONS_NUMBER, 0);
192 
193 	//	Read environmental variable to limit or disable ( 0 ) the size of the kernel cache in memory
194 	tmp = getenv( "AMD_CLBLAS_KCACHE_LIMIT_MB" );
195 	if( tmp != NULL )
196 	{
197 		kCacheLimit = atol( tmp );
198 #if defined( _WIN32 )
199 		printf( "Kernel Cache limit: %Iu MB\n", kCacheLimit );
200 #else
201 		printf( "Kernel Cache limit: %zu MB\n", kCacheLimit );
202 #endif
203 		kCacheLimit *= (1024 * 1024);
204 	}
205 
206     if (kCacheLimit || (tmp == NULL)) {
207         clblasKernelCache = createKernelCache(sidsNum, kCacheLimit);
208     	if (clblasKernelCache == NULL) {
209         	return clblasOutOfHostMemory;
210         }
211     }
212     if (initSCImages()) {
213         destroyKernelCache(clblasKernelCache);
214         return clblasOutOfHostMemory;
215     }
216 
217     decomposeEventsSetup();
218 
219     initStorageCache();
220 
221     clblasInitialized = 1;
222     return clblasSuccess;
223 }
224 
225 // TO BE FIXED: is really a uggly hack.
226 // The tune tool and some tests are linked with
227 // only a subset of clBLAS that does not contain
228 // the functor related codes.
229 //
230 //void (* _cleanFunctorCachesHook)(void) = 0 ;
231 
232 void
clblasTeardown(void)233 clblasTeardown(void)
234 {
235     if (!clblasInitialized) {
236         return;
237     }
238 
239     printMallocStatistics();
240 
241     if (clblasKernelCache != NULL) {
242         printKernelCacheSize(clblasKernelCache);
243         destroyKernelCache(clblasKernelCache);
244         clblasKernelCache = NULL;
245     }
246     releaseSCImages();
247     decomposeEventsTeardown();
248 
249     // win32 - crashes
250     destroyStorageCache();
251 
252     cleanFunctorCaches() ;
253 
254     printMemLeaksInfo();
255     releaseMallocTrace();
256 
257 #ifdef BUILDING_CLBLAS
258    initUserGemmClKernels();
259    initAutoGemmClKernels();
260 #endif
261 
262     clblasInitialized = 0;
263 }
264