1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <sys/mman.h>
42 #include "common.h"
43 
44 #ifndef USE_OPENMP
45 
46 #include "blas_server.c"
47 
48 #else
49 
50 int blas_server_avail = 0;
51 
blas_thread_init(void)52 int blas_thread_init(void){
53 
54   blas_get_cpu_number();
55 
56   blas_server_avail = 1;
57 
58   return 0;
59 }
60 
BLASFUNC(blas_thread_shutdown)61 int BLASFUNC(blas_thread_shutdown)(void){
62 
63   blas_server_avail = 0;
64 
65   return 0;
66 }
67 
legacy_exec(void * func,int mode,blas_arg_t * args,void * sb)68 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
69 
70       if (!(mode & BLAS_COMPLEX)){
71 #ifdef EXPRECISION
72 	if (mode & BLAS_XDOUBLE){
73 	  /* REAL / Extended Double */
74 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
75 			xdouble *, BLASLONG, xdouble *, BLASLONG,
76 			xdouble *, BLASLONG, void *) = func;
77 
78 	  afunc(args -> m, args -> n, args -> k,
79 		((xdouble *)args -> alpha)[0],
80 		args -> a, args -> lda,
81 		args -> b, args -> ldb,
82 		args -> c, args -> ldc, sb);
83 	} else
84 #endif
85 	  if (mode & BLAS_DOUBLE){
86 	    /* REAL / Double */
87 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
88 			  double *, BLASLONG, double *, BLASLONG,
89 			  double *, BLASLONG, void *) = func;
90 
91 	    afunc(args -> m, args -> n, args -> k,
92 		  ((double *)args -> alpha)[0],
93 		  args -> a, args -> lda,
94 		  args -> b, args -> ldb,
95 		  args -> c, args -> ldc, sb);
96 	  } else {
97 	    /* REAL / Single */
98 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
99 			  float *, BLASLONG, float *, BLASLONG,
100 			  float *, BLASLONG, void *) = func;
101 
102 	    afunc(args -> m, args -> n, args -> k,
103 		  ((float *)args -> alpha)[0],
104 		  args -> a, args -> lda,
105 		  args -> b, args -> ldb,
106 		  args -> c, args -> ldc, sb);
107 	  }
108       } else {
109 #ifdef EXPRECISION
110 	if (mode & BLAS_XDOUBLE){
111 	  /* COMPLEX / Extended Double */
112 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
113 			xdouble *, BLASLONG, xdouble *, BLASLONG,
114 			xdouble *, BLASLONG, void *) = func;
115 
116 	  afunc(args -> m, args -> n, args -> k,
117 		((xdouble *)args -> alpha)[0],
118 		((xdouble *)args -> alpha)[1],
119 		args -> a, args -> lda,
120 		args -> b, args -> ldb,
121 		args -> c, args -> ldc, sb);
122 	} else
123 #endif
124 	  if (mode & BLAS_DOUBLE){
125 	    /* COMPLEX / Double */
126 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
127 			double *, BLASLONG, double *, BLASLONG,
128 			double *, BLASLONG, void *) = func;
129 
130 	  afunc(args -> m, args -> n, args -> k,
131 		((double *)args -> alpha)[0],
132 		((double *)args -> alpha)[1],
133 		args -> a, args -> lda,
134 		args -> b, args -> ldb,
135 		args -> c, args -> ldc, sb);
136 	  } else {
137 	    /* COMPLEX / Single */
138 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
139 			float *, BLASLONG, float *, BLASLONG,
140 			float *, BLASLONG, void *) = func;
141 
142 	  afunc(args -> m, args -> n, args -> k,
143 		((float *)args -> alpha)[0],
144 		((float *)args -> alpha)[1],
145 		args -> a, args -> lda,
146 		args -> b, args -> ldb,
147 		args -> c, args -> ldc, sb);
148 	  }
149       }
150 }
151 
exec_threads(blas_queue_t * queue)152 static void exec_threads(blas_queue_t *queue){
153 
154   void *buffer, *sa, *sb;
155 
156   buffer = NULL;
157   sa = queue -> sa;
158   sb = queue -> sb;
159 
160 #ifdef CONSISTENT_FPCSR
161   __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
162   __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
163 #endif
164 
165   if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
166 
167     buffer = blas_memory_alloc(2);
168 
169     if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
170 
171     if (sb == NULL) {
172       if (!(queue -> mode & BLAS_COMPLEX)){
173 #ifdef EXPRECISION
174 	if (queue -> mode & BLAS_XDOUBLE){
175 	  sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
176 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
177 	} else
178 #endif
179 	  if (queue -> mode & BLAS_DOUBLE){
180 	    sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
181 					    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
182 
183 	  } else {
184 	    sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
185 					    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
186 	  }
187       } else {
188 #ifdef EXPRECISION
189 	if (queue -> mode & BLAS_XDOUBLE){
190 	  sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
191 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
192 	} else
193 #endif
194 	  if (queue -> mode & BLAS_DOUBLE){
195 	    sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
196 					    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
197 	  } else {
198 	    sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
199 					    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
200 	  }
201       }
202     }
203   }
204 
205   if (queue -> mode & BLAS_LEGACY) {
206     legacy_exec(queue -> routine, queue -> mode, queue -> args, sb);
207   } else
208     if (queue -> mode & BLAS_PTHREAD) {
209       void (*pthreadcompat)(void *) = queue -> routine;
210       (pthreadcompat)(queue -> args);
211 
212     } else {
213       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
214 
215       (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
216 
217     }
218 
219   if (buffer != NULL) blas_memory_free(buffer);
220 
221 }
222 
exec_blas(BLASLONG num,blas_queue_t * queue)223 int exec_blas(BLASLONG num, blas_queue_t *queue){
224 
225   BLASLONG i;
226 
227   if ((num <= 0) || (queue == NULL)) return 0;
228 
229 #ifdef CONSISTENT_FPCSR
230   for (i = 0; i < num; i ++) {
231     __asm__ __volatile__ ("fnstcw %0"  : "=m" (queue[i].x87_mode));
232     __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode));
233   }
234 #endif
235 
236 #pragma omp parallel for schedule(static)
237   for (i = 0; i < num; i ++) {
238 
239 #ifndef USE_SIMPLE_THREADED_LEVEL3
240     queue[i].position = i;
241 #endif
242 
243     exec_threads(&queue[i]);
244   }
245 
246   return 0;
247 }
248 
249 #endif
250