1 /*********************************************************************/
2 /*                                                                   */
3 /*             Optimized BLAS libraries                              */
4 /*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5 /*                                                                   */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of     */
16 /* profits, interruption of business, or related expenses which may  */
17 /* arise from use of Software or Documentation, including but not    */
18 /* limited to those resulting from defects in Software and/or        */
19 /* Documentation, or loss or inaccuracy of data of any kind.         */
20 /*********************************************************************/
21 
22 #ifndef CACHE_LINE_SIZE
23 #define CACHE_LINE_SIZE 8
24 #endif
25 
26 #ifndef DIVIDE_RATE
27 #define DIVIDE_RATE 2
28 #endif
29 
30 #ifndef SWITCH_RATIO
31 #define SWITCH_RATIO 2
32 #endif
33 
34 #ifndef GEMM3M_LOCAL
35 #if   defined(NN)
36 #define GEMM3M_LOCAL    GEMM3M_NN
37 #elif defined(NT)
38 #define GEMM3M_LOCAL    GEMM3M_NT
39 #elif defined(NR)
40 #define GEMM3M_LOCAL    GEMM3M_NR
41 #elif defined(NC)
42 #define GEMM3M_LOCAL    GEMM3M_NC
43 #elif defined(TN)
44 #define GEMM3M_LOCAL    GEMM3M_TN
45 #elif defined(TT)
46 #define GEMM3M_LOCAL    GEMM3M_TT
47 #elif defined(TR)
48 #define GEMM3M_LOCAL    GEMM3M_TR
49 #elif defined(TC)
50 #define GEMM3M_LOCAL    GEMM3M_TC
51 #elif defined(RN)
52 #define GEMM3M_LOCAL    GEMM3M_RN
53 #elif defined(RT)
54 #define GEMM3M_LOCAL    GEMM3M_RT
55 #elif defined(RR)
56 #define GEMM3M_LOCAL    GEMM3M_RR
57 #elif defined(RC)
58 #define GEMM3M_LOCAL    GEMM3M_RC
59 #elif defined(CN)
60 #define GEMM3M_LOCAL    GEMM3M_CN
61 #elif defined(CT)
62 #define GEMM3M_LOCAL    GEMM3M_CT
63 #elif defined(CR)
64 #define GEMM3M_LOCAL    GEMM3M_CR
65 #elif defined(CC)
66 #define GEMM3M_LOCAL    GEMM3M_CC
67 #endif
68 #endif
69 
70 typedef struct {
71   volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
72 } job_t;
73 
74 
75 #ifndef BETA_OPERATION
76 #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
77 	GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
78 		  BETA[0], BETA[1], NULL, 0, NULL, 0, \
79 		  (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
80 #endif
81 
82 #ifndef ICOPYB_OPERATION
83 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
84     defined(RN) || defined(RT) || defined(RC) || defined(RR)
85 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
86 	GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
87 #else
88 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
89 	GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
90 #endif
91 #endif
92 
93 #ifndef ICOPYR_OPERATION
94 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
95     defined(RN) || defined(RT) || defined(RC) || defined(RR)
96 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
97 	GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
98 #else
99 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
100 	GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
101 #endif
102 #endif
103 
104 #ifndef ICOPYI_OPERATION
105 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
106     defined(RN) || defined(RT) || defined(RC) || defined(RR)
107 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
108 	GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
109 #else
110 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
111 	GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
112 #endif
113 #endif
114 
115 
116 #ifndef OCOPYB_OPERATION
117 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
118     defined(NR) || defined(TR) || defined(CR) || defined(RR)
119 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
120 	GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
121 #else
122 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
123 	GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
124 #endif
125 #endif
126 
127 #ifndef OCOPYR_OPERATION
128 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
129     defined(NR) || defined(TR) || defined(CR) || defined(RR)
130 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
131 	GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
132 #else
133 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
134 	GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
135 #endif
136 #endif
137 
138 
139 #ifndef OCOPYI_OPERATION
140 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
141     defined(NR) || defined(TR) || defined(CR) || defined(RR)
142 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
143 	GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
144 #else
145 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
146 	GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
147 #endif
148 #endif
149 
150 #ifndef KERNEL_FUNC
151 #define KERNEL_FUNC	GEMM3M_KERNEL
152 #endif
153 
154 #ifndef KERNEL_OPERATION
155 #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
156 	KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
157 #endif
158 
159 #ifndef A
160 #define A	args -> a
161 #endif
162 #ifndef LDA
163 #define LDA	args -> lda
164 #endif
165 #ifndef B
166 #define B	args -> b
167 #endif
168 #ifndef LDB
169 #define LDB	args -> ldb
170 #endif
171 #ifndef C
172 #define C	args -> c
173 #endif
174 #ifndef LDC
175 #define LDC	args -> ldc
176 #endif
177 #ifndef M
178 #define M	args -> m
179 #endif
180 #ifndef N
181 #define N	args -> n
182 #endif
183 #ifndef K
184 #define K	args -> k
185 #endif
186 
187 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
188 #define ALPHA1	ONE
189 #define ALPHA2	ONE
190 #define ALPHA5	ZERO
191 #define ALPHA6	ONE
192 
193 #define ALPHA7	ONE
194 #define ALPHA8	ZERO
195 #define ALPHA11	ONE
196 #define ALPHA12	-ONE
197 
198 #define ALPHA13	ZERO
199 #define ALPHA14	ONE
200 #define ALPHA17	-ONE
201 #define ALPHA18	-ONE
202 #endif
203 
204 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
205 #define ALPHA1	ONE
206 #define ALPHA2	ONE
207 #define ALPHA5	ONE
208 #define ALPHA6	ZERO
209 
210 #define ALPHA7	ZERO
211 #define ALPHA8	ONE
212 #define ALPHA11	-ONE
213 #define ALPHA12	-ONE
214 
215 #define ALPHA13	ONE
216 #define ALPHA14	ZERO
217 #define ALPHA17	-ONE
218 #define ALPHA18	ONE
219 #endif
220 
221 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
222 #define ALPHA1	ONE
223 #define ALPHA2	ONE
224 #define ALPHA5	ONE
225 #define ALPHA6	ZERO
226 
227 #define ALPHA7	ZERO
228 #define ALPHA8	ONE
229 #define ALPHA11	-ONE
230 #define ALPHA12	ONE
231 
232 #define ALPHA13	ONE
233 #define ALPHA14	ZERO
234 #define ALPHA17	-ONE
235 #define ALPHA18	-ONE
236 #endif
237 
238 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
239 #define ALPHA1	ONE
240 #define ALPHA2	ONE
241 #define ALPHA5	ZERO
242 #define ALPHA6	-ONE
243 
244 #define ALPHA7	ONE
245 #define ALPHA8	ZERO
246 #define ALPHA11	ONE
247 #define ALPHA12	ONE
248 
249 #define ALPHA13	ZERO
250 #define ALPHA14	ONE
251 #define ALPHA17	-ONE
252 #define ALPHA18	ONE
253 #endif
254 
255 #ifdef TIMING
256 #define START_RPCC()		rpcc_counter = rpcc()
257 #define STOP_RPCC(COUNTER)	COUNTER  += rpcc() - rpcc_counter
258 #else
259 #define START_RPCC()
260 #define STOP_RPCC(COUNTER)
261 #endif
262 
inner_thread(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)263 static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
264 
265   BLASLONG k, lda, ldb, ldc;
266   BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;
267 
268   FLOAT *alpha, *beta;
269   FLOAT *a, *b, *c;
270   job_t *job = (job_t *)args -> common;
271   BLASLONG xxx, bufferside;
272   FLOAT *buffer[DIVIDE_RATE];
273 
274   BLASLONG ls, min_l, jjs, min_jj;
275   BLASLONG is, min_i, div_n;
276   BLASLONG i, current;
277 
278 #ifdef TIMING
279   BLASLONG rpcc_counter;
280   BLASLONG copy_A = 0;
281   BLASLONG copy_B = 0;
282   BLASLONG kernel = 0;
283   BLASLONG waiting1 = 0;
284   BLASLONG waiting2 = 0;
285   BLASLONG waiting3 = 0;
286   BLASLONG waiting6[MAX_CPU_NUMBER];
287   BLASLONG ops    = 0;
288 
289   for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
290 #endif
291 
292   k = K;
293 
294   a = (FLOAT *)A;
295   b = (FLOAT *)B;
296   c = (FLOAT *)C;
297 
298   lda = LDA;
299   ldb = LDB;
300   ldc = LDC;
301 
302   alpha = (FLOAT *)args -> alpha;
303   beta  = (FLOAT *)args -> beta;
304 
305   m_from = 0;
306   m_to   = M;
307 
308   if (range_m) {
309     m_from = range_m[0];
310     m_to   = range_m[1];
311   }
312 
313   n_from = 0;
314   n_to   = N;
315 
316   N_from = 0;
317   N_to   = N;
318 
319   if (range_n) {
320     n_from = range_n[mypos + 0];
321     n_to   = range_n[mypos + 1];
322 
323     N_from = range_n[0];
324     N_to   = range_n[args -> nthreads];
325   }
326 
327   if (beta) {
328     if ((beta[0] != ONE) || (beta[1] != ZERO))
329       BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
330   }
331 
332   if ((k == 0) || (alpha == NULL)) return 0;
333 
334   if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;
335 
336 #if 0
337   fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
338 	  mypos, m_from, m_to, n_from, n_to, N_from, N_to);
339 #endif
340 
341   div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
342 
343   buffer[0] = sb;
344   for (i = 1; i < DIVIDE_RATE; i++) {
345     buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
346   }
347 
348   for(ls = 0; ls < k; ls += min_l){
349     min_l = k - ls;
350     if (min_l >= GEMM3M_Q * 2) {
351       min_l = GEMM3M_Q;
352     } else {
353       if (min_l > GEMM3M_Q) {
354 	min_l = (min_l + 1) / 2;
355       }
356     }
357 
358     min_i = m_to - m_from;
359 
360     if (min_i >= GEMM3M_P * 2) {
361       min_i = GEMM3M_P;
362     } else {
363       if (min_i > GEMM3M_P) {
364 	min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
365       }
366     }
367 
368 
369     START_RPCC();
370 
371     ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
372 
373     STOP_RPCC(copy_A);
374 
375     div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
376 
377     for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
378 
379       START_RPCC();
380 
381       /* Make sure if no one is using another buffer */
382       for (i = 0; i < args -> nthreads; i++)
383 	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
384 
385       STOP_RPCC(waiting1);
386 
387       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
388 	min_jj = MIN(n_to, xxx + div_n) - jjs;
389 	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
390 
391 	START_RPCC();
392 
393 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
394 	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
395 #else
396 	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
397 #endif
398 
399 	STOP_RPCC(copy_B);
400 
401 	START_RPCC();
402 
403 	  KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
404 			   sa, buffer[bufferside] + min_l * (jjs - xxx),
405 			   c, ldc, m_from, jjs);
406 
407 	STOP_RPCC(kernel);
408 #ifdef TIMING
409 	  ops += 2 * min_i * min_jj * min_l;
410 #endif
411 
412       }
413 
414       for (i = 0; i < args -> nthreads; i++)
415 	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
416       }
417 
418     current = mypos;
419 
420     do {
421       current ++;
422       if (current >= args -> nthreads) current = 0;
423 
424       div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
425 
426       for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
427 
428 	if (current != mypos) {
429 
430 	  START_RPCC();
431 
432 	  /* thread has to wait */
433 	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
434 
435 	  STOP_RPCC(waiting2);
436 
437 	  START_RPCC();
438 
439 
440 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, ALPHA5, ALPHA6,
441 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
442 			   c, ldc, m_from, xxx);
443 
444 	STOP_RPCC(kernel);
445 #ifdef TIMING
446 	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
447 #endif
448 	}
449 
450 	if (m_to - m_from == min_i) {
451 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
452 	}
453       }
454     } while (current != mypos);
455 
456     for(is = m_from + min_i; is < m_to; is += min_i){
457       min_i = m_to - is;
458       if (min_i >= GEMM3M_P * 2) {
459 	min_i = GEMM3M_P;
460       } else
461 	if (min_i > GEMM3M_P) {
462 	  min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
463 	}
464 
465       START_RPCC();
466 
467       ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
468 
469       STOP_RPCC(copy_A);
470 
471       current = mypos;
472       do {
473 
474 	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
475 
476 	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
477 
478 	  START_RPCC();
479 
480 
481 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
482 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
483 			   c, ldc, is, xxx);
484 
485 	STOP_RPCC(kernel);
486 #ifdef TIMING
487 	ops += 2 * min_i * (range_n[current + 1]  - range_n[current] - div_n) * min_l;
488 #endif
489 	if (is + min_i >= m_to) {
490 	  /* Thread doesn't need this buffer any more */
491 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
492 	}
493 	}
494 
495 	current ++;
496 	if (current >= args -> nthreads) current = 0;
497 
498       } while (current != mypos);
499 
500     } /* end of is */
501 
502     START_RPCC();
503 
504     ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
505 
506     STOP_RPCC(copy_A);
507 
508     div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
509 
510     for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
511 
512       START_RPCC();
513 
514       /* Make sure if no one is using another buffer */
515       for (i = 0; i < args -> nthreads; i++)
516 	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
517 
518       STOP_RPCC(waiting1);
519 
520       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
521 	min_jj = MIN(n_to, xxx + div_n) - jjs;
522 	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
523 
524 	START_RPCC();
525 
526 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
527 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
528 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
529 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
530 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
531 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
532 #else
533 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
534 #endif
535 
536 	STOP_RPCC(copy_B);
537 
538 	START_RPCC();
539 
540 	  KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
541 			   sa, buffer[bufferside] + min_l * (jjs - xxx),
542 			   c, ldc, m_from, jjs);
543 
544 	STOP_RPCC(kernel);
545 #ifdef TIMING
546 	  ops += 2 * min_i * min_jj * min_l;
547 #endif
548 
549       }
550 
551       for (i = 0; i < args -> nthreads; i++)
552 	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
553       }
554 
555     current = mypos;
556 
557     do {
558       current ++;
559       if (current >= args -> nthreads) current = 0;
560 
561       div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
562 
563       for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
564 
565 	if (current != mypos) {
566 
567 	  START_RPCC();
568 
569 	  /* thread has to wait */
570 	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
571 
572 	  STOP_RPCC(waiting2);
573 
574 	  START_RPCC();
575 
576 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, ALPHA11, ALPHA12,
577 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
578 			   c, ldc, m_from, xxx);
579 
580 	STOP_RPCC(kernel);
581 #ifdef TIMING
582 	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
583 #endif
584 	}
585 
586 	if (m_to - m_from == min_i) {
587 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
588 	}
589       }
590     } while (current != mypos);
591 
592     for(is = m_from + min_i; is < m_to; is += min_i){
593       min_i = m_to - is;
594       if (min_i >= GEMM3M_P * 2) {
595 	min_i = GEMM3M_P;
596       } else
597 	if (min_i > GEMM3M_P) {
598 	  min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
599 	}
600 
601       START_RPCC();
602 
603       ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
604 
605       STOP_RPCC(copy_A);
606 
607       current = mypos;
608       do {
609 
610 	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
611 
612 	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
613 
614 	  START_RPCC();
615 
616 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
617 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
618 			   c, ldc, is, xxx);
619 
620 	STOP_RPCC(kernel);
621 #ifdef TIMING
622 	ops += 2 * min_i * (range_n[current + 1]  - range_n[current] - div_n) * min_l;
623 #endif
624 	if (is + min_i >= m_to) {
625 	  /* Thread doesn't need this buffer any more */
626 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
627 	}
628 	}
629 
630 	current ++;
631 	if (current >= args -> nthreads) current = 0;
632 
633       } while (current != mypos);
634 
635     } /* end of is */
636 
637 
638     START_RPCC();
639 
640     ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
641 
642     STOP_RPCC(copy_A);
643 
644     div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
645 
646     for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
647 
648       START_RPCC();
649 
650       /* Make sure if no one is using another buffer */
651       for (i = 0; i < args -> nthreads; i++)
652 	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
653 
654       STOP_RPCC(waiting1);
655 
656       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
657 	min_jj = MIN(n_to, xxx + div_n) - jjs;
658 	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
659 
660 	START_RPCC();
661 
662 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
663 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
664 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
665 	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
666 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
667 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
668 #else
669 	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
670 #endif
671 
672 	STOP_RPCC(copy_B);
673 
674 	START_RPCC();
675 
676 	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
677 			 sa, buffer[bufferside] + min_l * (jjs - xxx),
678 			 c, ldc, m_from, jjs);
679 
680 	STOP_RPCC(kernel);
681 #ifdef TIMING
682 	  ops += 2 * min_i * min_jj * min_l;
683 #endif
684 
685       }
686 
687       for (i = 0; i < args -> nthreads; i++)
688 	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
689       }
690 
691     current = mypos;
692 
693     do {
694       current ++;
695       if (current >= args -> nthreads) current = 0;
696 
697       div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
698 
699       for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
700 
701 	if (current != mypos) {
702 
703 	  START_RPCC();
704 
705 	  /* thread has to wait */
706 	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
707 
708 	  STOP_RPCC(waiting2);
709 
710 	  START_RPCC();
711 
712 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, ALPHA17, ALPHA18,
713 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
714 			   c, ldc, m_from, xxx);
715 
716 	STOP_RPCC(kernel);
717 #ifdef TIMING
718 	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
719 #endif
720 	}
721 
722 	if (m_to - m_from == min_i) {
723 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
724 	}
725       }
726     } while (current != mypos);
727 
728     for(is = m_from + min_i; is < m_to; is += min_i){
729       min_i = m_to - is;
730       if (min_i >= GEMM3M_P * 2) {
731 	min_i = GEMM3M_P;
732       } else
733 	if (min_i > GEMM3M_P) {
734 	  min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
735 	}
736 
737       START_RPCC();
738 
739       ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
740 
741       STOP_RPCC(copy_A);
742 
743       current = mypos;
744       do {
745 
746 	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
747 
748 	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
749 
750 	  START_RPCC();
751 
752 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
753 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
754 			   c, ldc, is, xxx);
755 
756 	STOP_RPCC(kernel);
757 #ifdef TIMING
758 	ops += 2 * min_i * (range_n[current + 1]  - range_n[current] - div_n) * min_l;
759 #endif
760 	if (is + min_i >= m_to) {
761 	  /* Thread doesn't need this buffer any more */
762 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
763 	}
764 	}
765 
766 	current ++;
767 	if (current >= args -> nthreads) current = 0;
768 
769       } while (current != mypos);
770 
771     } /* end of is */
772 
773   }
774 
775   START_RPCC();
776 
777   for (i = 0; i < args -> nthreads; i++) {
778     for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
779       while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
780     }
781   }
782 
783   STOP_RPCC(waiting3);
784 
785 #ifdef TIMING
786   BLASLONG waiting = waiting1 + waiting2 + waiting3;
787   BLASLONG total = copy_A + copy_B + kernel + waiting;
788 
789   fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait : %6.2f Kernel : %6.2f\n",
790 	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
791 	  (double)waiting /(double)total * 100.,
792 	  (double)ops/(double)kernel / 2. * 100.);
793 
794   fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
795 	  mypos, copy_A, copy_B, waiting);
796 
797 #if 0
798   fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
799 	  mypos,
800 	  (double)waiting1/(double)waiting * 100.,
801 	  (double)waiting2/(double)waiting * 100.,
802 	  (double)waiting3/(double)waiting * 100.);
803 #endif
804   fprintf(stderr, "\n");
805 #endif
806 
807 
808 
809   return 0;
810 }
811 
gemm_driver(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)812 static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
813 		       *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
814 
815   blas_arg_t newarg;
816 
817   blas_queue_t queue[MAX_CPU_NUMBER];
818 
819   BLASLONG range_M[MAX_CPU_NUMBER + 1];
820   BLASLONG range_N[MAX_CPU_NUMBER + 1];
821 
822   job_t        job[MAX_CPU_NUMBER];
823 
824   BLASLONG num_cpu_m, num_cpu_n;
825 
826   BLASLONG nthreads = args -> nthreads;
827 
828   BLASLONG width, i, j, k, js;
829   BLASLONG m, n, n_from, n_to;
830   int  mode;
831 
832 #ifdef XDOUBLE
833   mode  =  BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
834 #elif defined(DOUBLE)
835   mode  =  BLAS_DOUBLE  | BLAS_REAL | BLAS_NODE;
836 #else
837   mode  =  BLAS_SINGLE  | BLAS_REAL | BLAS_NODE;
838 #endif
839 
840   newarg.m        = args -> m;
841   newarg.n        = args -> n;
842   newarg.k        = args -> k;
843   newarg.a        = args -> a;
844   newarg.b        = args -> b;
845   newarg.c        = args -> c;
846   newarg.lda      = args -> lda;
847   newarg.ldb      = args -> ldb;
848   newarg.ldc      = args -> ldc;
849   newarg.alpha    = args -> alpha;
850   newarg.beta     = args -> beta;
851   newarg.nthreads = args -> nthreads;
852   newarg.common   = (void *)job;
853 
854   if (!range_m) {
855     range_M[0] = 0;
856     m          = args -> m;
857   } else {
858     range_M[0] = range_m[0];
859     m          = range_m[1] - range_m[0];
860   }
861 
862   num_cpu_m  = 0;
863 
864   while (m > 0){
865 
866     width  = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
867 
868     m -= width;
869     if (m < 0) width = width + m;
870 
871     range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;
872 
873     num_cpu_m ++;
874   }
875 
876   for (i = 0; i < num_cpu_m; i++) {
877     queue[i].mode    = mode;
878     queue[i].routine = inner_thread;
879     queue[i].args    = &newarg;
880     queue[i].range_m = &range_M[i];
881     queue[i].range_n = &range_N[0];
882     queue[i].sa      = NULL;
883     queue[i].sb      = NULL;
884     queue[i].next    = &queue[i + 1];
885   }
886 
887   queue[0].sa = sa;
888   queue[0].sb = sb;
889 
890   if (!range_n) {
891     n_from = 0;
892     n_to   = args -> n;
893   } else {
894     n_from = range_n[0];
895     n_to   = range_n[1];
896   }
897 
898   for(js = n_from; js < n_to; js += GEMM_R * nthreads){
899     n = n_to - js;
900     if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
901 
902     range_N[0] = js;
903 
904     num_cpu_n  = 0;
905 
906     while (n > 0){
907 
908       width  = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
909 
910       n -= width;
911       if (n < 0) width = width + n;
912 
913       range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
914 
915       num_cpu_n ++;
916     }
917 
918     for (j = 0; j < num_cpu_m; j++) {
919       for (i = 0; i < num_cpu_m; i++) {
920 	for (k = 0; k < DIVIDE_RATE; k++) {
921 	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
922 	}
923       }
924     }
925 
926     queue[num_cpu_m - 1].next = NULL;
927 
928     exec_blas(num_cpu_m, queue);
929   }
930 
931   return 0;
932 }
933 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)934 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
935 
936   BLASLONG m = args -> m;
937   BLASLONG n = args -> n;
938   BLASLONG nthreads = args -> nthreads;
939   BLASLONG divN, divT;
940   int mode;
941 
942   if (range_m) {
943     BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
944     BLASLONG m_to   = *(((BLASLONG *)range_m) + 1);
945 
946     m = m_to - m_from;
947   }
948 
949   if (range_n) {
950     BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
951     BLASLONG n_to   = *(((BLASLONG *)range_n) + 1);
952 
953     n = n_to - n_from;
954   }
955 
956   if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
957     GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
958     return 0;
959   }
960 
961   divT = nthreads;
962   divN = 1;
963 
964   while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
965     do {
966       divT --;
967       divN = 1;
968       while (divT * divN < nthreads) divN ++;
969     } while ((divT * divN != nthreads) && (divT > 1));
970   }
971 
972   args -> nthreads = divT;
973 
974   if (divN == 1){
975     gemm_driver(args, range_m, range_n, sa, sb, 0);
976   } else {
977 #ifdef XDOUBLE
978     mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
979 #elif defined(DOUBLE)
980     mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
981 #else
982     mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
983 #endif
984 
985 #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
986     defined(CN) || defined(CT) || defined(CR) || defined(CC)
987     mode |= (BLAS_TRANSA_T);
988 #endif
989 #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
990     defined(NC) || defined(TC) || defined(RC) || defined(CC)
991     mode |= (BLAS_TRANSB_T);
992 #endif
993 
994     gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
995   }
996 
997   return 0;
998 }
999