1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #ifndef CACHE_LINE_SIZE
23 #define CACHE_LINE_SIZE 8
24 #endif
25
26 #ifndef DIVIDE_RATE
27 #define DIVIDE_RATE 2
28 #endif
29
30 #ifndef SWITCH_RATIO
31 #define SWITCH_RATIO 2
32 #endif
33
34 #ifndef GEMM3M_LOCAL
35 #if defined(NN)
36 #define GEMM3M_LOCAL GEMM3M_NN
37 #elif defined(NT)
38 #define GEMM3M_LOCAL GEMM3M_NT
39 #elif defined(NR)
40 #define GEMM3M_LOCAL GEMM3M_NR
41 #elif defined(NC)
42 #define GEMM3M_LOCAL GEMM3M_NC
43 #elif defined(TN)
44 #define GEMM3M_LOCAL GEMM3M_TN
45 #elif defined(TT)
46 #define GEMM3M_LOCAL GEMM3M_TT
47 #elif defined(TR)
48 #define GEMM3M_LOCAL GEMM3M_TR
49 #elif defined(TC)
50 #define GEMM3M_LOCAL GEMM3M_TC
51 #elif defined(RN)
52 #define GEMM3M_LOCAL GEMM3M_RN
53 #elif defined(RT)
54 #define GEMM3M_LOCAL GEMM3M_RT
55 #elif defined(RR)
56 #define GEMM3M_LOCAL GEMM3M_RR
57 #elif defined(RC)
58 #define GEMM3M_LOCAL GEMM3M_RC
59 #elif defined(CN)
60 #define GEMM3M_LOCAL GEMM3M_CN
61 #elif defined(CT)
62 #define GEMM3M_LOCAL GEMM3M_CT
63 #elif defined(CR)
64 #define GEMM3M_LOCAL GEMM3M_CR
65 #elif defined(CC)
66 #define GEMM3M_LOCAL GEMM3M_CC
67 #endif
68 #endif
69
70 typedef struct {
71 volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
72 } job_t;
73
74
75 #ifndef BETA_OPERATION
76 #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
77 GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
78 BETA[0], BETA[1], NULL, 0, NULL, 0, \
79 (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
80 #endif
81
82 #ifndef ICOPYB_OPERATION
83 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
84 defined(RN) || defined(RT) || defined(RC) || defined(RR)
85 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
86 GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
87 #else
88 #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
89 GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
90 #endif
91 #endif
92
93 #ifndef ICOPYR_OPERATION
94 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
95 defined(RN) || defined(RT) || defined(RC) || defined(RR)
96 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
97 GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
98 #else
99 #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
100 GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
101 #endif
102 #endif
103
104 #ifndef ICOPYI_OPERATION
105 #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
106 defined(RN) || defined(RT) || defined(RC) || defined(RR)
107 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
108 GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
109 #else
110 #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
111 GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
112 #endif
113 #endif
114
115
116 #ifndef OCOPYB_OPERATION
117 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
118 defined(NR) || defined(TR) || defined(CR) || defined(RR)
119 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
120 GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
121 #else
122 #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
123 GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
124 #endif
125 #endif
126
127 #ifndef OCOPYR_OPERATION
128 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
129 defined(NR) || defined(TR) || defined(CR) || defined(RR)
130 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
131 GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
132 #else
133 #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
134 GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
135 #endif
136 #endif
137
138
139 #ifndef OCOPYI_OPERATION
140 #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
141 defined(NR) || defined(TR) || defined(CR) || defined(RR)
142 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
143 GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
144 #else
145 #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
146 GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
147 #endif
148 #endif
149
150 #ifndef KERNEL_FUNC
151 #define KERNEL_FUNC GEMM3M_KERNEL
152 #endif
153
154 #ifndef KERNEL_OPERATION
155 #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
156 KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
157 #endif
158
159 #ifndef A
160 #define A args -> a
161 #endif
162 #ifndef LDA
163 #define LDA args -> lda
164 #endif
165 #ifndef B
166 #define B args -> b
167 #endif
168 #ifndef LDB
169 #define LDB args -> ldb
170 #endif
171 #ifndef C
172 #define C args -> c
173 #endif
174 #ifndef LDC
175 #define LDC args -> ldc
176 #endif
177 #ifndef M
178 #define M args -> m
179 #endif
180 #ifndef N
181 #define N args -> n
182 #endif
183 #ifndef K
184 #define K args -> k
185 #endif
186
187 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
188 #define ALPHA1 ONE
189 #define ALPHA2 ONE
190 #define ALPHA5 ZERO
191 #define ALPHA6 ONE
192
193 #define ALPHA7 ONE
194 #define ALPHA8 ZERO
195 #define ALPHA11 ONE
196 #define ALPHA12 -ONE
197
198 #define ALPHA13 ZERO
199 #define ALPHA14 ONE
200 #define ALPHA17 -ONE
201 #define ALPHA18 -ONE
202 #endif
203
204 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
205 #define ALPHA1 ONE
206 #define ALPHA2 ONE
207 #define ALPHA5 ONE
208 #define ALPHA6 ZERO
209
210 #define ALPHA7 ZERO
211 #define ALPHA8 ONE
212 #define ALPHA11 -ONE
213 #define ALPHA12 -ONE
214
215 #define ALPHA13 ONE
216 #define ALPHA14 ZERO
217 #define ALPHA17 -ONE
218 #define ALPHA18 ONE
219 #endif
220
221 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
222 #define ALPHA1 ONE
223 #define ALPHA2 ONE
224 #define ALPHA5 ONE
225 #define ALPHA6 ZERO
226
227 #define ALPHA7 ZERO
228 #define ALPHA8 ONE
229 #define ALPHA11 -ONE
230 #define ALPHA12 ONE
231
232 #define ALPHA13 ONE
233 #define ALPHA14 ZERO
234 #define ALPHA17 -ONE
235 #define ALPHA18 -ONE
236 #endif
237
238 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
239 #define ALPHA1 ONE
240 #define ALPHA2 ONE
241 #define ALPHA5 ZERO
242 #define ALPHA6 -ONE
243
244 #define ALPHA7 ONE
245 #define ALPHA8 ZERO
246 #define ALPHA11 ONE
247 #define ALPHA12 ONE
248
249 #define ALPHA13 ZERO
250 #define ALPHA14 ONE
251 #define ALPHA17 -ONE
252 #define ALPHA18 ONE
253 #endif
254
255 #ifdef TIMING
256 #define START_RPCC() rpcc_counter = rpcc()
257 #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
258 #else
259 #define START_RPCC()
260 #define STOP_RPCC(COUNTER)
261 #endif
262
inner_thread(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)263 static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
264
265 BLASLONG k, lda, ldb, ldc;
266 BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;
267
268 FLOAT *alpha, *beta;
269 FLOAT *a, *b, *c;
270 job_t *job = (job_t *)args -> common;
271 BLASLONG xxx, bufferside;
272 FLOAT *buffer[DIVIDE_RATE];
273
274 BLASLONG ls, min_l, jjs, min_jj;
275 BLASLONG is, min_i, div_n;
276 BLASLONG i, current;
277
278 #ifdef TIMING
279 BLASLONG rpcc_counter;
280 BLASLONG copy_A = 0;
281 BLASLONG copy_B = 0;
282 BLASLONG kernel = 0;
283 BLASLONG waiting1 = 0;
284 BLASLONG waiting2 = 0;
285 BLASLONG waiting3 = 0;
286 BLASLONG waiting6[MAX_CPU_NUMBER];
287 BLASLONG ops = 0;
288
289 for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
290 #endif
291
292 k = K;
293
294 a = (FLOAT *)A;
295 b = (FLOAT *)B;
296 c = (FLOAT *)C;
297
298 lda = LDA;
299 ldb = LDB;
300 ldc = LDC;
301
302 alpha = (FLOAT *)args -> alpha;
303 beta = (FLOAT *)args -> beta;
304
305 m_from = 0;
306 m_to = M;
307
308 if (range_m) {
309 m_from = range_m[0];
310 m_to = range_m[1];
311 }
312
313 n_from = 0;
314 n_to = N;
315
316 N_from = 0;
317 N_to = N;
318
319 if (range_n) {
320 n_from = range_n[mypos + 0];
321 n_to = range_n[mypos + 1];
322
323 N_from = range_n[0];
324 N_to = range_n[args -> nthreads];
325 }
326
327 if (beta) {
328 if ((beta[0] != ONE) || (beta[1] != ZERO))
329 BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
330 }
331
332 if ((k == 0) || (alpha == NULL)) return 0;
333
334 if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;
335
336 #if 0
337 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
338 mypos, m_from, m_to, n_from, n_to, N_from, N_to);
339 #endif
340
341 div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
342
343 buffer[0] = sb;
344 for (i = 1; i < DIVIDE_RATE; i++) {
345 buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
346 }
347
348 for(ls = 0; ls < k; ls += min_l){
349 min_l = k - ls;
350 if (min_l >= GEMM3M_Q * 2) {
351 min_l = GEMM3M_Q;
352 } else {
353 if (min_l > GEMM3M_Q) {
354 min_l = (min_l + 1) / 2;
355 }
356 }
357
358 min_i = m_to - m_from;
359
360 if (min_i >= GEMM3M_P * 2) {
361 min_i = GEMM3M_P;
362 } else {
363 if (min_i > GEMM3M_P) {
364 min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
365 }
366 }
367
368
369 START_RPCC();
370
371 ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
372
373 STOP_RPCC(copy_A);
374
375 div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
376
377 for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
378
379 START_RPCC();
380
381 /* Make sure if no one is using another buffer */
382 for (i = 0; i < args -> nthreads; i++)
383 while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
384
385 STOP_RPCC(waiting1);
386
387 for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
388 min_jj = MIN(n_to, xxx + div_n) - jjs;
389 if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
390
391 START_RPCC();
392
393 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
394 OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
395 #else
396 OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
397 #endif
398
399 STOP_RPCC(copy_B);
400
401 START_RPCC();
402
403 KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
404 sa, buffer[bufferside] + min_l * (jjs - xxx),
405 c, ldc, m_from, jjs);
406
407 STOP_RPCC(kernel);
408 #ifdef TIMING
409 ops += 2 * min_i * min_jj * min_l;
410 #endif
411
412 }
413
414 for (i = 0; i < args -> nthreads; i++)
415 job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
416 }
417
418 current = mypos;
419
420 do {
421 current ++;
422 if (current >= args -> nthreads) current = 0;
423
424 div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
425
426 for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
427
428 if (current != mypos) {
429
430 START_RPCC();
431
432 /* thread has to wait */
433 while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
434
435 STOP_RPCC(waiting2);
436
437 START_RPCC();
438
439
440 KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
441 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
442 c, ldc, m_from, xxx);
443
444 STOP_RPCC(kernel);
445 #ifdef TIMING
446 ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
447 #endif
448 }
449
450 if (m_to - m_from == min_i) {
451 job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
452 }
453 }
454 } while (current != mypos);
455
456 for(is = m_from + min_i; is < m_to; is += min_i){
457 min_i = m_to - is;
458 if (min_i >= GEMM3M_P * 2) {
459 min_i = GEMM3M_P;
460 } else
461 if (min_i > GEMM3M_P) {
462 min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
463 }
464
465 START_RPCC();
466
467 ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
468
469 STOP_RPCC(copy_A);
470
471 current = mypos;
472 do {
473
474 div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
475
476 for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
477
478 START_RPCC();
479
480
481 KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
482 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
483 c, ldc, is, xxx);
484
485 STOP_RPCC(kernel);
486 #ifdef TIMING
487 ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
488 #endif
489 if (is + min_i >= m_to) {
490 /* Thread doesn't need this buffer any more */
491 job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
492 }
493 }
494
495 current ++;
496 if (current >= args -> nthreads) current = 0;
497
498 } while (current != mypos);
499
500 } /* end of is */
501
502 START_RPCC();
503
504 ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
505
506 STOP_RPCC(copy_A);
507
508 div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
509
510 for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
511
512 START_RPCC();
513
514 /* Make sure if no one is using another buffer */
515 for (i = 0; i < args -> nthreads; i++)
516 while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
517
518 STOP_RPCC(waiting1);
519
520 for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
521 min_jj = MIN(n_to, xxx + div_n) - jjs;
522 if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
523
524 START_RPCC();
525
526 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
527 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
528 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
529 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
530 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
531 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
532 #else
533 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
534 #endif
535
536 STOP_RPCC(copy_B);
537
538 START_RPCC();
539
540 KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
541 sa, buffer[bufferside] + min_l * (jjs - xxx),
542 c, ldc, m_from, jjs);
543
544 STOP_RPCC(kernel);
545 #ifdef TIMING
546 ops += 2 * min_i * min_jj * min_l;
547 #endif
548
549 }
550
551 for (i = 0; i < args -> nthreads; i++)
552 job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
553 }
554
555 current = mypos;
556
557 do {
558 current ++;
559 if (current >= args -> nthreads) current = 0;
560
561 div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
562
563 for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
564
565 if (current != mypos) {
566
567 START_RPCC();
568
569 /* thread has to wait */
570 while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
571
572 STOP_RPCC(waiting2);
573
574 START_RPCC();
575
576 KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
577 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
578 c, ldc, m_from, xxx);
579
580 STOP_RPCC(kernel);
581 #ifdef TIMING
582 ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
583 #endif
584 }
585
586 if (m_to - m_from == min_i) {
587 job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
588 }
589 }
590 } while (current != mypos);
591
592 for(is = m_from + min_i; is < m_to; is += min_i){
593 min_i = m_to - is;
594 if (min_i >= GEMM3M_P * 2) {
595 min_i = GEMM3M_P;
596 } else
597 if (min_i > GEMM3M_P) {
598 min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
599 }
600
601 START_RPCC();
602
603 ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
604
605 STOP_RPCC(copy_A);
606
607 current = mypos;
608 do {
609
610 div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
611
612 for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
613
614 START_RPCC();
615
616 KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
617 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
618 c, ldc, is, xxx);
619
620 STOP_RPCC(kernel);
621 #ifdef TIMING
622 ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
623 #endif
624 if (is + min_i >= m_to) {
625 /* Thread doesn't need this buffer any more */
626 job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
627 }
628 }
629
630 current ++;
631 if (current >= args -> nthreads) current = 0;
632
633 } while (current != mypos);
634
635 } /* end of is */
636
637
638 START_RPCC();
639
640 ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
641
642 STOP_RPCC(copy_A);
643
644 div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
645
646 for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
647
648 START_RPCC();
649
650 /* Make sure if no one is using another buffer */
651 for (i = 0; i < args -> nthreads; i++)
652 while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
653
654 STOP_RPCC(waiting1);
655
656 for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
657 min_jj = MIN(n_to, xxx + div_n) - jjs;
658 if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
659
660 START_RPCC();
661
662 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
663 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
664 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
665 OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
666 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
667 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
668 #else
669 OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
670 #endif
671
672 STOP_RPCC(copy_B);
673
674 START_RPCC();
675
676 KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
677 sa, buffer[bufferside] + min_l * (jjs - xxx),
678 c, ldc, m_from, jjs);
679
680 STOP_RPCC(kernel);
681 #ifdef TIMING
682 ops += 2 * min_i * min_jj * min_l;
683 #endif
684
685 }
686
687 for (i = 0; i < args -> nthreads; i++)
688 job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
689 }
690
691 current = mypos;
692
693 do {
694 current ++;
695 if (current >= args -> nthreads) current = 0;
696
697 div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
698
699 for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
700
701 if (current != mypos) {
702
703 START_RPCC();
704
705 /* thread has to wait */
706 while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
707
708 STOP_RPCC(waiting2);
709
710 START_RPCC();
711
712 KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
713 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
714 c, ldc, m_from, xxx);
715
716 STOP_RPCC(kernel);
717 #ifdef TIMING
718 ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
719 #endif
720 }
721
722 if (m_to - m_from == min_i) {
723 job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
724 }
725 }
726 } while (current != mypos);
727
728 for(is = m_from + min_i; is < m_to; is += min_i){
729 min_i = m_to - is;
730 if (min_i >= GEMM3M_P * 2) {
731 min_i = GEMM3M_P;
732 } else
733 if (min_i > GEMM3M_P) {
734 min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
735 }
736
737 START_RPCC();
738
739 ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
740
741 STOP_RPCC(copy_A);
742
743 current = mypos;
744 do {
745
746 div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
747
748 for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
749
750 START_RPCC();
751
752 KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
753 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
754 c, ldc, is, xxx);
755
756 STOP_RPCC(kernel);
757 #ifdef TIMING
758 ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
759 #endif
760 if (is + min_i >= m_to) {
761 /* Thread doesn't need this buffer any more */
762 job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
763 }
764 }
765
766 current ++;
767 if (current >= args -> nthreads) current = 0;
768
769 } while (current != mypos);
770
771 } /* end of is */
772
773 }
774
775 START_RPCC();
776
777 for (i = 0; i < args -> nthreads; i++) {
778 for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
779 while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
780 }
781 }
782
783 STOP_RPCC(waiting3);
784
785 #ifdef TIMING
786 BLASLONG waiting = waiting1 + waiting2 + waiting3;
787 BLASLONG total = copy_A + copy_B + kernel + waiting;
788
789 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n",
790 mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
791 (double)waiting /(double)total * 100.,
792 (double)ops/(double)kernel / 2. * 100.);
793
794 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
795 mypos, copy_A, copy_B, waiting);
796
797 #if 0
798 fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
799 mypos,
800 (double)waiting1/(double)waiting * 100.,
801 (double)waiting2/(double)waiting * 100.,
802 (double)waiting3/(double)waiting * 100.);
803 #endif
804 fprintf(stderr, "\n");
805 #endif
806
807
808
809 return 0;
810 }
811
gemm_driver(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)812 static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
813 *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
814
815 blas_arg_t newarg;
816
817 blas_queue_t queue[MAX_CPU_NUMBER];
818
819 BLASLONG range_M[MAX_CPU_NUMBER + 1];
820 BLASLONG range_N[MAX_CPU_NUMBER + 1];
821
822 job_t job[MAX_CPU_NUMBER];
823
824 BLASLONG num_cpu_m, num_cpu_n;
825
826 BLASLONG nthreads = args -> nthreads;
827
828 BLASLONG width, i, j, k, js;
829 BLASLONG m, n, n_from, n_to;
830 int mode;
831
832 #ifdef XDOUBLE
833 mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
834 #elif defined(DOUBLE)
835 mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
836 #else
837 mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
838 #endif
839
840 newarg.m = args -> m;
841 newarg.n = args -> n;
842 newarg.k = args -> k;
843 newarg.a = args -> a;
844 newarg.b = args -> b;
845 newarg.c = args -> c;
846 newarg.lda = args -> lda;
847 newarg.ldb = args -> ldb;
848 newarg.ldc = args -> ldc;
849 newarg.alpha = args -> alpha;
850 newarg.beta = args -> beta;
851 newarg.nthreads = args -> nthreads;
852 newarg.common = (void *)job;
853
854 if (!range_m) {
855 range_M[0] = 0;
856 m = args -> m;
857 } else {
858 range_M[0] = range_m[0];
859 m = range_m[1] - range_m[0];
860 }
861
862 num_cpu_m = 0;
863
864 while (m > 0){
865
866 width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
867
868 m -= width;
869 if (m < 0) width = width + m;
870
871 range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;
872
873 num_cpu_m ++;
874 }
875
876 for (i = 0; i < num_cpu_m; i++) {
877 queue[i].mode = mode;
878 queue[i].routine = inner_thread;
879 queue[i].args = &newarg;
880 queue[i].range_m = &range_M[i];
881 queue[i].range_n = &range_N[0];
882 queue[i].sa = NULL;
883 queue[i].sb = NULL;
884 queue[i].next = &queue[i + 1];
885 }
886
887 queue[0].sa = sa;
888 queue[0].sb = sb;
889
890 if (!range_n) {
891 n_from = 0;
892 n_to = args -> n;
893 } else {
894 n_from = range_n[0];
895 n_to = range_n[1];
896 }
897
898 for(js = n_from; js < n_to; js += GEMM_R * nthreads){
899 n = n_to - js;
900 if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
901
902 range_N[0] = js;
903
904 num_cpu_n = 0;
905
906 while (n > 0){
907
908 width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
909
910 n -= width;
911 if (n < 0) width = width + n;
912
913 range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
914
915 num_cpu_n ++;
916 }
917
918 for (j = 0; j < num_cpu_m; j++) {
919 for (i = 0; i < num_cpu_m; i++) {
920 for (k = 0; k < DIVIDE_RATE; k++) {
921 job[j].working[i][CACHE_LINE_SIZE * k] = 0;
922 }
923 }
924 }
925
926 queue[num_cpu_m - 1].next = NULL;
927
928 exec_blas(num_cpu_m, queue);
929 }
930
931 return 0;
932 }
933
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)934 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
935
936 BLASLONG m = args -> m;
937 BLASLONG n = args -> n;
938 BLASLONG nthreads = args -> nthreads;
939 BLASLONG divN, divT;
940 int mode;
941
942 if (range_m) {
943 BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
944 BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
945
946 m = m_to - m_from;
947 }
948
949 if (range_n) {
950 BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
951 BLASLONG n_to = *(((BLASLONG *)range_n) + 1);
952
953 n = n_to - n_from;
954 }
955
956 if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
957 GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
958 return 0;
959 }
960
961 divT = nthreads;
962 divN = 1;
963
964 while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
965 do {
966 divT --;
967 divN = 1;
968 while (divT * divN < nthreads) divN ++;
969 } while ((divT * divN != nthreads) && (divT > 1));
970 }
971
972 args -> nthreads = divT;
973
974 if (divN == 1){
975 gemm_driver(args, range_m, range_n, sa, sb, 0);
976 } else {
977 #ifdef XDOUBLE
978 mode = BLAS_XDOUBLE | BLAS_COMPLEX;
979 #elif defined(DOUBLE)
980 mode = BLAS_DOUBLE | BLAS_COMPLEX;
981 #else
982 mode = BLAS_SINGLE | BLAS_COMPLEX;
983 #endif
984
985 #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
986 defined(CN) || defined(CT) || defined(CR) || defined(CC)
987 mode |= (BLAS_TRANSA_T);
988 #endif
989 #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
990 defined(NC) || defined(TC) || defined(RC) || defined(CC)
991 mode |= (BLAS_TRANSB_T);
992 #endif
993
994 gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
995 }
996
997 return 0;
998 }
999