1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #ifndef CACHE_LINE_SIZE
40 #define CACHE_LINE_SIZE 8
41 #endif
42 
43 #ifndef DIVIDE_RATE
44 #define DIVIDE_RATE 2
45 #endif
46 
47 #ifndef SWITCH_RATIO
48 #define SWITCH_RATIO 2
49 #endif
50 
51 #ifndef SYRK_LOCAL
52 #if   !defined(LOWER) && !defined(TRANS)
53 #define SYRK_LOCAL    SYRK_UN
54 #elif !defined(LOWER) &&  defined(TRANS)
55 #define SYRK_LOCAL    SYRK_UT
56 #elif  defined(LOWER) && !defined(TRANS)
57 #define SYRK_LOCAL    SYRK_LN
58 #else
59 #define SYRK_LOCAL    SYRK_LT
60 #endif
61 #endif
62 
63 typedef struct {
64   volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
65 } job_t;
66 
67 
68 #ifndef KERNEL_OPERATION
69 #ifndef COMPLEX
70 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
71 	KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
72 #else
73 #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
74 	KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
75 #endif
76 #endif
77 
78 #ifndef ICOPY_OPERATION
79 #ifndef TRANS
80 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
81 #else
82 #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
83 #endif
84 #endif
85 
86 #ifndef OCOPY_OPERATION
87 #ifdef TRANS
88 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
89 #else
90 #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
91 #endif
92 #endif
93 
94 #ifndef A
95 #define A	args -> a
96 #endif
97 #ifndef LDA
98 #define LDA	args -> lda
99 #endif
100 #ifndef C
101 #define C	args -> c
102 #endif
103 #ifndef LDC
104 #define LDC	args -> ldc
105 #endif
106 #ifndef M
107 #define M	args -> m
108 #endif
109 #ifndef N
110 #define N	args -> n
111 #endif
112 #ifndef K
113 #define K	args -> k
114 #endif
115 
116 #undef TIMING
117 
118 #ifdef TIMING
119 #define START_RPCC()		rpcc_counter = rpcc()
120 #define STOP_RPCC(COUNTER)	COUNTER  += rpcc() - rpcc_counter
121 #else
122 #define START_RPCC()
123 #define STOP_RPCC(COUNTER)
124 #endif
125 
inner_thread(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)126 static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
127 
128   FLOAT *buffer[DIVIDE_RATE];
129 
130   BLASLONG k, lda, ldc;
131   BLASLONG m_from, m_to, n_from, n_to;
132 
133   FLOAT *alpha, *beta;
134   FLOAT *a, *c;
135   job_t *job = (job_t *)args -> common;
136   BLASLONG xxx, bufferside;
137 
138   BLASLONG ls, min_l, jjs, min_jj;
139   BLASLONG is, min_i, div_n;
140 
141   BLASLONG i, current;
142 #ifdef LOWER
143   BLASLONG start_i;
144 #endif
145 
146 #ifdef TIMING
147   BLASLONG rpcc_counter;
148   BLASLONG copy_A = 0;
149   BLASLONG copy_B = 0;
150   BLASLONG kernel = 0;
151   BLASLONG waiting1 = 0;
152   BLASLONG waiting2 = 0;
153   BLASLONG waiting3 = 0;
154   BLASLONG waiting6[MAX_CPU_NUMBER];
155   BLASLONG ops    = 0;
156 
157   for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
158 #endif
159 
160   k = K;
161 
162   a = (FLOAT *)A;
163   c = (FLOAT *)C;
164 
165   lda = LDA;
166   ldc = LDC;
167 
168   alpha = (FLOAT *)args -> alpha;
169   beta  = (FLOAT *)args -> beta;
170 
171   m_from = 0;
172   m_to   = N;
173 
174   /* Global Range */
175   n_from = 0;
176   n_to   = N;
177 
178   if (range_n) {
179     m_from = range_n[mypos + 0];
180     m_to   = range_n[mypos + 1];
181 
182     n_from = range_n[0];
183     n_to   = range_n[args -> nthreads];
184   }
185 
186   if (beta) {
187 #if !defined(COMPLEX) || defined(HERK)
188     if (beta[0] != ONE)
189 #else
190     if ((beta[0] != ONE) || (beta[1] != ZERO))
191 #endif
192       syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
193   }
194 
195   if ((k == 0) || (alpha == NULL)) return 0;
196 
197   if ((alpha[0] == ZERO)
198 #if defined(COMPLEX) && !defined(HERK)
199       && (alpha[1] == ZERO)
200 #endif
201       ) return 0;
202 
203 #if 0
204   fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",  mypos, m_from, m_to, n_from, n_to);
205 #endif
206 
207   div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
208 	                            + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
209 
210   buffer[0] = sb;
211   for (i = 1; i < DIVIDE_RATE; i++) {
212     buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
213   }
214 
215   for(ls = 0; ls < k; ls += min_l){
216 
217     min_l = k - ls;
218     if (min_l >= GEMM_Q * 2) {
219       min_l  = GEMM_Q;
220     } else {
221       if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
222     }
223 
224     min_i = m_to - m_from;
225 
226     if (min_i >= GEMM_P * 2) {
227       min_i = GEMM_P;
228     } else {
229       if (min_i > GEMM_P) {
230 	min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
231       }
232     }
233 
234 #ifdef LOWER
235     xxx = (m_to - m_from - min_i) % GEMM_P;
236 
237     if (xxx) min_i -= GEMM_P - xxx;
238 #endif
239 
240     START_RPCC();
241 
242 #ifndef LOWER
243     ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
244 #else
245     ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa);
246 #endif
247 
248     STOP_RPCC(copy_A);
249 
250     div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
251 	                              + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
252 
253     for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
254 
255       START_RPCC();
256 
257       /* Make sure if no one is using buffer */
258 #ifndef LOWER
259       for (i = 0; i < mypos; i++)
260 #else
261       for (i = mypos + 1; i < args -> nthreads; i++)
262 #endif
263 	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
264 
265       STOP_RPCC(waiting1);
266 
267 #ifndef LOWER
268 
269       for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
270 
271 	min_jj = MIN(m_to, xxx + div_n) - jjs;
272 
273 	if (xxx == m_from) {
274 	  if (min_jj > min_i) min_jj = min_i;
275 	} else {
276 	  if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
277 	}
278 
279 	START_RPCC();
280 
281 	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs,
282 			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
283 
284 	STOP_RPCC(copy_B);
285 
286 	START_RPCC();
287 
288 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
289 			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
290 			 c, ldc, m_from, jjs);
291 
292 	STOP_RPCC(kernel);
293 
294 #ifdef TIMING
295 	  ops += 2 * min_i * min_jj * min_l;
296 #endif
297 
298       }
299 
300 #else
301 
302       for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
303 
304 	min_jj = MIN(m_to, xxx + div_n) - jjs;
305 
306 	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
307 
308 	START_RPCC();
309 
310 	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs,
311 			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
312 
313 	STOP_RPCC(copy_B);
314 
315 	START_RPCC();
316 
317 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
318 			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
319 			 c, ldc, m_to - min_i, jjs);
320 
321 	STOP_RPCC(kernel);
322 
323 #ifdef TIMING
324 	  ops += 2 * min_i * min_jj * min_l;
325 #endif
326 
327       }
328 
329 #endif
330 
331 #ifndef LOWER
332       for (i = 0; i <= mypos; i++)
333 #else
334       for (i = mypos; i < args -> nthreads; i++)
335 #endif
336 	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
337 
338       WMB;
339     }
340 
341 
342 #ifndef LOWER
343     current = mypos + 1;
344     while (current < args -> nthreads) {
345 #else
346     current = mypos - 1;
347     while (current >= 0) {
348 #endif
349 
350 	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
351 		 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
352 
353 	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
354 
355 	  START_RPCC();
356 
357 	  /* thread has to wait */
358 	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
359 
360 	  STOP_RPCC(waiting2);
361 
362 	  START_RPCC();
363 
364 #ifndef LOWER
365 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
366 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
367 			   c, ldc,
368 			   m_from,
369 			   xxx);
370 #else
371 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
372 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
373 			   c, ldc,
374 			   m_to - min_i,
375 			   xxx);
376 #endif
377 
378 	  STOP_RPCC(kernel);
379 #ifdef TIMING
380 	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
381 #endif
382 
383 	  if (m_to - m_from == min_i) {
384 	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
385 	  }
386 	}
387 
388 #ifndef LOWER
389 	current ++;
390 #else
391 	current --;
392 #endif
393     }
394 
395 #ifndef LOWER
396     for(is = m_from + min_i; is < m_to; is += min_i){
397       min_i = m_to - is;
398 #else
399     start_i = min_i;
400 
401     for(is = m_from; is < m_to - start_i; is += min_i){
402       min_i = m_to - start_i - is;
403 #endif
404 
405       if (min_i >= GEMM_P * 2) {
406 	min_i = GEMM_P;
407       } else
408 	if (min_i > GEMM_P) {
409 	  min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
410 	}
411 
412       START_RPCC();
413 
414       ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
415 
416       STOP_RPCC(copy_A);
417 
418       current = mypos;
419 
420       do {
421 
422 	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
423 		                                                     + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
424 
425 	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
426 
427 	  START_RPCC();
428 
429 	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
430 			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
431 			   c, ldc, is, xxx);
432 
433 	  STOP_RPCC(kernel);
434 
435 #ifdef TIMING
436 	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
437 #endif
438 
439 #ifndef LOWER
440 	  if (is + min_i >= m_to) {
441 #else
442 	  if (is + min_i >= m_to - start_i) {
443 #endif
444 	    /* Thread doesn't need this buffer any more */
445 	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
446 	    WMB;
447 	  }
448 	}
449 
450 #ifndef LOWER
451 	current ++;
452       } while (current != args -> nthreads);
453 #else
454 	current --;
455       } while (current >= 0);
456 #endif
457 
458 
459     }
460   }
461 
462   START_RPCC();
463 
464   for (i = 0; i < args -> nthreads; i++) {
465     if (i != mypos) {
466       for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
467 	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
468       }
469     }
470   }
471 
472   STOP_RPCC(waiting3);
473 
474 #ifdef TIMING
475   BLASLONG waiting = waiting1 + waiting2 + waiting3;
476   BLASLONG total = copy_A + copy_B + kernel + waiting;
477 
478   fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
479 	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
480 	  (double)waiting1 /(double)total * 100.,
481 	  (double)waiting2 /(double)total * 100.,
482 	  (double)waiting3 /(double)total * 100.,
483 	  (double)ops/(double)kernel / 4. * 100.);
484 
485 #if 0
486   fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
487 	  mypos, copy_A, copy_B, waiting);
488 
489   fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
490 	  mypos,
491 	  (double)waiting1/(double)waiting * 100.,
492 	  (double)waiting2/(double)waiting * 100.,
493 	  (double)waiting3/(double)waiting * 100.);
494 #endif
495   fprintf(stderr, "\n");
496 #endif
497 
498   return 0;
499 }
500 
501 int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
502 
503   blas_arg_t newarg;
504 
505   job_t          job[MAX_CPU_NUMBER];
506   blas_queue_t queue[MAX_CPU_NUMBER];
507 
508   BLASLONG range[MAX_CPU_NUMBER + 100];
509 
510   BLASLONG num_cpu;
511 
512   BLASLONG nthreads = args -> nthreads;
513 
514   BLASLONG width, i, j, k;
515   BLASLONG n, n_from, n_to;
516   int  mode, mask;
517   double dnum;
518 
519   if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
520     SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
521     return 0;
522   }
523 
524 #ifndef COMPLEX
525 #ifdef XDOUBLE
526   mode  =  BLAS_XDOUBLE | BLAS_REAL;
527   mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
528 #elif defined(DOUBLE)
529   mode  =  BLAS_DOUBLE  | BLAS_REAL;
530   mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
531 #else
532   mode  =  BLAS_SINGLE  | BLAS_REAL;
533   mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
534 #endif
535 #else
536 #ifdef XDOUBLE
537   mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
538   mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
539 #elif defined(DOUBLE)
540   mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
541   mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
542 #else
543   mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
544   mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
545 #endif
546 #endif
547 
548   newarg.m        = args -> m;
549   newarg.n        = args -> n;
550   newarg.k        = args -> k;
551   newarg.a        = args -> a;
552   newarg.b        = args -> b;
553   newarg.c        = args -> c;
554   newarg.lda      = args -> lda;
555   newarg.ldb      = args -> ldb;
556   newarg.ldc      = args -> ldc;
557   newarg.alpha    = args -> alpha;
558   newarg.beta     = args -> beta;
559   newarg.common   = (void *)job;
560 
561   if (!range_n) {
562     n_from = 0;
563     n_to   = args -> n;
564   } else {
565     n_from = range_n[0];
566     n_to   = range_n[1] - range_n[0];
567   }
568 
569 #ifndef LOWER
570 
571   range[MAX_CPU_NUMBER] = n_to - n_from;
572   range[0] = 0;
573   num_cpu  = 0;
574   i        = 0;
575   n        = n_to - n_from;
576 
577   dnum = (double)n * (double)n /(double)nthreads;
578 
579   while (i < n){
580 
581     if (nthreads - num_cpu > 1) {
582 
583       double di   = (double)i;
584 
585       width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
586 
587       if (num_cpu == 0) width = n - ((n - width) & ~mask);
588 
589       if ((width > n - i) || (width < mask)) width = n - i;
590 
591     } else {
592       width = n - i;
593     }
594 
595     range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
596 
597     queue[num_cpu].mode    = mode;
598     queue[num_cpu].routine = inner_thread;
599     queue[num_cpu].args    = &newarg;
600     queue[num_cpu].range_m = range_m;
601 
602     queue[num_cpu].sa      = NULL;
603     queue[num_cpu].sb      = NULL;
604     queue[num_cpu].next    = &queue[num_cpu + 1];
605 
606     num_cpu ++;
607     i += width;
608   }
609 
610    for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
611 
612 #else
613 
614   range[0] = 0;
615   num_cpu  = 0;
616   i        = 0;
617   n        = n_to - n_from;
618 
619   dnum = (double)n * (double)n /(double)nthreads;
620 
621   while (i < n){
622 
623     if (nthreads - num_cpu > 1) {
624 
625 	double di   = (double)i;
626 
627 	width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
628 
629       if ((width > n - i) || (width < mask)) width = n - i;
630 
631     } else {
632       width = n - i;
633     }
634 
635     range[num_cpu + 1] = range[num_cpu] + width;
636 
637     queue[num_cpu].mode    = mode;
638     queue[num_cpu].routine = inner_thread;
639     queue[num_cpu].args    = &newarg;
640     queue[num_cpu].range_m = range_m;
641     queue[num_cpu].range_n = range;
642     queue[num_cpu].sa      = NULL;
643     queue[num_cpu].sb      = NULL;
644     queue[num_cpu].next    = &queue[num_cpu + 1];
645 
646     num_cpu ++;
647     i += width;
648   }
649 
650 #endif
651 
652   newarg.nthreads = num_cpu;
653 
654   if (num_cpu) {
655 
656     for (j = 0; j < num_cpu; j++) {
657       for (i = 0; i < num_cpu; i++) {
658 	for (k = 0; k < DIVIDE_RATE; k++) {
659 	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
660 	}
661       }
662     }
663 
664     queue[0].sa = sa;
665     queue[0].sb = sb;
666     queue[num_cpu - 1].next = NULL;
667 
668     exec_blas(num_cpu, queue);
669   }
670 
671 
672   return 0;
673 }
674