1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
13 
14 #if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
15 #ifdef FLA_ENABLE_TIDSP
16 #include <ti/omp/omp.h>
17 #else
18 #include <omp.h>
19 #endif
20 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
21 #include <pthread.h>
22 #endif
23 
24 
25 #ifdef FLA_ENABLE_SUPERMATRIX
26 
27 FLASH_Queue           _tq;
28 
29 static FLA_Bool       flash_queue_initialized     = FALSE;
30 
31 static int            flash_queue_n_read_blocks   = 0;
32 static int            flash_queue_n_write_blocks  = 0;
33 
34 static FLASH_Verbose  flash_queue_verbose         = FLASH_QUEUE_VERBOSE_NONE;
35 static FLA_Bool       flash_queue_sorting         = FALSE;
36 static FLA_Bool       flash_queue_caching         = FALSE;
37 static FLA_Bool       flash_queue_work_stealing   = FALSE;
38 static FLASH_Data_aff flash_queue_data_affinity   = FLASH_QUEUE_AFFINITY_NONE;
39 
40 static double         flash_queue_total_time      = 0.0;
41 static double         flash_queue_parallel_time   = 0.0;
42 
43 static dim_t          flash_queue_block_size      = 0;
44 static dim_t          flash_queue_cache_size      = 2 * 1024 * 1024;
45 static dim_t          flash_queue_cache_line_size = 64;
46 
47 static int            flash_queue_cores_per_cache = 1;
48 static int            flash_queue_cores_per_queue = 0;
49 
50 #endif
51 
52 
53 static unsigned int   flash_queue_stack           = 0;
54 static FLA_Bool       flash_queue_enabled         = TRUE;
55 
56 static unsigned int   flash_queue_n_threads       = 1;
57 
58 
FLASH_Queue_begin(void)59 void FLASH_Queue_begin( void )
60 /*----------------------------------------------------------------------------
61 
62    FLASH_Queue_begin
63 
64 ----------------------------------------------------------------------------*/
65 {
66 #ifdef FLA_ENABLE_SUPERMATRIX
67    if ( flash_queue_stack == 0 )
68    {
69       // Save the starting time for the total execution time.
70       flash_queue_total_time = FLA_Clock();
71    }
72 #endif
73 
74    // Push onto the stack.
75    flash_queue_stack++;
76 
77    return;
78 }
79 
80 
FLASH_Queue_end(void)81 void FLASH_Queue_end( void )
82 /*----------------------------------------------------------------------------
83 
84    FLASH_Queue_end
85 
86 ----------------------------------------------------------------------------*/
87 {
88    // Pop off the stack.
89    flash_queue_stack--;
90 
91 #ifdef FLA_ENABLE_SUPERMATRIX
92    if ( flash_queue_stack == 0 )
93    {
94       // Execute tasks if encounter the outermost parallel region.
95       FLASH_Queue_exec();
96 
97       // Find the total execution time.
98       flash_queue_total_time = FLA_Clock() - flash_queue_total_time;
99    }
100 #endif
101 
102    return;
103 }
104 
105 
FLASH_Queue_stack_depth(void)106 unsigned int FLASH_Queue_stack_depth( void )
107 /*----------------------------------------------------------------------------
108 
109    FLASH_Queue_stack_depth
110 
111 ----------------------------------------------------------------------------*/
112 {
113    return flash_queue_stack;
114 }
115 
116 
FLASH_Queue_enable(void)117 FLA_Error FLASH_Queue_enable( void )
118 /*----------------------------------------------------------------------------
119 
120    FLASH_Queue_enable
121 
122 ----------------------------------------------------------------------------*/
123 {
124 #ifdef FLA_ENABLE_SUPERMATRIX
125    if ( flash_queue_stack == 0 )
126    {
127       // Enable if not begin parallel region yet.
128       flash_queue_enabled = TRUE;
129       return FLA_SUCCESS;
130    }
131    else
132    {
133       // Cannot change status during parallel region.
134       return FLA_FAILURE;
135    }
136 #else
137    // Raise an exception when SuperMatrix is not configured.
138    FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED );
139    return FLA_FAILURE;
140 #endif
141 }
142 
143 
FLASH_Queue_disable(void)144 FLA_Error FLASH_Queue_disable( void )
145 /*----------------------------------------------------------------------------
146 
147    FLASH_Queue_disable
148 
149 ----------------------------------------------------------------------------*/
150 {
151 #ifdef FLA_ENABLE_SUPERMATRIX
152    if ( flash_queue_stack == 0 )
153    {
154       // Disable if not begin parallel region yet.
155       flash_queue_enabled = FALSE;
156       return FLA_SUCCESS;
157    }
158    else
159    {
160       // Cannot change status during parallel region.
161       return FLA_FAILURE;
162    }
163 #else
164    // Allow disabling enqueuing even when SuperMatrix is not configured.
165    flash_queue_enabled = FALSE;
166    return FLA_SUCCESS;
167 #endif
168 }
169 
170 
FLASH_Queue_get_enabled(void)171 FLA_Bool FLASH_Queue_get_enabled( void )
172 /*----------------------------------------------------------------------------
173 
174    FLASH_Queue_get_enabled
175 
176 ----------------------------------------------------------------------------*/
177 {
178    // Return if enabled, but always false if SuperMatrix is not configured.
179 #ifdef FLA_ENABLE_SUPERMATRIX
180    return flash_queue_enabled;
181 #else
182    return FALSE;
183 #endif
184 }
185 
186 
FLASH_Queue_set_num_threads(unsigned int n_threads)187 void FLASH_Queue_set_num_threads( unsigned int n_threads )
188 /*----------------------------------------------------------------------------
189 
190    FLASH_Queue_set_num_threads
191 
192 ----------------------------------------------------------------------------*/
193 {
194    FLA_Error e_val;
195 
196    // Verify that the number of threads is positive.
197    e_val = FLA_Check_num_threads( n_threads );
198    FLA_Check_error_code( e_val );
199 
200    // Keep track of the number of threads internally.
201    flash_queue_n_threads = n_threads;
202 
203 #if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
204 
205    // No additional action is necessary to set the number of OpenMP threads
206    // since setting the number of threads is handled at the parallel for loop
207    // with a num_threads() clause. This gives the user more flexibility since
208    // he can use the OMP_NUM_THREADS environment variable or the
209    // omp_set_num_threads() function to set the global number of OpenMP threads
210    // independently of the number of SuperMatrix threads.
211 
212 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
213 
214    // No additional action is necessary to set the number of pthreads
215    // since setting the number of threads is handled entirely on our end.
216 
217 #endif
218 
219    return;
220 }
221 
222 
FLASH_Queue_get_num_threads(void)223 unsigned int FLASH_Queue_get_num_threads( void )
224 /*----------------------------------------------------------------------------
225 
226    FLASH_Queue_get_num_threads
227 
228 ----------------------------------------------------------------------------*/
229 {
230    return flash_queue_n_threads;
231 }
232 
233 
234 #ifdef FLA_ENABLE_SUPERMATRIX
235 
236 
FLASH_Queue_init(void)237 void FLASH_Queue_init( void )
238 /*----------------------------------------------------------------------------
239 
240    FLASH_Queue_init
241 
242 ----------------------------------------------------------------------------*/
243 {
244    // Exit early if we're already initialized.
245    if ( flash_queue_initialized == TRUE )
246       return;
247 
248    // Reset all the initial values.
249    FLASH_Queue_reset();
250 
251    // Set the initialized flag.
252    flash_queue_initialized = TRUE;
253 
254 #ifdef FLA_ENABLE_GPU
255    FLASH_Queue_init_gpu();
256 #endif
257 
258    return;
259 }
260 
261 
FLASH_Queue_finalize(void)262 void FLASH_Queue_finalize( void )
263 /*----------------------------------------------------------------------------
264 
265    FLASH_Queue_finalize
266 
267 ----------------------------------------------------------------------------*/
268 {
269    // Exit early if we're not already initialized.
270    if ( flash_queue_initialized == FALSE )
271       return;
272 
273    // Clear the initialized flag.
274    flash_queue_initialized = FALSE;
275 
276 #ifdef FLA_ENABLE_GPU
277    FLASH_Queue_finalize_gpu();
278 #endif
279 
280    return;
281 }
282 
283 
FLASH_Queue_get_num_tasks(void)284 unsigned int FLASH_Queue_get_num_tasks( void )
285 /*----------------------------------------------------------------------------
286 
287    FLASH_Queue_get_num_tasks
288 
289 ----------------------------------------------------------------------------*/
290 {
291    return _tq.n_tasks;
292 }
293 
294 
FLASH_Queue_set_verbose_output(FLASH_Verbose verbose)295 void FLASH_Queue_set_verbose_output( FLASH_Verbose verbose )
296 /*----------------------------------------------------------------------------
297 
298    FLASH_Queue_set_verbose_output
299 
300 ----------------------------------------------------------------------------*/
301 {
302    flash_queue_verbose = verbose;
303 
304    return;
305 }
306 
307 
FLASH_Queue_get_verbose_output(void)308 FLASH_Verbose FLASH_Queue_get_verbose_output( void )
309 /*----------------------------------------------------------------------------
310 
311    FLASH_Queue_get_verbose_output
312 
313 ----------------------------------------------------------------------------*/
314 {
315    return flash_queue_verbose;
316 }
317 
318 
FLASH_Queue_set_sorting(FLA_Bool sorting)319 void FLASH_Queue_set_sorting( FLA_Bool sorting )
320 /*----------------------------------------------------------------------------
321 
322    FLASH_Queue_set_sorting
323 
324 ----------------------------------------------------------------------------*/
325 {
326    flash_queue_sorting = sorting;
327 
328    return;
329 }
330 
331 
FLASH_Queue_get_sorting(void)332 FLA_Bool FLASH_Queue_get_sorting( void )
333 /*----------------------------------------------------------------------------
334 
335    FLASH_Queue_get_sorting
336 
337 ----------------------------------------------------------------------------*/
338 {
339    return flash_queue_sorting;
340 }
341 
342 
FLASH_Queue_set_caching(FLA_Bool caching)343 void FLASH_Queue_set_caching( FLA_Bool caching )
344 /*----------------------------------------------------------------------------
345 
346    FLASH_Queue_set_caching
347 
348 ----------------------------------------------------------------------------*/
349 {
350    flash_queue_caching = caching;
351 
352    return;
353 }
354 
355 
FLASH_Queue_get_caching(void)356 FLA_Bool FLASH_Queue_get_caching( void )
357 /*----------------------------------------------------------------------------
358 
359    FLASH_Queue_get_caching
360 
361 ----------------------------------------------------------------------------*/
362 {
363    return flash_queue_caching;
364 }
365 
366 
FLASH_Queue_set_work_stealing(FLA_Bool work_stealing)367 void FLASH_Queue_set_work_stealing( FLA_Bool work_stealing )
368 /*----------------------------------------------------------------------------
369 
370    FLASH_Queue_set_work_stealing
371 
372 ----------------------------------------------------------------------------*/
373 {
374    flash_queue_work_stealing = work_stealing;
375 
376    return;
377 }
378 
379 
FLASH_Queue_get_work_stealing(void)380 FLA_Bool FLASH_Queue_get_work_stealing( void )
381 /*----------------------------------------------------------------------------
382 
383    FLASH_Queue_get_work_stealing
384 
385 ----------------------------------------------------------------------------*/
386 {
387    return flash_queue_work_stealing;
388 }
389 
390 
FLASH_Queue_set_data_affinity(FLASH_Data_aff data_affinity)391 void FLASH_Queue_set_data_affinity( FLASH_Data_aff data_affinity )
392 /*----------------------------------------------------------------------------
393 
394    FLASH_Queue_set_data_affinity
395 
396 ----------------------------------------------------------------------------*/
397 {
398    flash_queue_data_affinity = data_affinity;
399 
400    return;
401 }
402 
403 
FLASH_Queue_get_data_affinity(void)404 FLASH_Data_aff FLASH_Queue_get_data_affinity( void )
405 /*----------------------------------------------------------------------------
406 
407    FLASH_Queue_get_data_affinity
408 
409 ----------------------------------------------------------------------------*/
410 {
411    return flash_queue_data_affinity;
412 }
413 
414 
FLASH_Queue_get_total_time(void)415 double FLASH_Queue_get_total_time( void )
416 /*----------------------------------------------------------------------------
417 
418    FLASH_Queue_get_total_time
419 
420 ----------------------------------------------------------------------------*/
421 {
422    // Only return time if out of parallel region.
423    if ( flash_queue_stack == 0 )
424       return flash_queue_total_time;
425 
426    return 0.0;
427 }
428 
429 
FLASH_Queue_get_parallel_time(void)430 double FLASH_Queue_get_parallel_time( void )
431 /*----------------------------------------------------------------------------
432 
433    FLASH_Queue_get_parallel_time
434 
435 ----------------------------------------------------------------------------*/
436 {
437    // Only return time if out of parallel region.
438    if ( flash_queue_stack == 0 )
439       return flash_queue_parallel_time;
440 
441    return 0.0;
442 }
443 
444 
445 // --- helper functions --- ===================================================
446 
447 
FLASH_Queue_set_parallel_time(double dtime)448 void FLASH_Queue_set_parallel_time( double dtime )
449 /*----------------------------------------------------------------------------
450 
451    FLASH_Queue_set_parallel_time
452 
453 ----------------------------------------------------------------------------*/
454 {
455    flash_queue_parallel_time = dtime;
456 
457    return;
458 }
459 
460 
FLASH_Queue_set_block_size(dim_t size)461 void FLASH_Queue_set_block_size( dim_t size )
462 /*----------------------------------------------------------------------------
463 
464    FLASH_Queue_set_block_size
465 
466 ----------------------------------------------------------------------------*/
467 {
468    // Only adjust the block size if the new block is larger.
469    if ( flash_queue_block_size < size )
470       flash_queue_block_size = size;
471 
472    return;
473 }
474 
475 
FLASH_Queue_get_block_size(void)476 dim_t FLASH_Queue_get_block_size( void )
477 /*----------------------------------------------------------------------------
478 
479    FLASH_Queue_get_block_size
480 
481 ----------------------------------------------------------------------------*/
482 {
483    return flash_queue_block_size;
484 }
485 
486 
FLASH_Queue_set_cache_size(dim_t size)487 void FLASH_Queue_set_cache_size( dim_t size )
488 /*----------------------------------------------------------------------------
489 
490    FLASH_Queue_set_cache_size
491 
492 ----------------------------------------------------------------------------*/
493 {
494    flash_queue_cache_size = size;
495 
496    return;
497 }
498 
499 
FLASH_Queue_get_cache_size(void)500 dim_t FLASH_Queue_get_cache_size( void )
501 /*----------------------------------------------------------------------------
502 
503    FLASH_Queue_get_cache_size
504 
505 ----------------------------------------------------------------------------*/
506 {
507    return flash_queue_cache_size;
508 }
509 
510 
FLASH_Queue_set_cache_line_size(dim_t size)511 void FLASH_Queue_set_cache_line_size( dim_t size )
512 /*----------------------------------------------------------------------------
513 
514    FLASH_Queue_set_cache_line_size
515 
516 ----------------------------------------------------------------------------*/
517 {
518    flash_queue_cache_line_size = size;
519 
520    return;
521 }
522 
523 
FLASH_Queue_get_cache_line_size(void)524 dim_t FLASH_Queue_get_cache_line_size( void )
525 /*----------------------------------------------------------------------------
526 
527    FLASH_Queue_get_cache_line_size
528 
529 ----------------------------------------------------------------------------*/
530 {
531    return flash_queue_cache_line_size;
532 }
533 
534 
FLASH_Queue_set_cores_per_cache(int cores)535 void FLASH_Queue_set_cores_per_cache( int cores )
536 /*----------------------------------------------------------------------------
537 
538    FLASH_Queue_set_cores_per_cache
539 
540 ----------------------------------------------------------------------------*/
541 {
542    flash_queue_cores_per_cache = cores;
543 
544    return;
545 }
546 
547 
FLASH_Queue_get_cores_per_cache(void)548 int FLASH_Queue_get_cores_per_cache( void )
549 /*----------------------------------------------------------------------------
550 
551    FLASH_Queue_get_cores_per_cache
552 
553 ----------------------------------------------------------------------------*/
554 {
555    return flash_queue_cores_per_cache;
556 }
557 
558 
FLASH_Queue_set_cores_per_queue(int cores)559 void FLASH_Queue_set_cores_per_queue( int cores )
560 /*----------------------------------------------------------------------------
561 
562    FLASH_Queue_set_cores_per_queue
563 
564 ----------------------------------------------------------------------------*/
565 {
566    flash_queue_cores_per_queue = cores;
567 
568    return;
569 }
570 
571 
FLASH_Queue_get_cores_per_queue(void)572 int FLASH_Queue_get_cores_per_queue( void )
573 /*----------------------------------------------------------------------------
574 
575    FLASH_Queue_get_cores_per_queue
576 
577 ----------------------------------------------------------------------------*/
578 {
579    return flash_queue_cores_per_queue;
580 }
581 
582 
FLASH_Queue_reset(void)583 void FLASH_Queue_reset( void )
584 /*----------------------------------------------------------------------------
585 
586    FLASH_Queue_reset
587 
588 ----------------------------------------------------------------------------*/
589 {
590    // Clear the other fields of the FLASH_Queue structure.
591    _tq.n_tasks = 0;
592    _tq.head    = NULL;
593    _tq.tail    = NULL;
594 
595    // Reset the number of blocks.
596    flash_queue_n_read_blocks  = 0;
597    flash_queue_n_write_blocks = 0;
598 
599    return;
600 }
601 
602 
FLASH_Queue_get_head_task(void)603 FLASH_Task* FLASH_Queue_get_head_task( void )
604 /*----------------------------------------------------------------------------
605 
606    FLASH_Queue_get_head_task
607 
608 ----------------------------------------------------------------------------*/
609 {
610    return _tq.head;
611 }
612 
613 
FLASH_Queue_get_tail_task(void)614 FLASH_Task* FLASH_Queue_get_tail_task( void )
615 /*----------------------------------------------------------------------------
616 
617    FLASH_Queue_get_tail_task
618 
619 ----------------------------------------------------------------------------*/
620 {
621    return _tq.tail;
622 }
623 
624 
FLASH_Queue_push(void * func,void * cntl,char * name,FLA_Bool enabled_gpu,int n_int_args,int n_fla_args,int n_input_args,int n_output_args,...)625 void FLASH_Queue_push( void* func,
626                        void* cntl,
627                        char* name,
628                        FLA_Bool enabled_gpu,
629                        int n_int_args,
630                        int n_fla_args,
631                        int n_input_args,
632                        int n_output_args,
633                        ... )
634 /*----------------------------------------------------------------------------
635 
636    FLASH_Queue_push
637 
638 ----------------------------------------------------------------------------*/
639 {
640    int         i;
641    va_list     var_arg_list;
642    FLASH_Task* t;
643    FLA_Obj     obj;
644 
645    // Allocate a new FLA_Task and populate its fields with appropriate values.
646    t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
647                          n_int_args, n_fla_args,
648                          n_input_args, n_output_args );
649 
650    // Initialize variable argument environment. In case you're wondering, the
651    // second argument in this macro invocation of va_start() is supposed to be
652    // the parameter that immediately preceeds the variable argument list
653    // (ie: the ... above ).
654    va_start( var_arg_list, n_output_args );
655 
656    // Extract the integer arguments.
657    for ( i = 0; i < n_int_args; i++ )
658       t->int_arg[i] = va_arg( var_arg_list, int );
659 
660    // Extract the FLA_Obj arguments.
661    for ( i = 0; i < n_fla_args; i++ )
662       t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );
663 
664    // Extract the input FLA_Obj arguments.
665    for ( i = 0; i < n_input_args; i++ )
666    {
667       obj = va_arg( var_arg_list, FLA_Obj );
668       t->input_arg[i] = obj;
669 
670       // Macroblock is used.
671       if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
672       {
673          dim_t    jj, kk;
674          dim_t    m    = FLA_Obj_length( obj );
675          dim_t    n    = FLA_Obj_width( obj );
676          dim_t    cs   = FLA_Obj_col_stride( obj );
677          FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
678 
679          // Dependence analysis for each input block in macroblock.
680          for ( jj = 0; jj < n; jj++ )
681             for ( kk = 0; kk < m; kk++ )
682                FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );
683 
684          // Set the number of blocks in the macroblock subtracted by one
685          // since we do not want to recount an operand for each n_input_arg.
686          t->n_macro_args += m * n - 1;
687       }
688       else // Regular block.
689       {
690          // Dependence analysis for input operand.
691          FLASH_Queue_push_input( obj, t );
692       }
693    }
694 
695    // Extract the output FLA_Obj arguments.
696    for ( i = 0; i < n_output_args; i++ )
697    {
698       obj = va_arg( var_arg_list, FLA_Obj );
699       t->output_arg[i] = obj;
700 
701       // Only assign data affinity to the first output block.
702       if ( i == 0 )
703       {
704          FLA_Obj buf = obj;
705 
706          // Use the top left block of the macroblock.
707          if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
708             buf = *FLASH_OBJ_PTR_AT( obj );
709 
710          if ( buf.base->write_task == NULL )
711             t->queue = flash_queue_n_write_blocks;
712          else
713             t->queue = buf.base->write_task->queue;
714       }
715 
716       // Macroblock is used.
717       if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
718       {
719          dim_t    jj, kk;
720          dim_t    m    = FLA_Obj_length( obj );
721          dim_t    n    = FLA_Obj_width( obj );
722          dim_t    cs   = FLA_Obj_col_stride( obj );
723          FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
724 
725          // Dependence analysis for each output block in macroblock.
726          for ( jj = 0; jj < n; jj++ )
727             for ( kk = 0; kk < m; kk++ )
728                FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );
729 
730          // Set the number of blocks in the macroblock subtracted by one
731          // since we do not want to recount an operand for each n_output_arg.
732          t->n_macro_args += m * n - 1;
733       }
734       else // Regular block.
735       {
736          // Dependence analysis for output operand.
737          FLASH_Queue_push_output( obj, t );
738       }
739    }
740 
741    // Finalize the variable argument environment.
742    va_end( var_arg_list );
743 
744    // Add the task to the tail of the queue (and the head if queue is empty).
745    if ( _tq.n_tasks == 0 )
746    {
747       _tq.head = t;
748       _tq.tail = t;
749    }
750    else
751    {
752       t->prev_task = _tq.tail;
753       _tq.tail->next_task = t;
754       _tq.tail            = t;
755 
756       // Determine the index of the task in the task queue.
757       t->order = t->prev_task->order + 1;
758    }
759 
760    // Increment the number of tasks.
761    _tq.n_tasks++;
762 
763    return;
764 }
765 
766 
FLASH_Queue_push_input(FLA_Obj obj,FLASH_Task * t)767 void FLASH_Queue_push_input( FLA_Obj obj,
768                              FLASH_Task* t )
769 /*----------------------------------------------------------------------------
770 
771    FLASH_Queue_push_input
772 
773 ----------------------------------------------------------------------------*/
774 {
775    FLASH_Task* task;
776    FLASH_Dep*  d;
777 
778    // Find dependence information.
779    if ( obj.base->write_task == NULL )
780    {
781       t->n_ready--;
782 
783       // Add to number of blocks read if not written and not read before.
784       if ( obj.base->n_read_tasks == 0 )
785       {
786          // Identify each read block with an id for freeing.
787          obj.base->n_read_blocks = flash_queue_n_read_blocks;
788 
789          flash_queue_n_read_blocks++;
790       }
791    }
792    else
793    { // Flow dependence.
794       task = obj.base->write_task;
795 
796       d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
797 
798       d->task     = t;
799       d->next_dep = NULL;
800 
801       if ( task->n_dep_args == 0 )
802       {
803          task->dep_arg_head = d;
804          task->dep_arg_tail = d;
805       }
806       else
807       {
808          task->dep_arg_tail->next_dep = d;
809          task->dep_arg_tail           = d;
810       }
811 
812       task->n_dep_args++;
813    }
814 
815    // Add task to the read task in the object if not already there.
816    if ( obj.base->n_read_tasks == 0 ||
817         obj.base->read_task_tail->task != t )
818    { // Anti-dependence potentially.
819       d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
820 
821       d->task     = t;
822       d->next_dep = NULL;
823 
824       if ( obj.base->n_read_tasks == 0 )
825       {
826          obj.base->read_task_head = d;
827          obj.base->read_task_tail = d;
828       }
829       else
830       {
831          obj.base->read_task_tail->next_dep = d;
832          obj.base->read_task_tail           = d;
833       }
834 
835       obj.base->n_read_tasks++;
836    }
837 
838    return;
839 }
840 
841 
FLASH_Queue_push_output(FLA_Obj obj,FLASH_Task * t)842 void FLASH_Queue_push_output( FLA_Obj obj,
843                               FLASH_Task* t )
844 /*----------------------------------------------------------------------------
845 
846    FLASH_Queue_push_output
847 
848 ----------------------------------------------------------------------------*/
849 {
850    int         i;
851    FLASH_Task* task;
852    FLASH_Dep*  d;
853    FLASH_Dep*  next_dep;
854 
855    // Assign tasks to threads with data affinity.
856    if ( obj.base->write_task == NULL )
857    {
858       t->n_ready--;
859 
860       // Save index in which this output block is first encountered.
861       obj.base->n_write_blocks = flash_queue_n_write_blocks;
862 
863       // Number of blocks written if not written before.
864       flash_queue_n_write_blocks++;
865 
866       // Add to number of blocks read if not written or read before.
867       if ( obj.base->n_read_tasks == 0 )
868       {
869          // Identify each read block with an id for freeing.
870          obj.base->n_read_blocks = flash_queue_n_read_blocks;
871 
872          flash_queue_n_read_blocks++;
873       }
874    }
875    else
876    { // Flow dependence potentially.
877       // The last task to overwrite this block is not itself.
878       if ( obj.base->write_task != t )
879       {
880          // Create dependency from task that last wrote the block.
881          task = obj.base->write_task;
882 
883          d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
884 
885          d->task     = t;
886          d->next_dep = NULL;
887 
888          if ( task->n_dep_args == 0 )
889          {
890             task->dep_arg_head = d;
891             task->dep_arg_tail = d;
892          }
893          else
894          {
895             task->dep_arg_tail->next_dep = d;
896             task->dep_arg_tail           = d;
897          }
898 
899          task->n_dep_args++;
900       }
901       else
902       {
903          // No need to notify task twice for output block already seen.
904          t->n_ready--;
905       }
906    }
907 
908    // Clear read task for next set of reads and record the anti-dependence.
909    d = obj.base->read_task_head;
910 
911    for ( i = 0; i < obj.base->n_read_tasks; i++ )
912    {
913       task     = d->task;
914       next_dep = d->next_dep;
915 
916       // If the last task to read is not the current task, add dependence.
917       if ( task != t )
918       {
919          d->task     = t;
920          d->next_dep = NULL;
921 
922          if ( task->n_dep_args == 0 )
923          {
924             task->dep_arg_head = d;
925             task->dep_arg_tail = d;
926          }
927          else
928          {
929             task->dep_arg_tail->next_dep = d;
930             task->dep_arg_tail           = d;
931          }
932 
933          task->n_dep_args++;
934 
935          t->n_war_args++;
936       }
937       else
938       {
939          FLA_free( d );
940       }
941 
942       d = next_dep;
943    }
944 
945    obj.base->n_read_tasks   = 0;
946    obj.base->read_task_head = NULL;
947    obj.base->read_task_tail = NULL;
948 
949    // Record this task as the last to write to this block.
950    obj.base->write_task = t;
951 
952    return;
953 }
954 
955 
FLASH_Task_alloc(void * func,void * cntl,char * name,FLA_Bool enabled_gpu,int n_int_args,int n_fla_args,int n_input_args,int n_output_args)956 FLASH_Task* FLASH_Task_alloc( void *func,
957                               void *cntl,
958                               char *name,
959                               FLA_Bool enabled_gpu,
960                               int n_int_args,
961                               int n_fla_args,
962                               int n_input_args,
963                               int n_output_args )
964 /*----------------------------------------------------------------------------
965 
966    FLASH_Task_alloc
967 
968 ----------------------------------------------------------------------------*/
969 {
970    FLASH_Task* t;
971 
972    // Allocate space for the task structure t.
973    t             = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );
974 
975    // Allocate space for the task's integer arguments.
976    t->int_arg    = (int *) FLA_malloc( n_int_args * sizeof(int) );
977 
978    // Allocate space for the task's FLA_Obj arguments.
979    t->fla_arg    = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );
980 
981    // Allocate space for the task's input FLA_Obj arguments.
982    t->input_arg  = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );
983 
984    // Allocate space for the task's output FLA_Obj arguments.
985    t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
986 
987    // Initialize other fields of the structure.
988    t->n_ready       = 0;
989    t->order         = 0;
990    t->queue         = 0;
991    t->height        = 0;
992    t->thread        = 0;
993    t->cache         = 0;
994    t->hit           = FALSE;
995 
996    t->func          = func;
997    t->cntl          = cntl;
998    t->name          = name;
999    t->enabled_gpu   = enabled_gpu;
1000    t->n_int_args    = n_int_args;
1001    t->n_fla_args    = n_fla_args;
1002    t->n_input_args  = n_input_args;
1003    t->n_output_args = n_output_args;
1004 
1005    t->n_macro_args  = 0;
1006    t->n_war_args    = 0;
1007    t->n_dep_args    = 0;
1008    t->dep_arg_head  = NULL;
1009    t->dep_arg_tail  = NULL;
1010    t->prev_task     = NULL;
1011    t->next_task     = NULL;
1012    t->prev_wait     = NULL;
1013    t->next_wait     = NULL;
1014 
1015    // Return a pointer to the initialized structure.
1016    return t;
1017 }
1018 
1019 
FLASH_Task_free(FLASH_Task * t)1020 void FLASH_Task_free( FLASH_Task *t )
1021 /*----------------------------------------------------------------------------
1022 
1023    FLASH_Task_free
1024 
1025 ----------------------------------------------------------------------------*/
1026 {
1027    int        i, j, k;
1028    FLA_Obj    obj;
1029    FLASH_Dep* d;
1030    FLASH_Dep* next_dep;
1031 
1032    // Clearing the last write task in each output block.
1033    for ( i = 0; i < t->n_output_args; i++ )
1034    {
1035       obj = t->output_arg[i];
1036 
1037       // Macroblock is used.
1038       if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1039       {
1040          dim_t    jj, kk;
1041          dim_t    m    = FLA_Obj_length( obj );
1042          dim_t    n    = FLA_Obj_width( obj );
1043          dim_t    cs   = FLA_Obj_col_stride( obj );
1044          FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
1045 
1046          // Clear each block in macroblock.
1047          for ( jj = 0; jj < n; jj++ )
1048             for ( kk = 0; kk < m; kk++ )
1049                ( buf + jj * cs + kk )->base->write_task = NULL;
1050       }
1051       else // Clear regular block.
1052       {
1053          obj.base->write_task = NULL;
1054       }
1055    }
1056 
1057    // Cleaning the last read tasks in each input block.
1058    for ( i = 0; i < t->n_input_args; i++ )
1059    {
1060       obj = t->input_arg[i];
1061 
1062       // Macroblock is used.
1063       if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1064       {
1065          dim_t    jj, kk;
1066          dim_t    m    = FLA_Obj_length( obj );
1067          dim_t    n    = FLA_Obj_width( obj );
1068          dim_t    cs   = FLA_Obj_col_stride( obj );
1069          FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
1070 
1071          // Clear each block in macroblock.
1072          for ( jj = 0; jj < n; jj++ )
1073          {
1074             for ( kk = 0; kk < m; kk++ )
1075             {
1076                obj = *( buf + jj * cs + kk );
1077 
1078                k = obj.base->n_read_tasks;
1079                d = obj.base->read_task_head;
1080 
1081                obj.base->n_read_tasks   = 0;
1082                obj.base->read_task_head = NULL;
1083                obj.base->read_task_tail = NULL;
1084 
1085                for ( j = 0; j < k; j++ )
1086                {
1087                   next_dep = d->next_dep;
1088                   FLA_free( d );
1089                   d = next_dep;
1090                }
1091             }
1092          }
1093       }
1094       else // Regular block.
1095       {
1096          k = obj.base->n_read_tasks;
1097          d = obj.base->read_task_head;
1098 
1099          obj.base->n_read_tasks   = 0;
1100          obj.base->read_task_head = NULL;
1101          obj.base->read_task_tail = NULL;
1102 
1103          for ( j = 0; j < k; j++ )
1104          {
1105             next_dep = d->next_dep;
1106             FLA_free( d );
1107             d = next_dep;
1108          }
1109       }
1110    }
1111 
1112    // Free the dep_arg field of t.
1113    d = t->dep_arg_head;
1114 
1115    for ( i = 0; i < t->n_dep_args; i++ )
1116    {
1117       next_dep = d->next_dep;
1118       FLA_free( d );
1119       d = next_dep;
1120    }
1121 
1122    // Free the int_arg field of t.
1123    FLA_free( t->int_arg );
1124 
1125    // Free the fla_arg field of t.
1126    FLA_free( t->fla_arg );
1127 
1128    // Free the input_arg field of t.
1129    FLA_free( t->input_arg );
1130 
1131    // Free the output_arg field of t.
1132    FLA_free( t->output_arg );
1133 
1134    // Finally, free the struct itself.
1135    FLA_free( t );
1136 
1137    return;
1138 }
1139 
1140 
FLASH_Queue_exec_task(FLASH_Task * t)1141 void FLASH_Queue_exec_task( FLASH_Task* t )
1142 /*----------------------------------------------------------------------------
1143 
1144    FLASH_Queue_exec_task
1145 
1146 ----------------------------------------------------------------------------*/
1147 {
1148    // Define local function pointer types.
1149 
1150    // LAPACK-level
1151    typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
1152    typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl);
1153    typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
1154    typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl);
1155    typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl);
1156    typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl);
1157    typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl);
1158    typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
1159    typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl);
1160    typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
1161    typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
1162    typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl);
1163    typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl);
1164    typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1165    typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1166    typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl);
1167    typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl);
1168    typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl);
1169    typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl);
1170    typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl);
1171    typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl);
1172    typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl);
1173    typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl);
1174    typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl);
1175    typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl);
1176 
1177    // Level-3 BLAS
1178    typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl);
1179    typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl);
1180    typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl);
1181    typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl);
1182    typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl);
1183    typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl);
1184    typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl);
1185    typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl);
1186    typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl);
1187 
1188    // Level-2 BLAS
1189    typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl);
1190    typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl);
1191 
1192    // Level-1 BLAS
1193    typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl);
1194    typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl);
1195    typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
1196    typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl);
1197    typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
1198    typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl);
1199    typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl);
1200 
1201    // Base
1202    typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
1203    typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);
1204 
1205    // Only execute task if it is not NULL.
1206    if ( t == NULL )
1207       return;
1208 
1209    // Now "switch" between the various possible task functions.
1210 
1211    // FLA_LU_piv_macro
1212    if ( t->func == (void *) FLA_LU_piv_macro_task )
1213    {
1214       flash_lu_piv_macro_p func;
1215       func = (flash_lu_piv_macro_p) t->func;
1216 
1217       func(               t->output_arg[0],
1218                           t->output_arg[1],
1219             ( fla_lu_t* ) t->cntl );
1220    }
1221    // FLA_Apply_pivots_macro
1222    else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
1223    {
1224       flash_apply_pivots_macro_p func;
1225       func = (flash_apply_pivots_macro_p) t->func;
1226 
1227       func( ( FLA_Side  )    t->int_arg[0],
1228             ( FLA_Trans )    t->int_arg[1],
1229                              t->input_arg[0],
1230                              t->output_arg[0],
1231             ( fla_appiv_t* ) t->cntl );
1232    }
1233    // FLA_LU_piv
1234    else if ( t->func == (void *) FLA_LU_piv_task )
1235    {
1236       flash_lu_piv_p func;
1237       func = (flash_lu_piv_p) t->func;
1238 
1239       func(               t->output_arg[0],
1240                           t->fla_arg[0],
1241             ( fla_lu_t* ) t->cntl );
1242    }
1243    // FLA_LU_piv_copy
1244    else if ( t->func == (void *) FLA_LU_piv_copy_task )
1245    {
1246       flash_lu_piv_copy_p func;
1247       func = (flash_lu_piv_copy_p) t->func;
1248 
1249       func(               t->output_arg[0],
1250                           t->fla_arg[0],
1251                           t->output_arg[1],
1252             ( fla_lu_t* ) t->cntl );
1253    }
1254    // FLA_Trsm_piv
1255    else if ( t->func == (void *) FLA_Trsm_piv_task )
1256    {
1257       flash_trsm_piv_p func;
1258       func = (flash_trsm_piv_p) t->func;
1259 
1260       func(                 t->input_arg[0],
1261                             t->output_arg[0],
1262                             t->fla_arg[0],
1263             ( fla_trsm_t* ) t->cntl );
1264    }
1265    // FLA_SA_LU
1266    else if ( t->func == (void *) FLA_SA_LU_task )
1267    {
1268       flash_sa_lu_p func;
1269       func = (flash_sa_lu_p) t->func;
1270 
1271       func(               t->output_arg[1],
1272                           t->output_arg[0],
1273                           t->fla_arg[0],
1274                           t->fla_arg[1],
1275                           t->int_arg[0],
1276             ( fla_lu_t* ) t->cntl );
1277    }
1278    // FLA_SA_FS
1279    else if ( t->func == (void *) FLA_SA_FS_task )
1280    {
1281       flash_sa_fs_p func;
1282       func = (flash_sa_fs_p) t->func;
1283 
1284       func(                 t->fla_arg[0],
1285                             t->input_arg[0],
1286                             t->fla_arg[1],
1287                             t->output_arg[1],
1288                             t->output_arg[0],
1289                             t->int_arg[0],
1290             ( fla_gemm_t* ) t->cntl );
1291    }
1292    // FLA_LU_nopiv
1293    else if ( t->func == (void *) FLA_LU_nopiv_task )
1294    {
1295       flash_lu_nopiv_p func;
1296       func = (flash_lu_nopiv_p) t->func;
1297 
1298       func(               t->output_arg[0],
1299             ( fla_lu_t* ) t->cntl );
1300    }
1301    // FLA_Trinv
1302    else if ( t->func == (void *) FLA_Trinv_task )
1303    {
1304       flash_trinv_p func;
1305       func = (flash_trinv_p) t->func;
1306 
1307       func( ( FLA_Uplo     ) t->int_arg[0],
1308             ( FLA_Diag     ) t->int_arg[1],
1309                              t->output_arg[0],
1310             ( fla_trinv_t* ) t->cntl );
1311    }
1312    // FLA_Ttmm
1313    else if ( t->func == (void *) FLA_Ttmm_task )
1314    {
1315       flash_ttmm_p func;
1316       func = (flash_ttmm_p) t->func;
1317 
1318       func( ( FLA_Uplo    ) t->int_arg[0],
1319                             t->output_arg[0],
1320             ( fla_ttmm_t* ) t->cntl );
1321    }
1322    // FLA_Chol
1323    else if ( t->func == (void *) FLA_Chol_task )
1324    {
1325       flash_chol_p func;
1326       func = (flash_chol_p) t->func;
1327 
1328       func( ( FLA_Uplo    ) t->int_arg[0],
1329                             t->output_arg[0],
1330             ( fla_chol_t* ) t->cntl );
1331    }
1332    // FLA_Sylv
1333    else if ( t->func == (void *) FLA_Sylv_task )
1334    {
1335       flash_sylv_p func;
1336       func = (flash_sylv_p) t->func;
1337 
1338       func( ( FLA_Trans   ) t->int_arg[0],
1339             ( FLA_Trans   ) t->int_arg[1],
1340                             t->fla_arg[0],
1341                             t->input_arg[0],
1342                             t->input_arg[1],
1343                             t->output_arg[0],
1344                             t->fla_arg[1],
1345             ( fla_sylv_t* ) t->cntl );
1346    }
1347    // FLA_Lyap
1348    else if ( t->func == (void *) FLA_Lyap_task )
1349    {
1350       flash_lyap_p func;
1351       func = (flash_lyap_p) t->func;
1352 
1353       func( ( FLA_Trans   ) t->int_arg[0],
1354                             t->fla_arg[0],
1355                             t->input_arg[0],
1356                             t->output_arg[0],
1357                             t->fla_arg[1],
1358             ( fla_lyap_t* ) t->cntl );
1359    }
1360    // FLA_QR_UT_macro
1361    else if ( t->func == (void *) FLA_QR_UT_macro_task )
1362    {
1363       flash_qrut_macro_p func;
1364       func = (flash_qrut_macro_p) t->func;
1365 
1366       func(                 t->output_arg[0],
1367                             t->output_arg[1],
1368             ( fla_qrut_t* ) t->cntl );
1369    }
1370    // FLA_QR_UT
1371    else if ( t->func == (void *) FLA_QR_UT_task )
1372    {
1373       flash_qrut_p func;
1374       func = (flash_qrut_p) t->func;
1375 
1376       func(                 t->output_arg[0],
1377                             t->fla_arg[0],
1378             ( fla_qrut_t* ) t->cntl );
1379    }
1380    // FLA_QR_UT_copy
1381    else if ( t->func == (void *) FLA_QR_UT_copy_task )
1382    {
1383       flash_qrutc_p func;
1384       func = (flash_qrutc_p) t->func;
1385 
1386       func(                 t->output_arg[0],
1387                             t->fla_arg[0],
1388                             t->output_arg[1],
1389             ( fla_qrut_t* ) t->cntl );
1390    }
1391    // FLA_QR2_UT
1392    else if ( t->func == (void *) FLA_QR2_UT_task )
1393    {
1394       flash_qr2ut_p func;
1395       func = (flash_qr2ut_p) t->func;
1396 
1397       func(                 t->output_arg[1],
1398                             t->output_arg[0],
1399                             t->fla_arg[0],
1400            ( fla_qr2ut_t* ) t->cntl );
1401    }
1402    // FLA_LQ_UT_macro
1403    else if ( t->func == (void *) FLA_LQ_UT_macro_task )
1404    {
1405       flash_lqut_macro_p func;
1406       func = (flash_lqut_macro_p) t->func;
1407 
1408       func(                 t->output_arg[0],
1409                             t->output_arg[1],
1410             ( fla_lqut_t* ) t->cntl );
1411    }
1412    // FLA_CAQR2_UT
1413    else if ( t->func == (void *) FLA_CAQR2_UT_task )
1414    {
1415       flash_caqr2ut_p func;
1416       func = (flash_caqr2ut_p) t->func;
1417 
1418       func(                 t->output_arg[1],
1419                             t->output_arg[0],
1420                             t->fla_arg[0],
1421          ( fla_caqr2ut_t* ) t->cntl );
1422    }
1423    // FLA_UDdate_UT
1424    else if ( t->func == (void *) FLA_UDdate_UT_task )
1425    {
1426       flash_uddateut_p func;
1427       func = (flash_uddateut_p) t->func;
1428 
1429       func(                 t->output_arg[0],
1430                             t->output_arg[1],
1431                             t->output_arg[2],
1432                             t->output_arg[3],
1433         ( fla_uddateut_t* ) t->cntl );
1434    }
1435    // FLA_Apply_Q_UT
1436    else if ( t->func == (void *) FLA_Apply_Q_UT_task )
1437    {
1438       flash_apqut_p func;
1439       func = (flash_apqut_p) t->func;
1440 
1441       func( ( FLA_Side     ) t->int_arg[0],
1442             ( FLA_Trans    ) t->int_arg[1],
1443             ( FLA_Direct   ) t->int_arg[2],
1444             ( FLA_Store    ) t->int_arg[3],
1445                              t->input_arg[0],
1446                              t->fla_arg[0],
1447                              t->output_arg[1],
1448                              t->output_arg[0],
1449             ( fla_apqut_t* ) t->cntl );
1450    }
1451    // FLA_Apply_Q2_UT
1452    else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
1453    {
1454       flash_apq2ut_p func;
1455       func = (flash_apq2ut_p) t->func;
1456 
1457       func( ( FLA_Side      ) t->int_arg[0],
1458             ( FLA_Trans     ) t->int_arg[1],
1459             ( FLA_Direct    ) t->int_arg[2],
1460             ( FLA_Store     ) t->int_arg[3],
1461                               t->input_arg[0],
1462                               t->fla_arg[0],
1463                               t->output_arg[2],
1464                               t->output_arg[1],
1465                               t->output_arg[0],
1466             ( fla_apq2ut_t* ) t->cntl );
1467    }
1468    // FLA_Apply_CAQ2_UT
1469    else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
1470    {
1471       flash_apcaq2ut_p func;
1472       func = (flash_apcaq2ut_p) t->func;
1473 
1474       func( ( FLA_Side      ) t->int_arg[0],
1475             ( FLA_Trans     ) t->int_arg[1],
1476             ( FLA_Direct    ) t->int_arg[2],
1477             ( FLA_Store     ) t->int_arg[3],
1478                               t->input_arg[0],
1479                               t->fla_arg[0],
1480                               t->output_arg[2],
1481                               t->output_arg[1],
1482                               t->output_arg[0],
1483           ( fla_apcaq2ut_t* ) t->cntl );
1484    }
1485    // FLA_Apply_QUD_UT
1486    else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
1487    {
1488       flash_apqudut_p func;
1489       func = (flash_apqudut_p) t->func;
1490 
1491       func( ( FLA_Side       ) t->int_arg[0],
1492             ( FLA_Trans      ) t->int_arg[1],
1493             ( FLA_Direct     ) t->int_arg[2],
1494             ( FLA_Store      ) t->int_arg[3],
1495                                t->input_arg[0],
1496                                t->output_arg[0],
1497                                t->output_arg[1],
1498                                t->input_arg[1],
1499                                t->output_arg[2],
1500                                t->input_arg[2],
1501                                t->output_arg[3],
1502             ( fla_apqudut_t* ) t->cntl );
1503    }
1504    // FLA_Eig_gest
1505    else if ( t->func == (void *) FLA_Eig_gest_task )
1506    {
1507       flash_eig_gest_p func;
1508       func = (flash_eig_gest_p) t->func;
1509 
1510       func( ( FLA_Inv         ) t->int_arg[0],
1511             ( FLA_Uplo        ) t->int_arg[1],
1512                                 t->output_arg[1],
1513                                 t->output_arg[0],
1514                                 t->input_arg[0],
1515             ( fla_eig_gest_t* ) t->cntl );
1516    }
1517    // FLA_Gemm
1518    else if ( t->func == (void *) FLA_Gemm_task )
1519    {
1520       flash_gemm_p func;
1521       func = (flash_gemm_p) t->func;
1522 
1523       func( ( FLA_Trans   ) t->int_arg[0],
1524             ( FLA_Trans   ) t->int_arg[1],
1525                             t->fla_arg[0],
1526                             t->input_arg[0],
1527                             t->input_arg[1],
1528                             t->fla_arg[1],
1529                             t->output_arg[0],
1530             ( fla_gemm_t* ) t->cntl );
1531    }
1532    // FLA_Hemm
1533    else if ( t->func == (void *) FLA_Hemm_task )
1534    {
1535       flash_hemm_p func;
1536       func = (flash_hemm_p) t->func;
1537 
1538       func( ( FLA_Side    ) t->int_arg[0],
1539             ( FLA_Uplo    ) t->int_arg[1],
1540                             t->fla_arg[0],
1541                             t->input_arg[0],
1542                             t->input_arg[1],
1543                             t->fla_arg[1],
1544                             t->output_arg[0],
1545             ( fla_hemm_t* ) t->cntl );
1546    }
1547    // FLA_Herk
1548    else if ( t->func == (void *) FLA_Herk_task )
1549    {
1550       flash_herk_p func;
1551       func = (flash_herk_p) t->func;
1552 
1553       func( ( FLA_Uplo    ) t->int_arg[0],
1554             ( FLA_Trans   ) t->int_arg[1],
1555                             t->fla_arg[0],
1556                             t->input_arg[0],
1557                             t->fla_arg[1],
1558                             t->output_arg[0],
1559             ( fla_herk_t* ) t->cntl );
1560    }
1561    // FLA_Her2k
1562    else if ( t->func == (void *) FLA_Her2k_task )
1563    {
1564       flash_her2k_p func;
1565       func = (flash_her2k_p) t->func;
1566 
1567       func( ( FLA_Uplo     ) t->int_arg[0],
1568             ( FLA_Trans    ) t->int_arg[1],
1569                              t->fla_arg[0],
1570                              t->input_arg[0],
1571                              t->input_arg[1],
1572                              t->fla_arg[1],
1573                              t->output_arg[0],
1574             ( fla_her2k_t* ) t->cntl );
1575    }
1576    // FLA_Symm
1577    else if ( t->func == (void *) FLA_Symm_task )
1578    {
1579       flash_symm_p func;
1580       func = (flash_symm_p) t->func;
1581 
1582       func( ( FLA_Side    ) t->int_arg[0],
1583             ( FLA_Uplo    ) t->int_arg[1],
1584                             t->fla_arg[0],
1585                             t->input_arg[0],
1586                             t->input_arg[1],
1587                             t->fla_arg[1],
1588                             t->output_arg[0],
1589             ( fla_symm_t* ) t->cntl );
1590    }
1591    // FLA_Syrk
1592    else if ( t->func == (void *) FLA_Syrk_task )
1593    {
1594       flash_syrk_p func;
1595       func = (flash_syrk_p) t->func;
1596 
1597       func( ( FLA_Uplo    ) t->int_arg[0],
1598             ( FLA_Trans   ) t->int_arg[1],
1599                             t->fla_arg[0],
1600                             t->input_arg[0],
1601                             t->fla_arg[1],
1602                             t->output_arg[0],
1603             ( fla_syrk_t* ) t->cntl );
1604    }
1605    // FLA_Syr2k
1606    else if ( t->func == (void *) FLA_Syr2k_task )
1607    {
1608       flash_syr2k_p func;
1609       func = (flash_syr2k_p) t->func;
1610 
1611       func( ( FLA_Uplo     ) t->int_arg[0],
1612             ( FLA_Trans    ) t->int_arg[1],
1613                              t->fla_arg[0],
1614                              t->input_arg[0],
1615                              t->input_arg[1],
1616                              t->fla_arg[1],
1617                              t->output_arg[0],
1618             ( fla_syr2k_t* ) t->cntl );
1619    }
1620    // FLA_Trmm
1621    else if ( t->func == (void *) FLA_Trmm_task )
1622    {
1623       flash_trmm_p func;
1624       func = (flash_trmm_p) t->func;
1625 
1626       func( ( FLA_Side    ) t->int_arg[0],
1627             ( FLA_Uplo    ) t->int_arg[1],
1628             ( FLA_Trans   ) t->int_arg[2],
1629             ( FLA_Diag    ) t->int_arg[3],
1630                             t->fla_arg[0],
1631                             t->input_arg[0],
1632                             t->output_arg[0],
1633             ( fla_trmm_t* ) t->cntl );
1634    }
1635    // FLA_Trsm
1636    else if ( t->func == (void *) FLA_Trsm_task )
1637    {
1638       flash_trsm_p func;
1639       func = (flash_trsm_p) t->func;
1640 
1641       func( ( FLA_Side    ) t->int_arg[0],
1642             ( FLA_Uplo    ) t->int_arg[1],
1643             ( FLA_Trans   ) t->int_arg[2],
1644             ( FLA_Diag    ) t->int_arg[3],
1645                             t->fla_arg[0],
1646                             t->input_arg[0],
1647                             t->output_arg[0],
1648             ( fla_trsm_t* ) t->cntl );
1649    }
1650    // FLA_Gemv
1651    else if ( t->func == (void *) FLA_Gemv_task )
1652    {
1653       flash_gemv_p func;
1654       func = (flash_gemv_p) t->func;
1655 
1656       func( ( FLA_Trans   ) t->int_arg[0],
1657                             t->fla_arg[0],
1658                             t->input_arg[0],
1659                             t->input_arg[1],
1660                             t->fla_arg[1],
1661                             t->output_arg[0],
1662             ( fla_gemv_t* ) t->cntl );
1663    }
1664    // FLA_Trsv
1665    else if ( t->func == (void *) FLA_Trsv_task )
1666    {
1667       flash_trsv_p func;
1668       func = (flash_trsv_p) t->func;
1669 
1670       func( ( FLA_Uplo    ) t->int_arg[0],
1671             ( FLA_Trans   ) t->int_arg[1],
1672             ( FLA_Diag    ) t->int_arg[2],
1673                             t->input_arg[0],
1674                             t->output_arg[0],
1675             ( fla_trsv_t* ) t->cntl );
1676    }
1677    // FLA_Axpy
1678    else if ( t->func == (void *) FLA_Axpy_task )
1679    {
1680       flash_axpy_p func;
1681       func = (flash_axpy_p) t->func;
1682 
1683       func(                 t->fla_arg[0],
1684                             t->input_arg[0],
1685                             t->output_arg[0],
1686             ( fla_axpy_t* ) t->cntl );
1687    }
1688    // FLA_Axpyt
1689    else if ( t->func == (void *) FLA_Axpyt_task )
1690    {
1691       flash_axpyt_p func;
1692       func = (flash_axpyt_p) t->func;
1693 
1694       func( ( FLA_Trans    ) t->int_arg[0],
1695                              t->fla_arg[0],
1696                              t->input_arg[0],
1697                              t->output_arg[0],
1698             ( fla_axpyt_t* ) t->cntl );
1699    }
1700    // FLA_Copy
1701    else if ( t->func == (void *) FLA_Copy_task )
1702    {
1703       flash_copy_p func;
1704       func = (flash_copy_p) t->func;
1705 
1706       func(                 t->input_arg[0],
1707                             t->output_arg[0],
1708             ( fla_copy_t* ) t->cntl );
1709    }
1710    // FLA_Copyt
1711    else if ( t->func == (void *) FLA_Copyt_task )
1712    {
1713       flash_copyt_p func;
1714       func = (flash_copyt_p) t->func;
1715 
1716       func( ( FLA_Trans    ) t->int_arg[0],
1717                              t->input_arg[0],
1718                              t->output_arg[0],
1719             ( fla_copyt_t* ) t->cntl );
1720    }
1721    // FLA_Copyr
1722    else if ( t->func == (void *) FLA_Copyr_task )
1723    {
1724       flash_copyr_p func;
1725       func = (flash_copyr_p) t->func;
1726 
1727       func( ( FLA_Uplo     ) t->int_arg[0],
1728                              t->input_arg[0],
1729                              t->output_arg[0],
1730             ( fla_copyr_t* ) t->cntl );
1731    }
1732    // FLA_Scal
1733    else if ( t->func == (void *) FLA_Scal_task )
1734    {
1735       flash_scal_p func;
1736       func = (flash_scal_p) t->func;
1737 
1738       func(                 t->fla_arg[0],
1739                             t->output_arg[0],
1740             ( fla_scal_t* ) t->cntl );
1741    }
1742    // FLA_Scalr
1743    else if ( t->func == (void *) FLA_Scalr_task )
1744    {
1745       flash_scalr_p func;
1746       func = (flash_scalr_p) t->func;
1747 
1748       func( ( FLA_Uplo     ) t->int_arg[0],
1749                              t->fla_arg[0],
1750                              t->output_arg[0],
1751             ( fla_scalr_t* ) t->cntl );
1752    }
1753    // FLA_Obj_create_buffer
1754    else if ( t->func == (void *) FLA_Obj_create_buffer_task )
1755    {
1756       flash_obj_create_buffer_p func;
1757       func = (flash_obj_create_buffer_p) t->func;
1758 
1759       func( ( dim_t       ) t->int_arg[0],
1760             ( dim_t       ) t->int_arg[1],
1761                             t->output_arg[0],
1762                             t->cntl );
1763    }
1764    // FLA_Obj_free_buffer
1765    else if ( t->func == (void *) FLA_Obj_free_buffer_task )
1766    {
1767       flash_obj_free_buffer_p func;
1768       func = (flash_obj_free_buffer_p) t->func;
1769 
1770       func(                 t->output_arg[0],
1771                             t->cntl );
1772    }
1773    else
1774    {
1775       FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
1776    }
1777 
1778    return;
1779 }
1780 
1781 
FLASH_Queue_verbose_output(void)1782 void FLASH_Queue_verbose_output( void )
1783 /*----------------------------------------------------------------------------
1784 
1785    FLASH_Queue_verbose_output
1786 
1787 ----------------------------------------------------------------------------*/
1788 {
1789    int           i, j, k;
1790    int           n_threads = FLASH_Queue_get_num_threads();
1791    int           n_tasks   = FLASH_Queue_get_num_tasks();
1792    FLASH_Verbose verbose   = FLASH_Queue_get_verbose_output();
1793    FLASH_Task*   t;
1794    FLASH_Dep*    d;
1795 
1796    // Grab the head of the task queue.
1797    t = FLASH_Queue_get_head_task();
1798 
1799    if ( verbose == FLASH_QUEUE_VERBOSE_READABLE )
1800    {
1801       // Iterate over linked list of tasks.
1802       for ( i = 0; i < n_tasks; i++ )
1803       {
1804          printf( "%d\t%s\t", t->order, t->name );
1805 
1806          for ( j = 0; j < t->n_output_args; j++ )
1807             printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1808                     t->output_arg[j].base->m_index,
1809                     t->output_arg[j].base->n_index );
1810 
1811          printf( ":= " );
1812 
1813          for ( j = 0; j < t->n_output_args; j++ )
1814             printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1815                     t->output_arg[j].base->m_index,
1816                     t->output_arg[j].base->n_index );
1817 
1818          for ( j = 0; j < t->n_input_args; j++ )
1819             printf( "%lu[%lu,%lu] ", t->input_arg[j].base->id,
1820                     t->input_arg[j].base->m_index,
1821                     t->input_arg[j].base->n_index );
1822 
1823          printf( "\n" );
1824 
1825          // Go to the next task.
1826          t = t->next_task;
1827       }
1828 
1829       printf( "\n" );
1830    }
1831    else
1832    {
1833       printf( "digraph SuperMatrix {\n" );
1834 
1835       if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE )
1836       {
1837          // Iterate over linked list of tasks.
1838          for ( i = 0; i < n_tasks; i++ )
1839          {
1840             printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
1841 
1842             d = t->dep_arg_head;
1843             for ( j = 0; j < t->n_dep_args; j++ )
1844             {
1845                printf( "%d;", d->task->order );
1846                d = d->next_dep;
1847             }
1848 
1849             printf( "};\n" );
1850 
1851             // Go to the next task.
1852             t = t->next_task;
1853          }
1854       }
1855       else
1856       {
1857          // Iterate over all the threads.
1858          for ( k = 0; k < n_threads; k++ )
1859          {
1860             printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );
1861 
1862             // Iterate over linked list of tasks.
1863             for ( i = 0; i < n_tasks; i++ )
1864             {
1865                if ( t->queue == k )
1866                   printf( "%d [label=\"%s\"];\n", t->order, t->name );
1867 
1868                // Go to the next task.
1869                t = t->next_task;
1870             }
1871 
1872             printf( "}\n" );
1873 
1874             // Grab the head of the task queue.
1875             t = FLASH_Queue_get_head_task();
1876          }
1877 
1878          // Iterate over linked list of tasks.
1879          for ( i = 0; i < n_tasks; i++ )
1880          {
1881             printf( "%d -> {", t->order );
1882 
1883             d = t->dep_arg_head;
1884             for ( j = 0; j < t->n_dep_args; j++ )
1885             {
1886                printf( "%d;", d->task->order );
1887                d = d->next_dep;
1888             }
1889 
1890             printf( "};\n" );
1891 
1892             // Go to the next task.
1893             t = t->next_task;
1894          }
1895       }
1896 
1897       printf( "}\n\n" );
1898    }
1899 
1900    return;
1901 }
1902 
1903 
1904 #endif // FLA_ENABLE_SUPERMATRIX
1905