1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13
14 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
15 #ifdef FLA_ENABLE_TIDSP
16 #include <ti/omp/omp.h>
17 #else
18 #include <omp.h>
19 #endif
20 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
21 #include <pthread.h>
22 #endif
23
24
25 #ifdef FLA_ENABLE_SUPERMATRIX
26
27 FLASH_Queue _tq;
28
29 static FLA_Bool flash_queue_initialized = FALSE;
30
31 static int flash_queue_n_read_blocks = 0;
32 static int flash_queue_n_write_blocks = 0;
33
34 static FLASH_Verbose flash_queue_verbose = FLASH_QUEUE_VERBOSE_NONE;
35 static FLA_Bool flash_queue_sorting = FALSE;
36 static FLA_Bool flash_queue_caching = FALSE;
37 static FLA_Bool flash_queue_work_stealing = FALSE;
38 static FLASH_Data_aff flash_queue_data_affinity = FLASH_QUEUE_AFFINITY_NONE;
39
40 static double flash_queue_total_time = 0.0;
41 static double flash_queue_parallel_time = 0.0;
42
43 static dim_t flash_queue_block_size = 0;
44 static dim_t flash_queue_cache_size = 2 * 1024 * 1024;
45 static dim_t flash_queue_cache_line_size = 64;
46
47 static int flash_queue_cores_per_cache = 1;
48 static int flash_queue_cores_per_queue = 0;
49
50 #endif
51
52
53 static unsigned int flash_queue_stack = 0;
54 static FLA_Bool flash_queue_enabled = TRUE;
55
56 static unsigned int flash_queue_n_threads = 1;
57
58
FLASH_Queue_begin(void)59 void FLASH_Queue_begin( void )
60 /*----------------------------------------------------------------------------
61
62 FLASH_Queue_begin
63
64 ----------------------------------------------------------------------------*/
65 {
66 #ifdef FLA_ENABLE_SUPERMATRIX
67 if ( flash_queue_stack == 0 )
68 {
69 // Save the starting time for the total execution time.
70 flash_queue_total_time = FLA_Clock();
71 }
72 #endif
73
74 // Push onto the stack.
75 flash_queue_stack++;
76
77 return;
78 }
79
80
FLASH_Queue_end(void)81 void FLASH_Queue_end( void )
82 /*----------------------------------------------------------------------------
83
84 FLASH_Queue_end
85
86 ----------------------------------------------------------------------------*/
87 {
88 // Pop off the stack.
89 flash_queue_stack--;
90
91 #ifdef FLA_ENABLE_SUPERMATRIX
92 if ( flash_queue_stack == 0 )
93 {
94 // Execute tasks if encounter the outermost parallel region.
95 FLASH_Queue_exec();
96
97 // Find the total execution time.
98 flash_queue_total_time = FLA_Clock() - flash_queue_total_time;
99 }
100 #endif
101
102 return;
103 }
104
105
FLASH_Queue_stack_depth(void)106 unsigned int FLASH_Queue_stack_depth( void )
107 /*----------------------------------------------------------------------------
108
109 FLASH_Queue_stack_depth
110
111 ----------------------------------------------------------------------------*/
112 {
113 return flash_queue_stack;
114 }
115
116
FLASH_Queue_enable(void)117 FLA_Error FLASH_Queue_enable( void )
118 /*----------------------------------------------------------------------------
119
120 FLASH_Queue_enable
121
122 ----------------------------------------------------------------------------*/
123 {
124 #ifdef FLA_ENABLE_SUPERMATRIX
125 if ( flash_queue_stack == 0 )
126 {
127 // Enable if not begin parallel region yet.
128 flash_queue_enabled = TRUE;
129 return FLA_SUCCESS;
130 }
131 else
132 {
133 // Cannot change status during parallel region.
134 return FLA_FAILURE;
135 }
136 #else
137 // Raise an exception when SuperMatrix is not configured.
138 FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED );
139 return FLA_FAILURE;
140 #endif
141 }
142
143
FLASH_Queue_disable(void)144 FLA_Error FLASH_Queue_disable( void )
145 /*----------------------------------------------------------------------------
146
147 FLASH_Queue_disable
148
149 ----------------------------------------------------------------------------*/
150 {
151 #ifdef FLA_ENABLE_SUPERMATRIX
152 if ( flash_queue_stack == 0 )
153 {
154 // Disable if not begin parallel region yet.
155 flash_queue_enabled = FALSE;
156 return FLA_SUCCESS;
157 }
158 else
159 {
160 // Cannot change status during parallel region.
161 return FLA_FAILURE;
162 }
163 #else
164 // Allow disabling enqueuing even when SuperMatrix is not configured.
165 flash_queue_enabled = FALSE;
166 return FLA_SUCCESS;
167 #endif
168 }
169
170
FLASH_Queue_get_enabled(void)171 FLA_Bool FLASH_Queue_get_enabled( void )
172 /*----------------------------------------------------------------------------
173
174 FLASH_Queue_get_enabled
175
176 ----------------------------------------------------------------------------*/
177 {
178 // Return if enabled, but always false if SuperMatrix is not configured.
179 #ifdef FLA_ENABLE_SUPERMATRIX
180 return flash_queue_enabled;
181 #else
182 return FALSE;
183 #endif
184 }
185
186
FLASH_Queue_set_num_threads(unsigned int n_threads)187 void FLASH_Queue_set_num_threads( unsigned int n_threads )
188 /*----------------------------------------------------------------------------
189
190 FLASH_Queue_set_num_threads
191
192 ----------------------------------------------------------------------------*/
193 {
194 FLA_Error e_val;
195
196 // Verify that the number of threads is positive.
197 e_val = FLA_Check_num_threads( n_threads );
198 FLA_Check_error_code( e_val );
199
200 // Keep track of the number of threads internally.
201 flash_queue_n_threads = n_threads;
202
203 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
204
205 // No additional action is necessary to set the number of OpenMP threads
206 // since setting the number of threads is handled at the parallel for loop
207 // with a num_threads() clause. This gives the user more flexibility since
208 // he can use the OMP_NUM_THREADS environment variable or the
209 // omp_set_num_threads() function to set the global number of OpenMP threads
210 // independently of the number of SuperMatrix threads.
211
212 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
213
214 // No additional action is necessary to set the number of pthreads
215 // since setting the number of threads is handled entirely on our end.
216
217 #endif
218
219 return;
220 }
221
222
FLASH_Queue_get_num_threads(void)223 unsigned int FLASH_Queue_get_num_threads( void )
224 /*----------------------------------------------------------------------------
225
226 FLASH_Queue_get_num_threads
227
228 ----------------------------------------------------------------------------*/
229 {
230 return flash_queue_n_threads;
231 }
232
233
234 #ifdef FLA_ENABLE_SUPERMATRIX
235
236
FLASH_Queue_init(void)237 void FLASH_Queue_init( void )
238 /*----------------------------------------------------------------------------
239
240 FLASH_Queue_init
241
242 ----------------------------------------------------------------------------*/
243 {
244 // Exit early if we're already initialized.
245 if ( flash_queue_initialized == TRUE )
246 return;
247
248 // Reset all the initial values.
249 FLASH_Queue_reset();
250
251 // Set the initialized flag.
252 flash_queue_initialized = TRUE;
253
254 #ifdef FLA_ENABLE_GPU
255 FLASH_Queue_init_gpu();
256 #endif
257
258 return;
259 }
260
261
FLASH_Queue_finalize(void)262 void FLASH_Queue_finalize( void )
263 /*----------------------------------------------------------------------------
264
265 FLASH_Queue_finalize
266
267 ----------------------------------------------------------------------------*/
268 {
269 // Exit early if we're not already initialized.
270 if ( flash_queue_initialized == FALSE )
271 return;
272
273 // Clear the initialized flag.
274 flash_queue_initialized = FALSE;
275
276 #ifdef FLA_ENABLE_GPU
277 FLASH_Queue_finalize_gpu();
278 #endif
279
280 return;
281 }
282
283
FLASH_Queue_get_num_tasks(void)284 unsigned int FLASH_Queue_get_num_tasks( void )
285 /*----------------------------------------------------------------------------
286
287 FLASH_Queue_get_num_tasks
288
289 ----------------------------------------------------------------------------*/
290 {
291 return _tq.n_tasks;
292 }
293
294
FLASH_Queue_set_verbose_output(FLASH_Verbose verbose)295 void FLASH_Queue_set_verbose_output( FLASH_Verbose verbose )
296 /*----------------------------------------------------------------------------
297
298 FLASH_Queue_set_verbose_output
299
300 ----------------------------------------------------------------------------*/
301 {
302 flash_queue_verbose = verbose;
303
304 return;
305 }
306
307
FLASH_Queue_get_verbose_output(void)308 FLASH_Verbose FLASH_Queue_get_verbose_output( void )
309 /*----------------------------------------------------------------------------
310
311 FLASH_Queue_get_verbose_output
312
313 ----------------------------------------------------------------------------*/
314 {
315 return flash_queue_verbose;
316 }
317
318
FLASH_Queue_set_sorting(FLA_Bool sorting)319 void FLASH_Queue_set_sorting( FLA_Bool sorting )
320 /*----------------------------------------------------------------------------
321
322 FLASH_Queue_set_sorting
323
324 ----------------------------------------------------------------------------*/
325 {
326 flash_queue_sorting = sorting;
327
328 return;
329 }
330
331
FLASH_Queue_get_sorting(void)332 FLA_Bool FLASH_Queue_get_sorting( void )
333 /*----------------------------------------------------------------------------
334
335 FLASH_Queue_get_sorting
336
337 ----------------------------------------------------------------------------*/
338 {
339 return flash_queue_sorting;
340 }
341
342
FLASH_Queue_set_caching(FLA_Bool caching)343 void FLASH_Queue_set_caching( FLA_Bool caching )
344 /*----------------------------------------------------------------------------
345
346 FLASH_Queue_set_caching
347
348 ----------------------------------------------------------------------------*/
349 {
350 flash_queue_caching = caching;
351
352 return;
353 }
354
355
FLASH_Queue_get_caching(void)356 FLA_Bool FLASH_Queue_get_caching( void )
357 /*----------------------------------------------------------------------------
358
359 FLASH_Queue_get_caching
360
361 ----------------------------------------------------------------------------*/
362 {
363 return flash_queue_caching;
364 }
365
366
FLASH_Queue_set_work_stealing(FLA_Bool work_stealing)367 void FLASH_Queue_set_work_stealing( FLA_Bool work_stealing )
368 /*----------------------------------------------------------------------------
369
370 FLASH_Queue_set_work_stealing
371
372 ----------------------------------------------------------------------------*/
373 {
374 flash_queue_work_stealing = work_stealing;
375
376 return;
377 }
378
379
FLASH_Queue_get_work_stealing(void)380 FLA_Bool FLASH_Queue_get_work_stealing( void )
381 /*----------------------------------------------------------------------------
382
383 FLASH_Queue_get_work_stealing
384
385 ----------------------------------------------------------------------------*/
386 {
387 return flash_queue_work_stealing;
388 }
389
390
FLASH_Queue_set_data_affinity(FLASH_Data_aff data_affinity)391 void FLASH_Queue_set_data_affinity( FLASH_Data_aff data_affinity )
392 /*----------------------------------------------------------------------------
393
394 FLASH_Queue_set_data_affinity
395
396 ----------------------------------------------------------------------------*/
397 {
398 flash_queue_data_affinity = data_affinity;
399
400 return;
401 }
402
403
FLASH_Queue_get_data_affinity(void)404 FLASH_Data_aff FLASH_Queue_get_data_affinity( void )
405 /*----------------------------------------------------------------------------
406
407 FLASH_Queue_get_data_affinity
408
409 ----------------------------------------------------------------------------*/
410 {
411 return flash_queue_data_affinity;
412 }
413
414
FLASH_Queue_get_total_time(void)415 double FLASH_Queue_get_total_time( void )
416 /*----------------------------------------------------------------------------
417
418 FLASH_Queue_get_total_time
419
420 ----------------------------------------------------------------------------*/
421 {
422 // Only return time if out of parallel region.
423 if ( flash_queue_stack == 0 )
424 return flash_queue_total_time;
425
426 return 0.0;
427 }
428
429
FLASH_Queue_get_parallel_time(void)430 double FLASH_Queue_get_parallel_time( void )
431 /*----------------------------------------------------------------------------
432
433 FLASH_Queue_get_parallel_time
434
435 ----------------------------------------------------------------------------*/
436 {
437 // Only return time if out of parallel region.
438 if ( flash_queue_stack == 0 )
439 return flash_queue_parallel_time;
440
441 return 0.0;
442 }
443
444
445 // --- helper functions --- ===================================================
446
447
FLASH_Queue_set_parallel_time(double dtime)448 void FLASH_Queue_set_parallel_time( double dtime )
449 /*----------------------------------------------------------------------------
450
451 FLASH_Queue_set_parallel_time
452
453 ----------------------------------------------------------------------------*/
454 {
455 flash_queue_parallel_time = dtime;
456
457 return;
458 }
459
460
FLASH_Queue_set_block_size(dim_t size)461 void FLASH_Queue_set_block_size( dim_t size )
462 /*----------------------------------------------------------------------------
463
464 FLASH_Queue_set_block_size
465
466 ----------------------------------------------------------------------------*/
467 {
468 // Only adjust the block size if the new block is larger.
469 if ( flash_queue_block_size < size )
470 flash_queue_block_size = size;
471
472 return;
473 }
474
475
FLASH_Queue_get_block_size(void)476 dim_t FLASH_Queue_get_block_size( void )
477 /*----------------------------------------------------------------------------
478
479 FLASH_Queue_get_block_size
480
481 ----------------------------------------------------------------------------*/
482 {
483 return flash_queue_block_size;
484 }
485
486
FLASH_Queue_set_cache_size(dim_t size)487 void FLASH_Queue_set_cache_size( dim_t size )
488 /*----------------------------------------------------------------------------
489
490 FLASH_Queue_set_cache_size
491
492 ----------------------------------------------------------------------------*/
493 {
494 flash_queue_cache_size = size;
495
496 return;
497 }
498
499
FLASH_Queue_get_cache_size(void)500 dim_t FLASH_Queue_get_cache_size( void )
501 /*----------------------------------------------------------------------------
502
503 FLASH_Queue_get_cache_size
504
505 ----------------------------------------------------------------------------*/
506 {
507 return flash_queue_cache_size;
508 }
509
510
FLASH_Queue_set_cache_line_size(dim_t size)511 void FLASH_Queue_set_cache_line_size( dim_t size )
512 /*----------------------------------------------------------------------------
513
514 FLASH_Queue_set_cache_line_size
515
516 ----------------------------------------------------------------------------*/
517 {
518 flash_queue_cache_line_size = size;
519
520 return;
521 }
522
523
FLASH_Queue_get_cache_line_size(void)524 dim_t FLASH_Queue_get_cache_line_size( void )
525 /*----------------------------------------------------------------------------
526
527 FLASH_Queue_get_cache_line_size
528
529 ----------------------------------------------------------------------------*/
530 {
531 return flash_queue_cache_line_size;
532 }
533
534
FLASH_Queue_set_cores_per_cache(int cores)535 void FLASH_Queue_set_cores_per_cache( int cores )
536 /*----------------------------------------------------------------------------
537
538 FLASH_Queue_set_cores_per_cache
539
540 ----------------------------------------------------------------------------*/
541 {
542 flash_queue_cores_per_cache = cores;
543
544 return;
545 }
546
547
FLASH_Queue_get_cores_per_cache(void)548 int FLASH_Queue_get_cores_per_cache( void )
549 /*----------------------------------------------------------------------------
550
551 FLASH_Queue_get_cores_per_cache
552
553 ----------------------------------------------------------------------------*/
554 {
555 return flash_queue_cores_per_cache;
556 }
557
558
FLASH_Queue_set_cores_per_queue(int cores)559 void FLASH_Queue_set_cores_per_queue( int cores )
560 /*----------------------------------------------------------------------------
561
562 FLASH_Queue_set_cores_per_queue
563
564 ----------------------------------------------------------------------------*/
565 {
566 flash_queue_cores_per_queue = cores;
567
568 return;
569 }
570
571
FLASH_Queue_get_cores_per_queue(void)572 int FLASH_Queue_get_cores_per_queue( void )
573 /*----------------------------------------------------------------------------
574
575 FLASH_Queue_get_cores_per_queue
576
577 ----------------------------------------------------------------------------*/
578 {
579 return flash_queue_cores_per_queue;
580 }
581
582
FLASH_Queue_reset(void)583 void FLASH_Queue_reset( void )
584 /*----------------------------------------------------------------------------
585
586 FLASH_Queue_reset
587
588 ----------------------------------------------------------------------------*/
589 {
590 // Clear the other fields of the FLASH_Queue structure.
591 _tq.n_tasks = 0;
592 _tq.head = NULL;
593 _tq.tail = NULL;
594
595 // Reset the number of blocks.
596 flash_queue_n_read_blocks = 0;
597 flash_queue_n_write_blocks = 0;
598
599 return;
600 }
601
602
FLASH_Queue_get_head_task(void)603 FLASH_Task* FLASH_Queue_get_head_task( void )
604 /*----------------------------------------------------------------------------
605
606 FLASH_Queue_get_head_task
607
608 ----------------------------------------------------------------------------*/
609 {
610 return _tq.head;
611 }
612
613
FLASH_Queue_get_tail_task(void)614 FLASH_Task* FLASH_Queue_get_tail_task( void )
615 /*----------------------------------------------------------------------------
616
617 FLASH_Queue_get_tail_task
618
619 ----------------------------------------------------------------------------*/
620 {
621 return _tq.tail;
622 }
623
624
FLASH_Queue_push(void * func,void * cntl,char * name,FLA_Bool enabled_gpu,int n_int_args,int n_fla_args,int n_input_args,int n_output_args,...)625 void FLASH_Queue_push( void* func,
626 void* cntl,
627 char* name,
628 FLA_Bool enabled_gpu,
629 int n_int_args,
630 int n_fla_args,
631 int n_input_args,
632 int n_output_args,
633 ... )
634 /*----------------------------------------------------------------------------
635
636 FLASH_Queue_push
637
638 ----------------------------------------------------------------------------*/
639 {
640 int i;
641 va_list var_arg_list;
642 FLASH_Task* t;
643 FLA_Obj obj;
644
645 // Allocate a new FLA_Task and populate its fields with appropriate values.
646 t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
647 n_int_args, n_fla_args,
648 n_input_args, n_output_args );
649
650 // Initialize variable argument environment. In case you're wondering, the
651 // second argument in this macro invocation of va_start() is supposed to be
652 // the parameter that immediately preceeds the variable argument list
653 // (ie: the ... above ).
654 va_start( var_arg_list, n_output_args );
655
656 // Extract the integer arguments.
657 for ( i = 0; i < n_int_args; i++ )
658 t->int_arg[i] = va_arg( var_arg_list, int );
659
660 // Extract the FLA_Obj arguments.
661 for ( i = 0; i < n_fla_args; i++ )
662 t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );
663
664 // Extract the input FLA_Obj arguments.
665 for ( i = 0; i < n_input_args; i++ )
666 {
667 obj = va_arg( var_arg_list, FLA_Obj );
668 t->input_arg[i] = obj;
669
670 // Macroblock is used.
671 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
672 {
673 dim_t jj, kk;
674 dim_t m = FLA_Obj_length( obj );
675 dim_t n = FLA_Obj_width( obj );
676 dim_t cs = FLA_Obj_col_stride( obj );
677 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
678
679 // Dependence analysis for each input block in macroblock.
680 for ( jj = 0; jj < n; jj++ )
681 for ( kk = 0; kk < m; kk++ )
682 FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );
683
684 // Set the number of blocks in the macroblock subtracted by one
685 // since we do not want to recount an operand for each n_input_arg.
686 t->n_macro_args += m * n - 1;
687 }
688 else // Regular block.
689 {
690 // Dependence analysis for input operand.
691 FLASH_Queue_push_input( obj, t );
692 }
693 }
694
695 // Extract the output FLA_Obj arguments.
696 for ( i = 0; i < n_output_args; i++ )
697 {
698 obj = va_arg( var_arg_list, FLA_Obj );
699 t->output_arg[i] = obj;
700
701 // Only assign data affinity to the first output block.
702 if ( i == 0 )
703 {
704 FLA_Obj buf = obj;
705
706 // Use the top left block of the macroblock.
707 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
708 buf = *FLASH_OBJ_PTR_AT( obj );
709
710 if ( buf.base->write_task == NULL )
711 t->queue = flash_queue_n_write_blocks;
712 else
713 t->queue = buf.base->write_task->queue;
714 }
715
716 // Macroblock is used.
717 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
718 {
719 dim_t jj, kk;
720 dim_t m = FLA_Obj_length( obj );
721 dim_t n = FLA_Obj_width( obj );
722 dim_t cs = FLA_Obj_col_stride( obj );
723 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
724
725 // Dependence analysis for each output block in macroblock.
726 for ( jj = 0; jj < n; jj++ )
727 for ( kk = 0; kk < m; kk++ )
728 FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );
729
730 // Set the number of blocks in the macroblock subtracted by one
731 // since we do not want to recount an operand for each n_output_arg.
732 t->n_macro_args += m * n - 1;
733 }
734 else // Regular block.
735 {
736 // Dependence analysis for output operand.
737 FLASH_Queue_push_output( obj, t );
738 }
739 }
740
741 // Finalize the variable argument environment.
742 va_end( var_arg_list );
743
744 // Add the task to the tail of the queue (and the head if queue is empty).
745 if ( _tq.n_tasks == 0 )
746 {
747 _tq.head = t;
748 _tq.tail = t;
749 }
750 else
751 {
752 t->prev_task = _tq.tail;
753 _tq.tail->next_task = t;
754 _tq.tail = t;
755
756 // Determine the index of the task in the task queue.
757 t->order = t->prev_task->order + 1;
758 }
759
760 // Increment the number of tasks.
761 _tq.n_tasks++;
762
763 return;
764 }
765
766
FLASH_Queue_push_input(FLA_Obj obj,FLASH_Task * t)767 void FLASH_Queue_push_input( FLA_Obj obj,
768 FLASH_Task* t )
769 /*----------------------------------------------------------------------------
770
771 FLASH_Queue_push_input
772
773 ----------------------------------------------------------------------------*/
774 {
775 FLASH_Task* task;
776 FLASH_Dep* d;
777
778 // Find dependence information.
779 if ( obj.base->write_task == NULL )
780 {
781 t->n_ready--;
782
783 // Add to number of blocks read if not written and not read before.
784 if ( obj.base->n_read_tasks == 0 )
785 {
786 // Identify each read block with an id for freeing.
787 obj.base->n_read_blocks = flash_queue_n_read_blocks;
788
789 flash_queue_n_read_blocks++;
790 }
791 }
792 else
793 { // Flow dependence.
794 task = obj.base->write_task;
795
796 d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
797
798 d->task = t;
799 d->next_dep = NULL;
800
801 if ( task->n_dep_args == 0 )
802 {
803 task->dep_arg_head = d;
804 task->dep_arg_tail = d;
805 }
806 else
807 {
808 task->dep_arg_tail->next_dep = d;
809 task->dep_arg_tail = d;
810 }
811
812 task->n_dep_args++;
813 }
814
815 // Add task to the read task in the object if not already there.
816 if ( obj.base->n_read_tasks == 0 ||
817 obj.base->read_task_tail->task != t )
818 { // Anti-dependence potentially.
819 d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
820
821 d->task = t;
822 d->next_dep = NULL;
823
824 if ( obj.base->n_read_tasks == 0 )
825 {
826 obj.base->read_task_head = d;
827 obj.base->read_task_tail = d;
828 }
829 else
830 {
831 obj.base->read_task_tail->next_dep = d;
832 obj.base->read_task_tail = d;
833 }
834
835 obj.base->n_read_tasks++;
836 }
837
838 return;
839 }
840
841
FLASH_Queue_push_output(FLA_Obj obj,FLASH_Task * t)842 void FLASH_Queue_push_output( FLA_Obj obj,
843 FLASH_Task* t )
844 /*----------------------------------------------------------------------------
845
846 FLASH_Queue_push_output
847
848 ----------------------------------------------------------------------------*/
849 {
850 int i;
851 FLASH_Task* task;
852 FLASH_Dep* d;
853 FLASH_Dep* next_dep;
854
855 // Assign tasks to threads with data affinity.
856 if ( obj.base->write_task == NULL )
857 {
858 t->n_ready--;
859
860 // Save index in which this output block is first encountered.
861 obj.base->n_write_blocks = flash_queue_n_write_blocks;
862
863 // Number of blocks written if not written before.
864 flash_queue_n_write_blocks++;
865
866 // Add to number of blocks read if not written or read before.
867 if ( obj.base->n_read_tasks == 0 )
868 {
869 // Identify each read block with an id for freeing.
870 obj.base->n_read_blocks = flash_queue_n_read_blocks;
871
872 flash_queue_n_read_blocks++;
873 }
874 }
875 else
876 { // Flow dependence potentially.
877 // The last task to overwrite this block is not itself.
878 if ( obj.base->write_task != t )
879 {
880 // Create dependency from task that last wrote the block.
881 task = obj.base->write_task;
882
883 d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
884
885 d->task = t;
886 d->next_dep = NULL;
887
888 if ( task->n_dep_args == 0 )
889 {
890 task->dep_arg_head = d;
891 task->dep_arg_tail = d;
892 }
893 else
894 {
895 task->dep_arg_tail->next_dep = d;
896 task->dep_arg_tail = d;
897 }
898
899 task->n_dep_args++;
900 }
901 else
902 {
903 // No need to notify task twice for output block already seen.
904 t->n_ready--;
905 }
906 }
907
908 // Clear read task for next set of reads and record the anti-dependence.
909 d = obj.base->read_task_head;
910
911 for ( i = 0; i < obj.base->n_read_tasks; i++ )
912 {
913 task = d->task;
914 next_dep = d->next_dep;
915
916 // If the last task to read is not the current task, add dependence.
917 if ( task != t )
918 {
919 d->task = t;
920 d->next_dep = NULL;
921
922 if ( task->n_dep_args == 0 )
923 {
924 task->dep_arg_head = d;
925 task->dep_arg_tail = d;
926 }
927 else
928 {
929 task->dep_arg_tail->next_dep = d;
930 task->dep_arg_tail = d;
931 }
932
933 task->n_dep_args++;
934
935 t->n_war_args++;
936 }
937 else
938 {
939 FLA_free( d );
940 }
941
942 d = next_dep;
943 }
944
945 obj.base->n_read_tasks = 0;
946 obj.base->read_task_head = NULL;
947 obj.base->read_task_tail = NULL;
948
949 // Record this task as the last to write to this block.
950 obj.base->write_task = t;
951
952 return;
953 }
954
955
FLASH_Task_alloc(void * func,void * cntl,char * name,FLA_Bool enabled_gpu,int n_int_args,int n_fla_args,int n_input_args,int n_output_args)956 FLASH_Task* FLASH_Task_alloc( void *func,
957 void *cntl,
958 char *name,
959 FLA_Bool enabled_gpu,
960 int n_int_args,
961 int n_fla_args,
962 int n_input_args,
963 int n_output_args )
964 /*----------------------------------------------------------------------------
965
966 FLASH_Task_alloc
967
968 ----------------------------------------------------------------------------*/
969 {
970 FLASH_Task* t;
971
972 // Allocate space for the task structure t.
973 t = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );
974
975 // Allocate space for the task's integer arguments.
976 t->int_arg = (int *) FLA_malloc( n_int_args * sizeof(int) );
977
978 // Allocate space for the task's FLA_Obj arguments.
979 t->fla_arg = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );
980
981 // Allocate space for the task's input FLA_Obj arguments.
982 t->input_arg = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );
983
984 // Allocate space for the task's output FLA_Obj arguments.
985 t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
986
987 // Initialize other fields of the structure.
988 t->n_ready = 0;
989 t->order = 0;
990 t->queue = 0;
991 t->height = 0;
992 t->thread = 0;
993 t->cache = 0;
994 t->hit = FALSE;
995
996 t->func = func;
997 t->cntl = cntl;
998 t->name = name;
999 t->enabled_gpu = enabled_gpu;
1000 t->n_int_args = n_int_args;
1001 t->n_fla_args = n_fla_args;
1002 t->n_input_args = n_input_args;
1003 t->n_output_args = n_output_args;
1004
1005 t->n_macro_args = 0;
1006 t->n_war_args = 0;
1007 t->n_dep_args = 0;
1008 t->dep_arg_head = NULL;
1009 t->dep_arg_tail = NULL;
1010 t->prev_task = NULL;
1011 t->next_task = NULL;
1012 t->prev_wait = NULL;
1013 t->next_wait = NULL;
1014
1015 // Return a pointer to the initialized structure.
1016 return t;
1017 }
1018
1019
FLASH_Task_free(FLASH_Task * t)1020 void FLASH_Task_free( FLASH_Task *t )
1021 /*----------------------------------------------------------------------------
1022
1023 FLASH_Task_free
1024
1025 ----------------------------------------------------------------------------*/
1026 {
1027 int i, j, k;
1028 FLA_Obj obj;
1029 FLASH_Dep* d;
1030 FLASH_Dep* next_dep;
1031
1032 // Clearing the last write task in each output block.
1033 for ( i = 0; i < t->n_output_args; i++ )
1034 {
1035 obj = t->output_arg[i];
1036
1037 // Macroblock is used.
1038 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1039 {
1040 dim_t jj, kk;
1041 dim_t m = FLA_Obj_length( obj );
1042 dim_t n = FLA_Obj_width( obj );
1043 dim_t cs = FLA_Obj_col_stride( obj );
1044 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1045
1046 // Clear each block in macroblock.
1047 for ( jj = 0; jj < n; jj++ )
1048 for ( kk = 0; kk < m; kk++ )
1049 ( buf + jj * cs + kk )->base->write_task = NULL;
1050 }
1051 else // Clear regular block.
1052 {
1053 obj.base->write_task = NULL;
1054 }
1055 }
1056
1057 // Cleaning the last read tasks in each input block.
1058 for ( i = 0; i < t->n_input_args; i++ )
1059 {
1060 obj = t->input_arg[i];
1061
1062 // Macroblock is used.
1063 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1064 {
1065 dim_t jj, kk;
1066 dim_t m = FLA_Obj_length( obj );
1067 dim_t n = FLA_Obj_width( obj );
1068 dim_t cs = FLA_Obj_col_stride( obj );
1069 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1070
1071 // Clear each block in macroblock.
1072 for ( jj = 0; jj < n; jj++ )
1073 {
1074 for ( kk = 0; kk < m; kk++ )
1075 {
1076 obj = *( buf + jj * cs + kk );
1077
1078 k = obj.base->n_read_tasks;
1079 d = obj.base->read_task_head;
1080
1081 obj.base->n_read_tasks = 0;
1082 obj.base->read_task_head = NULL;
1083 obj.base->read_task_tail = NULL;
1084
1085 for ( j = 0; j < k; j++ )
1086 {
1087 next_dep = d->next_dep;
1088 FLA_free( d );
1089 d = next_dep;
1090 }
1091 }
1092 }
1093 }
1094 else // Regular block.
1095 {
1096 k = obj.base->n_read_tasks;
1097 d = obj.base->read_task_head;
1098
1099 obj.base->n_read_tasks = 0;
1100 obj.base->read_task_head = NULL;
1101 obj.base->read_task_tail = NULL;
1102
1103 for ( j = 0; j < k; j++ )
1104 {
1105 next_dep = d->next_dep;
1106 FLA_free( d );
1107 d = next_dep;
1108 }
1109 }
1110 }
1111
1112 // Free the dep_arg field of t.
1113 d = t->dep_arg_head;
1114
1115 for ( i = 0; i < t->n_dep_args; i++ )
1116 {
1117 next_dep = d->next_dep;
1118 FLA_free( d );
1119 d = next_dep;
1120 }
1121
1122 // Free the int_arg field of t.
1123 FLA_free( t->int_arg );
1124
1125 // Free the fla_arg field of t.
1126 FLA_free( t->fla_arg );
1127
1128 // Free the input_arg field of t.
1129 FLA_free( t->input_arg );
1130
1131 // Free the output_arg field of t.
1132 FLA_free( t->output_arg );
1133
1134 // Finally, free the struct itself.
1135 FLA_free( t );
1136
1137 return;
1138 }
1139
1140
FLASH_Queue_exec_task(FLASH_Task * t)1141 void FLASH_Queue_exec_task( FLASH_Task* t )
1142 /*----------------------------------------------------------------------------
1143
1144 FLASH_Queue_exec_task
1145
1146 ----------------------------------------------------------------------------*/
1147 {
1148 // Define local function pointer types.
1149
1150 // LAPACK-level
1151 typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
1152 typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl);
1153 typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
1154 typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl);
1155 typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl);
1156 typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl);
1157 typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl);
1158 typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
1159 typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl);
1160 typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
1161 typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
1162 typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl);
1163 typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl);
1164 typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1165 typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
1166 typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl);
1167 typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl);
1168 typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl);
1169 typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl);
1170 typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl);
1171 typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl);
1172 typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl);
1173 typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl);
1174 typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl);
1175 typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl);
1176
1177 // Level-3 BLAS
1178 typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl);
1179 typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl);
1180 typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl);
1181 typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl);
1182 typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl);
1183 typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl);
1184 typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl);
1185 typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl);
1186 typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl);
1187
1188 // Level-2 BLAS
1189 typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl);
1190 typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl);
1191
1192 // Level-1 BLAS
1193 typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl);
1194 typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl);
1195 typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
1196 typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl);
1197 typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
1198 typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl);
1199 typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl);
1200
1201 // Base
1202 typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
1203 typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);
1204
1205 // Only execute task if it is not NULL.
1206 if ( t == NULL )
1207 return;
1208
1209 // Now "switch" between the various possible task functions.
1210
1211 // FLA_LU_piv_macro
1212 if ( t->func == (void *) FLA_LU_piv_macro_task )
1213 {
1214 flash_lu_piv_macro_p func;
1215 func = (flash_lu_piv_macro_p) t->func;
1216
1217 func( t->output_arg[0],
1218 t->output_arg[1],
1219 ( fla_lu_t* ) t->cntl );
1220 }
1221 // FLA_Apply_pivots_macro
1222 else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
1223 {
1224 flash_apply_pivots_macro_p func;
1225 func = (flash_apply_pivots_macro_p) t->func;
1226
1227 func( ( FLA_Side ) t->int_arg[0],
1228 ( FLA_Trans ) t->int_arg[1],
1229 t->input_arg[0],
1230 t->output_arg[0],
1231 ( fla_appiv_t* ) t->cntl );
1232 }
1233 // FLA_LU_piv
1234 else if ( t->func == (void *) FLA_LU_piv_task )
1235 {
1236 flash_lu_piv_p func;
1237 func = (flash_lu_piv_p) t->func;
1238
1239 func( t->output_arg[0],
1240 t->fla_arg[0],
1241 ( fla_lu_t* ) t->cntl );
1242 }
1243 // FLA_LU_piv_copy
1244 else if ( t->func == (void *) FLA_LU_piv_copy_task )
1245 {
1246 flash_lu_piv_copy_p func;
1247 func = (flash_lu_piv_copy_p) t->func;
1248
1249 func( t->output_arg[0],
1250 t->fla_arg[0],
1251 t->output_arg[1],
1252 ( fla_lu_t* ) t->cntl );
1253 }
1254 // FLA_Trsm_piv
1255 else if ( t->func == (void *) FLA_Trsm_piv_task )
1256 {
1257 flash_trsm_piv_p func;
1258 func = (flash_trsm_piv_p) t->func;
1259
1260 func( t->input_arg[0],
1261 t->output_arg[0],
1262 t->fla_arg[0],
1263 ( fla_trsm_t* ) t->cntl );
1264 }
1265 // FLA_SA_LU
1266 else if ( t->func == (void *) FLA_SA_LU_task )
1267 {
1268 flash_sa_lu_p func;
1269 func = (flash_sa_lu_p) t->func;
1270
1271 func( t->output_arg[1],
1272 t->output_arg[0],
1273 t->fla_arg[0],
1274 t->fla_arg[1],
1275 t->int_arg[0],
1276 ( fla_lu_t* ) t->cntl );
1277 }
1278 // FLA_SA_FS
1279 else if ( t->func == (void *) FLA_SA_FS_task )
1280 {
1281 flash_sa_fs_p func;
1282 func = (flash_sa_fs_p) t->func;
1283
1284 func( t->fla_arg[0],
1285 t->input_arg[0],
1286 t->fla_arg[1],
1287 t->output_arg[1],
1288 t->output_arg[0],
1289 t->int_arg[0],
1290 ( fla_gemm_t* ) t->cntl );
1291 }
1292 // FLA_LU_nopiv
1293 else if ( t->func == (void *) FLA_LU_nopiv_task )
1294 {
1295 flash_lu_nopiv_p func;
1296 func = (flash_lu_nopiv_p) t->func;
1297
1298 func( t->output_arg[0],
1299 ( fla_lu_t* ) t->cntl );
1300 }
1301 // FLA_Trinv
1302 else if ( t->func == (void *) FLA_Trinv_task )
1303 {
1304 flash_trinv_p func;
1305 func = (flash_trinv_p) t->func;
1306
1307 func( ( FLA_Uplo ) t->int_arg[0],
1308 ( FLA_Diag ) t->int_arg[1],
1309 t->output_arg[0],
1310 ( fla_trinv_t* ) t->cntl );
1311 }
1312 // FLA_Ttmm
1313 else if ( t->func == (void *) FLA_Ttmm_task )
1314 {
1315 flash_ttmm_p func;
1316 func = (flash_ttmm_p) t->func;
1317
1318 func( ( FLA_Uplo ) t->int_arg[0],
1319 t->output_arg[0],
1320 ( fla_ttmm_t* ) t->cntl );
1321 }
1322 // FLA_Chol
1323 else if ( t->func == (void *) FLA_Chol_task )
1324 {
1325 flash_chol_p func;
1326 func = (flash_chol_p) t->func;
1327
1328 func( ( FLA_Uplo ) t->int_arg[0],
1329 t->output_arg[0],
1330 ( fla_chol_t* ) t->cntl );
1331 }
1332 // FLA_Sylv
1333 else if ( t->func == (void *) FLA_Sylv_task )
1334 {
1335 flash_sylv_p func;
1336 func = (flash_sylv_p) t->func;
1337
1338 func( ( FLA_Trans ) t->int_arg[0],
1339 ( FLA_Trans ) t->int_arg[1],
1340 t->fla_arg[0],
1341 t->input_arg[0],
1342 t->input_arg[1],
1343 t->output_arg[0],
1344 t->fla_arg[1],
1345 ( fla_sylv_t* ) t->cntl );
1346 }
1347 // FLA_Lyap
1348 else if ( t->func == (void *) FLA_Lyap_task )
1349 {
1350 flash_lyap_p func;
1351 func = (flash_lyap_p) t->func;
1352
1353 func( ( FLA_Trans ) t->int_arg[0],
1354 t->fla_arg[0],
1355 t->input_arg[0],
1356 t->output_arg[0],
1357 t->fla_arg[1],
1358 ( fla_lyap_t* ) t->cntl );
1359 }
1360 // FLA_QR_UT_macro
1361 else if ( t->func == (void *) FLA_QR_UT_macro_task )
1362 {
1363 flash_qrut_macro_p func;
1364 func = (flash_qrut_macro_p) t->func;
1365
1366 func( t->output_arg[0],
1367 t->output_arg[1],
1368 ( fla_qrut_t* ) t->cntl );
1369 }
1370 // FLA_QR_UT
1371 else if ( t->func == (void *) FLA_QR_UT_task )
1372 {
1373 flash_qrut_p func;
1374 func = (flash_qrut_p) t->func;
1375
1376 func( t->output_arg[0],
1377 t->fla_arg[0],
1378 ( fla_qrut_t* ) t->cntl );
1379 }
1380 // FLA_QR_UT_copy
1381 else if ( t->func == (void *) FLA_QR_UT_copy_task )
1382 {
1383 flash_qrutc_p func;
1384 func = (flash_qrutc_p) t->func;
1385
1386 func( t->output_arg[0],
1387 t->fla_arg[0],
1388 t->output_arg[1],
1389 ( fla_qrut_t* ) t->cntl );
1390 }
1391 // FLA_QR2_UT
1392 else if ( t->func == (void *) FLA_QR2_UT_task )
1393 {
1394 flash_qr2ut_p func;
1395 func = (flash_qr2ut_p) t->func;
1396
1397 func( t->output_arg[1],
1398 t->output_arg[0],
1399 t->fla_arg[0],
1400 ( fla_qr2ut_t* ) t->cntl );
1401 }
1402 // FLA_LQ_UT_macro
1403 else if ( t->func == (void *) FLA_LQ_UT_macro_task )
1404 {
1405 flash_lqut_macro_p func;
1406 func = (flash_lqut_macro_p) t->func;
1407
1408 func( t->output_arg[0],
1409 t->output_arg[1],
1410 ( fla_lqut_t* ) t->cntl );
1411 }
1412 // FLA_CAQR2_UT
1413 else if ( t->func == (void *) FLA_CAQR2_UT_task )
1414 {
1415 flash_caqr2ut_p func;
1416 func = (flash_caqr2ut_p) t->func;
1417
1418 func( t->output_arg[1],
1419 t->output_arg[0],
1420 t->fla_arg[0],
1421 ( fla_caqr2ut_t* ) t->cntl );
1422 }
1423 // FLA_UDdate_UT
1424 else if ( t->func == (void *) FLA_UDdate_UT_task )
1425 {
1426 flash_uddateut_p func;
1427 func = (flash_uddateut_p) t->func;
1428
1429 func( t->output_arg[0],
1430 t->output_arg[1],
1431 t->output_arg[2],
1432 t->output_arg[3],
1433 ( fla_uddateut_t* ) t->cntl );
1434 }
1435 // FLA_Apply_Q_UT
1436 else if ( t->func == (void *) FLA_Apply_Q_UT_task )
1437 {
1438 flash_apqut_p func;
1439 func = (flash_apqut_p) t->func;
1440
1441 func( ( FLA_Side ) t->int_arg[0],
1442 ( FLA_Trans ) t->int_arg[1],
1443 ( FLA_Direct ) t->int_arg[2],
1444 ( FLA_Store ) t->int_arg[3],
1445 t->input_arg[0],
1446 t->fla_arg[0],
1447 t->output_arg[1],
1448 t->output_arg[0],
1449 ( fla_apqut_t* ) t->cntl );
1450 }
1451 // FLA_Apply_Q2_UT
1452 else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
1453 {
1454 flash_apq2ut_p func;
1455 func = (flash_apq2ut_p) t->func;
1456
1457 func( ( FLA_Side ) t->int_arg[0],
1458 ( FLA_Trans ) t->int_arg[1],
1459 ( FLA_Direct ) t->int_arg[2],
1460 ( FLA_Store ) t->int_arg[3],
1461 t->input_arg[0],
1462 t->fla_arg[0],
1463 t->output_arg[2],
1464 t->output_arg[1],
1465 t->output_arg[0],
1466 ( fla_apq2ut_t* ) t->cntl );
1467 }
1468 // FLA_Apply_CAQ2_UT
1469 else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
1470 {
1471 flash_apcaq2ut_p func;
1472 func = (flash_apcaq2ut_p) t->func;
1473
1474 func( ( FLA_Side ) t->int_arg[0],
1475 ( FLA_Trans ) t->int_arg[1],
1476 ( FLA_Direct ) t->int_arg[2],
1477 ( FLA_Store ) t->int_arg[3],
1478 t->input_arg[0],
1479 t->fla_arg[0],
1480 t->output_arg[2],
1481 t->output_arg[1],
1482 t->output_arg[0],
1483 ( fla_apcaq2ut_t* ) t->cntl );
1484 }
1485 // FLA_Apply_QUD_UT
1486 else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
1487 {
1488 flash_apqudut_p func;
1489 func = (flash_apqudut_p) t->func;
1490
1491 func( ( FLA_Side ) t->int_arg[0],
1492 ( FLA_Trans ) t->int_arg[1],
1493 ( FLA_Direct ) t->int_arg[2],
1494 ( FLA_Store ) t->int_arg[3],
1495 t->input_arg[0],
1496 t->output_arg[0],
1497 t->output_arg[1],
1498 t->input_arg[1],
1499 t->output_arg[2],
1500 t->input_arg[2],
1501 t->output_arg[3],
1502 ( fla_apqudut_t* ) t->cntl );
1503 }
1504 // FLA_Eig_gest
1505 else if ( t->func == (void *) FLA_Eig_gest_task )
1506 {
1507 flash_eig_gest_p func;
1508 func = (flash_eig_gest_p) t->func;
1509
1510 func( ( FLA_Inv ) t->int_arg[0],
1511 ( FLA_Uplo ) t->int_arg[1],
1512 t->output_arg[1],
1513 t->output_arg[0],
1514 t->input_arg[0],
1515 ( fla_eig_gest_t* ) t->cntl );
1516 }
1517 // FLA_Gemm
1518 else if ( t->func == (void *) FLA_Gemm_task )
1519 {
1520 flash_gemm_p func;
1521 func = (flash_gemm_p) t->func;
1522
1523 func( ( FLA_Trans ) t->int_arg[0],
1524 ( FLA_Trans ) t->int_arg[1],
1525 t->fla_arg[0],
1526 t->input_arg[0],
1527 t->input_arg[1],
1528 t->fla_arg[1],
1529 t->output_arg[0],
1530 ( fla_gemm_t* ) t->cntl );
1531 }
1532 // FLA_Hemm
1533 else if ( t->func == (void *) FLA_Hemm_task )
1534 {
1535 flash_hemm_p func;
1536 func = (flash_hemm_p) t->func;
1537
1538 func( ( FLA_Side ) t->int_arg[0],
1539 ( FLA_Uplo ) t->int_arg[1],
1540 t->fla_arg[0],
1541 t->input_arg[0],
1542 t->input_arg[1],
1543 t->fla_arg[1],
1544 t->output_arg[0],
1545 ( fla_hemm_t* ) t->cntl );
1546 }
1547 // FLA_Herk
1548 else if ( t->func == (void *) FLA_Herk_task )
1549 {
1550 flash_herk_p func;
1551 func = (flash_herk_p) t->func;
1552
1553 func( ( FLA_Uplo ) t->int_arg[0],
1554 ( FLA_Trans ) t->int_arg[1],
1555 t->fla_arg[0],
1556 t->input_arg[0],
1557 t->fla_arg[1],
1558 t->output_arg[0],
1559 ( fla_herk_t* ) t->cntl );
1560 }
1561 // FLA_Her2k
1562 else if ( t->func == (void *) FLA_Her2k_task )
1563 {
1564 flash_her2k_p func;
1565 func = (flash_her2k_p) t->func;
1566
1567 func( ( FLA_Uplo ) t->int_arg[0],
1568 ( FLA_Trans ) t->int_arg[1],
1569 t->fla_arg[0],
1570 t->input_arg[0],
1571 t->input_arg[1],
1572 t->fla_arg[1],
1573 t->output_arg[0],
1574 ( fla_her2k_t* ) t->cntl );
1575 }
1576 // FLA_Symm
1577 else if ( t->func == (void *) FLA_Symm_task )
1578 {
1579 flash_symm_p func;
1580 func = (flash_symm_p) t->func;
1581
1582 func( ( FLA_Side ) t->int_arg[0],
1583 ( FLA_Uplo ) t->int_arg[1],
1584 t->fla_arg[0],
1585 t->input_arg[0],
1586 t->input_arg[1],
1587 t->fla_arg[1],
1588 t->output_arg[0],
1589 ( fla_symm_t* ) t->cntl );
1590 }
1591 // FLA_Syrk
1592 else if ( t->func == (void *) FLA_Syrk_task )
1593 {
1594 flash_syrk_p func;
1595 func = (flash_syrk_p) t->func;
1596
1597 func( ( FLA_Uplo ) t->int_arg[0],
1598 ( FLA_Trans ) t->int_arg[1],
1599 t->fla_arg[0],
1600 t->input_arg[0],
1601 t->fla_arg[1],
1602 t->output_arg[0],
1603 ( fla_syrk_t* ) t->cntl );
1604 }
1605 // FLA_Syr2k
1606 else if ( t->func == (void *) FLA_Syr2k_task )
1607 {
1608 flash_syr2k_p func;
1609 func = (flash_syr2k_p) t->func;
1610
1611 func( ( FLA_Uplo ) t->int_arg[0],
1612 ( FLA_Trans ) t->int_arg[1],
1613 t->fla_arg[0],
1614 t->input_arg[0],
1615 t->input_arg[1],
1616 t->fla_arg[1],
1617 t->output_arg[0],
1618 ( fla_syr2k_t* ) t->cntl );
1619 }
1620 // FLA_Trmm
1621 else if ( t->func == (void *) FLA_Trmm_task )
1622 {
1623 flash_trmm_p func;
1624 func = (flash_trmm_p) t->func;
1625
1626 func( ( FLA_Side ) t->int_arg[0],
1627 ( FLA_Uplo ) t->int_arg[1],
1628 ( FLA_Trans ) t->int_arg[2],
1629 ( FLA_Diag ) t->int_arg[3],
1630 t->fla_arg[0],
1631 t->input_arg[0],
1632 t->output_arg[0],
1633 ( fla_trmm_t* ) t->cntl );
1634 }
1635 // FLA_Trsm
1636 else if ( t->func == (void *) FLA_Trsm_task )
1637 {
1638 flash_trsm_p func;
1639 func = (flash_trsm_p) t->func;
1640
1641 func( ( FLA_Side ) t->int_arg[0],
1642 ( FLA_Uplo ) t->int_arg[1],
1643 ( FLA_Trans ) t->int_arg[2],
1644 ( FLA_Diag ) t->int_arg[3],
1645 t->fla_arg[0],
1646 t->input_arg[0],
1647 t->output_arg[0],
1648 ( fla_trsm_t* ) t->cntl );
1649 }
1650 // FLA_Gemv
1651 else if ( t->func == (void *) FLA_Gemv_task )
1652 {
1653 flash_gemv_p func;
1654 func = (flash_gemv_p) t->func;
1655
1656 func( ( FLA_Trans ) t->int_arg[0],
1657 t->fla_arg[0],
1658 t->input_arg[0],
1659 t->input_arg[1],
1660 t->fla_arg[1],
1661 t->output_arg[0],
1662 ( fla_gemv_t* ) t->cntl );
1663 }
1664 // FLA_Trsv
1665 else if ( t->func == (void *) FLA_Trsv_task )
1666 {
1667 flash_trsv_p func;
1668 func = (flash_trsv_p) t->func;
1669
1670 func( ( FLA_Uplo ) t->int_arg[0],
1671 ( FLA_Trans ) t->int_arg[1],
1672 ( FLA_Diag ) t->int_arg[2],
1673 t->input_arg[0],
1674 t->output_arg[0],
1675 ( fla_trsv_t* ) t->cntl );
1676 }
1677 // FLA_Axpy
1678 else if ( t->func == (void *) FLA_Axpy_task )
1679 {
1680 flash_axpy_p func;
1681 func = (flash_axpy_p) t->func;
1682
1683 func( t->fla_arg[0],
1684 t->input_arg[0],
1685 t->output_arg[0],
1686 ( fla_axpy_t* ) t->cntl );
1687 }
1688 // FLA_Axpyt
1689 else if ( t->func == (void *) FLA_Axpyt_task )
1690 {
1691 flash_axpyt_p func;
1692 func = (flash_axpyt_p) t->func;
1693
1694 func( ( FLA_Trans ) t->int_arg[0],
1695 t->fla_arg[0],
1696 t->input_arg[0],
1697 t->output_arg[0],
1698 ( fla_axpyt_t* ) t->cntl );
1699 }
1700 // FLA_Copy
1701 else if ( t->func == (void *) FLA_Copy_task )
1702 {
1703 flash_copy_p func;
1704 func = (flash_copy_p) t->func;
1705
1706 func( t->input_arg[0],
1707 t->output_arg[0],
1708 ( fla_copy_t* ) t->cntl );
1709 }
1710 // FLA_Copyt
1711 else if ( t->func == (void *) FLA_Copyt_task )
1712 {
1713 flash_copyt_p func;
1714 func = (flash_copyt_p) t->func;
1715
1716 func( ( FLA_Trans ) t->int_arg[0],
1717 t->input_arg[0],
1718 t->output_arg[0],
1719 ( fla_copyt_t* ) t->cntl );
1720 }
1721 // FLA_Copyr
1722 else if ( t->func == (void *) FLA_Copyr_task )
1723 {
1724 flash_copyr_p func;
1725 func = (flash_copyr_p) t->func;
1726
1727 func( ( FLA_Uplo ) t->int_arg[0],
1728 t->input_arg[0],
1729 t->output_arg[0],
1730 ( fla_copyr_t* ) t->cntl );
1731 }
1732 // FLA_Scal
1733 else if ( t->func == (void *) FLA_Scal_task )
1734 {
1735 flash_scal_p func;
1736 func = (flash_scal_p) t->func;
1737
1738 func( t->fla_arg[0],
1739 t->output_arg[0],
1740 ( fla_scal_t* ) t->cntl );
1741 }
1742 // FLA_Scalr
1743 else if ( t->func == (void *) FLA_Scalr_task )
1744 {
1745 flash_scalr_p func;
1746 func = (flash_scalr_p) t->func;
1747
1748 func( ( FLA_Uplo ) t->int_arg[0],
1749 t->fla_arg[0],
1750 t->output_arg[0],
1751 ( fla_scalr_t* ) t->cntl );
1752 }
1753 // FLA_Obj_create_buffer
1754 else if ( t->func == (void *) FLA_Obj_create_buffer_task )
1755 {
1756 flash_obj_create_buffer_p func;
1757 func = (flash_obj_create_buffer_p) t->func;
1758
1759 func( ( dim_t ) t->int_arg[0],
1760 ( dim_t ) t->int_arg[1],
1761 t->output_arg[0],
1762 t->cntl );
1763 }
1764 // FLA_Obj_free_buffer
1765 else if ( t->func == (void *) FLA_Obj_free_buffer_task )
1766 {
1767 flash_obj_free_buffer_p func;
1768 func = (flash_obj_free_buffer_p) t->func;
1769
1770 func( t->output_arg[0],
1771 t->cntl );
1772 }
1773 else
1774 {
1775 FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
1776 }
1777
1778 return;
1779 }
1780
1781
FLASH_Queue_verbose_output(void)1782 void FLASH_Queue_verbose_output( void )
1783 /*----------------------------------------------------------------------------
1784
1785 FLASH_Queue_verbose_output
1786
1787 ----------------------------------------------------------------------------*/
1788 {
1789 int i, j, k;
1790 int n_threads = FLASH_Queue_get_num_threads();
1791 int n_tasks = FLASH_Queue_get_num_tasks();
1792 FLASH_Verbose verbose = FLASH_Queue_get_verbose_output();
1793 FLASH_Task* t;
1794 FLASH_Dep* d;
1795
1796 // Grab the head of the task queue.
1797 t = FLASH_Queue_get_head_task();
1798
1799 if ( verbose == FLASH_QUEUE_VERBOSE_READABLE )
1800 {
1801 // Iterate over linked list of tasks.
1802 for ( i = 0; i < n_tasks; i++ )
1803 {
1804 printf( "%d\t%s\t", t->order, t->name );
1805
1806 for ( j = 0; j < t->n_output_args; j++ )
1807 printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1808 t->output_arg[j].base->m_index,
1809 t->output_arg[j].base->n_index );
1810
1811 printf( ":= " );
1812
1813 for ( j = 0; j < t->n_output_args; j++ )
1814 printf( "%lu[%lu,%lu] ", t->output_arg[j].base->id,
1815 t->output_arg[j].base->m_index,
1816 t->output_arg[j].base->n_index );
1817
1818 for ( j = 0; j < t->n_input_args; j++ )
1819 printf( "%lu[%lu,%lu] ", t->input_arg[j].base->id,
1820 t->input_arg[j].base->m_index,
1821 t->input_arg[j].base->n_index );
1822
1823 printf( "\n" );
1824
1825 // Go to the next task.
1826 t = t->next_task;
1827 }
1828
1829 printf( "\n" );
1830 }
1831 else
1832 {
1833 printf( "digraph SuperMatrix {\n" );
1834
1835 if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE )
1836 {
1837 // Iterate over linked list of tasks.
1838 for ( i = 0; i < n_tasks; i++ )
1839 {
1840 printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
1841
1842 d = t->dep_arg_head;
1843 for ( j = 0; j < t->n_dep_args; j++ )
1844 {
1845 printf( "%d;", d->task->order );
1846 d = d->next_dep;
1847 }
1848
1849 printf( "};\n" );
1850
1851 // Go to the next task.
1852 t = t->next_task;
1853 }
1854 }
1855 else
1856 {
1857 // Iterate over all the threads.
1858 for ( k = 0; k < n_threads; k++ )
1859 {
1860 printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );
1861
1862 // Iterate over linked list of tasks.
1863 for ( i = 0; i < n_tasks; i++ )
1864 {
1865 if ( t->queue == k )
1866 printf( "%d [label=\"%s\"];\n", t->order, t->name );
1867
1868 // Go to the next task.
1869 t = t->next_task;
1870 }
1871
1872 printf( "}\n" );
1873
1874 // Grab the head of the task queue.
1875 t = FLASH_Queue_get_head_task();
1876 }
1877
1878 // Iterate over linked list of tasks.
1879 for ( i = 0; i < n_tasks; i++ )
1880 {
1881 printf( "%d -> {", t->order );
1882
1883 d = t->dep_arg_head;
1884 for ( j = 0; j < t->n_dep_args; j++ )
1885 {
1886 printf( "%d;", d->task->order );
1887 d = d->next_dep;
1888 }
1889
1890 printf( "};\n" );
1891
1892 // Go to the next task.
1893 t = t->next_task;
1894 }
1895 }
1896
1897 printf( "}\n\n" );
1898 }
1899
1900 return;
1901 }
1902
1903
1904 #endif // FLA_ENABLE_SUPERMATRIX
1905