1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
13 
14 #if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
15 #ifdef FLA_ENABLE_TIDSP
16 #include <ti/omp/omp.h>
17 #else
18 #include <omp.h>
19 #endif
20 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
21 #include <pthread.h>
22 #endif
23 
24 #ifdef FLA_ENABLE_WINDOWS_BUILD
25 #define _CRT_RAND_S
26 #include <stdlib.h>
27 #endif
28 
29 
30 #ifdef FLA_ENABLE_SUPERMATRIX
31 
32 #ifndef FLA_ENABLE_SCC
33 
34 #define MIN_CACHE_BLOCKS  3
35 
36 #ifdef FLA_ENABLE_GPU
37 typedef struct FLA_Obj_gpu_struct
38 {
39    // Block stored in a GPU.
40    FLA_Obj      obj;
41 
42    // Pointer to the data stored on the GPU.
43    void*        buffer_gpu;
44 
45    // Whether the block is clean or dirty on the GPU.
46    FLA_Bool     clean;
47 
48    // Whether the block has been requested by another GPU.
49    FLA_Bool     request;
50 
51 } FLA_Obj_gpu;
52 #endif
53 
54 typedef struct FLASH_Queue_variables
55 {
56    // A lock on the global task counter.
57    // Needed only when multithreading is enabled.
58    FLA_Lock     all_lock;
59 
60    // A lock that protects the thread's waiting queue.
61    // Needed only when multithreading is enabled.
62    FLA_Lock*    run_lock;
63 
64    // A lock that allows threads to safely check for and place ready dependent
65    // tasks on waiting queue.  Needed only when multithreading is enabled.
66    FLA_Lock*    dep_lock;
67 
68    // A lock that allows threads to safely free the anti-dependency queue
69    // within each block.  Needed only when multithreading is enabled.
70    FLA_Lock*    war_lock;
71 
72    // A lock that allows threads to safely access the cache structures.
73    // Needed only when multithreading is enabled.
74    FLA_Lock*    cac_lock;
75 
76    // Number of queues.
77    int          n_queues;
78 
79    // Number of caches.
80    int          n_caches;
81 
82    // The number of blocks that can be stored in the cache on each thread.
83    int          size;
84 
85    // LRU cache simulation of blocks.
86    FLA_Obj*     cache;
87 
88    // List of blocks accessed by the first tasks.
89    FLA_Obj*     prefetch;
90 
91    // The waiting queue of tasks for each thread.
92    FLASH_Queue* wait_queue;
93 
94    // A global task counter that keeps track of how many tasks on the waiting
95    // queue have been processed.
96    int          pc;
97 
98 #ifdef FLA_ENABLE_GPU
99    // A lock that allows threads to safely access the cache structures.
100    // Needed only when multithreading is enabled.
101    FLA_Lock*    gpu_lock;
102 
103    // LRU software cache of GPU memory.
104    FLA_Obj_gpu* gpu;
105 
106    // Storing the block being evicted.
107    FLA_Obj_gpu* victim;
108 
109    // Temporary storage for logging blocks on GPU.
110    FLA_Obj_gpu* gpu_log;
111 
112    // The size of each block to allocate on GPU.
113    dim_t        block_size;
114 
115    // The datatype of each block to allocate on GPU.
116    FLA_Datatype datatype;
117 #endif
118 } FLASH_Queue_vars;
119 
120 
FLASH_Queue_exec(void)121 void FLASH_Queue_exec( void )
122 /*----------------------------------------------------------------------------
123 
124    FLASH_Queue_exec
125 
126 ----------------------------------------------------------------------------*/
127 {
128    int          n_tasks    = FLASH_Queue_get_num_tasks();
129    int          n_threads  = FLASH_Queue_get_num_threads();
130    int          n_queues;
131    int          n_caches;
132    int          size;
133    int          i;
134    dim_t        block_size = FLASH_Queue_get_block_size();
135    double       dtime;
136 
137    FLA_Lock*    run_lock;
138    FLA_Lock*    dep_lock;
139    FLA_Lock*    war_lock;
140    FLA_Lock*    cac_lock;
141 
142    FLA_Obj*     cache;
143    FLA_Obj*     prefetch;
144    FLASH_Queue* wait_queue;
145 
146 #ifdef FLA_ENABLE_GPU
147 #ifdef FLA_ENABLE_MULTITHREADING
148    FLA_Lock*    gpu_lock;
149 #endif
150    FLA_Obj_gpu* gpu;
151    FLA_Obj_gpu* victim;
152    FLA_Obj_gpu* gpu_log;
153    dim_t        gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
154 #endif
155 
156    // All the necessary variables for the SuperMatrix mechanism.
157    FLASH_Queue_vars args;
158 
159    // If the queue is empty, return early.
160    if ( n_tasks == 0 )
161       return;
162 
163 #ifndef FLA_ENABLE_MULTITHREADING
164    // Turn off work stealing in simulation mode.
165    FLASH_Queue_set_work_stealing( FALSE );
166 #endif
167 
168    // Query the number of user set threads per queue.
169    n_queues = FLASH_Queue_get_cores_per_queue();
170 
171    // Default user setting for number of threads.
172    if ( n_queues <= 0 )
173    {
174       // Do not use data affinity or work stealing when caching is enabled.
175       if ( FLASH_Queue_get_caching() )
176       {
177          FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
178          FLASH_Queue_set_work_stealing( FALSE );
179       }
180 
181       // Do not use work stealing when data affinity is enabled.
182       if ( FLASH_Queue_get_data_affinity() != FLASH_QUEUE_AFFINITY_NONE )
183       {
184          FLASH_Queue_set_work_stealing( FALSE );
185       }
186 
187       // Allocate different arrays if using data affinity.
188       n_queues = ( FLASH_Queue_get_data_affinity() ==
189                    FLASH_QUEUE_AFFINITY_NONE &&
190                    !FLASH_Queue_get_work_stealing() ? 1 : n_threads );
191    }
192    else
193    {
194       // Set the number of queues.
195       n_queues = n_threads / n_queues;
196 
197       // Must use at least one queue.
198       if ( n_queues == 0 )
199          n_queues = 1;
200 
201       if ( n_queues == 1 )
202       {
203          // Turn off all multiple queue implementations.
204          FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
205          FLASH_Queue_set_work_stealing( FALSE );
206       }
207       else
208       {
209          // Use 2D data affinity for multiple queues if nothing is set.
210          if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE &&
211               !FLASH_Queue_get_work_stealing() )
212          {
213             FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC );
214          }
215       }
216    }
217 
218    // Determine the number of caches.
219    n_caches = n_threads / FLASH_Queue_get_cores_per_cache();
220 
221    args.n_queues = n_queues;
222    args.n_caches = n_caches;
223 
224 #ifdef FLA_ENABLE_MULTITHREADING
225    // Allocate memory for array of locks.
226    run_lock = ( FLA_Lock* ) FLA_malloc( n_queues  * sizeof( FLA_Lock ) );
227    dep_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
228    war_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
229    cac_lock = ( FLA_Lock* ) FLA_malloc( n_caches  * sizeof( FLA_Lock ) );
230 
231    args.run_lock = run_lock;
232    args.dep_lock = dep_lock;
233    args.war_lock = war_lock;
234    args.cac_lock = cac_lock;
235 
236    // Initialize the all lock.
237    FLA_Lock_init( &(args.all_lock) );
238 
239    // Initialize the run lock for thread i.
240    for ( i = 0; i < n_queues; i++ )
241    {
242       FLA_Lock_init( &(args.run_lock[i]) );
243    }
244 
245    // Initialize the dep and war locks for thread i.
246    for ( i = 0; i < n_threads; i++ )
247    {
248       FLA_Lock_init( &(args.dep_lock[i]) );
249       FLA_Lock_init( &(args.war_lock[i]) );
250    }
251 
252    // Initialize the cac locks for each cache.
253    for ( i = 0; i < n_caches; i++ )
254    {
255       FLA_Lock_init( &(args.cac_lock[i]) );
256    }
257 #endif
258 
259    // The number of blocks that can fit into the cache on each thread.
260    if ( block_size == 0 )
261       size = MIN_CACHE_BLOCKS;
262    else
263       size = max( FLASH_Queue_get_cache_size() / block_size, MIN_CACHE_BLOCKS);
264    args.size = size;
265 
266    // Allocate memory for cache, prefetch buffer, and waiting queue.
267    cache = ( FLA_Obj* ) FLA_malloc( size * n_caches * sizeof( FLA_Obj ) );
268    prefetch = ( FLA_Obj* ) FLA_malloc( size * sizeof( FLA_Obj ) );
269    wait_queue = ( FLASH_Queue* ) FLA_malloc( n_queues * sizeof( FLASH_Queue ));
270 
271    args.cache = cache;
272    args.prefetch = prefetch;
273    args.wait_queue = wait_queue;
274 
275    // Initialize cache, prefetch buffer, and waiting queue.
276    for ( i = 0; i < size * n_caches; i++ )
277       args.cache[i].base = NULL;
278 
279    for ( i = 0; i < size; i++ )
280       args.prefetch[i].base = NULL;
281 
282    for ( i = 0; i < n_queues; i++ )
283    {
284       args.wait_queue[i].n_tasks = 0;
285       args.wait_queue[i].head = NULL;
286       args.wait_queue[i].tail = NULL;
287    }
288 
289    // Initialize the aggregate task counter.
290    args.pc = 0;
291 
292 #ifdef FLA_ENABLE_GPU
293 #ifdef FLA_ENABLE_MULTITHREADING
294    // Allocate and initialize the gpu locks.
295    gpu_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
296    args.gpu_lock = gpu_lock;
297 
298    for ( i = 0; i < n_threads; i++ )
299       FLA_Lock_init( &(args.gpu_lock[i]) );
300 #endif
301    // Allocate and initialize GPU software cache.
302    gpu = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
303    args.gpu = gpu;
304 
305    for ( i = 0; i < gpu_n_blocks * n_threads; i++ )
306    {
307       args.gpu[i].obj.base   = NULL;
308       args.gpu[i].buffer_gpu = NULL;
309       args.gpu[i].clean      = TRUE;
310       args.gpu[i].request    = FALSE;
311    }
312 
313    victim = ( FLA_Obj_gpu* ) FLA_malloc( n_threads * sizeof( FLA_Obj_gpu ) );
314    args.victim = victim;
315 
316    for ( i = 0; i < n_threads; i++ )
317       args.victim[i].obj.base = NULL;
318 
319    gpu_log = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
320    args.gpu_log = gpu_log;
321 #endif
322 
323    // Initialize tasks with critical information.
324    FLASH_Queue_init_tasks( ( void* ) &args );
325 
326    // Display verbose output before free all tasks.
327    if ( FLASH_Queue_get_verbose_output() )
328       FLASH_Queue_verbose_output();
329 
330    // Start timing the parallel execution.
331    dtime = FLA_Clock();
332 
333 #ifdef FLA_ENABLE_MULTITHREADING
334    // Parallel Execution!
335    FLASH_Queue_exec_parallel( ( void* ) &args );
336 #else
337    // Simulation!
338    FLASH_Queue_exec_simulation( ( void* ) &args );
339 #endif
340 
341    // End timing the parallel execution.
342    dtime = FLA_Clock() - dtime;
343    FLASH_Queue_set_parallel_time( dtime );
344 
345 #ifdef FLA_ENABLE_MULTITHREADING
346    // Destroy the locks.
347    FLA_Lock_destroy( &(args.all_lock) );
348 
349    for ( i = 0; i < n_queues; i++ )
350    {
351       FLA_Lock_destroy( &(args.run_lock[i]) );
352    }
353 
354    for ( i = 0; i < n_threads; i++ )
355    {
356       FLA_Lock_destroy( &(args.dep_lock[i]) );
357       FLA_Lock_destroy( &(args.war_lock[i]) );
358    }
359 
360    for ( i = 0; i < n_caches; i++ )
361    {
362       FLA_Lock_destroy( &(args.cac_lock[i]) );
363    }
364 
365    // Deallocate memory.
366    FLA_free( run_lock );
367    FLA_free( dep_lock );
368    FLA_free( war_lock );
369    FLA_free( cac_lock );
370 #endif
371 
372    FLA_free( cache );
373    FLA_free( prefetch );
374    FLA_free( wait_queue );
375 
376 #ifdef FLA_ENABLE_GPU
377 #ifdef FLA_ENABLE_MULTITHREADING
378    for ( i = 0; i < n_threads; i++ )
379       FLA_Lock_destroy( &(args.gpu_lock[i]) );
380    FLA_free( gpu_lock );
381 #endif
382    FLA_free( gpu );
383    FLA_free( victim );
384    FLA_free( gpu_log );
385 #endif
386 
387    // Reset values for next call to FLASH_Queue_exec().
388    FLASH_Queue_reset();
389 
390    return;
391 }
392 
393 
FLASH_Queue_init_tasks(void * arg)394 void FLASH_Queue_init_tasks( void* arg )
395 /*----------------------------------------------------------------------------
396 
397    FLASH_Queue_init_tasks
398 
399 ----------------------------------------------------------------------------*/
400 {
401    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
402    int            i, j, k;
403    int            n_tasks    = FLASH_Queue_get_num_tasks();
404    int            n_queues   = args->n_queues;
405    int            n_prefetch = 0;
406    int            n_ready    = 0;
407    int            length     = 0;
408    int            width      = 0;
409    int            height     = 0;
410    int            size       = args->size;
411    FLASH_Data_aff data_aff   = FLASH_Queue_get_data_affinity();
412    FLASH_Task*    t;
413    FLASH_Dep*     d;
414    FLA_Obj        obj;
415 
416 #ifdef FLA_ENABLE_GPU
417    dim_t block_size      = 0;
418    FLA_Datatype datatype = FLA_FLOAT;
419    dim_t datatype_size   = FLA_Obj_datatype_size( datatype );
420 #endif
421 
422    // Find the 2D factorization of the number of threads.
423    if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
424    {
425       int sq_rt = 0;
426       while ( sq_rt * sq_rt <= n_queues ) sq_rt++;
427       sq_rt--;
428       while ( n_queues % sq_rt != 0 ) sq_rt--;
429       length = n_queues / sq_rt;
430       width  = sq_rt;
431    }
432 
433    // Grab the tail of the task queue.
434    t = FLASH_Queue_get_tail_task();
435 
436    for ( i = n_tasks - 1; i >= 0; i-- )
437    {
438       // Determine data affinity.
439       if ( data_aff == FLASH_QUEUE_AFFINITY_NONE )
440       { // No data affinity
441          t->queue = 0;
442       }
443       else
444       {
445          // Use the first output block to determine data affinity.
446          obj = t->output_arg[0];
447 
448          // Use the top left block of the macroblock.
449          if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
450             obj = *FLASH_OBJ_PTR_AT( obj );
451 
452          if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
453          { // Two-dimensional block cyclic
454             t->queue = ( obj.base->m_index % length ) +
455                ( obj.base->n_index % width  ) * length;
456          }
457          else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_ROW_BLOCK_CYCLIC )
458          { // One-dimensional row block cyclic
459             t->queue = obj.base->m_index % n_queues;
460          }
461          else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_COLUMN_BLOCK_CYCLIC )
462          { // One-dimensional column block cyclic
463             t->queue = obj.base->n_index % n_queues;
464          }
465          else
466          { // Round-robin
467             t->queue = t->queue % n_queues;
468          }
469       }
470 
471       // Determine the height of each task in the DAG.
472       height = 0;
473       d = t->dep_arg_head;
474 
475       // Take the maximum height of dependent tasks.
476       for ( j = 0; j < t->n_dep_args; j++ )
477       {
478          height = max( height, d->task->height );
479          d = d->next_dep;
480       }
481 
482       t->height = height + 1;
483 
484       // Since freeing a task is always a leaf, we want to force it to execute
485       // earlier by giving it a greater height in order to reclaim memory.
486       if ( t->func == (void *) FLA_Obj_free_buffer_task )
487          t->height += n_tasks;
488 
489 #ifdef FLA_ENABLE_GPU
490       for ( j = 0; j < t->n_output_args + t->n_input_args; j++ )
491       {
492          // Find the correct input or output argument.
493          if ( j < t->n_output_args )
494             obj = t->output_arg[j];
495          else
496             obj = t->input_arg[j - t->n_output_args];
497 
498          // Macroblock is used.
499          if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
500          {
501             dim_t    jj, kk;
502             dim_t    m   = FLA_Obj_length( obj );
503             dim_t    n   = FLA_Obj_width( obj );
504             dim_t    cs  = FLA_Obj_col_stride( obj );
505             FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
506 
507             // Check each block in macroblock.
508             for ( jj = 0; jj < n; jj++ )
509             {
510                for ( kk = 0; kk < m; kk++ )
511                {
512                   obj = *( buf + jj * cs + kk );
513 
514                   block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
515 
516                   if ( jj == 0 && FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
517                   {
518                      datatype      = FLA_Obj_datatype( obj );
519                      datatype_size = FLA_Obj_datatype_size( datatype );
520                   }
521                }
522             }
523          }
524          else // Regular block.
525          {
526             block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
527 
528             if ( FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
529             {
530                datatype      = FLA_Obj_datatype( obj );
531                datatype_size = FLA_Obj_datatype_size( datatype );
532             }
533          }
534       }
535 #endif
536 
537       // Find the first blocks accessed each task.
538       if ( n_prefetch < size )
539       {
540          for ( j = 0; j < t->n_output_args; j++ )
541          {
542             obj = t->output_arg[j];
543 
544             // Macroblock is used.
545             if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
546             {
547                dim_t    jj, kk;
548                dim_t    m   = FLA_Obj_length( obj );
549                dim_t    n   = FLA_Obj_width( obj );
550                dim_t    cs  = FLA_Obj_col_stride( obj );
551                FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
552 
553                // Check each block in macroblock.
554                for ( jj = 0; jj < n; jj++ )
555                {
556                   for ( kk = 0; kk < m; kk++ )
557                   {
558                      obj = *( buf + jj * cs + kk );
559 
560                      k = obj.base->n_write_blocks;
561 
562                      // This block is one of the first blocks to be accessed.
563                      if ( k < size && k == n_prefetch )
564                      {
565                         args->prefetch[k] = obj;
566                         n_prefetch++;
567                      }
568                   }
569                }
570             }
571             else // Regular block.
572             {
573                k = obj.base->n_write_blocks;
574 
575                // This block is one of the first blocks to be accessed.
576                if ( k < size && k == n_prefetch )
577                {
578                   args->prefetch[k] = obj;
579                   n_prefetch++;
580                }
581             }
582          }
583       }
584 
585       // Find all ready tasks.
586       t->n_ready += t->n_input_args + t->n_output_args +
587                     t->n_macro_args + t->n_war_args;
588 
589       if ( t->n_ready == 0 )
590       {
591          // Save the number of ready and available tasks.
592          n_ready++;
593       }
594 
595       // Go to the previous task.
596       t = t->prev_task;
597    }
598 
599    // Grab the head of the task queue.
600    t = FLASH_Queue_get_head_task();
601 
602    for ( i = 0; i < n_tasks && n_ready > 0; i++ )
603    {
604       if ( t->n_ready == 0 )
605       {
606          // Enqueue all the ready and available tasks.
607          FLASH_Queue_wait_enqueue( t, arg );
608 
609          // Decrement the number of ready tasks left to be enqueued.
610          n_ready--;
611       }
612 
613       // Go to the next task.
614       t = t->next_task;
615    }
616 
617 #ifdef FLA_ENABLE_GPU
618    args->block_size = block_size;
619    args->datatype   = datatype;
620 #endif
621 
622    return;
623 }
624 
625 
FLASH_Queue_wait_enqueue(FLASH_Task * t,void * arg)626 void FLASH_Queue_wait_enqueue( FLASH_Task* t, void* arg )
627 /*----------------------------------------------------------------------------
628 
629    FLASH_Queue_wait_enqueue
630 
631 ----------------------------------------------------------------------------*/
632 {
633    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
634    int queue = t->queue;
635 
636    if ( args->wait_queue[queue].n_tasks == 0 )
637    {
638       args->wait_queue[queue].head = t;
639       args->wait_queue[queue].tail = t;
640    }
641    else
642    {
643       t->prev_wait = args->wait_queue[queue].tail;
644 
645       // Insertion sort of tasks in waiting queue.
646       if ( FLASH_Queue_get_sorting() )
647       {
648          while ( t->prev_wait != NULL )
649          {
650             if ( t->prev_wait->height >= t->height )
651                break;
652 
653             t->next_wait = t->prev_wait;
654             t->prev_wait = t->prev_wait->prev_wait;
655          }
656       }
657 
658       // Checking if the task is the head of the waiting queue.
659       if ( t->prev_wait == NULL )
660          args->wait_queue[queue].head = t;
661       else
662          t->prev_wait->next_wait = t;
663 
664       // Checking if the task is the tail of the waiting queue.
665       if ( t->next_wait == NULL )
666          args->wait_queue[queue].tail = t;
667       else
668          t->next_wait->prev_wait = t;
669    }
670 
671    // Increment number of tasks on waiting queue.
672    args->wait_queue[queue].n_tasks++;
673 
674    return;
675 }
676 
677 
FLASH_Queue_wait_dequeue(int queue,int cache,void * arg)678 FLASH_Task* FLASH_Queue_wait_dequeue( int queue, int cache, void* arg )
679 /*----------------------------------------------------------------------------
680 
681    FLASH_Queue_wait_dequeue
682 
683 ----------------------------------------------------------------------------*/
684 {
685    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
686    FLASH_Task* t = NULL;
687    FLA_Bool enabled = FALSE;
688 
689 #ifdef FLA_ENABLE_GPU
690    enabled = FLASH_Queue_get_enabled_gpu();
691 #endif
692 
693    if ( args->wait_queue[queue].n_tasks > 0 )
694    {
695       // Dequeue the first task.
696       t = args->wait_queue[queue].head;
697 
698       if ( args->wait_queue[queue].n_tasks == 1 )
699       {
700          // Clear the queue of its only task.
701          args->wait_queue[queue].head = NULL;
702          args->wait_queue[queue].tail = NULL;
703       }
704       else
705       {
706          // Grab a new task if using cache affinity.
707 	 if ( FLASH_Queue_get_caching() )
708 	 {
709             // Determine if using GPU or not.
710             if ( enabled )
711             {
712 #ifdef FLA_ENABLE_GPU
713 #ifdef FLA_ENABLE_MULTITHREADING
714                FLA_Lock_acquire( &(args->gpu_lock[cache]) ); // G ***
715 #endif
716                // Find a task where the task has blocks currently in GPU.
717                t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
718 
719 #ifdef FLA_ENABLE_MULTITHREADING
720                FLA_Lock_release( &(args->gpu_lock[cache]) ); // G ***
721 #endif
722 #endif
723             }
724             else
725             {
726 #ifdef FLA_ENABLE_MULTITHREADING
727                FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
728 #endif
729                // Find a task where the task has blocks currently in cache.
730                t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
731 
732 #ifdef FLA_ENABLE_MULTITHREADING
733                FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
734 #endif
735             }
736 
737 	    // Adjust pointers if the task is head of waiting queue.
738 	    if ( t->prev_wait == NULL )
739 	    {
740 	       args->wait_queue[queue].head = t->next_wait;
741 	       args->wait_queue[queue].head->prev_wait = NULL;
742 	    }
743 	    else
744 	    {
745 	       t->prev_wait->next_wait = t->next_wait;
746 	    }
747 
748 	    // Adjust pointers if the task is tail of waiting queue.
749 	    if ( t->next_wait == NULL )
750 	    {
751 	       args->wait_queue[queue].tail = t->prev_wait;
752 	       args->wait_queue[queue].tail->next_wait = NULL;
753 	    }
754 	    else
755 	    {
756 	       t->next_wait->prev_wait = t->prev_wait;
757 	    }
758 	 }
759 	 else
760 	 {
761 	    // Adjust pointers in waiting queue.
762 	    args->wait_queue[queue].head = t->next_wait;
763 	    args->wait_queue[queue].head->prev_wait = NULL;
764 	 }
765       }
766 
767       // Clear the task's waiting linked list pointers.
768       t->prev_wait = NULL;
769       t->next_wait = NULL;
770 
771       // Decrement number of tasks on waiting queue.
772       args->wait_queue[queue].n_tasks--;
773    }
774 
775    return t;
776 }
777 
778 
FLASH_Queue_wait_dequeue_block(int queue,int cache,void * arg)779 FLASH_Task* FLASH_Queue_wait_dequeue_block( int queue, int cache, void* arg )
780 /*----------------------------------------------------------------------------
781 
782    FLASH_Queue_wait_dequeue_block
783 
784 ----------------------------------------------------------------------------*/
785 {
786    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
787    int         i, j, k;
788    int         size    = args->size;
789    int         n_tasks = args->wait_queue[queue].n_tasks;
790    FLA_Bool    enabled = FALSE;
791    FLASH_Task* t;
792    FLA_Obj     obj;
793    FLA_Obj     mem;
794 
795 #ifdef FLA_ENABLE_GPU
796    enabled = FLASH_Queue_get_enabled_gpu();
797 
798    // If using GPUs, then only check GPU and not the cache.
799    if ( enabled )
800       size = FLASH_Queue_get_gpu_num_blocks();
801 #endif
802 
803    t = args->wait_queue[queue].head;
804 
805    // Check if any of the output blocks are in the cache.
806    for ( i = 0; i < n_tasks; i++ )
807    {
808       for ( j = 0; j < size; j++ )
809       {
810          // Initialize the memory just in case.
811          mem.base = NULL;
812 
813          // Determine if using GPU or not.
814          if ( enabled )
815          {
816 #ifdef FLA_ENABLE_GPU
817             mem = args->gpu[cache * size + j].obj;
818 #endif
819          }
820          else
821          {
822             mem = args->cache[cache * size + j];
823          }
824 
825 	 for ( k = 0; k < t->n_output_args; k++ )
826 	 {
827             obj = t->output_arg[k];
828 
829             if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
830                obj = *FLASH_OBJ_PTR_AT( obj );
831 
832             // Return the task if its output block is in cache.
833             if ( mem.base == obj.base )
834             {
835                t->hit = TRUE;
836                return t;
837             }
838 	 }
839       }
840       t = t->next_wait;
841    }
842 
843    return args->wait_queue[queue].head;
844 }
845 
846 
FLASH_Queue_update_cache(FLASH_Task * t,void * arg)847 void FLASH_Queue_update_cache( FLASH_Task* t, void* arg )
848 /*----------------------------------------------------------------------------
849 
850    FLASH_Queue_update_cache
851 
852 ----------------------------------------------------------------------------*/
853 {
854    int      i, j;
855    FLA_Bool duplicate;
856    FLA_Obj  obj;
857 
858    if ( t == NULL )
859       return;
860 
861    // Updating the input blocks.
862    for ( i = t->n_input_args - 1; i >= 0; i-- )
863    {
864       // Check for duplicate blocks.
865       duplicate = FALSE;
866 
867       for ( j = 0; j < t->n_output_args && !duplicate; j++ )
868       {
869 	 if ( t->input_arg[i].base == t->output_arg[j].base )
870 	    duplicate = TRUE;
871       }
872 
873       for ( j = 0; j < i && !duplicate; j++ )
874       {
875 	 if ( t->input_arg[i].base == t->input_arg[j].base )
876 	    duplicate = TRUE;
877       }
878 
879       // If the input block has not been processed before.
880       if ( !duplicate )
881       {
882          obj = t->input_arg[i];
883 
884          // Macroblock is used.
885          if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
886          {
887             dim_t    jj, kk;
888             dim_t    m    = FLA_Obj_length( obj );
889             dim_t    n    = FLA_Obj_width( obj );
890             dim_t    cs   = FLA_Obj_col_stride( obj );
891             FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
892 
893             // Dependence analysis for each input block in macroblock.
894             for ( jj = 0; jj < n; jj++ )
895                for ( kk = 0; kk < m; kk++ )
896                   FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
897                                                   t->cache, FALSE, arg );
898          }
899          else // Regular block.
900          {
901             FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
902          }
903       }
904    }
905 
906    // Updating the output blocks.
907    for ( i = t->n_output_args - 1; i >= 0; i-- )
908    {
909       // Check for duplicate blocks.
910       duplicate = FALSE;
911 
912       for ( j = 0; j < i && !duplicate; j++ )
913       {
914 	 if ( t->output_arg[i].base == t->output_arg[j].base )
915 	    duplicate = TRUE;
916       }
917 
918       // If the output block has not been processed before.
919       if ( !duplicate )
920       {
921          obj = t->output_arg[i];
922 
923          // Macroblock is used.
924          if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
925          {
926             dim_t    jj, kk;
927             dim_t    m    = FLA_Obj_length( obj );
928             dim_t    n    = FLA_Obj_width( obj );
929             dim_t    cs   = FLA_Obj_col_stride( obj );
930             FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
931 
932             // Dependence analysis for each input block in macroblock.
933             for ( jj = 0; jj < n; jj++ )
934                for ( kk = 0; kk < m; kk++ )
935                   FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
936                                                   t->cache, TRUE, arg );
937          }
938          else // Regular block.
939          {
940             FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
941          }
942       }
943    }
944 
945    return;
946 }
947 
948 
FLASH_Queue_update_cache_block(FLA_Obj obj,int cache,FLA_Bool output,void * arg)949 void FLASH_Queue_update_cache_block( FLA_Obj obj,
950                                      int cache,
951                                      FLA_Bool output,
952                                      void* arg )
953 /*----------------------------------------------------------------------------
954 
955    FLASH_Queue_update_cache_block
956 
957 ----------------------------------------------------------------------------*/
958 {
959    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
960    int i, j, k;
961    int n_caches = args->n_caches;
962    int size     = args->size;
963 
964 #ifdef FLA_ENABLE_MULTITHREADING
965    FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
966 #endif
967 
968    // Locate the position of the block in the cache.
969    for ( k = 0; k < size - 1; k++ )
970    {
971       if ( obj.base == args->cache[cache * size + k].base )
972 	 break;
973    }
974 
975    // Shift all the previous tasks for LRU replacement.
976    for ( j = k; j > 0; j-- )
977       args->cache[cache * size + j] = args->cache[cache * size + j - 1];
978 
979    // Place the block on the cache as the most recently used.
980    args->cache[cache * size] = obj;
981 
982 #ifdef FLA_ENABLE_MULTITHREADING
983    FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
984 #endif
985 
986    // Write invalidate if updating with output block.
987    if ( output )
988    {
989       for ( i = 0; i < n_caches; i++ )
990       {
991          if ( i != cache )
992          {
993 #ifdef FLA_ENABLE_MULTITHREADING
994 	    FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
995 #endif
996 	    // Locate the position of the block in the cache.
997 	    for ( k = 0; k < size; k++ )
998 	    {
999 	       if ( obj.base == args->cache[i * size + k].base )
1000 		  break;
1001 	    }
1002 
1003 	    // The block is owned by other thread.
1004 	    if ( k < size )
1005 	    {
1006 	       // Shift all the blocks for the invalidated block.
1007 	       for ( j = k; j < size - 1; j++ )
1008 		  args->cache[i * size + j] = args->cache[i * size + j + 1];
1009 
1010 	       // Invalidate the block.
1011 	       args->cache[i * size + size - 1].base = NULL;
1012 	    }
1013 #ifdef FLA_ENABLE_MULTITHREADING
1014 	    FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
1015 #endif
1016          }
1017       }
1018    }
1019 
1020    return;
1021 }
1022 
1023 
FLASH_Queue_prefetch(int cache,void * arg)1024 void FLASH_Queue_prefetch( int cache, void* arg )
1025 /*----------------------------------------------------------------------------
1026 
1027    FLASH_Queue_prefetch
1028 
1029 ----------------------------------------------------------------------------*/
1030 {
1031    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1032    int i;
1033    int size = args->size;
1034    FLA_Obj obj;
1035 
1036    // Prefetch blocks in opposite order to maintain LRU.
1037    for ( i = size - 1; i >= 0; i-- )
1038    {
1039       obj = args->prefetch[i];
1040 
1041       // Only prefetch if it is a valid block.
1042       if ( obj.base != NULL )
1043       {
1044          // Prefetch the block.
1045          FLASH_Queue_prefetch_block( obj );
1046 
1047          // Record the prefetched block in the cache.
1048          args->cache[cache * size + i] = obj;
1049       }
1050    }
1051 
1052    return;
1053 }
1054 
1055 
FLASH_Queue_prefetch_block(FLA_Obj obj)1056 void FLASH_Queue_prefetch_block( FLA_Obj obj )
1057 /*----------------------------------------------------------------------------
1058 
1059    FLASH_Queue_prefetch_block
1060 
1061 ----------------------------------------------------------------------------*/
1062 {
1063    int          i, inc;
1064    int          line_size = FLASH_Queue_get_cache_line_size();
1065    int          elem_size = FLA_Obj_elem_size( obj );
1066    int          length    = FLA_Obj_length( obj );
1067    int          width     = FLA_Obj_width( obj );
1068    FLA_Datatype datatype  = FLA_Obj_datatype( obj );
1069 
1070    // Determine stride to prefetch block into cache.
1071    inc = line_size / elem_size;
1072 
1073    // Switch between the four different datatypes.
1074    switch ( datatype )
1075    {
1076       case FLA_FLOAT:
1077       {
1078          float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
1079          float access;
1080 
1081          // Access each cache line of the block.
1082          for ( i = 0; i < length * width; i += inc )
1083             access = buffer[i];
1084 
1085          // Prevent dead code elimination.
1086          access += 1.0;
1087 
1088          break;
1089       }
1090       case FLA_DOUBLE:
1091       {
1092          double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
1093          double access;
1094 
1095          // Access each cache line of the block.
1096          for ( i = 0; i < length * width; i += inc )
1097             access = buffer[i];
1098 
1099          // Prevent dead code elimination.
1100          access += 1.0;
1101 
1102          break;
1103       }
1104       case FLA_COMPLEX:
1105       {
1106          scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
1107          scomplex access;
1108 
1109          // Access each cache line of the block.
1110          for ( i = 0; i < length * width; i += inc )
1111             access = buffer[i];
1112 
1113          // Prevent dead code elimination.
1114          access.real += 1.0;
1115 
1116          break;
1117       }
1118       case FLA_DOUBLE_COMPLEX:
1119       {
1120          dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
1121          dcomplex access;
1122 
1123          // Access each cache line of the block.
1124          for ( i = 0; i < length * width; i += inc )
1125             access = buffer[i];
1126 
1127          // Prevent dead code elimination.
1128          access.real += 1.0;
1129 
1130          break;
1131       }
1132       case FLA_INT:
1133       {
1134          int *buffer = ( int * ) FLA_INT_PTR( obj );
1135          int access;
1136 
1137          // Access each cache line of the block.
1138          for ( i = 0; i < length * width; i += inc )
1139             access = buffer[i];
1140 
1141          // Prevent dead code elimination.
1142          access += 1.0;
1143 
1144          break;
1145       }
1146       default:
1147          // This default case should never execute.
1148          FLA_Check_error_code( FLA_INVALID_DATATYPE );
1149    }
1150 
1151    return;
1152 }
1153 
1154 
FLASH_Queue_work_stealing(int queue,void * arg)1155 FLASH_Task* FLASH_Queue_work_stealing( int queue, void *arg )
1156 /*----------------------------------------------------------------------------
1157 
1158    FLASH_Queue_work_stealing
1159 
1160 ----------------------------------------------------------------------------*/
1161 {
1162    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1163    int         q;
1164    int         n_queues = args->n_queues;
1165    FLASH_Task* t = NULL;
1166 
1167    // Do not perform work stealing if there is only one queue.
1168    if ( n_queues == 1 )
1169       return t;
1170 
1171    // Find a random queue not equal to the current queue.
1172    do
1173    {
1174 #ifdef FLA_ENABLE_WINDOWS_BUILD
1175       rand_s( &q );
1176       q = q % n_queues;
1177 #else
1178 #ifdef FLA_ENABLE_TIDSP
1179       q = rand() % n_queues;
1180 #else
1181       q = lrand48() % n_queues;
1182 #endif
1183 #endif
1184    }
1185    while ( q == queue );
1186 
1187 #ifdef FLA_ENABLE_MULTITHREADING
1188    FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
1189 #endif
1190 
1191    // If there are tasks that this thread can steal.
1192    if ( args->wait_queue[q].n_tasks > 0 )
1193    {
1194       // Dequeue the last task.
1195       t = args->wait_queue[q].tail;
1196 
1197       if ( args->wait_queue[q].n_tasks == 1 )
1198       {
1199          // Clear the queue of its only task.
1200          args->wait_queue[q].head = NULL;
1201          args->wait_queue[q].tail = NULL;
1202       }
1203       else
1204       {
1205          // Adjust pointers in waiting queue.
1206          args->wait_queue[q].tail = t->prev_wait;
1207          args->wait_queue[q].tail->next_wait = NULL;
1208       }
1209 
1210       // Reset waiting queue data about the stolen task.
1211       t->queue = queue;
1212       t->prev_wait = NULL;
1213       t->next_wait = NULL;
1214 
1215       args->wait_queue[q].n_tasks--;
1216    }
1217 
1218 #ifdef FLA_ENABLE_MULTITHREADING
1219    FLA_Lock_release( &(args->run_lock[q]) ); // R ***
1220 #endif
1221 
1222    return t;
1223 }
1224 
1225 #ifdef FLA_ENABLE_GPU
1226 
FLASH_Queue_create_gpu(int thread,void * arg)1227 void FLASH_Queue_create_gpu( int thread, void *arg )
1228 /*----------------------------------------------------------------------------
1229 
1230    FLASH_Queue_create_gpu
1231 
1232 ----------------------------------------------------------------------------*/
1233 {
1234    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1235    int i;
1236    dim_t gpu_n_blocks     = FLASH_Queue_get_gpu_num_blocks();
1237    dim_t block_size       = args->block_size;
1238    FLA_Datatype datatype  = args->datatype;
1239 
1240    // Exit if not using GPU.
1241    if ( !FLASH_Queue_get_enabled_gpu() )
1242       return;
1243 
1244    // Bind thread to GPU.
1245    FLASH_Queue_bind_gpu( thread );
1246 
1247    // Allocate the memory on the GPU for all the blocks a priori.
1248    for ( i = 0; i < gpu_n_blocks; i++ )
1249       FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
1250 
1251    return;
1252 }
1253 
1254 
FLASH_Queue_destroy_gpu(int thread,void * arg)1255 void FLASH_Queue_destroy_gpu( int thread, void *arg )
1256 /*----------------------------------------------------------------------------
1257 
1258    FLASH_Queue_destroy_gpu
1259 
1260 ----------------------------------------------------------------------------*/
1261 {
1262    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1263    int i;
1264    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1265    FLA_Obj_gpu gpu_obj;
1266 
1267    // Exit if not using GPU.
1268    if ( !FLASH_Queue_get_enabled_gpu() )
1269       return;
1270 
1271    // Examine every block left on the GPU.
1272    for ( i = 0; i < gpu_n_blocks; i++ )
1273    {
1274       gpu_obj = args->gpu[thread * gpu_n_blocks + i];
1275 
1276       // Flush the blocks that are dirty.
1277       if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1278          FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1279 
1280       // Free the memory on the GPU for all the blocks.
1281       FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
1282    }
1283 
1284    return;
1285 }
1286 
1287 
FLASH_Queue_exec_gpu(FLASH_Task * t,void * arg)1288 FLA_Bool FLASH_Queue_exec_gpu( FLASH_Task *t, void *arg )
1289 /*----------------------------------------------------------------------------
1290 
1291    FLASH_Queue_exec_gpu
1292 
1293 ----------------------------------------------------------------------------*/
1294 {
1295    void** input_arg;
1296    void** output_arg;
1297 
1298    if ( t == NULL )
1299       return TRUE;
1300 
1301    // If not using the GPU, then execute on CPU.
1302    if ( !FLASH_Queue_get_enabled_gpu() )
1303    {
1304       FLASH_Queue_exec_task( t );
1305 
1306       return TRUE;
1307    }
1308 
1309    // Check if all the operands are ready and up to date.
1310    if ( !FLASH_Queue_check_gpu( t, arg ) )
1311    {
1312       FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1313       int queue = t->queue;
1314       t->hit = FALSE;
1315 
1316 #ifdef FLA_ENABLE_MULTITHREADING
1317       FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
1318 #endif
1319       // Reenqueue the task if the blocks are not all flushed.
1320       FLASH_Queue_wait_enqueue( t, arg );
1321 
1322 #ifdef FLA_ENABLE_MULTITHREADING
1323       FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
1324 #endif
1325 
1326       return FALSE;
1327    }
1328 
1329    // If GPU is enabled, but the task is not supported for GPU execution.
1330    if ( !t->enabled_gpu )
1331    {
1332       int i, j, k;
1333       int thread        = t->thread;
1334       int n_input_args  = t->n_input_args;
1335       int n_output_args = t->n_output_args;
1336       int n_threads     = FLASH_Queue_get_num_threads();
1337       FLA_Bool duplicate;
1338       FLA_Obj  obj;
1339 
1340       // Check the blocks on each GPU.
1341       for ( k = 0; k < n_threads; k++ )
1342       {
1343          // Check the input and output arguments on the GPUs.
1344          for ( i = 0; i < n_input_args + n_output_args; i++ )
1345          {
1346             // Check for duplicate blocks.
1347             duplicate = FALSE;
1348 
1349             // Find the correct input or output argument.
1350             if ( i < n_input_args )
1351             {
1352                obj = t->input_arg[i];
1353 
1354                for ( j = 0; j < n_output_args && !duplicate; j++ )
1355                {
1356                   if ( obj.base == t->output_arg[j].base )
1357                      duplicate = TRUE;
1358                }
1359 
1360                for ( j = 0; j < i && !duplicate; j++ )
1361                {
1362                   if ( obj.base == t->input_arg[j].base )
1363                      duplicate = TRUE;
1364                }
1365             }
1366             else
1367             {
1368                obj = t->output_arg[i - n_input_args];
1369 
1370                for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1371                {
1372                   if ( obj.base == t->output_arg[j].base )
1373                      duplicate = TRUE;
1374                }
1375             }
1376 
1377             // If the block has not been processed before.
1378             if ( !duplicate )
1379             {
1380                // Macroblock is used.
1381                if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1382                {
1383                   dim_t    jj, kk;
1384                   dim_t    m    = FLA_Obj_length( obj );
1385                   dim_t    n    = FLA_Obj_width( obj );
1386                   dim_t    cs   = FLA_Obj_col_stride( obj );
1387                   FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
1388 
1389                   // Clear each block in macroblock.
1390                   for ( jj = 0; jj < n; jj++ )
1391                   {
1392                      for ( kk = 0; kk < m; kk++ )
1393                      {
1394                         obj = *( buf + jj * cs + kk );
1395 
1396                         // Flush the block to main memory if it is on the GPU.
1397                         if ( k == thread )
1398                            FLASH_Queue_flush_block_gpu( obj, k, arg );
1399 
1400                         // Invalidate output block on all GPUs.
1401                         if ( i >= n_input_args )
1402                            FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1403                      }
1404                   }
1405                }
1406                else
1407                {
1408                   // Flush the block to main memory if it is on the GPU.
1409                   if ( k == thread )
1410                      FLASH_Queue_flush_block_gpu( obj, k, arg );
1411 
1412                   // Invalidate output block on all GPUs.
1413                   if ( i >= n_input_args )
1414                      FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1415                }
1416             }
1417          }
1418       }
1419 
1420       // Execute the task on CPU instead of GPU.
1421       FLASH_Queue_exec_task( t );
1422 
1423       return TRUE;
1424    }
1425 
1426    // Gather the pointers for the data on the GPU.
1427    input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
1428    output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
1429 
1430    // Bring all the blocks to GPU.
1431    FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
1432 
1433    // Execute the task on GPU.
1434    FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
1435 
1436    // Mark all the output blocks as dirty.
1437    FLASH_Queue_mark_gpu( t, arg );
1438 
1439    // Free memory.
1440    FLA_free( input_arg );
1441    FLA_free( output_arg );
1442 
1443    return TRUE;
1444 }
1445 
1446 
FLASH_Queue_check_gpu(FLASH_Task * t,void * arg)1447 FLA_Bool FLASH_Queue_check_gpu( FLASH_Task *t, void *arg )
1448 /*----------------------------------------------------------------------------
1449 
1450    FLASH_Queue_check_gpu
1451 
1452 ----------------------------------------------------------------------------*/
1453 {
1454    int i, j, k;
1455    int thread        = t->thread;
1456    int n_input_args  = t->n_input_args;
1457    int n_output_args = t->n_output_args;
1458    int n_threads     = FLASH_Queue_get_num_threads();
1459    FLA_Bool r_val    = TRUE;
1460    FLA_Bool t_val;
1461    FLA_Bool duplicate;
1462    FLA_Obj  obj;
1463 
1464    // Check the input and output arguments on the GPUs.
1465    for ( i = 0; i < n_input_args + n_output_args; i++ )
1466    {
1467       // Check for duplicate blocks.
1468       duplicate = FALSE;
1469 
1470       // Find the correct input or output argument.
1471       if ( i < n_input_args )
1472       {
1473          obj = t->input_arg[i];
1474 
1475          for ( j = 0; j < n_output_args && !duplicate; j++ )
1476          {
1477             if ( obj.base == t->output_arg[j].base )
1478                duplicate = TRUE;
1479          }
1480 
1481          for ( j = 0; j < i && !duplicate; j++ )
1482          {
1483             if ( obj.base == t->input_arg[j].base )
1484                duplicate = TRUE;
1485          }
1486       }
1487       else
1488       {
1489          obj = t->output_arg[i - n_input_args];
1490 
1491          for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1492          {
1493             if ( obj.base == t->output_arg[j].base )
1494                duplicate = TRUE;
1495          }
1496       }
1497 
1498       // If the block has not been processed before.
1499       if ( !duplicate )
1500       {
1501          // Macroblock is used.
1502          if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1503          {
1504             dim_t    jj, kk;
1505             dim_t    m    = FLA_Obj_length( obj );
1506             dim_t    n    = FLA_Obj_width( obj );
1507             dim_t    cs   = FLA_Obj_col_stride( obj );
1508             FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
1509 
1510             // Clear each block in macroblock.
1511             for ( jj = 0; jj < n; jj++ )
1512             {
1513                for ( kk = 0; kk < m; kk++ )
1514                {
1515                   obj = *( buf + jj * cs + kk );
1516 
1517                   t_val = TRUE;
1518 
1519                   // Check to see if the block is dirty on another GPU.
1520                   for ( k = 0; k < n_threads && t_val; k++ )
1521                      if ( k != thread )
1522                         t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1523 
1524                   r_val = r_val && t_val;
1525                }
1526             }
1527          }
1528          else
1529          {
1530             t_val = TRUE;
1531 
1532             // Check to see if the block is dirty on another GPU.
1533             for ( k = 0; k < n_threads && t_val; k++ )
1534                if ( k != thread )
1535                   t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1536 
1537             r_val = r_val && t_val;
1538          }
1539       }
1540    }
1541 
1542    return r_val;
1543 }
1544 
1545 
FLASH_Queue_check_block_gpu(FLA_Obj obj,int thread,void * arg)1546 FLA_Bool FLASH_Queue_check_block_gpu( FLA_Obj obj, int thread, void *arg )
1547 /*----------------------------------------------------------------------------
1548 
1549    FLASH_Queue_check_block_gpu
1550 
1551 ----------------------------------------------------------------------------*/
1552 {
1553    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1554    int k;
1555    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1556    FLA_Bool r_val = TRUE;
1557 
1558 #ifdef FLA_ENABLE_MULTITHREADING
1559    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1560 #endif
1561 
1562    // Locate the position of the block on the GPU.
1563    for ( k = 0; k < gpu_n_blocks; k++ )
1564       if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1565          break;
1566 
1567    if ( k < gpu_n_blocks )
1568    {
1569       // Request this block if it is dirty.
1570       if ( !args->gpu[thread * gpu_n_blocks + k].clean )
1571       {
1572          args->gpu[thread * gpu_n_blocks + k].request = TRUE;
1573 
1574          r_val = FALSE;
1575       }
1576    }
1577 
1578    // Check the victim block.
1579    if ( obj.base == args->victim[thread].obj.base )
1580       r_val = FALSE;
1581 
1582 #ifdef FLA_ENABLE_MULTITHREADING
1583    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1584 #endif
1585 
1586    return r_val;
1587 }
1588 
1589 
FLASH_Queue_update_gpu(FLASH_Task * t,void ** input_arg,void ** output_arg,void * arg)1590 void FLASH_Queue_update_gpu( FLASH_Task *t,
1591                              void **input_arg,
1592                              void **output_arg,
1593                              void *arg )
1594 /*----------------------------------------------------------------------------
1595 
1596    FLASH_Queue_update_gpu
1597 
1598 ----------------------------------------------------------------------------*/
1599 {
1600    int i, j, k;
1601    int thread    = t->thread;
1602    int n_threads = FLASH_Queue_get_num_threads();
1603    FLA_Bool duplicate;
1604 
1605    // None of the arguments can be macroblocks yet.
1606    // Complicating factor is copying macroblock to contiguous memory on GPU.
1607 
1608    // Bring the input arguments to the GPU.
1609    for ( i = t->n_input_args - 1; i >= 0; i-- )
1610    {
1611       // Check for duplicate blocks.
1612       duplicate = FALSE;
1613 
1614       for ( j = 0; j < t->n_output_args && !duplicate; j++ )
1615       {
1616          if ( t->input_arg[i].base == t->output_arg[j].base )
1617             duplicate = TRUE;
1618       }
1619 
1620       for ( j = 0; j < i && !duplicate; j++ )
1621       {
1622          if ( t->input_arg[i].base == t->input_arg[j].base )
1623             duplicate = TRUE;
1624       }
1625 
1626       // If the input block has not been processed before.
1627       if ( !duplicate )
1628       {
1629          FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
1630       }
1631       else
1632       {
1633          input_arg[i] = NULL;
1634       }
1635    }
1636 
1637    // Bring the output arguments to the GPU.
1638    for ( i = t->n_output_args - 1; i >= 0; i-- )
1639    {
1640       // Check for duplicate blocks.
1641       duplicate = FALSE;
1642 
1643       for ( j = 0; j < i && !duplicate; j++ )
1644       {
1645          if ( t->output_arg[i].base == t->output_arg[j].base )
1646             duplicate = TRUE;
1647       }
1648 
1649       // If the output block has not been processed before.
1650       if ( !duplicate )
1651       {
1652          FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
1653 
1654          // Invalidate output blocks on all other GPUs.
1655          for ( k = 0; k < n_threads; k++ )
1656             if ( k != thread )
1657                FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
1658       }
1659       else
1660       {
1661          output_arg[i] = NULL;
1662       }
1663    }
1664 
1665    // Check to see if there are any duplicates.
1666    for ( i = t->n_input_args - 1; i >= 0; i-- )
1667    {
1668       for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
1669       {
1670          if ( t->input_arg[i].base == t->output_arg[j].base )
1671             input_arg[i] = output_arg[j];
1672       }
1673 
1674       for ( j = 0; j < i && input_arg[i] == NULL; j++ )
1675       {
1676          if ( t->input_arg[i].base == t->input_arg[j].base )
1677             input_arg[i] = input_arg[j];
1678       }
1679    }
1680 
1681    // Check to see if there are any duplicates.
1682    for ( i = t->n_output_args - 1; i >= 0; i-- )
1683    {
1684       for ( j = 0; j < i && output_arg[i] == NULL; j++ )
1685       {
1686          if ( t->output_arg[i].base == t->output_arg[j].base )
1687             output_arg[i] = output_arg[j];
1688       }
1689    }
1690 
1691    return;
1692 }
1693 
1694 
FLASH_Queue_update_block_gpu(FLA_Obj obj,void ** buffer_gpu,int thread,void * arg)1695 void FLASH_Queue_update_block_gpu( FLA_Obj obj,
1696                                    void **buffer_gpu,
1697                                    int thread,
1698                                    void *arg )
1699 /*----------------------------------------------------------------------------
1700 
1701    FLASH_Queue_update_block_gpu
1702 
1703 ----------------------------------------------------------------------------*/
1704 {
1705    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1706    int j, k;
1707    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1708    FLA_Bool transfer = FALSE;
1709    FLA_Bool evict = FALSE;
1710    FLA_Obj_gpu evict_obj;
1711    FLA_Obj_gpu gpu_obj;
1712 
1713 #ifdef FLA_ENABLE_MULTITHREADING
1714    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1715 #endif
1716 
1717    // Locate the position of the block on GPU.
1718    for ( k = 0; k < gpu_n_blocks - 1; k++ )
1719       if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1720          break;
1721 
1722    // Save the pointer to the data on the GPU.
1723    buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
1724 
1725    // Save the victim block.
1726    evict_obj = args->gpu[thread * gpu_n_blocks + k];
1727 
1728    // The block is not already in the GPU.
1729    if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
1730    {
1731       // Save for data transfer outside of critical section.
1732       transfer = TRUE;
1733 
1734       // Save for eviction outside of critical section.
1735       if ( evict_obj.obj.base != NULL && !evict_obj.clean )
1736       {
1737          evict = TRUE;
1738          args->victim[thread] = evict_obj;
1739       }
1740 
1741       // Save the block in the data structure.
1742       args->gpu[thread * gpu_n_blocks + k].obj = obj;
1743 
1744       // Make sure the new block is clean.
1745       args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
1746       args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1747    }
1748 
1749    // Use the block on the GPU that is a hit or LRU.
1750    gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1751 
1752    // Shift all the previous tasks for LRU replacement.
1753    for ( j = k; j > 0; j-- )
1754       args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
1755 
1756    // Place the block on the cache as the most recently used.
1757    args->gpu[thread * gpu_n_blocks] = gpu_obj;
1758 
1759 #ifdef FLA_ENABLE_MULTITHREADING
1760    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1761 #endif
1762 
1763    // Evict and flush the LRU dirty block.
1764    if ( evict )
1765    {
1766       FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
1767 
1768 #ifdef FLA_ENABLE_MULTITHREADING
1769       FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1770 #endif
1771 
1772       args->victim[thread].obj.base = NULL;
1773 
1774 #ifdef FLA_ENABLE_MULTITHREADING
1775       FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1776 #endif
1777    }
1778 
1779    // Move the block to the GPU.
1780    if ( transfer )
1781       FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1782 
1783    return;
1784 }
1785 
1786 
FLASH_Queue_mark_gpu(FLASH_Task * t,void * arg)1787 void FLASH_Queue_mark_gpu( FLASH_Task *t, void *arg )
1788 /*----------------------------------------------------------------------------
1789 
1790    FLASH_Queue_mark_gpu
1791 
1792 ----------------------------------------------------------------------------*/
1793 {
1794    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1795    int i, j, k;
1796    int thread = t->thread;
1797    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1798    FLA_Bool duplicate;
1799    FLA_Obj  obj;
1800 
1801    // Mark all the output blocks on the GPU as dirty.
1802    for ( i = t->n_output_args - 1; i >= 0; i-- )
1803    {
1804       obj = t->output_arg[i];
1805 
1806       // Check for duplicate blocks.
1807       duplicate = FALSE;
1808 
1809       for ( j = 0; j < i && !duplicate; j++ )
1810       {
1811          if ( obj.base == t->output_arg[j].base )
1812             duplicate = TRUE;
1813       }
1814 
1815       // If the output block has not been processed before.
1816       if ( !duplicate )
1817       {
1818 #ifdef FLA_ENABLE_MULTITHREADING
1819          FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1820 #endif
1821 
1822          // Locate the position of the block on the GPU.
1823          for ( k = 0; k < gpu_n_blocks; k++ )
1824             if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1825                break;
1826 
1827          if ( k < gpu_n_blocks )
1828          {
1829             // Change the bits for the new dirty block.
1830             args->gpu[thread * gpu_n_blocks + k].clean   = FALSE;
1831             args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1832          }
1833 
1834 #ifdef FLA_ENABLE_MULTITHREADING
1835          FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1836 #endif
1837       }
1838    }
1839 
1840    return;
1841 }
1842 
1843 
FLASH_Queue_invalidate_block_gpu(FLA_Obj obj,int thread,void * arg)1844 void FLASH_Queue_invalidate_block_gpu( FLA_Obj obj, int thread, void *arg )
1845 /*----------------------------------------------------------------------------
1846 
1847    FLASH_Queue_invalidate_block_gpu
1848 
1849 ----------------------------------------------------------------------------*/
1850 {
1851    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1852    int j, k;
1853    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1854    FLA_Obj_gpu gpu_obj;
1855 
1856 #ifdef FLA_ENABLE_MULTITHREADING
1857    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1858 #endif
1859 
1860    // Locate the position of the block on the GPU.
1861    for ( k = 0; k < gpu_n_blocks; k++ )
1862       if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1863          break;
1864 
1865    // The block is owned by other GPU.
1866    if ( k < gpu_n_blocks )
1867    {
1868       // Invalidate the block.
1869       args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
1870 
1871       args->gpu[thread * gpu_n_blocks + k].clean    = TRUE;
1872       args->gpu[thread * gpu_n_blocks + k].request  = FALSE;
1873 
1874       // Save the block that will be invalidated.
1875       gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1876 
1877       // Shift all the blocks for the invalidated block.
1878       for ( j = k; j < gpu_n_blocks - 1; j++ )
1879          args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
1880 
1881       // Move to the LRU block.
1882       args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
1883    }
1884 
1885 #ifdef FLA_ENABLE_MULTITHREADING
1886    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1887 #endif
1888 
1889    return;
1890 }
1891 
1892 
FLASH_Queue_flush_block_gpu(FLA_Obj obj,int thread,void * arg)1893 void FLASH_Queue_flush_block_gpu( FLA_Obj obj, int thread, void *arg )
1894 /*----------------------------------------------------------------------------
1895 
1896    FLASH_Queue_flush_block_gpu
1897 
1898 ----------------------------------------------------------------------------*/
1899 {
1900    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1901    int k;
1902    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1903    FLA_Bool transfer = FALSE;
1904    FLA_Obj_gpu gpu_obj;
1905 
1906 #ifdef FLA_ENABLE_MULTITHREADING
1907    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1908 #endif
1909 
1910    // Locate the position of the block on the GPU.
1911    for ( k = 0; k < gpu_n_blocks; k++ )
1912       if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1913          break;
1914 
1915    // The block is owned by the GPU.
1916    if ( k < gpu_n_blocks )
1917    {
1918       // Save the block that will be flushed.
1919       gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1920 
1921       // If the block is dirty, then flush it.
1922       if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1923          transfer = TRUE;
1924    }
1925 
1926 #ifdef FLA_ENABLE_MULTITHREADING
1927    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1928 #endif
1929 
1930    // Exit early if a flush is not required.
1931    if ( !transfer )
1932       return;
1933 
1934    // Flush the block outside the critical section.
1935    FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1936 
1937 #ifdef FLA_ENABLE_MULTITHREADING
1938    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1939 #endif
1940 
1941    // Locate the position of the block on the GPU.
1942    for ( k = 0; k < gpu_n_blocks; k++ )
1943       if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1944          break;
1945 
1946    if ( k < gpu_n_blocks )
1947    {
1948       // Update the bits for the flushed block.
1949       args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
1950       args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1951    }
1952 
1953 #ifdef FLA_ENABLE_MULTITHREADING
1954    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1955 #endif
1956 
1957    return;
1958 }
1959 
1960 
FLASH_Queue_flush_gpu(int thread,void * arg)1961 void FLASH_Queue_flush_gpu( int thread, void *arg )
1962 /*----------------------------------------------------------------------------
1963 
1964    FLASH_Queue_flush_gpu
1965 
1966 ----------------------------------------------------------------------------*/
1967 {
1968    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1969    int i, k;
1970    dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1971    int n_transfer = 0;
1972    FLA_Obj_gpu gpu_obj;
1973 
1974    // Exit if not using GPU.
1975    if ( !FLASH_Queue_get_enabled_gpu() )
1976       return;
1977 
1978 #ifdef FLA_ENABLE_MULTITHREADING
1979    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1980 #endif
1981 
1982    for ( k = 0; k < gpu_n_blocks; k++ )
1983    {
1984       // Save the block that might be flushed.
1985       gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1986 
1987       // Flush the block if it is dirty and requested.
1988       if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
1989       {
1990          // Save the block for data transfer outside the critical section.
1991          args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
1992          n_transfer++;
1993       }
1994    }
1995 
1996 #ifdef FLA_ENABLE_MULTITHREADING
1997    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1998 #endif
1999 
2000    // Exit early if a flush is not required.
2001    if ( n_transfer == 0 )
2002       return;
2003 
2004    // Flush the block outside the critical section.
2005    for ( i = 0; i < n_transfer; i++ )
2006    {
2007       gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
2008       FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
2009    }
2010 
2011 #ifdef FLA_ENABLE_MULTITHREADING
2012    FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
2013 #endif
2014 
2015    // Update the bits for each block that is flushed.
2016    for ( i = 0; i < n_transfer; i++ )
2017    {
2018       // Locate the position of the block on the GPU.
2019       for ( k = 0; k < gpu_n_blocks; k++ )
2020          if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
2021               args->gpu[thread * gpu_n_blocks + k].obj.base )
2022             break;
2023 
2024       if ( k < gpu_n_blocks )
2025       {
2026          // The block is now clean.
2027          args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
2028          args->gpu[thread * gpu_n_blocks + k].request = FALSE;
2029       }
2030    }
2031 
2032 #ifdef FLA_ENABLE_MULTITHREADING
2033    FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
2034 #endif
2035 
2036    return;
2037 }
2038 
2039 #endif
2040 
2041 #ifdef FLA_ENABLE_MULTITHREADING
2042 
FLASH_Queue_exec_parallel(void * arg)2043 void FLASH_Queue_exec_parallel( void* arg )
2044 /*----------------------------------------------------------------------------
2045 
2046    FLASH_Queue_exec_parallel
2047 
2048 ----------------------------------------------------------------------------*/
2049 {
2050    int   i;
2051    int   n_threads = FLASH_Queue_get_num_threads();
2052    void* (*thread_entry_point)( void* );
2053 
2054    // Allocate the thread structures array. Here, an array of FLASH_Thread
2055    // structures of length n_threads is allocated and the fields of each
2056    // structure set to appropriate values.
2057    FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
2058 
2059    // Initialize the thread structures array.
2060    for ( i = 0; i < n_threads; i++ )
2061    {
2062       // Save the thread's identifier.
2063       thread[i].id = i;
2064 
2065       // Save the pointer to the necessary variables with the thread.
2066       thread[i].args = arg;
2067 
2068       // The pthread object, if it was even compiled into the FLASH_Thread
2069       // structure, will be initialized by the pthread implementation when we
2070       // call pthread_create() and does not need to be touched at this time.
2071    }
2072 
2073    // Determine which function to send threads to.
2074    thread_entry_point = FLASH_Queue_exec_parallel_function;
2075 
2076 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
2077 
2078    // An OpenMP parallel for region spawns n_threads threads. Each thread
2079    // executes the work function with a different FLASH_Thread argument.
2080    // An implicit synchronization point exists at the end of the curly
2081    // brace scope.
2082    #pragma omp parallel for \
2083            private( i ) \
2084            shared( thread, n_threads, thread_entry_point ) \
2085            schedule( static, 1 ) \
2086            num_threads( n_threads )
2087    for ( i = 0; i < n_threads; ++i )
2088    {
2089       thread_entry_point( ( void* ) &thread[i] );
2090    }
2091 
2092 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2093 
2094    // Create each POSIX thread needed in addition to the main thread.
2095    for ( i = 1; i < n_threads; i++ )
2096    {
2097       int pthread_e_val;
2098 
2099       // Create thread i with default attributes.
2100       pthread_e_val = pthread_create( &(thread[i].pthread_obj),
2101                                       NULL,
2102                                       thread_entry_point,
2103                                       ( void* ) &thread[i] );
2104 
2105 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2106       FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
2107       FLA_Check_error_code( e_val );
2108 #endif
2109    }
2110 
2111    // The main thread is assigned the role of thread 0. Here we manually
2112    // execute it as a worker thread.
2113    thread_entry_point( ( void* ) &thread[0] );
2114 
2115    // Wait for non-main threads to finish.
2116    for ( i = 1; i < n_threads; i++ )
2117    {
2118       // These two variables are declared local to this for loop since this
2119       // is the only place they are needed, and since they would show up as
2120       // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
2121       // Strangely, the Intel compiler produces code that results in an
2122       // "unaligned access" runtime message if thread_status is declared as
2123       // an int. Declaring it as a long or void* appears to force the
2124       // compiler (not surprisingly) into aligning it to an 8-byte boundary.
2125       int   pthread_e_val;
2126       void* thread_status;
2127 
2128       // Wait for thread i to invoke its respective pthread_exit().
2129       // The return value passed to pthread_exit() is provided to us
2130       // via status, if one was given.
2131       pthread_e_val = pthread_join( thread[i].pthread_obj,
2132                                     ( void** ) &thread_status );
2133 
2134 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2135       FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
2136       FLA_Check_error_code( e_val );
2137 #endif
2138    }
2139 
2140 #endif
2141 
2142    FLA_free( thread );
2143 
2144    return;
2145 }
2146 
2147 
2148 //#include <sched.h>
2149 //#include <sys/types.h>
2150 //#include <linux/unistd.h>
2151 //#include <errno.h>
2152 //#include <unistd.h>
2153 //#include <sys/syscall.h>
2154 
2155 
FLASH_Queue_exec_parallel_function(void * arg)2156 void* FLASH_Queue_exec_parallel_function( void* arg )
2157 /*----------------------------------------------------------------------------
2158 
2159    FLASH_Queue_exec_parallel_function
2160 
2161 ----------------------------------------------------------------------------*/
2162 {
2163    FLASH_Queue_vars* args;
2164    int           i;
2165    int           queue;
2166    int           cache;
2167    int           n_tasks   = FLASH_Queue_get_num_tasks();
2168    int           n_threads = FLASH_Queue_get_num_threads();
2169    int           n_cores   = FLASH_Queue_get_cores_per_cache();
2170    FLA_Bool      caching   = FLASH_Queue_get_caching();
2171    FLA_Bool      stealing  = FLASH_Queue_get_work_stealing();
2172    FLA_Bool      committed = TRUE;
2173    FLA_Bool      condition = TRUE;
2174    FLA_Bool      enabled   = FALSE;
2175    FLA_Bool      available;
2176    FLASH_Task*   t = NULL;
2177    FLASH_Task*   r = NULL;
2178    FLASH_Thread* me;
2179    //cpu_set_t     cpu_set;
2180 
2181    // Interpret the thread argument as what it really is--a pointer to an
2182    // FLASH_Thread structure.
2183    me = ( FLASH_Thread* ) arg;
2184 
2185    // Extract the variables from the current thread.
2186    args = ( FLASH_Queue_vars* ) me->args;
2187 
2188    // Figure out the id of the current thread.
2189    i = me->id;
2190 
2191    // Set the CPU affinity; We want the current thread i to run only on CPU i.
2192    //CPU_ZERO( &cpu_set );
2193    //CPU_SET( i, &cpu_set );
2194    //sched_setaffinity( syscall( __NR_gettid ), sizeof(cpu_set_t), &cpu_set );
2195 
2196    // Determine to which queue this thread belongs.
2197    queue = i / ( n_threads / args->n_queues );
2198 
2199    // Determine to which cache this thread belongs.
2200    cache = i / n_cores;
2201 
2202 #ifdef FLA_ENABLE_GPU
2203    // Create memory on GPU.
2204    FLASH_Queue_create_gpu( i, ( void* ) args );
2205 
2206    // Save whether GPUs are enabled.
2207    enabled = FLASH_Queue_get_enabled_gpu();
2208 
2209    // Only use each GPU as its own cache when GPUs are enabled.
2210    if ( enabled )
2211       cache = i;
2212 #endif
2213 
2214    // Prefetch blocks into the cache before execution.
2215    if ( caching && !enabled && i % n_cores == 0 )
2216       FLASH_Queue_prefetch( cache, ( void* ) args );
2217 
2218    // Loop until all the tasks have committed.
2219    while ( condition )
2220    {
2221 #ifdef FLA_ENABLE_GPU
2222       // Check to see if any blocks on GPU need to be flushed.
2223       FLASH_Queue_flush_gpu( i, ( void* ) args );
2224 #endif
2225 
2226       // Dequeue a task if there has not been one binded to thread.
2227       if ( r == NULL )
2228       {
2229          FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2230 
2231          // Obtain task to execute.
2232          t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
2233 
2234          FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2235       }
2236       else
2237       {
2238          // Obtain the binded task.
2239          t = r;
2240          r = NULL;
2241       }
2242 
2243       // Dequeued a task from the waiting queue.
2244       available = ( t != NULL );
2245 
2246       if ( available )
2247       {
2248          // Save the thread and cache that executes the task.
2249          t->thread = i;
2250          t->cache = cache;
2251 
2252          if ( caching && !enabled )
2253          {
2254             // Update the current state of the cache.
2255             FLASH_Queue_update_cache( t, ( void* ) args );
2256          }
2257 
2258 #ifdef FLA_ENABLE_GPU
2259          // Execute the task on GPU.
2260          committed = FLASH_Queue_exec_gpu( t, ( void* ) args );
2261 #else
2262          // Execute the task.
2263          FLASH_Queue_exec_task( t );
2264 #endif
2265 
2266          // If the task has executed or not.
2267          if ( committed )
2268          {
2269             // Update task dependencies.
2270             r = FLASH_Task_update_dependencies( t, ( void* ) args );
2271 
2272             // Free the task once it executes in parallel.
2273             FLASH_Task_free_parallel( t, ( void* ) args );
2274          }
2275       }
2276       else
2277       {
2278          if ( stealing )
2279          {
2280             // Perform work stealing if there are no tasks to dequeue.
2281             r = FLASH_Queue_work_stealing( queue, ( void* ) args );
2282          }
2283       }
2284 
2285       FLA_Lock_acquire( &(args->all_lock) ); // A ***
2286 
2287       // Increment program counter.
2288       if ( available && committed )
2289          args->pc++;
2290 
2291       // Terminate loop.
2292       if ( args->pc >= n_tasks )
2293          condition = FALSE;
2294 
2295       FLA_Lock_release( &(args->all_lock) ); // A ***
2296    }
2297 
2298 #ifdef FLA_ENABLE_GPU
2299    // Destroy and flush contents of GPU back to main memory.
2300    FLASH_Queue_destroy_gpu( i, ( void* ) args );
2301 #endif
2302 
2303 #if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2304    // If this is a non-main thread, then exit with a zero (normal) error code.
2305    // The main thread cannot call pthread_exit() because this routine never
2306    // returns. The main thread must proceed so it can oversee the joining of
2307    // the exited non-main pthreads.
2308    if ( i != 0 )
2309       pthread_exit( ( void* ) NULL );
2310 #endif
2311 
2312    return ( void* ) NULL;
2313 }
2314 
2315 
FLASH_Task_update_dependencies(FLASH_Task * t,void * arg)2316 FLASH_Task* FLASH_Task_update_dependencies( FLASH_Task* t, void* arg )
2317 /*----------------------------------------------------------------------------
2318 
2319    FLASH_Task_update_dependencies
2320 
2321 ----------------------------------------------------------------------------*/
2322 {
2323    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2324    int         i;
2325    int         q = t->queue;
2326    int         queue;
2327    int         thread;
2328    int         n_threads = FLASH_Queue_get_num_threads();
2329    FLA_Bool    caching   = FLASH_Queue_get_caching();
2330    FLA_Bool    stealing  = FLASH_Queue_get_work_stealing();
2331    FLA_Bool    available;
2332    FLASH_Task* task;
2333    FLASH_Task* r = NULL;
2334    FLASH_Dep*  d = t->dep_arg_head;
2335 
2336    // Dequeue task to bind to thread if caching is enabled.
2337    if ( caching )
2338    {
2339       FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
2340 
2341       // Obtain task to execute.
2342       r = FLASH_Queue_wait_dequeue( q, t->cache, arg );
2343 
2344       FLA_Lock_release( &(args->run_lock[q]) ); // R ***
2345    }
2346 
2347    // Check each dependent task.
2348    for ( i = 0; i < t->n_dep_args; i++ )
2349    {
2350       if ( stealing )
2351       {
2352          // Place all dependent tasks onto same queue as predecessor task.
2353          d->task->queue = q;
2354       }
2355 
2356       task   = d->task;
2357       queue  = task->queue;
2358       thread = task->order % n_threads;
2359 
2360       FLA_Lock_acquire( &(args->dep_lock[thread]) ); // D ***
2361 
2362       task->n_ready--;
2363       available = ( task->n_ready == 0 );
2364 
2365       FLA_Lock_release( &(args->dep_lock[thread]) ); // D ***
2366 
2367       // Place newly ready tasks on waiting queue.
2368       if ( available )
2369       {
2370          // If caching is enabled and the task belongs to this thread's queue.
2371          if ( caching && q == queue )
2372          {
2373             // Determine if there is a new binded task.
2374             r = FLASH_Task_update_binding( task, r, arg );
2375          }
2376          else
2377          {
2378             FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2379 
2380             FLASH_Queue_wait_enqueue( task, arg );
2381 
2382             FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2383          }
2384       }
2385 
2386       // Go to the next dep.
2387       d = d->next_dep;
2388    }
2389 
2390    return r;
2391 }
2392 
2393 
FLASH_Task_update_binding(FLASH_Task * t,FLASH_Task * r,void * arg)2394 FLASH_Task* FLASH_Task_update_binding( FLASH_Task* t, FLASH_Task* r,
2395                                        void* arg )
2396 /*----------------------------------------------------------------------------
2397 
2398    FLASH_Task_update_binding
2399 
2400 ----------------------------------------------------------------------------*/
2401 {
2402    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2403    int queue;
2404 
2405    if ( r == NULL )
2406    {
2407       // There are no tasks on waiting queue, so bind the first task.
2408       r = t;
2409       r->hit = TRUE;
2410    }
2411    else
2412    {
2413       // Swap the binded task for the new ready task.
2414       if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
2415       {
2416          queue = r->queue;
2417          r->hit = FALSE;
2418 
2419          FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2420 
2421          // Place swapped task back onto waiting queue.
2422          FLASH_Queue_wait_enqueue( r, arg );
2423 
2424          FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2425 
2426          // Bind the new ready task.
2427          r = t;
2428          r->hit = TRUE;
2429       }
2430       else // Keep the binded task and enqueue new ready task.
2431       {
2432          queue = t->queue;
2433 
2434          FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2435 
2436          FLASH_Queue_wait_enqueue( t, arg );
2437 
2438          FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2439       }
2440    }
2441 
2442    return r;
2443 }
2444 
2445 
FLASH_Task_free_parallel(FLASH_Task * t,void * arg)2446 void FLASH_Task_free_parallel( FLASH_Task* t, void* arg )
2447 /*----------------------------------------------------------------------------
2448 
2449    FLASH_Task_free_parallel
2450 
2451 ----------------------------------------------------------------------------*/
2452 {
2453    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2454    int        i, j, k;
2455    int        thread;
2456    int        n_threads = FLASH_Queue_get_num_threads();
2457    FLASH_Dep* d;
2458    FLASH_Dep* next_dep;
2459    FLA_Obj    obj;
2460 
2461    // Clearing the last write task in each output block.
2462    for ( i = 0; i < t->n_output_args; i++ )
2463    {
2464       obj = t->output_arg[i];
2465 
2466       // Macroblock is used.
2467       if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2468       {
2469          dim_t    jj, kk;
2470          dim_t    m    = FLA_Obj_length( obj );
2471          dim_t    n    = FLA_Obj_width( obj );
2472          dim_t    cs   = FLA_Obj_col_stride( obj );
2473          FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
2474 
2475          // Clear each block in macroblock.
2476          for ( jj = 0; jj < n; jj++ )
2477             for ( kk = 0; kk < m; kk++ )
2478                ( buf + jj * cs + kk )->base->write_task = NULL;
2479       }
2480       else // Clear regular block.
2481       {
2482          obj.base->write_task = NULL;
2483       }
2484    }
2485 
2486    // Cleaning the last read tasks in each input block.
2487    for ( i = 0; i < t->n_input_args; i++ )
2488    {
2489       obj = t->input_arg[i];
2490 
2491       // Macroblock is used.
2492       if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2493       {
2494          dim_t    jj, kk;
2495          dim_t    m    = FLA_Obj_length( obj );
2496          dim_t    n    = FLA_Obj_width( obj );
2497          dim_t    cs   = FLA_Obj_col_stride( obj );
2498          FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
2499 
2500          // Clear each block in macroblock.
2501          for ( jj = 0; jj < n; jj++ )
2502          {
2503             for ( kk = 0; kk < m; kk++ )
2504             {
2505                obj = *( buf + jj * cs + kk );
2506 
2507                thread = obj.base->n_read_blocks % n_threads;
2508 
2509                FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2510 
2511                k = obj.base->n_read_tasks;
2512                d = obj.base->read_task_head;
2513 
2514                obj.base->n_read_tasks   = 0;
2515                obj.base->read_task_head = NULL;
2516                obj.base->read_task_tail = NULL;
2517 
2518                FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2519 
2520                for ( j = 0; j < k; j++ )
2521                {
2522                   next_dep = d->next_dep;
2523                   FLA_free( d );
2524                   d = next_dep;
2525                }
2526             }
2527          }
2528       }
2529       else // Regular block.
2530       {
2531          thread = obj.base->n_read_blocks % n_threads;
2532 
2533          FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2534 
2535          k = obj.base->n_read_tasks;
2536          d = obj.base->read_task_head;
2537 
2538          obj.base->n_read_tasks   = 0;
2539          obj.base->read_task_head = NULL;
2540          obj.base->read_task_tail = NULL;
2541 
2542          FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2543 
2544          for ( j = 0; j < k; j++ )
2545          {
2546             next_dep = d->next_dep;
2547             FLA_free( d );
2548             d = next_dep;
2549          }
2550       }
2551    }
2552 
2553    // Free the dep_arg field of t.
2554    d = t->dep_arg_head;
2555 
2556    for ( i = 0; i < t->n_dep_args; i++ )
2557    {
2558       next_dep = d->next_dep;
2559       FLA_free( d );
2560       d = next_dep;
2561    }
2562 
2563    // Free the int_arg field of t.
2564    FLA_free( t->int_arg );
2565 
2566    // Free the fla_arg field of t.
2567    FLA_free( t->fla_arg );
2568 
2569    // Free the input_arg field of t.
2570    FLA_free( t->input_arg );
2571 
2572    // Free the output_arg field of t.
2573    FLA_free( t->output_arg );
2574 
2575    // Finally, free the struct itself.
2576    FLA_free( t );
2577 
2578    return;
2579 }
2580 
2581 #endif
2582 
2583 
2584 // ============================================================================
2585 
2586 
2587 #ifndef FLA_ENABLE_MULTITHREADING
2588 
FLASH_Queue_exec_simulation(void * arg)2589 void FLASH_Queue_exec_simulation( void* arg )
2590 /*----------------------------------------------------------------------------
2591 
2592    FLASH_Queue_exec_simulation
2593 
2594 ----------------------------------------------------------------------------*/
2595 {
2596    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2597    int           i, j;
2598    int           queue;
2599    int           cache;
2600    int           n_stages  = 0;
2601    int           n_queues  = args->n_queues;
2602    int           n_tasks   = FLASH_Queue_get_num_tasks();
2603    int           n_threads = FLASH_Queue_get_num_threads();
2604    int           n_cores   = FLASH_Queue_get_cores_per_cache();
2605    FLASH_Verbose verbose   = FLASH_Queue_get_verbose_output();
2606    FLASH_Task*   task;
2607    FLASH_Task*   t;
2608    FLASH_Dep*    d;
2609 
2610    // An array to hold tasks to be executed during of simulation.
2611 #ifdef FLA_ENABLE_WINDOWS_BUILD
2612    FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
2613 #else
2614    FLASH_Task* exec_array[n_threads];
2615 #endif
2616 
2617    for ( i = 0; i < n_threads; i++ )
2618    {
2619       // Initialize all exec_array to NULL.
2620       exec_array[i] = NULL;
2621 
2622       // Prefetch blocks into the cache before execution.
2623       if ( i % n_cores == 0 )
2624          FLASH_Queue_prefetch( i, arg );
2625    }
2626 
2627    // Loop until all the tasks have committed.
2628    while ( args->pc < n_tasks )
2629    {
2630       for ( i = 0; i < n_threads; i++ )
2631       {
2632          // Update waiting queue with ready tasks.
2633          t = exec_array[i];
2634 
2635          if ( t != NULL )
2636          {
2637             // Check each dependent task.
2638             d = t->dep_arg_head;
2639 
2640             for ( j = 0; j < t->n_dep_args; j++ )
2641             {
2642                task = d->task;
2643                task->n_ready--;
2644 
2645                // Place newly ready tasks on waiting queue.
2646                if ( task->n_ready == 0 )
2647                {
2648                   FLASH_Queue_wait_enqueue( task, arg );
2649                }
2650 
2651                // Go to the next dep.
2652                d = d->next_dep;
2653             }
2654 
2655             // Free the task.
2656             FLASH_Task_free( t );
2657          }
2658       }
2659 
2660       n_stages++;
2661       if ( !verbose )
2662          printf( "%7d", n_stages );
2663 
2664       // Move ready tasks from the waiting queue to execution queue.
2665       for ( i = 0; i < n_threads; i++ )
2666       {
2667          // Determine to which queue this thread belongs.
2668          queue = i / ( n_threads / n_queues );
2669 
2670          // Determine to which cache this thread belongs.
2671          cache = i / n_cores;
2672 
2673          // Dequeue a task.
2674 	 t = FLASH_Queue_wait_dequeue( queue, cache, arg );
2675 
2676          // Save the task for execution.
2677          exec_array[i] = t;
2678 
2679          if ( t != NULL )
2680          {
2681             // Save the thread and cache that executes the task.
2682             t->thread = i;
2683             t->cache = cache;
2684 
2685             // Increment program counter.
2686             args->pc++;
2687          }
2688       }
2689 
2690       // Execute independent tasks.
2691       for ( i = 0; i < n_threads; i++ )
2692       {
2693          t = exec_array[i];
2694          FLASH_Queue_update_cache( t, arg );
2695          FLASH_Queue_exec_task( t );
2696 
2697          if ( !verbose )
2698             printf( "%7s", ( t == NULL ? "     " : t->name ) );
2699 
2700          // Free the task if this is the last stage.
2701          if ( args->pc == n_tasks && t != NULL )
2702             FLASH_Task_free( t );
2703       }
2704 
2705       if ( !verbose )
2706          printf( "\n" );
2707    }
2708 
2709    if ( !verbose )
2710       printf( "\n" );
2711 
2712 #ifdef FLA_ENABLE_WINDOWS_BUILD
2713    FLA_free( exec_array );
2714 #endif
2715 
2716    return;
2717 }
2718 
2719 #endif
2720 
2721 #else // FLA_ENABLE_SCC
2722 
2723 int RCCE_acquire_lock(int);
2724 int RCCE_release_lock(int);
2725 double RCCE_wtime(void);
2726 int    RCCE_ue(void);
2727 
2728 //This function needs to be defined in the driver
2729 // or linked in some how by the user.
2730 //It just just RCCE_barrier( &RCCE_COMM_WORLD ),
2731 // but we can't implement it here because we're
2732 // trying not to link in RCCE.h
2733 void Synch_all();
2734 
2735 
2736 typedef struct FLASH_Queue_variables
2737 {
2738    // Queue of all the tasks.
2739    FLASH_Task** task_queue;
2740 
2741    // The waiting queue of tasks for each thread.
2742    int*         n_ready;
2743 
2744    // The waiting queue of tasks for each thread.
2745    int*         wait_queue;
2746 
2747    // The number of tasks on waiting queue.
2748    int*         n_wait;
2749 
2750    // A global task counter that keeps track of how many tasks on the waiting
2751    // queue have been processed.
2752    int*         pc;
2753 } FLASH_Queue_vars;
2754 
2755 
FLASH_Queue_exec(void)2756 void FLASH_Queue_exec( void )
2757 /*----------------------------------------------------------------------------
2758 
2759    FLASH_Queue_exec
2760 
2761 ----------------------------------------------------------------------------*/
2762 {
2763    int         n_tasks    = FLASH_Queue_get_num_tasks();
2764    int         i;
2765    double      dtime;
2766 
2767    // All the necessary variables for the SuperMatrix mechanism.
2768    FLASH_Queue_vars args;
2769 
2770    // If the queue is empty, return early.
2771    if ( n_tasks == 0 )
2772       return;
2773 
2774    // Turn off all multiple queue implementations.
2775    FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
2776    FLASH_Queue_set_work_stealing( FALSE );
2777    // Do not use cache affinity yet.
2778    FLASH_Queue_set_caching( FALSE );
2779 
2780    // Allocate memory for task queues.
2781    args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) );
2782    args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2783    args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2784    args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) );
2785    args.pc = ( int* ) FLA_shmalloc( sizeof( int ) );
2786 
2787    // Initialize data.
2788    if ( FLA_is_owner() )
2789    {
2790       args.n_wait[0] = 0;
2791       args.pc[0] = 0;
2792    }
2793 
2794    Synch_all();
2795 
2796    // Initialize tasks with critical information.
2797    FLASH_Queue_init_tasks( ( void* ) &args );
2798 
2799    // Display verbose output before free all tasks.
2800    if ( FLASH_Queue_get_verbose_output() )
2801       FLASH_Queue_verbose_output();
2802 
2803    // Start timing the parallel execution.
2804    dtime = RCCE_wtime();
2805 
2806    FLASH_Queue_exec_parallel_function( ( void* ) &args );
2807 
2808    // End timing the parallel execution.
2809    dtime = RCCE_wtime() - dtime;
2810    FLASH_Queue_set_parallel_time( dtime );
2811 
2812    // Free all tasks sequentially.
2813    for ( i = 0; i < n_tasks; i++ )
2814       FLASH_Task_free( args.task_queue[i] );
2815 
2816    // Free data.
2817    FLA_free( args.task_queue );
2818    FLA_shfree( args.n_ready );
2819    FLA_shfree( args.wait_queue );
2820    FLA_shfree( args.n_wait );
2821    FLA_shfree( args.pc );
2822 
2823    // Reset values for next call to FLASH_Queue_exec().
2824    FLASH_Queue_reset();
2825 
2826    return;
2827 }
2828 
2829 
FLASH_Queue_init_tasks(void * arg)2830 void FLASH_Queue_init_tasks( void* arg )
2831 /*----------------------------------------------------------------------------
2832 
2833    FLASH_Queue_init_tasks
2834 
2835 ----------------------------------------------------------------------------*/
2836 {
2837    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2838    int            i, j;
2839    int            n_tasks = FLASH_Queue_get_num_tasks();
2840    int            n_ready = 0;
2841    int            height;
2842    FLASH_Task*    t;
2843    FLASH_Dep*     d;
2844 
2845    // Grab the tail of the task queue.
2846    t = FLASH_Queue_get_tail_task();
2847 
2848    for ( i = n_tasks - 1; i >= 0; i-- )
2849    {
2850       // Save all the task pointers.
2851       args->task_queue[i] = t;
2852 
2853       // Only use a single queue implementation.
2854       t->queue = 0;
2855 
2856       // Determine the height of each task in the DAG.
2857       height = 0;
2858       d = t->dep_arg_head;
2859 
2860       // Take the maximum height of dependent tasks.
2861       for ( j = 0; j < t->n_dep_args; j++ )
2862       {
2863          height = max( height, d->task->height );
2864          d = d->next_dep;
2865       }
2866 
2867       t->height = height + 1;
2868 
2869       // Since freeing a task is always a leaf, we want to force it to execute
2870       // earlier by giving it a greater height in order to reclaim memory.
2871       if ( t->func == (void *) FLA_Obj_free_buffer_task )
2872          t->height += n_tasks;
2873 
2874       // Find all ready tasks.
2875       t->n_ready += t->n_input_args + t->n_output_args +
2876                     t->n_macro_args + t->n_war_args;
2877 
2878       if ( t->n_ready == 0 )
2879       {
2880          // Save the number of ready and available tasks.
2881          n_ready++;
2882       }
2883 
2884       if ( FLA_is_owner() )
2885       {
2886          // Record all the ready values.
2887          args->n_ready[i] = t->n_ready;
2888       }
2889 
2890       // Go to the previous task.
2891       t = t->prev_task;
2892    }
2893 
2894    // Only allow the first core to enqueue the initial ready tasks.
2895    if ( !FLA_is_owner() )
2896       return;
2897 
2898    // Grab the head of the task queue.
2899    t = FLASH_Queue_get_head_task();
2900 
2901    for ( i = 0; i < n_tasks && n_ready > 0; i++ )
2902    {
2903       if ( t->n_ready == 0 )
2904       {
2905          RCCE_acquire_lock( 0 );
2906 
2907          // Enqueue all the ready and available tasks.
2908          FLASH_Queue_wait_enqueue( t, arg );
2909 
2910          RCCE_release_lock( 0 );
2911 
2912          // Decrement the number of ready tasks left to be enqueued.
2913          n_ready--;
2914       }
2915 
2916       // Go to the next task.
2917       t = t->next_task;
2918    }
2919 
2920    return;
2921 }
2922 
2923 
FLASH_Queue_wait_enqueue(FLASH_Task * t,void * arg)2924 void FLASH_Queue_wait_enqueue( FLASH_Task* t, void* arg )
2925 /*----------------------------------------------------------------------------
2926 
2927    FLASH_Queue_wait_enqueue
2928 
2929 ----------------------------------------------------------------------------*/
2930 {
2931    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2932    int i = args->n_wait[0] + args->pc[0];
2933 
2934    // Insertion sort of tasks in waiting queue.
2935    if ( FLASH_Queue_get_sorting() )
2936    {
2937       for ( ; i > args->pc[0]; i-- )
2938       {
2939          if ( args->task_queue[args->wait_queue[i-1]]->height >
2940               args->task_queue[t->order]->height )
2941             break;
2942 
2943          args->wait_queue[i] = args->wait_queue[i-1];
2944       }
2945    }
2946 
2947    args->wait_queue[i] = t->order;
2948 
2949    // Increment number of tasks on waiting queue.
2950    args->n_wait[0]++;
2951 
2952    return;
2953 }
2954 
2955 
FLASH_Queue_wait_dequeue(int queue,int cache,void * arg)2956 FLASH_Task* FLASH_Queue_wait_dequeue( int queue, int cache, void* arg )
2957 /*----------------------------------------------------------------------------
2958 
2959    FLASH_Queue_wait_dequeue
2960 
2961 ----------------------------------------------------------------------------*/
2962 {
2963    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2964    FLASH_Task* t = NULL;
2965 
2966    if ( args->n_wait[0] > 0 )
2967    {
2968       // Grab the head of the queue.
2969       t = args->task_queue[args->wait_queue[args->pc[0]]];
2970 
2971       // Decrement number of tasks on waiting queue.
2972       args->n_wait[0]--;
2973 
2974       // Increment the program counter.
2975       args->pc[0]++;
2976    }
2977 
2978    return t;
2979 }
2980 
2981 
FLASH_Queue_exec_parallel_function(void * arg)2982 void* FLASH_Queue_exec_parallel_function( void* arg )
2983 /*----------------------------------------------------------------------------
2984 
2985    FLASH_Queue_exec_parallel_function
2986 
2987 ----------------------------------------------------------------------------*/
2988 {
2989    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2990    int           i         = RCCE_ue();
2991    int           queue     = 0;
2992    int           cache     = 0;
2993    int           n_tasks   = FLASH_Queue_get_num_tasks();
2994    int           n_threads = FLASH_Queue_get_num_threads();
2995    FLA_Bool      condition;
2996    FLA_Bool      available;
2997    FLASH_Task*   t = NULL;
2998 
2999    // Do not let extraneous cores execute.
3000    if ( i < n_threads )
3001       condition = TRUE;
3002    else
3003       condition = FALSE;
3004 
3005    // Loop until all the tasks have committed.
3006    while ( condition )
3007    {
3008       RCCE_acquire_lock( 0 );
3009 
3010       // Obtain task to execute.
3011       t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
3012 
3013       RCCE_release_lock( 0 );
3014 
3015       // Dequeued a task from the waiting queue.
3016       available = ( t != NULL );
3017 
3018       if ( available )
3019       {
3020          // Save the thread and cache that executes the task.
3021          t->thread = i;
3022          t->cache = cache;
3023 
3024          // Execute the task.
3025          FLASH_Queue_exec_task( t );
3026 
3027          // Update task dependencies.
3028          FLASH_Task_update_dependencies( t, ( void* ) args );
3029       }
3030 
3031       RCCE_acquire_lock( 0 );
3032 
3033       // Terminate loop.
3034       if ( args->pc[0] >= n_tasks )
3035          condition = FALSE;
3036 
3037       RCCE_release_lock( 0 );
3038    }
3039 
3040    return ( void* ) NULL;
3041 }
3042 
3043 
FLASH_Task_update_dependencies(FLASH_Task * t,void * arg)3044 FLASH_Task* FLASH_Task_update_dependencies( FLASH_Task* t, void* arg )
3045 /*----------------------------------------------------------------------------
3046 
3047    FLASH_Task_update_dependencies
3048 
3049 ----------------------------------------------------------------------------*/
3050 {
3051    FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
3052    int         i;
3053    int         n_threads = FLASH_Queue_get_num_threads();
3054    int         thread;
3055    FLA_Bool    available;
3056    FLASH_Task* task;
3057    FLASH_Task* r = NULL;
3058    FLASH_Dep*  d = t->dep_arg_head;
3059 
3060    // Check each dependent task.
3061    for ( i = 0; i < t->n_dep_args; i++ )
3062    {
3063       task = d->task;
3064 
3065       // Use the remaining locks except for the first one.
3066       thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 );
3067 
3068       RCCE_acquire_lock( thread );
3069 
3070       args->n_ready[task->order]--;
3071       available = ( args->n_ready[task->order] == 0 );
3072 
3073       RCCE_release_lock( thread );
3074 
3075       // Place newly ready tasks on waiting queue.
3076       if ( available )
3077       {
3078          RCCE_acquire_lock( 0 );
3079 
3080          FLASH_Queue_wait_enqueue( task, arg );
3081 
3082          RCCE_release_lock( 0 );
3083       }
3084 
3085       // Go to the next dep.
3086       d = d->next_dep;
3087    }
3088 
3089    return r;
3090 }
3091 
3092 #endif // FLA_ENABLE_SCC
3093 
3094 #endif // FLA_ENABLE_SUPERMATRIX
3095