1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13
14 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
15 #ifdef FLA_ENABLE_TIDSP
16 #include <ti/omp/omp.h>
17 #else
18 #include <omp.h>
19 #endif
20 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
21 #include <pthread.h>
22 #endif
23
24 #ifdef FLA_ENABLE_WINDOWS_BUILD
25 #define _CRT_RAND_S
26 #include <stdlib.h>
27 #endif
28
29
30 #ifdef FLA_ENABLE_SUPERMATRIX
31
32 #ifndef FLA_ENABLE_SCC
33
34 #define MIN_CACHE_BLOCKS 3
35
36 #ifdef FLA_ENABLE_GPU
37 typedef struct FLA_Obj_gpu_struct
38 {
39 // Block stored in a GPU.
40 FLA_Obj obj;
41
42 // Pointer to the data stored on the GPU.
43 void* buffer_gpu;
44
45 // Whether the block is clean or dirty on the GPU.
46 FLA_Bool clean;
47
48 // Whether the block has been requested by another GPU.
49 FLA_Bool request;
50
51 } FLA_Obj_gpu;
52 #endif
53
54 typedef struct FLASH_Queue_variables
55 {
56 // A lock on the global task counter.
57 // Needed only when multithreading is enabled.
58 FLA_Lock all_lock;
59
60 // A lock that protects the thread's waiting queue.
61 // Needed only when multithreading is enabled.
62 FLA_Lock* run_lock;
63
64 // A lock that allows threads to safely check for and place ready dependent
65 // tasks on waiting queue. Needed only when multithreading is enabled.
66 FLA_Lock* dep_lock;
67
68 // A lock that allows threads to safely free the anti-dependency queue
69 // within each block. Needed only when multithreading is enabled.
70 FLA_Lock* war_lock;
71
72 // A lock that allows threads to safely access the cache structures.
73 // Needed only when multithreading is enabled.
74 FLA_Lock* cac_lock;
75
76 // Number of queues.
77 int n_queues;
78
79 // Number of caches.
80 int n_caches;
81
82 // The number of blocks that can be stored in the cache on each thread.
83 int size;
84
85 // LRU cache simulation of blocks.
86 FLA_Obj* cache;
87
88 // List of blocks accessed by the first tasks.
89 FLA_Obj* prefetch;
90
91 // The waiting queue of tasks for each thread.
92 FLASH_Queue* wait_queue;
93
94 // A global task counter that keeps track of how many tasks on the waiting
95 // queue have been processed.
96 int pc;
97
98 #ifdef FLA_ENABLE_GPU
99 // A lock that allows threads to safely access the cache structures.
100 // Needed only when multithreading is enabled.
101 FLA_Lock* gpu_lock;
102
103 // LRU software cache of GPU memory.
104 FLA_Obj_gpu* gpu;
105
106 // Storing the block being evicted.
107 FLA_Obj_gpu* victim;
108
109 // Temporary storage for logging blocks on GPU.
110 FLA_Obj_gpu* gpu_log;
111
112 // The size of each block to allocate on GPU.
113 dim_t block_size;
114
115 // The datatype of each block to allocate on GPU.
116 FLA_Datatype datatype;
117 #endif
118 } FLASH_Queue_vars;
119
120
FLASH_Queue_exec(void)121 void FLASH_Queue_exec( void )
122 /*----------------------------------------------------------------------------
123
124 FLASH_Queue_exec
125
126 ----------------------------------------------------------------------------*/
127 {
128 int n_tasks = FLASH_Queue_get_num_tasks();
129 int n_threads = FLASH_Queue_get_num_threads();
130 int n_queues;
131 int n_caches;
132 int size;
133 int i;
134 dim_t block_size = FLASH_Queue_get_block_size();
135 double dtime;
136
137 FLA_Lock* run_lock;
138 FLA_Lock* dep_lock;
139 FLA_Lock* war_lock;
140 FLA_Lock* cac_lock;
141
142 FLA_Obj* cache;
143 FLA_Obj* prefetch;
144 FLASH_Queue* wait_queue;
145
146 #ifdef FLA_ENABLE_GPU
147 #ifdef FLA_ENABLE_MULTITHREADING
148 FLA_Lock* gpu_lock;
149 #endif
150 FLA_Obj_gpu* gpu;
151 FLA_Obj_gpu* victim;
152 FLA_Obj_gpu* gpu_log;
153 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
154 #endif
155
156 // All the necessary variables for the SuperMatrix mechanism.
157 FLASH_Queue_vars args;
158
159 // If the queue is empty, return early.
160 if ( n_tasks == 0 )
161 return;
162
163 #ifndef FLA_ENABLE_MULTITHREADING
164 // Turn off work stealing in simulation mode.
165 FLASH_Queue_set_work_stealing( FALSE );
166 #endif
167
168 // Query the number of user set threads per queue.
169 n_queues = FLASH_Queue_get_cores_per_queue();
170
171 // Default user setting for number of threads.
172 if ( n_queues <= 0 )
173 {
174 // Do not use data affinity or work stealing when caching is enabled.
175 if ( FLASH_Queue_get_caching() )
176 {
177 FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
178 FLASH_Queue_set_work_stealing( FALSE );
179 }
180
181 // Do not use work stealing when data affinity is enabled.
182 if ( FLASH_Queue_get_data_affinity() != FLASH_QUEUE_AFFINITY_NONE )
183 {
184 FLASH_Queue_set_work_stealing( FALSE );
185 }
186
187 // Allocate different arrays if using data affinity.
188 n_queues = ( FLASH_Queue_get_data_affinity() ==
189 FLASH_QUEUE_AFFINITY_NONE &&
190 !FLASH_Queue_get_work_stealing() ? 1 : n_threads );
191 }
192 else
193 {
194 // Set the number of queues.
195 n_queues = n_threads / n_queues;
196
197 // Must use at least one queue.
198 if ( n_queues == 0 )
199 n_queues = 1;
200
201 if ( n_queues == 1 )
202 {
203 // Turn off all multiple queue implementations.
204 FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
205 FLASH_Queue_set_work_stealing( FALSE );
206 }
207 else
208 {
209 // Use 2D data affinity for multiple queues if nothing is set.
210 if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE &&
211 !FLASH_Queue_get_work_stealing() )
212 {
213 FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC );
214 }
215 }
216 }
217
218 // Determine the number of caches.
219 n_caches = n_threads / FLASH_Queue_get_cores_per_cache();
220
221 args.n_queues = n_queues;
222 args.n_caches = n_caches;
223
224 #ifdef FLA_ENABLE_MULTITHREADING
225 // Allocate memory for array of locks.
226 run_lock = ( FLA_Lock* ) FLA_malloc( n_queues * sizeof( FLA_Lock ) );
227 dep_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
228 war_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
229 cac_lock = ( FLA_Lock* ) FLA_malloc( n_caches * sizeof( FLA_Lock ) );
230
231 args.run_lock = run_lock;
232 args.dep_lock = dep_lock;
233 args.war_lock = war_lock;
234 args.cac_lock = cac_lock;
235
236 // Initialize the all lock.
237 FLA_Lock_init( &(args.all_lock) );
238
239 // Initialize the run lock for thread i.
240 for ( i = 0; i < n_queues; i++ )
241 {
242 FLA_Lock_init( &(args.run_lock[i]) );
243 }
244
245 // Initialize the dep and war locks for thread i.
246 for ( i = 0; i < n_threads; i++ )
247 {
248 FLA_Lock_init( &(args.dep_lock[i]) );
249 FLA_Lock_init( &(args.war_lock[i]) );
250 }
251
252 // Initialize the cac locks for each cache.
253 for ( i = 0; i < n_caches; i++ )
254 {
255 FLA_Lock_init( &(args.cac_lock[i]) );
256 }
257 #endif
258
259 // The number of blocks that can fit into the cache on each thread.
260 if ( block_size == 0 )
261 size = MIN_CACHE_BLOCKS;
262 else
263 size = max( FLASH_Queue_get_cache_size() / block_size, MIN_CACHE_BLOCKS);
264 args.size = size;
265
266 // Allocate memory for cache, prefetch buffer, and waiting queue.
267 cache = ( FLA_Obj* ) FLA_malloc( size * n_caches * sizeof( FLA_Obj ) );
268 prefetch = ( FLA_Obj* ) FLA_malloc( size * sizeof( FLA_Obj ) );
269 wait_queue = ( FLASH_Queue* ) FLA_malloc( n_queues * sizeof( FLASH_Queue ));
270
271 args.cache = cache;
272 args.prefetch = prefetch;
273 args.wait_queue = wait_queue;
274
275 // Initialize cache, prefetch buffer, and waiting queue.
276 for ( i = 0; i < size * n_caches; i++ )
277 args.cache[i].base = NULL;
278
279 for ( i = 0; i < size; i++ )
280 args.prefetch[i].base = NULL;
281
282 for ( i = 0; i < n_queues; i++ )
283 {
284 args.wait_queue[i].n_tasks = 0;
285 args.wait_queue[i].head = NULL;
286 args.wait_queue[i].tail = NULL;
287 }
288
289 // Initialize the aggregate task counter.
290 args.pc = 0;
291
292 #ifdef FLA_ENABLE_GPU
293 #ifdef FLA_ENABLE_MULTITHREADING
294 // Allocate and initialize the gpu locks.
295 gpu_lock = ( FLA_Lock* ) FLA_malloc( n_threads * sizeof( FLA_Lock ) );
296 args.gpu_lock = gpu_lock;
297
298 for ( i = 0; i < n_threads; i++ )
299 FLA_Lock_init( &(args.gpu_lock[i]) );
300 #endif
301 // Allocate and initialize GPU software cache.
302 gpu = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
303 args.gpu = gpu;
304
305 for ( i = 0; i < gpu_n_blocks * n_threads; i++ )
306 {
307 args.gpu[i].obj.base = NULL;
308 args.gpu[i].buffer_gpu = NULL;
309 args.gpu[i].clean = TRUE;
310 args.gpu[i].request = FALSE;
311 }
312
313 victim = ( FLA_Obj_gpu* ) FLA_malloc( n_threads * sizeof( FLA_Obj_gpu ) );
314 args.victim = victim;
315
316 for ( i = 0; i < n_threads; i++ )
317 args.victim[i].obj.base = NULL;
318
319 gpu_log = ( FLA_Obj_gpu* ) FLA_malloc( gpu_n_blocks * n_threads * sizeof( FLA_Obj_gpu ) );
320 args.gpu_log = gpu_log;
321 #endif
322
323 // Initialize tasks with critical information.
324 FLASH_Queue_init_tasks( ( void* ) &args );
325
326 // Display verbose output before free all tasks.
327 if ( FLASH_Queue_get_verbose_output() )
328 FLASH_Queue_verbose_output();
329
330 // Start timing the parallel execution.
331 dtime = FLA_Clock();
332
333 #ifdef FLA_ENABLE_MULTITHREADING
334 // Parallel Execution!
335 FLASH_Queue_exec_parallel( ( void* ) &args );
336 #else
337 // Simulation!
338 FLASH_Queue_exec_simulation( ( void* ) &args );
339 #endif
340
341 // End timing the parallel execution.
342 dtime = FLA_Clock() - dtime;
343 FLASH_Queue_set_parallel_time( dtime );
344
345 #ifdef FLA_ENABLE_MULTITHREADING
346 // Destroy the locks.
347 FLA_Lock_destroy( &(args.all_lock) );
348
349 for ( i = 0; i < n_queues; i++ )
350 {
351 FLA_Lock_destroy( &(args.run_lock[i]) );
352 }
353
354 for ( i = 0; i < n_threads; i++ )
355 {
356 FLA_Lock_destroy( &(args.dep_lock[i]) );
357 FLA_Lock_destroy( &(args.war_lock[i]) );
358 }
359
360 for ( i = 0; i < n_caches; i++ )
361 {
362 FLA_Lock_destroy( &(args.cac_lock[i]) );
363 }
364
365 // Deallocate memory.
366 FLA_free( run_lock );
367 FLA_free( dep_lock );
368 FLA_free( war_lock );
369 FLA_free( cac_lock );
370 #endif
371
372 FLA_free( cache );
373 FLA_free( prefetch );
374 FLA_free( wait_queue );
375
376 #ifdef FLA_ENABLE_GPU
377 #ifdef FLA_ENABLE_MULTITHREADING
378 for ( i = 0; i < n_threads; i++ )
379 FLA_Lock_destroy( &(args.gpu_lock[i]) );
380 FLA_free( gpu_lock );
381 #endif
382 FLA_free( gpu );
383 FLA_free( victim );
384 FLA_free( gpu_log );
385 #endif
386
387 // Reset values for next call to FLASH_Queue_exec().
388 FLASH_Queue_reset();
389
390 return;
391 }
392
393
FLASH_Queue_init_tasks(void * arg)394 void FLASH_Queue_init_tasks( void* arg )
395 /*----------------------------------------------------------------------------
396
397 FLASH_Queue_init_tasks
398
399 ----------------------------------------------------------------------------*/
400 {
401 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
402 int i, j, k;
403 int n_tasks = FLASH_Queue_get_num_tasks();
404 int n_queues = args->n_queues;
405 int n_prefetch = 0;
406 int n_ready = 0;
407 int length = 0;
408 int width = 0;
409 int height = 0;
410 int size = args->size;
411 FLASH_Data_aff data_aff = FLASH_Queue_get_data_affinity();
412 FLASH_Task* t;
413 FLASH_Dep* d;
414 FLA_Obj obj;
415
416 #ifdef FLA_ENABLE_GPU
417 dim_t block_size = 0;
418 FLA_Datatype datatype = FLA_FLOAT;
419 dim_t datatype_size = FLA_Obj_datatype_size( datatype );
420 #endif
421
422 // Find the 2D factorization of the number of threads.
423 if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
424 {
425 int sq_rt = 0;
426 while ( sq_rt * sq_rt <= n_queues ) sq_rt++;
427 sq_rt--;
428 while ( n_queues % sq_rt != 0 ) sq_rt--;
429 length = n_queues / sq_rt;
430 width = sq_rt;
431 }
432
433 // Grab the tail of the task queue.
434 t = FLASH_Queue_get_tail_task();
435
436 for ( i = n_tasks - 1; i >= 0; i-- )
437 {
438 // Determine data affinity.
439 if ( data_aff == FLASH_QUEUE_AFFINITY_NONE )
440 { // No data affinity
441 t->queue = 0;
442 }
443 else
444 {
445 // Use the first output block to determine data affinity.
446 obj = t->output_arg[0];
447
448 // Use the top left block of the macroblock.
449 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
450 obj = *FLASH_OBJ_PTR_AT( obj );
451
452 if ( data_aff == FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC )
453 { // Two-dimensional block cyclic
454 t->queue = ( obj.base->m_index % length ) +
455 ( obj.base->n_index % width ) * length;
456 }
457 else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_ROW_BLOCK_CYCLIC )
458 { // One-dimensional row block cyclic
459 t->queue = obj.base->m_index % n_queues;
460 }
461 else if ( data_aff == FLASH_QUEUE_AFFINITY_1D_COLUMN_BLOCK_CYCLIC )
462 { // One-dimensional column block cyclic
463 t->queue = obj.base->n_index % n_queues;
464 }
465 else
466 { // Round-robin
467 t->queue = t->queue % n_queues;
468 }
469 }
470
471 // Determine the height of each task in the DAG.
472 height = 0;
473 d = t->dep_arg_head;
474
475 // Take the maximum height of dependent tasks.
476 for ( j = 0; j < t->n_dep_args; j++ )
477 {
478 height = max( height, d->task->height );
479 d = d->next_dep;
480 }
481
482 t->height = height + 1;
483
484 // Since freeing a task is always a leaf, we want to force it to execute
485 // earlier by giving it a greater height in order to reclaim memory.
486 if ( t->func == (void *) FLA_Obj_free_buffer_task )
487 t->height += n_tasks;
488
489 #ifdef FLA_ENABLE_GPU
490 for ( j = 0; j < t->n_output_args + t->n_input_args; j++ )
491 {
492 // Find the correct input or output argument.
493 if ( j < t->n_output_args )
494 obj = t->output_arg[j];
495 else
496 obj = t->input_arg[j - t->n_output_args];
497
498 // Macroblock is used.
499 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
500 {
501 dim_t jj, kk;
502 dim_t m = FLA_Obj_length( obj );
503 dim_t n = FLA_Obj_width( obj );
504 dim_t cs = FLA_Obj_col_stride( obj );
505 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
506
507 // Check each block in macroblock.
508 for ( jj = 0; jj < n; jj++ )
509 {
510 for ( kk = 0; kk < m; kk++ )
511 {
512 obj = *( buf + jj * cs + kk );
513
514 block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
515
516 if ( jj == 0 && FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
517 {
518 datatype = FLA_Obj_datatype( obj );
519 datatype_size = FLA_Obj_datatype_size( datatype );
520 }
521 }
522 }
523 }
524 else // Regular block.
525 {
526 block_size = max( FLA_Obj_length( obj ) * FLA_Obj_width( obj ), block_size );
527
528 if ( FLA_Obj_datatype( obj ) != datatype && FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ) > datatype_size )
529 {
530 datatype = FLA_Obj_datatype( obj );
531 datatype_size = FLA_Obj_datatype_size( datatype );
532 }
533 }
534 }
535 #endif
536
537 // Find the first blocks accessed each task.
538 if ( n_prefetch < size )
539 {
540 for ( j = 0; j < t->n_output_args; j++ )
541 {
542 obj = t->output_arg[j];
543
544 // Macroblock is used.
545 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
546 {
547 dim_t jj, kk;
548 dim_t m = FLA_Obj_length( obj );
549 dim_t n = FLA_Obj_width( obj );
550 dim_t cs = FLA_Obj_col_stride( obj );
551 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
552
553 // Check each block in macroblock.
554 for ( jj = 0; jj < n; jj++ )
555 {
556 for ( kk = 0; kk < m; kk++ )
557 {
558 obj = *( buf + jj * cs + kk );
559
560 k = obj.base->n_write_blocks;
561
562 // This block is one of the first blocks to be accessed.
563 if ( k < size && k == n_prefetch )
564 {
565 args->prefetch[k] = obj;
566 n_prefetch++;
567 }
568 }
569 }
570 }
571 else // Regular block.
572 {
573 k = obj.base->n_write_blocks;
574
575 // This block is one of the first blocks to be accessed.
576 if ( k < size && k == n_prefetch )
577 {
578 args->prefetch[k] = obj;
579 n_prefetch++;
580 }
581 }
582 }
583 }
584
585 // Find all ready tasks.
586 t->n_ready += t->n_input_args + t->n_output_args +
587 t->n_macro_args + t->n_war_args;
588
589 if ( t->n_ready == 0 )
590 {
591 // Save the number of ready and available tasks.
592 n_ready++;
593 }
594
595 // Go to the previous task.
596 t = t->prev_task;
597 }
598
599 // Grab the head of the task queue.
600 t = FLASH_Queue_get_head_task();
601
602 for ( i = 0; i < n_tasks && n_ready > 0; i++ )
603 {
604 if ( t->n_ready == 0 )
605 {
606 // Enqueue all the ready and available tasks.
607 FLASH_Queue_wait_enqueue( t, arg );
608
609 // Decrement the number of ready tasks left to be enqueued.
610 n_ready--;
611 }
612
613 // Go to the next task.
614 t = t->next_task;
615 }
616
617 #ifdef FLA_ENABLE_GPU
618 args->block_size = block_size;
619 args->datatype = datatype;
620 #endif
621
622 return;
623 }
624
625
FLASH_Queue_wait_enqueue(FLASH_Task * t,void * arg)626 void FLASH_Queue_wait_enqueue( FLASH_Task* t, void* arg )
627 /*----------------------------------------------------------------------------
628
629 FLASH_Queue_wait_enqueue
630
631 ----------------------------------------------------------------------------*/
632 {
633 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
634 int queue = t->queue;
635
636 if ( args->wait_queue[queue].n_tasks == 0 )
637 {
638 args->wait_queue[queue].head = t;
639 args->wait_queue[queue].tail = t;
640 }
641 else
642 {
643 t->prev_wait = args->wait_queue[queue].tail;
644
645 // Insertion sort of tasks in waiting queue.
646 if ( FLASH_Queue_get_sorting() )
647 {
648 while ( t->prev_wait != NULL )
649 {
650 if ( t->prev_wait->height >= t->height )
651 break;
652
653 t->next_wait = t->prev_wait;
654 t->prev_wait = t->prev_wait->prev_wait;
655 }
656 }
657
658 // Checking if the task is the head of the waiting queue.
659 if ( t->prev_wait == NULL )
660 args->wait_queue[queue].head = t;
661 else
662 t->prev_wait->next_wait = t;
663
664 // Checking if the task is the tail of the waiting queue.
665 if ( t->next_wait == NULL )
666 args->wait_queue[queue].tail = t;
667 else
668 t->next_wait->prev_wait = t;
669 }
670
671 // Increment number of tasks on waiting queue.
672 args->wait_queue[queue].n_tasks++;
673
674 return;
675 }
676
677
FLASH_Queue_wait_dequeue(int queue,int cache,void * arg)678 FLASH_Task* FLASH_Queue_wait_dequeue( int queue, int cache, void* arg )
679 /*----------------------------------------------------------------------------
680
681 FLASH_Queue_wait_dequeue
682
683 ----------------------------------------------------------------------------*/
684 {
685 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
686 FLASH_Task* t = NULL;
687 FLA_Bool enabled = FALSE;
688
689 #ifdef FLA_ENABLE_GPU
690 enabled = FLASH_Queue_get_enabled_gpu();
691 #endif
692
693 if ( args->wait_queue[queue].n_tasks > 0 )
694 {
695 // Dequeue the first task.
696 t = args->wait_queue[queue].head;
697
698 if ( args->wait_queue[queue].n_tasks == 1 )
699 {
700 // Clear the queue of its only task.
701 args->wait_queue[queue].head = NULL;
702 args->wait_queue[queue].tail = NULL;
703 }
704 else
705 {
706 // Grab a new task if using cache affinity.
707 if ( FLASH_Queue_get_caching() )
708 {
709 // Determine if using GPU or not.
710 if ( enabled )
711 {
712 #ifdef FLA_ENABLE_GPU
713 #ifdef FLA_ENABLE_MULTITHREADING
714 FLA_Lock_acquire( &(args->gpu_lock[cache]) ); // G ***
715 #endif
716 // Find a task where the task has blocks currently in GPU.
717 t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
718
719 #ifdef FLA_ENABLE_MULTITHREADING
720 FLA_Lock_release( &(args->gpu_lock[cache]) ); // G ***
721 #endif
722 #endif
723 }
724 else
725 {
726 #ifdef FLA_ENABLE_MULTITHREADING
727 FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
728 #endif
729 // Find a task where the task has blocks currently in cache.
730 t = FLASH_Queue_wait_dequeue_block( queue, cache, arg );
731
732 #ifdef FLA_ENABLE_MULTITHREADING
733 FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
734 #endif
735 }
736
737 // Adjust pointers if the task is head of waiting queue.
738 if ( t->prev_wait == NULL )
739 {
740 args->wait_queue[queue].head = t->next_wait;
741 args->wait_queue[queue].head->prev_wait = NULL;
742 }
743 else
744 {
745 t->prev_wait->next_wait = t->next_wait;
746 }
747
748 // Adjust pointers if the task is tail of waiting queue.
749 if ( t->next_wait == NULL )
750 {
751 args->wait_queue[queue].tail = t->prev_wait;
752 args->wait_queue[queue].tail->next_wait = NULL;
753 }
754 else
755 {
756 t->next_wait->prev_wait = t->prev_wait;
757 }
758 }
759 else
760 {
761 // Adjust pointers in waiting queue.
762 args->wait_queue[queue].head = t->next_wait;
763 args->wait_queue[queue].head->prev_wait = NULL;
764 }
765 }
766
767 // Clear the task's waiting linked list pointers.
768 t->prev_wait = NULL;
769 t->next_wait = NULL;
770
771 // Decrement number of tasks on waiting queue.
772 args->wait_queue[queue].n_tasks--;
773 }
774
775 return t;
776 }
777
778
FLASH_Queue_wait_dequeue_block(int queue,int cache,void * arg)779 FLASH_Task* FLASH_Queue_wait_dequeue_block( int queue, int cache, void* arg )
780 /*----------------------------------------------------------------------------
781
782 FLASH_Queue_wait_dequeue_block
783
784 ----------------------------------------------------------------------------*/
785 {
786 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
787 int i, j, k;
788 int size = args->size;
789 int n_tasks = args->wait_queue[queue].n_tasks;
790 FLA_Bool enabled = FALSE;
791 FLASH_Task* t;
792 FLA_Obj obj;
793 FLA_Obj mem;
794
795 #ifdef FLA_ENABLE_GPU
796 enabled = FLASH_Queue_get_enabled_gpu();
797
798 // If using GPUs, then only check GPU and not the cache.
799 if ( enabled )
800 size = FLASH_Queue_get_gpu_num_blocks();
801 #endif
802
803 t = args->wait_queue[queue].head;
804
805 // Check if any of the output blocks are in the cache.
806 for ( i = 0; i < n_tasks; i++ )
807 {
808 for ( j = 0; j < size; j++ )
809 {
810 // Initialize the memory just in case.
811 mem.base = NULL;
812
813 // Determine if using GPU or not.
814 if ( enabled )
815 {
816 #ifdef FLA_ENABLE_GPU
817 mem = args->gpu[cache * size + j].obj;
818 #endif
819 }
820 else
821 {
822 mem = args->cache[cache * size + j];
823 }
824
825 for ( k = 0; k < t->n_output_args; k++ )
826 {
827 obj = t->output_arg[k];
828
829 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
830 obj = *FLASH_OBJ_PTR_AT( obj );
831
832 // Return the task if its output block is in cache.
833 if ( mem.base == obj.base )
834 {
835 t->hit = TRUE;
836 return t;
837 }
838 }
839 }
840 t = t->next_wait;
841 }
842
843 return args->wait_queue[queue].head;
844 }
845
846
FLASH_Queue_update_cache(FLASH_Task * t,void * arg)847 void FLASH_Queue_update_cache( FLASH_Task* t, void* arg )
848 /*----------------------------------------------------------------------------
849
850 FLASH_Queue_update_cache
851
852 ----------------------------------------------------------------------------*/
853 {
854 int i, j;
855 FLA_Bool duplicate;
856 FLA_Obj obj;
857
858 if ( t == NULL )
859 return;
860
861 // Updating the input blocks.
862 for ( i = t->n_input_args - 1; i >= 0; i-- )
863 {
864 // Check for duplicate blocks.
865 duplicate = FALSE;
866
867 for ( j = 0; j < t->n_output_args && !duplicate; j++ )
868 {
869 if ( t->input_arg[i].base == t->output_arg[j].base )
870 duplicate = TRUE;
871 }
872
873 for ( j = 0; j < i && !duplicate; j++ )
874 {
875 if ( t->input_arg[i].base == t->input_arg[j].base )
876 duplicate = TRUE;
877 }
878
879 // If the input block has not been processed before.
880 if ( !duplicate )
881 {
882 obj = t->input_arg[i];
883
884 // Macroblock is used.
885 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
886 {
887 dim_t jj, kk;
888 dim_t m = FLA_Obj_length( obj );
889 dim_t n = FLA_Obj_width( obj );
890 dim_t cs = FLA_Obj_col_stride( obj );
891 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
892
893 // Dependence analysis for each input block in macroblock.
894 for ( jj = 0; jj < n; jj++ )
895 for ( kk = 0; kk < m; kk++ )
896 FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
897 t->cache, FALSE, arg );
898 }
899 else // Regular block.
900 {
901 FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
902 }
903 }
904 }
905
906 // Updating the output blocks.
907 for ( i = t->n_output_args - 1; i >= 0; i-- )
908 {
909 // Check for duplicate blocks.
910 duplicate = FALSE;
911
912 for ( j = 0; j < i && !duplicate; j++ )
913 {
914 if ( t->output_arg[i].base == t->output_arg[j].base )
915 duplicate = TRUE;
916 }
917
918 // If the output block has not been processed before.
919 if ( !duplicate )
920 {
921 obj = t->output_arg[i];
922
923 // Macroblock is used.
924 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
925 {
926 dim_t jj, kk;
927 dim_t m = FLA_Obj_length( obj );
928 dim_t n = FLA_Obj_width( obj );
929 dim_t cs = FLA_Obj_col_stride( obj );
930 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
931
932 // Dependence analysis for each input block in macroblock.
933 for ( jj = 0; jj < n; jj++ )
934 for ( kk = 0; kk < m; kk++ )
935 FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
936 t->cache, TRUE, arg );
937 }
938 else // Regular block.
939 {
940 FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
941 }
942 }
943 }
944
945 return;
946 }
947
948
FLASH_Queue_update_cache_block(FLA_Obj obj,int cache,FLA_Bool output,void * arg)949 void FLASH_Queue_update_cache_block( FLA_Obj obj,
950 int cache,
951 FLA_Bool output,
952 void* arg )
953 /*----------------------------------------------------------------------------
954
955 FLASH_Queue_update_cache_block
956
957 ----------------------------------------------------------------------------*/
958 {
959 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
960 int i, j, k;
961 int n_caches = args->n_caches;
962 int size = args->size;
963
964 #ifdef FLA_ENABLE_MULTITHREADING
965 FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
966 #endif
967
968 // Locate the position of the block in the cache.
969 for ( k = 0; k < size - 1; k++ )
970 {
971 if ( obj.base == args->cache[cache * size + k].base )
972 break;
973 }
974
975 // Shift all the previous tasks for LRU replacement.
976 for ( j = k; j > 0; j-- )
977 args->cache[cache * size + j] = args->cache[cache * size + j - 1];
978
979 // Place the block on the cache as the most recently used.
980 args->cache[cache * size] = obj;
981
982 #ifdef FLA_ENABLE_MULTITHREADING
983 FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
984 #endif
985
986 // Write invalidate if updating with output block.
987 if ( output )
988 {
989 for ( i = 0; i < n_caches; i++ )
990 {
991 if ( i != cache )
992 {
993 #ifdef FLA_ENABLE_MULTITHREADING
994 FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
995 #endif
996 // Locate the position of the block in the cache.
997 for ( k = 0; k < size; k++ )
998 {
999 if ( obj.base == args->cache[i * size + k].base )
1000 break;
1001 }
1002
1003 // The block is owned by other thread.
1004 if ( k < size )
1005 {
1006 // Shift all the blocks for the invalidated block.
1007 for ( j = k; j < size - 1; j++ )
1008 args->cache[i * size + j] = args->cache[i * size + j + 1];
1009
1010 // Invalidate the block.
1011 args->cache[i * size + size - 1].base = NULL;
1012 }
1013 #ifdef FLA_ENABLE_MULTITHREADING
1014 FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
1015 #endif
1016 }
1017 }
1018 }
1019
1020 return;
1021 }
1022
1023
FLASH_Queue_prefetch(int cache,void * arg)1024 void FLASH_Queue_prefetch( int cache, void* arg )
1025 /*----------------------------------------------------------------------------
1026
1027 FLASH_Queue_prefetch
1028
1029 ----------------------------------------------------------------------------*/
1030 {
1031 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1032 int i;
1033 int size = args->size;
1034 FLA_Obj obj;
1035
1036 // Prefetch blocks in opposite order to maintain LRU.
1037 for ( i = size - 1; i >= 0; i-- )
1038 {
1039 obj = args->prefetch[i];
1040
1041 // Only prefetch if it is a valid block.
1042 if ( obj.base != NULL )
1043 {
1044 // Prefetch the block.
1045 FLASH_Queue_prefetch_block( obj );
1046
1047 // Record the prefetched block in the cache.
1048 args->cache[cache * size + i] = obj;
1049 }
1050 }
1051
1052 return;
1053 }
1054
1055
FLASH_Queue_prefetch_block(FLA_Obj obj)1056 void FLASH_Queue_prefetch_block( FLA_Obj obj )
1057 /*----------------------------------------------------------------------------
1058
1059 FLASH_Queue_prefetch_block
1060
1061 ----------------------------------------------------------------------------*/
1062 {
1063 int i, inc;
1064 int line_size = FLASH_Queue_get_cache_line_size();
1065 int elem_size = FLA_Obj_elem_size( obj );
1066 int length = FLA_Obj_length( obj );
1067 int width = FLA_Obj_width( obj );
1068 FLA_Datatype datatype = FLA_Obj_datatype( obj );
1069
1070 // Determine stride to prefetch block into cache.
1071 inc = line_size / elem_size;
1072
1073 // Switch between the four different datatypes.
1074 switch ( datatype )
1075 {
1076 case FLA_FLOAT:
1077 {
1078 float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
1079 float access;
1080
1081 // Access each cache line of the block.
1082 for ( i = 0; i < length * width; i += inc )
1083 access = buffer[i];
1084
1085 // Prevent dead code elimination.
1086 access += 1.0;
1087
1088 break;
1089 }
1090 case FLA_DOUBLE:
1091 {
1092 double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
1093 double access;
1094
1095 // Access each cache line of the block.
1096 for ( i = 0; i < length * width; i += inc )
1097 access = buffer[i];
1098
1099 // Prevent dead code elimination.
1100 access += 1.0;
1101
1102 break;
1103 }
1104 case FLA_COMPLEX:
1105 {
1106 scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
1107 scomplex access;
1108
1109 // Access each cache line of the block.
1110 for ( i = 0; i < length * width; i += inc )
1111 access = buffer[i];
1112
1113 // Prevent dead code elimination.
1114 access.real += 1.0;
1115
1116 break;
1117 }
1118 case FLA_DOUBLE_COMPLEX:
1119 {
1120 dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
1121 dcomplex access;
1122
1123 // Access each cache line of the block.
1124 for ( i = 0; i < length * width; i += inc )
1125 access = buffer[i];
1126
1127 // Prevent dead code elimination.
1128 access.real += 1.0;
1129
1130 break;
1131 }
1132 case FLA_INT:
1133 {
1134 int *buffer = ( int * ) FLA_INT_PTR( obj );
1135 int access;
1136
1137 // Access each cache line of the block.
1138 for ( i = 0; i < length * width; i += inc )
1139 access = buffer[i];
1140
1141 // Prevent dead code elimination.
1142 access += 1.0;
1143
1144 break;
1145 }
1146 default:
1147 // This default case should never execute.
1148 FLA_Check_error_code( FLA_INVALID_DATATYPE );
1149 }
1150
1151 return;
1152 }
1153
1154
FLASH_Queue_work_stealing(int queue,void * arg)1155 FLASH_Task* FLASH_Queue_work_stealing( int queue, void *arg )
1156 /*----------------------------------------------------------------------------
1157
1158 FLASH_Queue_work_stealing
1159
1160 ----------------------------------------------------------------------------*/
1161 {
1162 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1163 int q;
1164 int n_queues = args->n_queues;
1165 FLASH_Task* t = NULL;
1166
1167 // Do not perform work stealing if there is only one queue.
1168 if ( n_queues == 1 )
1169 return t;
1170
1171 // Find a random queue not equal to the current queue.
1172 do
1173 {
1174 #ifdef FLA_ENABLE_WINDOWS_BUILD
1175 rand_s( &q );
1176 q = q % n_queues;
1177 #else
1178 #ifdef FLA_ENABLE_TIDSP
1179 q = rand() % n_queues;
1180 #else
1181 q = lrand48() % n_queues;
1182 #endif
1183 #endif
1184 }
1185 while ( q == queue );
1186
1187 #ifdef FLA_ENABLE_MULTITHREADING
1188 FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
1189 #endif
1190
1191 // If there are tasks that this thread can steal.
1192 if ( args->wait_queue[q].n_tasks > 0 )
1193 {
1194 // Dequeue the last task.
1195 t = args->wait_queue[q].tail;
1196
1197 if ( args->wait_queue[q].n_tasks == 1 )
1198 {
1199 // Clear the queue of its only task.
1200 args->wait_queue[q].head = NULL;
1201 args->wait_queue[q].tail = NULL;
1202 }
1203 else
1204 {
1205 // Adjust pointers in waiting queue.
1206 args->wait_queue[q].tail = t->prev_wait;
1207 args->wait_queue[q].tail->next_wait = NULL;
1208 }
1209
1210 // Reset waiting queue data about the stolen task.
1211 t->queue = queue;
1212 t->prev_wait = NULL;
1213 t->next_wait = NULL;
1214
1215 args->wait_queue[q].n_tasks--;
1216 }
1217
1218 #ifdef FLA_ENABLE_MULTITHREADING
1219 FLA_Lock_release( &(args->run_lock[q]) ); // R ***
1220 #endif
1221
1222 return t;
1223 }
1224
1225 #ifdef FLA_ENABLE_GPU
1226
FLASH_Queue_create_gpu(int thread,void * arg)1227 void FLASH_Queue_create_gpu( int thread, void *arg )
1228 /*----------------------------------------------------------------------------
1229
1230 FLASH_Queue_create_gpu
1231
1232 ----------------------------------------------------------------------------*/
1233 {
1234 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1235 int i;
1236 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1237 dim_t block_size = args->block_size;
1238 FLA_Datatype datatype = args->datatype;
1239
1240 // Exit if not using GPU.
1241 if ( !FLASH_Queue_get_enabled_gpu() )
1242 return;
1243
1244 // Bind thread to GPU.
1245 FLASH_Queue_bind_gpu( thread );
1246
1247 // Allocate the memory on the GPU for all the blocks a priori.
1248 for ( i = 0; i < gpu_n_blocks; i++ )
1249 FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
1250
1251 return;
1252 }
1253
1254
FLASH_Queue_destroy_gpu(int thread,void * arg)1255 void FLASH_Queue_destroy_gpu( int thread, void *arg )
1256 /*----------------------------------------------------------------------------
1257
1258 FLASH_Queue_destroy_gpu
1259
1260 ----------------------------------------------------------------------------*/
1261 {
1262 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1263 int i;
1264 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1265 FLA_Obj_gpu gpu_obj;
1266
1267 // Exit if not using GPU.
1268 if ( !FLASH_Queue_get_enabled_gpu() )
1269 return;
1270
1271 // Examine every block left on the GPU.
1272 for ( i = 0; i < gpu_n_blocks; i++ )
1273 {
1274 gpu_obj = args->gpu[thread * gpu_n_blocks + i];
1275
1276 // Flush the blocks that are dirty.
1277 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1278 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1279
1280 // Free the memory on the GPU for all the blocks.
1281 FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
1282 }
1283
1284 return;
1285 }
1286
1287
FLASH_Queue_exec_gpu(FLASH_Task * t,void * arg)1288 FLA_Bool FLASH_Queue_exec_gpu( FLASH_Task *t, void *arg )
1289 /*----------------------------------------------------------------------------
1290
1291 FLASH_Queue_exec_gpu
1292
1293 ----------------------------------------------------------------------------*/
1294 {
1295 void** input_arg;
1296 void** output_arg;
1297
1298 if ( t == NULL )
1299 return TRUE;
1300
1301 // If not using the GPU, then execute on CPU.
1302 if ( !FLASH_Queue_get_enabled_gpu() )
1303 {
1304 FLASH_Queue_exec_task( t );
1305
1306 return TRUE;
1307 }
1308
1309 // Check if all the operands are ready and up to date.
1310 if ( !FLASH_Queue_check_gpu( t, arg ) )
1311 {
1312 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1313 int queue = t->queue;
1314 t->hit = FALSE;
1315
1316 #ifdef FLA_ENABLE_MULTITHREADING
1317 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
1318 #endif
1319 // Reenqueue the task if the blocks are not all flushed.
1320 FLASH_Queue_wait_enqueue( t, arg );
1321
1322 #ifdef FLA_ENABLE_MULTITHREADING
1323 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
1324 #endif
1325
1326 return FALSE;
1327 }
1328
1329 // If GPU is enabled, but the task is not supported for GPU execution.
1330 if ( !t->enabled_gpu )
1331 {
1332 int i, j, k;
1333 int thread = t->thread;
1334 int n_input_args = t->n_input_args;
1335 int n_output_args = t->n_output_args;
1336 int n_threads = FLASH_Queue_get_num_threads();
1337 FLA_Bool duplicate;
1338 FLA_Obj obj;
1339
1340 // Check the blocks on each GPU.
1341 for ( k = 0; k < n_threads; k++ )
1342 {
1343 // Check the input and output arguments on the GPUs.
1344 for ( i = 0; i < n_input_args + n_output_args; i++ )
1345 {
1346 // Check for duplicate blocks.
1347 duplicate = FALSE;
1348
1349 // Find the correct input or output argument.
1350 if ( i < n_input_args )
1351 {
1352 obj = t->input_arg[i];
1353
1354 for ( j = 0; j < n_output_args && !duplicate; j++ )
1355 {
1356 if ( obj.base == t->output_arg[j].base )
1357 duplicate = TRUE;
1358 }
1359
1360 for ( j = 0; j < i && !duplicate; j++ )
1361 {
1362 if ( obj.base == t->input_arg[j].base )
1363 duplicate = TRUE;
1364 }
1365 }
1366 else
1367 {
1368 obj = t->output_arg[i - n_input_args];
1369
1370 for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1371 {
1372 if ( obj.base == t->output_arg[j].base )
1373 duplicate = TRUE;
1374 }
1375 }
1376
1377 // If the block has not been processed before.
1378 if ( !duplicate )
1379 {
1380 // Macroblock is used.
1381 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1382 {
1383 dim_t jj, kk;
1384 dim_t m = FLA_Obj_length( obj );
1385 dim_t n = FLA_Obj_width( obj );
1386 dim_t cs = FLA_Obj_col_stride( obj );
1387 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1388
1389 // Clear each block in macroblock.
1390 for ( jj = 0; jj < n; jj++ )
1391 {
1392 for ( kk = 0; kk < m; kk++ )
1393 {
1394 obj = *( buf + jj * cs + kk );
1395
1396 // Flush the block to main memory if it is on the GPU.
1397 if ( k == thread )
1398 FLASH_Queue_flush_block_gpu( obj, k, arg );
1399
1400 // Invalidate output block on all GPUs.
1401 if ( i >= n_input_args )
1402 FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1403 }
1404 }
1405 }
1406 else
1407 {
1408 // Flush the block to main memory if it is on the GPU.
1409 if ( k == thread )
1410 FLASH_Queue_flush_block_gpu( obj, k, arg );
1411
1412 // Invalidate output block on all GPUs.
1413 if ( i >= n_input_args )
1414 FLASH_Queue_invalidate_block_gpu( obj, k, arg );
1415 }
1416 }
1417 }
1418 }
1419
1420 // Execute the task on CPU instead of GPU.
1421 FLASH_Queue_exec_task( t );
1422
1423 return TRUE;
1424 }
1425
1426 // Gather the pointers for the data on the GPU.
1427 input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
1428 output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
1429
1430 // Bring all the blocks to GPU.
1431 FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
1432
1433 // Execute the task on GPU.
1434 FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
1435
1436 // Mark all the output blocks as dirty.
1437 FLASH_Queue_mark_gpu( t, arg );
1438
1439 // Free memory.
1440 FLA_free( input_arg );
1441 FLA_free( output_arg );
1442
1443 return TRUE;
1444 }
1445
1446
FLASH_Queue_check_gpu(FLASH_Task * t,void * arg)1447 FLA_Bool FLASH_Queue_check_gpu( FLASH_Task *t, void *arg )
1448 /*----------------------------------------------------------------------------
1449
1450 FLASH_Queue_check_gpu
1451
1452 ----------------------------------------------------------------------------*/
1453 {
1454 int i, j, k;
1455 int thread = t->thread;
1456 int n_input_args = t->n_input_args;
1457 int n_output_args = t->n_output_args;
1458 int n_threads = FLASH_Queue_get_num_threads();
1459 FLA_Bool r_val = TRUE;
1460 FLA_Bool t_val;
1461 FLA_Bool duplicate;
1462 FLA_Obj obj;
1463
1464 // Check the input and output arguments on the GPUs.
1465 for ( i = 0; i < n_input_args + n_output_args; i++ )
1466 {
1467 // Check for duplicate blocks.
1468 duplicate = FALSE;
1469
1470 // Find the correct input or output argument.
1471 if ( i < n_input_args )
1472 {
1473 obj = t->input_arg[i];
1474
1475 for ( j = 0; j < n_output_args && !duplicate; j++ )
1476 {
1477 if ( obj.base == t->output_arg[j].base )
1478 duplicate = TRUE;
1479 }
1480
1481 for ( j = 0; j < i && !duplicate; j++ )
1482 {
1483 if ( obj.base == t->input_arg[j].base )
1484 duplicate = TRUE;
1485 }
1486 }
1487 else
1488 {
1489 obj = t->output_arg[i - n_input_args];
1490
1491 for ( j = 0; j < i - n_input_args && !duplicate; j++ )
1492 {
1493 if ( obj.base == t->output_arg[j].base )
1494 duplicate = TRUE;
1495 }
1496 }
1497
1498 // If the block has not been processed before.
1499 if ( !duplicate )
1500 {
1501 // Macroblock is used.
1502 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
1503 {
1504 dim_t jj, kk;
1505 dim_t m = FLA_Obj_length( obj );
1506 dim_t n = FLA_Obj_width( obj );
1507 dim_t cs = FLA_Obj_col_stride( obj );
1508 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
1509
1510 // Clear each block in macroblock.
1511 for ( jj = 0; jj < n; jj++ )
1512 {
1513 for ( kk = 0; kk < m; kk++ )
1514 {
1515 obj = *( buf + jj * cs + kk );
1516
1517 t_val = TRUE;
1518
1519 // Check to see if the block is dirty on another GPU.
1520 for ( k = 0; k < n_threads && t_val; k++ )
1521 if ( k != thread )
1522 t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1523
1524 r_val = r_val && t_val;
1525 }
1526 }
1527 }
1528 else
1529 {
1530 t_val = TRUE;
1531
1532 // Check to see if the block is dirty on another GPU.
1533 for ( k = 0; k < n_threads && t_val; k++ )
1534 if ( k != thread )
1535 t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
1536
1537 r_val = r_val && t_val;
1538 }
1539 }
1540 }
1541
1542 return r_val;
1543 }
1544
1545
FLASH_Queue_check_block_gpu(FLA_Obj obj,int thread,void * arg)1546 FLA_Bool FLASH_Queue_check_block_gpu( FLA_Obj obj, int thread, void *arg )
1547 /*----------------------------------------------------------------------------
1548
1549 FLASH_Queue_check_block_gpu
1550
1551 ----------------------------------------------------------------------------*/
1552 {
1553 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1554 int k;
1555 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1556 FLA_Bool r_val = TRUE;
1557
1558 #ifdef FLA_ENABLE_MULTITHREADING
1559 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1560 #endif
1561
1562 // Locate the position of the block on the GPU.
1563 for ( k = 0; k < gpu_n_blocks; k++ )
1564 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1565 break;
1566
1567 if ( k < gpu_n_blocks )
1568 {
1569 // Request this block if it is dirty.
1570 if ( !args->gpu[thread * gpu_n_blocks + k].clean )
1571 {
1572 args->gpu[thread * gpu_n_blocks + k].request = TRUE;
1573
1574 r_val = FALSE;
1575 }
1576 }
1577
1578 // Check the victim block.
1579 if ( obj.base == args->victim[thread].obj.base )
1580 r_val = FALSE;
1581
1582 #ifdef FLA_ENABLE_MULTITHREADING
1583 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1584 #endif
1585
1586 return r_val;
1587 }
1588
1589
FLASH_Queue_update_gpu(FLASH_Task * t,void ** input_arg,void ** output_arg,void * arg)1590 void FLASH_Queue_update_gpu( FLASH_Task *t,
1591 void **input_arg,
1592 void **output_arg,
1593 void *arg )
1594 /*----------------------------------------------------------------------------
1595
1596 FLASH_Queue_update_gpu
1597
1598 ----------------------------------------------------------------------------*/
1599 {
1600 int i, j, k;
1601 int thread = t->thread;
1602 int n_threads = FLASH_Queue_get_num_threads();
1603 FLA_Bool duplicate;
1604
1605 // None of the arguments can be macroblocks yet.
1606 // Complicating factor is copying macroblock to contiguous memory on GPU.
1607
1608 // Bring the input arguments to the GPU.
1609 for ( i = t->n_input_args - 1; i >= 0; i-- )
1610 {
1611 // Check for duplicate blocks.
1612 duplicate = FALSE;
1613
1614 for ( j = 0; j < t->n_output_args && !duplicate; j++ )
1615 {
1616 if ( t->input_arg[i].base == t->output_arg[j].base )
1617 duplicate = TRUE;
1618 }
1619
1620 for ( j = 0; j < i && !duplicate; j++ )
1621 {
1622 if ( t->input_arg[i].base == t->input_arg[j].base )
1623 duplicate = TRUE;
1624 }
1625
1626 // If the input block has not been processed before.
1627 if ( !duplicate )
1628 {
1629 FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
1630 }
1631 else
1632 {
1633 input_arg[i] = NULL;
1634 }
1635 }
1636
1637 // Bring the output arguments to the GPU.
1638 for ( i = t->n_output_args - 1; i >= 0; i-- )
1639 {
1640 // Check for duplicate blocks.
1641 duplicate = FALSE;
1642
1643 for ( j = 0; j < i && !duplicate; j++ )
1644 {
1645 if ( t->output_arg[i].base == t->output_arg[j].base )
1646 duplicate = TRUE;
1647 }
1648
1649 // If the output block has not been processed before.
1650 if ( !duplicate )
1651 {
1652 FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
1653
1654 // Invalidate output blocks on all other GPUs.
1655 for ( k = 0; k < n_threads; k++ )
1656 if ( k != thread )
1657 FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
1658 }
1659 else
1660 {
1661 output_arg[i] = NULL;
1662 }
1663 }
1664
1665 // Check to see if there are any duplicates.
1666 for ( i = t->n_input_args - 1; i >= 0; i-- )
1667 {
1668 for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
1669 {
1670 if ( t->input_arg[i].base == t->output_arg[j].base )
1671 input_arg[i] = output_arg[j];
1672 }
1673
1674 for ( j = 0; j < i && input_arg[i] == NULL; j++ )
1675 {
1676 if ( t->input_arg[i].base == t->input_arg[j].base )
1677 input_arg[i] = input_arg[j];
1678 }
1679 }
1680
1681 // Check to see if there are any duplicates.
1682 for ( i = t->n_output_args - 1; i >= 0; i-- )
1683 {
1684 for ( j = 0; j < i && output_arg[i] == NULL; j++ )
1685 {
1686 if ( t->output_arg[i].base == t->output_arg[j].base )
1687 output_arg[i] = output_arg[j];
1688 }
1689 }
1690
1691 return;
1692 }
1693
1694
FLASH_Queue_update_block_gpu(FLA_Obj obj,void ** buffer_gpu,int thread,void * arg)1695 void FLASH_Queue_update_block_gpu( FLA_Obj obj,
1696 void **buffer_gpu,
1697 int thread,
1698 void *arg )
1699 /*----------------------------------------------------------------------------
1700
1701 FLASH_Queue_update_block_gpu
1702
1703 ----------------------------------------------------------------------------*/
1704 {
1705 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1706 int j, k;
1707 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1708 FLA_Bool transfer = FALSE;
1709 FLA_Bool evict = FALSE;
1710 FLA_Obj_gpu evict_obj;
1711 FLA_Obj_gpu gpu_obj;
1712
1713 #ifdef FLA_ENABLE_MULTITHREADING
1714 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1715 #endif
1716
1717 // Locate the position of the block on GPU.
1718 for ( k = 0; k < gpu_n_blocks - 1; k++ )
1719 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1720 break;
1721
1722 // Save the pointer to the data on the GPU.
1723 buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
1724
1725 // Save the victim block.
1726 evict_obj = args->gpu[thread * gpu_n_blocks + k];
1727
1728 // The block is not already in the GPU.
1729 if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
1730 {
1731 // Save for data transfer outside of critical section.
1732 transfer = TRUE;
1733
1734 // Save for eviction outside of critical section.
1735 if ( evict_obj.obj.base != NULL && !evict_obj.clean )
1736 {
1737 evict = TRUE;
1738 args->victim[thread] = evict_obj;
1739 }
1740
1741 // Save the block in the data structure.
1742 args->gpu[thread * gpu_n_blocks + k].obj = obj;
1743
1744 // Make sure the new block is clean.
1745 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1746 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1747 }
1748
1749 // Use the block on the GPU that is a hit or LRU.
1750 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1751
1752 // Shift all the previous tasks for LRU replacement.
1753 for ( j = k; j > 0; j-- )
1754 args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
1755
1756 // Place the block on the cache as the most recently used.
1757 args->gpu[thread * gpu_n_blocks] = gpu_obj;
1758
1759 #ifdef FLA_ENABLE_MULTITHREADING
1760 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1761 #endif
1762
1763 // Evict and flush the LRU dirty block.
1764 if ( evict )
1765 {
1766 FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
1767
1768 #ifdef FLA_ENABLE_MULTITHREADING
1769 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1770 #endif
1771
1772 args->victim[thread].obj.base = NULL;
1773
1774 #ifdef FLA_ENABLE_MULTITHREADING
1775 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1776 #endif
1777 }
1778
1779 // Move the block to the GPU.
1780 if ( transfer )
1781 FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1782
1783 return;
1784 }
1785
1786
FLASH_Queue_mark_gpu(FLASH_Task * t,void * arg)1787 void FLASH_Queue_mark_gpu( FLASH_Task *t, void *arg )
1788 /*----------------------------------------------------------------------------
1789
1790 FLASH_Queue_mark_gpu
1791
1792 ----------------------------------------------------------------------------*/
1793 {
1794 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1795 int i, j, k;
1796 int thread = t->thread;
1797 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1798 FLA_Bool duplicate;
1799 FLA_Obj obj;
1800
1801 // Mark all the output blocks on the GPU as dirty.
1802 for ( i = t->n_output_args - 1; i >= 0; i-- )
1803 {
1804 obj = t->output_arg[i];
1805
1806 // Check for duplicate blocks.
1807 duplicate = FALSE;
1808
1809 for ( j = 0; j < i && !duplicate; j++ )
1810 {
1811 if ( obj.base == t->output_arg[j].base )
1812 duplicate = TRUE;
1813 }
1814
1815 // If the output block has not been processed before.
1816 if ( !duplicate )
1817 {
1818 #ifdef FLA_ENABLE_MULTITHREADING
1819 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1820 #endif
1821
1822 // Locate the position of the block on the GPU.
1823 for ( k = 0; k < gpu_n_blocks; k++ )
1824 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1825 break;
1826
1827 if ( k < gpu_n_blocks )
1828 {
1829 // Change the bits for the new dirty block.
1830 args->gpu[thread * gpu_n_blocks + k].clean = FALSE;
1831 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1832 }
1833
1834 #ifdef FLA_ENABLE_MULTITHREADING
1835 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1836 #endif
1837 }
1838 }
1839
1840 return;
1841 }
1842
1843
FLASH_Queue_invalidate_block_gpu(FLA_Obj obj,int thread,void * arg)1844 void FLASH_Queue_invalidate_block_gpu( FLA_Obj obj, int thread, void *arg )
1845 /*----------------------------------------------------------------------------
1846
1847 FLASH_Queue_invalidate_block_gpu
1848
1849 ----------------------------------------------------------------------------*/
1850 {
1851 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1852 int j, k;
1853 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1854 FLA_Obj_gpu gpu_obj;
1855
1856 #ifdef FLA_ENABLE_MULTITHREADING
1857 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1858 #endif
1859
1860 // Locate the position of the block on the GPU.
1861 for ( k = 0; k < gpu_n_blocks; k++ )
1862 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1863 break;
1864
1865 // The block is owned by other GPU.
1866 if ( k < gpu_n_blocks )
1867 {
1868 // Invalidate the block.
1869 args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
1870
1871 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1872 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1873
1874 // Save the block that will be invalidated.
1875 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1876
1877 // Shift all the blocks for the invalidated block.
1878 for ( j = k; j < gpu_n_blocks - 1; j++ )
1879 args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
1880
1881 // Move to the LRU block.
1882 args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
1883 }
1884
1885 #ifdef FLA_ENABLE_MULTITHREADING
1886 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1887 #endif
1888
1889 return;
1890 }
1891
1892
FLASH_Queue_flush_block_gpu(FLA_Obj obj,int thread,void * arg)1893 void FLASH_Queue_flush_block_gpu( FLA_Obj obj, int thread, void *arg )
1894 /*----------------------------------------------------------------------------
1895
1896 FLASH_Queue_flush_block_gpu
1897
1898 ----------------------------------------------------------------------------*/
1899 {
1900 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1901 int k;
1902 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1903 FLA_Bool transfer = FALSE;
1904 FLA_Obj_gpu gpu_obj;
1905
1906 #ifdef FLA_ENABLE_MULTITHREADING
1907 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1908 #endif
1909
1910 // Locate the position of the block on the GPU.
1911 for ( k = 0; k < gpu_n_blocks; k++ )
1912 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1913 break;
1914
1915 // The block is owned by the GPU.
1916 if ( k < gpu_n_blocks )
1917 {
1918 // Save the block that will be flushed.
1919 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1920
1921 // If the block is dirty, then flush it.
1922 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
1923 transfer = TRUE;
1924 }
1925
1926 #ifdef FLA_ENABLE_MULTITHREADING
1927 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1928 #endif
1929
1930 // Exit early if a flush is not required.
1931 if ( !transfer )
1932 return;
1933
1934 // Flush the block outside the critical section.
1935 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
1936
1937 #ifdef FLA_ENABLE_MULTITHREADING
1938 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1939 #endif
1940
1941 // Locate the position of the block on the GPU.
1942 for ( k = 0; k < gpu_n_blocks; k++ )
1943 if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
1944 break;
1945
1946 if ( k < gpu_n_blocks )
1947 {
1948 // Update the bits for the flushed block.
1949 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
1950 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
1951 }
1952
1953 #ifdef FLA_ENABLE_MULTITHREADING
1954 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1955 #endif
1956
1957 return;
1958 }
1959
1960
FLASH_Queue_flush_gpu(int thread,void * arg)1961 void FLASH_Queue_flush_gpu( int thread, void *arg )
1962 /*----------------------------------------------------------------------------
1963
1964 FLASH_Queue_flush_gpu
1965
1966 ----------------------------------------------------------------------------*/
1967 {
1968 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
1969 int i, k;
1970 dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
1971 int n_transfer = 0;
1972 FLA_Obj_gpu gpu_obj;
1973
1974 // Exit if not using GPU.
1975 if ( !FLASH_Queue_get_enabled_gpu() )
1976 return;
1977
1978 #ifdef FLA_ENABLE_MULTITHREADING
1979 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
1980 #endif
1981
1982 for ( k = 0; k < gpu_n_blocks; k++ )
1983 {
1984 // Save the block that might be flushed.
1985 gpu_obj = args->gpu[thread * gpu_n_blocks + k];
1986
1987 // Flush the block if it is dirty and requested.
1988 if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
1989 {
1990 // Save the block for data transfer outside the critical section.
1991 args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
1992 n_transfer++;
1993 }
1994 }
1995
1996 #ifdef FLA_ENABLE_MULTITHREADING
1997 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
1998 #endif
1999
2000 // Exit early if a flush is not required.
2001 if ( n_transfer == 0 )
2002 return;
2003
2004 // Flush the block outside the critical section.
2005 for ( i = 0; i < n_transfer; i++ )
2006 {
2007 gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
2008 FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
2009 }
2010
2011 #ifdef FLA_ENABLE_MULTITHREADING
2012 FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
2013 #endif
2014
2015 // Update the bits for each block that is flushed.
2016 for ( i = 0; i < n_transfer; i++ )
2017 {
2018 // Locate the position of the block on the GPU.
2019 for ( k = 0; k < gpu_n_blocks; k++ )
2020 if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
2021 args->gpu[thread * gpu_n_blocks + k].obj.base )
2022 break;
2023
2024 if ( k < gpu_n_blocks )
2025 {
2026 // The block is now clean.
2027 args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
2028 args->gpu[thread * gpu_n_blocks + k].request = FALSE;
2029 }
2030 }
2031
2032 #ifdef FLA_ENABLE_MULTITHREADING
2033 FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
2034 #endif
2035
2036 return;
2037 }
2038
2039 #endif
2040
2041 #ifdef FLA_ENABLE_MULTITHREADING
2042
FLASH_Queue_exec_parallel(void * arg)2043 void FLASH_Queue_exec_parallel( void* arg )
2044 /*----------------------------------------------------------------------------
2045
2046 FLASH_Queue_exec_parallel
2047
2048 ----------------------------------------------------------------------------*/
2049 {
2050 int i;
2051 int n_threads = FLASH_Queue_get_num_threads();
2052 void* (*thread_entry_point)( void* );
2053
2054 // Allocate the thread structures array. Here, an array of FLASH_Thread
2055 // structures of length n_threads is allocated and the fields of each
2056 // structure set to appropriate values.
2057 FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
2058
2059 // Initialize the thread structures array.
2060 for ( i = 0; i < n_threads; i++ )
2061 {
2062 // Save the thread's identifier.
2063 thread[i].id = i;
2064
2065 // Save the pointer to the necessary variables with the thread.
2066 thread[i].args = arg;
2067
2068 // The pthread object, if it was even compiled into the FLASH_Thread
2069 // structure, will be initialized by the pthread implementation when we
2070 // call pthread_create() and does not need to be touched at this time.
2071 }
2072
2073 // Determine which function to send threads to.
2074 thread_entry_point = FLASH_Queue_exec_parallel_function;
2075
2076 #if FLA_MULTITHREADING_MODEL == FLA_OPENMP
2077
2078 // An OpenMP parallel for region spawns n_threads threads. Each thread
2079 // executes the work function with a different FLASH_Thread argument.
2080 // An implicit synchronization point exists at the end of the curly
2081 // brace scope.
2082 #pragma omp parallel for \
2083 private( i ) \
2084 shared( thread, n_threads, thread_entry_point ) \
2085 schedule( static, 1 ) \
2086 num_threads( n_threads )
2087 for ( i = 0; i < n_threads; ++i )
2088 {
2089 thread_entry_point( ( void* ) &thread[i] );
2090 }
2091
2092 #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2093
2094 // Create each POSIX thread needed in addition to the main thread.
2095 for ( i = 1; i < n_threads; i++ )
2096 {
2097 int pthread_e_val;
2098
2099 // Create thread i with default attributes.
2100 pthread_e_val = pthread_create( &(thread[i].pthread_obj),
2101 NULL,
2102 thread_entry_point,
2103 ( void* ) &thread[i] );
2104
2105 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2106 FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
2107 FLA_Check_error_code( e_val );
2108 #endif
2109 }
2110
2111 // The main thread is assigned the role of thread 0. Here we manually
2112 // execute it as a worker thread.
2113 thread_entry_point( ( void* ) &thread[0] );
2114
2115 // Wait for non-main threads to finish.
2116 for ( i = 1; i < n_threads; i++ )
2117 {
2118 // These two variables are declared local to this for loop since this
2119 // is the only place they are needed, and since they would show up as
2120 // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
2121 // Strangely, the Intel compiler produces code that results in an
2122 // "unaligned access" runtime message if thread_status is declared as
2123 // an int. Declaring it as a long or void* appears to force the
2124 // compiler (not surprisingly) into aligning it to an 8-byte boundary.
2125 int pthread_e_val;
2126 void* thread_status;
2127
2128 // Wait for thread i to invoke its respective pthread_exit().
2129 // The return value passed to pthread_exit() is provided to us
2130 // via status, if one was given.
2131 pthread_e_val = pthread_join( thread[i].pthread_obj,
2132 ( void** ) &thread_status );
2133
2134 #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
2135 FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
2136 FLA_Check_error_code( e_val );
2137 #endif
2138 }
2139
2140 #endif
2141
2142 FLA_free( thread );
2143
2144 return;
2145 }
2146
2147
2148 //#include <sched.h>
2149 //#include <sys/types.h>
2150 //#include <linux/unistd.h>
2151 //#include <errno.h>
2152 //#include <unistd.h>
2153 //#include <sys/syscall.h>
2154
2155
FLASH_Queue_exec_parallel_function(void * arg)2156 void* FLASH_Queue_exec_parallel_function( void* arg )
2157 /*----------------------------------------------------------------------------
2158
2159 FLASH_Queue_exec_parallel_function
2160
2161 ----------------------------------------------------------------------------*/
2162 {
2163 FLASH_Queue_vars* args;
2164 int i;
2165 int queue;
2166 int cache;
2167 int n_tasks = FLASH_Queue_get_num_tasks();
2168 int n_threads = FLASH_Queue_get_num_threads();
2169 int n_cores = FLASH_Queue_get_cores_per_cache();
2170 FLA_Bool caching = FLASH_Queue_get_caching();
2171 FLA_Bool stealing = FLASH_Queue_get_work_stealing();
2172 FLA_Bool committed = TRUE;
2173 FLA_Bool condition = TRUE;
2174 FLA_Bool enabled = FALSE;
2175 FLA_Bool available;
2176 FLASH_Task* t = NULL;
2177 FLASH_Task* r = NULL;
2178 FLASH_Thread* me;
2179 //cpu_set_t cpu_set;
2180
2181 // Interpret the thread argument as what it really is--a pointer to an
2182 // FLASH_Thread structure.
2183 me = ( FLASH_Thread* ) arg;
2184
2185 // Extract the variables from the current thread.
2186 args = ( FLASH_Queue_vars* ) me->args;
2187
2188 // Figure out the id of the current thread.
2189 i = me->id;
2190
2191 // Set the CPU affinity; We want the current thread i to run only on CPU i.
2192 //CPU_ZERO( &cpu_set );
2193 //CPU_SET( i, &cpu_set );
2194 //sched_setaffinity( syscall( __NR_gettid ), sizeof(cpu_set_t), &cpu_set );
2195
2196 // Determine to which queue this thread belongs.
2197 queue = i / ( n_threads / args->n_queues );
2198
2199 // Determine to which cache this thread belongs.
2200 cache = i / n_cores;
2201
2202 #ifdef FLA_ENABLE_GPU
2203 // Create memory on GPU.
2204 FLASH_Queue_create_gpu( i, ( void* ) args );
2205
2206 // Save whether GPUs are enabled.
2207 enabled = FLASH_Queue_get_enabled_gpu();
2208
2209 // Only use each GPU as its own cache when GPUs are enabled.
2210 if ( enabled )
2211 cache = i;
2212 #endif
2213
2214 // Prefetch blocks into the cache before execution.
2215 if ( caching && !enabled && i % n_cores == 0 )
2216 FLASH_Queue_prefetch( cache, ( void* ) args );
2217
2218 // Loop until all the tasks have committed.
2219 while ( condition )
2220 {
2221 #ifdef FLA_ENABLE_GPU
2222 // Check to see if any blocks on GPU need to be flushed.
2223 FLASH_Queue_flush_gpu( i, ( void* ) args );
2224 #endif
2225
2226 // Dequeue a task if there has not been one binded to thread.
2227 if ( r == NULL )
2228 {
2229 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2230
2231 // Obtain task to execute.
2232 t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
2233
2234 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2235 }
2236 else
2237 {
2238 // Obtain the binded task.
2239 t = r;
2240 r = NULL;
2241 }
2242
2243 // Dequeued a task from the waiting queue.
2244 available = ( t != NULL );
2245
2246 if ( available )
2247 {
2248 // Save the thread and cache that executes the task.
2249 t->thread = i;
2250 t->cache = cache;
2251
2252 if ( caching && !enabled )
2253 {
2254 // Update the current state of the cache.
2255 FLASH_Queue_update_cache( t, ( void* ) args );
2256 }
2257
2258 #ifdef FLA_ENABLE_GPU
2259 // Execute the task on GPU.
2260 committed = FLASH_Queue_exec_gpu( t, ( void* ) args );
2261 #else
2262 // Execute the task.
2263 FLASH_Queue_exec_task( t );
2264 #endif
2265
2266 // If the task has executed or not.
2267 if ( committed )
2268 {
2269 // Update task dependencies.
2270 r = FLASH_Task_update_dependencies( t, ( void* ) args );
2271
2272 // Free the task once it executes in parallel.
2273 FLASH_Task_free_parallel( t, ( void* ) args );
2274 }
2275 }
2276 else
2277 {
2278 if ( stealing )
2279 {
2280 // Perform work stealing if there are no tasks to dequeue.
2281 r = FLASH_Queue_work_stealing( queue, ( void* ) args );
2282 }
2283 }
2284
2285 FLA_Lock_acquire( &(args->all_lock) ); // A ***
2286
2287 // Increment program counter.
2288 if ( available && committed )
2289 args->pc++;
2290
2291 // Terminate loop.
2292 if ( args->pc >= n_tasks )
2293 condition = FALSE;
2294
2295 FLA_Lock_release( &(args->all_lock) ); // A ***
2296 }
2297
2298 #ifdef FLA_ENABLE_GPU
2299 // Destroy and flush contents of GPU back to main memory.
2300 FLASH_Queue_destroy_gpu( i, ( void* ) args );
2301 #endif
2302
2303 #if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
2304 // If this is a non-main thread, then exit with a zero (normal) error code.
2305 // The main thread cannot call pthread_exit() because this routine never
2306 // returns. The main thread must proceed so it can oversee the joining of
2307 // the exited non-main pthreads.
2308 if ( i != 0 )
2309 pthread_exit( ( void* ) NULL );
2310 #endif
2311
2312 return ( void* ) NULL;
2313 }
2314
2315
FLASH_Task_update_dependencies(FLASH_Task * t,void * arg)2316 FLASH_Task* FLASH_Task_update_dependencies( FLASH_Task* t, void* arg )
2317 /*----------------------------------------------------------------------------
2318
2319 FLASH_Task_update_dependencies
2320
2321 ----------------------------------------------------------------------------*/
2322 {
2323 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2324 int i;
2325 int q = t->queue;
2326 int queue;
2327 int thread;
2328 int n_threads = FLASH_Queue_get_num_threads();
2329 FLA_Bool caching = FLASH_Queue_get_caching();
2330 FLA_Bool stealing = FLASH_Queue_get_work_stealing();
2331 FLA_Bool available;
2332 FLASH_Task* task;
2333 FLASH_Task* r = NULL;
2334 FLASH_Dep* d = t->dep_arg_head;
2335
2336 // Dequeue task to bind to thread if caching is enabled.
2337 if ( caching )
2338 {
2339 FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
2340
2341 // Obtain task to execute.
2342 r = FLASH_Queue_wait_dequeue( q, t->cache, arg );
2343
2344 FLA_Lock_release( &(args->run_lock[q]) ); // R ***
2345 }
2346
2347 // Check each dependent task.
2348 for ( i = 0; i < t->n_dep_args; i++ )
2349 {
2350 if ( stealing )
2351 {
2352 // Place all dependent tasks onto same queue as predecessor task.
2353 d->task->queue = q;
2354 }
2355
2356 task = d->task;
2357 queue = task->queue;
2358 thread = task->order % n_threads;
2359
2360 FLA_Lock_acquire( &(args->dep_lock[thread]) ); // D ***
2361
2362 task->n_ready--;
2363 available = ( task->n_ready == 0 );
2364
2365 FLA_Lock_release( &(args->dep_lock[thread]) ); // D ***
2366
2367 // Place newly ready tasks on waiting queue.
2368 if ( available )
2369 {
2370 // If caching is enabled and the task belongs to this thread's queue.
2371 if ( caching && q == queue )
2372 {
2373 // Determine if there is a new binded task.
2374 r = FLASH_Task_update_binding( task, r, arg );
2375 }
2376 else
2377 {
2378 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2379
2380 FLASH_Queue_wait_enqueue( task, arg );
2381
2382 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2383 }
2384 }
2385
2386 // Go to the next dep.
2387 d = d->next_dep;
2388 }
2389
2390 return r;
2391 }
2392
2393
FLASH_Task_update_binding(FLASH_Task * t,FLASH_Task * r,void * arg)2394 FLASH_Task* FLASH_Task_update_binding( FLASH_Task* t, FLASH_Task* r,
2395 void* arg )
2396 /*----------------------------------------------------------------------------
2397
2398 FLASH_Task_update_binding
2399
2400 ----------------------------------------------------------------------------*/
2401 {
2402 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2403 int queue;
2404
2405 if ( r == NULL )
2406 {
2407 // There are no tasks on waiting queue, so bind the first task.
2408 r = t;
2409 r->hit = TRUE;
2410 }
2411 else
2412 {
2413 // Swap the binded task for the new ready task.
2414 if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
2415 {
2416 queue = r->queue;
2417 r->hit = FALSE;
2418
2419 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2420
2421 // Place swapped task back onto waiting queue.
2422 FLASH_Queue_wait_enqueue( r, arg );
2423
2424 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2425
2426 // Bind the new ready task.
2427 r = t;
2428 r->hit = TRUE;
2429 }
2430 else // Keep the binded task and enqueue new ready task.
2431 {
2432 queue = t->queue;
2433
2434 FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
2435
2436 FLASH_Queue_wait_enqueue( t, arg );
2437
2438 FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
2439 }
2440 }
2441
2442 return r;
2443 }
2444
2445
FLASH_Task_free_parallel(FLASH_Task * t,void * arg)2446 void FLASH_Task_free_parallel( FLASH_Task* t, void* arg )
2447 /*----------------------------------------------------------------------------
2448
2449 FLASH_Task_free_parallel
2450
2451 ----------------------------------------------------------------------------*/
2452 {
2453 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2454 int i, j, k;
2455 int thread;
2456 int n_threads = FLASH_Queue_get_num_threads();
2457 FLASH_Dep* d;
2458 FLASH_Dep* next_dep;
2459 FLA_Obj obj;
2460
2461 // Clearing the last write task in each output block.
2462 for ( i = 0; i < t->n_output_args; i++ )
2463 {
2464 obj = t->output_arg[i];
2465
2466 // Macroblock is used.
2467 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2468 {
2469 dim_t jj, kk;
2470 dim_t m = FLA_Obj_length( obj );
2471 dim_t n = FLA_Obj_width( obj );
2472 dim_t cs = FLA_Obj_col_stride( obj );
2473 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2474
2475 // Clear each block in macroblock.
2476 for ( jj = 0; jj < n; jj++ )
2477 for ( kk = 0; kk < m; kk++ )
2478 ( buf + jj * cs + kk )->base->write_task = NULL;
2479 }
2480 else // Clear regular block.
2481 {
2482 obj.base->write_task = NULL;
2483 }
2484 }
2485
2486 // Cleaning the last read tasks in each input block.
2487 for ( i = 0; i < t->n_input_args; i++ )
2488 {
2489 obj = t->input_arg[i];
2490
2491 // Macroblock is used.
2492 if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
2493 {
2494 dim_t jj, kk;
2495 dim_t m = FLA_Obj_length( obj );
2496 dim_t n = FLA_Obj_width( obj );
2497 dim_t cs = FLA_Obj_col_stride( obj );
2498 FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
2499
2500 // Clear each block in macroblock.
2501 for ( jj = 0; jj < n; jj++ )
2502 {
2503 for ( kk = 0; kk < m; kk++ )
2504 {
2505 obj = *( buf + jj * cs + kk );
2506
2507 thread = obj.base->n_read_blocks % n_threads;
2508
2509 FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2510
2511 k = obj.base->n_read_tasks;
2512 d = obj.base->read_task_head;
2513
2514 obj.base->n_read_tasks = 0;
2515 obj.base->read_task_head = NULL;
2516 obj.base->read_task_tail = NULL;
2517
2518 FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2519
2520 for ( j = 0; j < k; j++ )
2521 {
2522 next_dep = d->next_dep;
2523 FLA_free( d );
2524 d = next_dep;
2525 }
2526 }
2527 }
2528 }
2529 else // Regular block.
2530 {
2531 thread = obj.base->n_read_blocks % n_threads;
2532
2533 FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
2534
2535 k = obj.base->n_read_tasks;
2536 d = obj.base->read_task_head;
2537
2538 obj.base->n_read_tasks = 0;
2539 obj.base->read_task_head = NULL;
2540 obj.base->read_task_tail = NULL;
2541
2542 FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
2543
2544 for ( j = 0; j < k; j++ )
2545 {
2546 next_dep = d->next_dep;
2547 FLA_free( d );
2548 d = next_dep;
2549 }
2550 }
2551 }
2552
2553 // Free the dep_arg field of t.
2554 d = t->dep_arg_head;
2555
2556 for ( i = 0; i < t->n_dep_args; i++ )
2557 {
2558 next_dep = d->next_dep;
2559 FLA_free( d );
2560 d = next_dep;
2561 }
2562
2563 // Free the int_arg field of t.
2564 FLA_free( t->int_arg );
2565
2566 // Free the fla_arg field of t.
2567 FLA_free( t->fla_arg );
2568
2569 // Free the input_arg field of t.
2570 FLA_free( t->input_arg );
2571
2572 // Free the output_arg field of t.
2573 FLA_free( t->output_arg );
2574
2575 // Finally, free the struct itself.
2576 FLA_free( t );
2577
2578 return;
2579 }
2580
2581 #endif
2582
2583
2584 // ============================================================================
2585
2586
2587 #ifndef FLA_ENABLE_MULTITHREADING
2588
FLASH_Queue_exec_simulation(void * arg)2589 void FLASH_Queue_exec_simulation( void* arg )
2590 /*----------------------------------------------------------------------------
2591
2592 FLASH_Queue_exec_simulation
2593
2594 ----------------------------------------------------------------------------*/
2595 {
2596 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2597 int i, j;
2598 int queue;
2599 int cache;
2600 int n_stages = 0;
2601 int n_queues = args->n_queues;
2602 int n_tasks = FLASH_Queue_get_num_tasks();
2603 int n_threads = FLASH_Queue_get_num_threads();
2604 int n_cores = FLASH_Queue_get_cores_per_cache();
2605 FLASH_Verbose verbose = FLASH_Queue_get_verbose_output();
2606 FLASH_Task* task;
2607 FLASH_Task* t;
2608 FLASH_Dep* d;
2609
2610 // An array to hold tasks to be executed during of simulation.
2611 #ifdef FLA_ENABLE_WINDOWS_BUILD
2612 FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
2613 #else
2614 FLASH_Task* exec_array[n_threads];
2615 #endif
2616
2617 for ( i = 0; i < n_threads; i++ )
2618 {
2619 // Initialize all exec_array to NULL.
2620 exec_array[i] = NULL;
2621
2622 // Prefetch blocks into the cache before execution.
2623 if ( i % n_cores == 0 )
2624 FLASH_Queue_prefetch( i, arg );
2625 }
2626
2627 // Loop until all the tasks have committed.
2628 while ( args->pc < n_tasks )
2629 {
2630 for ( i = 0; i < n_threads; i++ )
2631 {
2632 // Update waiting queue with ready tasks.
2633 t = exec_array[i];
2634
2635 if ( t != NULL )
2636 {
2637 // Check each dependent task.
2638 d = t->dep_arg_head;
2639
2640 for ( j = 0; j < t->n_dep_args; j++ )
2641 {
2642 task = d->task;
2643 task->n_ready--;
2644
2645 // Place newly ready tasks on waiting queue.
2646 if ( task->n_ready == 0 )
2647 {
2648 FLASH_Queue_wait_enqueue( task, arg );
2649 }
2650
2651 // Go to the next dep.
2652 d = d->next_dep;
2653 }
2654
2655 // Free the task.
2656 FLASH_Task_free( t );
2657 }
2658 }
2659
2660 n_stages++;
2661 if ( !verbose )
2662 printf( "%7d", n_stages );
2663
2664 // Move ready tasks from the waiting queue to execution queue.
2665 for ( i = 0; i < n_threads; i++ )
2666 {
2667 // Determine to which queue this thread belongs.
2668 queue = i / ( n_threads / n_queues );
2669
2670 // Determine to which cache this thread belongs.
2671 cache = i / n_cores;
2672
2673 // Dequeue a task.
2674 t = FLASH_Queue_wait_dequeue( queue, cache, arg );
2675
2676 // Save the task for execution.
2677 exec_array[i] = t;
2678
2679 if ( t != NULL )
2680 {
2681 // Save the thread and cache that executes the task.
2682 t->thread = i;
2683 t->cache = cache;
2684
2685 // Increment program counter.
2686 args->pc++;
2687 }
2688 }
2689
2690 // Execute independent tasks.
2691 for ( i = 0; i < n_threads; i++ )
2692 {
2693 t = exec_array[i];
2694 FLASH_Queue_update_cache( t, arg );
2695 FLASH_Queue_exec_task( t );
2696
2697 if ( !verbose )
2698 printf( "%7s", ( t == NULL ? " " : t->name ) );
2699
2700 // Free the task if this is the last stage.
2701 if ( args->pc == n_tasks && t != NULL )
2702 FLASH_Task_free( t );
2703 }
2704
2705 if ( !verbose )
2706 printf( "\n" );
2707 }
2708
2709 if ( !verbose )
2710 printf( "\n" );
2711
2712 #ifdef FLA_ENABLE_WINDOWS_BUILD
2713 FLA_free( exec_array );
2714 #endif
2715
2716 return;
2717 }
2718
2719 #endif
2720
2721 #else // FLA_ENABLE_SCC
2722
2723 int RCCE_acquire_lock(int);
2724 int RCCE_release_lock(int);
2725 double RCCE_wtime(void);
2726 int RCCE_ue(void);
2727
2728 //This function needs to be defined in the driver
2729 // or linked in some how by the user.
2730 //It just just RCCE_barrier( &RCCE_COMM_WORLD ),
2731 // but we can't implement it here because we're
2732 // trying not to link in RCCE.h
2733 void Synch_all();
2734
2735
2736 typedef struct FLASH_Queue_variables
2737 {
2738 // Queue of all the tasks.
2739 FLASH_Task** task_queue;
2740
2741 // The waiting queue of tasks for each thread.
2742 int* n_ready;
2743
2744 // The waiting queue of tasks for each thread.
2745 int* wait_queue;
2746
2747 // The number of tasks on waiting queue.
2748 int* n_wait;
2749
2750 // A global task counter that keeps track of how many tasks on the waiting
2751 // queue have been processed.
2752 int* pc;
2753 } FLASH_Queue_vars;
2754
2755
FLASH_Queue_exec(void)2756 void FLASH_Queue_exec( void )
2757 /*----------------------------------------------------------------------------
2758
2759 FLASH_Queue_exec
2760
2761 ----------------------------------------------------------------------------*/
2762 {
2763 int n_tasks = FLASH_Queue_get_num_tasks();
2764 int i;
2765 double dtime;
2766
2767 // All the necessary variables for the SuperMatrix mechanism.
2768 FLASH_Queue_vars args;
2769
2770 // If the queue is empty, return early.
2771 if ( n_tasks == 0 )
2772 return;
2773
2774 // Turn off all multiple queue implementations.
2775 FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
2776 FLASH_Queue_set_work_stealing( FALSE );
2777 // Do not use cache affinity yet.
2778 FLASH_Queue_set_caching( FALSE );
2779
2780 // Allocate memory for task queues.
2781 args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) );
2782 args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2783 args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
2784 args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) );
2785 args.pc = ( int* ) FLA_shmalloc( sizeof( int ) );
2786
2787 // Initialize data.
2788 if ( FLA_is_owner() )
2789 {
2790 args.n_wait[0] = 0;
2791 args.pc[0] = 0;
2792 }
2793
2794 Synch_all();
2795
2796 // Initialize tasks with critical information.
2797 FLASH_Queue_init_tasks( ( void* ) &args );
2798
2799 // Display verbose output before free all tasks.
2800 if ( FLASH_Queue_get_verbose_output() )
2801 FLASH_Queue_verbose_output();
2802
2803 // Start timing the parallel execution.
2804 dtime = RCCE_wtime();
2805
2806 FLASH_Queue_exec_parallel_function( ( void* ) &args );
2807
2808 // End timing the parallel execution.
2809 dtime = RCCE_wtime() - dtime;
2810 FLASH_Queue_set_parallel_time( dtime );
2811
2812 // Free all tasks sequentially.
2813 for ( i = 0; i < n_tasks; i++ )
2814 FLASH_Task_free( args.task_queue[i] );
2815
2816 // Free data.
2817 FLA_free( args.task_queue );
2818 FLA_shfree( args.n_ready );
2819 FLA_shfree( args.wait_queue );
2820 FLA_shfree( args.n_wait );
2821 FLA_shfree( args.pc );
2822
2823 // Reset values for next call to FLASH_Queue_exec().
2824 FLASH_Queue_reset();
2825
2826 return;
2827 }
2828
2829
FLASH_Queue_init_tasks(void * arg)2830 void FLASH_Queue_init_tasks( void* arg )
2831 /*----------------------------------------------------------------------------
2832
2833 FLASH_Queue_init_tasks
2834
2835 ----------------------------------------------------------------------------*/
2836 {
2837 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2838 int i, j;
2839 int n_tasks = FLASH_Queue_get_num_tasks();
2840 int n_ready = 0;
2841 int height;
2842 FLASH_Task* t;
2843 FLASH_Dep* d;
2844
2845 // Grab the tail of the task queue.
2846 t = FLASH_Queue_get_tail_task();
2847
2848 for ( i = n_tasks - 1; i >= 0; i-- )
2849 {
2850 // Save all the task pointers.
2851 args->task_queue[i] = t;
2852
2853 // Only use a single queue implementation.
2854 t->queue = 0;
2855
2856 // Determine the height of each task in the DAG.
2857 height = 0;
2858 d = t->dep_arg_head;
2859
2860 // Take the maximum height of dependent tasks.
2861 for ( j = 0; j < t->n_dep_args; j++ )
2862 {
2863 height = max( height, d->task->height );
2864 d = d->next_dep;
2865 }
2866
2867 t->height = height + 1;
2868
2869 // Since freeing a task is always a leaf, we want to force it to execute
2870 // earlier by giving it a greater height in order to reclaim memory.
2871 if ( t->func == (void *) FLA_Obj_free_buffer_task )
2872 t->height += n_tasks;
2873
2874 // Find all ready tasks.
2875 t->n_ready += t->n_input_args + t->n_output_args +
2876 t->n_macro_args + t->n_war_args;
2877
2878 if ( t->n_ready == 0 )
2879 {
2880 // Save the number of ready and available tasks.
2881 n_ready++;
2882 }
2883
2884 if ( FLA_is_owner() )
2885 {
2886 // Record all the ready values.
2887 args->n_ready[i] = t->n_ready;
2888 }
2889
2890 // Go to the previous task.
2891 t = t->prev_task;
2892 }
2893
2894 // Only allow the first core to enqueue the initial ready tasks.
2895 if ( !FLA_is_owner() )
2896 return;
2897
2898 // Grab the head of the task queue.
2899 t = FLASH_Queue_get_head_task();
2900
2901 for ( i = 0; i < n_tasks && n_ready > 0; i++ )
2902 {
2903 if ( t->n_ready == 0 )
2904 {
2905 RCCE_acquire_lock( 0 );
2906
2907 // Enqueue all the ready and available tasks.
2908 FLASH_Queue_wait_enqueue( t, arg );
2909
2910 RCCE_release_lock( 0 );
2911
2912 // Decrement the number of ready tasks left to be enqueued.
2913 n_ready--;
2914 }
2915
2916 // Go to the next task.
2917 t = t->next_task;
2918 }
2919
2920 return;
2921 }
2922
2923
FLASH_Queue_wait_enqueue(FLASH_Task * t,void * arg)2924 void FLASH_Queue_wait_enqueue( FLASH_Task* t, void* arg )
2925 /*----------------------------------------------------------------------------
2926
2927 FLASH_Queue_wait_enqueue
2928
2929 ----------------------------------------------------------------------------*/
2930 {
2931 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2932 int i = args->n_wait[0] + args->pc[0];
2933
2934 // Insertion sort of tasks in waiting queue.
2935 if ( FLASH_Queue_get_sorting() )
2936 {
2937 for ( ; i > args->pc[0]; i-- )
2938 {
2939 if ( args->task_queue[args->wait_queue[i-1]]->height >
2940 args->task_queue[t->order]->height )
2941 break;
2942
2943 args->wait_queue[i] = args->wait_queue[i-1];
2944 }
2945 }
2946
2947 args->wait_queue[i] = t->order;
2948
2949 // Increment number of tasks on waiting queue.
2950 args->n_wait[0]++;
2951
2952 return;
2953 }
2954
2955
FLASH_Queue_wait_dequeue(int queue,int cache,void * arg)2956 FLASH_Task* FLASH_Queue_wait_dequeue( int queue, int cache, void* arg )
2957 /*----------------------------------------------------------------------------
2958
2959 FLASH_Queue_wait_dequeue
2960
2961 ----------------------------------------------------------------------------*/
2962 {
2963 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2964 FLASH_Task* t = NULL;
2965
2966 if ( args->n_wait[0] > 0 )
2967 {
2968 // Grab the head of the queue.
2969 t = args->task_queue[args->wait_queue[args->pc[0]]];
2970
2971 // Decrement number of tasks on waiting queue.
2972 args->n_wait[0]--;
2973
2974 // Increment the program counter.
2975 args->pc[0]++;
2976 }
2977
2978 return t;
2979 }
2980
2981
FLASH_Queue_exec_parallel_function(void * arg)2982 void* FLASH_Queue_exec_parallel_function( void* arg )
2983 /*----------------------------------------------------------------------------
2984
2985 FLASH_Queue_exec_parallel_function
2986
2987 ----------------------------------------------------------------------------*/
2988 {
2989 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
2990 int i = RCCE_ue();
2991 int queue = 0;
2992 int cache = 0;
2993 int n_tasks = FLASH_Queue_get_num_tasks();
2994 int n_threads = FLASH_Queue_get_num_threads();
2995 FLA_Bool condition;
2996 FLA_Bool available;
2997 FLASH_Task* t = NULL;
2998
2999 // Do not let extraneous cores execute.
3000 if ( i < n_threads )
3001 condition = TRUE;
3002 else
3003 condition = FALSE;
3004
3005 // Loop until all the tasks have committed.
3006 while ( condition )
3007 {
3008 RCCE_acquire_lock( 0 );
3009
3010 // Obtain task to execute.
3011 t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
3012
3013 RCCE_release_lock( 0 );
3014
3015 // Dequeued a task from the waiting queue.
3016 available = ( t != NULL );
3017
3018 if ( available )
3019 {
3020 // Save the thread and cache that executes the task.
3021 t->thread = i;
3022 t->cache = cache;
3023
3024 // Execute the task.
3025 FLASH_Queue_exec_task( t );
3026
3027 // Update task dependencies.
3028 FLASH_Task_update_dependencies( t, ( void* ) args );
3029 }
3030
3031 RCCE_acquire_lock( 0 );
3032
3033 // Terminate loop.
3034 if ( args->pc[0] >= n_tasks )
3035 condition = FALSE;
3036
3037 RCCE_release_lock( 0 );
3038 }
3039
3040 return ( void* ) NULL;
3041 }
3042
3043
FLASH_Task_update_dependencies(FLASH_Task * t,void * arg)3044 FLASH_Task* FLASH_Task_update_dependencies( FLASH_Task* t, void* arg )
3045 /*----------------------------------------------------------------------------
3046
3047 FLASH_Task_update_dependencies
3048
3049 ----------------------------------------------------------------------------*/
3050 {
3051 FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
3052 int i;
3053 int n_threads = FLASH_Queue_get_num_threads();
3054 int thread;
3055 FLA_Bool available;
3056 FLASH_Task* task;
3057 FLASH_Task* r = NULL;
3058 FLASH_Dep* d = t->dep_arg_head;
3059
3060 // Check each dependent task.
3061 for ( i = 0; i < t->n_dep_args; i++ )
3062 {
3063 task = d->task;
3064
3065 // Use the remaining locks except for the first one.
3066 thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 );
3067
3068 RCCE_acquire_lock( thread );
3069
3070 args->n_ready[task->order]--;
3071 available = ( args->n_ready[task->order] == 0 );
3072
3073 RCCE_release_lock( thread );
3074
3075 // Place newly ready tasks on waiting queue.
3076 if ( available )
3077 {
3078 RCCE_acquire_lock( 0 );
3079
3080 FLASH_Queue_wait_enqueue( task, arg );
3081
3082 RCCE_release_lock( 0 );
3083 }
3084
3085 // Go to the next dep.
3086 d = d->next_dep;
3087 }
3088
3089 return r;
3090 }
3091
3092 #endif // FLA_ENABLE_SCC
3093
3094 #endif // FLA_ENABLE_SUPERMATRIX
3095