/*

    Copyright (C) 2014, The University of Texas at Austin

    This file is part of libflame and is available under the 3-Clause
    BSD license, which can be found in the LICENSE file at the top-level
    directory, or at http://opensource.org/licenses/BSD-3-Clause

*/

#include "FLAME.h"


#ifdef FLA_ENABLE_GPU

#include "cublas.h"


static FLA_Bool flash_queue_enabled_gpu  = FALSE;
static dim_t    flash_queue_gpu_n_blocks = 128;


void FLASH_Queue_init_gpu( void )
/*----------------------------------------------------------------------------

   FLASH_Queue_init_gpu

----------------------------------------------------------------------------*/
{
   cublasInit();

   return;
}


void FLASH_Queue_finalize_gpu( void )
/*----------------------------------------------------------------------------

   FLASH_Queue_finalize_gpu

----------------------------------------------------------------------------*/
{
   cublasShutdown();

   return;
}


FLA_Error FLASH_Queue_enable_gpu( void )
/*----------------------------------------------------------------------------

   FLASH_Queue_enable_gpu

----------------------------------------------------------------------------*/
{
   if ( FLASH_Queue_stack_depth() == 0 && FLASH_Queue_get_enabled() )
   {
      // Enable if not begin parallel region yet and SuperMatrix is enabled.
      flash_queue_enabled_gpu = TRUE;
      return FLA_SUCCESS;
   }
   else
   {
      // Cannot change status during parallel region.
      return FLA_FAILURE;
   }
}


FLA_Error FLASH_Queue_disable_gpu( void )
/*----------------------------------------------------------------------------

   FLASH_Queue_disable_gpu

----------------------------------------------------------------------------*/
{
   if ( FLASH_Queue_stack_depth() == 0 )
   {
      // Disable if not begin parallel region yet.
      flash_queue_enabled_gpu = FALSE;
      return FLA_SUCCESS;
   }
   else
   {
      // Cannot change status during parallel region.
      return FLA_FAILURE;
   }
}


FLA_Bool FLASH_Queue_get_enabled_gpu( void )
/*----------------------------------------------------------------------------

   FLASH_Queue_get_enabled_gpu

----------------------------------------------------------------------------*/
{
   // Return if SuperMatrix is enabled, but always false if not.
   if ( FLASH_Queue_get_enabled() )
      return flash_queue_enabled_gpu;
   else
      return FALSE;
}


void FLASH_Queue_set_gpu_num_blocks( dim_t n_blocks )
/*----------------------------------------------------------------------------

   FLASH_Queue_set_gpu_num_blocks

----------------------------------------------------------------------------*/
{
   flash_queue_gpu_n_blocks = n_blocks;

   return;
}


dim_t FLASH_Queue_get_gpu_num_blocks( void )
/*----------------------------------------------------------------------------

   FLASH_Queue_get_gpu_num_blocks

----------------------------------------------------------------------------*/
{
   return flash_queue_gpu_n_blocks;
}


// --- helper functions --- ===================================================


FLA_Error FLASH_Queue_bind_gpu( int thread )
/*----------------------------------------------------------------------------

   FLASH_Queue_bind_gpu

----------------------------------------------------------------------------*/
{
   // Bind a GPU to this thread.
   cudaSetDevice( thread );

   return FLA_SUCCESS;
}


FLA_Error FLASH_Queue_alloc_gpu( dim_t size,
                                 FLA_Datatype datatype,
                                 void** buffer_gpu )
/*----------------------------------------------------------------------------

   FLASH_Queue_alloc_gpu

----------------------------------------------------------------------------*/
{
   cublasStatus status;

   // Allocate memory for a block on GPU.
   status = cublasAlloc( size,
                         FLA_Obj_datatype_size( datatype ),
                         buffer_gpu );

   // Check to see if the allocation was successful.
   if ( status != CUBLAS_STATUS_SUCCESS )
      FLA_Check_error_code( FLA_MALLOC_GPU_RETURNED_NULL_POINTER );

   return FLA_SUCCESS;
}


FLA_Error FLASH_Queue_free_gpu( void* buffer_gpu )
/*----------------------------------------------------------------------------

   FLASH_Queue_free_gpu

----------------------------------------------------------------------------*/
{
   // Free memory for a block on GPU.
   cublasFree( buffer_gpu );

   return FLA_SUCCESS;
}


FLA_Error FLASH_Queue_write_gpu( FLA_Obj obj, void* buffer_gpu )
/*----------------------------------------------------------------------------

   FLASH_Queue_write_gpu

----------------------------------------------------------------------------*/
{
   // Write the contents of a block in main memory to GPU.
   cublasSetMatrix( FLA_Obj_length( obj ),
                    FLA_Obj_width( obj ),
                    FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ),
                    FLA_Obj_buffer_at_view( obj ), 
                    FLA_Obj_col_stride( obj ),
                    buffer_gpu,
                    FLA_Obj_length( obj ) );

   return FLA_SUCCESS;
}


FLA_Error FLASH_Queue_read_gpu( FLA_Obj obj, void* buffer_gpu )
/*----------------------------------------------------------------------------

   FLASH_Queue_read_gpu

----------------------------------------------------------------------------*/
{
   // Read the memory of a block on GPU to main memory.
   cublasGetMatrix( FLA_Obj_length( obj ),
                    FLA_Obj_width( obj ),
                    FLA_Obj_datatype_size( FLA_Obj_datatype( obj ) ),
                    buffer_gpu,
                    FLA_Obj_length( obj ),
                    FLA_Obj_buffer_at_view( obj ),
                    FLA_Obj_col_stride( obj ) );

   return FLA_SUCCESS;
}


void FLASH_Queue_exec_task_gpu( FLASH_Task* t, 
                                void** input_arg, 
                                void** output_arg )
/*----------------------------------------------------------------------------

   FLASH_Queue_exec_task_gpu

----------------------------------------------------------------------------*/
{
   // Define local function pointer types.

   // Level-3 BLAS
   typedef FLA_Error(*flash_gemm_gpu_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_hemm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_herk_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_her2k_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_symm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_syrk_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_syr2k_gpu_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_trmm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj C, void* C_gpu);
   typedef FLA_Error(*flash_trsm_gpu_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj C, void* C_gpu);

   // Level-2 BLAS
   typedef FLA_Error(*flash_gemv_gpu_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu, FLA_Obj beta, FLA_Obj y, void* y_gpu);
   typedef FLA_Error(*flash_trsv_gpu_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu);

   // Level-1 BLAS
   typedef FLA_Error(*flash_axpy_gpu_p)(FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu);
   typedef FLA_Error(*flash_copy_gpu_p)(FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu);
   typedef FLA_Error(*flash_scal_gpu_p)(FLA_Obj alpha, FLA_Obj A, void* A_gpu);
   typedef FLA_Error(*flash_scalr_gpu_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu);

   // Only execute task if it is not NULL.
   if ( t == NULL )
      return;

   // Now "switch" between the various possible task functions.

   // FLA_Gemm
   if ( t->func == (void *) FLA_Gemm_task )
   {
      flash_gemm_gpu_p func;
      func = (flash_gemm_gpu_p) FLA_Gemm_external_gpu;
      
      func( ( FLA_Trans   ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->input_arg[1],
                            input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Hemm
   else if ( t->func == (void *) FLA_Hemm_task )
   {
      flash_hemm_gpu_p func;
      func = (flash_hemm_gpu_p) FLA_Hemm_external_gpu;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->input_arg[1],
                            input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Herk
   else if ( t->func == (void *) FLA_Herk_task )
   {
      flash_herk_gpu_p func;
      func = (flash_herk_gpu_p) FLA_Herk_external_gpu;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->fla_arg[1],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Her2k
   else if ( t->func == (void *) FLA_Her2k_task )
   {
      flash_her2k_gpu_p func;
      func = (flash_her2k_gpu_p) FLA_Her2k_external_gpu;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
                             t->fla_arg[0],
                             t->input_arg[0],
                             input_arg[0],
                             t->input_arg[1],
                             input_arg[1],
                             t->fla_arg[1],
                             t->output_arg[0],
                             output_arg[0] );
   }
   // FLA_Symm
   else if ( t->func == (void *) FLA_Symm_task )
   {
      flash_symm_gpu_p func;
      func = (flash_symm_gpu_p) FLA_Symm_external_gpu;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->input_arg[1],
                            input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Syrk
   else if ( t->func == (void *) FLA_Syrk_task )
   {
      flash_syrk_gpu_p func;
      func = (flash_syrk_gpu_p) FLA_Syrk_external_gpu;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->fla_arg[1],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Syr2k
   else if ( t->func == (void *) FLA_Syr2k_task )
   {
      flash_syr2k_gpu_p func;
      func = (flash_syr2k_gpu_p) FLA_Syr2k_external_gpu;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
                             t->fla_arg[0],
                             t->input_arg[0],
                             input_arg[0],
                             t->input_arg[1],
                             input_arg[1],
                             t->fla_arg[1],
                             t->output_arg[0],
                             output_arg[0] );
   }
   // FLA_Trmm
   else if ( t->func == (void *) FLA_Trmm_task )
   {
      flash_trmm_gpu_p func;
      func = (flash_trmm_gpu_p) FLA_Trmm_external_gpu;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
            ( FLA_Trans   ) t->int_arg[2],
            ( FLA_Diag    ) t->int_arg[3],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Trsm
   else if ( t->func == (void *) FLA_Trsm_task )
   {
      flash_trsm_gpu_p func;
      func = (flash_trsm_gpu_p) FLA_Trsm_external_gpu;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
            ( FLA_Trans   ) t->int_arg[2],
            ( FLA_Diag    ) t->int_arg[3],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Gemv
   else if ( t->func == (void *) FLA_Gemv_task )
   {
      flash_gemv_gpu_p func;
      func = (flash_gemv_gpu_p) FLA_Gemv_external_gpu;
      
      func( ( FLA_Trans   ) t->int_arg[0],
                            t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->input_arg[1],
                            input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Trsv
   else if ( t->func == (void *) FLA_Trsv_task )
   {
      flash_trsv_gpu_p func;
      func = (flash_trsv_gpu_p) FLA_Trsv_external_gpu;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
            ( FLA_Diag    ) t->int_arg[2],
                            t->input_arg[0],
                            input_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Axpy
   else if ( t->func == (void *) FLA_Axpy_task )
   {
      flash_axpy_gpu_p func;
      func = (flash_axpy_gpu_p) FLA_Axpy_external_gpu;
         
      func(                 t->fla_arg[0],
                            t->input_arg[0],
                            input_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Copy
   else if ( t->func == (void *) FLA_Copy_task )
   {
      flash_copy_gpu_p func;
      func = (flash_copy_gpu_p) FLA_Copy_external_gpu;
         
      func(                 t->input_arg[0],
                            input_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Scal
   else if ( t->func == (void *) FLA_Scal_task )
   {
      flash_scal_gpu_p func;
      func = (flash_scal_gpu_p) FLA_Scal_external_gpu;

      func(                 t->fla_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   // FLA_Scalr
   else if ( t->func == (void *) FLA_Scalr_task )
   {
      flash_scalr_gpu_p func;
      func = (flash_scalr_gpu_p) FLA_Scalr_external_gpu;

      func( ( FLA_Uplo    ) t->int_arg[0],
                            t->fla_arg[0],
                            t->output_arg[0],
                            output_arg[0] );
   }
   else
   {
      FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
   }

   return;
}


#endif // FLA_ENABLE_GPU