#ifndef __CS_BLAS_H__
#define __CS_BLAS_H__

/*============================================================================
 * BLAS (Basic Linear Algebra Subroutine) functions
 *============================================================================*/

/*
  This file is part of Code_Saturne, a general-purpose CFD tool.

  Copyright (C) 1998-2021 EDF S.A.

  This program is free software; you can redistribute it and/or modify it under
  the terms of the GNU General Public License as published by the Free Software
  Foundation; either version 2 of the License, or (at your option) any later
  version.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  details.

  You should have received a copy of the GNU General Public License along with
  this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
  Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/*----------------------------------------------------------------------------*/

#include "cs_defs.h"

/*----------------------------------------------------------------------------
 * External library headers
 *----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------
 *  Local headers
 *----------------------------------------------------------------------------*/

#include "cs_base.h"

/*----------------------------------------------------------------------------*/

BEGIN_C_DECLS

/*============================================================================
 * Macro definitions
 *============================================================================*/

/*============================================================================
 * Type definitions
 *============================================================================*/

/* BLAS reduction algorithm families */

typedef enum {

  CS_BLAS_REDUCE_SUPERBLOCK,
  CS_BLAS_REDUCE_KAHAN

} cs_blas_reduce_t;

/*============================================================================
 *  Public function prototypes
 *============================================================================*/

/*----------------------------------------------------------------------------*/
/*!
 * \brief Set the preferred BLAS reduction algorithm family.
 *
 * This may not be enforced for all algorithms, though it should at least
 * be enforced for the most general functions such as \ref cs_dot.
 *
 * \param[in]  mode   BLAS mode to use
 */
/*----------------------------------------------------------------------------*/

void
cs_blas_set_reduce_algorithm(cs_blas_reduce_t  mode);

/*----------------------------------------------------------------------------
 * Constant times a vector plus a vector: y <-- ax + y
 *
 * parameters:
 *   n <-- size of arrays x and y
 *   a <-- multiplier for x
 *   x <-- array of floating-point values
 *   y <-- array of floating-point values
 *----------------------------------------------------------------------------*/

void
cs_axpy(cs_lnum_t         n,
        double            a,
        const cs_real_t  *x,
        cs_real_t        *restrict y);

/*----------------------------------------------------------------------------
 * Return the sum of a vector. For better precision, a superblock algorithm
 * is used.
 *
 * parameters:
 *   n <-- size of array x
 *   x <-- array of floating-point values
 *
 * returns:
 *   the resulting sum
 *----------------------------------------------------------------------------*/

double
cs_sum(cs_lnum_t         n,
       const cs_real_t  *x);

/*----------------------------------------------------------------------------
 * Return the weighted sum of a vector. For better precision, a superblock
 * algorithm is used.
 *
 * \param[in]  n  size of array x
 * \param[in]  w  array of floating-point weights
 * \param[in]  x  array of floating-point values
 *
 * \return the resulting weighted sum
 *----------------------------------------------------------------------------*/

double
cs_weighted_sum(cs_lnum_t         n,
                const cs_real_t  *w,
                const cs_real_t  *x);

/*----------------------------------------------------------------------------
 * Return the dot product of 2 vectors: x.y
 *
 * parameters:
 *   n <-- size of arrays x and y
 *   x <-- array of floating-point values
 *   y <-- array of floating-point values
 *
 * returns:
 *   dot product
 *----------------------------------------------------------------------------*/

double
cs_dot(cs_lnum_t         n,
       const cs_real_t  *x,
       const cs_real_t  *y);

/*----------------------------------------------------------------------------
 * Return dot products of a vector with itself: x.x
 *
 * parameters:
 *   n  <-- size of arrays x
 *   x  <-- array of floating-point values
 *
 * returns:
 *   dot product
 *----------------------------------------------------------------------------*/

double
cs_dot_xx(cs_lnum_t         n,
          const cs_real_t  *x);

/*----------------------------------------------------------------------------
 * Return weighted dot products of a vector with itself: x.x
 *
 * For better precision, a superblock algorithm is used.
 *
 * parameters:
 *   n  <-- size of arrays x
 *   w  <-- array of weights
 *   x  <-- array of floating-point values
 *
 * returns:
 *   dot product
 *----------------------------------------------------------------------------*/

double
cs_dot_wxx(cs_lnum_t         n,
           const cs_real_t  *w,
           const cs_real_t  *x);

/*----------------------------------------------------------------------------
 * Return the double dot product of 2 vectors: x.x, and x.y
 *
 * The products could be computed separately, but computing them
 * simultaneously adds more optimization opportunities and possibly better
 * cache behavior.
 *
 * parameters:
 *   n  <-- size of arrays x and y
 *   x  <-- array of floating-point values
 *   y  <-- array of floating-point values
 *   xx --> x.x dot product
 *   xy --> x.y dot product
 *----------------------------------------------------------------------------*/

void
cs_dot_xx_xy(cs_lnum_t                    n,
             const cs_real_t  *restrict   x,
             const cs_real_t  *restrict   y,
             double                      *xx,
             double                      *xy);

/*----------------------------------------------------------------------------
 * Return the double dot product of 3 vectors: x.y, and y.z
 *
 * The products could be computed separately, but computing them
 * simultaneously adds more optimization opportunities and possibly better
 * cache behavior.
 *
 * parameters:
 *   n  <-- size of arrays x and y
 *   x  <-- array of floating-point values
 *   y  <-- array of floating-point values
 *   z  <-- array of floating-point values
 *   xy --> x.y dot product
 *   yz --> x.z dot product
 *----------------------------------------------------------------------------*/

void
cs_dot_xy_yz(cs_lnum_t                    n,
             const cs_real_t  *restrict   x,
             const cs_real_t  *restrict   y,
             const cs_real_t  *restrict   z,
             double                      *xx,
             double                      *xy);

/*----------------------------------------------------------------------------
 * Return 3 dot products of 3 vectors: x.x, x.y, and y.z
 *
 * The products could be computed separately, but computing them
 * simultaneously adds more optimization opportunities and possibly better
 * cache behavior.
 *
 * parameters:
 *   n  <-- size of arrays x and y
 *   x  <-- array of floating-point values
 *   y  <-- array of floating-point values
 *   z  <-- array of floating-point values
 *   xx --> x.y dot product
 *   xy --> x.y dot product
 *   yz --> y.z dot product
 *----------------------------------------------------------------------------*/

void
cs_dot_xx_xy_yz(cs_lnum_t                    n,
                const cs_real_t  *restrict   x,
                const cs_real_t  *restrict   y,
                const cs_real_t  *restrict   z,
                double                      *xx,
                double                      *xy,
                double                      *yz);

/*----------------------------------------------------------------------------
 * Return 5 dot products of 3 vectors: x.x, y.y, x.y, x.z, and y.z
 *
 * The products could be computed separately, but computing them
 * simultaneously adds more optimization opportunities and possibly better
 * cache behavior.
 *
 * parameters:
 *   n  <-- size of arrays x and y
 *   x  <-- array of floating-point values
 *   y  <-- array of floating-point values
 *   z  <-- array of floating-point values
 *   xx --> x.y dot product
 *   yy --> y.y dot product
 *   xy --> x.y dot product
 *   xz --> x.z dot product
 *   yz --> y.z dot product
 *----------------------------------------------------------------------------*/

void
cs_dot_xx_yy_xy_xz_yz(cs_lnum_t                    n,
                      const cs_real_t  *restrict   x,
                      const cs_real_t  *restrict   y,
                      const cs_real_t  *restrict   z,
                      double                      *xx,
                      double                      *yy,
                      double                      *xy,
                      double                      *xz,
                      double                      *yz);

/*----------------------------------------------------------------------------
 * Return the global dot product of 2 vectors: x.y
 *
 * In parallel mode, the local results are summed on the default
 * global communicator.
 *
 * parameters:
 *   n <-- size of arrays x and y
 *   x <-- array of floating-point values
 *   y <-- array of floating-point values
 *
 * returns:
 *   dot product
 *----------------------------------------------------------------------------*/

double
cs_gdot(cs_lnum_t         n,
        const cs_real_t  *x,
        const cs_real_t  *y);

/*----------------------------------------------------------------------------
 * Return the global residual of 2 intensive vectors:
 *  1/sum(vol) . sum(vol.x.y)
 *
 * parameters:
 *   n   <-- size of arrays x and y
 *   vol <-- array of floating-point values
 *   x   <-- array of floating-point values
 *   y   <-- array of floating-point values
 *
 * returns:
 *   global residual
 *----------------------------------------------------------------------------*/

double
cs_gres(cs_lnum_t         n,
        const cs_real_t  *vol,
        const cs_real_t  *x,
        const cs_real_t  *y);

/*----------------------------------------------------------------------------
 * Return the global volume weighted mean
 *  1/sum(vol) . sum(vol.x)
 *
 * parameters:
 *   n   <-- size of arrays x
 *   vol <-- array of floating-point values
 *   x   <-- array of floating-point values
 *
 * returns:
 *   global residual
 *----------------------------------------------------------------------------*/

double
cs_gmean(cs_lnum_t         n,
         const cs_real_t  *vol,
         const cs_real_t  *x);

/*----------------------------------------------------------------------------*/

END_C_DECLS

#endif /* __CS_BLAS_H__ */