/*
 * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

/* clang-format off */

/* mmreal4.c -- F90 fast-/dgemm-like MATMUL intrinsics for real*4 type */

#include "stdioInterf.h"
#include "fioMacros.h"

#define SMALL_ROWSA 10
#define SMALL_ROWSB 10
#define SMALL_COLSB 10

void ENTF90(MMUL_REAL4, mmul_real4)(int ta, int tb, __POINT_T mra,
                                    __POINT_T ncb, __POINT_T kab, float *alpha,
                                    float a[], __POINT_T lda, float b[],
                                    __POINT_T ldb, float *beta, float c[],
                                    __POINT_T ldc)

{
  /*
  *   Notes on parameters
  *   ta = 0 -> no transpose of matrix a
  *   tb = 0 -> no transpose of matrix b

  *   mra = number of rows in matrices a and c ( = m )
  *   ncb = number of columns in matrices b and c ( = n )
  *   kab = shared dimension of matrices a and b ( = k, but need k elsewhere )
  *   a = starting address of matrix a
  *   b = starting address of matrix b
  *   c = starting address of matric c
  *   lda = leading dimension of matrix a
  *   ldb = leading dimension of matrix b
  *   ldc = leading dimension of matrix c
  *   alpha = 1.0
  *   beta = 0.0
  *   Note that these last two conditions are inconsitent with the general
  *   case for dgemm.
  *   Taken together we have
  *   c = beta * c + alpha * ( (ta)a * (tb)*b )
  *   where the meaning of (ta) and (tb) is that if ta = 0 a is not transposed
  *   and transposed otherwise and if tb = 0, b is not transpose and transposed
  *   otherwise.
  */

  // Local variables

  int colsa, rowsa, rowsb, colsb;
  int ar, ac;
  int ndx, ndxsav, colchunk, colchunks, rowchunk, rowchunks;
  int colsb_chunks, colsb_end, colsb_strt;
  int bufr, bufc, loc, lor;
  int small_size = SMALL_ROWSA * SMALL_ROWSB * SMALL_COLSB;
  int tindex = 0;
  float buffera[SMALL_ROWSA * SMALL_ROWSB];
  float bufferb[SMALL_COLSB * SMALL_ROWSB];
  float temp;
  void ftn_mvmul_real4_(), ftn_vmmul_real4_();
  void ftn_mnaxnb_real4_(), ftn_mnaxtb_real4_();
  void ftn_mtaxnb_real4_(), ftn_mtaxtb_real4_();
  float calpha, cbeta;
  /*
   * Small matrix multiply variables
   */
  int i, ia, ja, j, k, bk;
  int astrt, bstrt, cstrt, andx, bndx, cndx, indx, indx_strt;
  /*
   * tindex has the following meaning:
   * ta == 0, tb == 0: tindex = 0
   * ta == 1, tb == 0: tindex = 1
   * ta == 0, tb == 1; tindex = 2
   * ta == 1, tb == 1; tindex = 3
   */

  /*  if( ( tb == 0 ) && ( ncb == 1 ) && ( ldc == 1 ) ){ */
  if ((tb == 0) && (ncb == 1)) {
    /* matrix vector multiply */
    ftn_mvmul_real4_(&ta, &mra, &kab, alpha, a, &lda, b, beta, c);
    return;
  }
  if ((ta == 0) && (mra == 1) && (ldc == 1)) {
    /* vector matrix multiply */
    ftn_vmmul_real4_(&tb, &ncb, &kab, alpha, a, b, &ldb, beta, c);
    return;
  }
  calpha = *alpha;
  cbeta = *beta;
  rowsa = mra;
  colsa = kab;
  rowsb = kab;
  colsb = ncb;
  if (ta == 1)
    tindex = 1;

  if (tb == 1)
    tindex += 2;

  // Check for really small matrix sizes

  // Check for really small matrix sizes

  if ((colsb <= SMALL_COLSB) && (rowsa <= SMALL_ROWSA) &&
      (rowsb <= SMALL_ROWSB)) {
    switch (tindex) {
    case 0: /* matrix a and matrix b normally oriented
             *
             * The notation here refers to the Fortran orientation since
             * that is the origination of these matrices
             */
      astrt = 0;
      bstrt = 0;
      cstrt = 0;
      if (cbeta == (float)0.0) {
        for (i = 0; i < rowsa; i++) {
          /* Transpose the a row of the a matrix */
          andx = astrt;
          indx = 0;
          for (ja = 0; ja < colsa; ja++) {
            buffera[indx++] = calpha * a[andx];
            andx += lda;
          }
          astrt++;
          cndx = cstrt;
          for (j = 0; j < colsb; j++) {
            temp = 0.0;
            bndx = bstrt;
            for (k = 0; k < rowsb; k++)
              temp += buffera[k] * b[bndx++];
            bstrt += ldb;
            c[cndx] = temp;
            cndx += ldc;
          }
          cstrt++; /* set index for next row of c */
          bstrt = 0;
        }
      } else {
        for (i = 0; i < rowsa; i++) {
          /* Transpose the a row of the a matrix */
          andx = astrt;
          indx = 0;
          for (ja = 0; ja < colsa; ja++) {
            buffera[indx++] = calpha * a[andx];
            andx += lda;
          }
          astrt++;
          cndx = cstrt;
          for (j = 0; j < colsb; j++) {
            temp = 0.0;
            bndx = bstrt;
            for (k = 0; k < rowsb; k++)
              temp += buffera[k] * b[bndx++];
            bstrt += ldb;
            c[cndx] = temp + cbeta * c[cndx];
            cndx += ldc;
          }
          cstrt++; /* set index for next row of c */
          bstrt = 0;
        }
      }

      break;
    case 1: /* matrix a transpose, matrix b normally oriented */
      bndx = 0;
      cstrt = 0;
      andx = 0;
      if (cbeta == (float)0.0) {
        for (j = 0; j < colsb; j++) {
          cndx = cstrt;
          for (i = 0; i < rowsa; i++) {
            /* Matrix a need not be transposed */
            temp = 0.0;
            for (k = 0; k < rowsb; k++)
              temp += a[andx + k] * b[bndx + k];
            c[cndx] = calpha * temp;
            andx += lda;
            cndx++;
          }
          cstrt += ldc; /* set index for next column of c */
          astrt++;      /* set index for next column of a */
          b += ldb;
          andx = 0;
        }
      } else {
        for (j = 0; j < colsb; j++) {
          cndx = cstrt;
          for (i = 0; i < rowsa; i++) {
            /* Matrix a need not be transposed */
            temp = 0.0;
            for (k = 0; k < rowsb; k++)
              temp += a[andx + k] * b[bndx + k];
            c[cndx] = calpha * temp + cbeta * c[cndx];
            andx += lda;
            cndx++;
          }
          cstrt += ldc; /* set index for next column of c */
          astrt++;      /* set index for next column of a */
          b += ldb;
          andx = 0;
        }
      }

      break;
    case 2: /* Matrix a normal, b transposed */
      /* We will transpose b and work with transposed rows of a */
      /* Transpose matrix b */
      indx_strt = 0;
      bstrt = 0;
      for (j = 0; j < rowsb; j++) {
        indx = indx_strt;
        bndx = bstrt;
        for (i = 0; i < colsb; i++) {
          bufferb[indx] = calpha * b[bndx++];
          indx += rowsb;
        }
        indx_strt++;
        bstrt += ldb;
      }
      /* All of b is now transposed */

      astrt = 0;
      cstrt = 0;
      if (cbeta == (float)0.0) {
        for (i = 0; i < rowsa; i++) {
          /* Transpose the a row of the a matrix */
          andx = astrt;
          indx = 0;
          for (ja = 0; ja < colsa; ja++) {
            buffera[indx++] = a[andx];
            andx += lda;
          }
          cndx = cstrt;
          bndx = 0;
          for (j = 0; j < colsb; j++) {
            temp = 0.0;
            for (k = 0; k < rowsb; k++)
              temp += buffera[k] * bufferb[bndx++];
            c[cndx] = temp;
            cndx += ldc;
          }
          cstrt++; /* set index for next row of c */
          astrt++;
        }
      } else {
        for (i = 0; i < rowsa; i++) {
          /* Transpose the a row of the a matrix */
          andx = astrt;
          indx = 0;
          for (ja = 0; ja < colsa; ja++) {
            buffera[indx++] = a[andx];
            andx += lda;
          }
          cndx = cstrt;
          bndx = 0;
          for (j = 0; j < colsb; j++) {
            temp = 0.0;
            for (k = 0; k < rowsb; k++)
              temp += buffera[k] * bufferb[bndx++];
            c[cndx] = temp + cbeta * c[cndx];
            cndx += ldc;
          }
          cstrt++; /* set index for next row of c */
          astrt++;
        }
      }
      break;
    case 3: /* both matrices tranposed. Combination of cases 1 and 2 */
      /* Transpose matrix b */

      indx_strt = 0;
      bstrt = 0;
      for (j = 0; j < rowsb; j++) {
        indx = indx_strt;
        bndx = bstrt;
        for (i = 0; i < colsb; i++) {
          bufferb[indx] = calpha * b[bndx++];
          indx += rowsb;
        }
        indx_strt++;
        bstrt += ldb;
      }

      /* All of b is now transposed */
      andx = 0;
      cstrt = 0;
      bndx = 0;
      if (cbeta == (float)0.0) {
        for (i = 0; i < colsb; i++) {
          /* Matrix a need not be transposed */
          cndx = cstrt;
          for (j = 0; j < rowsa; j++) {
            temp = 0.0;
            for (k = 0; k < rowsb; k++)
              temp += a[andx + k] * bufferb[bndx + k];
            c[cndx] = temp;
            cndx++;
            andx += lda;
          }
          bndx += rowsb; /* index for next transposed column of b */
          andx = 0;      /* set index for next column of a */
          cstrt += ldc;  /* set index for next row of c */
        }
      } else {
        for (i = 0; i < colsb; i++) {
          /* Matrix a need not be transposed */
          cndx = cstrt;
          for (j = 0; j < rowsa; j++) {
            temp = 0.0;
            for (k = 0; k < rowsb; k++)
              temp += a[andx + k] * bufferb[bndx + k];
            c[cndx] = temp + cbeta * c[cndx];
            cndx++;
            andx += lda;
          }
          bndx += rowsb; /* index for next transposed column of b */
          andx = 0;      /* set index for next column of a */
        }
      }
    }
  } else {
    switch (tindex) {
    case 0:
      ftn_mnaxnb_real4_(&mra, &ncb, &kab, alpha, a, &lda, b, &ldb, beta, c,
                          &ldc);
      break;
    case 1:
      ftn_mtaxnb_real4_(&mra, &ncb, &kab, alpha, a, &lda, b, &ldb, beta, c,
                          &ldc);
      break;
    case 2:
      ftn_mnaxtb_real4_(&mra, &ncb, &kab, alpha, a, &lda, b, &ldb, beta, c,
                          &ldc);
      break;
    case 3:
      ftn_mtaxtb_real4_(&mra, &ncb, &kab, alpha, a, &lda, b, &ldb, beta, c,
                          &ldc);
    }
  }

}