/* * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ /* clang-format off */ /* mmulLOG4.c --F90 MATMUL intrinsics for logical*4 type */ #include "stdioInterf.h" #include "fioMacros.h" void ENTF90(MATMUL_LOG4, matmul_log4)(char *dest_addr, char *s1_addr, char *s2_addr, F90_Desc *dest_desc, F90_Desc *s1_desc, F90_Desc *s2_desc) { __LOG4_T *s1_base; __LOG4_T *s2_base; __LOG4_T *dest_base; __LOG4_T *d_elem_p; __LOG4_T *s1_elem_p; __LOG4_T *s2_elem_p; __INT_T s1_d1_lstride; __INT_T s1_d1_sstride; __INT_T s1_d1_lb; __INT_T s1_d1_soffset = 0; __INT_T s1_d2_lstride = 1; __INT_T s1_d2_sstride = 1; __INT_T s1_d2_lb = 0; __INT_T s1_d2_soffset = 0; __INT_T s2_d1_lstride; __INT_T s2_d1_sstride; __INT_T s2_d1_lb; __INT_T s2_d1_soffset = 0; __INT_T s2_d2_lstride = 1; __INT_T s2_d2_sstride = 1; __INT_T s2_d2_lb = 0; __INT_T s2_d2_soffset = 0; __INT_T d_d1_lstride; __INT_T d_d1_sstride; __INT_T d_d1_lb; __INT_T d_d1_soffset = 0; __INT_T d_d2_lstride = 1; __INT_T d_d2_sstride = 1; __INT_T d_d2_lb = 0; __INT_T d_d2_soffset = 0; __INT_T d_rank = F90_RANK_G(dest_desc); __INT_T s1_rank = F90_RANK_G(s1_desc); __INT_T s2_rank = F90_RANK_G(s2_desc); __INT_T k_extent = s2_rank == 2 ? F90_DIM_EXTENT_G(s2_desc, 1) : 1; __INT_T m_extent = s1_rank == 2 ? F90_DIM_EXTENT_G(s1_desc, 1) : F90_DIM_EXTENT_G(s1_desc, 0); __INT_T n_extent = s1_rank == 2 ? F90_DIM_EXTENT_G(s1_desc, 0) : 1; __INT_T dest_offset; __INT_T s1_d1_base, s1_d1_delta, s1_d1_offset, s1_d2_base, s1_d2_delta, s1_d2_offset, s2_d1_base, s2_d1_delta, s2_d1_offset, s2_d2_base, s2_d2_delta, s2_d2_offset, d_d1_base, d_d1_delta, d_d1_offset, d_d2_base, d_d2_delta, d_d2_offset; __INT_T k; __INT_T m; __INT_T n; /* mxm * s1(n,m) x s2(m,k) -> dest(n,k) * Check * dest_d1 extent== n_extnet * dest_d2 extent == k_extent * s2_d1 extent = m_extent * * mxv * s1(n,m) x s2(m) -> dest(n) * Check * dest_d1 extent== n_extent * s2_d1 extent == m_extent * * vxm * s1(m) x s2(m,k) -> dest(k) * check * s2_d1 extent == m_extent * dest_d1 extent == k_extent */ if (d_rank == 2 && s1_rank == 2 && s2_rank == 2) { if (F90_DIM_EXTENT_G(dest_desc, 0) != n_extent || F90_DIM_EXTENT_G(dest_desc, 1) != k_extent || F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) { __fort_abort("MATMUL: nonconforming array shapes"); } } else if (d_rank == 1 && s1_rank == 2 && s2_rank == 1) { if (F90_DIM_EXTENT_G(dest_desc, 0) != n_extent || F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) { __fort_abort("MATMUL: nonconforming array shapes"); } } else if (d_rank == 1 && s1_rank == 1 && s2_rank == 2) { if (F90_DIM_EXTENT_G(dest_desc, 0) != k_extent || F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) { __fort_abort("MATMUL: nonconforming array shapes"); } } else { __fort_abort("MATMUL: non-conforming array shapes"); } s1_d1_lstride = F90_DIM_LSTRIDE_G(s1_desc, 0); s1_d1_sstride = F90_DIM_SSTRIDE_G(s1_desc, 0); s1_d1_lb = F90_DIM_LBOUND_G(s1_desc, 0); if (s1_d1_sstride != 1 || F90_DIM_SOFFSET_G(s1_desc, 0)) s1_d1_soffset = F90_DIM_SOFFSET_G(s1_desc, 0) + s1_d1_sstride - s1_d1_lb; if (s1_rank == 2) { s1_d2_lstride = F90_DIM_LSTRIDE_G(s1_desc, 1); s1_d2_lb = F90_DIM_LBOUND_G(s1_desc, 1); s1_d2_sstride = F90_DIM_SSTRIDE_G(s1_desc, 1); if (s1_d2_sstride != 1 || F90_DIM_SOFFSET_G(s1_desc, 1)) s1_d2_soffset = F90_DIM_SOFFSET_G(s1_desc, 1) + s1_d2_sstride - s1_d2_lb; } s2_d1_lstride = F90_DIM_LSTRIDE_G(s2_desc, 0); s2_d1_lb = F90_DIM_LBOUND_G(s2_desc, 0); s2_d1_sstride = F90_DIM_SSTRIDE_G(s2_desc, 0); if (s2_d1_sstride != 1 || F90_DIM_SOFFSET_G(s2_desc, 0)) s2_d1_soffset = F90_DIM_SOFFSET_G(s2_desc, 0) + s2_d1_sstride - s2_d1_lb; if (s2_rank == 2) { s2_d2_lstride = F90_DIM_LSTRIDE_G(s2_desc, 1); s2_d2_lb = F90_DIM_LBOUND_G(s2_desc, 1); s2_d2_sstride = F90_DIM_SSTRIDE_G(s2_desc, 1); if (s2_d2_sstride != 1 || F90_DIM_SOFFSET_G(s2_desc, 1)) s2_d2_soffset = F90_DIM_SOFFSET_G(s2_desc, 1) + s2_d2_sstride - s2_d2_lb; } d_d1_lstride = F90_DIM_LSTRIDE_G(dest_desc, 0); d_d1_lb = F90_DIM_LBOUND_G(dest_desc, 0); d_d1_sstride = F90_DIM_SSTRIDE_G(dest_desc, 0); if (d_d1_sstride != 1 || F90_DIM_SOFFSET_G(dest_desc, 0)) d_d1_soffset = F90_DIM_SOFFSET_G(dest_desc, 0) + d_d1_sstride - d_d1_lb; if (d_rank == 2) { d_d2_lstride = F90_DIM_LSTRIDE_G(dest_desc, 1); d_d2_lb = F90_DIM_LBOUND_G(dest_desc, 1); d_d2_sstride = F90_DIM_SSTRIDE_G(dest_desc, 1); if (d_d2_sstride != 1 || F90_DIM_SOFFSET_G(dest_desc, 1)) d_d2_soffset = F90_DIM_SOFFSET_G(dest_desc, 1) + d_d2_sstride - d_d2_lb; } s1_base = (__LOG4_T *)s1_addr + F90_LBASE_G(s1_desc) + s1_d1_lb * s1_d1_lstride + s1_d2_lb * s1_d2_lstride - 1; s2_base = (__LOG4_T *)s2_addr + F90_LBASE_G(s2_desc) + s2_d1_lb * s2_d1_lstride + s2_d2_lb * s2_d2_lstride - 1; dest_base = (__LOG4_T *)dest_addr + F90_LBASE_G(dest_desc) + d_d1_lb * d_d1_lstride + d_d2_lb * d_d2_lstride - 1; d_d1_offset = d_d1_base = d_d1_soffset * d_d1_lstride; d_d1_delta = d_d1_sstride * d_d1_lstride; d_d2_offset = d_d2_base = d_d2_soffset * d_d2_lstride; d_d2_delta = s1_rank == 2 ? d_d2_sstride * d_d2_lstride : d_d1_delta; s1_d1_offset = s1_d1_base = s1_d1_soffset * s1_d1_lstride; s1_d1_delta = s1_d1_sstride * s1_d1_lstride; s1_d2_offset = s1_d2_base = s1_d2_soffset * s1_d2_lstride; s1_d2_delta = s1_rank == 2 ? s1_d2_sstride * s1_d2_lstride : s1_d1_delta; s2_d1_offset = s2_d1_base = s2_d1_soffset * s2_d1_lstride; s2_d1_delta = s2_d1_sstride * s2_d1_lstride; s2_d2_offset = s2_d2_base = s2_d2_soffset * s2_d2_lstride; s2_d2_delta = s2_d2_sstride * s2_d2_lstride; if (s1_rank == 2) { for (k = 0; k < k_extent; k++) { d_elem_p = dest_base + d_d1_base + d_d2_offset; d_d2_offset += d_d2_delta; for (n = 0; n < n_extent; n++) { *d_elem_p = 0; d_elem_p += d_d1_delta; } } d_d2_offset = d_d2_base; for (k = 0; k < k_extent; k++) { s2_elem_p = s2_base + s2_d1_base + s2_d2_offset; s2_d2_offset += s2_d2_delta; s1_d2_offset = s1_d2_base; for (m = 0; m < m_extent; m++) { s1_elem_p = s1_base + s1_d1_base + s1_d2_offset; s1_d2_offset += s1_d2_delta; d_elem_p = dest_base + d_d1_base + d_d2_offset; for (n = 0; n < n_extent; n++) { if ((*s1_elem_p & GET_DIST_MASK_LOG4) && (*s2_elem_p & GET_DIST_MASK_LOG4)) { *d_elem_p = GET_DIST_TRUE_LOG4; } d_elem_p += d_d1_delta; s1_elem_p += s1_d1_delta; } s2_elem_p += s2_d1_delta; } d_d2_offset += d_d2_delta; } } else { __LOG4_T rslt_tmp; s1_base += s1_d1_base; s2_base += s2_d1_soffset * s2_d1_lstride; dest_offset = d_d1_base; for (k = 0; k < k_extent; k++) { s1_elem_p = s1_base; s2_elem_p = s2_base + s2_d2_base; rslt_tmp = 0; for (m = 0; m < m_extent; m++) { if ((*s1_elem_p & GET_DIST_MASK_LOG4) && (*s2_elem_p & GET_DIST_MASK_LOG4)) { rslt_tmp = GET_DIST_TRUE_LOG4; } s1_elem_p += s1_d1_delta; s2_elem_p += s2_d1_delta; } *(dest_base + dest_offset) = rslt_tmp; dest_offset += d_d1_delta; s2_d2_base += s2_d2_delta; } } }