1 /*
2  * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 /* clang-format off */
19 
20 /* mmulLOG4.c --F90  MATMUL intrinsics for logical*4 type */
21 
22 #include "stdioInterf.h"
23 #include "fioMacros.h"
24 
ENTF90(MATMUL_LOG4,matmul_log4)25 void ENTF90(MATMUL_LOG4, matmul_log4)(char *dest_addr, char *s1_addr,
26                                       char *s2_addr, F90_Desc *dest_desc,
27                                       F90_Desc *s1_desc, F90_Desc *s2_desc)
28 {
29 
30   __LOG4_T *s1_base;
31   __LOG4_T *s2_base;
32   __LOG4_T *dest_base;
33   __LOG4_T *d_elem_p;
34   __LOG4_T *s1_elem_p;
35   __LOG4_T *s2_elem_p;
36 
37   __INT_T s1_d1_lstride;
38   __INT_T s1_d1_sstride;
39   __INT_T s1_d1_lb;
40   __INT_T s1_d1_soffset = 0;
41 
42   __INT_T s1_d2_lstride = 1;
43   __INT_T s1_d2_sstride = 1;
44   __INT_T s1_d2_lb = 0;
45   __INT_T s1_d2_soffset = 0;
46 
47   __INT_T s2_d1_lstride;
48   __INT_T s2_d1_sstride;
49   __INT_T s2_d1_lb;
50   __INT_T s2_d1_soffset = 0;
51 
52   __INT_T s2_d2_lstride = 1;
53   __INT_T s2_d2_sstride = 1;
54   __INT_T s2_d2_lb = 0;
55   __INT_T s2_d2_soffset = 0;
56 
57   __INT_T d_d1_lstride;
58   __INT_T d_d1_sstride;
59   __INT_T d_d1_lb;
60   __INT_T d_d1_soffset = 0;
61 
62   __INT_T d_d2_lstride = 1;
63   __INT_T d_d2_sstride = 1;
64   __INT_T d_d2_lb = 0;
65   __INT_T d_d2_soffset = 0;
66 
67   __INT_T d_rank = F90_RANK_G(dest_desc);
68   __INT_T s1_rank = F90_RANK_G(s1_desc);
69   __INT_T s2_rank = F90_RANK_G(s2_desc);
70 
71   __INT_T k_extent = s2_rank == 2 ? F90_DIM_EXTENT_G(s2_desc, 1) : 1;
72   __INT_T m_extent = s1_rank == 2 ? F90_DIM_EXTENT_G(s1_desc, 1)
73                                   : F90_DIM_EXTENT_G(s1_desc, 0);
74   __INT_T n_extent = s1_rank == 2 ? F90_DIM_EXTENT_G(s1_desc, 0) : 1;
75 
76   __INT_T dest_offset;
77 
78   __INT_T s1_d1_base, s1_d1_delta, s1_d1_offset, s1_d2_base, s1_d2_delta,
79       s1_d2_offset, s2_d1_base, s2_d1_delta, s2_d1_offset, s2_d2_base,
80       s2_d2_delta, s2_d2_offset, d_d1_base, d_d1_delta, d_d1_offset, d_d2_base,
81       d_d2_delta, d_d2_offset;
82 
83   __INT_T k;
84   __INT_T m;
85   __INT_T n;
86 
87   /* mxm
88    *  s1(n,m) x s2(m,k) -> dest(n,k)
89    *  Check
90    *   dest_d1 extent== n_extnet
91    *   dest_d2 extent == k_extent
92    *   s2_d1 extent = m_extent
93    *
94    * mxv
95    *  s1(n,m) x s2(m) -> dest(n)
96    *  Check
97    *   dest_d1 extent== n_extent
98    *   s2_d1 extent == m_extent
99    *
100    * vxm
101    *  s1(m) x s2(m,k) -> dest(k)
102    *  check
103    *   s2_d1 extent == m_extent
104    *   dest_d1 extent == k_extent
105    */
106 
107   if (d_rank == 2 && s1_rank == 2 && s2_rank == 2) {
108     if (F90_DIM_EXTENT_G(dest_desc, 0) != n_extent ||
109         F90_DIM_EXTENT_G(dest_desc, 1) != k_extent ||
110         F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) {
111       __fort_abort("MATMUL: nonconforming array shapes");
112     }
113   } else if (d_rank == 1 && s1_rank == 2 && s2_rank == 1) {
114     if (F90_DIM_EXTENT_G(dest_desc, 0) != n_extent ||
115         F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) {
116       __fort_abort("MATMUL: nonconforming array shapes");
117     }
118   } else if (d_rank == 1 && s1_rank == 1 && s2_rank == 2) {
119     if (F90_DIM_EXTENT_G(dest_desc, 0) != k_extent ||
120         F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) {
121       __fort_abort("MATMUL: nonconforming array shapes");
122     }
123   } else {
124     __fort_abort("MATMUL: non-conforming array shapes");
125   }
126 
127   s1_d1_lstride = F90_DIM_LSTRIDE_G(s1_desc, 0);
128   s1_d1_sstride = F90_DIM_SSTRIDE_G(s1_desc, 0);
129   s1_d1_lb = F90_DIM_LBOUND_G(s1_desc, 0);
130   if (s1_d1_sstride != 1 || F90_DIM_SOFFSET_G(s1_desc, 0))
131     s1_d1_soffset = F90_DIM_SOFFSET_G(s1_desc, 0) + s1_d1_sstride - s1_d1_lb;
132 
133   if (s1_rank == 2) {
134     s1_d2_lstride = F90_DIM_LSTRIDE_G(s1_desc, 1);
135     s1_d2_lb = F90_DIM_LBOUND_G(s1_desc, 1);
136     s1_d2_sstride = F90_DIM_SSTRIDE_G(s1_desc, 1);
137     if (s1_d2_sstride != 1 || F90_DIM_SOFFSET_G(s1_desc, 1))
138       s1_d2_soffset = F90_DIM_SOFFSET_G(s1_desc, 1) + s1_d2_sstride - s1_d2_lb;
139   }
140 
141   s2_d1_lstride = F90_DIM_LSTRIDE_G(s2_desc, 0);
142   s2_d1_lb = F90_DIM_LBOUND_G(s2_desc, 0);
143   s2_d1_sstride = F90_DIM_SSTRIDE_G(s2_desc, 0);
144   if (s2_d1_sstride != 1 || F90_DIM_SOFFSET_G(s2_desc, 0))
145     s2_d1_soffset = F90_DIM_SOFFSET_G(s2_desc, 0) + s2_d1_sstride - s2_d1_lb;
146 
147   if (s2_rank == 2) {
148     s2_d2_lstride = F90_DIM_LSTRIDE_G(s2_desc, 1);
149     s2_d2_lb = F90_DIM_LBOUND_G(s2_desc, 1);
150     s2_d2_sstride = F90_DIM_SSTRIDE_G(s2_desc, 1);
151     if (s2_d2_sstride != 1 || F90_DIM_SOFFSET_G(s2_desc, 1))
152       s2_d2_soffset = F90_DIM_SOFFSET_G(s2_desc, 1) + s2_d2_sstride - s2_d2_lb;
153   }
154 
155   d_d1_lstride = F90_DIM_LSTRIDE_G(dest_desc, 0);
156   d_d1_lb = F90_DIM_LBOUND_G(dest_desc, 0);
157   d_d1_sstride = F90_DIM_SSTRIDE_G(dest_desc, 0);
158   if (d_d1_sstride != 1 || F90_DIM_SOFFSET_G(dest_desc, 0))
159     d_d1_soffset = F90_DIM_SOFFSET_G(dest_desc, 0) + d_d1_sstride - d_d1_lb;
160 
161   if (d_rank == 2) {
162     d_d2_lstride = F90_DIM_LSTRIDE_G(dest_desc, 1);
163     d_d2_lb = F90_DIM_LBOUND_G(dest_desc, 1);
164     d_d2_sstride = F90_DIM_SSTRIDE_G(dest_desc, 1);
165     if (d_d2_sstride != 1 || F90_DIM_SOFFSET_G(dest_desc, 1))
166       d_d2_soffset = F90_DIM_SOFFSET_G(dest_desc, 1) + d_d2_sstride - d_d2_lb;
167   }
168 
169   s1_base = (__LOG4_T *)s1_addr + F90_LBASE_G(s1_desc) +
170             s1_d1_lb * s1_d1_lstride + s1_d2_lb * s1_d2_lstride - 1;
171   s2_base = (__LOG4_T *)s2_addr + F90_LBASE_G(s2_desc) +
172             s2_d1_lb * s2_d1_lstride + s2_d2_lb * s2_d2_lstride - 1;
173   dest_base = (__LOG4_T *)dest_addr + F90_LBASE_G(dest_desc) +
174               d_d1_lb * d_d1_lstride + d_d2_lb * d_d2_lstride - 1;
175 
176   d_d1_offset = d_d1_base = d_d1_soffset * d_d1_lstride;
177   d_d1_delta = d_d1_sstride * d_d1_lstride;
178 
179   d_d2_offset = d_d2_base = d_d2_soffset * d_d2_lstride;
180   d_d2_delta = s1_rank == 2 ? d_d2_sstride * d_d2_lstride : d_d1_delta;
181 
182   s1_d1_offset = s1_d1_base = s1_d1_soffset * s1_d1_lstride;
183   s1_d1_delta = s1_d1_sstride * s1_d1_lstride;
184 
185   s1_d2_offset = s1_d2_base = s1_d2_soffset * s1_d2_lstride;
186   s1_d2_delta = s1_rank == 2 ? s1_d2_sstride * s1_d2_lstride : s1_d1_delta;
187 
188   s2_d1_offset = s2_d1_base = s2_d1_soffset * s2_d1_lstride;
189   s2_d1_delta = s2_d1_sstride * s2_d1_lstride;
190 
191   s2_d2_offset = s2_d2_base = s2_d2_soffset * s2_d2_lstride;
192   s2_d2_delta = s2_d2_sstride * s2_d2_lstride;
193 
194   if (s1_rank == 2) {
195     for (k = 0; k < k_extent; k++) {
196       d_elem_p = dest_base + d_d1_base + d_d2_offset;
197       d_d2_offset += d_d2_delta;
198       for (n = 0; n < n_extent; n++) {
199         *d_elem_p = 0;
200         d_elem_p += d_d1_delta;
201       }
202     }
203 
204     d_d2_offset = d_d2_base;
205     for (k = 0; k < k_extent; k++) {
206       s2_elem_p = s2_base + s2_d1_base + s2_d2_offset;
207       s2_d2_offset += s2_d2_delta;
208       s1_d2_offset = s1_d2_base;
209       for (m = 0; m < m_extent; m++) {
210         s1_elem_p = s1_base + s1_d1_base + s1_d2_offset;
211         s1_d2_offset += s1_d2_delta;
212         d_elem_p = dest_base + d_d1_base + d_d2_offset;
213         for (n = 0; n < n_extent; n++) {
214           if ((*s1_elem_p & GET_DIST_MASK_LOG4) &&
215               (*s2_elem_p & GET_DIST_MASK_LOG4)) {
216             *d_elem_p = GET_DIST_TRUE_LOG4;
217           }
218 
219           d_elem_p += d_d1_delta;
220           s1_elem_p += s1_d1_delta;
221         }
222         s2_elem_p += s2_d1_delta;
223       }
224       d_d2_offset += d_d2_delta;
225     }
226   } else {
227     __LOG4_T rslt_tmp;
228 
229     s1_base += s1_d1_base;
230     s2_base += s2_d1_soffset * s2_d1_lstride;
231     dest_offset = d_d1_base;
232     for (k = 0; k < k_extent; k++) {
233       s1_elem_p = s1_base;
234       s2_elem_p = s2_base + s2_d2_base;
235       rslt_tmp = 0;
236       for (m = 0; m < m_extent; m++) {
237         if ((*s1_elem_p & GET_DIST_MASK_LOG4) &&
238             (*s2_elem_p & GET_DIST_MASK_LOG4)) {
239           rslt_tmp = GET_DIST_TRUE_LOG4;
240         }
241 
242         s1_elem_p += s1_d1_delta;
243         s2_elem_p += s2_d1_delta;
244       }
245       *(dest_base + dest_offset) = rslt_tmp;
246       dest_offset += d_d1_delta;
247       s2_d2_base += s2_d2_delta;
248     }
249   }
250 }
251