1 /*
2 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 /* clang-format off */
19
20 /* mmulLOG4.c --F90 MATMUL intrinsics for logical*4 type */
21
22 #include "stdioInterf.h"
23 #include "fioMacros.h"
24
ENTF90(MATMUL_LOG4,matmul_log4)25 void ENTF90(MATMUL_LOG4, matmul_log4)(char *dest_addr, char *s1_addr,
26 char *s2_addr, F90_Desc *dest_desc,
27 F90_Desc *s1_desc, F90_Desc *s2_desc)
28 {
29
30 __LOG4_T *s1_base;
31 __LOG4_T *s2_base;
32 __LOG4_T *dest_base;
33 __LOG4_T *d_elem_p;
34 __LOG4_T *s1_elem_p;
35 __LOG4_T *s2_elem_p;
36
37 __INT_T s1_d1_lstride;
38 __INT_T s1_d1_sstride;
39 __INT_T s1_d1_lb;
40 __INT_T s1_d1_soffset = 0;
41
42 __INT_T s1_d2_lstride = 1;
43 __INT_T s1_d2_sstride = 1;
44 __INT_T s1_d2_lb = 0;
45 __INT_T s1_d2_soffset = 0;
46
47 __INT_T s2_d1_lstride;
48 __INT_T s2_d1_sstride;
49 __INT_T s2_d1_lb;
50 __INT_T s2_d1_soffset = 0;
51
52 __INT_T s2_d2_lstride = 1;
53 __INT_T s2_d2_sstride = 1;
54 __INT_T s2_d2_lb = 0;
55 __INT_T s2_d2_soffset = 0;
56
57 __INT_T d_d1_lstride;
58 __INT_T d_d1_sstride;
59 __INT_T d_d1_lb;
60 __INT_T d_d1_soffset = 0;
61
62 __INT_T d_d2_lstride = 1;
63 __INT_T d_d2_sstride = 1;
64 __INT_T d_d2_lb = 0;
65 __INT_T d_d2_soffset = 0;
66
67 __INT_T d_rank = F90_RANK_G(dest_desc);
68 __INT_T s1_rank = F90_RANK_G(s1_desc);
69 __INT_T s2_rank = F90_RANK_G(s2_desc);
70
71 __INT_T k_extent = s2_rank == 2 ? F90_DIM_EXTENT_G(s2_desc, 1) : 1;
72 __INT_T m_extent = s1_rank == 2 ? F90_DIM_EXTENT_G(s1_desc, 1)
73 : F90_DIM_EXTENT_G(s1_desc, 0);
74 __INT_T n_extent = s1_rank == 2 ? F90_DIM_EXTENT_G(s1_desc, 0) : 1;
75
76 __INT_T dest_offset;
77
78 __INT_T s1_d1_base, s1_d1_delta, s1_d1_offset, s1_d2_base, s1_d2_delta,
79 s1_d2_offset, s2_d1_base, s2_d1_delta, s2_d1_offset, s2_d2_base,
80 s2_d2_delta, s2_d2_offset, d_d1_base, d_d1_delta, d_d1_offset, d_d2_base,
81 d_d2_delta, d_d2_offset;
82
83 __INT_T k;
84 __INT_T m;
85 __INT_T n;
86
87 /* mxm
88 * s1(n,m) x s2(m,k) -> dest(n,k)
89 * Check
90 * dest_d1 extent== n_extnet
91 * dest_d2 extent == k_extent
92 * s2_d1 extent = m_extent
93 *
94 * mxv
95 * s1(n,m) x s2(m) -> dest(n)
96 * Check
97 * dest_d1 extent== n_extent
98 * s2_d1 extent == m_extent
99 *
100 * vxm
101 * s1(m) x s2(m,k) -> dest(k)
102 * check
103 * s2_d1 extent == m_extent
104 * dest_d1 extent == k_extent
105 */
106
107 if (d_rank == 2 && s1_rank == 2 && s2_rank == 2) {
108 if (F90_DIM_EXTENT_G(dest_desc, 0) != n_extent ||
109 F90_DIM_EXTENT_G(dest_desc, 1) != k_extent ||
110 F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) {
111 __fort_abort("MATMUL: nonconforming array shapes");
112 }
113 } else if (d_rank == 1 && s1_rank == 2 && s2_rank == 1) {
114 if (F90_DIM_EXTENT_G(dest_desc, 0) != n_extent ||
115 F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) {
116 __fort_abort("MATMUL: nonconforming array shapes");
117 }
118 } else if (d_rank == 1 && s1_rank == 1 && s2_rank == 2) {
119 if (F90_DIM_EXTENT_G(dest_desc, 0) != k_extent ||
120 F90_DIM_EXTENT_G(s2_desc, 0) != m_extent) {
121 __fort_abort("MATMUL: nonconforming array shapes");
122 }
123 } else {
124 __fort_abort("MATMUL: non-conforming array shapes");
125 }
126
127 s1_d1_lstride = F90_DIM_LSTRIDE_G(s1_desc, 0);
128 s1_d1_sstride = F90_DIM_SSTRIDE_G(s1_desc, 0);
129 s1_d1_lb = F90_DIM_LBOUND_G(s1_desc, 0);
130 if (s1_d1_sstride != 1 || F90_DIM_SOFFSET_G(s1_desc, 0))
131 s1_d1_soffset = F90_DIM_SOFFSET_G(s1_desc, 0) + s1_d1_sstride - s1_d1_lb;
132
133 if (s1_rank == 2) {
134 s1_d2_lstride = F90_DIM_LSTRIDE_G(s1_desc, 1);
135 s1_d2_lb = F90_DIM_LBOUND_G(s1_desc, 1);
136 s1_d2_sstride = F90_DIM_SSTRIDE_G(s1_desc, 1);
137 if (s1_d2_sstride != 1 || F90_DIM_SOFFSET_G(s1_desc, 1))
138 s1_d2_soffset = F90_DIM_SOFFSET_G(s1_desc, 1) + s1_d2_sstride - s1_d2_lb;
139 }
140
141 s2_d1_lstride = F90_DIM_LSTRIDE_G(s2_desc, 0);
142 s2_d1_lb = F90_DIM_LBOUND_G(s2_desc, 0);
143 s2_d1_sstride = F90_DIM_SSTRIDE_G(s2_desc, 0);
144 if (s2_d1_sstride != 1 || F90_DIM_SOFFSET_G(s2_desc, 0))
145 s2_d1_soffset = F90_DIM_SOFFSET_G(s2_desc, 0) + s2_d1_sstride - s2_d1_lb;
146
147 if (s2_rank == 2) {
148 s2_d2_lstride = F90_DIM_LSTRIDE_G(s2_desc, 1);
149 s2_d2_lb = F90_DIM_LBOUND_G(s2_desc, 1);
150 s2_d2_sstride = F90_DIM_SSTRIDE_G(s2_desc, 1);
151 if (s2_d2_sstride != 1 || F90_DIM_SOFFSET_G(s2_desc, 1))
152 s2_d2_soffset = F90_DIM_SOFFSET_G(s2_desc, 1) + s2_d2_sstride - s2_d2_lb;
153 }
154
155 d_d1_lstride = F90_DIM_LSTRIDE_G(dest_desc, 0);
156 d_d1_lb = F90_DIM_LBOUND_G(dest_desc, 0);
157 d_d1_sstride = F90_DIM_SSTRIDE_G(dest_desc, 0);
158 if (d_d1_sstride != 1 || F90_DIM_SOFFSET_G(dest_desc, 0))
159 d_d1_soffset = F90_DIM_SOFFSET_G(dest_desc, 0) + d_d1_sstride - d_d1_lb;
160
161 if (d_rank == 2) {
162 d_d2_lstride = F90_DIM_LSTRIDE_G(dest_desc, 1);
163 d_d2_lb = F90_DIM_LBOUND_G(dest_desc, 1);
164 d_d2_sstride = F90_DIM_SSTRIDE_G(dest_desc, 1);
165 if (d_d2_sstride != 1 || F90_DIM_SOFFSET_G(dest_desc, 1))
166 d_d2_soffset = F90_DIM_SOFFSET_G(dest_desc, 1) + d_d2_sstride - d_d2_lb;
167 }
168
169 s1_base = (__LOG4_T *)s1_addr + F90_LBASE_G(s1_desc) +
170 s1_d1_lb * s1_d1_lstride + s1_d2_lb * s1_d2_lstride - 1;
171 s2_base = (__LOG4_T *)s2_addr + F90_LBASE_G(s2_desc) +
172 s2_d1_lb * s2_d1_lstride + s2_d2_lb * s2_d2_lstride - 1;
173 dest_base = (__LOG4_T *)dest_addr + F90_LBASE_G(dest_desc) +
174 d_d1_lb * d_d1_lstride + d_d2_lb * d_d2_lstride - 1;
175
176 d_d1_offset = d_d1_base = d_d1_soffset * d_d1_lstride;
177 d_d1_delta = d_d1_sstride * d_d1_lstride;
178
179 d_d2_offset = d_d2_base = d_d2_soffset * d_d2_lstride;
180 d_d2_delta = s1_rank == 2 ? d_d2_sstride * d_d2_lstride : d_d1_delta;
181
182 s1_d1_offset = s1_d1_base = s1_d1_soffset * s1_d1_lstride;
183 s1_d1_delta = s1_d1_sstride * s1_d1_lstride;
184
185 s1_d2_offset = s1_d2_base = s1_d2_soffset * s1_d2_lstride;
186 s1_d2_delta = s1_rank == 2 ? s1_d2_sstride * s1_d2_lstride : s1_d1_delta;
187
188 s2_d1_offset = s2_d1_base = s2_d1_soffset * s2_d1_lstride;
189 s2_d1_delta = s2_d1_sstride * s2_d1_lstride;
190
191 s2_d2_offset = s2_d2_base = s2_d2_soffset * s2_d2_lstride;
192 s2_d2_delta = s2_d2_sstride * s2_d2_lstride;
193
194 if (s1_rank == 2) {
195 for (k = 0; k < k_extent; k++) {
196 d_elem_p = dest_base + d_d1_base + d_d2_offset;
197 d_d2_offset += d_d2_delta;
198 for (n = 0; n < n_extent; n++) {
199 *d_elem_p = 0;
200 d_elem_p += d_d1_delta;
201 }
202 }
203
204 d_d2_offset = d_d2_base;
205 for (k = 0; k < k_extent; k++) {
206 s2_elem_p = s2_base + s2_d1_base + s2_d2_offset;
207 s2_d2_offset += s2_d2_delta;
208 s1_d2_offset = s1_d2_base;
209 for (m = 0; m < m_extent; m++) {
210 s1_elem_p = s1_base + s1_d1_base + s1_d2_offset;
211 s1_d2_offset += s1_d2_delta;
212 d_elem_p = dest_base + d_d1_base + d_d2_offset;
213 for (n = 0; n < n_extent; n++) {
214 if ((*s1_elem_p & GET_DIST_MASK_LOG4) &&
215 (*s2_elem_p & GET_DIST_MASK_LOG4)) {
216 *d_elem_p = GET_DIST_TRUE_LOG4;
217 }
218
219 d_elem_p += d_d1_delta;
220 s1_elem_p += s1_d1_delta;
221 }
222 s2_elem_p += s2_d1_delta;
223 }
224 d_d2_offset += d_d2_delta;
225 }
226 } else {
227 __LOG4_T rslt_tmp;
228
229 s1_base += s1_d1_base;
230 s2_base += s2_d1_soffset * s2_d1_lstride;
231 dest_offset = d_d1_base;
232 for (k = 0; k < k_extent; k++) {
233 s1_elem_p = s1_base;
234 s2_elem_p = s2_base + s2_d2_base;
235 rslt_tmp = 0;
236 for (m = 0; m < m_extent; m++) {
237 if ((*s1_elem_p & GET_DIST_MASK_LOG4) &&
238 (*s2_elem_p & GET_DIST_MASK_LOG4)) {
239 rslt_tmp = GET_DIST_TRUE_LOG4;
240 }
241
242 s1_elem_p += s1_d1_delta;
243 s2_elem_p += s2_d1_delta;
244 }
245 *(dest_base + dest_offset) = rslt_tmp;
246 dest_offset += d_d1_delta;
247 s2_d2_base += s2_d2_delta;
248 }
249 }
250 }
251