xref: /qemu/target/arm/tcg/translate-sme.c (revision 6b40847a)
1 /*
2  * AArch64 SME translation
3  *
4  * Copyright (c) 2022 Linaro, Ltd
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "translate.h"
22 #include "translate-a64.h"
23 
24 /*
25  * Include the generated decoder.
26  */
27 
28 #include "decode-sme.c.inc"
29 
30 
31 /*
32  * Resolve tile.size[index] to a host pointer, where tile and index
33  * are always decoded together, dependent on the element size.
34  */
35 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
36                                 int tile_index, bool vertical)
37 {
38     int tile = tile_index >> (4 - esz);
39     int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
40     int pos, len, offset;
41     TCGv_i32 tmp;
42     TCGv_ptr addr;
43 
44     /* Compute the final index, which is Rs+imm. */
45     tmp = tcg_temp_new_i32();
46     tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
47     tcg_gen_addi_i32(tmp, tmp, index);
48 
49     /* Prepare a power-of-two modulo via extraction of @len bits. */
50     len = ctz32(streaming_vec_reg_size(s)) - esz;
51 
52     if (vertical) {
53         /*
54          * Compute the byte offset of the index within the tile:
55          *     (index % (svl / size)) * size
56          *   = (index % (svl >> esz)) << esz
57          * Perform the power-of-two modulo via extraction of the low @len bits.
58          * Perform the multiply by shifting left by @pos bits.
59          * Perform these operations simultaneously via deposit into zero.
60          */
61         pos = esz;
62         tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
63 
64         /*
65          * For big-endian, adjust the indexed column byte offset within
66          * the uint64_t host words that make up env->zarray[].
67          */
68         if (HOST_BIG_ENDIAN && esz < MO_64) {
69             tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
70         }
71     } else {
72         /*
73          * Compute the byte offset of the index within the tile:
74          *     (index % (svl / size)) * (size * sizeof(row))
75          *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
76          */
77         pos = esz + ctz32(sizeof(ARMVectorReg));
78         tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
79 
80         /* Row slices are always aligned and need no endian adjustment. */
81     }
82 
83     /* The tile byte offset within env->zarray is the row. */
84     offset = tile * sizeof(ARMVectorReg);
85 
86     /* Include the byte offset of zarray to make this relative to env. */
87     offset += offsetof(CPUARMState, zarray);
88     tcg_gen_addi_i32(tmp, tmp, offset);
89 
90     /* Add the byte offset to env to produce the final pointer. */
91     addr = tcg_temp_new_ptr();
92     tcg_gen_ext_i32_ptr(addr, tmp);
93     tcg_gen_add_ptr(addr, addr, cpu_env);
94 
95     return addr;
96 }
97 
98 static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
99 {
100     if (!dc_isar_feature(aa64_sme, s)) {
101         return false;
102     }
103     if (sme_za_enabled_check(s)) {
104         gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm),
105                             tcg_constant_i32(streaming_vec_reg_size(s)));
106     }
107     return true;
108 }
109 
110 static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
111 {
112     static gen_helper_gvec_4 * const h_fns[5] = {
113         gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
114         gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
115         gen_helper_sve_sel_zpzz_q
116     };
117     static gen_helper_gvec_3 * const cz_fns[5] = {
118         gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
119         gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
120         gen_helper_sme_mova_cz_q,
121     };
122     static gen_helper_gvec_3 * const zc_fns[5] = {
123         gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
124         gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
125         gen_helper_sme_mova_zc_q,
126     };
127 
128     TCGv_ptr t_za, t_zr, t_pg;
129     TCGv_i32 t_desc;
130     int svl;
131 
132     if (!dc_isar_feature(aa64_sme, s)) {
133         return false;
134     }
135     if (!sme_smza_enabled_check(s)) {
136         return true;
137     }
138 
139     t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
140     t_zr = vec_full_reg_ptr(s, a->zr);
141     t_pg = pred_full_reg_ptr(s, a->pg);
142 
143     svl = streaming_vec_reg_size(s);
144     t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
145 
146     if (a->v) {
147         /* Vertical slice -- use sme mova helpers. */
148         if (a->to_vec) {
149             zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
150         } else {
151             cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
152         }
153     } else {
154         /* Horizontal slice -- reuse sve sel helpers. */
155         if (a->to_vec) {
156             h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
157         } else {
158             h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
159         }
160     }
161     return true;
162 }
163 
164 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
165 {
166     typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
167 
168     /*
169      * Indexed by [esz][be][v][mte][st], which is (except for load/store)
170      * also the order in which the elements appear in the function names,
171      * and so how we must concatenate the pieces.
172      */
173 
174 #define FN_LS(F)     { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
175 #define FN_MTE(F)    { FN_LS(F), FN_LS(F##_mte) }
176 #define FN_HV(F)     { FN_MTE(F##_h), FN_MTE(F##_v) }
177 #define FN_END(L, B) { FN_HV(L), FN_HV(B) }
178 
179     static GenLdSt1 * const fns[5][2][2][2][2] = {
180         FN_END(b, b),
181         FN_END(h_le, h_be),
182         FN_END(s_le, s_be),
183         FN_END(d_le, d_be),
184         FN_END(q_le, q_be),
185     };
186 
187 #undef FN_LS
188 #undef FN_MTE
189 #undef FN_HV
190 #undef FN_END
191 
192     TCGv_ptr t_za, t_pg;
193     TCGv_i64 addr;
194     int svl, desc = 0;
195     bool be = s->be_data == MO_BE;
196     bool mte = s->mte_active[0];
197 
198     if (!dc_isar_feature(aa64_sme, s)) {
199         return false;
200     }
201     if (!sme_smza_enabled_check(s)) {
202         return true;
203     }
204 
205     t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
206     t_pg = pred_full_reg_ptr(s, a->pg);
207     addr = tcg_temp_new_i64();
208 
209     tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
210     tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
211 
212     if (mte) {
213         desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
214         desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
215         desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
216         desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
217         desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
218         desc <<= SVE_MTEDESC_SHIFT;
219     } else {
220         addr = clean_data_tbi(s, addr);
221     }
222     svl = streaming_vec_reg_size(s);
223     desc = simd_desc(svl, svl, desc);
224 
225     fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr,
226                                       tcg_constant_i32(desc));
227     return true;
228 }
229 
230 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
231 
232 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
233 {
234     int svl = streaming_vec_reg_size(s);
235     int imm = a->imm;
236     TCGv_ptr base;
237 
238     if (!sme_za_enabled_check(s)) {
239         return true;
240     }
241 
242     /* ZA[n] equates to ZA0H.B[n]. */
243     base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
244 
245     fn(s, base, 0, svl, a->rn, imm * svl);
246     return true;
247 }
248 
249 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
250 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
251 
252 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
253                     gen_helper_gvec_4 *fn)
254 {
255     int svl = streaming_vec_reg_size(s);
256     uint32_t desc = simd_desc(svl, svl, 0);
257     TCGv_ptr za, zn, pn, pm;
258 
259     if (!sme_smza_enabled_check(s)) {
260         return true;
261     }
262 
263     /* Sum XZR+zad to find ZAd. */
264     za = get_tile_rowcol(s, esz, 31, a->zad, false);
265     zn = vec_full_reg_ptr(s, a->zn);
266     pn = pred_full_reg_ptr(s, a->pn);
267     pm = pred_full_reg_ptr(s, a->pm);
268 
269     fn(za, zn, pn, pm, tcg_constant_i32(desc));
270     return true;
271 }
272 
273 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
274 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
275 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
276 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
277 
278 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
279                        gen_helper_gvec_5 *fn)
280 {
281     int svl = streaming_vec_reg_size(s);
282     uint32_t desc = simd_desc(svl, svl, a->sub);
283     TCGv_ptr za, zn, zm, pn, pm;
284 
285     if (!sme_smza_enabled_check(s)) {
286         return true;
287     }
288 
289     /* Sum XZR+zad to find ZAd. */
290     za = get_tile_rowcol(s, esz, 31, a->zad, false);
291     zn = vec_full_reg_ptr(s, a->zn);
292     zm = vec_full_reg_ptr(s, a->zm);
293     pn = pred_full_reg_ptr(s, a->pn);
294     pm = pred_full_reg_ptr(s, a->pm);
295 
296     fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
297     return true;
298 }
299 
300 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
301                             gen_helper_gvec_5_ptr *fn)
302 {
303     int svl = streaming_vec_reg_size(s);
304     uint32_t desc = simd_desc(svl, svl, a->sub);
305     TCGv_ptr za, zn, zm, pn, pm, fpst;
306 
307     if (!sme_smza_enabled_check(s)) {
308         return true;
309     }
310 
311     /* Sum XZR+zad to find ZAd. */
312     za = get_tile_rowcol(s, esz, 31, a->zad, false);
313     zn = vec_full_reg_ptr(s, a->zn);
314     zm = vec_full_reg_ptr(s, a->zm);
315     pn = pred_full_reg_ptr(s, a->pn);
316     pm = pred_full_reg_ptr(s, a->pm);
317     fpst = fpstatus_ptr(FPST_FPCR);
318 
319     fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
320     return true;
321 }
322 
323 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h)
324 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s)
325 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d)
326 
327 /* TODO: FEAT_EBF16 */
328 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
329 
330 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
331 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
332 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
333 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
334 
335 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
336 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
337 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
338 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
339