xref: /qemu/target/arm/tcg/translate-neon.c (revision cb4c33f0)
1 /*
2  *  ARM translation: AArch32 Neon instructions
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *  Copyright (c) 2005-2007 CodeSourcery
6  *  Copyright (c) 2007 OpenedHand, Ltd.
7  *  Copyright (c) 2020 Linaro, Ltd.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "qemu/osdep.h"
24 #include "tcg/tcg-op.h"
25 #include "tcg/tcg-op-gvec.h"
26 #include "exec/exec-all.h"
27 #include "exec/gen-icount.h"
28 #include "translate.h"
29 #include "translate-a32.h"
30 
31 /* Include the generated Neon decoder */
32 #include "decode-neon-dp.c.inc"
33 #include "decode-neon-ls.c.inc"
34 #include "decode-neon-shared.c.inc"
35 
36 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
37 {
38     TCGv_ptr ret = tcg_temp_new_ptr();
39     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
40     return ret;
41 }
42 
43 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
44 {
45     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
46 
47     switch (mop) {
48     case MO_UB:
49         tcg_gen_ld8u_i32(var, cpu_env, offset);
50         break;
51     case MO_UW:
52         tcg_gen_ld16u_i32(var, cpu_env, offset);
53         break;
54     case MO_UL:
55         tcg_gen_ld_i32(var, cpu_env, offset);
56         break;
57     default:
58         g_assert_not_reached();
59     }
60 }
61 
62 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
63 {
64     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
65 
66     switch (mop) {
67     case MO_UB:
68         tcg_gen_ld8u_i64(var, cpu_env, offset);
69         break;
70     case MO_UW:
71         tcg_gen_ld16u_i64(var, cpu_env, offset);
72         break;
73     case MO_UL:
74         tcg_gen_ld32u_i64(var, cpu_env, offset);
75         break;
76     case MO_UQ:
77         tcg_gen_ld_i64(var, cpu_env, offset);
78         break;
79     default:
80         g_assert_not_reached();
81     }
82 }
83 
84 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
85 {
86     long offset = neon_element_offset(reg, ele, size);
87 
88     switch (size) {
89     case MO_8:
90         tcg_gen_st8_i32(var, cpu_env, offset);
91         break;
92     case MO_16:
93         tcg_gen_st16_i32(var, cpu_env, offset);
94         break;
95     case MO_32:
96         tcg_gen_st_i32(var, cpu_env, offset);
97         break;
98     default:
99         g_assert_not_reached();
100     }
101 }
102 
103 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
104 {
105     long offset = neon_element_offset(reg, ele, size);
106 
107     switch (size) {
108     case MO_8:
109         tcg_gen_st8_i64(var, cpu_env, offset);
110         break;
111     case MO_16:
112         tcg_gen_st16_i64(var, cpu_env, offset);
113         break;
114     case MO_32:
115         tcg_gen_st32_i64(var, cpu_env, offset);
116         break;
117     case MO_64:
118         tcg_gen_st_i64(var, cpu_env, offset);
119         break;
120     default:
121         g_assert_not_reached();
122     }
123 }
124 
125 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
126                          int data, gen_helper_gvec_4 *fn_gvec)
127 {
128     /* UNDEF accesses to D16-D31 if they don't exist. */
129     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
130         return false;
131     }
132 
133     /*
134      * UNDEF accesses to odd registers for each bit of Q.
135      * Q will be 0b111 for all Q-reg instructions, otherwise
136      * when we have mixed Q- and D-reg inputs.
137      */
138     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
139         return false;
140     }
141 
142     if (!vfp_access_check(s)) {
143         return true;
144     }
145 
146     int opr_sz = q ? 16 : 8;
147     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
148                        vfp_reg_offset(1, vn),
149                        vfp_reg_offset(1, vm),
150                        vfp_reg_offset(1, vd),
151                        opr_sz, opr_sz, data, fn_gvec);
152     return true;
153 }
154 
155 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
156                               int data, ARMFPStatusFlavour fp_flavour,
157                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
158 {
159     /* UNDEF accesses to D16-D31 if they don't exist. */
160     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
161         return false;
162     }
163 
164     /*
165      * UNDEF accesses to odd registers for each bit of Q.
166      * Q will be 0b111 for all Q-reg instructions, otherwise
167      * when we have mixed Q- and D-reg inputs.
168      */
169     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
170         return false;
171     }
172 
173     if (!vfp_access_check(s)) {
174         return true;
175     }
176 
177     int opr_sz = q ? 16 : 8;
178     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
179 
180     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
181                        vfp_reg_offset(1, vn),
182                        vfp_reg_offset(1, vm),
183                        vfp_reg_offset(1, vd),
184                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
185     tcg_temp_free_ptr(fpst);
186     return true;
187 }
188 
189 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
190 {
191     if (!dc_isar_feature(aa32_vcma, s)) {
192         return false;
193     }
194     if (a->size == MO_16) {
195         if (!dc_isar_feature(aa32_fp16_arith, s)) {
196             return false;
197         }
198         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
199                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
200     }
201     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
202                              FPST_STD, gen_helper_gvec_fcmlas);
203 }
204 
205 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
206 {
207     int opr_sz;
208     TCGv_ptr fpst;
209     gen_helper_gvec_3_ptr *fn_gvec_ptr;
210 
211     if (!dc_isar_feature(aa32_vcma, s)
212         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
213         return false;
214     }
215 
216     /* UNDEF accesses to D16-D31 if they don't exist. */
217     if (!dc_isar_feature(aa32_simd_r32, s) &&
218         ((a->vd | a->vn | a->vm) & 0x10)) {
219         return false;
220     }
221 
222     if ((a->vn | a->vm | a->vd) & a->q) {
223         return false;
224     }
225 
226     if (!vfp_access_check(s)) {
227         return true;
228     }
229 
230     opr_sz = (1 + a->q) * 8;
231     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
232     fn_gvec_ptr = (a->size == MO_16) ?
233         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
234     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
235                        vfp_reg_offset(1, a->vn),
236                        vfp_reg_offset(1, a->vm),
237                        fpst, opr_sz, opr_sz, a->rot,
238                        fn_gvec_ptr);
239     tcg_temp_free_ptr(fpst);
240     return true;
241 }
242 
243 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
244 {
245     if (!dc_isar_feature(aa32_dp, s)) {
246         return false;
247     }
248     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
249                         gen_helper_gvec_sdot_b);
250 }
251 
252 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
253 {
254     if (!dc_isar_feature(aa32_dp, s)) {
255         return false;
256     }
257     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
258                         gen_helper_gvec_udot_b);
259 }
260 
261 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
262 {
263     if (!dc_isar_feature(aa32_i8mm, s)) {
264         return false;
265     }
266     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
267                         gen_helper_gvec_usdot_b);
268 }
269 
270 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
271 {
272     if (!dc_isar_feature(aa32_bf16, s)) {
273         return false;
274     }
275     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
276                         gen_helper_gvec_bfdot);
277 }
278 
279 static bool trans_VFML(DisasContext *s, arg_VFML *a)
280 {
281     int opr_sz;
282 
283     if (!dc_isar_feature(aa32_fhm, s)) {
284         return false;
285     }
286 
287     /* UNDEF accesses to D16-D31 if they don't exist. */
288     if (!dc_isar_feature(aa32_simd_r32, s) &&
289         (a->vd & 0x10)) {
290         return false;
291     }
292 
293     if (a->vd & a->q) {
294         return false;
295     }
296 
297     if (!vfp_access_check(s)) {
298         return true;
299     }
300 
301     opr_sz = (1 + a->q) * 8;
302     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
303                        vfp_reg_offset(a->q, a->vn),
304                        vfp_reg_offset(a->q, a->vm),
305                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
306                        gen_helper_gvec_fmlal_a32);
307     return true;
308 }
309 
310 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
311 {
312     int data = (a->index << 2) | a->rot;
313 
314     if (!dc_isar_feature(aa32_vcma, s)) {
315         return false;
316     }
317     if (a->size == MO_16) {
318         if (!dc_isar_feature(aa32_fp16_arith, s)) {
319             return false;
320         }
321         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
322                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
323     }
324     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
325                              FPST_STD, gen_helper_gvec_fcmlas_idx);
326 }
327 
328 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
329 {
330     if (!dc_isar_feature(aa32_dp, s)) {
331         return false;
332     }
333     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
334                         gen_helper_gvec_sdot_idx_b);
335 }
336 
337 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
338 {
339     if (!dc_isar_feature(aa32_dp, s)) {
340         return false;
341     }
342     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
343                         gen_helper_gvec_udot_idx_b);
344 }
345 
346 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
347 {
348     if (!dc_isar_feature(aa32_i8mm, s)) {
349         return false;
350     }
351     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
352                         gen_helper_gvec_usdot_idx_b);
353 }
354 
355 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
356 {
357     if (!dc_isar_feature(aa32_i8mm, s)) {
358         return false;
359     }
360     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
361                         gen_helper_gvec_sudot_idx_b);
362 }
363 
364 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
365 {
366     if (!dc_isar_feature(aa32_bf16, s)) {
367         return false;
368     }
369     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
370                         gen_helper_gvec_bfdot_idx);
371 }
372 
373 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
374 {
375     int opr_sz;
376 
377     if (!dc_isar_feature(aa32_fhm, s)) {
378         return false;
379     }
380 
381     /* UNDEF accesses to D16-D31 if they don't exist. */
382     if (!dc_isar_feature(aa32_simd_r32, s) &&
383         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
384         return false;
385     }
386 
387     if (a->vd & a->q) {
388         return false;
389     }
390 
391     if (!vfp_access_check(s)) {
392         return true;
393     }
394 
395     opr_sz = (1 + a->q) * 8;
396     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
397                        vfp_reg_offset(a->q, a->vn),
398                        vfp_reg_offset(a->q, a->rm),
399                        cpu_env, opr_sz, opr_sz,
400                        (a->index << 2) | a->s, /* is_2 == 0 */
401                        gen_helper_gvec_fmlal_idx_a32);
402     return true;
403 }
404 
405 static struct {
406     int nregs;
407     int interleave;
408     int spacing;
409 } const neon_ls_element_type[11] = {
410     {1, 4, 1},
411     {1, 4, 2},
412     {4, 1, 1},
413     {2, 2, 2},
414     {1, 3, 1},
415     {1, 3, 2},
416     {3, 1, 1},
417     {1, 1, 1},
418     {1, 2, 1},
419     {1, 2, 2},
420     {2, 1, 1}
421 };
422 
423 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
424                                       int stride)
425 {
426     if (rm != 15) {
427         TCGv_i32 base;
428 
429         base = load_reg(s, rn);
430         if (rm == 13) {
431             tcg_gen_addi_i32(base, base, stride);
432         } else {
433             TCGv_i32 index;
434             index = load_reg(s, rm);
435             tcg_gen_add_i32(base, base, index);
436             tcg_temp_free_i32(index);
437         }
438         store_reg(s, rn, base);
439     }
440 }
441 
442 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
443 {
444     /* Neon load/store multiple structures */
445     int nregs, interleave, spacing, reg, n;
446     MemOp mop, align, endian;
447     int mmu_idx = get_mem_index(s);
448     int size = a->size;
449     TCGv_i64 tmp64;
450     TCGv_i32 addr;
451 
452     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
453         return false;
454     }
455 
456     /* UNDEF accesses to D16-D31 if they don't exist */
457     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
458         return false;
459     }
460     if (a->itype > 10) {
461         return false;
462     }
463     /* Catch UNDEF cases for bad values of align field */
464     switch (a->itype & 0xc) {
465     case 4:
466         if (a->align >= 2) {
467             return false;
468         }
469         break;
470     case 8:
471         if (a->align == 3) {
472             return false;
473         }
474         break;
475     default:
476         break;
477     }
478     nregs = neon_ls_element_type[a->itype].nregs;
479     interleave = neon_ls_element_type[a->itype].interleave;
480     spacing = neon_ls_element_type[a->itype].spacing;
481     if (size == 3 && (interleave | spacing) != 1) {
482         return false;
483     }
484 
485     if (!vfp_access_check(s)) {
486         return true;
487     }
488 
489     /* For our purposes, bytes are always little-endian.  */
490     endian = s->be_data;
491     if (size == 0) {
492         endian = MO_LE;
493     }
494 
495     /* Enforce alignment requested by the instruction */
496     if (a->align) {
497         align = pow2_align(a->align + 2); /* 4 ** a->align */
498     } else {
499         align = s->align_mem ? MO_ALIGN : 0;
500     }
501 
502     /*
503      * Consecutive little-endian elements from a single register
504      * can be promoted to a larger little-endian operation.
505      */
506     if (interleave == 1 && endian == MO_LE) {
507         /* Retain any natural alignment. */
508         if (align == MO_ALIGN) {
509             align = pow2_align(size);
510         }
511         size = 3;
512     }
513 
514     tmp64 = tcg_temp_new_i64();
515     addr = tcg_temp_new_i32();
516     load_reg_var(s, addr, a->rn);
517 
518     mop = endian | size | align;
519     for (reg = 0; reg < nregs; reg++) {
520         for (n = 0; n < 8 >> size; n++) {
521             int xs;
522             for (xs = 0; xs < interleave; xs++) {
523                 int tt = a->vd + reg + spacing * xs;
524 
525                 if (a->l) {
526                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
527                     neon_store_element64(tt, n, size, tmp64);
528                 } else {
529                     neon_load_element64(tmp64, tt, n, size);
530                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
531                 }
532                 tcg_gen_addi_i32(addr, addr, 1 << size);
533 
534                 /* Subsequent memory operations inherit alignment */
535                 mop &= ~MO_AMASK;
536             }
537         }
538     }
539     tcg_temp_free_i32(addr);
540     tcg_temp_free_i64(tmp64);
541 
542     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
543     return true;
544 }
545 
546 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
547 {
548     /* Neon load single structure to all lanes */
549     int reg, stride, vec_size;
550     int vd = a->vd;
551     int size = a->size;
552     int nregs = a->n + 1;
553     TCGv_i32 addr, tmp;
554     MemOp mop, align;
555 
556     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
557         return false;
558     }
559 
560     /* UNDEF accesses to D16-D31 if they don't exist */
561     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
562         return false;
563     }
564 
565     align = 0;
566     if (size == 3) {
567         if (nregs != 4 || a->a == 0) {
568             return false;
569         }
570         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
571         size = MO_32;
572         align = MO_ALIGN_16;
573     } else if (a->a) {
574         switch (nregs) {
575         case 1:
576             if (size == 0) {
577                 return false;
578             }
579             align = MO_ALIGN;
580             break;
581         case 2:
582             align = pow2_align(size + 1);
583             break;
584         case 3:
585             return false;
586         case 4:
587             if (size == 2) {
588                 align = pow2_align(3);
589             } else {
590                 align = pow2_align(size + 2);
591             }
592             break;
593         default:
594             g_assert_not_reached();
595         }
596     }
597 
598     if (!vfp_access_check(s)) {
599         return true;
600     }
601 
602     /*
603      * VLD1 to all lanes: T bit indicates how many Dregs to write.
604      * VLD2/3/4 to all lanes: T bit indicates register stride.
605      */
606     stride = a->t ? 2 : 1;
607     vec_size = nregs == 1 ? stride * 8 : 8;
608     mop = size | align;
609     tmp = tcg_temp_new_i32();
610     addr = tcg_temp_new_i32();
611     load_reg_var(s, addr, a->rn);
612     for (reg = 0; reg < nregs; reg++) {
613         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
614         if ((vd & 1) && vec_size == 16) {
615             /*
616              * We cannot write 16 bytes at once because the
617              * destination is unaligned.
618              */
619             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
620                                  8, 8, tmp);
621             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
622                              neon_full_reg_offset(vd), 8, 8);
623         } else {
624             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
625                                  vec_size, vec_size, tmp);
626         }
627         tcg_gen_addi_i32(addr, addr, 1 << size);
628         vd += stride;
629 
630         /* Subsequent memory operations inherit alignment */
631         mop &= ~MO_AMASK;
632     }
633     tcg_temp_free_i32(tmp);
634     tcg_temp_free_i32(addr);
635 
636     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
637 
638     return true;
639 }
640 
641 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
642 {
643     /* Neon load/store single structure to one lane */
644     int reg;
645     int nregs = a->n + 1;
646     int vd = a->vd;
647     TCGv_i32 addr, tmp;
648     MemOp mop;
649 
650     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
651         return false;
652     }
653 
654     /* UNDEF accesses to D16-D31 if they don't exist */
655     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
656         return false;
657     }
658 
659     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
660     switch (nregs) {
661     case 1:
662         if (a->stride != 1) {
663             return false;
664         }
665         if (((a->align & (1 << a->size)) != 0) ||
666             (a->size == 2 && (a->align == 1 || a->align == 2))) {
667             return false;
668         }
669         break;
670     case 2:
671         if (a->size == 2 && (a->align & 2) != 0) {
672             return false;
673         }
674         break;
675     case 3:
676         if (a->align != 0) {
677             return false;
678         }
679         break;
680     case 4:
681         if (a->size == 2 && a->align == 3) {
682             return false;
683         }
684         break;
685     default:
686         g_assert_not_reached();
687     }
688     if ((vd + a->stride * (nregs - 1)) > 31) {
689         /*
690          * Attempts to write off the end of the register file are
691          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
692          * access off the end of the array that holds the register data.
693          */
694         return false;
695     }
696 
697     if (!vfp_access_check(s)) {
698         return true;
699     }
700 
701     /* Pick up SCTLR settings */
702     mop = finalize_memop(s, a->size);
703 
704     if (a->align) {
705         MemOp align_op;
706 
707         switch (nregs) {
708         case 1:
709             /* For VLD1, use natural alignment. */
710             align_op = MO_ALIGN;
711             break;
712         case 2:
713             /* For VLD2, use double alignment. */
714             align_op = pow2_align(a->size + 1);
715             break;
716         case 4:
717             if (a->size == MO_32) {
718                 /*
719                  * For VLD4.32, align = 1 is double alignment, align = 2 is
720                  * quad alignment; align = 3 is rejected above.
721                  */
722                 align_op = pow2_align(a->size + a->align);
723             } else {
724                 /* For VLD4.8 and VLD.16, we want quad alignment. */
725                 align_op = pow2_align(a->size + 2);
726             }
727             break;
728         default:
729             /* For VLD3, the alignment field is zero and rejected above. */
730             g_assert_not_reached();
731         }
732 
733         mop = (mop & ~MO_AMASK) | align_op;
734     }
735 
736     tmp = tcg_temp_new_i32();
737     addr = tcg_temp_new_i32();
738     load_reg_var(s, addr, a->rn);
739 
740     for (reg = 0; reg < nregs; reg++) {
741         if (a->l) {
742             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
743             neon_store_element(vd, a->reg_idx, a->size, tmp);
744         } else { /* Store */
745             neon_load_element(tmp, vd, a->reg_idx, a->size);
746             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
747         }
748         vd += a->stride;
749         tcg_gen_addi_i32(addr, addr, 1 << a->size);
750 
751         /* Subsequent memory operations inherit alignment */
752         mop &= ~MO_AMASK;
753     }
754     tcg_temp_free_i32(addr);
755     tcg_temp_free_i32(tmp);
756 
757     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
758 
759     return true;
760 }
761 
762 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
763 {
764     int vec_size = a->q ? 16 : 8;
765     int rd_ofs = neon_full_reg_offset(a->vd);
766     int rn_ofs = neon_full_reg_offset(a->vn);
767     int rm_ofs = neon_full_reg_offset(a->vm);
768 
769     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
770         return false;
771     }
772 
773     /* UNDEF accesses to D16-D31 if they don't exist. */
774     if (!dc_isar_feature(aa32_simd_r32, s) &&
775         ((a->vd | a->vn | a->vm) & 0x10)) {
776         return false;
777     }
778 
779     if ((a->vn | a->vm | a->vd) & a->q) {
780         return false;
781     }
782 
783     if (!vfp_access_check(s)) {
784         return true;
785     }
786 
787     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
788     return true;
789 }
790 
791 #define DO_3SAME(INSN, FUNC)                                            \
792     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
793     {                                                                   \
794         return do_3same(s, a, FUNC);                                    \
795     }
796 
797 DO_3SAME(VADD, tcg_gen_gvec_add)
798 DO_3SAME(VSUB, tcg_gen_gvec_sub)
799 DO_3SAME(VAND, tcg_gen_gvec_and)
800 DO_3SAME(VBIC, tcg_gen_gvec_andc)
801 DO_3SAME(VORR, tcg_gen_gvec_or)
802 DO_3SAME(VORN, tcg_gen_gvec_orc)
803 DO_3SAME(VEOR, tcg_gen_gvec_xor)
804 DO_3SAME(VSHL_S, gen_gvec_sshl)
805 DO_3SAME(VSHL_U, gen_gvec_ushl)
806 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
807 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
808 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
809 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
810 
811 /* These insns are all gvec_bitsel but with the inputs in various orders. */
812 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
813     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
814                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
815                                 uint32_t oprsz, uint32_t maxsz)         \
816     {                                                                   \
817         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
818     }                                                                   \
819     DO_3SAME(INSN, gen_##INSN##_3s)
820 
821 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
822 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
823 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
824 
825 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
826     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
827     {                                                                   \
828         if (a->size == 3) {                                             \
829             return false;                                               \
830         }                                                               \
831         return do_3same(s, a, FUNC);                                    \
832     }
833 
834 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
835 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
836 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
837 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
838 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
839 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
840 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
841 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
842 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
843 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
844 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
845 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
846 
847 #define DO_3SAME_CMP(INSN, COND)                                        \
848     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
849                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
850                                 uint32_t oprsz, uint32_t maxsz)         \
851     {                                                                   \
852         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
853     }                                                                   \
854     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
855 
856 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
857 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
858 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
859 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
860 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
861 
862 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
863     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
864                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
865     {                                                                      \
866         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
867     }
868 
869 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
870 
871 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
872 {
873     if (a->size != 0) {
874         return false;
875     }
876     return do_3same(s, a, gen_VMUL_p_3s);
877 }
878 
879 #define DO_VQRDMLAH(INSN, FUNC)                                         \
880     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
881     {                                                                   \
882         if (!dc_isar_feature(aa32_rdm, s)) {                            \
883             return false;                                               \
884         }                                                               \
885         if (a->size != 1 && a->size != 2) {                             \
886             return false;                                               \
887         }                                                               \
888         return do_3same(s, a, FUNC);                                    \
889     }
890 
891 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
892 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
893 
894 #define DO_SHA1(NAME, FUNC)                                             \
895     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
896     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
897     {                                                                   \
898         if (!dc_isar_feature(aa32_sha1, s)) {                           \
899             return false;                                               \
900         }                                                               \
901         return do_3same(s, a, gen_##NAME##_3s);                         \
902     }
903 
904 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
905 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
906 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
907 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
908 
909 #define DO_SHA2(NAME, FUNC)                                             \
910     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
911     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
912     {                                                                   \
913         if (!dc_isar_feature(aa32_sha2, s)) {                           \
914             return false;                                               \
915         }                                                               \
916         return do_3same(s, a, gen_##NAME##_3s);                         \
917     }
918 
919 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
920 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
921 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
922 
923 #define DO_3SAME_64(INSN, FUNC)                                         \
924     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
925                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
926                                 uint32_t oprsz, uint32_t maxsz)         \
927     {                                                                   \
928         static const GVecGen3 op = { .fni8 = FUNC };                    \
929         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
930     }                                                                   \
931     DO_3SAME(INSN, gen_##INSN##_3s)
932 
933 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
934     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
935     {                                                                   \
936         FUNC(d, cpu_env, n, m);                                         \
937     }                                                                   \
938     DO_3SAME_64(INSN, gen_##INSN##_elt)
939 
940 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
941 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
942 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
943 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
944 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
945 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
946 
947 #define DO_3SAME_32(INSN, FUNC)                                         \
948     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
949                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
950                                 uint32_t oprsz, uint32_t maxsz)         \
951     {                                                                   \
952         static const GVecGen3 ops[4] = {                                \
953             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
954             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
955             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
956             { 0 },                                                      \
957         };                                                              \
958         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
959     }                                                                   \
960     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
961     {                                                                   \
962         if (a->size > 2) {                                              \
963             return false;                                               \
964         }                                                               \
965         return do_3same(s, a, gen_##INSN##_3s);                         \
966     }
967 
968 /*
969  * Some helper functions need to be passed the cpu_env. In order
970  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
971  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
972  * and which call a NeonGenTwoOpEnvFn().
973  */
974 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
975     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
976     {                                                                   \
977         FUNC(d, cpu_env, n, m);                                         \
978     }
979 
980 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
981     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
982     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
983     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
984     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
985                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
986                                 uint32_t oprsz, uint32_t maxsz)         \
987     {                                                                   \
988         static const GVecGen3 ops[4] = {                                \
989             { .fni4 = gen_##INSN##_tramp8 },                            \
990             { .fni4 = gen_##INSN##_tramp16 },                           \
991             { .fni4 = gen_##INSN##_tramp32 },                           \
992             { 0 },                                                      \
993         };                                                              \
994         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
995     }                                                                   \
996     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
997     {                                                                   \
998         if (a->size > 2) {                                              \
999             return false;                                               \
1000         }                                                               \
1001         return do_3same(s, a, gen_##INSN##_3s);                         \
1002     }
1003 
1004 DO_3SAME_32(VHADD_S, hadd_s)
1005 DO_3SAME_32(VHADD_U, hadd_u)
1006 DO_3SAME_32(VHSUB_S, hsub_s)
1007 DO_3SAME_32(VHSUB_U, hsub_u)
1008 DO_3SAME_32(VRHADD_S, rhadd_s)
1009 DO_3SAME_32(VRHADD_U, rhadd_u)
1010 DO_3SAME_32(VRSHL_S, rshl_s)
1011 DO_3SAME_32(VRSHL_U, rshl_u)
1012 
1013 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1014 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1015 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1016 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1017 
1018 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1019 {
1020     /* Operations handled pairwise 32 bits at a time */
1021     TCGv_i32 tmp, tmp2, tmp3;
1022 
1023     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1024         return false;
1025     }
1026 
1027     /* UNDEF accesses to D16-D31 if they don't exist. */
1028     if (!dc_isar_feature(aa32_simd_r32, s) &&
1029         ((a->vd | a->vn | a->vm) & 0x10)) {
1030         return false;
1031     }
1032 
1033     if (a->size == 3) {
1034         return false;
1035     }
1036 
1037     if (!vfp_access_check(s)) {
1038         return true;
1039     }
1040 
1041     assert(a->q == 0); /* enforced by decode patterns */
1042 
1043     /*
1044      * Note that we have to be careful not to clobber the source operands
1045      * in the "vm == vd" case by storing the result of the first pass too
1046      * early. Since Q is 0 there are always just two passes, so instead
1047      * of a complicated loop over each pass we just unroll.
1048      */
1049     tmp = tcg_temp_new_i32();
1050     tmp2 = tcg_temp_new_i32();
1051     tmp3 = tcg_temp_new_i32();
1052 
1053     read_neon_element32(tmp, a->vn, 0, MO_32);
1054     read_neon_element32(tmp2, a->vn, 1, MO_32);
1055     fn(tmp, tmp, tmp2);
1056 
1057     read_neon_element32(tmp3, a->vm, 0, MO_32);
1058     read_neon_element32(tmp2, a->vm, 1, MO_32);
1059     fn(tmp3, tmp3, tmp2);
1060 
1061     write_neon_element32(tmp, a->vd, 0, MO_32);
1062     write_neon_element32(tmp3, a->vd, 1, MO_32);
1063 
1064     tcg_temp_free_i32(tmp);
1065     tcg_temp_free_i32(tmp2);
1066     tcg_temp_free_i32(tmp3);
1067     return true;
1068 }
1069 
1070 #define DO_3SAME_PAIR(INSN, func)                                       \
1071     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1072     {                                                                   \
1073         static NeonGenTwoOpFn * const fns[] = {                         \
1074             gen_helper_neon_##func##8,                                  \
1075             gen_helper_neon_##func##16,                                 \
1076             gen_helper_neon_##func##32,                                 \
1077         };                                                              \
1078         if (a->size > 2) {                                              \
1079             return false;                                               \
1080         }                                                               \
1081         return do_3same_pair(s, a, fns[a->size]);                       \
1082     }
1083 
1084 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1085 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1086 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1087 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1088 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1089 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1090 
1091 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1092 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1093 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1094 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1095 DO_3SAME_PAIR(VPADD, padd_u)
1096 
1097 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1098     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1099     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1100     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1101                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1102                                 uint32_t oprsz, uint32_t maxsz)         \
1103     {                                                                   \
1104         static const GVecGen3 ops[2] = {                                \
1105             { .fni4 = gen_##INSN##_tramp16 },                           \
1106             { .fni4 = gen_##INSN##_tramp32 },                           \
1107         };                                                              \
1108         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1109     }                                                                   \
1110     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1111     {                                                                   \
1112         if (a->size != 1 && a->size != 2) {                             \
1113             return false;                                               \
1114         }                                                               \
1115         return do_3same(s, a, gen_##INSN##_3s);                         \
1116     }
1117 
1118 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1119 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1120 
1121 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1122     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1123                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1124                          uint32_t oprsz, uint32_t maxsz)                \
1125     {                                                                   \
1126         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1127         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1128                            oprsz, maxsz, 0, FUNC);                      \
1129         tcg_temp_free_ptr(fpst);                                        \
1130     }
1131 
1132 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1133     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1134     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1135     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1136     {                                                                   \
1137         if (a->size == MO_16) {                                         \
1138             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1139                 return false;                                           \
1140             }                                                           \
1141             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1142         }                                                               \
1143         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1144     }
1145 
1146 
1147 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1148 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1149 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1150 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1151 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1152 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1153 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1154 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1155 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1156 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1157 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1158 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1159 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1160 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1161 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1162 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1163 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1164 
1165 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1166 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1167 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1168 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1169 
1170 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1171 {
1172     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1173         return false;
1174     }
1175 
1176     if (a->size == MO_16) {
1177         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1178             return false;
1179         }
1180         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1181     }
1182     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1183 }
1184 
1185 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1186 {
1187     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1188         return false;
1189     }
1190 
1191     if (a->size == MO_16) {
1192         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1193             return false;
1194         }
1195         return do_3same(s, a, gen_VMINNM_fp16_3s);
1196     }
1197     return do_3same(s, a, gen_VMINNM_fp32_3s);
1198 }
1199 
1200 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1201                              gen_helper_gvec_3_ptr *fn)
1202 {
1203     /* FP pairwise operations */
1204     TCGv_ptr fpstatus;
1205 
1206     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1207         return false;
1208     }
1209 
1210     /* UNDEF accesses to D16-D31 if they don't exist. */
1211     if (!dc_isar_feature(aa32_simd_r32, s) &&
1212         ((a->vd | a->vn | a->vm) & 0x10)) {
1213         return false;
1214     }
1215 
1216     if (!vfp_access_check(s)) {
1217         return true;
1218     }
1219 
1220     assert(a->q == 0); /* enforced by decode patterns */
1221 
1222 
1223     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1224     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1225                        vfp_reg_offset(1, a->vn),
1226                        vfp_reg_offset(1, a->vm),
1227                        fpstatus, 8, 8, 0, fn);
1228     tcg_temp_free_ptr(fpstatus);
1229 
1230     return true;
1231 }
1232 
1233 /*
1234  * For all the functions using this macro, size == 1 means fp16,
1235  * which is an architecture extension we don't implement yet.
1236  */
1237 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1238     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1239     {                                                               \
1240         if (a->size == MO_16) {                                     \
1241             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1242                 return false;                                       \
1243             }                                                       \
1244             return do_3same_fp_pair(s, a, FUNC##h);                 \
1245         }                                                           \
1246         return do_3same_fp_pair(s, a, FUNC##s);                     \
1247     }
1248 
1249 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1250 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1251 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1252 
1253 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1254 {
1255     /* Handle a 2-reg-shift insn which can be vectorized. */
1256     int vec_size = a->q ? 16 : 8;
1257     int rd_ofs = neon_full_reg_offset(a->vd);
1258     int rm_ofs = neon_full_reg_offset(a->vm);
1259 
1260     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1261         return false;
1262     }
1263 
1264     /* UNDEF accesses to D16-D31 if they don't exist. */
1265     if (!dc_isar_feature(aa32_simd_r32, s) &&
1266         ((a->vd | a->vm) & 0x10)) {
1267         return false;
1268     }
1269 
1270     if ((a->vm | a->vd) & a->q) {
1271         return false;
1272     }
1273 
1274     if (!vfp_access_check(s)) {
1275         return true;
1276     }
1277 
1278     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1279     return true;
1280 }
1281 
1282 #define DO_2SH(INSN, FUNC)                                              \
1283     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1284     {                                                                   \
1285         return do_vector_2sh(s, a, FUNC);                               \
1286     }                                                                   \
1287 
1288 DO_2SH(VSHL, tcg_gen_gvec_shli)
1289 DO_2SH(VSLI, gen_gvec_sli)
1290 DO_2SH(VSRI, gen_gvec_sri)
1291 DO_2SH(VSRA_S, gen_gvec_ssra)
1292 DO_2SH(VSRA_U, gen_gvec_usra)
1293 DO_2SH(VRSHR_S, gen_gvec_srshr)
1294 DO_2SH(VRSHR_U, gen_gvec_urshr)
1295 DO_2SH(VRSRA_S, gen_gvec_srsra)
1296 DO_2SH(VRSRA_U, gen_gvec_ursra)
1297 
1298 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1299 {
1300     /* Signed shift out of range results in all-sign-bits */
1301     a->shift = MIN(a->shift, (8 << a->size) - 1);
1302     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1303 }
1304 
1305 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1306                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1307 {
1308     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1309 }
1310 
1311 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1312 {
1313     /* Shift out of range is architecturally valid and results in zero. */
1314     if (a->shift >= (8 << a->size)) {
1315         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1316     } else {
1317         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1318     }
1319 }
1320 
1321 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1322                              NeonGenTwo64OpEnvFn *fn)
1323 {
1324     /*
1325      * 2-reg-and-shift operations, size == 3 case, where the
1326      * function needs to be passed cpu_env.
1327      */
1328     TCGv_i64 constimm;
1329     int pass;
1330 
1331     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1332         return false;
1333     }
1334 
1335     /* UNDEF accesses to D16-D31 if they don't exist. */
1336     if (!dc_isar_feature(aa32_simd_r32, s) &&
1337         ((a->vd | a->vm) & 0x10)) {
1338         return false;
1339     }
1340 
1341     if ((a->vm | a->vd) & a->q) {
1342         return false;
1343     }
1344 
1345     if (!vfp_access_check(s)) {
1346         return true;
1347     }
1348 
1349     /*
1350      * To avoid excessive duplication of ops we implement shift
1351      * by immediate using the variable shift operations.
1352      */
1353     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1354 
1355     for (pass = 0; pass < a->q + 1; pass++) {
1356         TCGv_i64 tmp = tcg_temp_new_i64();
1357 
1358         read_neon_element64(tmp, a->vm, pass, MO_64);
1359         fn(tmp, cpu_env, tmp, constimm);
1360         write_neon_element64(tmp, a->vd, pass, MO_64);
1361         tcg_temp_free_i64(tmp);
1362     }
1363     return true;
1364 }
1365 
1366 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1367                              NeonGenTwoOpEnvFn *fn)
1368 {
1369     /*
1370      * 2-reg-and-shift operations, size < 3 case, where the
1371      * helper needs to be passed cpu_env.
1372      */
1373     TCGv_i32 constimm, tmp;
1374     int pass;
1375 
1376     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1377         return false;
1378     }
1379 
1380     /* UNDEF accesses to D16-D31 if they don't exist. */
1381     if (!dc_isar_feature(aa32_simd_r32, s) &&
1382         ((a->vd | a->vm) & 0x10)) {
1383         return false;
1384     }
1385 
1386     if ((a->vm | a->vd) & a->q) {
1387         return false;
1388     }
1389 
1390     if (!vfp_access_check(s)) {
1391         return true;
1392     }
1393 
1394     /*
1395      * To avoid excessive duplication of ops we implement shift
1396      * by immediate using the variable shift operations.
1397      */
1398     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1399     tmp = tcg_temp_new_i32();
1400 
1401     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1402         read_neon_element32(tmp, a->vm, pass, MO_32);
1403         fn(tmp, cpu_env, tmp, constimm);
1404         write_neon_element32(tmp, a->vd, pass, MO_32);
1405     }
1406     tcg_temp_free_i32(tmp);
1407     return true;
1408 }
1409 
1410 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1411     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1412     {                                                                   \
1413         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1414     }                                                                   \
1415     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1416     {                                                                   \
1417         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1418             gen_helper_neon_##FUNC##8,                                  \
1419             gen_helper_neon_##FUNC##16,                                 \
1420             gen_helper_neon_##FUNC##32,                                 \
1421         };                                                              \
1422         assert(a->size < ARRAY_SIZE(fns));                              \
1423         return do_2shift_env_32(s, a, fns[a->size]);                    \
1424     }
1425 
1426 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1427 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1428 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1429 
1430 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1431                                 NeonGenTwo64OpFn *shiftfn,
1432                                 NeonGenNarrowEnvFn *narrowfn)
1433 {
1434     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1435     TCGv_i64 constimm, rm1, rm2;
1436     TCGv_i32 rd;
1437 
1438     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1439         return false;
1440     }
1441 
1442     /* UNDEF accesses to D16-D31 if they don't exist. */
1443     if (!dc_isar_feature(aa32_simd_r32, s) &&
1444         ((a->vd | a->vm) & 0x10)) {
1445         return false;
1446     }
1447 
1448     if (a->vm & 1) {
1449         return false;
1450     }
1451 
1452     if (!vfp_access_check(s)) {
1453         return true;
1454     }
1455 
1456     /*
1457      * This is always a right shift, and the shiftfn is always a
1458      * left-shift helper, which thus needs the negated shift count.
1459      */
1460     constimm = tcg_constant_i64(-a->shift);
1461     rm1 = tcg_temp_new_i64();
1462     rm2 = tcg_temp_new_i64();
1463     rd = tcg_temp_new_i32();
1464 
1465     /* Load both inputs first to avoid potential overwrite if rm == rd */
1466     read_neon_element64(rm1, a->vm, 0, MO_64);
1467     read_neon_element64(rm2, a->vm, 1, MO_64);
1468 
1469     shiftfn(rm1, rm1, constimm);
1470     narrowfn(rd, cpu_env, rm1);
1471     write_neon_element32(rd, a->vd, 0, MO_32);
1472 
1473     shiftfn(rm2, rm2, constimm);
1474     narrowfn(rd, cpu_env, rm2);
1475     write_neon_element32(rd, a->vd, 1, MO_32);
1476 
1477     tcg_temp_free_i32(rd);
1478     tcg_temp_free_i64(rm1);
1479     tcg_temp_free_i64(rm2);
1480 
1481     return true;
1482 }
1483 
1484 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1485                                 NeonGenTwoOpFn *shiftfn,
1486                                 NeonGenNarrowEnvFn *narrowfn)
1487 {
1488     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1489     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1490     TCGv_i64 rtmp;
1491     uint32_t imm;
1492 
1493     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1494         return false;
1495     }
1496 
1497     /* UNDEF accesses to D16-D31 if they don't exist. */
1498     if (!dc_isar_feature(aa32_simd_r32, s) &&
1499         ((a->vd | a->vm) & 0x10)) {
1500         return false;
1501     }
1502 
1503     if (a->vm & 1) {
1504         return false;
1505     }
1506 
1507     if (!vfp_access_check(s)) {
1508         return true;
1509     }
1510 
1511     /*
1512      * This is always a right shift, and the shiftfn is always a
1513      * left-shift helper, which thus needs the negated shift count
1514      * duplicated into each lane of the immediate value.
1515      */
1516     if (a->size == 1) {
1517         imm = (uint16_t)(-a->shift);
1518         imm |= imm << 16;
1519     } else {
1520         /* size == 2 */
1521         imm = -a->shift;
1522     }
1523     constimm = tcg_constant_i32(imm);
1524 
1525     /* Load all inputs first to avoid potential overwrite */
1526     rm1 = tcg_temp_new_i32();
1527     rm2 = tcg_temp_new_i32();
1528     rm3 = tcg_temp_new_i32();
1529     rm4 = tcg_temp_new_i32();
1530     read_neon_element32(rm1, a->vm, 0, MO_32);
1531     read_neon_element32(rm2, a->vm, 1, MO_32);
1532     read_neon_element32(rm3, a->vm, 2, MO_32);
1533     read_neon_element32(rm4, a->vm, 3, MO_32);
1534     rtmp = tcg_temp_new_i64();
1535 
1536     shiftfn(rm1, rm1, constimm);
1537     shiftfn(rm2, rm2, constimm);
1538 
1539     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1540     tcg_temp_free_i32(rm2);
1541 
1542     narrowfn(rm1, cpu_env, rtmp);
1543     write_neon_element32(rm1, a->vd, 0, MO_32);
1544     tcg_temp_free_i32(rm1);
1545 
1546     shiftfn(rm3, rm3, constimm);
1547     shiftfn(rm4, rm4, constimm);
1548 
1549     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1550     tcg_temp_free_i32(rm4);
1551 
1552     narrowfn(rm3, cpu_env, rtmp);
1553     tcg_temp_free_i64(rtmp);
1554     write_neon_element32(rm3, a->vd, 1, MO_32);
1555     tcg_temp_free_i32(rm3);
1556     return true;
1557 }
1558 
1559 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1560     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1561     {                                                                   \
1562         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1563     }
1564 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1565     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1566     {                                                                   \
1567         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1568     }
1569 
1570 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1571 {
1572     tcg_gen_extrl_i64_i32(dest, src);
1573 }
1574 
1575 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1576 {
1577     gen_helper_neon_narrow_u16(dest, src);
1578 }
1579 
1580 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1581 {
1582     gen_helper_neon_narrow_u8(dest, src);
1583 }
1584 
1585 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1586 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1587 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1588 
1589 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1590 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1591 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1592 
1593 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1594 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1595 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1596 
1597 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1598 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1599 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1600 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1601 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1602 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1603 
1604 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1605 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1606 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1607 
1608 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1609 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1610 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1611 
1612 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1613 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1614 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1615 
1616 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1617                          NeonGenWidenFn *widenfn, bool u)
1618 {
1619     TCGv_i64 tmp;
1620     TCGv_i32 rm0, rm1;
1621     uint64_t widen_mask = 0;
1622 
1623     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1624         return false;
1625     }
1626 
1627     /* UNDEF accesses to D16-D31 if they don't exist. */
1628     if (!dc_isar_feature(aa32_simd_r32, s) &&
1629         ((a->vd | a->vm) & 0x10)) {
1630         return false;
1631     }
1632 
1633     if (a->vd & 1) {
1634         return false;
1635     }
1636 
1637     if (!vfp_access_check(s)) {
1638         return true;
1639     }
1640 
1641     /*
1642      * This is a widen-and-shift operation. The shift is always less
1643      * than the width of the source type, so after widening the input
1644      * vector we can simply shift the whole 64-bit widened register,
1645      * and then clear the potential overflow bits resulting from left
1646      * bits of the narrow input appearing as right bits of the left
1647      * neighbour narrow input. Calculate a mask of bits to clear.
1648      */
1649     if ((a->shift != 0) && (a->size < 2 || u)) {
1650         int esize = 8 << a->size;
1651         widen_mask = MAKE_64BIT_MASK(0, esize);
1652         widen_mask >>= esize - a->shift;
1653         widen_mask = dup_const(a->size + 1, widen_mask);
1654     }
1655 
1656     rm0 = tcg_temp_new_i32();
1657     rm1 = tcg_temp_new_i32();
1658     read_neon_element32(rm0, a->vm, 0, MO_32);
1659     read_neon_element32(rm1, a->vm, 1, MO_32);
1660     tmp = tcg_temp_new_i64();
1661 
1662     widenfn(tmp, rm0);
1663     tcg_temp_free_i32(rm0);
1664     if (a->shift != 0) {
1665         tcg_gen_shli_i64(tmp, tmp, a->shift);
1666         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1667     }
1668     write_neon_element64(tmp, a->vd, 0, MO_64);
1669 
1670     widenfn(tmp, rm1);
1671     tcg_temp_free_i32(rm1);
1672     if (a->shift != 0) {
1673         tcg_gen_shli_i64(tmp, tmp, a->shift);
1674         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1675     }
1676     write_neon_element64(tmp, a->vd, 1, MO_64);
1677     tcg_temp_free_i64(tmp);
1678     return true;
1679 }
1680 
1681 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1682 {
1683     static NeonGenWidenFn * const widenfn[] = {
1684         gen_helper_neon_widen_s8,
1685         gen_helper_neon_widen_s16,
1686         tcg_gen_ext_i32_i64,
1687     };
1688     return do_vshll_2sh(s, a, widenfn[a->size], false);
1689 }
1690 
1691 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1692 {
1693     static NeonGenWidenFn * const widenfn[] = {
1694         gen_helper_neon_widen_u8,
1695         gen_helper_neon_widen_u16,
1696         tcg_gen_extu_i32_i64,
1697     };
1698     return do_vshll_2sh(s, a, widenfn[a->size], true);
1699 }
1700 
1701 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1702                       gen_helper_gvec_2_ptr *fn)
1703 {
1704     /* FP operations in 2-reg-and-shift group */
1705     int vec_size = a->q ? 16 : 8;
1706     int rd_ofs = neon_full_reg_offset(a->vd);
1707     int rm_ofs = neon_full_reg_offset(a->vm);
1708     TCGv_ptr fpst;
1709 
1710     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1711         return false;
1712     }
1713 
1714     if (a->size == MO_16) {
1715         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1716             return false;
1717         }
1718     }
1719 
1720     /* UNDEF accesses to D16-D31 if they don't exist. */
1721     if (!dc_isar_feature(aa32_simd_r32, s) &&
1722         ((a->vd | a->vm) & 0x10)) {
1723         return false;
1724     }
1725 
1726     if ((a->vm | a->vd) & a->q) {
1727         return false;
1728     }
1729 
1730     if (!vfp_access_check(s)) {
1731         return true;
1732     }
1733 
1734     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1735     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1736     tcg_temp_free_ptr(fpst);
1737     return true;
1738 }
1739 
1740 #define DO_FP_2SH(INSN, FUNC)                                           \
1741     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1742     {                                                                   \
1743         return do_fp_2sh(s, a, FUNC);                                   \
1744     }
1745 
1746 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1747 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1748 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1749 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1750 
1751 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1752 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1753 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1754 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1755 
1756 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1757                         GVecGen2iFn *fn)
1758 {
1759     uint64_t imm;
1760     int reg_ofs, vec_size;
1761 
1762     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1763         return false;
1764     }
1765 
1766     /* UNDEF accesses to D16-D31 if they don't exist. */
1767     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1768         return false;
1769     }
1770 
1771     if (a->vd & a->q) {
1772         return false;
1773     }
1774 
1775     if (!vfp_access_check(s)) {
1776         return true;
1777     }
1778 
1779     reg_ofs = neon_full_reg_offset(a->vd);
1780     vec_size = a->q ? 16 : 8;
1781     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1782 
1783     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1784     return true;
1785 }
1786 
1787 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1788                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1789 {
1790     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1791 }
1792 
1793 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1794 {
1795     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1796     GVecGen2iFn *fn;
1797 
1798     if ((a->cmode & 1) && a->cmode < 12) {
1799         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1800         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1801     } else {
1802         /* There is one unallocated cmode/op combination in this space */
1803         if (a->cmode == 15 && a->op == 1) {
1804             return false;
1805         }
1806         fn = gen_VMOV_1r;
1807     }
1808     return do_1reg_imm(s, a, fn);
1809 }
1810 
1811 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1812                            NeonGenWidenFn *widenfn,
1813                            NeonGenTwo64OpFn *opfn,
1814                            int src1_mop, int src2_mop)
1815 {
1816     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1817     TCGv_i64 rn0_64, rn1_64, rm_64;
1818 
1819     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1820         return false;
1821     }
1822 
1823     /* UNDEF accesses to D16-D31 if they don't exist. */
1824     if (!dc_isar_feature(aa32_simd_r32, s) &&
1825         ((a->vd | a->vn | a->vm) & 0x10)) {
1826         return false;
1827     }
1828 
1829     if (!opfn) {
1830         /* size == 3 case, which is an entirely different insn group */
1831         return false;
1832     }
1833 
1834     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1835         return false;
1836     }
1837 
1838     if (!vfp_access_check(s)) {
1839         return true;
1840     }
1841 
1842     rn0_64 = tcg_temp_new_i64();
1843     rn1_64 = tcg_temp_new_i64();
1844     rm_64 = tcg_temp_new_i64();
1845 
1846     if (src1_mop >= 0) {
1847         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1848     } else {
1849         TCGv_i32 tmp = tcg_temp_new_i32();
1850         read_neon_element32(tmp, a->vn, 0, MO_32);
1851         widenfn(rn0_64, tmp);
1852         tcg_temp_free_i32(tmp);
1853     }
1854     if (src2_mop >= 0) {
1855         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1856     } else {
1857         TCGv_i32 tmp = tcg_temp_new_i32();
1858         read_neon_element32(tmp, a->vm, 0, MO_32);
1859         widenfn(rm_64, tmp);
1860         tcg_temp_free_i32(tmp);
1861     }
1862 
1863     opfn(rn0_64, rn0_64, rm_64);
1864 
1865     /*
1866      * Load second pass inputs before storing the first pass result, to
1867      * avoid incorrect results if a narrow input overlaps with the result.
1868      */
1869     if (src1_mop >= 0) {
1870         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1871     } else {
1872         TCGv_i32 tmp = tcg_temp_new_i32();
1873         read_neon_element32(tmp, a->vn, 1, MO_32);
1874         widenfn(rn1_64, tmp);
1875         tcg_temp_free_i32(tmp);
1876     }
1877     if (src2_mop >= 0) {
1878         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1879     } else {
1880         TCGv_i32 tmp = tcg_temp_new_i32();
1881         read_neon_element32(tmp, a->vm, 1, MO_32);
1882         widenfn(rm_64, tmp);
1883         tcg_temp_free_i32(tmp);
1884     }
1885 
1886     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1887 
1888     opfn(rn1_64, rn1_64, rm_64);
1889     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1890 
1891     tcg_temp_free_i64(rn0_64);
1892     tcg_temp_free_i64(rn1_64);
1893     tcg_temp_free_i64(rm_64);
1894 
1895     return true;
1896 }
1897 
1898 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1899     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1900     {                                                                   \
1901         static NeonGenWidenFn * const widenfn[] = {                     \
1902             gen_helper_neon_widen_##S##8,                               \
1903             gen_helper_neon_widen_##S##16,                              \
1904             NULL, NULL,                                                 \
1905         };                                                              \
1906         static NeonGenTwo64OpFn * const addfn[] = {                     \
1907             gen_helper_neon_##OP##l_u16,                                \
1908             gen_helper_neon_##OP##l_u32,                                \
1909             tcg_gen_##OP##_i64,                                         \
1910             NULL,                                                       \
1911         };                                                              \
1912         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1913         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1914                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1915                               narrow_mop);                              \
1916     }
1917 
1918 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1919 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1920 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1921 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1922 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1923 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1924 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1925 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1926 
1927 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1928                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1929 {
1930     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1931     TCGv_i64 rn_64, rm_64;
1932     TCGv_i32 rd0, rd1;
1933 
1934     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1935         return false;
1936     }
1937 
1938     /* UNDEF accesses to D16-D31 if they don't exist. */
1939     if (!dc_isar_feature(aa32_simd_r32, s) &&
1940         ((a->vd | a->vn | a->vm) & 0x10)) {
1941         return false;
1942     }
1943 
1944     if (!opfn || !narrowfn) {
1945         /* size == 3 case, which is an entirely different insn group */
1946         return false;
1947     }
1948 
1949     if ((a->vn | a->vm) & 1) {
1950         return false;
1951     }
1952 
1953     if (!vfp_access_check(s)) {
1954         return true;
1955     }
1956 
1957     rn_64 = tcg_temp_new_i64();
1958     rm_64 = tcg_temp_new_i64();
1959     rd0 = tcg_temp_new_i32();
1960     rd1 = tcg_temp_new_i32();
1961 
1962     read_neon_element64(rn_64, a->vn, 0, MO_64);
1963     read_neon_element64(rm_64, a->vm, 0, MO_64);
1964 
1965     opfn(rn_64, rn_64, rm_64);
1966 
1967     narrowfn(rd0, rn_64);
1968 
1969     read_neon_element64(rn_64, a->vn, 1, MO_64);
1970     read_neon_element64(rm_64, a->vm, 1, MO_64);
1971 
1972     opfn(rn_64, rn_64, rm_64);
1973 
1974     narrowfn(rd1, rn_64);
1975 
1976     write_neon_element32(rd0, a->vd, 0, MO_32);
1977     write_neon_element32(rd1, a->vd, 1, MO_32);
1978 
1979     tcg_temp_free_i32(rd0);
1980     tcg_temp_free_i32(rd1);
1981     tcg_temp_free_i64(rn_64);
1982     tcg_temp_free_i64(rm_64);
1983 
1984     return true;
1985 }
1986 
1987 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1988     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1989     {                                                                   \
1990         static NeonGenTwo64OpFn * const addfn[] = {                     \
1991             gen_helper_neon_##OP##l_u16,                                \
1992             gen_helper_neon_##OP##l_u32,                                \
1993             tcg_gen_##OP##_i64,                                         \
1994             NULL,                                                       \
1995         };                                                              \
1996         static NeonGenNarrowFn * const narrowfn[] = {                   \
1997             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1998             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1999             EXTOP,                                                      \
2000             NULL,                                                       \
2001         };                                                              \
2002         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2003     }
2004 
2005 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2006 {
2007     tcg_gen_addi_i64(rn, rn, 1u << 31);
2008     tcg_gen_extrh_i64_i32(rd, rn);
2009 }
2010 
2011 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2012 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2013 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2014 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2015 
2016 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2017                        NeonGenTwoOpWidenFn *opfn,
2018                        NeonGenTwo64OpFn *accfn)
2019 {
2020     /*
2021      * 3-regs different lengths, long operations.
2022      * These perform an operation on two inputs that returns a double-width
2023      * result, and then possibly perform an accumulation operation of
2024      * that result into the double-width destination.
2025      */
2026     TCGv_i64 rd0, rd1, tmp;
2027     TCGv_i32 rn, rm;
2028 
2029     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2030         return false;
2031     }
2032 
2033     /* UNDEF accesses to D16-D31 if they don't exist. */
2034     if (!dc_isar_feature(aa32_simd_r32, s) &&
2035         ((a->vd | a->vn | a->vm) & 0x10)) {
2036         return false;
2037     }
2038 
2039     if (!opfn) {
2040         /* size == 3 case, which is an entirely different insn group */
2041         return false;
2042     }
2043 
2044     if (a->vd & 1) {
2045         return false;
2046     }
2047 
2048     if (!vfp_access_check(s)) {
2049         return true;
2050     }
2051 
2052     rd0 = tcg_temp_new_i64();
2053     rd1 = tcg_temp_new_i64();
2054 
2055     rn = tcg_temp_new_i32();
2056     rm = tcg_temp_new_i32();
2057     read_neon_element32(rn, a->vn, 0, MO_32);
2058     read_neon_element32(rm, a->vm, 0, MO_32);
2059     opfn(rd0, rn, rm);
2060 
2061     read_neon_element32(rn, a->vn, 1, MO_32);
2062     read_neon_element32(rm, a->vm, 1, MO_32);
2063     opfn(rd1, rn, rm);
2064     tcg_temp_free_i32(rn);
2065     tcg_temp_free_i32(rm);
2066 
2067     /* Don't store results until after all loads: they might overlap */
2068     if (accfn) {
2069         tmp = tcg_temp_new_i64();
2070         read_neon_element64(tmp, a->vd, 0, MO_64);
2071         accfn(rd0, tmp, rd0);
2072         read_neon_element64(tmp, a->vd, 1, MO_64);
2073         accfn(rd1, tmp, rd1);
2074         tcg_temp_free_i64(tmp);
2075     }
2076 
2077     write_neon_element64(rd0, a->vd, 0, MO_64);
2078     write_neon_element64(rd1, a->vd, 1, MO_64);
2079     tcg_temp_free_i64(rd0);
2080     tcg_temp_free_i64(rd1);
2081 
2082     return true;
2083 }
2084 
2085 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2086 {
2087     static NeonGenTwoOpWidenFn * const opfn[] = {
2088         gen_helper_neon_abdl_s16,
2089         gen_helper_neon_abdl_s32,
2090         gen_helper_neon_abdl_s64,
2091         NULL,
2092     };
2093 
2094     return do_long_3d(s, a, opfn[a->size], NULL);
2095 }
2096 
2097 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2098 {
2099     static NeonGenTwoOpWidenFn * const opfn[] = {
2100         gen_helper_neon_abdl_u16,
2101         gen_helper_neon_abdl_u32,
2102         gen_helper_neon_abdl_u64,
2103         NULL,
2104     };
2105 
2106     return do_long_3d(s, a, opfn[a->size], NULL);
2107 }
2108 
2109 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2110 {
2111     static NeonGenTwoOpWidenFn * const opfn[] = {
2112         gen_helper_neon_abdl_s16,
2113         gen_helper_neon_abdl_s32,
2114         gen_helper_neon_abdl_s64,
2115         NULL,
2116     };
2117     static NeonGenTwo64OpFn * const addfn[] = {
2118         gen_helper_neon_addl_u16,
2119         gen_helper_neon_addl_u32,
2120         tcg_gen_add_i64,
2121         NULL,
2122     };
2123 
2124     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2125 }
2126 
2127 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2128 {
2129     static NeonGenTwoOpWidenFn * const opfn[] = {
2130         gen_helper_neon_abdl_u16,
2131         gen_helper_neon_abdl_u32,
2132         gen_helper_neon_abdl_u64,
2133         NULL,
2134     };
2135     static NeonGenTwo64OpFn * const addfn[] = {
2136         gen_helper_neon_addl_u16,
2137         gen_helper_neon_addl_u32,
2138         tcg_gen_add_i64,
2139         NULL,
2140     };
2141 
2142     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2143 }
2144 
2145 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2146 {
2147     TCGv_i32 lo = tcg_temp_new_i32();
2148     TCGv_i32 hi = tcg_temp_new_i32();
2149 
2150     tcg_gen_muls2_i32(lo, hi, rn, rm);
2151     tcg_gen_concat_i32_i64(rd, lo, hi);
2152 
2153     tcg_temp_free_i32(lo);
2154     tcg_temp_free_i32(hi);
2155 }
2156 
2157 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2158 {
2159     TCGv_i32 lo = tcg_temp_new_i32();
2160     TCGv_i32 hi = tcg_temp_new_i32();
2161 
2162     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2163     tcg_gen_concat_i32_i64(rd, lo, hi);
2164 
2165     tcg_temp_free_i32(lo);
2166     tcg_temp_free_i32(hi);
2167 }
2168 
2169 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2170 {
2171     static NeonGenTwoOpWidenFn * const opfn[] = {
2172         gen_helper_neon_mull_s8,
2173         gen_helper_neon_mull_s16,
2174         gen_mull_s32,
2175         NULL,
2176     };
2177 
2178     return do_long_3d(s, a, opfn[a->size], NULL);
2179 }
2180 
2181 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2182 {
2183     static NeonGenTwoOpWidenFn * const opfn[] = {
2184         gen_helper_neon_mull_u8,
2185         gen_helper_neon_mull_u16,
2186         gen_mull_u32,
2187         NULL,
2188     };
2189 
2190     return do_long_3d(s, a, opfn[a->size], NULL);
2191 }
2192 
2193 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2194     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2195     {                                                                   \
2196         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2197             gen_helper_neon_##MULL##8,                                  \
2198             gen_helper_neon_##MULL##16,                                 \
2199             gen_##MULL##32,                                             \
2200             NULL,                                                       \
2201         };                                                              \
2202         static NeonGenTwo64OpFn * const accfn[] = {                     \
2203             gen_helper_neon_##ACC##l_u16,                               \
2204             gen_helper_neon_##ACC##l_u32,                               \
2205             tcg_gen_##ACC##_i64,                                        \
2206             NULL,                                                       \
2207         };                                                              \
2208         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2209     }
2210 
2211 DO_VMLAL(VMLAL_S,mull_s,add)
2212 DO_VMLAL(VMLAL_U,mull_u,add)
2213 DO_VMLAL(VMLSL_S,mull_s,sub)
2214 DO_VMLAL(VMLSL_U,mull_u,sub)
2215 
2216 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2217 {
2218     gen_helper_neon_mull_s16(rd, rn, rm);
2219     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2220 }
2221 
2222 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2223 {
2224     gen_mull_s32(rd, rn, rm);
2225     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2226 }
2227 
2228 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2229 {
2230     static NeonGenTwoOpWidenFn * const opfn[] = {
2231         NULL,
2232         gen_VQDMULL_16,
2233         gen_VQDMULL_32,
2234         NULL,
2235     };
2236 
2237     return do_long_3d(s, a, opfn[a->size], NULL);
2238 }
2239 
2240 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2241 {
2242     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2243 }
2244 
2245 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2246 {
2247     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2248 }
2249 
2250 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2251 {
2252     static NeonGenTwoOpWidenFn * const opfn[] = {
2253         NULL,
2254         gen_VQDMULL_16,
2255         gen_VQDMULL_32,
2256         NULL,
2257     };
2258     static NeonGenTwo64OpFn * const accfn[] = {
2259         NULL,
2260         gen_VQDMLAL_acc_16,
2261         gen_VQDMLAL_acc_32,
2262         NULL,
2263     };
2264 
2265     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2266 }
2267 
2268 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2269 {
2270     gen_helper_neon_negl_u32(rm, rm);
2271     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2272 }
2273 
2274 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2275 {
2276     tcg_gen_neg_i64(rm, rm);
2277     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2278 }
2279 
2280 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2281 {
2282     static NeonGenTwoOpWidenFn * const opfn[] = {
2283         NULL,
2284         gen_VQDMULL_16,
2285         gen_VQDMULL_32,
2286         NULL,
2287     };
2288     static NeonGenTwo64OpFn * const accfn[] = {
2289         NULL,
2290         gen_VQDMLSL_acc_16,
2291         gen_VQDMLSL_acc_32,
2292         NULL,
2293     };
2294 
2295     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2296 }
2297 
2298 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2299 {
2300     gen_helper_gvec_3 *fn_gvec;
2301 
2302     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2303         return false;
2304     }
2305 
2306     /* UNDEF accesses to D16-D31 if they don't exist. */
2307     if (!dc_isar_feature(aa32_simd_r32, s) &&
2308         ((a->vd | a->vn | a->vm) & 0x10)) {
2309         return false;
2310     }
2311 
2312     if (a->vd & 1) {
2313         return false;
2314     }
2315 
2316     switch (a->size) {
2317     case 0:
2318         fn_gvec = gen_helper_neon_pmull_h;
2319         break;
2320     case 2:
2321         if (!dc_isar_feature(aa32_pmull, s)) {
2322             return false;
2323         }
2324         fn_gvec = gen_helper_gvec_pmull_q;
2325         break;
2326     default:
2327         return false;
2328     }
2329 
2330     if (!vfp_access_check(s)) {
2331         return true;
2332     }
2333 
2334     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2335                        neon_full_reg_offset(a->vn),
2336                        neon_full_reg_offset(a->vm),
2337                        16, 16, 0, fn_gvec);
2338     return true;
2339 }
2340 
2341 static void gen_neon_dup_low16(TCGv_i32 var)
2342 {
2343     TCGv_i32 tmp = tcg_temp_new_i32();
2344     tcg_gen_ext16u_i32(var, var);
2345     tcg_gen_shli_i32(tmp, var, 16);
2346     tcg_gen_or_i32(var, var, tmp);
2347     tcg_temp_free_i32(tmp);
2348 }
2349 
2350 static void gen_neon_dup_high16(TCGv_i32 var)
2351 {
2352     TCGv_i32 tmp = tcg_temp_new_i32();
2353     tcg_gen_andi_i32(var, var, 0xffff0000);
2354     tcg_gen_shri_i32(tmp, var, 16);
2355     tcg_gen_or_i32(var, var, tmp);
2356     tcg_temp_free_i32(tmp);
2357 }
2358 
2359 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2360 {
2361     TCGv_i32 tmp = tcg_temp_new_i32();
2362     if (size == MO_16) {
2363         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2364         if (reg & 8) {
2365             gen_neon_dup_high16(tmp);
2366         } else {
2367             gen_neon_dup_low16(tmp);
2368         }
2369     } else {
2370         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2371     }
2372     return tmp;
2373 }
2374 
2375 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2376                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2377 {
2378     /*
2379      * Two registers and a scalar: perform an operation between
2380      * the input elements and the scalar, and then possibly
2381      * perform an accumulation operation of that result into the
2382      * destination.
2383      */
2384     TCGv_i32 scalar, tmp;
2385     int pass;
2386 
2387     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2388         return false;
2389     }
2390 
2391     /* UNDEF accesses to D16-D31 if they don't exist. */
2392     if (!dc_isar_feature(aa32_simd_r32, s) &&
2393         ((a->vd | a->vn | a->vm) & 0x10)) {
2394         return false;
2395     }
2396 
2397     if (!opfn) {
2398         /* Bad size (including size == 3, which is a different insn group) */
2399         return false;
2400     }
2401 
2402     if (a->q && ((a->vd | a->vn) & 1)) {
2403         return false;
2404     }
2405 
2406     if (!vfp_access_check(s)) {
2407         return true;
2408     }
2409 
2410     scalar = neon_get_scalar(a->size, a->vm);
2411     tmp = tcg_temp_new_i32();
2412 
2413     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2414         read_neon_element32(tmp, a->vn, pass, MO_32);
2415         opfn(tmp, tmp, scalar);
2416         if (accfn) {
2417             TCGv_i32 rd = tcg_temp_new_i32();
2418             read_neon_element32(rd, a->vd, pass, MO_32);
2419             accfn(tmp, rd, tmp);
2420             tcg_temp_free_i32(rd);
2421         }
2422         write_neon_element32(tmp, a->vd, pass, MO_32);
2423     }
2424     tcg_temp_free_i32(tmp);
2425     tcg_temp_free_i32(scalar);
2426     return true;
2427 }
2428 
2429 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2430 {
2431     static NeonGenTwoOpFn * const opfn[] = {
2432         NULL,
2433         gen_helper_neon_mul_u16,
2434         tcg_gen_mul_i32,
2435         NULL,
2436     };
2437 
2438     return do_2scalar(s, a, opfn[a->size], NULL);
2439 }
2440 
2441 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2442 {
2443     static NeonGenTwoOpFn * const opfn[] = {
2444         NULL,
2445         gen_helper_neon_mul_u16,
2446         tcg_gen_mul_i32,
2447         NULL,
2448     };
2449     static NeonGenTwoOpFn * const accfn[] = {
2450         NULL,
2451         gen_helper_neon_add_u16,
2452         tcg_gen_add_i32,
2453         NULL,
2454     };
2455 
2456     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2457 }
2458 
2459 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2460 {
2461     static NeonGenTwoOpFn * const opfn[] = {
2462         NULL,
2463         gen_helper_neon_mul_u16,
2464         tcg_gen_mul_i32,
2465         NULL,
2466     };
2467     static NeonGenTwoOpFn * const accfn[] = {
2468         NULL,
2469         gen_helper_neon_sub_u16,
2470         tcg_gen_sub_i32,
2471         NULL,
2472     };
2473 
2474     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2475 }
2476 
2477 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2478                               gen_helper_gvec_3_ptr *fn)
2479 {
2480     /* Two registers and a scalar, using gvec */
2481     int vec_size = a->q ? 16 : 8;
2482     int rd_ofs = neon_full_reg_offset(a->vd);
2483     int rn_ofs = neon_full_reg_offset(a->vn);
2484     int rm_ofs;
2485     int idx;
2486     TCGv_ptr fpstatus;
2487 
2488     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2489         return false;
2490     }
2491 
2492     /* UNDEF accesses to D16-D31 if they don't exist. */
2493     if (!dc_isar_feature(aa32_simd_r32, s) &&
2494         ((a->vd | a->vn | a->vm) & 0x10)) {
2495         return false;
2496     }
2497 
2498     if (!fn) {
2499         /* Bad size (including size == 3, which is a different insn group) */
2500         return false;
2501     }
2502 
2503     if (a->q && ((a->vd | a->vn) & 1)) {
2504         return false;
2505     }
2506 
2507     if (!vfp_access_check(s)) {
2508         return true;
2509     }
2510 
2511     /* a->vm is M:Vm, which encodes both register and index */
2512     idx = extract32(a->vm, a->size + 2, 2);
2513     a->vm = extract32(a->vm, 0, a->size + 2);
2514     rm_ofs = neon_full_reg_offset(a->vm);
2515 
2516     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2517     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2518                        vec_size, vec_size, idx, fn);
2519     tcg_temp_free_ptr(fpstatus);
2520     return true;
2521 }
2522 
2523 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2524     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2525     {                                                                   \
2526         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2527             NULL,                                                       \
2528             gen_helper_##FUNC##_h,                                      \
2529             gen_helper_##FUNC##_s,                                      \
2530             NULL,                                                       \
2531         };                                                              \
2532         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2533             return false;                                               \
2534         }                                                               \
2535         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2536     }
2537 
2538 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2539 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2540 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2541 
2542 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2543 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2544 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2545 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2546 
2547 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2548 {
2549     static NeonGenTwoOpFn * const opfn[] = {
2550         NULL,
2551         gen_VQDMULH_16,
2552         gen_VQDMULH_32,
2553         NULL,
2554     };
2555 
2556     return do_2scalar(s, a, opfn[a->size], NULL);
2557 }
2558 
2559 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2560 {
2561     static NeonGenTwoOpFn * const opfn[] = {
2562         NULL,
2563         gen_VQRDMULH_16,
2564         gen_VQRDMULH_32,
2565         NULL,
2566     };
2567 
2568     return do_2scalar(s, a, opfn[a->size], NULL);
2569 }
2570 
2571 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2572                             NeonGenThreeOpEnvFn *opfn)
2573 {
2574     /*
2575      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2576      * performs a kind of fused op-then-accumulate using a helper
2577      * function that takes all of rd, rn and the scalar at once.
2578      */
2579     TCGv_i32 scalar, rn, rd;
2580     int pass;
2581 
2582     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2583         return false;
2584     }
2585 
2586     if (!dc_isar_feature(aa32_rdm, s)) {
2587         return false;
2588     }
2589 
2590     /* UNDEF accesses to D16-D31 if they don't exist. */
2591     if (!dc_isar_feature(aa32_simd_r32, s) &&
2592         ((a->vd | a->vn | a->vm) & 0x10)) {
2593         return false;
2594     }
2595 
2596     if (!opfn) {
2597         /* Bad size (including size == 3, which is a different insn group) */
2598         return false;
2599     }
2600 
2601     if (a->q && ((a->vd | a->vn) & 1)) {
2602         return false;
2603     }
2604 
2605     if (!vfp_access_check(s)) {
2606         return true;
2607     }
2608 
2609     scalar = neon_get_scalar(a->size, a->vm);
2610     rn = tcg_temp_new_i32();
2611     rd = tcg_temp_new_i32();
2612 
2613     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2614         read_neon_element32(rn, a->vn, pass, MO_32);
2615         read_neon_element32(rd, a->vd, pass, MO_32);
2616         opfn(rd, cpu_env, rn, scalar, rd);
2617         write_neon_element32(rd, a->vd, pass, MO_32);
2618     }
2619     tcg_temp_free_i32(rn);
2620     tcg_temp_free_i32(rd);
2621     tcg_temp_free_i32(scalar);
2622 
2623     return true;
2624 }
2625 
2626 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2627 {
2628     static NeonGenThreeOpEnvFn *opfn[] = {
2629         NULL,
2630         gen_helper_neon_qrdmlah_s16,
2631         gen_helper_neon_qrdmlah_s32,
2632         NULL,
2633     };
2634     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2635 }
2636 
2637 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2638 {
2639     static NeonGenThreeOpEnvFn *opfn[] = {
2640         NULL,
2641         gen_helper_neon_qrdmlsh_s16,
2642         gen_helper_neon_qrdmlsh_s32,
2643         NULL,
2644     };
2645     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2646 }
2647 
2648 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2649                             NeonGenTwoOpWidenFn *opfn,
2650                             NeonGenTwo64OpFn *accfn)
2651 {
2652     /*
2653      * Two registers and a scalar, long operations: perform an
2654      * operation on the input elements and the scalar which produces
2655      * a double-width result, and then possibly perform an accumulation
2656      * operation of that result into the destination.
2657      */
2658     TCGv_i32 scalar, rn;
2659     TCGv_i64 rn0_64, rn1_64;
2660 
2661     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2662         return false;
2663     }
2664 
2665     /* UNDEF accesses to D16-D31 if they don't exist. */
2666     if (!dc_isar_feature(aa32_simd_r32, s) &&
2667         ((a->vd | a->vn | a->vm) & 0x10)) {
2668         return false;
2669     }
2670 
2671     if (!opfn) {
2672         /* Bad size (including size == 3, which is a different insn group) */
2673         return false;
2674     }
2675 
2676     if (a->vd & 1) {
2677         return false;
2678     }
2679 
2680     if (!vfp_access_check(s)) {
2681         return true;
2682     }
2683 
2684     scalar = neon_get_scalar(a->size, a->vm);
2685 
2686     /* Load all inputs before writing any outputs, in case of overlap */
2687     rn = tcg_temp_new_i32();
2688     read_neon_element32(rn, a->vn, 0, MO_32);
2689     rn0_64 = tcg_temp_new_i64();
2690     opfn(rn0_64, rn, scalar);
2691 
2692     read_neon_element32(rn, a->vn, 1, MO_32);
2693     rn1_64 = tcg_temp_new_i64();
2694     opfn(rn1_64, rn, scalar);
2695     tcg_temp_free_i32(rn);
2696     tcg_temp_free_i32(scalar);
2697 
2698     if (accfn) {
2699         TCGv_i64 t64 = tcg_temp_new_i64();
2700         read_neon_element64(t64, a->vd, 0, MO_64);
2701         accfn(rn0_64, t64, rn0_64);
2702         read_neon_element64(t64, a->vd, 1, MO_64);
2703         accfn(rn1_64, t64, rn1_64);
2704         tcg_temp_free_i64(t64);
2705     }
2706 
2707     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2708     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2709     tcg_temp_free_i64(rn0_64);
2710     tcg_temp_free_i64(rn1_64);
2711     return true;
2712 }
2713 
2714 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2715 {
2716     static NeonGenTwoOpWidenFn * const opfn[] = {
2717         NULL,
2718         gen_helper_neon_mull_s16,
2719         gen_mull_s32,
2720         NULL,
2721     };
2722 
2723     return do_2scalar_long(s, a, opfn[a->size], NULL);
2724 }
2725 
2726 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2727 {
2728     static NeonGenTwoOpWidenFn * const opfn[] = {
2729         NULL,
2730         gen_helper_neon_mull_u16,
2731         gen_mull_u32,
2732         NULL,
2733     };
2734 
2735     return do_2scalar_long(s, a, opfn[a->size], NULL);
2736 }
2737 
2738 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2739     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2740     {                                                                   \
2741         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2742             NULL,                                                       \
2743             gen_helper_neon_##MULL##16,                                 \
2744             gen_##MULL##32,                                             \
2745             NULL,                                                       \
2746         };                                                              \
2747         static NeonGenTwo64OpFn * const accfn[] = {                     \
2748             NULL,                                                       \
2749             gen_helper_neon_##ACC##l_u32,                               \
2750             tcg_gen_##ACC##_i64,                                        \
2751             NULL,                                                       \
2752         };                                                              \
2753         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2754     }
2755 
2756 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2757 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2758 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2759 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2760 
2761 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2762 {
2763     static NeonGenTwoOpWidenFn * const opfn[] = {
2764         NULL,
2765         gen_VQDMULL_16,
2766         gen_VQDMULL_32,
2767         NULL,
2768     };
2769 
2770     return do_2scalar_long(s, a, opfn[a->size], NULL);
2771 }
2772 
2773 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2774 {
2775     static NeonGenTwoOpWidenFn * const opfn[] = {
2776         NULL,
2777         gen_VQDMULL_16,
2778         gen_VQDMULL_32,
2779         NULL,
2780     };
2781     static NeonGenTwo64OpFn * const accfn[] = {
2782         NULL,
2783         gen_VQDMLAL_acc_16,
2784         gen_VQDMLAL_acc_32,
2785         NULL,
2786     };
2787 
2788     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2789 }
2790 
2791 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2792 {
2793     static NeonGenTwoOpWidenFn * const opfn[] = {
2794         NULL,
2795         gen_VQDMULL_16,
2796         gen_VQDMULL_32,
2797         NULL,
2798     };
2799     static NeonGenTwo64OpFn * const accfn[] = {
2800         NULL,
2801         gen_VQDMLSL_acc_16,
2802         gen_VQDMLSL_acc_32,
2803         NULL,
2804     };
2805 
2806     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2807 }
2808 
2809 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2810 {
2811     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2812         return false;
2813     }
2814 
2815     /* UNDEF accesses to D16-D31 if they don't exist. */
2816     if (!dc_isar_feature(aa32_simd_r32, s) &&
2817         ((a->vd | a->vn | a->vm) & 0x10)) {
2818         return false;
2819     }
2820 
2821     if ((a->vn | a->vm | a->vd) & a->q) {
2822         return false;
2823     }
2824 
2825     if (a->imm > 7 && !a->q) {
2826         return false;
2827     }
2828 
2829     if (!vfp_access_check(s)) {
2830         return true;
2831     }
2832 
2833     if (!a->q) {
2834         /* Extract 64 bits from <Vm:Vn> */
2835         TCGv_i64 left, right, dest;
2836 
2837         left = tcg_temp_new_i64();
2838         right = tcg_temp_new_i64();
2839         dest = tcg_temp_new_i64();
2840 
2841         read_neon_element64(right, a->vn, 0, MO_64);
2842         read_neon_element64(left, a->vm, 0, MO_64);
2843         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2844         write_neon_element64(dest, a->vd, 0, MO_64);
2845 
2846         tcg_temp_free_i64(left);
2847         tcg_temp_free_i64(right);
2848         tcg_temp_free_i64(dest);
2849     } else {
2850         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2851         TCGv_i64 left, middle, right, destleft, destright;
2852 
2853         left = tcg_temp_new_i64();
2854         middle = tcg_temp_new_i64();
2855         right = tcg_temp_new_i64();
2856         destleft = tcg_temp_new_i64();
2857         destright = tcg_temp_new_i64();
2858 
2859         if (a->imm < 8) {
2860             read_neon_element64(right, a->vn, 0, MO_64);
2861             read_neon_element64(middle, a->vn, 1, MO_64);
2862             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2863             read_neon_element64(left, a->vm, 0, MO_64);
2864             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2865         } else {
2866             read_neon_element64(right, a->vn, 1, MO_64);
2867             read_neon_element64(middle, a->vm, 0, MO_64);
2868             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2869             read_neon_element64(left, a->vm, 1, MO_64);
2870             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2871         }
2872 
2873         write_neon_element64(destright, a->vd, 0, MO_64);
2874         write_neon_element64(destleft, a->vd, 1, MO_64);
2875 
2876         tcg_temp_free_i64(destright);
2877         tcg_temp_free_i64(destleft);
2878         tcg_temp_free_i64(right);
2879         tcg_temp_free_i64(middle);
2880         tcg_temp_free_i64(left);
2881     }
2882     return true;
2883 }
2884 
2885 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2886 {
2887     TCGv_i64 val, def;
2888     TCGv_i32 desc;
2889 
2890     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2891         return false;
2892     }
2893 
2894     /* UNDEF accesses to D16-D31 if they don't exist. */
2895     if (!dc_isar_feature(aa32_simd_r32, s) &&
2896         ((a->vd | a->vn | a->vm) & 0x10)) {
2897         return false;
2898     }
2899 
2900     if ((a->vn + a->len + 1) > 32) {
2901         /*
2902          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2903          * helper function running off the end of the register file.
2904          */
2905         return false;
2906     }
2907 
2908     if (!vfp_access_check(s)) {
2909         return true;
2910     }
2911 
2912     desc = tcg_constant_i32((a->vn << 2) | a->len);
2913     def = tcg_temp_new_i64();
2914     if (a->op) {
2915         read_neon_element64(def, a->vd, 0, MO_64);
2916     } else {
2917         tcg_gen_movi_i64(def, 0);
2918     }
2919     val = tcg_temp_new_i64();
2920     read_neon_element64(val, a->vm, 0, MO_64);
2921 
2922     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2923     write_neon_element64(val, a->vd, 0, MO_64);
2924 
2925     tcg_temp_free_i64(def);
2926     tcg_temp_free_i64(val);
2927     return true;
2928 }
2929 
2930 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2931 {
2932     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2933         return false;
2934     }
2935 
2936     /* UNDEF accesses to D16-D31 if they don't exist. */
2937     if (!dc_isar_feature(aa32_simd_r32, s) &&
2938         ((a->vd | a->vm) & 0x10)) {
2939         return false;
2940     }
2941 
2942     if (a->vd & a->q) {
2943         return false;
2944     }
2945 
2946     if (!vfp_access_check(s)) {
2947         return true;
2948     }
2949 
2950     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2951                          neon_element_offset(a->vm, a->index, a->size),
2952                          a->q ? 16 : 8, a->q ? 16 : 8);
2953     return true;
2954 }
2955 
2956 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2957 {
2958     int pass, half;
2959     TCGv_i32 tmp[2];
2960 
2961     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2962         return false;
2963     }
2964 
2965     /* UNDEF accesses to D16-D31 if they don't exist. */
2966     if (!dc_isar_feature(aa32_simd_r32, s) &&
2967         ((a->vd | a->vm) & 0x10)) {
2968         return false;
2969     }
2970 
2971     if ((a->vd | a->vm) & a->q) {
2972         return false;
2973     }
2974 
2975     if (a->size == 3) {
2976         return false;
2977     }
2978 
2979     if (!vfp_access_check(s)) {
2980         return true;
2981     }
2982 
2983     tmp[0] = tcg_temp_new_i32();
2984     tmp[1] = tcg_temp_new_i32();
2985 
2986     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2987         for (half = 0; half < 2; half++) {
2988             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2989             switch (a->size) {
2990             case 0:
2991                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2992                 break;
2993             case 1:
2994                 gen_swap_half(tmp[half], tmp[half]);
2995                 break;
2996             case 2:
2997                 break;
2998             default:
2999                 g_assert_not_reached();
3000             }
3001         }
3002         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3003         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3004     }
3005 
3006     tcg_temp_free_i32(tmp[0]);
3007     tcg_temp_free_i32(tmp[1]);
3008     return true;
3009 }
3010 
3011 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3012                               NeonGenWidenFn *widenfn,
3013                               NeonGenTwo64OpFn *opfn,
3014                               NeonGenTwo64OpFn *accfn)
3015 {
3016     /*
3017      * Pairwise long operations: widen both halves of the pair,
3018      * combine the pairs with the opfn, and then possibly accumulate
3019      * into the destination with the accfn.
3020      */
3021     int pass;
3022 
3023     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3024         return false;
3025     }
3026 
3027     /* UNDEF accesses to D16-D31 if they don't exist. */
3028     if (!dc_isar_feature(aa32_simd_r32, s) &&
3029         ((a->vd | a->vm) & 0x10)) {
3030         return false;
3031     }
3032 
3033     if ((a->vd | a->vm) & a->q) {
3034         return false;
3035     }
3036 
3037     if (!widenfn) {
3038         return false;
3039     }
3040 
3041     if (!vfp_access_check(s)) {
3042         return true;
3043     }
3044 
3045     for (pass = 0; pass < a->q + 1; pass++) {
3046         TCGv_i32 tmp;
3047         TCGv_i64 rm0_64, rm1_64, rd_64;
3048 
3049         rm0_64 = tcg_temp_new_i64();
3050         rm1_64 = tcg_temp_new_i64();
3051         rd_64 = tcg_temp_new_i64();
3052 
3053         tmp = tcg_temp_new_i32();
3054         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3055         widenfn(rm0_64, tmp);
3056         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3057         widenfn(rm1_64, tmp);
3058         tcg_temp_free_i32(tmp);
3059 
3060         opfn(rd_64, rm0_64, rm1_64);
3061         tcg_temp_free_i64(rm0_64);
3062         tcg_temp_free_i64(rm1_64);
3063 
3064         if (accfn) {
3065             TCGv_i64 tmp64 = tcg_temp_new_i64();
3066             read_neon_element64(tmp64, a->vd, pass, MO_64);
3067             accfn(rd_64, tmp64, rd_64);
3068             tcg_temp_free_i64(tmp64);
3069         }
3070         write_neon_element64(rd_64, a->vd, pass, MO_64);
3071         tcg_temp_free_i64(rd_64);
3072     }
3073     return true;
3074 }
3075 
3076 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3077 {
3078     static NeonGenWidenFn * const widenfn[] = {
3079         gen_helper_neon_widen_s8,
3080         gen_helper_neon_widen_s16,
3081         tcg_gen_ext_i32_i64,
3082         NULL,
3083     };
3084     static NeonGenTwo64OpFn * const opfn[] = {
3085         gen_helper_neon_paddl_u16,
3086         gen_helper_neon_paddl_u32,
3087         tcg_gen_add_i64,
3088         NULL,
3089     };
3090 
3091     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3092 }
3093 
3094 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3095 {
3096     static NeonGenWidenFn * const widenfn[] = {
3097         gen_helper_neon_widen_u8,
3098         gen_helper_neon_widen_u16,
3099         tcg_gen_extu_i32_i64,
3100         NULL,
3101     };
3102     static NeonGenTwo64OpFn * const opfn[] = {
3103         gen_helper_neon_paddl_u16,
3104         gen_helper_neon_paddl_u32,
3105         tcg_gen_add_i64,
3106         NULL,
3107     };
3108 
3109     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3110 }
3111 
3112 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3113 {
3114     static NeonGenWidenFn * const widenfn[] = {
3115         gen_helper_neon_widen_s8,
3116         gen_helper_neon_widen_s16,
3117         tcg_gen_ext_i32_i64,
3118         NULL,
3119     };
3120     static NeonGenTwo64OpFn * const opfn[] = {
3121         gen_helper_neon_paddl_u16,
3122         gen_helper_neon_paddl_u32,
3123         tcg_gen_add_i64,
3124         NULL,
3125     };
3126     static NeonGenTwo64OpFn * const accfn[] = {
3127         gen_helper_neon_addl_u16,
3128         gen_helper_neon_addl_u32,
3129         tcg_gen_add_i64,
3130         NULL,
3131     };
3132 
3133     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3134                              accfn[a->size]);
3135 }
3136 
3137 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3138 {
3139     static NeonGenWidenFn * const widenfn[] = {
3140         gen_helper_neon_widen_u8,
3141         gen_helper_neon_widen_u16,
3142         tcg_gen_extu_i32_i64,
3143         NULL,
3144     };
3145     static NeonGenTwo64OpFn * const opfn[] = {
3146         gen_helper_neon_paddl_u16,
3147         gen_helper_neon_paddl_u32,
3148         tcg_gen_add_i64,
3149         NULL,
3150     };
3151     static NeonGenTwo64OpFn * const accfn[] = {
3152         gen_helper_neon_addl_u16,
3153         gen_helper_neon_addl_u32,
3154         tcg_gen_add_i64,
3155         NULL,
3156     };
3157 
3158     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3159                              accfn[a->size]);
3160 }
3161 
3162 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3163 
3164 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3165                        ZipFn *fn)
3166 {
3167     TCGv_ptr pd, pm;
3168 
3169     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3170         return false;
3171     }
3172 
3173     /* UNDEF accesses to D16-D31 if they don't exist. */
3174     if (!dc_isar_feature(aa32_simd_r32, s) &&
3175         ((a->vd | a->vm) & 0x10)) {
3176         return false;
3177     }
3178 
3179     if ((a->vd | a->vm) & a->q) {
3180         return false;
3181     }
3182 
3183     if (!fn) {
3184         /* Bad size or size/q combination */
3185         return false;
3186     }
3187 
3188     if (!vfp_access_check(s)) {
3189         return true;
3190     }
3191 
3192     pd = vfp_reg_ptr(true, a->vd);
3193     pm = vfp_reg_ptr(true, a->vm);
3194     fn(pd, pm);
3195     tcg_temp_free_ptr(pd);
3196     tcg_temp_free_ptr(pm);
3197     return true;
3198 }
3199 
3200 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3201 {
3202     static ZipFn * const fn[2][4] = {
3203         {
3204             gen_helper_neon_unzip8,
3205             gen_helper_neon_unzip16,
3206             NULL,
3207             NULL,
3208         }, {
3209             gen_helper_neon_qunzip8,
3210             gen_helper_neon_qunzip16,
3211             gen_helper_neon_qunzip32,
3212             NULL,
3213         }
3214     };
3215     return do_zip_uzp(s, a, fn[a->q][a->size]);
3216 }
3217 
3218 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3219 {
3220     static ZipFn * const fn[2][4] = {
3221         {
3222             gen_helper_neon_zip8,
3223             gen_helper_neon_zip16,
3224             NULL,
3225             NULL,
3226         }, {
3227             gen_helper_neon_qzip8,
3228             gen_helper_neon_qzip16,
3229             gen_helper_neon_qzip32,
3230             NULL,
3231         }
3232     };
3233     return do_zip_uzp(s, a, fn[a->q][a->size]);
3234 }
3235 
3236 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3237                      NeonGenNarrowEnvFn *narrowfn)
3238 {
3239     TCGv_i64 rm;
3240     TCGv_i32 rd0, rd1;
3241 
3242     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3243         return false;
3244     }
3245 
3246     /* UNDEF accesses to D16-D31 if they don't exist. */
3247     if (!dc_isar_feature(aa32_simd_r32, s) &&
3248         ((a->vd | a->vm) & 0x10)) {
3249         return false;
3250     }
3251 
3252     if (a->vm & 1) {
3253         return false;
3254     }
3255 
3256     if (!narrowfn) {
3257         return false;
3258     }
3259 
3260     if (!vfp_access_check(s)) {
3261         return true;
3262     }
3263 
3264     rm = tcg_temp_new_i64();
3265     rd0 = tcg_temp_new_i32();
3266     rd1 = tcg_temp_new_i32();
3267 
3268     read_neon_element64(rm, a->vm, 0, MO_64);
3269     narrowfn(rd0, cpu_env, rm);
3270     read_neon_element64(rm, a->vm, 1, MO_64);
3271     narrowfn(rd1, cpu_env, rm);
3272     write_neon_element32(rd0, a->vd, 0, MO_32);
3273     write_neon_element32(rd1, a->vd, 1, MO_32);
3274     tcg_temp_free_i32(rd0);
3275     tcg_temp_free_i32(rd1);
3276     tcg_temp_free_i64(rm);
3277     return true;
3278 }
3279 
3280 #define DO_VMOVN(INSN, FUNC)                                    \
3281     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3282     {                                                           \
3283         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3284             FUNC##8,                                            \
3285             FUNC##16,                                           \
3286             FUNC##32,                                           \
3287             NULL,                                               \
3288         };                                                      \
3289         return do_vmovn(s, a, narrowfn[a->size]);               \
3290     }
3291 
3292 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3293 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3294 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3295 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3296 
3297 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3298 {
3299     TCGv_i32 rm0, rm1;
3300     TCGv_i64 rd;
3301     static NeonGenWidenFn * const widenfns[] = {
3302         gen_helper_neon_widen_u8,
3303         gen_helper_neon_widen_u16,
3304         tcg_gen_extu_i32_i64,
3305         NULL,
3306     };
3307     NeonGenWidenFn *widenfn = widenfns[a->size];
3308 
3309     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3310         return false;
3311     }
3312 
3313     /* UNDEF accesses to D16-D31 if they don't exist. */
3314     if (!dc_isar_feature(aa32_simd_r32, s) &&
3315         ((a->vd | a->vm) & 0x10)) {
3316         return false;
3317     }
3318 
3319     if (a->vd & 1) {
3320         return false;
3321     }
3322 
3323     if (!widenfn) {
3324         return false;
3325     }
3326 
3327     if (!vfp_access_check(s)) {
3328         return true;
3329     }
3330 
3331     rd = tcg_temp_new_i64();
3332     rm0 = tcg_temp_new_i32();
3333     rm1 = tcg_temp_new_i32();
3334 
3335     read_neon_element32(rm0, a->vm, 0, MO_32);
3336     read_neon_element32(rm1, a->vm, 1, MO_32);
3337 
3338     widenfn(rd, rm0);
3339     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3340     write_neon_element64(rd, a->vd, 0, MO_64);
3341     widenfn(rd, rm1);
3342     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3343     write_neon_element64(rd, a->vd, 1, MO_64);
3344 
3345     tcg_temp_free_i64(rd);
3346     tcg_temp_free_i32(rm0);
3347     tcg_temp_free_i32(rm1);
3348     return true;
3349 }
3350 
3351 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3352 {
3353     TCGv_ptr fpst;
3354     TCGv_i64 tmp;
3355     TCGv_i32 dst0, dst1;
3356 
3357     if (!dc_isar_feature(aa32_bf16, s)) {
3358         return false;
3359     }
3360 
3361     /* UNDEF accesses to D16-D31 if they don't exist. */
3362     if (!dc_isar_feature(aa32_simd_r32, s) &&
3363         ((a->vd | a->vm) & 0x10)) {
3364         return false;
3365     }
3366 
3367     if ((a->vm & 1) || (a->size != 1)) {
3368         return false;
3369     }
3370 
3371     if (!vfp_access_check(s)) {
3372         return true;
3373     }
3374 
3375     fpst = fpstatus_ptr(FPST_STD);
3376     tmp = tcg_temp_new_i64();
3377     dst0 = tcg_temp_new_i32();
3378     dst1 = tcg_temp_new_i32();
3379 
3380     read_neon_element64(tmp, a->vm, 0, MO_64);
3381     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3382 
3383     read_neon_element64(tmp, a->vm, 1, MO_64);
3384     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3385 
3386     write_neon_element32(dst0, a->vd, 0, MO_32);
3387     write_neon_element32(dst1, a->vd, 1, MO_32);
3388 
3389     tcg_temp_free_i64(tmp);
3390     tcg_temp_free_i32(dst0);
3391     tcg_temp_free_i32(dst1);
3392     tcg_temp_free_ptr(fpst);
3393     return true;
3394 }
3395 
3396 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3397 {
3398     TCGv_ptr fpst;
3399     TCGv_i32 ahp, tmp, tmp2, tmp3;
3400 
3401     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3402         !dc_isar_feature(aa32_fp16_spconv, s)) {
3403         return false;
3404     }
3405 
3406     /* UNDEF accesses to D16-D31 if they don't exist. */
3407     if (!dc_isar_feature(aa32_simd_r32, s) &&
3408         ((a->vd | a->vm) & 0x10)) {
3409         return false;
3410     }
3411 
3412     if ((a->vm & 1) || (a->size != 1)) {
3413         return false;
3414     }
3415 
3416     if (!vfp_access_check(s)) {
3417         return true;
3418     }
3419 
3420     fpst = fpstatus_ptr(FPST_STD);
3421     ahp = get_ahp_flag();
3422     tmp = tcg_temp_new_i32();
3423     read_neon_element32(tmp, a->vm, 0, MO_32);
3424     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3425     tmp2 = tcg_temp_new_i32();
3426     read_neon_element32(tmp2, a->vm, 1, MO_32);
3427     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3428     tcg_gen_shli_i32(tmp2, tmp2, 16);
3429     tcg_gen_or_i32(tmp2, tmp2, tmp);
3430     read_neon_element32(tmp, a->vm, 2, MO_32);
3431     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3432     tmp3 = tcg_temp_new_i32();
3433     read_neon_element32(tmp3, a->vm, 3, MO_32);
3434     write_neon_element32(tmp2, a->vd, 0, MO_32);
3435     tcg_temp_free_i32(tmp2);
3436     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3437     tcg_gen_shli_i32(tmp3, tmp3, 16);
3438     tcg_gen_or_i32(tmp3, tmp3, tmp);
3439     write_neon_element32(tmp3, a->vd, 1, MO_32);
3440     tcg_temp_free_i32(tmp3);
3441     tcg_temp_free_i32(tmp);
3442     tcg_temp_free_i32(ahp);
3443     tcg_temp_free_ptr(fpst);
3444 
3445     return true;
3446 }
3447 
3448 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3449 {
3450     TCGv_ptr fpst;
3451     TCGv_i32 ahp, tmp, tmp2, tmp3;
3452 
3453     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3454         !dc_isar_feature(aa32_fp16_spconv, s)) {
3455         return false;
3456     }
3457 
3458     /* UNDEF accesses to D16-D31 if they don't exist. */
3459     if (!dc_isar_feature(aa32_simd_r32, s) &&
3460         ((a->vd | a->vm) & 0x10)) {
3461         return false;
3462     }
3463 
3464     if ((a->vd & 1) || (a->size != 1)) {
3465         return false;
3466     }
3467 
3468     if (!vfp_access_check(s)) {
3469         return true;
3470     }
3471 
3472     fpst = fpstatus_ptr(FPST_STD);
3473     ahp = get_ahp_flag();
3474     tmp3 = tcg_temp_new_i32();
3475     tmp2 = tcg_temp_new_i32();
3476     tmp = tcg_temp_new_i32();
3477     read_neon_element32(tmp, a->vm, 0, MO_32);
3478     read_neon_element32(tmp2, a->vm, 1, MO_32);
3479     tcg_gen_ext16u_i32(tmp3, tmp);
3480     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3481     write_neon_element32(tmp3, a->vd, 0, MO_32);
3482     tcg_gen_shri_i32(tmp, tmp, 16);
3483     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3484     write_neon_element32(tmp, a->vd, 1, MO_32);
3485     tcg_temp_free_i32(tmp);
3486     tcg_gen_ext16u_i32(tmp3, tmp2);
3487     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3488     write_neon_element32(tmp3, a->vd, 2, MO_32);
3489     tcg_temp_free_i32(tmp3);
3490     tcg_gen_shri_i32(tmp2, tmp2, 16);
3491     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3492     write_neon_element32(tmp2, a->vd, 3, MO_32);
3493     tcg_temp_free_i32(tmp2);
3494     tcg_temp_free_i32(ahp);
3495     tcg_temp_free_ptr(fpst);
3496 
3497     return true;
3498 }
3499 
3500 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3501 {
3502     int vec_size = a->q ? 16 : 8;
3503     int rd_ofs = neon_full_reg_offset(a->vd);
3504     int rm_ofs = neon_full_reg_offset(a->vm);
3505 
3506     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3507         return false;
3508     }
3509 
3510     /* UNDEF accesses to D16-D31 if they don't exist. */
3511     if (!dc_isar_feature(aa32_simd_r32, s) &&
3512         ((a->vd | a->vm) & 0x10)) {
3513         return false;
3514     }
3515 
3516     if (a->size == 3) {
3517         return false;
3518     }
3519 
3520     if ((a->vd | a->vm) & a->q) {
3521         return false;
3522     }
3523 
3524     if (!vfp_access_check(s)) {
3525         return true;
3526     }
3527 
3528     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3529 
3530     return true;
3531 }
3532 
3533 #define DO_2MISC_VEC(INSN, FN)                                  \
3534     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3535     {                                                           \
3536         return do_2misc_vec(s, a, FN);                          \
3537     }
3538 
3539 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3540 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3541 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3542 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3543 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3544 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3545 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3546 
3547 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3548 {
3549     if (a->size != 0) {
3550         return false;
3551     }
3552     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3553 }
3554 
3555 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3556     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3557                          uint32_t rm_ofs, uint32_t oprsz,               \
3558                          uint32_t maxsz)                                \
3559     {                                                                   \
3560         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3561                            DATA, FUNC);                                 \
3562     }
3563 
3564 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3565     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3566                          uint32_t rm_ofs, uint32_t oprsz,               \
3567                          uint32_t maxsz)                                \
3568     {                                                                   \
3569         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3570     }
3571 
3572 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3573 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3574 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3575 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3576 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3577 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3578 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3579 
3580 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3581     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3582     {                                                           \
3583         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3584             return false;                                       \
3585         }                                                       \
3586         return do_2misc_vec(s, a, gen_##INSN);                  \
3587     }
3588 
3589 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3590 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3591 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3592 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3593 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3594 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3595 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3596 
3597 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3598 {
3599     TCGv_i32 tmp;
3600     int pass;
3601 
3602     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3603     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3604         return false;
3605     }
3606 
3607     /* UNDEF accesses to D16-D31 if they don't exist. */
3608     if (!dc_isar_feature(aa32_simd_r32, s) &&
3609         ((a->vd | a->vm) & 0x10)) {
3610         return false;
3611     }
3612 
3613     if (!fn) {
3614         return false;
3615     }
3616 
3617     if ((a->vd | a->vm) & a->q) {
3618         return false;
3619     }
3620 
3621     if (!vfp_access_check(s)) {
3622         return true;
3623     }
3624 
3625     tmp = tcg_temp_new_i32();
3626     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3627         read_neon_element32(tmp, a->vm, pass, MO_32);
3628         fn(tmp, tmp);
3629         write_neon_element32(tmp, a->vd, pass, MO_32);
3630     }
3631     tcg_temp_free_i32(tmp);
3632 
3633     return true;
3634 }
3635 
3636 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3637 {
3638     static NeonGenOneOpFn * const fn[] = {
3639         tcg_gen_bswap32_i32,
3640         gen_swap_half,
3641         NULL,
3642         NULL,
3643     };
3644     return do_2misc(s, a, fn[a->size]);
3645 }
3646 
3647 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3648 {
3649     if (a->size != 0) {
3650         return false;
3651     }
3652     return do_2misc(s, a, gen_rev16);
3653 }
3654 
3655 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3656 {
3657     static NeonGenOneOpFn * const fn[] = {
3658         gen_helper_neon_cls_s8,
3659         gen_helper_neon_cls_s16,
3660         gen_helper_neon_cls_s32,
3661         NULL,
3662     };
3663     return do_2misc(s, a, fn[a->size]);
3664 }
3665 
3666 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3667 {
3668     tcg_gen_clzi_i32(rd, rm, 32);
3669 }
3670 
3671 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3672 {
3673     static NeonGenOneOpFn * const fn[] = {
3674         gen_helper_neon_clz_u8,
3675         gen_helper_neon_clz_u16,
3676         do_VCLZ_32,
3677         NULL,
3678     };
3679     return do_2misc(s, a, fn[a->size]);
3680 }
3681 
3682 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3683 {
3684     if (a->size != 0) {
3685         return false;
3686     }
3687     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3688 }
3689 
3690 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3691                        uint32_t oprsz, uint32_t maxsz)
3692 {
3693     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3694                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3695                       oprsz, maxsz);
3696 }
3697 
3698 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3699 {
3700     if (a->size == MO_16) {
3701         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3702             return false;
3703         }
3704     } else if (a->size != MO_32) {
3705         return false;
3706     }
3707     return do_2misc_vec(s, a, gen_VABS_F);
3708 }
3709 
3710 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3711                        uint32_t oprsz, uint32_t maxsz)
3712 {
3713     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3714                       vece == MO_16 ? 0x8000 : 0x80000000,
3715                       oprsz, maxsz);
3716 }
3717 
3718 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3719 {
3720     if (a->size == MO_16) {
3721         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3722             return false;
3723         }
3724     } else if (a->size != MO_32) {
3725         return false;
3726     }
3727     return do_2misc_vec(s, a, gen_VNEG_F);
3728 }
3729 
3730 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3731 {
3732     if (a->size != 2) {
3733         return false;
3734     }
3735     return do_2misc(s, a, gen_helper_recpe_u32);
3736 }
3737 
3738 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3739 {
3740     if (a->size != 2) {
3741         return false;
3742     }
3743     return do_2misc(s, a, gen_helper_rsqrte_u32);
3744 }
3745 
3746 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3747     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3748     {                                                   \
3749         FUNC(d, cpu_env, m);                            \
3750     }
3751 
3752 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3753 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3754 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3755 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3756 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3757 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3758 
3759 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3760 {
3761     static NeonGenOneOpFn * const fn[] = {
3762         gen_VQABS_s8,
3763         gen_VQABS_s16,
3764         gen_VQABS_s32,
3765         NULL,
3766     };
3767     return do_2misc(s, a, fn[a->size]);
3768 }
3769 
3770 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3771 {
3772     static NeonGenOneOpFn * const fn[] = {
3773         gen_VQNEG_s8,
3774         gen_VQNEG_s16,
3775         gen_VQNEG_s32,
3776         NULL,
3777     };
3778     return do_2misc(s, a, fn[a->size]);
3779 }
3780 
3781 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3782     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3783                            uint32_t rm_ofs,                             \
3784                            uint32_t oprsz, uint32_t maxsz)              \
3785     {                                                                   \
3786         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3787             NULL, HFUNC, SFUNC, NULL,                                   \
3788         };                                                              \
3789         TCGv_ptr fpst;                                                  \
3790         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3791         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3792                            fns[vece]);                                  \
3793         tcg_temp_free_ptr(fpst);                                        \
3794     }                                                                   \
3795     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3796     {                                                                   \
3797         if (a->size == MO_16) {                                         \
3798             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3799                 return false;                                           \
3800             }                                                           \
3801         } else if (a->size != MO_32) {                                  \
3802             return false;                                               \
3803         }                                                               \
3804         return do_2misc_vec(s, a, gen_##INSN);                          \
3805     }
3806 
3807 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3808 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3809 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3810 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3811 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3812 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3813 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3814 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3815 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3816 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3817 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3818 
3819 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3820 
3821 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3822 {
3823     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3824         return false;
3825     }
3826     return trans_VRINTX_impl(s, a);
3827 }
3828 
3829 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3830     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3831                            uint32_t rm_ofs,                             \
3832                            uint32_t oprsz, uint32_t maxsz)              \
3833     {                                                                   \
3834         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3835             NULL,                                                       \
3836             gen_helper_gvec_##OP##h,                                    \
3837             gen_helper_gvec_##OP##s,                                    \
3838             NULL,                                                       \
3839         };                                                              \
3840         TCGv_ptr fpst;                                                  \
3841         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3842         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3843                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3844         tcg_temp_free_ptr(fpst);                                        \
3845     }                                                                   \
3846     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3847     {                                                                   \
3848         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3849             return false;                                               \
3850         }                                                               \
3851         if (a->size == MO_16) {                                         \
3852             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3853                 return false;                                           \
3854             }                                                           \
3855         } else if (a->size != MO_32) {                                  \
3856             return false;                                               \
3857         }                                                               \
3858         return do_2misc_vec(s, a, gen_##INSN);                          \
3859     }
3860 
3861 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3862 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3863 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3864 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3865 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3866 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3867 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3868 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3869 
3870 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3871 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3872 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3873 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3874 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3875 
3876 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3877 {
3878     TCGv_i64 rm, rd;
3879     int pass;
3880 
3881     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3882         return false;
3883     }
3884 
3885     /* UNDEF accesses to D16-D31 if they don't exist. */
3886     if (!dc_isar_feature(aa32_simd_r32, s) &&
3887         ((a->vd | a->vm) & 0x10)) {
3888         return false;
3889     }
3890 
3891     if (a->size != 0) {
3892         return false;
3893     }
3894 
3895     if ((a->vd | a->vm) & a->q) {
3896         return false;
3897     }
3898 
3899     if (!vfp_access_check(s)) {
3900         return true;
3901     }
3902 
3903     rm = tcg_temp_new_i64();
3904     rd = tcg_temp_new_i64();
3905     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3906         read_neon_element64(rm, a->vm, pass, MO_64);
3907         read_neon_element64(rd, a->vd, pass, MO_64);
3908         write_neon_element64(rm, a->vd, pass, MO_64);
3909         write_neon_element64(rd, a->vm, pass, MO_64);
3910     }
3911     tcg_temp_free_i64(rm);
3912     tcg_temp_free_i64(rd);
3913 
3914     return true;
3915 }
3916 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3917 {
3918     TCGv_i32 rd, tmp;
3919 
3920     rd = tcg_temp_new_i32();
3921     tmp = tcg_temp_new_i32();
3922 
3923     tcg_gen_shli_i32(rd, t0, 8);
3924     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3925     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3926     tcg_gen_or_i32(rd, rd, tmp);
3927 
3928     tcg_gen_shri_i32(t1, t1, 8);
3929     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3930     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3931     tcg_gen_or_i32(t1, t1, tmp);
3932     tcg_gen_mov_i32(t0, rd);
3933 
3934     tcg_temp_free_i32(tmp);
3935     tcg_temp_free_i32(rd);
3936 }
3937 
3938 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3939 {
3940     TCGv_i32 rd, tmp;
3941 
3942     rd = tcg_temp_new_i32();
3943     tmp = tcg_temp_new_i32();
3944 
3945     tcg_gen_shli_i32(rd, t0, 16);
3946     tcg_gen_andi_i32(tmp, t1, 0xffff);
3947     tcg_gen_or_i32(rd, rd, tmp);
3948     tcg_gen_shri_i32(t1, t1, 16);
3949     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3950     tcg_gen_or_i32(t1, t1, tmp);
3951     tcg_gen_mov_i32(t0, rd);
3952 
3953     tcg_temp_free_i32(tmp);
3954     tcg_temp_free_i32(rd);
3955 }
3956 
3957 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3958 {
3959     TCGv_i32 tmp, tmp2;
3960     int pass;
3961 
3962     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3963         return false;
3964     }
3965 
3966     /* UNDEF accesses to D16-D31 if they don't exist. */
3967     if (!dc_isar_feature(aa32_simd_r32, s) &&
3968         ((a->vd | a->vm) & 0x10)) {
3969         return false;
3970     }
3971 
3972     if ((a->vd | a->vm) & a->q) {
3973         return false;
3974     }
3975 
3976     if (a->size == 3) {
3977         return false;
3978     }
3979 
3980     if (!vfp_access_check(s)) {
3981         return true;
3982     }
3983 
3984     tmp = tcg_temp_new_i32();
3985     tmp2 = tcg_temp_new_i32();
3986     if (a->size == MO_32) {
3987         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3988             read_neon_element32(tmp, a->vm, pass, MO_32);
3989             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3990             write_neon_element32(tmp2, a->vm, pass, MO_32);
3991             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3992         }
3993     } else {
3994         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3995             read_neon_element32(tmp, a->vm, pass, MO_32);
3996             read_neon_element32(tmp2, a->vd, pass, MO_32);
3997             if (a->size == MO_8) {
3998                 gen_neon_trn_u8(tmp, tmp2);
3999             } else {
4000                 gen_neon_trn_u16(tmp, tmp2);
4001             }
4002             write_neon_element32(tmp2, a->vm, pass, MO_32);
4003             write_neon_element32(tmp, a->vd, pass, MO_32);
4004         }
4005     }
4006     tcg_temp_free_i32(tmp);
4007     tcg_temp_free_i32(tmp2);
4008     return true;
4009 }
4010 
4011 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4012 {
4013     if (!dc_isar_feature(aa32_i8mm, s)) {
4014         return false;
4015     }
4016     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4017                         gen_helper_gvec_smmla_b);
4018 }
4019 
4020 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4021 {
4022     if (!dc_isar_feature(aa32_i8mm, s)) {
4023         return false;
4024     }
4025     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4026                         gen_helper_gvec_ummla_b);
4027 }
4028 
4029 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4030 {
4031     if (!dc_isar_feature(aa32_i8mm, s)) {
4032         return false;
4033     }
4034     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4035                         gen_helper_gvec_usmmla_b);
4036 }
4037 
4038 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4039 {
4040     if (!dc_isar_feature(aa32_bf16, s)) {
4041         return false;
4042     }
4043     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4044                         gen_helper_gvec_bfmmla);
4045 }
4046 
4047 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4048 {
4049     if (!dc_isar_feature(aa32_bf16, s)) {
4050         return false;
4051     }
4052     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4053                              gen_helper_gvec_bfmlal);
4054 }
4055 
4056 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4057 {
4058     if (!dc_isar_feature(aa32_bf16, s)) {
4059         return false;
4060     }
4061     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4062                              (a->index << 1) | a->q, FPST_STD,
4063                              gen_helper_gvec_bfmlal_idx);
4064 }
4065