xref: /qemu/tcg/tcg-op-gvec.c (revision 92eecfff)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "qemu/main-loop.h"
25 #include "tcg/tcg-gvec-desc.h"
26 
27 #define MAX_UNROLL  4
28 
29 #ifdef CONFIG_DEBUG_TCG
30 static const TCGOpcode vecop_list_empty[1] = { 0 };
31 #else
32 #define vecop_list_empty NULL
33 #endif
34 
35 
36 /* Verify vector size and alignment rules.  OFS should be the OR of all
37    of the operand offsets so that we can check them all at once.  */
38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
39 {
40     uint32_t max_align;
41 
42     switch (oprsz) {
43     case 8:
44     case 16:
45     case 32:
46         tcg_debug_assert(oprsz <= maxsz);
47         break;
48     default:
49         tcg_debug_assert(oprsz == maxsz);
50         break;
51     }
52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
53 
54     max_align = maxsz >= 16 ? 15 : 7;
55     tcg_debug_assert((maxsz & max_align) == 0);
56     tcg_debug_assert((ofs & max_align) == 0);
57 }
58 
59 /* Verify vector overlap rules for two operands.  */
60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
61 {
62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
63 }
64 
65 /* Verify vector overlap rules for three operands.  */
66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
67 {
68     check_overlap_2(d, a, s);
69     check_overlap_2(d, b, s);
70     check_overlap_2(a, b, s);
71 }
72 
73 /* Verify vector overlap rules for four operands.  */
74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
75                             uint32_t c, uint32_t s)
76 {
77     check_overlap_2(d, a, s);
78     check_overlap_2(d, b, s);
79     check_overlap_2(d, c, s);
80     check_overlap_2(a, b, s);
81     check_overlap_2(a, c, s);
82     check_overlap_2(b, c, s);
83 }
84 
85 /* Create a descriptor from components.  */
86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
87 {
88     uint32_t desc = 0;
89 
90     check_size_align(oprsz, maxsz, 0);
91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
92 
93     oprsz = (oprsz / 8) - 1;
94     maxsz = (maxsz / 8) - 1;
95 
96     /*
97      * We have just asserted in check_size_align that either
98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
99      * case with '2', as that would otherwise map to 24.
100      */
101     if (oprsz == maxsz) {
102         oprsz = 2;
103     }
104 
105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
108 
109     return desc;
110 }
111 
112 /* Generate a call to a gvec-style helper with two vector operands.  */
113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
115                         gen_helper_gvec_2 *fn)
116 {
117     TCGv_ptr a0, a1;
118     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
119 
120     a0 = tcg_temp_new_ptr();
121     a1 = tcg_temp_new_ptr();
122 
123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
125 
126     fn(a0, a1, desc);
127 
128     tcg_temp_free_ptr(a0);
129     tcg_temp_free_ptr(a1);
130     tcg_temp_free_i32(desc);
131 }
132 
133 /* Generate a call to a gvec-style helper with two vector operands
134    and one scalar operand.  */
135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
136                          uint32_t oprsz, uint32_t maxsz, int32_t data,
137                          gen_helper_gvec_2i *fn)
138 {
139     TCGv_ptr a0, a1;
140     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
141 
142     a0 = tcg_temp_new_ptr();
143     a1 = tcg_temp_new_ptr();
144 
145     tcg_gen_addi_ptr(a0, cpu_env, dofs);
146     tcg_gen_addi_ptr(a1, cpu_env, aofs);
147 
148     fn(a0, a1, c, desc);
149 
150     tcg_temp_free_ptr(a0);
151     tcg_temp_free_ptr(a1);
152     tcg_temp_free_i32(desc);
153 }
154 
155 /* Generate a call to a gvec-style helper with three vector operands.  */
156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
157                         uint32_t oprsz, uint32_t maxsz, int32_t data,
158                         gen_helper_gvec_3 *fn)
159 {
160     TCGv_ptr a0, a1, a2;
161     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
162 
163     a0 = tcg_temp_new_ptr();
164     a1 = tcg_temp_new_ptr();
165     a2 = tcg_temp_new_ptr();
166 
167     tcg_gen_addi_ptr(a0, cpu_env, dofs);
168     tcg_gen_addi_ptr(a1, cpu_env, aofs);
169     tcg_gen_addi_ptr(a2, cpu_env, bofs);
170 
171     fn(a0, a1, a2, desc);
172 
173     tcg_temp_free_ptr(a0);
174     tcg_temp_free_ptr(a1);
175     tcg_temp_free_ptr(a2);
176     tcg_temp_free_i32(desc);
177 }
178 
179 /* Generate a call to a gvec-style helper with four vector operands.  */
180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
181                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
182                         int32_t data, gen_helper_gvec_4 *fn)
183 {
184     TCGv_ptr a0, a1, a2, a3;
185     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
186 
187     a0 = tcg_temp_new_ptr();
188     a1 = tcg_temp_new_ptr();
189     a2 = tcg_temp_new_ptr();
190     a3 = tcg_temp_new_ptr();
191 
192     tcg_gen_addi_ptr(a0, cpu_env, dofs);
193     tcg_gen_addi_ptr(a1, cpu_env, aofs);
194     tcg_gen_addi_ptr(a2, cpu_env, bofs);
195     tcg_gen_addi_ptr(a3, cpu_env, cofs);
196 
197     fn(a0, a1, a2, a3, desc);
198 
199     tcg_temp_free_ptr(a0);
200     tcg_temp_free_ptr(a1);
201     tcg_temp_free_ptr(a2);
202     tcg_temp_free_ptr(a3);
203     tcg_temp_free_i32(desc);
204 }
205 
206 /* Generate a call to a gvec-style helper with five vector operands.  */
207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
208                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
209                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
210 {
211     TCGv_ptr a0, a1, a2, a3, a4;
212     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
213 
214     a0 = tcg_temp_new_ptr();
215     a1 = tcg_temp_new_ptr();
216     a2 = tcg_temp_new_ptr();
217     a3 = tcg_temp_new_ptr();
218     a4 = tcg_temp_new_ptr();
219 
220     tcg_gen_addi_ptr(a0, cpu_env, dofs);
221     tcg_gen_addi_ptr(a1, cpu_env, aofs);
222     tcg_gen_addi_ptr(a2, cpu_env, bofs);
223     tcg_gen_addi_ptr(a3, cpu_env, cofs);
224     tcg_gen_addi_ptr(a4, cpu_env, xofs);
225 
226     fn(a0, a1, a2, a3, a4, desc);
227 
228     tcg_temp_free_ptr(a0);
229     tcg_temp_free_ptr(a1);
230     tcg_temp_free_ptr(a2);
231     tcg_temp_free_ptr(a3);
232     tcg_temp_free_ptr(a4);
233     tcg_temp_free_i32(desc);
234 }
235 
236 /* Generate a call to a gvec-style helper with three vector operands
237    and an extra pointer operand.  */
238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
239                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
240                         int32_t data, gen_helper_gvec_2_ptr *fn)
241 {
242     TCGv_ptr a0, a1;
243     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
244 
245     a0 = tcg_temp_new_ptr();
246     a1 = tcg_temp_new_ptr();
247 
248     tcg_gen_addi_ptr(a0, cpu_env, dofs);
249     tcg_gen_addi_ptr(a1, cpu_env, aofs);
250 
251     fn(a0, a1, ptr, desc);
252 
253     tcg_temp_free_ptr(a0);
254     tcg_temp_free_ptr(a1);
255     tcg_temp_free_i32(desc);
256 }
257 
258 /* Generate a call to a gvec-style helper with three vector operands
259    and an extra pointer operand.  */
260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
261                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
262                         int32_t data, gen_helper_gvec_3_ptr *fn)
263 {
264     TCGv_ptr a0, a1, a2;
265     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
266 
267     a0 = tcg_temp_new_ptr();
268     a1 = tcg_temp_new_ptr();
269     a2 = tcg_temp_new_ptr();
270 
271     tcg_gen_addi_ptr(a0, cpu_env, dofs);
272     tcg_gen_addi_ptr(a1, cpu_env, aofs);
273     tcg_gen_addi_ptr(a2, cpu_env, bofs);
274 
275     fn(a0, a1, a2, ptr, desc);
276 
277     tcg_temp_free_ptr(a0);
278     tcg_temp_free_ptr(a1);
279     tcg_temp_free_ptr(a2);
280     tcg_temp_free_i32(desc);
281 }
282 
283 /* Generate a call to a gvec-style helper with four vector operands
284    and an extra pointer operand.  */
285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
286                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
287                         uint32_t maxsz, int32_t data,
288                         gen_helper_gvec_4_ptr *fn)
289 {
290     TCGv_ptr a0, a1, a2, a3;
291     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
292 
293     a0 = tcg_temp_new_ptr();
294     a1 = tcg_temp_new_ptr();
295     a2 = tcg_temp_new_ptr();
296     a3 = tcg_temp_new_ptr();
297 
298     tcg_gen_addi_ptr(a0, cpu_env, dofs);
299     tcg_gen_addi_ptr(a1, cpu_env, aofs);
300     tcg_gen_addi_ptr(a2, cpu_env, bofs);
301     tcg_gen_addi_ptr(a3, cpu_env, cofs);
302 
303     fn(a0, a1, a2, a3, ptr, desc);
304 
305     tcg_temp_free_ptr(a0);
306     tcg_temp_free_ptr(a1);
307     tcg_temp_free_ptr(a2);
308     tcg_temp_free_ptr(a3);
309     tcg_temp_free_i32(desc);
310 }
311 
312 /* Generate a call to a gvec-style helper with five vector operands
313    and an extra pointer operand.  */
314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
315                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
316                         uint32_t oprsz, uint32_t maxsz, int32_t data,
317                         gen_helper_gvec_5_ptr *fn)
318 {
319     TCGv_ptr a0, a1, a2, a3, a4;
320     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
321 
322     a0 = tcg_temp_new_ptr();
323     a1 = tcg_temp_new_ptr();
324     a2 = tcg_temp_new_ptr();
325     a3 = tcg_temp_new_ptr();
326     a4 = tcg_temp_new_ptr();
327 
328     tcg_gen_addi_ptr(a0, cpu_env, dofs);
329     tcg_gen_addi_ptr(a1, cpu_env, aofs);
330     tcg_gen_addi_ptr(a2, cpu_env, bofs);
331     tcg_gen_addi_ptr(a3, cpu_env, cofs);
332     tcg_gen_addi_ptr(a4, cpu_env, eofs);
333 
334     fn(a0, a1, a2, a3, a4, ptr, desc);
335 
336     tcg_temp_free_ptr(a0);
337     tcg_temp_free_ptr(a1);
338     tcg_temp_free_ptr(a2);
339     tcg_temp_free_ptr(a3);
340     tcg_temp_free_ptr(a4);
341     tcg_temp_free_i32(desc);
342 }
343 
344 /* Return true if we want to implement something of OPRSZ bytes
345    in units of LNSZ.  This limits the expansion of inline code.  */
346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
347 {
348     uint32_t q, r;
349 
350     if (oprsz < lnsz) {
351         return false;
352     }
353 
354     q = oprsz / lnsz;
355     r = oprsz % lnsz;
356     tcg_debug_assert((r & 7) == 0);
357 
358     if (lnsz < 16) {
359         /* For sizes below 16, accept no remainder. */
360         if (r != 0) {
361             return false;
362         }
363     } else {
364         /*
365          * Recall that ARM SVE allows vector sizes that are not a
366          * power of 2, but always a multiple of 16.  The intent is
367          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
368          * In addition, expand_clr needs to handle a multiple of 8.
369          * Thus we can handle the tail with one more operation per
370          * diminishing power of 2.
371          */
372         q += ctpop32(r);
373     }
374 
375     return q <= MAX_UNROLL;
376 }
377 
378 static void expand_clr(uint32_t dofs, uint32_t maxsz);
379 
380 /* Duplicate C as per VECE.  */
381 uint64_t (dup_const)(unsigned vece, uint64_t c)
382 {
383     switch (vece) {
384     case MO_8:
385         return 0x0101010101010101ull * (uint8_t)c;
386     case MO_16:
387         return 0x0001000100010001ull * (uint16_t)c;
388     case MO_32:
389         return 0x0000000100000001ull * (uint32_t)c;
390     case MO_64:
391         return c;
392     default:
393         g_assert_not_reached();
394     }
395 }
396 
397 /* Duplicate IN into OUT as per VECE.  */
398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
399 {
400     switch (vece) {
401     case MO_8:
402         tcg_gen_ext8u_i32(out, in);
403         tcg_gen_muli_i32(out, out, 0x01010101);
404         break;
405     case MO_16:
406         tcg_gen_deposit_i32(out, in, in, 16, 16);
407         break;
408     case MO_32:
409         tcg_gen_mov_i32(out, in);
410         break;
411     default:
412         g_assert_not_reached();
413     }
414 }
415 
416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
417 {
418     switch (vece) {
419     case MO_8:
420         tcg_gen_ext8u_i64(out, in);
421         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
422         break;
423     case MO_16:
424         tcg_gen_ext16u_i64(out, in);
425         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
426         break;
427     case MO_32:
428         tcg_gen_deposit_i64(out, in, in, 32, 32);
429         break;
430     case MO_64:
431         tcg_gen_mov_i64(out, in);
432         break;
433     default:
434         g_assert_not_reached();
435     }
436 }
437 
438 /* Select a supported vector type for implementing an operation on SIZE
439  * bytes.  If OP is 0, assume that the real operation to be performed is
440  * required by all backends.  Otherwise, make sure than OP can be performed
441  * on elements of size VECE in the selected type.  Do not select V64 if
442  * PREFER_I64 is true.  Return 0 if no vector type is selected.
443  */
444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
445                                   uint32_t size, bool prefer_i64)
446 {
447     /*
448      * Recall that ARM SVE allows vector sizes that are not a
449      * power of 2, but always a multiple of 16.  The intent is
450      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
451      * It is hard to imagine a case in which v256 is supported
452      * but v128 is not, but check anyway.
453      * In addition, expand_clr needs to handle a multiple of 8.
454      */
455     if (TCG_TARGET_HAS_v256 &&
456         check_size_impl(size, 32) &&
457         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
458         (!(size & 16) ||
459          (TCG_TARGET_HAS_v128 &&
460           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
461         (!(size & 8) ||
462          (TCG_TARGET_HAS_v64 &&
463           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
464         return TCG_TYPE_V256;
465     }
466     if (TCG_TARGET_HAS_v128 &&
467         check_size_impl(size, 16) &&
468         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
469         (!(size & 8) ||
470          (TCG_TARGET_HAS_v64 &&
471           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
472         return TCG_TYPE_V128;
473     }
474     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
475         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
476         return TCG_TYPE_V64;
477     }
478     return 0;
479 }
480 
481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
482                          uint32_t maxsz, TCGv_vec t_vec)
483 {
484     uint32_t i = 0;
485 
486     tcg_debug_assert(oprsz >= 8);
487 
488     /*
489      * This may be expand_clr for the tail of an operation, e.g.
490      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
491      * are misaligned wrt the maximum vector size, so do that first.
492      */
493     if (dofs & 8) {
494         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
495         i += 8;
496     }
497 
498     switch (type) {
499     case TCG_TYPE_V256:
500         /*
501          * Recall that ARM SVE allows vector sizes that are not a
502          * power of 2, but always a multiple of 16.  The intent is
503          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
504          */
505         for (; i + 32 <= oprsz; i += 32) {
506             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
507         }
508         /* fallthru */
509     case TCG_TYPE_V128:
510         for (; i + 16 <= oprsz; i += 16) {
511             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
512         }
513         break;
514     case TCG_TYPE_V64:
515         for (; i < oprsz; i += 8) {
516             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
517         }
518         break;
519     default:
520         g_assert_not_reached();
521     }
522 
523     if (oprsz < maxsz) {
524         expand_clr(dofs + oprsz, maxsz - oprsz);
525     }
526 }
527 
528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
529  * Only one of IN_32 or IN_64 may be set;
530  * IN_C is used if IN_32 and IN_64 are unset.
531  */
532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
533                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
534                    uint64_t in_c)
535 {
536     TCGType type;
537     TCGv_i64 t_64;
538     TCGv_i32 t_32, t_desc;
539     TCGv_ptr t_ptr;
540     uint32_t i;
541 
542     assert(vece <= (in_32 ? MO_32 : MO_64));
543     assert(in_32 == NULL || in_64 == NULL);
544 
545     /* If we're storing 0, expand oprsz to maxsz.  */
546     if (in_32 == NULL && in_64 == NULL) {
547         in_c = dup_const(vece, in_c);
548         if (in_c == 0) {
549             oprsz = maxsz;
550         }
551     }
552 
553     /* Implement inline with a vector type, if possible.
554      * Prefer integer when 64-bit host and no variable dup.
555      */
556     type = choose_vector_type(NULL, vece, oprsz,
557                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
558                                && (in_64 == NULL || vece == MO_64)));
559     if (type != 0) {
560         TCGv_vec t_vec = tcg_temp_new_vec(type);
561 
562         if (in_32) {
563             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
564         } else if (in_64) {
565             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
566         } else {
567             tcg_gen_dupi_vec(vece, t_vec, in_c);
568         }
569         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
570         tcg_temp_free_vec(t_vec);
571         return;
572     }
573 
574     /* Otherwise, inline with an integer type, unless "large".  */
575     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
576         t_64 = NULL;
577         t_32 = NULL;
578 
579         if (in_32) {
580             /* We are given a 32-bit variable input.  For a 64-bit host,
581                use a 64-bit operation unless the 32-bit operation would
582                be simple enough.  */
583             if (TCG_TARGET_REG_BITS == 64
584                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
585                 t_64 = tcg_temp_new_i64();
586                 tcg_gen_extu_i32_i64(t_64, in_32);
587                 gen_dup_i64(vece, t_64, t_64);
588             } else {
589                 t_32 = tcg_temp_new_i32();
590                 gen_dup_i32(vece, t_32, in_32);
591             }
592         } else if (in_64) {
593             /* We are given a 64-bit variable input.  */
594             t_64 = tcg_temp_new_i64();
595             gen_dup_i64(vece, t_64, in_64);
596         } else {
597             /* We are given a constant input.  */
598             /* For 64-bit hosts, use 64-bit constants for "simple" constants
599                or when we'd need too many 32-bit stores, or when a 64-bit
600                constant is really required.  */
601             if (vece == MO_64
602                 || (TCG_TARGET_REG_BITS == 64
603                     && (in_c == 0 || in_c == -1
604                         || !check_size_impl(oprsz, 4)))) {
605                 t_64 = tcg_const_i64(in_c);
606             } else {
607                 t_32 = tcg_const_i32(in_c);
608             }
609         }
610 
611         /* Implement inline if we picked an implementation size above.  */
612         if (t_32) {
613             for (i = 0; i < oprsz; i += 4) {
614                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
615             }
616             tcg_temp_free_i32(t_32);
617             goto done;
618         }
619         if (t_64) {
620             for (i = 0; i < oprsz; i += 8) {
621                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
622             }
623             tcg_temp_free_i64(t_64);
624             goto done;
625         }
626     }
627 
628     /* Otherwise implement out of line.  */
629     t_ptr = tcg_temp_new_ptr();
630     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
631     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
632 
633     if (vece == MO_64) {
634         if (in_64) {
635             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
636         } else {
637             t_64 = tcg_const_i64(in_c);
638             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
639             tcg_temp_free_i64(t_64);
640         }
641     } else {
642         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
643         static dup_fn * const fns[3] = {
644             gen_helper_gvec_dup8,
645             gen_helper_gvec_dup16,
646             gen_helper_gvec_dup32
647         };
648 
649         if (in_32) {
650             fns[vece](t_ptr, t_desc, in_32);
651         } else {
652             t_32 = tcg_temp_new_i32();
653             if (in_64) {
654                 tcg_gen_extrl_i64_i32(t_32, in_64);
655             } else if (vece == MO_8) {
656                 tcg_gen_movi_i32(t_32, in_c & 0xff);
657             } else if (vece == MO_16) {
658                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
659             } else {
660                 tcg_gen_movi_i32(t_32, in_c);
661             }
662             fns[vece](t_ptr, t_desc, t_32);
663             tcg_temp_free_i32(t_32);
664         }
665     }
666 
667     tcg_temp_free_ptr(t_ptr);
668     tcg_temp_free_i32(t_desc);
669     return;
670 
671  done:
672     if (oprsz < maxsz) {
673         expand_clr(dofs + oprsz, maxsz - oprsz);
674     }
675 }
676 
677 /* Likewise, but with zero.  */
678 static void expand_clr(uint32_t dofs, uint32_t maxsz)
679 {
680     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
681 }
682 
683 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
684 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
685                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
686 {
687     TCGv_i32 t0 = tcg_temp_new_i32();
688     TCGv_i32 t1 = tcg_temp_new_i32();
689     uint32_t i;
690 
691     for (i = 0; i < oprsz; i += 4) {
692         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
693         if (load_dest) {
694             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
695         }
696         fni(t1, t0);
697         tcg_gen_st_i32(t1, cpu_env, dofs + i);
698     }
699     tcg_temp_free_i32(t0);
700     tcg_temp_free_i32(t1);
701 }
702 
703 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
704                           int32_t c, bool load_dest,
705                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
706 {
707     TCGv_i32 t0 = tcg_temp_new_i32();
708     TCGv_i32 t1 = tcg_temp_new_i32();
709     uint32_t i;
710 
711     for (i = 0; i < oprsz; i += 4) {
712         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
713         if (load_dest) {
714             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
715         }
716         fni(t1, t0, c);
717         tcg_gen_st_i32(t1, cpu_env, dofs + i);
718     }
719     tcg_temp_free_i32(t0);
720     tcg_temp_free_i32(t1);
721 }
722 
723 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
724                           TCGv_i32 c, bool scalar_first,
725                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
726 {
727     TCGv_i32 t0 = tcg_temp_new_i32();
728     TCGv_i32 t1 = tcg_temp_new_i32();
729     uint32_t i;
730 
731     for (i = 0; i < oprsz; i += 4) {
732         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
733         if (scalar_first) {
734             fni(t1, c, t0);
735         } else {
736             fni(t1, t0, c);
737         }
738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
739     }
740     tcg_temp_free_i32(t0);
741     tcg_temp_free_i32(t1);
742 }
743 
744 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
745 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
746                          uint32_t bofs, uint32_t oprsz, bool load_dest,
747                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
748 {
749     TCGv_i32 t0 = tcg_temp_new_i32();
750     TCGv_i32 t1 = tcg_temp_new_i32();
751     TCGv_i32 t2 = tcg_temp_new_i32();
752     uint32_t i;
753 
754     for (i = 0; i < oprsz; i += 4) {
755         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
756         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
757         if (load_dest) {
758             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
759         }
760         fni(t2, t0, t1);
761         tcg_gen_st_i32(t2, cpu_env, dofs + i);
762     }
763     tcg_temp_free_i32(t2);
764     tcg_temp_free_i32(t1);
765     tcg_temp_free_i32(t0);
766 }
767 
768 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
769                           uint32_t oprsz, int32_t c, bool load_dest,
770                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
771 {
772     TCGv_i32 t0 = tcg_temp_new_i32();
773     TCGv_i32 t1 = tcg_temp_new_i32();
774     TCGv_i32 t2 = tcg_temp_new_i32();
775     uint32_t i;
776 
777     for (i = 0; i < oprsz; i += 4) {
778         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
779         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
780         if (load_dest) {
781             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
782         }
783         fni(t2, t0, t1, c);
784         tcg_gen_st_i32(t2, cpu_env, dofs + i);
785     }
786     tcg_temp_free_i32(t0);
787     tcg_temp_free_i32(t1);
788     tcg_temp_free_i32(t2);
789 }
790 
791 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
792 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
793                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
794                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
795 {
796     TCGv_i32 t0 = tcg_temp_new_i32();
797     TCGv_i32 t1 = tcg_temp_new_i32();
798     TCGv_i32 t2 = tcg_temp_new_i32();
799     TCGv_i32 t3 = tcg_temp_new_i32();
800     uint32_t i;
801 
802     for (i = 0; i < oprsz; i += 4) {
803         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
804         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
805         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
806         fni(t0, t1, t2, t3);
807         tcg_gen_st_i32(t0, cpu_env, dofs + i);
808         if (write_aofs) {
809             tcg_gen_st_i32(t1, cpu_env, aofs + i);
810         }
811     }
812     tcg_temp_free_i32(t3);
813     tcg_temp_free_i32(t2);
814     tcg_temp_free_i32(t1);
815     tcg_temp_free_i32(t0);
816 }
817 
818 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
819 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
820                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
821 {
822     TCGv_i64 t0 = tcg_temp_new_i64();
823     TCGv_i64 t1 = tcg_temp_new_i64();
824     uint32_t i;
825 
826     for (i = 0; i < oprsz; i += 8) {
827         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
828         if (load_dest) {
829             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
830         }
831         fni(t1, t0);
832         tcg_gen_st_i64(t1, cpu_env, dofs + i);
833     }
834     tcg_temp_free_i64(t0);
835     tcg_temp_free_i64(t1);
836 }
837 
838 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
839                           int64_t c, bool load_dest,
840                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
841 {
842     TCGv_i64 t0 = tcg_temp_new_i64();
843     TCGv_i64 t1 = tcg_temp_new_i64();
844     uint32_t i;
845 
846     for (i = 0; i < oprsz; i += 8) {
847         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
848         if (load_dest) {
849             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
850         }
851         fni(t1, t0, c);
852         tcg_gen_st_i64(t1, cpu_env, dofs + i);
853     }
854     tcg_temp_free_i64(t0);
855     tcg_temp_free_i64(t1);
856 }
857 
858 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
859                           TCGv_i64 c, bool scalar_first,
860                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
861 {
862     TCGv_i64 t0 = tcg_temp_new_i64();
863     TCGv_i64 t1 = tcg_temp_new_i64();
864     uint32_t i;
865 
866     for (i = 0; i < oprsz; i += 8) {
867         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
868         if (scalar_first) {
869             fni(t1, c, t0);
870         } else {
871             fni(t1, t0, c);
872         }
873         tcg_gen_st_i64(t1, cpu_env, dofs + i);
874     }
875     tcg_temp_free_i64(t0);
876     tcg_temp_free_i64(t1);
877 }
878 
879 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
880 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
881                          uint32_t bofs, uint32_t oprsz, bool load_dest,
882                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
883 {
884     TCGv_i64 t0 = tcg_temp_new_i64();
885     TCGv_i64 t1 = tcg_temp_new_i64();
886     TCGv_i64 t2 = tcg_temp_new_i64();
887     uint32_t i;
888 
889     for (i = 0; i < oprsz; i += 8) {
890         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
891         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
892         if (load_dest) {
893             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
894         }
895         fni(t2, t0, t1);
896         tcg_gen_st_i64(t2, cpu_env, dofs + i);
897     }
898     tcg_temp_free_i64(t2);
899     tcg_temp_free_i64(t1);
900     tcg_temp_free_i64(t0);
901 }
902 
903 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
904                           uint32_t oprsz, int64_t c, bool load_dest,
905                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
906 {
907     TCGv_i64 t0 = tcg_temp_new_i64();
908     TCGv_i64 t1 = tcg_temp_new_i64();
909     TCGv_i64 t2 = tcg_temp_new_i64();
910     uint32_t i;
911 
912     for (i = 0; i < oprsz; i += 8) {
913         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
914         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
915         if (load_dest) {
916             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
917         }
918         fni(t2, t0, t1, c);
919         tcg_gen_st_i64(t2, cpu_env, dofs + i);
920     }
921     tcg_temp_free_i64(t0);
922     tcg_temp_free_i64(t1);
923     tcg_temp_free_i64(t2);
924 }
925 
926 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
927 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
928                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
929                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
930 {
931     TCGv_i64 t0 = tcg_temp_new_i64();
932     TCGv_i64 t1 = tcg_temp_new_i64();
933     TCGv_i64 t2 = tcg_temp_new_i64();
934     TCGv_i64 t3 = tcg_temp_new_i64();
935     uint32_t i;
936 
937     for (i = 0; i < oprsz; i += 8) {
938         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
939         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
940         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
941         fni(t0, t1, t2, t3);
942         tcg_gen_st_i64(t0, cpu_env, dofs + i);
943         if (write_aofs) {
944             tcg_gen_st_i64(t1, cpu_env, aofs + i);
945         }
946     }
947     tcg_temp_free_i64(t3);
948     tcg_temp_free_i64(t2);
949     tcg_temp_free_i64(t1);
950     tcg_temp_free_i64(t0);
951 }
952 
953 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
954 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
955                          uint32_t oprsz, uint32_t tysz, TCGType type,
956                          bool load_dest,
957                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
958 {
959     TCGv_vec t0 = tcg_temp_new_vec(type);
960     TCGv_vec t1 = tcg_temp_new_vec(type);
961     uint32_t i;
962 
963     for (i = 0; i < oprsz; i += tysz) {
964         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
965         if (load_dest) {
966             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
967         }
968         fni(vece, t1, t0);
969         tcg_gen_st_vec(t1, cpu_env, dofs + i);
970     }
971     tcg_temp_free_vec(t0);
972     tcg_temp_free_vec(t1);
973 }
974 
975 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
976    using host vectors.  */
977 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
978                           uint32_t oprsz, uint32_t tysz, TCGType type,
979                           int64_t c, bool load_dest,
980                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
981 {
982     TCGv_vec t0 = tcg_temp_new_vec(type);
983     TCGv_vec t1 = tcg_temp_new_vec(type);
984     uint32_t i;
985 
986     for (i = 0; i < oprsz; i += tysz) {
987         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
988         if (load_dest) {
989             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
990         }
991         fni(vece, t1, t0, c);
992         tcg_gen_st_vec(t1, cpu_env, dofs + i);
993     }
994     tcg_temp_free_vec(t0);
995     tcg_temp_free_vec(t1);
996 }
997 
998 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
999                           uint32_t oprsz, uint32_t tysz, TCGType type,
1000                           TCGv_vec c, bool scalar_first,
1001                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1002 {
1003     TCGv_vec t0 = tcg_temp_new_vec(type);
1004     TCGv_vec t1 = tcg_temp_new_vec(type);
1005     uint32_t i;
1006 
1007     for (i = 0; i < oprsz; i += tysz) {
1008         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009         if (scalar_first) {
1010             fni(vece, t1, c, t0);
1011         } else {
1012             fni(vece, t1, t0, c);
1013         }
1014         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1015     }
1016     tcg_temp_free_vec(t0);
1017     tcg_temp_free_vec(t1);
1018 }
1019 
1020 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1021 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1022                          uint32_t bofs, uint32_t oprsz,
1023                          uint32_t tysz, TCGType type, bool load_dest,
1024                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1025 {
1026     TCGv_vec t0 = tcg_temp_new_vec(type);
1027     TCGv_vec t1 = tcg_temp_new_vec(type);
1028     TCGv_vec t2 = tcg_temp_new_vec(type);
1029     uint32_t i;
1030 
1031     for (i = 0; i < oprsz; i += tysz) {
1032         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1033         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1034         if (load_dest) {
1035             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1036         }
1037         fni(vece, t2, t0, t1);
1038         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1039     }
1040     tcg_temp_free_vec(t2);
1041     tcg_temp_free_vec(t1);
1042     tcg_temp_free_vec(t0);
1043 }
1044 
1045 /*
1046  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1047  * using host vectors.
1048  */
1049 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1050                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1051                           TCGType type, int64_t c, bool load_dest,
1052                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1053                                       int64_t))
1054 {
1055     TCGv_vec t0 = tcg_temp_new_vec(type);
1056     TCGv_vec t1 = tcg_temp_new_vec(type);
1057     TCGv_vec t2 = tcg_temp_new_vec(type);
1058     uint32_t i;
1059 
1060     for (i = 0; i < oprsz; i += tysz) {
1061         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1062         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1063         if (load_dest) {
1064             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1065         }
1066         fni(vece, t2, t0, t1, c);
1067         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1068     }
1069     tcg_temp_free_vec(t0);
1070     tcg_temp_free_vec(t1);
1071     tcg_temp_free_vec(t2);
1072 }
1073 
1074 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1075 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1076                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1077                          uint32_t tysz, TCGType type, bool write_aofs,
1078                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1079                                      TCGv_vec, TCGv_vec))
1080 {
1081     TCGv_vec t0 = tcg_temp_new_vec(type);
1082     TCGv_vec t1 = tcg_temp_new_vec(type);
1083     TCGv_vec t2 = tcg_temp_new_vec(type);
1084     TCGv_vec t3 = tcg_temp_new_vec(type);
1085     uint32_t i;
1086 
1087     for (i = 0; i < oprsz; i += tysz) {
1088         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1089         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1090         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1091         fni(vece, t0, t1, t2, t3);
1092         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1093         if (write_aofs) {
1094             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1095         }
1096     }
1097     tcg_temp_free_vec(t3);
1098     tcg_temp_free_vec(t2);
1099     tcg_temp_free_vec(t1);
1100     tcg_temp_free_vec(t0);
1101 }
1102 
1103 /* Expand a vector two-operand operation.  */
1104 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1105                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1106 {
1107     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1108     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1109     TCGType type;
1110     uint32_t some;
1111 
1112     check_size_align(oprsz, maxsz, dofs | aofs);
1113     check_overlap_2(dofs, aofs, maxsz);
1114 
1115     type = 0;
1116     if (g->fniv) {
1117         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1118     }
1119     switch (type) {
1120     case TCG_TYPE_V256:
1121         /* Recall that ARM SVE allows vector sizes that are not a
1122          * power of 2, but always a multiple of 16.  The intent is
1123          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1124          */
1125         some = QEMU_ALIGN_DOWN(oprsz, 32);
1126         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1127                      g->load_dest, g->fniv);
1128         if (some == oprsz) {
1129             break;
1130         }
1131         dofs += some;
1132         aofs += some;
1133         oprsz -= some;
1134         maxsz -= some;
1135         /* fallthru */
1136     case TCG_TYPE_V128:
1137         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1138                      g->load_dest, g->fniv);
1139         break;
1140     case TCG_TYPE_V64:
1141         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1142                      g->load_dest, g->fniv);
1143         break;
1144 
1145     case 0:
1146         if (g->fni8 && check_size_impl(oprsz, 8)) {
1147             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1148         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1149             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1150         } else {
1151             assert(g->fno != NULL);
1152             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1153             oprsz = maxsz;
1154         }
1155         break;
1156 
1157     default:
1158         g_assert_not_reached();
1159     }
1160     tcg_swap_vecop_list(hold_list);
1161 
1162     if (oprsz < maxsz) {
1163         expand_clr(dofs + oprsz, maxsz - oprsz);
1164     }
1165 }
1166 
1167 /* Expand a vector operation with two vectors and an immediate.  */
1168 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1169                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1170 {
1171     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1172     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1173     TCGType type;
1174     uint32_t some;
1175 
1176     check_size_align(oprsz, maxsz, dofs | aofs);
1177     check_overlap_2(dofs, aofs, maxsz);
1178 
1179     type = 0;
1180     if (g->fniv) {
1181         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1182     }
1183     switch (type) {
1184     case TCG_TYPE_V256:
1185         /* Recall that ARM SVE allows vector sizes that are not a
1186          * power of 2, but always a multiple of 16.  The intent is
1187          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1188          */
1189         some = QEMU_ALIGN_DOWN(oprsz, 32);
1190         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1191                       c, g->load_dest, g->fniv);
1192         if (some == oprsz) {
1193             break;
1194         }
1195         dofs += some;
1196         aofs += some;
1197         oprsz -= some;
1198         maxsz -= some;
1199         /* fallthru */
1200     case TCG_TYPE_V128:
1201         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1202                       c, g->load_dest, g->fniv);
1203         break;
1204     case TCG_TYPE_V64:
1205         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1206                       c, g->load_dest, g->fniv);
1207         break;
1208 
1209     case 0:
1210         if (g->fni8 && check_size_impl(oprsz, 8)) {
1211             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1212         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1213             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1214         } else {
1215             if (g->fno) {
1216                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1217             } else {
1218                 TCGv_i64 tcg_c = tcg_const_i64(c);
1219                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1220                                     maxsz, c, g->fnoi);
1221                 tcg_temp_free_i64(tcg_c);
1222             }
1223             oprsz = maxsz;
1224         }
1225         break;
1226 
1227     default:
1228         g_assert_not_reached();
1229     }
1230     tcg_swap_vecop_list(hold_list);
1231 
1232     if (oprsz < maxsz) {
1233         expand_clr(dofs + oprsz, maxsz - oprsz);
1234     }
1235 }
1236 
1237 /* Expand a vector operation with two vectors and a scalar.  */
1238 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1239                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1240 {
1241     TCGType type;
1242 
1243     check_size_align(oprsz, maxsz, dofs | aofs);
1244     check_overlap_2(dofs, aofs, maxsz);
1245 
1246     type = 0;
1247     if (g->fniv) {
1248         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1249     }
1250     if (type != 0) {
1251         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1252         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1253         TCGv_vec t_vec = tcg_temp_new_vec(type);
1254         uint32_t some;
1255 
1256         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1257 
1258         switch (type) {
1259         case TCG_TYPE_V256:
1260             /* Recall that ARM SVE allows vector sizes that are not a
1261              * power of 2, but always a multiple of 16.  The intent is
1262              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1263              */
1264             some = QEMU_ALIGN_DOWN(oprsz, 32);
1265             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1266                           t_vec, g->scalar_first, g->fniv);
1267             if (some == oprsz) {
1268                 break;
1269             }
1270             dofs += some;
1271             aofs += some;
1272             oprsz -= some;
1273             maxsz -= some;
1274             /* fallthru */
1275 
1276         case TCG_TYPE_V128:
1277             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1278                           t_vec, g->scalar_first, g->fniv);
1279             break;
1280 
1281         case TCG_TYPE_V64:
1282             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1283                           t_vec, g->scalar_first, g->fniv);
1284             break;
1285 
1286         default:
1287             g_assert_not_reached();
1288         }
1289         tcg_temp_free_vec(t_vec);
1290         tcg_swap_vecop_list(hold_list);
1291     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1292         TCGv_i64 t64 = tcg_temp_new_i64();
1293 
1294         gen_dup_i64(g->vece, t64, c);
1295         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1296         tcg_temp_free_i64(t64);
1297     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1298         TCGv_i32 t32 = tcg_temp_new_i32();
1299 
1300         tcg_gen_extrl_i64_i32(t32, c);
1301         gen_dup_i32(g->vece, t32, t32);
1302         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1303         tcg_temp_free_i32(t32);
1304     } else {
1305         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1306         return;
1307     }
1308 
1309     if (oprsz < maxsz) {
1310         expand_clr(dofs + oprsz, maxsz - oprsz);
1311     }
1312 }
1313 
1314 /* Expand a vector three-operand operation.  */
1315 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1316                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1317 {
1318     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1319     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1320     TCGType type;
1321     uint32_t some;
1322 
1323     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1324     check_overlap_3(dofs, aofs, bofs, maxsz);
1325 
1326     type = 0;
1327     if (g->fniv) {
1328         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1329     }
1330     switch (type) {
1331     case TCG_TYPE_V256:
1332         /* Recall that ARM SVE allows vector sizes that are not a
1333          * power of 2, but always a multiple of 16.  The intent is
1334          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1335          */
1336         some = QEMU_ALIGN_DOWN(oprsz, 32);
1337         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1338                      g->load_dest, g->fniv);
1339         if (some == oprsz) {
1340             break;
1341         }
1342         dofs += some;
1343         aofs += some;
1344         bofs += some;
1345         oprsz -= some;
1346         maxsz -= some;
1347         /* fallthru */
1348     case TCG_TYPE_V128:
1349         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1350                      g->load_dest, g->fniv);
1351         break;
1352     case TCG_TYPE_V64:
1353         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1354                      g->load_dest, g->fniv);
1355         break;
1356 
1357     case 0:
1358         if (g->fni8 && check_size_impl(oprsz, 8)) {
1359             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1360         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1361             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1362         } else {
1363             assert(g->fno != NULL);
1364             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1365                                maxsz, g->data, g->fno);
1366             oprsz = maxsz;
1367         }
1368         break;
1369 
1370     default:
1371         g_assert_not_reached();
1372     }
1373     tcg_swap_vecop_list(hold_list);
1374 
1375     if (oprsz < maxsz) {
1376         expand_clr(dofs + oprsz, maxsz - oprsz);
1377     }
1378 }
1379 
1380 /* Expand a vector operation with three vectors and an immediate.  */
1381 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1382                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1383                      const GVecGen3i *g)
1384 {
1385     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1386     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1387     TCGType type;
1388     uint32_t some;
1389 
1390     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1391     check_overlap_3(dofs, aofs, bofs, maxsz);
1392 
1393     type = 0;
1394     if (g->fniv) {
1395         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1396     }
1397     switch (type) {
1398     case TCG_TYPE_V256:
1399         /*
1400          * Recall that ARM SVE allows vector sizes that are not a
1401          * power of 2, but always a multiple of 16.  The intent is
1402          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1403          */
1404         some = QEMU_ALIGN_DOWN(oprsz, 32);
1405         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1406                       c, g->load_dest, g->fniv);
1407         if (some == oprsz) {
1408             break;
1409         }
1410         dofs += some;
1411         aofs += some;
1412         bofs += some;
1413         oprsz -= some;
1414         maxsz -= some;
1415         /* fallthru */
1416     case TCG_TYPE_V128:
1417         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1418                       c, g->load_dest, g->fniv);
1419         break;
1420     case TCG_TYPE_V64:
1421         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1422                       c, g->load_dest, g->fniv);
1423         break;
1424 
1425     case 0:
1426         if (g->fni8 && check_size_impl(oprsz, 8)) {
1427             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1428         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1429             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1430         } else {
1431             assert(g->fno != NULL);
1432             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1433             oprsz = maxsz;
1434         }
1435         break;
1436 
1437     default:
1438         g_assert_not_reached();
1439     }
1440     tcg_swap_vecop_list(hold_list);
1441 
1442     if (oprsz < maxsz) {
1443         expand_clr(dofs + oprsz, maxsz - oprsz);
1444     }
1445 }
1446 
1447 /* Expand a vector four-operand operation.  */
1448 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1449                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1450 {
1451     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1452     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1453     TCGType type;
1454     uint32_t some;
1455 
1456     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1457     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1458 
1459     type = 0;
1460     if (g->fniv) {
1461         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1462     }
1463     switch (type) {
1464     case TCG_TYPE_V256:
1465         /* Recall that ARM SVE allows vector sizes that are not a
1466          * power of 2, but always a multiple of 16.  The intent is
1467          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1468          */
1469         some = QEMU_ALIGN_DOWN(oprsz, 32);
1470         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1471                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1472         if (some == oprsz) {
1473             break;
1474         }
1475         dofs += some;
1476         aofs += some;
1477         bofs += some;
1478         cofs += some;
1479         oprsz -= some;
1480         maxsz -= some;
1481         /* fallthru */
1482     case TCG_TYPE_V128:
1483         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1484                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1485         break;
1486     case TCG_TYPE_V64:
1487         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1488                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1489         break;
1490 
1491     case 0:
1492         if (g->fni8 && check_size_impl(oprsz, 8)) {
1493             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1494                          g->write_aofs, g->fni8);
1495         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1496             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1497                          g->write_aofs, g->fni4);
1498         } else {
1499             assert(g->fno != NULL);
1500             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1501                                oprsz, maxsz, g->data, g->fno);
1502             oprsz = maxsz;
1503         }
1504         break;
1505 
1506     default:
1507         g_assert_not_reached();
1508     }
1509     tcg_swap_vecop_list(hold_list);
1510 
1511     if (oprsz < maxsz) {
1512         expand_clr(dofs + oprsz, maxsz - oprsz);
1513     }
1514 }
1515 
1516 /*
1517  * Expand specific vector operations.
1518  */
1519 
1520 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1521 {
1522     tcg_gen_mov_vec(a, b);
1523 }
1524 
1525 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1526                       uint32_t oprsz, uint32_t maxsz)
1527 {
1528     static const GVecGen2 g = {
1529         .fni8 = tcg_gen_mov_i64,
1530         .fniv = vec_mov2,
1531         .fno = gen_helper_gvec_mov,
1532         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1533     };
1534     if (dofs != aofs) {
1535         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1536     } else {
1537         check_size_align(oprsz, maxsz, dofs);
1538         if (oprsz < maxsz) {
1539             expand_clr(dofs + oprsz, maxsz - oprsz);
1540         }
1541     }
1542 }
1543 
1544 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1545                           uint32_t maxsz, TCGv_i32 in)
1546 {
1547     check_size_align(oprsz, maxsz, dofs);
1548     tcg_debug_assert(vece <= MO_32);
1549     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1550 }
1551 
1552 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1553                           uint32_t maxsz, TCGv_i64 in)
1554 {
1555     check_size_align(oprsz, maxsz, dofs);
1556     tcg_debug_assert(vece <= MO_64);
1557     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1558 }
1559 
1560 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1561                           uint32_t oprsz, uint32_t maxsz)
1562 {
1563     check_size_align(oprsz, maxsz, dofs);
1564     if (vece <= MO_64) {
1565         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1566         if (type != 0) {
1567             TCGv_vec t_vec = tcg_temp_new_vec(type);
1568             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1569             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1570             tcg_temp_free_vec(t_vec);
1571         } else if (vece <= MO_32) {
1572             TCGv_i32 in = tcg_temp_new_i32();
1573             switch (vece) {
1574             case MO_8:
1575                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1576                 break;
1577             case MO_16:
1578                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1579                 break;
1580             default:
1581                 tcg_gen_ld_i32(in, cpu_env, aofs);
1582                 break;
1583             }
1584             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1585             tcg_temp_free_i32(in);
1586         } else {
1587             TCGv_i64 in = tcg_temp_new_i64();
1588             tcg_gen_ld_i64(in, cpu_env, aofs);
1589             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1590             tcg_temp_free_i64(in);
1591         }
1592     } else if (vece == 4) {
1593         /* 128-bit duplicate.  */
1594         int i;
1595 
1596         tcg_debug_assert(oprsz >= 16);
1597         if (TCG_TARGET_HAS_v128) {
1598             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1599 
1600             tcg_gen_ld_vec(in, cpu_env, aofs);
1601             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1602                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1603             }
1604             tcg_temp_free_vec(in);
1605         } else {
1606             TCGv_i64 in0 = tcg_temp_new_i64();
1607             TCGv_i64 in1 = tcg_temp_new_i64();
1608 
1609             tcg_gen_ld_i64(in0, cpu_env, aofs);
1610             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1611             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1612                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1613                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1614             }
1615             tcg_temp_free_i64(in0);
1616             tcg_temp_free_i64(in1);
1617         }
1618         if (oprsz < maxsz) {
1619             expand_clr(dofs + oprsz, maxsz - oprsz);
1620         }
1621     } else if (vece == 5) {
1622         /* 256-bit duplicate.  */
1623         int i;
1624 
1625         tcg_debug_assert(oprsz >= 32);
1626         tcg_debug_assert(oprsz % 32 == 0);
1627         if (TCG_TARGET_HAS_v256) {
1628             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1629 
1630             tcg_gen_ld_vec(in, cpu_env, aofs);
1631             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1632                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1633             }
1634             tcg_temp_free_vec(in);
1635         } else if (TCG_TARGET_HAS_v128) {
1636             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1637             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1638 
1639             tcg_gen_ld_vec(in0, cpu_env, aofs);
1640             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1641             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1642                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1643                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1644             }
1645             tcg_temp_free_vec(in0);
1646             tcg_temp_free_vec(in1);
1647         } else {
1648             TCGv_i64 in[4];
1649             int j;
1650 
1651             for (j = 0; j < 4; ++j) {
1652                 in[j] = tcg_temp_new_i64();
1653                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1654             }
1655             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1656                 for (j = 0; j < 4; ++j) {
1657                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1658                 }
1659             }
1660             for (j = 0; j < 4; ++j) {
1661                 tcg_temp_free_i64(in[j]);
1662             }
1663         }
1664         if (oprsz < maxsz) {
1665             expand_clr(dofs + oprsz, maxsz - oprsz);
1666         }
1667     } else {
1668         g_assert_not_reached();
1669     }
1670 }
1671 
1672 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1673                           uint32_t maxsz, uint64_t x)
1674 {
1675     check_size_align(oprsz, maxsz, dofs);
1676     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1677 }
1678 
1679 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1680                       uint32_t oprsz, uint32_t maxsz)
1681 {
1682     static const GVecGen2 g = {
1683         .fni8 = tcg_gen_not_i64,
1684         .fniv = tcg_gen_not_vec,
1685         .fno = gen_helper_gvec_not,
1686         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1687     };
1688     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1689 }
1690 
1691 /* Perform a vector addition using normal addition and a mask.  The mask
1692    should be the sign bit of each lane.  This 6-operation form is more
1693    efficient than separate additions when there are 4 or more lanes in
1694    the 64-bit operation.  */
1695 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1696 {
1697     TCGv_i64 t1 = tcg_temp_new_i64();
1698     TCGv_i64 t2 = tcg_temp_new_i64();
1699     TCGv_i64 t3 = tcg_temp_new_i64();
1700 
1701     tcg_gen_andc_i64(t1, a, m);
1702     tcg_gen_andc_i64(t2, b, m);
1703     tcg_gen_xor_i64(t3, a, b);
1704     tcg_gen_add_i64(d, t1, t2);
1705     tcg_gen_and_i64(t3, t3, m);
1706     tcg_gen_xor_i64(d, d, t3);
1707 
1708     tcg_temp_free_i64(t1);
1709     tcg_temp_free_i64(t2);
1710     tcg_temp_free_i64(t3);
1711 }
1712 
1713 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1714 {
1715     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1716     gen_addv_mask(d, a, b, m);
1717     tcg_temp_free_i64(m);
1718 }
1719 
1720 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1721 {
1722     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1723     gen_addv_mask(d, a, b, m);
1724     tcg_temp_free_i64(m);
1725 }
1726 
1727 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1728 {
1729     TCGv_i64 t1 = tcg_temp_new_i64();
1730     TCGv_i64 t2 = tcg_temp_new_i64();
1731 
1732     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1733     tcg_gen_add_i64(t2, a, b);
1734     tcg_gen_add_i64(t1, t1, b);
1735     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1736 
1737     tcg_temp_free_i64(t1);
1738     tcg_temp_free_i64(t2);
1739 }
1740 
1741 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1742 
1743 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1744                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1745 {
1746     static const GVecGen3 g[4] = {
1747         { .fni8 = tcg_gen_vec_add8_i64,
1748           .fniv = tcg_gen_add_vec,
1749           .fno = gen_helper_gvec_add8,
1750           .opt_opc = vecop_list_add,
1751           .vece = MO_8 },
1752         { .fni8 = tcg_gen_vec_add16_i64,
1753           .fniv = tcg_gen_add_vec,
1754           .fno = gen_helper_gvec_add16,
1755           .opt_opc = vecop_list_add,
1756           .vece = MO_16 },
1757         { .fni4 = tcg_gen_add_i32,
1758           .fniv = tcg_gen_add_vec,
1759           .fno = gen_helper_gvec_add32,
1760           .opt_opc = vecop_list_add,
1761           .vece = MO_32 },
1762         { .fni8 = tcg_gen_add_i64,
1763           .fniv = tcg_gen_add_vec,
1764           .fno = gen_helper_gvec_add64,
1765           .opt_opc = vecop_list_add,
1766           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1767           .vece = MO_64 },
1768     };
1769 
1770     tcg_debug_assert(vece <= MO_64);
1771     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1772 }
1773 
1774 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1775                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1776 {
1777     static const GVecGen2s g[4] = {
1778         { .fni8 = tcg_gen_vec_add8_i64,
1779           .fniv = tcg_gen_add_vec,
1780           .fno = gen_helper_gvec_adds8,
1781           .opt_opc = vecop_list_add,
1782           .vece = MO_8 },
1783         { .fni8 = tcg_gen_vec_add16_i64,
1784           .fniv = tcg_gen_add_vec,
1785           .fno = gen_helper_gvec_adds16,
1786           .opt_opc = vecop_list_add,
1787           .vece = MO_16 },
1788         { .fni4 = tcg_gen_add_i32,
1789           .fniv = tcg_gen_add_vec,
1790           .fno = gen_helper_gvec_adds32,
1791           .opt_opc = vecop_list_add,
1792           .vece = MO_32 },
1793         { .fni8 = tcg_gen_add_i64,
1794           .fniv = tcg_gen_add_vec,
1795           .fno = gen_helper_gvec_adds64,
1796           .opt_opc = vecop_list_add,
1797           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1798           .vece = MO_64 },
1799     };
1800 
1801     tcg_debug_assert(vece <= MO_64);
1802     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1803 }
1804 
1805 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1806                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1807 {
1808     TCGv_i64 tmp = tcg_const_i64(c);
1809     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1810     tcg_temp_free_i64(tmp);
1811 }
1812 
1813 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1814 
1815 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1816                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1817 {
1818     static const GVecGen2s g[4] = {
1819         { .fni8 = tcg_gen_vec_sub8_i64,
1820           .fniv = tcg_gen_sub_vec,
1821           .fno = gen_helper_gvec_subs8,
1822           .opt_opc = vecop_list_sub,
1823           .vece = MO_8 },
1824         { .fni8 = tcg_gen_vec_sub16_i64,
1825           .fniv = tcg_gen_sub_vec,
1826           .fno = gen_helper_gvec_subs16,
1827           .opt_opc = vecop_list_sub,
1828           .vece = MO_16 },
1829         { .fni4 = tcg_gen_sub_i32,
1830           .fniv = tcg_gen_sub_vec,
1831           .fno = gen_helper_gvec_subs32,
1832           .opt_opc = vecop_list_sub,
1833           .vece = MO_32 },
1834         { .fni8 = tcg_gen_sub_i64,
1835           .fniv = tcg_gen_sub_vec,
1836           .fno = gen_helper_gvec_subs64,
1837           .opt_opc = vecop_list_sub,
1838           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1839           .vece = MO_64 },
1840     };
1841 
1842     tcg_debug_assert(vece <= MO_64);
1843     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1844 }
1845 
1846 /* Perform a vector subtraction using normal subtraction and a mask.
1847    Compare gen_addv_mask above.  */
1848 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1849 {
1850     TCGv_i64 t1 = tcg_temp_new_i64();
1851     TCGv_i64 t2 = tcg_temp_new_i64();
1852     TCGv_i64 t3 = tcg_temp_new_i64();
1853 
1854     tcg_gen_or_i64(t1, a, m);
1855     tcg_gen_andc_i64(t2, b, m);
1856     tcg_gen_eqv_i64(t3, a, b);
1857     tcg_gen_sub_i64(d, t1, t2);
1858     tcg_gen_and_i64(t3, t3, m);
1859     tcg_gen_xor_i64(d, d, t3);
1860 
1861     tcg_temp_free_i64(t1);
1862     tcg_temp_free_i64(t2);
1863     tcg_temp_free_i64(t3);
1864 }
1865 
1866 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1867 {
1868     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1869     gen_subv_mask(d, a, b, m);
1870     tcg_temp_free_i64(m);
1871 }
1872 
1873 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1874 {
1875     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1876     gen_subv_mask(d, a, b, m);
1877     tcg_temp_free_i64(m);
1878 }
1879 
1880 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1881 {
1882     TCGv_i64 t1 = tcg_temp_new_i64();
1883     TCGv_i64 t2 = tcg_temp_new_i64();
1884 
1885     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1886     tcg_gen_sub_i64(t2, a, b);
1887     tcg_gen_sub_i64(t1, a, t1);
1888     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1889 
1890     tcg_temp_free_i64(t1);
1891     tcg_temp_free_i64(t2);
1892 }
1893 
1894 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1895                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1896 {
1897     static const GVecGen3 g[4] = {
1898         { .fni8 = tcg_gen_vec_sub8_i64,
1899           .fniv = tcg_gen_sub_vec,
1900           .fno = gen_helper_gvec_sub8,
1901           .opt_opc = vecop_list_sub,
1902           .vece = MO_8 },
1903         { .fni8 = tcg_gen_vec_sub16_i64,
1904           .fniv = tcg_gen_sub_vec,
1905           .fno = gen_helper_gvec_sub16,
1906           .opt_opc = vecop_list_sub,
1907           .vece = MO_16 },
1908         { .fni4 = tcg_gen_sub_i32,
1909           .fniv = tcg_gen_sub_vec,
1910           .fno = gen_helper_gvec_sub32,
1911           .opt_opc = vecop_list_sub,
1912           .vece = MO_32 },
1913         { .fni8 = tcg_gen_sub_i64,
1914           .fniv = tcg_gen_sub_vec,
1915           .fno = gen_helper_gvec_sub64,
1916           .opt_opc = vecop_list_sub,
1917           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1918           .vece = MO_64 },
1919     };
1920 
1921     tcg_debug_assert(vece <= MO_64);
1922     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1923 }
1924 
1925 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1926 
1927 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1928                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1929 {
1930     static const GVecGen3 g[4] = {
1931         { .fniv = tcg_gen_mul_vec,
1932           .fno = gen_helper_gvec_mul8,
1933           .opt_opc = vecop_list_mul,
1934           .vece = MO_8 },
1935         { .fniv = tcg_gen_mul_vec,
1936           .fno = gen_helper_gvec_mul16,
1937           .opt_opc = vecop_list_mul,
1938           .vece = MO_16 },
1939         { .fni4 = tcg_gen_mul_i32,
1940           .fniv = tcg_gen_mul_vec,
1941           .fno = gen_helper_gvec_mul32,
1942           .opt_opc = vecop_list_mul,
1943           .vece = MO_32 },
1944         { .fni8 = tcg_gen_mul_i64,
1945           .fniv = tcg_gen_mul_vec,
1946           .fno = gen_helper_gvec_mul64,
1947           .opt_opc = vecop_list_mul,
1948           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1949           .vece = MO_64 },
1950     };
1951 
1952     tcg_debug_assert(vece <= MO_64);
1953     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1954 }
1955 
1956 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1957                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1958 {
1959     static const GVecGen2s g[4] = {
1960         { .fniv = tcg_gen_mul_vec,
1961           .fno = gen_helper_gvec_muls8,
1962           .opt_opc = vecop_list_mul,
1963           .vece = MO_8 },
1964         { .fniv = tcg_gen_mul_vec,
1965           .fno = gen_helper_gvec_muls16,
1966           .opt_opc = vecop_list_mul,
1967           .vece = MO_16 },
1968         { .fni4 = tcg_gen_mul_i32,
1969           .fniv = tcg_gen_mul_vec,
1970           .fno = gen_helper_gvec_muls32,
1971           .opt_opc = vecop_list_mul,
1972           .vece = MO_32 },
1973         { .fni8 = tcg_gen_mul_i64,
1974           .fniv = tcg_gen_mul_vec,
1975           .fno = gen_helper_gvec_muls64,
1976           .opt_opc = vecop_list_mul,
1977           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1978           .vece = MO_64 },
1979     };
1980 
1981     tcg_debug_assert(vece <= MO_64);
1982     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1983 }
1984 
1985 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1986                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1987 {
1988     TCGv_i64 tmp = tcg_const_i64(c);
1989     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1990     tcg_temp_free_i64(tmp);
1991 }
1992 
1993 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1994                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1995 {
1996     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1997     static const GVecGen3 g[4] = {
1998         { .fniv = tcg_gen_ssadd_vec,
1999           .fno = gen_helper_gvec_ssadd8,
2000           .opt_opc = vecop_list,
2001           .vece = MO_8 },
2002         { .fniv = tcg_gen_ssadd_vec,
2003           .fno = gen_helper_gvec_ssadd16,
2004           .opt_opc = vecop_list,
2005           .vece = MO_16 },
2006         { .fniv = tcg_gen_ssadd_vec,
2007           .fno = gen_helper_gvec_ssadd32,
2008           .opt_opc = vecop_list,
2009           .vece = MO_32 },
2010         { .fniv = tcg_gen_ssadd_vec,
2011           .fno = gen_helper_gvec_ssadd64,
2012           .opt_opc = vecop_list,
2013           .vece = MO_64 },
2014     };
2015     tcg_debug_assert(vece <= MO_64);
2016     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2017 }
2018 
2019 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2020                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2021 {
2022     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2023     static const GVecGen3 g[4] = {
2024         { .fniv = tcg_gen_sssub_vec,
2025           .fno = gen_helper_gvec_sssub8,
2026           .opt_opc = vecop_list,
2027           .vece = MO_8 },
2028         { .fniv = tcg_gen_sssub_vec,
2029           .fno = gen_helper_gvec_sssub16,
2030           .opt_opc = vecop_list,
2031           .vece = MO_16 },
2032         { .fniv = tcg_gen_sssub_vec,
2033           .fno = gen_helper_gvec_sssub32,
2034           .opt_opc = vecop_list,
2035           .vece = MO_32 },
2036         { .fniv = tcg_gen_sssub_vec,
2037           .fno = gen_helper_gvec_sssub64,
2038           .opt_opc = vecop_list,
2039           .vece = MO_64 },
2040     };
2041     tcg_debug_assert(vece <= MO_64);
2042     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2043 }
2044 
2045 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2046 {
2047     TCGv_i32 max = tcg_const_i32(-1);
2048     tcg_gen_add_i32(d, a, b);
2049     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2050     tcg_temp_free_i32(max);
2051 }
2052 
2053 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2054 {
2055     TCGv_i64 max = tcg_const_i64(-1);
2056     tcg_gen_add_i64(d, a, b);
2057     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2058     tcg_temp_free_i64(max);
2059 }
2060 
2061 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2062                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2063 {
2064     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2065     static const GVecGen3 g[4] = {
2066         { .fniv = tcg_gen_usadd_vec,
2067           .fno = gen_helper_gvec_usadd8,
2068           .opt_opc = vecop_list,
2069           .vece = MO_8 },
2070         { .fniv = tcg_gen_usadd_vec,
2071           .fno = gen_helper_gvec_usadd16,
2072           .opt_opc = vecop_list,
2073           .vece = MO_16 },
2074         { .fni4 = tcg_gen_usadd_i32,
2075           .fniv = tcg_gen_usadd_vec,
2076           .fno = gen_helper_gvec_usadd32,
2077           .opt_opc = vecop_list,
2078           .vece = MO_32 },
2079         { .fni8 = tcg_gen_usadd_i64,
2080           .fniv = tcg_gen_usadd_vec,
2081           .fno = gen_helper_gvec_usadd64,
2082           .opt_opc = vecop_list,
2083           .vece = MO_64 }
2084     };
2085     tcg_debug_assert(vece <= MO_64);
2086     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2087 }
2088 
2089 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2090 {
2091     TCGv_i32 min = tcg_const_i32(0);
2092     tcg_gen_sub_i32(d, a, b);
2093     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2094     tcg_temp_free_i32(min);
2095 }
2096 
2097 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2098 {
2099     TCGv_i64 min = tcg_const_i64(0);
2100     tcg_gen_sub_i64(d, a, b);
2101     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2102     tcg_temp_free_i64(min);
2103 }
2104 
2105 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2106                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2107 {
2108     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2109     static const GVecGen3 g[4] = {
2110         { .fniv = tcg_gen_ussub_vec,
2111           .fno = gen_helper_gvec_ussub8,
2112           .opt_opc = vecop_list,
2113           .vece = MO_8 },
2114         { .fniv = tcg_gen_ussub_vec,
2115           .fno = gen_helper_gvec_ussub16,
2116           .opt_opc = vecop_list,
2117           .vece = MO_16 },
2118         { .fni4 = tcg_gen_ussub_i32,
2119           .fniv = tcg_gen_ussub_vec,
2120           .fno = gen_helper_gvec_ussub32,
2121           .opt_opc = vecop_list,
2122           .vece = MO_32 },
2123         { .fni8 = tcg_gen_ussub_i64,
2124           .fniv = tcg_gen_ussub_vec,
2125           .fno = gen_helper_gvec_ussub64,
2126           .opt_opc = vecop_list,
2127           .vece = MO_64 }
2128     };
2129     tcg_debug_assert(vece <= MO_64);
2130     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2131 }
2132 
2133 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2134                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2135 {
2136     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2137     static const GVecGen3 g[4] = {
2138         { .fniv = tcg_gen_smin_vec,
2139           .fno = gen_helper_gvec_smin8,
2140           .opt_opc = vecop_list,
2141           .vece = MO_8 },
2142         { .fniv = tcg_gen_smin_vec,
2143           .fno = gen_helper_gvec_smin16,
2144           .opt_opc = vecop_list,
2145           .vece = MO_16 },
2146         { .fni4 = tcg_gen_smin_i32,
2147           .fniv = tcg_gen_smin_vec,
2148           .fno = gen_helper_gvec_smin32,
2149           .opt_opc = vecop_list,
2150           .vece = MO_32 },
2151         { .fni8 = tcg_gen_smin_i64,
2152           .fniv = tcg_gen_smin_vec,
2153           .fno = gen_helper_gvec_smin64,
2154           .opt_opc = vecop_list,
2155           .vece = MO_64 }
2156     };
2157     tcg_debug_assert(vece <= MO_64);
2158     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2159 }
2160 
2161 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2162                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2163 {
2164     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2165     static const GVecGen3 g[4] = {
2166         { .fniv = tcg_gen_umin_vec,
2167           .fno = gen_helper_gvec_umin8,
2168           .opt_opc = vecop_list,
2169           .vece = MO_8 },
2170         { .fniv = tcg_gen_umin_vec,
2171           .fno = gen_helper_gvec_umin16,
2172           .opt_opc = vecop_list,
2173           .vece = MO_16 },
2174         { .fni4 = tcg_gen_umin_i32,
2175           .fniv = tcg_gen_umin_vec,
2176           .fno = gen_helper_gvec_umin32,
2177           .opt_opc = vecop_list,
2178           .vece = MO_32 },
2179         { .fni8 = tcg_gen_umin_i64,
2180           .fniv = tcg_gen_umin_vec,
2181           .fno = gen_helper_gvec_umin64,
2182           .opt_opc = vecop_list,
2183           .vece = MO_64 }
2184     };
2185     tcg_debug_assert(vece <= MO_64);
2186     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2187 }
2188 
2189 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2190                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2191 {
2192     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2193     static const GVecGen3 g[4] = {
2194         { .fniv = tcg_gen_smax_vec,
2195           .fno = gen_helper_gvec_smax8,
2196           .opt_opc = vecop_list,
2197           .vece = MO_8 },
2198         { .fniv = tcg_gen_smax_vec,
2199           .fno = gen_helper_gvec_smax16,
2200           .opt_opc = vecop_list,
2201           .vece = MO_16 },
2202         { .fni4 = tcg_gen_smax_i32,
2203           .fniv = tcg_gen_smax_vec,
2204           .fno = gen_helper_gvec_smax32,
2205           .opt_opc = vecop_list,
2206           .vece = MO_32 },
2207         { .fni8 = tcg_gen_smax_i64,
2208           .fniv = tcg_gen_smax_vec,
2209           .fno = gen_helper_gvec_smax64,
2210           .opt_opc = vecop_list,
2211           .vece = MO_64 }
2212     };
2213     tcg_debug_assert(vece <= MO_64);
2214     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2215 }
2216 
2217 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2218                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2219 {
2220     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2221     static const GVecGen3 g[4] = {
2222         { .fniv = tcg_gen_umax_vec,
2223           .fno = gen_helper_gvec_umax8,
2224           .opt_opc = vecop_list,
2225           .vece = MO_8 },
2226         { .fniv = tcg_gen_umax_vec,
2227           .fno = gen_helper_gvec_umax16,
2228           .opt_opc = vecop_list,
2229           .vece = MO_16 },
2230         { .fni4 = tcg_gen_umax_i32,
2231           .fniv = tcg_gen_umax_vec,
2232           .fno = gen_helper_gvec_umax32,
2233           .opt_opc = vecop_list,
2234           .vece = MO_32 },
2235         { .fni8 = tcg_gen_umax_i64,
2236           .fniv = tcg_gen_umax_vec,
2237           .fno = gen_helper_gvec_umax64,
2238           .opt_opc = vecop_list,
2239           .vece = MO_64 }
2240     };
2241     tcg_debug_assert(vece <= MO_64);
2242     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2243 }
2244 
2245 /* Perform a vector negation using normal negation and a mask.
2246    Compare gen_subv_mask above.  */
2247 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2248 {
2249     TCGv_i64 t2 = tcg_temp_new_i64();
2250     TCGv_i64 t3 = tcg_temp_new_i64();
2251 
2252     tcg_gen_andc_i64(t3, m, b);
2253     tcg_gen_andc_i64(t2, b, m);
2254     tcg_gen_sub_i64(d, m, t2);
2255     tcg_gen_xor_i64(d, d, t3);
2256 
2257     tcg_temp_free_i64(t2);
2258     tcg_temp_free_i64(t3);
2259 }
2260 
2261 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2262 {
2263     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2264     gen_negv_mask(d, b, m);
2265     tcg_temp_free_i64(m);
2266 }
2267 
2268 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2269 {
2270     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2271     gen_negv_mask(d, b, m);
2272     tcg_temp_free_i64(m);
2273 }
2274 
2275 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2276 {
2277     TCGv_i64 t1 = tcg_temp_new_i64();
2278     TCGv_i64 t2 = tcg_temp_new_i64();
2279 
2280     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2281     tcg_gen_neg_i64(t2, b);
2282     tcg_gen_neg_i64(t1, t1);
2283     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2284 
2285     tcg_temp_free_i64(t1);
2286     tcg_temp_free_i64(t2);
2287 }
2288 
2289 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2290                       uint32_t oprsz, uint32_t maxsz)
2291 {
2292     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2293     static const GVecGen2 g[4] = {
2294         { .fni8 = tcg_gen_vec_neg8_i64,
2295           .fniv = tcg_gen_neg_vec,
2296           .fno = gen_helper_gvec_neg8,
2297           .opt_opc = vecop_list,
2298           .vece = MO_8 },
2299         { .fni8 = tcg_gen_vec_neg16_i64,
2300           .fniv = tcg_gen_neg_vec,
2301           .fno = gen_helper_gvec_neg16,
2302           .opt_opc = vecop_list,
2303           .vece = MO_16 },
2304         { .fni4 = tcg_gen_neg_i32,
2305           .fniv = tcg_gen_neg_vec,
2306           .fno = gen_helper_gvec_neg32,
2307           .opt_opc = vecop_list,
2308           .vece = MO_32 },
2309         { .fni8 = tcg_gen_neg_i64,
2310           .fniv = tcg_gen_neg_vec,
2311           .fno = gen_helper_gvec_neg64,
2312           .opt_opc = vecop_list,
2313           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2314           .vece = MO_64 },
2315     };
2316 
2317     tcg_debug_assert(vece <= MO_64);
2318     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2319 }
2320 
2321 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2322 {
2323     TCGv_i64 t = tcg_temp_new_i64();
2324     int nbit = 8 << vece;
2325 
2326     /* Create -1 for each negative element.  */
2327     tcg_gen_shri_i64(t, b, nbit - 1);
2328     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2329     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2330 
2331     /*
2332      * Invert (via xor -1) and add one.
2333      * Because of the ordering the msb is cleared,
2334      * so we never have carry into the next element.
2335      */
2336     tcg_gen_xor_i64(d, b, t);
2337     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2338     tcg_gen_add_i64(d, d, t);
2339 
2340     tcg_temp_free_i64(t);
2341 }
2342 
2343 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2344 {
2345     gen_absv_mask(d, b, MO_8);
2346 }
2347 
2348 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2349 {
2350     gen_absv_mask(d, b, MO_16);
2351 }
2352 
2353 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2354                       uint32_t oprsz, uint32_t maxsz)
2355 {
2356     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2357     static const GVecGen2 g[4] = {
2358         { .fni8 = tcg_gen_vec_abs8_i64,
2359           .fniv = tcg_gen_abs_vec,
2360           .fno = gen_helper_gvec_abs8,
2361           .opt_opc = vecop_list,
2362           .vece = MO_8 },
2363         { .fni8 = tcg_gen_vec_abs16_i64,
2364           .fniv = tcg_gen_abs_vec,
2365           .fno = gen_helper_gvec_abs16,
2366           .opt_opc = vecop_list,
2367           .vece = MO_16 },
2368         { .fni4 = tcg_gen_abs_i32,
2369           .fniv = tcg_gen_abs_vec,
2370           .fno = gen_helper_gvec_abs32,
2371           .opt_opc = vecop_list,
2372           .vece = MO_32 },
2373         { .fni8 = tcg_gen_abs_i64,
2374           .fniv = tcg_gen_abs_vec,
2375           .fno = gen_helper_gvec_abs64,
2376           .opt_opc = vecop_list,
2377           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2378           .vece = MO_64 },
2379     };
2380 
2381     tcg_debug_assert(vece <= MO_64);
2382     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2383 }
2384 
2385 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2386                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2387 {
2388     static const GVecGen3 g = {
2389         .fni8 = tcg_gen_and_i64,
2390         .fniv = tcg_gen_and_vec,
2391         .fno = gen_helper_gvec_and,
2392         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2393     };
2394 
2395     if (aofs == bofs) {
2396         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2397     } else {
2398         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2399     }
2400 }
2401 
2402 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2403                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2404 {
2405     static const GVecGen3 g = {
2406         .fni8 = tcg_gen_or_i64,
2407         .fniv = tcg_gen_or_vec,
2408         .fno = gen_helper_gvec_or,
2409         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410     };
2411 
2412     if (aofs == bofs) {
2413         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2414     } else {
2415         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2416     }
2417 }
2418 
2419 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2420                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2421 {
2422     static const GVecGen3 g = {
2423         .fni8 = tcg_gen_xor_i64,
2424         .fniv = tcg_gen_xor_vec,
2425         .fno = gen_helper_gvec_xor,
2426         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2427     };
2428 
2429     if (aofs == bofs) {
2430         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2431     } else {
2432         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2433     }
2434 }
2435 
2436 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2437                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2438 {
2439     static const GVecGen3 g = {
2440         .fni8 = tcg_gen_andc_i64,
2441         .fniv = tcg_gen_andc_vec,
2442         .fno = gen_helper_gvec_andc,
2443         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2444     };
2445 
2446     if (aofs == bofs) {
2447         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2448     } else {
2449         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2450     }
2451 }
2452 
2453 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2454                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2455 {
2456     static const GVecGen3 g = {
2457         .fni8 = tcg_gen_orc_i64,
2458         .fniv = tcg_gen_orc_vec,
2459         .fno = gen_helper_gvec_orc,
2460         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2461     };
2462 
2463     if (aofs == bofs) {
2464         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2465     } else {
2466         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2467     }
2468 }
2469 
2470 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2471                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2472 {
2473     static const GVecGen3 g = {
2474         .fni8 = tcg_gen_nand_i64,
2475         .fniv = tcg_gen_nand_vec,
2476         .fno = gen_helper_gvec_nand,
2477         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2478     };
2479 
2480     if (aofs == bofs) {
2481         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2482     } else {
2483         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2484     }
2485 }
2486 
2487 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2488                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2489 {
2490     static const GVecGen3 g = {
2491         .fni8 = tcg_gen_nor_i64,
2492         .fniv = tcg_gen_nor_vec,
2493         .fno = gen_helper_gvec_nor,
2494         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2495     };
2496 
2497     if (aofs == bofs) {
2498         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2499     } else {
2500         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2501     }
2502 }
2503 
2504 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2505                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2506 {
2507     static const GVecGen3 g = {
2508         .fni8 = tcg_gen_eqv_i64,
2509         .fniv = tcg_gen_eqv_vec,
2510         .fno = gen_helper_gvec_eqv,
2511         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2512     };
2513 
2514     if (aofs == bofs) {
2515         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2516     } else {
2517         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2518     }
2519 }
2520 
2521 static const GVecGen2s gop_ands = {
2522     .fni8 = tcg_gen_and_i64,
2523     .fniv = tcg_gen_and_vec,
2524     .fno = gen_helper_gvec_ands,
2525     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2526     .vece = MO_64
2527 };
2528 
2529 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2530                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2531 {
2532     TCGv_i64 tmp = tcg_temp_new_i64();
2533     gen_dup_i64(vece, tmp, c);
2534     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2535     tcg_temp_free_i64(tmp);
2536 }
2537 
2538 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2539                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2540 {
2541     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2542     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2543     tcg_temp_free_i64(tmp);
2544 }
2545 
2546 static const GVecGen2s gop_xors = {
2547     .fni8 = tcg_gen_xor_i64,
2548     .fniv = tcg_gen_xor_vec,
2549     .fno = gen_helper_gvec_xors,
2550     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2551     .vece = MO_64
2552 };
2553 
2554 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2555                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2556 {
2557     TCGv_i64 tmp = tcg_temp_new_i64();
2558     gen_dup_i64(vece, tmp, c);
2559     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2560     tcg_temp_free_i64(tmp);
2561 }
2562 
2563 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2564                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2565 {
2566     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2567     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2568     tcg_temp_free_i64(tmp);
2569 }
2570 
2571 static const GVecGen2s gop_ors = {
2572     .fni8 = tcg_gen_or_i64,
2573     .fniv = tcg_gen_or_vec,
2574     .fno = gen_helper_gvec_ors,
2575     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2576     .vece = MO_64
2577 };
2578 
2579 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2580                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2581 {
2582     TCGv_i64 tmp = tcg_temp_new_i64();
2583     gen_dup_i64(vece, tmp, c);
2584     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2585     tcg_temp_free_i64(tmp);
2586 }
2587 
2588 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2589                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2590 {
2591     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2592     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2593     tcg_temp_free_i64(tmp);
2594 }
2595 
2596 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2597 {
2598     uint64_t mask = dup_const(MO_8, 0xff << c);
2599     tcg_gen_shli_i64(d, a, c);
2600     tcg_gen_andi_i64(d, d, mask);
2601 }
2602 
2603 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2604 {
2605     uint64_t mask = dup_const(MO_16, 0xffff << c);
2606     tcg_gen_shli_i64(d, a, c);
2607     tcg_gen_andi_i64(d, d, mask);
2608 }
2609 
2610 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2611                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2612 {
2613     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2614     static const GVecGen2i g[4] = {
2615         { .fni8 = tcg_gen_vec_shl8i_i64,
2616           .fniv = tcg_gen_shli_vec,
2617           .fno = gen_helper_gvec_shl8i,
2618           .opt_opc = vecop_list,
2619           .vece = MO_8 },
2620         { .fni8 = tcg_gen_vec_shl16i_i64,
2621           .fniv = tcg_gen_shli_vec,
2622           .fno = gen_helper_gvec_shl16i,
2623           .opt_opc = vecop_list,
2624           .vece = MO_16 },
2625         { .fni4 = tcg_gen_shli_i32,
2626           .fniv = tcg_gen_shli_vec,
2627           .fno = gen_helper_gvec_shl32i,
2628           .opt_opc = vecop_list,
2629           .vece = MO_32 },
2630         { .fni8 = tcg_gen_shli_i64,
2631           .fniv = tcg_gen_shli_vec,
2632           .fno = gen_helper_gvec_shl64i,
2633           .opt_opc = vecop_list,
2634           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2635           .vece = MO_64 },
2636     };
2637 
2638     tcg_debug_assert(vece <= MO_64);
2639     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2640     if (shift == 0) {
2641         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2642     } else {
2643         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2644     }
2645 }
2646 
2647 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2648 {
2649     uint64_t mask = dup_const(MO_8, 0xff >> c);
2650     tcg_gen_shri_i64(d, a, c);
2651     tcg_gen_andi_i64(d, d, mask);
2652 }
2653 
2654 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2655 {
2656     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2657     tcg_gen_shri_i64(d, a, c);
2658     tcg_gen_andi_i64(d, d, mask);
2659 }
2660 
2661 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2662                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2663 {
2664     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2665     static const GVecGen2i g[4] = {
2666         { .fni8 = tcg_gen_vec_shr8i_i64,
2667           .fniv = tcg_gen_shri_vec,
2668           .fno = gen_helper_gvec_shr8i,
2669           .opt_opc = vecop_list,
2670           .vece = MO_8 },
2671         { .fni8 = tcg_gen_vec_shr16i_i64,
2672           .fniv = tcg_gen_shri_vec,
2673           .fno = gen_helper_gvec_shr16i,
2674           .opt_opc = vecop_list,
2675           .vece = MO_16 },
2676         { .fni4 = tcg_gen_shri_i32,
2677           .fniv = tcg_gen_shri_vec,
2678           .fno = gen_helper_gvec_shr32i,
2679           .opt_opc = vecop_list,
2680           .vece = MO_32 },
2681         { .fni8 = tcg_gen_shri_i64,
2682           .fniv = tcg_gen_shri_vec,
2683           .fno = gen_helper_gvec_shr64i,
2684           .opt_opc = vecop_list,
2685           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2686           .vece = MO_64 },
2687     };
2688 
2689     tcg_debug_assert(vece <= MO_64);
2690     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2691     if (shift == 0) {
2692         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2693     } else {
2694         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2695     }
2696 }
2697 
2698 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2699 {
2700     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2701     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2702     TCGv_i64 s = tcg_temp_new_i64();
2703 
2704     tcg_gen_shri_i64(d, a, c);
2705     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2706     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2707     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2708     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2709     tcg_temp_free_i64(s);
2710 }
2711 
2712 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2713 {
2714     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2715     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2716     TCGv_i64 s = tcg_temp_new_i64();
2717 
2718     tcg_gen_shri_i64(d, a, c);
2719     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2720     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2721     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2722     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2723     tcg_temp_free_i64(s);
2724 }
2725 
2726 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2727                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2728 {
2729     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2730     static const GVecGen2i g[4] = {
2731         { .fni8 = tcg_gen_vec_sar8i_i64,
2732           .fniv = tcg_gen_sari_vec,
2733           .fno = gen_helper_gvec_sar8i,
2734           .opt_opc = vecop_list,
2735           .vece = MO_8 },
2736         { .fni8 = tcg_gen_vec_sar16i_i64,
2737           .fniv = tcg_gen_sari_vec,
2738           .fno = gen_helper_gvec_sar16i,
2739           .opt_opc = vecop_list,
2740           .vece = MO_16 },
2741         { .fni4 = tcg_gen_sari_i32,
2742           .fniv = tcg_gen_sari_vec,
2743           .fno = gen_helper_gvec_sar32i,
2744           .opt_opc = vecop_list,
2745           .vece = MO_32 },
2746         { .fni8 = tcg_gen_sari_i64,
2747           .fniv = tcg_gen_sari_vec,
2748           .fno = gen_helper_gvec_sar64i,
2749           .opt_opc = vecop_list,
2750           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2751           .vece = MO_64 },
2752     };
2753 
2754     tcg_debug_assert(vece <= MO_64);
2755     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2756     if (shift == 0) {
2757         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2758     } else {
2759         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2760     }
2761 }
2762 
2763 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2764 {
2765     uint64_t mask = dup_const(MO_8, 0xff << c);
2766 
2767     tcg_gen_shli_i64(d, a, c);
2768     tcg_gen_shri_i64(a, a, 8 - c);
2769     tcg_gen_andi_i64(d, d, mask);
2770     tcg_gen_andi_i64(a, a, ~mask);
2771     tcg_gen_or_i64(d, d, a);
2772 }
2773 
2774 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2775 {
2776     uint64_t mask = dup_const(MO_16, 0xffff << c);
2777 
2778     tcg_gen_shli_i64(d, a, c);
2779     tcg_gen_shri_i64(a, a, 16 - c);
2780     tcg_gen_andi_i64(d, d, mask);
2781     tcg_gen_andi_i64(a, a, ~mask);
2782     tcg_gen_or_i64(d, d, a);
2783 }
2784 
2785 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2786                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2787 {
2788     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2789     static const GVecGen2i g[4] = {
2790         { .fni8 = tcg_gen_vec_rotl8i_i64,
2791           .fniv = tcg_gen_rotli_vec,
2792           .fno = gen_helper_gvec_rotl8i,
2793           .opt_opc = vecop_list,
2794           .vece = MO_8 },
2795         { .fni8 = tcg_gen_vec_rotl16i_i64,
2796           .fniv = tcg_gen_rotli_vec,
2797           .fno = gen_helper_gvec_rotl16i,
2798           .opt_opc = vecop_list,
2799           .vece = MO_16 },
2800         { .fni4 = tcg_gen_rotli_i32,
2801           .fniv = tcg_gen_rotli_vec,
2802           .fno = gen_helper_gvec_rotl32i,
2803           .opt_opc = vecop_list,
2804           .vece = MO_32 },
2805         { .fni8 = tcg_gen_rotli_i64,
2806           .fniv = tcg_gen_rotli_vec,
2807           .fno = gen_helper_gvec_rotl64i,
2808           .opt_opc = vecop_list,
2809           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2810           .vece = MO_64 },
2811     };
2812 
2813     tcg_debug_assert(vece <= MO_64);
2814     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2815     if (shift == 0) {
2816         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2817     } else {
2818         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2819     }
2820 }
2821 
2822 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2823                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2824 {
2825     tcg_debug_assert(vece <= MO_64);
2826     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2827     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2828                        oprsz, maxsz);
2829 }
2830 
2831 /*
2832  * Specialized generation vector shifts by a non-constant scalar.
2833  */
2834 
2835 typedef struct {
2836     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2837     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2838     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2839     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2840     gen_helper_gvec_2 *fno[4];
2841     TCGOpcode s_list[2];
2842     TCGOpcode v_list[2];
2843 } GVecGen2sh;
2844 
2845 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2846                            uint32_t oprsz, uint32_t tysz, TCGType type,
2847                            TCGv_i32 shift,
2848                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2849 {
2850     TCGv_vec t0 = tcg_temp_new_vec(type);
2851     uint32_t i;
2852 
2853     for (i = 0; i < oprsz; i += tysz) {
2854         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2855         fni(vece, t0, t0, shift);
2856         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2857     }
2858     tcg_temp_free_vec(t0);
2859 }
2860 
2861 static void
2862 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2863                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2864 {
2865     TCGType type;
2866     uint32_t some;
2867 
2868     check_size_align(oprsz, maxsz, dofs | aofs);
2869     check_overlap_2(dofs, aofs, maxsz);
2870 
2871     /* If the backend has a scalar expansion, great.  */
2872     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2873     if (type) {
2874         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2875         switch (type) {
2876         case TCG_TYPE_V256:
2877             some = QEMU_ALIGN_DOWN(oprsz, 32);
2878             expand_2sh_vec(vece, dofs, aofs, some, 32,
2879                            TCG_TYPE_V256, shift, g->fniv_s);
2880             if (some == oprsz) {
2881                 break;
2882             }
2883             dofs += some;
2884             aofs += some;
2885             oprsz -= some;
2886             maxsz -= some;
2887             /* fallthru */
2888         case TCG_TYPE_V128:
2889             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2890                            TCG_TYPE_V128, shift, g->fniv_s);
2891             break;
2892         case TCG_TYPE_V64:
2893             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2894                            TCG_TYPE_V64, shift, g->fniv_s);
2895             break;
2896         default:
2897             g_assert_not_reached();
2898         }
2899         tcg_swap_vecop_list(hold_list);
2900         goto clear_tail;
2901     }
2902 
2903     /* If the backend supports variable vector shifts, also cool.  */
2904     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2905     if (type) {
2906         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2907         TCGv_vec v_shift = tcg_temp_new_vec(type);
2908 
2909         if (vece == MO_64) {
2910             TCGv_i64 sh64 = tcg_temp_new_i64();
2911             tcg_gen_extu_i32_i64(sh64, shift);
2912             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2913             tcg_temp_free_i64(sh64);
2914         } else {
2915             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2916         }
2917 
2918         switch (type) {
2919         case TCG_TYPE_V256:
2920             some = QEMU_ALIGN_DOWN(oprsz, 32);
2921             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2922                           v_shift, false, g->fniv_v);
2923             if (some == oprsz) {
2924                 break;
2925             }
2926             dofs += some;
2927             aofs += some;
2928             oprsz -= some;
2929             maxsz -= some;
2930             /* fallthru */
2931         case TCG_TYPE_V128:
2932             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2933                           v_shift, false, g->fniv_v);
2934             break;
2935         case TCG_TYPE_V64:
2936             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2937                           v_shift, false, g->fniv_v);
2938             break;
2939         default:
2940             g_assert_not_reached();
2941         }
2942         tcg_temp_free_vec(v_shift);
2943         tcg_swap_vecop_list(hold_list);
2944         goto clear_tail;
2945     }
2946 
2947     /* Otherwise fall back to integral... */
2948     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2949         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2950     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2951         TCGv_i64 sh64 = tcg_temp_new_i64();
2952         tcg_gen_extu_i32_i64(sh64, shift);
2953         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2954         tcg_temp_free_i64(sh64);
2955     } else {
2956         TCGv_ptr a0 = tcg_temp_new_ptr();
2957         TCGv_ptr a1 = tcg_temp_new_ptr();
2958         TCGv_i32 desc = tcg_temp_new_i32();
2959 
2960         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2961         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2962         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2963         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2964 
2965         g->fno[vece](a0, a1, desc);
2966 
2967         tcg_temp_free_ptr(a0);
2968         tcg_temp_free_ptr(a1);
2969         tcg_temp_free_i32(desc);
2970         return;
2971     }
2972 
2973  clear_tail:
2974     if (oprsz < maxsz) {
2975         expand_clr(dofs + oprsz, maxsz - oprsz);
2976     }
2977 }
2978 
2979 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2980                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2981 {
2982     static const GVecGen2sh g = {
2983         .fni4 = tcg_gen_shl_i32,
2984         .fni8 = tcg_gen_shl_i64,
2985         .fniv_s = tcg_gen_shls_vec,
2986         .fniv_v = tcg_gen_shlv_vec,
2987         .fno = {
2988             gen_helper_gvec_shl8i,
2989             gen_helper_gvec_shl16i,
2990             gen_helper_gvec_shl32i,
2991             gen_helper_gvec_shl64i,
2992         },
2993         .s_list = { INDEX_op_shls_vec, 0 },
2994         .v_list = { INDEX_op_shlv_vec, 0 },
2995     };
2996 
2997     tcg_debug_assert(vece <= MO_64);
2998     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2999 }
3000 
3001 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3002                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3003 {
3004     static const GVecGen2sh g = {
3005         .fni4 = tcg_gen_shr_i32,
3006         .fni8 = tcg_gen_shr_i64,
3007         .fniv_s = tcg_gen_shrs_vec,
3008         .fniv_v = tcg_gen_shrv_vec,
3009         .fno = {
3010             gen_helper_gvec_shr8i,
3011             gen_helper_gvec_shr16i,
3012             gen_helper_gvec_shr32i,
3013             gen_helper_gvec_shr64i,
3014         },
3015         .s_list = { INDEX_op_shrs_vec, 0 },
3016         .v_list = { INDEX_op_shrv_vec, 0 },
3017     };
3018 
3019     tcg_debug_assert(vece <= MO_64);
3020     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3021 }
3022 
3023 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3024                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3025 {
3026     static const GVecGen2sh g = {
3027         .fni4 = tcg_gen_sar_i32,
3028         .fni8 = tcg_gen_sar_i64,
3029         .fniv_s = tcg_gen_sars_vec,
3030         .fniv_v = tcg_gen_sarv_vec,
3031         .fno = {
3032             gen_helper_gvec_sar8i,
3033             gen_helper_gvec_sar16i,
3034             gen_helper_gvec_sar32i,
3035             gen_helper_gvec_sar64i,
3036         },
3037         .s_list = { INDEX_op_sars_vec, 0 },
3038         .v_list = { INDEX_op_sarv_vec, 0 },
3039     };
3040 
3041     tcg_debug_assert(vece <= MO_64);
3042     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3043 }
3044 
3045 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3046                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3047 {
3048     static const GVecGen2sh g = {
3049         .fni4 = tcg_gen_rotl_i32,
3050         .fni8 = tcg_gen_rotl_i64,
3051         .fniv_s = tcg_gen_rotls_vec,
3052         .fniv_v = tcg_gen_rotlv_vec,
3053         .fno = {
3054             gen_helper_gvec_rotl8i,
3055             gen_helper_gvec_rotl16i,
3056             gen_helper_gvec_rotl32i,
3057             gen_helper_gvec_rotl64i,
3058         },
3059         .s_list = { INDEX_op_rotls_vec, 0 },
3060         .v_list = { INDEX_op_rotlv_vec, 0 },
3061     };
3062 
3063     tcg_debug_assert(vece <= MO_64);
3064     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3065 }
3066 
3067 /*
3068  * Expand D = A << (B % element bits)
3069  *
3070  * Unlike scalar shifts, where it is easy for the target front end
3071  * to include the modulo as part of the expansion.  If the target
3072  * naturally includes the modulo as part of the operation, great!
3073  * If the target has some other behaviour from out-of-range shifts,
3074  * then it could not use this function anyway, and would need to
3075  * do it's own expansion with custom functions.
3076  */
3077 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3078                                  TCGv_vec a, TCGv_vec b)
3079 {
3080     TCGv_vec t = tcg_temp_new_vec_matching(d);
3081 
3082     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3083     tcg_gen_and_vec(vece, t, t, b);
3084     tcg_gen_shlv_vec(vece, d, a, t);
3085     tcg_temp_free_vec(t);
3086 }
3087 
3088 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3089 {
3090     TCGv_i32 t = tcg_temp_new_i32();
3091 
3092     tcg_gen_andi_i32(t, b, 31);
3093     tcg_gen_shl_i32(d, a, t);
3094     tcg_temp_free_i32(t);
3095 }
3096 
3097 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3098 {
3099     TCGv_i64 t = tcg_temp_new_i64();
3100 
3101     tcg_gen_andi_i64(t, b, 63);
3102     tcg_gen_shl_i64(d, a, t);
3103     tcg_temp_free_i64(t);
3104 }
3105 
3106 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3107                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3108 {
3109     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3110     static const GVecGen3 g[4] = {
3111         { .fniv = tcg_gen_shlv_mod_vec,
3112           .fno = gen_helper_gvec_shl8v,
3113           .opt_opc = vecop_list,
3114           .vece = MO_8 },
3115         { .fniv = tcg_gen_shlv_mod_vec,
3116           .fno = gen_helper_gvec_shl16v,
3117           .opt_opc = vecop_list,
3118           .vece = MO_16 },
3119         { .fni4 = tcg_gen_shl_mod_i32,
3120           .fniv = tcg_gen_shlv_mod_vec,
3121           .fno = gen_helper_gvec_shl32v,
3122           .opt_opc = vecop_list,
3123           .vece = MO_32 },
3124         { .fni8 = tcg_gen_shl_mod_i64,
3125           .fniv = tcg_gen_shlv_mod_vec,
3126           .fno = gen_helper_gvec_shl64v,
3127           .opt_opc = vecop_list,
3128           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3129           .vece = MO_64 },
3130     };
3131 
3132     tcg_debug_assert(vece <= MO_64);
3133     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3134 }
3135 
3136 /*
3137  * Similarly for logical right shifts.
3138  */
3139 
3140 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3141                                  TCGv_vec a, TCGv_vec b)
3142 {
3143     TCGv_vec t = tcg_temp_new_vec_matching(d);
3144 
3145     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3146     tcg_gen_and_vec(vece, t, t, b);
3147     tcg_gen_shrv_vec(vece, d, a, t);
3148     tcg_temp_free_vec(t);
3149 }
3150 
3151 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3152 {
3153     TCGv_i32 t = tcg_temp_new_i32();
3154 
3155     tcg_gen_andi_i32(t, b, 31);
3156     tcg_gen_shr_i32(d, a, t);
3157     tcg_temp_free_i32(t);
3158 }
3159 
3160 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3161 {
3162     TCGv_i64 t = tcg_temp_new_i64();
3163 
3164     tcg_gen_andi_i64(t, b, 63);
3165     tcg_gen_shr_i64(d, a, t);
3166     tcg_temp_free_i64(t);
3167 }
3168 
3169 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3170                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3171 {
3172     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3173     static const GVecGen3 g[4] = {
3174         { .fniv = tcg_gen_shrv_mod_vec,
3175           .fno = gen_helper_gvec_shr8v,
3176           .opt_opc = vecop_list,
3177           .vece = MO_8 },
3178         { .fniv = tcg_gen_shrv_mod_vec,
3179           .fno = gen_helper_gvec_shr16v,
3180           .opt_opc = vecop_list,
3181           .vece = MO_16 },
3182         { .fni4 = tcg_gen_shr_mod_i32,
3183           .fniv = tcg_gen_shrv_mod_vec,
3184           .fno = gen_helper_gvec_shr32v,
3185           .opt_opc = vecop_list,
3186           .vece = MO_32 },
3187         { .fni8 = tcg_gen_shr_mod_i64,
3188           .fniv = tcg_gen_shrv_mod_vec,
3189           .fno = gen_helper_gvec_shr64v,
3190           .opt_opc = vecop_list,
3191           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3192           .vece = MO_64 },
3193     };
3194 
3195     tcg_debug_assert(vece <= MO_64);
3196     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3197 }
3198 
3199 /*
3200  * Similarly for arithmetic right shifts.
3201  */
3202 
3203 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3204                                  TCGv_vec a, TCGv_vec b)
3205 {
3206     TCGv_vec t = tcg_temp_new_vec_matching(d);
3207 
3208     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3209     tcg_gen_and_vec(vece, t, t, b);
3210     tcg_gen_sarv_vec(vece, d, a, t);
3211     tcg_temp_free_vec(t);
3212 }
3213 
3214 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3215 {
3216     TCGv_i32 t = tcg_temp_new_i32();
3217 
3218     tcg_gen_andi_i32(t, b, 31);
3219     tcg_gen_sar_i32(d, a, t);
3220     tcg_temp_free_i32(t);
3221 }
3222 
3223 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3224 {
3225     TCGv_i64 t = tcg_temp_new_i64();
3226 
3227     tcg_gen_andi_i64(t, b, 63);
3228     tcg_gen_sar_i64(d, a, t);
3229     tcg_temp_free_i64(t);
3230 }
3231 
3232 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3233                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3234 {
3235     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3236     static const GVecGen3 g[4] = {
3237         { .fniv = tcg_gen_sarv_mod_vec,
3238           .fno = gen_helper_gvec_sar8v,
3239           .opt_opc = vecop_list,
3240           .vece = MO_8 },
3241         { .fniv = tcg_gen_sarv_mod_vec,
3242           .fno = gen_helper_gvec_sar16v,
3243           .opt_opc = vecop_list,
3244           .vece = MO_16 },
3245         { .fni4 = tcg_gen_sar_mod_i32,
3246           .fniv = tcg_gen_sarv_mod_vec,
3247           .fno = gen_helper_gvec_sar32v,
3248           .opt_opc = vecop_list,
3249           .vece = MO_32 },
3250         { .fni8 = tcg_gen_sar_mod_i64,
3251           .fniv = tcg_gen_sarv_mod_vec,
3252           .fno = gen_helper_gvec_sar64v,
3253           .opt_opc = vecop_list,
3254           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3255           .vece = MO_64 },
3256     };
3257 
3258     tcg_debug_assert(vece <= MO_64);
3259     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3260 }
3261 
3262 /*
3263  * Similarly for rotates.
3264  */
3265 
3266 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3267                                   TCGv_vec a, TCGv_vec b)
3268 {
3269     TCGv_vec t = tcg_temp_new_vec_matching(d);
3270 
3271     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3272     tcg_gen_and_vec(vece, t, t, b);
3273     tcg_gen_rotlv_vec(vece, d, a, t);
3274     tcg_temp_free_vec(t);
3275 }
3276 
3277 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3278 {
3279     TCGv_i32 t = tcg_temp_new_i32();
3280 
3281     tcg_gen_andi_i32(t, b, 31);
3282     tcg_gen_rotl_i32(d, a, t);
3283     tcg_temp_free_i32(t);
3284 }
3285 
3286 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3287 {
3288     TCGv_i64 t = tcg_temp_new_i64();
3289 
3290     tcg_gen_andi_i64(t, b, 63);
3291     tcg_gen_rotl_i64(d, a, t);
3292     tcg_temp_free_i64(t);
3293 }
3294 
3295 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3296                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3297 {
3298     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3299     static const GVecGen3 g[4] = {
3300         { .fniv = tcg_gen_rotlv_mod_vec,
3301           .fno = gen_helper_gvec_rotl8v,
3302           .opt_opc = vecop_list,
3303           .vece = MO_8 },
3304         { .fniv = tcg_gen_rotlv_mod_vec,
3305           .fno = gen_helper_gvec_rotl16v,
3306           .opt_opc = vecop_list,
3307           .vece = MO_16 },
3308         { .fni4 = tcg_gen_rotl_mod_i32,
3309           .fniv = tcg_gen_rotlv_mod_vec,
3310           .fno = gen_helper_gvec_rotl32v,
3311           .opt_opc = vecop_list,
3312           .vece = MO_32 },
3313         { .fni8 = tcg_gen_rotl_mod_i64,
3314           .fniv = tcg_gen_rotlv_mod_vec,
3315           .fno = gen_helper_gvec_rotl64v,
3316           .opt_opc = vecop_list,
3317           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3318           .vece = MO_64 },
3319     };
3320 
3321     tcg_debug_assert(vece <= MO_64);
3322     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3323 }
3324 
3325 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3326                                   TCGv_vec a, TCGv_vec b)
3327 {
3328     TCGv_vec t = tcg_temp_new_vec_matching(d);
3329 
3330     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3331     tcg_gen_and_vec(vece, t, t, b);
3332     tcg_gen_rotrv_vec(vece, d, a, t);
3333     tcg_temp_free_vec(t);
3334 }
3335 
3336 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3337 {
3338     TCGv_i32 t = tcg_temp_new_i32();
3339 
3340     tcg_gen_andi_i32(t, b, 31);
3341     tcg_gen_rotr_i32(d, a, t);
3342     tcg_temp_free_i32(t);
3343 }
3344 
3345 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3346 {
3347     TCGv_i64 t = tcg_temp_new_i64();
3348 
3349     tcg_gen_andi_i64(t, b, 63);
3350     tcg_gen_rotr_i64(d, a, t);
3351     tcg_temp_free_i64(t);
3352 }
3353 
3354 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3355                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3356 {
3357     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3358     static const GVecGen3 g[4] = {
3359         { .fniv = tcg_gen_rotrv_mod_vec,
3360           .fno = gen_helper_gvec_rotr8v,
3361           .opt_opc = vecop_list,
3362           .vece = MO_8 },
3363         { .fniv = tcg_gen_rotrv_mod_vec,
3364           .fno = gen_helper_gvec_rotr16v,
3365           .opt_opc = vecop_list,
3366           .vece = MO_16 },
3367         { .fni4 = tcg_gen_rotr_mod_i32,
3368           .fniv = tcg_gen_rotrv_mod_vec,
3369           .fno = gen_helper_gvec_rotr32v,
3370           .opt_opc = vecop_list,
3371           .vece = MO_32 },
3372         { .fni8 = tcg_gen_rotr_mod_i64,
3373           .fniv = tcg_gen_rotrv_mod_vec,
3374           .fno = gen_helper_gvec_rotr64v,
3375           .opt_opc = vecop_list,
3376           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3377           .vece = MO_64 },
3378     };
3379 
3380     tcg_debug_assert(vece <= MO_64);
3381     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3382 }
3383 
3384 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3385 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3386                            uint32_t oprsz, TCGCond cond)
3387 {
3388     TCGv_i32 t0 = tcg_temp_new_i32();
3389     TCGv_i32 t1 = tcg_temp_new_i32();
3390     uint32_t i;
3391 
3392     for (i = 0; i < oprsz; i += 4) {
3393         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3394         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3395         tcg_gen_setcond_i32(cond, t0, t0, t1);
3396         tcg_gen_neg_i32(t0, t0);
3397         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3398     }
3399     tcg_temp_free_i32(t1);
3400     tcg_temp_free_i32(t0);
3401 }
3402 
3403 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3404                            uint32_t oprsz, TCGCond cond)
3405 {
3406     TCGv_i64 t0 = tcg_temp_new_i64();
3407     TCGv_i64 t1 = tcg_temp_new_i64();
3408     uint32_t i;
3409 
3410     for (i = 0; i < oprsz; i += 8) {
3411         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3412         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3413         tcg_gen_setcond_i64(cond, t0, t0, t1);
3414         tcg_gen_neg_i64(t0, t0);
3415         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3416     }
3417     tcg_temp_free_i64(t1);
3418     tcg_temp_free_i64(t0);
3419 }
3420 
3421 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3422                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3423                            TCGType type, TCGCond cond)
3424 {
3425     TCGv_vec t0 = tcg_temp_new_vec(type);
3426     TCGv_vec t1 = tcg_temp_new_vec(type);
3427     uint32_t i;
3428 
3429     for (i = 0; i < oprsz; i += tysz) {
3430         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3431         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3432         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3433         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3434     }
3435     tcg_temp_free_vec(t1);
3436     tcg_temp_free_vec(t0);
3437 }
3438 
3439 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3440                       uint32_t aofs, uint32_t bofs,
3441                       uint32_t oprsz, uint32_t maxsz)
3442 {
3443     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3444     static gen_helper_gvec_3 * const eq_fn[4] = {
3445         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3446         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3447     };
3448     static gen_helper_gvec_3 * const ne_fn[4] = {
3449         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3450         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3451     };
3452     static gen_helper_gvec_3 * const lt_fn[4] = {
3453         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3454         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3455     };
3456     static gen_helper_gvec_3 * const le_fn[4] = {
3457         gen_helper_gvec_le8, gen_helper_gvec_le16,
3458         gen_helper_gvec_le32, gen_helper_gvec_le64
3459     };
3460     static gen_helper_gvec_3 * const ltu_fn[4] = {
3461         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3462         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3463     };
3464     static gen_helper_gvec_3 * const leu_fn[4] = {
3465         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3466         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3467     };
3468     static gen_helper_gvec_3 * const * const fns[16] = {
3469         [TCG_COND_EQ] = eq_fn,
3470         [TCG_COND_NE] = ne_fn,
3471         [TCG_COND_LT] = lt_fn,
3472         [TCG_COND_LE] = le_fn,
3473         [TCG_COND_LTU] = ltu_fn,
3474         [TCG_COND_LEU] = leu_fn,
3475     };
3476 
3477     const TCGOpcode *hold_list;
3478     TCGType type;
3479     uint32_t some;
3480 
3481     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3482     check_overlap_3(dofs, aofs, bofs, maxsz);
3483 
3484     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3485         do_dup(MO_8, dofs, oprsz, maxsz,
3486                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3487         return;
3488     }
3489 
3490     /*
3491      * Implement inline with a vector type, if possible.
3492      * Prefer integer when 64-bit host and 64-bit comparison.
3493      */
3494     hold_list = tcg_swap_vecop_list(cmp_list);
3495     type = choose_vector_type(cmp_list, vece, oprsz,
3496                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3497     switch (type) {
3498     case TCG_TYPE_V256:
3499         /* Recall that ARM SVE allows vector sizes that are not a
3500          * power of 2, but always a multiple of 16.  The intent is
3501          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3502          */
3503         some = QEMU_ALIGN_DOWN(oprsz, 32);
3504         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3505         if (some == oprsz) {
3506             break;
3507         }
3508         dofs += some;
3509         aofs += some;
3510         bofs += some;
3511         oprsz -= some;
3512         maxsz -= some;
3513         /* fallthru */
3514     case TCG_TYPE_V128:
3515         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3516         break;
3517     case TCG_TYPE_V64:
3518         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3519         break;
3520 
3521     case 0:
3522         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3523             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3524         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3525             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3526         } else {
3527             gen_helper_gvec_3 * const *fn = fns[cond];
3528 
3529             if (fn == NULL) {
3530                 uint32_t tmp;
3531                 tmp = aofs, aofs = bofs, bofs = tmp;
3532                 cond = tcg_swap_cond(cond);
3533                 fn = fns[cond];
3534                 assert(fn != NULL);
3535             }
3536             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3537             oprsz = maxsz;
3538         }
3539         break;
3540 
3541     default:
3542         g_assert_not_reached();
3543     }
3544     tcg_swap_vecop_list(hold_list);
3545 
3546     if (oprsz < maxsz) {
3547         expand_clr(dofs + oprsz, maxsz - oprsz);
3548     }
3549 }
3550 
3551 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3552 {
3553     TCGv_i64 t = tcg_temp_new_i64();
3554 
3555     tcg_gen_and_i64(t, b, a);
3556     tcg_gen_andc_i64(d, c, a);
3557     tcg_gen_or_i64(d, d, t);
3558     tcg_temp_free_i64(t);
3559 }
3560 
3561 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3562                          uint32_t bofs, uint32_t cofs,
3563                          uint32_t oprsz, uint32_t maxsz)
3564 {
3565     static const GVecGen4 g = {
3566         .fni8 = tcg_gen_bitsel_i64,
3567         .fniv = tcg_gen_bitsel_vec,
3568         .fno = gen_helper_gvec_bitsel,
3569     };
3570 
3571     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3572 }
3573