1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 
96 /* Split one or more double-mode RTL references into pairs of half-mode
97    references.  The RTL can be REG, offsettable MEM, integer constant, or
98    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
99    split and "num" is its length.  lo_half and hi_half are output arrays
100    that parallel "operands".  */
101 
102 void
split_double_mode(machine_mode mode,rtx operands[],int num,rtx lo_half[],rtx hi_half[])103 split_double_mode (machine_mode mode, rtx operands[],
104 		   int num, rtx lo_half[], rtx hi_half[])
105 {
106   machine_mode half_mode;
107   unsigned int byte;
108   rtx mem_op = NULL_RTX;
109   int mem_num = 0;
110 
111   switch (mode)
112     {
113     case E_TImode:
114       half_mode = DImode;
115       break;
116     case E_DImode:
117       half_mode = SImode;
118       break;
119     case E_P2HImode:
120       half_mode = HImode;
121       break;
122     case E_P2QImode:
123       half_mode = QImode;
124       break;
125     default:
126       gcc_unreachable ();
127     }
128 
129   byte = GET_MODE_SIZE (half_mode);
130 
131   while (num--)
132     {
133       rtx op = operands[num];
134 
135       /* simplify_subreg refuse to split volatile memory addresses,
136          but we still have to handle it.  */
137       if (MEM_P (op))
138 	{
139 	  if (mem_op && rtx_equal_p (op, mem_op))
140 	    {
141 	      lo_half[num] = lo_half[mem_num];
142 	      hi_half[num] = hi_half[mem_num];
143 	    }
144 	  else
145 	    {
146 	      mem_op = op;
147 	      mem_num = num;
148 	      lo_half[num] = adjust_address (op, half_mode, 0);
149 	      hi_half[num] = adjust_address (op, half_mode, byte);
150 	    }
151 	}
152       else
153 	{
154 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
155 					      GET_MODE (op) == VOIDmode
156 					      ? mode : GET_MODE (op), 0);
157 
158 	  rtx tmp = simplify_gen_subreg (half_mode, op,
159 					 GET_MODE (op) == VOIDmode
160 					 ? mode : GET_MODE (op), byte);
161 	  /* simplify_gen_subreg will return NULL RTX for the
162 	     high half of the paradoxical subreg. */
163 	  hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 	}
165     }
166 }
167 
168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
169    for the target.  */
170 
171 void
ix86_expand_clear(rtx dest)172 ix86_expand_clear (rtx dest)
173 {
174   rtx tmp;
175 
176   /* We play register width games, which are only valid after reload.  */
177   gcc_assert (reload_completed);
178 
179   /* Avoid HImode and its attendant prefix byte.  */
180   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
181     dest = gen_rtx_REG (SImode, REGNO (dest));
182   tmp = gen_rtx_SET (dest, const0_rtx);
183 
184   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
185     {
186       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
187       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
188     }
189 
190   emit_insn (tmp);
191 }
192 
193 /* Return true if V can be broadcasted from an integer of WIDTH bits
194    which is returned in VAL_BROADCAST.  Otherwise, return false.  */
195 
196 static bool
ix86_broadcast(HOST_WIDE_INT v,unsigned int width,HOST_WIDE_INT & val_broadcast)197 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
198 		HOST_WIDE_INT &val_broadcast)
199 {
200   wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
201   val_broadcast = wi::extract_uhwi (val, 0, width);
202   for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
203     {
204       HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
205       if (val_broadcast != each)
206 	return false;
207     }
208   val_broadcast = sext_hwi (val_broadcast, width);
209   return true;
210 }
211 
212 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
213 
214 static rtx
ix86_convert_const_wide_int_to_broadcast(machine_mode mode,rtx op)215 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
216 {
217   /* Don't use integer vector broadcast if we can't move from GPR to SSE
218      register directly.  */
219   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
220     return nullptr;
221 
222   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
223      broadcast only if vector broadcast is available.  */
224   if (!TARGET_AVX
225       || !CONST_WIDE_INT_P (op)
226       || standard_sse_constant_p (op, mode))
227     return nullptr;
228 
229   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
230   HOST_WIDE_INT val_broadcast;
231   scalar_int_mode broadcast_mode;
232   if (TARGET_AVX2
233       && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
234 			 val_broadcast))
235     broadcast_mode = QImode;
236   else if (TARGET_AVX2
237 	   && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
238 			      val_broadcast))
239     broadcast_mode = HImode;
240   else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
241 			   val_broadcast))
242     broadcast_mode = SImode;
243   else if (TARGET_64BIT
244 	   && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
245 			      val_broadcast))
246     broadcast_mode = DImode;
247   else
248     return nullptr;
249 
250   /* Check if OP can be broadcasted from VAL.  */
251   for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
252     if (val != CONST_WIDE_INT_ELT (op, i))
253       return nullptr;
254 
255   unsigned int nunits = (GET_MODE_SIZE (mode)
256 			 / GET_MODE_SIZE (broadcast_mode));
257   machine_mode vector_mode;
258   if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
259     gcc_unreachable ();
260   rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
261   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
262 					       target,
263 					       GEN_INT (val_broadcast));
264   gcc_assert (ok);
265   target = lowpart_subreg (mode, target, vector_mode);
266   return target;
267 }
268 
269 void
ix86_expand_move(machine_mode mode,rtx operands[])270 ix86_expand_move (machine_mode mode, rtx operands[])
271 {
272   rtx op0, op1;
273   rtx tmp, addend = NULL_RTX;
274   enum tls_model model;
275 
276   op0 = operands[0];
277   op1 = operands[1];
278 
279   /* Avoid complex sets of likely spilled hard registers before reload.  */
280   if (!ix86_hardreg_mov_ok (op0, op1))
281     {
282       tmp = gen_reg_rtx (mode);
283       operands[0] = tmp;
284       ix86_expand_move (mode, operands);
285       operands[0] = op0;
286       operands[1] = tmp;
287       op1 = tmp;
288     }
289 
290   switch (GET_CODE (op1))
291     {
292     case CONST:
293       tmp = XEXP (op1, 0);
294 
295       if (GET_CODE (tmp) != PLUS
296 	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
297 	break;
298 
299       op1 = XEXP (tmp, 0);
300       addend = XEXP (tmp, 1);
301       /* FALLTHRU */
302 
303     case SYMBOL_REF:
304       model = SYMBOL_REF_TLS_MODEL (op1);
305 
306       if (model)
307 	op1 = legitimize_tls_address (op1, model, true);
308       else if (ix86_force_load_from_GOT_p (op1))
309 	{
310 	  /* Load the external function address via GOT slot to avoid PLT.  */
311 	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
312 				(TARGET_64BIT
313 				 ? UNSPEC_GOTPCREL
314 				 : UNSPEC_GOT));
315 	  op1 = gen_rtx_CONST (Pmode, op1);
316 	  op1 = gen_const_mem (Pmode, op1);
317 	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
318 	}
319       else
320 	{
321 	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
322 	  if (tmp)
323 	    {
324 	      op1 = tmp;
325 	      if (!addend)
326 		break;
327 	    }
328 	  else
329 	    {
330 	      op1 = operands[1];
331 	      break;
332 	    }
333 	}
334 
335       if (addend)
336 	{
337 	  op1 = force_operand (op1, NULL_RTX);
338 	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
339 				     op0, 1, OPTAB_DIRECT);
340 	}
341       else
342 	op1 = force_operand (op1, op0);
343 
344       if (op1 == op0)
345 	return;
346 
347       op1 = convert_to_mode (mode, op1, 1);
348 
349     default:
350       break;
351     }
352 
353   if ((flag_pic || MACHOPIC_INDIRECT)
354       && symbolic_operand (op1, mode))
355     {
356       if (TARGET_MACHO && !TARGET_64BIT)
357 	{
358 #if TARGET_MACHO
359 	  /* dynamic-no-pic */
360 	  if (MACHOPIC_INDIRECT)
361 	    {
362 	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
363 			 ? op0 : gen_reg_rtx (Pmode);
364 	      op1 = machopic_indirect_data_reference (op1, temp);
365 	      if (MACHOPIC_PURE)
366 		op1 = machopic_legitimize_pic_address (op1, mode,
367 						       temp == op1 ? 0 : temp);
368 	    }
369 	  if (op0 != op1 && GET_CODE (op0) != MEM)
370 	    {
371 	      rtx insn = gen_rtx_SET (op0, op1);
372 	      emit_insn (insn);
373 	      return;
374 	    }
375 	  if (GET_CODE (op0) == MEM)
376 	    op1 = force_reg (Pmode, op1);
377 	  else
378 	    {
379 	      rtx temp = op0;
380 	      if (GET_CODE (temp) != REG)
381 		temp = gen_reg_rtx (Pmode);
382 	      temp = legitimize_pic_address (op1, temp);
383 	      if (temp == op0)
384 	    return;
385 	      op1 = temp;
386 	    }
387       /* dynamic-no-pic */
388 #endif
389 	}
390       else
391 	{
392 	  if (MEM_P (op0))
393 	    op1 = force_reg (mode, op1);
394 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
395 	    {
396 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
397 	      op1 = legitimize_pic_address (op1, reg);
398 	      if (op0 == op1)
399 		return;
400 	      op1 = convert_to_mode (mode, op1, 1);
401 	    }
402 	}
403     }
404   else
405     {
406       if (MEM_P (op0)
407 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
408 	      || !push_operand (op0, mode))
409 	  && MEM_P (op1))
410 	op1 = force_reg (mode, op1);
411 
412       if (push_operand (op0, mode)
413 	  && ! general_no_elim_operand (op1, mode))
414 	op1 = copy_to_mode_reg (mode, op1);
415 
416       /* Force large constants in 64bit compilation into register
417 	 to get them CSEed.  */
418       if (can_create_pseudo_p ()
419 	  && (mode == DImode) && TARGET_64BIT
420 	  && immediate_operand (op1, mode)
421 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
422 	  && !register_operand (op0, mode)
423 	  && optimize)
424 	op1 = copy_to_mode_reg (mode, op1);
425 
426       if (can_create_pseudo_p ())
427 	{
428 	  if (CONST_DOUBLE_P (op1))
429 	    {
430 	      /* If we are loading a floating point constant to a
431 		 register, force the value to memory now, since we'll
432 		 get better code out the back end.  */
433 
434 	      op1 = validize_mem (force_const_mem (mode, op1));
435 	      if (!register_operand (op0, mode))
436 		{
437 		  rtx temp = gen_reg_rtx (mode);
438 		  emit_insn (gen_rtx_SET (temp, op1));
439 		  emit_move_insn (op0, temp);
440 		  return;
441 		}
442 	    }
443 	  else if (GET_MODE_SIZE (mode) >= 16)
444 	    {
445 	      rtx tmp = ix86_convert_const_wide_int_to_broadcast
446 		(GET_MODE (op0), op1);
447 	      if (tmp != nullptr)
448 		op1 = tmp;
449 	    }
450 	}
451     }
452 
453   emit_insn (gen_rtx_SET (op0, op1));
454 }
455 
456 /* OP is a memref of CONST_VECTOR, return scalar constant mem
457    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
458 static rtx
ix86_broadcast_from_constant(machine_mode mode,rtx op)459 ix86_broadcast_from_constant (machine_mode mode, rtx op)
460 {
461   int nunits = GET_MODE_NUNITS (mode);
462   if (nunits < 2)
463     return nullptr;
464 
465   /* Don't use integer vector broadcast if we can't move from GPR to SSE
466      register directly.  */
467   if (!TARGET_INTER_UNIT_MOVES_TO_VEC
468       && INTEGRAL_MODE_P (mode))
469     return nullptr;
470 
471   /* Convert CONST_VECTOR to a non-standard SSE constant integer
472      broadcast only if vector broadcast is available.  */
473   if (!(TARGET_AVX2
474 	|| (TARGET_AVX
475 	    && (GET_MODE_INNER (mode) == SImode
476 		|| GET_MODE_INNER (mode) == DImode))
477 	|| FLOAT_MODE_P (mode))
478       || standard_sse_constant_p (op, mode))
479     return nullptr;
480 
481   /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
482      We can still put 64-bit integer constant in memory when
483      avx512 embed broadcast is available.  */
484   if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
485       && (!TARGET_AVX512F
486 	  || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
487     return nullptr;
488 
489   if (GET_MODE_INNER (mode) == TImode)
490     return nullptr;
491 
492   rtx constant = get_pool_constant (XEXP (op, 0));
493   if (GET_CODE (constant) != CONST_VECTOR)
494     return nullptr;
495 
496   /* There could be some rtx like
497      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
498      but with "*.LC1" refer to V2DI constant vector.  */
499   if (GET_MODE (constant) != mode)
500     {
501       constant = simplify_subreg (mode, constant, GET_MODE (constant),
502 				  0);
503       if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
504 	return nullptr;
505     }
506 
507   rtx first = XVECEXP (constant, 0, 0);
508 
509   for (int i = 1; i < nunits; ++i)
510     {
511       rtx tmp = XVECEXP (constant, 0, i);
512       /* Vector duplicate value.  */
513       if (!rtx_equal_p (tmp, first))
514 	return nullptr;
515     }
516 
517   return first;
518 }
519 
520 void
ix86_expand_vector_move(machine_mode mode,rtx operands[])521 ix86_expand_vector_move (machine_mode mode, rtx operands[])
522 {
523   rtx op0 = operands[0], op1 = operands[1];
524   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
525      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
526   unsigned int align = (TARGET_IAMCU
527 			? GET_MODE_BITSIZE (mode)
528 			: GET_MODE_ALIGNMENT (mode));
529 
530   if (push_operand (op0, VOIDmode))
531     op0 = emit_move_resolve_push (mode, op0);
532 
533   /* Force constants other than zero into memory.  We do not know how
534      the instructions used to build constants modify the upper 64 bits
535      of the register, once we have that information we may be able
536      to handle some of them more efficiently.  */
537   if (can_create_pseudo_p ()
538       && (CONSTANT_P (op1)
539 	  || (SUBREG_P (op1)
540 	      && CONSTANT_P (SUBREG_REG (op1))))
541       && ((register_operand (op0, mode)
542 	   && !standard_sse_constant_p (op1, mode))
543 	  /* ix86_expand_vector_move_misalign() does not like constants.  */
544 	  || (SSE_REG_MODE_P (mode)
545 	      && MEM_P (op0)
546 	      && MEM_ALIGN (op0) < align)))
547     {
548       if (SUBREG_P (op1))
549 	{
550 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
551 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
552 	  if (r)
553 	    r = validize_mem (r);
554 	  else
555 	    r = force_reg (imode, SUBREG_REG (op1));
556 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
557 	}
558       else
559 	{
560 	  machine_mode mode = GET_MODE (op0);
561 	  rtx tmp = ix86_convert_const_wide_int_to_broadcast
562 	    (mode, op1);
563 	  if (tmp == nullptr)
564 	    op1 = validize_mem (force_const_mem (mode, op1));
565 	  else
566 	    op1 = tmp;
567 	}
568     }
569 
570   if (can_create_pseudo_p ()
571       && GET_MODE_SIZE (mode) >= 16
572       && VECTOR_MODE_P (mode)
573       && (MEM_P (op1)
574 	  && SYMBOL_REF_P (XEXP (op1, 0))
575 	  && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
576     {
577       rtx first = ix86_broadcast_from_constant (mode, op1);
578       if (first != nullptr)
579 	{
580 	  /* Broadcast to XMM/YMM/ZMM register from an integer
581 	     constant or scalar mem.  */
582 	  op1 = gen_reg_rtx (mode);
583 	  if (FLOAT_MODE_P (mode)
584 	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
585 	    first = force_const_mem (GET_MODE_INNER (mode), first);
586 	  bool ok = ix86_expand_vector_init_duplicate (false, mode,
587 						       op1, first);
588 	  gcc_assert (ok);
589 	  emit_move_insn (op0, op1);
590 	  return;
591 	}
592     }
593 
594   /* We need to check memory alignment for SSE mode since attribute
595      can make operands unaligned.  */
596   if (can_create_pseudo_p ()
597       && SSE_REG_MODE_P (mode)
598       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
599 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
600     {
601       rtx tmp[2];
602 
603       /* ix86_expand_vector_move_misalign() does not like both
604 	 arguments in memory.  */
605       if (!register_operand (op0, mode)
606 	  && !register_operand (op1, mode))
607 	{
608 	  rtx scratch = ix86_gen_scratch_sse_rtx (mode);
609 	  emit_move_insn (scratch, op1);
610 	  op1 = scratch;
611 	}
612 
613       tmp[0] = op0; tmp[1] = op1;
614       ix86_expand_vector_move_misalign (mode, tmp);
615       return;
616     }
617 
618   /* If operand0 is a hard register, make operand1 a pseudo.  */
619   if (can_create_pseudo_p ()
620       && !ix86_hardreg_mov_ok (op0, op1))
621     {
622       rtx tmp = gen_reg_rtx (GET_MODE (op0));
623       emit_move_insn (tmp, op1);
624       emit_move_insn (op0, tmp);
625       return;
626     }
627 
628   /* Make operand1 a register if it isn't already.  */
629   if (can_create_pseudo_p ()
630       && !register_operand (op0, mode)
631       && !register_operand (op1, mode))
632     {
633       rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
634       emit_move_insn (tmp, op1);
635       emit_move_insn (op0, tmp);
636       return;
637     }
638 
639   emit_insn (gen_rtx_SET (op0, op1));
640 }
641 
642 /* Split 32-byte AVX unaligned load and store if needed.  */
643 
644 static void
ix86_avx256_split_vector_move_misalign(rtx op0,rtx op1)645 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
646 {
647   rtx m;
648   rtx (*extract) (rtx, rtx, rtx);
649   machine_mode mode;
650 
651   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
652       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
653     {
654       emit_insn (gen_rtx_SET (op0, op1));
655       return;
656     }
657 
658   rtx orig_op0 = NULL_RTX;
659   mode = GET_MODE (op0);
660   switch (GET_MODE_CLASS (mode))
661     {
662     case MODE_VECTOR_INT:
663     case MODE_INT:
664       if (mode != V32QImode)
665 	{
666 	  if (!MEM_P (op0))
667 	    {
668 	      orig_op0 = op0;
669 	      op0 = gen_reg_rtx (V32QImode);
670 	    }
671 	  else
672 	    op0 = gen_lowpart (V32QImode, op0);
673 	  op1 = gen_lowpart (V32QImode, op1);
674 	  mode = V32QImode;
675 	}
676       break;
677     case MODE_VECTOR_FLOAT:
678       break;
679     default:
680       gcc_unreachable ();
681     }
682 
683   switch (mode)
684     {
685     default:
686       gcc_unreachable ();
687     case E_V32QImode:
688       extract = gen_avx_vextractf128v32qi;
689       mode = V16QImode;
690       break;
691     case E_V16HFmode:
692       extract = gen_avx_vextractf128v16hf;
693       mode = V8HFmode;
694       break;
695     case E_V8SFmode:
696       extract = gen_avx_vextractf128v8sf;
697       mode = V4SFmode;
698       break;
699     case E_V4DFmode:
700       extract = gen_avx_vextractf128v4df;
701       mode = V2DFmode;
702       break;
703     }
704 
705   if (MEM_P (op1))
706     {
707       rtx r = gen_reg_rtx (mode);
708       m = adjust_address (op1, mode, 0);
709       emit_move_insn (r, m);
710       m = adjust_address (op1, mode, 16);
711       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
712       emit_move_insn (op0, r);
713     }
714   else if (MEM_P (op0))
715     {
716       m = adjust_address (op0, mode, 0);
717       emit_insn (extract (m, op1, const0_rtx));
718       m = adjust_address (op0, mode, 16);
719       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
720     }
721   else
722     gcc_unreachable ();
723 
724   if (orig_op0)
725     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
726 }
727 
728 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
729    straight to ix86_expand_vector_move.  */
730 /* Code generation for scalar reg-reg moves of single and double precision data:
731      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
732        movaps reg, reg
733      else
734        movss reg, reg
735      if (x86_sse_partial_reg_dependency == true)
736        movapd reg, reg
737      else
738        movsd reg, reg
739 
740    Code generation for scalar loads of double precision data:
741      if (x86_sse_split_regs == true)
742        movlpd mem, reg      (gas syntax)
743      else
744        movsd mem, reg
745 
746    Code generation for unaligned packed loads of single precision data
747    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
748      if (x86_sse_unaligned_move_optimal)
749        movups mem, reg
750 
751      if (x86_sse_partial_reg_dependency == true)
752        {
753          xorps  reg, reg
754          movlps mem, reg
755          movhps mem+8, reg
756        }
757      else
758        {
759          movlps mem, reg
760          movhps mem+8, reg
761        }
762 
763    Code generation for unaligned packed loads of double precision data
764    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
765      if (x86_sse_unaligned_move_optimal)
766        movupd mem, reg
767 
768      if (x86_sse_split_regs == true)
769        {
770          movlpd mem, reg
771          movhpd mem+8, reg
772        }
773      else
774        {
775          movsd  mem, reg
776          movhpd mem+8, reg
777        }
778  */
779 
780 void
ix86_expand_vector_move_misalign(machine_mode mode,rtx operands[])781 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
782 {
783   rtx op0, op1, m;
784 
785   op0 = operands[0];
786   op1 = operands[1];
787 
788   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
789   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
790     {
791       emit_insn (gen_rtx_SET (op0, op1));
792       return;
793     }
794 
795   if (TARGET_AVX)
796     {
797       if (GET_MODE_SIZE (mode) == 32)
798 	ix86_avx256_split_vector_move_misalign (op0, op1);
799       else
800 	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
801 	emit_insn (gen_rtx_SET (op0, op1));
802       return;
803     }
804 
805   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
806       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
807     {
808       emit_insn (gen_rtx_SET (op0, op1));
809       return;
810     }
811 
812   /* ??? If we have typed data, then it would appear that using
813      movdqu is the only way to get unaligned data loaded with
814      integer type.  */
815   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
816     {
817       emit_insn (gen_rtx_SET (op0, op1));
818       return;
819     }
820 
821   if (MEM_P (op1))
822     {
823       if (TARGET_SSE2 && mode == V2DFmode)
824         {
825           rtx zero;
826 
827 	  /* When SSE registers are split into halves, we can avoid
828 	     writing to the top half twice.  */
829 	  if (TARGET_SSE_SPLIT_REGS)
830 	    {
831 	      emit_clobber (op0);
832 	      zero = op0;
833 	    }
834 	  else
835 	    {
836 	      /* ??? Not sure about the best option for the Intel chips.
837 		 The following would seem to satisfy; the register is
838 		 entirely cleared, breaking the dependency chain.  We
839 		 then store to the upper half, with a dependency depth
840 		 of one.  A rumor has it that Intel recommends two movsd
841 		 followed by an unpacklpd, but this is unconfirmed.  And
842 		 given that the dependency depth of the unpacklpd would
843 		 still be one, I'm not sure why this would be better.  */
844 	      zero = CONST0_RTX (V2DFmode);
845 	    }
846 
847 	  m = adjust_address (op1, DFmode, 0);
848 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
849 	  m = adjust_address (op1, DFmode, 8);
850 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
851 	}
852       else
853         {
854 	  rtx t;
855 
856 	  if (mode != V4SFmode)
857 	    t = gen_reg_rtx (V4SFmode);
858 	  else
859 	    t = op0;
860 
861 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
862 	    emit_move_insn (t, CONST0_RTX (V4SFmode));
863 	  else
864 	    emit_clobber (t);
865 
866 	  m = adjust_address (op1, V2SFmode, 0);
867 	  emit_insn (gen_sse_loadlps (t, t, m));
868 	  m = adjust_address (op1, V2SFmode, 8);
869 	  emit_insn (gen_sse_loadhps (t, t, m));
870 	  if (mode != V4SFmode)
871 	    emit_move_insn (op0, gen_lowpart (mode, t));
872 	}
873     }
874   else if (MEM_P (op0))
875     {
876       if (TARGET_SSE2 && mode == V2DFmode)
877 	{
878 	  m = adjust_address (op0, DFmode, 0);
879 	  emit_insn (gen_sse2_storelpd (m, op1));
880 	  m = adjust_address (op0, DFmode, 8);
881 	  emit_insn (gen_sse2_storehpd (m, op1));
882 	}
883       else
884 	{
885 	  if (mode != V4SFmode)
886 	    op1 = gen_lowpart (V4SFmode, op1);
887 
888 	  m = adjust_address (op0, V2SFmode, 0);
889 	  emit_insn (gen_sse_storelps (m, op1));
890 	  m = adjust_address (op0, V2SFmode, 8);
891 	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
892 	}
893     }
894   else
895     gcc_unreachable ();
896 }
897 
898 /* Move bits 64:95 to bits 32:63.  */
899 
900 void
ix86_move_vector_high_sse_to_mmx(rtx op)901 ix86_move_vector_high_sse_to_mmx (rtx op)
902 {
903   rtx mask = gen_rtx_PARALLEL (VOIDmode,
904 			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
905 					  GEN_INT (0), GEN_INT (0)));
906   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
907   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
908   rtx insn = gen_rtx_SET (dest, op);
909   emit_insn (insn);
910 }
911 
912 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
913 
914 void
ix86_split_mmx_pack(rtx operands[],enum rtx_code code)915 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
916 {
917   rtx op0 = operands[0];
918   rtx op1 = operands[1];
919   rtx op2 = operands[2];
920 
921   machine_mode dmode = GET_MODE (op0);
922   machine_mode smode = GET_MODE (op1);
923   machine_mode inner_dmode = GET_MODE_INNER (dmode);
924   machine_mode inner_smode = GET_MODE_INNER (smode);
925 
926   /* Get the corresponding SSE mode for destination.  */
927   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
928   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
929 					    nunits).require ();
930   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
931 						 nunits / 2).require ();
932 
933   /* Get the corresponding SSE mode for source.  */
934   nunits = 16 / GET_MODE_SIZE (inner_smode);
935   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
936 					    nunits).require ();
937 
938   /* Generate SSE pack with signed/unsigned saturation.  */
939   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
940   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
941   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
942 
943   op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
944   op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
945   rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
946 						    op1, op2));
947   emit_insn (insn);
948 
949   ix86_move_vector_high_sse_to_mmx (op0);
950 }
951 
952 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
953 
954 void
ix86_split_mmx_punpck(rtx operands[],bool high_p)955 ix86_split_mmx_punpck (rtx operands[], bool high_p)
956 {
957   rtx op0 = operands[0];
958   rtx op1 = operands[1];
959   rtx op2 = operands[2];
960   machine_mode mode = GET_MODE (op0);
961   rtx mask;
962   /* The corresponding SSE mode.  */
963   machine_mode sse_mode, double_sse_mode;
964 
965   switch (mode)
966     {
967     case E_V4QImode:
968     case E_V8QImode:
969       sse_mode = V16QImode;
970       double_sse_mode = V32QImode;
971       mask = gen_rtx_PARALLEL (VOIDmode,
972 			       gen_rtvec (16,
973 					  GEN_INT (0), GEN_INT (16),
974 					  GEN_INT (1), GEN_INT (17),
975 					  GEN_INT (2), GEN_INT (18),
976 					  GEN_INT (3), GEN_INT (19),
977 					  GEN_INT (4), GEN_INT (20),
978 					  GEN_INT (5), GEN_INT (21),
979 					  GEN_INT (6), GEN_INT (22),
980 					  GEN_INT (7), GEN_INT (23)));
981       break;
982 
983     case E_V4HImode:
984     case E_V2HImode:
985       sse_mode = V8HImode;
986       double_sse_mode = V16HImode;
987       mask = gen_rtx_PARALLEL (VOIDmode,
988 			       gen_rtvec (8,
989 					  GEN_INT (0), GEN_INT (8),
990 					  GEN_INT (1), GEN_INT (9),
991 					  GEN_INT (2), GEN_INT (10),
992 					  GEN_INT (3), GEN_INT (11)));
993       break;
994 
995     case E_V2SImode:
996       sse_mode = V4SImode;
997       double_sse_mode = V8SImode;
998       mask = gen_rtx_PARALLEL (VOIDmode,
999 			       gen_rtvec (4,
1000 					  GEN_INT (0), GEN_INT (4),
1001 					  GEN_INT (1), GEN_INT (5)));
1002       break;
1003 
1004     case E_V2SFmode:
1005       sse_mode = V4SFmode;
1006       double_sse_mode = V8SFmode;
1007       mask = gen_rtx_PARALLEL (VOIDmode,
1008 			       gen_rtvec (4,
1009 					  GEN_INT (0), GEN_INT (4),
1010 					  GEN_INT (1), GEN_INT (5)));
1011       break;
1012 
1013     default:
1014       gcc_unreachable ();
1015     }
1016 
1017   /* Generate SSE punpcklXX.  */
1018   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1019   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1020   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1021 
1022   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1023   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1024   rtx insn = gen_rtx_SET (dest, op2);
1025   emit_insn (insn);
1026 
1027   /* Move high bits to low bits.  */
1028   if (high_p)
1029     {
1030       if (sse_mode == V4SFmode)
1031 	{
1032 	  mask = gen_rtx_PARALLEL (VOIDmode,
1033 				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1034 					      GEN_INT (4), GEN_INT (5)));
1035 	  op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1036 	  op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1037 	}
1038       else
1039 	{
1040 	  int sz = GET_MODE_SIZE (mode);
1041 
1042 	  if (sz == 4)
1043 	    mask = gen_rtx_PARALLEL (VOIDmode,
1044 				     gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1045 						GEN_INT (0), GEN_INT (1)));
1046 	  else if (sz == 8)
1047 	    mask = gen_rtx_PARALLEL (VOIDmode,
1048 				     gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1049 						GEN_INT (0), GEN_INT (1)));
1050 	  else
1051 	    gcc_unreachable ();
1052 
1053 	  dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1054 	  op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1055 	}
1056 
1057       insn = gen_rtx_SET (dest, op1);
1058       emit_insn (insn);
1059     }
1060 }
1061 
1062 /* Helper function of ix86_fixup_binary_operands to canonicalize
1063    operand order.  Returns true if the operands should be swapped.  */
1064 
1065 static bool
ix86_swap_binary_operands_p(enum rtx_code code,machine_mode mode,rtx operands[])1066 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1067 			     rtx operands[])
1068 {
1069   rtx dst = operands[0];
1070   rtx src1 = operands[1];
1071   rtx src2 = operands[2];
1072 
1073   /* If the operation is not commutative, we can't do anything.  */
1074   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1075       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1076     return false;
1077 
1078   /* Highest priority is that src1 should match dst.  */
1079   if (rtx_equal_p (dst, src1))
1080     return false;
1081   if (rtx_equal_p (dst, src2))
1082     return true;
1083 
1084   /* Next highest priority is that immediate constants come second.  */
1085   if (immediate_operand (src2, mode))
1086     return false;
1087   if (immediate_operand (src1, mode))
1088     return true;
1089 
1090   /* Lowest priority is that memory references should come second.  */
1091   if (MEM_P (src2))
1092     return false;
1093   if (MEM_P (src1))
1094     return true;
1095 
1096   return false;
1097 }
1098 
1099 
1100 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
1101    destination to use for the operation.  If different from the true
1102    destination in operands[0], a copy operation will be required.  */
1103 
1104 rtx
ix86_fixup_binary_operands(enum rtx_code code,machine_mode mode,rtx operands[])1105 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1106 			    rtx operands[])
1107 {
1108   rtx dst = operands[0];
1109   rtx src1 = operands[1];
1110   rtx src2 = operands[2];
1111 
1112   /* Canonicalize operand order.  */
1113   if (ix86_swap_binary_operands_p (code, mode, operands))
1114     {
1115       /* It is invalid to swap operands of different modes.  */
1116       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1117 
1118       std::swap (src1, src2);
1119     }
1120 
1121   /* Both source operands cannot be in memory.  */
1122   if (MEM_P (src1) && MEM_P (src2))
1123     {
1124       /* Optimization: Only read from memory once.  */
1125       if (rtx_equal_p (src1, src2))
1126 	{
1127 	  src2 = force_reg (mode, src2);
1128 	  src1 = src2;
1129 	}
1130       else if (rtx_equal_p (dst, src1))
1131 	src2 = force_reg (mode, src2);
1132       else
1133 	src1 = force_reg (mode, src1);
1134     }
1135 
1136   /* If the destination is memory, and we do not have matching source
1137      operands, do things in registers.  */
1138   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1139     dst = gen_reg_rtx (mode);
1140 
1141   /* Source 1 cannot be a constant.  */
1142   if (CONSTANT_P (src1))
1143     src1 = force_reg (mode, src1);
1144 
1145   /* Source 1 cannot be a non-matching memory.  */
1146   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1147     src1 = force_reg (mode, src1);
1148 
1149   /* Improve address combine.  */
1150   if (code == PLUS
1151       && GET_MODE_CLASS (mode) == MODE_INT
1152       && MEM_P (src2))
1153     src2 = force_reg (mode, src2);
1154 
1155   operands[1] = src1;
1156   operands[2] = src2;
1157   return dst;
1158 }
1159 
1160 /* Similarly, but assume that the destination has already been
1161    set up properly.  */
1162 
1163 void
ix86_fixup_binary_operands_no_copy(enum rtx_code code,machine_mode mode,rtx operands[])1164 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1165 				    machine_mode mode, rtx operands[])
1166 {
1167   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1168   gcc_assert (dst == operands[0]);
1169 }
1170 
1171 /* Attempt to expand a binary operator.  Make the expansion closer to the
1172    actual machine, then just general_operand, which will allow 3 separate
1173    memory references (one output, two input) in a single insn.  */
1174 
1175 void
ix86_expand_binary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1176 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1177 			     rtx operands[])
1178 {
1179   rtx src1, src2, dst, op, clob;
1180 
1181   dst = ix86_fixup_binary_operands (code, mode, operands);
1182   src1 = operands[1];
1183   src2 = operands[2];
1184 
1185  /* Emit the instruction.  */
1186 
1187   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1188 
1189   if (reload_completed
1190       && code == PLUS
1191       && !rtx_equal_p (dst, src1))
1192     {
1193       /* This is going to be an LEA; avoid splitting it later.  */
1194       emit_insn (op);
1195     }
1196   else
1197     {
1198       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1199       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1200     }
1201 
1202   /* Fix up the destination if needed.  */
1203   if (dst != operands[0])
1204     emit_move_insn (operands[0], dst);
1205 }
1206 
1207 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1208    the given OPERANDS.  */
1209 
1210 void
ix86_expand_vector_logical_operator(enum rtx_code code,machine_mode mode,rtx operands[])1211 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1212 				     rtx operands[])
1213 {
1214   rtx op1 = NULL_RTX, op2 = NULL_RTX;
1215   if (SUBREG_P (operands[1]))
1216     {
1217       op1 = operands[1];
1218       op2 = operands[2];
1219     }
1220   else if (SUBREG_P (operands[2]))
1221     {
1222       op1 = operands[2];
1223       op2 = operands[1];
1224     }
1225   /* Optimize (__m128i) d | (__m128i) e and similar code
1226      when d and e are float vectors into float vector logical
1227      insn.  In C/C++ without using intrinsics there is no other way
1228      to express vector logical operation on float vectors than
1229      to cast them temporarily to integer vectors.  */
1230   if (op1
1231       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1232       && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1233       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1234       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1235       && SUBREG_BYTE (op1) == 0
1236       && (GET_CODE (op2) == CONST_VECTOR
1237 	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1238 	      && SUBREG_BYTE (op2) == 0))
1239       && can_create_pseudo_p ())
1240     {
1241       rtx dst;
1242       switch (GET_MODE (SUBREG_REG (op1)))
1243 	{
1244 	case E_V4SFmode:
1245 	case E_V8SFmode:
1246 	case E_V16SFmode:
1247 	case E_V2DFmode:
1248 	case E_V4DFmode:
1249 	case E_V8DFmode:
1250 	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1251 	  if (GET_CODE (op2) == CONST_VECTOR)
1252 	    {
1253 	      op2 = gen_lowpart (GET_MODE (dst), op2);
1254 	      op2 = force_reg (GET_MODE (dst), op2);
1255 	    }
1256 	  else
1257 	    {
1258 	      op1 = operands[1];
1259 	      op2 = SUBREG_REG (operands[2]);
1260 	      if (!vector_operand (op2, GET_MODE (dst)))
1261 		op2 = force_reg (GET_MODE (dst), op2);
1262 	    }
1263 	  op1 = SUBREG_REG (op1);
1264 	  if (!vector_operand (op1, GET_MODE (dst)))
1265 	    op1 = force_reg (GET_MODE (dst), op1);
1266 	  emit_insn (gen_rtx_SET (dst,
1267 				  gen_rtx_fmt_ee (code, GET_MODE (dst),
1268 						  op1, op2)));
1269 	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
1270 	  return;
1271 	default:
1272 	  break;
1273 	}
1274     }
1275   if (!vector_operand (operands[1], mode))
1276     operands[1] = force_reg (mode, operands[1]);
1277   if (!vector_operand (operands[2], mode))
1278     operands[2] = force_reg (mode, operands[2]);
1279   ix86_fixup_binary_operands_no_copy (code, mode, operands);
1280   emit_insn (gen_rtx_SET (operands[0],
1281 			  gen_rtx_fmt_ee (code, mode, operands[1],
1282 					  operands[2])));
1283 }
1284 
1285 /* Return TRUE or FALSE depending on whether the binary operator meets the
1286    appropriate constraints.  */
1287 
1288 bool
ix86_binary_operator_ok(enum rtx_code code,machine_mode mode,rtx operands[3])1289 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1290 			 rtx operands[3])
1291 {
1292   rtx dst = operands[0];
1293   rtx src1 = operands[1];
1294   rtx src2 = operands[2];
1295 
1296   /* Both source operands cannot be in memory.  */
1297   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1298       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1299     return false;
1300 
1301   /* Canonicalize operand order for commutative operators.  */
1302   if (ix86_swap_binary_operands_p (code, mode, operands))
1303     std::swap (src1, src2);
1304 
1305   /* If the destination is memory, we must have a matching source operand.  */
1306   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1307     return false;
1308 
1309   /* Source 1 cannot be a constant.  */
1310   if (CONSTANT_P (src1))
1311     return false;
1312 
1313   /* Source 1 cannot be a non-matching memory.  */
1314   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1315     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
1316     return (code == AND
1317 	    && (mode == HImode
1318 		|| mode == SImode
1319 		|| (TARGET_64BIT && mode == DImode))
1320 	    && satisfies_constraint_L (src2));
1321 
1322   return true;
1323 }
1324 
1325 /* Attempt to expand a unary operator.  Make the expansion closer to the
1326    actual machine, then just general_operand, which will allow 2 separate
1327    memory references (one output, one input) in a single insn.  */
1328 
1329 void
ix86_expand_unary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1330 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1331 			    rtx operands[])
1332 {
1333   bool matching_memory = false;
1334   rtx src, dst, op, clob;
1335 
1336   dst = operands[0];
1337   src = operands[1];
1338 
1339   /* If the destination is memory, and we do not have matching source
1340      operands, do things in registers.  */
1341   if (MEM_P (dst))
1342     {
1343       if (rtx_equal_p (dst, src))
1344 	matching_memory = true;
1345       else
1346 	dst = gen_reg_rtx (mode);
1347     }
1348 
1349   /* When source operand is memory, destination must match.  */
1350   if (MEM_P (src) && !matching_memory)
1351     src = force_reg (mode, src);
1352 
1353   /* Emit the instruction.  */
1354 
1355   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1356 
1357   if (code == NOT)
1358     emit_insn (op);
1359   else
1360     {
1361       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1362       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1363     }
1364 
1365   /* Fix up the destination if needed.  */
1366   if (dst != operands[0])
1367     emit_move_insn (operands[0], dst);
1368 }
1369 
1370 /* Predict just emitted jump instruction to be taken with probability PROB.  */
1371 
1372 static void
predict_jump(int prob)1373 predict_jump (int prob)
1374 {
1375   rtx_insn *insn = get_last_insn ();
1376   gcc_assert (JUMP_P (insn));
1377   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1378 }
1379 
1380 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1381    divisor are within the range [0-255].  */
1382 
1383 void
ix86_split_idivmod(machine_mode mode,rtx operands[],bool unsigned_p)1384 ix86_split_idivmod (machine_mode mode, rtx operands[],
1385 		    bool unsigned_p)
1386 {
1387   rtx_code_label *end_label, *qimode_label;
1388   rtx div, mod;
1389   rtx_insn *insn;
1390   rtx scratch, tmp0, tmp1, tmp2;
1391   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1392 
1393   switch (mode)
1394     {
1395     case E_SImode:
1396       if (GET_MODE (operands[0]) == SImode)
1397 	{
1398 	  if (GET_MODE (operands[1]) == SImode)
1399 	    gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1400 	  else
1401 	    gen_divmod4_1
1402 	      = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1403 	}
1404       else
1405 	gen_divmod4_1
1406 	  = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1407       break;
1408 
1409     case E_DImode:
1410       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1411       break;
1412 
1413     default:
1414       gcc_unreachable ();
1415     }
1416 
1417   end_label = gen_label_rtx ();
1418   qimode_label = gen_label_rtx ();
1419 
1420   scratch = gen_reg_rtx (mode);
1421 
1422   /* Use 8bit unsigned divimod if dividend and divisor are within
1423      the range [0-255].  */
1424   emit_move_insn (scratch, operands[2]);
1425   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1426 				 scratch, 1, OPTAB_DIRECT);
1427   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1428   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1429   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1430   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1431 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1432 			       pc_rtx);
1433   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1434   predict_jump (REG_BR_PROB_BASE * 50 / 100);
1435   JUMP_LABEL (insn) = qimode_label;
1436 
1437   /* Generate original signed/unsigned divimod.  */
1438   emit_insn (gen_divmod4_1 (operands[0], operands[1],
1439 			    operands[2], operands[3]));
1440 
1441   /* Branch to the end.  */
1442   emit_jump_insn (gen_jump (end_label));
1443   emit_barrier ();
1444 
1445   /* Generate 8bit unsigned divide.  */
1446   emit_label (qimode_label);
1447   /* Don't use operands[0] for result of 8bit divide since not all
1448      registers support QImode ZERO_EXTRACT.  */
1449   tmp0 = lowpart_subreg (HImode, scratch, mode);
1450   tmp1 = lowpart_subreg (HImode, operands[2], mode);
1451   tmp2 = lowpart_subreg (QImode, operands[3], mode);
1452   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1453 
1454   if (unsigned_p)
1455     {
1456       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1457       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1458     }
1459   else
1460     {
1461       div = gen_rtx_DIV (mode, operands[2], operands[3]);
1462       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1463     }
1464   if (mode == SImode)
1465     {
1466       if (GET_MODE (operands[0]) != SImode)
1467 	div = gen_rtx_ZERO_EXTEND (DImode, div);
1468       if (GET_MODE (operands[1]) != SImode)
1469 	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1470     }
1471 
1472   /* Extract remainder from AH.  */
1473   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1474   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1475 			       GEN_INT (8), GEN_INT (8));
1476   insn = emit_move_insn (operands[1], tmp1);
1477   set_unique_reg_note (insn, REG_EQUAL, mod);
1478 
1479   /* Zero extend quotient from AL.  */
1480   tmp1 = gen_lowpart (QImode, tmp0);
1481   insn = emit_insn (gen_extend_insn
1482 		    (operands[0], tmp1,
1483 		     GET_MODE (operands[0]), QImode, 1));
1484   set_unique_reg_note (insn, REG_EQUAL, div);
1485 
1486   emit_label (end_label);
1487 }
1488 
1489 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1490    matches destination.  RTX includes clobber of FLAGS_REG.  */
1491 
1492 void
ix86_emit_binop(enum rtx_code code,machine_mode mode,rtx dst,rtx src)1493 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1494 		 rtx dst, rtx src)
1495 {
1496   rtx op, clob;
1497 
1498   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1499   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1500 
1501   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1502 }
1503 
1504 /* Return true if regno1 def is nearest to the insn.  */
1505 
1506 static bool
find_nearest_reg_def(rtx_insn * insn,int regno1,int regno2)1507 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1508 {
1509   rtx_insn *prev = insn;
1510   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1511 
1512   if (insn == start)
1513     return false;
1514   while (prev && prev != start)
1515     {
1516       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1517 	{
1518 	  prev = PREV_INSN (prev);
1519 	  continue;
1520 	}
1521       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1522 	return true;
1523       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1524 	return false;
1525       prev = PREV_INSN (prev);
1526     }
1527 
1528   /* None of the regs is defined in the bb.  */
1529   return false;
1530 }
1531 
1532 /* INSN_UID of the last insn emitted by zero store peephole2s.  */
1533 int ix86_last_zero_store_uid;
1534 
1535 /* Split lea instructions into a sequence of instructions
1536    which are executed on ALU to avoid AGU stalls.
1537    It is assumed that it is allowed to clobber flags register
1538    at lea position.  */
1539 
1540 void
ix86_split_lea_for_addr(rtx_insn * insn,rtx operands[],machine_mode mode)1541 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1542 {
1543   unsigned int regno0, regno1, regno2;
1544   struct ix86_address parts;
1545   rtx target, tmp;
1546   int ok, adds;
1547 
1548   ok = ix86_decompose_address (operands[1], &parts);
1549   gcc_assert (ok);
1550 
1551   target = gen_lowpart (mode, operands[0]);
1552 
1553   regno0 = true_regnum (target);
1554   regno1 = INVALID_REGNUM;
1555   regno2 = INVALID_REGNUM;
1556 
1557   if (parts.base)
1558     {
1559       parts.base = gen_lowpart (mode, parts.base);
1560       regno1 = true_regnum (parts.base);
1561     }
1562 
1563   if (parts.index)
1564     {
1565       parts.index = gen_lowpart (mode, parts.index);
1566       regno2 = true_regnum (parts.index);
1567     }
1568 
1569   if (parts.disp)
1570     parts.disp = gen_lowpart (mode, parts.disp);
1571 
1572   if (parts.scale > 1)
1573     {
1574       /* Case r1 = r1 + ...  */
1575       if (regno1 == regno0)
1576 	{
1577 	  /* If we have a case r1 = r1 + C * r2 then we
1578 	     should use multiplication which is very
1579 	     expensive.  Assume cost model is wrong if we
1580 	     have such case here.  */
1581 	  gcc_assert (regno2 != regno0);
1582 
1583 	  for (adds = parts.scale; adds > 0; adds--)
1584 	    ix86_emit_binop (PLUS, mode, target, parts.index);
1585 	}
1586       else
1587 	{
1588 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
1589 	  if (regno0 != regno2)
1590 	    emit_insn (gen_rtx_SET (target, parts.index));
1591 
1592 	  /* Use shift for scaling, but emit it as MULT instead
1593 	     to avoid it being immediately peephole2 optimized back
1594 	     into lea.  */
1595 	  ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1596 
1597 	  if (parts.base)
1598 	    ix86_emit_binop (PLUS, mode, target, parts.base);
1599 
1600 	  if (parts.disp && parts.disp != const0_rtx)
1601 	    ix86_emit_binop (PLUS, mode, target, parts.disp);
1602 	}
1603     }
1604   else if (!parts.base && !parts.index)
1605     {
1606       gcc_assert(parts.disp);
1607       emit_insn (gen_rtx_SET (target, parts.disp));
1608     }
1609   else
1610     {
1611       if (!parts.base)
1612 	{
1613 	  if (regno0 != regno2)
1614 	    emit_insn (gen_rtx_SET (target, parts.index));
1615 	}
1616       else if (!parts.index)
1617 	{
1618 	  if (regno0 != regno1)
1619 	    emit_insn (gen_rtx_SET (target, parts.base));
1620 	}
1621       else
1622 	{
1623 	  if (regno0 == regno1)
1624 	    tmp = parts.index;
1625 	  else if (regno0 == regno2)
1626 	    tmp = parts.base;
1627 	  else
1628 	    {
1629 	      rtx tmp1;
1630 
1631 	      /* Find better operand for SET instruction, depending
1632 		 on which definition is farther from the insn.  */
1633 	      if (find_nearest_reg_def (insn, regno1, regno2))
1634 		tmp = parts.index, tmp1 = parts.base;
1635 	      else
1636 		tmp = parts.base, tmp1 = parts.index;
1637 
1638 	      emit_insn (gen_rtx_SET (target, tmp));
1639 
1640 	      if (parts.disp && parts.disp != const0_rtx)
1641 		ix86_emit_binop (PLUS, mode, target, parts.disp);
1642 
1643 	      ix86_emit_binop (PLUS, mode, target, tmp1);
1644 	      return;
1645 	    }
1646 
1647 	  ix86_emit_binop (PLUS, mode, target, tmp);
1648 	}
1649 
1650       if (parts.disp && parts.disp != const0_rtx)
1651 	ix86_emit_binop (PLUS, mode, target, parts.disp);
1652     }
1653 }
1654 
1655 /* Post-reload splitter for converting an SF or DFmode value in an
1656    SSE register into an unsigned SImode.  */
1657 
1658 void
ix86_split_convert_uns_si_sse(rtx operands[])1659 ix86_split_convert_uns_si_sse (rtx operands[])
1660 {
1661   machine_mode vecmode;
1662   rtx value, large, zero_or_two31, input, two31, x;
1663 
1664   large = operands[1];
1665   zero_or_two31 = operands[2];
1666   input = operands[3];
1667   two31 = operands[4];
1668   vecmode = GET_MODE (large);
1669   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1670 
1671   /* Load up the value into the low element.  We must ensure that the other
1672      elements are valid floats -- zero is the easiest such value.  */
1673   if (MEM_P (input))
1674     {
1675       if (vecmode == V4SFmode)
1676 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1677       else
1678 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1679     }
1680   else
1681     {
1682       input = gen_rtx_REG (vecmode, REGNO (input));
1683       emit_move_insn (value, CONST0_RTX (vecmode));
1684       if (vecmode == V4SFmode)
1685 	emit_insn (gen_sse_movss (value, value, input));
1686       else
1687 	emit_insn (gen_sse2_movsd (value, value, input));
1688     }
1689 
1690   emit_move_insn (large, two31);
1691   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1692 
1693   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1694   emit_insn (gen_rtx_SET (large, x));
1695 
1696   x = gen_rtx_AND (vecmode, zero_or_two31, large);
1697   emit_insn (gen_rtx_SET (zero_or_two31, x));
1698 
1699   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1700   emit_insn (gen_rtx_SET (value, x));
1701 
1702   large = gen_rtx_REG (V4SImode, REGNO (large));
1703   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1704 
1705   x = gen_rtx_REG (V4SImode, REGNO (value));
1706   if (vecmode == V4SFmode)
1707     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1708   else
1709     emit_insn (gen_sse2_cvttpd2dq (x, value));
1710   value = x;
1711 
1712   emit_insn (gen_xorv4si3 (value, value, large));
1713 }
1714 
1715 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1716 						 machine_mode mode, rtx target,
1717 						 rtx var, int one_var);
1718 
1719 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1720    Expects the 64-bit DImode to be supplied in a pair of integral
1721    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
1722    -mfpmath=sse, !optimize_size only.  */
1723 
1724 void
ix86_expand_convert_uns_didf_sse(rtx target,rtx input)1725 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1726 {
1727   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1728   rtx int_xmm, fp_xmm;
1729   rtx biases, exponents;
1730   rtx x;
1731 
1732   int_xmm = gen_reg_rtx (V4SImode);
1733   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1734     emit_insn (gen_movdi_to_sse (int_xmm, input));
1735   else if (TARGET_SSE_SPLIT_REGS)
1736     {
1737       emit_clobber (int_xmm);
1738       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1739     }
1740   else
1741     {
1742       x = gen_reg_rtx (V2DImode);
1743       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1744       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1745     }
1746 
1747   x = gen_rtx_CONST_VECTOR (V4SImode,
1748 			    gen_rtvec (4, GEN_INT (0x43300000UL),
1749 				       GEN_INT (0x45300000UL),
1750 				       const0_rtx, const0_rtx));
1751   exponents = validize_mem (force_const_mem (V4SImode, x));
1752 
1753   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1754   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1755 
1756   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1757      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1758      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1759      (0x1.0p84 + double(fp_value_hi_xmm)).
1760      Note these exponents differ by 32.  */
1761 
1762   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1763 
1764   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1765      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
1766   real_ldexp (&bias_lo_rvt, &dconst1, 52);
1767   real_ldexp (&bias_hi_rvt, &dconst1, 84);
1768   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1769   x = const_double_from_real_value (bias_hi_rvt, DFmode);
1770   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1771   biases = validize_mem (force_const_mem (V2DFmode, biases));
1772   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1773 
1774   /* Add the upper and lower DFmode values together.  */
1775   if (TARGET_SSE3)
1776     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1777   else
1778     {
1779       x = copy_to_mode_reg (V2DFmode, fp_xmm);
1780       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1781       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1782     }
1783 
1784   ix86_expand_vector_extract (false, target, fp_xmm, 0);
1785 }
1786 
1787 /* Not used, but eases macroization of patterns.  */
1788 void
ix86_expand_convert_uns_sixf_sse(rtx,rtx)1789 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1790 {
1791   gcc_unreachable ();
1792 }
1793 
1794 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1795 
1796 /* Convert an unsigned SImode value into a DFmode.  Only currently used
1797    for SSE, but applicable anywhere.  */
1798 
1799 void
ix86_expand_convert_uns_sidf_sse(rtx target,rtx input)1800 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1801 {
1802   REAL_VALUE_TYPE TWO31r;
1803   rtx x, fp;
1804 
1805   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1806 			   NULL, 1, OPTAB_DIRECT);
1807 
1808   fp = gen_reg_rtx (DFmode);
1809   emit_insn (gen_floatsidf2 (fp, x));
1810 
1811   real_ldexp (&TWO31r, &dconst1, 31);
1812   x = const_double_from_real_value (TWO31r, DFmode);
1813 
1814   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1815 
1816   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
1817   if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1818     x = ix86_expand_sse_fabs (x, NULL);
1819 
1820   if (x != target)
1821     emit_move_insn (target, x);
1822 }
1823 
1824 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
1825    32-bit mode; otherwise we have a direct convert instruction.  */
1826 
1827 void
ix86_expand_convert_sign_didf_sse(rtx target,rtx input)1828 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1829 {
1830   REAL_VALUE_TYPE TWO32r;
1831   rtx fp_lo, fp_hi, x;
1832 
1833   fp_lo = gen_reg_rtx (DFmode);
1834   fp_hi = gen_reg_rtx (DFmode);
1835 
1836   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1837 
1838   real_ldexp (&TWO32r, &dconst1, 32);
1839   x = const_double_from_real_value (TWO32r, DFmode);
1840   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1841 
1842   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1843 
1844   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1845 			   0, OPTAB_DIRECT);
1846   if (x != target)
1847     emit_move_insn (target, x);
1848 }
1849 
1850 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1851    For x86_32, -mfpmath=sse, !optimize_size only.  */
1852 void
ix86_expand_convert_uns_sisf_sse(rtx target,rtx input)1853 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1854 {
1855   REAL_VALUE_TYPE ONE16r;
1856   rtx fp_hi, fp_lo, int_hi, int_lo, x;
1857 
1858   real_ldexp (&ONE16r, &dconst1, 16);
1859   x = const_double_from_real_value (ONE16r, SFmode);
1860   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1861 				      NULL, 0, OPTAB_DIRECT);
1862   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1863 				      NULL, 0, OPTAB_DIRECT);
1864   fp_hi = gen_reg_rtx (SFmode);
1865   fp_lo = gen_reg_rtx (SFmode);
1866   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1867   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1868   if (TARGET_FMA)
1869     {
1870       x = validize_mem (force_const_mem (SFmode, x));
1871       fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1872       emit_move_insn (target, fp_hi);
1873     }
1874   else
1875     {
1876       fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1877 				   0, OPTAB_DIRECT);
1878       fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1879 				   0, OPTAB_DIRECT);
1880       if (!rtx_equal_p (target, fp_hi))
1881 	emit_move_insn (target, fp_hi);
1882     }
1883 }
1884 
1885 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
1886    a vector of unsigned ints VAL to vector of floats TARGET.  */
1887 
1888 void
ix86_expand_vector_convert_uns_vsivsf(rtx target,rtx val)1889 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1890 {
1891   rtx tmp[8];
1892   REAL_VALUE_TYPE TWO16r;
1893   machine_mode intmode = GET_MODE (val);
1894   machine_mode fltmode = GET_MODE (target);
1895   rtx (*cvt) (rtx, rtx);
1896 
1897   if (intmode == V4SImode)
1898     cvt = gen_floatv4siv4sf2;
1899   else
1900     cvt = gen_floatv8siv8sf2;
1901   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1902   tmp[0] = force_reg (intmode, tmp[0]);
1903   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1904 				OPTAB_DIRECT);
1905   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1906 				NULL_RTX, 1, OPTAB_DIRECT);
1907   tmp[3] = gen_reg_rtx (fltmode);
1908   emit_insn (cvt (tmp[3], tmp[1]));
1909   tmp[4] = gen_reg_rtx (fltmode);
1910   emit_insn (cvt (tmp[4], tmp[2]));
1911   real_ldexp (&TWO16r, &dconst1, 16);
1912   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1913   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1914   if (TARGET_FMA)
1915     {
1916       tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1917       emit_move_insn (target, tmp[6]);
1918     }
1919   else
1920     {
1921       tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1922 				    NULL_RTX, 1, OPTAB_DIRECT);
1923       tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1924 				    target, 1, OPTAB_DIRECT);
1925       if (tmp[7] != target)
1926 	emit_move_insn (target, tmp[7]);
1927     }
1928 }
1929 
1930 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1931    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1932    This is done by doing just signed conversion if < 0x1p31, and otherwise by
1933    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
1934 
1935 rtx
ix86_expand_adjust_ufix_to_sfix_si(rtx val,rtx * xorp)1936 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1937 {
1938   REAL_VALUE_TYPE TWO31r;
1939   rtx two31r, tmp[4];
1940   machine_mode mode = GET_MODE (val);
1941   machine_mode scalarmode = GET_MODE_INNER (mode);
1942   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1943   rtx (*cmp) (rtx, rtx, rtx, rtx);
1944   int i;
1945 
1946   for (i = 0; i < 3; i++)
1947     tmp[i] = gen_reg_rtx (mode);
1948   real_ldexp (&TWO31r, &dconst1, 31);
1949   two31r = const_double_from_real_value (TWO31r, scalarmode);
1950   two31r = ix86_build_const_vector (mode, 1, two31r);
1951   two31r = force_reg (mode, two31r);
1952   switch (mode)
1953     {
1954     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1955     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1956     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1957     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1958     default: gcc_unreachable ();
1959     }
1960   tmp[3] = gen_rtx_LE (mode, two31r, val);
1961   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1962   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1963 				0, OPTAB_DIRECT);
1964   if (intmode == V4SImode || TARGET_AVX2)
1965     *xorp = expand_simple_binop (intmode, ASHIFT,
1966 				 gen_lowpart (intmode, tmp[0]),
1967 				 GEN_INT (31), NULL_RTX, 0,
1968 				 OPTAB_DIRECT);
1969   else
1970     {
1971       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1972       two31 = ix86_build_const_vector (intmode, 1, two31);
1973       *xorp = expand_simple_binop (intmode, AND,
1974 				   gen_lowpart (intmode, tmp[0]),
1975 				   two31, NULL_RTX, 0,
1976 				   OPTAB_DIRECT);
1977     }
1978   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1979 			      0, OPTAB_DIRECT);
1980 }
1981 
1982 /* Generate code for floating point ABS or NEG.  */
1983 
1984 void
ix86_expand_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1985 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1986 				rtx operands[])
1987 {
1988   rtx set, dst, src;
1989   bool use_sse = false;
1990   bool vector_mode = VECTOR_MODE_P (mode);
1991   machine_mode vmode = mode;
1992   rtvec par;
1993 
1994   if (vector_mode || mode == TFmode || mode == HFmode)
1995     {
1996       use_sse = true;
1997       if (mode == HFmode)
1998 	vmode = V8HFmode;
1999     }
2000   else if (TARGET_SSE_MATH)
2001     {
2002       use_sse = SSE_FLOAT_MODE_P (mode);
2003       if (mode == SFmode)
2004 	vmode = V4SFmode;
2005       else if (mode == DFmode)
2006 	vmode = V2DFmode;
2007     }
2008 
2009   dst = operands[0];
2010   src = operands[1];
2011 
2012   set = gen_rtx_fmt_e (code, mode, src);
2013   set = gen_rtx_SET (dst, set);
2014 
2015   if (use_sse)
2016     {
2017       rtx mask, use, clob;
2018 
2019       /* NEG and ABS performed with SSE use bitwise mask operations.
2020 	 Create the appropriate mask now.  */
2021       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2022       use = gen_rtx_USE (VOIDmode, mask);
2023       if (vector_mode || mode == TFmode)
2024 	par = gen_rtvec (2, set, use);
2025       else
2026 	{
2027           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2028 	  par = gen_rtvec (3, set, use, clob);
2029         }
2030     }
2031   else
2032     {
2033       rtx clob;
2034 
2035       /* Changing of sign for FP values is doable using integer unit too.  */
2036       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2037       par = gen_rtvec (2, set, clob);
2038     }
2039 
2040   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2041 }
2042 
2043 /* Deconstruct a floating point ABS or NEG operation
2044    with integer registers into integer operations.  */
2045 
2046 void
ix86_split_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])2047 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2048 			       rtx operands[])
2049 {
2050   enum rtx_code absneg_op;
2051   rtx dst, set;
2052 
2053   gcc_assert (operands_match_p (operands[0], operands[1]));
2054 
2055   switch (mode)
2056     {
2057     case E_SFmode:
2058       dst = gen_lowpart (SImode, operands[0]);
2059 
2060       if (code == ABS)
2061 	{
2062 	  set = gen_int_mode (0x7fffffff, SImode);
2063 	  absneg_op = AND;
2064 	}
2065       else
2066 	{
2067 	  set = gen_int_mode (0x80000000, SImode);
2068 	  absneg_op = XOR;
2069 	}
2070       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2071       break;
2072 
2073     case E_DFmode:
2074       if (TARGET_64BIT)
2075 	{
2076 	  dst = gen_lowpart (DImode, operands[0]);
2077 	  dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2078 
2079 	  if (code == ABS)
2080 	    set = const0_rtx;
2081 	  else
2082 	    set = gen_rtx_NOT (DImode, dst);
2083 	}
2084       else
2085 	{
2086 	  dst = gen_highpart (SImode, operands[0]);
2087 
2088 	  if (code == ABS)
2089 	    {
2090 	      set = gen_int_mode (0x7fffffff, SImode);
2091 	      absneg_op = AND;
2092 	    }
2093 	  else
2094 	    {
2095 	      set = gen_int_mode (0x80000000, SImode);
2096 	      absneg_op = XOR;
2097 	    }
2098 	  set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2099 	}
2100       break;
2101 
2102     case E_XFmode:
2103       dst = gen_rtx_REG (SImode,
2104 			 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2105       if (code == ABS)
2106 	{
2107 	  set = GEN_INT (0x7fff);
2108 	  absneg_op = AND;
2109 	}
2110       else
2111 	{
2112 	  set = GEN_INT (0x8000);
2113 	  absneg_op = XOR;
2114 	}
2115       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2116       break;
2117 
2118     default:
2119       gcc_unreachable ();
2120     }
2121 
2122   set = gen_rtx_SET (dst, set);
2123 
2124   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2125   rtvec par = gen_rtvec (2, set, clob);
2126 
2127   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2128 }
2129 
2130 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
2131 
2132 void
ix86_expand_copysign(rtx operands[])2133 ix86_expand_copysign (rtx operands[])
2134 {
2135   machine_mode mode, vmode;
2136   rtx dest, op0, op1, mask, op2, op3;
2137 
2138   mode = GET_MODE (operands[0]);
2139 
2140   if (mode == HFmode)
2141     vmode = V8HFmode;
2142   else if (mode == SFmode)
2143     vmode = V4SFmode;
2144   else if (mode == DFmode)
2145     vmode = V2DFmode;
2146   else if (mode == TFmode)
2147     vmode = mode;
2148   else
2149     gcc_unreachable ();
2150 
2151   if (rtx_equal_p (operands[1], operands[2]))
2152     {
2153       emit_move_insn (operands[0], operands[1]);
2154       return;
2155     }
2156 
2157   dest = lowpart_subreg (vmode, operands[0], mode);
2158   op1 = lowpart_subreg (vmode, operands[2], mode);
2159   mask = ix86_build_signbit_mask (vmode, 0, 0);
2160 
2161   if (CONST_DOUBLE_P (operands[1]))
2162     {
2163       op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2164       /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
2165       if (op0 == CONST0_RTX (mode))
2166 	{
2167 	  emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
2168 	  return;
2169 	}
2170 
2171       if (GET_MODE_SIZE (mode) < 16)
2172 	op0 = ix86_build_const_vector (vmode, false, op0);
2173       op0 = force_reg (vmode, op0);
2174     }
2175   else
2176     op0 = lowpart_subreg (vmode, operands[1], mode);
2177 
2178   op2 = gen_reg_rtx (vmode);
2179   op3 = gen_reg_rtx (vmode);
2180   emit_move_insn (op2, gen_rtx_AND (vmode,
2181 				    gen_rtx_NOT (vmode, mask),
2182 				    op0));
2183   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2184   emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
2185 }
2186 
2187 /* Expand an xorsign operation.  */
2188 
2189 void
ix86_expand_xorsign(rtx operands[])2190 ix86_expand_xorsign (rtx operands[])
2191 {
2192   machine_mode mode, vmode;
2193   rtx dest, op0, op1, mask, x, temp;
2194 
2195   dest = operands[0];
2196   op0 = operands[1];
2197   op1 = operands[2];
2198 
2199   mode = GET_MODE (dest);
2200 
2201   if (mode == HFmode)
2202     vmode = V8HFmode;
2203   else if (mode == SFmode)
2204     vmode = V4SFmode;
2205   else if (mode == DFmode)
2206     vmode = V2DFmode;
2207   else
2208     gcc_unreachable ();
2209 
2210   temp = gen_reg_rtx (vmode);
2211   mask = ix86_build_signbit_mask (vmode, 0, 0);
2212 
2213   op1 = lowpart_subreg (vmode, op1, mode);
2214   x = gen_rtx_AND (vmode, op1, mask);
2215   emit_insn (gen_rtx_SET (temp, x));
2216 
2217   op0 = lowpart_subreg (vmode, op0, mode);
2218   x = gen_rtx_XOR (vmode, temp, op0);
2219 
2220   dest = lowpart_subreg (vmode, dest, mode);
2221   emit_insn (gen_rtx_SET (dest, x));
2222 }
2223 
2224 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2225 
2226 void
ix86_expand_branch(enum rtx_code code,rtx op0,rtx op1,rtx label)2227 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2228 {
2229   machine_mode mode = GET_MODE (op0);
2230   rtx tmp;
2231 
2232   /* Handle special case - vector comparsion with boolean result, transform
2233      it using ptest instruction.  */
2234   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2235     {
2236       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2237       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2238 
2239       gcc_assert (code == EQ || code == NE);
2240       /* Generate XOR since we can't check that one operand is zero vector.  */
2241       tmp = gen_reg_rtx (mode);
2242       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2243       tmp = gen_lowpart (p_mode, tmp);
2244       emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2245 			      gen_rtx_UNSPEC (CCmode,
2246 					      gen_rtvec (2, tmp, tmp),
2247 					      UNSPEC_PTEST)));
2248       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2249       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2250 				  gen_rtx_LABEL_REF (VOIDmode, label),
2251 				  pc_rtx);
2252       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2253       return;
2254     }
2255 
2256   switch (mode)
2257     {
2258     case E_HFmode:
2259     case E_SFmode:
2260     case E_DFmode:
2261     case E_XFmode:
2262     case E_QImode:
2263     case E_HImode:
2264     case E_SImode:
2265       simple:
2266       tmp = ix86_expand_compare (code, op0, op1);
2267       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2268 				  gen_rtx_LABEL_REF (VOIDmode, label),
2269 				  pc_rtx);
2270       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2271       return;
2272 
2273     case E_DImode:
2274       if (TARGET_64BIT)
2275 	goto simple;
2276       /* For 32-bit target DI comparison may be performed on
2277 	 SSE registers.  To allow this we should avoid split
2278 	 to SI mode which is achieved by doing xor in DI mode
2279 	 and then comparing with zero (which is recognized by
2280 	 STV pass).  We don't compare using xor when optimizing
2281 	 for size.  */
2282       if (!optimize_insn_for_size_p ()
2283 	  && TARGET_STV
2284 	  && (code == EQ || code == NE))
2285 	{
2286 	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2287 	  op1 = const0_rtx;
2288 	}
2289       /* FALLTHRU */
2290     case E_TImode:
2291       /* Expand DImode branch into multiple compare+branch.  */
2292       {
2293 	rtx lo[2], hi[2];
2294 	rtx_code_label *label2;
2295 	enum rtx_code code1, code2, code3;
2296 	machine_mode submode;
2297 
2298 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2299 	  {
2300 	    std::swap (op0, op1);
2301 	    code = swap_condition (code);
2302 	  }
2303 
2304 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
2305 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
2306 
2307 	submode = mode == DImode ? SImode : DImode;
2308 
2309 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2310 	   avoid two branches.  This costs one extra insn, so disable when
2311 	   optimizing for size.  */
2312 
2313 	if ((code == EQ || code == NE)
2314 	    && (!optimize_insn_for_size_p ()
2315 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
2316 	  {
2317 	    rtx xor0, xor1;
2318 
2319 	    xor1 = hi[0];
2320 	    if (hi[1] != const0_rtx)
2321 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2322 				   NULL_RTX, 0, OPTAB_WIDEN);
2323 
2324 	    xor0 = lo[0];
2325 	    if (lo[1] != const0_rtx)
2326 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2327 				   NULL_RTX, 0, OPTAB_WIDEN);
2328 
2329 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
2330 				NULL_RTX, 0, OPTAB_WIDEN);
2331 
2332 	    ix86_expand_branch (code, tmp, const0_rtx, label);
2333 	    return;
2334 	  }
2335 
2336 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
2337 	   op1 is a constant and the low word is zero, then we can just
2338 	   examine the high word.  Similarly for low word -1 and
2339 	   less-or-equal-than or greater-than.  */
2340 
2341 	if (CONST_INT_P (hi[1]))
2342 	  switch (code)
2343 	    {
2344 	    case LT: case LTU: case GE: case GEU:
2345 	      if (lo[1] == const0_rtx)
2346 		{
2347 		  ix86_expand_branch (code, hi[0], hi[1], label);
2348 		  return;
2349 		}
2350 	      break;
2351 	    case LE: case LEU: case GT: case GTU:
2352 	      if (lo[1] == constm1_rtx)
2353 		{
2354 		  ix86_expand_branch (code, hi[0], hi[1], label);
2355 		  return;
2356 		}
2357 	      break;
2358 	    default:
2359 	      break;
2360 	    }
2361 
2362 	/* Emulate comparisons that do not depend on Zero flag with
2363 	   double-word subtraction.  Note that only Overflow, Sign
2364 	   and Carry flags are valid, so swap arguments and condition
2365 	   of comparisons that would otherwise test Zero flag.  */
2366 
2367 	switch (code)
2368 	  {
2369 	  case LE: case LEU: case GT: case GTU:
2370 	    std::swap (lo[0], lo[1]);
2371 	    std::swap (hi[0], hi[1]);
2372 	    code = swap_condition (code);
2373 	    /* FALLTHRU */
2374 
2375 	  case LT: case LTU: case GE: case GEU:
2376 	    {
2377 	      bool uns = (code == LTU || code == GEU);
2378 	      rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2379 		= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2380 
2381 	      if (!nonimmediate_operand (lo[0], submode))
2382 		lo[0] = force_reg (submode, lo[0]);
2383 	      if (!x86_64_general_operand (lo[1], submode))
2384 		lo[1] = force_reg (submode, lo[1]);
2385 
2386 	      if (!register_operand (hi[0], submode))
2387 		hi[0] = force_reg (submode, hi[0]);
2388 	      if ((uns && !nonimmediate_operand (hi[1], submode))
2389 		  || (!uns && !x86_64_general_operand (hi[1], submode)))
2390 		hi[1] = force_reg (submode, hi[1]);
2391 
2392 	      emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2393 
2394 	      tmp = gen_rtx_SCRATCH (submode);
2395 	      emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2396 
2397 	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2398 	      ix86_expand_branch (code, tmp, const0_rtx, label);
2399 	      return;
2400 	    }
2401 
2402 	  default:
2403 	    break;
2404 	  }
2405 
2406 	/* Otherwise, we need two or three jumps.  */
2407 
2408 	label2 = gen_label_rtx ();
2409 
2410 	code1 = code;
2411 	code2 = swap_condition (code);
2412 	code3 = unsigned_condition (code);
2413 
2414 	switch (code)
2415 	  {
2416 	  case LT: case GT: case LTU: case GTU:
2417 	    break;
2418 
2419 	  case LE:   code1 = LT;  code2 = GT;  break;
2420 	  case GE:   code1 = GT;  code2 = LT;  break;
2421 	  case LEU:  code1 = LTU; code2 = GTU; break;
2422 	  case GEU:  code1 = GTU; code2 = LTU; break;
2423 
2424 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
2425 	  case NE:   code2 = UNKNOWN; break;
2426 
2427 	  default:
2428 	    gcc_unreachable ();
2429 	  }
2430 
2431 	/*
2432 	 * a < b =>
2433 	 *    if (hi(a) < hi(b)) goto true;
2434 	 *    if (hi(a) > hi(b)) goto false;
2435 	 *    if (lo(a) < lo(b)) goto true;
2436 	 *  false:
2437 	 */
2438 
2439 	if (code1 != UNKNOWN)
2440 	  ix86_expand_branch (code1, hi[0], hi[1], label);
2441 	if (code2 != UNKNOWN)
2442 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
2443 
2444 	ix86_expand_branch (code3, lo[0], lo[1], label);
2445 
2446 	if (code2 != UNKNOWN)
2447 	  emit_label (label2);
2448 	return;
2449       }
2450 
2451     default:
2452       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2453       goto simple;
2454     }
2455 }
2456 
2457 /* Figure out whether to use unordered fp comparisons.  */
2458 
2459 static bool
ix86_unordered_fp_compare(enum rtx_code code)2460 ix86_unordered_fp_compare (enum rtx_code code)
2461 {
2462   if (!TARGET_IEEE_FP)
2463     return false;
2464 
2465   switch (code)
2466     {
2467     case LT:
2468     case LE:
2469     case GT:
2470     case GE:
2471     case LTGT:
2472       return false;
2473 
2474     case EQ:
2475     case NE:
2476 
2477     case UNORDERED:
2478     case ORDERED:
2479     case UNLT:
2480     case UNLE:
2481     case UNGT:
2482     case UNGE:
2483     case UNEQ:
2484       return true;
2485 
2486     default:
2487       gcc_unreachable ();
2488     }
2489 }
2490 
2491 /* Return a comparison we can do and that it is equivalent to
2492    swap_condition (code) apart possibly from orderedness.
2493    But, never change orderedness if TARGET_IEEE_FP, returning
2494    UNKNOWN in that case if necessary.  */
2495 
2496 static enum rtx_code
ix86_fp_swap_condition(enum rtx_code code)2497 ix86_fp_swap_condition (enum rtx_code code)
2498 {
2499   switch (code)
2500     {
2501     case GT:                   /* GTU - CF=0 & ZF=0 */
2502       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2503     case GE:                   /* GEU - CF=0 */
2504       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2505     case UNLT:                 /* LTU - CF=1 */
2506       return TARGET_IEEE_FP ? UNKNOWN : GT;
2507     case UNLE:                 /* LEU - CF=1 | ZF=1 */
2508       return TARGET_IEEE_FP ? UNKNOWN : GE;
2509     default:
2510       return swap_condition (code);
2511     }
2512 }
2513 
2514 /* Return cost of comparison CODE using the best strategy for performance.
2515    All following functions do use number of instructions as a cost metrics.
2516    In future this should be tweaked to compute bytes for optimize_size and
2517    take into account performance of various instructions on various CPUs.  */
2518 
2519 static int
ix86_fp_comparison_cost(enum rtx_code code)2520 ix86_fp_comparison_cost (enum rtx_code code)
2521 {
2522   int arith_cost;
2523 
2524   /* The cost of code using bit-twiddling on %ah.  */
2525   switch (code)
2526     {
2527     case UNLE:
2528     case UNLT:
2529     case LTGT:
2530     case GT:
2531     case GE:
2532     case UNORDERED:
2533     case ORDERED:
2534     case UNEQ:
2535       arith_cost = 4;
2536       break;
2537     case LT:
2538     case NE:
2539     case EQ:
2540     case UNGE:
2541       arith_cost = TARGET_IEEE_FP ? 5 : 4;
2542       break;
2543     case LE:
2544     case UNGT:
2545       arith_cost = TARGET_IEEE_FP ? 6 : 4;
2546       break;
2547     default:
2548       gcc_unreachable ();
2549     }
2550 
2551   switch (ix86_fp_comparison_strategy (code))
2552     {
2553     case IX86_FPCMP_COMI:
2554       return arith_cost > 4 ? 3 : 2;
2555     case IX86_FPCMP_SAHF:
2556       return arith_cost > 4 ? 4 : 3;
2557     default:
2558       return arith_cost;
2559     }
2560 }
2561 
2562 /* Swap, force into registers, or otherwise massage the two operands
2563    to a fp comparison.  The operands are updated in place; the new
2564    comparison code is returned.  */
2565 
2566 static enum rtx_code
ix86_prepare_fp_compare_args(enum rtx_code code,rtx * pop0,rtx * pop1)2567 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2568 {
2569   bool unordered_compare = ix86_unordered_fp_compare (code);
2570   rtx op0 = *pop0, op1 = *pop1;
2571   machine_mode op_mode = GET_MODE (op0);
2572   bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2573 
2574   /* All of the unordered compare instructions only work on registers.
2575      The same is true of the fcomi compare instructions.  The XFmode
2576      compare instructions require registers except when comparing
2577      against zero or when converting operand 1 from fixed point to
2578      floating point.  */
2579 
2580   if (!is_sse
2581       && (unordered_compare
2582 	  || (op_mode == XFmode
2583 	      && ! (standard_80387_constant_p (op0) == 1
2584 		    || standard_80387_constant_p (op1) == 1)
2585 	      && GET_CODE (op1) != FLOAT)
2586 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2587     {
2588       op0 = force_reg (op_mode, op0);
2589       op1 = force_reg (op_mode, op1);
2590     }
2591   else
2592     {
2593       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
2594 	 things around if they appear profitable, otherwise force op0
2595 	 into a register.  */
2596 
2597       if (standard_80387_constant_p (op0) == 0
2598 	  || (MEM_P (op0)
2599 	      && ! (standard_80387_constant_p (op1) == 0
2600 		    || MEM_P (op1))))
2601 	{
2602 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
2603 	  if (new_code != UNKNOWN)
2604 	    {
2605 	      std::swap (op0, op1);
2606 	      code = new_code;
2607 	    }
2608 	}
2609 
2610       if (!REG_P (op0))
2611 	op0 = force_reg (op_mode, op0);
2612 
2613       if (CONSTANT_P (op1))
2614 	{
2615 	  int tmp = standard_80387_constant_p (op1);
2616 	  if (tmp == 0)
2617 	    op1 = validize_mem (force_const_mem (op_mode, op1));
2618 	  else if (tmp == 1)
2619 	    {
2620 	      if (TARGET_CMOVE)
2621 		op1 = force_reg (op_mode, op1);
2622 	    }
2623 	  else
2624 	    op1 = force_reg (op_mode, op1);
2625 	}
2626     }
2627 
2628   /* Try to rearrange the comparison to make it cheaper.  */
2629   if (ix86_fp_comparison_cost (code)
2630       > ix86_fp_comparison_cost (swap_condition (code))
2631       && (REG_P (op1) || can_create_pseudo_p ()))
2632     {
2633       std::swap (op0, op1);
2634       code = swap_condition (code);
2635       if (!REG_P (op0))
2636 	op0 = force_reg (op_mode, op0);
2637     }
2638 
2639   *pop0 = op0;
2640   *pop1 = op1;
2641   return code;
2642 }
2643 
2644 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
2645 
2646 static rtx
ix86_expand_fp_compare(enum rtx_code code,rtx op0,rtx op1)2647 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2648 {
2649   bool unordered_compare = ix86_unordered_fp_compare (code);
2650   machine_mode cmp_mode;
2651   rtx tmp, scratch;
2652 
2653   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2654 
2655   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2656   if (unordered_compare)
2657     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2658 
2659   /* Do fcomi/sahf based test when profitable.  */
2660   switch (ix86_fp_comparison_strategy (code))
2661     {
2662     case IX86_FPCMP_COMI:
2663       cmp_mode = CCFPmode;
2664       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2665       break;
2666 
2667     case IX86_FPCMP_SAHF:
2668       cmp_mode = CCFPmode;
2669       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2670       scratch = gen_reg_rtx (HImode);
2671       emit_insn (gen_rtx_SET (scratch, tmp));
2672       emit_insn (gen_x86_sahf_1 (scratch));
2673       break;
2674 
2675     case IX86_FPCMP_ARITH:
2676       cmp_mode = CCNOmode;
2677       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2678       scratch = gen_reg_rtx (HImode);
2679       emit_insn (gen_rtx_SET (scratch, tmp));
2680 
2681       /* In the unordered case, we have to check C2 for NaN's, which
2682 	 doesn't happen to work out to anything nice combination-wise.
2683 	 So do some bit twiddling on the value we've got in AH to come
2684 	 up with an appropriate set of condition codes.  */
2685 
2686       switch (code)
2687 	{
2688 	case GT:
2689 	case UNGT:
2690 	  if (code == GT || !TARGET_IEEE_FP)
2691 	    {
2692 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2693 	      code = EQ;
2694 	    }
2695 	  else
2696 	    {
2697 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2698 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2699 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2700 	      cmp_mode = CCmode;
2701 	      code = GEU;
2702 	    }
2703 	  break;
2704 	case LT:
2705 	case UNLT:
2706 	  if (code == LT && TARGET_IEEE_FP)
2707 	    {
2708 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2709 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2710 	      cmp_mode = CCmode;
2711 	      code = EQ;
2712 	    }
2713 	  else
2714 	    {
2715 	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2716 	      code = NE;
2717 	    }
2718 	  break;
2719 	case GE:
2720 	case UNGE:
2721 	  if (code == GE || !TARGET_IEEE_FP)
2722 	    {
2723 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2724 	      code = EQ;
2725 	    }
2726 	  else
2727 	    {
2728 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2729 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2730 	      code = NE;
2731 	    }
2732 	  break;
2733 	case LE:
2734 	case UNLE:
2735 	  if (code == LE && TARGET_IEEE_FP)
2736 	    {
2737 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2738 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2739 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2740 	      cmp_mode = CCmode;
2741 	      code = LTU;
2742 	    }
2743 	  else
2744 	    {
2745 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2746 	      code = NE;
2747 	    }
2748 	  break;
2749 	case EQ:
2750 	case UNEQ:
2751 	  if (code == EQ && TARGET_IEEE_FP)
2752 	    {
2753 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2754 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2755 	      cmp_mode = CCmode;
2756 	      code = EQ;
2757 	    }
2758 	  else
2759 	    {
2760 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2761 	      code = NE;
2762 	    }
2763 	  break;
2764 	case NE:
2765 	case LTGT:
2766 	  if (code == NE && TARGET_IEEE_FP)
2767 	    {
2768 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2769 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2770 					     GEN_INT (0x40)));
2771 	      code = NE;
2772 	    }
2773 	  else
2774 	    {
2775 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2776 	      code = EQ;
2777 	    }
2778 	  break;
2779 
2780 	case UNORDERED:
2781 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2782 	  code = NE;
2783 	  break;
2784 	case ORDERED:
2785 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2786 	  code = EQ;
2787 	  break;
2788 
2789 	default:
2790 	  gcc_unreachable ();
2791 	}
2792 	break;
2793 
2794     default:
2795       gcc_unreachable();
2796     }
2797 
2798   /* Return the test that should be put into the flags user, i.e.
2799      the bcc, scc, or cmov instruction.  */
2800   return gen_rtx_fmt_ee (code, VOIDmode,
2801 			 gen_rtx_REG (cmp_mode, FLAGS_REG),
2802 			 const0_rtx);
2803 }
2804 
2805 /* Generate insn patterns to do an integer compare of OPERANDS.  */
2806 
2807 static rtx
ix86_expand_int_compare(enum rtx_code code,rtx op0,rtx op1)2808 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2809 {
2810   machine_mode cmpmode;
2811   rtx tmp, flags;
2812 
2813   /* Swap operands to emit carry flag comparison.  */
2814   if ((code == GTU || code == LEU)
2815       && nonimmediate_operand (op1, VOIDmode))
2816     {
2817       std::swap (op0, op1);
2818       code = swap_condition (code);
2819     }
2820 
2821   cmpmode = SELECT_CC_MODE (code, op0, op1);
2822   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2823 
2824   /* This is very simple, but making the interface the same as in the
2825      FP case makes the rest of the code easier.  */
2826   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2827   emit_insn (gen_rtx_SET (flags, tmp));
2828 
2829   /* Return the test that should be put into the flags user, i.e.
2830      the bcc, scc, or cmov instruction.  */
2831   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2832 }
2833 
2834 static rtx
ix86_expand_compare(enum rtx_code code,rtx op0,rtx op1)2835 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2836 {
2837   rtx ret;
2838 
2839   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2840     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2841 
2842   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2843     {
2844       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2845       ret = ix86_expand_fp_compare (code, op0, op1);
2846     }
2847   else
2848     ret = ix86_expand_int_compare (code, op0, op1);
2849 
2850   return ret;
2851 }
2852 
2853 void
ix86_expand_setcc(rtx dest,enum rtx_code code,rtx op0,rtx op1)2854 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2855 {
2856   rtx ret;
2857 
2858   gcc_assert (GET_MODE (dest) == QImode);
2859 
2860   ret = ix86_expand_compare (code, op0, op1);
2861   PUT_MODE (ret, QImode);
2862   emit_insn (gen_rtx_SET (dest, ret));
2863 }
2864 
2865 /* Expand comparison setting or clearing carry flag.  Return true when
2866    successful and set pop for the operation.  */
2867 static bool
ix86_expand_carry_flag_compare(enum rtx_code code,rtx op0,rtx op1,rtx * pop)2868 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2869 {
2870   machine_mode mode
2871     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2872 
2873   /* Do not handle double-mode compares that go through special path.  */
2874   if (mode == (TARGET_64BIT ? TImode : DImode))
2875     return false;
2876 
2877   if (SCALAR_FLOAT_MODE_P (mode))
2878     {
2879       rtx compare_op;
2880       rtx_insn *compare_seq;
2881 
2882       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2883 
2884       /* Shortcut:  following common codes never translate
2885 	 into carry flag compares.  */
2886       if (code == EQ || code == NE || code == UNEQ || code == LTGT
2887 	  || code == ORDERED || code == UNORDERED)
2888 	return false;
2889 
2890       /* These comparisons require zero flag; swap operands so they won't.  */
2891       if ((code == GT || code == UNLE || code == LE || code == UNGT)
2892 	  && !TARGET_IEEE_FP)
2893 	{
2894 	  std::swap (op0, op1);
2895 	  code = swap_condition (code);
2896 	}
2897 
2898       /* Try to expand the comparison and verify that we end up with
2899 	 carry flag based comparison.  This fails to be true only when
2900 	 we decide to expand comparison using arithmetic that is not
2901 	 too common scenario.  */
2902       start_sequence ();
2903       compare_op = ix86_expand_fp_compare (code, op0, op1);
2904       compare_seq = get_insns ();
2905       end_sequence ();
2906 
2907       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2908         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2909       else
2910 	code = GET_CODE (compare_op);
2911 
2912       if (code != LTU && code != GEU)
2913 	return false;
2914 
2915       emit_insn (compare_seq);
2916       *pop = compare_op;
2917       return true;
2918     }
2919 
2920   if (!INTEGRAL_MODE_P (mode))
2921     return false;
2922 
2923   switch (code)
2924     {
2925     case LTU:
2926     case GEU:
2927       break;
2928 
2929     /* Convert a==0 into (unsigned)a<1.  */
2930     case EQ:
2931     case NE:
2932       if (op1 != const0_rtx)
2933 	return false;
2934       op1 = const1_rtx;
2935       code = (code == EQ ? LTU : GEU);
2936       break;
2937 
2938     /* Convert a>b into b<a or a>=b-1.  */
2939     case GTU:
2940     case LEU:
2941       if (CONST_INT_P (op1))
2942 	{
2943 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2944 	  /* Bail out on overflow.  We still can swap operands but that
2945 	     would force loading of the constant into register.  */
2946 	  if (op1 == const0_rtx
2947 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2948 	    return false;
2949 	  code = (code == GTU ? GEU : LTU);
2950 	}
2951       else
2952 	{
2953 	  std::swap (op0, op1);
2954 	  code = (code == GTU ? LTU : GEU);
2955 	}
2956       break;
2957 
2958     /* Convert a>=0 into (unsigned)a<0x80000000.  */
2959     case LT:
2960     case GE:
2961       if (mode == DImode || op1 != const0_rtx)
2962 	return false;
2963       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2964       code = (code == LT ? GEU : LTU);
2965       break;
2966     case LE:
2967     case GT:
2968       if (mode == DImode || op1 != constm1_rtx)
2969 	return false;
2970       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2971       code = (code == LE ? GEU : LTU);
2972       break;
2973 
2974     default:
2975       return false;
2976     }
2977   /* Swapping operands may cause constant to appear as first operand.  */
2978   if (!nonimmediate_operand (op0, VOIDmode))
2979     {
2980       if (!can_create_pseudo_p ())
2981 	return false;
2982       op0 = force_reg (mode, op0);
2983     }
2984   *pop = ix86_expand_compare (code, op0, op1);
2985   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2986   return true;
2987 }
2988 
2989 /* Expand conditional increment or decrement using adb/sbb instructions.
2990    The default case using setcc followed by the conditional move can be
2991    done by generic code.  */
2992 bool
ix86_expand_int_addcc(rtx operands[])2993 ix86_expand_int_addcc (rtx operands[])
2994 {
2995   enum rtx_code code = GET_CODE (operands[1]);
2996   rtx flags;
2997   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2998   rtx compare_op;
2999   rtx val = const0_rtx;
3000   bool fpcmp = false;
3001   machine_mode mode;
3002   rtx op0 = XEXP (operands[1], 0);
3003   rtx op1 = XEXP (operands[1], 1);
3004 
3005   if (operands[3] != const1_rtx
3006       && operands[3] != constm1_rtx)
3007     return false;
3008   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3009      return false;
3010   code = GET_CODE (compare_op);
3011 
3012   flags = XEXP (compare_op, 0);
3013 
3014   if (GET_MODE (flags) == CCFPmode)
3015     {
3016       fpcmp = true;
3017       code = ix86_fp_compare_code_to_integer (code);
3018     }
3019 
3020   if (code != LTU)
3021     {
3022       val = constm1_rtx;
3023       if (fpcmp)
3024 	PUT_CODE (compare_op,
3025 		  reverse_condition_maybe_unordered
3026 		    (GET_CODE (compare_op)));
3027       else
3028 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3029     }
3030 
3031   mode = GET_MODE (operands[0]);
3032 
3033   /* Construct either adc or sbb insn.  */
3034   if ((code == LTU) == (operands[3] == constm1_rtx))
3035     insn = gen_sub3_carry;
3036   else
3037     insn = gen_add3_carry;
3038 
3039   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3040 
3041   return true;
3042 }
3043 
3044 bool
ix86_expand_int_movcc(rtx operands[])3045 ix86_expand_int_movcc (rtx operands[])
3046 {
3047   enum rtx_code code = GET_CODE (operands[1]), compare_code;
3048   rtx_insn *compare_seq;
3049   rtx compare_op;
3050   machine_mode mode = GET_MODE (operands[0]);
3051   bool sign_bit_compare_p = false;
3052   rtx op0 = XEXP (operands[1], 0);
3053   rtx op1 = XEXP (operands[1], 1);
3054 
3055   if (GET_MODE (op0) == TImode
3056       || (GET_MODE (op0) == DImode
3057 	  && !TARGET_64BIT))
3058     return false;
3059 
3060   start_sequence ();
3061   compare_op = ix86_expand_compare (code, op0, op1);
3062   compare_seq = get_insns ();
3063   end_sequence ();
3064 
3065   compare_code = GET_CODE (compare_op);
3066 
3067   if ((op1 == const0_rtx && (code == GE || code == LT))
3068       || (op1 == constm1_rtx && (code == GT || code == LE)))
3069     sign_bit_compare_p = true;
3070 
3071   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3072      HImode insns, we'd be swallowed in word prefix ops.  */
3073 
3074   if ((mode != HImode || TARGET_FAST_PREFIX)
3075       && (mode != (TARGET_64BIT ? TImode : DImode))
3076       && CONST_INT_P (operands[2])
3077       && CONST_INT_P (operands[3]))
3078     {
3079       rtx out = operands[0];
3080       HOST_WIDE_INT ct = INTVAL (operands[2]);
3081       HOST_WIDE_INT cf = INTVAL (operands[3]);
3082       HOST_WIDE_INT diff;
3083 
3084       diff = ct - cf;
3085       /*  Sign bit compares are better done using shifts than we do by using
3086 	  sbb.  */
3087       if (sign_bit_compare_p
3088 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3089 	{
3090 	  /* Detect overlap between destination and compare sources.  */
3091 	  rtx tmp = out;
3092 
3093           if (!sign_bit_compare_p)
3094 	    {
3095 	      rtx flags;
3096 	      bool fpcmp = false;
3097 
3098 	      compare_code = GET_CODE (compare_op);
3099 
3100 	      flags = XEXP (compare_op, 0);
3101 
3102 	      if (GET_MODE (flags) == CCFPmode)
3103 		{
3104 		  fpcmp = true;
3105 		  compare_code
3106 		    = ix86_fp_compare_code_to_integer (compare_code);
3107 		}
3108 
3109 	      /* To simplify rest of code, restrict to the GEU case.  */
3110 	      if (compare_code == LTU)
3111 		{
3112 		  std::swap (ct, cf);
3113 		  compare_code = reverse_condition (compare_code);
3114 		  code = reverse_condition (code);
3115 		}
3116 	      else
3117 		{
3118 		  if (fpcmp)
3119 		    PUT_CODE (compare_op,
3120 			      reverse_condition_maybe_unordered
3121 			        (GET_CODE (compare_op)));
3122 		  else
3123 		    PUT_CODE (compare_op,
3124 			      reverse_condition (GET_CODE (compare_op)));
3125 		}
3126 	      diff = ct - cf;
3127 
3128 	      if (reg_overlap_mentioned_p (out, op0)
3129 		  || reg_overlap_mentioned_p (out, op1))
3130 		tmp = gen_reg_rtx (mode);
3131 
3132 	      if (mode == DImode)
3133 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3134 	      else
3135 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
3136 						 flags, compare_op));
3137 	    }
3138 	  else
3139 	    {
3140 	      if (code == GT || code == GE)
3141 		code = reverse_condition (code);
3142 	      else
3143 		{
3144 		  std::swap (ct, cf);
3145 		  diff = ct - cf;
3146 		}
3147 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3148 	    }
3149 
3150 	  if (diff == 1)
3151 	    {
3152 	      /*
3153 	       * cmpl op0,op1
3154 	       * sbbl dest,dest
3155 	       * [addl dest, ct]
3156 	       *
3157 	       * Size 5 - 8.
3158 	       */
3159 	      if (ct)
3160 		tmp = expand_simple_binop (mode, PLUS,
3161 					   tmp, GEN_INT (ct),
3162 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3163 	    }
3164 	  else if (cf == -1)
3165 	    {
3166 	      /*
3167 	       * cmpl op0,op1
3168 	       * sbbl dest,dest
3169 	       * orl $ct, dest
3170 	       *
3171 	       * Size 8.
3172 	       */
3173 	      tmp = expand_simple_binop (mode, IOR,
3174 					 tmp, GEN_INT (ct),
3175 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3176 	    }
3177 	  else if (diff == -1 && ct)
3178 	    {
3179 	      /*
3180 	       * cmpl op0,op1
3181 	       * sbbl dest,dest
3182 	       * notl dest
3183 	       * [addl dest, cf]
3184 	       *
3185 	       * Size 8 - 11.
3186 	       */
3187 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3188 	      if (cf)
3189 		tmp = expand_simple_binop (mode, PLUS,
3190 					   copy_rtx (tmp), GEN_INT (cf),
3191 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3192 	    }
3193 	  else
3194 	    {
3195 	      /*
3196 	       * cmpl op0,op1
3197 	       * sbbl dest,dest
3198 	       * [notl dest]
3199 	       * andl cf - ct, dest
3200 	       * [addl dest, ct]
3201 	       *
3202 	       * Size 8 - 11.
3203 	       */
3204 
3205 	      if (cf == 0)
3206 		{
3207 		  cf = ct;
3208 		  ct = 0;
3209 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3210 		}
3211 
3212 	      tmp = expand_simple_binop (mode, AND,
3213 					 copy_rtx (tmp),
3214 					 gen_int_mode (cf - ct, mode),
3215 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3216 	      if (ct)
3217 		tmp = expand_simple_binop (mode, PLUS,
3218 					   copy_rtx (tmp), GEN_INT (ct),
3219 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3220 	    }
3221 
3222 	  if (!rtx_equal_p (tmp, out))
3223 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3224 
3225 	  return true;
3226 	}
3227 
3228       if (diff < 0)
3229 	{
3230 	  machine_mode cmp_mode = GET_MODE (op0);
3231 	  enum rtx_code new_code;
3232 
3233 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
3234 	    {
3235 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3236 
3237 	      /* We may be reversing a non-trapping
3238 		 comparison to a trapping comparison.  */
3239 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3240 		      && code != EQ && code != NE
3241 		      && code != ORDERED && code != UNORDERED)
3242 		    new_code = UNKNOWN;
3243 		  else
3244 		    new_code = reverse_condition_maybe_unordered (code);
3245 	    }
3246 	  else
3247 	    new_code = ix86_reverse_condition (code, cmp_mode);
3248 	  if (new_code != UNKNOWN)
3249 	    {
3250 	      std::swap (ct, cf);
3251 	      diff = -diff;
3252 	      code = new_code;
3253 	    }
3254 	}
3255 
3256       compare_code = UNKNOWN;
3257       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3258 	  && CONST_INT_P (op1))
3259 	{
3260 	  if (op1 == const0_rtx
3261 	      && (code == LT || code == GE))
3262 	    compare_code = code;
3263 	  else if (op1 == constm1_rtx)
3264 	    {
3265 	      if (code == LE)
3266 		compare_code = LT;
3267 	      else if (code == GT)
3268 		compare_code = GE;
3269 	    }
3270 	}
3271 
3272       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
3273       if (compare_code != UNKNOWN
3274 	  && GET_MODE (op0) == GET_MODE (out)
3275 	  && (cf == -1 || ct == -1))
3276 	{
3277 	  /* If lea code below could be used, only optimize
3278 	     if it results in a 2 insn sequence.  */
3279 
3280 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3281 		 || diff == 3 || diff == 5 || diff == 9)
3282 	      || (compare_code == LT && ct == -1)
3283 	      || (compare_code == GE && cf == -1))
3284 	    {
3285 	      /*
3286 	       * notl op1	(if necessary)
3287 	       * sarl $31, op1
3288 	       * orl cf, op1
3289 	       */
3290 	      if (ct != -1)
3291 		{
3292 		  cf = ct;
3293 		  ct = -1;
3294 		  code = reverse_condition (code);
3295 		}
3296 
3297 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3298 
3299 	      out = expand_simple_binop (mode, IOR,
3300 					 out, GEN_INT (cf),
3301 					 out, 1, OPTAB_DIRECT);
3302 	      if (out != operands[0])
3303 		emit_move_insn (operands[0], out);
3304 
3305 	      return true;
3306 	    }
3307 	}
3308 
3309 
3310       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3311 	   || diff == 3 || diff == 5 || diff == 9)
3312 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3313 	  && (mode != DImode
3314 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3315 	{
3316 	  /*
3317 	   * xorl dest,dest
3318 	   * cmpl op1,op2
3319 	   * setcc dest
3320 	   * lea cf(dest*(ct-cf)),dest
3321 	   *
3322 	   * Size 14.
3323 	   *
3324 	   * This also catches the degenerate setcc-only case.
3325 	   */
3326 
3327 	  rtx tmp;
3328 	  int nops;
3329 
3330 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3331 
3332 	  nops = 0;
3333 	  /* On x86_64 the lea instruction operates on Pmode, so we need
3334 	     to get arithmetics done in proper mode to match.  */
3335 	  if (diff == 1)
3336 	    tmp = copy_rtx (out);
3337 	  else
3338 	    {
3339 	      rtx out1;
3340 	      out1 = copy_rtx (out);
3341 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3342 	      nops++;
3343 	      if (diff & 1)
3344 		{
3345 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
3346 		  nops++;
3347 		}
3348 	    }
3349 	  if (cf != 0)
3350 	    {
3351 	      tmp = plus_constant (mode, tmp, cf);
3352 	      nops++;
3353 	    }
3354 	  if (!rtx_equal_p (tmp, out))
3355 	    {
3356 	      if (nops == 1)
3357 		out = force_operand (tmp, copy_rtx (out));
3358 	      else
3359 		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3360 	    }
3361 	  if (!rtx_equal_p (out, operands[0]))
3362 	    emit_move_insn (operands[0], copy_rtx (out));
3363 
3364 	  return true;
3365 	}
3366 
3367       /*
3368        * General case:			Jumpful:
3369        *   xorl dest,dest		cmpl op1, op2
3370        *   cmpl op1, op2		movl ct, dest
3371        *   setcc dest			jcc 1f
3372        *   decl dest			movl cf, dest
3373        *   andl (cf-ct),dest		1:
3374        *   addl ct,dest
3375        *
3376        * Size 20.			Size 14.
3377        *
3378        * This is reasonably steep, but branch mispredict costs are
3379        * high on modern cpus, so consider failing only if optimizing
3380        * for space.
3381        */
3382 
3383       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3384 	  && BRANCH_COST (optimize_insn_for_speed_p (),
3385 		  	  false) >= 2)
3386 	{
3387 	  if (cf == 0)
3388 	    {
3389 	      machine_mode cmp_mode = GET_MODE (op0);
3390 	      enum rtx_code new_code;
3391 
3392 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
3393 		{
3394 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3395 
3396 		  /* We may be reversing a non-trapping
3397 		     comparison to a trapping comparison.  */
3398 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3399 		      && code != EQ && code != NE
3400 		      && code != ORDERED && code != UNORDERED)
3401 		    new_code = UNKNOWN;
3402 		  else
3403 		    new_code = reverse_condition_maybe_unordered (code);
3404 
3405 		}
3406 	      else
3407 		{
3408 		  new_code = ix86_reverse_condition (code, cmp_mode);
3409 		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
3410 		    compare_code = reverse_condition (compare_code);
3411 		}
3412 
3413 	      if (new_code != UNKNOWN)
3414 		{
3415 		  cf = ct;
3416 		  ct = 0;
3417 		  code = new_code;
3418 		}
3419 	    }
3420 
3421 	  if (compare_code != UNKNOWN)
3422 	    {
3423 	      /* notl op1	(if needed)
3424 		 sarl $31, op1
3425 		 andl (cf-ct), op1
3426 		 addl ct, op1
3427 
3428 		 For x < 0 (resp. x <= -1) there will be no notl,
3429 		 so if possible swap the constants to get rid of the
3430 		 complement.
3431 		 True/false will be -1/0 while code below (store flag
3432 		 followed by decrement) is 0/-1, so the constants need
3433 		 to be exchanged once more.  */
3434 
3435 	      if (compare_code == GE || !cf)
3436 		{
3437 		  code = reverse_condition (code);
3438 		  compare_code = LT;
3439 		}
3440 	      else
3441 		std::swap (ct, cf);
3442 
3443 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3444 	    }
3445 	  else
3446 	    {
3447 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3448 
3449 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3450 					 constm1_rtx,
3451 					 copy_rtx (out), 1, OPTAB_DIRECT);
3452 	    }
3453 
3454 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
3455 				     gen_int_mode (cf - ct, mode),
3456 				     copy_rtx (out), 1, OPTAB_DIRECT);
3457 	  if (ct)
3458 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3459 				       copy_rtx (out), 1, OPTAB_DIRECT);
3460 	  if (!rtx_equal_p (out, operands[0]))
3461 	    emit_move_insn (operands[0], copy_rtx (out));
3462 
3463 	  return true;
3464 	}
3465     }
3466 
3467   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3468     {
3469       /* Try a few things more with specific constants and a variable.  */
3470 
3471       optab op;
3472       rtx var, orig_out, out, tmp;
3473 
3474       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3475 	return false;
3476 
3477       /* If one of the two operands is an interesting constant, load a
3478 	 constant with the above and mask it in with a logical operation.  */
3479 
3480       if (CONST_INT_P (operands[2]))
3481 	{
3482 	  var = operands[3];
3483 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3484 	    operands[3] = constm1_rtx, op = and_optab;
3485 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3486 	    operands[3] = const0_rtx, op = ior_optab;
3487 	  else
3488 	    return false;
3489 	}
3490       else if (CONST_INT_P (operands[3]))
3491 	{
3492 	  var = operands[2];
3493 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3494 	    {
3495 	      /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3496 		 "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
3497 	      if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3498 		operands[1] = simplify_gen_relational (LT, VOIDmode,
3499 						       GET_MODE (op0),
3500 						       op0, const0_rtx);
3501 
3502 	      operands[2] = constm1_rtx;
3503 	      op = and_optab;
3504 	    }
3505 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3506 	    operands[2] = const0_rtx, op = ior_optab;
3507 	  else
3508 	    return false;
3509 	}
3510       else
3511         return false;
3512 
3513       orig_out = operands[0];
3514       tmp = gen_reg_rtx (mode);
3515       operands[0] = tmp;
3516 
3517       /* Recurse to get the constant loaded.  */
3518       if (!ix86_expand_int_movcc (operands))
3519         return false;
3520 
3521       /* Mask in the interesting variable.  */
3522       out = expand_binop (mode, op, var, tmp, orig_out, 0,
3523 			  OPTAB_WIDEN);
3524       if (!rtx_equal_p (out, orig_out))
3525 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3526 
3527       return true;
3528     }
3529 
3530   /*
3531    * For comparison with above,
3532    *
3533    * movl cf,dest
3534    * movl ct,tmp
3535    * cmpl op1,op2
3536    * cmovcc tmp,dest
3537    *
3538    * Size 15.
3539    */
3540 
3541   if (! nonimmediate_operand (operands[2], mode))
3542     operands[2] = force_reg (mode, operands[2]);
3543   if (! nonimmediate_operand (operands[3], mode))
3544     operands[3] = force_reg (mode, operands[3]);
3545 
3546   if (! register_operand (operands[2], VOIDmode)
3547       && (mode == QImode
3548           || ! register_operand (operands[3], VOIDmode)))
3549     operands[2] = force_reg (mode, operands[2]);
3550 
3551   if (mode == QImode
3552       && ! register_operand (operands[3], VOIDmode))
3553     operands[3] = force_reg (mode, operands[3]);
3554 
3555   emit_insn (compare_seq);
3556   emit_insn (gen_rtx_SET (operands[0],
3557 			  gen_rtx_IF_THEN_ELSE (mode,
3558 						compare_op, operands[2],
3559 						operands[3])));
3560   return true;
3561 }
3562 
3563 /* Detect conditional moves that exactly match min/max operational
3564    semantics.  Note that this is IEEE safe, as long as we don't
3565    interchange the operands.
3566 
3567    Returns FALSE if this conditional move doesn't match a MIN/MAX,
3568    and TRUE if the operation is successful and instructions are emitted.  */
3569 
3570 static bool
ix86_expand_sse_fp_minmax(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx if_true,rtx if_false)3571 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3572 			   rtx cmp_op1, rtx if_true, rtx if_false)
3573 {
3574   machine_mode mode;
3575   bool is_min;
3576   rtx tmp;
3577 
3578   if (code == LT)
3579     ;
3580   else if (code == UNGE)
3581     std::swap (if_true, if_false);
3582   else
3583     return false;
3584 
3585   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3586     is_min = true;
3587   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3588     is_min = false;
3589   else
3590     return false;
3591 
3592   mode = GET_MODE (dest);
3593 
3594   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3595      but MODE may be a vector mode and thus not appropriate.  */
3596   if (!flag_finite_math_only || flag_signed_zeros)
3597     {
3598       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3599       rtvec v;
3600 
3601       if_true = force_reg (mode, if_true);
3602       v = gen_rtvec (2, if_true, if_false);
3603       tmp = gen_rtx_UNSPEC (mode, v, u);
3604     }
3605   else
3606     {
3607       code = is_min ? SMIN : SMAX;
3608       if (MEM_P (if_true) && MEM_P (if_false))
3609 	if_true = force_reg (mode, if_true);
3610       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3611     }
3612 
3613   emit_insn (gen_rtx_SET (dest, tmp));
3614   return true;
3615 }
3616 
3617 /* Return true if MODE is valid for vector compare to mask register,
3618    Same result for conditionl vector move with mask register.  */
3619 static bool
ix86_valid_mask_cmp_mode(machine_mode mode)3620 ix86_valid_mask_cmp_mode (machine_mode mode)
3621 {
3622   /* XOP has its own vector conditional movement.  */
3623   if (TARGET_XOP && !TARGET_AVX512F)
3624     return false;
3625 
3626   /* HFmode only supports vcmpsh whose dest is mask register.  */
3627   if (TARGET_AVX512FP16 && mode == HFmode)
3628     return true;
3629 
3630   /* AVX512F is needed for mask operation.  */
3631   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3632     return false;
3633 
3634   /* AVX512BW is needed for vector QI/HImode,
3635      AVX512VL is needed for 128/256-bit vector.  */
3636   machine_mode inner_mode = GET_MODE_INNER (mode);
3637   int vector_size = GET_MODE_SIZE (mode);
3638   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3639     return false;
3640 
3641   return vector_size == 64 || TARGET_AVX512VL;
3642 }
3643 
3644 /* Return true if integer mask comparison should be used.  */
3645 static bool
ix86_use_mask_cmp_p(machine_mode mode,machine_mode cmp_mode,rtx op_true,rtx op_false)3646 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3647 		     rtx op_true, rtx op_false)
3648 {
3649   int vector_size = GET_MODE_SIZE (mode);
3650 
3651   if (cmp_mode == HFmode)
3652     return true;
3653   else if (vector_size < 16)
3654     return false;
3655   else if (vector_size == 64)
3656     return true;
3657   else if (GET_MODE_INNER (cmp_mode) == HFmode)
3658     return true;
3659 
3660   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
3661   gcc_assert (!op_true == !op_false);
3662 
3663   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3664      vector dest is required.  */
3665   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3666     return false;
3667 
3668   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
3669   if (op_false == CONST0_RTX (mode)
3670       || op_true == CONST0_RTX (mode)
3671       || (INTEGRAL_MODE_P (mode)
3672 	  && (op_true == CONSTM1_RTX (mode)
3673 	      || op_false == CONSTM1_RTX (mode))))
3674     return false;
3675 
3676   return true;
3677 }
3678 
3679 /* Expand an SSE comparison.  Return the register with the result.  */
3680 
3681 static rtx
ix86_expand_sse_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx op_true,rtx op_false)3682 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3683 		     rtx op_true, rtx op_false)
3684 {
3685   machine_mode mode = GET_MODE (dest);
3686   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3687 
3688   /* In general case result of comparison can differ from operands' type.  */
3689   machine_mode cmp_mode;
3690 
3691   /* In AVX512F the result of comparison is an integer mask.  */
3692   bool maskcmp = false;
3693   rtx x;
3694 
3695   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3696     {
3697       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3698       maskcmp = true;
3699       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3700     }
3701   else
3702     cmp_mode = cmp_ops_mode;
3703 
3704   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3705 
3706   bool (*op1_predicate)(rtx, machine_mode)
3707     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3708 
3709   if (!op1_predicate (cmp_op1, cmp_ops_mode))
3710     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3711 
3712   if (optimize
3713       || (maskcmp && cmp_mode != mode)
3714       || (op_true && reg_overlap_mentioned_p (dest, op_true))
3715       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3716     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3717 
3718   if (maskcmp)
3719     {
3720       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3721       gcc_assert (ok);
3722       return dest;
3723     }
3724 
3725   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3726 
3727   if (cmp_mode != mode)
3728     {
3729       x = force_reg (cmp_ops_mode, x);
3730       convert_move (dest, x, false);
3731     }
3732   else
3733     emit_insn (gen_rtx_SET (dest, x));
3734 
3735   return dest;
3736 }
3737 
3738 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3739    operations.  This is used for both scalar and vector conditional moves.  */
3740 
3741 void
ix86_expand_sse_movcc(rtx dest,rtx cmp,rtx op_true,rtx op_false)3742 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3743 {
3744   machine_mode mode = GET_MODE (dest);
3745   machine_mode cmpmode = GET_MODE (cmp);
3746 
3747   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
3748   if (rtx_equal_p (op_true, op_false))
3749     {
3750       emit_move_insn (dest, op_true);
3751       return;
3752     }
3753 
3754   rtx t2, t3, x;
3755 
3756   /* If we have an integer mask and FP value then we need
3757      to cast mask to FP mode.  */
3758   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3759     {
3760       cmp = force_reg (cmpmode, cmp);
3761       cmp = gen_rtx_SUBREG (mode, cmp, 0);
3762     }
3763 
3764   /* In AVX512F the result of comparison is an integer mask.  */
3765   if (mode != cmpmode
3766       && GET_MODE_CLASS (cmpmode) == MODE_INT)
3767     {
3768       gcc_assert (ix86_valid_mask_cmp_mode (mode));
3769       /* Using scalar/vector move with mask register.  */
3770       cmp = force_reg (cmpmode, cmp);
3771       /* Optimize for mask zero.  */
3772       op_true = (op_true != CONST0_RTX (mode)
3773 		 ? force_reg (mode, op_true) : op_true);
3774       op_false = (op_false != CONST0_RTX (mode)
3775 		  ? force_reg (mode, op_false) : op_false);
3776       if (op_true == CONST0_RTX (mode))
3777 	{
3778 	  rtx n = gen_reg_rtx (cmpmode);
3779 	  if (cmpmode == E_DImode && !TARGET_64BIT)
3780 	    emit_insn (gen_knotdi (n, cmp));
3781 	  else
3782 	    emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
3783 	  cmp = n;
3784 	  /* Reverse op_true op_false.  */
3785 	  std::swap (op_true, op_false);
3786 	}
3787 
3788       if (mode == HFmode)
3789 	emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
3790       else
3791 	{
3792 	  rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3793 	  emit_insn (gen_rtx_SET (dest, vec_merge));
3794 	}
3795       return;
3796     }
3797   else if (vector_all_ones_operand (op_true, mode)
3798 	   && op_false == CONST0_RTX (mode))
3799     {
3800       emit_insn (gen_rtx_SET (dest, cmp));
3801       return;
3802     }
3803   else if (op_false == CONST0_RTX (mode))
3804     {
3805       op_true = force_reg (mode, op_true);
3806       x = gen_rtx_AND (mode, cmp, op_true);
3807       emit_insn (gen_rtx_SET (dest, x));
3808       return;
3809     }
3810   else if (op_true == CONST0_RTX (mode))
3811     {
3812       op_false = force_reg (mode, op_false);
3813       x = gen_rtx_NOT (mode, cmp);
3814       x = gen_rtx_AND (mode, x, op_false);
3815       emit_insn (gen_rtx_SET (dest, x));
3816       return;
3817     }
3818   else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3819     {
3820       op_false = force_reg (mode, op_false);
3821       x = gen_rtx_IOR (mode, cmp, op_false);
3822       emit_insn (gen_rtx_SET (dest, x));
3823       return;
3824     }
3825   else if (TARGET_XOP)
3826     {
3827       op_true = force_reg (mode, op_true);
3828 
3829       if (GET_MODE_SIZE (mode) < 16
3830 	  || !nonimmediate_operand (op_false, mode))
3831 	op_false = force_reg (mode, op_false);
3832 
3833       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3834 							  op_true,
3835 							  op_false)));
3836       return;
3837     }
3838 
3839   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3840   rtx d = dest;
3841 
3842   if (!vector_operand (op_true, mode))
3843     op_true = force_reg (mode, op_true);
3844 
3845   op_false = force_reg (mode, op_false);
3846 
3847   switch (mode)
3848     {
3849     case E_V2SFmode:
3850       if (TARGET_SSE4_1)
3851 	{
3852 	  gen = gen_mmx_blendvps;
3853 	  op_true = force_reg (mode, op_true);
3854 	}
3855       break;
3856     case E_V4SFmode:
3857       if (TARGET_SSE4_1)
3858 	gen = gen_sse4_1_blendvps;
3859       break;
3860     case E_V2DFmode:
3861       if (TARGET_SSE4_1)
3862 	gen = gen_sse4_1_blendvpd;
3863       break;
3864     case E_SFmode:
3865       if (TARGET_SSE4_1)
3866 	{
3867 	  gen = gen_sse4_1_blendvss;
3868 	  op_true = force_reg (mode, op_true);
3869 	}
3870       break;
3871     case E_DFmode:
3872       if (TARGET_SSE4_1)
3873 	{
3874 	  gen = gen_sse4_1_blendvsd;
3875 	  op_true = force_reg (mode, op_true);
3876 	}
3877       break;
3878     case E_V8QImode:
3879     case E_V4HImode:
3880     case E_V2SImode:
3881       if (TARGET_SSE4_1)
3882 	{
3883 	  op_true = force_reg (mode, op_true);
3884 
3885 	  gen = gen_mmx_pblendvb64;
3886 	  if (mode != V8QImode)
3887 	    d = gen_reg_rtx (V8QImode);
3888 	  op_false = gen_lowpart (V8QImode, op_false);
3889 	  op_true = gen_lowpart (V8QImode, op_true);
3890 	  cmp = gen_lowpart (V8QImode, cmp);
3891 	}
3892       break;
3893     case E_V4QImode:
3894     case E_V2HImode:
3895       if (TARGET_SSE4_1)
3896 	{
3897 	  op_true = force_reg (mode, op_true);
3898 
3899 	  gen = gen_mmx_pblendvb32;
3900 	  if (mode != V4QImode)
3901 	    d = gen_reg_rtx (V4QImode);
3902 	  op_false = gen_lowpart (V4QImode, op_false);
3903 	  op_true = gen_lowpart (V4QImode, op_true);
3904 	  cmp = gen_lowpart (V4QImode, cmp);
3905 	}
3906       break;
3907     case E_V16QImode:
3908     case E_V8HImode:
3909     case E_V8HFmode:
3910     case E_V4SImode:
3911     case E_V2DImode:
3912       if (TARGET_SSE4_1)
3913 	{
3914 	  gen = gen_sse4_1_pblendvb;
3915 	  if (mode != V16QImode)
3916 	    d = gen_reg_rtx (V16QImode);
3917 	  op_false = gen_lowpart (V16QImode, op_false);
3918 	  op_true = gen_lowpart (V16QImode, op_true);
3919 	  cmp = gen_lowpart (V16QImode, cmp);
3920 	}
3921       break;
3922     case E_V8SFmode:
3923       if (TARGET_AVX)
3924 	gen = gen_avx_blendvps256;
3925       break;
3926     case E_V4DFmode:
3927       if (TARGET_AVX)
3928 	gen = gen_avx_blendvpd256;
3929       break;
3930     case E_V32QImode:
3931     case E_V16HImode:
3932     case E_V16HFmode:
3933     case E_V8SImode:
3934     case E_V4DImode:
3935       if (TARGET_AVX2)
3936 	{
3937 	  gen = gen_avx2_pblendvb;
3938 	  if (mode != V32QImode)
3939 	    d = gen_reg_rtx (V32QImode);
3940 	  op_false = gen_lowpart (V32QImode, op_false);
3941 	  op_true = gen_lowpart (V32QImode, op_true);
3942 	  cmp = gen_lowpart (V32QImode, cmp);
3943 	}
3944       break;
3945 
3946     case E_V64QImode:
3947       gen = gen_avx512bw_blendmv64qi;
3948       break;
3949     case E_V32HImode:
3950       gen = gen_avx512bw_blendmv32hi;
3951       break;
3952     case E_V32HFmode:
3953       gen = gen_avx512bw_blendmv32hf;
3954       break;
3955     case E_V16SImode:
3956       gen = gen_avx512f_blendmv16si;
3957       break;
3958     case E_V8DImode:
3959       gen = gen_avx512f_blendmv8di;
3960       break;
3961     case E_V8DFmode:
3962       gen = gen_avx512f_blendmv8df;
3963       break;
3964     case E_V16SFmode:
3965       gen = gen_avx512f_blendmv16sf;
3966       break;
3967 
3968     default:
3969       break;
3970     }
3971 
3972   if (gen != NULL)
3973     {
3974       emit_insn (gen (d, op_false, op_true, cmp));
3975       if (d != dest)
3976 	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3977     }
3978   else
3979     {
3980       op_true = force_reg (mode, op_true);
3981 
3982       t2 = gen_reg_rtx (mode);
3983       if (optimize)
3984 	t3 = gen_reg_rtx (mode);
3985       else
3986 	t3 = dest;
3987 
3988       x = gen_rtx_AND (mode, op_true, cmp);
3989       emit_insn (gen_rtx_SET (t2, x));
3990 
3991       x = gen_rtx_NOT (mode, cmp);
3992       x = gen_rtx_AND (mode, x, op_false);
3993       emit_insn (gen_rtx_SET (t3, x));
3994 
3995       x = gen_rtx_IOR (mode, t3, t2);
3996       emit_insn (gen_rtx_SET (dest, x));
3997     }
3998 }
3999 
4000 /* Swap, force into registers, or otherwise massage the two operands
4001    to an sse comparison with a mask result.  Thus we differ a bit from
4002    ix86_prepare_fp_compare_args which expects to produce a flags result.
4003 
4004    The DEST operand exists to help determine whether to commute commutative
4005    operators.  The POP0/POP1 operands are updated in place.  The new
4006    comparison code is returned, or UNKNOWN if not implementable.  */
4007 
4008 static enum rtx_code
ix86_prepare_sse_fp_compare_args(rtx dest,enum rtx_code code,rtx * pop0,rtx * pop1)4009 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4010 				  rtx *pop0, rtx *pop1)
4011 {
4012   switch (code)
4013     {
4014     case LTGT:
4015     case UNEQ:
4016       /* AVX supports all the needed comparisons.  */
4017       if (TARGET_AVX)
4018 	break;
4019       /* We have no LTGT as an operator.  We could implement it with
4020 	 NE & ORDERED, but this requires an extra temporary.  It's
4021 	 not clear that it's worth it.  */
4022       return UNKNOWN;
4023 
4024     case LT:
4025     case LE:
4026     case UNGT:
4027     case UNGE:
4028       /* These are supported directly.  */
4029       break;
4030 
4031     case EQ:
4032     case NE:
4033     case UNORDERED:
4034     case ORDERED:
4035       /* AVX has 3 operand comparisons, no need to swap anything.  */
4036       if (TARGET_AVX)
4037 	break;
4038       /* For commutative operators, try to canonicalize the destination
4039 	 operand to be first in the comparison - this helps reload to
4040 	 avoid extra moves.  */
4041       if (!dest || !rtx_equal_p (dest, *pop1))
4042 	break;
4043       /* FALLTHRU */
4044 
4045     case GE:
4046     case GT:
4047     case UNLE:
4048     case UNLT:
4049       /* These are not supported directly before AVX, and furthermore
4050 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
4051 	 comparison operands to transform into something that is
4052 	 supported.  */
4053       std::swap (*pop0, *pop1);
4054       code = swap_condition (code);
4055       break;
4056 
4057     default:
4058       gcc_unreachable ();
4059     }
4060 
4061   return code;
4062 }
4063 
4064 /* Expand a floating-point conditional move.  Return true if successful.  */
4065 
4066 bool
ix86_expand_fp_movcc(rtx operands[])4067 ix86_expand_fp_movcc (rtx operands[])
4068 {
4069   machine_mode mode = GET_MODE (operands[0]);
4070   enum rtx_code code = GET_CODE (operands[1]);
4071   rtx tmp, compare_op;
4072   rtx op0 = XEXP (operands[1], 0);
4073   rtx op1 = XEXP (operands[1], 1);
4074 
4075   if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4076     {
4077       machine_mode cmode;
4078 
4079       /* Since we've no cmove for sse registers, don't force bad register
4080 	 allocation just to gain access to it.  Deny movcc when the
4081 	 comparison mode doesn't match the move mode.  */
4082       cmode = GET_MODE (op0);
4083       if (cmode == VOIDmode)
4084 	cmode = GET_MODE (op1);
4085       if (cmode != mode)
4086 	return false;
4087 
4088       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4089       if (code == UNKNOWN)
4090 	return false;
4091 
4092       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4093 				     operands[2], operands[3]))
4094 	return true;
4095 
4096       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4097 				 operands[2], operands[3]);
4098       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4099       return true;
4100     }
4101 
4102   if (GET_MODE (op0) == TImode
4103       || (GET_MODE (op0) == DImode
4104 	  && !TARGET_64BIT))
4105     return false;
4106 
4107   /* The floating point conditional move instructions don't directly
4108      support conditions resulting from a signed integer comparison.  */
4109 
4110   compare_op = ix86_expand_compare (code, op0, op1);
4111   if (!fcmov_comparison_operator (compare_op, VOIDmode))
4112     {
4113       tmp = gen_reg_rtx (QImode);
4114       ix86_expand_setcc (tmp, code, op0, op1);
4115 
4116       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4117     }
4118 
4119   emit_insn (gen_rtx_SET (operands[0],
4120 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
4121 						operands[2], operands[3])));
4122 
4123   return true;
4124 }
4125 
4126 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
4127 
4128 static int
ix86_int_cmp_code_to_pcmp_immediate(enum rtx_code code)4129 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4130 {
4131   switch (code)
4132     {
4133     case EQ:
4134       return 0;
4135     case LT:
4136     case LTU:
4137       return 1;
4138     case LE:
4139     case LEU:
4140       return 2;
4141     case NE:
4142       return 4;
4143     case GE:
4144     case GEU:
4145       return 5;
4146     case GT:
4147     case GTU:
4148       return 6;
4149     default:
4150       gcc_unreachable ();
4151     }
4152 }
4153 
4154 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
4155 
4156 static int
ix86_fp_cmp_code_to_pcmp_immediate(enum rtx_code code)4157 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4158 {
4159   switch (code)
4160     {
4161     case EQ:
4162       return 0x00;
4163     case NE:
4164       return 0x04;
4165     case GT:
4166       return 0x0e;
4167     case LE:
4168       return 0x02;
4169     case GE:
4170       return 0x0d;
4171     case LT:
4172       return 0x01;
4173     case UNLE:
4174       return 0x0a;
4175     case UNLT:
4176       return 0x09;
4177     case UNGE:
4178       return 0x05;
4179     case UNGT:
4180       return 0x06;
4181     case UNEQ:
4182       return 0x18;
4183     case LTGT:
4184       return 0x0c;
4185     case ORDERED:
4186       return 0x07;
4187     case UNORDERED:
4188       return 0x03;
4189     default:
4190       gcc_unreachable ();
4191     }
4192 }
4193 
4194 /* Return immediate value to be used in UNSPEC_PCMP
4195    for comparison CODE in MODE.  */
4196 
4197 static int
ix86_cmp_code_to_pcmp_immediate(enum rtx_code code,machine_mode mode)4198 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4199 {
4200   if (FLOAT_MODE_P (mode))
4201     return ix86_fp_cmp_code_to_pcmp_immediate (code);
4202   return ix86_int_cmp_code_to_pcmp_immediate (code);
4203 }
4204 
4205 /* Expand AVX-512 vector comparison.  */
4206 
4207 bool
ix86_expand_mask_vec_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1)4208 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4209 {
4210   machine_mode mask_mode = GET_MODE (dest);
4211   machine_mode cmp_mode = GET_MODE (cmp_op0);
4212   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4213   int unspec_code;
4214   rtx unspec;
4215 
4216   switch (code)
4217     {
4218     case LEU:
4219     case GTU:
4220     case GEU:
4221     case LTU:
4222       unspec_code = UNSPEC_UNSIGNED_PCMP;
4223       break;
4224 
4225     default:
4226       unspec_code = UNSPEC_PCMP;
4227     }
4228 
4229   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4230 			   unspec_code);
4231   emit_insn (gen_rtx_SET (dest, unspec));
4232 
4233   return true;
4234 }
4235 
4236 /* Expand fp vector comparison.  */
4237 
4238 bool
ix86_expand_fp_vec_cmp(rtx operands[])4239 ix86_expand_fp_vec_cmp (rtx operands[])
4240 {
4241   enum rtx_code code = GET_CODE (operands[1]);
4242   rtx cmp;
4243 
4244   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4245 					   &operands[2], &operands[3]);
4246   if (code == UNKNOWN)
4247     {
4248       rtx temp;
4249       switch (GET_CODE (operands[1]))
4250 	{
4251 	case LTGT:
4252 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4253 				      operands[3], NULL, NULL);
4254 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4255 				     operands[3], NULL, NULL);
4256 	  code = AND;
4257 	  break;
4258 	case UNEQ:
4259 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4260 				      operands[3], NULL, NULL);
4261 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4262 				     operands[3], NULL, NULL);
4263 	  code = IOR;
4264 	  break;
4265 	default:
4266 	  gcc_unreachable ();
4267 	}
4268       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4269 				 OPTAB_DIRECT);
4270     }
4271   else
4272     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4273 			       NULL, NULL);
4274 
4275   if (operands[0] != cmp)
4276     emit_move_insn (operands[0], cmp);
4277 
4278   return true;
4279 }
4280 
4281 static rtx
ix86_expand_int_sse_cmp(rtx dest,enum rtx_code code,rtx cop0,rtx cop1,rtx op_true,rtx op_false,bool * negate)4282 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4283 			 rtx op_true, rtx op_false, bool *negate)
4284 {
4285   machine_mode data_mode = GET_MODE (dest);
4286   machine_mode mode = GET_MODE (cop0);
4287   rtx x;
4288 
4289   *negate = false;
4290 
4291   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
4292   if (TARGET_XOP
4293       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4294       && GET_MODE_SIZE (mode) <= 16)
4295     ;
4296   /* AVX512F supports all of the comparsions
4297      on all 128/256/512-bit vector int types.  */
4298   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4299     ;
4300   else
4301     {
4302       /* Canonicalize the comparison to EQ, GT, GTU.  */
4303       switch (code)
4304 	{
4305 	case EQ:
4306 	case GT:
4307 	case GTU:
4308 	  break;
4309 
4310 	case NE:
4311 	case LE:
4312 	case LEU:
4313 	  code = reverse_condition (code);
4314 	  *negate = true;
4315 	  break;
4316 
4317 	case GE:
4318 	case GEU:
4319 	  code = reverse_condition (code);
4320 	  *negate = true;
4321 	  /* FALLTHRU */
4322 
4323 	case LT:
4324 	case LTU:
4325 	  std::swap (cop0, cop1);
4326 	  code = swap_condition (code);
4327 	  break;
4328 
4329 	default:
4330 	  gcc_unreachable ();
4331 	}
4332 
4333       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
4334       if (mode == V2DImode)
4335 	{
4336 	  switch (code)
4337 	    {
4338 	    case EQ:
4339 	      /* SSE4.1 supports EQ.  */
4340 	      if (!TARGET_SSE4_1)
4341 		return NULL;
4342 	      break;
4343 
4344 	    case GT:
4345 	    case GTU:
4346 	      /* SSE4.2 supports GT/GTU.  */
4347 	      if (!TARGET_SSE4_2)
4348 		return NULL;
4349 	      break;
4350 
4351 	    default:
4352 	      gcc_unreachable ();
4353 	    }
4354 	}
4355 
4356       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4357       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4358       if (*negate)
4359 	std::swap (optrue, opfalse);
4360 
4361       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4362 	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4363 	 min (x, y) == x).  While we add one instruction (the minimum),
4364 	 we remove the need for two instructions in the negation, as the
4365 	 result is done this way.
4366 	 When using masks, do it for SI/DImode element types, as it is shorter
4367 	 than the two subtractions.  */
4368       if ((code != EQ
4369 	   && GET_MODE_SIZE (mode) != 64
4370 	   && vector_all_ones_operand (opfalse, data_mode)
4371 	   && optrue == CONST0_RTX (data_mode))
4372 	  || (code == GTU
4373 	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4374 	      /* Don't do it if not using integer masks and we'd end up with
4375 		 the right values in the registers though.  */
4376 	      && (GET_MODE_SIZE (mode) == 64
4377 		  || !vector_all_ones_operand (optrue, data_mode)
4378 		  || opfalse != CONST0_RTX (data_mode))))
4379 	{
4380 	  rtx (*gen) (rtx, rtx, rtx) = NULL;
4381 
4382 	  switch (mode)
4383 	    {
4384 	    case E_V16SImode:
4385 	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4386 	      break;
4387 	    case E_V8DImode:
4388 	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4389 	      cop0 = force_reg (mode, cop0);
4390 	      cop1 = force_reg (mode, cop1);
4391 	      break;
4392 	    case E_V32QImode:
4393 	      if (TARGET_AVX2)
4394 		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4395 	      break;
4396 	    case E_V16HImode:
4397 	      if (TARGET_AVX2)
4398 		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4399 	      break;
4400 	    case E_V8SImode:
4401 	      if (TARGET_AVX2)
4402 		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4403 	      break;
4404 	    case E_V4DImode:
4405 	      if (TARGET_AVX512VL)
4406 		{
4407 		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4408 		  cop0 = force_reg (mode, cop0);
4409 		  cop1 = force_reg (mode, cop1);
4410 		}
4411 	      break;
4412 	    case E_V16QImode:
4413 	      if (code == GTU && TARGET_SSE2)
4414 		gen = gen_uminv16qi3;
4415 	      else if (code == GT && TARGET_SSE4_1)
4416 		gen = gen_sminv16qi3;
4417 	      break;
4418 	    case E_V8QImode:
4419 	      if (code == GTU && TARGET_SSE2)
4420 		gen = gen_uminv8qi3;
4421 	      else if (code == GT && TARGET_SSE4_1)
4422 		gen = gen_sminv8qi3;
4423 	      break;
4424 	    case E_V4QImode:
4425 	      if (code == GTU && TARGET_SSE2)
4426 		gen = gen_uminv4qi3;
4427 	      else if (code == GT && TARGET_SSE4_1)
4428 		gen = gen_sminv4qi3;
4429 	      break;
4430 	    case E_V8HImode:
4431 	      if (code == GTU && TARGET_SSE4_1)
4432 		gen = gen_uminv8hi3;
4433 	      else if (code == GT && TARGET_SSE2)
4434 		gen = gen_sminv8hi3;
4435 	      break;
4436 	    case E_V4HImode:
4437 	      if (code == GTU && TARGET_SSE4_1)
4438 		gen = gen_uminv4hi3;
4439 	      else if (code == GT && TARGET_SSE2)
4440 		gen = gen_sminv4hi3;
4441 	      break;
4442 	    case E_V2HImode:
4443 	      if (code == GTU && TARGET_SSE4_1)
4444 		gen = gen_uminv2hi3;
4445 	      else if (code == GT && TARGET_SSE2)
4446 		gen = gen_sminv2hi3;
4447 	      break;
4448 	    case E_V4SImode:
4449 	      if (TARGET_SSE4_1)
4450 		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4451 	      break;
4452 	    case E_V2SImode:
4453 	      if (TARGET_SSE4_1)
4454 		gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4455 	      break;
4456 	    case E_V2DImode:
4457 	      if (TARGET_AVX512VL)
4458 		{
4459 		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4460 		  cop0 = force_reg (mode, cop0);
4461 		  cop1 = force_reg (mode, cop1);
4462 		}
4463 	      break;
4464 	    default:
4465 	      break;
4466 	    }
4467 
4468 	  if (gen)
4469 	    {
4470 	      rtx tem = gen_reg_rtx (mode);
4471 	      if (!vector_operand (cop0, mode))
4472 		cop0 = force_reg (mode, cop0);
4473 	      if (!vector_operand (cop1, mode))
4474 		cop1 = force_reg (mode, cop1);
4475 	      *negate = !*negate;
4476 	      emit_insn (gen (tem, cop0, cop1));
4477 	      cop1 = tem;
4478 	      code = EQ;
4479 	    }
4480 	}
4481 
4482       /* Unsigned parallel compare is not supported by the hardware.
4483 	 Play some tricks to turn this into a signed comparison
4484 	 against 0.  */
4485       if (code == GTU)
4486 	{
4487 	  cop0 = force_reg (mode, cop0);
4488 
4489 	  switch (mode)
4490 	    {
4491 	    case E_V16SImode:
4492 	    case E_V8DImode:
4493 	    case E_V8SImode:
4494 	    case E_V4DImode:
4495 	    case E_V4SImode:
4496 	    case E_V2SImode:
4497 	    case E_V2DImode:
4498 		{
4499 		  rtx t1, t2, mask;
4500 
4501 		  /* Subtract (-(INT MAX) - 1) from both operands to make
4502 		     them signed.  */
4503 		  mask = ix86_build_signbit_mask (mode, true, false);
4504 		  t1 = gen_reg_rtx (mode);
4505 		  emit_insn (gen_sub3_insn (t1, cop0, mask));
4506 
4507 		  t2 = gen_reg_rtx (mode);
4508 		  emit_insn (gen_sub3_insn (t2, cop1, mask));
4509 
4510 		  cop0 = t1;
4511 		  cop1 = t2;
4512 		  code = GT;
4513 		}
4514 	      break;
4515 
4516 	    case E_V64QImode:
4517 	    case E_V32HImode:
4518 	    case E_V32QImode:
4519 	    case E_V16HImode:
4520 	    case E_V16QImode:
4521 	    case E_V8QImode:
4522 	    case E_V4QImode:
4523 	    case E_V8HImode:
4524 	    case E_V4HImode:
4525 	    case E_V2HImode:
4526 	      /* Perform a parallel unsigned saturating subtraction.  */
4527 	      x = gen_reg_rtx (mode);
4528 	      emit_insn (gen_rtx_SET
4529 			 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4530 	      cop0 = x;
4531 	      cop1 = CONST0_RTX (mode);
4532 	      code = EQ;
4533 	      *negate = !*negate;
4534 	      break;
4535 
4536 	    default:
4537 	      gcc_unreachable ();
4538 	    }
4539 	}
4540     }
4541 
4542   if (*negate)
4543     std::swap (op_true, op_false);
4544 
4545   /* Allow the comparison to be done in one mode, but the movcc to
4546      happen in another mode.  */
4547   if (data_mode == mode)
4548     {
4549       x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4550 			       op_true, op_false);
4551     }
4552   else
4553     {
4554       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4555       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4556 			       op_true, op_false);
4557       if (GET_MODE (x) == mode)
4558 	x = gen_lowpart (data_mode, x);
4559     }
4560 
4561   return x;
4562 }
4563 
4564 /* Expand integer vector comparison.  */
4565 
4566 bool
ix86_expand_int_vec_cmp(rtx operands[])4567 ix86_expand_int_vec_cmp (rtx operands[])
4568 {
4569   rtx_code code = GET_CODE (operands[1]);
4570   bool negate = false;
4571   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4572 				     operands[3], NULL, NULL, &negate);
4573 
4574   if (!cmp)
4575     return false;
4576 
4577   if (negate)
4578     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4579 				   CONST0_RTX (GET_MODE (cmp)),
4580 				   NULL, NULL, &negate);
4581 
4582   gcc_assert (!negate);
4583 
4584   if (operands[0] != cmp)
4585     emit_move_insn (operands[0], cmp);
4586 
4587   return true;
4588 }
4589 
4590 /* Expand a floating-point vector conditional move; a vcond operation
4591    rather than a movcc operation.  */
4592 
4593 bool
ix86_expand_fp_vcond(rtx operands[])4594 ix86_expand_fp_vcond (rtx operands[])
4595 {
4596   enum rtx_code code = GET_CODE (operands[3]);
4597   rtx cmp;
4598 
4599   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4600 					   &operands[4], &operands[5]);
4601   if (code == UNKNOWN)
4602     {
4603       rtx temp;
4604       switch (GET_CODE (operands[3]))
4605 	{
4606 	case LTGT:
4607 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4608 				      operands[5], operands[0], operands[0]);
4609 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4610 				     operands[5], operands[1], operands[2]);
4611 	  code = AND;
4612 	  break;
4613 	case UNEQ:
4614 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4615 				      operands[5], operands[0], operands[0]);
4616 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4617 				     operands[5], operands[1], operands[2]);
4618 	  code = IOR;
4619 	  break;
4620 	default:
4621 	  gcc_unreachable ();
4622 	}
4623       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4624 				 OPTAB_DIRECT);
4625       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4626       return true;
4627     }
4628 
4629   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4630 				 operands[5], operands[1], operands[2]))
4631     return true;
4632 
4633   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4634 			     operands[1], operands[2]);
4635   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4636   return true;
4637 }
4638 
4639 /* Expand a signed/unsigned integral vector conditional move.  */
4640 
4641 bool
ix86_expand_int_vcond(rtx operands[])4642 ix86_expand_int_vcond (rtx operands[])
4643 {
4644   machine_mode data_mode = GET_MODE (operands[0]);
4645   machine_mode mode = GET_MODE (operands[4]);
4646   enum rtx_code code = GET_CODE (operands[3]);
4647   bool negate = false;
4648   rtx x, cop0, cop1;
4649 
4650   cop0 = operands[4];
4651   cop1 = operands[5];
4652 
4653   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4654      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
4655   if ((code == LT || code == GE)
4656       && data_mode == mode
4657       && cop1 == CONST0_RTX (mode)
4658       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4659       && GET_MODE_UNIT_SIZE (data_mode) > 1
4660       && GET_MODE_UNIT_SIZE (data_mode) <= 8
4661       && (GET_MODE_SIZE (data_mode) == 16
4662 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4663     {
4664       rtx negop = operands[2 - (code == LT)];
4665       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4666       if (negop == CONST1_RTX (data_mode))
4667 	{
4668 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4669 					 operands[0], 1, OPTAB_DIRECT);
4670 	  if (res != operands[0])
4671 	    emit_move_insn (operands[0], res);
4672 	  return true;
4673 	}
4674       else if (GET_MODE_INNER (data_mode) != DImode
4675 	       && vector_all_ones_operand (negop, data_mode))
4676 	{
4677 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4678 					 operands[0], 0, OPTAB_DIRECT);
4679 	  if (res != operands[0])
4680 	    emit_move_insn (operands[0], res);
4681 	  return true;
4682 	}
4683     }
4684 
4685   if (!nonimmediate_operand (cop1, mode))
4686     cop1 = force_reg (mode, cop1);
4687   if (!general_operand (operands[1], data_mode))
4688     operands[1] = force_reg (data_mode, operands[1]);
4689   if (!general_operand (operands[2], data_mode))
4690     operands[2] = force_reg (data_mode, operands[2]);
4691 
4692   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4693 			       operands[1], operands[2], &negate);
4694 
4695   if (!x)
4696     return false;
4697 
4698   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4699 			 operands[2-negate]);
4700   return true;
4701 }
4702 
4703 static bool
ix86_expand_vec_perm_vpermt2(rtx target,rtx mask,rtx op0,rtx op1,struct expand_vec_perm_d * d)4704 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4705 			      struct expand_vec_perm_d *d)
4706 {
4707   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4708      expander, so args are either in d, or in op0, op1 etc.  */
4709   machine_mode mode = GET_MODE (d ? d->op0 : op0);
4710   machine_mode maskmode = mode;
4711   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4712 
4713   switch (mode)
4714     {
4715     case E_V16QImode:
4716       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4717 	gen = gen_avx512vl_vpermt2varv16qi3;
4718       break;
4719     case E_V32QImode:
4720       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4721 	gen = gen_avx512vl_vpermt2varv32qi3;
4722       break;
4723     case E_V64QImode:
4724       if (TARGET_AVX512VBMI)
4725 	gen = gen_avx512bw_vpermt2varv64qi3;
4726       break;
4727     case E_V8HImode:
4728       if (TARGET_AVX512VL && TARGET_AVX512BW)
4729 	gen = gen_avx512vl_vpermt2varv8hi3;
4730       break;
4731     case E_V16HImode:
4732       if (TARGET_AVX512VL && TARGET_AVX512BW)
4733 	gen = gen_avx512vl_vpermt2varv16hi3;
4734       break;
4735     case E_V32HImode:
4736       if (TARGET_AVX512BW)
4737 	gen = gen_avx512bw_vpermt2varv32hi3;
4738       break;
4739     case E_V4SImode:
4740       if (TARGET_AVX512VL)
4741 	gen = gen_avx512vl_vpermt2varv4si3;
4742       break;
4743     case E_V8SImode:
4744       if (TARGET_AVX512VL)
4745 	gen = gen_avx512vl_vpermt2varv8si3;
4746       break;
4747     case E_V16SImode:
4748       if (TARGET_AVX512F)
4749 	gen = gen_avx512f_vpermt2varv16si3;
4750       break;
4751     case E_V4SFmode:
4752       if (TARGET_AVX512VL)
4753 	{
4754 	  gen = gen_avx512vl_vpermt2varv4sf3;
4755 	  maskmode = V4SImode;
4756 	}
4757       break;
4758     case E_V8SFmode:
4759       if (TARGET_AVX512VL)
4760 	{
4761 	  gen = gen_avx512vl_vpermt2varv8sf3;
4762 	  maskmode = V8SImode;
4763 	}
4764       break;
4765     case E_V16SFmode:
4766       if (TARGET_AVX512F)
4767 	{
4768 	  gen = gen_avx512f_vpermt2varv16sf3;
4769 	  maskmode = V16SImode;
4770 	}
4771       break;
4772     case E_V2DImode:
4773       if (TARGET_AVX512VL)
4774 	gen = gen_avx512vl_vpermt2varv2di3;
4775       break;
4776     case E_V4DImode:
4777       if (TARGET_AVX512VL)
4778 	gen = gen_avx512vl_vpermt2varv4di3;
4779       break;
4780     case E_V8DImode:
4781       if (TARGET_AVX512F)
4782 	gen = gen_avx512f_vpermt2varv8di3;
4783       break;
4784     case E_V2DFmode:
4785       if (TARGET_AVX512VL)
4786 	{
4787 	  gen = gen_avx512vl_vpermt2varv2df3;
4788 	  maskmode = V2DImode;
4789 	}
4790       break;
4791     case E_V4DFmode:
4792       if (TARGET_AVX512VL)
4793 	{
4794 	  gen = gen_avx512vl_vpermt2varv4df3;
4795 	  maskmode = V4DImode;
4796 	}
4797       break;
4798     case E_V8DFmode:
4799       if (TARGET_AVX512F)
4800 	{
4801 	  gen = gen_avx512f_vpermt2varv8df3;
4802 	  maskmode = V8DImode;
4803 	}
4804       break;
4805     default:
4806       break;
4807     }
4808 
4809   if (gen == NULL)
4810     return false;
4811 
4812   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4813      expander, so args are either in d, or in op0, op1 etc.  */
4814   if (d)
4815     {
4816       rtx vec[64];
4817       target = d->target;
4818       op0 = d->op0;
4819       op1 = d->op1;
4820       for (int i = 0; i < d->nelt; ++i)
4821 	vec[i] = GEN_INT (d->perm[i]);
4822       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4823     }
4824 
4825   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4826   return true;
4827 }
4828 
4829 /* Expand a variable vector permutation.  */
4830 
4831 void
ix86_expand_vec_perm(rtx operands[])4832 ix86_expand_vec_perm (rtx operands[])
4833 {
4834   rtx target = operands[0];
4835   rtx op0 = operands[1];
4836   rtx op1 = operands[2];
4837   rtx mask = operands[3];
4838   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4839   machine_mode mode = GET_MODE (op0);
4840   machine_mode maskmode = GET_MODE (mask);
4841   int w, e, i;
4842   bool one_operand_shuffle = rtx_equal_p (op0, op1);
4843 
4844   /* Number of elements in the vector.  */
4845   w = GET_MODE_NUNITS (mode);
4846   e = GET_MODE_UNIT_SIZE (mode);
4847   gcc_assert (w <= 64);
4848 
4849   /* For HF mode vector, convert it to HI using subreg.  */
4850   if (GET_MODE_INNER (mode) == HFmode)
4851     {
4852       machine_mode orig_mode = mode;
4853       mode = mode_for_vector (HImode, w).require ();
4854       target = lowpart_subreg (mode, target, orig_mode);
4855       op0 = lowpart_subreg (mode, op0, orig_mode);
4856       op1 = lowpart_subreg (mode, op1, orig_mode);
4857     }
4858 
4859   if (TARGET_AVX512F && one_operand_shuffle)
4860     {
4861       rtx (*gen) (rtx, rtx, rtx) = NULL;
4862       switch (mode)
4863 	{
4864 	case E_V16SImode:
4865 	  gen =gen_avx512f_permvarv16si;
4866 	  break;
4867 	case E_V16SFmode:
4868 	  gen = gen_avx512f_permvarv16sf;
4869 	  break;
4870 	case E_V8DImode:
4871 	  gen = gen_avx512f_permvarv8di;
4872 	  break;
4873 	case E_V8DFmode:
4874 	  gen = gen_avx512f_permvarv8df;
4875 	  break;
4876 	default:
4877 	  break;
4878 	}
4879       if (gen != NULL)
4880 	{
4881 	  emit_insn (gen (target, op0, mask));
4882 	  return;
4883 	}
4884     }
4885 
4886   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4887     return;
4888 
4889   if (TARGET_AVX2)
4890     {
4891       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4892 	{
4893 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4894 	     an constant shuffle operand.  With a tiny bit of effort we can
4895 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
4896 	     unfortunate but there's no avoiding it.
4897 	     Similarly for V16HImode we don't have instructions for variable
4898 	     shuffling, while for V32QImode we can use after preparing suitable
4899 	     masks vpshufb; vpshufb; vpermq; vpor.  */
4900 
4901 	  if (mode == V16HImode)
4902 	    {
4903 	      maskmode = mode = V32QImode;
4904 	      w = 32;
4905 	      e = 1;
4906 	    }
4907 	  else
4908 	    {
4909 	      maskmode = mode = V8SImode;
4910 	      w = 8;
4911 	      e = 4;
4912 	    }
4913 	  t1 = gen_reg_rtx (maskmode);
4914 
4915 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
4916 	       mask = { A B C D }
4917 	       t1 = { A A B B C C D D }.  */
4918 	  for (i = 0; i < w / 2; ++i)
4919 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4920 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4921 	  vt = force_reg (maskmode, vt);
4922 	  mask = gen_lowpart (maskmode, mask);
4923 	  if (maskmode == V8SImode)
4924 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4925 	  else
4926 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4927 
4928 	  /* Multiply the shuffle indicies by two.  */
4929 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4930 				    OPTAB_DIRECT);
4931 
4932 	  /* Add one to the odd shuffle indicies:
4933 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
4934 	  for (i = 0; i < w / 2; ++i)
4935 	    {
4936 	      vec[i * 2] = const0_rtx;
4937 	      vec[i * 2 + 1] = const1_rtx;
4938 	    }
4939 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4940 	  vt = validize_mem (force_const_mem (maskmode, vt));
4941 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4942 				    OPTAB_DIRECT);
4943 
4944 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
4945 	  operands[3] = mask = t1;
4946 	  target = gen_reg_rtx (mode);
4947 	  op0 = gen_lowpart (mode, op0);
4948 	  op1 = gen_lowpart (mode, op1);
4949 	}
4950 
4951       switch (mode)
4952 	{
4953 	case E_V8SImode:
4954 	  /* The VPERMD and VPERMPS instructions already properly ignore
4955 	     the high bits of the shuffle elements.  No need for us to
4956 	     perform an AND ourselves.  */
4957 	  if (one_operand_shuffle)
4958 	    {
4959 	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4960 	      if (target != operands[0])
4961 		emit_move_insn (operands[0],
4962 				gen_lowpart (GET_MODE (operands[0]), target));
4963 	    }
4964 	  else
4965 	    {
4966 	      t1 = gen_reg_rtx (V8SImode);
4967 	      t2 = gen_reg_rtx (V8SImode);
4968 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4969 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4970 	      goto merge_two;
4971 	    }
4972 	  return;
4973 
4974 	case E_V8SFmode:
4975 	  mask = gen_lowpart (V8SImode, mask);
4976 	  if (one_operand_shuffle)
4977 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4978 	  else
4979 	    {
4980 	      t1 = gen_reg_rtx (V8SFmode);
4981 	      t2 = gen_reg_rtx (V8SFmode);
4982 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4983 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4984 	      goto merge_two;
4985 	    }
4986 	  return;
4987 
4988         case E_V4SImode:
4989 	  /* By combining the two 128-bit input vectors into one 256-bit
4990 	     input vector, we can use VPERMD and VPERMPS for the full
4991 	     two-operand shuffle.  */
4992 	  t1 = gen_reg_rtx (V8SImode);
4993 	  t2 = gen_reg_rtx (V8SImode);
4994 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4995 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4996 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4997 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4998 	  return;
4999 
5000         case E_V4SFmode:
5001 	  t1 = gen_reg_rtx (V8SFmode);
5002 	  t2 = gen_reg_rtx (V8SImode);
5003 	  mask = gen_lowpart (V4SImode, mask);
5004 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5005 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5006 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5007 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5008 	  return;
5009 
5010 	case E_V32QImode:
5011 	  t1 = gen_reg_rtx (V32QImode);
5012 	  t2 = gen_reg_rtx (V32QImode);
5013 	  t3 = gen_reg_rtx (V32QImode);
5014 	  vt2 = GEN_INT (-128);
5015 	  vt = gen_const_vec_duplicate (V32QImode, vt2);
5016 	  vt = force_reg (V32QImode, vt);
5017 	  for (i = 0; i < 32; i++)
5018 	    vec[i] = i < 16 ? vt2 : const0_rtx;
5019 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5020 	  vt2 = force_reg (V32QImode, vt2);
5021 	  /* From mask create two adjusted masks, which contain the same
5022 	     bits as mask in the low 7 bits of each vector element.
5023 	     The first mask will have the most significant bit clear
5024 	     if it requests element from the same 128-bit lane
5025 	     and MSB set if it requests element from the other 128-bit lane.
5026 	     The second mask will have the opposite values of the MSB,
5027 	     and additionally will have its 128-bit lanes swapped.
5028 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5029 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
5030 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5031 	     stands for other 12 bytes.  */
5032 	  /* The bit whether element is from the same lane or the other
5033 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
5034 	  t5 = gen_reg_rtx (V4DImode);
5035 	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5036 				    GEN_INT (3)));
5037 	  /* Clear MSB bits from the mask just in case it had them set.  */
5038 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5039 	  /* After this t1 will have MSB set for elements from other lane.  */
5040 	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5041 	  /* Clear bits other than MSB.  */
5042 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
5043 	  /* Or in the lower bits from mask into t3.  */
5044 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
5045 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
5046 	     lane.  */
5047 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
5048 	  /* Swap 128-bit lanes in t3.  */
5049 	  t6 = gen_reg_rtx (V4DImode);
5050 	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5051 					  const2_rtx, GEN_INT (3),
5052 					  const0_rtx, const1_rtx));
5053 	  /* And or in the lower bits from mask into t1.  */
5054 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
5055 	  if (one_operand_shuffle)
5056 	    {
5057 	      /* Each of these shuffles will put 0s in places where
5058 		 element from the other 128-bit lane is needed, otherwise
5059 		 will shuffle in the requested value.  */
5060 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5061 						gen_lowpart (V32QImode, t6)));
5062 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5063 	      /* For t3 the 128-bit lanes are swapped again.  */
5064 	      t7 = gen_reg_rtx (V4DImode);
5065 	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5066 					      const2_rtx, GEN_INT (3),
5067 					      const0_rtx, const1_rtx));
5068 	      /* And oring both together leads to the result.  */
5069 	      emit_insn (gen_iorv32qi3 (target, t1,
5070 					gen_lowpart (V32QImode, t7)));
5071 	      if (target != operands[0])
5072 		emit_move_insn (operands[0],
5073 				gen_lowpart (GET_MODE (operands[0]), target));
5074 	      return;
5075 	    }
5076 
5077 	  t4 = gen_reg_rtx (V32QImode);
5078 	  /* Similarly to the above one_operand_shuffle code,
5079 	     just for repeated twice for each operand.  merge_two:
5080 	     code will merge the two results together.  */
5081 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5082 					    gen_lowpart (V32QImode, t6)));
5083 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5084 					    gen_lowpart (V32QImode, t6)));
5085 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5086 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5087 	  t7 = gen_reg_rtx (V4DImode);
5088 	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5089 					  const2_rtx, GEN_INT (3),
5090 					  const0_rtx, const1_rtx));
5091 	  t8 = gen_reg_rtx (V4DImode);
5092 	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5093 					  const2_rtx, GEN_INT (3),
5094 					  const0_rtx, const1_rtx));
5095 	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5096 	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5097 	  t1 = t4;
5098 	  t2 = t3;
5099 	  goto merge_two;
5100 
5101 	default:
5102 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
5103 	  break;
5104 	}
5105     }
5106 
5107   if (TARGET_XOP)
5108     {
5109       /* The XOP VPPERM insn supports three inputs.  By ignoring the
5110 	 one_operand_shuffle special case, we avoid creating another
5111 	 set of constant vectors in memory.  */
5112       one_operand_shuffle = false;
5113 
5114       /* mask = mask & {2*w-1, ...} */
5115       vt = GEN_INT (2*w - 1);
5116     }
5117   else
5118     {
5119       /* mask = mask & {w-1, ...} */
5120       vt = GEN_INT (w - 1);
5121     }
5122 
5123   vt = gen_const_vec_duplicate (maskmode, vt);
5124   mask = expand_simple_binop (maskmode, AND, mask, vt,
5125 			      NULL_RTX, 0, OPTAB_DIRECT);
5126 
5127   /* For non-QImode operations, convert the word permutation control
5128      into a byte permutation control.  */
5129   if (mode != V16QImode)
5130     {
5131       mask = expand_simple_binop (maskmode, ASHIFT, mask,
5132 				  GEN_INT (exact_log2 (e)),
5133 				  NULL_RTX, 0, OPTAB_DIRECT);
5134 
5135       /* Convert mask to vector of chars.  */
5136       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5137 
5138       /* Replicate each of the input bytes into byte positions:
5139 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5140 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5141 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
5142       for (i = 0; i < 16; ++i)
5143 	vec[i] = GEN_INT (i/e * e);
5144       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5145       vt = validize_mem (force_const_mem (V16QImode, vt));
5146       if (TARGET_XOP)
5147 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5148       else
5149 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5150 
5151       /* Convert it into the byte positions by doing
5152 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
5153       for (i = 0; i < 16; ++i)
5154 	vec[i] = GEN_INT (i % e);
5155       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5156       vt = validize_mem (force_const_mem (V16QImode, vt));
5157       emit_insn (gen_addv16qi3 (mask, mask, vt));
5158     }
5159 
5160   /* The actual shuffle operations all operate on V16QImode.  */
5161   op0 = gen_lowpart (V16QImode, op0);
5162   op1 = gen_lowpart (V16QImode, op1);
5163 
5164   if (TARGET_XOP)
5165     {
5166       if (GET_MODE (target) != V16QImode)
5167 	target = gen_reg_rtx (V16QImode);
5168       emit_insn (gen_xop_pperm (target, op0, op1, mask));
5169       if (target != operands[0])
5170 	emit_move_insn (operands[0],
5171 			gen_lowpart (GET_MODE (operands[0]), target));
5172     }
5173   else if (one_operand_shuffle)
5174     {
5175       if (GET_MODE (target) != V16QImode)
5176 	target = gen_reg_rtx (V16QImode);
5177       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5178       if (target != operands[0])
5179 	emit_move_insn (operands[0],
5180 			gen_lowpart (GET_MODE (operands[0]), target));
5181     }
5182   else
5183     {
5184       rtx xops[6];
5185       bool ok;
5186 
5187       /* Shuffle the two input vectors independently.  */
5188       t1 = gen_reg_rtx (V16QImode);
5189       t2 = gen_reg_rtx (V16QImode);
5190       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5191       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5192 
5193  merge_two:
5194       /* Then merge them together.  The key is whether any given control
5195          element contained a bit set that indicates the second word.  */
5196       mask = operands[3];
5197       vt = GEN_INT (w);
5198       if (maskmode == V2DImode && !TARGET_SSE4_1)
5199 	{
5200 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
5201 	     more shuffle to convert the V2DI input mask into a V4SI
5202 	     input mask.  At which point the masking that expand_int_vcond
5203 	     will work as desired.  */
5204 	  rtx t3 = gen_reg_rtx (V4SImode);
5205 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5206 				        const0_rtx, const0_rtx,
5207 				        const2_rtx, const2_rtx));
5208 	  mask = t3;
5209 	  maskmode = V4SImode;
5210 	  e = w = 4;
5211 	}
5212 
5213       vt = gen_const_vec_duplicate (maskmode, vt);
5214       vt = force_reg (maskmode, vt);
5215       mask = expand_simple_binop (maskmode, AND, mask, vt,
5216 				  NULL_RTX, 0, OPTAB_DIRECT);
5217 
5218       if (GET_MODE (target) != mode)
5219 	target = gen_reg_rtx (mode);
5220       xops[0] = target;
5221       xops[1] = gen_lowpart (mode, t2);
5222       xops[2] = gen_lowpart (mode, t1);
5223       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5224       xops[4] = mask;
5225       xops[5] = vt;
5226       ok = ix86_expand_int_vcond (xops);
5227       gcc_assert (ok);
5228       if (target != operands[0])
5229 	emit_move_insn (operands[0],
5230 			gen_lowpart (GET_MODE (operands[0]), target));
5231     }
5232 }
5233 
5234 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
5235    true if we should do zero extension, else sign extension.  HIGH_P is
5236    true if we want the N/2 high elements, else the low elements.  */
5237 
5238 void
ix86_expand_sse_unpack(rtx dest,rtx src,bool unsigned_p,bool high_p)5239 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5240 {
5241   machine_mode imode = GET_MODE (src);
5242   rtx tmp;
5243 
5244   if (TARGET_SSE4_1)
5245     {
5246       rtx (*unpack)(rtx, rtx);
5247       rtx (*extract)(rtx, rtx) = NULL;
5248       machine_mode halfmode = BLKmode;
5249 
5250       switch (imode)
5251 	{
5252 	case E_V64QImode:
5253 	  if (unsigned_p)
5254 	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5255 	  else
5256 	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5257 	  halfmode = V32QImode;
5258 	  extract
5259 	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5260 	  break;
5261 	case E_V32QImode:
5262 	  if (unsigned_p)
5263 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
5264 	  else
5265 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
5266 	  halfmode = V16QImode;
5267 	  extract
5268 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5269 	  break;
5270 	case E_V32HImode:
5271 	  if (unsigned_p)
5272 	    unpack = gen_avx512f_zero_extendv16hiv16si2;
5273 	  else
5274 	    unpack = gen_avx512f_sign_extendv16hiv16si2;
5275 	  halfmode = V16HImode;
5276 	  extract
5277 	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5278 	  break;
5279 	case E_V16HImode:
5280 	  if (unsigned_p)
5281 	    unpack = gen_avx2_zero_extendv8hiv8si2;
5282 	  else
5283 	    unpack = gen_avx2_sign_extendv8hiv8si2;
5284 	  halfmode = V8HImode;
5285 	  extract
5286 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5287 	  break;
5288 	case E_V16SImode:
5289 	  if (unsigned_p)
5290 	    unpack = gen_avx512f_zero_extendv8siv8di2;
5291 	  else
5292 	    unpack = gen_avx512f_sign_extendv8siv8di2;
5293 	  halfmode = V8SImode;
5294 	  extract
5295 	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5296 	  break;
5297 	case E_V8SImode:
5298 	  if (unsigned_p)
5299 	    unpack = gen_avx2_zero_extendv4siv4di2;
5300 	  else
5301 	    unpack = gen_avx2_sign_extendv4siv4di2;
5302 	  halfmode = V4SImode;
5303 	  extract
5304 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5305 	  break;
5306 	case E_V16QImode:
5307 	  if (unsigned_p)
5308 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5309 	  else
5310 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5311 	  break;
5312 	case E_V8HImode:
5313 	  if (unsigned_p)
5314 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
5315 	  else
5316 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
5317 	  break;
5318 	case E_V4SImode:
5319 	  if (unsigned_p)
5320 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
5321 	  else
5322 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
5323 	  break;
5324 	case E_V8QImode:
5325 	  if (unsigned_p)
5326 	    unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5327 	  else
5328 	    unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5329 	  break;
5330 	case E_V4HImode:
5331 	  if (unsigned_p)
5332 	    unpack = gen_sse4_1_zero_extendv2hiv2si2;
5333 	  else
5334 	    unpack = gen_sse4_1_sign_extendv2hiv2si2;
5335 	  break;
5336 	case E_V4QImode:
5337 	  if (unsigned_p)
5338 	    unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5339 	  else
5340 	    unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5341 	  break;
5342 	default:
5343 	  gcc_unreachable ();
5344 	}
5345 
5346       if (GET_MODE_SIZE (imode) >= 32)
5347 	{
5348 	  tmp = gen_reg_rtx (halfmode);
5349 	  emit_insn (extract (tmp, src));
5350 	}
5351       else if (high_p)
5352 	{
5353 	  switch (GET_MODE_SIZE (imode))
5354 	    {
5355 	    case 16:
5356 	      /* Shift higher 8 bytes to lower 8 bytes.  */
5357 	      tmp = gen_reg_rtx (V1TImode);
5358 	      emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5359 					     GEN_INT (64)));
5360 	      break;
5361 	    case 8:
5362 	      /* Shift higher 4 bytes to lower 4 bytes.  */
5363 	      tmp = gen_reg_rtx (V1DImode);
5364 	      emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5365 					    GEN_INT (32)));
5366 	      break;
5367 	    case 4:
5368 	      /* Shift higher 2 bytes to lower 2 bytes.  */
5369 	      tmp = gen_reg_rtx (V1SImode);
5370 	      emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5371 					    GEN_INT (16)));
5372 	      break;
5373 	    default:
5374 	      gcc_unreachable ();
5375 	    }
5376 
5377 	  tmp = gen_lowpart (imode, tmp);
5378 	}
5379       else
5380 	tmp = src;
5381 
5382       emit_insn (unpack (dest, tmp));
5383     }
5384   else
5385     {
5386       rtx (*unpack)(rtx, rtx, rtx);
5387 
5388       switch (imode)
5389 	{
5390 	case E_V16QImode:
5391 	  if (high_p)
5392 	    unpack = gen_vec_interleave_highv16qi;
5393 	  else
5394 	    unpack = gen_vec_interleave_lowv16qi;
5395 	  break;
5396 	case E_V8HImode:
5397 	  if (high_p)
5398 	    unpack = gen_vec_interleave_highv8hi;
5399 	  else
5400 	    unpack = gen_vec_interleave_lowv8hi;
5401 	  break;
5402 	case E_V4SImode:
5403 	  if (high_p)
5404 	    unpack = gen_vec_interleave_highv4si;
5405 	  else
5406 	    unpack = gen_vec_interleave_lowv4si;
5407 	  break;
5408 	case E_V8QImode:
5409 	  if (high_p)
5410 	    unpack = gen_mmx_punpckhbw;
5411 	  else
5412 	    unpack = gen_mmx_punpcklbw;
5413 	  break;
5414 	case E_V4HImode:
5415 	  if (high_p)
5416 	    unpack = gen_mmx_punpckhwd;
5417 	  else
5418 	    unpack = gen_mmx_punpcklwd;
5419 	  break;
5420 	case E_V4QImode:
5421 	  if (high_p)
5422 	    unpack = gen_mmx_punpckhbw_low;
5423 	  else
5424 	    unpack = gen_mmx_punpcklbw_low;
5425 	  break;
5426 	default:
5427 	  gcc_unreachable ();
5428 	}
5429 
5430       if (unsigned_p)
5431 	tmp = force_reg (imode, CONST0_RTX (imode));
5432       else
5433 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5434 				   src, pc_rtx, pc_rtx);
5435 
5436       rtx tmp2 = gen_reg_rtx (imode);
5437       emit_insn (unpack (tmp2, src, tmp));
5438       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5439     }
5440 }
5441 
5442 /* Return true if mem is pool constant which contains a const_vector
5443    perm index, assign the index to PERM.  */
5444 bool
ix86_extract_perm_from_pool_constant(int * perm,rtx mem)5445 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5446 {
5447   machine_mode mode = GET_MODE (mem);
5448   int nelt = GET_MODE_NUNITS (mode);
5449 
5450   if (!INTEGRAL_MODE_P (mode))
5451     return false;
5452 
5453     /* Needs to be constant pool.  */
5454   if (!(MEM_P (mem))
5455       || !SYMBOL_REF_P (XEXP (mem, 0))
5456       || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5457    return false;
5458 
5459   rtx constant = get_pool_constant (XEXP (mem, 0));
5460 
5461   if (GET_CODE (constant) != CONST_VECTOR)
5462     return false;
5463 
5464   /* There could be some rtx like
5465      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5466      but with "*.LC1" refer to V2DI constant vector.  */
5467   if (GET_MODE (constant) != mode)
5468     {
5469       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5470 
5471       if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5472 	return false;
5473     }
5474 
5475   for (int i = 0; i != nelt; i++)
5476     perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5477 
5478   return true;
5479 }
5480 
5481 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
5482    but works for floating pointer parameters and nonoffsetable memories.
5483    For pushes, it returns just stack offsets; the values will be saved
5484    in the right order.  Maximally three parts are generated.  */
5485 
5486 static int
ix86_split_to_parts(rtx operand,rtx * parts,machine_mode mode)5487 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5488 {
5489   int size;
5490 
5491   if (!TARGET_64BIT)
5492     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5493   else
5494     size = (GET_MODE_SIZE (mode) + 4) / 8;
5495 
5496   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5497   gcc_assert (size >= 2 && size <= 4);
5498 
5499   /* Optimize constant pool reference to immediates.  This is used by fp
5500      moves, that force all constants to memory to allow combining.  */
5501   if (MEM_P (operand) && MEM_READONLY_P (operand))
5502     operand = avoid_constant_pool_reference (operand);
5503 
5504   if (MEM_P (operand) && !offsettable_memref_p (operand))
5505     {
5506       /* The only non-offsetable memories we handle are pushes.  */
5507       int ok = push_operand (operand, VOIDmode);
5508 
5509       gcc_assert (ok);
5510 
5511       operand = copy_rtx (operand);
5512       PUT_MODE (operand, word_mode);
5513       parts[0] = parts[1] = parts[2] = parts[3] = operand;
5514       return size;
5515     }
5516 
5517   if (GET_CODE (operand) == CONST_VECTOR)
5518     {
5519       scalar_int_mode imode = int_mode_for_mode (mode).require ();
5520       /* Caution: if we looked through a constant pool memory above,
5521 	 the operand may actually have a different mode now.  That's
5522 	 ok, since we want to pun this all the way back to an integer.  */
5523       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5524       gcc_assert (operand != NULL);
5525       mode = imode;
5526     }
5527 
5528   if (!TARGET_64BIT)
5529     {
5530       if (mode == DImode)
5531 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5532       else
5533 	{
5534 	  int i;
5535 
5536 	  if (REG_P (operand))
5537 	    {
5538 	      gcc_assert (reload_completed);
5539 	      for (i = 0; i < size; i++)
5540 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5541 	    }
5542 	  else if (offsettable_memref_p (operand))
5543 	    {
5544 	      operand = adjust_address (operand, SImode, 0);
5545 	      parts[0] = operand;
5546 	      for (i = 1; i < size; i++)
5547 		parts[i] = adjust_address (operand, SImode, 4 * i);
5548 	    }
5549 	  else if (CONST_DOUBLE_P (operand))
5550 	    {
5551 	      const REAL_VALUE_TYPE *r;
5552 	      long l[4];
5553 
5554 	      r = CONST_DOUBLE_REAL_VALUE (operand);
5555 	      switch (mode)
5556 		{
5557 		case E_TFmode:
5558 		  real_to_target (l, r, mode);
5559 		  parts[3] = gen_int_mode (l[3], SImode);
5560 		  parts[2] = gen_int_mode (l[2], SImode);
5561 		  break;
5562 		case E_XFmode:
5563 		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5564 		     long double may not be 80-bit.  */
5565 		  real_to_target (l, r, mode);
5566 		  parts[2] = gen_int_mode (l[2], SImode);
5567 		  break;
5568 		case E_DFmode:
5569 		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5570 		  break;
5571 		default:
5572 		  gcc_unreachable ();
5573 		}
5574 	      parts[1] = gen_int_mode (l[1], SImode);
5575 	      parts[0] = gen_int_mode (l[0], SImode);
5576 	    }
5577 	  else
5578 	    gcc_unreachable ();
5579 	}
5580     }
5581   else
5582     {
5583       if (mode == TImode)
5584 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5585       if (mode == XFmode || mode == TFmode)
5586 	{
5587 	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5588 	  if (REG_P (operand))
5589 	    {
5590 	      gcc_assert (reload_completed);
5591 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5592 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5593 	    }
5594 	  else if (offsettable_memref_p (operand))
5595 	    {
5596 	      operand = adjust_address (operand, DImode, 0);
5597 	      parts[0] = operand;
5598 	      parts[1] = adjust_address (operand, upper_mode, 8);
5599 	    }
5600 	  else if (CONST_DOUBLE_P (operand))
5601 	    {
5602 	      long l[4];
5603 
5604 	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5605 
5606 	      /* real_to_target puts 32-bit pieces in each long.  */
5607 	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5608 				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5609 					  << 32), DImode);
5610 
5611 	      if (upper_mode == SImode)
5612 	        parts[1] = gen_int_mode (l[2], SImode);
5613 	      else
5614 	        parts[1]
5615 		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5616 				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5617 				     << 32), DImode);
5618 	    }
5619 	  else
5620 	    gcc_unreachable ();
5621 	}
5622     }
5623 
5624   return size;
5625 }
5626 
5627 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5628    Return false when normal moves are needed; true when all required
5629    insns have been emitted.  Operands 2-4 contain the input values
5630    int the correct order; operands 5-7 contain the output values.  */
5631 
5632 void
ix86_split_long_move(rtx operands[])5633 ix86_split_long_move (rtx operands[])
5634 {
5635   rtx part[2][4];
5636   int nparts, i, j;
5637   int push = 0;
5638   int collisions = 0;
5639   machine_mode mode = GET_MODE (operands[0]);
5640   bool collisionparts[4];
5641 
5642   /* The DFmode expanders may ask us to move double.
5643      For 64bit target this is single move.  By hiding the fact
5644      here we simplify i386.md splitters.  */
5645   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5646     {
5647       /* Optimize constant pool reference to immediates.  This is used by
5648 	 fp moves, that force all constants to memory to allow combining.  */
5649 
5650       if (MEM_P (operands[1])
5651 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5652 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5653 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
5654       if (push_operand (operands[0], VOIDmode))
5655 	{
5656 	  operands[0] = copy_rtx (operands[0]);
5657 	  PUT_MODE (operands[0], word_mode);
5658 	}
5659       else
5660         operands[0] = gen_lowpart (DImode, operands[0]);
5661       operands[1] = gen_lowpart (DImode, operands[1]);
5662       emit_move_insn (operands[0], operands[1]);
5663       return;
5664     }
5665 
5666   /* The only non-offsettable memory we handle is push.  */
5667   if (push_operand (operands[0], VOIDmode))
5668     push = 1;
5669   else
5670     gcc_assert (!MEM_P (operands[0])
5671 		|| offsettable_memref_p (operands[0]));
5672 
5673   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5674   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5675 
5676   /* When emitting push, take care for source operands on the stack.  */
5677   if (push && MEM_P (operands[1])
5678       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5679     {
5680       rtx src_base = XEXP (part[1][nparts - 1], 0);
5681 
5682       /* Compensate for the stack decrement by 4.  */
5683       if (!TARGET_64BIT && nparts == 3
5684 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5685 	src_base = plus_constant (Pmode, src_base, 4);
5686 
5687       /* src_base refers to the stack pointer and is
5688 	 automatically decreased by emitted push.  */
5689       for (i = 0; i < nparts; i++)
5690 	part[1][i] = change_address (part[1][i],
5691 				     GET_MODE (part[1][i]), src_base);
5692     }
5693 
5694   /* We need to do copy in the right order in case an address register
5695      of the source overlaps the destination.  */
5696   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5697     {
5698       rtx tmp;
5699 
5700       for (i = 0; i < nparts; i++)
5701 	{
5702 	  collisionparts[i]
5703 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5704 	  if (collisionparts[i])
5705 	    collisions++;
5706 	}
5707 
5708       /* Collision in the middle part can be handled by reordering.  */
5709       if (collisions == 1 && nparts == 3 && collisionparts [1])
5710 	{
5711 	  std::swap (part[0][1], part[0][2]);
5712 	  std::swap (part[1][1], part[1][2]);
5713 	}
5714       else if (collisions == 1
5715 	       && nparts == 4
5716 	       && (collisionparts [1] || collisionparts [2]))
5717 	{
5718 	  if (collisionparts [1])
5719 	    {
5720 	      std::swap (part[0][1], part[0][2]);
5721 	      std::swap (part[1][1], part[1][2]);
5722 	    }
5723 	  else
5724 	    {
5725 	      std::swap (part[0][2], part[0][3]);
5726 	      std::swap (part[1][2], part[1][3]);
5727 	    }
5728 	}
5729 
5730       /* If there are more collisions, we can't handle it by reordering.
5731 	 Do an lea to the last part and use only one colliding move.  */
5732       else if (collisions > 1)
5733 	{
5734 	  rtx base, addr;
5735 
5736 	  collisions = 1;
5737 
5738 	  base = part[0][nparts - 1];
5739 
5740 	  /* Handle the case when the last part isn't valid for lea.
5741 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
5742 	  if (GET_MODE (base) != Pmode)
5743 	    base = gen_rtx_REG (Pmode, REGNO (base));
5744 
5745 	  addr = XEXP (part[1][0], 0);
5746 	  if (TARGET_TLS_DIRECT_SEG_REFS)
5747 	    {
5748 	      struct ix86_address parts;
5749 	      int ok = ix86_decompose_address (addr, &parts);
5750 	      gcc_assert (ok);
5751 	      /* It is not valid to use %gs: or %fs: in lea.  */
5752 	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5753 	    }
5754 	  emit_insn (gen_rtx_SET (base, addr));
5755 	  part[1][0] = replace_equiv_address (part[1][0], base);
5756 	  for (i = 1; i < nparts; i++)
5757 	    {
5758 	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5759 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
5760 	    }
5761 	}
5762     }
5763 
5764   if (push)
5765     {
5766       if (!TARGET_64BIT)
5767 	{
5768 	  if (nparts == 3)
5769 	    {
5770 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5771                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5772 	      emit_move_insn (part[0][2], part[1][2]);
5773 	    }
5774 	  else if (nparts == 4)
5775 	    {
5776 	      emit_move_insn (part[0][3], part[1][3]);
5777 	      emit_move_insn (part[0][2], part[1][2]);
5778 	    }
5779 	}
5780       else
5781 	{
5782 	  /* In 64bit mode we don't have 32bit push available.  In case this is
5783 	     register, it is OK - we will just use larger counterpart.  We also
5784 	     retype memory - these comes from attempt to avoid REX prefix on
5785 	     moving of second half of TFmode value.  */
5786 	  if (GET_MODE (part[1][1]) == SImode)
5787 	    {
5788 	      switch (GET_CODE (part[1][1]))
5789 		{
5790 		case MEM:
5791 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
5792 		  break;
5793 
5794 		case REG:
5795 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5796 		  break;
5797 
5798 		default:
5799 		  gcc_unreachable ();
5800 		}
5801 
5802 	      if (GET_MODE (part[1][0]) == SImode)
5803 		part[1][0] = part[1][1];
5804 	    }
5805 	}
5806       emit_move_insn (part[0][1], part[1][1]);
5807       emit_move_insn (part[0][0], part[1][0]);
5808       return;
5809     }
5810 
5811   /* Choose correct order to not overwrite the source before it is copied.  */
5812   if ((REG_P (part[0][0])
5813        && REG_P (part[1][1])
5814        && (REGNO (part[0][0]) == REGNO (part[1][1])
5815 	   || (nparts == 3
5816 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
5817 	   || (nparts == 4
5818 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
5819       || (collisions > 0
5820 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5821     {
5822       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5823 	{
5824 	  operands[2 + i] = part[0][j];
5825 	  operands[6 + i] = part[1][j];
5826 	}
5827     }
5828   else
5829     {
5830       for (i = 0; i < nparts; i++)
5831 	{
5832 	  operands[2 + i] = part[0][i];
5833 	  operands[6 + i] = part[1][i];
5834 	}
5835     }
5836 
5837   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
5838   if (optimize_insn_for_size_p ())
5839     {
5840       for (j = 0; j < nparts - 1; j++)
5841 	if (CONST_INT_P (operands[6 + j])
5842 	    && operands[6 + j] != const0_rtx
5843 	    && REG_P (operands[2 + j]))
5844 	  for (i = j; i < nparts - 1; i++)
5845 	    if (CONST_INT_P (operands[7 + i])
5846 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5847 	      operands[7 + i] = operands[2 + j];
5848     }
5849 
5850   for (i = 0; i < nparts; i++)
5851     emit_move_insn (operands[2 + i], operands[6 + i]);
5852 
5853   return;
5854 }
5855 
5856 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5857    left shift by a constant, either using a single shift or
5858    a sequence of add instructions.  */
5859 
5860 static void
ix86_expand_ashl_const(rtx operand,int count,machine_mode mode)5861 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5862 {
5863   if (count == 1
5864       || (count * ix86_cost->add <= ix86_cost->shift_const
5865 	  && !optimize_insn_for_size_p ()))
5866     {
5867       while (count-- > 0)
5868 	emit_insn (gen_add2_insn (operand, operand));
5869     }
5870   else
5871     {
5872       rtx (*insn)(rtx, rtx, rtx);
5873 
5874       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5875       emit_insn (insn (operand, operand, GEN_INT (count)));
5876     }
5877 }
5878 
5879 void
ix86_split_ashl(rtx * operands,rtx scratch,machine_mode mode)5880 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5881 {
5882   rtx (*gen_ashl3)(rtx, rtx, rtx);
5883   rtx (*gen_shld)(rtx, rtx, rtx);
5884   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5885   machine_mode half_mode;
5886 
5887   rtx low[2], high[2];
5888   int count;
5889 
5890   if (CONST_INT_P (operands[2]))
5891     {
5892       split_double_mode (mode, operands, 2, low, high);
5893       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5894 
5895       if (count >= half_width)
5896 	{
5897 	  emit_move_insn (high[0], low[1]);
5898 	  emit_move_insn (low[0], const0_rtx);
5899 
5900 	  if (count > half_width)
5901 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
5902 	}
5903       else
5904 	{
5905 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5906 
5907 	  if (!rtx_equal_p (operands[0], operands[1]))
5908 	    emit_move_insn (operands[0], operands[1]);
5909 
5910 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5911 	  ix86_expand_ashl_const (low[0], count, mode);
5912 	}
5913       return;
5914     }
5915 
5916   split_double_mode (mode, operands, 1, low, high);
5917   half_mode = mode == DImode ? SImode : DImode;
5918 
5919   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5920 
5921   if (operands[1] == const1_rtx)
5922     {
5923       /* Assuming we've chosen a QImode capable registers, then 1 << N
5924 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
5925       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5926 	{
5927 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5928 
5929 	  ix86_expand_clear (low[0]);
5930 	  ix86_expand_clear (high[0]);
5931 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5932 
5933 	  d = gen_lowpart (QImode, low[0]);
5934 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5935 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
5936 	  emit_insn (gen_rtx_SET (d, s));
5937 
5938 	  d = gen_lowpart (QImode, high[0]);
5939 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5940 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
5941 	  emit_insn (gen_rtx_SET (d, s));
5942 	}
5943 
5944       /* Otherwise, we can get the same results by manually performing
5945 	 a bit extract operation on bit 5/6, and then performing the two
5946 	 shifts.  The two methods of getting 0/1 into low/high are exactly
5947 	 the same size.  Avoiding the shift in the bit extract case helps
5948 	 pentium4 a bit; no one else seems to care much either way.  */
5949       else
5950 	{
5951 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
5952 	  rtx (*gen_and3)(rtx, rtx, rtx);
5953 	  rtx (*gen_xor3)(rtx, rtx, rtx);
5954 	  HOST_WIDE_INT bits;
5955 	  rtx x;
5956 
5957 	  if (mode == DImode)
5958 	    {
5959 	      gen_lshr3 = gen_lshrsi3;
5960 	      gen_and3 = gen_andsi3;
5961 	      gen_xor3 = gen_xorsi3;
5962 	      bits = 5;
5963 	    }
5964 	  else
5965 	    {
5966 	      gen_lshr3 = gen_lshrdi3;
5967 	      gen_and3 = gen_anddi3;
5968 	      gen_xor3 = gen_xordi3;
5969 	      bits = 6;
5970 	    }
5971 
5972 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5973 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5974 	  else
5975 	    x = gen_lowpart (half_mode, operands[2]);
5976 	  emit_insn (gen_rtx_SET (high[0], x));
5977 
5978 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5979 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5980 	  emit_move_insn (low[0], high[0]);
5981 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5982 	}
5983 
5984       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5985       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5986       return;
5987     }
5988 
5989   if (operands[1] == constm1_rtx)
5990     {
5991       /* For -1 << N, we can avoid the shld instruction, because we
5992 	 know that we're shifting 0...31/63 ones into a -1.  */
5993       emit_move_insn (low[0], constm1_rtx);
5994       if (optimize_insn_for_size_p ())
5995 	emit_move_insn (high[0], low[0]);
5996       else
5997 	emit_move_insn (high[0], constm1_rtx);
5998     }
5999   else
6000     {
6001       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6002 
6003       if (!rtx_equal_p (operands[0], operands[1]))
6004 	emit_move_insn (operands[0], operands[1]);
6005 
6006       split_double_mode (mode, operands, 1, low, high);
6007       emit_insn (gen_shld (high[0], low[0], operands[2]));
6008     }
6009 
6010   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6011 
6012   if (TARGET_CMOVE && scratch)
6013     {
6014       ix86_expand_clear (scratch);
6015       emit_insn (gen_x86_shift_adj_1
6016 		 (half_mode, high[0], low[0], operands[2], scratch));
6017     }
6018   else
6019     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6020 }
6021 
6022 void
ix86_split_ashr(rtx * operands,rtx scratch,machine_mode mode)6023 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6024 {
6025   rtx (*gen_ashr3)(rtx, rtx, rtx)
6026     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6027   rtx (*gen_shrd)(rtx, rtx, rtx);
6028   int half_width = GET_MODE_BITSIZE (mode) >> 1;
6029 
6030   rtx low[2], high[2];
6031   int count;
6032 
6033   if (CONST_INT_P (operands[2]))
6034     {
6035       split_double_mode (mode, operands, 2, low, high);
6036       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6037 
6038       if (count == GET_MODE_BITSIZE (mode) - 1)
6039 	{
6040 	  emit_move_insn (high[0], high[1]);
6041 	  emit_insn (gen_ashr3 (high[0], high[0],
6042 				GEN_INT (half_width - 1)));
6043 	  emit_move_insn (low[0], high[0]);
6044 
6045 	}
6046       else if (count >= half_width)
6047 	{
6048 	  emit_move_insn (low[0], high[1]);
6049 	  emit_move_insn (high[0], low[0]);
6050 	  emit_insn (gen_ashr3 (high[0], high[0],
6051 				GEN_INT (half_width - 1)));
6052 
6053 	  if (count > half_width)
6054 	    emit_insn (gen_ashr3 (low[0], low[0],
6055 				  GEN_INT (count - half_width)));
6056 	}
6057       else
6058 	{
6059 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6060 
6061 	  if (!rtx_equal_p (operands[0], operands[1]))
6062 	    emit_move_insn (operands[0], operands[1]);
6063 
6064 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6065 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6066 	}
6067     }
6068   else
6069     {
6070       machine_mode half_mode;
6071 
6072       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6073 
6074      if (!rtx_equal_p (operands[0], operands[1]))
6075 	emit_move_insn (operands[0], operands[1]);
6076 
6077       split_double_mode (mode, operands, 1, low, high);
6078       half_mode = mode == DImode ? SImode : DImode;
6079 
6080       emit_insn (gen_shrd (low[0], high[0], operands[2]));
6081       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6082 
6083       if (TARGET_CMOVE && scratch)
6084 	{
6085 	  emit_move_insn (scratch, high[0]);
6086 	  emit_insn (gen_ashr3 (scratch, scratch,
6087 				GEN_INT (half_width - 1)));
6088 	  emit_insn (gen_x86_shift_adj_1
6089 		     (half_mode, low[0], high[0], operands[2], scratch));
6090 	}
6091       else
6092 	emit_insn (gen_x86_shift_adj_3
6093 		   (half_mode, low[0], high[0], operands[2]));
6094     }
6095 }
6096 
6097 void
ix86_split_lshr(rtx * operands,rtx scratch,machine_mode mode)6098 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6099 {
6100   rtx (*gen_lshr3)(rtx, rtx, rtx)
6101     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6102   rtx (*gen_shrd)(rtx, rtx, rtx);
6103   int half_width = GET_MODE_BITSIZE (mode) >> 1;
6104 
6105   rtx low[2], high[2];
6106   int count;
6107 
6108   if (CONST_INT_P (operands[2]))
6109     {
6110       split_double_mode (mode, operands, 2, low, high);
6111       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6112 
6113       if (count >= half_width)
6114 	{
6115 	  emit_move_insn (low[0], high[1]);
6116 	  ix86_expand_clear (high[0]);
6117 
6118 	  if (count > half_width)
6119 	    emit_insn (gen_lshr3 (low[0], low[0],
6120 				  GEN_INT (count - half_width)));
6121 	}
6122       else
6123 	{
6124 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6125 
6126 	  if (!rtx_equal_p (operands[0], operands[1]))
6127 	    emit_move_insn (operands[0], operands[1]);
6128 
6129 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6130 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6131 	}
6132     }
6133   else
6134     {
6135       machine_mode half_mode;
6136 
6137       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6138 
6139       if (!rtx_equal_p (operands[0], operands[1]))
6140 	emit_move_insn (operands[0], operands[1]);
6141 
6142       split_double_mode (mode, operands, 1, low, high);
6143       half_mode = mode == DImode ? SImode : DImode;
6144 
6145       emit_insn (gen_shrd (low[0], high[0], operands[2]));
6146       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6147 
6148       if (TARGET_CMOVE && scratch)
6149 	{
6150 	  ix86_expand_clear (scratch);
6151 	  emit_insn (gen_x86_shift_adj_1
6152 		     (half_mode, low[0], high[0], operands[2], scratch));
6153 	}
6154       else
6155 	emit_insn (gen_x86_shift_adj_2
6156 		   (half_mode, low[0], high[0], operands[2]));
6157     }
6158 }
6159 
6160 /* Expand move of V1TI mode register X to a new TI mode register.  */
6161 static rtx
ix86_expand_v1ti_to_ti(rtx x)6162 ix86_expand_v1ti_to_ti (rtx x)
6163 {
6164   rtx result = gen_reg_rtx (TImode);
6165   if (TARGET_SSE2)
6166     {
6167       rtx temp = gen_reg_rtx (V2DImode);
6168       emit_move_insn (temp, gen_lowpart (V2DImode, x));
6169       rtx lo = gen_lowpart (DImode, result);
6170       emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6171       rtx hi = gen_highpart (DImode, result);
6172       emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6173     }
6174   else
6175     emit_move_insn (result, gen_lowpart (TImode, x));
6176   return result;
6177 }
6178 
6179 /* Expand move of TI mode register X to a new V1TI mode register.  */
6180 static rtx
ix86_expand_ti_to_v1ti(rtx x)6181 ix86_expand_ti_to_v1ti (rtx x)
6182 {
6183   rtx result = gen_reg_rtx (V1TImode);
6184   if (TARGET_SSE2)
6185     {
6186       rtx lo = gen_lowpart (DImode, x);
6187       rtx hi = gen_highpart (DImode, x);
6188       rtx tmp = gen_reg_rtx (V2DImode);
6189       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6190       emit_move_insn (result, gen_lowpart (V1TImode, tmp));
6191     }
6192   else
6193     emit_move_insn (result, gen_lowpart (V1TImode, x));
6194   return result;
6195 }
6196 
6197 /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
6198 void
ix86_expand_v1ti_shift(enum rtx_code code,rtx operands[])6199 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6200 {
6201   rtx op1 = force_reg (V1TImode, operands[1]);
6202 
6203   if (!CONST_INT_P (operands[2]))
6204     {
6205       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6206       rtx tmp2 = gen_reg_rtx (TImode);
6207       rtx (*shift) (rtx, rtx, rtx)
6208 	    = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6209       emit_insn (shift (tmp2, tmp1, operands[2]));
6210       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6211       emit_move_insn (operands[0], tmp3);
6212       return;
6213     }
6214 
6215   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6216 
6217   if (bits == 0)
6218     {
6219       emit_move_insn (operands[0], op1);
6220       return;
6221     }
6222 
6223   if ((bits & 7) == 0)
6224     {
6225       rtx tmp = gen_reg_rtx (V1TImode);
6226       if (code == ASHIFT)
6227 	emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6228       else
6229 	emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6230       emit_move_insn (operands[0], tmp);
6231       return;
6232     }
6233 
6234   rtx tmp1 = gen_reg_rtx (V1TImode);
6235   if (code == ASHIFT)
6236     emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6237   else
6238     emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6239 
6240   /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
6241   rtx tmp2 = gen_reg_rtx (V2DImode);
6242   emit_move_insn (tmp2, gen_lowpart (V2DImode, tmp1));
6243 
6244   /* tmp3 will be the V2DImode result.  */
6245   rtx tmp3 = gen_reg_rtx (V2DImode);
6246 
6247   if (bits > 64)
6248     {
6249       if (code == ASHIFT)
6250 	emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6251       else
6252 	emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6253     }
6254   else
6255     {
6256       /* tmp4 is operands[1], in V2DImode.  */
6257       rtx tmp4 = gen_reg_rtx (V2DImode);
6258       emit_move_insn (tmp4, gen_lowpart (V2DImode, op1));
6259 
6260       rtx tmp5 = gen_reg_rtx (V2DImode);
6261       if (code == ASHIFT)
6262 	emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6263       else
6264 	emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6265 
6266       rtx tmp6 = gen_reg_rtx (V2DImode);
6267       if (code == ASHIFT)
6268 	emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6269       else
6270 	emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6271 
6272       emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6273     }
6274 
6275   /* Convert the result back to V1TImode and store in operands[0].  */
6276   rtx tmp7 = gen_reg_rtx (V1TImode);
6277   emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp3));
6278   emit_move_insn (operands[0], tmp7);
6279 }
6280 
6281 /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
6282 void
ix86_expand_v1ti_rotate(enum rtx_code code,rtx operands[])6283 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6284 {
6285   rtx op1 = force_reg (V1TImode, operands[1]);
6286 
6287   if (!CONST_INT_P (operands[2]))
6288     {
6289       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6290       rtx tmp2 = gen_reg_rtx (TImode);
6291       rtx (*rotate) (rtx, rtx, rtx)
6292 	    = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6293       emit_insn (rotate (tmp2, tmp1, operands[2]));
6294       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6295       emit_move_insn (operands[0], tmp3);
6296       return;
6297     }
6298 
6299   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6300 
6301   if (bits == 0)
6302     {
6303       emit_move_insn (operands[0], op1);
6304       return;
6305     }
6306 
6307   if (code == ROTATERT)
6308     bits = 128 - bits;
6309 
6310   if ((bits & 31) == 0)
6311     {
6312       rtx tmp1 = gen_reg_rtx (V4SImode);
6313       rtx tmp2 = gen_reg_rtx (V4SImode);
6314       rtx tmp3 = gen_reg_rtx (V1TImode);
6315 
6316       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6317       if (bits == 32)
6318 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6319       else if (bits == 64)
6320 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6321       else
6322 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6323       emit_move_insn (tmp3, gen_lowpart (V1TImode, tmp2));
6324       emit_move_insn (operands[0], tmp3);
6325       return;
6326     }
6327 
6328   if ((bits & 7) == 0)
6329     {
6330       rtx tmp1 = gen_reg_rtx (V1TImode);
6331       rtx tmp2 = gen_reg_rtx (V1TImode);
6332       rtx tmp3 = gen_reg_rtx (V1TImode);
6333 
6334       emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6335       emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6336       emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6337       emit_move_insn (operands[0], tmp3);
6338       return;
6339     }
6340 
6341   rtx op1_v4si = gen_reg_rtx (V4SImode);
6342   emit_move_insn (op1_v4si, gen_lowpart (V4SImode, op1));
6343 
6344   rtx lobits;
6345   rtx hibits;
6346 
6347   switch (bits >> 5)
6348     {
6349     case 0:
6350       lobits = op1_v4si;
6351       hibits = gen_reg_rtx (V4SImode);
6352       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6353       break;
6354 
6355     case 1:
6356       lobits = gen_reg_rtx (V4SImode);
6357       hibits = gen_reg_rtx (V4SImode);
6358       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6359       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6360       break;
6361 
6362     case 2:
6363       lobits = gen_reg_rtx (V4SImode);
6364       hibits = gen_reg_rtx (V4SImode);
6365       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6366       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6367       break;
6368 
6369     default:
6370       lobits = gen_reg_rtx (V4SImode);
6371       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6372       hibits = op1_v4si;
6373       break;
6374     }
6375 
6376   rtx tmp1 = gen_reg_rtx (V4SImode);
6377   rtx tmp2 = gen_reg_rtx (V4SImode);
6378   rtx tmp3 = gen_reg_rtx (V4SImode);
6379   rtx tmp4 = gen_reg_rtx (V1TImode);
6380 
6381   emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6382   emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6383   emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6384   emit_move_insn (tmp4, gen_lowpart (V1TImode, tmp3));
6385   emit_move_insn (operands[0], tmp4);
6386 }
6387 
6388 /* Expand V1TI mode ashiftrt by constant.  */
6389 void
ix86_expand_v1ti_ashiftrt(rtx operands[])6390 ix86_expand_v1ti_ashiftrt (rtx operands[])
6391 {
6392   rtx op1 = force_reg (V1TImode, operands[1]);
6393 
6394   if (!CONST_INT_P (operands[2]))
6395     {
6396       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6397       rtx tmp2 = gen_reg_rtx (TImode);
6398       emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6399       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6400       emit_move_insn (operands[0], tmp3);
6401       return;
6402     }
6403 
6404   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6405 
6406   if (bits == 0)
6407     {
6408       emit_move_insn (operands[0], op1);
6409       return;
6410     }
6411 
6412   if (bits == 127)
6413     {
6414       /* Two operations.  */
6415       rtx tmp1 = gen_reg_rtx (V4SImode);
6416       rtx tmp2 = gen_reg_rtx (V4SImode);
6417       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6418       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6419 
6420       rtx tmp3 = gen_reg_rtx (V4SImode);
6421       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6422 
6423       rtx tmp4 = gen_reg_rtx (V1TImode);
6424       emit_move_insn (tmp4, gen_lowpart (V1TImode, tmp3));
6425       emit_move_insn (operands[0], tmp4);
6426       return;
6427     }
6428 
6429   if (bits == 64)
6430     {
6431       /* Three operations.  */
6432       rtx tmp1 = gen_reg_rtx (V4SImode);
6433       rtx tmp2 = gen_reg_rtx (V4SImode);
6434       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6435       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6436 
6437       rtx tmp3 = gen_reg_rtx (V4SImode);
6438       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6439 
6440       rtx tmp4 = gen_reg_rtx (V2DImode);
6441       rtx tmp5 = gen_reg_rtx (V2DImode);
6442       rtx tmp6 = gen_reg_rtx (V2DImode);
6443       emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp1));
6444       emit_move_insn (tmp5, gen_lowpart (V2DImode, tmp3));
6445       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6446 
6447       rtx tmp7 = gen_reg_rtx (V1TImode);
6448       emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
6449       emit_move_insn (operands[0], tmp7);
6450       return;
6451     }
6452 
6453   if (bits == 96)
6454     {
6455       /* Three operations.  */
6456       rtx tmp3 = gen_reg_rtx (V2DImode);
6457       rtx tmp1 = gen_reg_rtx (V4SImode);
6458       rtx tmp2 = gen_reg_rtx (V4SImode);
6459       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6460       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6461 
6462       rtx tmp4 = gen_reg_rtx (V2DImode);
6463       rtx tmp5 = gen_reg_rtx (V2DImode);
6464       emit_move_insn (tmp3, gen_lowpart (V2DImode, tmp1));
6465       emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp2));
6466       emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6467 
6468       rtx tmp6 = gen_reg_rtx (V4SImode);
6469       rtx tmp7 = gen_reg_rtx (V4SImode);
6470       emit_move_insn (tmp6, gen_lowpart (V4SImode, tmp5));
6471       emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6472 
6473       rtx tmp8 = gen_reg_rtx (V1TImode);
6474       emit_move_insn (tmp8, gen_lowpart (V1TImode, tmp7));
6475       emit_move_insn (operands[0], tmp8);
6476       return;
6477     }
6478 
6479   if (TARGET_AVX2 || TARGET_SSE4_1)
6480     {
6481       /* Three operations.  */
6482       if (bits == 32)
6483 	{
6484 	  rtx tmp1 = gen_reg_rtx (V4SImode);
6485 	  rtx tmp2 = gen_reg_rtx (V4SImode);
6486 	  emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6487 	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6488 
6489 	  rtx tmp3 = gen_reg_rtx (V1TImode);
6490 	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6491 
6492 	  if (TARGET_AVX2)
6493 	    {
6494 	      rtx tmp4 = gen_reg_rtx (V4SImode);
6495 	      rtx tmp5 = gen_reg_rtx (V4SImode);
6496 	      emit_move_insn (tmp4, gen_lowpart (V4SImode, tmp3));
6497 	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6498 					       GEN_INT (7)));
6499 
6500 	      rtx tmp6 = gen_reg_rtx (V1TImode);
6501 	      emit_move_insn (tmp6, gen_lowpart (V1TImode, tmp5));
6502 	      emit_move_insn (operands[0], tmp6);
6503 	    }
6504 	  else
6505 	    {
6506 	      rtx tmp4 = gen_reg_rtx (V8HImode);
6507 	      rtx tmp5 = gen_reg_rtx (V8HImode);
6508 	      rtx tmp6 = gen_reg_rtx (V8HImode);
6509 	      emit_move_insn (tmp4, gen_lowpart (V8HImode, tmp2));
6510 	      emit_move_insn (tmp5, gen_lowpart (V8HImode, tmp3));
6511 	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6512 					     GEN_INT (0x3f)));
6513 
6514 	      rtx tmp7 = gen_reg_rtx (V1TImode);
6515 	      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
6516 	      emit_move_insn (operands[0], tmp7);
6517 	    }
6518 	  return;
6519 	}
6520 
6521       /* Three operations.  */
6522       if (bits == 8 || bits == 16 || bits == 24)
6523 	{
6524 	  rtx tmp1 = gen_reg_rtx (V4SImode);
6525 	  rtx tmp2 = gen_reg_rtx (V4SImode);
6526 	  emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6527 	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6528 
6529 	  rtx tmp3 = gen_reg_rtx (V1TImode);
6530 	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6531 
6532 	  if (TARGET_AVX2)
6533 	    {
6534 	      rtx tmp4 = gen_reg_rtx (V4SImode);
6535 	      rtx tmp5 = gen_reg_rtx (V4SImode);
6536 	      emit_move_insn (tmp4, gen_lowpart (V4SImode, tmp3));
6537 	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6538 					       GEN_INT (7)));
6539 
6540 	      rtx tmp6 = gen_reg_rtx (V1TImode);
6541 	      emit_move_insn (tmp6, gen_lowpart (V1TImode, tmp5));
6542 	      emit_move_insn (operands[0], tmp6);
6543 	    }
6544 	  else
6545 	    {
6546 	      rtx tmp4 = gen_reg_rtx (V8HImode);
6547 	      rtx tmp5 = gen_reg_rtx (V8HImode);
6548 	      rtx tmp6 = gen_reg_rtx (V8HImode);
6549 	      emit_move_insn (tmp4, gen_lowpart (V8HImode, tmp2));
6550 	      emit_move_insn (tmp5, gen_lowpart (V8HImode, tmp3));
6551 	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6552 					     GEN_INT (0x3f)));
6553 
6554 	      rtx tmp7 = gen_reg_rtx (V1TImode);
6555 	      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
6556 	      emit_move_insn (operands[0], tmp7);
6557 	    }
6558 	  return;
6559 	}
6560     }
6561 
6562   if (bits > 96)
6563     {
6564       /* Four operations.  */
6565       rtx tmp1 = gen_reg_rtx (V4SImode);
6566       rtx tmp2 = gen_reg_rtx (V4SImode);
6567       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6568       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6569 
6570       rtx tmp3 = gen_reg_rtx (V4SImode);
6571       emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6572 
6573       rtx tmp4 = gen_reg_rtx (V2DImode);
6574       rtx tmp5 = gen_reg_rtx (V2DImode);
6575       rtx tmp6 = gen_reg_rtx (V2DImode);
6576       emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp2));
6577       emit_move_insn (tmp5, gen_lowpart (V2DImode, tmp3));
6578       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6579 
6580       rtx tmp7 = gen_reg_rtx (V4SImode);
6581       rtx tmp8 = gen_reg_rtx (V4SImode);
6582       emit_move_insn (tmp7, gen_lowpart (V4SImode, tmp6));
6583       emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6584 
6585       rtx tmp9 = gen_reg_rtx (V1TImode);
6586       emit_move_insn (tmp9, gen_lowpart (V1TImode, tmp8));
6587       emit_move_insn (operands[0], tmp9);
6588       return;
6589     }
6590 
6591   if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6592     {
6593       /* Four operations.  */
6594       rtx tmp1 = gen_reg_rtx (V4SImode);
6595       rtx tmp2 = gen_reg_rtx (V4SImode);
6596       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6597       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6598 
6599       rtx tmp3 = gen_reg_rtx (V4SImode);
6600       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6601 
6602       rtx tmp4 = gen_reg_rtx (V1TImode);
6603       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6604 
6605       rtx tmp5 = gen_reg_rtx (V8HImode);
6606       rtx tmp6 = gen_reg_rtx (V8HImode);
6607       rtx tmp7 = gen_reg_rtx (V8HImode);
6608       emit_move_insn (tmp5, gen_lowpart (V8HImode, tmp3));
6609       emit_move_insn (tmp6, gen_lowpart (V8HImode, tmp4));
6610       emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6611 				     GEN_INT (bits == 48 ? 0x1f : 0x07)));
6612 
6613       rtx tmp8 = gen_reg_rtx (V1TImode);
6614       emit_move_insn (tmp8, gen_lowpart (V1TImode, tmp7));
6615       emit_move_insn (operands[0], tmp8);
6616       return;
6617     }
6618 
6619   if ((bits & 7) == 0)
6620     {
6621       /* Five operations.  */
6622       rtx tmp1 = gen_reg_rtx (V4SImode);
6623       rtx tmp2 = gen_reg_rtx (V4SImode);
6624       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6625       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6626 
6627       rtx tmp3 = gen_reg_rtx (V4SImode);
6628       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6629 
6630       rtx tmp4 = gen_reg_rtx (V1TImode);
6631       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6632 
6633       rtx tmp5 = gen_reg_rtx (V1TImode);
6634       rtx tmp6 = gen_reg_rtx (V1TImode);
6635       emit_move_insn (tmp5, gen_lowpart (V1TImode, tmp3));
6636       emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6637 
6638       rtx tmp7 = gen_reg_rtx (V2DImode);
6639       rtx tmp8 = gen_reg_rtx (V2DImode);
6640       rtx tmp9 = gen_reg_rtx (V2DImode);
6641       emit_move_insn (tmp7, gen_lowpart (V2DImode, tmp4));
6642       emit_move_insn (tmp8, gen_lowpart (V2DImode, tmp6));
6643       emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6644 
6645       rtx tmp10 = gen_reg_rtx (V1TImode);
6646       emit_move_insn (tmp10, gen_lowpart (V1TImode, tmp9));
6647       emit_move_insn (operands[0], tmp10);
6648       return;
6649     }
6650 
6651   if (TARGET_AVX2 && bits < 32)
6652     {
6653       /* Six operations.  */
6654       rtx tmp1 = gen_reg_rtx (V4SImode);
6655       rtx tmp2 = gen_reg_rtx (V4SImode);
6656       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6657       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6658 
6659       rtx tmp3 = gen_reg_rtx (V1TImode);
6660       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6661 
6662       rtx tmp4 = gen_reg_rtx (V2DImode);
6663       rtx tmp5 = gen_reg_rtx (V2DImode);
6664       emit_move_insn (tmp4, gen_lowpart (V2DImode, op1));
6665       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6666 
6667       rtx tmp6 = gen_reg_rtx (V2DImode);
6668       rtx tmp7 = gen_reg_rtx (V2DImode);
6669       emit_move_insn (tmp6, gen_lowpart (V2DImode, tmp3));
6670       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6671 
6672       rtx tmp8 = gen_reg_rtx (V2DImode);
6673       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6674 
6675       rtx tmp9 = gen_reg_rtx (V4SImode);
6676       rtx tmp10 = gen_reg_rtx (V4SImode);
6677       emit_move_insn (tmp9, gen_lowpart (V4SImode, tmp8));
6678       emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6679 
6680       rtx tmp11 = gen_reg_rtx (V1TImode);
6681       emit_move_insn (tmp11, gen_lowpart (V1TImode, tmp10));
6682       emit_move_insn (operands[0], tmp11);
6683       return;
6684     }
6685 
6686   if (TARGET_SSE4_1 && bits < 15)
6687     {
6688       /* Six operations.  */
6689       rtx tmp1 = gen_reg_rtx (V4SImode);
6690       rtx tmp2 = gen_reg_rtx (V4SImode);
6691       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6692       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6693 
6694       rtx tmp3 = gen_reg_rtx (V1TImode);
6695       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6696 
6697       rtx tmp4 = gen_reg_rtx (V2DImode);
6698       rtx tmp5 = gen_reg_rtx (V2DImode);
6699       emit_move_insn (tmp4, gen_lowpart (V2DImode, op1));
6700       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6701 
6702       rtx tmp6 = gen_reg_rtx (V2DImode);
6703       rtx tmp7 = gen_reg_rtx (V2DImode);
6704       emit_move_insn (tmp6, gen_lowpart (V2DImode, tmp3));
6705       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6706 
6707       rtx tmp8 = gen_reg_rtx (V2DImode);
6708       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6709 
6710       rtx tmp9 = gen_reg_rtx (V8HImode);
6711       rtx tmp10 = gen_reg_rtx (V8HImode);
6712       rtx tmp11 = gen_reg_rtx (V8HImode);
6713       emit_move_insn (tmp9, gen_lowpart (V8HImode, tmp2));
6714       emit_move_insn (tmp10, gen_lowpart (V8HImode, tmp8));
6715       emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6716 
6717       rtx tmp12 = gen_reg_rtx (V1TImode);
6718       emit_move_insn (tmp12, gen_lowpart (V1TImode, tmp11));
6719       emit_move_insn (operands[0], tmp12);
6720       return;
6721     }
6722 
6723   if (bits == 1)
6724     {
6725       /* Eight operations.  */
6726       rtx tmp1 = gen_reg_rtx (V1TImode);
6727       emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6728 
6729       rtx tmp2 = gen_reg_rtx (V2DImode);
6730       rtx tmp3 = gen_reg_rtx (V2DImode);
6731       emit_move_insn (tmp2, gen_lowpart (V2DImode, op1));
6732       emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6733 
6734       rtx tmp4 = gen_reg_rtx (V2DImode);
6735       rtx tmp5 = gen_reg_rtx (V2DImode);
6736       emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp1));
6737       emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6738 
6739       rtx tmp6 = gen_reg_rtx (V2DImode);
6740       emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6741 
6742       rtx tmp7 = gen_reg_rtx (V2DImode);
6743       emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6744 
6745       rtx tmp8 = gen_reg_rtx (V4SImode);
6746       rtx tmp9 = gen_reg_rtx (V4SImode);
6747       emit_move_insn (tmp8, gen_lowpart (V4SImode, tmp7));
6748       emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
6749 
6750       rtx tmp10 = gen_reg_rtx (V2DImode);
6751       rtx tmp11 = gen_reg_rtx (V2DImode);
6752       emit_move_insn (tmp10, gen_lowpart (V2DImode, tmp9));
6753       emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
6754 
6755       rtx tmp12 = gen_reg_rtx (V2DImode);
6756       emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
6757 
6758       rtx tmp13 = gen_reg_rtx (V1TImode);
6759       emit_move_insn (tmp13, gen_lowpart (V1TImode, tmp12));
6760       emit_move_insn (operands[0], tmp13);
6761       return;
6762     }
6763 
6764   if (bits > 64)
6765     {
6766       /* Eight operations.  */
6767       rtx tmp1 = gen_reg_rtx (V4SImode);
6768       rtx tmp2 = gen_reg_rtx (V4SImode);
6769       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6770       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6771 
6772       rtx tmp3 = gen_reg_rtx (V4SImode);
6773       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6774 
6775       rtx tmp4 = gen_reg_rtx (V1TImode);
6776       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6777 
6778       rtx tmp5 = gen_reg_rtx (V2DImode);
6779       rtx tmp6 = gen_reg_rtx (V2DImode);
6780       emit_move_insn (tmp5, gen_lowpart (V2DImode, tmp4));
6781       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
6782 
6783       rtx tmp7 = gen_reg_rtx (V1TImode);
6784       rtx tmp8 = gen_reg_rtx (V1TImode);
6785       emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp3));
6786       emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
6787 
6788       rtx tmp9 = gen_reg_rtx (V2DImode);
6789       rtx tmp10 = gen_reg_rtx (V2DImode);
6790       emit_move_insn (tmp9, gen_lowpart (V2DImode, tmp3));
6791       emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
6792 
6793       rtx tmp11 = gen_reg_rtx (V2DImode);
6794       rtx tmp12 = gen_reg_rtx (V2DImode);
6795       emit_move_insn (tmp11, gen_lowpart (V2DImode, tmp8));
6796       emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
6797 
6798       rtx tmp13 = gen_reg_rtx (V2DImode);
6799       emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
6800 
6801       rtx tmp14 = gen_reg_rtx (V1TImode);
6802       emit_move_insn (tmp14, gen_lowpart (V1TImode, tmp13));
6803       emit_move_insn (operands[0], tmp14);
6804     }
6805   else
6806     {
6807       /* Nine operations.  */
6808       rtx tmp1 = gen_reg_rtx (V4SImode);
6809       rtx tmp2 = gen_reg_rtx (V4SImode);
6810       emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
6811       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6812 
6813       rtx tmp3 = gen_reg_rtx (V4SImode);
6814       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6815 
6816       rtx tmp4 = gen_reg_rtx (V1TImode);
6817       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6818 
6819       rtx tmp5 = gen_reg_rtx (V2DImode);
6820       rtx tmp6 = gen_reg_rtx (V2DImode);
6821       emit_move_insn (tmp5, gen_lowpart (V2DImode, op1));
6822       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
6823 
6824       rtx tmp7 = gen_reg_rtx (V2DImode);
6825       rtx tmp8 = gen_reg_rtx (V2DImode);
6826       emit_move_insn (tmp7, gen_lowpart (V2DImode, tmp4));
6827       emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
6828 
6829       rtx tmp9 = gen_reg_rtx (V2DImode);
6830       emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
6831 
6832       rtx tmp10 = gen_reg_rtx (V1TImode);
6833       rtx tmp11 = gen_reg_rtx (V1TImode);
6834       emit_move_insn (tmp10, gen_lowpart (V1TImode, tmp3));
6835       emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
6836 
6837       rtx tmp12 = gen_reg_rtx (V2DImode);
6838       rtx tmp13 = gen_reg_rtx (V2DImode);
6839       emit_move_insn (tmp12, gen_lowpart (V2DImode, tmp11));
6840       emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
6841 
6842       rtx tmp14 = gen_reg_rtx (V2DImode);
6843       emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
6844 
6845       rtx tmp15 = gen_reg_rtx (V1TImode);
6846       emit_move_insn (tmp15, gen_lowpart (V1TImode, tmp14));
6847       emit_move_insn (operands[0], tmp15);
6848     }
6849 }
6850 
6851 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
6852    DImode for constant loop counts.  */
6853 
6854 static machine_mode
counter_mode(rtx count_exp)6855 counter_mode (rtx count_exp)
6856 {
6857   if (GET_MODE (count_exp) != VOIDmode)
6858     return GET_MODE (count_exp);
6859   if (!CONST_INT_P (count_exp))
6860     return Pmode;
6861   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6862     return DImode;
6863   return SImode;
6864 }
6865 
6866 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6867    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6868    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
6869    memory by VALUE (supposed to be in MODE).
6870 
6871    The size is rounded down to whole number of chunk size moved at once.
6872    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
6873 
6874 
6875 static void
expand_set_or_cpymem_via_loop(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx count,machine_mode mode,int unroll,int expected_size,bool issetmem)6876 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
6877 			       rtx destptr, rtx srcptr, rtx value,
6878 			       rtx count, machine_mode mode, int unroll,
6879 			       int expected_size, bool issetmem)
6880 {
6881   rtx_code_label *out_label, *top_label;
6882   rtx iter, tmp;
6883   machine_mode iter_mode = counter_mode (count);
6884   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
6885   rtx piece_size = GEN_INT (piece_size_n);
6886   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
6887   rtx size;
6888   int i;
6889 
6890   top_label = gen_label_rtx ();
6891   out_label = gen_label_rtx ();
6892   iter = gen_reg_rtx (iter_mode);
6893 
6894   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
6895 			      NULL, 1, OPTAB_DIRECT);
6896   /* Those two should combine.  */
6897   if (piece_size == const1_rtx)
6898     {
6899       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
6900 			       true, out_label);
6901       predict_jump (REG_BR_PROB_BASE * 10 / 100);
6902     }
6903   emit_move_insn (iter, const0_rtx);
6904 
6905   emit_label (top_label);
6906 
6907   tmp = convert_modes (Pmode, iter_mode, iter, true);
6908 
6909   /* This assert could be relaxed - in this case we'll need to compute
6910      smallest power of two, containing in PIECE_SIZE_N and pass it to
6911      offset_address.  */
6912   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
6913   destmem = offset_address (destmem, tmp, piece_size_n);
6914   destmem = adjust_address (destmem, mode, 0);
6915 
6916   if (!issetmem)
6917     {
6918       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
6919       srcmem = adjust_address (srcmem, mode, 0);
6920 
6921       /* When unrolling for chips that reorder memory reads and writes,
6922 	 we can save registers by using single temporary.
6923 	 Also using 4 temporaries is overkill in 32bit mode.  */
6924       if (!TARGET_64BIT && 0)
6925 	{
6926 	  for (i = 0; i < unroll; i++)
6927 	    {
6928 	      if (i)
6929 		{
6930 		  destmem = adjust_address (copy_rtx (destmem), mode,
6931 					    GET_MODE_SIZE (mode));
6932 		  srcmem = adjust_address (copy_rtx (srcmem), mode,
6933 					   GET_MODE_SIZE (mode));
6934 		}
6935 	      emit_move_insn (destmem, srcmem);
6936 	    }
6937 	}
6938       else
6939 	{
6940 	  rtx tmpreg[4];
6941 	  gcc_assert (unroll <= 4);
6942 	  for (i = 0; i < unroll; i++)
6943 	    {
6944 	      tmpreg[i] = gen_reg_rtx (mode);
6945 	      if (i)
6946 		srcmem = adjust_address (copy_rtx (srcmem), mode,
6947 					 GET_MODE_SIZE (mode));
6948 	      emit_move_insn (tmpreg[i], srcmem);
6949 	    }
6950 	  for (i = 0; i < unroll; i++)
6951 	    {
6952 	      if (i)
6953 		destmem = adjust_address (copy_rtx (destmem), mode,
6954 					  GET_MODE_SIZE (mode));
6955 	      emit_move_insn (destmem, tmpreg[i]);
6956 	    }
6957 	}
6958     }
6959   else
6960     for (i = 0; i < unroll; i++)
6961       {
6962 	if (i)
6963 	  destmem = adjust_address (copy_rtx (destmem), mode,
6964 				    GET_MODE_SIZE (mode));
6965 	emit_move_insn (destmem, value);
6966       }
6967 
6968   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
6969 			     true, OPTAB_LIB_WIDEN);
6970   if (tmp != iter)
6971     emit_move_insn (iter, tmp);
6972 
6973   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
6974 			   true, top_label);
6975   if (expected_size != -1)
6976     {
6977       expected_size /= GET_MODE_SIZE (mode) * unroll;
6978       if (expected_size == 0)
6979 	predict_jump (0);
6980       else if (expected_size > REG_BR_PROB_BASE)
6981 	predict_jump (REG_BR_PROB_BASE - 1);
6982       else
6983         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
6984 		      / expected_size);
6985     }
6986   else
6987     predict_jump (REG_BR_PROB_BASE * 80 / 100);
6988   iter = ix86_zero_extend_to_Pmode (iter);
6989   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
6990 			     true, OPTAB_LIB_WIDEN);
6991   if (tmp != destptr)
6992     emit_move_insn (destptr, tmp);
6993   if (!issetmem)
6994     {
6995       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
6996 				 true, OPTAB_LIB_WIDEN);
6997       if (tmp != srcptr)
6998 	emit_move_insn (srcptr, tmp);
6999     }
7000   emit_label (out_label);
7001 }
7002 
7003 /* Divide COUNTREG by SCALE.  */
7004 static rtx
scale_counter(rtx countreg,int scale)7005 scale_counter (rtx countreg, int scale)
7006 {
7007   rtx sc;
7008 
7009   if (scale == 1)
7010     return countreg;
7011   if (CONST_INT_P (countreg))
7012     return GEN_INT (INTVAL (countreg) / scale);
7013   gcc_assert (REG_P (countreg));
7014 
7015   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7016 			    GEN_INT (exact_log2 (scale)),
7017 			    NULL, 1, OPTAB_DIRECT);
7018   return sc;
7019 }
7020 
7021 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7022    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7023    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7024    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7025    ORIG_VALUE is the original value passed to memset to fill the memory with.
7026    Other arguments have same meaning as for previous function.  */
7027 
7028 static void
expand_set_or_cpymem_via_rep(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx orig_value,rtx count,machine_mode mode,bool issetmem)7029 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7030 			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7031 			   rtx count,
7032 			   machine_mode mode, bool issetmem)
7033 {
7034   rtx destexp;
7035   rtx srcexp;
7036   rtx countreg;
7037   HOST_WIDE_INT rounded_count;
7038 
7039   /* If possible, it is shorter to use rep movs.
7040      TODO: Maybe it is better to move this logic to decide_alg.  */
7041   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7042       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7043       && (!issetmem || orig_value == const0_rtx))
7044     mode = SImode;
7045 
7046   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7047     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7048 
7049   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7050 						       GET_MODE_SIZE (mode)));
7051   if (mode != QImode)
7052     {
7053       destexp = gen_rtx_ASHIFT (Pmode, countreg,
7054 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7055       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7056     }
7057   else
7058     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7059   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7060     {
7061       rounded_count
7062 	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7063       destmem = shallow_copy_rtx (destmem);
7064       set_mem_size (destmem, rounded_count);
7065     }
7066   else if (MEM_SIZE_KNOWN_P (destmem))
7067     clear_mem_size (destmem);
7068 
7069   if (issetmem)
7070     {
7071       value = force_reg (mode, gen_lowpart (mode, value));
7072       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7073     }
7074   else
7075     {
7076       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7077 	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7078       if (mode != QImode)
7079 	{
7080 	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7081 				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7082 	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7083 	}
7084       else
7085 	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7086       if (CONST_INT_P (count))
7087 	{
7088 	  rounded_count
7089 	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7090 	  srcmem = shallow_copy_rtx (srcmem);
7091 	  set_mem_size (srcmem, rounded_count);
7092 	}
7093       else
7094 	{
7095 	  if (MEM_SIZE_KNOWN_P (srcmem))
7096 	    clear_mem_size (srcmem);
7097 	}
7098       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7099 			      destexp, srcexp));
7100     }
7101 }
7102 
7103 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7104    DESTMEM.
7105    SRC is passed by pointer to be updated on return.
7106    Return value is updated DST.  */
7107 static rtx
emit_memmov(rtx destmem,rtx * srcmem,rtx destptr,rtx srcptr,HOST_WIDE_INT size_to_move)7108 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7109 	     HOST_WIDE_INT size_to_move)
7110 {
7111   rtx dst = destmem, src = *srcmem, tempreg;
7112   enum insn_code code;
7113   machine_mode move_mode;
7114   int piece_size, i;
7115 
7116   /* Find the widest mode in which we could perform moves.
7117      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7118      it until move of such size is supported.  */
7119   piece_size = 1 << floor_log2 (size_to_move);
7120   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7121 	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7122     {
7123       gcc_assert (piece_size > 1);
7124       piece_size >>= 1;
7125     }
7126 
7127   /* Find the corresponding vector mode with the same size as MOVE_MODE.
7128      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
7129   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7130     {
7131       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7132       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7133 	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7134 	{
7135 	  move_mode = word_mode;
7136 	  piece_size = GET_MODE_SIZE (move_mode);
7137 	  code = optab_handler (mov_optab, move_mode);
7138 	}
7139     }
7140   gcc_assert (code != CODE_FOR_nothing);
7141 
7142   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7143   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7144 
7145   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
7146   gcc_assert (size_to_move % piece_size == 0);
7147 
7148   for (i = 0; i < size_to_move; i += piece_size)
7149     {
7150       /* We move from memory to memory, so we'll need to do it via
7151 	 a temporary register.  */
7152       tempreg = gen_reg_rtx (move_mode);
7153       emit_insn (GEN_FCN (code) (tempreg, src));
7154       emit_insn (GEN_FCN (code) (dst, tempreg));
7155 
7156       emit_move_insn (destptr,
7157 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
7158       emit_move_insn (srcptr,
7159 		      plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7160 
7161       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7162 					  piece_size);
7163       src = adjust_automodify_address_nv (src, move_mode, srcptr,
7164 					  piece_size);
7165     }
7166 
7167   /* Update DST and SRC rtx.  */
7168   *srcmem = src;
7169   return dst;
7170 }
7171 
7172 /* Helper function for the string operations below.  Dest VARIABLE whether
7173    it is aligned to VALUE bytes.  If true, jump to the label.  */
7174 
7175 static rtx_code_label *
ix86_expand_aligntest(rtx variable,int value,bool epilogue)7176 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7177 {
7178   rtx_code_label *label = gen_label_rtx ();
7179   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7180   if (GET_MODE (variable) == DImode)
7181     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7182   else
7183     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7184   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7185 			   1, label);
7186   if (epilogue)
7187     predict_jump (REG_BR_PROB_BASE * 50 / 100);
7188   else
7189     predict_jump (REG_BR_PROB_BASE * 90 / 100);
7190   return label;
7191 }
7192 
7193 
7194 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
7195 
7196 static void
expand_cpymem_epilogue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx count,int max_size)7197 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7198 			rtx destptr, rtx srcptr, rtx count, int max_size)
7199 {
7200   rtx src, dest;
7201   if (CONST_INT_P (count))
7202     {
7203       HOST_WIDE_INT countval = INTVAL (count);
7204       HOST_WIDE_INT epilogue_size = countval % max_size;
7205       int i;
7206 
7207       /* For now MAX_SIZE should be a power of 2.  This assert could be
7208 	 relaxed, but it'll require a bit more complicated epilogue
7209 	 expanding.  */
7210       gcc_assert ((max_size & (max_size - 1)) == 0);
7211       for (i = max_size; i >= 1; i >>= 1)
7212 	{
7213 	  if (epilogue_size & i)
7214 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7215 	}
7216       return;
7217     }
7218   if (max_size > 8)
7219     {
7220       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7221 				    count, 1, OPTAB_DIRECT);
7222       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7223 				     count, QImode, 1, 4, false);
7224       return;
7225     }
7226 
7227   /* When there are stringops, we can cheaply increase dest and src pointers.
7228      Otherwise we save code size by maintaining offset (zero is readily
7229      available from preceding rep operation) and using x86 addressing modes.
7230    */
7231   if (TARGET_SINGLE_STRINGOP)
7232     {
7233       if (max_size > 4)
7234 	{
7235 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7236 	  src = change_address (srcmem, SImode, srcptr);
7237 	  dest = change_address (destmem, SImode, destptr);
7238 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
7239 	  emit_label (label);
7240 	  LABEL_NUSES (label) = 1;
7241 	}
7242       if (max_size > 2)
7243 	{
7244 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7245 	  src = change_address (srcmem, HImode, srcptr);
7246 	  dest = change_address (destmem, HImode, destptr);
7247 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
7248 	  emit_label (label);
7249 	  LABEL_NUSES (label) = 1;
7250 	}
7251       if (max_size > 1)
7252 	{
7253 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7254 	  src = change_address (srcmem, QImode, srcptr);
7255 	  dest = change_address (destmem, QImode, destptr);
7256 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
7257 	  emit_label (label);
7258 	  LABEL_NUSES (label) = 1;
7259 	}
7260     }
7261   else
7262     {
7263       rtx offset = force_reg (Pmode, const0_rtx);
7264       rtx tmp;
7265 
7266       if (max_size > 4)
7267 	{
7268 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7269 	  src = change_address (srcmem, SImode, srcptr);
7270 	  dest = change_address (destmem, SImode, destptr);
7271 	  emit_move_insn (dest, src);
7272 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7273 				     true, OPTAB_LIB_WIDEN);
7274 	  if (tmp != offset)
7275 	    emit_move_insn (offset, tmp);
7276 	  emit_label (label);
7277 	  LABEL_NUSES (label) = 1;
7278 	}
7279       if (max_size > 2)
7280 	{
7281 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7282 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7283 	  src = change_address (srcmem, HImode, tmp);
7284 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7285 	  dest = change_address (destmem, HImode, tmp);
7286 	  emit_move_insn (dest, src);
7287 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7288 				     true, OPTAB_LIB_WIDEN);
7289 	  if (tmp != offset)
7290 	    emit_move_insn (offset, tmp);
7291 	  emit_label (label);
7292 	  LABEL_NUSES (label) = 1;
7293 	}
7294       if (max_size > 1)
7295 	{
7296 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7297 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7298 	  src = change_address (srcmem, QImode, tmp);
7299 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7300 	  dest = change_address (destmem, QImode, tmp);
7301 	  emit_move_insn (dest, src);
7302 	  emit_label (label);
7303 	  LABEL_NUSES (label) = 1;
7304 	}
7305     }
7306 }
7307 
7308 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7309    with value PROMOTED_VAL.
7310    SRC is passed by pointer to be updated on return.
7311    Return value is updated DST.  */
7312 static rtx
emit_memset(rtx destmem,rtx destptr,rtx promoted_val,HOST_WIDE_INT size_to_move)7313 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7314 	     HOST_WIDE_INT size_to_move)
7315 {
7316   rtx dst = destmem;
7317   enum insn_code code;
7318   machine_mode move_mode;
7319   int piece_size, i;
7320 
7321   /* Find the widest mode in which we could perform moves.
7322      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7323      it until move of such size is supported.  */
7324   move_mode = GET_MODE (promoted_val);
7325   if (move_mode == VOIDmode)
7326     move_mode = QImode;
7327   if (size_to_move < GET_MODE_SIZE (move_mode))
7328     {
7329       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7330       move_mode = int_mode_for_size (move_bits, 0).require ();
7331       promoted_val = gen_lowpart (move_mode, promoted_val);
7332     }
7333   piece_size = GET_MODE_SIZE (move_mode);
7334   code = optab_handler (mov_optab, move_mode);
7335   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7336 
7337   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7338 
7339   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
7340   gcc_assert (size_to_move % piece_size == 0);
7341 
7342   for (i = 0; i < size_to_move; i += piece_size)
7343     {
7344       if (piece_size <= GET_MODE_SIZE (word_mode))
7345 	{
7346 	  emit_insn (gen_strset (destptr, dst, promoted_val));
7347 	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7348 					      piece_size);
7349 	  continue;
7350 	}
7351 
7352       emit_insn (GEN_FCN (code) (dst, promoted_val));
7353 
7354       emit_move_insn (destptr,
7355 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
7356 
7357       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7358 					  piece_size);
7359     }
7360 
7361   /* Update DST rtx.  */
7362   return dst;
7363 }
7364 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
7365 static void
expand_setmem_epilogue_via_loop(rtx destmem,rtx destptr,rtx value,rtx count,int max_size)7366 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7367 				 rtx count, int max_size)
7368 {
7369   count = expand_simple_binop (counter_mode (count), AND, count,
7370 			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7371   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7372 				 gen_lowpart (QImode, value), count, QImode,
7373 				 1, max_size / 2, true);
7374 }
7375 
7376 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
7377 static void
expand_setmem_epilogue(rtx destmem,rtx destptr,rtx value,rtx vec_value,rtx count,int max_size)7378 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7379 			rtx count, int max_size)
7380 {
7381   rtx dest;
7382 
7383   if (CONST_INT_P (count))
7384     {
7385       HOST_WIDE_INT countval = INTVAL (count);
7386       HOST_WIDE_INT epilogue_size = countval % max_size;
7387       int i;
7388 
7389       /* For now MAX_SIZE should be a power of 2.  This assert could be
7390 	 relaxed, but it'll require a bit more complicated epilogue
7391 	 expanding.  */
7392       gcc_assert ((max_size & (max_size - 1)) == 0);
7393       for (i = max_size; i >= 1; i >>= 1)
7394 	{
7395 	  if (epilogue_size & i)
7396 	    {
7397 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7398 		destmem = emit_memset (destmem, destptr, vec_value, i);
7399 	      else
7400 		destmem = emit_memset (destmem, destptr, value, i);
7401 	    }
7402 	}
7403       return;
7404     }
7405   if (max_size > 32)
7406     {
7407       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7408       return;
7409     }
7410   if (max_size > 16)
7411     {
7412       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7413       if (TARGET_64BIT)
7414 	{
7415 	  dest = change_address (destmem, DImode, destptr);
7416 	  emit_insn (gen_strset (destptr, dest, value));
7417 	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7418 	  emit_insn (gen_strset (destptr, dest, value));
7419 	}
7420       else
7421 	{
7422 	  dest = change_address (destmem, SImode, destptr);
7423 	  emit_insn (gen_strset (destptr, dest, value));
7424 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7425 	  emit_insn (gen_strset (destptr, dest, value));
7426 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7427 	  emit_insn (gen_strset (destptr, dest, value));
7428 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7429 	  emit_insn (gen_strset (destptr, dest, value));
7430 	}
7431       emit_label (label);
7432       LABEL_NUSES (label) = 1;
7433     }
7434   if (max_size > 8)
7435     {
7436       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7437       if (TARGET_64BIT)
7438 	{
7439 	  dest = change_address (destmem, DImode, destptr);
7440 	  emit_insn (gen_strset (destptr, dest, value));
7441 	}
7442       else
7443 	{
7444 	  dest = change_address (destmem, SImode, destptr);
7445 	  emit_insn (gen_strset (destptr, dest, value));
7446 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7447 	  emit_insn (gen_strset (destptr, dest, value));
7448 	}
7449       emit_label (label);
7450       LABEL_NUSES (label) = 1;
7451     }
7452   if (max_size > 4)
7453     {
7454       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7455       dest = change_address (destmem, SImode, destptr);
7456       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7457       emit_label (label);
7458       LABEL_NUSES (label) = 1;
7459     }
7460   if (max_size > 2)
7461     {
7462       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7463       dest = change_address (destmem, HImode, destptr);
7464       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7465       emit_label (label);
7466       LABEL_NUSES (label) = 1;
7467     }
7468   if (max_size > 1)
7469     {
7470       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7471       dest = change_address (destmem, QImode, destptr);
7472       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7473       emit_label (label);
7474       LABEL_NUSES (label) = 1;
7475     }
7476 }
7477 
7478 /* Adjust COUNTER by the VALUE.  */
7479 static void
ix86_adjust_counter(rtx countreg,HOST_WIDE_INT value)7480 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7481 {
7482   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7483 }
7484 
7485 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7486    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
7487    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7488    ignored.
7489    Return value is updated DESTMEM.  */
7490 
7491 static rtx
expand_set_or_cpymem_prologue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int align,int desired_alignment,bool issetmem)7492 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7493 				  rtx destptr, rtx srcptr, rtx value,
7494 				  rtx vec_value, rtx count, int align,
7495 				  int desired_alignment, bool issetmem)
7496 {
7497   int i;
7498   for (i = 1; i < desired_alignment; i <<= 1)
7499     {
7500       if (align <= i)
7501 	{
7502 	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7503 	  if (issetmem)
7504 	    {
7505 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7506 		destmem = emit_memset (destmem, destptr, vec_value, i);
7507 	      else
7508 		destmem = emit_memset (destmem, destptr, value, i);
7509 	    }
7510 	  else
7511 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7512 	  ix86_adjust_counter (count, i);
7513 	  emit_label (label);
7514 	  LABEL_NUSES (label) = 1;
7515 	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7516 	}
7517     }
7518   return destmem;
7519 }
7520 
7521 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7522    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7523    and jump to DONE_LABEL.  */
7524 static void
expand_small_cpymem_or_setmem(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int size,rtx done_label,bool issetmem)7525 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7526 			       rtx destptr, rtx srcptr,
7527 			       rtx value, rtx vec_value,
7528 			       rtx count, int size,
7529 			       rtx done_label, bool issetmem)
7530 {
7531   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7532   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7533   rtx modesize;
7534   int n;
7535 
7536   /* If we do not have vector value to copy, we must reduce size.  */
7537   if (issetmem)
7538     {
7539       if (!vec_value)
7540 	{
7541 	  if (GET_MODE (value) == VOIDmode && size > 8)
7542 	    mode = Pmode;
7543 	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7544 	    mode = GET_MODE (value);
7545 	}
7546       else
7547 	mode = GET_MODE (vec_value), value = vec_value;
7548     }
7549   else
7550     {
7551       /* Choose appropriate vector mode.  */
7552       if (size >= 32)
7553 	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7554       else if (size >= 16)
7555 	mode = TARGET_SSE ? V16QImode : DImode;
7556       srcmem = change_address (srcmem, mode, srcptr);
7557     }
7558   destmem = change_address (destmem, mode, destptr);
7559   modesize = GEN_INT (GET_MODE_SIZE (mode));
7560   gcc_assert (GET_MODE_SIZE (mode) <= size);
7561   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7562     {
7563       if (issetmem)
7564 	emit_move_insn (destmem, gen_lowpart (mode, value));
7565       else
7566 	{
7567           emit_move_insn (destmem, srcmem);
7568           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7569 	}
7570       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7571     }
7572 
7573   destmem = offset_address (destmem, count, 1);
7574   destmem = offset_address (destmem, GEN_INT (-2 * size),
7575 			    GET_MODE_SIZE (mode));
7576   if (!issetmem)
7577     {
7578       srcmem = offset_address (srcmem, count, 1);
7579       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7580 			       GET_MODE_SIZE (mode));
7581     }
7582   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7583     {
7584       if (issetmem)
7585 	emit_move_insn (destmem, gen_lowpart (mode, value));
7586       else
7587 	{
7588 	  emit_move_insn (destmem, srcmem);
7589 	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7590 	}
7591       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7592     }
7593   emit_jump_insn (gen_jump (done_label));
7594   emit_barrier ();
7595 
7596   emit_label (label);
7597   LABEL_NUSES (label) = 1;
7598 }
7599 
7600 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7601    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7602    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7603    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7604    DONE_LABEL is a label after the whole copying sequence. The label is created
7605    on demand if *DONE_LABEL is NULL.
7606    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
7607    bounds after the initial copies.
7608 
7609    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7610    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7611    we will dispatch to a library call for large blocks.
7612 
7613    In pseudocode we do:
7614 
7615    if (COUNT < SIZE)
7616      {
7617        Assume that SIZE is 4. Bigger sizes are handled analogously
7618        if (COUNT & 4)
7619 	 {
7620 	    copy 4 bytes from SRCPTR to DESTPTR
7621 	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7622 	    goto done_label
7623 	 }
7624        if (!COUNT)
7625 	 goto done_label;
7626        copy 1 byte from SRCPTR to DESTPTR
7627        if (COUNT & 2)
7628 	 {
7629 	    copy 2 bytes from SRCPTR to DESTPTR
7630 	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7631 	 }
7632      }
7633    else
7634      {
7635        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7636        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7637 
7638        OLD_DESPTR = DESTPTR;
7639        Align DESTPTR up to DESIRED_ALIGN
7640        SRCPTR += DESTPTR - OLD_DESTPTR
7641        COUNT -= DEST_PTR - OLD_DESTPTR
7642        if (DYNAMIC_CHECK)
7643 	 Round COUNT down to multiple of SIZE
7644        << optional caller supplied zero size guard is here >>
7645        << optional caller supplied dynamic check is here >>
7646        << caller supplied main copy loop is here >>
7647      }
7648    done_label:
7649   */
7650 static void
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves(rtx destmem,rtx srcmem,rtx * destptr,rtx * srcptr,machine_mode mode,rtx value,rtx vec_value,rtx * count,rtx_code_label ** done_label,int size,int desired_align,int align,unsigned HOST_WIDE_INT * min_size,bool dynamic_check,bool issetmem)7651 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7652 							    rtx *destptr, rtx *srcptr,
7653 							    machine_mode mode,
7654 							    rtx value, rtx vec_value,
7655 							    rtx *count,
7656 							    rtx_code_label **done_label,
7657 							    int size,
7658 							    int desired_align,
7659 							    int align,
7660 							    unsigned HOST_WIDE_INT *min_size,
7661 							    bool dynamic_check,
7662 							    bool issetmem)
7663 {
7664   rtx_code_label *loop_label = NULL, *label;
7665   int n;
7666   rtx modesize;
7667   int prolog_size = 0;
7668   rtx mode_value;
7669 
7670   /* Chose proper value to copy.  */
7671   if (issetmem && VECTOR_MODE_P (mode))
7672     mode_value = vec_value;
7673   else
7674     mode_value = value;
7675   gcc_assert (GET_MODE_SIZE (mode) <= size);
7676 
7677   /* See if block is big or small, handle small blocks.  */
7678   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7679     {
7680       int size2 = size;
7681       loop_label = gen_label_rtx ();
7682 
7683       if (!*done_label)
7684 	*done_label = gen_label_rtx ();
7685 
7686       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7687 			       1, loop_label);
7688       size2 >>= 1;
7689 
7690       /* Handle sizes > 3.  */
7691       for (;size2 > 2; size2 >>= 1)
7692 	expand_small_cpymem_or_setmem (destmem, srcmem,
7693 				       *destptr, *srcptr,
7694 				       value, vec_value,
7695 				       *count,
7696 				       size2, *done_label, issetmem);
7697       /* Nothing to copy?  Jump to DONE_LABEL if so */
7698       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7699 			       1, *done_label);
7700 
7701       /* Do a byte copy.  */
7702       destmem = change_address (destmem, QImode, *destptr);
7703       if (issetmem)
7704 	emit_move_insn (destmem, gen_lowpart (QImode, value));
7705       else
7706 	{
7707           srcmem = change_address (srcmem, QImode, *srcptr);
7708           emit_move_insn (destmem, srcmem);
7709 	}
7710 
7711       /* Handle sizes 2 and 3.  */
7712       label = ix86_expand_aligntest (*count, 2, false);
7713       destmem = change_address (destmem, HImode, *destptr);
7714       destmem = offset_address (destmem, *count, 1);
7715       destmem = offset_address (destmem, GEN_INT (-2), 2);
7716       if (issetmem)
7717         emit_move_insn (destmem, gen_lowpart (HImode, value));
7718       else
7719 	{
7720 	  srcmem = change_address (srcmem, HImode, *srcptr);
7721 	  srcmem = offset_address (srcmem, *count, 1);
7722 	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7723 	  emit_move_insn (destmem, srcmem);
7724 	}
7725 
7726       emit_label (label);
7727       LABEL_NUSES (label) = 1;
7728       emit_jump_insn (gen_jump (*done_label));
7729       emit_barrier ();
7730     }
7731   else
7732     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7733 		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7734 
7735   /* Start memcpy for COUNT >= SIZE.  */
7736   if (loop_label)
7737     {
7738        emit_label (loop_label);
7739        LABEL_NUSES (loop_label) = 1;
7740     }
7741 
7742   /* Copy first desired_align bytes.  */
7743   if (!issetmem)
7744     srcmem = change_address (srcmem, mode, *srcptr);
7745   destmem = change_address (destmem, mode, *destptr);
7746   modesize = GEN_INT (GET_MODE_SIZE (mode));
7747   for (n = 0; prolog_size < desired_align - align; n++)
7748     {
7749       if (issetmem)
7750         emit_move_insn (destmem, mode_value);
7751       else
7752 	{
7753           emit_move_insn (destmem, srcmem);
7754           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7755 	}
7756       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7757       prolog_size += GET_MODE_SIZE (mode);
7758     }
7759 
7760 
7761   /* Copy last SIZE bytes.  */
7762   destmem = offset_address (destmem, *count, 1);
7763   destmem = offset_address (destmem,
7764 			    GEN_INT (-size - prolog_size),
7765 			    1);
7766   if (issetmem)
7767     emit_move_insn (destmem, mode_value);
7768   else
7769     {
7770       srcmem = offset_address (srcmem, *count, 1);
7771       srcmem = offset_address (srcmem,
7772 			       GEN_INT (-size - prolog_size),
7773 			       1);
7774       emit_move_insn (destmem, srcmem);
7775     }
7776   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7777     {
7778       destmem = offset_address (destmem, modesize, 1);
7779       if (issetmem)
7780 	emit_move_insn (destmem, mode_value);
7781       else
7782 	{
7783           srcmem = offset_address (srcmem, modesize, 1);
7784           emit_move_insn (destmem, srcmem);
7785 	}
7786     }
7787 
7788   /* Align destination.  */
7789   if (desired_align > 1 && desired_align > align)
7790     {
7791       rtx saveddest = *destptr;
7792 
7793       gcc_assert (desired_align <= size);
7794       /* Align destptr up, place it to new register.  */
7795       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7796 				      GEN_INT (prolog_size),
7797 				      NULL_RTX, 1, OPTAB_DIRECT);
7798       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7799 	REG_POINTER (*destptr) = 1;
7800       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7801 				      GEN_INT (-desired_align),
7802 				      *destptr, 1, OPTAB_DIRECT);
7803       /* See how many bytes we skipped.  */
7804       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7805 				       *destptr,
7806 				       saveddest, 1, OPTAB_DIRECT);
7807       /* Adjust srcptr and count.  */
7808       if (!issetmem)
7809 	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7810 				       saveddest, *srcptr, 1, OPTAB_DIRECT);
7811       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7812 				    saveddest, *count, 1, OPTAB_DIRECT);
7813       /* We copied at most size + prolog_size.  */
7814       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7815 	*min_size
7816 	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7817       else
7818 	*min_size = 0;
7819 
7820       /* Our loops always round down the block size, but for dispatch to
7821          library we need precise value.  */
7822       if (dynamic_check)
7823 	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
7824 				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7825     }
7826   else
7827     {
7828       gcc_assert (prolog_size == 0);
7829       /* Decrease count, so we won't end up copying last word twice.  */
7830       if (!CONST_INT_P (*count))
7831 	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7832 				      constm1_rtx, *count, 1, OPTAB_DIRECT);
7833       else
7834 	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7835 				      (unsigned HOST_WIDE_INT)size));
7836       if (*min_size)
7837 	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7838     }
7839 }
7840 
7841 
7842 /* This function is like the previous one, except here we know how many bytes
7843    need to be copied.  That allows us to update alignment not only of DST, which
7844    is returned, but also of SRC, which is passed as a pointer for that
7845    reason.  */
7846 static rtx
expand_set_or_cpymem_constant_prologue(rtx dst,rtx * srcp,rtx destreg,rtx srcreg,rtx value,rtx vec_value,int desired_align,int align_bytes,bool issetmem)7847 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
7848 					   rtx srcreg, rtx value, rtx vec_value,
7849 					   int desired_align, int align_bytes,
7850 					   bool issetmem)
7851 {
7852   rtx src = NULL;
7853   rtx orig_dst = dst;
7854   rtx orig_src = NULL;
7855   int piece_size = 1;
7856   int copied_bytes = 0;
7857 
7858   if (!issetmem)
7859     {
7860       gcc_assert (srcp != NULL);
7861       src = *srcp;
7862       orig_src = src;
7863     }
7864 
7865   for (piece_size = 1;
7866        piece_size <= desired_align && copied_bytes < align_bytes;
7867        piece_size <<= 1)
7868     {
7869       if (align_bytes & piece_size)
7870 	{
7871 	  if (issetmem)
7872 	    {
7873 	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
7874 		dst = emit_memset (dst, destreg, vec_value, piece_size);
7875 	      else
7876 		dst = emit_memset (dst, destreg, value, piece_size);
7877 	    }
7878 	  else
7879 	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
7880 	  copied_bytes += piece_size;
7881 	}
7882     }
7883   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
7884     set_mem_align (dst, desired_align * BITS_PER_UNIT);
7885   if (MEM_SIZE_KNOWN_P (orig_dst))
7886     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
7887 
7888   if (!issetmem)
7889     {
7890       int src_align_bytes = get_mem_align_offset (src, desired_align
7891 						       * BITS_PER_UNIT);
7892       if (src_align_bytes >= 0)
7893 	src_align_bytes = desired_align - src_align_bytes;
7894       if (src_align_bytes >= 0)
7895 	{
7896 	  unsigned int src_align;
7897 	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
7898 	    {
7899 	      if ((src_align_bytes & (src_align - 1))
7900 		   == (align_bytes & (src_align - 1)))
7901 		break;
7902 	    }
7903 	  if (src_align > (unsigned int) desired_align)
7904 	    src_align = desired_align;
7905 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
7906 	    set_mem_align (src, src_align * BITS_PER_UNIT);
7907 	}
7908       if (MEM_SIZE_KNOWN_P (orig_src))
7909 	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
7910       *srcp = src;
7911     }
7912 
7913   return dst;
7914 }
7915 
7916 /* Return true if ALG can be used in current context.
7917    Assume we expand memset if MEMSET is true.  */
7918 static bool
alg_usable_p(enum stringop_alg alg,bool memset,bool have_as)7919 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
7920 {
7921   if (alg == no_stringop)
7922     return false;
7923   if (alg == vector_loop)
7924     return TARGET_SSE || TARGET_AVX;
7925   /* Algorithms using the rep prefix want at least edi and ecx;
7926      additionally, memset wants eax and memcpy wants esi.  Don't
7927      consider such algorithms if the user has appropriated those
7928      registers for their own purposes, or if we have a non-default
7929      address space, since some string insns cannot override the segment.  */
7930   if (alg == rep_prefix_1_byte
7931       || alg == rep_prefix_4_byte
7932       || alg == rep_prefix_8_byte)
7933     {
7934       if (have_as)
7935 	return false;
7936       if (fixed_regs[CX_REG]
7937 	  || fixed_regs[DI_REG]
7938 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
7939 	return false;
7940     }
7941   return true;
7942 }
7943 
7944 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
7945 static enum stringop_alg
decide_alg(HOST_WIDE_INT count,HOST_WIDE_INT expected_size,unsigned HOST_WIDE_INT min_size,unsigned HOST_WIDE_INT max_size,bool memset,bool zero_memset,bool have_as,int * dynamic_check,bool * noalign,bool recur)7946 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
7947 	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
7948 	    bool memset, bool zero_memset, bool have_as,
7949 	    int *dynamic_check, bool *noalign, bool recur)
7950 {
7951   const struct stringop_algs *algs;
7952   bool optimize_for_speed;
7953   int max = 0;
7954   const struct processor_costs *cost;
7955   int i;
7956   bool any_alg_usable_p = false;
7957 
7958   *noalign = false;
7959   *dynamic_check = -1;
7960 
7961   /* Even if the string operation call is cold, we still might spend a lot
7962      of time processing large blocks.  */
7963   if (optimize_function_for_size_p (cfun)
7964       || (optimize_insn_for_size_p ()
7965  	  && (max_size < 256
7966               || (expected_size != -1 && expected_size < 256))))
7967     optimize_for_speed = false;
7968   else
7969     optimize_for_speed = true;
7970 
7971   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
7972   if (memset)
7973     algs = &cost->memset[TARGET_64BIT != 0];
7974   else
7975     algs = &cost->memcpy[TARGET_64BIT != 0];
7976 
7977   /* See maximal size for user defined algorithm.  */
7978   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
7979     {
7980       enum stringop_alg candidate = algs->size[i].alg;
7981       bool usable = alg_usable_p (candidate, memset, have_as);
7982       any_alg_usable_p |= usable;
7983 
7984       if (candidate != libcall && candidate && usable)
7985 	max = algs->size[i].max;
7986     }
7987 
7988   /* If expected size is not known but max size is small enough
7989      so inline version is a win, set expected size into
7990      the range.  */
7991   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
7992       && expected_size == -1)
7993     expected_size = min_size / 2 + max_size / 2;
7994 
7995   /* If user specified the algorithm, honor it if possible.  */
7996   if (ix86_stringop_alg != no_stringop
7997       && alg_usable_p (ix86_stringop_alg, memset, have_as))
7998     return ix86_stringop_alg;
7999   /* rep; movq or rep; movl is the smallest variant.  */
8000   else if (!optimize_for_speed)
8001     {
8002       *noalign = true;
8003       if (!count || (count & 3) || (memset && !zero_memset))
8004 	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8005 	       ? rep_prefix_1_byte : loop_1_byte;
8006       else
8007 	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8008 	       ? rep_prefix_4_byte : loop;
8009     }
8010   /* Very tiny blocks are best handled via the loop, REP is expensive to
8011      setup.  */
8012   else if (expected_size != -1 && expected_size < 4)
8013     return loop_1_byte;
8014   else if (expected_size != -1)
8015     {
8016       enum stringop_alg alg = libcall;
8017       bool alg_noalign = false;
8018       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8019 	{
8020 	  /* We get here if the algorithms that were not libcall-based
8021 	     were rep-prefix based and we are unable to use rep prefixes
8022 	     based on global register usage.  Break out of the loop and
8023 	     use the heuristic below.  */
8024 	  if (algs->size[i].max == 0)
8025 	    break;
8026 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8027 	    {
8028 	      enum stringop_alg candidate = algs->size[i].alg;
8029 
8030 	      if (candidate != libcall
8031 		  && alg_usable_p (candidate, memset, have_as))
8032 		{
8033 		  alg = candidate;
8034 		  alg_noalign = algs->size[i].noalign;
8035 		}
8036 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8037 		 last non-libcall inline algorithm.  */
8038 	      if (TARGET_INLINE_ALL_STRINGOPS)
8039 		{
8040 		  /* When the current size is best to be copied by a libcall,
8041 		     but we are still forced to inline, run the heuristic below
8042 		     that will pick code for medium sized blocks.  */
8043 		  if (alg != libcall)
8044 		    {
8045 		      *noalign = alg_noalign;
8046 		      return alg;
8047 		    }
8048 		  else if (!any_alg_usable_p)
8049 		    break;
8050 		}
8051 	      else if (alg_usable_p (candidate, memset, have_as)
8052 		       && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8053 			    && candidate == rep_prefix_1_byte
8054 			    /* NB: If min_size != max_size, size is
8055 			       unknown.  */
8056 			    && min_size != max_size))
8057 		{
8058 		  *noalign = algs->size[i].noalign;
8059 		  return candidate;
8060 		}
8061 	    }
8062 	}
8063     }
8064   /* When asked to inline the call anyway, try to pick meaningful choice.
8065      We look for maximal size of block that is faster to copy by hand and
8066      take blocks of at most of that size guessing that average size will
8067      be roughly half of the block.
8068 
8069      If this turns out to be bad, we might simply specify the preferred
8070      choice in ix86_costs.  */
8071   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8072       && (algs->unknown_size == libcall
8073 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
8074     {
8075       enum stringop_alg alg;
8076       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8077 
8078       /* If there aren't any usable algorithms or if recursing already,
8079 	 then recursing on smaller sizes or same size isn't going to
8080 	 find anything.  Just return the simple byte-at-a-time copy loop.  */
8081       if (!any_alg_usable_p || recur)
8082 	{
8083 	  /* Pick something reasonable.  */
8084 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8085 	    *dynamic_check = 128;
8086 	  return loop_1_byte;
8087 	}
8088       alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8089 			zero_memset, have_as, dynamic_check, noalign, true);
8090       gcc_assert (*dynamic_check == -1);
8091       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8092 	*dynamic_check = max;
8093       else
8094 	gcc_assert (alg != libcall);
8095       return alg;
8096     }
8097   return (alg_usable_p (algs->unknown_size, memset, have_as)
8098 	  ? algs->unknown_size : libcall);
8099 }
8100 
8101 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
8102    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
8103 static int
decide_alignment(int align,enum stringop_alg alg,int expected_size,machine_mode move_mode)8104 decide_alignment (int align,
8105 		  enum stringop_alg alg,
8106 		  int expected_size,
8107 		  machine_mode move_mode)
8108 {
8109   int desired_align = 0;
8110 
8111   gcc_assert (alg != no_stringop);
8112 
8113   if (alg == libcall)
8114     return 0;
8115   if (move_mode == VOIDmode)
8116     return 0;
8117 
8118   desired_align = GET_MODE_SIZE (move_mode);
8119   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8120      copying whole cacheline at once.  */
8121   if (TARGET_CPU_P (PENTIUMPRO)
8122       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8123     desired_align = 8;
8124 
8125   if (optimize_size)
8126     desired_align = 1;
8127   if (desired_align < align)
8128     desired_align = align;
8129   if (expected_size != -1 && expected_size < 4)
8130     desired_align = align;
8131 
8132   return desired_align;
8133 }
8134 
8135 
8136 /* Helper function for memcpy.  For QImode value 0xXY produce
8137    0xXYXYXYXY of wide specified by MODE.  This is essentially
8138    a * 0x10101010, but we can do slightly better than
8139    synth_mult by unwinding the sequence by hand on CPUs with
8140    slow multiply.  */
8141 static rtx
promote_duplicated_reg(machine_mode mode,rtx val)8142 promote_duplicated_reg (machine_mode mode, rtx val)
8143 {
8144   machine_mode valmode = GET_MODE (val);
8145   rtx tmp;
8146   int nops = mode == DImode ? 3 : 2;
8147 
8148   gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8149   if (val == const0_rtx)
8150     return copy_to_mode_reg (mode, CONST0_RTX (mode));
8151   if (CONST_INT_P (val))
8152     {
8153       HOST_WIDE_INT v = INTVAL (val) & 255;
8154 
8155       v |= v << 8;
8156       v |= v << 16;
8157       if (mode == DImode)
8158         v |= (v << 16) << 16;
8159       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8160     }
8161 
8162   if (valmode == VOIDmode)
8163     valmode = QImode;
8164   if (valmode != QImode)
8165     val = gen_lowpart (QImode, val);
8166   if (mode == QImode)
8167     return val;
8168   if (!TARGET_PARTIAL_REG_STALL)
8169     nops--;
8170   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8171       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8172       <= (ix86_cost->shift_const + ix86_cost->add) * nops
8173           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8174     {
8175       rtx reg = convert_modes (mode, QImode, val, true);
8176       tmp = promote_duplicated_reg (mode, const1_rtx);
8177       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8178 				  OPTAB_DIRECT);
8179     }
8180   else
8181     {
8182       rtx reg = convert_modes (mode, QImode, val, true);
8183 
8184       if (!TARGET_PARTIAL_REG_STALL)
8185 	emit_insn (gen_insv_1 (mode, reg, reg));
8186       else
8187 	{
8188 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8189 				     NULL, 1, OPTAB_DIRECT);
8190 	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8191 				     OPTAB_DIRECT);
8192 	}
8193       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8194 			         NULL, 1, OPTAB_DIRECT);
8195       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8196       if (mode == SImode)
8197 	return reg;
8198       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8199 				 NULL, 1, OPTAB_DIRECT);
8200       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8201       return reg;
8202     }
8203 }
8204 
8205 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8206    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8207    alignment from ALIGN to DESIRED_ALIGN.  */
8208 static rtx
promote_duplicated_reg_to_size(rtx val,int size_needed,int desired_align,int align)8209 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8210 				int align)
8211 {
8212   rtx promoted_val;
8213 
8214   if (TARGET_64BIT
8215       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8216     promoted_val = promote_duplicated_reg (DImode, val);
8217   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8218     promoted_val = promote_duplicated_reg (SImode, val);
8219   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8220     promoted_val = promote_duplicated_reg (HImode, val);
8221   else
8222     promoted_val = val;
8223 
8224   return promoted_val;
8225 }
8226 
8227 /* Copy the address to a Pmode register.  This is used for x32 to
8228    truncate DImode TLS address to a SImode register. */
8229 
8230 static rtx
ix86_copy_addr_to_reg(rtx addr)8231 ix86_copy_addr_to_reg (rtx addr)
8232 {
8233   rtx reg;
8234   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8235     {
8236       reg = copy_addr_to_reg (addr);
8237       REG_POINTER (reg) = 1;
8238       return reg;
8239     }
8240   else
8241     {
8242       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8243       reg = copy_to_mode_reg (DImode, addr);
8244       REG_POINTER (reg) = 1;
8245       return gen_rtx_SUBREG (SImode, reg, 0);
8246     }
8247 }
8248 
8249 /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
8250    operations when profitable.  The code depends upon architecture, block size
8251    and alignment, but always has one of the following overall structures:
8252 
8253    Aligned move sequence:
8254 
8255      1) Prologue guard: Conditional that jumps up to epilogues for small
8256 	blocks that can be handled by epilogue alone.  This is faster
8257 	but also needed for correctness, since prologue assume the block
8258 	is larger than the desired alignment.
8259 
8260 	Optional dynamic check for size and libcall for large
8261 	blocks is emitted here too, with -minline-stringops-dynamically.
8262 
8263      2) Prologue: copy first few bytes in order to get destination
8264 	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
8265 	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8266 	copied.  We emit either a jump tree on power of two sized
8267 	blocks, or a byte loop.
8268 
8269      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8270 	with specified algorithm.
8271 
8272      4) Epilogue: code copying tail of the block that is too small to be
8273 	handled by main body (or up to size guarded by prologue guard).
8274 
8275   Misaligned move sequence
8276 
8277      1) missaligned move prologue/epilogue containing:
8278         a) Prologue handling small memory blocks and jumping to done_label
8279 	   (skipped if blocks are known to be large enough)
8280 	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8281            needed by single possibly misaligned move
8282 	   (skipped if alignment is not needed)
8283         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8284 
8285      2) Zero size guard dispatching to done_label, if needed
8286 
8287      3) dispatch to library call, if needed,
8288 
8289      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8290 	with specified algorithm.  */
8291 bool
ix86_expand_set_or_cpymem(rtx dst,rtx src,rtx count_exp,rtx val_exp,rtx align_exp,rtx expected_align_exp,rtx expected_size_exp,rtx min_size_exp,rtx max_size_exp,rtx probable_max_size_exp,bool issetmem)8292 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8293 			   rtx align_exp, rtx expected_align_exp,
8294 			   rtx expected_size_exp, rtx min_size_exp,
8295 			   rtx max_size_exp, rtx probable_max_size_exp,
8296 			   bool issetmem)
8297 {
8298   rtx destreg;
8299   rtx srcreg = NULL;
8300   rtx_code_label *label = NULL;
8301   rtx tmp;
8302   rtx_code_label *jump_around_label = NULL;
8303   HOST_WIDE_INT align = 1;
8304   unsigned HOST_WIDE_INT count = 0;
8305   HOST_WIDE_INT expected_size = -1;
8306   int size_needed = 0, epilogue_size_needed;
8307   int desired_align = 0, align_bytes = 0;
8308   enum stringop_alg alg;
8309   rtx promoted_val = NULL;
8310   rtx vec_promoted_val = NULL;
8311   bool force_loopy_epilogue = false;
8312   int dynamic_check;
8313   bool need_zero_guard = false;
8314   bool noalign;
8315   machine_mode move_mode = VOIDmode;
8316   machine_mode wider_mode;
8317   int unroll_factor = 1;
8318   /* TODO: Once value ranges are available, fill in proper data.  */
8319   unsigned HOST_WIDE_INT min_size = 0;
8320   unsigned HOST_WIDE_INT max_size = -1;
8321   unsigned HOST_WIDE_INT probable_max_size = -1;
8322   bool misaligned_prologue_used = false;
8323   bool have_as;
8324 
8325   if (CONST_INT_P (align_exp))
8326     align = INTVAL (align_exp);
8327   /* i386 can do misaligned access on reasonably increased cost.  */
8328   if (CONST_INT_P (expected_align_exp)
8329       && INTVAL (expected_align_exp) > align)
8330     align = INTVAL (expected_align_exp);
8331   /* ALIGN is the minimum of destination and source alignment, but we care here
8332      just about destination alignment.  */
8333   else if (!issetmem
8334 	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8335     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8336 
8337   if (CONST_INT_P (count_exp))
8338     {
8339       min_size = max_size = probable_max_size = count = expected_size
8340 	= INTVAL (count_exp);
8341       /* When COUNT is 0, there is nothing to do.  */
8342       if (!count)
8343 	return true;
8344     }
8345   else
8346     {
8347       if (min_size_exp)
8348 	min_size = INTVAL (min_size_exp);
8349       if (max_size_exp)
8350 	max_size = INTVAL (max_size_exp);
8351       if (probable_max_size_exp)
8352 	probable_max_size = INTVAL (probable_max_size_exp);
8353       if (CONST_INT_P (expected_size_exp))
8354 	expected_size = INTVAL (expected_size_exp);
8355      }
8356 
8357   /* Make sure we don't need to care about overflow later on.  */
8358   if (count > (HOST_WIDE_INT_1U << 30))
8359     return false;
8360 
8361   have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8362   if (!issetmem)
8363     have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8364 
8365   /* Step 0: Decide on preferred algorithm, desired alignment and
8366      size of chunks to be copied by main loop.  */
8367   alg = decide_alg (count, expected_size, min_size, probable_max_size,
8368 		    issetmem,
8369 		    issetmem && val_exp == const0_rtx, have_as,
8370 		    &dynamic_check, &noalign, false);
8371 
8372   if (dump_file)
8373     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8374 	     stringop_alg_names[alg]);
8375 
8376   if (alg == libcall)
8377     return false;
8378   gcc_assert (alg != no_stringop);
8379 
8380   /* For now vector-version of memset is generated only for memory zeroing, as
8381      creating of promoted vector value is very cheap in this case.  */
8382   if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8383     alg = unrolled_loop;
8384 
8385   if (!count)
8386     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8387   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8388   if (!issetmem)
8389     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8390 
8391   unroll_factor = 1;
8392   move_mode = word_mode;
8393   switch (alg)
8394     {
8395     case libcall:
8396     case no_stringop:
8397     case last_alg:
8398       gcc_unreachable ();
8399     case loop_1_byte:
8400       need_zero_guard = true;
8401       move_mode = QImode;
8402       break;
8403     case loop:
8404       need_zero_guard = true;
8405       break;
8406     case unrolled_loop:
8407       need_zero_guard = true;
8408       unroll_factor = (TARGET_64BIT ? 4 : 2);
8409       break;
8410     case vector_loop:
8411       need_zero_guard = true;
8412       unroll_factor = 4;
8413       /* Find the widest supported mode.  */
8414       move_mode = word_mode;
8415       while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8416 	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8417 	move_mode = wider_mode;
8418 
8419       if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8420 	move_mode = TImode;
8421 
8422       /* Find the corresponding vector mode with the same size as MOVE_MODE.
8423 	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
8424       if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8425 	{
8426 	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8427 	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8428 	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8429 	    move_mode = word_mode;
8430 	}
8431       gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8432       break;
8433     case rep_prefix_8_byte:
8434       move_mode = DImode;
8435       break;
8436     case rep_prefix_4_byte:
8437       move_mode = SImode;
8438       break;
8439     case rep_prefix_1_byte:
8440       move_mode = QImode;
8441       break;
8442     }
8443   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8444   epilogue_size_needed = size_needed;
8445 
8446   /* If we are going to call any library calls conditionally, make sure any
8447      pending stack adjustment happen before the first conditional branch,
8448      otherwise they will be emitted before the library call only and won't
8449      happen from the other branches.  */
8450   if (dynamic_check != -1)
8451     do_pending_stack_adjust ();
8452 
8453   desired_align = decide_alignment (align, alg, expected_size, move_mode);
8454   if (!TARGET_ALIGN_STRINGOPS || noalign)
8455     align = desired_align;
8456 
8457   /* Step 1: Prologue guard.  */
8458 
8459   /* Alignment code needs count to be in register.  */
8460   if (CONST_INT_P (count_exp) && desired_align > align)
8461     {
8462       if (INTVAL (count_exp) > desired_align
8463 	  && INTVAL (count_exp) > size_needed)
8464 	{
8465 	  align_bytes
8466 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8467 	  if (align_bytes <= 0)
8468 	    align_bytes = 0;
8469 	  else
8470 	    align_bytes = desired_align - align_bytes;
8471 	}
8472       if (align_bytes == 0)
8473 	count_exp = force_reg (counter_mode (count_exp), count_exp);
8474     }
8475   gcc_assert (desired_align >= 1 && align >= 1);
8476 
8477   /* Misaligned move sequences handle both prologue and epilogue at once.
8478      Default code generation results in a smaller code for large alignments
8479      and also avoids redundant job when sizes are known precisely.  */
8480   misaligned_prologue_used
8481     = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8482        && MAX (desired_align, epilogue_size_needed) <= 32
8483        && desired_align <= epilogue_size_needed
8484        && ((desired_align > align && !align_bytes)
8485 	   || (!count && epilogue_size_needed > 1)));
8486 
8487   /* Do the cheap promotion to allow better CSE across the
8488      main loop and epilogue (ie one load of the big constant in the
8489      front of all code.
8490      For now the misaligned move sequences do not have fast path
8491      without broadcasting.  */
8492   if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8493     {
8494       if (alg == vector_loop)
8495 	{
8496 	  gcc_assert (val_exp == const0_rtx);
8497 	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8498 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
8499 							 GET_MODE_SIZE (word_mode),
8500 							 desired_align, align);
8501 	}
8502       else
8503 	{
8504 	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8505 							 desired_align, align);
8506 	}
8507     }
8508   /* Misaligned move sequences handles both prologues and epilogues at once.
8509      Default code generation results in smaller code for large alignments and
8510      also avoids redundant job when sizes are known precisely.  */
8511   if (misaligned_prologue_used)
8512     {
8513       /* Misaligned move prologue handled small blocks by itself.  */
8514       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8515 	   (dst, src, &destreg, &srcreg,
8516 	    move_mode, promoted_val, vec_promoted_val,
8517 	    &count_exp,
8518 	    &jump_around_label,
8519             desired_align < align
8520 	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8521 	    desired_align, align, &min_size, dynamic_check, issetmem);
8522       if (!issetmem)
8523         src = change_address (src, BLKmode, srcreg);
8524       dst = change_address (dst, BLKmode, destreg);
8525       set_mem_align (dst, desired_align * BITS_PER_UNIT);
8526       epilogue_size_needed = 0;
8527       if (need_zero_guard
8528 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
8529 	{
8530 	  /* It is possible that we copied enough so the main loop will not
8531 	     execute.  */
8532 	  gcc_assert (size_needed > 1);
8533 	  if (jump_around_label == NULL_RTX)
8534 	    jump_around_label = gen_label_rtx ();
8535 	  emit_cmp_and_jump_insns (count_exp,
8536 				   GEN_INT (size_needed),
8537 				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8538 	  if (expected_size == -1
8539 	      || expected_size < (desired_align - align) / 2 + size_needed)
8540 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
8541 	  else
8542 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
8543 	}
8544     }
8545   /* Ensure that alignment prologue won't copy past end of block.  */
8546   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8547     {
8548       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8549       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8550 	 Make sure it is power of 2.  */
8551       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8552 
8553       /* To improve performance of small blocks, we jump around the VAL
8554 	 promoting mode.  This mean that if the promoted VAL is not constant,
8555 	 we might not use it in the epilogue and have to use byte
8556 	 loop variant.  */
8557       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8558 	force_loopy_epilogue = true;
8559       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8560 	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8561 	{
8562 	  /* If main algorithm works on QImode, no epilogue is needed.
8563 	     For small sizes just don't align anything.  */
8564 	  if (size_needed == 1)
8565 	    desired_align = align;
8566 	  else
8567 	    goto epilogue;
8568 	}
8569       else if (!count
8570 	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8571 	{
8572 	  label = gen_label_rtx ();
8573 	  emit_cmp_and_jump_insns (count_exp,
8574 				   GEN_INT (epilogue_size_needed),
8575 				   LTU, 0, counter_mode (count_exp), 1, label);
8576 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
8577 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
8578 	  else
8579 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
8580 	}
8581     }
8582 
8583   /* Emit code to decide on runtime whether library call or inline should be
8584      used.  */
8585   if (dynamic_check != -1)
8586     {
8587       if (!issetmem && CONST_INT_P (count_exp))
8588 	{
8589 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8590 	    {
8591 	      emit_block_copy_via_libcall (dst, src, count_exp);
8592 	      count_exp = const0_rtx;
8593 	      goto epilogue;
8594 	    }
8595 	}
8596       else
8597 	{
8598 	  rtx_code_label *hot_label = gen_label_rtx ();
8599 	  if (jump_around_label == NULL_RTX)
8600 	    jump_around_label = gen_label_rtx ();
8601 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8602 				   LEU, 0, counter_mode (count_exp),
8603 				   1, hot_label);
8604 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
8605 	  if (issetmem)
8606 	    set_storage_via_libcall (dst, count_exp, val_exp);
8607 	  else
8608 	    emit_block_copy_via_libcall (dst, src, count_exp);
8609 	  emit_jump (jump_around_label);
8610 	  emit_label (hot_label);
8611 	}
8612     }
8613 
8614   /* Step 2: Alignment prologue.  */
8615   /* Do the expensive promotion once we branched off the small blocks.  */
8616   if (issetmem && !promoted_val)
8617     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8618 						   desired_align, align);
8619 
8620   if (desired_align > align && !misaligned_prologue_used)
8621     {
8622       if (align_bytes == 0)
8623 	{
8624 	  /* Except for the first move in prologue, we no longer know
8625 	     constant offset in aliasing info.  It don't seems to worth
8626 	     the pain to maintain it for the first move, so throw away
8627 	     the info early.  */
8628 	  dst = change_address (dst, BLKmode, destreg);
8629 	  if (!issetmem)
8630 	    src = change_address (src, BLKmode, srcreg);
8631 	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8632 					    promoted_val, vec_promoted_val,
8633 					    count_exp, align, desired_align,
8634 					    issetmem);
8635 	  /* At most desired_align - align bytes are copied.  */
8636 	  if (min_size < (unsigned)(desired_align - align))
8637 	    min_size = 0;
8638 	  else
8639 	    min_size -= desired_align - align;
8640 	}
8641       else
8642 	{
8643 	  /* If we know how many bytes need to be stored before dst is
8644 	     sufficiently aligned, maintain aliasing info accurately.  */
8645 	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8646 							   srcreg,
8647 							   promoted_val,
8648 							   vec_promoted_val,
8649 							   desired_align,
8650 							   align_bytes,
8651 							   issetmem);
8652 
8653 	  count_exp = plus_constant (counter_mode (count_exp),
8654 				     count_exp, -align_bytes);
8655 	  count -= align_bytes;
8656 	  min_size -= align_bytes;
8657 	  max_size -= align_bytes;
8658 	}
8659       if (need_zero_guard
8660 	  && min_size < (unsigned HOST_WIDE_INT) size_needed
8661 	  && (count < (unsigned HOST_WIDE_INT) size_needed
8662 	      || (align_bytes == 0
8663 		  && count < ((unsigned HOST_WIDE_INT) size_needed
8664 			      + desired_align - align))))
8665 	{
8666 	  /* It is possible that we copied enough so the main loop will not
8667 	     execute.  */
8668 	  gcc_assert (size_needed > 1);
8669 	  if (label == NULL_RTX)
8670 	    label = gen_label_rtx ();
8671 	  emit_cmp_and_jump_insns (count_exp,
8672 				   GEN_INT (size_needed),
8673 				   LTU, 0, counter_mode (count_exp), 1, label);
8674 	  if (expected_size == -1
8675 	      || expected_size < (desired_align - align) / 2 + size_needed)
8676 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
8677 	  else
8678 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
8679 	}
8680     }
8681   if (label && size_needed == 1)
8682     {
8683       emit_label (label);
8684       LABEL_NUSES (label) = 1;
8685       label = NULL;
8686       epilogue_size_needed = 1;
8687       if (issetmem)
8688 	promoted_val = val_exp;
8689     }
8690   else if (label == NULL_RTX && !misaligned_prologue_used)
8691     epilogue_size_needed = size_needed;
8692 
8693   /* Step 3: Main loop.  */
8694 
8695   switch (alg)
8696     {
8697     case libcall:
8698     case no_stringop:
8699     case last_alg:
8700       gcc_unreachable ();
8701     case loop_1_byte:
8702     case loop:
8703     case unrolled_loop:
8704       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8705 				     count_exp, move_mode, unroll_factor,
8706 				     expected_size, issetmem);
8707       break;
8708     case vector_loop:
8709       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8710 				     vec_promoted_val, count_exp, move_mode,
8711 				     unroll_factor, expected_size, issetmem);
8712       break;
8713     case rep_prefix_8_byte:
8714     case rep_prefix_4_byte:
8715     case rep_prefix_1_byte:
8716       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
8717 				       val_exp, count_exp, move_mode, issetmem);
8718       break;
8719     }
8720   /* Adjust properly the offset of src and dest memory for aliasing.  */
8721   if (CONST_INT_P (count_exp))
8722     {
8723       if (!issetmem)
8724 	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8725 					    (count / size_needed) * size_needed);
8726       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8727 					  (count / size_needed) * size_needed);
8728     }
8729   else
8730     {
8731       if (!issetmem)
8732 	src = change_address (src, BLKmode, srcreg);
8733       dst = change_address (dst, BLKmode, destreg);
8734     }
8735 
8736   /* Step 4: Epilogue to copy the remaining bytes.  */
8737  epilogue:
8738   if (label)
8739     {
8740       /* When the main loop is done, COUNT_EXP might hold original count,
8741 	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8742 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8743 	 bytes. Compensate if needed.  */
8744 
8745       if (size_needed < epilogue_size_needed)
8746 	{
8747 	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8748 				     GEN_INT (size_needed - 1), count_exp, 1,
8749 				     OPTAB_DIRECT);
8750 	  if (tmp != count_exp)
8751 	    emit_move_insn (count_exp, tmp);
8752 	}
8753       emit_label (label);
8754       LABEL_NUSES (label) = 1;
8755     }
8756 
8757   if (count_exp != const0_rtx && epilogue_size_needed > 1)
8758     {
8759       if (force_loopy_epilogue)
8760 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8761 					 epilogue_size_needed);
8762       else
8763 	{
8764 	  if (issetmem)
8765 	    expand_setmem_epilogue (dst, destreg, promoted_val,
8766 				    vec_promoted_val, count_exp,
8767 				    epilogue_size_needed);
8768 	  else
8769 	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
8770 				    epilogue_size_needed);
8771 	}
8772     }
8773   if (jump_around_label)
8774     emit_label (jump_around_label);
8775   return true;
8776 }
8777 
8778 /* Expand cmpstrn or memcmp.  */
8779 
8780 bool
ix86_expand_cmpstrn_or_cmpmem(rtx result,rtx src1,rtx src2,rtx length,rtx align,bool is_cmpstrn)8781 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8782 			       rtx length, rtx align, bool is_cmpstrn)
8783 {
8784   /* Expand strncmp and memcmp only with -minline-all-stringops since
8785      "repz cmpsb" can be much slower than strncmp and memcmp functions
8786      implemented with vector instructions, see
8787 
8788      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8789    */
8790   if (!TARGET_INLINE_ALL_STRINGOPS)
8791     return false;
8792 
8793   /* Can't use this if the user has appropriated ecx, esi or edi.  */
8794   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8795     return false;
8796 
8797   if (is_cmpstrn)
8798     {
8799       /* For strncmp, length is the maximum length, which can be larger
8800 	 than actual string lengths.  We can expand the cmpstrn pattern
8801 	 to "repz cmpsb" only if one of the strings is a constant so
8802 	 that expand_builtin_strncmp() can write the length argument to
8803 	 be the minimum of the const string length and the actual length
8804 	 argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
8805       tree t1 = MEM_EXPR (src1);
8806       tree t2 = MEM_EXPR (src2);
8807       if (!((t1 && TREE_CODE (t1) == MEM_REF
8808 	     && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8809 	     && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8810 		 == STRING_CST))
8811 	    || (t2 && TREE_CODE (t2) == MEM_REF
8812 		&& TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8813 		&& (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8814 		    == STRING_CST))))
8815 	return false;
8816     }
8817 
8818   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8819   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8820   if (addr1 != XEXP (src1, 0))
8821     src1 = replace_equiv_address_nv (src1, addr1);
8822   if (addr2 != XEXP (src2, 0))
8823     src2 = replace_equiv_address_nv (src2, addr2);
8824 
8825   /* NB: Make a copy of the data length to avoid changing the original
8826      data length by cmpstrnqi patterns.  */
8827   length = ix86_zero_extend_to_Pmode (length);
8828   rtx lengthreg = gen_reg_rtx (Pmode);
8829   emit_move_insn (lengthreg, length);
8830 
8831   /* If we are testing strict equality, we can use known alignment to
8832      good advantage.  This may be possible with combine, particularly
8833      once cc0 is dead.  */
8834   if (CONST_INT_P (length))
8835     {
8836       if (length == const0_rtx)
8837 	{
8838 	  emit_move_insn (result, const0_rtx);
8839 	  return true;
8840 	}
8841       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8842 				     src1, src2));
8843     }
8844   else
8845     {
8846       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8847       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8848 				  src1, src2));
8849     }
8850 
8851   rtx out = gen_lowpart (QImode, result);
8852   emit_insn (gen_cmpintqi (out));
8853   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8854 
8855   return true;
8856 }
8857 
8858 /* Expand the appropriate insns for doing strlen if not just doing
8859    repnz; scasb
8860 
8861    out = result, initialized with the start address
8862    align_rtx = alignment of the address.
8863    scratch = scratch register, initialized with the startaddress when
8864 	not aligned, otherwise undefined
8865 
8866    This is just the body. It needs the initializations mentioned above and
8867    some address computing at the end.  These things are done in i386.md.  */
8868 
8869 static void
ix86_expand_strlensi_unroll_1(rtx out,rtx src,rtx align_rtx)8870 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
8871 {
8872   int align;
8873   rtx tmp;
8874   rtx_code_label *align_2_label = NULL;
8875   rtx_code_label *align_3_label = NULL;
8876   rtx_code_label *align_4_label = gen_label_rtx ();
8877   rtx_code_label *end_0_label = gen_label_rtx ();
8878   rtx mem;
8879   rtx tmpreg = gen_reg_rtx (SImode);
8880   rtx scratch = gen_reg_rtx (SImode);
8881   rtx cmp;
8882 
8883   align = 0;
8884   if (CONST_INT_P (align_rtx))
8885     align = INTVAL (align_rtx);
8886 
8887   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
8888 
8889   /* Is there a known alignment and is it less than 4?  */
8890   if (align < 4)
8891     {
8892       rtx scratch1 = gen_reg_rtx (Pmode);
8893       emit_move_insn (scratch1, out);
8894       /* Is there a known alignment and is it not 2? */
8895       if (align != 2)
8896 	{
8897 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
8898 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
8899 
8900 	  /* Leave just the 3 lower bits.  */
8901 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
8902 				    NULL_RTX, 0, OPTAB_WIDEN);
8903 
8904 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8905 				   Pmode, 1, align_4_label);
8906 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
8907 				   Pmode, 1, align_2_label);
8908 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
8909 				   Pmode, 1, align_3_label);
8910 	}
8911       else
8912         {
8913 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
8914 	     check if is aligned to 4 - byte.  */
8915 
8916 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
8917 				    NULL_RTX, 0, OPTAB_WIDEN);
8918 
8919 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8920 				   Pmode, 1, align_4_label);
8921         }
8922 
8923       mem = change_address (src, QImode, out);
8924 
8925       /* Now compare the bytes.  */
8926 
8927       /* Compare the first n unaligned byte on a byte per byte basis.  */
8928       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
8929 			       QImode, 1, end_0_label);
8930 
8931       /* Increment the address.  */
8932       emit_insn (gen_add2_insn (out, const1_rtx));
8933 
8934       /* Not needed with an alignment of 2 */
8935       if (align != 2)
8936 	{
8937 	  emit_label (align_2_label);
8938 
8939 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
8940 				   end_0_label);
8941 
8942 	  emit_insn (gen_add2_insn (out, const1_rtx));
8943 
8944 	  emit_label (align_3_label);
8945 	}
8946 
8947       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
8948 			       end_0_label);
8949 
8950       emit_insn (gen_add2_insn (out, const1_rtx));
8951     }
8952 
8953   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
8954      align this loop.  It gives only huge programs, but does not help to
8955      speed up.  */
8956   emit_label (align_4_label);
8957 
8958   mem = change_address (src, SImode, out);
8959   emit_move_insn (scratch, mem);
8960   emit_insn (gen_add2_insn (out, GEN_INT (4)));
8961 
8962   /* This formula yields a nonzero result iff one of the bytes is zero.
8963      This saves three branches inside loop and many cycles.  */
8964 
8965   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
8966   emit_insn (gen_one_cmplsi2 (scratch, scratch));
8967   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
8968   emit_insn (gen_andsi3 (tmpreg, tmpreg,
8969 			 gen_int_mode (0x80808080, SImode)));
8970   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
8971 			   align_4_label);
8972 
8973   if (TARGET_CMOVE)
8974     {
8975        rtx reg = gen_reg_rtx (SImode);
8976        rtx reg2 = gen_reg_rtx (Pmode);
8977        emit_move_insn (reg, tmpreg);
8978        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
8979 
8980        /* If zero is not in the first two bytes, move two bytes forward.  */
8981        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
8982        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
8983        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
8984        emit_insn (gen_rtx_SET (tmpreg,
8985 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
8986 						     reg,
8987 						     tmpreg)));
8988        /* Emit lea manually to avoid clobbering of flags.  */
8989        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
8990 
8991        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
8992        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
8993        emit_insn (gen_rtx_SET (out,
8994 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
8995 						     reg2,
8996 						     out)));
8997     }
8998   else
8999     {
9000        rtx_code_label *end_2_label = gen_label_rtx ();
9001        /* Is zero in the first two bytes? */
9002 
9003        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9004        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9005        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9006        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9007                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9008                             pc_rtx);
9009        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9010        JUMP_LABEL (tmp) = end_2_label;
9011 
9012        /* Not in the first two.  Move two bytes forward.  */
9013        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9014        emit_insn (gen_add2_insn (out, const2_rtx));
9015 
9016        emit_label (end_2_label);
9017 
9018     }
9019 
9020   /* Avoid branch in fixing the byte.  */
9021   tmpreg = gen_lowpart (QImode, tmpreg);
9022   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9023   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9024   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9025   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9026 
9027   emit_label (end_0_label);
9028 }
9029 
9030 /* Expand strlen.  */
9031 
9032 bool
ix86_expand_strlen(rtx out,rtx src,rtx eoschar,rtx align)9033 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9034 {
9035 if (TARGET_UNROLL_STRLEN
9036 	   && TARGET_INLINE_ALL_STRINGOPS
9037 	   && eoschar == const0_rtx
9038 	   && optimize > 1)
9039     {
9040       /* The generic case of strlen expander is long.  Avoid it's
9041 	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
9042       rtx addr = force_reg (Pmode, XEXP (src, 0));
9043       /* Well it seems that some optimizer does not combine a call like
9044 	 foo(strlen(bar), strlen(bar));
9045 	 when the move and the subtraction is done here.  It does calculate
9046 	 the length just once when these instructions are done inside of
9047 	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
9048 	 often used and I use one fewer register for the lifetime of
9049 	 output_strlen_unroll() this is better.  */
9050 
9051       emit_move_insn (out, addr);
9052 
9053       ix86_expand_strlensi_unroll_1 (out, src, align);
9054 
9055       /* strlensi_unroll_1 returns the address of the zero at the end of
9056 	 the string, like memchr(), so compute the length by subtracting
9057 	 the start address.  */
9058       emit_insn (gen_sub2_insn (out, addr));
9059       return true;
9060     }
9061   else
9062     return false;
9063 }
9064 
9065 /* For given symbol (function) construct code to compute address of it's PLT
9066    entry in large x86-64 PIC model.  */
9067 
9068 static rtx
construct_plt_address(rtx symbol)9069 construct_plt_address (rtx symbol)
9070 {
9071   rtx tmp, unspec;
9072 
9073   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9074   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9075   gcc_assert (Pmode == DImode);
9076 
9077   tmp = gen_reg_rtx (Pmode);
9078   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9079 
9080   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9081   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9082   return tmp;
9083 }
9084 
9085 /* Additional registers that are clobbered by SYSV calls.  */
9086 
9087 static int const x86_64_ms_sysv_extra_clobbered_registers
9088 		 [NUM_X86_64_MS_CLOBBERED_REGS] =
9089 {
9090   SI_REG, DI_REG,
9091   XMM6_REG, XMM7_REG,
9092   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9093   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9094 };
9095 
9096 rtx_insn *
ix86_expand_call(rtx retval,rtx fnaddr,rtx callarg1,rtx callarg2,rtx pop,bool sibcall)9097 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9098 		  rtx callarg2,
9099 		  rtx pop, bool sibcall)
9100 {
9101   rtx vec[3];
9102   rtx use = NULL, call;
9103   unsigned int vec_len = 0;
9104   tree fndecl;
9105 
9106   if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9107     {
9108       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9109       if (fndecl
9110 	  && (lookup_attribute ("interrupt",
9111 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9112 	error ("interrupt service routine cannot be called directly");
9113     }
9114   else
9115     fndecl = NULL_TREE;
9116 
9117   if (pop == const0_rtx)
9118     pop = NULL;
9119   gcc_assert (!TARGET_64BIT || !pop);
9120 
9121   rtx addr = XEXP (fnaddr, 0);
9122   if (TARGET_MACHO && !TARGET_64BIT)
9123     {
9124 #if TARGET_MACHO
9125       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9126 	fnaddr = machopic_indirect_call_target (fnaddr);
9127 #endif
9128     }
9129   else
9130     {
9131       /* Static functions and indirect calls don't need the pic register.  Also,
9132 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9133 	 it an indirect call.  */
9134       if (flag_pic
9135 	  && GET_CODE (addr) == SYMBOL_REF
9136 	  && ix86_call_use_plt_p (addr))
9137 	{
9138 	  if (flag_plt
9139 	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
9140 		  || !lookup_attribute ("noplt",
9141 					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9142 	    {
9143 	      if (!TARGET_64BIT
9144 		  || (ix86_cmodel == CM_LARGE_PIC
9145 		      && DEFAULT_ABI != MS_ABI))
9146 		{
9147 		  use_reg (&use, gen_rtx_REG (Pmode,
9148 					      REAL_PIC_OFFSET_TABLE_REGNUM));
9149 		  if (ix86_use_pseudo_pic_reg ())
9150 		    emit_move_insn (gen_rtx_REG (Pmode,
9151 						 REAL_PIC_OFFSET_TABLE_REGNUM),
9152 				    pic_offset_table_rtx);
9153 		}
9154 	    }
9155 	  else if (!TARGET_PECOFF && !TARGET_MACHO)
9156 	    {
9157 	      if (TARGET_64BIT
9158 		  && ix86_cmodel == CM_LARGE_PIC
9159 		  && DEFAULT_ABI != MS_ABI)
9160 		{
9161 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9162 					   UNSPEC_GOT);
9163 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9164 		  fnaddr = force_reg (Pmode, fnaddr);
9165 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9166 		}
9167 	      else if (TARGET_64BIT)
9168 		{
9169 		  fnaddr = gen_rtx_UNSPEC (Pmode,
9170 					   gen_rtvec (1, addr),
9171 					   UNSPEC_GOTPCREL);
9172 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9173 		}
9174 	      else
9175 		{
9176 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9177 					   UNSPEC_GOT);
9178 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9179 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9180 					 fnaddr);
9181 		}
9182 	      fnaddr = gen_const_mem (Pmode, fnaddr);
9183 	      /* Pmode may not be the same as word_mode for x32, which
9184 		 doesn't support indirect branch via 32-bit memory slot.
9185 		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9186 		 indirect branch via x32 GOT slot is OK.  */
9187 	      if (GET_MODE (fnaddr) != word_mode)
9188 		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9189 	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
9190 	    }
9191 	}
9192     }
9193 
9194   /* Skip setting up RAX register for -mskip-rax-setup when there are no
9195      parameters passed in vector registers.  */
9196   if (TARGET_64BIT
9197       && (INTVAL (callarg2) > 0
9198 	  || (INTVAL (callarg2) == 0
9199 	      && (TARGET_SSE || !flag_skip_rax_setup))))
9200     {
9201       rtx al = gen_rtx_REG (QImode, AX_REG);
9202       emit_move_insn (al, callarg2);
9203       use_reg (&use, al);
9204     }
9205 
9206   if (ix86_cmodel == CM_LARGE_PIC
9207       && !TARGET_PECOFF
9208       && MEM_P (fnaddr)
9209       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9210       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9211     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9212   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9213      branch via x32 GOT slot is OK.  */
9214   else if (!(TARGET_X32
9215 	     && MEM_P (fnaddr)
9216 	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9217 	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9218 	   && (sibcall
9219 	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9220 	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9221     {
9222       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9223       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9224     }
9225 
9226   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9227 
9228   if (retval)
9229     call = gen_rtx_SET (retval, call);
9230   vec[vec_len++] = call;
9231 
9232   if (pop)
9233     {
9234       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9235       pop = gen_rtx_SET (stack_pointer_rtx, pop);
9236       vec[vec_len++] = pop;
9237     }
9238 
9239   if (cfun->machine->no_caller_saved_registers
9240       && (!fndecl
9241 	  || (!TREE_THIS_VOLATILE (fndecl)
9242 	      && !lookup_attribute ("no_caller_saved_registers",
9243 				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9244     {
9245       static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9246       bool is_64bit_ms_abi = (TARGET_64BIT
9247 			      && ix86_function_abi (fndecl) == MS_ABI);
9248       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9249 
9250       /* If there are no caller-saved registers, add all registers
9251 	 that are clobbered by the call which returns.  */
9252       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9253 	if (!fixed_regs[i]
9254 	    && (ix86_call_used_regs[i] == 1
9255 		|| (ix86_call_used_regs[i] & c_mask))
9256 	    && !STACK_REGNO_P (i)
9257 	    && !MMX_REGNO_P (i))
9258 	  clobber_reg (&use,
9259 		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9260     }
9261   else if (TARGET_64BIT_MS_ABI
9262 	   && (!callarg2 || INTVAL (callarg2) != -2))
9263     {
9264       unsigned i;
9265 
9266       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9267 	{
9268 	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9269 	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9270 
9271 	  clobber_reg (&use, gen_rtx_REG (mode, regno));
9272 	}
9273 
9274       /* Set here, but it may get cleared later.  */
9275       if (TARGET_CALL_MS2SYSV_XLOGUES)
9276 	{
9277 	  if (!TARGET_SSE)
9278 	    ;
9279 
9280 	  /* Don't break hot-patched functions.  */
9281 	  else if (ix86_function_ms_hook_prologue (current_function_decl))
9282 	    ;
9283 
9284 	  /* TODO: Cases not yet examined.  */
9285 	  else if (flag_split_stack)
9286 	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9287 
9288 	  else
9289 	    {
9290 	      gcc_assert (!reload_completed);
9291 	      cfun->machine->call_ms2sysv = true;
9292 	    }
9293 	}
9294     }
9295 
9296   if (TARGET_MACHO && TARGET_64BIT && !sibcall
9297       && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9298 	  || !fndecl || TREE_PUBLIC (fndecl)))
9299     {
9300       /* We allow public functions defined in a TU to bind locally for PIC
9301 	 code (the default) on 64bit Mach-O.
9302 	 If such functions are not inlined, we cannot tell at compile-time if
9303 	 they will be called via the lazy symbol resolver (this can depend on
9304 	 options given at link-time).  Therefore, we must assume that the lazy
9305 	 resolver could be used which clobbers R11 and R10.  */
9306       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9307       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9308     }
9309 
9310   if (vec_len > 1)
9311     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9312   rtx_insn *call_insn = emit_call_insn (call);
9313   if (use)
9314     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9315 
9316   return call_insn;
9317 }
9318 
9319 /* Split simple return with popping POPC bytes from stack to indirect
9320    branch with stack adjustment .  */
9321 
9322 void
ix86_split_simple_return_pop_internal(rtx popc)9323 ix86_split_simple_return_pop_internal (rtx popc)
9324 {
9325   struct machine_function *m = cfun->machine;
9326   rtx ecx = gen_rtx_REG (SImode, CX_REG);
9327   rtx_insn *insn;
9328 
9329   /* There is no "pascal" calling convention in any 64bit ABI.  */
9330   gcc_assert (!TARGET_64BIT);
9331 
9332   insn = emit_insn (gen_pop (ecx));
9333   m->fs.cfa_offset -= UNITS_PER_WORD;
9334   m->fs.sp_offset -= UNITS_PER_WORD;
9335 
9336   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9337   x = gen_rtx_SET (stack_pointer_rtx, x);
9338   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9339   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9340   RTX_FRAME_RELATED_P (insn) = 1;
9341 
9342   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9343   x = gen_rtx_SET (stack_pointer_rtx, x);
9344   insn = emit_insn (x);
9345   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9346   RTX_FRAME_RELATED_P (insn) = 1;
9347 
9348   /* Now return address is in ECX.  */
9349   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9350 }
9351 
9352 /* Errors in the source file can cause expand_expr to return const0_rtx
9353    where we expect a vector.  To avoid crashing, use one of the vector
9354    clear instructions.  */
9355 
9356 static rtx
safe_vector_operand(rtx x,machine_mode mode)9357 safe_vector_operand (rtx x, machine_mode mode)
9358 {
9359   if (x == const0_rtx)
9360     x = CONST0_RTX (mode);
9361   return x;
9362 }
9363 
9364 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
9365 
9366 static rtx
ix86_expand_binop_builtin(enum insn_code icode,tree exp,rtx target)9367 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9368 {
9369   rtx pat;
9370   tree arg0 = CALL_EXPR_ARG (exp, 0);
9371   tree arg1 = CALL_EXPR_ARG (exp, 1);
9372   rtx op0 = expand_normal (arg0);
9373   rtx op1 = expand_normal (arg1);
9374   machine_mode tmode = insn_data[icode].operand[0].mode;
9375   machine_mode mode0 = insn_data[icode].operand[1].mode;
9376   machine_mode mode1 = insn_data[icode].operand[2].mode;
9377 
9378   if (VECTOR_MODE_P (mode0))
9379     op0 = safe_vector_operand (op0, mode0);
9380   if (VECTOR_MODE_P (mode1))
9381     op1 = safe_vector_operand (op1, mode1);
9382 
9383   if (optimize || !target
9384       || GET_MODE (target) != tmode
9385       || !insn_data[icode].operand[0].predicate (target, tmode))
9386     target = gen_reg_rtx (tmode);
9387 
9388   if (GET_MODE (op1) == SImode && mode1 == TImode)
9389     {
9390       rtx x = gen_reg_rtx (V4SImode);
9391       emit_insn (gen_sse2_loadd (x, op1));
9392       op1 = gen_lowpart (TImode, x);
9393     }
9394 
9395   if (!insn_data[icode].operand[1].predicate (op0, mode0))
9396     op0 = copy_to_mode_reg (mode0, op0);
9397   if (!insn_data[icode].operand[2].predicate (op1, mode1))
9398     op1 = copy_to_mode_reg (mode1, op1);
9399 
9400   pat = GEN_FCN (icode) (target, op0, op1);
9401   if (! pat)
9402     return 0;
9403 
9404   emit_insn (pat);
9405 
9406   return target;
9407 }
9408 
9409 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
9410 
9411 static rtx
ix86_expand_multi_arg_builtin(enum insn_code icode,tree exp,rtx target,enum ix86_builtin_func_type m_type,enum rtx_code sub_code)9412 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9413 			       enum ix86_builtin_func_type m_type,
9414 			       enum rtx_code sub_code)
9415 {
9416   rtx pat;
9417   unsigned int i, nargs;
9418   bool comparison_p = false;
9419   bool tf_p = false;
9420   bool last_arg_constant = false;
9421   int num_memory = 0;
9422   rtx xops[4];
9423 
9424   machine_mode tmode = insn_data[icode].operand[0].mode;
9425 
9426   switch (m_type)
9427     {
9428     case MULTI_ARG_4_DF2_DI_I:
9429     case MULTI_ARG_4_DF2_DI_I1:
9430     case MULTI_ARG_4_SF2_SI_I:
9431     case MULTI_ARG_4_SF2_SI_I1:
9432       nargs = 4;
9433       last_arg_constant = true;
9434       break;
9435 
9436     case MULTI_ARG_3_SF:
9437     case MULTI_ARG_3_DF:
9438     case MULTI_ARG_3_SF2:
9439     case MULTI_ARG_3_DF2:
9440     case MULTI_ARG_3_DI:
9441     case MULTI_ARG_3_SI:
9442     case MULTI_ARG_3_SI_DI:
9443     case MULTI_ARG_3_HI:
9444     case MULTI_ARG_3_HI_SI:
9445     case MULTI_ARG_3_QI:
9446     case MULTI_ARG_3_DI2:
9447     case MULTI_ARG_3_SI2:
9448     case MULTI_ARG_3_HI2:
9449     case MULTI_ARG_3_QI2:
9450       nargs = 3;
9451       break;
9452 
9453     case MULTI_ARG_2_SF:
9454     case MULTI_ARG_2_DF:
9455     case MULTI_ARG_2_DI:
9456     case MULTI_ARG_2_SI:
9457     case MULTI_ARG_2_HI:
9458     case MULTI_ARG_2_QI:
9459       nargs = 2;
9460       break;
9461 
9462     case MULTI_ARG_2_DI_IMM:
9463     case MULTI_ARG_2_SI_IMM:
9464     case MULTI_ARG_2_HI_IMM:
9465     case MULTI_ARG_2_QI_IMM:
9466       nargs = 2;
9467       last_arg_constant = true;
9468       break;
9469 
9470     case MULTI_ARG_1_SF:
9471     case MULTI_ARG_1_DF:
9472     case MULTI_ARG_1_SF2:
9473     case MULTI_ARG_1_DF2:
9474     case MULTI_ARG_1_DI:
9475     case MULTI_ARG_1_SI:
9476     case MULTI_ARG_1_HI:
9477     case MULTI_ARG_1_QI:
9478     case MULTI_ARG_1_SI_DI:
9479     case MULTI_ARG_1_HI_DI:
9480     case MULTI_ARG_1_HI_SI:
9481     case MULTI_ARG_1_QI_DI:
9482     case MULTI_ARG_1_QI_SI:
9483     case MULTI_ARG_1_QI_HI:
9484       nargs = 1;
9485       break;
9486 
9487     case MULTI_ARG_2_DI_CMP:
9488     case MULTI_ARG_2_SI_CMP:
9489     case MULTI_ARG_2_HI_CMP:
9490     case MULTI_ARG_2_QI_CMP:
9491       nargs = 2;
9492       comparison_p = true;
9493       break;
9494 
9495     case MULTI_ARG_2_SF_TF:
9496     case MULTI_ARG_2_DF_TF:
9497     case MULTI_ARG_2_DI_TF:
9498     case MULTI_ARG_2_SI_TF:
9499     case MULTI_ARG_2_HI_TF:
9500     case MULTI_ARG_2_QI_TF:
9501       nargs = 2;
9502       tf_p = true;
9503       break;
9504 
9505     default:
9506       gcc_unreachable ();
9507     }
9508 
9509   if (optimize || !target
9510       || GET_MODE (target) != tmode
9511       || !insn_data[icode].operand[0].predicate (target, tmode))
9512     target = gen_reg_rtx (tmode);
9513   else if (memory_operand (target, tmode))
9514     num_memory++;
9515 
9516   gcc_assert (nargs <= ARRAY_SIZE (xops));
9517 
9518   for (i = 0; i < nargs; i++)
9519     {
9520       tree arg = CALL_EXPR_ARG (exp, i);
9521       rtx op = expand_normal (arg);
9522       int adjust = (comparison_p) ? 1 : 0;
9523       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9524 
9525       if (last_arg_constant && i == nargs - 1)
9526 	{
9527 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9528 	    {
9529 	      enum insn_code new_icode = icode;
9530 	      switch (icode)
9531 		{
9532 		case CODE_FOR_xop_vpermil2v2df3:
9533 		case CODE_FOR_xop_vpermil2v4sf3:
9534 		case CODE_FOR_xop_vpermil2v4df3:
9535 		case CODE_FOR_xop_vpermil2v8sf3:
9536 		  error ("the last argument must be a 2-bit immediate");
9537 		  return gen_reg_rtx (tmode);
9538 		case CODE_FOR_xop_rotlv2di3:
9539 		  new_icode = CODE_FOR_rotlv2di3;
9540 		  goto xop_rotl;
9541 		case CODE_FOR_xop_rotlv4si3:
9542 		  new_icode = CODE_FOR_rotlv4si3;
9543 		  goto xop_rotl;
9544 		case CODE_FOR_xop_rotlv8hi3:
9545 		  new_icode = CODE_FOR_rotlv8hi3;
9546 		  goto xop_rotl;
9547 		case CODE_FOR_xop_rotlv16qi3:
9548 		  new_icode = CODE_FOR_rotlv16qi3;
9549 		xop_rotl:
9550 		  if (CONST_INT_P (op))
9551 		    {
9552 		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9553 		      op = GEN_INT (INTVAL (op) & mask);
9554 		      gcc_checking_assert
9555 			(insn_data[icode].operand[i + 1].predicate (op, mode));
9556 		    }
9557 		  else
9558 		    {
9559 		      gcc_checking_assert
9560 			(nargs == 2
9561 			 && insn_data[new_icode].operand[0].mode == tmode
9562 			 && insn_data[new_icode].operand[1].mode == tmode
9563 			 && insn_data[new_icode].operand[2].mode == mode
9564 			 && insn_data[new_icode].operand[0].predicate
9565 			    == insn_data[icode].operand[0].predicate
9566 			 && insn_data[new_icode].operand[1].predicate
9567 			    == insn_data[icode].operand[1].predicate);
9568 		      icode = new_icode;
9569 		      goto non_constant;
9570 		    }
9571 		  break;
9572 		default:
9573 		  gcc_unreachable ();
9574 		}
9575 	    }
9576 	}
9577       else
9578 	{
9579 	non_constant:
9580 	  if (VECTOR_MODE_P (mode))
9581 	    op = safe_vector_operand (op, mode);
9582 
9583 	  /* If we aren't optimizing, only allow one memory operand to be
9584 	     generated.  */
9585 	  if (memory_operand (op, mode))
9586 	    num_memory++;
9587 
9588 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9589 
9590 	  if (optimize
9591 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9592 	      || num_memory > 1)
9593 	    op = force_reg (mode, op);
9594 	}
9595 
9596       xops[i] = op;
9597     }
9598 
9599   switch (nargs)
9600     {
9601     case 1:
9602       pat = GEN_FCN (icode) (target, xops[0]);
9603       break;
9604 
9605     case 2:
9606       if (tf_p)
9607 	pat = GEN_FCN (icode) (target, xops[0], xops[1],
9608 			       GEN_INT ((int)sub_code));
9609       else if (! comparison_p)
9610 	pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9611       else
9612 	{
9613 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9614 				       xops[0], xops[1]);
9615 
9616 	  pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9617 	}
9618       break;
9619 
9620     case 3:
9621       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9622       break;
9623 
9624     case 4:
9625       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9626       break;
9627 
9628     default:
9629       gcc_unreachable ();
9630     }
9631 
9632   if (! pat)
9633     return 0;
9634 
9635   emit_insn (pat);
9636   return target;
9637 }
9638 
9639 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9640    insns with vec_merge.  */
9641 
9642 static rtx
ix86_expand_unop_vec_merge_builtin(enum insn_code icode,tree exp,rtx target)9643 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9644 				    rtx target)
9645 {
9646   rtx pat;
9647   tree arg0 = CALL_EXPR_ARG (exp, 0);
9648   rtx op1, op0 = expand_normal (arg0);
9649   machine_mode tmode = insn_data[icode].operand[0].mode;
9650   machine_mode mode0 = insn_data[icode].operand[1].mode;
9651 
9652   if (optimize || !target
9653       || GET_MODE (target) != tmode
9654       || !insn_data[icode].operand[0].predicate (target, tmode))
9655     target = gen_reg_rtx (tmode);
9656 
9657   if (VECTOR_MODE_P (mode0))
9658     op0 = safe_vector_operand (op0, mode0);
9659 
9660   if ((optimize && !register_operand (op0, mode0))
9661       || !insn_data[icode].operand[1].predicate (op0, mode0))
9662     op0 = copy_to_mode_reg (mode0, op0);
9663 
9664   op1 = op0;
9665   if (!insn_data[icode].operand[2].predicate (op1, mode0))
9666     op1 = copy_to_mode_reg (mode0, op1);
9667 
9668   pat = GEN_FCN (icode) (target, op0, op1);
9669   if (! pat)
9670     return 0;
9671   emit_insn (pat);
9672   return target;
9673 }
9674 
9675 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
9676 
9677 static rtx
ix86_expand_sse_compare(const struct builtin_description * d,tree exp,rtx target,bool swap)9678 ix86_expand_sse_compare (const struct builtin_description *d,
9679 			 tree exp, rtx target, bool swap)
9680 {
9681   rtx pat;
9682   tree arg0 = CALL_EXPR_ARG (exp, 0);
9683   tree arg1 = CALL_EXPR_ARG (exp, 1);
9684   rtx op0 = expand_normal (arg0);
9685   rtx op1 = expand_normal (arg1);
9686   rtx op2;
9687   machine_mode tmode = insn_data[d->icode].operand[0].mode;
9688   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9689   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9690   enum rtx_code comparison = d->comparison;
9691 
9692   if (VECTOR_MODE_P (mode0))
9693     op0 = safe_vector_operand (op0, mode0);
9694   if (VECTOR_MODE_P (mode1))
9695     op1 = safe_vector_operand (op1, mode1);
9696 
9697   /* Swap operands if we have a comparison that isn't available in
9698      hardware.  */
9699   if (swap)
9700     std::swap (op0, op1);
9701 
9702   if (optimize || !target
9703       || GET_MODE (target) != tmode
9704       || !insn_data[d->icode].operand[0].predicate (target, tmode))
9705     target = gen_reg_rtx (tmode);
9706 
9707   if ((optimize && !register_operand (op0, mode0))
9708       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9709     op0 = copy_to_mode_reg (mode0, op0);
9710   if ((optimize && !register_operand (op1, mode1))
9711       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9712     op1 = copy_to_mode_reg (mode1, op1);
9713 
9714   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9715   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9716   if (! pat)
9717     return 0;
9718   emit_insn (pat);
9719   return target;
9720 }
9721 
9722 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
9723 
9724 static rtx
ix86_expand_sse_comi(const struct builtin_description * d,tree exp,rtx target)9725 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9726 		      rtx target)
9727 {
9728   rtx pat;
9729   tree arg0 = CALL_EXPR_ARG (exp, 0);
9730   tree arg1 = CALL_EXPR_ARG (exp, 1);
9731   rtx op0 = expand_normal (arg0);
9732   rtx op1 = expand_normal (arg1);
9733   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9734   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9735   enum rtx_code comparison = d->comparison;
9736 
9737   if (VECTOR_MODE_P (mode0))
9738     op0 = safe_vector_operand (op0, mode0);
9739   if (VECTOR_MODE_P (mode1))
9740     op1 = safe_vector_operand (op1, mode1);
9741 
9742   target = gen_reg_rtx (SImode);
9743   emit_move_insn (target, const0_rtx);
9744   target = gen_rtx_SUBREG (QImode, target, 0);
9745 
9746   if ((optimize && !register_operand (op0, mode0))
9747       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9748     op0 = copy_to_mode_reg (mode0, op0);
9749   if ((optimize && !register_operand (op1, mode1))
9750       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9751     op1 = copy_to_mode_reg (mode1, op1);
9752 
9753   pat = GEN_FCN (d->icode) (op0, op1);
9754   if (! pat)
9755     return 0;
9756   emit_insn (pat);
9757   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9758 			  gen_rtx_fmt_ee (comparison, QImode,
9759 					  SET_DEST (pat),
9760 					  const0_rtx)));
9761 
9762   return SUBREG_REG (target);
9763 }
9764 
9765 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
9766 
9767 static rtx
ix86_expand_sse_round(const struct builtin_description * d,tree exp,rtx target)9768 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9769 		       rtx target)
9770 {
9771   rtx pat;
9772   tree arg0 = CALL_EXPR_ARG (exp, 0);
9773   rtx op1, op0 = expand_normal (arg0);
9774   machine_mode tmode = insn_data[d->icode].operand[0].mode;
9775   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9776 
9777   if (optimize || target == 0
9778       || GET_MODE (target) != tmode
9779       || !insn_data[d->icode].operand[0].predicate (target, tmode))
9780     target = gen_reg_rtx (tmode);
9781 
9782   if (VECTOR_MODE_P (mode0))
9783     op0 = safe_vector_operand (op0, mode0);
9784 
9785   if ((optimize && !register_operand (op0, mode0))
9786       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9787     op0 = copy_to_mode_reg (mode0, op0);
9788 
9789   op1 = GEN_INT (d->comparison);
9790 
9791   pat = GEN_FCN (d->icode) (target, op0, op1);
9792   if (! pat)
9793     return 0;
9794   emit_insn (pat);
9795   return target;
9796 }
9797 
9798 static rtx
ix86_expand_sse_round_vec_pack_sfix(const struct builtin_description * d,tree exp,rtx target)9799 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
9800 				     tree exp, rtx target)
9801 {
9802   rtx pat;
9803   tree arg0 = CALL_EXPR_ARG (exp, 0);
9804   tree arg1 = CALL_EXPR_ARG (exp, 1);
9805   rtx op0 = expand_normal (arg0);
9806   rtx op1 = expand_normal (arg1);
9807   rtx op2;
9808   machine_mode tmode = insn_data[d->icode].operand[0].mode;
9809   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9810   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9811 
9812   if (optimize || target == 0
9813       || GET_MODE (target) != tmode
9814       || !insn_data[d->icode].operand[0].predicate (target, tmode))
9815     target = gen_reg_rtx (tmode);
9816 
9817   op0 = safe_vector_operand (op0, mode0);
9818   op1 = safe_vector_operand (op1, mode1);
9819 
9820   if ((optimize && !register_operand (op0, mode0))
9821       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9822     op0 = copy_to_mode_reg (mode0, op0);
9823   if ((optimize && !register_operand (op1, mode1))
9824       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9825     op1 = copy_to_mode_reg (mode1, op1);
9826 
9827   op2 = GEN_INT (d->comparison);
9828 
9829   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9830   if (! pat)
9831     return 0;
9832   emit_insn (pat);
9833   return target;
9834 }
9835 
9836 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
9837 
9838 static rtx
ix86_expand_sse_ptest(const struct builtin_description * d,tree exp,rtx target)9839 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
9840 		       rtx target)
9841 {
9842   rtx pat;
9843   tree arg0 = CALL_EXPR_ARG (exp, 0);
9844   tree arg1 = CALL_EXPR_ARG (exp, 1);
9845   rtx op0 = expand_normal (arg0);
9846   rtx op1 = expand_normal (arg1);
9847   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9848   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9849   enum rtx_code comparison = d->comparison;
9850 
9851   if (VECTOR_MODE_P (mode0))
9852     op0 = safe_vector_operand (op0, mode0);
9853   if (VECTOR_MODE_P (mode1))
9854     op1 = safe_vector_operand (op1, mode1);
9855 
9856   target = gen_reg_rtx (SImode);
9857   emit_move_insn (target, const0_rtx);
9858   target = gen_rtx_SUBREG (QImode, target, 0);
9859 
9860   if ((optimize && !register_operand (op0, mode0))
9861       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9862     op0 = copy_to_mode_reg (mode0, op0);
9863   if ((optimize && !register_operand (op1, mode1))
9864       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9865     op1 = copy_to_mode_reg (mode1, op1);
9866 
9867   pat = GEN_FCN (d->icode) (op0, op1);
9868   if (! pat)
9869     return 0;
9870   emit_insn (pat);
9871   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9872 			  gen_rtx_fmt_ee (comparison, QImode,
9873 					  SET_DEST (pat),
9874 					  const0_rtx)));
9875 
9876   return SUBREG_REG (target);
9877 }
9878 
9879 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
9880 
9881 static rtx
ix86_expand_sse_pcmpestr(const struct builtin_description * d,tree exp,rtx target)9882 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
9883 			  tree exp, rtx target)
9884 {
9885   rtx pat;
9886   tree arg0 = CALL_EXPR_ARG (exp, 0);
9887   tree arg1 = CALL_EXPR_ARG (exp, 1);
9888   tree arg2 = CALL_EXPR_ARG (exp, 2);
9889   tree arg3 = CALL_EXPR_ARG (exp, 3);
9890   tree arg4 = CALL_EXPR_ARG (exp, 4);
9891   rtx scratch0, scratch1;
9892   rtx op0 = expand_normal (arg0);
9893   rtx op1 = expand_normal (arg1);
9894   rtx op2 = expand_normal (arg2);
9895   rtx op3 = expand_normal (arg3);
9896   rtx op4 = expand_normal (arg4);
9897   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
9898 
9899   tmode0 = insn_data[d->icode].operand[0].mode;
9900   tmode1 = insn_data[d->icode].operand[1].mode;
9901   modev2 = insn_data[d->icode].operand[2].mode;
9902   modei3 = insn_data[d->icode].operand[3].mode;
9903   modev4 = insn_data[d->icode].operand[4].mode;
9904   modei5 = insn_data[d->icode].operand[5].mode;
9905   modeimm = insn_data[d->icode].operand[6].mode;
9906 
9907   if (VECTOR_MODE_P (modev2))
9908     op0 = safe_vector_operand (op0, modev2);
9909   if (VECTOR_MODE_P (modev4))
9910     op2 = safe_vector_operand (op2, modev4);
9911 
9912   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
9913     op0 = copy_to_mode_reg (modev2, op0);
9914   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
9915     op1 = copy_to_mode_reg (modei3, op1);
9916   if ((optimize && !register_operand (op2, modev4))
9917       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
9918     op2 = copy_to_mode_reg (modev4, op2);
9919   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
9920     op3 = copy_to_mode_reg (modei5, op3);
9921 
9922   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
9923     {
9924       error ("the fifth argument must be an 8-bit immediate");
9925       return const0_rtx;
9926     }
9927 
9928   if (d->code == IX86_BUILTIN_PCMPESTRI128)
9929     {
9930       if (optimize || !target
9931 	  || GET_MODE (target) != tmode0
9932 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
9933 	target = gen_reg_rtx (tmode0);
9934 
9935       scratch1 = gen_reg_rtx (tmode1);
9936 
9937       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
9938     }
9939   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
9940     {
9941       if (optimize || !target
9942 	  || GET_MODE (target) != tmode1
9943 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
9944 	target = gen_reg_rtx (tmode1);
9945 
9946       scratch0 = gen_reg_rtx (tmode0);
9947 
9948       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
9949     }
9950   else
9951     {
9952       gcc_assert (d->flag);
9953 
9954       scratch0 = gen_reg_rtx (tmode0);
9955       scratch1 = gen_reg_rtx (tmode1);
9956 
9957       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
9958     }
9959 
9960   if (! pat)
9961     return 0;
9962 
9963   emit_insn (pat);
9964 
9965   if (d->flag)
9966     {
9967       target = gen_reg_rtx (SImode);
9968       emit_move_insn (target, const0_rtx);
9969       target = gen_rtx_SUBREG (QImode, target, 0);
9970 
9971       emit_insn
9972 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9973 		      gen_rtx_fmt_ee (EQ, QImode,
9974 				      gen_rtx_REG ((machine_mode) d->flag,
9975 						   FLAGS_REG),
9976 				      const0_rtx)));
9977       return SUBREG_REG (target);
9978     }
9979   else
9980     return target;
9981 }
9982 
9983 
9984 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
9985 
9986 static rtx
ix86_expand_sse_pcmpistr(const struct builtin_description * d,tree exp,rtx target)9987 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
9988 			  tree exp, rtx target)
9989 {
9990   rtx pat;
9991   tree arg0 = CALL_EXPR_ARG (exp, 0);
9992   tree arg1 = CALL_EXPR_ARG (exp, 1);
9993   tree arg2 = CALL_EXPR_ARG (exp, 2);
9994   rtx scratch0, scratch1;
9995   rtx op0 = expand_normal (arg0);
9996   rtx op1 = expand_normal (arg1);
9997   rtx op2 = expand_normal (arg2);
9998   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
9999 
10000   tmode0 = insn_data[d->icode].operand[0].mode;
10001   tmode1 = insn_data[d->icode].operand[1].mode;
10002   modev2 = insn_data[d->icode].operand[2].mode;
10003   modev3 = insn_data[d->icode].operand[3].mode;
10004   modeimm = insn_data[d->icode].operand[4].mode;
10005 
10006   if (VECTOR_MODE_P (modev2))
10007     op0 = safe_vector_operand (op0, modev2);
10008   if (VECTOR_MODE_P (modev3))
10009     op1 = safe_vector_operand (op1, modev3);
10010 
10011   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10012     op0 = copy_to_mode_reg (modev2, op0);
10013   if ((optimize && !register_operand (op1, modev3))
10014       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10015     op1 = copy_to_mode_reg (modev3, op1);
10016 
10017   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10018     {
10019       error ("the third argument must be an 8-bit immediate");
10020       return const0_rtx;
10021     }
10022 
10023   if (d->code == IX86_BUILTIN_PCMPISTRI128)
10024     {
10025       if (optimize || !target
10026 	  || GET_MODE (target) != tmode0
10027 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10028 	target = gen_reg_rtx (tmode0);
10029 
10030       scratch1 = gen_reg_rtx (tmode1);
10031 
10032       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10033     }
10034   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10035     {
10036       if (optimize || !target
10037 	  || GET_MODE (target) != tmode1
10038 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10039 	target = gen_reg_rtx (tmode1);
10040 
10041       scratch0 = gen_reg_rtx (tmode0);
10042 
10043       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10044     }
10045   else
10046     {
10047       gcc_assert (d->flag);
10048 
10049       scratch0 = gen_reg_rtx (tmode0);
10050       scratch1 = gen_reg_rtx (tmode1);
10051 
10052       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10053     }
10054 
10055   if (! pat)
10056     return 0;
10057 
10058   emit_insn (pat);
10059 
10060   if (d->flag)
10061     {
10062       target = gen_reg_rtx (SImode);
10063       emit_move_insn (target, const0_rtx);
10064       target = gen_rtx_SUBREG (QImode, target, 0);
10065 
10066       emit_insn
10067 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10068 		      gen_rtx_fmt_ee (EQ, QImode,
10069 				      gen_rtx_REG ((machine_mode) d->flag,
10070 						   FLAGS_REG),
10071 				      const0_rtx)));
10072       return SUBREG_REG (target);
10073     }
10074   else
10075     return target;
10076 }
10077 
10078 /* Fixup modeless constants to fit required mode.  */
10079 
10080 static rtx
fixup_modeless_constant(rtx x,machine_mode mode)10081 fixup_modeless_constant (rtx x, machine_mode mode)
10082 {
10083   if (GET_MODE (x) == VOIDmode)
10084     x = convert_to_mode (mode, x, 1);
10085   return x;
10086 }
10087 
10088 /* Subroutine of ix86_expand_builtin to take care of insns with
10089    variable number of operands.  */
10090 
10091 static rtx
ix86_expand_args_builtin(const struct builtin_description * d,tree exp,rtx target)10092 ix86_expand_args_builtin (const struct builtin_description *d,
10093 			  tree exp, rtx target)
10094 {
10095   rtx pat, real_target;
10096   unsigned int i, nargs;
10097   unsigned int nargs_constant = 0;
10098   unsigned int mask_pos = 0;
10099   int num_memory = 0;
10100   rtx xops[6];
10101   bool second_arg_count = false;
10102   enum insn_code icode = d->icode;
10103   const struct insn_data_d *insn_p = &insn_data[icode];
10104   machine_mode tmode = insn_p->operand[0].mode;
10105   machine_mode rmode = VOIDmode;
10106   bool swap = false;
10107   enum rtx_code comparison = d->comparison;
10108 
10109   switch ((enum ix86_builtin_func_type) d->flag)
10110     {
10111     case V2DF_FTYPE_V2DF_ROUND:
10112     case V4DF_FTYPE_V4DF_ROUND:
10113     case V8DF_FTYPE_V8DF_ROUND:
10114     case V4SF_FTYPE_V4SF_ROUND:
10115     case V8SF_FTYPE_V8SF_ROUND:
10116     case V16SF_FTYPE_V16SF_ROUND:
10117     case V8HF_FTYPE_V8HF_ROUND:
10118     case V16HF_FTYPE_V16HF_ROUND:
10119     case V32HF_FTYPE_V32HF_ROUND:
10120     case V4SI_FTYPE_V4SF_ROUND:
10121     case V8SI_FTYPE_V8SF_ROUND:
10122     case V16SI_FTYPE_V16SF_ROUND:
10123       return ix86_expand_sse_round (d, exp, target);
10124     case V4SI_FTYPE_V2DF_V2DF_ROUND:
10125     case V8SI_FTYPE_V4DF_V4DF_ROUND:
10126     case V16SI_FTYPE_V8DF_V8DF_ROUND:
10127       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10128     case INT_FTYPE_V8SF_V8SF_PTEST:
10129     case INT_FTYPE_V4DI_V4DI_PTEST:
10130     case INT_FTYPE_V4DF_V4DF_PTEST:
10131     case INT_FTYPE_V4SF_V4SF_PTEST:
10132     case INT_FTYPE_V2DI_V2DI_PTEST:
10133     case INT_FTYPE_V2DF_V2DF_PTEST:
10134       return ix86_expand_sse_ptest (d, exp, target);
10135     case FLOAT128_FTYPE_FLOAT128:
10136     case FLOAT_FTYPE_FLOAT:
10137     case INT_FTYPE_INT:
10138     case UINT_FTYPE_UINT:
10139     case UINT16_FTYPE_UINT16:
10140     case UINT64_FTYPE_INT:
10141     case UINT64_FTYPE_UINT64:
10142     case INT64_FTYPE_INT64:
10143     case INT64_FTYPE_V4SF:
10144     case INT64_FTYPE_V2DF:
10145     case INT_FTYPE_V16QI:
10146     case INT_FTYPE_V8QI:
10147     case INT_FTYPE_V8SF:
10148     case INT_FTYPE_V4DF:
10149     case INT_FTYPE_V4SF:
10150     case INT_FTYPE_V2DF:
10151     case INT_FTYPE_V32QI:
10152     case V16QI_FTYPE_V16QI:
10153     case V8SI_FTYPE_V8SF:
10154     case V8SI_FTYPE_V4SI:
10155     case V8HI_FTYPE_V8HI:
10156     case V8HI_FTYPE_V16QI:
10157     case V8QI_FTYPE_V8QI:
10158     case V8SF_FTYPE_V8SF:
10159     case V8SF_FTYPE_V8SI:
10160     case V8SF_FTYPE_V4SF:
10161     case V8SF_FTYPE_V8HI:
10162     case V4SI_FTYPE_V4SI:
10163     case V4SI_FTYPE_V16QI:
10164     case V4SI_FTYPE_V4SF:
10165     case V4SI_FTYPE_V8SI:
10166     case V4SI_FTYPE_V8HI:
10167     case V4SI_FTYPE_V4DF:
10168     case V4SI_FTYPE_V2DF:
10169     case V4HI_FTYPE_V4HI:
10170     case V4DF_FTYPE_V4DF:
10171     case V4DF_FTYPE_V4SI:
10172     case V4DF_FTYPE_V4SF:
10173     case V4DF_FTYPE_V2DF:
10174     case V4SF_FTYPE_V4SF:
10175     case V4SF_FTYPE_V4SI:
10176     case V4SF_FTYPE_V8SF:
10177     case V4SF_FTYPE_V4DF:
10178     case V4SF_FTYPE_V8HI:
10179     case V4SF_FTYPE_V2DF:
10180     case V2DI_FTYPE_V2DI:
10181     case V2DI_FTYPE_V16QI:
10182     case V2DI_FTYPE_V8HI:
10183     case V2DI_FTYPE_V4SI:
10184     case V2DF_FTYPE_V2DF:
10185     case V2DF_FTYPE_V4SI:
10186     case V2DF_FTYPE_V4DF:
10187     case V2DF_FTYPE_V4SF:
10188     case V2DF_FTYPE_V2SI:
10189     case V2SI_FTYPE_V2SI:
10190     case V2SI_FTYPE_V4SF:
10191     case V2SI_FTYPE_V2SF:
10192     case V2SI_FTYPE_V2DF:
10193     case V2SF_FTYPE_V2SF:
10194     case V2SF_FTYPE_V2SI:
10195     case V32QI_FTYPE_V32QI:
10196     case V32QI_FTYPE_V16QI:
10197     case V16HI_FTYPE_V16HI:
10198     case V16HI_FTYPE_V8HI:
10199     case V8SI_FTYPE_V8SI:
10200     case V16HI_FTYPE_V16QI:
10201     case V8SI_FTYPE_V16QI:
10202     case V4DI_FTYPE_V16QI:
10203     case V8SI_FTYPE_V8HI:
10204     case V4DI_FTYPE_V8HI:
10205     case V4DI_FTYPE_V4SI:
10206     case V4DI_FTYPE_V2DI:
10207     case UQI_FTYPE_UQI:
10208     case UHI_FTYPE_UHI:
10209     case USI_FTYPE_USI:
10210     case USI_FTYPE_UQI:
10211     case USI_FTYPE_UHI:
10212     case UDI_FTYPE_UDI:
10213     case UHI_FTYPE_V16QI:
10214     case USI_FTYPE_V32QI:
10215     case UDI_FTYPE_V64QI:
10216     case V16QI_FTYPE_UHI:
10217     case V32QI_FTYPE_USI:
10218     case V64QI_FTYPE_UDI:
10219     case V8HI_FTYPE_UQI:
10220     case V16HI_FTYPE_UHI:
10221     case V32HI_FTYPE_USI:
10222     case V4SI_FTYPE_UQI:
10223     case V8SI_FTYPE_UQI:
10224     case V4SI_FTYPE_UHI:
10225     case V8SI_FTYPE_UHI:
10226     case UQI_FTYPE_V8HI:
10227     case UHI_FTYPE_V16HI:
10228     case USI_FTYPE_V32HI:
10229     case UQI_FTYPE_V4SI:
10230     case UQI_FTYPE_V8SI:
10231     case UHI_FTYPE_V16SI:
10232     case UQI_FTYPE_V2DI:
10233     case UQI_FTYPE_V4DI:
10234     case UQI_FTYPE_V8DI:
10235     case V16SI_FTYPE_UHI:
10236     case V2DI_FTYPE_UQI:
10237     case V4DI_FTYPE_UQI:
10238     case V16SI_FTYPE_INT:
10239     case V16SF_FTYPE_V8SF:
10240     case V16SI_FTYPE_V8SI:
10241     case V16SF_FTYPE_V4SF:
10242     case V16SI_FTYPE_V4SI:
10243     case V16SI_FTYPE_V16SF:
10244     case V16SI_FTYPE_V16SI:
10245     case V64QI_FTYPE_V64QI:
10246     case V32HI_FTYPE_V32HI:
10247     case V16SF_FTYPE_V16SF:
10248     case V8DI_FTYPE_UQI:
10249     case V8DI_FTYPE_V8DI:
10250     case V8DF_FTYPE_V4DF:
10251     case V8DF_FTYPE_V2DF:
10252     case V8DF_FTYPE_V8DF:
10253     case V4DI_FTYPE_V4DI:
10254     case V16HI_FTYPE_V16SF:
10255     case V8HI_FTYPE_V8SF:
10256     case V8HI_FTYPE_V4SF:
10257       nargs = 1;
10258       break;
10259     case V4SF_FTYPE_V4SF_VEC_MERGE:
10260     case V2DF_FTYPE_V2DF_VEC_MERGE:
10261       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10262     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10263     case V16QI_FTYPE_V16QI_V16QI:
10264     case V16QI_FTYPE_V8HI_V8HI:
10265     case V16HF_FTYPE_V16HF_V16HF:
10266     case V16SF_FTYPE_V16SF_V16SF:
10267     case V8QI_FTYPE_V8QI_V8QI:
10268     case V8QI_FTYPE_V4HI_V4HI:
10269     case V8HI_FTYPE_V8HI_V8HI:
10270     case V8HI_FTYPE_V16QI_V16QI:
10271     case V8HI_FTYPE_V4SI_V4SI:
10272     case V8HF_FTYPE_V8HF_V8HF:
10273     case V8SF_FTYPE_V8SF_V8SF:
10274     case V8SF_FTYPE_V8SF_V8SI:
10275     case V8DF_FTYPE_V8DF_V8DF:
10276     case V4SI_FTYPE_V4SI_V4SI:
10277     case V4SI_FTYPE_V8HI_V8HI:
10278     case V4SI_FTYPE_V2DF_V2DF:
10279     case V4HI_FTYPE_V4HI_V4HI:
10280     case V4HI_FTYPE_V8QI_V8QI:
10281     case V4HI_FTYPE_V2SI_V2SI:
10282     case V4DF_FTYPE_V4DF_V4DF:
10283     case V4DF_FTYPE_V4DF_V4DI:
10284     case V4SF_FTYPE_V4SF_V4SF:
10285     case V4SF_FTYPE_V4SF_V4SI:
10286     case V4SF_FTYPE_V4SF_V2SI:
10287     case V4SF_FTYPE_V4SF_V2DF:
10288     case V4SF_FTYPE_V4SF_UINT:
10289     case V4SF_FTYPE_V4SF_DI:
10290     case V4SF_FTYPE_V4SF_SI:
10291     case V2DI_FTYPE_V2DI_V2DI:
10292     case V2DI_FTYPE_V16QI_V16QI:
10293     case V2DI_FTYPE_V4SI_V4SI:
10294     case V2DI_FTYPE_V2DI_V16QI:
10295     case V2SI_FTYPE_V2SI_V2SI:
10296     case V2SI_FTYPE_V4HI_V4HI:
10297     case V2SI_FTYPE_V2SF_V2SF:
10298     case V2DF_FTYPE_V2DF_V2DF:
10299     case V2DF_FTYPE_V2DF_V4SF:
10300     case V2DF_FTYPE_V2DF_V2DI:
10301     case V2DF_FTYPE_V2DF_DI:
10302     case V2DF_FTYPE_V2DF_SI:
10303     case V2DF_FTYPE_V2DF_UINT:
10304     case V2SF_FTYPE_V2SF_V2SF:
10305     case V1DI_FTYPE_V1DI_V1DI:
10306     case V1DI_FTYPE_V8QI_V8QI:
10307     case V1DI_FTYPE_V2SI_V2SI:
10308     case V32QI_FTYPE_V16HI_V16HI:
10309     case V16HI_FTYPE_V8SI_V8SI:
10310     case V64QI_FTYPE_V64QI_V64QI:
10311     case V32QI_FTYPE_V32QI_V32QI:
10312     case V16HI_FTYPE_V32QI_V32QI:
10313     case V16HI_FTYPE_V16HI_V16HI:
10314     case V8SI_FTYPE_V4DF_V4DF:
10315     case V8SI_FTYPE_V8SI_V8SI:
10316     case V8SI_FTYPE_V16HI_V16HI:
10317     case V4DI_FTYPE_V4DI_V4DI:
10318     case V4DI_FTYPE_V8SI_V8SI:
10319     case V8DI_FTYPE_V64QI_V64QI:
10320       if (comparison == UNKNOWN)
10321 	return ix86_expand_binop_builtin (icode, exp, target);
10322       nargs = 2;
10323       break;
10324     case V4SF_FTYPE_V4SF_V4SF_SWAP:
10325     case V2DF_FTYPE_V2DF_V2DF_SWAP:
10326       gcc_assert (comparison != UNKNOWN);
10327       nargs = 2;
10328       swap = true;
10329       break;
10330     case V16HI_FTYPE_V16HI_V8HI_COUNT:
10331     case V16HI_FTYPE_V16HI_SI_COUNT:
10332     case V8SI_FTYPE_V8SI_V4SI_COUNT:
10333     case V8SI_FTYPE_V8SI_SI_COUNT:
10334     case V4DI_FTYPE_V4DI_V2DI_COUNT:
10335     case V4DI_FTYPE_V4DI_INT_COUNT:
10336     case V8HI_FTYPE_V8HI_V8HI_COUNT:
10337     case V8HI_FTYPE_V8HI_SI_COUNT:
10338     case V4SI_FTYPE_V4SI_V4SI_COUNT:
10339     case V4SI_FTYPE_V4SI_SI_COUNT:
10340     case V4HI_FTYPE_V4HI_V4HI_COUNT:
10341     case V4HI_FTYPE_V4HI_SI_COUNT:
10342     case V2DI_FTYPE_V2DI_V2DI_COUNT:
10343     case V2DI_FTYPE_V2DI_SI_COUNT:
10344     case V2SI_FTYPE_V2SI_V2SI_COUNT:
10345     case V2SI_FTYPE_V2SI_SI_COUNT:
10346     case V1DI_FTYPE_V1DI_V1DI_COUNT:
10347     case V1DI_FTYPE_V1DI_SI_COUNT:
10348       nargs = 2;
10349       second_arg_count = true;
10350       break;
10351     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10352     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10353     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10354     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10355     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10356     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10357     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10358     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10359     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10360     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10361     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10362     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10363     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10364     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10365     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10366     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10367     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10368     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10369       nargs = 4;
10370       second_arg_count = true;
10371       break;
10372     case UINT64_FTYPE_UINT64_UINT64:
10373     case UINT_FTYPE_UINT_UINT:
10374     case UINT_FTYPE_UINT_USHORT:
10375     case UINT_FTYPE_UINT_UCHAR:
10376     case UINT16_FTYPE_UINT16_INT:
10377     case UINT8_FTYPE_UINT8_INT:
10378     case UQI_FTYPE_UQI_UQI:
10379     case UHI_FTYPE_UHI_UHI:
10380     case USI_FTYPE_USI_USI:
10381     case UDI_FTYPE_UDI_UDI:
10382     case V16SI_FTYPE_V8DF_V8DF:
10383     case V32HI_FTYPE_V16SF_V16SF:
10384     case V16HI_FTYPE_V8SF_V8SF:
10385     case V8HI_FTYPE_V4SF_V4SF:
10386     case V16HI_FTYPE_V16SF_UHI:
10387     case V8HI_FTYPE_V8SF_UQI:
10388     case V8HI_FTYPE_V4SF_UQI:
10389       nargs = 2;
10390       break;
10391     case V2DI_FTYPE_V2DI_INT_CONVERT:
10392       nargs = 2;
10393       rmode = V1TImode;
10394       nargs_constant = 1;
10395       break;
10396     case V4DI_FTYPE_V4DI_INT_CONVERT:
10397       nargs = 2;
10398       rmode = V2TImode;
10399       nargs_constant = 1;
10400       break;
10401     case V8DI_FTYPE_V8DI_INT_CONVERT:
10402       nargs = 2;
10403       rmode = V4TImode;
10404       nargs_constant = 1;
10405       break;
10406     case V8HI_FTYPE_V8HI_INT:
10407     case V8HI_FTYPE_V8SF_INT:
10408     case V16HI_FTYPE_V16SF_INT:
10409     case V8HI_FTYPE_V4SF_INT:
10410     case V8SF_FTYPE_V8SF_INT:
10411     case V4SF_FTYPE_V16SF_INT:
10412     case V16SF_FTYPE_V16SF_INT:
10413     case V4SI_FTYPE_V4SI_INT:
10414     case V4SI_FTYPE_V8SI_INT:
10415     case V4HI_FTYPE_V4HI_INT:
10416     case V4DF_FTYPE_V4DF_INT:
10417     case V4DF_FTYPE_V8DF_INT:
10418     case V4SF_FTYPE_V4SF_INT:
10419     case V4SF_FTYPE_V8SF_INT:
10420     case V2DI_FTYPE_V2DI_INT:
10421     case V2DF_FTYPE_V2DF_INT:
10422     case V2DF_FTYPE_V4DF_INT:
10423     case V16HI_FTYPE_V16HI_INT:
10424     case V8SI_FTYPE_V8SI_INT:
10425     case V16SI_FTYPE_V16SI_INT:
10426     case V4SI_FTYPE_V16SI_INT:
10427     case V4DI_FTYPE_V4DI_INT:
10428     case V2DI_FTYPE_V4DI_INT:
10429     case V4DI_FTYPE_V8DI_INT:
10430     case UQI_FTYPE_UQI_UQI_CONST:
10431     case UHI_FTYPE_UHI_UQI:
10432     case USI_FTYPE_USI_UQI:
10433     case UDI_FTYPE_UDI_UQI:
10434       nargs = 2;
10435       nargs_constant = 1;
10436       break;
10437     case V16QI_FTYPE_V16QI_V16QI_V16QI:
10438     case V8SF_FTYPE_V8SF_V8SF_V8SF:
10439     case V4DF_FTYPE_V4DF_V4DF_V4DF:
10440     case V4SF_FTYPE_V4SF_V4SF_V4SF:
10441     case V2DF_FTYPE_V2DF_V2DF_V2DF:
10442     case V32QI_FTYPE_V32QI_V32QI_V32QI:
10443     case UHI_FTYPE_V16SI_V16SI_UHI:
10444     case UQI_FTYPE_V8DI_V8DI_UQI:
10445     case V16HI_FTYPE_V16SI_V16HI_UHI:
10446     case V16QI_FTYPE_V16SI_V16QI_UHI:
10447     case V16QI_FTYPE_V8DI_V16QI_UQI:
10448     case V32HF_FTYPE_V32HF_V32HF_USI:
10449     case V16SF_FTYPE_V16SF_V16SF_UHI:
10450     case V16SF_FTYPE_V4SF_V16SF_UHI:
10451     case V16SI_FTYPE_SI_V16SI_UHI:
10452     case V16SI_FTYPE_V16HI_V16SI_UHI:
10453     case V16SI_FTYPE_V16QI_V16SI_UHI:
10454     case V8SF_FTYPE_V4SF_V8SF_UQI:
10455     case V4DF_FTYPE_V2DF_V4DF_UQI:
10456     case V8SI_FTYPE_V4SI_V8SI_UQI:
10457     case V8SI_FTYPE_SI_V8SI_UQI:
10458     case V4SI_FTYPE_V4SI_V4SI_UQI:
10459     case V4SI_FTYPE_SI_V4SI_UQI:
10460     case V4DI_FTYPE_V2DI_V4DI_UQI:
10461     case V4DI_FTYPE_DI_V4DI_UQI:
10462     case V2DI_FTYPE_V2DI_V2DI_UQI:
10463     case V2DI_FTYPE_DI_V2DI_UQI:
10464     case V64QI_FTYPE_V64QI_V64QI_UDI:
10465     case V64QI_FTYPE_V16QI_V64QI_UDI:
10466     case V64QI_FTYPE_QI_V64QI_UDI:
10467     case V32QI_FTYPE_V32QI_V32QI_USI:
10468     case V32QI_FTYPE_V16QI_V32QI_USI:
10469     case V32QI_FTYPE_QI_V32QI_USI:
10470     case V16QI_FTYPE_V16QI_V16QI_UHI:
10471     case V16QI_FTYPE_QI_V16QI_UHI:
10472     case V32HI_FTYPE_V8HI_V32HI_USI:
10473     case V32HI_FTYPE_HI_V32HI_USI:
10474     case V16HI_FTYPE_V8HI_V16HI_UHI:
10475     case V16HI_FTYPE_HI_V16HI_UHI:
10476     case V8HI_FTYPE_V8HI_V8HI_UQI:
10477     case V8HI_FTYPE_HI_V8HI_UQI:
10478     case V16HF_FTYPE_V16HF_V16HF_UHI:
10479     case V8SF_FTYPE_V8HI_V8SF_UQI:
10480     case V4SF_FTYPE_V8HI_V4SF_UQI:
10481     case V8SI_FTYPE_V8HF_V8SI_UQI:
10482     case V8SF_FTYPE_V8HF_V8SF_UQI:
10483     case V8SI_FTYPE_V8SF_V8SI_UQI:
10484     case V4SI_FTYPE_V4SF_V4SI_UQI:
10485     case V4SI_FTYPE_V8HF_V4SI_UQI:
10486     case V4SF_FTYPE_V8HF_V4SF_UQI:
10487     case V4DI_FTYPE_V8HF_V4DI_UQI:
10488     case V4DI_FTYPE_V4SF_V4DI_UQI:
10489     case V2DI_FTYPE_V8HF_V2DI_UQI:
10490     case V2DI_FTYPE_V4SF_V2DI_UQI:
10491     case V8HF_FTYPE_V8HF_V8HF_UQI:
10492     case V8HF_FTYPE_V8HF_V8HF_V8HF:
10493     case V8HF_FTYPE_V8HI_V8HF_UQI:
10494     case V8HF_FTYPE_V8SI_V8HF_UQI:
10495     case V8HF_FTYPE_V8SF_V8HF_UQI:
10496     case V8HF_FTYPE_V4SI_V8HF_UQI:
10497     case V8HF_FTYPE_V4SF_V8HF_UQI:
10498     case V8HF_FTYPE_V4DI_V8HF_UQI:
10499     case V8HF_FTYPE_V4DF_V8HF_UQI:
10500     case V8HF_FTYPE_V2DI_V8HF_UQI:
10501     case V8HF_FTYPE_V2DF_V8HF_UQI:
10502     case V4SF_FTYPE_V4DI_V4SF_UQI:
10503     case V4SF_FTYPE_V2DI_V4SF_UQI:
10504     case V4DF_FTYPE_V4DI_V4DF_UQI:
10505     case V4DF_FTYPE_V8HF_V4DF_UQI:
10506     case V2DF_FTYPE_V8HF_V2DF_UQI:
10507     case V2DF_FTYPE_V2DI_V2DF_UQI:
10508     case V16QI_FTYPE_V8HI_V16QI_UQI:
10509     case V16QI_FTYPE_V16HI_V16QI_UHI:
10510     case V16QI_FTYPE_V4SI_V16QI_UQI:
10511     case V16QI_FTYPE_V8SI_V16QI_UQI:
10512     case V8HI_FTYPE_V8HF_V8HI_UQI:
10513     case V8HI_FTYPE_V4SI_V8HI_UQI:
10514     case V8HI_FTYPE_V8SI_V8HI_UQI:
10515     case V16QI_FTYPE_V2DI_V16QI_UQI:
10516     case V16QI_FTYPE_V4DI_V16QI_UQI:
10517     case V8HI_FTYPE_V2DI_V8HI_UQI:
10518     case V8HI_FTYPE_V4DI_V8HI_UQI:
10519     case V4SI_FTYPE_V2DI_V4SI_UQI:
10520     case V4SI_FTYPE_V4DI_V4SI_UQI:
10521     case V32QI_FTYPE_V32HI_V32QI_USI:
10522     case UHI_FTYPE_V16QI_V16QI_UHI:
10523     case USI_FTYPE_V32QI_V32QI_USI:
10524     case UDI_FTYPE_V64QI_V64QI_UDI:
10525     case UQI_FTYPE_V8HI_V8HI_UQI:
10526     case UHI_FTYPE_V16HI_V16HI_UHI:
10527     case USI_FTYPE_V32HI_V32HI_USI:
10528     case UQI_FTYPE_V4SI_V4SI_UQI:
10529     case UQI_FTYPE_V8SI_V8SI_UQI:
10530     case UQI_FTYPE_V2DI_V2DI_UQI:
10531     case UQI_FTYPE_V4DI_V4DI_UQI:
10532     case V4SF_FTYPE_V2DF_V4SF_UQI:
10533     case V4SF_FTYPE_V4DF_V4SF_UQI:
10534     case V16SI_FTYPE_V16SI_V16SI_UHI:
10535     case V16SI_FTYPE_V4SI_V16SI_UHI:
10536     case V2DI_FTYPE_V4SI_V2DI_UQI:
10537     case V2DI_FTYPE_V8HI_V2DI_UQI:
10538     case V2DI_FTYPE_V16QI_V2DI_UQI:
10539     case V4DI_FTYPE_V4DI_V4DI_UQI:
10540     case V4DI_FTYPE_V4SI_V4DI_UQI:
10541     case V4DI_FTYPE_V8HI_V4DI_UQI:
10542     case V4DI_FTYPE_V16QI_V4DI_UQI:
10543     case V4DI_FTYPE_V4DF_V4DI_UQI:
10544     case V2DI_FTYPE_V2DF_V2DI_UQI:
10545     case V4SI_FTYPE_V4DF_V4SI_UQI:
10546     case V4SI_FTYPE_V2DF_V4SI_UQI:
10547     case V4SI_FTYPE_V8HI_V4SI_UQI:
10548     case V4SI_FTYPE_V16QI_V4SI_UQI:
10549     case V4DI_FTYPE_V4DI_V4DI_V4DI:
10550     case V8DF_FTYPE_V2DF_V8DF_UQI:
10551     case V8DF_FTYPE_V4DF_V8DF_UQI:
10552     case V8DF_FTYPE_V8DF_V8DF_UQI:
10553     case V8SF_FTYPE_V8SF_V8SF_UQI:
10554     case V8SF_FTYPE_V8SI_V8SF_UQI:
10555     case V4DF_FTYPE_V4DF_V4DF_UQI:
10556     case V4SF_FTYPE_V4SF_V4SF_UQI:
10557     case V2DF_FTYPE_V2DF_V2DF_UQI:
10558     case V2DF_FTYPE_V4SF_V2DF_UQI:
10559     case V2DF_FTYPE_V4SI_V2DF_UQI:
10560     case V4SF_FTYPE_V4SI_V4SF_UQI:
10561     case V4DF_FTYPE_V4SF_V4DF_UQI:
10562     case V4DF_FTYPE_V4SI_V4DF_UQI:
10563     case V8SI_FTYPE_V8SI_V8SI_UQI:
10564     case V8SI_FTYPE_V8HI_V8SI_UQI:
10565     case V8SI_FTYPE_V16QI_V8SI_UQI:
10566     case V8DF_FTYPE_V8SI_V8DF_UQI:
10567     case V8DI_FTYPE_DI_V8DI_UQI:
10568     case V16SF_FTYPE_V8SF_V16SF_UHI:
10569     case V16SI_FTYPE_V8SI_V16SI_UHI:
10570     case V16HF_FTYPE_V16HI_V16HF_UHI:
10571     case V16HF_FTYPE_V16HF_V16HF_V16HF:
10572     case V16HI_FTYPE_V16HF_V16HI_UHI:
10573     case V16HI_FTYPE_V16HI_V16HI_UHI:
10574     case V8HI_FTYPE_V16QI_V8HI_UQI:
10575     case V16HI_FTYPE_V16QI_V16HI_UHI:
10576     case V32HI_FTYPE_V32HI_V32HI_USI:
10577     case V32HI_FTYPE_V32QI_V32HI_USI:
10578     case V8DI_FTYPE_V16QI_V8DI_UQI:
10579     case V8DI_FTYPE_V2DI_V8DI_UQI:
10580     case V8DI_FTYPE_V4DI_V8DI_UQI:
10581     case V8DI_FTYPE_V8DI_V8DI_UQI:
10582     case V8DI_FTYPE_V8HI_V8DI_UQI:
10583     case V8DI_FTYPE_V8SI_V8DI_UQI:
10584     case V8HI_FTYPE_V8DI_V8HI_UQI:
10585     case V8SI_FTYPE_V8DI_V8SI_UQI:
10586     case V4SI_FTYPE_V4SI_V4SI_V4SI:
10587     case V16SI_FTYPE_V16SI_V16SI_V16SI:
10588     case V8DI_FTYPE_V8DI_V8DI_V8DI:
10589     case V32HI_FTYPE_V32HI_V32HI_V32HI:
10590     case V2DI_FTYPE_V2DI_V2DI_V2DI:
10591     case V16HI_FTYPE_V16HI_V16HI_V16HI:
10592     case V8SI_FTYPE_V8SI_V8SI_V8SI:
10593     case V8HI_FTYPE_V8HI_V8HI_V8HI:
10594     case V32HI_FTYPE_V16SF_V16SF_USI:
10595     case V16HI_FTYPE_V8SF_V8SF_UHI:
10596     case V8HI_FTYPE_V4SF_V4SF_UQI:
10597     case V16HI_FTYPE_V16SF_V16HI_UHI:
10598     case V8HI_FTYPE_V8SF_V8HI_UQI:
10599     case V8HI_FTYPE_V4SF_V8HI_UQI:
10600     case V16SF_FTYPE_V16SF_V32HI_V32HI:
10601     case V8SF_FTYPE_V8SF_V16HI_V16HI:
10602     case V4SF_FTYPE_V4SF_V8HI_V8HI:
10603       nargs = 3;
10604       break;
10605     case V32QI_FTYPE_V32QI_V32QI_INT:
10606     case V16HI_FTYPE_V16HI_V16HI_INT:
10607     case V16QI_FTYPE_V16QI_V16QI_INT:
10608     case V4DI_FTYPE_V4DI_V4DI_INT:
10609     case V8HI_FTYPE_V8HI_V8HI_INT:
10610     case V8SI_FTYPE_V8SI_V8SI_INT:
10611     case V8SI_FTYPE_V8SI_V4SI_INT:
10612     case V8SF_FTYPE_V8SF_V8SF_INT:
10613     case V8SF_FTYPE_V8SF_V4SF_INT:
10614     case V4SI_FTYPE_V4SI_V4SI_INT:
10615     case V4DF_FTYPE_V4DF_V4DF_INT:
10616     case V16SF_FTYPE_V16SF_V16SF_INT:
10617     case V16SF_FTYPE_V16SF_V4SF_INT:
10618     case V16SI_FTYPE_V16SI_V4SI_INT:
10619     case V4DF_FTYPE_V4DF_V2DF_INT:
10620     case V4SF_FTYPE_V4SF_V4SF_INT:
10621     case V2DI_FTYPE_V2DI_V2DI_INT:
10622     case V4DI_FTYPE_V4DI_V2DI_INT:
10623     case V2DF_FTYPE_V2DF_V2DF_INT:
10624     case UQI_FTYPE_V8DI_V8UDI_INT:
10625     case UQI_FTYPE_V8DF_V8DF_INT:
10626     case UQI_FTYPE_V2DF_V2DF_INT:
10627     case UQI_FTYPE_V4SF_V4SF_INT:
10628     case UHI_FTYPE_V16SI_V16SI_INT:
10629     case UHI_FTYPE_V16SF_V16SF_INT:
10630     case V64QI_FTYPE_V64QI_V64QI_INT:
10631     case V32HI_FTYPE_V32HI_V32HI_INT:
10632     case V16SI_FTYPE_V16SI_V16SI_INT:
10633     case V8DI_FTYPE_V8DI_V8DI_INT:
10634       nargs = 3;
10635       nargs_constant = 1;
10636       break;
10637     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10638       nargs = 3;
10639       rmode = V4DImode;
10640       nargs_constant = 1;
10641       break;
10642     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10643       nargs = 3;
10644       rmode = V2DImode;
10645       nargs_constant = 1;
10646       break;
10647     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10648       nargs = 3;
10649       rmode = DImode;
10650       nargs_constant = 1;
10651       break;
10652     case V2DI_FTYPE_V2DI_UINT_UINT:
10653       nargs = 3;
10654       nargs_constant = 2;
10655       break;
10656     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10657       nargs = 3;
10658       rmode = V8DImode;
10659       nargs_constant = 1;
10660       break;
10661     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10662       nargs = 5;
10663       rmode = V8DImode;
10664       mask_pos = 2;
10665       nargs_constant = 1;
10666       break;
10667     case QI_FTYPE_V8DF_INT_UQI:
10668     case QI_FTYPE_V4DF_INT_UQI:
10669     case QI_FTYPE_V2DF_INT_UQI:
10670     case HI_FTYPE_V16SF_INT_UHI:
10671     case QI_FTYPE_V8SF_INT_UQI:
10672     case QI_FTYPE_V4SF_INT_UQI:
10673     case QI_FTYPE_V8HF_INT_UQI:
10674     case HI_FTYPE_V16HF_INT_UHI:
10675     case SI_FTYPE_V32HF_INT_USI:
10676     case V4SI_FTYPE_V4SI_V4SI_UHI:
10677     case V8SI_FTYPE_V8SI_V8SI_UHI:
10678       nargs = 3;
10679       mask_pos = 1;
10680       nargs_constant = 1;
10681       break;
10682     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
10683       nargs = 5;
10684       rmode = V4DImode;
10685       mask_pos = 2;
10686       nargs_constant = 1;
10687       break;
10688     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
10689       nargs = 5;
10690       rmode = V2DImode;
10691       mask_pos = 2;
10692       nargs_constant = 1;
10693       break;
10694     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
10695     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
10696     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
10697     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
10698     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
10699     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
10700     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
10701     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
10702     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
10703     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
10704     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
10705     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
10706     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
10707     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
10708     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
10709     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
10710     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
10711     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
10712     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
10713     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
10714     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
10715     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
10716     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
10717     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
10718     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
10719     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
10720     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
10721     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
10722     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
10723     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
10724     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
10725     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
10726     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
10727     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
10728     case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
10729     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
10730     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10731     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10732     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10733     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10734     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10735     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10736     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
10737     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
10738     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10739     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10740     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10741     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10742     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10743     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10744     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10745     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10746     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10747     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10748     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
10749     case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
10750     case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
10751     case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
10752       nargs = 4;
10753       break;
10754     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10755     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10756     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10757     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10758     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10759       nargs = 4;
10760       nargs_constant = 1;
10761       break;
10762     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10763     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10764     case QI_FTYPE_V4DF_V4DF_INT_UQI:
10765     case QI_FTYPE_V8SF_V8SF_INT_UQI:
10766     case UHI_FTYPE_V16HF_V16HF_INT_UHI:
10767     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10768     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10769     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10770     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
10771     case UQI_FTYPE_V8HF_V8HF_INT_UQI:
10772     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10773     case USI_FTYPE_V32QI_V32QI_INT_USI:
10774     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10775     case USI_FTYPE_V32HI_V32HI_INT_USI:
10776     case USI_FTYPE_V32HF_V32HF_INT_USI:
10777     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10778     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
10779       nargs = 4;
10780       mask_pos = 1;
10781       nargs_constant = 1;
10782       break;
10783     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10784       nargs = 4;
10785       nargs_constant = 2;
10786       break;
10787     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
10788     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
10789     case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
10790     case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
10791     case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
10792       nargs = 4;
10793       break;
10794     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
10795     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
10796       mask_pos = 1;
10797       nargs = 4;
10798       nargs_constant = 1;
10799       break;
10800     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
10801     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
10802     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
10803     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
10804     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
10805     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
10806     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
10807     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
10808     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
10809     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
10810     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
10811     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
10812     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
10813     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
10814     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
10815     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
10816     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
10817     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
10818     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
10819     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
10820     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
10821     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
10822     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
10823     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
10824     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
10825     case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
10826     case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
10827     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
10828     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
10829     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
10830     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
10831     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
10832       nargs = 4;
10833       mask_pos = 2;
10834       nargs_constant = 1;
10835       break;
10836     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
10837     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
10838     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
10839     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
10840     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
10841     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
10842     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
10843     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
10844     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
10845     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
10846     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
10847     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
10848     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
10849     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
10850     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
10851     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
10852     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
10853     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
10854     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
10855     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
10856     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
10857     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
10858     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
10859     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
10860     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
10861     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
10862     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
10863       nargs = 5;
10864       mask_pos = 2;
10865       nargs_constant = 1;
10866       break;
10867     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
10868     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
10869     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
10870     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
10871     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
10872     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
10873     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
10874     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
10875     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
10876     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
10877       nargs = 5;
10878       mask_pos = 1;
10879       nargs_constant = 1;
10880       break;
10881     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
10882     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
10883     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
10884     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
10885     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
10886     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
10887     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
10888     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
10889     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
10890     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
10891     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
10892     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
10893       nargs = 5;
10894       mask_pos = 1;
10895       nargs_constant = 2;
10896       break;
10897 
10898     default:
10899       gcc_unreachable ();
10900     }
10901 
10902   gcc_assert (nargs <= ARRAY_SIZE (xops));
10903 
10904   if (comparison != UNKNOWN)
10905     {
10906       gcc_assert (nargs == 2);
10907       return ix86_expand_sse_compare (d, exp, target, swap);
10908     }
10909 
10910   if (rmode == VOIDmode || rmode == tmode)
10911     {
10912       if (optimize
10913 	  || target == 0
10914 	  || GET_MODE (target) != tmode
10915 	  || !insn_p->operand[0].predicate (target, tmode))
10916 	target = gen_reg_rtx (tmode);
10917       else if (memory_operand (target, tmode))
10918 	num_memory++;
10919       real_target = target;
10920     }
10921   else
10922     {
10923       real_target = gen_reg_rtx (tmode);
10924       target = lowpart_subreg (rmode, real_target, tmode);
10925     }
10926 
10927   for (i = 0; i < nargs; i++)
10928     {
10929       tree arg = CALL_EXPR_ARG (exp, i);
10930       rtx op = expand_normal (arg);
10931       machine_mode mode = insn_p->operand[i + 1].mode;
10932       bool match = insn_p->operand[i + 1].predicate (op, mode);
10933 
10934       if (second_arg_count && i == 1)
10935 	{
10936 	  /* SIMD shift insns take either an 8-bit immediate or
10937 	     register as count.  But builtin functions take int as
10938 	     count.  If count doesn't match, we put it in register.
10939 	     The instructions are using 64-bit count, if op is just
10940 	     32-bit, zero-extend it, as negative shift counts
10941 	     are undefined behavior and zero-extension is more
10942 	     efficient.  */
10943 	  if (!match)
10944 	    {
10945 	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
10946 		op = convert_modes (mode, GET_MODE (op), op, 1);
10947 	      else
10948 		op = lowpart_subreg (mode, op, GET_MODE (op));
10949 	      if (!insn_p->operand[i + 1].predicate (op, mode))
10950 		op = copy_to_reg (op);
10951 	    }
10952 	}
10953       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
10954 	       (!mask_pos && (nargs - i) <= nargs_constant))
10955 	{
10956 	  if (!match)
10957 	    switch (icode)
10958 	      {
10959 	      case CODE_FOR_avx_vinsertf128v4di:
10960 	      case CODE_FOR_avx_vextractf128v4di:
10961 		error ("the last argument must be an 1-bit immediate");
10962 		return const0_rtx;
10963 
10964 	      case CODE_FOR_avx512f_cmpv8di3_mask:
10965 	      case CODE_FOR_avx512f_cmpv16si3_mask:
10966 	      case CODE_FOR_avx512f_ucmpv8di3_mask:
10967 	      case CODE_FOR_avx512f_ucmpv16si3_mask:
10968 	      case CODE_FOR_avx512vl_cmpv4di3_mask:
10969 	      case CODE_FOR_avx512vl_cmpv8si3_mask:
10970 	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
10971 	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
10972 	      case CODE_FOR_avx512vl_cmpv2di3_mask:
10973 	      case CODE_FOR_avx512vl_cmpv4si3_mask:
10974 	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
10975 	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
10976 		error ("the last argument must be a 3-bit immediate");
10977 		return const0_rtx;
10978 
10979 	      case CODE_FOR_sse4_1_roundsd:
10980 	      case CODE_FOR_sse4_1_roundss:
10981 
10982 	      case CODE_FOR_sse4_1_roundpd:
10983 	      case CODE_FOR_sse4_1_roundps:
10984 	      case CODE_FOR_avx_roundpd256:
10985 	      case CODE_FOR_avx_roundps256:
10986 
10987 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
10988 	      case CODE_FOR_sse4_1_roundps_sfix:
10989 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
10990 	      case CODE_FOR_avx_roundps_sfix256:
10991 
10992 	      case CODE_FOR_sse4_1_blendps:
10993 	      case CODE_FOR_avx_blendpd256:
10994 	      case CODE_FOR_avx_vpermilv4df:
10995 	      case CODE_FOR_avx_vpermilv4df_mask:
10996 	      case CODE_FOR_avx512f_getmantv8df_mask:
10997 	      case CODE_FOR_avx512f_getmantv16sf_mask:
10998 	      case CODE_FOR_avx512vl_getmantv16hf_mask:
10999 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
11000 	      case CODE_FOR_avx512vl_getmantv4df_mask:
11001 	      case CODE_FOR_avx512fp16_getmantv8hf_mask:
11002 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
11003 	      case CODE_FOR_avx512vl_getmantv2df_mask:
11004 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
11005 	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11006 	      case CODE_FOR_avx512dq_rangepv4df_mask:
11007 	      case CODE_FOR_avx512dq_rangepv8sf_mask:
11008 	      case CODE_FOR_avx512dq_rangepv2df_mask:
11009 	      case CODE_FOR_avx512dq_rangepv4sf_mask:
11010 	      case CODE_FOR_avx_shufpd256_mask:
11011 		error ("the last argument must be a 4-bit immediate");
11012 		return const0_rtx;
11013 
11014 	      case CODE_FOR_sha1rnds4:
11015 	      case CODE_FOR_sse4_1_blendpd:
11016 	      case CODE_FOR_avx_vpermilv2df:
11017 	      case CODE_FOR_avx_vpermilv2df_mask:
11018 	      case CODE_FOR_xop_vpermil2v2df3:
11019 	      case CODE_FOR_xop_vpermil2v4sf3:
11020 	      case CODE_FOR_xop_vpermil2v4df3:
11021 	      case CODE_FOR_xop_vpermil2v8sf3:
11022 	      case CODE_FOR_avx512f_vinsertf32x4_mask:
11023 	      case CODE_FOR_avx512f_vinserti32x4_mask:
11024 	      case CODE_FOR_avx512f_vextractf32x4_mask:
11025 	      case CODE_FOR_avx512f_vextracti32x4_mask:
11026 	      case CODE_FOR_sse2_shufpd:
11027 	      case CODE_FOR_sse2_shufpd_mask:
11028 	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
11029 	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
11030 	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
11031 	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
11032 		error ("the last argument must be a 2-bit immediate");
11033 		return const0_rtx;
11034 
11035 	      case CODE_FOR_avx_vextractf128v4df:
11036 	      case CODE_FOR_avx_vextractf128v8sf:
11037 	      case CODE_FOR_avx_vextractf128v8si:
11038 	      case CODE_FOR_avx_vinsertf128v4df:
11039 	      case CODE_FOR_avx_vinsertf128v8sf:
11040 	      case CODE_FOR_avx_vinsertf128v8si:
11041 	      case CODE_FOR_avx512f_vinsertf64x4_mask:
11042 	      case CODE_FOR_avx512f_vinserti64x4_mask:
11043 	      case CODE_FOR_avx512f_vextractf64x4_mask:
11044 	      case CODE_FOR_avx512f_vextracti64x4_mask:
11045 	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
11046 	      case CODE_FOR_avx512dq_vinserti32x8_mask:
11047 	      case CODE_FOR_avx512vl_vinsertv4df:
11048 	      case CODE_FOR_avx512vl_vinsertv4di:
11049 	      case CODE_FOR_avx512vl_vinsertv8sf:
11050 	      case CODE_FOR_avx512vl_vinsertv8si:
11051 		error ("the last argument must be a 1-bit immediate");
11052 		return const0_rtx;
11053 
11054 	      case CODE_FOR_avx_vmcmpv2df3:
11055 	      case CODE_FOR_avx_vmcmpv4sf3:
11056 	      case CODE_FOR_avx_cmpv2df3:
11057 	      case CODE_FOR_avx_cmpv4sf3:
11058 	      case CODE_FOR_avx_cmpv4df3:
11059 	      case CODE_FOR_avx_cmpv8sf3:
11060 	      case CODE_FOR_avx512f_cmpv8df3_mask:
11061 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
11062 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
11063 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11064 	      case CODE_FOR_avx512bw_cmpv32hf3_mask:
11065 	      case CODE_FOR_avx512vl_cmpv16hf3_mask:
11066 	      case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11067 		error ("the last argument must be a 5-bit immediate");
11068 		return const0_rtx;
11069 
11070 	      default:
11071 		switch (nargs_constant)
11072 		  {
11073 		  case 2:
11074 		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11075 			(!mask_pos && (nargs - i) == nargs_constant))
11076 		      {
11077 			error ("the next to last argument must be an 8-bit immediate");
11078 			break;
11079 		      }
11080 		    /* FALLTHRU */
11081 		  case 1:
11082 		    error ("the last argument must be an 8-bit immediate");
11083 		    break;
11084 		  default:
11085 		    gcc_unreachable ();
11086 		  }
11087 		return const0_rtx;
11088 	      }
11089 	}
11090       else
11091 	{
11092 	  if (VECTOR_MODE_P (mode))
11093 	    op = safe_vector_operand (op, mode);
11094 
11095 	  /* If we aren't optimizing, only allow one memory operand to
11096 	     be generated.  */
11097 	  if (memory_operand (op, mode))
11098 	    num_memory++;
11099 
11100 	  op = fixup_modeless_constant (op, mode);
11101 
11102 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11103 	    {
11104 	      if (optimize || !match || num_memory > 1)
11105 		op = copy_to_mode_reg (mode, op);
11106 	    }
11107 	  else
11108 	    {
11109 	      op = copy_to_reg (op);
11110 	      op = lowpart_subreg (mode, op, GET_MODE (op));
11111 	    }
11112 	}
11113 
11114       xops[i] = op;
11115     }
11116 
11117   switch (nargs)
11118     {
11119     case 1:
11120       pat = GEN_FCN (icode) (real_target, xops[0]);
11121       break;
11122     case 2:
11123       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11124       break;
11125     case 3:
11126       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11127       break;
11128     case 4:
11129       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11130 			     xops[2], xops[3]);
11131       break;
11132     case 5:
11133       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11134 			     xops[2], xops[3], xops[4]);
11135       break;
11136     case 6:
11137       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11138 			     xops[2], xops[3], xops[4], xops[5]);
11139       break;
11140     default:
11141       gcc_unreachable ();
11142     }
11143 
11144   if (! pat)
11145     return 0;
11146 
11147   emit_insn (pat);
11148   return target;
11149 }
11150 
11151 /* Transform pattern of following layout:
11152      (set A
11153        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11154      )
11155    into:
11156      (set (A B)) */
11157 
11158 static rtx
ix86_erase_embedded_rounding(rtx pat)11159 ix86_erase_embedded_rounding (rtx pat)
11160 {
11161   if (GET_CODE (pat) == INSN)
11162     pat = PATTERN (pat);
11163 
11164   gcc_assert (GET_CODE (pat) == SET);
11165   rtx src = SET_SRC (pat);
11166   gcc_assert (XVECLEN (src, 0) == 2);
11167   rtx p0 = XVECEXP (src, 0, 0);
11168   gcc_assert (GET_CODE (src) == UNSPEC
11169 	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11170   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11171   return res;
11172 }
11173 
11174 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11175    with rounding.  */
11176 static rtx
ix86_expand_sse_comi_round(const struct builtin_description * d,tree exp,rtx target)11177 ix86_expand_sse_comi_round (const struct builtin_description *d,
11178 			    tree exp, rtx target)
11179 {
11180   rtx pat, set_dst;
11181   tree arg0 = CALL_EXPR_ARG (exp, 0);
11182   tree arg1 = CALL_EXPR_ARG (exp, 1);
11183   tree arg2 = CALL_EXPR_ARG (exp, 2);
11184   tree arg3 = CALL_EXPR_ARG (exp, 3);
11185   rtx op0 = expand_normal (arg0);
11186   rtx op1 = expand_normal (arg1);
11187   rtx op2 = expand_normal (arg2);
11188   rtx op3 = expand_normal (arg3);
11189   enum insn_code icode = d->icode;
11190   const struct insn_data_d *insn_p = &insn_data[icode];
11191   machine_mode mode0 = insn_p->operand[0].mode;
11192   machine_mode mode1 = insn_p->operand[1].mode;
11193 
11194   /* See avxintrin.h for values.  */
11195   static const enum rtx_code comparisons[32] =
11196     {
11197       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11198       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11199       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11200       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11201     };
11202   static const bool ordereds[32] =
11203     {
11204       true,  true,  true,  false, false, false, false, true,
11205       false, false, false, true,  true,  true,  true,  false,
11206       true,  true,  true,  false, false, false, false, true,
11207       false, false, false, true,  true,  true,  true,  false
11208     };
11209   static const bool non_signalings[32] =
11210     {
11211       true,  false, false, true,  true,  false, false, true,
11212       true,  false, false, true,  true,  false, false, true,
11213       false, true,  true,  false, false, true,  true,  false,
11214       false, true,  true,  false, false, true,  true,  false
11215     };
11216 
11217   if (!CONST_INT_P (op2))
11218     {
11219       error ("the third argument must be comparison constant");
11220       return const0_rtx;
11221     }
11222   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11223     {
11224       error ("incorrect comparison mode");
11225       return const0_rtx;
11226     }
11227 
11228   if (!insn_p->operand[2].predicate (op3, SImode))
11229     {
11230       error ("incorrect rounding operand");
11231       return const0_rtx;
11232     }
11233 
11234   if (VECTOR_MODE_P (mode0))
11235     op0 = safe_vector_operand (op0, mode0);
11236   if (VECTOR_MODE_P (mode1))
11237     op1 = safe_vector_operand (op1, mode1);
11238 
11239   enum rtx_code comparison = comparisons[INTVAL (op2)];
11240   bool ordered = ordereds[INTVAL (op2)];
11241   bool non_signaling = non_signalings[INTVAL (op2)];
11242   rtx const_val = const0_rtx;
11243 
11244   bool check_unordered = false;
11245   machine_mode mode = CCFPmode;
11246   switch (comparison)
11247     {
11248     case ORDERED:
11249       if (!ordered)
11250 	{
11251 	  /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
11252 	  if (!non_signaling)
11253 	    ordered = true;
11254 	  mode = CCSmode;
11255 	}
11256       else
11257 	{
11258 	  /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
11259 	  if (non_signaling)
11260 	    ordered = false;
11261 	  mode = CCPmode;
11262 	}
11263       comparison = NE;
11264       break;
11265     case UNORDERED:
11266       if (ordered)
11267 	{
11268 	  /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
11269 	  if (non_signaling)
11270 	    ordered = false;
11271 	  mode = CCSmode;
11272 	}
11273       else
11274 	{
11275 	  /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
11276 	  if (!non_signaling)
11277 	    ordered = true;
11278 	  mode = CCPmode;
11279 	}
11280       comparison = EQ;
11281       break;
11282 
11283     case LE:	/* -> GE  */
11284     case LT:	/* -> GT  */
11285     case UNGE:	/* -> UNLE  */
11286     case UNGT:	/* -> UNLT  */
11287       std::swap (op0, op1);
11288       comparison = swap_condition (comparison);
11289       /* FALLTHRU */
11290     case GT:
11291     case GE:
11292     case UNEQ:
11293     case UNLT:
11294     case UNLE:
11295     case LTGT:
11296       /* These are supported by CCFPmode.  NB: Use ordered/signaling
11297 	 COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
11298 	 with NAN operands.  */
11299       if (ordered == non_signaling)
11300 	ordered = !ordered;
11301       break;
11302     case EQ:
11303       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
11304 	 _CMP_EQ_OQ/_CMP_EQ_OS.  */
11305       check_unordered = true;
11306       mode = CCZmode;
11307       break;
11308     case NE:
11309       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
11310 	 _CMP_NEQ_UQ/_CMP_NEQ_US.  */
11311       gcc_assert (!ordered);
11312       check_unordered = true;
11313       mode = CCZmode;
11314       const_val = const1_rtx;
11315       break;
11316     default:
11317       gcc_unreachable ();
11318     }
11319 
11320   target = gen_reg_rtx (SImode);
11321   emit_move_insn (target, const_val);
11322   target = gen_rtx_SUBREG (QImode, target, 0);
11323 
11324   if ((optimize && !register_operand (op0, mode0))
11325       || !insn_p->operand[0].predicate (op0, mode0))
11326     op0 = copy_to_mode_reg (mode0, op0);
11327   if ((optimize && !register_operand (op1, mode1))
11328       || !insn_p->operand[1].predicate (op1, mode1))
11329     op1 = copy_to_mode_reg (mode1, op1);
11330 
11331   /*
11332      1. COMI: ordered and signaling.
11333      2. UCOMI: unordered and non-signaling.
11334    */
11335   if (non_signaling)
11336     icode = (icode == CODE_FOR_sse_comi_round
11337 	     ? CODE_FOR_sse_ucomi_round
11338 	     : CODE_FOR_sse2_ucomi_round);
11339 
11340   pat = GEN_FCN (icode) (op0, op1, op3);
11341   if (! pat)
11342     return 0;
11343 
11344   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
11345   if (INTVAL (op3) == NO_ROUND)
11346     {
11347       pat = ix86_erase_embedded_rounding (pat);
11348       if (! pat)
11349 	return 0;
11350 
11351       set_dst = SET_DEST (pat);
11352     }
11353   else
11354     {
11355       gcc_assert (GET_CODE (pat) == SET);
11356       set_dst = SET_DEST (pat);
11357     }
11358 
11359   emit_insn (pat);
11360 
11361   rtx_code_label *label = NULL;
11362 
11363   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11364      with NAN operands.  */
11365   if (check_unordered)
11366     {
11367       gcc_assert (comparison == EQ || comparison == NE);
11368 
11369       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11370       label = gen_label_rtx ();
11371       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11372       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11373 				  gen_rtx_LABEL_REF (VOIDmode, label),
11374 				  pc_rtx);
11375       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11376     }
11377 
11378   /* NB: Set CCFPmode and check a different CCmode which is in subset
11379      of CCFPmode.  */
11380   if (GET_MODE (set_dst) != mode)
11381     {
11382       gcc_assert (mode == CCAmode || mode == CCCmode
11383 		  || mode == CCOmode || mode == CCPmode
11384 		  || mode == CCSmode || mode == CCZmode);
11385       set_dst = gen_rtx_REG (mode, FLAGS_REG);
11386     }
11387 
11388   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11389 			  gen_rtx_fmt_ee (comparison, QImode,
11390 					  set_dst,
11391 					  const0_rtx)));
11392 
11393   if (label)
11394     emit_label (label);
11395 
11396   return SUBREG_REG (target);
11397 }
11398 
11399 static rtx
ix86_expand_round_builtin(const struct builtin_description * d,tree exp,rtx target)11400 ix86_expand_round_builtin (const struct builtin_description *d,
11401 			   tree exp, rtx target)
11402 {
11403   rtx pat;
11404   unsigned int i, nargs;
11405   rtx xops[6];
11406   enum insn_code icode = d->icode;
11407   const struct insn_data_d *insn_p = &insn_data[icode];
11408   machine_mode tmode = insn_p->operand[0].mode;
11409   unsigned int nargs_constant = 0;
11410   unsigned int redundant_embed_rnd = 0;
11411 
11412   switch ((enum ix86_builtin_func_type) d->flag)
11413     {
11414     case UINT64_FTYPE_V2DF_INT:
11415     case UINT64_FTYPE_V4SF_INT:
11416     case UINT64_FTYPE_V8HF_INT:
11417     case UINT_FTYPE_V2DF_INT:
11418     case UINT_FTYPE_V4SF_INT:
11419     case UINT_FTYPE_V8HF_INT:
11420     case INT64_FTYPE_V2DF_INT:
11421     case INT64_FTYPE_V4SF_INT:
11422     case INT64_FTYPE_V8HF_INT:
11423     case INT_FTYPE_V2DF_INT:
11424     case INT_FTYPE_V4SF_INT:
11425     case INT_FTYPE_V8HF_INT:
11426       nargs = 2;
11427       break;
11428     case V32HF_FTYPE_V32HF_V32HF_INT:
11429     case V8HF_FTYPE_V8HF_V8HF_INT:
11430     case V8HF_FTYPE_V8HF_INT_INT:
11431     case V8HF_FTYPE_V8HF_UINT_INT:
11432     case V8HF_FTYPE_V8HF_INT64_INT:
11433     case V8HF_FTYPE_V8HF_UINT64_INT:
11434     case V4SF_FTYPE_V4SF_UINT_INT:
11435     case V4SF_FTYPE_V4SF_UINT64_INT:
11436     case V2DF_FTYPE_V2DF_UINT64_INT:
11437     case V4SF_FTYPE_V4SF_INT_INT:
11438     case V4SF_FTYPE_V4SF_INT64_INT:
11439     case V2DF_FTYPE_V2DF_INT64_INT:
11440     case V4SF_FTYPE_V4SF_V4SF_INT:
11441     case V2DF_FTYPE_V2DF_V2DF_INT:
11442     case V4SF_FTYPE_V4SF_V2DF_INT:
11443     case V2DF_FTYPE_V2DF_V4SF_INT:
11444       nargs = 3;
11445       break;
11446     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11447     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11448     case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11449     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11450     case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11451     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11452     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11453     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11454     case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11455     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11456     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11457     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11458     case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11459     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11460     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11461     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11462     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11463     case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11464     case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11465     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11466     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11467     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11468     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11469     case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11470     case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11471     case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11472     case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11473       nargs = 4;
11474       break;
11475     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11476     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11477       nargs_constant = 2;
11478       nargs = 4;
11479       break;
11480     case INT_FTYPE_V4SF_V4SF_INT_INT:
11481     case INT_FTYPE_V2DF_V2DF_INT_INT:
11482       return ix86_expand_sse_comi_round (d, exp, target);
11483     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11484     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11485     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11486     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11487     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11488     case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11489     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11490     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11491     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11492     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11493     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11494     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11495     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11496     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11497     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11498     case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11499     case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11500       nargs = 5;
11501       break;
11502     case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11503     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11504     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11505     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11506     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11507       nargs_constant = 4;
11508       nargs = 5;
11509       break;
11510     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11511     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11512     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11513     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11514     case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11515     case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11516       nargs_constant = 3;
11517       nargs = 5;
11518       break;
11519     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11520     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11521     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11522     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11523     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11524     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11525     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11526       nargs = 6;
11527       nargs_constant = 4;
11528       break;
11529     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11530     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11531     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11532     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11533       nargs = 6;
11534       nargs_constant = 3;
11535       break;
11536     default:
11537       gcc_unreachable ();
11538     }
11539   gcc_assert (nargs <= ARRAY_SIZE (xops));
11540 
11541   if (optimize
11542       || target == 0
11543       || GET_MODE (target) != tmode
11544       || !insn_p->operand[0].predicate (target, tmode))
11545     target = gen_reg_rtx (tmode);
11546 
11547   for (i = 0; i < nargs; i++)
11548     {
11549       tree arg = CALL_EXPR_ARG (exp, i);
11550       rtx op = expand_normal (arg);
11551       machine_mode mode = insn_p->operand[i + 1].mode;
11552       bool match = insn_p->operand[i + 1].predicate (op, mode);
11553 
11554       if (i == nargs - nargs_constant)
11555 	{
11556 	  if (!match)
11557 	    {
11558 	      switch (icode)
11559 		{
11560 		case CODE_FOR_avx512f_getmantv8df_mask_round:
11561 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
11562 		case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11563 		case CODE_FOR_avx512f_vgetmantv2df_round:
11564 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11565 		case CODE_FOR_avx512f_vgetmantv4sf_round:
11566 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11567 		case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11568 		  error ("the immediate argument must be a 4-bit immediate");
11569 		  return const0_rtx;
11570 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
11571 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11572 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11573 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11574 		case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11575 		case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11576 		  error ("the immediate argument must be a 5-bit immediate");
11577 		  return const0_rtx;
11578 		default:
11579 		  error ("the immediate argument must be an 8-bit immediate");
11580 		  return const0_rtx;
11581 		}
11582 	    }
11583 	}
11584       else if (i == nargs-1)
11585 	{
11586 	  if (!insn_p->operand[nargs].predicate (op, SImode))
11587 	    {
11588 	      error ("incorrect rounding operand");
11589 	      return const0_rtx;
11590 	    }
11591 
11592 	  /* If there is no rounding use normal version of the pattern.  */
11593 	  if (INTVAL (op) == NO_ROUND)
11594 	    {
11595 	      /* Skip erasing embedded rounding for below expanders who
11596 		 generates multiple insns.  In ix86_erase_embedded_rounding
11597 		 the pattern will be transformed to a single set, and emit_insn
11598 		 appends the set insead of insert it to chain.  So the insns
11599 		 emitted inside define_expander would be ignored.  */
11600 	      switch (icode)
11601 		{
11602 		case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11603 		case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11604 		case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11605 		case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11606 		case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11607 		case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11608 		  redundant_embed_rnd = 0;
11609 		  break;
11610 		default:
11611 		  redundant_embed_rnd = 1;
11612 		  break;
11613 		}
11614 	    }
11615 	}
11616       else
11617 	{
11618 	  if (VECTOR_MODE_P (mode))
11619 	    op = safe_vector_operand (op, mode);
11620 
11621 	  op = fixup_modeless_constant (op, mode);
11622 
11623 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11624 	    {
11625 	      if (optimize || !match)
11626 		op = copy_to_mode_reg (mode, op);
11627 	    }
11628 	  else
11629 	    {
11630 	      op = copy_to_reg (op);
11631 	      op = lowpart_subreg (mode, op, GET_MODE (op));
11632 	    }
11633 	}
11634 
11635       xops[i] = op;
11636     }
11637 
11638   switch (nargs)
11639     {
11640     case 1:
11641       pat = GEN_FCN (icode) (target, xops[0]);
11642       break;
11643     case 2:
11644       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11645       break;
11646     case 3:
11647       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11648       break;
11649     case 4:
11650       pat = GEN_FCN (icode) (target, xops[0], xops[1],
11651 			     xops[2], xops[3]);
11652       break;
11653     case 5:
11654       pat = GEN_FCN (icode) (target, xops[0], xops[1],
11655 			     xops[2], xops[3], xops[4]);
11656       break;
11657     case 6:
11658       pat = GEN_FCN (icode) (target, xops[0], xops[1],
11659 			     xops[2], xops[3], xops[4], xops[5]);
11660       break;
11661     default:
11662       gcc_unreachable ();
11663     }
11664 
11665   if (!pat)
11666     return 0;
11667 
11668   if (redundant_embed_rnd)
11669     pat = ix86_erase_embedded_rounding (pat);
11670 
11671   emit_insn (pat);
11672   return target;
11673 }
11674 
11675 /* Subroutine of ix86_expand_builtin to take care of special insns
11676    with variable number of operands.  */
11677 
11678 static rtx
ix86_expand_special_args_builtin(const struct builtin_description * d,tree exp,rtx target)11679 ix86_expand_special_args_builtin (const struct builtin_description *d,
11680 				  tree exp, rtx target)
11681 {
11682   tree arg;
11683   rtx pat, op;
11684   unsigned int i, nargs, arg_adjust, memory;
11685   bool aligned_mem = false;
11686   rtx xops[3];
11687   enum insn_code icode = d->icode;
11688   const struct insn_data_d *insn_p = &insn_data[icode];
11689   machine_mode tmode = insn_p->operand[0].mode;
11690   enum { load, store } klass;
11691 
11692   switch ((enum ix86_builtin_func_type) d->flag)
11693     {
11694     case VOID_FTYPE_VOID:
11695       emit_insn (GEN_FCN (icode) (target));
11696       return 0;
11697     case VOID_FTYPE_UINT64:
11698     case VOID_FTYPE_UNSIGNED:
11699       nargs = 0;
11700       klass = store;
11701       memory = 0;
11702       break;
11703 
11704     case INT_FTYPE_VOID:
11705     case USHORT_FTYPE_VOID:
11706     case UINT64_FTYPE_VOID:
11707     case UINT_FTYPE_VOID:
11708     case UINT8_FTYPE_VOID:
11709     case UNSIGNED_FTYPE_VOID:
11710       nargs = 0;
11711       klass = load;
11712       memory = 0;
11713       break;
11714     case UINT64_FTYPE_PUNSIGNED:
11715     case V2DI_FTYPE_PV2DI:
11716     case V4DI_FTYPE_PV4DI:
11717     case V32QI_FTYPE_PCCHAR:
11718     case V16QI_FTYPE_PCCHAR:
11719     case V8SF_FTYPE_PCV4SF:
11720     case V8SF_FTYPE_PCFLOAT:
11721     case V4SF_FTYPE_PCFLOAT:
11722     case V4DF_FTYPE_PCV2DF:
11723     case V4DF_FTYPE_PCDOUBLE:
11724     case V2DF_FTYPE_PCDOUBLE:
11725     case VOID_FTYPE_PVOID:
11726     case V8DI_FTYPE_PV8DI:
11727       nargs = 1;
11728       klass = load;
11729       memory = 0;
11730       switch (icode)
11731 	{
11732 	case CODE_FOR_sse4_1_movntdqa:
11733 	case CODE_FOR_avx2_movntdqa:
11734 	case CODE_FOR_avx512f_movntdqa:
11735 	  aligned_mem = true;
11736 	  break;
11737 	default:
11738 	  break;
11739 	}
11740       break;
11741     case VOID_FTYPE_PV2SF_V4SF:
11742     case VOID_FTYPE_PV8DI_V8DI:
11743     case VOID_FTYPE_PV4DI_V4DI:
11744     case VOID_FTYPE_PV2DI_V2DI:
11745     case VOID_FTYPE_PCHAR_V32QI:
11746     case VOID_FTYPE_PCHAR_V16QI:
11747     case VOID_FTYPE_PFLOAT_V16SF:
11748     case VOID_FTYPE_PFLOAT_V8SF:
11749     case VOID_FTYPE_PFLOAT_V4SF:
11750     case VOID_FTYPE_PDOUBLE_V8DF:
11751     case VOID_FTYPE_PDOUBLE_V4DF:
11752     case VOID_FTYPE_PDOUBLE_V2DF:
11753     case VOID_FTYPE_PLONGLONG_LONGLONG:
11754     case VOID_FTYPE_PULONGLONG_ULONGLONG:
11755     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11756     case VOID_FTYPE_PINT_INT:
11757       nargs = 1;
11758       klass = store;
11759       /* Reserve memory operand for target.  */
11760       memory = ARRAY_SIZE (xops);
11761       switch (icode)
11762 	{
11763 	/* These builtins and instructions require the memory
11764 	   to be properly aligned.  */
11765 	case CODE_FOR_avx_movntv4di:
11766 	case CODE_FOR_sse2_movntv2di:
11767 	case CODE_FOR_avx_movntv8sf:
11768 	case CODE_FOR_sse_movntv4sf:
11769 	case CODE_FOR_sse4a_vmmovntv4sf:
11770 	case CODE_FOR_avx_movntv4df:
11771 	case CODE_FOR_sse2_movntv2df:
11772 	case CODE_FOR_sse4a_vmmovntv2df:
11773 	case CODE_FOR_sse2_movntidi:
11774 	case CODE_FOR_sse_movntq:
11775 	case CODE_FOR_sse2_movntisi:
11776 	case CODE_FOR_avx512f_movntv16sf:
11777 	case CODE_FOR_avx512f_movntv8df:
11778 	case CODE_FOR_avx512f_movntv8di:
11779 	  aligned_mem = true;
11780 	  break;
11781 	default:
11782 	  break;
11783 	}
11784       break;
11785     case VOID_FTYPE_PVOID_PCVOID:
11786 	nargs = 1;
11787 	klass = store;
11788 	memory = 0;
11789 
11790 	break;
11791     case V4SF_FTYPE_V4SF_PCV2SF:
11792     case V2DF_FTYPE_V2DF_PCDOUBLE:
11793       nargs = 2;
11794       klass = load;
11795       memory = 1;
11796       break;
11797     case V8SF_FTYPE_PCV8SF_V8SI:
11798     case V4DF_FTYPE_PCV4DF_V4DI:
11799     case V4SF_FTYPE_PCV4SF_V4SI:
11800     case V2DF_FTYPE_PCV2DF_V2DI:
11801     case V8SI_FTYPE_PCV8SI_V8SI:
11802     case V4DI_FTYPE_PCV4DI_V4DI:
11803     case V4SI_FTYPE_PCV4SI_V4SI:
11804     case V2DI_FTYPE_PCV2DI_V2DI:
11805     case VOID_FTYPE_INT_INT64:
11806       nargs = 2;
11807       klass = load;
11808       memory = 0;
11809       break;
11810     case VOID_FTYPE_PV8DF_V8DF_UQI:
11811     case VOID_FTYPE_PV4DF_V4DF_UQI:
11812     case VOID_FTYPE_PV2DF_V2DF_UQI:
11813     case VOID_FTYPE_PV16SF_V16SF_UHI:
11814     case VOID_FTYPE_PV8SF_V8SF_UQI:
11815     case VOID_FTYPE_PV4SF_V4SF_UQI:
11816     case VOID_FTYPE_PV8DI_V8DI_UQI:
11817     case VOID_FTYPE_PV4DI_V4DI_UQI:
11818     case VOID_FTYPE_PV2DI_V2DI_UQI:
11819     case VOID_FTYPE_PV16SI_V16SI_UHI:
11820     case VOID_FTYPE_PV8SI_V8SI_UQI:
11821     case VOID_FTYPE_PV4SI_V4SI_UQI:
11822     case VOID_FTYPE_PV64QI_V64QI_UDI:
11823     case VOID_FTYPE_PV32HI_V32HI_USI:
11824     case VOID_FTYPE_PV32QI_V32QI_USI:
11825     case VOID_FTYPE_PV16QI_V16QI_UHI:
11826     case VOID_FTYPE_PV16HI_V16HI_UHI:
11827     case VOID_FTYPE_PV8HI_V8HI_UQI:
11828       switch (icode)
11829 	{
11830 	/* These builtins and instructions require the memory
11831 	   to be properly aligned.  */
11832 	case CODE_FOR_avx512f_storev16sf_mask:
11833 	case CODE_FOR_avx512f_storev16si_mask:
11834 	case CODE_FOR_avx512f_storev8df_mask:
11835 	case CODE_FOR_avx512f_storev8di_mask:
11836 	case CODE_FOR_avx512vl_storev8sf_mask:
11837 	case CODE_FOR_avx512vl_storev8si_mask:
11838 	case CODE_FOR_avx512vl_storev4df_mask:
11839 	case CODE_FOR_avx512vl_storev4di_mask:
11840 	case CODE_FOR_avx512vl_storev4sf_mask:
11841 	case CODE_FOR_avx512vl_storev4si_mask:
11842 	case CODE_FOR_avx512vl_storev2df_mask:
11843 	case CODE_FOR_avx512vl_storev2di_mask:
11844 	  aligned_mem = true;
11845 	  break;
11846 	default:
11847 	  break;
11848 	}
11849       /* FALLTHRU */
11850     case VOID_FTYPE_PV8SF_V8SI_V8SF:
11851     case VOID_FTYPE_PV4DF_V4DI_V4DF:
11852     case VOID_FTYPE_PV4SF_V4SI_V4SF:
11853     case VOID_FTYPE_PV2DF_V2DI_V2DF:
11854     case VOID_FTYPE_PV8SI_V8SI_V8SI:
11855     case VOID_FTYPE_PV4DI_V4DI_V4DI:
11856     case VOID_FTYPE_PV4SI_V4SI_V4SI:
11857     case VOID_FTYPE_PV2DI_V2DI_V2DI:
11858     case VOID_FTYPE_PV8SI_V8DI_UQI:
11859     case VOID_FTYPE_PV8HI_V8DI_UQI:
11860     case VOID_FTYPE_PV16HI_V16SI_UHI:
11861     case VOID_FTYPE_PUDI_V8DI_UQI:
11862     case VOID_FTYPE_PV16QI_V16SI_UHI:
11863     case VOID_FTYPE_PV4SI_V4DI_UQI:
11864     case VOID_FTYPE_PUDI_V2DI_UQI:
11865     case VOID_FTYPE_PUDI_V4DI_UQI:
11866     case VOID_FTYPE_PUSI_V2DI_UQI:
11867     case VOID_FTYPE_PV8HI_V8SI_UQI:
11868     case VOID_FTYPE_PUDI_V4SI_UQI:
11869     case VOID_FTYPE_PUSI_V4DI_UQI:
11870     case VOID_FTYPE_PUHI_V2DI_UQI:
11871     case VOID_FTYPE_PUDI_V8SI_UQI:
11872     case VOID_FTYPE_PUSI_V4SI_UQI:
11873     case VOID_FTYPE_PCHAR_V64QI_UDI:
11874     case VOID_FTYPE_PCHAR_V32QI_USI:
11875     case VOID_FTYPE_PCHAR_V16QI_UHI:
11876     case VOID_FTYPE_PSHORT_V32HI_USI:
11877     case VOID_FTYPE_PSHORT_V16HI_UHI:
11878     case VOID_FTYPE_PSHORT_V8HI_UQI:
11879     case VOID_FTYPE_PINT_V16SI_UHI:
11880     case VOID_FTYPE_PINT_V8SI_UQI:
11881     case VOID_FTYPE_PINT_V4SI_UQI:
11882     case VOID_FTYPE_PINT64_V8DI_UQI:
11883     case VOID_FTYPE_PINT64_V4DI_UQI:
11884     case VOID_FTYPE_PINT64_V2DI_UQI:
11885     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
11886     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
11887     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
11888     case VOID_FTYPE_PFLOAT_V16SF_UHI:
11889     case VOID_FTYPE_PFLOAT_V8SF_UQI:
11890     case VOID_FTYPE_PFLOAT_V4SF_UQI:
11891     case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
11892     case VOID_FTYPE_PV32QI_V32HI_USI:
11893     case VOID_FTYPE_PV16QI_V16HI_UHI:
11894     case VOID_FTYPE_PUDI_V8HI_UQI:
11895       nargs = 2;
11896       klass = store;
11897       /* Reserve memory operand for target.  */
11898       memory = ARRAY_SIZE (xops);
11899       break;
11900     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
11901     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
11902     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
11903     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
11904     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
11905     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
11906     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
11907     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
11908     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
11909     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
11910     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
11911     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
11912     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
11913     case V32HI_FTYPE_PCV32HI_V32HI_USI:
11914     case V32QI_FTYPE_PCV32QI_V32QI_USI:
11915     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
11916     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
11917     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
11918       switch (icode)
11919 	{
11920 	/* These builtins and instructions require the memory
11921 	   to be properly aligned.  */
11922 	case CODE_FOR_avx512f_loadv16sf_mask:
11923 	case CODE_FOR_avx512f_loadv16si_mask:
11924 	case CODE_FOR_avx512f_loadv8df_mask:
11925 	case CODE_FOR_avx512f_loadv8di_mask:
11926 	case CODE_FOR_avx512vl_loadv8sf_mask:
11927 	case CODE_FOR_avx512vl_loadv8si_mask:
11928 	case CODE_FOR_avx512vl_loadv4df_mask:
11929 	case CODE_FOR_avx512vl_loadv4di_mask:
11930 	case CODE_FOR_avx512vl_loadv4sf_mask:
11931 	case CODE_FOR_avx512vl_loadv4si_mask:
11932 	case CODE_FOR_avx512vl_loadv2df_mask:
11933 	case CODE_FOR_avx512vl_loadv2di_mask:
11934 	case CODE_FOR_avx512bw_loadv64qi_mask:
11935 	case CODE_FOR_avx512vl_loadv32qi_mask:
11936 	case CODE_FOR_avx512vl_loadv16qi_mask:
11937 	case CODE_FOR_avx512bw_loadv32hi_mask:
11938 	case CODE_FOR_avx512vl_loadv16hi_mask:
11939 	case CODE_FOR_avx512vl_loadv8hi_mask:
11940 	  aligned_mem = true;
11941 	  break;
11942 	default:
11943 	  break;
11944 	}
11945       /* FALLTHRU */
11946     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
11947     case V32QI_FTYPE_PCCHAR_V32QI_USI:
11948     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
11949     case V32HI_FTYPE_PCSHORT_V32HI_USI:
11950     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
11951     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
11952     case V16SI_FTYPE_PCINT_V16SI_UHI:
11953     case V8SI_FTYPE_PCINT_V8SI_UQI:
11954     case V4SI_FTYPE_PCINT_V4SI_UQI:
11955     case V8DI_FTYPE_PCINT64_V8DI_UQI:
11956     case V4DI_FTYPE_PCINT64_V4DI_UQI:
11957     case V2DI_FTYPE_PCINT64_V2DI_UQI:
11958     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
11959     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
11960     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
11961     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
11962     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
11963     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
11964     case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
11965       nargs = 3;
11966       klass = load;
11967       memory = 0;
11968       break;
11969     default:
11970       gcc_unreachable ();
11971     }
11972 
11973   gcc_assert (nargs <= ARRAY_SIZE (xops));
11974 
11975   if (klass == store)
11976     {
11977       arg = CALL_EXPR_ARG (exp, 0);
11978       op = expand_normal (arg);
11979       gcc_assert (target == 0);
11980       if (memory)
11981 	{
11982 	  op = ix86_zero_extend_to_Pmode (op);
11983 	  target = gen_rtx_MEM (tmode, op);
11984 	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
11985 	     on it.  Try to improve it using get_pointer_alignment,
11986 	     and if the special builtin is one that requires strict
11987 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
11988 	     Failure to do so could lead to ix86_legitimate_combined_insn
11989 	     rejecting all changes to such insns.  */
11990 	  unsigned int align = get_pointer_alignment (arg);
11991 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
11992 	    align = GET_MODE_ALIGNMENT (tmode);
11993 	  if (MEM_ALIGN (target) < align)
11994 	    set_mem_align (target, align);
11995 	}
11996       else
11997 	target = force_reg (tmode, op);
11998       arg_adjust = 1;
11999     }
12000   else
12001     {
12002       arg_adjust = 0;
12003       if (optimize
12004 	  || target == 0
12005 	  || !register_operand (target, tmode)
12006 	  || GET_MODE (target) != tmode)
12007 	target = gen_reg_rtx (tmode);
12008     }
12009 
12010   for (i = 0; i < nargs; i++)
12011     {
12012       machine_mode mode = insn_p->operand[i + 1].mode;
12013 
12014       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12015       op = expand_normal (arg);
12016 
12017       if (i == memory)
12018 	{
12019 	  /* This must be the memory operand.  */
12020 	  op = ix86_zero_extend_to_Pmode (op);
12021 	  op = gen_rtx_MEM (mode, op);
12022 	  /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12023 	     on it.  Try to improve it using get_pointer_alignment,
12024 	     and if the special builtin is one that requires strict
12025 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
12026 	     Failure to do so could lead to ix86_legitimate_combined_insn
12027 	     rejecting all changes to such insns.  */
12028 	  unsigned int align = get_pointer_alignment (arg);
12029 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12030 	    align = GET_MODE_ALIGNMENT (mode);
12031 	  if (MEM_ALIGN (op) < align)
12032 	    set_mem_align (op, align);
12033 	}
12034       else
12035 	{
12036 	  /* This must be register.  */
12037 	  if (VECTOR_MODE_P (mode))
12038 	    op = safe_vector_operand (op, mode);
12039 
12040 	  op = fixup_modeless_constant (op, mode);
12041 
12042 	  /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12043 	     and that mask operand shoud be at the end.
12044 	     Keep all-ones mask which would be simplified by the expander.  */
12045 	  if (nargs == 3 && i == 2 && klass == load
12046 	      && constm1_operand (op, mode)
12047 	      && insn_p->operand[i].predicate (op, mode))
12048 	    ;
12049 	  else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12050 	    op = copy_to_mode_reg (mode, op);
12051 	  else
12052 	    {
12053 	      op = copy_to_reg (op);
12054 	      op = lowpart_subreg (mode, op, GET_MODE (op));
12055 	    }
12056 	}
12057 
12058       xops[i]= op;
12059     }
12060 
12061   switch (nargs)
12062     {
12063     case 0:
12064       pat = GEN_FCN (icode) (target);
12065       break;
12066     case 1:
12067       pat = GEN_FCN (icode) (target, xops[0]);
12068       break;
12069     case 2:
12070       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12071       break;
12072     case 3:
12073       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12074       break;
12075     default:
12076       gcc_unreachable ();
12077     }
12078 
12079   if (! pat)
12080     return 0;
12081 
12082   emit_insn (pat);
12083   return klass == store ? 0 : target;
12084 }
12085 
12086 /* Return the integer constant in ARG.  Constrain it to be in the range
12087    of the subparts of VEC_TYPE; issue an error if not.  */
12088 
12089 static int
get_element_number(tree vec_type,tree arg)12090 get_element_number (tree vec_type, tree arg)
12091 {
12092   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12093 
12094   if (!tree_fits_uhwi_p (arg)
12095       || (elt = tree_to_uhwi (arg), elt > max))
12096     {
12097       error ("selector must be an integer constant in the range "
12098 	     "[0, %wi]", max);
12099       return 0;
12100     }
12101 
12102   return elt;
12103 }
12104 
12105 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
12106    ix86_expand_vector_init.  We DO have language-level syntax for this, in
12107    the form of  (type){ init-list }.  Except that since we can't place emms
12108    instructions from inside the compiler, we can't allow the use of MMX
12109    registers unless the user explicitly asks for it.  So we do *not* define
12110    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
12111    we have builtins invoked by mmintrin.h that gives us license to emit
12112    these sorts of instructions.  */
12113 
12114 static rtx
ix86_expand_vec_init_builtin(tree type,tree exp,rtx target)12115 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12116 {
12117   machine_mode tmode = TYPE_MODE (type);
12118   machine_mode inner_mode = GET_MODE_INNER (tmode);
12119   int i, n_elt = GET_MODE_NUNITS (tmode);
12120   rtvec v = rtvec_alloc (n_elt);
12121 
12122   gcc_assert (VECTOR_MODE_P (tmode));
12123   gcc_assert (call_expr_nargs (exp) == n_elt);
12124 
12125   for (i = 0; i < n_elt; ++i)
12126     {
12127       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12128       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12129     }
12130 
12131   if (!target || !register_operand (target, tmode))
12132     target = gen_reg_rtx (tmode);
12133 
12134   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12135   return target;
12136 }
12137 
12138 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
12139    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
12140    had a language-level syntax for referencing vector elements.  */
12141 
12142 static rtx
ix86_expand_vec_ext_builtin(tree exp,rtx target)12143 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12144 {
12145   machine_mode tmode, mode0;
12146   tree arg0, arg1;
12147   int elt;
12148   rtx op0;
12149 
12150   arg0 = CALL_EXPR_ARG (exp, 0);
12151   arg1 = CALL_EXPR_ARG (exp, 1);
12152 
12153   op0 = expand_normal (arg0);
12154   elt = get_element_number (TREE_TYPE (arg0), arg1);
12155 
12156   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12157   mode0 = TYPE_MODE (TREE_TYPE (arg0));
12158   gcc_assert (VECTOR_MODE_P (mode0));
12159 
12160   op0 = force_reg (mode0, op0);
12161 
12162   if (optimize || !target || !register_operand (target, tmode))
12163     target = gen_reg_rtx (tmode);
12164 
12165   ix86_expand_vector_extract (true, target, op0, elt);
12166 
12167   return target;
12168 }
12169 
12170 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
12171    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
12172    a language-level syntax for referencing vector elements.  */
12173 
12174 static rtx
ix86_expand_vec_set_builtin(tree exp)12175 ix86_expand_vec_set_builtin (tree exp)
12176 {
12177   machine_mode tmode, mode1;
12178   tree arg0, arg1, arg2;
12179   int elt;
12180   rtx op0, op1, target;
12181 
12182   arg0 = CALL_EXPR_ARG (exp, 0);
12183   arg1 = CALL_EXPR_ARG (exp, 1);
12184   arg2 = CALL_EXPR_ARG (exp, 2);
12185 
12186   tmode = TYPE_MODE (TREE_TYPE (arg0));
12187   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12188   gcc_assert (VECTOR_MODE_P (tmode));
12189 
12190   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12191   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12192   elt = get_element_number (TREE_TYPE (arg0), arg2);
12193 
12194   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
12195     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12196 
12197   op0 = force_reg (tmode, op0);
12198   op1 = force_reg (mode1, op1);
12199 
12200   /* OP0 is the source of these builtin functions and shouldn't be
12201      modified.  Create a copy, use it and return it as target.  */
12202   target = gen_reg_rtx (tmode);
12203   emit_move_insn (target, op0);
12204   ix86_expand_vector_set (true, target, op1, elt);
12205 
12206   return target;
12207 }
12208 
12209 /* Expand an expression EXP that calls a built-in function,
12210    with result going to TARGET if that's convenient
12211    (and in mode MODE if that's convenient).
12212    SUBTARGET may be used as the target for computing one of EXP's operands.
12213    IGNORE is nonzero if the value is to be ignored.  */
12214 
12215 rtx
ix86_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)12216 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12217 		     machine_mode mode, int ignore)
12218 {
12219   size_t i;
12220   enum insn_code icode, icode2;
12221   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12222   tree arg0, arg1, arg2, arg3, arg4;
12223   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12224   machine_mode mode0, mode1, mode2, mode3, mode4;
12225   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12226 
12227   /* For CPU builtins that can be folded, fold first and expand the fold.  */
12228   switch (fcode)
12229     {
12230     case IX86_BUILTIN_CPU_INIT:
12231       {
12232 	/* Make it call __cpu_indicator_init in libgcc. */
12233 	tree call_expr, fndecl, type;
12234         type = build_function_type_list (integer_type_node, NULL_TREE);
12235 	fndecl = build_fn_decl ("__cpu_indicator_init", type);
12236 	call_expr = build_call_expr (fndecl, 0);
12237 	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12238       }
12239     case IX86_BUILTIN_CPU_IS:
12240     case IX86_BUILTIN_CPU_SUPPORTS:
12241       {
12242 	tree arg0 = CALL_EXPR_ARG (exp, 0);
12243 	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12244 	gcc_assert (fold_expr != NULL_TREE);
12245 	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12246       }
12247     }
12248 
12249   HOST_WIDE_INT isa = ix86_isa_flags;
12250   HOST_WIDE_INT isa2 = ix86_isa_flags2;
12251   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12252   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12253   /* The general case is we require all the ISAs specified in bisa{,2}
12254      to be enabled.
12255      The exceptions are:
12256      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12257      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12258      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12259      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12260        OPTION_MASK_ISA2_AVXVNNI
12261      where for each such pair it is sufficient if either of the ISAs is
12262      enabled, plus if it is ored with other options also those others.
12263      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
12264   if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12265        == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12266       && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12267     isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
12268 
12269   if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12270        == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12271       && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12272     isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
12273 
12274   if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12275        == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12276       && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12277     isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
12278 
12279   if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12280 	== (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12281        || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12282       && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12283 	   == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12284 	  || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12285     {
12286       isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12287       isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12288     }
12289 
12290   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12291       /* __builtin_ia32_maskmovq requires MMX registers.  */
12292       && fcode != IX86_BUILTIN_MASKMOVQ)
12293     {
12294       bisa &= ~OPTION_MASK_ISA_MMX;
12295       bisa |= OPTION_MASK_ISA_SSE2;
12296     }
12297 
12298   if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
12299     {
12300       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12301       if (TARGET_ABI_X32)
12302 	bisa |= OPTION_MASK_ABI_X32;
12303       else
12304 	bisa |= OPTION_MASK_ABI_64;
12305       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12306 				       (enum fpmath_unit) 0,
12307 				       (enum prefer_vector_width) 0,
12308 				       PVW_NONE, PVW_NONE,
12309 				       false, add_abi_p);
12310       if (!opts)
12311 	error ("%qE needs unknown isa option", fndecl);
12312       else
12313 	{
12314 	  gcc_assert (opts != NULL);
12315 	  error ("%qE needs isa option %s", fndecl, opts);
12316 	  free (opts);
12317 	}
12318       return expand_call (exp, target, ignore);
12319     }
12320 
12321   switch (fcode)
12322     {
12323     case IX86_BUILTIN_MASKMOVQ:
12324     case IX86_BUILTIN_MASKMOVDQU:
12325       icode = (fcode == IX86_BUILTIN_MASKMOVQ
12326 	       ? CODE_FOR_mmx_maskmovq
12327 	       : CODE_FOR_sse2_maskmovdqu);
12328       /* Note the arg order is different from the operand order.  */
12329       arg1 = CALL_EXPR_ARG (exp, 0);
12330       arg2 = CALL_EXPR_ARG (exp, 1);
12331       arg0 = CALL_EXPR_ARG (exp, 2);
12332       op0 = expand_normal (arg0);
12333       op1 = expand_normal (arg1);
12334       op2 = expand_normal (arg2);
12335       mode0 = insn_data[icode].operand[0].mode;
12336       mode1 = insn_data[icode].operand[1].mode;
12337       mode2 = insn_data[icode].operand[2].mode;
12338 
12339       op0 = ix86_zero_extend_to_Pmode (op0);
12340       op0 = gen_rtx_MEM (mode1, op0);
12341 
12342       if (!insn_data[icode].operand[0].predicate (op0, mode0))
12343 	op0 = copy_to_mode_reg (mode0, op0);
12344       if (!insn_data[icode].operand[1].predicate (op1, mode1))
12345 	op1 = copy_to_mode_reg (mode1, op1);
12346       if (!insn_data[icode].operand[2].predicate (op2, mode2))
12347 	op2 = copy_to_mode_reg (mode2, op2);
12348       pat = GEN_FCN (icode) (op0, op1, op2);
12349       if (! pat)
12350 	return 0;
12351       emit_insn (pat);
12352       return 0;
12353 
12354     case IX86_BUILTIN_LDMXCSR:
12355       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12356       target = assign_386_stack_local (SImode, SLOT_TEMP);
12357       emit_move_insn (target, op0);
12358       emit_insn (gen_sse_ldmxcsr (target));
12359       return 0;
12360 
12361     case IX86_BUILTIN_STMXCSR:
12362       target = assign_386_stack_local (SImode, SLOT_TEMP);
12363       emit_insn (gen_sse_stmxcsr (target));
12364       return copy_to_mode_reg (SImode, target);
12365 
12366     case IX86_BUILTIN_CLFLUSH:
12367 	arg0 = CALL_EXPR_ARG (exp, 0);
12368 	op0 = expand_normal (arg0);
12369 	icode = CODE_FOR_sse2_clflush;
12370 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12371 	  op0 = ix86_zero_extend_to_Pmode (op0);
12372 
12373 	emit_insn (gen_sse2_clflush (op0));
12374 	return 0;
12375 
12376     case IX86_BUILTIN_CLWB:
12377 	arg0 = CALL_EXPR_ARG (exp, 0);
12378 	op0 = expand_normal (arg0);
12379 	icode = CODE_FOR_clwb;
12380 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12381 	  op0 = ix86_zero_extend_to_Pmode (op0);
12382 
12383 	emit_insn (gen_clwb (op0));
12384 	return 0;
12385 
12386     case IX86_BUILTIN_CLFLUSHOPT:
12387 	arg0 = CALL_EXPR_ARG (exp, 0);
12388 	op0 = expand_normal (arg0);
12389 	icode = CODE_FOR_clflushopt;
12390 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12391 	  op0 = ix86_zero_extend_to_Pmode (op0);
12392 
12393 	emit_insn (gen_clflushopt (op0));
12394 	return 0;
12395 
12396     case IX86_BUILTIN_MONITOR:
12397     case IX86_BUILTIN_MONITORX:
12398       arg0 = CALL_EXPR_ARG (exp, 0);
12399       arg1 = CALL_EXPR_ARG (exp, 1);
12400       arg2 = CALL_EXPR_ARG (exp, 2);
12401       op0 = expand_normal (arg0);
12402       op1 = expand_normal (arg1);
12403       op2 = expand_normal (arg2);
12404       if (!REG_P (op0))
12405 	op0 = ix86_zero_extend_to_Pmode (op0);
12406       if (!REG_P (op1))
12407 	op1 = copy_to_mode_reg (SImode, op1);
12408       if (!REG_P (op2))
12409 	op2 = copy_to_mode_reg (SImode, op2);
12410 
12411       emit_insn (fcode == IX86_BUILTIN_MONITOR
12412 		 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12413 		 : gen_monitorx (Pmode, op0, op1, op2));
12414       return 0;
12415 
12416     case IX86_BUILTIN_MWAIT:
12417       arg0 = CALL_EXPR_ARG (exp, 0);
12418       arg1 = CALL_EXPR_ARG (exp, 1);
12419       op0 = expand_normal (arg0);
12420       op1 = expand_normal (arg1);
12421       if (!REG_P (op0))
12422 	op0 = copy_to_mode_reg (SImode, op0);
12423       if (!REG_P (op1))
12424 	op1 = copy_to_mode_reg (SImode, op1);
12425       emit_insn (gen_sse3_mwait (op0, op1));
12426       return 0;
12427 
12428     case IX86_BUILTIN_MWAITX:
12429       arg0 = CALL_EXPR_ARG (exp, 0);
12430       arg1 = CALL_EXPR_ARG (exp, 1);
12431       arg2 = CALL_EXPR_ARG (exp, 2);
12432       op0 = expand_normal (arg0);
12433       op1 = expand_normal (arg1);
12434       op2 = expand_normal (arg2);
12435       if (!REG_P (op0))
12436 	op0 = copy_to_mode_reg (SImode, op0);
12437       if (!REG_P (op1))
12438 	op1 = copy_to_mode_reg (SImode, op1);
12439       if (!REG_P (op2))
12440 	op2 = copy_to_mode_reg (SImode, op2);
12441       emit_insn (gen_mwaitx (op0, op1, op2));
12442       return 0;
12443 
12444     case IX86_BUILTIN_UMONITOR:
12445       arg0 = CALL_EXPR_ARG (exp, 0);
12446       op0 = expand_normal (arg0);
12447 
12448       op0 = ix86_zero_extend_to_Pmode (op0);
12449       emit_insn (gen_umonitor (Pmode, op0));
12450       return 0;
12451 
12452     case IX86_BUILTIN_UMWAIT:
12453     case IX86_BUILTIN_TPAUSE:
12454       arg0 = CALL_EXPR_ARG (exp, 0);
12455       arg1 = CALL_EXPR_ARG (exp, 1);
12456       op0 = expand_normal (arg0);
12457       op1 = expand_normal (arg1);
12458 
12459       if (!REG_P (op0))
12460 	op0 = copy_to_mode_reg (SImode, op0);
12461 
12462       op1 = force_reg (DImode, op1);
12463 
12464       if (TARGET_64BIT)
12465 	{
12466 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12467 				     NULL, 1, OPTAB_DIRECT);
12468 	  switch (fcode)
12469 	    {
12470 	    case IX86_BUILTIN_UMWAIT:
12471 	      icode = CODE_FOR_umwait_rex64;
12472 	      break;
12473 	    case IX86_BUILTIN_TPAUSE:
12474 	      icode = CODE_FOR_tpause_rex64;
12475 	      break;
12476 	    default:
12477 	      gcc_unreachable ();
12478 	    }
12479 
12480 	  op2 = gen_lowpart (SImode, op2);
12481 	  op1 = gen_lowpart (SImode, op1);
12482 	  pat = GEN_FCN (icode) (op0, op1, op2);
12483 	}
12484       else
12485 	{
12486 	  switch (fcode)
12487 	    {
12488 	    case IX86_BUILTIN_UMWAIT:
12489 	      icode = CODE_FOR_umwait;
12490 	      break;
12491 	    case IX86_BUILTIN_TPAUSE:
12492 	      icode = CODE_FOR_tpause;
12493 	      break;
12494 	    default:
12495 	      gcc_unreachable ();
12496 	    }
12497 	  pat = GEN_FCN (icode) (op0, op1);
12498 	}
12499 
12500       if (!pat)
12501 	return 0;
12502 
12503       emit_insn (pat);
12504 
12505       if (target == 0
12506 	  || !register_operand (target, QImode))
12507 	target = gen_reg_rtx (QImode);
12508 
12509       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12510 			const0_rtx);
12511       emit_insn (gen_rtx_SET (target, pat));
12512 
12513       return target;
12514 
12515     case IX86_BUILTIN_TESTUI:
12516       emit_insn (gen_testui ());
12517 
12518       if (target == 0
12519 	  || !register_operand (target, QImode))
12520 	target = gen_reg_rtx (QImode);
12521 
12522       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12523 			 const0_rtx);
12524       emit_insn (gen_rtx_SET (target, pat));
12525 
12526       return target;
12527 
12528     case IX86_BUILTIN_CLZERO:
12529       arg0 = CALL_EXPR_ARG (exp, 0);
12530       op0 = expand_normal (arg0);
12531       if (!REG_P (op0))
12532 	op0 = ix86_zero_extend_to_Pmode (op0);
12533       emit_insn (gen_clzero (Pmode, op0));
12534       return 0;
12535 
12536     case IX86_BUILTIN_CLDEMOTE:
12537       arg0 = CALL_EXPR_ARG (exp, 0);
12538       op0 = expand_normal (arg0);
12539       icode = CODE_FOR_cldemote;
12540       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12541 	op0 = ix86_zero_extend_to_Pmode (op0);
12542 
12543       emit_insn (gen_cldemote (op0));
12544       return 0;
12545 
12546     case IX86_BUILTIN_LOADIWKEY:
12547       {
12548 	arg0 = CALL_EXPR_ARG (exp, 0);
12549 	arg1 = CALL_EXPR_ARG (exp, 1);
12550 	arg2 = CALL_EXPR_ARG (exp, 2);
12551 	arg3 = CALL_EXPR_ARG (exp, 3);
12552 
12553 	op0 = expand_normal (arg0);
12554 	op1 = expand_normal (arg1);
12555 	op2 = expand_normal (arg2);
12556 	op3 = expand_normal (arg3);
12557 
12558 	if (!REG_P (op0))
12559 	  op0 = copy_to_mode_reg (V2DImode, op0);
12560 	if (!REG_P (op1))
12561 	  op1 = copy_to_mode_reg (V2DImode, op1);
12562 	if (!REG_P (op2))
12563 	  op2 = copy_to_mode_reg (V2DImode, op2);
12564 	if (!REG_P (op3))
12565 	  op3 = copy_to_mode_reg (SImode, op3);
12566 
12567 	emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12568 
12569 	return 0;
12570       }
12571 
12572     case IX86_BUILTIN_AESDEC128KLU8:
12573       icode = CODE_FOR_aesdec128klu8;
12574       goto aesdecenc_expand;
12575 
12576     case IX86_BUILTIN_AESDEC256KLU8:
12577       icode = CODE_FOR_aesdec256klu8;
12578       goto aesdecenc_expand;
12579 
12580     case IX86_BUILTIN_AESENC128KLU8:
12581       icode = CODE_FOR_aesenc128klu8;
12582       goto aesdecenc_expand;
12583 
12584     case IX86_BUILTIN_AESENC256KLU8:
12585       icode = CODE_FOR_aesenc256klu8;
12586 
12587     aesdecenc_expand:
12588 
12589       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12590       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12591       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12592 
12593       op0 = expand_normal (arg0);
12594       op1 = expand_normal (arg1);
12595       op2 = expand_normal (arg2);
12596 
12597       if (!address_operand (op0, V2DImode))
12598 	{
12599 	  op0 = convert_memory_address (Pmode, op0);
12600 	  op0 = copy_addr_to_reg (op0);
12601 	}
12602       op0 = gen_rtx_MEM (V2DImode, op0);
12603 
12604       if (!REG_P (op1))
12605 	op1 = copy_to_mode_reg (V2DImode, op1);
12606 
12607       if (!address_operand (op2, VOIDmode))
12608 	{
12609 	  op2 = convert_memory_address (Pmode, op2);
12610 	  op2 = copy_addr_to_reg (op2);
12611 	}
12612       op2 = gen_rtx_MEM (BLKmode, op2);
12613 
12614       emit_insn (GEN_FCN (icode) (op1, op1, op2));
12615 
12616       if (target == 0)
12617 	target = gen_reg_rtx (QImode);
12618 
12619       /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12620 	 error occurs. Then the output should be cleared for safety. */
12621       rtx_code_label *ok_label;
12622       rtx tmp;
12623 
12624       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12625       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12626       ok_label = gen_label_rtx ();
12627       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12628 			       true, ok_label);
12629       /* Usually the runtime error seldom occur, so predict OK path as
12630 	 hotspot to optimize it as fallthrough block. */
12631       predict_jump (REG_BR_PROB_BASE * 90 / 100);
12632 
12633       emit_insn (gen_rtx_SET (op1, const0_rtx));
12634 
12635       emit_label (ok_label);
12636       emit_insn (gen_rtx_SET (target, pat));
12637       emit_insn (gen_rtx_SET (op0, op1));
12638 
12639       return target;
12640 
12641     case IX86_BUILTIN_AESDECWIDE128KLU8:
12642       icode = CODE_FOR_aesdecwide128klu8;
12643       goto wideaesdecenc_expand;
12644 
12645     case IX86_BUILTIN_AESDECWIDE256KLU8:
12646       icode = CODE_FOR_aesdecwide256klu8;
12647       goto wideaesdecenc_expand;
12648 
12649     case IX86_BUILTIN_AESENCWIDE128KLU8:
12650       icode = CODE_FOR_aesencwide128klu8;
12651       goto wideaesdecenc_expand;
12652 
12653     case IX86_BUILTIN_AESENCWIDE256KLU8:
12654       icode = CODE_FOR_aesencwide256klu8;
12655 
12656     wideaesdecenc_expand:
12657 
12658       rtx xmm_regs[8];
12659       rtx op;
12660 
12661       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
12662       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
12663       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12664 
12665       op0 = expand_normal (arg0);
12666       op1 = expand_normal (arg1);
12667       op2 = expand_normal (arg2);
12668 
12669       if (!address_operand (op2, VOIDmode))
12670 	{
12671 	  op2 = convert_memory_address (Pmode, op2);
12672 	  op2 = copy_addr_to_reg (op2);
12673 	}
12674       op2 = gen_rtx_MEM (BLKmode, op2);
12675 
12676       for (i = 0; i < 8; i++)
12677 	{
12678 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12679 
12680 	  op = gen_rtx_MEM (V2DImode,
12681 			    plus_constant (Pmode, op1, (i * 16)));
12682 
12683 	  emit_move_insn (xmm_regs[i], op);
12684 	}
12685 
12686       emit_insn (GEN_FCN (icode) (op2));
12687 
12688       if (target == 0)
12689 	target = gen_reg_rtx (QImode);
12690 
12691       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12692       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12693       ok_label = gen_label_rtx ();
12694       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12695 			       true, ok_label);
12696       predict_jump (REG_BR_PROB_BASE * 90 / 100);
12697 
12698       for (i = 0; i < 8; i++)
12699 	emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
12700 
12701       emit_label (ok_label);
12702       emit_insn (gen_rtx_SET (target, pat));
12703 
12704       for (i = 0; i < 8; i++)
12705 	{
12706 	  op = gen_rtx_MEM (V2DImode,
12707 			    plus_constant (Pmode, op0, (i * 16)));
12708 	  emit_move_insn (op, xmm_regs[i]);
12709 	}
12710 
12711       return target;
12712 
12713     case IX86_BUILTIN_ENCODEKEY128U32:
12714       {
12715 	rtx op, xmm_regs[7];
12716 
12717 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12718 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
12719 	arg2 = CALL_EXPR_ARG (exp, 2); // void *h
12720 
12721 	op0 = expand_normal (arg0);
12722 	op1 = expand_normal (arg1);
12723 	op2 = expand_normal (arg2);
12724 
12725 	if (!REG_P (op0))
12726 	  op0 = copy_to_mode_reg (SImode, op0);
12727 
12728 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12729 	emit_move_insn (op, op1);
12730 
12731 	for (i = 0; i < 3; i++)
12732 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12733 
12734 	if (target == 0)
12735 	  target = gen_reg_rtx (SImode);
12736 
12737 	emit_insn (gen_encodekey128u32 (target, op0));
12738 
12739 	for (i = 0; i < 3; i++)
12740 	  {
12741 	    op = gen_rtx_MEM (V2DImode,
12742 			      plus_constant (Pmode, op2, (i * 16)));
12743 	    emit_move_insn (op, xmm_regs[i]);
12744 	  }
12745 
12746 	return target;
12747       }
12748     case IX86_BUILTIN_ENCODEKEY256U32:
12749       {
12750 	rtx op, xmm_regs[7];
12751 
12752 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12753 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
12754 	arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
12755 	arg3 = CALL_EXPR_ARG (exp, 3); // void *h
12756 
12757 	op0 = expand_normal (arg0);
12758 	op1 = expand_normal (arg1);
12759 	op2 = expand_normal (arg2);
12760 	op3 = expand_normal (arg3);
12761 
12762 	if (!REG_P (op0))
12763 	  op0 = copy_to_mode_reg (SImode, op0);
12764 
12765 	/* Force to use xmm0, xmm1 for keylow, keyhi*/
12766 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12767 	emit_move_insn (op, op1);
12768 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
12769 	emit_move_insn (op, op2);
12770 
12771 	for (i = 0; i < 4; i++)
12772 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12773 
12774 	if (target == 0)
12775 	  target = gen_reg_rtx (SImode);
12776 
12777 	emit_insn (gen_encodekey256u32 (target, op0));
12778 
12779 	for (i = 0; i < 4; i++)
12780 	  {
12781 	    op = gen_rtx_MEM (V2DImode,
12782 			      plus_constant (Pmode, op3, (i * 16)));
12783 	    emit_move_insn (op, xmm_regs[i]);
12784 	  }
12785 
12786 	return target;
12787       }
12788 
12789     case IX86_BUILTIN_VEC_INIT_V2SI:
12790     case IX86_BUILTIN_VEC_INIT_V4HI:
12791     case IX86_BUILTIN_VEC_INIT_V8QI:
12792       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
12793 
12794     case IX86_BUILTIN_VEC_EXT_V2DF:
12795     case IX86_BUILTIN_VEC_EXT_V2DI:
12796     case IX86_BUILTIN_VEC_EXT_V4SF:
12797     case IX86_BUILTIN_VEC_EXT_V4SI:
12798     case IX86_BUILTIN_VEC_EXT_V8HI:
12799     case IX86_BUILTIN_VEC_EXT_V2SI:
12800     case IX86_BUILTIN_VEC_EXT_V4HI:
12801     case IX86_BUILTIN_VEC_EXT_V16QI:
12802       return ix86_expand_vec_ext_builtin (exp, target);
12803 
12804     case IX86_BUILTIN_VEC_SET_V2DI:
12805     case IX86_BUILTIN_VEC_SET_V4SF:
12806     case IX86_BUILTIN_VEC_SET_V4SI:
12807     case IX86_BUILTIN_VEC_SET_V8HI:
12808     case IX86_BUILTIN_VEC_SET_V4HI:
12809     case IX86_BUILTIN_VEC_SET_V16QI:
12810       return ix86_expand_vec_set_builtin (exp);
12811 
12812     case IX86_BUILTIN_NANQ:
12813     case IX86_BUILTIN_NANSQ:
12814       return expand_call (exp, target, ignore);
12815 
12816     case IX86_BUILTIN_RDPID:
12817 
12818       op0 = gen_reg_rtx (word_mode);
12819 
12820       if (TARGET_64BIT)
12821 	{
12822 	  insn = gen_rdpid_rex64 (op0);
12823 	  op0 = convert_to_mode (SImode, op0, 1);
12824 	}
12825       else
12826 	insn = gen_rdpid (op0);
12827 
12828       emit_insn (insn);
12829 
12830       if (target == 0
12831 	  || !register_operand (target, SImode))
12832 	target = gen_reg_rtx (SImode);
12833 
12834       emit_move_insn (target, op0);
12835       return target;
12836 
12837     case IX86_BUILTIN_2INTERSECTD512:
12838     case IX86_BUILTIN_2INTERSECTQ512:
12839     case IX86_BUILTIN_2INTERSECTD256:
12840     case IX86_BUILTIN_2INTERSECTQ256:
12841     case IX86_BUILTIN_2INTERSECTD128:
12842     case IX86_BUILTIN_2INTERSECTQ128:
12843       arg0 = CALL_EXPR_ARG (exp, 0);
12844       arg1 = CALL_EXPR_ARG (exp, 1);
12845       arg2 = CALL_EXPR_ARG (exp, 2);
12846       arg3 = CALL_EXPR_ARG (exp, 3);
12847       op0 = expand_normal (arg0);
12848       op1 = expand_normal (arg1);
12849       op2 = expand_normal (arg2);
12850       op3 = expand_normal (arg3);
12851 
12852       if (!address_operand (op0, VOIDmode))
12853 	{
12854 	  op0 = convert_memory_address (Pmode, op0);
12855 	  op0 = copy_addr_to_reg (op0);
12856 	}
12857       if (!address_operand (op1, VOIDmode))
12858 	{
12859 	  op1 = convert_memory_address (Pmode, op1);
12860 	  op1 = copy_addr_to_reg (op1);
12861 	}
12862 
12863       switch (fcode)
12864 	{
12865 	case IX86_BUILTIN_2INTERSECTD512:
12866 	  mode4 = P2HImode;
12867 	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
12868 	  break;
12869 	case IX86_BUILTIN_2INTERSECTQ512:
12870 	  mode4 = P2QImode;
12871 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
12872 	  break;
12873 	case IX86_BUILTIN_2INTERSECTD256:
12874 	  mode4 = P2QImode;
12875 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
12876 	  break;
12877 	case IX86_BUILTIN_2INTERSECTQ256:
12878 	  mode4 = P2QImode;
12879 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
12880 	  break;
12881 	case IX86_BUILTIN_2INTERSECTD128:
12882 	  mode4 = P2QImode;
12883 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
12884 	  break;
12885 	case IX86_BUILTIN_2INTERSECTQ128:
12886 	  mode4 = P2QImode;
12887 	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
12888 	  break;
12889 	default:
12890 	  gcc_unreachable ();
12891 	}
12892 
12893       mode2 = insn_data[icode].operand[1].mode;
12894       mode3 = insn_data[icode].operand[2].mode;
12895       if (!insn_data[icode].operand[1].predicate (op2, mode2))
12896 	op2 = copy_to_mode_reg (mode2, op2);
12897       if (!insn_data[icode].operand[2].predicate (op3, mode3))
12898 	op3 = copy_to_mode_reg (mode3, op3);
12899 
12900       op4 = gen_reg_rtx (mode4);
12901       emit_insn (GEN_FCN (icode) (op4, op2, op3));
12902       mode0 = mode4 == P2HImode ? HImode : QImode;
12903       emit_move_insn (gen_rtx_MEM (mode0, op0),
12904 		      gen_lowpart (mode0, op4));
12905       emit_move_insn (gen_rtx_MEM (mode0, op1),
12906 		      gen_highpart (mode0, op4));
12907 
12908       return 0;
12909 
12910     case IX86_BUILTIN_RDPMC:
12911     case IX86_BUILTIN_RDTSC:
12912     case IX86_BUILTIN_RDTSCP:
12913     case IX86_BUILTIN_XGETBV:
12914 
12915       op0 = gen_reg_rtx (DImode);
12916       op1 = gen_reg_rtx (DImode);
12917 
12918       if (fcode == IX86_BUILTIN_RDPMC)
12919 	{
12920 	  arg0 = CALL_EXPR_ARG (exp, 0);
12921 	  op2 = expand_normal (arg0);
12922 	  if (!register_operand (op2, SImode))
12923 	    op2 = copy_to_mode_reg (SImode, op2);
12924 
12925 	  insn = (TARGET_64BIT
12926 		  ? gen_rdpmc_rex64 (op0, op1, op2)
12927 		  : gen_rdpmc (op0, op2));
12928 	  emit_insn (insn);
12929 	}
12930       else if (fcode == IX86_BUILTIN_XGETBV)
12931 	{
12932 	  arg0 = CALL_EXPR_ARG (exp, 0);
12933 	  op2 = expand_normal (arg0);
12934 	  if (!register_operand (op2, SImode))
12935 	    op2 = copy_to_mode_reg (SImode, op2);
12936 
12937 	  insn = (TARGET_64BIT
12938 		  ? gen_xgetbv_rex64 (op0, op1, op2)
12939 		  : gen_xgetbv (op0, op2));
12940 	  emit_insn (insn);
12941 	}
12942       else if (fcode == IX86_BUILTIN_RDTSC)
12943 	{
12944 	  insn = (TARGET_64BIT
12945 		  ? gen_rdtsc_rex64 (op0, op1)
12946 		  : gen_rdtsc (op0));
12947 	  emit_insn (insn);
12948 	}
12949       else
12950 	{
12951 	  op2 = gen_reg_rtx (SImode);
12952 
12953 	  insn = (TARGET_64BIT
12954 		  ? gen_rdtscp_rex64 (op0, op1, op2)
12955 		  : gen_rdtscp (op0, op2));
12956 	  emit_insn (insn);
12957 
12958 	  arg0 = CALL_EXPR_ARG (exp, 0);
12959 	  op4 = expand_normal (arg0);
12960 	  if (!address_operand (op4, VOIDmode))
12961 	    {
12962 	      op4 = convert_memory_address (Pmode, op4);
12963 	      op4 = copy_addr_to_reg (op4);
12964 	    }
12965 	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
12966 	}
12967 
12968       if (target == 0
12969 	  || !register_operand (target, DImode))
12970         target = gen_reg_rtx (DImode);
12971 
12972       if (TARGET_64BIT)
12973 	{
12974 	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
12975 				     op1, 1, OPTAB_DIRECT);
12976 	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
12977 				     op0, 1, OPTAB_DIRECT);
12978 	}
12979 
12980       emit_move_insn (target, op0);
12981       return target;
12982 
12983     case IX86_BUILTIN_ENQCMD:
12984     case IX86_BUILTIN_ENQCMDS:
12985     case IX86_BUILTIN_MOVDIR64B:
12986 
12987       arg0 = CALL_EXPR_ARG (exp, 0);
12988       arg1 = CALL_EXPR_ARG (exp, 1);
12989       op0 = expand_normal (arg0);
12990       op1 = expand_normal (arg1);
12991 
12992       op0 = ix86_zero_extend_to_Pmode (op0);
12993       if (!address_operand (op1, VOIDmode))
12994       {
12995 	op1 = convert_memory_address (Pmode, op1);
12996 	op1 = copy_addr_to_reg (op1);
12997       }
12998       op1 = gen_rtx_MEM (XImode, op1);
12999 
13000       if (fcode == IX86_BUILTIN_MOVDIR64B)
13001 	{
13002 	  emit_insn (gen_movdir64b (Pmode, op0, op1));
13003 	  return 0;
13004 	}
13005       else
13006 	{
13007 	  if (target == 0
13008 	      || !register_operand (target, SImode))
13009 	    target = gen_reg_rtx (SImode);
13010 
13011 	  emit_move_insn (target, const0_rtx);
13012 	  target = gen_rtx_SUBREG (QImode, target, 0);
13013 
13014 	  int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13015 			 ? UNSPECV_ENQCMD
13016 			 : UNSPECV_ENQCMDS);
13017 	  icode = code_for_enqcmd (unspecv, Pmode);
13018 	  emit_insn (GEN_FCN (icode) (op0, op1));
13019 
13020 	  emit_insn
13021 	    (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13022 			  gen_rtx_fmt_ee (EQ, QImode,
13023 					  gen_rtx_REG (CCZmode, FLAGS_REG),
13024 					  const0_rtx)));
13025 	  return SUBREG_REG (target);
13026 	}
13027 
13028     case IX86_BUILTIN_FXSAVE:
13029     case IX86_BUILTIN_FXRSTOR:
13030     case IX86_BUILTIN_FXSAVE64:
13031     case IX86_BUILTIN_FXRSTOR64:
13032     case IX86_BUILTIN_FNSTENV:
13033     case IX86_BUILTIN_FLDENV:
13034       mode0 = BLKmode;
13035       switch (fcode)
13036 	{
13037 	case IX86_BUILTIN_FXSAVE:
13038 	  icode = CODE_FOR_fxsave;
13039 	  break;
13040 	case IX86_BUILTIN_FXRSTOR:
13041 	  icode = CODE_FOR_fxrstor;
13042 	  break;
13043 	case IX86_BUILTIN_FXSAVE64:
13044 	  icode = CODE_FOR_fxsave64;
13045 	  break;
13046 	case IX86_BUILTIN_FXRSTOR64:
13047 	  icode = CODE_FOR_fxrstor64;
13048 	  break;
13049 	case IX86_BUILTIN_FNSTENV:
13050 	  icode = CODE_FOR_fnstenv;
13051 	  break;
13052 	case IX86_BUILTIN_FLDENV:
13053 	  icode = CODE_FOR_fldenv;
13054 	  break;
13055 	default:
13056 	  gcc_unreachable ();
13057 	}
13058 
13059       arg0 = CALL_EXPR_ARG (exp, 0);
13060       op0 = expand_normal (arg0);
13061 
13062       if (!address_operand (op0, VOIDmode))
13063 	{
13064 	  op0 = convert_memory_address (Pmode, op0);
13065 	  op0 = copy_addr_to_reg (op0);
13066 	}
13067       op0 = gen_rtx_MEM (mode0, op0);
13068 
13069       pat = GEN_FCN (icode) (op0);
13070       if (pat)
13071 	emit_insn (pat);
13072       return 0;
13073 
13074     case IX86_BUILTIN_XSETBV:
13075       arg0 = CALL_EXPR_ARG (exp, 0);
13076       arg1 = CALL_EXPR_ARG (exp, 1);
13077       op0 = expand_normal (arg0);
13078       op1 = expand_normal (arg1);
13079 
13080       if (!REG_P (op0))
13081 	op0 = copy_to_mode_reg (SImode, op0);
13082 
13083       op1 = force_reg (DImode, op1);
13084 
13085       if (TARGET_64BIT)
13086 	{
13087 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13088 				     NULL, 1, OPTAB_DIRECT);
13089 
13090 	  icode = CODE_FOR_xsetbv_rex64;
13091 
13092 	  op2 = gen_lowpart (SImode, op2);
13093 	  op1 = gen_lowpart (SImode, op1);
13094 	  pat = GEN_FCN (icode) (op0, op1, op2);
13095 	}
13096       else
13097 	{
13098 	  icode = CODE_FOR_xsetbv;
13099 
13100 	  pat = GEN_FCN (icode) (op0, op1);
13101 	}
13102       if (pat)
13103 	emit_insn (pat);
13104       return 0;
13105 
13106     case IX86_BUILTIN_XSAVE:
13107     case IX86_BUILTIN_XRSTOR:
13108     case IX86_BUILTIN_XSAVE64:
13109     case IX86_BUILTIN_XRSTOR64:
13110     case IX86_BUILTIN_XSAVEOPT:
13111     case IX86_BUILTIN_XSAVEOPT64:
13112     case IX86_BUILTIN_XSAVES:
13113     case IX86_BUILTIN_XRSTORS:
13114     case IX86_BUILTIN_XSAVES64:
13115     case IX86_BUILTIN_XRSTORS64:
13116     case IX86_BUILTIN_XSAVEC:
13117     case IX86_BUILTIN_XSAVEC64:
13118       arg0 = CALL_EXPR_ARG (exp, 0);
13119       arg1 = CALL_EXPR_ARG (exp, 1);
13120       op0 = expand_normal (arg0);
13121       op1 = expand_normal (arg1);
13122 
13123       if (!address_operand (op0, VOIDmode))
13124 	{
13125 	  op0 = convert_memory_address (Pmode, op0);
13126 	  op0 = copy_addr_to_reg (op0);
13127 	}
13128       op0 = gen_rtx_MEM (BLKmode, op0);
13129 
13130       op1 = force_reg (DImode, op1);
13131 
13132       if (TARGET_64BIT)
13133 	{
13134 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13135 				     NULL, 1, OPTAB_DIRECT);
13136 	  switch (fcode)
13137 	    {
13138 	    case IX86_BUILTIN_XSAVE:
13139 	      icode = CODE_FOR_xsave_rex64;
13140 	      break;
13141 	    case IX86_BUILTIN_XRSTOR:
13142 	      icode = CODE_FOR_xrstor_rex64;
13143 	      break;
13144 	    case IX86_BUILTIN_XSAVE64:
13145 	      icode = CODE_FOR_xsave64;
13146 	      break;
13147 	    case IX86_BUILTIN_XRSTOR64:
13148 	      icode = CODE_FOR_xrstor64;
13149 	      break;
13150 	    case IX86_BUILTIN_XSAVEOPT:
13151 	      icode = CODE_FOR_xsaveopt_rex64;
13152 	      break;
13153 	    case IX86_BUILTIN_XSAVEOPT64:
13154 	      icode = CODE_FOR_xsaveopt64;
13155 	      break;
13156 	    case IX86_BUILTIN_XSAVES:
13157 	      icode = CODE_FOR_xsaves_rex64;
13158 	      break;
13159 	    case IX86_BUILTIN_XRSTORS:
13160 	      icode = CODE_FOR_xrstors_rex64;
13161 	      break;
13162 	    case IX86_BUILTIN_XSAVES64:
13163 	      icode = CODE_FOR_xsaves64;
13164 	      break;
13165 	    case IX86_BUILTIN_XRSTORS64:
13166 	      icode = CODE_FOR_xrstors64;
13167 	      break;
13168 	    case IX86_BUILTIN_XSAVEC:
13169 	      icode = CODE_FOR_xsavec_rex64;
13170 	      break;
13171 	    case IX86_BUILTIN_XSAVEC64:
13172 	      icode = CODE_FOR_xsavec64;
13173 	      break;
13174 	    default:
13175 	      gcc_unreachable ();
13176 	    }
13177 
13178 	  op2 = gen_lowpart (SImode, op2);
13179 	  op1 = gen_lowpart (SImode, op1);
13180 	  pat = GEN_FCN (icode) (op0, op1, op2);
13181 	}
13182       else
13183 	{
13184 	  switch (fcode)
13185 	    {
13186 	    case IX86_BUILTIN_XSAVE:
13187 	      icode = CODE_FOR_xsave;
13188 	      break;
13189 	    case IX86_BUILTIN_XRSTOR:
13190 	      icode = CODE_FOR_xrstor;
13191 	      break;
13192 	    case IX86_BUILTIN_XSAVEOPT:
13193 	      icode = CODE_FOR_xsaveopt;
13194 	      break;
13195 	    case IX86_BUILTIN_XSAVES:
13196 	      icode = CODE_FOR_xsaves;
13197 	      break;
13198 	    case IX86_BUILTIN_XRSTORS:
13199 	      icode = CODE_FOR_xrstors;
13200 	      break;
13201 	    case IX86_BUILTIN_XSAVEC:
13202 	      icode = CODE_FOR_xsavec;
13203 	      break;
13204 	    default:
13205 	      gcc_unreachable ();
13206 	    }
13207 	  pat = GEN_FCN (icode) (op0, op1);
13208 	}
13209 
13210       if (pat)
13211 	emit_insn (pat);
13212       return 0;
13213 
13214     case IX86_BUILTIN_LLWPCB:
13215       arg0 = CALL_EXPR_ARG (exp, 0);
13216       op0 = expand_normal (arg0);
13217 
13218       if (!register_operand (op0, Pmode))
13219 	op0 = ix86_zero_extend_to_Pmode (op0);
13220       emit_insn (gen_lwp_llwpcb (Pmode, op0));
13221       return 0;
13222 
13223     case IX86_BUILTIN_SLWPCB:
13224       if (!target
13225 	  || !register_operand (target, Pmode))
13226 	target = gen_reg_rtx (Pmode);
13227       emit_insn (gen_lwp_slwpcb (Pmode, target));
13228       return target;
13229 
13230     case IX86_BUILTIN_LWPVAL32:
13231     case IX86_BUILTIN_LWPVAL64:
13232     case IX86_BUILTIN_LWPINS32:
13233     case IX86_BUILTIN_LWPINS64:
13234       mode = ((fcode == IX86_BUILTIN_LWPVAL32
13235 	       || fcode == IX86_BUILTIN_LWPINS32)
13236 	      ? SImode : DImode);
13237 
13238       if (fcode == IX86_BUILTIN_LWPVAL32
13239 	  || fcode == IX86_BUILTIN_LWPVAL64)
13240 	icode = code_for_lwp_lwpval (mode);
13241       else
13242 	icode = code_for_lwp_lwpins (mode);
13243 
13244       arg0 = CALL_EXPR_ARG (exp, 0);
13245       arg1 = CALL_EXPR_ARG (exp, 1);
13246       arg2 = CALL_EXPR_ARG (exp, 2);
13247       op0 = expand_normal (arg0);
13248       op1 = expand_normal (arg1);
13249       op2 = expand_normal (arg2);
13250       mode0 = insn_data[icode].operand[0].mode;
13251 
13252       if (!insn_data[icode].operand[0].predicate (op0, mode0))
13253 	op0 = copy_to_mode_reg (mode0, op0);
13254       if (!insn_data[icode].operand[1].predicate (op1, SImode))
13255 	op1 = copy_to_mode_reg (SImode, op1);
13256 
13257       if (!CONST_INT_P (op2))
13258 	{
13259 	  error ("the last argument must be a 32-bit immediate");
13260 	  return const0_rtx;
13261 	}
13262 
13263       emit_insn (GEN_FCN (icode) (op0, op1, op2));
13264 
13265       if (fcode == IX86_BUILTIN_LWPINS32
13266 	  || fcode == IX86_BUILTIN_LWPINS64)
13267 	{
13268 	  if (target == 0
13269 	      || !nonimmediate_operand (target, QImode))
13270 	    target = gen_reg_rtx (QImode);
13271 
13272 	  pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13273 			    const0_rtx);
13274 	  emit_insn (gen_rtx_SET (target, pat));
13275 
13276 	  return target;
13277 	}
13278       else
13279 	return 0;
13280 
13281     case IX86_BUILTIN_BEXTRI32:
13282     case IX86_BUILTIN_BEXTRI64:
13283       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13284 
13285       arg0 = CALL_EXPR_ARG (exp, 0);
13286       arg1 = CALL_EXPR_ARG (exp, 1);
13287       op0 = expand_normal (arg0);
13288       op1 = expand_normal (arg1);
13289 
13290       if (!CONST_INT_P (op1))
13291 	{
13292 	  error ("last argument must be an immediate");
13293 	  return const0_rtx;
13294 	}
13295       else
13296 	{
13297 	  unsigned char lsb_index = UINTVAL (op1);
13298 	  unsigned char length = UINTVAL (op1) >> 8;
13299 
13300 	  unsigned char bitsize = GET_MODE_BITSIZE (mode);
13301 
13302 	  icode = code_for_tbm_bextri (mode);
13303 
13304 	  mode1 = insn_data[icode].operand[1].mode;
13305 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
13306 	    op0 = copy_to_mode_reg (mode1, op0);
13307 
13308 	  mode0 = insn_data[icode].operand[0].mode;
13309 	  if (target == 0
13310 	      || !register_operand (target, mode0))
13311 	    target = gen_reg_rtx (mode0);
13312 
13313 	  if (length == 0 || lsb_index >= bitsize)
13314 	    {
13315 	      emit_move_insn (target, const0_rtx);
13316 	      return target;
13317 	    }
13318 
13319 	  if (length + lsb_index > bitsize)
13320 	    length = bitsize - lsb_index;
13321 
13322 	  op1 = GEN_INT (length);
13323 	  op2 = GEN_INT (lsb_index);
13324 
13325 	  emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13326 	  return target;
13327 	}
13328 
13329     case IX86_BUILTIN_RDRAND16_STEP:
13330       mode = HImode;
13331       goto rdrand_step;
13332 
13333     case IX86_BUILTIN_RDRAND32_STEP:
13334       mode = SImode;
13335       goto rdrand_step;
13336 
13337     case IX86_BUILTIN_RDRAND64_STEP:
13338       mode = DImode;
13339 
13340 rdrand_step:
13341       arg0 = CALL_EXPR_ARG (exp, 0);
13342       op1 = expand_normal (arg0);
13343       if (!address_operand (op1, VOIDmode))
13344 	{
13345 	  op1 = convert_memory_address (Pmode, op1);
13346 	  op1 = copy_addr_to_reg (op1);
13347 	}
13348 
13349       op0 = gen_reg_rtx (mode);
13350       emit_insn (gen_rdrand (mode, op0));
13351 
13352       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13353 
13354       op1 = force_reg (SImode, const1_rtx);
13355 
13356       /* Emit SImode conditional move.  */
13357       if (mode == HImode)
13358 	{
13359 	  if (TARGET_ZERO_EXTEND_WITH_AND
13360 	      && optimize_function_for_speed_p (cfun))
13361 	    {
13362 	      op2 = force_reg (SImode, const0_rtx);
13363 
13364 	      emit_insn (gen_movstricthi
13365 			 (gen_lowpart (HImode, op2), op0));
13366 	    }
13367 	  else
13368 	    {
13369 	      op2 = gen_reg_rtx (SImode);
13370 
13371 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
13372 	    }
13373 	}
13374       else if (mode == SImode)
13375 	op2 = op0;
13376       else
13377 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
13378 
13379       if (target == 0
13380 	  || !register_operand (target, SImode))
13381 	target = gen_reg_rtx (SImode);
13382 
13383       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13384 			 const0_rtx);
13385       emit_insn (gen_rtx_SET (target,
13386 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13387       return target;
13388 
13389     case IX86_BUILTIN_RDSEED16_STEP:
13390       mode = HImode;
13391       goto rdseed_step;
13392 
13393     case IX86_BUILTIN_RDSEED32_STEP:
13394       mode = SImode;
13395       goto rdseed_step;
13396 
13397     case IX86_BUILTIN_RDSEED64_STEP:
13398       mode = DImode;
13399 
13400 rdseed_step:
13401       arg0 = CALL_EXPR_ARG (exp, 0);
13402       op1 = expand_normal (arg0);
13403       if (!address_operand (op1, VOIDmode))
13404 	{
13405 	  op1 = convert_memory_address (Pmode, op1);
13406 	  op1 = copy_addr_to_reg (op1);
13407 	}
13408 
13409       op0 = gen_reg_rtx (mode);
13410       emit_insn (gen_rdseed (mode, op0));
13411 
13412       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13413 
13414       op2 = gen_reg_rtx (QImode);
13415 
13416       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13417                          const0_rtx);
13418       emit_insn (gen_rtx_SET (op2, pat));
13419 
13420       if (target == 0
13421 	  || !register_operand (target, SImode))
13422         target = gen_reg_rtx (SImode);
13423 
13424       emit_insn (gen_zero_extendqisi2 (target, op2));
13425       return target;
13426 
13427     case IX86_BUILTIN_SBB32:
13428       icode = CODE_FOR_subborrowsi;
13429       icode2 = CODE_FOR_subborrowsi_0;
13430       mode0 = SImode;
13431       mode1 = DImode;
13432       mode2 = CCmode;
13433       goto handlecarry;
13434 
13435     case IX86_BUILTIN_SBB64:
13436       icode = CODE_FOR_subborrowdi;
13437       icode2 = CODE_FOR_subborrowdi_0;
13438       mode0 = DImode;
13439       mode1 = TImode;
13440       mode2 = CCmode;
13441       goto handlecarry;
13442 
13443     case IX86_BUILTIN_ADDCARRYX32:
13444       icode = CODE_FOR_addcarrysi;
13445       icode2 = CODE_FOR_addcarrysi_0;
13446       mode0 = SImode;
13447       mode1 = DImode;
13448       mode2 = CCCmode;
13449       goto handlecarry;
13450 
13451     case IX86_BUILTIN_ADDCARRYX64:
13452       icode = CODE_FOR_addcarrydi;
13453       icode2 = CODE_FOR_addcarrydi_0;
13454       mode0 = DImode;
13455       mode1 = TImode;
13456       mode2 = CCCmode;
13457 
13458     handlecarry:
13459       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
13460       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
13461       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
13462       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
13463 
13464       op1 = expand_normal (arg0);
13465       if (!integer_zerop (arg0))
13466 	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13467 
13468       op2 = expand_normal (arg1);
13469       if (!register_operand (op2, mode0))
13470 	op2 = copy_to_mode_reg (mode0, op2);
13471 
13472       op3 = expand_normal (arg2);
13473       if (!register_operand (op3, mode0))
13474 	op3 = copy_to_mode_reg (mode0, op3);
13475 
13476       op4 = expand_normal (arg3);
13477       if (!address_operand (op4, VOIDmode))
13478 	{
13479 	  op4 = convert_memory_address (Pmode, op4);
13480 	  op4 = copy_addr_to_reg (op4);
13481 	}
13482 
13483       op0 = gen_reg_rtx (mode0);
13484       if (integer_zerop (arg0))
13485 	{
13486 	  /* If arg0 is 0, optimize right away into add or sub
13487 	     instruction that sets CCCmode flags.  */
13488 	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
13489 	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13490 	}
13491       else
13492 	{
13493 	  /* Generate CF from input operand.  */
13494 	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13495 
13496 	  /* Generate instruction that consumes CF.  */
13497 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13498 	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13499 	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13500 	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13501 	}
13502 
13503       /* Return current CF value.  */
13504       if (target == 0)
13505         target = gen_reg_rtx (QImode);
13506 
13507       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13508       emit_insn (gen_rtx_SET (target, pat));
13509 
13510       /* Store the result.  */
13511       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13512 
13513       return target;
13514 
13515     case IX86_BUILTIN_READ_FLAGS:
13516       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13517 
13518       if (optimize
13519 	  || target == NULL_RTX
13520 	  || !nonimmediate_operand (target, word_mode)
13521 	  || GET_MODE (target) != word_mode)
13522 	target = gen_reg_rtx (word_mode);
13523 
13524       emit_insn (gen_pop (target));
13525       return target;
13526 
13527     case IX86_BUILTIN_WRITE_FLAGS:
13528 
13529       arg0 = CALL_EXPR_ARG (exp, 0);
13530       op0 = expand_normal (arg0);
13531       if (!general_no_elim_operand (op0, word_mode))
13532 	op0 = copy_to_mode_reg (word_mode, op0);
13533 
13534       emit_insn (gen_push (op0));
13535       emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13536       return 0;
13537 
13538     case IX86_BUILTIN_KTESTC8:
13539       icode = CODE_FOR_ktestqi;
13540       mode3 = CCCmode;
13541       goto kortest;
13542 
13543     case IX86_BUILTIN_KTESTZ8:
13544       icode = CODE_FOR_ktestqi;
13545       mode3 = CCZmode;
13546       goto kortest;
13547 
13548     case IX86_BUILTIN_KTESTC16:
13549       icode = CODE_FOR_ktesthi;
13550       mode3 = CCCmode;
13551       goto kortest;
13552 
13553     case IX86_BUILTIN_KTESTZ16:
13554       icode = CODE_FOR_ktesthi;
13555       mode3 = CCZmode;
13556       goto kortest;
13557 
13558     case IX86_BUILTIN_KTESTC32:
13559       icode = CODE_FOR_ktestsi;
13560       mode3 = CCCmode;
13561       goto kortest;
13562 
13563     case IX86_BUILTIN_KTESTZ32:
13564       icode = CODE_FOR_ktestsi;
13565       mode3 = CCZmode;
13566       goto kortest;
13567 
13568     case IX86_BUILTIN_KTESTC64:
13569       icode = CODE_FOR_ktestdi;
13570       mode3 = CCCmode;
13571       goto kortest;
13572 
13573     case IX86_BUILTIN_KTESTZ64:
13574       icode = CODE_FOR_ktestdi;
13575       mode3 = CCZmode;
13576       goto kortest;
13577 
13578     case IX86_BUILTIN_KORTESTC8:
13579       icode = CODE_FOR_kortestqi;
13580       mode3 = CCCmode;
13581       goto kortest;
13582 
13583     case IX86_BUILTIN_KORTESTZ8:
13584       icode = CODE_FOR_kortestqi;
13585       mode3 = CCZmode;
13586       goto kortest;
13587 
13588     case IX86_BUILTIN_KORTESTC16:
13589       icode = CODE_FOR_kortesthi;
13590       mode3 = CCCmode;
13591       goto kortest;
13592 
13593     case IX86_BUILTIN_KORTESTZ16:
13594       icode = CODE_FOR_kortesthi;
13595       mode3 = CCZmode;
13596       goto kortest;
13597 
13598     case IX86_BUILTIN_KORTESTC32:
13599       icode = CODE_FOR_kortestsi;
13600       mode3 = CCCmode;
13601       goto kortest;
13602 
13603     case IX86_BUILTIN_KORTESTZ32:
13604       icode = CODE_FOR_kortestsi;
13605       mode3 = CCZmode;
13606       goto kortest;
13607 
13608     case IX86_BUILTIN_KORTESTC64:
13609       icode = CODE_FOR_kortestdi;
13610       mode3 = CCCmode;
13611       goto kortest;
13612 
13613     case IX86_BUILTIN_KORTESTZ64:
13614       icode = CODE_FOR_kortestdi;
13615       mode3 = CCZmode;
13616 
13617     kortest:
13618       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
13619       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
13620       op0 = expand_normal (arg0);
13621       op1 = expand_normal (arg1);
13622 
13623       mode0 = insn_data[icode].operand[0].mode;
13624       mode1 = insn_data[icode].operand[1].mode;
13625 
13626       if (GET_MODE (op0) != VOIDmode)
13627 	op0 = force_reg (GET_MODE (op0), op0);
13628 
13629       op0 = gen_lowpart (mode0, op0);
13630 
13631       if (!insn_data[icode].operand[0].predicate (op0, mode0))
13632 	op0 = copy_to_mode_reg (mode0, op0);
13633 
13634       if (GET_MODE (op1) != VOIDmode)
13635 	op1 = force_reg (GET_MODE (op1), op1);
13636 
13637       op1 = gen_lowpart (mode1, op1);
13638 
13639       if (!insn_data[icode].operand[1].predicate (op1, mode1))
13640 	op1 = copy_to_mode_reg (mode1, op1);
13641 
13642       target = gen_reg_rtx (QImode);
13643 
13644       /* Emit kortest.  */
13645       emit_insn (GEN_FCN (icode) (op0, op1));
13646       /* And use setcc to return result from flags.  */
13647       ix86_expand_setcc (target, EQ,
13648 			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
13649       return target;
13650 
13651     case IX86_BUILTIN_GATHERSIV2DF:
13652       icode = CODE_FOR_avx2_gathersiv2df;
13653       goto gather_gen;
13654     case IX86_BUILTIN_GATHERSIV4DF:
13655       icode = CODE_FOR_avx2_gathersiv4df;
13656       goto gather_gen;
13657     case IX86_BUILTIN_GATHERDIV2DF:
13658       icode = CODE_FOR_avx2_gatherdiv2df;
13659       goto gather_gen;
13660     case IX86_BUILTIN_GATHERDIV4DF:
13661       icode = CODE_FOR_avx2_gatherdiv4df;
13662       goto gather_gen;
13663     case IX86_BUILTIN_GATHERSIV4SF:
13664       icode = CODE_FOR_avx2_gathersiv4sf;
13665       goto gather_gen;
13666     case IX86_BUILTIN_GATHERSIV8SF:
13667       icode = CODE_FOR_avx2_gathersiv8sf;
13668       goto gather_gen;
13669     case IX86_BUILTIN_GATHERDIV4SF:
13670       icode = CODE_FOR_avx2_gatherdiv4sf;
13671       goto gather_gen;
13672     case IX86_BUILTIN_GATHERDIV8SF:
13673       icode = CODE_FOR_avx2_gatherdiv8sf;
13674       goto gather_gen;
13675     case IX86_BUILTIN_GATHERSIV2DI:
13676       icode = CODE_FOR_avx2_gathersiv2di;
13677       goto gather_gen;
13678     case IX86_BUILTIN_GATHERSIV4DI:
13679       icode = CODE_FOR_avx2_gathersiv4di;
13680       goto gather_gen;
13681     case IX86_BUILTIN_GATHERDIV2DI:
13682       icode = CODE_FOR_avx2_gatherdiv2di;
13683       goto gather_gen;
13684     case IX86_BUILTIN_GATHERDIV4DI:
13685       icode = CODE_FOR_avx2_gatherdiv4di;
13686       goto gather_gen;
13687     case IX86_BUILTIN_GATHERSIV4SI:
13688       icode = CODE_FOR_avx2_gathersiv4si;
13689       goto gather_gen;
13690     case IX86_BUILTIN_GATHERSIV8SI:
13691       icode = CODE_FOR_avx2_gathersiv8si;
13692       goto gather_gen;
13693     case IX86_BUILTIN_GATHERDIV4SI:
13694       icode = CODE_FOR_avx2_gatherdiv4si;
13695       goto gather_gen;
13696     case IX86_BUILTIN_GATHERDIV8SI:
13697       icode = CODE_FOR_avx2_gatherdiv8si;
13698       goto gather_gen;
13699     case IX86_BUILTIN_GATHERALTSIV4DF:
13700       icode = CODE_FOR_avx2_gathersiv4df;
13701       goto gather_gen;
13702     case IX86_BUILTIN_GATHERALTDIV8SF:
13703       icode = CODE_FOR_avx2_gatherdiv8sf;
13704       goto gather_gen;
13705     case IX86_BUILTIN_GATHERALTSIV4DI:
13706       icode = CODE_FOR_avx2_gathersiv4di;
13707       goto gather_gen;
13708     case IX86_BUILTIN_GATHERALTDIV8SI:
13709       icode = CODE_FOR_avx2_gatherdiv8si;
13710       goto gather_gen;
13711     case IX86_BUILTIN_GATHER3SIV16SF:
13712       icode = CODE_FOR_avx512f_gathersiv16sf;
13713       goto gather_gen;
13714     case IX86_BUILTIN_GATHER3SIV8DF:
13715       icode = CODE_FOR_avx512f_gathersiv8df;
13716       goto gather_gen;
13717     case IX86_BUILTIN_GATHER3DIV16SF:
13718       icode = CODE_FOR_avx512f_gatherdiv16sf;
13719       goto gather_gen;
13720     case IX86_BUILTIN_GATHER3DIV8DF:
13721       icode = CODE_FOR_avx512f_gatherdiv8df;
13722       goto gather_gen;
13723     case IX86_BUILTIN_GATHER3SIV16SI:
13724       icode = CODE_FOR_avx512f_gathersiv16si;
13725       goto gather_gen;
13726     case IX86_BUILTIN_GATHER3SIV8DI:
13727       icode = CODE_FOR_avx512f_gathersiv8di;
13728       goto gather_gen;
13729     case IX86_BUILTIN_GATHER3DIV16SI:
13730       icode = CODE_FOR_avx512f_gatherdiv16si;
13731       goto gather_gen;
13732     case IX86_BUILTIN_GATHER3DIV8DI:
13733       icode = CODE_FOR_avx512f_gatherdiv8di;
13734       goto gather_gen;
13735     case IX86_BUILTIN_GATHER3ALTSIV8DF:
13736       icode = CODE_FOR_avx512f_gathersiv8df;
13737       goto gather_gen;
13738     case IX86_BUILTIN_GATHER3ALTDIV16SF:
13739       icode = CODE_FOR_avx512f_gatherdiv16sf;
13740       goto gather_gen;
13741     case IX86_BUILTIN_GATHER3ALTSIV8DI:
13742       icode = CODE_FOR_avx512f_gathersiv8di;
13743       goto gather_gen;
13744     case IX86_BUILTIN_GATHER3ALTDIV16SI:
13745       icode = CODE_FOR_avx512f_gatherdiv16si;
13746       goto gather_gen;
13747     case IX86_BUILTIN_GATHER3SIV2DF:
13748       icode = CODE_FOR_avx512vl_gathersiv2df;
13749       goto gather_gen;
13750     case IX86_BUILTIN_GATHER3SIV4DF:
13751       icode = CODE_FOR_avx512vl_gathersiv4df;
13752       goto gather_gen;
13753     case IX86_BUILTIN_GATHER3DIV2DF:
13754       icode = CODE_FOR_avx512vl_gatherdiv2df;
13755       goto gather_gen;
13756     case IX86_BUILTIN_GATHER3DIV4DF:
13757       icode = CODE_FOR_avx512vl_gatherdiv4df;
13758       goto gather_gen;
13759     case IX86_BUILTIN_GATHER3SIV4SF:
13760       icode = CODE_FOR_avx512vl_gathersiv4sf;
13761       goto gather_gen;
13762     case IX86_BUILTIN_GATHER3SIV8SF:
13763       icode = CODE_FOR_avx512vl_gathersiv8sf;
13764       goto gather_gen;
13765     case IX86_BUILTIN_GATHER3DIV4SF:
13766       icode = CODE_FOR_avx512vl_gatherdiv4sf;
13767       goto gather_gen;
13768     case IX86_BUILTIN_GATHER3DIV8SF:
13769       icode = CODE_FOR_avx512vl_gatherdiv8sf;
13770       goto gather_gen;
13771     case IX86_BUILTIN_GATHER3SIV2DI:
13772       icode = CODE_FOR_avx512vl_gathersiv2di;
13773       goto gather_gen;
13774     case IX86_BUILTIN_GATHER3SIV4DI:
13775       icode = CODE_FOR_avx512vl_gathersiv4di;
13776       goto gather_gen;
13777     case IX86_BUILTIN_GATHER3DIV2DI:
13778       icode = CODE_FOR_avx512vl_gatherdiv2di;
13779       goto gather_gen;
13780     case IX86_BUILTIN_GATHER3DIV4DI:
13781       icode = CODE_FOR_avx512vl_gatherdiv4di;
13782       goto gather_gen;
13783     case IX86_BUILTIN_GATHER3SIV4SI:
13784       icode = CODE_FOR_avx512vl_gathersiv4si;
13785       goto gather_gen;
13786     case IX86_BUILTIN_GATHER3SIV8SI:
13787       icode = CODE_FOR_avx512vl_gathersiv8si;
13788       goto gather_gen;
13789     case IX86_BUILTIN_GATHER3DIV4SI:
13790       icode = CODE_FOR_avx512vl_gatherdiv4si;
13791       goto gather_gen;
13792     case IX86_BUILTIN_GATHER3DIV8SI:
13793       icode = CODE_FOR_avx512vl_gatherdiv8si;
13794       goto gather_gen;
13795     case IX86_BUILTIN_GATHER3ALTSIV4DF:
13796       icode = CODE_FOR_avx512vl_gathersiv4df;
13797       goto gather_gen;
13798     case IX86_BUILTIN_GATHER3ALTDIV8SF:
13799       icode = CODE_FOR_avx512vl_gatherdiv8sf;
13800       goto gather_gen;
13801     case IX86_BUILTIN_GATHER3ALTSIV4DI:
13802       icode = CODE_FOR_avx512vl_gathersiv4di;
13803       goto gather_gen;
13804     case IX86_BUILTIN_GATHER3ALTDIV8SI:
13805       icode = CODE_FOR_avx512vl_gatherdiv8si;
13806       goto gather_gen;
13807     case IX86_BUILTIN_SCATTERSIV16SF:
13808       icode = CODE_FOR_avx512f_scattersiv16sf;
13809       goto scatter_gen;
13810     case IX86_BUILTIN_SCATTERSIV8DF:
13811       icode = CODE_FOR_avx512f_scattersiv8df;
13812       goto scatter_gen;
13813     case IX86_BUILTIN_SCATTERDIV16SF:
13814       icode = CODE_FOR_avx512f_scatterdiv16sf;
13815       goto scatter_gen;
13816     case IX86_BUILTIN_SCATTERDIV8DF:
13817       icode = CODE_FOR_avx512f_scatterdiv8df;
13818       goto scatter_gen;
13819     case IX86_BUILTIN_SCATTERSIV16SI:
13820       icode = CODE_FOR_avx512f_scattersiv16si;
13821       goto scatter_gen;
13822     case IX86_BUILTIN_SCATTERSIV8DI:
13823       icode = CODE_FOR_avx512f_scattersiv8di;
13824       goto scatter_gen;
13825     case IX86_BUILTIN_SCATTERDIV16SI:
13826       icode = CODE_FOR_avx512f_scatterdiv16si;
13827       goto scatter_gen;
13828     case IX86_BUILTIN_SCATTERDIV8DI:
13829       icode = CODE_FOR_avx512f_scatterdiv8di;
13830       goto scatter_gen;
13831     case IX86_BUILTIN_SCATTERSIV8SF:
13832       icode = CODE_FOR_avx512vl_scattersiv8sf;
13833       goto scatter_gen;
13834     case IX86_BUILTIN_SCATTERSIV4SF:
13835       icode = CODE_FOR_avx512vl_scattersiv4sf;
13836       goto scatter_gen;
13837     case IX86_BUILTIN_SCATTERSIV4DF:
13838       icode = CODE_FOR_avx512vl_scattersiv4df;
13839       goto scatter_gen;
13840     case IX86_BUILTIN_SCATTERSIV2DF:
13841       icode = CODE_FOR_avx512vl_scattersiv2df;
13842       goto scatter_gen;
13843     case IX86_BUILTIN_SCATTERDIV8SF:
13844       icode = CODE_FOR_avx512vl_scatterdiv8sf;
13845       goto scatter_gen;
13846     case IX86_BUILTIN_SCATTERDIV4SF:
13847       icode = CODE_FOR_avx512vl_scatterdiv4sf;
13848       goto scatter_gen;
13849     case IX86_BUILTIN_SCATTERDIV4DF:
13850       icode = CODE_FOR_avx512vl_scatterdiv4df;
13851       goto scatter_gen;
13852     case IX86_BUILTIN_SCATTERDIV2DF:
13853       icode = CODE_FOR_avx512vl_scatterdiv2df;
13854       goto scatter_gen;
13855     case IX86_BUILTIN_SCATTERSIV8SI:
13856       icode = CODE_FOR_avx512vl_scattersiv8si;
13857       goto scatter_gen;
13858     case IX86_BUILTIN_SCATTERSIV4SI:
13859       icode = CODE_FOR_avx512vl_scattersiv4si;
13860       goto scatter_gen;
13861     case IX86_BUILTIN_SCATTERSIV4DI:
13862       icode = CODE_FOR_avx512vl_scattersiv4di;
13863       goto scatter_gen;
13864     case IX86_BUILTIN_SCATTERSIV2DI:
13865       icode = CODE_FOR_avx512vl_scattersiv2di;
13866       goto scatter_gen;
13867     case IX86_BUILTIN_SCATTERDIV8SI:
13868       icode = CODE_FOR_avx512vl_scatterdiv8si;
13869       goto scatter_gen;
13870     case IX86_BUILTIN_SCATTERDIV4SI:
13871       icode = CODE_FOR_avx512vl_scatterdiv4si;
13872       goto scatter_gen;
13873     case IX86_BUILTIN_SCATTERDIV4DI:
13874       icode = CODE_FOR_avx512vl_scatterdiv4di;
13875       goto scatter_gen;
13876     case IX86_BUILTIN_SCATTERDIV2DI:
13877       icode = CODE_FOR_avx512vl_scatterdiv2di;
13878       goto scatter_gen;
13879     case IX86_BUILTIN_GATHERPFDPD:
13880       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
13881       goto vec_prefetch_gen;
13882     case IX86_BUILTIN_SCATTERALTSIV8DF:
13883       icode = CODE_FOR_avx512f_scattersiv8df;
13884       goto scatter_gen;
13885     case IX86_BUILTIN_SCATTERALTDIV16SF:
13886       icode = CODE_FOR_avx512f_scatterdiv16sf;
13887       goto scatter_gen;
13888     case IX86_BUILTIN_SCATTERALTSIV8DI:
13889       icode = CODE_FOR_avx512f_scattersiv8di;
13890       goto scatter_gen;
13891     case IX86_BUILTIN_SCATTERALTDIV16SI:
13892       icode = CODE_FOR_avx512f_scatterdiv16si;
13893       goto scatter_gen;
13894     case IX86_BUILTIN_SCATTERALTSIV4DF:
13895       icode = CODE_FOR_avx512vl_scattersiv4df;
13896       goto scatter_gen;
13897     case IX86_BUILTIN_SCATTERALTDIV8SF:
13898       icode = CODE_FOR_avx512vl_scatterdiv8sf;
13899       goto scatter_gen;
13900     case IX86_BUILTIN_SCATTERALTSIV4DI:
13901       icode = CODE_FOR_avx512vl_scattersiv4di;
13902       goto scatter_gen;
13903     case IX86_BUILTIN_SCATTERALTDIV8SI:
13904       icode = CODE_FOR_avx512vl_scatterdiv8si;
13905       goto scatter_gen;
13906     case IX86_BUILTIN_SCATTERALTSIV2DF:
13907       icode = CODE_FOR_avx512vl_scattersiv2df;
13908       goto scatter_gen;
13909     case IX86_BUILTIN_SCATTERALTDIV4SF:
13910       icode = CODE_FOR_avx512vl_scatterdiv4sf;
13911       goto scatter_gen;
13912     case IX86_BUILTIN_SCATTERALTSIV2DI:
13913       icode = CODE_FOR_avx512vl_scattersiv2di;
13914       goto scatter_gen;
13915     case IX86_BUILTIN_SCATTERALTDIV4SI:
13916       icode = CODE_FOR_avx512vl_scatterdiv4si;
13917       goto scatter_gen;
13918     case IX86_BUILTIN_GATHERPFDPS:
13919       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
13920       goto vec_prefetch_gen;
13921     case IX86_BUILTIN_GATHERPFQPD:
13922       icode = CODE_FOR_avx512pf_gatherpfv8didf;
13923       goto vec_prefetch_gen;
13924     case IX86_BUILTIN_GATHERPFQPS:
13925       icode = CODE_FOR_avx512pf_gatherpfv8disf;
13926       goto vec_prefetch_gen;
13927     case IX86_BUILTIN_SCATTERPFDPD:
13928       icode = CODE_FOR_avx512pf_scatterpfv8sidf;
13929       goto vec_prefetch_gen;
13930     case IX86_BUILTIN_SCATTERPFDPS:
13931       icode = CODE_FOR_avx512pf_scatterpfv16sisf;
13932       goto vec_prefetch_gen;
13933     case IX86_BUILTIN_SCATTERPFQPD:
13934       icode = CODE_FOR_avx512pf_scatterpfv8didf;
13935       goto vec_prefetch_gen;
13936     case IX86_BUILTIN_SCATTERPFQPS:
13937       icode = CODE_FOR_avx512pf_scatterpfv8disf;
13938       goto vec_prefetch_gen;
13939 
13940     gather_gen:
13941       rtx half;
13942       rtx (*gen) (rtx, rtx);
13943 
13944       arg0 = CALL_EXPR_ARG (exp, 0);
13945       arg1 = CALL_EXPR_ARG (exp, 1);
13946       arg2 = CALL_EXPR_ARG (exp, 2);
13947       arg3 = CALL_EXPR_ARG (exp, 3);
13948       arg4 = CALL_EXPR_ARG (exp, 4);
13949       op0 = expand_normal (arg0);
13950       op1 = expand_normal (arg1);
13951       op2 = expand_normal (arg2);
13952       op3 = expand_normal (arg3);
13953       op4 = expand_normal (arg4);
13954       /* Note the arg order is different from the operand order.  */
13955       mode0 = insn_data[icode].operand[1].mode;
13956       mode2 = insn_data[icode].operand[3].mode;
13957       mode3 = insn_data[icode].operand[4].mode;
13958       mode4 = insn_data[icode].operand[5].mode;
13959 
13960       if (target == NULL_RTX
13961 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
13962 	  || !insn_data[icode].operand[0].predicate (target,
13963 						     GET_MODE (target)))
13964 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
13965       else
13966 	subtarget = target;
13967 
13968       switch (fcode)
13969 	{
13970 	case IX86_BUILTIN_GATHER3ALTSIV8DF:
13971 	case IX86_BUILTIN_GATHER3ALTSIV8DI:
13972 	  half = gen_reg_rtx (V8SImode);
13973 	  if (!nonimmediate_operand (op2, V16SImode))
13974 	    op2 = copy_to_mode_reg (V16SImode, op2);
13975 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
13976 	  op2 = half;
13977 	  break;
13978 	case IX86_BUILTIN_GATHER3ALTSIV4DF:
13979 	case IX86_BUILTIN_GATHER3ALTSIV4DI:
13980 	case IX86_BUILTIN_GATHERALTSIV4DF:
13981 	case IX86_BUILTIN_GATHERALTSIV4DI:
13982 	  half = gen_reg_rtx (V4SImode);
13983 	  if (!nonimmediate_operand (op2, V8SImode))
13984 	    op2 = copy_to_mode_reg (V8SImode, op2);
13985 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
13986 	  op2 = half;
13987 	  break;
13988 	case IX86_BUILTIN_GATHER3ALTDIV16SF:
13989 	case IX86_BUILTIN_GATHER3ALTDIV16SI:
13990 	  half = gen_reg_rtx (mode0);
13991 	  if (mode0 == V8SFmode)
13992 	    gen = gen_vec_extract_lo_v16sf;
13993 	  else
13994 	    gen = gen_vec_extract_lo_v16si;
13995 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
13996 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
13997 	  emit_insn (gen (half, op0));
13998 	  op0 = half;
13999 	  op3 = lowpart_subreg (QImode, op3, HImode);
14000 	  break;
14001 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
14002 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
14003 	case IX86_BUILTIN_GATHERALTDIV8SF:
14004 	case IX86_BUILTIN_GATHERALTDIV8SI:
14005 	  half = gen_reg_rtx (mode0);
14006 	  if (mode0 == V4SFmode)
14007 	    gen = gen_vec_extract_lo_v8sf;
14008 	  else
14009 	    gen = gen_vec_extract_lo_v8si;
14010 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
14011 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14012 	  emit_insn (gen (half, op0));
14013 	  op0 = half;
14014 	  if (VECTOR_MODE_P (GET_MODE (op3)))
14015 	    {
14016 	      half = gen_reg_rtx (mode0);
14017 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
14018 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14019 	      emit_insn (gen (half, op3));
14020 	      op3 = half;
14021 	    }
14022 	  break;
14023 	default:
14024 	  break;
14025 	}
14026 
14027       /* Force memory operand only with base register here.  But we
14028 	 don't want to do it on memory operand for other builtin
14029 	 functions.  */
14030       op1 = ix86_zero_extend_to_Pmode (op1);
14031 
14032       if (!insn_data[icode].operand[1].predicate (op0, mode0))
14033 	op0 = copy_to_mode_reg (mode0, op0);
14034       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14035 	op1 = copy_to_mode_reg (Pmode, op1);
14036       if (!insn_data[icode].operand[3].predicate (op2, mode2))
14037 	op2 = copy_to_mode_reg (mode2, op2);
14038 
14039       op3 = fixup_modeless_constant (op3, mode3);
14040 
14041       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14042 	{
14043 	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
14044 	    op3 = copy_to_mode_reg (mode3, op3);
14045 	}
14046       else
14047 	{
14048 	  op3 = copy_to_reg (op3);
14049 	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14050 	}
14051       if (!insn_data[icode].operand[5].predicate (op4, mode4))
14052 	{
14053           error ("the last argument must be scale 1, 2, 4, 8");
14054           return const0_rtx;
14055 	}
14056 
14057       /* Optimize.  If mask is known to have all high bits set,
14058 	 replace op0 with pc_rtx to signal that the instruction
14059 	 overwrites the whole destination and doesn't use its
14060 	 previous contents.  */
14061       if (optimize)
14062 	{
14063 	  if (TREE_CODE (arg3) == INTEGER_CST)
14064 	    {
14065 	      if (integer_all_onesp (arg3))
14066 		op0 = pc_rtx;
14067 	    }
14068 	  else if (TREE_CODE (arg3) == VECTOR_CST)
14069 	    {
14070 	      unsigned int negative = 0;
14071 	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14072 		{
14073 		  tree cst = VECTOR_CST_ELT (arg3, i);
14074 		  if (TREE_CODE (cst) == INTEGER_CST
14075 		      && tree_int_cst_sign_bit (cst))
14076 		    negative++;
14077 		  else if (TREE_CODE (cst) == REAL_CST
14078 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14079 		    negative++;
14080 		}
14081 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14082 		op0 = pc_rtx;
14083 	    }
14084 	  else if (TREE_CODE (arg3) == SSA_NAME
14085 		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14086 	    {
14087 	      /* Recognize also when mask is like:
14088 		 __v2df src = _mm_setzero_pd ();
14089 		 __v2df mask = _mm_cmpeq_pd (src, src);
14090 		 or
14091 		 __v8sf src = _mm256_setzero_ps ();
14092 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14093 		 as that is a cheaper way to load all ones into
14094 		 a register than having to load a constant from
14095 		 memory.  */
14096 	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14097 	      if (is_gimple_call (def_stmt))
14098 		{
14099 		  tree fndecl = gimple_call_fndecl (def_stmt);
14100 		  if (fndecl
14101 		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14102 		    switch (DECL_MD_FUNCTION_CODE (fndecl))
14103 		      {
14104 		      case IX86_BUILTIN_CMPPD:
14105 		      case IX86_BUILTIN_CMPPS:
14106 		      case IX86_BUILTIN_CMPPD256:
14107 		      case IX86_BUILTIN_CMPPS256:
14108 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14109 			  break;
14110 			/* FALLTHRU */
14111 		      case IX86_BUILTIN_CMPEQPD:
14112 		      case IX86_BUILTIN_CMPEQPS:
14113 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14114 			    && initializer_zerop (gimple_call_arg (def_stmt,
14115 								   1)))
14116 			  op0 = pc_rtx;
14117 			break;
14118 		      default:
14119 			break;
14120 		      }
14121 		}
14122 	    }
14123 	}
14124 
14125       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14126       if (! pat)
14127 	return const0_rtx;
14128       emit_insn (pat);
14129 
14130       switch (fcode)
14131 	{
14132 	case IX86_BUILTIN_GATHER3DIV16SF:
14133 	  if (target == NULL_RTX)
14134 	    target = gen_reg_rtx (V8SFmode);
14135 	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14136 	  break;
14137 	case IX86_BUILTIN_GATHER3DIV16SI:
14138 	  if (target == NULL_RTX)
14139 	    target = gen_reg_rtx (V8SImode);
14140 	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14141 	  break;
14142 	case IX86_BUILTIN_GATHER3DIV8SF:
14143 	case IX86_BUILTIN_GATHERDIV8SF:
14144 	  if (target == NULL_RTX)
14145 	    target = gen_reg_rtx (V4SFmode);
14146 	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14147 	  break;
14148 	case IX86_BUILTIN_GATHER3DIV8SI:
14149 	case IX86_BUILTIN_GATHERDIV8SI:
14150 	  if (target == NULL_RTX)
14151 	    target = gen_reg_rtx (V4SImode);
14152 	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14153 	  break;
14154 	default:
14155 	  target = subtarget;
14156 	  break;
14157 	}
14158       return target;
14159 
14160     scatter_gen:
14161       arg0 = CALL_EXPR_ARG (exp, 0);
14162       arg1 = CALL_EXPR_ARG (exp, 1);
14163       arg2 = CALL_EXPR_ARG (exp, 2);
14164       arg3 = CALL_EXPR_ARG (exp, 3);
14165       arg4 = CALL_EXPR_ARG (exp, 4);
14166       op0 = expand_normal (arg0);
14167       op1 = expand_normal (arg1);
14168       op2 = expand_normal (arg2);
14169       op3 = expand_normal (arg3);
14170       op4 = expand_normal (arg4);
14171       mode1 = insn_data[icode].operand[1].mode;
14172       mode2 = insn_data[icode].operand[2].mode;
14173       mode3 = insn_data[icode].operand[3].mode;
14174       mode4 = insn_data[icode].operand[4].mode;
14175 
14176       /* Scatter instruction stores operand op3 to memory with
14177 	 indices from op2 and scale from op4 under writemask op1.
14178 	 If index operand op2 has more elements then source operand
14179 	 op3 one need to use only its low half. And vice versa.  */
14180       switch (fcode)
14181 	{
14182 	case IX86_BUILTIN_SCATTERALTSIV8DF:
14183 	case IX86_BUILTIN_SCATTERALTSIV8DI:
14184 	  half = gen_reg_rtx (V8SImode);
14185 	  if (!nonimmediate_operand (op2, V16SImode))
14186 	    op2 = copy_to_mode_reg (V16SImode, op2);
14187 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
14188 	  op2 = half;
14189 	  break;
14190 	case IX86_BUILTIN_SCATTERALTDIV16SF:
14191 	case IX86_BUILTIN_SCATTERALTDIV16SI:
14192 	  half = gen_reg_rtx (mode3);
14193 	  if (mode3 == V8SFmode)
14194 	    gen = gen_vec_extract_lo_v16sf;
14195 	  else
14196 	    gen = gen_vec_extract_lo_v16si;
14197 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
14198 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14199 	  emit_insn (gen (half, op3));
14200 	  op3 = half;
14201 	  break;
14202 	case IX86_BUILTIN_SCATTERALTSIV4DF:
14203 	case IX86_BUILTIN_SCATTERALTSIV4DI:
14204 	  half = gen_reg_rtx (V4SImode);
14205 	  if (!nonimmediate_operand (op2, V8SImode))
14206 	    op2 = copy_to_mode_reg (V8SImode, op2);
14207 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
14208 	  op2 = half;
14209 	  break;
14210 	case IX86_BUILTIN_SCATTERALTDIV8SF:
14211 	case IX86_BUILTIN_SCATTERALTDIV8SI:
14212 	  half = gen_reg_rtx (mode3);
14213 	  if (mode3 == V4SFmode)
14214 	    gen = gen_vec_extract_lo_v8sf;
14215 	  else
14216 	    gen = gen_vec_extract_lo_v8si;
14217 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
14218 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14219 	  emit_insn (gen (half, op3));
14220 	  op3 = half;
14221 	  break;
14222 	case IX86_BUILTIN_SCATTERALTSIV2DF:
14223 	case IX86_BUILTIN_SCATTERALTSIV2DI:
14224 	  if (!nonimmediate_operand (op2, V4SImode))
14225 	    op2 = copy_to_mode_reg (V4SImode, op2);
14226 	  break;
14227 	case IX86_BUILTIN_SCATTERALTDIV4SF:
14228 	case IX86_BUILTIN_SCATTERALTDIV4SI:
14229 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
14230 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14231 	  break;
14232 	default:
14233 	  break;
14234 	}
14235 
14236       /* Force memory operand only with base register here.  But we
14237 	 don't want to do it on memory operand for other builtin
14238 	 functions.  */
14239       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14240 
14241       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14242 	op0 = copy_to_mode_reg (Pmode, op0);
14243 
14244       op1 = fixup_modeless_constant (op1, mode1);
14245 
14246       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14247 	{
14248 	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
14249 	    op1 = copy_to_mode_reg (mode1, op1);
14250 	}
14251       else
14252 	{
14253 	  op1 = copy_to_reg (op1);
14254 	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14255 	}
14256 
14257       if (!insn_data[icode].operand[2].predicate (op2, mode2))
14258 	op2 = copy_to_mode_reg (mode2, op2);
14259 
14260       if (!insn_data[icode].operand[3].predicate (op3, mode3))
14261 	op3 = copy_to_mode_reg (mode3, op3);
14262 
14263       if (!insn_data[icode].operand[4].predicate (op4, mode4))
14264 	{
14265 	  error ("the last argument must be scale 1, 2, 4, 8");
14266 	  return const0_rtx;
14267 	}
14268 
14269       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14270       if (! pat)
14271 	return const0_rtx;
14272 
14273       emit_insn (pat);
14274       return 0;
14275 
14276     vec_prefetch_gen:
14277       arg0 = CALL_EXPR_ARG (exp, 0);
14278       arg1 = CALL_EXPR_ARG (exp, 1);
14279       arg2 = CALL_EXPR_ARG (exp, 2);
14280       arg3 = CALL_EXPR_ARG (exp, 3);
14281       arg4 = CALL_EXPR_ARG (exp, 4);
14282       op0 = expand_normal (arg0);
14283       op1 = expand_normal (arg1);
14284       op2 = expand_normal (arg2);
14285       op3 = expand_normal (arg3);
14286       op4 = expand_normal (arg4);
14287       mode0 = insn_data[icode].operand[0].mode;
14288       mode1 = insn_data[icode].operand[1].mode;
14289       mode3 = insn_data[icode].operand[3].mode;
14290       mode4 = insn_data[icode].operand[4].mode;
14291 
14292       op0 = fixup_modeless_constant (op0, mode0);
14293 
14294       if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14295 	{
14296 	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
14297 	    op0 = copy_to_mode_reg (mode0, op0);
14298 	}
14299       else
14300 	{
14301 	  op0 = copy_to_reg (op0);
14302 	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14303 	}
14304 
14305       if (!insn_data[icode].operand[1].predicate (op1, mode1))
14306 	op1 = copy_to_mode_reg (mode1, op1);
14307 
14308       /* Force memory operand only with base register here.  But we
14309 	 don't want to do it on memory operand for other builtin
14310 	 functions.  */
14311       op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14312 
14313       if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14314 	op2 = copy_to_mode_reg (Pmode, op2);
14315 
14316       if (!insn_data[icode].operand[3].predicate (op3, mode3))
14317 	{
14318 	  error ("the forth argument must be scale 1, 2, 4, 8");
14319 	  return const0_rtx;
14320 	}
14321 
14322       if (!insn_data[icode].operand[4].predicate (op4, mode4))
14323 	{
14324 	  error ("incorrect hint operand");
14325 	  return const0_rtx;
14326 	}
14327 
14328       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14329       if (! pat)
14330 	return const0_rtx;
14331 
14332       emit_insn (pat);
14333 
14334       return 0;
14335 
14336     case IX86_BUILTIN_XABORT:
14337       icode = CODE_FOR_xabort;
14338       arg0 = CALL_EXPR_ARG (exp, 0);
14339       op0 = expand_normal (arg0);
14340       mode0 = insn_data[icode].operand[0].mode;
14341       if (!insn_data[icode].operand[0].predicate (op0, mode0))
14342 	{
14343 	  error ("the argument to %<xabort%> intrinsic must "
14344 		 "be an 8-bit immediate");
14345 	  return const0_rtx;
14346 	}
14347       emit_insn (gen_xabort (op0));
14348       return 0;
14349 
14350     case IX86_BUILTIN_RDSSPD:
14351     case IX86_BUILTIN_RDSSPQ:
14352       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14353 
14354       if (target == 0
14355 	  || !register_operand (target, mode))
14356 	target = gen_reg_rtx (mode);
14357 
14358       op0 = force_reg (mode, const0_rtx);
14359 
14360       emit_insn (gen_rdssp (mode, target, op0));
14361       return target;
14362 
14363     case IX86_BUILTIN_INCSSPD:
14364     case IX86_BUILTIN_INCSSPQ:
14365       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14366 
14367       arg0 = CALL_EXPR_ARG (exp, 0);
14368       op0 = expand_normal (arg0);
14369 
14370       op0 = force_reg (mode, op0);
14371 
14372       emit_insn (gen_incssp (mode, op0));
14373       return 0;
14374 
14375     case IX86_BUILTIN_HRESET:
14376       icode = CODE_FOR_hreset;
14377       arg0 = CALL_EXPR_ARG (exp, 0);
14378       op0 = expand_normal (arg0);
14379       op0 = force_reg (SImode, op0);
14380       emit_insn (gen_hreset (op0));
14381       return 0;
14382 
14383     case IX86_BUILTIN_RSTORSSP:
14384     case IX86_BUILTIN_CLRSSBSY:
14385       arg0 = CALL_EXPR_ARG (exp, 0);
14386       op0 = expand_normal (arg0);
14387       icode = (fcode == IX86_BUILTIN_RSTORSSP
14388 	       ? CODE_FOR_rstorssp
14389 	       : CODE_FOR_clrssbsy);
14390 
14391       if (!address_operand (op0, VOIDmode))
14392 	{
14393 	  op0 = convert_memory_address (Pmode, op0);
14394 	  op0 = copy_addr_to_reg (op0);
14395 	}
14396       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14397       return 0;
14398 
14399     case IX86_BUILTIN_WRSSD:
14400     case IX86_BUILTIN_WRSSQ:
14401     case IX86_BUILTIN_WRUSSD:
14402     case IX86_BUILTIN_WRUSSQ:
14403       mode = ((fcode == IX86_BUILTIN_WRSSD
14404 	       || fcode == IX86_BUILTIN_WRUSSD)
14405 	      ? SImode : DImode);
14406 
14407       arg0 = CALL_EXPR_ARG (exp, 0);
14408       op0 = expand_normal (arg0);
14409       arg1 = CALL_EXPR_ARG (exp, 1);
14410       op1 = expand_normal (arg1);
14411 
14412       op0 = force_reg (mode, op0);
14413 
14414       if (!address_operand (op1, VOIDmode))
14415 	{
14416 	  op1 = convert_memory_address (Pmode, op1);
14417 	  op1 = copy_addr_to_reg (op1);
14418 	}
14419       op1 = gen_rtx_MEM (mode, op1);
14420 
14421       icode = ((fcode == IX86_BUILTIN_WRSSD
14422 		|| fcode == IX86_BUILTIN_WRSSQ)
14423 	       ? code_for_wrss (mode)
14424 	       : code_for_wruss (mode));
14425       emit_insn (GEN_FCN (icode) (op0, op1));
14426 
14427       return 0;
14428 
14429     default:
14430       break;
14431     }
14432 
14433   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14434       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14435     {
14436       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14437       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14438 					       target);
14439     }
14440 
14441   if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14442       && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14443     {
14444       i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14445       return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14446 					       target);
14447     }
14448 
14449   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14450       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14451     {
14452       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14453       rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14454       rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14455       rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14456       int masked = 1;
14457       machine_mode mode, wide_mode, nar_mode;
14458 
14459       nar_mode  = V4SFmode;
14460       mode      = V16SFmode;
14461       wide_mode = V64SFmode;
14462       fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
14463       fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14464 
14465       switch (fcode)
14466 	{
14467 	case IX86_BUILTIN_4FMAPS:
14468 	  fcn = gen_avx5124fmaddps_4fmaddps;
14469 	  masked = 0;
14470 	  goto v4fma_expand;
14471 
14472 	case IX86_BUILTIN_4DPWSSD:
14473 	  nar_mode  = V4SImode;
14474 	  mode      = V16SImode;
14475 	  wide_mode = V64SImode;
14476 	  fcn = gen_avx5124vnniw_vp4dpwssd;
14477 	  masked = 0;
14478 	  goto v4fma_expand;
14479 
14480 	case IX86_BUILTIN_4DPWSSDS:
14481 	  nar_mode  = V4SImode;
14482 	  mode      = V16SImode;
14483 	  wide_mode = V64SImode;
14484 	  fcn = gen_avx5124vnniw_vp4dpwssds;
14485 	  masked = 0;
14486 	  goto v4fma_expand;
14487 
14488 	case IX86_BUILTIN_4FNMAPS:
14489 	  fcn = gen_avx5124fmaddps_4fnmaddps;
14490 	  masked = 0;
14491 	  goto v4fma_expand;
14492 
14493 	case IX86_BUILTIN_4FNMAPS_MASK:
14494 	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
14495 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14496 	  goto v4fma_expand;
14497 
14498 	case IX86_BUILTIN_4DPWSSD_MASK:
14499 	  nar_mode  = V4SImode;
14500 	  mode      = V16SImode;
14501 	  wide_mode = V64SImode;
14502 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
14503 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14504 	  goto v4fma_expand;
14505 
14506 	case IX86_BUILTIN_4DPWSSDS_MASK:
14507 	  nar_mode  = V4SImode;
14508 	  mode      = V16SImode;
14509 	  wide_mode = V64SImode;
14510 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
14511 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14512 	  goto v4fma_expand;
14513 
14514 	case IX86_BUILTIN_4FMAPS_MASK:
14515 	  {
14516 	    tree args[4];
14517 	    rtx ops[4];
14518 	    rtx wide_reg;
14519 	    rtx accum;
14520 	    rtx addr;
14521 	    rtx mem;
14522 
14523 v4fma_expand:
14524 	    wide_reg = gen_reg_rtx (wide_mode);
14525 	    for (i = 0; i < 4; i++)
14526 	      {
14527 		args[i] = CALL_EXPR_ARG (exp, i);
14528 		ops[i] = expand_normal (args[i]);
14529 
14530 		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14531 				ops[i]);
14532 	      }
14533 
14534 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14535 	    accum = force_reg (mode, accum);
14536 
14537 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14538 	    addr = force_reg (Pmode, addr);
14539 
14540 	    mem = gen_rtx_MEM (nar_mode, addr);
14541 
14542 	    target = gen_reg_rtx (mode);
14543 
14544 	    emit_move_insn (target, accum);
14545 
14546 	    if (! masked)
14547 	      emit_insn (fcn (target, accum, wide_reg, mem));
14548 	    else
14549 	      {
14550 		rtx merge, mask;
14551 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14552 
14553 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14554 
14555 		if (CONST_INT_P (mask))
14556 		  mask = fixup_modeless_constant (mask, HImode);
14557 
14558 		mask = force_reg (HImode, mask);
14559 
14560 		if (GET_MODE (mask) != HImode)
14561 		  mask = gen_rtx_SUBREG (HImode, mask, 0);
14562 
14563 		/* If merge is 0 then we're about to emit z-masked variant.  */
14564 		if (const0_operand (merge, mode))
14565 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14566 		/* If merge is the same as accum then emit merge-masked variant.  */
14567 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14568 		  {
14569 		    merge = force_reg (mode, merge);
14570 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14571 		  }
14572 		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
14573 		else
14574 		  {
14575 		    target = gen_reg_rtx (mode);
14576 		    emit_move_insn (target, merge);
14577 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14578 		  }
14579 	      }
14580 	    return target;
14581 	  }
14582 
14583 	case IX86_BUILTIN_4FNMASS:
14584 	  fcn = gen_avx5124fmaddps_4fnmaddss;
14585 	  masked = 0;
14586 	  goto s4fma_expand;
14587 
14588 	case IX86_BUILTIN_4FMASS:
14589 	  fcn = gen_avx5124fmaddps_4fmaddss;
14590 	  masked = 0;
14591 	  goto s4fma_expand;
14592 
14593 	case IX86_BUILTIN_4FNMASS_MASK:
14594 	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
14595 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
14596 	  goto s4fma_expand;
14597 
14598 	case IX86_BUILTIN_4FMASS_MASK:
14599 	  {
14600 	    tree args[4];
14601 	    rtx ops[4];
14602 	    rtx wide_reg;
14603 	    rtx accum;
14604 	    rtx addr;
14605 	    rtx mem;
14606 
14607 	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
14608 	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
14609 
14610 s4fma_expand:
14611 	    mode = V4SFmode;
14612 	    wide_reg = gen_reg_rtx (V64SFmode);
14613 	    for (i = 0; i < 4; i++)
14614 	      {
14615 		rtx tmp;
14616 		args[i] = CALL_EXPR_ARG (exp, i);
14617 		ops[i] = expand_normal (args[i]);
14618 
14619 		tmp = gen_reg_rtx (SFmode);
14620 		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
14621 
14622 		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
14623 				gen_rtx_SUBREG (V16SFmode, tmp, 0));
14624 	      }
14625 
14626 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14627 	    accum = force_reg (V4SFmode, accum);
14628 
14629 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14630 	    addr = force_reg (Pmode, addr);
14631 
14632 	    mem = gen_rtx_MEM (V4SFmode, addr);
14633 
14634 	    target = gen_reg_rtx (V4SFmode);
14635 
14636 	    emit_move_insn (target, accum);
14637 
14638 	    if (! masked)
14639 	      emit_insn (fcn (target, accum, wide_reg, mem));
14640 	    else
14641 	      {
14642 		rtx merge, mask;
14643 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14644 
14645 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14646 
14647 		if (CONST_INT_P (mask))
14648 		  mask = fixup_modeless_constant (mask, QImode);
14649 
14650 		mask = force_reg (QImode, mask);
14651 
14652 		if (GET_MODE (mask) != QImode)
14653 		  mask = gen_rtx_SUBREG (QImode, mask, 0);
14654 
14655 		/* If merge is 0 then we're about to emit z-masked variant.  */
14656 		if (const0_operand (merge, mode))
14657 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14658 		/* If merge is the same as accum then emit merge-masked
14659 		   variant.  */
14660 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14661 		  {
14662 		    merge = force_reg (mode, merge);
14663 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14664 		  }
14665 		/* Merge with something unknown might happen if we z-mask
14666 		   w/ -O0.  */
14667 		else
14668 		  {
14669 		    target = gen_reg_rtx (mode);
14670 		    emit_move_insn (target, merge);
14671 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14672 		  }
14673 		}
14674 	      return target;
14675 	    }
14676 	  case IX86_BUILTIN_RDPID:
14677 	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
14678 						     target);
14679 	  case IX86_BUILTIN_FABSQ:
14680 	  case IX86_BUILTIN_COPYSIGNQ:
14681 	    if (!TARGET_SSE)
14682 	      /* Emit a normal call if SSE isn't available.  */
14683 	      return expand_call (exp, target, ignore);
14684 	    /* FALLTHRU */
14685 	  default:
14686 	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
14687 	  }
14688     }
14689 
14690   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
14691       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
14692     {
14693       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
14694       return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
14695     }
14696 
14697   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14698       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
14699     {
14700       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
14701       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
14702     }
14703 
14704   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14705       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
14706     {
14707       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
14708       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
14709     }
14710 
14711   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14712       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
14713     {
14714       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
14715       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
14716     }
14717 
14718   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14719       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
14720     {
14721       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
14722       const struct builtin_description *d = bdesc_multi_arg + i;
14723       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
14724 					    (enum ix86_builtin_func_type)
14725 					    d->flag, d->comparison);
14726     }
14727 
14728   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
14729       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
14730     {
14731       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
14732       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
14733 					       target);
14734     }
14735 
14736   gcc_unreachable ();
14737 }
14738 
14739 /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
14740    fill target with val via vec_duplicate.  */
14741 
14742 static bool
ix86_vector_duplicate_value(machine_mode mode,rtx target,rtx val)14743 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
14744 {
14745   bool ok;
14746   rtx_insn *insn;
14747   rtx dup;
14748 
14749   /* First attempt to recognize VAL as-is.  */
14750   dup = gen_vec_duplicate (mode, val);
14751   insn = emit_insn (gen_rtx_SET (target, dup));
14752   if (recog_memoized (insn) < 0)
14753     {
14754       rtx_insn *seq;
14755       machine_mode innermode = GET_MODE_INNER (mode);
14756       rtx reg;
14757 
14758       /* If that fails, force VAL into a register.  */
14759 
14760       start_sequence ();
14761       reg = force_reg (innermode, val);
14762       if (GET_MODE (reg) != innermode)
14763 	reg = gen_lowpart (innermode, reg);
14764       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
14765       seq = get_insns ();
14766       end_sequence ();
14767       if (seq)
14768 	emit_insn_before (seq, insn);
14769 
14770       ok = recog_memoized (insn) >= 0;
14771       gcc_assert (ok);
14772     }
14773   return true;
14774 }
14775 
14776 /* Get a vector mode of the same size as the original but with elements
14777    twice as wide.  This is only guaranteed to apply to integral vectors.  */
14778 
14779 static machine_mode
get_mode_wider_vector(machine_mode o)14780 get_mode_wider_vector (machine_mode o)
14781 {
14782   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
14783   machine_mode n = GET_MODE_WIDER_MODE (o).require ();
14784   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
14785   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
14786   return n;
14787 }
14788 
14789 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
14790 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
14791 
14792 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
14793    with all elements equal to VAR.  Return true if successful.  */
14794 
14795 bool
ix86_expand_vector_init_duplicate(bool mmx_ok,machine_mode mode,rtx target,rtx val)14796 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
14797 				   rtx target, rtx val)
14798 {
14799   bool ok;
14800 
14801   switch (mode)
14802     {
14803     case E_V2SImode:
14804     case E_V2SFmode:
14805       if (!mmx_ok)
14806 	return false;
14807       /* FALLTHRU */
14808 
14809     case E_V4DFmode:
14810     case E_V4DImode:
14811     case E_V8SFmode:
14812     case E_V8SImode:
14813     case E_V2DFmode:
14814     case E_V2DImode:
14815     case E_V4SFmode:
14816     case E_V4SImode:
14817     case E_V16SImode:
14818     case E_V8DImode:
14819     case E_V16SFmode:
14820     case E_V8DFmode:
14821       return ix86_vector_duplicate_value (mode, target, val);
14822 
14823     case E_V4HImode:
14824       if (!mmx_ok)
14825 	return false;
14826       if (TARGET_SSE || TARGET_3DNOW_A)
14827 	{
14828 	  rtx x;
14829 
14830 	  val = gen_lowpart (SImode, val);
14831 	  x = gen_rtx_TRUNCATE (HImode, val);
14832 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
14833 	  emit_insn (gen_rtx_SET (target, x));
14834 	  return true;
14835 	}
14836       goto widen;
14837 
14838     case E_V2HImode:
14839       if (TARGET_SSE2)
14840 	{
14841 	  rtx x;
14842 
14843 	  val = gen_lowpart (SImode, val);
14844 	  x = gen_rtx_TRUNCATE (HImode, val);
14845 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
14846 	  emit_insn (gen_rtx_SET (target, x));
14847 	  return true;
14848 	}
14849       return false;
14850 
14851     case E_V8QImode:
14852     case E_V4QImode:
14853       if (!mmx_ok)
14854 	return false;
14855       goto widen;
14856 
14857     case E_V8HImode:
14858       if (TARGET_AVX2)
14859 	return ix86_vector_duplicate_value (mode, target, val);
14860 
14861       if (TARGET_SSE2)
14862 	{
14863 	  struct expand_vec_perm_d dperm;
14864 	  rtx tmp1, tmp2;
14865 
14866 	permute:
14867 	  memset (&dperm, 0, sizeof (dperm));
14868 	  dperm.target = target;
14869 	  dperm.vmode = mode;
14870 	  dperm.nelt = GET_MODE_NUNITS (mode);
14871 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
14872 	  dperm.one_operand_p = true;
14873 
14874 	  /* Extend to SImode using a paradoxical SUBREG.  */
14875 	  tmp1 = gen_reg_rtx (SImode);
14876 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
14877 
14878 	  /* Insert the SImode value as low element of a V4SImode vector.  */
14879 	  tmp2 = gen_reg_rtx (V4SImode);
14880 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
14881 	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
14882 
14883 	  ok = (expand_vec_perm_1 (&dperm)
14884 		|| expand_vec_perm_broadcast_1 (&dperm));
14885 	  gcc_assert (ok);
14886 	  return ok;
14887 	}
14888       goto widen;
14889 
14890     case E_V16QImode:
14891       if (TARGET_AVX2)
14892 	return ix86_vector_duplicate_value (mode, target, val);
14893 
14894       if (TARGET_SSE2)
14895 	goto permute;
14896       goto widen;
14897 
14898     widen:
14899       /* Replicate the value once into the next wider mode and recurse.  */
14900       {
14901 	machine_mode smode, wsmode, wvmode;
14902 	rtx x;
14903 
14904 	smode = GET_MODE_INNER (mode);
14905 	wvmode = get_mode_wider_vector (mode);
14906 	wsmode = GET_MODE_INNER (wvmode);
14907 
14908 	val = convert_modes (wsmode, smode, val, true);
14909 
14910 	if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
14911 	  emit_insn (gen_insv_1 (wsmode, val, val));
14912 	else
14913 	  {
14914 	    x = expand_simple_binop (wsmode, ASHIFT, val,
14915 				     GEN_INT (GET_MODE_BITSIZE (smode)),
14916 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
14917 	    val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
14918 				       OPTAB_LIB_WIDEN);
14919 	  }
14920 
14921 	x = gen_reg_rtx (wvmode);
14922 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
14923 	gcc_assert (ok);
14924 	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
14925 	return ok;
14926       }
14927 
14928     case E_V16HImode:
14929     case E_V32QImode:
14930       if (TARGET_AVX2)
14931 	return ix86_vector_duplicate_value (mode, target, val);
14932       else
14933 	{
14934 	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
14935 	  rtx x = gen_reg_rtx (hvmode);
14936 
14937 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
14938 	  gcc_assert (ok);
14939 
14940 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
14941 	  emit_insn (gen_rtx_SET (target, x));
14942 	}
14943       return true;
14944 
14945     case E_V64QImode:
14946     case E_V32HImode:
14947       if (TARGET_AVX512BW)
14948 	return ix86_vector_duplicate_value (mode, target, val);
14949       else
14950 	{
14951 	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
14952 	  rtx x = gen_reg_rtx (hvmode);
14953 
14954 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
14955 	  gcc_assert (ok);
14956 
14957 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
14958 	  emit_insn (gen_rtx_SET (target, x));
14959 	}
14960       return true;
14961 
14962     case E_V8HFmode:
14963     case E_V16HFmode:
14964     case E_V32HFmode:
14965       return ix86_vector_duplicate_value (mode, target, val);
14966 
14967     default:
14968       return false;
14969     }
14970 }
14971 
14972 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
14973    whose ONE_VAR element is VAR, and other elements are zero.  Return true
14974    if successful.  */
14975 
14976 static bool
ix86_expand_vector_init_one_nonzero(bool mmx_ok,machine_mode mode,rtx target,rtx var,int one_var)14977 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
14978 				     rtx target, rtx var, int one_var)
14979 {
14980   machine_mode vsimode;
14981   rtx new_target;
14982   rtx x, tmp;
14983   bool use_vector_set = false;
14984   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
14985 
14986   switch (mode)
14987     {
14988     case E_V2DImode:
14989       /* For SSE4.1, we normally use vector set.  But if the second
14990 	 element is zero and inter-unit moves are OK, we use movq
14991 	 instead.  */
14992       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
14993 			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
14994 			     && one_var == 0));
14995       break;
14996     case E_V16QImode:
14997     case E_V4SImode:
14998     case E_V4SFmode:
14999       use_vector_set = TARGET_SSE4_1;
15000       break;
15001     case E_V8HImode:
15002       use_vector_set = TARGET_SSE2;
15003       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15004 	? gen_vec_setv8hi_0 : NULL;
15005       break;
15006     case E_V8QImode:
15007       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15008       break;
15009     case E_V4HImode:
15010       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15011       break;
15012     case E_V4QImode:
15013       use_vector_set = TARGET_SSE4_1;
15014       break;
15015     case E_V32QImode:
15016       use_vector_set = TARGET_AVX;
15017       break;
15018     case E_V16HImode:
15019       use_vector_set = TARGET_AVX;
15020       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15021 	? gen_vec_setv16hi_0 : NULL;
15022       break;
15023     case E_V8SImode:
15024       use_vector_set = TARGET_AVX;
15025       gen_vec_set_0 = gen_vec_setv8si_0;
15026       break;
15027     case E_V8SFmode:
15028       use_vector_set = TARGET_AVX;
15029       gen_vec_set_0 = gen_vec_setv8sf_0;
15030       break;
15031     case E_V4DFmode:
15032       use_vector_set = TARGET_AVX;
15033       gen_vec_set_0 = gen_vec_setv4df_0;
15034       break;
15035     case E_V4DImode:
15036       /* Use ix86_expand_vector_set in 64bit mode only.  */
15037       use_vector_set = TARGET_AVX && TARGET_64BIT;
15038       gen_vec_set_0 = gen_vec_setv4di_0;
15039       break;
15040     case E_V16SImode:
15041       use_vector_set = TARGET_AVX512F && one_var == 0;
15042       gen_vec_set_0 = gen_vec_setv16si_0;
15043       break;
15044     case E_V16SFmode:
15045       use_vector_set = TARGET_AVX512F && one_var == 0;
15046       gen_vec_set_0 = gen_vec_setv16sf_0;
15047       break;
15048     case E_V8DFmode:
15049       use_vector_set = TARGET_AVX512F && one_var == 0;
15050       gen_vec_set_0 = gen_vec_setv8df_0;
15051       break;
15052     case E_V8DImode:
15053       /* Use ix86_expand_vector_set in 64bit mode only.  */
15054       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15055       gen_vec_set_0 = gen_vec_setv8di_0;
15056       break;
15057     case E_V8HFmode:
15058       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15059       gen_vec_set_0 = gen_vec_setv8hf_0;
15060       break;
15061     case E_V16HFmode:
15062       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15063       gen_vec_set_0 = gen_vec_setv16hf_0;
15064       break;
15065     case E_V32HFmode:
15066       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15067       gen_vec_set_0 = gen_vec_setv32hf_0;
15068       break;
15069     case E_V32HImode:
15070       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15071       gen_vec_set_0 = gen_vec_setv32hi_0;
15072     default:
15073       break;
15074     }
15075 
15076   if (use_vector_set)
15077     {
15078       if (gen_vec_set_0 && one_var == 0)
15079 	{
15080 	  var = force_reg (GET_MODE_INNER (mode), var);
15081 	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15082 	  return true;
15083 	}
15084       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15085       var = force_reg (GET_MODE_INNER (mode), var);
15086       ix86_expand_vector_set (mmx_ok, target, var, one_var);
15087       return true;
15088     }
15089 
15090   switch (mode)
15091     {
15092     case E_V2SFmode:
15093     case E_V2SImode:
15094       if (!mmx_ok)
15095 	return false;
15096       /* FALLTHRU */
15097 
15098     case E_V2DFmode:
15099     case E_V2DImode:
15100       if (one_var != 0)
15101 	return false;
15102       var = force_reg (GET_MODE_INNER (mode), var);
15103       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15104       emit_insn (gen_rtx_SET (target, x));
15105       return true;
15106 
15107     case E_V4SFmode:
15108     case E_V4SImode:
15109       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15110 	new_target = gen_reg_rtx (mode);
15111       else
15112 	new_target = target;
15113       var = force_reg (GET_MODE_INNER (mode), var);
15114       x = gen_rtx_VEC_DUPLICATE (mode, var);
15115       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15116       emit_insn (gen_rtx_SET (new_target, x));
15117       if (one_var != 0)
15118 	{
15119 	  /* We need to shuffle the value to the correct position, so
15120 	     create a new pseudo to store the intermediate result.  */
15121 
15122 	  /* With SSE2, we can use the integer shuffle insns.  */
15123 	  if (mode != V4SFmode && TARGET_SSE2)
15124 	    {
15125 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15126 					    const1_rtx,
15127 					    GEN_INT (one_var == 1 ? 0 : 1),
15128 					    GEN_INT (one_var == 2 ? 0 : 1),
15129 					    GEN_INT (one_var == 3 ? 0 : 1)));
15130 	      if (target != new_target)
15131 		emit_move_insn (target, new_target);
15132 	      return true;
15133 	    }
15134 
15135 	  /* Otherwise convert the intermediate result to V4SFmode and
15136 	     use the SSE1 shuffle instructions.  */
15137 	  if (mode != V4SFmode)
15138 	    {
15139 	      tmp = gen_reg_rtx (V4SFmode);
15140 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15141 	    }
15142 	  else
15143 	    tmp = new_target;
15144 
15145 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15146 				       const1_rtx,
15147 				       GEN_INT (one_var == 1 ? 0 : 1),
15148 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
15149 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15150 
15151 	  if (mode != V4SFmode)
15152 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15153 	  else if (tmp != target)
15154 	    emit_move_insn (target, tmp);
15155 	}
15156       else if (target != new_target)
15157 	emit_move_insn (target, new_target);
15158       return true;
15159 
15160     case E_V8HImode:
15161     case E_V16QImode:
15162       vsimode = V4SImode;
15163       goto widen;
15164     case E_V4HImode:
15165     case E_V8QImode:
15166       if (!mmx_ok)
15167 	return false;
15168       vsimode = V2SImode;
15169       goto widen;
15170     widen:
15171       if (one_var != 0)
15172 	return false;
15173 
15174       /* Zero extend the variable element to SImode and recurse.  */
15175       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15176 
15177       x = gen_reg_rtx (vsimode);
15178       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15179 						var, one_var))
15180 	gcc_unreachable ();
15181 
15182       emit_move_insn (target, gen_lowpart (mode, x));
15183       return true;
15184 
15185     default:
15186       return false;
15187     }
15188 }
15189 
15190 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
15191    consisting of the values in VALS.  It is known that all elements
15192    except ONE_VAR are constants.  Return true if successful.  */
15193 
15194 static bool
ix86_expand_vector_init_one_var(bool mmx_ok,machine_mode mode,rtx target,rtx vals,int one_var)15195 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15196 				 rtx target, rtx vals, int one_var)
15197 {
15198   rtx var = XVECEXP (vals, 0, one_var);
15199   machine_mode wmode;
15200   rtx const_vec, x;
15201 
15202   const_vec = copy_rtx (vals);
15203   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15204   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15205 
15206   switch (mode)
15207     {
15208     case E_V2DFmode:
15209     case E_V2DImode:
15210     case E_V2SFmode:
15211     case E_V2SImode:
15212       /* For the two element vectors, it's just as easy to use
15213 	 the general case.  */
15214       return false;
15215 
15216     case E_V4DImode:
15217       /* Use ix86_expand_vector_set in 64bit mode only.  */
15218       if (!TARGET_64BIT)
15219 	return false;
15220       /* FALLTHRU */
15221     case E_V8HFmode:
15222     case E_V16HFmode:
15223     case E_V4DFmode:
15224     case E_V8SFmode:
15225     case E_V8SImode:
15226     case E_V16HImode:
15227     case E_V32QImode:
15228     case E_V4SFmode:
15229     case E_V4SImode:
15230     case E_V8HImode:
15231     case E_V4HImode:
15232       break;
15233 
15234     case E_V16QImode:
15235       if (TARGET_SSE4_1)
15236 	break;
15237       wmode = V8HImode;
15238       goto widen;
15239     case E_V8QImode:
15240       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15241 	break;
15242       wmode = V4HImode;
15243       goto widen;
15244     case E_V4QImode:
15245       if (TARGET_SSE4_1)
15246 	break;
15247       wmode = V2HImode;
15248     widen:
15249       /* There's no way to set one QImode entry easily.  Combine
15250 	 the variable value with its adjacent constant value, and
15251 	 promote to an HImode set.  */
15252       x = XVECEXP (vals, 0, one_var ^ 1);
15253       if (one_var & 1)
15254 	{
15255 	  var = convert_modes (HImode, QImode, var, true);
15256 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15257 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
15258 	  x = GEN_INT (INTVAL (x) & 0xff);
15259 	}
15260       else
15261 	{
15262 	  var = convert_modes (HImode, QImode, var, true);
15263 	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
15264 	}
15265       if (x != const0_rtx)
15266 	var = expand_simple_binop (HImode, IOR, var, x, var,
15267 				   1, OPTAB_LIB_WIDEN);
15268 
15269       x = gen_reg_rtx (wmode);
15270       emit_move_insn (x, gen_lowpart (wmode, const_vec));
15271       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15272 
15273       emit_move_insn (target, gen_lowpart (mode, x));
15274       return true;
15275 
15276     default:
15277       return false;
15278     }
15279 
15280   emit_move_insn (target, const_vec);
15281   ix86_expand_vector_set (mmx_ok, target, var, one_var);
15282   return true;
15283 }
15284 
15285 /* A subroutine of ix86_expand_vector_init_general.  Use vector
15286    concatenate to handle the most general case: all values variable,
15287    and none identical.  */
15288 
15289 static void
ix86_expand_vector_init_concat(machine_mode mode,rtx target,rtx * ops,int n)15290 ix86_expand_vector_init_concat (machine_mode mode,
15291 				rtx target, rtx *ops, int n)
15292 {
15293   machine_mode half_mode = VOIDmode;
15294   rtx half[2];
15295   rtvec v;
15296   int i, j;
15297 
15298   switch (n)
15299     {
15300     case 2:
15301       switch (mode)
15302 	{
15303 	case E_V32HFmode:
15304 	  half_mode = V16HFmode;
15305 	  break;
15306 	case E_V16SImode:
15307 	  half_mode = V8SImode;
15308 	  break;
15309 	case E_V16SFmode:
15310 	  half_mode = V8SFmode;
15311 	  break;
15312 	case E_V8DImode:
15313 	  half_mode = V4DImode;
15314 	  break;
15315 	case E_V8DFmode:
15316 	  half_mode = V4DFmode;
15317 	  break;
15318 	case E_V16HFmode:
15319 	  half_mode = V8HFmode;
15320 	  break;
15321 	case E_V8SImode:
15322 	  half_mode = V4SImode;
15323 	  break;
15324 	case E_V8SFmode:
15325 	  half_mode = V4SFmode;
15326 	  break;
15327 	case E_V4DImode:
15328 	  half_mode = V2DImode;
15329 	  break;
15330 	case E_V4DFmode:
15331 	  half_mode = V2DFmode;
15332 	  break;
15333 	case E_V4SImode:
15334 	  half_mode = V2SImode;
15335 	  break;
15336 	case E_V4SFmode:
15337 	  half_mode = V2SFmode;
15338 	  break;
15339 	case E_V2DImode:
15340 	  half_mode = DImode;
15341 	  break;
15342 	case E_V2SImode:
15343 	  half_mode = SImode;
15344 	  break;
15345 	case E_V2DFmode:
15346 	  half_mode = DFmode;
15347 	  break;
15348 	case E_V2SFmode:
15349 	  half_mode = SFmode;
15350 	  break;
15351 	default:
15352 	  gcc_unreachable ();
15353 	}
15354 
15355       if (!register_operand (ops[1], half_mode))
15356 	ops[1] = force_reg (half_mode, ops[1]);
15357       if (!register_operand (ops[0], half_mode))
15358 	ops[0] = force_reg (half_mode, ops[0]);
15359       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15360 							  ops[1])));
15361       break;
15362 
15363     case 4:
15364       switch (mode)
15365 	{
15366 	case E_V4DImode:
15367 	  half_mode = V2DImode;
15368 	  break;
15369 	case E_V4DFmode:
15370 	  half_mode = V2DFmode;
15371 	  break;
15372 	case E_V4SImode:
15373 	  half_mode = V2SImode;
15374 	  break;
15375 	case E_V4SFmode:
15376 	  half_mode = V2SFmode;
15377 	  break;
15378 	default:
15379 	  gcc_unreachable ();
15380 	}
15381       goto half;
15382 
15383     case 8:
15384       switch (mode)
15385 	{
15386 	case E_V8DImode:
15387 	  half_mode = V4DImode;
15388 	  break;
15389 	case E_V8DFmode:
15390 	  half_mode = V4DFmode;
15391 	  break;
15392 	case E_V8SImode:
15393 	  half_mode = V4SImode;
15394 	  break;
15395 	case E_V8SFmode:
15396 	  half_mode = V4SFmode;
15397 	  break;
15398 	default:
15399 	  gcc_unreachable ();
15400 	}
15401       goto half;
15402 
15403     case 16:
15404       switch (mode)
15405 	{
15406 	case E_V16SImode:
15407 	  half_mode = V8SImode;
15408 	  break;
15409 	case E_V16SFmode:
15410 	  half_mode = V8SFmode;
15411 	  break;
15412 	default:
15413 	  gcc_unreachable ();
15414 	}
15415       goto half;
15416 
15417 half:
15418       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
15419       i = n - 1;
15420       for (j = 1; j != -1; j--)
15421 	{
15422 	  half[j] = gen_reg_rtx (half_mode);
15423 	  switch (n >> 1)
15424 	    {
15425 	    case 2:
15426 	      v = gen_rtvec (2, ops[i-1], ops[i]);
15427 	      i -= 2;
15428 	      break;
15429 	    case 4:
15430 	      v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15431 	      i -= 4;
15432 	      break;
15433 	    case 8:
15434 	      v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15435 			     ops[i-3], ops[i-2], ops[i-1], ops[i]);
15436 	      i -= 8;
15437 	      break;
15438 	    default:
15439 	      gcc_unreachable ();
15440 	    }
15441 	  ix86_expand_vector_init (false, half[j],
15442 				   gen_rtx_PARALLEL (half_mode, v));
15443 	}
15444 
15445       ix86_expand_vector_init_concat (mode, target, half, 2);
15446       break;
15447 
15448     default:
15449       gcc_unreachable ();
15450     }
15451 }
15452 
15453 /* A subroutine of ix86_expand_vector_init_general.  Use vector
15454    interleave to handle the most general case: all values variable,
15455    and none identical.  */
15456 
15457 static void
ix86_expand_vector_init_interleave(machine_mode mode,rtx target,rtx * ops,int n)15458 ix86_expand_vector_init_interleave (machine_mode mode,
15459 				    rtx target, rtx *ops, int n)
15460 {
15461   machine_mode first_imode, second_imode, third_imode, inner_mode;
15462   int i, j;
15463   rtx op, op0, op1;
15464   rtx (*gen_load_even) (rtx, rtx, rtx);
15465   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15466   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15467 
15468   switch (mode)
15469     {
15470     case E_V8HFmode:
15471       gen_load_even = gen_vec_interleave_lowv8hf;
15472       gen_interleave_first_low = gen_vec_interleave_lowv4si;
15473       gen_interleave_second_low = gen_vec_interleave_lowv2di;
15474       inner_mode = HFmode;
15475       first_imode = V4SImode;
15476       second_imode = V2DImode;
15477       third_imode = VOIDmode;
15478       break;
15479     case E_V8HImode:
15480       gen_load_even = gen_vec_setv8hi;
15481       gen_interleave_first_low = gen_vec_interleave_lowv4si;
15482       gen_interleave_second_low = gen_vec_interleave_lowv2di;
15483       inner_mode = HImode;
15484       first_imode = V4SImode;
15485       second_imode = V2DImode;
15486       third_imode = VOIDmode;
15487       break;
15488     case E_V16QImode:
15489       gen_load_even = gen_vec_setv16qi;
15490       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
15491       gen_interleave_second_low = gen_vec_interleave_lowv4si;
15492       inner_mode = QImode;
15493       first_imode = V8HImode;
15494       second_imode = V4SImode;
15495       third_imode = V2DImode;
15496       break;
15497     default:
15498       gcc_unreachable ();
15499     }
15500 
15501   for (i = 0; i < n; i++)
15502     {
15503       op = ops [i + i];
15504       if (inner_mode == HFmode)
15505 	{
15506 	  rtx even, odd;
15507 	  /* Use vpuncklwd to pack 2 HFmode.  */
15508 	  op0 = gen_reg_rtx (V8HFmode);
15509 	  even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
15510 	  odd = lowpart_subreg (V8HFmode,
15511 				force_reg (HFmode, ops[i + i + 1]),
15512 				HFmode);
15513 	  emit_insn (gen_load_even (op0, even, odd));
15514 	}
15515       else
15516 	{
15517 	  /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
15518 	  op0 = gen_reg_rtx (SImode);
15519 	  emit_move_insn (op0, gen_lowpart (SImode, op));
15520 
15521 	  /* Insert the SImode value as low element of V4SImode vector.  */
15522 	  op1 = gen_reg_rtx (V4SImode);
15523 	  op0 = gen_rtx_VEC_MERGE (V4SImode,
15524 				   gen_rtx_VEC_DUPLICATE (V4SImode,
15525 							  op0),
15526 				   CONST0_RTX (V4SImode),
15527 				   const1_rtx);
15528 	  emit_insn (gen_rtx_SET (op1, op0));
15529 
15530 	  /* Cast the V4SImode vector back to a vector in orignal mode.  */
15531 	  op0 = gen_reg_rtx (mode);
15532 	  emit_move_insn (op0, gen_lowpart (mode, op1));
15533 
15534 	  /* Load even elements into the second position.  */
15535 	  emit_insn (gen_load_even (op0,
15536 				    force_reg (inner_mode,
15537 					       ops[i + i + 1]),
15538 				    const1_rtx));
15539 	}
15540 
15541       /* Cast vector to FIRST_IMODE vector.  */
15542       ops[i] = gen_reg_rtx (first_imode);
15543       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
15544     }
15545 
15546   /* Interleave low FIRST_IMODE vectors.  */
15547   for (i = j = 0; i < n; i += 2, j++)
15548     {
15549       op0 = gen_reg_rtx (first_imode);
15550       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
15551 
15552       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
15553       ops[j] = gen_reg_rtx (second_imode);
15554       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
15555     }
15556 
15557   /* Interleave low SECOND_IMODE vectors.  */
15558   switch (second_imode)
15559     {
15560     case E_V4SImode:
15561       for (i = j = 0; i < n / 2; i += 2, j++)
15562 	{
15563 	  op0 = gen_reg_rtx (second_imode);
15564 	  emit_insn (gen_interleave_second_low (op0, ops[i],
15565 						ops[i + 1]));
15566 
15567 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15568 	     vector.  */
15569 	  ops[j] = gen_reg_rtx (third_imode);
15570 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
15571 	}
15572       second_imode = V2DImode;
15573       gen_interleave_second_low = gen_vec_interleave_lowv2di;
15574       /* FALLTHRU */
15575 
15576     case E_V2DImode:
15577       op0 = gen_reg_rtx (second_imode);
15578       emit_insn (gen_interleave_second_low (op0, ops[0],
15579 					    ops[1]));
15580 
15581       /* Cast the SECOND_IMODE vector back to a vector on original
15582 	 mode.  */
15583       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
15584       break;
15585 
15586     default:
15587       gcc_unreachable ();
15588     }
15589 }
15590 
15591 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
15592    all values variable, and none identical.  */
15593 
15594 static void
ix86_expand_vector_init_general(bool mmx_ok,machine_mode mode,rtx target,rtx vals)15595 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
15596 				 rtx target, rtx vals)
15597 {
15598   rtx ops[64], op0, op1, op2, op3, op4, op5;
15599   machine_mode half_mode = VOIDmode;
15600   machine_mode quarter_mode = VOIDmode;
15601   int n, i;
15602 
15603   switch (mode)
15604     {
15605     case E_V2SFmode:
15606     case E_V2SImode:
15607       if (!mmx_ok && !TARGET_SSE)
15608 	break;
15609       /* FALLTHRU */
15610 
15611     case E_V16SImode:
15612     case E_V16SFmode:
15613     case E_V8DFmode:
15614     case E_V8DImode:
15615     case E_V8SFmode:
15616     case E_V8SImode:
15617     case E_V4DFmode:
15618     case E_V4DImode:
15619     case E_V4SFmode:
15620     case E_V4SImode:
15621     case E_V2DFmode:
15622     case E_V2DImode:
15623       n = GET_MODE_NUNITS (mode);
15624       for (i = 0; i < n; i++)
15625 	ops[i] = XVECEXP (vals, 0, i);
15626       ix86_expand_vector_init_concat (mode, target, ops, n);
15627       return;
15628 
15629     case E_V2TImode:
15630       for (i = 0; i < 2; i++)
15631 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15632       op0 = gen_reg_rtx (V4DImode);
15633       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
15634       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15635       return;
15636 
15637     case E_V4TImode:
15638       for (i = 0; i < 4; i++)
15639 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15640       ops[4] = gen_reg_rtx (V4DImode);
15641       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
15642       ops[5] = gen_reg_rtx (V4DImode);
15643       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
15644       op0 = gen_reg_rtx (V8DImode);
15645       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
15646       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15647       return;
15648 
15649     case E_V32QImode:
15650       half_mode = V16QImode;
15651       goto half;
15652 
15653     case E_V16HImode:
15654       half_mode = V8HImode;
15655       goto half;
15656 
15657     case E_V16HFmode:
15658       half_mode = V8HFmode;
15659       goto half;
15660 
15661 half:
15662       n = GET_MODE_NUNITS (mode);
15663       for (i = 0; i < n; i++)
15664 	ops[i] = XVECEXP (vals, 0, i);
15665       op0 = gen_reg_rtx (half_mode);
15666       op1 = gen_reg_rtx (half_mode);
15667       ix86_expand_vector_init_interleave (half_mode, op0, ops,
15668 					  n >> 2);
15669       ix86_expand_vector_init_interleave (half_mode, op1,
15670 					  &ops [n >> 1], n >> 2);
15671       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
15672       return;
15673 
15674     case E_V64QImode:
15675       quarter_mode = V16QImode;
15676       half_mode = V32QImode;
15677       goto quarter;
15678 
15679     case E_V32HImode:
15680       quarter_mode = V8HImode;
15681       half_mode = V16HImode;
15682       goto quarter;
15683 
15684     case E_V32HFmode:
15685       quarter_mode = V8HFmode;
15686       half_mode = V16HFmode;
15687       goto quarter;
15688 
15689 quarter:
15690       n = GET_MODE_NUNITS (mode);
15691       for (i = 0; i < n; i++)
15692 	ops[i] = XVECEXP (vals, 0, i);
15693       op0 = gen_reg_rtx (quarter_mode);
15694       op1 = gen_reg_rtx (quarter_mode);
15695       op2 = gen_reg_rtx (quarter_mode);
15696       op3 = gen_reg_rtx (quarter_mode);
15697       op4 = gen_reg_rtx (half_mode);
15698       op5 = gen_reg_rtx (half_mode);
15699       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
15700 					  n >> 3);
15701       ix86_expand_vector_init_interleave (quarter_mode, op1,
15702 					  &ops [n >> 2], n >> 3);
15703       ix86_expand_vector_init_interleave (quarter_mode, op2,
15704 					  &ops [n >> 1], n >> 3);
15705       ix86_expand_vector_init_interleave (quarter_mode, op3,
15706 					  &ops [(n >> 1) | (n >> 2)], n >> 3);
15707       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
15708       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
15709       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
15710       return;
15711 
15712     case E_V16QImode:
15713       if (!TARGET_SSE4_1)
15714 	break;
15715       /* FALLTHRU */
15716 
15717     case E_V8HImode:
15718       if (!TARGET_SSE2)
15719 	break;
15720 
15721       /* Don't use ix86_expand_vector_init_interleave if we can't
15722 	 move from GPR to SSE register directly.  */
15723       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
15724 	break;
15725       /* FALLTHRU */
15726 
15727     case E_V8HFmode:
15728 
15729       n = GET_MODE_NUNITS (mode);
15730       for (i = 0; i < n; i++)
15731 	ops[i] = XVECEXP (vals, 0, i);
15732       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
15733       return;
15734 
15735     case E_V4HImode:
15736     case E_V8QImode:
15737 
15738     case E_V2HImode:
15739     case E_V4QImode:
15740       break;
15741 
15742     default:
15743       gcc_unreachable ();
15744     }
15745 
15746     {
15747       int i, j, n_elts, n_words, n_elt_per_word;
15748       machine_mode tmp_mode, inner_mode;
15749       rtx words[4], shift;
15750 
15751       tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
15752 
15753       inner_mode = GET_MODE_INNER (mode);
15754       n_elts = GET_MODE_NUNITS (mode);
15755       n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
15756       n_elt_per_word = n_elts / n_words;
15757       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
15758 
15759       for (i = 0; i < n_words; ++i)
15760 	{
15761 	  rtx word = NULL_RTX;
15762 
15763 	  for (j = 0; j < n_elt_per_word; ++j)
15764 	    {
15765 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
15766 	      elt = convert_modes (tmp_mode, inner_mode, elt, true);
15767 
15768 	      if (j == 0)
15769 		word = elt;
15770 	      else
15771 		{
15772 		  word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
15773 					      word, 1, OPTAB_LIB_WIDEN);
15774 		  word = expand_simple_binop (tmp_mode, IOR, word, elt,
15775 					      word, 1, OPTAB_LIB_WIDEN);
15776 		}
15777 	    }
15778 
15779 	  words[i] = word;
15780 	}
15781 
15782       if (n_words == 1)
15783 	emit_move_insn (target, gen_lowpart (mode, words[0]));
15784       else if (n_words == 2)
15785 	{
15786 	  rtx tmp = gen_reg_rtx (mode);
15787 	  emit_clobber (tmp);
15788 	  emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
15789 	  emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
15790 	  emit_move_insn (target, tmp);
15791 	}
15792       else if (n_words == 4)
15793 	{
15794 	  rtx tmp = gen_reg_rtx (V4SImode);
15795 	  gcc_assert (tmp_mode == SImode);
15796 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
15797 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
15798 	  emit_move_insn (target, gen_lowpart (mode, tmp));
15799 	}
15800       else
15801 	gcc_unreachable ();
15802     }
15803 }
15804 
15805 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
15806    instructions unless MMX_OK is true.  */
15807 
15808 void
ix86_expand_vector_init(bool mmx_ok,rtx target,rtx vals)15809 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
15810 {
15811   machine_mode mode = GET_MODE (target);
15812   machine_mode inner_mode = GET_MODE_INNER (mode);
15813   int n_elts = GET_MODE_NUNITS (mode);
15814   int n_var = 0, one_var = -1;
15815   bool all_same = true, all_const_zero = true;
15816   int i;
15817   rtx x;
15818 
15819   /* Handle first initialization from vector elts.  */
15820   if (n_elts != XVECLEN (vals, 0))
15821     {
15822       rtx subtarget = target;
15823       x = XVECEXP (vals, 0, 0);
15824       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
15825       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
15826 	{
15827 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
15828 	  if (inner_mode == QImode
15829 	      || inner_mode == HImode
15830 	      || inner_mode == TImode
15831 	      || inner_mode == HFmode)
15832 	    {
15833 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
15834 	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
15835 	      n_bits /= GET_MODE_SIZE (elt_mode);
15836 	      mode = mode_for_vector (elt_mode, n_bits).require ();
15837 	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
15838 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
15839 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
15840 	      subtarget = gen_reg_rtx (mode);
15841 	    }
15842 	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
15843 	  if (subtarget != target)
15844 	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
15845 	  return;
15846 	}
15847       gcc_unreachable ();
15848     }
15849 
15850   for (i = 0; i < n_elts; ++i)
15851     {
15852       x = XVECEXP (vals, 0, i);
15853       if (!(CONST_SCALAR_INT_P (x)
15854 	    || CONST_DOUBLE_P (x)
15855 	    || CONST_FIXED_P (x)))
15856 	n_var++, one_var = i;
15857       else if (x != CONST0_RTX (inner_mode))
15858 	all_const_zero = false;
15859       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
15860 	all_same = false;
15861     }
15862 
15863   /* Constants are best loaded from the constant pool.  */
15864   if (n_var == 0)
15865     {
15866       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
15867       return;
15868     }
15869 
15870   /* If all values are identical, broadcast the value.  */
15871   if (all_same
15872       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
15873 					    XVECEXP (vals, 0, 0)))
15874     return;
15875 
15876   /* Values where only one field is non-constant are best loaded from
15877      the pool and overwritten via move later.  */
15878   if (n_var == 1)
15879     {
15880       if (all_const_zero
15881 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
15882 						  XVECEXP (vals, 0, one_var),
15883 						  one_var))
15884 	return;
15885 
15886       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
15887 	return;
15888     }
15889 
15890   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
15891 }
15892 
15893 /* Implemented as
15894    V setg (V v, int idx, T val)
15895    {
15896      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
15897      V valv = (V){val, val, val, val, val, val, val, val};
15898      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
15899      v = (v & ~mask) | (valv & mask);
15900      return v;
15901    }.  */
15902 void
ix86_expand_vector_set_var(rtx target,rtx val,rtx idx)15903 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
15904 {
15905   rtx vec[64];
15906   machine_mode mode = GET_MODE (target);
15907   machine_mode cmp_mode = mode;
15908   int n_elts = GET_MODE_NUNITS (mode);
15909   rtx valv,idxv,constv,idx_tmp;
15910   bool ok = false;
15911 
15912   /* 512-bits vector byte/word broadcast and comparison only available
15913      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
15914      when without TARGET_AVX512BW.  */
15915   if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
15916     {
15917       gcc_assert (TARGET_AVX512F);
15918       rtx vhi, vlo, idx_hi;
15919       machine_mode half_mode;
15920       rtx (*extract_hi)(rtx, rtx);
15921       rtx (*extract_lo)(rtx, rtx);
15922 
15923       if (mode == V32HImode)
15924 	{
15925 	  half_mode = V16HImode;
15926 	  extract_hi = gen_vec_extract_hi_v32hi;
15927 	  extract_lo = gen_vec_extract_lo_v32hi;
15928 	}
15929       else
15930 	{
15931 	  half_mode = V32QImode;
15932 	  extract_hi = gen_vec_extract_hi_v64qi;
15933 	  extract_lo = gen_vec_extract_lo_v64qi;
15934 	}
15935 
15936       vhi = gen_reg_rtx (half_mode);
15937       vlo = gen_reg_rtx (half_mode);
15938       idx_hi = gen_reg_rtx (GET_MODE (idx));
15939       emit_insn (extract_hi (vhi, target));
15940       emit_insn (extract_lo (vlo, target));
15941       vec[0] = idx_hi;
15942       vec[1] = idx;
15943       vec[2] = GEN_INT (n_elts/2);
15944       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
15945       ix86_expand_vector_set_var (vhi, val, idx_hi);
15946       ix86_expand_vector_set_var (vlo, val, idx);
15947       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
15948       return;
15949     }
15950 
15951   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
15952     {
15953       switch (mode)
15954 	{
15955 	case E_V2DFmode:
15956 	  cmp_mode = V2DImode;
15957 	  break;
15958 	case E_V4DFmode:
15959 	  cmp_mode = V4DImode;
15960 	  break;
15961 	case E_V8DFmode:
15962 	  cmp_mode = V8DImode;
15963 	  break;
15964 	case E_V2SFmode:
15965 	  cmp_mode = V2SImode;
15966 	  break;
15967 	case E_V4SFmode:
15968 	  cmp_mode = V4SImode;
15969 	  break;
15970 	case E_V8SFmode:
15971 	  cmp_mode = V8SImode;
15972 	  break;
15973 	case E_V16SFmode:
15974 	  cmp_mode = V16SImode;
15975 	  break;
15976 	/* TARGET_AVX512FP16 implies TARGET_AVX512BW.  */
15977 	case E_V8HFmode:
15978 	  cmp_mode = V8HImode;
15979 	  break;
15980 	case E_V16HFmode:
15981 	  cmp_mode = V16HImode;
15982 	  break;
15983 	case E_V32HFmode:
15984 	  cmp_mode = V32HImode;
15985 	  break;
15986 	default:
15987 	  gcc_unreachable ();
15988 	}
15989     }
15990 
15991   for (int i = 0; i != n_elts; i++)
15992     vec[i] = GEN_INT (i);
15993   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
15994   valv = gen_reg_rtx (mode);
15995   idxv = gen_reg_rtx (cmp_mode);
15996   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
15997 
15998   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
15999 					  mode, valv, val);
16000   gcc_assert (ok);
16001   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16002 					  cmp_mode, idxv, idx_tmp);
16003   gcc_assert (ok);
16004   vec[0] = target;
16005   vec[1] = valv;
16006   vec[2] = target;
16007   vec[3] = gen_rtx_EQ (mode, idxv, constv);
16008   vec[4] = idxv;
16009   vec[5] = constv;
16010   ok = ix86_expand_int_vcond (vec);
16011   gcc_assert (ok);
16012 }
16013 
16014 void
ix86_expand_vector_set(bool mmx_ok,rtx target,rtx val,int elt)16015 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16016 {
16017   machine_mode mode = GET_MODE (target);
16018   machine_mode inner_mode = GET_MODE_INNER (mode);
16019   machine_mode half_mode;
16020   bool use_vec_merge = false;
16021   bool blendm_const = false;
16022   rtx tmp;
16023   static rtx (*gen_extract[7][2]) (rtx, rtx)
16024     = {
16025 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16026 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16027 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16028 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16029 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16030 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16031 	{ gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
16032       };
16033   static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
16034     = {
16035 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16036 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16037 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16038 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16039 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16040 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16041 	{ gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16042       };
16043   int i, j, n;
16044   machine_mode mmode = VOIDmode;
16045   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16046 
16047   switch (mode)
16048     {
16049     case E_V2SImode:
16050       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16051       if (use_vec_merge)
16052 	break;
16053       /* FALLTHRU */
16054 
16055     case E_V2SFmode:
16056       if (mmx_ok)
16057 	{
16058 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16059 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16060 	  if (elt == 0)
16061 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16062 	  else
16063 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16064 	  emit_insn (gen_rtx_SET (target, tmp));
16065 	  return;
16066 	}
16067       break;
16068 
16069     case E_V2DImode:
16070       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16071       if (use_vec_merge)
16072 	break;
16073 
16074       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16075       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16076       if (elt == 0)
16077 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16078       else
16079 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16080       emit_insn (gen_rtx_SET (target, tmp));
16081       return;
16082 
16083     case E_V2DFmode:
16084       /* NB: For ELT == 0, use standard scalar operation patterns which
16085 	 preserve the rest of the vector for combiner:
16086 
16087 	 (vec_merge:V2DF
16088 	   (vec_duplicate:V2DF (reg:DF))
16089 	   (reg:V2DF)
16090 	   (const_int 1))
16091        */
16092       if (elt == 0)
16093 	goto do_vec_merge;
16094 
16095       {
16096 	rtx op0, op1;
16097 
16098 	/* For the two element vectors, we implement a VEC_CONCAT with
16099 	   the extraction of the other element.  */
16100 
16101 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16102 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16103 
16104 	if (elt == 0)
16105 	  op0 = val, op1 = tmp;
16106 	else
16107 	  op0 = tmp, op1 = val;
16108 
16109 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16110 	emit_insn (gen_rtx_SET (target, tmp));
16111       }
16112       return;
16113 
16114     case E_V4SFmode:
16115       use_vec_merge = TARGET_SSE4_1;
16116       if (use_vec_merge)
16117 	break;
16118 
16119       switch (elt)
16120 	{
16121 	case 0:
16122 	  use_vec_merge = true;
16123 	  break;
16124 
16125 	case 1:
16126 	  /* tmp = target = A B C D */
16127 	  tmp = copy_to_reg (target);
16128 	  /* target = A A B B */
16129 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16130 	  /* target = X A B B */
16131 	  ix86_expand_vector_set (false, target, val, 0);
16132 	  /* target = A X C D  */
16133 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16134 					  const1_rtx, const0_rtx,
16135 					  GEN_INT (2+4), GEN_INT (3+4)));
16136 	  return;
16137 
16138 	case 2:
16139 	  /* tmp = target = A B C D */
16140 	  tmp = copy_to_reg (target);
16141 	  /* tmp = X B C D */
16142 	  ix86_expand_vector_set (false, tmp, val, 0);
16143 	  /* target = A B X D */
16144 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16145 					  const0_rtx, const1_rtx,
16146 					  GEN_INT (0+4), GEN_INT (3+4)));
16147 	  return;
16148 
16149 	case 3:
16150 	  /* tmp = target = A B C D */
16151 	  tmp = copy_to_reg (target);
16152 	  /* tmp = X B C D */
16153 	  ix86_expand_vector_set (false, tmp, val, 0);
16154 	  /* target = A B X D */
16155 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16156 					  const0_rtx, const1_rtx,
16157 					  GEN_INT (2+4), GEN_INT (0+4)));
16158 	  return;
16159 
16160 	default:
16161 	  gcc_unreachable ();
16162 	}
16163       break;
16164 
16165     case E_V4SImode:
16166       use_vec_merge = TARGET_SSE4_1;
16167       if (use_vec_merge)
16168 	break;
16169 
16170       /* Element 0 handled by vec_merge below.  */
16171       if (elt == 0)
16172 	{
16173 	  use_vec_merge = true;
16174 	  break;
16175 	}
16176 
16177       if (TARGET_SSE2)
16178 	{
16179 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
16180 	     store into element 0, then shuffle them back.  */
16181 
16182 	  rtx order[4];
16183 
16184 	  order[0] = GEN_INT (elt);
16185 	  order[1] = const1_rtx;
16186 	  order[2] = const2_rtx;
16187 	  order[3] = GEN_INT (3);
16188 	  order[elt] = const0_rtx;
16189 
16190 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16191 					order[1], order[2], order[3]));
16192 
16193 	  ix86_expand_vector_set (false, target, val, 0);
16194 
16195 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16196 					order[1], order[2], order[3]));
16197 	}
16198       else
16199 	{
16200 	  /* For SSE1, we have to reuse the V4SF code.  */
16201 	  rtx t = gen_reg_rtx (V4SFmode);
16202 	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
16203 	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16204 	  emit_move_insn (target, gen_lowpart (mode, t));
16205 	}
16206       return;
16207 
16208     case E_V8HImode:
16209     case E_V8HFmode:
16210     case E_V2HImode:
16211       use_vec_merge = TARGET_SSE2;
16212       break;
16213     case E_V4HImode:
16214       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16215       break;
16216 
16217     case E_V16QImode:
16218     case E_V4QImode:
16219       use_vec_merge = TARGET_SSE4_1;
16220       break;
16221 
16222     case E_V8QImode:
16223       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16224       break;
16225 
16226     case E_V32QImode:
16227       half_mode = V16QImode;
16228       j = 0;
16229       n = 16;
16230       goto half;
16231 
16232     case E_V16HFmode:
16233       if (TARGET_AVX2)
16234 	{
16235 	  mmode = SImode;
16236 	  gen_blendm = gen_avx2_pblendph;
16237 	  blendm_const = true;
16238 	  break;
16239 	}
16240       else
16241 	{
16242 	  half_mode = V8HFmode;
16243 	  j = 6;
16244 	  n = 8;
16245 	  goto half;
16246 	}
16247 
16248     case E_V16HImode:
16249       half_mode = V8HImode;
16250       j = 1;
16251       n = 8;
16252       goto half;
16253 
16254     case E_V8SImode:
16255       half_mode = V4SImode;
16256       j = 2;
16257       n = 4;
16258       goto half;
16259 
16260     case E_V4DImode:
16261       half_mode = V2DImode;
16262       j = 3;
16263       n = 2;
16264       goto half;
16265 
16266     case E_V8SFmode:
16267       half_mode = V4SFmode;
16268       j = 4;
16269       n = 4;
16270       goto half;
16271 
16272     case E_V4DFmode:
16273       half_mode = V2DFmode;
16274       j = 5;
16275       n = 2;
16276       goto half;
16277 
16278 half:
16279       /* Compute offset.  */
16280       i = elt / n;
16281       elt %= n;
16282 
16283       gcc_assert (i <= 1);
16284 
16285       /* Extract the half.  */
16286       tmp = gen_reg_rtx (half_mode);
16287       emit_insn (gen_extract[j][i] (tmp, target));
16288 
16289       /* Put val in tmp at elt.  */
16290       ix86_expand_vector_set (false, tmp, val, elt);
16291 
16292       /* Put it back.  */
16293       emit_insn (gen_insert[j][i] (target, target, tmp));
16294       return;
16295 
16296     case E_V8DFmode:
16297       if (TARGET_AVX512F)
16298 	{
16299 	  mmode = QImode;
16300 	  gen_blendm = gen_avx512f_blendmv8df;
16301 	}
16302       break;
16303 
16304     case E_V8DImode:
16305       if (TARGET_AVX512F)
16306 	{
16307 	  mmode = QImode;
16308 	  gen_blendm = gen_avx512f_blendmv8di;
16309 	}
16310       break;
16311 
16312     case E_V16SFmode:
16313       if (TARGET_AVX512F)
16314 	{
16315 	  mmode = HImode;
16316 	  gen_blendm = gen_avx512f_blendmv16sf;
16317 	}
16318       break;
16319 
16320     case E_V16SImode:
16321       if (TARGET_AVX512F)
16322 	{
16323 	  mmode = HImode;
16324 	  gen_blendm = gen_avx512f_blendmv16si;
16325 	}
16326       break;
16327 
16328     case E_V32HFmode:
16329       if (TARGET_AVX512BW)
16330 	{
16331 	  mmode = SImode;
16332 	  gen_blendm = gen_avx512bw_blendmv32hf;
16333 	}
16334       break;
16335     case E_V32HImode:
16336       if (TARGET_AVX512BW)
16337 	{
16338 	  mmode = SImode;
16339 	  gen_blendm = gen_avx512bw_blendmv32hi;
16340 	}
16341       else if (TARGET_AVX512F)
16342 	{
16343 	  half_mode = E_V8HImode;
16344 	  n = 8;
16345 	  goto quarter;
16346 	}
16347       break;
16348 
16349     case E_V64QImode:
16350       if (TARGET_AVX512BW)
16351 	{
16352 	  mmode = DImode;
16353 	  gen_blendm = gen_avx512bw_blendmv64qi;
16354 	}
16355       else if (TARGET_AVX512F)
16356 	{
16357 	  half_mode = E_V16QImode;
16358 	  n = 16;
16359 	  goto quarter;
16360 	}
16361       break;
16362 
16363 quarter:
16364       /* Compute offset.  */
16365       i = elt / n;
16366       elt %= n;
16367 
16368       gcc_assert (i <= 3);
16369 
16370       {
16371 	/* Extract the quarter.  */
16372 	tmp = gen_reg_rtx (V4SImode);
16373 	rtx tmp2 = gen_lowpart (V16SImode, target);
16374 	rtx mask = gen_reg_rtx (QImode);
16375 
16376 	emit_move_insn (mask, constm1_rtx);
16377 	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16378 						   tmp, mask));
16379 
16380 	tmp2 = gen_reg_rtx (half_mode);
16381 	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16382 	tmp = tmp2;
16383 
16384 	/* Put val in tmp at elt.  */
16385 	ix86_expand_vector_set (false, tmp, val, elt);
16386 
16387 	/* Put it back.  */
16388 	tmp2 = gen_reg_rtx (V16SImode);
16389 	rtx tmp3 = gen_lowpart (V16SImode, target);
16390 	mask = gen_reg_rtx (HImode);
16391 	emit_move_insn (mask, constm1_rtx);
16392 	tmp = gen_lowpart (V4SImode, tmp);
16393 	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16394 						  tmp3, mask));
16395 	emit_move_insn (target, gen_lowpart (mode, tmp2));
16396       }
16397       return;
16398 
16399     default:
16400       break;
16401     }
16402 
16403   if (mmode != VOIDmode)
16404     {
16405       tmp = gen_reg_rtx (mode);
16406       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
16407       rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
16408       /* The avx512*_blendm<mode> expanders have different operand order
16409 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
16410 	 elements where the mask is set and second input operand otherwise,
16411 	 in {sse,avx}*_*blend* the first input operand is used for elements
16412 	 where the mask is clear and second input operand otherwise.  */
16413       if (!blendm_const)
16414 	merge_mask = force_reg (mmode, merge_mask);
16415       emit_insn (gen_blendm (target, target, tmp, merge_mask));
16416     }
16417   else if (use_vec_merge)
16418     {
16419 do_vec_merge:
16420       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
16421       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
16422 			       GEN_INT (HOST_WIDE_INT_1U << elt));
16423       emit_insn (gen_rtx_SET (target, tmp));
16424     }
16425   else
16426     {
16427       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16428 
16429       emit_move_insn (mem, target);
16430 
16431       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
16432       emit_move_insn (tmp, val);
16433 
16434       emit_move_insn (target, mem);
16435     }
16436 }
16437 
16438 void
ix86_expand_vector_extract(bool mmx_ok,rtx target,rtx vec,int elt)16439 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
16440 {
16441   machine_mode mode = GET_MODE (vec);
16442   machine_mode inner_mode = GET_MODE_INNER (mode);
16443   bool use_vec_extr = false;
16444   rtx tmp;
16445 
16446   switch (mode)
16447     {
16448     case E_V2SImode:
16449       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16450       if (use_vec_extr)
16451 	break;
16452       /* FALLTHRU */
16453 
16454     case E_V2SFmode:
16455       if (!mmx_ok)
16456 	break;
16457       /* FALLTHRU */
16458 
16459     case E_V2DFmode:
16460     case E_V2DImode:
16461     case E_V2TImode:
16462     case E_V4TImode:
16463       use_vec_extr = true;
16464       break;
16465 
16466     case E_V4SFmode:
16467       use_vec_extr = TARGET_SSE4_1;
16468       if (use_vec_extr)
16469 	break;
16470 
16471       switch (elt)
16472 	{
16473 	case 0:
16474 	  tmp = vec;
16475 	  break;
16476 
16477 	case 1:
16478 	case 3:
16479 	  tmp = gen_reg_rtx (mode);
16480 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
16481 				       GEN_INT (elt), GEN_INT (elt),
16482 				       GEN_INT (elt+4), GEN_INT (elt+4)));
16483 	  break;
16484 
16485 	case 2:
16486 	  tmp = gen_reg_rtx (mode);
16487 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
16488 	  break;
16489 
16490 	default:
16491 	  gcc_unreachable ();
16492 	}
16493       vec = tmp;
16494       use_vec_extr = true;
16495       elt = 0;
16496       break;
16497 
16498     case E_V4SImode:
16499       use_vec_extr = TARGET_SSE4_1;
16500       if (use_vec_extr)
16501 	break;
16502 
16503       if (TARGET_SSE2)
16504 	{
16505 	  switch (elt)
16506 	    {
16507 	    case 0:
16508 	      tmp = vec;
16509 	      break;
16510 
16511 	    case 1:
16512 	    case 3:
16513 	      tmp = gen_reg_rtx (mode);
16514 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
16515 					    GEN_INT (elt), GEN_INT (elt),
16516 					    GEN_INT (elt), GEN_INT (elt)));
16517 	      break;
16518 
16519 	    case 2:
16520 	      tmp = gen_reg_rtx (mode);
16521 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
16522 	      break;
16523 
16524 	    default:
16525 	      gcc_unreachable ();
16526 	    }
16527 	  vec = tmp;
16528 	  use_vec_extr = true;
16529 	  elt = 0;
16530 	}
16531       else
16532 	{
16533 	  /* For SSE1, we have to reuse the V4SF code.  */
16534 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
16535 				      gen_lowpart (V4SFmode, vec), elt);
16536 	  return;
16537 	}
16538       break;
16539 
16540     case E_V8HImode:
16541     case E_V2HImode:
16542       use_vec_extr = TARGET_SSE2;
16543       break;
16544     case E_V4HImode:
16545       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16546       break;
16547 
16548     case E_V16QImode:
16549       use_vec_extr = TARGET_SSE4_1;
16550       if (!use_vec_extr
16551 	  && TARGET_SSE2
16552 	  && elt == 0
16553 	  && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
16554 	{
16555 	  tmp = gen_reg_rtx (SImode);
16556 	  ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
16557 				      0);
16558 	  emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
16559 	  return;
16560 	}
16561       break;
16562     case E_V4QImode:
16563       use_vec_extr = TARGET_SSE4_1;
16564       break;
16565 
16566     case E_V8SFmode:
16567       if (TARGET_AVX)
16568 	{
16569 	  tmp = gen_reg_rtx (V4SFmode);
16570 	  if (elt < 4)
16571 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
16572 	  else
16573 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
16574 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
16575 	  return;
16576 	}
16577       break;
16578 
16579     case E_V4DFmode:
16580       if (TARGET_AVX)
16581 	{
16582 	  tmp = gen_reg_rtx (V2DFmode);
16583 	  if (elt < 2)
16584 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
16585 	  else
16586 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
16587 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
16588 	  return;
16589 	}
16590       break;
16591 
16592     case E_V32QImode:
16593       if (TARGET_AVX)
16594 	{
16595 	  tmp = gen_reg_rtx (V16QImode);
16596 	  if (elt < 16)
16597 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
16598 	  else
16599 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
16600 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
16601 	  return;
16602 	}
16603       break;
16604 
16605     case E_V16HImode:
16606       if (TARGET_AVX)
16607 	{
16608 	  tmp = gen_reg_rtx (V8HImode);
16609 	  if (elt < 8)
16610 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
16611 	  else
16612 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
16613 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
16614 	  return;
16615 	}
16616       break;
16617 
16618     case E_V8SImode:
16619       if (TARGET_AVX)
16620 	{
16621 	  tmp = gen_reg_rtx (V4SImode);
16622 	  if (elt < 4)
16623 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
16624 	  else
16625 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
16626 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
16627 	  return;
16628 	}
16629       break;
16630 
16631     case E_V4DImode:
16632       if (TARGET_AVX)
16633 	{
16634 	  tmp = gen_reg_rtx (V2DImode);
16635 	  if (elt < 2)
16636 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
16637 	  else
16638 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
16639 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
16640 	  return;
16641 	}
16642       break;
16643 
16644     case E_V32HImode:
16645       if (TARGET_AVX512BW)
16646 	{
16647 	  tmp = gen_reg_rtx (V16HImode);
16648 	  if (elt < 16)
16649 	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
16650 	  else
16651 	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
16652 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
16653 	  return;
16654 	}
16655       break;
16656 
16657     case E_V64QImode:
16658       if (TARGET_AVX512BW)
16659 	{
16660 	  tmp = gen_reg_rtx (V32QImode);
16661 	  if (elt < 32)
16662 	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
16663 	  else
16664 	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
16665 	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
16666 	  return;
16667 	}
16668       break;
16669 
16670     case E_V16SFmode:
16671       tmp = gen_reg_rtx (V8SFmode);
16672       if (elt < 8)
16673 	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
16674       else
16675 	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
16676       ix86_expand_vector_extract (false, target, tmp, elt & 7);
16677       return;
16678 
16679     case E_V8DFmode:
16680       tmp = gen_reg_rtx (V4DFmode);
16681       if (elt < 4)
16682 	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
16683       else
16684 	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
16685       ix86_expand_vector_extract (false, target, tmp, elt & 3);
16686       return;
16687 
16688     case E_V16SImode:
16689       tmp = gen_reg_rtx (V8SImode);
16690       if (elt < 8)
16691 	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
16692       else
16693 	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
16694       ix86_expand_vector_extract (false, target, tmp, elt & 7);
16695       return;
16696 
16697     case E_V8DImode:
16698       tmp = gen_reg_rtx (V4DImode);
16699       if (elt < 4)
16700 	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
16701       else
16702 	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
16703       ix86_expand_vector_extract (false, target, tmp, elt & 3);
16704       return;
16705 
16706     case E_V32HFmode:
16707       tmp = gen_reg_rtx (V16HFmode);
16708       if (elt < 16)
16709 	emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
16710       else
16711 	emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
16712       ix86_expand_vector_extract (false, target, tmp, elt & 15);
16713       return;
16714 
16715     case E_V16HFmode:
16716       tmp = gen_reg_rtx (V8HFmode);
16717       if (elt < 8)
16718 	emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
16719       else
16720 	emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
16721       ix86_expand_vector_extract (false, target, tmp, elt & 7);
16722       return;
16723 
16724     case E_V8HFmode:
16725       use_vec_extr = true;
16726       break;
16727 
16728     case E_V8QImode:
16729       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16730       /* ??? Could extract the appropriate HImode element and shift.  */
16731       break;
16732 
16733     default:
16734       break;
16735     }
16736 
16737   if (use_vec_extr)
16738     {
16739       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
16740       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
16741 
16742       /* Let the rtl optimizers know about the zero extension performed.  */
16743       if (inner_mode == QImode || inner_mode == HImode)
16744 	{
16745 	  rtx reg = gen_reg_rtx (SImode);
16746 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
16747 	  emit_move_insn (reg, tmp);
16748 	  tmp = gen_lowpart (inner_mode, reg);
16749 	  SUBREG_PROMOTED_VAR_P (tmp) = 1;
16750 	  SUBREG_PROMOTED_SET (tmp, 1);
16751 	}
16752 
16753       emit_move_insn (target, tmp);
16754     }
16755   else
16756     {
16757       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16758 
16759       emit_move_insn (mem, vec);
16760 
16761       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
16762       emit_move_insn (target, tmp);
16763     }
16764 }
16765 
16766 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
16767    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
16768    The upper bits of DEST are undefined, though they shouldn't cause
16769    exceptions (some bits from src or all zeros are ok).  */
16770 
16771 static void
emit_reduc_half(rtx dest,rtx src,int i)16772 emit_reduc_half (rtx dest, rtx src, int i)
16773 {
16774   rtx tem, d = dest;
16775   switch (GET_MODE (src))
16776     {
16777     case E_V4SFmode:
16778       if (i == 128)
16779 	tem = gen_sse_movhlps (dest, src, src);
16780       else
16781 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
16782 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
16783       break;
16784     case E_V2DFmode:
16785       tem = gen_vec_interleave_highv2df (dest, src, src);
16786       break;
16787     case E_V4QImode:
16788       d = gen_reg_rtx (V1SImode);
16789       tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
16790 			       GEN_INT (i / 2));
16791       break;
16792     case E_V4HImode:
16793       d = gen_reg_rtx (V1DImode);
16794       tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
16795 			       GEN_INT (i / 2));
16796       break;
16797     case E_V16QImode:
16798     case E_V8HImode:
16799     case E_V8HFmode:
16800     case E_V4SImode:
16801     case E_V2DImode:
16802       d = gen_reg_rtx (V1TImode);
16803       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
16804 				GEN_INT (i / 2));
16805       break;
16806     case E_V8SFmode:
16807       if (i == 256)
16808 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
16809       else
16810 	tem = gen_avx_shufps256 (dest, src, src,
16811 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
16812       break;
16813     case E_V4DFmode:
16814       if (i == 256)
16815 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
16816       else
16817 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
16818       break;
16819     case E_V32QImode:
16820     case E_V16HImode:
16821     case E_V16HFmode:
16822     case E_V8SImode:
16823     case E_V4DImode:
16824       if (i == 256)
16825 	{
16826 	  if (GET_MODE (dest) != V4DImode)
16827 	    d = gen_reg_rtx (V4DImode);
16828 	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
16829 				   gen_lowpart (V4DImode, src),
16830 				   const1_rtx);
16831 	}
16832       else
16833 	{
16834 	  d = gen_reg_rtx (V2TImode);
16835 	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
16836 				    GEN_INT (i / 2));
16837 	}
16838       break;
16839     case E_V64QImode:
16840     case E_V32HImode:
16841     case E_V32HFmode:
16842       if (i < 64)
16843 	{
16844 	  d = gen_reg_rtx (V4TImode);
16845 	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
16846 					GEN_INT (i / 2));
16847 	  break;
16848 	}
16849       /* FALLTHRU */
16850     case E_V16SImode:
16851     case E_V16SFmode:
16852     case E_V8DImode:
16853     case E_V8DFmode:
16854       if (i > 128)
16855 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
16856 					gen_lowpart (V16SImode, src),
16857 					gen_lowpart (V16SImode, src),
16858 					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
16859 					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
16860 					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
16861 					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
16862 					GEN_INT (0xC), GEN_INT (0xD),
16863 					GEN_INT (0xE), GEN_INT (0xF),
16864 					GEN_INT (0x10), GEN_INT (0x11),
16865 					GEN_INT (0x12), GEN_INT (0x13),
16866 					GEN_INT (0x14), GEN_INT (0x15),
16867 					GEN_INT (0x16), GEN_INT (0x17));
16868       else
16869 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
16870 				    gen_lowpart (V16SImode, src),
16871 				    GEN_INT (i == 128 ? 0x2 : 0x1),
16872 				    GEN_INT (0x3),
16873 				    GEN_INT (0x3),
16874 				    GEN_INT (0x3),
16875 				    GEN_INT (i == 128 ? 0x6 : 0x5),
16876 				    GEN_INT (0x7),
16877 				    GEN_INT (0x7),
16878 				    GEN_INT (0x7),
16879 				    GEN_INT (i == 128 ? 0xA : 0x9),
16880 				    GEN_INT (0xB),
16881 				    GEN_INT (0xB),
16882 				    GEN_INT (0xB),
16883 				    GEN_INT (i == 128 ? 0xE : 0xD),
16884 				    GEN_INT (0xF),
16885 				    GEN_INT (0xF),
16886 				    GEN_INT (0xF));
16887       break;
16888     default:
16889       gcc_unreachable ();
16890     }
16891   emit_insn (tem);
16892   if (d != dest)
16893     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
16894 }
16895 
16896 /* Expand a vector reduction.  FN is the binary pattern to reduce;
16897    DEST is the destination; IN is the input vector.  */
16898 
16899 void
ix86_expand_reduc(rtx (* fn)(rtx,rtx,rtx),rtx dest,rtx in)16900 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
16901 {
16902   rtx half, dst, vec = in;
16903   machine_mode mode = GET_MODE (in);
16904   int i;
16905 
16906   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
16907   if (TARGET_SSE4_1
16908       && mode == V8HImode
16909       && fn == gen_uminv8hi3)
16910     {
16911       emit_insn (gen_sse4_1_phminposuw (dest, in));
16912       return;
16913     }
16914 
16915   for (i = GET_MODE_BITSIZE (mode);
16916        i > GET_MODE_UNIT_BITSIZE (mode);
16917        i >>= 1)
16918     {
16919       half = gen_reg_rtx (mode);
16920       emit_reduc_half (half, vec, i);
16921       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
16922 	dst = dest;
16923       else
16924 	dst = gen_reg_rtx (mode);
16925       emit_insn (fn (dst, half, vec));
16926       vec = dst;
16927     }
16928 }
16929 
16930 /* Output code to perform a conditional jump to LABEL, if C2 flag in
16931    FP status register is set.  */
16932 
16933 void
ix86_emit_fp_unordered_jump(rtx label)16934 ix86_emit_fp_unordered_jump (rtx label)
16935 {
16936   rtx reg = gen_reg_rtx (HImode);
16937   rtx_insn *insn;
16938   rtx temp;
16939 
16940   emit_insn (gen_x86_fnstsw_1 (reg));
16941 
16942   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
16943     {
16944       emit_insn (gen_x86_sahf_1 (reg));
16945 
16946       temp = gen_rtx_REG (CCmode, FLAGS_REG);
16947       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
16948     }
16949   else
16950     {
16951       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
16952 
16953       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
16954       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
16955     }
16956 
16957   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
16958 			      gen_rtx_LABEL_REF (VOIDmode, label),
16959 			      pc_rtx);
16960   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
16961   predict_jump (REG_BR_PROB_BASE * 10 / 100);
16962   JUMP_LABEL (insn) = label;
16963 }
16964 
16965 /* Output code to perform an sinh XFmode calculation.  */
16966 
ix86_emit_i387_sinh(rtx op0,rtx op1)16967 void ix86_emit_i387_sinh (rtx op0, rtx op1)
16968 {
16969   rtx e1 = gen_reg_rtx (XFmode);
16970   rtx e2 = gen_reg_rtx (XFmode);
16971   rtx scratch = gen_reg_rtx (HImode);
16972   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
16973   rtx half = const_double_from_real_value (dconsthalf, XFmode);
16974   rtx cst1, tmp;
16975   rtx_code_label *jump_label = gen_label_rtx ();
16976   rtx_insn *insn;
16977 
16978   /* scratch = fxam (op1) */
16979   emit_insn (gen_fxamxf2_i387 (scratch, op1));
16980 
16981   /* e1 = expm1 (|op1|) */
16982   emit_insn (gen_absxf2 (e2, op1));
16983   emit_insn (gen_expm1xf2 (e1, e2));
16984 
16985   /* e2 = e1 / (e1 + 1.0) + e1 */
16986   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16987   emit_insn (gen_addxf3 (e2, e1, cst1));
16988   emit_insn (gen_divxf3 (e2, e1, e2));
16989   emit_insn (gen_addxf3 (e2, e2, e1));
16990 
16991   /* flags = signbit (op1) */
16992   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
16993 
16994   /* if (flags) then e2 = -e2 */
16995   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
16996 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
16997 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
16998 			      pc_rtx);
16999   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17000   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17001   JUMP_LABEL (insn) = jump_label;
17002 
17003   emit_insn (gen_negxf2 (e2, e2));
17004 
17005   emit_label (jump_label);
17006   LABEL_NUSES (jump_label) = 1;
17007 
17008   /* op0 = 0.5 * e2 */
17009   half = force_reg (XFmode, half);
17010   emit_insn (gen_mulxf3 (op0, e2, half));
17011 }
17012 
17013 /* Output code to perform an cosh XFmode calculation.  */
17014 
ix86_emit_i387_cosh(rtx op0,rtx op1)17015 void ix86_emit_i387_cosh (rtx op0, rtx op1)
17016 {
17017   rtx e1 = gen_reg_rtx (XFmode);
17018   rtx e2 = gen_reg_rtx (XFmode);
17019   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17020   rtx cst1;
17021 
17022   /* e1 = exp (op1) */
17023   emit_insn (gen_expxf2 (e1, op1));
17024 
17025   /* e2 = e1 + 1.0 / e1 */
17026   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17027   emit_insn (gen_divxf3 (e2, cst1, e1));
17028   emit_insn (gen_addxf3 (e2, e1, e2));
17029 
17030   /* op0 = 0.5 * e2 */
17031   half = force_reg (XFmode, half);
17032   emit_insn (gen_mulxf3 (op0, e2, half));
17033 }
17034 
17035 /* Output code to perform an tanh XFmode calculation.  */
17036 
ix86_emit_i387_tanh(rtx op0,rtx op1)17037 void ix86_emit_i387_tanh (rtx op0, rtx op1)
17038 {
17039   rtx e1 = gen_reg_rtx (XFmode);
17040   rtx e2 = gen_reg_rtx (XFmode);
17041   rtx scratch = gen_reg_rtx (HImode);
17042   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17043   rtx cst2, tmp;
17044   rtx_code_label *jump_label = gen_label_rtx ();
17045   rtx_insn *insn;
17046 
17047   /* scratch = fxam (op1) */
17048   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17049 
17050   /* e1 = expm1 (-|2 * op1|) */
17051   emit_insn (gen_addxf3 (e2, op1, op1));
17052   emit_insn (gen_absxf2 (e2, e2));
17053   emit_insn (gen_negxf2 (e2, e2));
17054   emit_insn (gen_expm1xf2 (e1, e2));
17055 
17056   /* e2 = e1 / (e1 + 2.0) */
17057   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17058   emit_insn (gen_addxf3 (e2, e1, cst2));
17059   emit_insn (gen_divxf3 (e2, e1, e2));
17060 
17061   /* flags = signbit (op1) */
17062   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17063 
17064   /* if (!flags) then e2 = -e2 */
17065   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17066 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
17067 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17068 			      pc_rtx);
17069   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17070   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17071   JUMP_LABEL (insn) = jump_label;
17072 
17073   emit_insn (gen_negxf2 (e2, e2));
17074 
17075   emit_label (jump_label);
17076   LABEL_NUSES (jump_label) = 1;
17077 
17078   emit_move_insn (op0, e2);
17079 }
17080 
17081 /* Output code to perform an asinh XFmode calculation.  */
17082 
ix86_emit_i387_asinh(rtx op0,rtx op1)17083 void ix86_emit_i387_asinh (rtx op0, rtx op1)
17084 {
17085   rtx e1 = gen_reg_rtx (XFmode);
17086   rtx e2 = gen_reg_rtx (XFmode);
17087   rtx scratch = gen_reg_rtx (HImode);
17088   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17089   rtx cst1, tmp;
17090   rtx_code_label *jump_label = gen_label_rtx ();
17091   rtx_insn *insn;
17092 
17093   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17094   emit_insn (gen_mulxf3 (e1, op1, op1));
17095   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17096   emit_insn (gen_addxf3 (e2, e1, cst1));
17097   emit_insn (gen_sqrtxf2 (e2, e2));
17098   emit_insn (gen_addxf3 (e2, e2, cst1));
17099 
17100   /* e1 = e1 / e2 */
17101   emit_insn (gen_divxf3 (e1, e1, e2));
17102 
17103   /* scratch = fxam (op1) */
17104   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17105 
17106   /* e1 = e1 + |op1| */
17107   emit_insn (gen_absxf2 (e2, op1));
17108   emit_insn (gen_addxf3 (e1, e1, e2));
17109 
17110   /* e2 = log1p (e1) */
17111   ix86_emit_i387_log1p (e2, e1);
17112 
17113   /* flags = signbit (op1) */
17114   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17115 
17116   /* if (flags) then e2 = -e2 */
17117   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17118 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17119 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17120 			      pc_rtx);
17121   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17122   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17123   JUMP_LABEL (insn) = jump_label;
17124 
17125   emit_insn (gen_negxf2 (e2, e2));
17126 
17127   emit_label (jump_label);
17128   LABEL_NUSES (jump_label) = 1;
17129 
17130   emit_move_insn (op0, e2);
17131 }
17132 
17133 /* Output code to perform an acosh XFmode calculation.  */
17134 
ix86_emit_i387_acosh(rtx op0,rtx op1)17135 void ix86_emit_i387_acosh (rtx op0, rtx op1)
17136 {
17137   rtx e1 = gen_reg_rtx (XFmode);
17138   rtx e2 = gen_reg_rtx (XFmode);
17139   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17140 
17141   /* e2 = sqrt (op1 + 1.0) */
17142   emit_insn (gen_addxf3 (e2, op1, cst1));
17143   emit_insn (gen_sqrtxf2 (e2, e2));
17144 
17145   /* e1 = sqrt (op1 - 1.0) */
17146   emit_insn (gen_subxf3 (e1, op1, cst1));
17147   emit_insn (gen_sqrtxf2 (e1, e1));
17148 
17149   /* e1 = e1 * e2 */
17150   emit_insn (gen_mulxf3 (e1, e1, e2));
17151 
17152   /* e1 = e1 + op1 */
17153   emit_insn (gen_addxf3 (e1, e1, op1));
17154 
17155   /* op0 = log (e1) */
17156   emit_insn (gen_logxf2 (op0, e1));
17157 }
17158 
17159 /* Output code to perform an atanh XFmode calculation.  */
17160 
ix86_emit_i387_atanh(rtx op0,rtx op1)17161 void ix86_emit_i387_atanh (rtx op0, rtx op1)
17162 {
17163   rtx e1 = gen_reg_rtx (XFmode);
17164   rtx e2 = gen_reg_rtx (XFmode);
17165   rtx scratch = gen_reg_rtx (HImode);
17166   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17167   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17168   rtx cst1, tmp;
17169   rtx_code_label *jump_label = gen_label_rtx ();
17170   rtx_insn *insn;
17171 
17172   /* scratch = fxam (op1) */
17173   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17174 
17175   /* e2 = |op1| */
17176   emit_insn (gen_absxf2 (e2, op1));
17177 
17178   /* e1 = -(e2 + e2) / (e2 + 1.0) */
17179   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17180   emit_insn (gen_addxf3 (e1, e2, cst1));
17181   emit_insn (gen_addxf3 (e2, e2, e2));
17182   emit_insn (gen_negxf2 (e2, e2));
17183   emit_insn (gen_divxf3 (e1, e2, e1));
17184 
17185   /* e2 = log1p (e1) */
17186   ix86_emit_i387_log1p (e2, e1);
17187 
17188   /* flags = signbit (op1) */
17189   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17190 
17191   /* if (!flags) then e2 = -e2 */
17192   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17193 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
17194 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17195 			      pc_rtx);
17196   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17197   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17198   JUMP_LABEL (insn) = jump_label;
17199 
17200   emit_insn (gen_negxf2 (e2, e2));
17201 
17202   emit_label (jump_label);
17203   LABEL_NUSES (jump_label) = 1;
17204 
17205   /* op0 = 0.5 * e2 */
17206   half = force_reg (XFmode, half);
17207   emit_insn (gen_mulxf3 (op0, e2, half));
17208 }
17209 
17210 /* Output code to perform a log1p XFmode calculation.  */
17211 
ix86_emit_i387_log1p(rtx op0,rtx op1)17212 void ix86_emit_i387_log1p (rtx op0, rtx op1)
17213 {
17214   rtx_code_label *label1 = gen_label_rtx ();
17215   rtx_code_label *label2 = gen_label_rtx ();
17216 
17217   rtx tmp = gen_reg_rtx (XFmode);
17218   rtx res = gen_reg_rtx (XFmode);
17219   rtx cst, cstln2, cst1;
17220   rtx_insn *insn;
17221 
17222   cst = const_double_from_real_value
17223     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17224   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17225 
17226   emit_insn (gen_absxf2 (tmp, op1));
17227 
17228   cst = force_reg (XFmode, cst);
17229   ix86_expand_branch (GE, tmp, cst, label1);
17230   predict_jump (REG_BR_PROB_BASE * 10 / 100);
17231   insn = get_last_insn ();
17232   JUMP_LABEL (insn) = label1;
17233 
17234   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17235   emit_jump (label2);
17236 
17237   emit_label (label1);
17238   LABEL_NUSES (label1) = 1;
17239 
17240   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17241   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17242   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17243 
17244   emit_label (label2);
17245   LABEL_NUSES (label2) = 1;
17246 
17247   emit_move_insn (op0, res);
17248 }
17249 
17250 /* Emit code for round calculation.  */
ix86_emit_i387_round(rtx op0,rtx op1)17251 void ix86_emit_i387_round (rtx op0, rtx op1)
17252 {
17253   machine_mode inmode = GET_MODE (op1);
17254   machine_mode outmode = GET_MODE (op0);
17255   rtx e1 = gen_reg_rtx (XFmode);
17256   rtx e2 = gen_reg_rtx (XFmode);
17257   rtx scratch = gen_reg_rtx (HImode);
17258   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17259   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17260   rtx res = gen_reg_rtx (outmode);
17261   rtx_code_label *jump_label = gen_label_rtx ();
17262   rtx (*floor_insn) (rtx, rtx);
17263   rtx (*neg_insn) (rtx, rtx);
17264   rtx_insn *insn;
17265   rtx tmp;
17266 
17267   switch (inmode)
17268     {
17269     case E_SFmode:
17270     case E_DFmode:
17271       tmp = gen_reg_rtx (XFmode);
17272 
17273       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17274       op1 = tmp;
17275       break;
17276     case E_XFmode:
17277       break;
17278     default:
17279       gcc_unreachable ();
17280     }
17281 
17282   switch (outmode)
17283     {
17284     case E_SFmode:
17285       floor_insn = gen_frndintxf2_floor;
17286       neg_insn = gen_negsf2;
17287       break;
17288     case E_DFmode:
17289       floor_insn = gen_frndintxf2_floor;
17290       neg_insn = gen_negdf2;
17291       break;
17292     case E_XFmode:
17293       floor_insn = gen_frndintxf2_floor;
17294       neg_insn = gen_negxf2;
17295       break;
17296     case E_HImode:
17297       floor_insn = gen_lfloorxfhi2;
17298       neg_insn = gen_neghi2;
17299       break;
17300     case E_SImode:
17301       floor_insn = gen_lfloorxfsi2;
17302       neg_insn = gen_negsi2;
17303       break;
17304     case E_DImode:
17305       floor_insn = gen_lfloorxfdi2;
17306       neg_insn = gen_negdi2;
17307       break;
17308     default:
17309       gcc_unreachable ();
17310     }
17311 
17312   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17313 
17314   /* scratch = fxam(op1) */
17315   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17316 
17317   /* e1 = fabs(op1) */
17318   emit_insn (gen_absxf2 (e1, op1));
17319 
17320   /* e2 = e1 + 0.5 */
17321   half = force_reg (XFmode, half);
17322   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17323 
17324   /* res = floor(e2) */
17325   switch (outmode)
17326     {
17327     case E_SFmode:
17328     case E_DFmode:
17329       {
17330 	tmp = gen_reg_rtx (XFmode);
17331 
17332 	emit_insn (floor_insn (tmp, e2));
17333 	emit_insn (gen_rtx_SET (res,
17334 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17335 						UNSPEC_TRUNC_NOOP)));
17336       }
17337       break;
17338     default:
17339       emit_insn (floor_insn (res, e2));
17340     }
17341 
17342   /* flags = signbit(a) */
17343   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17344 
17345   /* if (flags) then res = -res */
17346   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17347 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17348 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17349 			      pc_rtx);
17350   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17351   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17352   JUMP_LABEL (insn) = jump_label;
17353 
17354   emit_insn (neg_insn (res, res));
17355 
17356   emit_label (jump_label);
17357   LABEL_NUSES (jump_label) = 1;
17358 
17359   emit_move_insn (op0, res);
17360 }
17361 
17362 /* Output code to perform a Newton-Rhapson approximation of a single precision
17363    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
17364 
ix86_emit_swdivsf(rtx res,rtx a,rtx b,machine_mode mode)17365 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
17366 {
17367   rtx x0, x1, e0, e1;
17368 
17369   x0 = gen_reg_rtx (mode);
17370   e0 = gen_reg_rtx (mode);
17371   e1 = gen_reg_rtx (mode);
17372   x1 = gen_reg_rtx (mode);
17373 
17374   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17375 
17376   b = force_reg (mode, b);
17377 
17378   /* x0 = rcp(b) estimate */
17379   if (mode == V16SFmode || mode == V8DFmode)
17380     {
17381       if (TARGET_AVX512ER)
17382 	{
17383 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17384 						      UNSPEC_RCP28)));
17385 	  /* res = a * x0 */
17386 	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17387 	  return;
17388 	}
17389       else
17390 	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17391 						    UNSPEC_RCP14)));
17392     }
17393   else
17394     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17395 						UNSPEC_RCP)));
17396 
17397   /* e0 = x0 * b */
17398   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
17399 
17400   /* e0 = x0 * e0 */
17401   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
17402 
17403   /* e1 = x0 + x0 */
17404   emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
17405 
17406   /* x1 = e1 - e0 */
17407   emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
17408 
17409   /* res = a * x1 */
17410   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
17411 }
17412 
17413 /* Output code to perform a Newton-Rhapson approximation of a
17414    single precision floating point [reciprocal] square root.  */
17415 
ix86_emit_swsqrtsf(rtx res,rtx a,machine_mode mode,bool recip)17416 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
17417 {
17418   rtx x0, e0, e1, e2, e3, mthree, mhalf;
17419   REAL_VALUE_TYPE r;
17420   int unspec;
17421 
17422   x0 = gen_reg_rtx (mode);
17423   e0 = gen_reg_rtx (mode);
17424   e1 = gen_reg_rtx (mode);
17425   e2 = gen_reg_rtx (mode);
17426   e3 = gen_reg_rtx (mode);
17427 
17428   if (TARGET_AVX512ER && mode == V16SFmode)
17429     {
17430       if (recip)
17431 	/* res = rsqrt28(a) estimate */
17432 	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17433 						     UNSPEC_RSQRT28)));
17434       else
17435 	{
17436 	  /* x0 = rsqrt28(a) estimate */
17437 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17438 						      UNSPEC_RSQRT28)));
17439 	  /* res = rcp28(x0) estimate */
17440 	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
17441 						       UNSPEC_RCP28)));
17442 	}
17443       return;
17444     }
17445 
17446   real_from_integer (&r, VOIDmode, -3, SIGNED);
17447   mthree = const_double_from_real_value (r, SFmode);
17448 
17449   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
17450   mhalf = const_double_from_real_value (r, SFmode);
17451   unspec = UNSPEC_RSQRT;
17452 
17453   if (VECTOR_MODE_P (mode))
17454     {
17455       mthree = ix86_build_const_vector (mode, true, mthree);
17456       mhalf = ix86_build_const_vector (mode, true, mhalf);
17457       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
17458       if (GET_MODE_SIZE (mode) == 64)
17459 	unspec = UNSPEC_RSQRT14;
17460     }
17461 
17462   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17463      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17464 
17465   a = force_reg (mode, a);
17466 
17467   /* x0 = rsqrt(a) estimate */
17468   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17469 					      unspec)));
17470 
17471   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
17472   if (!recip)
17473     {
17474       rtx zero = force_reg (mode, CONST0_RTX(mode));
17475       rtx mask;
17476 
17477       /* Handle masked compare.  */
17478       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
17479 	{
17480 	  mask = gen_reg_rtx (HImode);
17481 	  /* Imm value 0x4 corresponds to not-equal comparison.  */
17482 	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
17483 	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
17484 	}
17485       else
17486 	{
17487 	  mask = gen_reg_rtx (mode);
17488 	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
17489 	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
17490 	}
17491     }
17492 
17493   mthree = force_reg (mode, mthree);
17494 
17495   /* e0 = x0 * a */
17496   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
17497 
17498   unsigned vector_size = GET_MODE_SIZE (mode);
17499   if (TARGET_FMA
17500       || (TARGET_AVX512F && vector_size == 64)
17501       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
17502     emit_insn (gen_rtx_SET (e2,
17503 			    gen_rtx_FMA (mode, e0, x0, mthree)));
17504   else
17505     {
17506       /* e1 = e0 * x0 */
17507       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
17508 
17509       /* e2 = e1 - 3. */
17510       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
17511     }
17512 
17513   mhalf = force_reg (mode, mhalf);
17514   if (recip)
17515     /* e3 = -.5 * x0 */
17516     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
17517   else
17518     /* e3 = -.5 * e0 */
17519     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
17520   /* ret = e2 * e3 */
17521   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
17522 }
17523 
17524 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
17525    mask for masking out the sign-bit is stored in *SMASK, if that is
17526    non-null.  */
17527 
17528 static rtx
ix86_expand_sse_fabs(rtx op0,rtx * smask)17529 ix86_expand_sse_fabs (rtx op0, rtx *smask)
17530 {
17531   machine_mode vmode, mode = GET_MODE (op0);
17532   rtx xa, mask;
17533 
17534   xa = gen_reg_rtx (mode);
17535   if (mode == SFmode)
17536     vmode = V4SFmode;
17537   else if (mode == DFmode)
17538     vmode = V2DFmode;
17539   else
17540     vmode = mode;
17541   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
17542   if (!VECTOR_MODE_P (mode))
17543     {
17544       /* We need to generate a scalar mode mask in this case.  */
17545       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17546       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17547       mask = gen_reg_rtx (mode);
17548       emit_insn (gen_rtx_SET (mask, tmp));
17549     }
17550   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
17551 
17552   if (smask)
17553     *smask = mask;
17554 
17555   return xa;
17556 }
17557 
17558 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17559    swapping the operands if SWAP_OPERANDS is true.  The expanded
17560    code is a forward jump to a newly created label in case the
17561    comparison is true.  The generated label rtx is returned.  */
17562 static rtx_code_label *
ix86_expand_sse_compare_and_jump(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)17563 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
17564                                   bool swap_operands)
17565 {
17566   bool unordered_compare = ix86_unordered_fp_compare (code);
17567   rtx_code_label *label;
17568   rtx tmp, reg;
17569 
17570   if (swap_operands)
17571     std::swap (op0, op1);
17572 
17573   label = gen_label_rtx ();
17574   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
17575   if (unordered_compare)
17576     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
17577   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
17578   emit_insn (gen_rtx_SET (reg, tmp));
17579   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
17580   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17581 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
17582   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17583   JUMP_LABEL (tmp) = label;
17584 
17585   return label;
17586 }
17587 
17588 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17589    using comparison code CODE.  Operands are swapped for the comparison if
17590    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
17591 static rtx
ix86_expand_sse_compare_mask(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)17592 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
17593 			      bool swap_operands)
17594 {
17595   rtx (*insn)(rtx, rtx, rtx, rtx);
17596   machine_mode mode = GET_MODE (op0);
17597   rtx mask = gen_reg_rtx (mode);
17598 
17599   if (swap_operands)
17600     std::swap (op0, op1);
17601 
17602   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
17603 
17604   emit_insn (insn (mask, op0, op1,
17605 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
17606   return mask;
17607 }
17608 
17609 /* Expand copysign from SIGN to the positive value ABS_VALUE
17610    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
17611    the sign-bit.  */
17612 
17613 static void
ix86_sse_copysign_to_positive(rtx result,rtx abs_value,rtx sign,rtx mask)17614 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
17615 {
17616   machine_mode mode = GET_MODE (sign);
17617   rtx sgn = gen_reg_rtx (mode);
17618   if (mask == NULL_RTX)
17619     {
17620       machine_mode vmode;
17621 
17622       if (mode == SFmode)
17623 	vmode = V4SFmode;
17624       else if (mode == DFmode)
17625 	vmode = V2DFmode;
17626       else
17627 	vmode = mode;
17628 
17629       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
17630       if (!VECTOR_MODE_P (mode))
17631 	{
17632 	  /* We need to generate a scalar mode mask in this case.  */
17633 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17634 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17635 	  mask = gen_reg_rtx (mode);
17636 	  emit_insn (gen_rtx_SET (mask, tmp));
17637 	}
17638     }
17639   else
17640     mask = gen_rtx_NOT (mode, mask);
17641   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
17642   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
17643 }
17644 
17645 /* Expand SSE sequence for computing lround from OP1 storing
17646    into OP0.  */
17647 
17648 void
ix86_expand_lround(rtx op0,rtx op1)17649 ix86_expand_lround (rtx op0, rtx op1)
17650 {
17651   /* C code for the stuff we're doing below:
17652 	tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
17653 	return (long)tmp;
17654    */
17655   machine_mode mode = GET_MODE (op1);
17656   const struct real_format *fmt;
17657   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17658   rtx adj;
17659 
17660   /* load nextafter (0.5, 0.0) */
17661   fmt = REAL_MODE_FORMAT (mode);
17662   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17663   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17664 
17665   /* adj = copysign (0.5, op1) */
17666   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
17667   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
17668 
17669   /* adj = op1 + adj */
17670   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
17671 
17672   /* op0 = (imode)adj */
17673   expand_fix (op0, adj, 0);
17674 }
17675 
17676 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
17677    into OPERAND0.  */
17678 
17679 void
ix86_expand_lfloorceil(rtx op0,rtx op1,bool do_floor)17680 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
17681 {
17682   /* C code for the stuff we're doing below (for do_floor):
17683 	xi = (long)op1;
17684 	xi -= (double)xi > op1 ? 1 : 0;
17685 	return xi;
17686    */
17687   machine_mode fmode = GET_MODE (op1);
17688   machine_mode imode = GET_MODE (op0);
17689   rtx ireg, freg, tmp;
17690   rtx_code_label *label;
17691 
17692   /* reg = (long)op1 */
17693   ireg = gen_reg_rtx (imode);
17694   expand_fix (ireg, op1, 0);
17695 
17696   /* freg = (double)reg */
17697   freg = gen_reg_rtx (fmode);
17698   expand_float (freg, ireg, 0);
17699 
17700   /* ireg = (freg > op1) ? ireg - 1 : ireg */
17701   label = ix86_expand_sse_compare_and_jump (UNLE,
17702 					    freg, op1, !do_floor);
17703   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
17704 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
17705   emit_move_insn (ireg, tmp);
17706 
17707   emit_label (label);
17708   LABEL_NUSES (label) = 1;
17709 
17710   emit_move_insn (op0, ireg);
17711 }
17712 
17713 /* Generate and return a rtx of mode MODE for 2**n where n is the number
17714    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
17715 
17716 static rtx
ix86_gen_TWO52(machine_mode mode)17717 ix86_gen_TWO52 (machine_mode mode)
17718 {
17719   const struct real_format *fmt;
17720   REAL_VALUE_TYPE TWO52r;
17721   rtx TWO52;
17722 
17723   fmt = REAL_MODE_FORMAT (mode);
17724   real_2expN (&TWO52r, fmt->p - 1, mode);
17725   TWO52 = const_double_from_real_value (TWO52r, mode);
17726   TWO52 = force_reg (mode, TWO52);
17727 
17728   return TWO52;
17729 }
17730 
17731 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
17732 
17733 void
ix86_expand_rint(rtx operand0,rtx operand1)17734 ix86_expand_rint (rtx operand0, rtx operand1)
17735 {
17736   /* C code for the stuff we're doing below:
17737 	xa = fabs (operand1);
17738 	if (!isless (xa, 2**52))
17739 	  return operand1;
17740 	two52 = 2**52;
17741 	if (flag_rounding_math)
17742 	  {
17743 	    two52 = copysign (two52, operand1);
17744 	    xa = operand1;
17745 	  }
17746 	xa = xa + two52 - two52;
17747 	return copysign (xa, operand1);
17748    */
17749   machine_mode mode = GET_MODE (operand0);
17750   rtx res, xa, TWO52, mask;
17751   rtx_code_label *label;
17752 
17753   TWO52 = ix86_gen_TWO52 (mode);
17754 
17755   /* Temporary for holding the result, initialized to the input
17756      operand to ease control flow.  */
17757   res = copy_to_reg (operand1);
17758 
17759   /* xa = abs (operand1) */
17760   xa = ix86_expand_sse_fabs (res, &mask);
17761 
17762   /* if (!isless (xa, TWO52)) goto label; */
17763   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17764 
17765   if (flag_rounding_math)
17766     {
17767       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
17768       xa = res;
17769     }
17770 
17771   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17772   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17773 
17774   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
17775   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
17776     xa = ix86_expand_sse_fabs (xa, NULL);
17777 
17778   ix86_sse_copysign_to_positive (res, xa, res, mask);
17779 
17780   emit_label (label);
17781   LABEL_NUSES (label) = 1;
17782 
17783   emit_move_insn (operand0, res);
17784 }
17785 
17786 /* Expand SSE2 sequence for computing floor or ceil
17787    from OPERAND1 storing into OPERAND0.  */
17788 void
ix86_expand_floorceil(rtx operand0,rtx operand1,bool do_floor)17789 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
17790 {
17791   /* C code for the stuff we expand below.
17792 	double xa = fabs (x), x2;
17793 	if (!isless (xa, TWO52))
17794 	  return x;
17795 	x2 = (double)(long)x;
17796 
17797      Compensate.  Floor:
17798 	if (x2 > x)
17799 	  x2 -= 1;
17800      Compensate.  Ceil:
17801 	if (x2 < x)
17802 	  x2 += 1;
17803 
17804 	if (HONOR_SIGNED_ZEROS (mode))
17805 	  return copysign (x2, x);
17806 	return x2;
17807    */
17808   machine_mode mode = GET_MODE (operand0);
17809   rtx xa, xi, TWO52, tmp, one, res, mask;
17810   rtx_code_label *label;
17811 
17812   TWO52 = ix86_gen_TWO52 (mode);
17813 
17814   /* Temporary for holding the result, initialized to the input
17815      operand to ease control flow.  */
17816   res = copy_to_reg (operand1);
17817 
17818   /* xa = abs (operand1) */
17819   xa = ix86_expand_sse_fabs (res, &mask);
17820 
17821   /* if (!isless (xa, TWO52)) goto label; */
17822   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17823 
17824   /* xa = (double)(long)x */
17825   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
17826   expand_fix (xi, res, 0);
17827   expand_float (xa, xi, 0);
17828 
17829   /* generate 1.0 */
17830   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17831 
17832   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17833   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17834   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17835   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17836 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17837   if (HONOR_SIGNED_ZEROS (mode))
17838     {
17839       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
17840       if (do_floor && flag_rounding_math)
17841 	tmp = ix86_expand_sse_fabs (tmp, NULL);
17842 
17843       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
17844     }
17845   emit_move_insn (res, tmp);
17846 
17847   emit_label (label);
17848   LABEL_NUSES (label) = 1;
17849 
17850   emit_move_insn (operand0, res);
17851 }
17852 
17853 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
17854    into OPERAND0 without relying on DImode truncation via cvttsd2siq
17855    that is only available on 64bit targets.  */
17856 void
ix86_expand_floorceildf_32(rtx operand0,rtx operand1,bool do_floor)17857 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
17858 {
17859   /* C code for the stuff we expand below.
17860 	double xa = fabs (x), x2;
17861 	if (!isless (xa, TWO52))
17862 	  return x;
17863 	xa = xa + TWO52 - TWO52;
17864 	x2 = copysign (xa, x);
17865 
17866      Compensate.  Floor:
17867 	if (x2 > x)
17868 	  x2 -= 1;
17869      Compensate.  Ceil:
17870 	if (x2 < x)
17871 	  x2 += 1;
17872 
17873 	if (HONOR_SIGNED_ZEROS (mode))
17874 	  x2 = copysign (x2, x);
17875 	return x2;
17876    */
17877   machine_mode mode = GET_MODE (operand0);
17878   rtx xa, TWO52, tmp, one, res, mask;
17879   rtx_code_label *label;
17880 
17881   TWO52 = ix86_gen_TWO52 (mode);
17882 
17883   /* Temporary for holding the result, initialized to the input
17884      operand to ease control flow.  */
17885   res = copy_to_reg (operand1);
17886 
17887   /* xa = abs (operand1) */
17888   xa = ix86_expand_sse_fabs (res, &mask);
17889 
17890   /* if (!isless (xa, TWO52)) goto label; */
17891   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17892 
17893   /* xa = xa + TWO52 - TWO52; */
17894   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17895   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17896 
17897   /* xa = copysign (xa, operand1) */
17898   ix86_sse_copysign_to_positive (xa, xa, res, mask);
17899 
17900   /* generate 1.0 */
17901   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17902 
17903   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17904   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17905   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17906   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17907 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17908   if (HONOR_SIGNED_ZEROS (mode))
17909     {
17910       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
17911       if (do_floor && flag_rounding_math)
17912 	tmp = ix86_expand_sse_fabs (tmp, NULL);
17913 
17914       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
17915     }
17916   emit_move_insn (res, tmp);
17917 
17918   emit_label (label);
17919   LABEL_NUSES (label) = 1;
17920 
17921   emit_move_insn (operand0, res);
17922 }
17923 
17924 /* Expand SSE sequence for computing trunc
17925    from OPERAND1 storing into OPERAND0.  */
17926 void
ix86_expand_trunc(rtx operand0,rtx operand1)17927 ix86_expand_trunc (rtx operand0, rtx operand1)
17928 {
17929   /* C code for SSE variant we expand below.
17930 	double xa = fabs (x), x2;
17931 	if (!isless (xa, TWO52))
17932 	  return x;
17933 	x2 = (double)(long)x;
17934 	if (HONOR_SIGNED_ZEROS (mode))
17935 	  return copysign (x2, x);
17936 	return x2;
17937    */
17938   machine_mode mode = GET_MODE (operand0);
17939   rtx xa, xi, TWO52, res, mask;
17940   rtx_code_label *label;
17941 
17942   TWO52 = ix86_gen_TWO52 (mode);
17943 
17944   /* Temporary for holding the result, initialized to the input
17945      operand to ease control flow.  */
17946   res = copy_to_reg (operand1);
17947 
17948   /* xa = abs (operand1) */
17949   xa = ix86_expand_sse_fabs (res, &mask);
17950 
17951   /* if (!isless (xa, TWO52)) goto label; */
17952   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17953 
17954   /* xa = (double)(long)x */
17955   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
17956   expand_fix (xi, res, 0);
17957   expand_float (xa, xi, 0);
17958 
17959   if (HONOR_SIGNED_ZEROS (mode))
17960     ix86_sse_copysign_to_positive (xa, xa, res, mask);
17961 
17962   emit_move_insn (res, xa);
17963 
17964   emit_label (label);
17965   LABEL_NUSES (label) = 1;
17966 
17967   emit_move_insn (operand0, res);
17968 }
17969 
17970 /* Expand SSE sequence for computing trunc from OPERAND1 storing
17971    into OPERAND0 without relying on DImode truncation via cvttsd2siq
17972    that is only available on 64bit targets.  */
17973 void
ix86_expand_truncdf_32(rtx operand0,rtx operand1)17974 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
17975 {
17976   machine_mode mode = GET_MODE (operand0);
17977   rtx xa, xa2, TWO52, tmp, one, res, mask;
17978   rtx_code_label *label;
17979 
17980   /* C code for SSE variant we expand below.
17981 	double xa = fabs (x), x2;
17982 	if (!isless (xa, TWO52))
17983 	  return x;
17984 	xa2 = xa + TWO52 - TWO52;
17985      Compensate:
17986 	if (xa2 > xa)
17987 	  xa2 -= 1.0;
17988 	x2 = copysign (xa2, x);
17989 	return x2;
17990    */
17991 
17992   TWO52 = ix86_gen_TWO52 (mode);
17993 
17994   /* Temporary for holding the result, initialized to the input
17995      operand to ease control flow.  */
17996   res =copy_to_reg (operand1);
17997 
17998   /* xa = abs (operand1) */
17999   xa = ix86_expand_sse_fabs (res, &mask);
18000 
18001   /* if (!isless (xa, TWO52)) goto label; */
18002   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18003 
18004   /* xa2 = xa + TWO52 - TWO52; */
18005   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18006   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18007 
18008   /* generate 1.0 */
18009   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18010 
18011   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
18012   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18013   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18014   tmp = expand_simple_binop (mode, MINUS,
18015 			     xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18016   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
18017   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18018     tmp = ix86_expand_sse_fabs (tmp, NULL);
18019 
18020   /* res = copysign (xa2, operand1) */
18021   ix86_sse_copysign_to_positive (res, tmp, res, mask);
18022 
18023   emit_label (label);
18024   LABEL_NUSES (label) = 1;
18025 
18026   emit_move_insn (operand0, res);
18027 }
18028 
18029 /* Expand SSE sequence for computing round
18030    from OPERAND1 storing into OPERAND0.  */
18031 void
ix86_expand_round(rtx operand0,rtx operand1)18032 ix86_expand_round (rtx operand0, rtx operand1)
18033 {
18034   /* C code for the stuff we're doing below:
18035 	double xa = fabs (x);
18036 	if (!isless (xa, TWO52))
18037 	  return x;
18038 	xa = (double)(long)(xa + nextafter (0.5, 0.0));
18039 	return copysign (xa, x);
18040    */
18041   machine_mode mode = GET_MODE (operand0);
18042   rtx res, TWO52, xa, xi, half, mask;
18043   rtx_code_label *label;
18044   const struct real_format *fmt;
18045   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18046 
18047   /* Temporary for holding the result, initialized to the input
18048      operand to ease control flow.  */
18049   res = copy_to_reg (operand1);
18050 
18051   TWO52 = ix86_gen_TWO52 (mode);
18052   xa = ix86_expand_sse_fabs (res, &mask);
18053   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18054 
18055   /* load nextafter (0.5, 0.0) */
18056   fmt = REAL_MODE_FORMAT (mode);
18057   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18058   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18059 
18060   /* xa = xa + 0.5 */
18061   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18062   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18063 
18064   /* xa = (double)(int64_t)xa */
18065   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18066   expand_fix (xi, xa, 0);
18067   expand_float (xa, xi, 0);
18068 
18069   /* res = copysign (xa, operand1) */
18070   ix86_sse_copysign_to_positive (res, xa, res, mask);
18071 
18072   emit_label (label);
18073   LABEL_NUSES (label) = 1;
18074 
18075   emit_move_insn (operand0, res);
18076 }
18077 
18078 /* Expand SSE sequence for computing round from OPERAND1 storing
18079    into OPERAND0 without relying on DImode truncation via cvttsd2siq
18080    that is only available on 64bit targets.  */
18081 void
ix86_expand_rounddf_32(rtx operand0,rtx operand1)18082 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18083 {
18084   /* C code for the stuff we expand below.
18085 	double xa = fabs (x), xa2, x2;
18086 	if (!isless (xa, TWO52))
18087 	  return x;
18088      Using the absolute value and copying back sign makes
18089      -0.0 -> -0.0 correct.
18090 	xa2 = xa + TWO52 - TWO52;
18091      Compensate.
18092 	dxa = xa2 - xa;
18093 	if (dxa <= -0.5)
18094 	  xa2 += 1;
18095 	else if (dxa > 0.5)
18096 	  xa2 -= 1;
18097 	x2 = copysign (xa2, x);
18098 	return x2;
18099    */
18100   machine_mode mode = GET_MODE (operand0);
18101   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18102   rtx_code_label *label;
18103 
18104   TWO52 = ix86_gen_TWO52 (mode);
18105 
18106   /* Temporary for holding the result, initialized to the input
18107      operand to ease control flow.  */
18108   res = copy_to_reg (operand1);
18109 
18110   /* xa = abs (operand1) */
18111   xa = ix86_expand_sse_fabs (res, &mask);
18112 
18113   /* if (!isless (xa, TWO52)) goto label; */
18114   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18115 
18116   /* xa2 = xa + TWO52 - TWO52; */
18117   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18118   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18119 
18120   /* dxa = xa2 - xa; */
18121   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18122 
18123   /* generate 0.5, 1.0 and -0.5 */
18124   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18125   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18126   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18127 			       0, OPTAB_DIRECT);
18128 
18129   /* Compensate.  */
18130   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18131   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18132   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18133   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18134   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18135   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18136   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18137   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18138 
18139   /* res = copysign (xa2, operand1) */
18140   ix86_sse_copysign_to_positive (res, xa2, res, mask);
18141 
18142   emit_label (label);
18143   LABEL_NUSES (label) = 1;
18144 
18145   emit_move_insn (operand0, res);
18146 }
18147 
18148 /* Expand SSE sequence for computing round
18149    from OP1 storing into OP0 using sse4 round insn.  */
18150 void
ix86_expand_round_sse4(rtx op0,rtx op1)18151 ix86_expand_round_sse4 (rtx op0, rtx op1)
18152 {
18153   machine_mode mode = GET_MODE (op0);
18154   rtx e1, e2, res, half;
18155   const struct real_format *fmt;
18156   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18157   rtx (*gen_copysign) (rtx, rtx, rtx);
18158   rtx (*gen_round) (rtx, rtx, rtx);
18159 
18160   switch (mode)
18161     {
18162     case E_SFmode:
18163       gen_copysign = gen_copysignsf3;
18164       gen_round = gen_sse4_1_roundsf2;
18165       break;
18166     case E_DFmode:
18167       gen_copysign = gen_copysigndf3;
18168       gen_round = gen_sse4_1_rounddf2;
18169       break;
18170     default:
18171       gcc_unreachable ();
18172     }
18173 
18174   /* round (a) = trunc (a + copysign (0.5, a)) */
18175 
18176   /* load nextafter (0.5, 0.0) */
18177   fmt = REAL_MODE_FORMAT (mode);
18178   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18179   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18180   half = const_double_from_real_value (pred_half, mode);
18181 
18182   /* e1 = copysign (0.5, op1) */
18183   e1 = gen_reg_rtx (mode);
18184   emit_insn (gen_copysign (e1, half, op1));
18185 
18186   /* e2 = op1 + e1 */
18187   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18188 
18189   /* res = trunc (e2) */
18190   res = gen_reg_rtx (mode);
18191   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18192 
18193   emit_move_insn (op0, res);
18194 }
18195 
18196 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18197    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18198    insn every time.  */
18199 
18200 static GTY(()) rtx_insn *vselect_insn;
18201 
18202 /* Initialize vselect_insn.  */
18203 
18204 static void
init_vselect_insn(void)18205 init_vselect_insn (void)
18206 {
18207   unsigned i;
18208   rtx x;
18209 
18210   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18211   for (i = 0; i < MAX_VECT_LEN; ++i)
18212     XVECEXP (x, 0, i) = const0_rtx;
18213   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18214 							const0_rtx), x);
18215   x = gen_rtx_SET (const0_rtx, x);
18216   start_sequence ();
18217   vselect_insn = emit_insn (x);
18218   end_sequence ();
18219 }
18220 
18221 /* Construct (set target (vec_select op0 (parallel perm))) and
18222    return true if that's a valid instruction in the active ISA.  */
18223 
18224 static bool
expand_vselect(rtx target,rtx op0,const unsigned char * perm,unsigned nelt,bool testing_p)18225 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18226 		unsigned nelt, bool testing_p)
18227 {
18228   unsigned int i;
18229   rtx x, save_vconcat;
18230   int icode;
18231 
18232   if (vselect_insn == NULL_RTX)
18233     init_vselect_insn ();
18234 
18235   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18236   PUT_NUM_ELEM (XVEC (x, 0), nelt);
18237   for (i = 0; i < nelt; ++i)
18238     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18239   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18240   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18241   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18242   SET_DEST (PATTERN (vselect_insn)) = target;
18243   icode = recog_memoized (vselect_insn);
18244 
18245   if (icode >= 0 && !testing_p)
18246     emit_insn (copy_rtx (PATTERN (vselect_insn)));
18247 
18248   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18249   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18250   INSN_CODE (vselect_insn) = -1;
18251 
18252   return icode >= 0;
18253 }
18254 
18255 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
18256 
18257 static bool
expand_vselect_vconcat(rtx target,rtx op0,rtx op1,const unsigned char * perm,unsigned nelt,bool testing_p)18258 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18259 			const unsigned char *perm, unsigned nelt,
18260 			bool testing_p)
18261 {
18262   machine_mode v2mode;
18263   rtx x;
18264   bool ok;
18265 
18266   if (vselect_insn == NULL_RTX)
18267     init_vselect_insn ();
18268 
18269   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18270     return false;
18271   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18272   PUT_MODE (x, v2mode);
18273   XEXP (x, 0) = op0;
18274   XEXP (x, 1) = op1;
18275   ok = expand_vselect (target, x, perm, nelt, testing_p);
18276   XEXP (x, 0) = const0_rtx;
18277   XEXP (x, 1) = const0_rtx;
18278   return ok;
18279 }
18280 
18281 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18282    using movss or movsd.  */
18283 static bool
expand_vec_perm_movs(struct expand_vec_perm_d * d)18284 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18285 {
18286   machine_mode vmode = d->vmode;
18287   unsigned i, nelt = d->nelt;
18288   rtx x;
18289 
18290   if (d->one_operand_p)
18291     return false;
18292 
18293   if (!(TARGET_SSE && vmode == V4SFmode)
18294       && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
18295       && !(TARGET_SSE2 && vmode == V2DFmode))
18296     return false;
18297 
18298   /* Only the first element is changed.  */
18299   if (d->perm[0] != nelt && d->perm[0] != 0)
18300     return false;
18301   for (i = 1; i < nelt; ++i)
18302     if (d->perm[i] != i + nelt - d->perm[0])
18303       return false;
18304 
18305   if (d->testing_p)
18306     return true;
18307 
18308   if (d->perm[0] == nelt)
18309     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18310   else
18311     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18312 
18313   emit_insn (gen_rtx_SET (d->target, x));
18314 
18315   return true;
18316 }
18317 
18318 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18319    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
18320 
18321 static bool
expand_vec_perm_blend(struct expand_vec_perm_d * d)18322 expand_vec_perm_blend (struct expand_vec_perm_d *d)
18323 {
18324   machine_mode mmode, vmode = d->vmode;
18325   unsigned i, nelt = d->nelt;
18326   unsigned HOST_WIDE_INT mask;
18327   rtx target, op0, op1, maskop, x;
18328   rtx rperm[32], vperm;
18329 
18330   if (d->one_operand_p)
18331     return false;
18332   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18333       && (TARGET_AVX512BW
18334 	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
18335     ;
18336   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18337     ;
18338   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18339     ;
18340   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
18341 			     || GET_MODE_SIZE (vmode) == 8
18342 			     || GET_MODE_SIZE (vmode) == 4))
18343     ;
18344   else
18345     return false;
18346 
18347   /* This is a blend, not a permute.  Elements must stay in their
18348      respective lanes.  */
18349   for (i = 0; i < nelt; ++i)
18350     {
18351       unsigned e = d->perm[i];
18352       if (!(e == i || e == i + nelt))
18353 	return false;
18354     }
18355 
18356   if (d->testing_p)
18357     return true;
18358 
18359   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
18360      decision should be extracted elsewhere, so that we only try that
18361      sequence once all budget==3 options have been tried.  */
18362   target = d->target;
18363   op0 = d->op0;
18364   op1 = d->op1;
18365   mask = 0;
18366 
18367   switch (vmode)
18368     {
18369     case E_V8DFmode:
18370     case E_V16SFmode:
18371     case E_V4DFmode:
18372     case E_V8SFmode:
18373     case E_V2DFmode:
18374     case E_V4SFmode:
18375     case E_V4HImode:
18376     case E_V8HImode:
18377     case E_V8SImode:
18378     case E_V32HImode:
18379     case E_V64QImode:
18380     case E_V16SImode:
18381     case E_V8DImode:
18382       for (i = 0; i < nelt; ++i)
18383 	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
18384       break;
18385 
18386     case E_V2DImode:
18387       for (i = 0; i < 2; ++i)
18388 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18389       vmode = V8HImode;
18390       goto do_subreg;
18391 
18392     case E_V2SImode:
18393       for (i = 0; i < 2; ++i)
18394 	mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
18395       vmode = V4HImode;
18396       goto do_subreg;
18397 
18398     case E_V4SImode:
18399       for (i = 0; i < 4; ++i)
18400 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18401       vmode = V8HImode;
18402       goto do_subreg;
18403 
18404     case E_V16QImode:
18405       /* See if bytes move in pairs so we can use pblendw with
18406 	 an immediate argument, rather than pblendvb with a vector
18407 	 argument.  */
18408       for (i = 0; i < 16; i += 2)
18409 	if (d->perm[i] + 1 != d->perm[i + 1])
18410 	  {
18411 	  use_pblendvb:
18412 	    for (i = 0; i < nelt; ++i)
18413 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
18414 
18415 	  finish_pblendvb:
18416 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18417 	    vperm = force_reg (vmode, vperm);
18418 
18419 	    if (GET_MODE_SIZE (vmode) == 4)
18420 	      emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm));
18421 	    else if (GET_MODE_SIZE (vmode) == 8)
18422 	      emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
18423 	    else if (GET_MODE_SIZE (vmode) == 16)
18424 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
18425 	    else
18426 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
18427 	    if (target != d->target)
18428 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18429 	    return true;
18430 	  }
18431 
18432       for (i = 0; i < 8; ++i)
18433 	mask |= (d->perm[i * 2] >= 16) << i;
18434       vmode = V8HImode;
18435       /* FALLTHRU */
18436 
18437     do_subreg:
18438       target = gen_reg_rtx (vmode);
18439       op0 = gen_lowpart (vmode, op0);
18440       op1 = gen_lowpart (vmode, op1);
18441       break;
18442 
18443     case E_V8QImode:
18444       for (i = 0; i < 8; i += 2)
18445 	if (d->perm[i] + 1 != d->perm[i + 1])
18446 	  goto use_pblendvb;
18447 
18448       for (i = 0; i < 4; ++i)
18449 	mask |= (d->perm[i * 2] >= 8) << i;
18450       vmode = V4HImode;
18451       goto do_subreg;
18452 
18453     case E_V4QImode:
18454       for (i = 0; i < 4; i += 2)
18455 	if (d->perm[i] + 1 != d->perm[i + 1])
18456 	  goto use_pblendvb;
18457 
18458       for (i = 0; i < 2; ++i)
18459 	mask |= (d->perm[i * 2] >= 4) << i;
18460       vmode = V2HImode;
18461       goto do_subreg;
18462 
18463     case E_V32QImode:
18464       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
18465       for (i = 0; i < 32; i += 2)
18466 	if (d->perm[i] + 1 != d->perm[i + 1])
18467 	  goto use_pblendvb;
18468       /* See if bytes move in quadruplets.  If yes, vpblendd
18469 	 with immediate can be used.  */
18470       for (i = 0; i < 32; i += 4)
18471 	if (d->perm[i] + 2 != d->perm[i + 2])
18472 	  break;
18473       if (i < 32)
18474 	{
18475 	  /* See if bytes move the same in both lanes.  If yes,
18476 	     vpblendw with immediate can be used.  */
18477 	  for (i = 0; i < 16; i += 2)
18478 	    if (d->perm[i] + 16 != d->perm[i + 16])
18479 	      goto use_pblendvb;
18480 
18481 	  /* Use vpblendw.  */
18482 	  for (i = 0; i < 16; ++i)
18483 	    mask |= (d->perm[i * 2] >= 32) << i;
18484 	  vmode = V16HImode;
18485 	  goto do_subreg;
18486 	}
18487 
18488       /* Use vpblendd.  */
18489       for (i = 0; i < 8; ++i)
18490 	mask |= (d->perm[i * 4] >= 32) << i;
18491       vmode = V8SImode;
18492       goto do_subreg;
18493 
18494     case E_V16HImode:
18495       /* See if words move in pairs.  If yes, vpblendd can be used.  */
18496       for (i = 0; i < 16; i += 2)
18497 	if (d->perm[i] + 1 != d->perm[i + 1])
18498 	  break;
18499       if (i < 16)
18500 	{
18501 	  /* See if words move the same in both lanes.  If not,
18502 	     vpblendvb must be used.  */
18503 	  for (i = 0; i < 8; i++)
18504 	    if (d->perm[i] + 8 != d->perm[i + 8])
18505 	      {
18506 		/* Use vpblendvb.  */
18507 		for (i = 0; i < 32; ++i)
18508 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
18509 
18510 		vmode = V32QImode;
18511 		nelt = 32;
18512 		target = gen_reg_rtx (vmode);
18513 		op0 = gen_lowpart (vmode, op0);
18514 		op1 = gen_lowpart (vmode, op1);
18515 		goto finish_pblendvb;
18516 	      }
18517 
18518 	  /* Use vpblendw.  */
18519 	  for (i = 0; i < 16; ++i)
18520 	    mask |= (d->perm[i] >= 16) << i;
18521 	  break;
18522 	}
18523 
18524       /* Use vpblendd.  */
18525       for (i = 0; i < 8; ++i)
18526 	mask |= (d->perm[i * 2] >= 16) << i;
18527       vmode = V8SImode;
18528       goto do_subreg;
18529 
18530     case E_V4DImode:
18531       /* Use vpblendd.  */
18532       for (i = 0; i < 4; ++i)
18533 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18534       vmode = V8SImode;
18535       goto do_subreg;
18536 
18537     default:
18538       gcc_unreachable ();
18539     }
18540 
18541   switch (vmode)
18542     {
18543     case E_V8DFmode:
18544     case E_V8DImode:
18545       mmode = QImode;
18546       break;
18547     case E_V16SFmode:
18548     case E_V16SImode:
18549       mmode = HImode;
18550       break;
18551     case E_V32HImode:
18552       mmode = SImode;
18553       break;
18554     case E_V64QImode:
18555       mmode = DImode;
18556       break;
18557     default:
18558       mmode = VOIDmode;
18559     }
18560 
18561   if (mmode != VOIDmode)
18562     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
18563   else
18564     maskop = GEN_INT (mask);
18565 
18566   /* This matches five different patterns with the different modes.  */
18567   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
18568   x = gen_rtx_SET (target, x);
18569   emit_insn (x);
18570   if (target != d->target)
18571     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18572 
18573   return true;
18574 }
18575 
18576 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18577    in terms of the variable form of vpermilps.
18578 
18579    Note that we will have already failed the immediate input vpermilps,
18580    which requires that the high and low part shuffle be identical; the
18581    variable form doesn't require that.  */
18582 
18583 static bool
expand_vec_perm_vpermil(struct expand_vec_perm_d * d)18584 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
18585 {
18586   rtx rperm[8], vperm;
18587   unsigned i;
18588 
18589   if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
18590     return false;
18591 
18592   /* We can only permute within the 128-bit lane.  */
18593   for (i = 0; i < 8; ++i)
18594     {
18595       unsigned e = d->perm[i];
18596       if (i < 4 ? e >= 4 : e < 4)
18597 	return false;
18598     }
18599 
18600   if (d->testing_p)
18601     return true;
18602 
18603   for (i = 0; i < 8; ++i)
18604     {
18605       unsigned e = d->perm[i];
18606 
18607       /* Within each 128-bit lane, the elements of op0 are numbered
18608 	 from 0 and the elements of op1 are numbered from 4.  */
18609       if (e >= 8 + 4)
18610 	e -= 8;
18611       else if (e >= 4)
18612 	e -= 4;
18613 
18614       rperm[i] = GEN_INT (e);
18615     }
18616 
18617   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
18618   vperm = force_reg (V8SImode, vperm);
18619   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
18620 
18621   return true;
18622 }
18623 
18624 /* For V*[QHS]Imode permutations, check if the same permutation
18625    can't be performed in a 2x, 4x or 8x wider inner mode.  */
18626 
18627 static bool
canonicalize_vector_int_perm(const struct expand_vec_perm_d * d,struct expand_vec_perm_d * nd)18628 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
18629 			      struct expand_vec_perm_d *nd)
18630 {
18631   int i;
18632   machine_mode mode = VOIDmode;
18633 
18634   switch (d->vmode)
18635     {
18636     case E_V8QImode: mode = V4HImode; break;
18637     case E_V16QImode: mode = V8HImode; break;
18638     case E_V32QImode: mode = V16HImode; break;
18639     case E_V64QImode: mode = V32HImode; break;
18640     case E_V4HImode: mode = V2SImode; break;
18641     case E_V8HImode: mode = V4SImode; break;
18642     case E_V16HImode: mode = V8SImode; break;
18643     case E_V32HImode: mode = V16SImode; break;
18644     case E_V4SImode: mode = V2DImode; break;
18645     case E_V8SImode: mode = V4DImode; break;
18646     case E_V16SImode: mode = V8DImode; break;
18647     default: return false;
18648     }
18649   for (i = 0; i < d->nelt; i += 2)
18650     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
18651       return false;
18652   nd->vmode = mode;
18653   nd->nelt = d->nelt / 2;
18654   for (i = 0; i < nd->nelt; i++)
18655     nd->perm[i] = d->perm[2 * i] / 2;
18656   if (GET_MODE_INNER (mode) != DImode)
18657     canonicalize_vector_int_perm (nd, nd);
18658   if (nd != d)
18659     {
18660       nd->one_operand_p = d->one_operand_p;
18661       nd->testing_p = d->testing_p;
18662       if (d->op0 == d->op1)
18663 	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
18664       else
18665 	{
18666 	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
18667 	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
18668 	}
18669       if (d->testing_p)
18670 	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
18671       else
18672 	nd->target = gen_reg_rtx (nd->vmode);
18673     }
18674   return true;
18675 }
18676 
18677 /* Return true if permutation D can be performed as VMODE permutation
18678    instead.  */
18679 
18680 static bool
valid_perm_using_mode_p(machine_mode vmode,struct expand_vec_perm_d * d)18681 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
18682 {
18683   unsigned int i, j, chunk;
18684 
18685   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
18686       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
18687       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
18688     return false;
18689 
18690   if (GET_MODE_NUNITS (vmode) >= d->nelt)
18691     return true;
18692 
18693   chunk = d->nelt / GET_MODE_NUNITS (vmode);
18694   for (i = 0; i < d->nelt; i += chunk)
18695     if (d->perm[i] & (chunk - 1))
18696       return false;
18697     else
18698       for (j = 1; j < chunk; ++j)
18699 	if (d->perm[i] + j != d->perm[i + j])
18700 	  return false;
18701 
18702   return true;
18703 }
18704 
18705 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18706    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
18707 
18708 static bool
expand_vec_perm_pshufb(struct expand_vec_perm_d * d)18709 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
18710 {
18711   unsigned i, nelt, eltsz, mask;
18712   unsigned char perm[64];
18713   machine_mode vmode = V16QImode;
18714   struct expand_vec_perm_d nd;
18715   rtx rperm[64], vperm, target, op0, op1;
18716 
18717   nelt = d->nelt;
18718 
18719   if (!d->one_operand_p)
18720     switch (GET_MODE_SIZE (d->vmode))
18721       {
18722       case 4:
18723 	if (!TARGET_XOP)
18724 	  return false;
18725 	vmode = V4QImode;
18726 	break;
18727 
18728       case 8:
18729 	if (!TARGET_XOP)
18730 	  return false;
18731 	vmode = V8QImode;
18732 	break;
18733 
18734       case 16:
18735 	if (!TARGET_XOP)
18736 	  return false;
18737 	break;
18738 
18739       case 32:
18740 	if (!TARGET_AVX2)
18741 	  return false;
18742 
18743 	if (valid_perm_using_mode_p (V2TImode, d))
18744 	  {
18745 	    if (d->testing_p)
18746 	      return true;
18747 
18748 	    /* Use vperm2i128 insn.  The pattern uses
18749 	       V4DImode instead of V2TImode.  */
18750 	    target = d->target;
18751 	    if (d->vmode != V4DImode)
18752 	      target = gen_reg_rtx (V4DImode);
18753 	    op0 = gen_lowpart (V4DImode, d->op0);
18754 	    op1 = gen_lowpart (V4DImode, d->op1);
18755 	    rperm[0]
18756 	      = GEN_INT ((d->perm[0] / (nelt / 2))
18757 			 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
18758 	    emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
18759 	    if (target != d->target)
18760 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18761 	    return true;
18762 	  }
18763 	/* FALLTHRU */
18764 
18765       default:
18766 	return false;
18767       }
18768   else
18769     switch (GET_MODE_SIZE (d->vmode))
18770       {
18771       case 4:
18772 	if (!TARGET_SSSE3)
18773 	  return false;
18774 	vmode = V4QImode;
18775 	break;
18776 
18777       case 8:
18778 	if (!TARGET_SSSE3)
18779 	  return false;
18780 	vmode = V8QImode;
18781 	break;
18782 
18783       case 16:
18784 	if (!TARGET_SSSE3)
18785 	  return false;
18786 	break;
18787 
18788       case 32:
18789 	if (!TARGET_AVX2)
18790 	  return false;
18791 
18792 	/* V4DImode should be already handled through
18793 	   expand_vselect by vpermq instruction.  */
18794 	gcc_assert (d->vmode != V4DImode);
18795 
18796 	vmode = V32QImode;
18797 	if (d->vmode == V8SImode
18798 	    || d->vmode == V16HImode
18799 	    || d->vmode == V32QImode)
18800 	  {
18801 	    /* First see if vpermq can be used for
18802 	       V8SImode/V16HImode/V32QImode.  */
18803 	    if (valid_perm_using_mode_p (V4DImode, d))
18804 	      {
18805 		for (i = 0; i < 4; i++)
18806 		  perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
18807 		if (d->testing_p)
18808 		  return true;
18809 		target = gen_reg_rtx (V4DImode);
18810 		if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
18811 				    perm, 4, false))
18812 		  {
18813 		    emit_move_insn (d->target,
18814 				    gen_lowpart (d->vmode, target));
18815 		    return true;
18816 		  }
18817 		return false;
18818 	      }
18819 
18820 	    /* Next see if vpermd can be used.  */
18821 	    if (valid_perm_using_mode_p (V8SImode, d))
18822 	      vmode = V8SImode;
18823 	  }
18824 	/* Or if vpermps can be used.  */
18825 	else if (d->vmode == V8SFmode)
18826 	  vmode = V8SImode;
18827 
18828 	if (vmode == V32QImode)
18829 	  {
18830 	    /* vpshufb only works intra lanes, it is not
18831 	       possible to shuffle bytes in between the lanes.  */
18832 	    for (i = 0; i < nelt; ++i)
18833 	      if ((d->perm[i] ^ i) & (nelt / 2))
18834 		return false;
18835 	  }
18836 	break;
18837 
18838       case 64:
18839 	if (!TARGET_AVX512BW)
18840 	  return false;
18841 
18842 	/* If vpermq didn't work, vpshufb won't work either.  */
18843 	if (d->vmode == V8DFmode || d->vmode == V8DImode)
18844 	  return false;
18845 
18846 	vmode = V64QImode;
18847 	if (d->vmode == V16SImode
18848 	    || d->vmode == V32HImode
18849 	    || d->vmode == V64QImode)
18850 	  {
18851 	    /* First see if vpermq can be used for
18852 	       V16SImode/V32HImode/V64QImode.  */
18853 	    if (valid_perm_using_mode_p (V8DImode, d))
18854 	      {
18855 		for (i = 0; i < 8; i++)
18856 		  perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
18857 		if (d->testing_p)
18858 		  return true;
18859 		target = gen_reg_rtx (V8DImode);
18860 		if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
18861 				    perm, 8, false))
18862 		  {
18863 		    emit_move_insn (d->target,
18864 				    gen_lowpart (d->vmode, target));
18865 		    return true;
18866 		  }
18867 		return false;
18868 	      }
18869 
18870 	    /* Next see if vpermd can be used.  */
18871 	    if (valid_perm_using_mode_p (V16SImode, d))
18872 	      vmode = V16SImode;
18873 	  }
18874 	/* Or if vpermps can be used.  */
18875 	else if (d->vmode == V16SFmode)
18876 	  vmode = V16SImode;
18877 	if (vmode == V64QImode)
18878 	  {
18879 	    /* vpshufb only works intra lanes, it is not
18880 	       possible to shuffle bytes in between the lanes.  */
18881 	    for (i = 0; i < nelt; ++i)
18882 	      if ((d->perm[i] ^ i) & (3 * nelt / 4))
18883 		return false;
18884 	  }
18885 	break;
18886 
18887       default:
18888 	return false;
18889       }
18890 
18891   if (d->testing_p)
18892     return true;
18893 
18894   /* Try to avoid variable permutation instruction.  */
18895   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
18896     {
18897       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
18898       return true;
18899     }
18900 
18901   if (vmode == V8SImode)
18902     for (i = 0; i < 8; ++i)
18903       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
18904   else if (vmode == V16SImode)
18905     for (i = 0; i < 16; ++i)
18906       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
18907   else
18908     {
18909       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18910       if (!d->one_operand_p)
18911 	mask = 2 * nelt - 1;
18912       else if (vmode == V64QImode)
18913 	mask = nelt / 4 - 1;
18914       else if (vmode == V32QImode)
18915 	mask = nelt / 2 - 1;
18916       else
18917 	mask = nelt - 1;
18918 
18919       for (i = 0; i < nelt; ++i)
18920 	{
18921 	  unsigned j, e = d->perm[i] & mask;
18922 	  for (j = 0; j < eltsz; ++j)
18923 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
18924 	}
18925     }
18926 
18927   machine_mode vpmode = vmode;
18928 
18929   if (vmode == V4QImode
18930       || vmode == V8QImode)
18931     {
18932       rtx m128 = GEN_INT (-128);
18933 
18934       /* Remap elements from the second operand, as we have to
18935 	 account for inactive top elements from the first operand.  */
18936       if (!d->one_operand_p)
18937 	{
18938 	  int sz = GET_MODE_SIZE (vmode);
18939 
18940 	  for (i = 0; i < nelt; ++i)
18941 	    {
18942 	      int ival = INTVAL (rperm[i]);
18943 	      if (ival >= sz)
18944 		ival += 16-sz;
18945 	      rperm[i] = GEN_INT (ival);
18946 	    }
18947 	}
18948 
18949       /* V4QI/V8QI is emulated with V16QI instruction, fill inactive
18950 	 elements in the top positions with zeros.  */
18951       for (i = nelt; i < 16; ++i)
18952 	rperm[i] = m128;
18953 
18954       vpmode = V16QImode;
18955     }
18956 
18957   vperm = gen_rtx_CONST_VECTOR (vpmode,
18958 				gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
18959   vperm = force_reg (vpmode, vperm);
18960 
18961   if (vmode == d->vmode)
18962     target = d->target;
18963   else
18964     target = gen_reg_rtx (vmode);
18965 
18966   op0 = gen_lowpart (vmode, d->op0);
18967 
18968   if (d->one_operand_p)
18969     {
18970       rtx (*gen) (rtx, rtx, rtx);
18971 
18972       if (vmode == V4QImode)
18973 	gen = gen_mmx_pshufbv4qi3;
18974       else if (vmode == V8QImode)
18975 	gen = gen_mmx_pshufbv8qi3;
18976       else if (vmode == V16QImode)
18977 	gen = gen_ssse3_pshufbv16qi3;
18978       else if (vmode == V32QImode)
18979 	gen = gen_avx2_pshufbv32qi3;
18980       else if (vmode == V64QImode)
18981 	gen = gen_avx512bw_pshufbv64qi3;
18982       else if (vmode == V8SFmode)
18983 	gen = gen_avx2_permvarv8sf;
18984       else if (vmode == V8SImode)
18985 	gen = gen_avx2_permvarv8si;
18986       else if (vmode == V16SFmode)
18987 	gen = gen_avx512f_permvarv16sf;
18988       else if (vmode == V16SImode)
18989 	gen = gen_avx512f_permvarv16si;
18990       else
18991 	gcc_unreachable ();
18992 
18993       emit_insn (gen (target, op0, vperm));
18994     }
18995   else
18996     {
18997       rtx (*gen) (rtx, rtx, rtx, rtx);
18998 
18999       op1 = gen_lowpart (vmode, d->op1);
19000 
19001       if (vmode == V4QImode)
19002 	gen = gen_mmx_ppermv32;
19003       else if (vmode == V8QImode)
19004 	gen = gen_mmx_ppermv64;
19005       else if (vmode == V16QImode)
19006 	gen = gen_xop_pperm;
19007       else
19008 	gcc_unreachable ();
19009 
19010       emit_insn (gen (target, op0, op1, vperm));
19011     }
19012 
19013   if (target != d->target)
19014     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19015 
19016   return true;
19017 }
19018 
19019 /* Try to expand one-operand permutation with constant mask.  */
19020 
19021 static bool
ix86_expand_vec_one_operand_perm_avx512(struct expand_vec_perm_d * d)19022 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19023 {
19024   machine_mode mode = GET_MODE (d->op0);
19025   machine_mode maskmode = mode;
19026   unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19027   rtx (*gen) (rtx, rtx, rtx) = NULL;
19028   rtx target, op0, mask;
19029   rtx vec[64];
19030 
19031   if (!rtx_equal_p (d->op0, d->op1))
19032     return false;
19033 
19034   if (!TARGET_AVX512F)
19035     return false;
19036 
19037   /* Accept VNxHImode and VNxQImode now.  */
19038   if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19039     return false;
19040 
19041   /* vpermw.  */
19042   if (!TARGET_AVX512BW && inner_size == 2)
19043     return false;
19044 
19045   /* vpermb.  */
19046   if (!TARGET_AVX512VBMI && inner_size == 1)
19047     return false;
19048 
19049   switch (mode)
19050     {
19051     case E_V16SImode:
19052       gen = gen_avx512f_permvarv16si;
19053       break;
19054     case E_V16SFmode:
19055       gen = gen_avx512f_permvarv16sf;
19056       maskmode = V16SImode;
19057       break;
19058     case E_V8DImode:
19059       gen = gen_avx512f_permvarv8di;
19060       break;
19061     case E_V8DFmode:
19062       gen = gen_avx512f_permvarv8df;
19063       maskmode = V8DImode;
19064       break;
19065     case E_V32HImode:
19066       gen = gen_avx512bw_permvarv32hi;
19067       break;
19068     case E_V16HImode:
19069       gen = gen_avx512vl_permvarv16hi;
19070       break;
19071     case E_V8HImode:
19072       gen = gen_avx512vl_permvarv8hi;
19073       break;
19074     case E_V64QImode:
19075       gen = gen_avx512bw_permvarv64qi;
19076       break;
19077     case E_V32QImode:
19078       gen = gen_avx512vl_permvarv32qi;
19079       break;
19080     case E_V16QImode:
19081       gen = gen_avx512vl_permvarv16qi;
19082       break;
19083 
19084     default:
19085       return false;
19086     }
19087 
19088   if (d->testing_p)
19089     return true;
19090 
19091   target = d->target;
19092   op0 = d->op0;
19093   for (int i = 0; i < d->nelt; ++i)
19094     vec[i] = GEN_INT (d->perm[i]);
19095   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19096   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19097   return true;
19098 }
19099 
19100 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19101 
19102 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
19103    in a single instruction.  */
19104 
19105 static bool
expand_vec_perm_1(struct expand_vec_perm_d * d)19106 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19107 {
19108   unsigned i, nelt = d->nelt;
19109   struct expand_vec_perm_d nd;
19110 
19111   /* Check plain VEC_SELECT first, because AVX has instructions that could
19112      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19113      input where SEL+CONCAT may not.  */
19114   if (d->one_operand_p)
19115     {
19116       int mask = nelt - 1;
19117       bool identity_perm = true;
19118       bool broadcast_perm = true;
19119 
19120       for (i = 0; i < nelt; i++)
19121 	{
19122 	  nd.perm[i] = d->perm[i] & mask;
19123 	  if (nd.perm[i] != i)
19124 	    identity_perm = false;
19125 	  if (nd.perm[i])
19126 	    broadcast_perm = false;
19127 	}
19128 
19129       if (identity_perm)
19130 	{
19131 	  if (!d->testing_p)
19132 	    emit_move_insn (d->target, d->op0);
19133 	  return true;
19134 	}
19135       else if (broadcast_perm && TARGET_AVX2)
19136 	{
19137 	  /* Use vpbroadcast{b,w,d}.  */
19138 	  rtx (*gen) (rtx, rtx) = NULL;
19139 	  switch (d->vmode)
19140 	    {
19141 	    case E_V64QImode:
19142 	      if (TARGET_AVX512BW)
19143 		gen = gen_avx512bw_vec_dupv64qi_1;
19144 	      break;
19145 	    case E_V32QImode:
19146 	      gen = gen_avx2_pbroadcastv32qi_1;
19147 	      break;
19148 	    case E_V32HImode:
19149 	      if (TARGET_AVX512BW)
19150 		gen = gen_avx512bw_vec_dupv32hi_1;
19151 	      break;
19152 	    case E_V16HImode:
19153 	      gen = gen_avx2_pbroadcastv16hi_1;
19154 	      break;
19155 	    case E_V16SImode:
19156 	      if (TARGET_AVX512F)
19157 		gen = gen_avx512f_vec_dupv16si_1;
19158 	      break;
19159 	    case E_V8SImode:
19160 	      gen = gen_avx2_pbroadcastv8si_1;
19161 	      break;
19162 	    case E_V16QImode:
19163 	      gen = gen_avx2_pbroadcastv16qi;
19164 	      break;
19165 	    case E_V8HImode:
19166 	      gen = gen_avx2_pbroadcastv8hi;
19167 	      break;
19168 	    case E_V16SFmode:
19169 	      if (TARGET_AVX512F)
19170 		gen = gen_avx512f_vec_dupv16sf_1;
19171 	      break;
19172 	    case E_V8SFmode:
19173 	      gen = gen_avx2_vec_dupv8sf_1;
19174 	      break;
19175 	    case E_V8DFmode:
19176 	      if (TARGET_AVX512F)
19177 		gen = gen_avx512f_vec_dupv8df_1;
19178 	      break;
19179 	    case E_V8DImode:
19180 	      if (TARGET_AVX512F)
19181 		gen = gen_avx512f_vec_dupv8di_1;
19182 	      break;
19183 	    /* For other modes prefer other shuffles this function creates.  */
19184 	    default: break;
19185 	    }
19186 	  if (gen != NULL)
19187 	    {
19188 	      if (!d->testing_p)
19189 		emit_insn (gen (d->target, d->op0));
19190 	      return true;
19191 	    }
19192 	}
19193 
19194       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19195 	return true;
19196 
19197       /* There are plenty of patterns in sse.md that are written for
19198 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
19199 	 that should be changed, to avoid the nastiness here.  */
19200 
19201       /* Recognize interleave style patterns, which means incrementing
19202 	 every other permutation operand.  */
19203       for (i = 0; i < nelt; i += 2)
19204 	{
19205 	  nd.perm[i] = d->perm[i] & mask;
19206 	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19207 	}
19208       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19209 				  d->testing_p))
19210 	return true;
19211 
19212       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
19213       if (nelt >= 4)
19214 	{
19215 	  for (i = 0; i < nelt; i += 4)
19216 	    {
19217 	      nd.perm[i + 0] = d->perm[i + 0] & mask;
19218 	      nd.perm[i + 1] = d->perm[i + 1] & mask;
19219 	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19220 	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19221 	    }
19222 
19223 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19224 				      d->testing_p))
19225 	    return true;
19226 	}
19227     }
19228 
19229   /* Try movss/movsd instructions.  */
19230   if (expand_vec_perm_movs (d))
19231     return true;
19232 
19233   /* Finally, try the fully general two operand permute.  */
19234   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19235 			      d->testing_p))
19236     return true;
19237 
19238   /* Recognize interleave style patterns with reversed operands.  */
19239   if (!d->one_operand_p)
19240     {
19241       for (i = 0; i < nelt; ++i)
19242 	{
19243 	  unsigned e = d->perm[i];
19244 	  if (e >= nelt)
19245 	    e -= nelt;
19246 	  else
19247 	    e += nelt;
19248 	  nd.perm[i] = e;
19249 	}
19250 
19251       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19252 				  d->testing_p))
19253 	return true;
19254     }
19255 
19256   /* Try the SSE4.1 blend variable merge instructions.  */
19257   if (expand_vec_perm_blend (d))
19258     return true;
19259 
19260   /* Try one of the AVX vpermil variable permutations.  */
19261   if (expand_vec_perm_vpermil (d))
19262     return true;
19263 
19264   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19265      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
19266   if (expand_vec_perm_pshufb (d))
19267     return true;
19268 
19269   /* Try the AVX2 vpalignr instruction.  */
19270   if (expand_vec_perm_palignr (d, true))
19271     return true;
19272 
19273   /* Try the AVX512F vperm{w,b,s,d} instructions  */
19274   if (ix86_expand_vec_one_operand_perm_avx512 (d))
19275     return true;
19276 
19277   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
19278   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19279     return true;
19280 
19281   /* See if we can get the same permutation in different vector integer
19282      mode.  */
19283   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19284     {
19285       if (!d->testing_p)
19286 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19287       return true;
19288     }
19289   return false;
19290 }
19291 
19292 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
19293    in terms of a pair of pshuflw + pshufhw instructions.  */
19294 
19295 static bool
expand_vec_perm_pshuflw_pshufhw(struct expand_vec_perm_d * d)19296 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
19297 {
19298   unsigned char perm2[MAX_VECT_LEN];
19299   unsigned i;
19300   bool ok;
19301 
19302   if (d->vmode != V8HImode || !d->one_operand_p)
19303     return false;
19304 
19305   /* The two permutations only operate in 64-bit lanes.  */
19306   for (i = 0; i < 4; ++i)
19307     if (d->perm[i] >= 4)
19308       return false;
19309   for (i = 4; i < 8; ++i)
19310     if (d->perm[i] < 4)
19311       return false;
19312 
19313   if (d->testing_p)
19314     return true;
19315 
19316   /* Emit the pshuflw.  */
19317   memcpy (perm2, d->perm, 4);
19318   for (i = 4; i < 8; ++i)
19319     perm2[i] = i;
19320   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
19321   gcc_assert (ok);
19322 
19323   /* Emit the pshufhw.  */
19324   memcpy (perm2 + 4, d->perm + 4, 4);
19325   for (i = 0; i < 4; ++i)
19326     perm2[i] = i;
19327   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
19328   gcc_assert (ok);
19329 
19330   return true;
19331 }
19332 
19333 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19334    the permutation using the SSSE3 palignr instruction.  This succeeds
19335    when all of the elements in PERM fit within one vector and we merely
19336    need to shift them down so that a single vector permutation has a
19337    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
19338    the vpalignr instruction itself can perform the requested permutation.  */
19339 
19340 static bool
expand_vec_perm_palignr(struct expand_vec_perm_d * d,bool single_insn_only_p)19341 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
19342 {
19343   unsigned i, nelt = d->nelt;
19344   unsigned min, max, minswap, maxswap;
19345   bool in_order, ok, swap = false;
19346   rtx shift, target;
19347   struct expand_vec_perm_d dcopy;
19348 
19349   /* Even with AVX, palignr only operates on 128-bit vectors,
19350      in AVX2 palignr operates on both 128-bit lanes.  */
19351   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
19352       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
19353     return false;
19354 
19355   min = 2 * nelt;
19356   max = 0;
19357   minswap = 2 * nelt;
19358   maxswap = 0;
19359   for (i = 0; i < nelt; ++i)
19360     {
19361       unsigned e = d->perm[i];
19362       unsigned eswap = d->perm[i] ^ nelt;
19363       if (GET_MODE_SIZE (d->vmode) == 32)
19364 	{
19365 	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
19366 	  eswap = e ^ (nelt / 2);
19367 	}
19368       if (e < min)
19369 	min = e;
19370       if (e > max)
19371 	max = e;
19372       if (eswap < minswap)
19373 	minswap = eswap;
19374       if (eswap > maxswap)
19375 	maxswap = eswap;
19376     }
19377   if (min == 0
19378       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
19379     {
19380       if (d->one_operand_p
19381 	  || minswap == 0
19382 	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
19383 				   ? nelt / 2 : nelt))
19384 	return false;
19385       swap = true;
19386       min = minswap;
19387       max = maxswap;
19388     }
19389 
19390   /* Given that we have SSSE3, we know we'll be able to implement the
19391      single operand permutation after the palignr with pshufb for
19392      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
19393      first.  */
19394   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
19395     return true;
19396 
19397   dcopy = *d;
19398   if (swap)
19399     {
19400       dcopy.op0 = d->op1;
19401       dcopy.op1 = d->op0;
19402       for (i = 0; i < nelt; ++i)
19403 	dcopy.perm[i] ^= nelt;
19404     }
19405 
19406   in_order = true;
19407   for (i = 0; i < nelt; ++i)
19408     {
19409       unsigned e = dcopy.perm[i];
19410       if (GET_MODE_SIZE (d->vmode) == 32
19411 	  && e >= nelt
19412 	  && (e & (nelt / 2 - 1)) < min)
19413 	e = e - min - (nelt / 2);
19414       else
19415 	e = e - min;
19416       if (e != i)
19417 	in_order = false;
19418       dcopy.perm[i] = e;
19419     }
19420   dcopy.one_operand_p = true;
19421 
19422   if (single_insn_only_p && !in_order)
19423     return false;
19424 
19425   /* For AVX2, test whether we can permute the result in one instruction.  */
19426   if (d->testing_p)
19427     {
19428       if (in_order)
19429 	return true;
19430       dcopy.op1 = dcopy.op0;
19431       return expand_vec_perm_1 (&dcopy);
19432     }
19433 
19434   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
19435   if (GET_MODE_SIZE (d->vmode) == 16)
19436     {
19437       target = gen_reg_rtx (TImode);
19438       emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
19439 				      gen_lowpart (TImode, dcopy.op0), shift));
19440     }
19441   else
19442     {
19443       target = gen_reg_rtx (V2TImode);
19444       emit_insn (gen_avx2_palignrv2ti (target,
19445 				       gen_lowpart (V2TImode, dcopy.op1),
19446 				       gen_lowpart (V2TImode, dcopy.op0),
19447 				       shift));
19448     }
19449 
19450   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
19451 
19452   /* Test for the degenerate case where the alignment by itself
19453      produces the desired permutation.  */
19454   if (in_order)
19455     {
19456       emit_move_insn (d->target, dcopy.op0);
19457       return true;
19458     }
19459 
19460   ok = expand_vec_perm_1 (&dcopy);
19461   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
19462 
19463   return ok;
19464 }
19465 
19466 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19467    the permutation using the SSE4_1 pblendv instruction.  Potentially
19468    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
19469 
19470 static bool
expand_vec_perm_pblendv(struct expand_vec_perm_d * d)19471 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
19472 {
19473   unsigned i, which, nelt = d->nelt;
19474   struct expand_vec_perm_d dcopy, dcopy1;
19475   machine_mode vmode = d->vmode;
19476   bool ok;
19477 
19478   /* Use the same checks as in expand_vec_perm_blend.  */
19479   if (d->one_operand_p)
19480     return false;
19481   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19482     ;
19483   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19484     ;
19485   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
19486 			     || GET_MODE_SIZE (vmode) == 8
19487 			     || GET_MODE_SIZE (vmode) == 16))
19488     ;
19489   else
19490     return false;
19491 
19492   /* Figure out where permutation elements stay not in their
19493      respective lanes.  */
19494   for (i = 0, which = 0; i < nelt; ++i)
19495     {
19496       unsigned e = d->perm[i];
19497       if (e != i)
19498 	which |= (e < nelt ? 1 : 2);
19499     }
19500   /* We can pblend the part where elements stay not in their
19501      respective lanes only when these elements are all in one
19502      half of a permutation.
19503      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
19504      lanes, but both 8 and 9 >= 8
19505      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
19506      respective lanes and 8 >= 8, but 2 not.  */
19507   if (which != 1 && which != 2)
19508     return false;
19509   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
19510     return true;
19511 
19512   /* First we apply one operand permutation to the part where
19513      elements stay not in their respective lanes.  */
19514   dcopy = *d;
19515   if (which == 2)
19516     dcopy.op0 = dcopy.op1 = d->op1;
19517   else
19518     dcopy.op0 = dcopy.op1 = d->op0;
19519   if (!d->testing_p)
19520     dcopy.target = gen_reg_rtx (vmode);
19521   dcopy.one_operand_p = true;
19522 
19523   for (i = 0; i < nelt; ++i)
19524     dcopy.perm[i] = d->perm[i] & (nelt - 1);
19525 
19526   ok = expand_vec_perm_1 (&dcopy);
19527   if (GET_MODE_SIZE (vmode) != 16 && !ok)
19528     return false;
19529   else
19530     gcc_assert (ok);
19531   if (d->testing_p)
19532     return true;
19533 
19534   /* Next we put permuted elements into their positions.  */
19535   dcopy1 = *d;
19536   if (which == 2)
19537     dcopy1.op1 = dcopy.target;
19538   else
19539     dcopy1.op0 = dcopy.target;
19540 
19541   for (i = 0; i < nelt; ++i)
19542     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
19543 
19544   ok = expand_vec_perm_blend (&dcopy1);
19545   gcc_assert (ok);
19546 
19547   return true;
19548 }
19549 
19550 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
19551 
19552 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19553    a two vector permutation into a single vector permutation by using
19554    an interleave operation to merge the vectors.  */
19555 
19556 static bool
expand_vec_perm_interleave2(struct expand_vec_perm_d * d)19557 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
19558 {
19559   struct expand_vec_perm_d dremap, dfinal;
19560   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
19561   unsigned HOST_WIDE_INT contents;
19562   unsigned char remap[2 * MAX_VECT_LEN];
19563   rtx_insn *seq;
19564   bool ok, same_halves = false;
19565 
19566   if (GET_MODE_SIZE (d->vmode) == 4
19567       || GET_MODE_SIZE (d->vmode) == 8
19568       || GET_MODE_SIZE (d->vmode) == 16)
19569     {
19570       if (d->one_operand_p)
19571 	return false;
19572     }
19573   else if (GET_MODE_SIZE (d->vmode) == 32)
19574     {
19575       if (!TARGET_AVX)
19576 	return false;
19577       /* For 32-byte modes allow even d->one_operand_p.
19578 	 The lack of cross-lane shuffling in some instructions
19579 	 might prevent a single insn shuffle.  */
19580       dfinal = *d;
19581       dfinal.testing_p = true;
19582       /* If expand_vec_perm_interleave3 can expand this into
19583 	 a 3 insn sequence, give up and let it be expanded as
19584 	 3 insn sequence.  While that is one insn longer,
19585 	 it doesn't need a memory operand and in the common
19586 	 case that both interleave low and high permutations
19587 	 with the same operands are adjacent needs 4 insns
19588 	 for both after CSE.  */
19589       if (expand_vec_perm_interleave3 (&dfinal))
19590 	return false;
19591     }
19592   else
19593     return false;
19594 
19595   /* Examine from whence the elements come.  */
19596   contents = 0;
19597   for (i = 0; i < nelt; ++i)
19598     contents |= HOST_WIDE_INT_1U << d->perm[i];
19599 
19600   memset (remap, 0xff, sizeof (remap));
19601   dremap = *d;
19602 
19603   if (GET_MODE_SIZE (d->vmode) == 4
19604       || GET_MODE_SIZE (d->vmode) == 8)
19605     {
19606       unsigned HOST_WIDE_INT h1, h2, h3, h4;
19607 
19608       /* Split the two input vectors into 4 halves.  */
19609       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19610       h2 = h1 << nelt2;
19611       h3 = h2 << nelt2;
19612       h4 = h3 << nelt2;
19613 
19614       /* If the elements from the low halves use interleave low,
19615 	 and similarly for interleave high.  */
19616       if ((contents & (h1 | h3)) == contents)
19617 	{
19618 	  /* punpckl* */
19619 	  for (i = 0; i < nelt2; ++i)
19620 	    {
19621 	      remap[i] = i * 2;
19622 	      remap[i + nelt] = i * 2 + 1;
19623 	      dremap.perm[i * 2] = i;
19624 	      dremap.perm[i * 2 + 1] = i + nelt;
19625 	    }
19626 	}
19627       else if ((contents & (h2 | h4)) == contents)
19628 	{
19629 	  /* punpckh* */
19630 	  for (i = 0; i < nelt2; ++i)
19631 	    {
19632 	      remap[i + nelt2] = i * 2;
19633 	      remap[i + nelt + nelt2] = i * 2 + 1;
19634 	      dremap.perm[i * 2] = i + nelt2;
19635 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19636 	    }
19637 	}
19638       else
19639 	return false;
19640     }
19641   else if (GET_MODE_SIZE (d->vmode) == 16)
19642     {
19643       unsigned HOST_WIDE_INT h1, h2, h3, h4;
19644 
19645       /* Split the two input vectors into 4 halves.  */
19646       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19647       h2 = h1 << nelt2;
19648       h3 = h2 << nelt2;
19649       h4 = h3 << nelt2;
19650 
19651       /* If the elements from the low halves use interleave low, and similarly
19652 	 for interleave high.  If the elements are from mis-matched halves, we
19653 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
19654       if ((contents & (h1 | h3)) == contents)
19655 	{
19656 	  /* punpckl* */
19657 	  for (i = 0; i < nelt2; ++i)
19658 	    {
19659 	      remap[i] = i * 2;
19660 	      remap[i + nelt] = i * 2 + 1;
19661 	      dremap.perm[i * 2] = i;
19662 	      dremap.perm[i * 2 + 1] = i + nelt;
19663 	    }
19664 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
19665 	    dremap.vmode = V4SFmode;
19666 	}
19667       else if ((contents & (h2 | h4)) == contents)
19668 	{
19669 	  /* punpckh* */
19670 	  for (i = 0; i < nelt2; ++i)
19671 	    {
19672 	      remap[i + nelt2] = i * 2;
19673 	      remap[i + nelt + nelt2] = i * 2 + 1;
19674 	      dremap.perm[i * 2] = i + nelt2;
19675 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19676 	    }
19677 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
19678 	    dremap.vmode = V4SFmode;
19679 	}
19680       else if ((contents & (h1 | h4)) == contents)
19681 	{
19682 	  /* shufps */
19683 	  for (i = 0; i < nelt2; ++i)
19684 	    {
19685 	      remap[i] = i;
19686 	      remap[i + nelt + nelt2] = i + nelt2;
19687 	      dremap.perm[i] = i;
19688 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
19689 	    }
19690 	  if (nelt != 4)
19691 	    {
19692 	      /* shufpd */
19693 	      dremap.vmode = V2DImode;
19694 	      dremap.nelt = 2;
19695 	      dremap.perm[0] = 0;
19696 	      dremap.perm[1] = 3;
19697 	    }
19698 	}
19699       else if ((contents & (h2 | h3)) == contents)
19700 	{
19701 	  /* shufps */
19702 	  for (i = 0; i < nelt2; ++i)
19703 	    {
19704 	      remap[i + nelt2] = i;
19705 	      remap[i + nelt] = i + nelt2;
19706 	      dremap.perm[i] = i + nelt2;
19707 	      dremap.perm[i + nelt2] = i + nelt;
19708 	    }
19709 	  if (nelt != 4)
19710 	    {
19711 	      /* shufpd */
19712 	      dremap.vmode = V2DImode;
19713 	      dremap.nelt = 2;
19714 	      dremap.perm[0] = 1;
19715 	      dremap.perm[1] = 2;
19716 	    }
19717 	}
19718       else
19719 	return false;
19720     }
19721   else
19722     {
19723       unsigned int nelt4 = nelt / 4, nzcnt = 0;
19724       unsigned HOST_WIDE_INT q[8];
19725       unsigned int nonzero_halves[4];
19726 
19727       /* Split the two input vectors into 8 quarters.  */
19728       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
19729       for (i = 1; i < 8; ++i)
19730 	q[i] = q[0] << (nelt4 * i);
19731       for (i = 0; i < 4; ++i)
19732 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
19733 	  {
19734 	    nonzero_halves[nzcnt] = i;
19735 	    ++nzcnt;
19736 	  }
19737 
19738       if (nzcnt == 1)
19739 	{
19740 	  gcc_assert (d->one_operand_p);
19741 	  nonzero_halves[1] = nonzero_halves[0];
19742 	  same_halves = true;
19743 	}
19744       else if (d->one_operand_p)
19745 	{
19746 	  gcc_assert (nonzero_halves[0] == 0);
19747 	  gcc_assert (nonzero_halves[1] == 1);
19748 	}
19749 
19750       if (nzcnt <= 2)
19751 	{
19752 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
19753 	    {
19754 	      /* Attempt to increase the likelihood that dfinal
19755 		 shuffle will be intra-lane.  */
19756 	      std::swap (nonzero_halves[0], nonzero_halves[1]);
19757 	    }
19758 
19759 	  /* vperm2f128 or vperm2i128.  */
19760 	  for (i = 0; i < nelt2; ++i)
19761 	    {
19762 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
19763 	      remap[i + nonzero_halves[0] * nelt2] = i;
19764 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
19765 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
19766 	    }
19767 
19768 	  if (d->vmode != V8SFmode
19769 	      && d->vmode != V4DFmode
19770 	      && d->vmode != V8SImode)
19771 	    {
19772 	      dremap.vmode = V8SImode;
19773 	      dremap.nelt = 8;
19774 	      for (i = 0; i < 4; ++i)
19775 		{
19776 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
19777 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
19778 		}
19779 	    }
19780 	}
19781       else if (d->one_operand_p)
19782 	return false;
19783       else if (TARGET_AVX2
19784 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
19785 	{
19786 	  /* vpunpckl* */
19787 	  for (i = 0; i < nelt4; ++i)
19788 	    {
19789 	      remap[i] = i * 2;
19790 	      remap[i + nelt] = i * 2 + 1;
19791 	      remap[i + nelt2] = i * 2 + nelt2;
19792 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
19793 	      dremap.perm[i * 2] = i;
19794 	      dremap.perm[i * 2 + 1] = i + nelt;
19795 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
19796 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
19797 	    }
19798 	}
19799       else if (TARGET_AVX2
19800 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
19801 	{
19802 	  /* vpunpckh* */
19803 	  for (i = 0; i < nelt4; ++i)
19804 	    {
19805 	      remap[i + nelt4] = i * 2;
19806 	      remap[i + nelt + nelt4] = i * 2 + 1;
19807 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
19808 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
19809 	      dremap.perm[i * 2] = i + nelt4;
19810 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
19811 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
19812 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
19813 	    }
19814 	}
19815       else
19816 	return false;
19817     }
19818 
19819   /* Use the remapping array set up above to move the elements from their
19820      swizzled locations into their final destinations.  */
19821   dfinal = *d;
19822   for (i = 0; i < nelt; ++i)
19823     {
19824       unsigned e = remap[d->perm[i]];
19825       gcc_assert (e < nelt);
19826       /* If same_halves is true, both halves of the remapped vector are the
19827 	 same.  Avoid cross-lane accesses if possible.  */
19828       if (same_halves && i >= nelt2)
19829 	{
19830 	  gcc_assert (e < nelt2);
19831 	  dfinal.perm[i] = e + nelt2;
19832 	}
19833       else
19834 	dfinal.perm[i] = e;
19835     }
19836   if (!d->testing_p)
19837     {
19838       dremap.target = gen_reg_rtx (dremap.vmode);
19839       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19840     }
19841   dfinal.op1 = dfinal.op0;
19842   dfinal.one_operand_p = true;
19843 
19844   /* Test if the final remap can be done with a single insn.  For V4SFmode or
19845      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
19846   start_sequence ();
19847   ok = expand_vec_perm_1 (&dfinal);
19848   seq = get_insns ();
19849   end_sequence ();
19850 
19851   if (!ok)
19852     return false;
19853 
19854   if (d->testing_p)
19855     return true;
19856 
19857   if (dremap.vmode != dfinal.vmode)
19858     {
19859       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
19860       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
19861     }
19862 
19863   ok = expand_vec_perm_1 (&dremap);
19864   gcc_assert (ok);
19865 
19866   emit_insn (seq);
19867   return true;
19868 }
19869 
19870 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19871    a single vector cross-lane permutation into vpermq followed
19872    by any of the single insn permutations.  */
19873 
19874 static bool
expand_vec_perm_vpermq_perm_1(struct expand_vec_perm_d * d)19875 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
19876 {
19877   struct expand_vec_perm_d dremap, dfinal;
19878   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
19879   unsigned contents[2];
19880   bool ok;
19881 
19882   if (!(TARGET_AVX2
19883 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
19884 	&& d->one_operand_p))
19885     return false;
19886 
19887   contents[0] = 0;
19888   contents[1] = 0;
19889   for (i = 0; i < nelt2; ++i)
19890     {
19891       contents[0] |= 1u << (d->perm[i] / nelt4);
19892       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
19893     }
19894 
19895   for (i = 0; i < 2; ++i)
19896     {
19897       unsigned int cnt = 0;
19898       for (j = 0; j < 4; ++j)
19899 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
19900 	  return false;
19901     }
19902 
19903   if (d->testing_p)
19904     return true;
19905 
19906   dremap = *d;
19907   dremap.vmode = V4DImode;
19908   dremap.nelt = 4;
19909   dremap.target = gen_reg_rtx (V4DImode);
19910   dremap.op0 = gen_lowpart (V4DImode, d->op0);
19911   dremap.op1 = dremap.op0;
19912   dremap.one_operand_p = true;
19913   for (i = 0; i < 2; ++i)
19914     {
19915       unsigned int cnt = 0;
19916       for (j = 0; j < 4; ++j)
19917 	if ((contents[i] & (1u << j)) != 0)
19918 	  dremap.perm[2 * i + cnt++] = j;
19919       for (; cnt < 2; ++cnt)
19920 	dremap.perm[2 * i + cnt] = 0;
19921     }
19922 
19923   dfinal = *d;
19924   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19925   dfinal.op1 = dfinal.op0;
19926   dfinal.one_operand_p = true;
19927   for (i = 0, j = 0; i < nelt; ++i)
19928     {
19929       if (i == nelt2)
19930 	j = 2;
19931       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
19932       if ((d->perm[i] / nelt4) == dremap.perm[j])
19933 	;
19934       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
19935 	dfinal.perm[i] |= nelt4;
19936       else
19937 	gcc_unreachable ();
19938     }
19939 
19940   ok = expand_vec_perm_1 (&dremap);
19941   gcc_assert (ok);
19942 
19943   ok = expand_vec_perm_1 (&dfinal);
19944   gcc_assert (ok);
19945 
19946   return true;
19947 }
19948 
19949 static bool canonicalize_perm (struct expand_vec_perm_d *d);
19950 
19951 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
19952    a vector permutation using two instructions, vperm2f128 resp.
19953    vperm2i128 followed by any single in-lane permutation.  */
19954 
19955 static bool
expand_vec_perm_vperm2f128(struct expand_vec_perm_d * d)19956 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
19957 {
19958   struct expand_vec_perm_d dfirst, dsecond;
19959   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
19960   bool ok;
19961 
19962   if (!TARGET_AVX
19963       || GET_MODE_SIZE (d->vmode) != 32
19964       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
19965     return false;
19966 
19967   dsecond = *d;
19968   dsecond.one_operand_p = false;
19969   dsecond.testing_p = true;
19970 
19971   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
19972      immediate.  For perm < 16 the second permutation uses
19973      d->op0 as first operand, for perm >= 16 it uses d->op1
19974      as first operand.  The second operand is the result of
19975      vperm2[fi]128.  */
19976   for (perm = 0; perm < 32; perm++)
19977     {
19978       /* Ignore permutations which do not move anything cross-lane.  */
19979       if (perm < 16)
19980 	{
19981 	  /* The second shuffle for e.g. V4DFmode has
19982 	     0123 and ABCD operands.
19983 	     Ignore AB23, as 23 is already in the second lane
19984 	     of the first operand.  */
19985 	  if ((perm & 0xc) == (1 << 2)) continue;
19986 	  /* And 01CD, as 01 is in the first lane of the first
19987 	     operand.  */
19988 	  if ((perm & 3) == 0) continue;
19989 	  /* And 4567, as then the vperm2[fi]128 doesn't change
19990 	     anything on the original 4567 second operand.  */
19991 	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
19992 	}
19993       else
19994 	{
19995 	  /* The second shuffle for e.g. V4DFmode has
19996 	     4567 and ABCD operands.
19997 	     Ignore AB67, as 67 is already in the second lane
19998 	     of the first operand.  */
19999 	  if ((perm & 0xc) == (3 << 2)) continue;
20000 	  /* And 45CD, as 45 is in the first lane of the first
20001 	     operand.  */
20002 	  if ((perm & 3) == 2) continue;
20003 	  /* And 0123, as then the vperm2[fi]128 doesn't change
20004 	     anything on the original 0123 first operand.  */
20005 	  if ((perm & 0xf) == (1 << 2)) continue;
20006 	}
20007 
20008       for (i = 0; i < nelt; i++)
20009 	{
20010 	  j = d->perm[i] / nelt2;
20011 	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20012 	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20013 	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20014 	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
20015 	  else
20016 	    break;
20017 	}
20018 
20019       if (i == nelt)
20020 	{
20021 	  start_sequence ();
20022 	  ok = expand_vec_perm_1 (&dsecond);
20023 	  end_sequence ();
20024 	}
20025       else
20026 	ok = false;
20027 
20028       if (ok)
20029 	{
20030 	  if (d->testing_p)
20031 	    return true;
20032 
20033 	  /* Found a usable second shuffle.  dfirst will be
20034 	     vperm2f128 on d->op0 and d->op1.  */
20035 	  dsecond.testing_p = false;
20036 	  dfirst = *d;
20037 	  dfirst.target = gen_reg_rtx (d->vmode);
20038 	  for (i = 0; i < nelt; i++)
20039 	    dfirst.perm[i] = (i & (nelt2 - 1))
20040 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20041 
20042 	  canonicalize_perm (&dfirst);
20043 	  ok = expand_vec_perm_1 (&dfirst);
20044 	  gcc_assert (ok);
20045 
20046 	  /* And dsecond is some single insn shuffle, taking
20047 	     d->op0 and result of vperm2f128 (if perm < 16) or
20048 	     d->op1 and result of vperm2f128 (otherwise).  */
20049 	  if (perm >= 16)
20050 	    dsecond.op0 = dsecond.op1;
20051 	  dsecond.op1 = dfirst.target;
20052 
20053 	  ok = expand_vec_perm_1 (&dsecond);
20054 	  gcc_assert (ok);
20055 
20056 	  return true;
20057 	}
20058 
20059       /* For one operand, the only useful vperm2f128 permutation is 0x01
20060 	 aka lanes swap.  */
20061       if (d->one_operand_p)
20062 	return false;
20063     }
20064 
20065   return false;
20066 }
20067 
20068 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
20069    a two vector permutation using 2 intra-lane interleave insns
20070    and cross-lane shuffle for 32-byte vectors.  */
20071 
20072 static bool
expand_vec_perm_interleave3(struct expand_vec_perm_d * d)20073 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20074 {
20075   unsigned i, nelt;
20076   rtx (*gen) (rtx, rtx, rtx);
20077 
20078   if (d->one_operand_p)
20079     return false;
20080   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20081     ;
20082   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20083     ;
20084   else
20085     return false;
20086 
20087   nelt = d->nelt;
20088   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20089     return false;
20090   for (i = 0; i < nelt; i += 2)
20091     if (d->perm[i] != d->perm[0] + i / 2
20092 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20093       return false;
20094 
20095   if (d->testing_p)
20096     return true;
20097 
20098   switch (d->vmode)
20099     {
20100     case E_V32QImode:
20101       if (d->perm[0])
20102 	gen = gen_vec_interleave_highv32qi;
20103       else
20104 	gen = gen_vec_interleave_lowv32qi;
20105       break;
20106     case E_V16HImode:
20107       if (d->perm[0])
20108 	gen = gen_vec_interleave_highv16hi;
20109       else
20110 	gen = gen_vec_interleave_lowv16hi;
20111       break;
20112     case E_V8SImode:
20113       if (d->perm[0])
20114 	gen = gen_vec_interleave_highv8si;
20115       else
20116 	gen = gen_vec_interleave_lowv8si;
20117       break;
20118     case E_V4DImode:
20119       if (d->perm[0])
20120 	gen = gen_vec_interleave_highv4di;
20121       else
20122 	gen = gen_vec_interleave_lowv4di;
20123       break;
20124     case E_V8SFmode:
20125       if (d->perm[0])
20126 	gen = gen_vec_interleave_highv8sf;
20127       else
20128 	gen = gen_vec_interleave_lowv8sf;
20129       break;
20130     case E_V4DFmode:
20131       if (d->perm[0])
20132 	gen = gen_vec_interleave_highv4df;
20133       else
20134 	gen = gen_vec_interleave_lowv4df;
20135       break;
20136     default:
20137       gcc_unreachable ();
20138     }
20139 
20140   emit_insn (gen (d->target, d->op0, d->op1));
20141   return true;
20142 }
20143 
20144 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
20145    a single vector permutation using a single intra-lane vector
20146    permutation, vperm2f128 swapping the lanes and vblend* insn blending
20147    the non-swapped and swapped vectors together.  */
20148 
20149 static bool
expand_vec_perm_vperm2f128_vblend(struct expand_vec_perm_d * d)20150 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20151 {
20152   struct expand_vec_perm_d dfirst, dsecond;
20153   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20154   rtx_insn *seq;
20155   bool ok;
20156   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20157 
20158   if (!TARGET_AVX
20159       || TARGET_AVX2
20160       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20161       || !d->one_operand_p)
20162     return false;
20163 
20164   dfirst = *d;
20165   for (i = 0; i < nelt; i++)
20166     dfirst.perm[i] = 0xff;
20167   for (i = 0, msk = 0; i < nelt; i++)
20168     {
20169       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20170       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20171 	return false;
20172       dfirst.perm[j] = d->perm[i];
20173       if (j != i)
20174 	msk |= (1 << i);
20175     }
20176   for (i = 0; i < nelt; i++)
20177     if (dfirst.perm[i] == 0xff)
20178       dfirst.perm[i] = i;
20179 
20180   if (!d->testing_p)
20181     dfirst.target = gen_reg_rtx (dfirst.vmode);
20182 
20183   start_sequence ();
20184   ok = expand_vec_perm_1 (&dfirst);
20185   seq = get_insns ();
20186   end_sequence ();
20187 
20188   if (!ok)
20189     return false;
20190 
20191   if (d->testing_p)
20192     return true;
20193 
20194   emit_insn (seq);
20195 
20196   dsecond = *d;
20197   dsecond.op0 = dfirst.target;
20198   dsecond.op1 = dfirst.target;
20199   dsecond.one_operand_p = true;
20200   dsecond.target = gen_reg_rtx (dsecond.vmode);
20201   for (i = 0; i < nelt; i++)
20202     dsecond.perm[i] = i ^ nelt2;
20203 
20204   ok = expand_vec_perm_1 (&dsecond);
20205   gcc_assert (ok);
20206 
20207   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20208   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20209   return true;
20210 }
20211 
20212 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
20213    a two vector permutation using two single vector permutations and
20214    {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
20215    of dfirst or dsecond is identity permutation.  */
20216 
20217 static bool
expand_vec_perm_2perm_interleave(struct expand_vec_perm_d * d,bool two_insn)20218 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20219 {
20220   unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20221   struct expand_vec_perm_d dfirst, dsecond, dfinal;
20222   bool ident1 = true, ident2 = true;
20223 
20224   if (d->one_operand_p)
20225     return false;
20226 
20227   if (GET_MODE_SIZE (d->vmode) == 16)
20228     {
20229       if (!TARGET_SSE)
20230 	return false;
20231       if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20232 	return false;
20233     }
20234   else if (GET_MODE_SIZE (d->vmode) == 32)
20235     {
20236       if (!TARGET_AVX)
20237 	return false;
20238       if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20239 	return false;
20240       lane = nelt2;
20241     }
20242   else
20243     return false;
20244 
20245   for (i = 1; i < nelt; i++)
20246     if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20247       return false;
20248 
20249   dfirst = *d;
20250   dsecond = *d;
20251   dfinal = *d;
20252   dfirst.op1 = dfirst.op0;
20253   dfirst.one_operand_p = true;
20254   dsecond.op0 = dsecond.op1;
20255   dsecond.one_operand_p = true;
20256 
20257   for (i = 0; i < nelt; i++)
20258     if (d->perm[i] >= nelt)
20259       {
20260 	dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
20261 	if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
20262 	  ident2 = false;
20263 	dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
20264 	  = d->perm[i] - nelt;
20265       }
20266     else
20267       {
20268 	dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
20269 	if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
20270 	  ident1 = false;
20271 	dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
20272       }
20273 
20274   if (two_insn && !ident1 && !ident2)
20275     return false;
20276 
20277   if (!d->testing_p)
20278     {
20279       if (!ident1)
20280 	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20281       if (!ident2)
20282 	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20283       if (d->perm[0] >= nelt)
20284 	std::swap (dfinal.op0, dfinal.op1);
20285     }
20286 
20287   bool ok;
20288   rtx_insn *seq1 = NULL, *seq2 = NULL;
20289 
20290   if (!ident1)
20291     {
20292       start_sequence ();
20293       ok = expand_vec_perm_1 (&dfirst);
20294       seq1 = get_insns ();
20295       end_sequence ();
20296 
20297       if (!ok)
20298 	return false;
20299     }
20300 
20301   if (!ident2)
20302     {
20303       start_sequence ();
20304       ok = expand_vec_perm_1 (&dsecond);
20305       seq2 = get_insns ();
20306       end_sequence ();
20307 
20308       if (!ok)
20309 	return false;
20310     }
20311 
20312   if (d->testing_p)
20313     return true;
20314 
20315   for (i = 0; i < nelt; i++)
20316     {
20317       dfinal.perm[i] = i / 2;
20318       if (i >= lane)
20319 	dfinal.perm[i] += lane / 2;
20320       if ((i & 1) != 0)
20321 	dfinal.perm[i] += nelt;
20322     }
20323   emit_insn (seq1);
20324   emit_insn (seq2);
20325   ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
20326 			       dfinal.perm, dfinal.nelt, false);
20327   gcc_assert (ok);
20328   return true;
20329 }
20330 
20331 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
20332    the permutation using two single vector permutations and the SSE4_1 pblendv
20333    instruction.  If two_insn, succeed only if one of dfirst or dsecond is
20334    identity permutation.  */
20335 
20336 static bool
expand_vec_perm_2perm_pblendv(struct expand_vec_perm_d * d,bool two_insn)20337 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
20338 {
20339   unsigned i, nelt = d->nelt;
20340   struct expand_vec_perm_d dfirst, dsecond, dfinal;
20341   machine_mode vmode = d->vmode;
20342   bool ident1 = true, ident2 = true;
20343 
20344   /* Use the same checks as in expand_vec_perm_blend.  */
20345   if (d->one_operand_p)
20346     return false;
20347   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20348     ;
20349   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20350     ;
20351   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
20352 			     || GET_MODE_SIZE (vmode) == 8
20353 			     || GET_MODE_SIZE (vmode) == 4))
20354     ;
20355   else
20356     return false;
20357 
20358   dfirst = *d;
20359   dsecond = *d;
20360   dfinal = *d;
20361   dfirst.op1 = dfirst.op0;
20362   dfirst.one_operand_p = true;
20363   dsecond.op0 = dsecond.op1;
20364   dsecond.one_operand_p = true;
20365 
20366   for (i = 0; i < nelt; ++i)
20367     if (d->perm[i] >= nelt)
20368       {
20369 	dfirst.perm[i] = 0xff;
20370 	dsecond.perm[i] = d->perm[i] - nelt;
20371 	if (d->perm[i] != i + nelt)
20372 	  ident2 = false;
20373       }
20374     else
20375       {
20376 	dsecond.perm[i] = 0xff;
20377 	dfirst.perm[i] = d->perm[i];
20378 	if (d->perm[i] != i)
20379 	  ident1 = false;
20380       }
20381 
20382   if (two_insn && !ident1 && !ident2)
20383     return false;
20384 
20385   /* For now.  Ideally treat 0xff as a wildcard.  */
20386   for (i = 0; i < nelt; ++i)
20387     if (dfirst.perm[i] == 0xff)
20388       {
20389 	if (GET_MODE_SIZE (vmode) == 32
20390 	    && dfirst.perm[i ^ (nelt / 2)] != 0xff)
20391 	  dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20392 	else
20393 	  dfirst.perm[i] = i;
20394       }
20395     else
20396       {
20397 	if (GET_MODE_SIZE (vmode) == 32
20398 	    && dsecond.perm[i ^ (nelt / 2)] != 0xff)
20399 	  dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20400 	else
20401 	  dsecond.perm[i] = i;
20402       }
20403 
20404   if (!d->testing_p)
20405     {
20406       if (!ident1)
20407 	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20408       if (!ident2)
20409 	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20410     }
20411 
20412   bool ok;
20413   rtx_insn *seq1 = NULL, *seq2 = NULL;
20414 
20415   if (!ident1)
20416     {
20417       start_sequence ();
20418       ok = expand_vec_perm_1 (&dfirst);
20419       seq1 = get_insns ();
20420       end_sequence ();
20421 
20422       if (!ok)
20423 	return false;
20424     }
20425 
20426   if (!ident2)
20427     {
20428       start_sequence ();
20429       ok = expand_vec_perm_1 (&dsecond);
20430       seq2 = get_insns ();
20431       end_sequence ();
20432 
20433       if (!ok)
20434 	return false;
20435     }
20436 
20437   if (d->testing_p)
20438     return true;
20439 
20440   for (i = 0; i < nelt; ++i)
20441     dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
20442 
20443   emit_insn (seq1);
20444   emit_insn (seq2);
20445   ok = expand_vec_perm_blend (&dfinal);
20446   gcc_assert (ok);
20447   return true;
20448 }
20449 
20450 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
20451    permutation using two vperm2f128, followed by a vshufpd insn blending
20452    the two vectors together.  */
20453 
20454 static bool
expand_vec_perm_2vperm2f128_vshuf(struct expand_vec_perm_d * d)20455 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
20456 {
20457   struct expand_vec_perm_d dfirst, dsecond, dthird;
20458   bool ok;
20459 
20460   if (!TARGET_AVX || (d->vmode != V4DFmode))
20461     return false;
20462 
20463   if (d->testing_p)
20464     return true;
20465 
20466   dfirst = *d;
20467   dsecond = *d;
20468   dthird = *d;
20469 
20470   dfirst.perm[0] = (d->perm[0] & ~1);
20471   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
20472   dfirst.perm[2] = (d->perm[2] & ~1);
20473   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
20474   dsecond.perm[0] = (d->perm[1] & ~1);
20475   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
20476   dsecond.perm[2] = (d->perm[3] & ~1);
20477   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
20478   dthird.perm[0] = (d->perm[0] % 2);
20479   dthird.perm[1] = (d->perm[1] % 2) + 4;
20480   dthird.perm[2] = (d->perm[2] % 2) + 2;
20481   dthird.perm[3] = (d->perm[3] % 2) + 6;
20482 
20483   dfirst.target = gen_reg_rtx (dfirst.vmode);
20484   dsecond.target = gen_reg_rtx (dsecond.vmode);
20485   dthird.op0 = dfirst.target;
20486   dthird.op1 = dsecond.target;
20487   dthird.one_operand_p = false;
20488 
20489   canonicalize_perm (&dfirst);
20490   canonicalize_perm (&dsecond);
20491 
20492   ok = expand_vec_perm_1 (&dfirst)
20493        && expand_vec_perm_1 (&dsecond)
20494        && expand_vec_perm_1 (&dthird);
20495 
20496   gcc_assert (ok);
20497 
20498   return true;
20499 }
20500 
20501 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
20502 
20503 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
20504    a two vector permutation using two intra-lane vector
20505    permutations, vperm2f128 swapping the lanes and vblend* insn blending
20506    the non-swapped and swapped vectors together.  */
20507 
20508 static bool
expand_vec_perm2_vperm2f128_vblend(struct expand_vec_perm_d * d)20509 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
20510 {
20511   struct expand_vec_perm_d dfirst, dsecond, dthird;
20512   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
20513   rtx_insn *seq1, *seq2;
20514   bool ok;
20515   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20516 
20517   if (!TARGET_AVX
20518       || TARGET_AVX2
20519       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20520       || d->one_operand_p)
20521     return false;
20522 
20523   dfirst = *d;
20524   dsecond = *d;
20525   for (i = 0; i < nelt; i++)
20526     {
20527       dfirst.perm[i] = 0xff;
20528       dsecond.perm[i] = 0xff;
20529     }
20530   for (i = 0, msk = 0; i < nelt; i++)
20531     {
20532       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20533       if (j == i)
20534 	{
20535 	  dfirst.perm[j] = d->perm[i];
20536 	  which1 |= (d->perm[i] < nelt ? 1 : 2);
20537 	}
20538       else
20539 	{
20540 	  dsecond.perm[j] = d->perm[i];
20541 	  which2 |= (d->perm[i] < nelt ? 1 : 2);
20542 	  msk |= (1U << i);
20543 	}
20544     }
20545   if (msk == 0 || msk == (1U << nelt) - 1)
20546     return false;
20547 
20548   if (!d->testing_p)
20549     {
20550       dfirst.target = gen_reg_rtx (dfirst.vmode);
20551       dsecond.target = gen_reg_rtx (dsecond.vmode);
20552     }
20553 
20554   for (i = 0; i < nelt; i++)
20555     {
20556       if (dfirst.perm[i] == 0xff)
20557 	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
20558       if (dsecond.perm[i] == 0xff)
20559 	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
20560     }
20561   canonicalize_perm (&dfirst);
20562   start_sequence ();
20563   ok = ix86_expand_vec_perm_const_1 (&dfirst);
20564   seq1 = get_insns ();
20565   end_sequence ();
20566 
20567   if (!ok)
20568     return false;
20569 
20570   canonicalize_perm (&dsecond);
20571   start_sequence ();
20572   ok = ix86_expand_vec_perm_const_1 (&dsecond);
20573   seq2 = get_insns ();
20574   end_sequence ();
20575 
20576   if (!ok)
20577     return false;
20578 
20579   if (d->testing_p)
20580     return true;
20581 
20582   emit_insn (seq1);
20583   emit_insn (seq2);
20584 
20585   dthird = *d;
20586   dthird.op0 = dsecond.target;
20587   dthird.op1 = dsecond.target;
20588   dthird.one_operand_p = true;
20589   dthird.target = gen_reg_rtx (dthird.vmode);
20590   for (i = 0; i < nelt; i++)
20591     dthird.perm[i] = i ^ nelt2;
20592 
20593   ok = expand_vec_perm_1 (&dthird);
20594   gcc_assert (ok);
20595 
20596   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20597   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
20598   return true;
20599 }
20600 
20601 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
20602    permutation with two pshufb insns and an ior.  We should have already
20603    failed all two instruction sequences.  */
20604 
20605 static bool
expand_vec_perm_pshufb2(struct expand_vec_perm_d * d)20606 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
20607 {
20608   rtx rperm[2][16], vperm, l, h, op, m128;
20609   unsigned int i, nelt, eltsz;
20610   machine_mode mode;
20611   rtx (*gen) (rtx, rtx, rtx);
20612 
20613   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
20614 			&& GET_MODE_SIZE (d->vmode) != 8
20615 			&& GET_MODE_SIZE (d->vmode) != 4))
20616     return false;
20617   gcc_assert (!d->one_operand_p);
20618 
20619   if (d->testing_p)
20620     return true;
20621 
20622   switch (GET_MODE_SIZE (d->vmode))
20623     {
20624     case 4:
20625       mode = V4QImode;
20626       gen = gen_mmx_pshufbv4qi3;
20627       break;
20628     case 8:
20629       mode = V8QImode;
20630       gen = gen_mmx_pshufbv8qi3;
20631       break;
20632     case 16:
20633       mode = V16QImode;
20634       gen = gen_ssse3_pshufbv16qi3;
20635       break;
20636     default:
20637       gcc_unreachable ();
20638     }
20639 
20640   nelt = d->nelt;
20641   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20642 
20643   /* Generate two permutation masks.  If the required element is within
20644      the given vector it is shuffled into the proper lane.  If the required
20645      element is in the other vector, force a zero into the lane by setting
20646      bit 7 in the permutation mask.  */
20647   m128 = GEN_INT (-128);
20648   for (i = 0; i < nelt; ++i)
20649     {
20650       unsigned j, k, e = d->perm[i];
20651       unsigned which = (e >= nelt);
20652       if (e >= nelt)
20653 	e -= nelt;
20654 
20655       for (j = 0; j < eltsz; ++j)
20656 	{
20657 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
20658 	  rperm[1-which][i*eltsz + j] = m128;
20659 	}
20660 
20661       for (k = i*eltsz + j; k < 16; ++k)
20662 	rperm[0][k] = rperm[1][k] = m128;
20663     }
20664 
20665   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
20666   vperm = force_reg (V16QImode, vperm);
20667 
20668   l = gen_reg_rtx (mode);
20669   op = gen_lowpart (mode, d->op0);
20670   emit_insn (gen (l, op, vperm));
20671 
20672   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
20673   vperm = force_reg (V16QImode, vperm);
20674 
20675   h = gen_reg_rtx (mode);
20676   op = gen_lowpart (mode, d->op1);
20677   emit_insn (gen (h, op, vperm));
20678 
20679   op = d->target;
20680   if (d->vmode != mode)
20681     op = gen_reg_rtx (mode);
20682   emit_insn (gen_rtx_SET (op, gen_rtx_IOR (mode, l, h)));
20683   if (op != d->target)
20684     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20685 
20686   return true;
20687 }
20688 
20689 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
20690    with two vpshufb insns, vpermq and vpor.  We should have already failed
20691    all two or three instruction sequences.  */
20692 
20693 static bool
expand_vec_perm_vpshufb2_vpermq(struct expand_vec_perm_d * d)20694 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
20695 {
20696   rtx rperm[2][32], vperm, l, h, hp, op, m128;
20697   unsigned int i, nelt, eltsz;
20698 
20699   if (!TARGET_AVX2
20700       || !d->one_operand_p
20701       || (d->vmode != V32QImode && d->vmode != V16HImode))
20702     return false;
20703 
20704   if (d->testing_p)
20705     return true;
20706 
20707   nelt = d->nelt;
20708   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20709 
20710   /* Generate two permutation masks.  If the required element is within
20711      the same lane, it is shuffled in.  If the required element from the
20712      other lane, force a zero by setting bit 7 in the permutation mask.
20713      In the other mask the mask has non-negative elements if element
20714      is requested from the other lane, but also moved to the other lane,
20715      so that the result of vpshufb can have the two V2TImode halves
20716      swapped.  */
20717   m128 = GEN_INT (-128);
20718   for (i = 0; i < nelt; ++i)
20719     {
20720       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20721       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
20722 
20723       for (j = 0; j < eltsz; ++j)
20724 	{
20725 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
20726 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
20727 	}
20728     }
20729 
20730   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20731   vperm = force_reg (V32QImode, vperm);
20732 
20733   h = gen_reg_rtx (V32QImode);
20734   op = gen_lowpart (V32QImode, d->op0);
20735   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20736 
20737   /* Swap the 128-byte lanes of h into hp.  */
20738   hp = gen_reg_rtx (V4DImode);
20739   op = gen_lowpart (V4DImode, h);
20740   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
20741 				  const1_rtx));
20742 
20743   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20744   vperm = force_reg (V32QImode, vperm);
20745 
20746   l = gen_reg_rtx (V32QImode);
20747   op = gen_lowpart (V32QImode, d->op0);
20748   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20749 
20750   op = d->target;
20751   if (d->vmode != V32QImode)
20752     op = gen_reg_rtx (V32QImode);
20753   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
20754   if (op != d->target)
20755     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20756 
20757   return true;
20758 }
20759 
20760 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
20761    and extract-odd permutations of two V32QImode and V16QImode operand
20762    with two vpshufb insns, vpor and vpermq.  We should have already
20763    failed all two or three instruction sequences.  */
20764 
20765 static bool
expand_vec_perm_vpshufb2_vpermq_even_odd(struct expand_vec_perm_d * d)20766 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
20767 {
20768   rtx rperm[2][32], vperm, l, h, ior, op, m128;
20769   unsigned int i, nelt, eltsz;
20770 
20771   if (!TARGET_AVX2
20772       || d->one_operand_p
20773       || (d->vmode != V32QImode && d->vmode != V16HImode))
20774     return false;
20775 
20776   for (i = 0; i < d->nelt; ++i)
20777     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
20778       return false;
20779 
20780   if (d->testing_p)
20781     return true;
20782 
20783   nelt = d->nelt;
20784   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20785 
20786   /* Generate two permutation masks.  In the first permutation mask
20787      the first quarter will contain indexes for the first half
20788      of the op0, the second quarter will contain bit 7 set, third quarter
20789      will contain indexes for the second half of the op0 and the
20790      last quarter bit 7 set.  In the second permutation mask
20791      the first quarter will contain bit 7 set, the second quarter
20792      indexes for the first half of the op1, the third quarter bit 7 set
20793      and last quarter indexes for the second half of the op1.
20794      I.e. the first mask e.g. for V32QImode extract even will be:
20795      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
20796      (all values masked with 0xf except for -128) and second mask
20797      for extract even will be
20798      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
20799   m128 = GEN_INT (-128);
20800   for (i = 0; i < nelt; ++i)
20801     {
20802       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20803       unsigned which = d->perm[i] >= nelt;
20804       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
20805 
20806       for (j = 0; j < eltsz; ++j)
20807 	{
20808 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
20809 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
20810 	}
20811     }
20812 
20813   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20814   vperm = force_reg (V32QImode, vperm);
20815 
20816   l = gen_reg_rtx (V32QImode);
20817   op = gen_lowpart (V32QImode, d->op0);
20818   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20819 
20820   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20821   vperm = force_reg (V32QImode, vperm);
20822 
20823   h = gen_reg_rtx (V32QImode);
20824   op = gen_lowpart (V32QImode, d->op1);
20825   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20826 
20827   ior = gen_reg_rtx (V32QImode);
20828   emit_insn (gen_iorv32qi3 (ior, l, h));
20829 
20830   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
20831   op = gen_reg_rtx (V4DImode);
20832   ior = gen_lowpart (V4DImode, ior);
20833   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
20834 				  const1_rtx, GEN_INT (3)));
20835   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20836 
20837   return true;
20838 }
20839 
20840 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
20841    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
20842    operands with two "and" and "pack" or two "shift" and "pack" insns.
20843    We should have already failed all two instruction sequences.  */
20844 
20845 static bool
expand_vec_perm_even_odd_pack(struct expand_vec_perm_d * d)20846 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
20847 {
20848   rtx op, dop0, dop1, t;
20849   unsigned i, odd, c, s, nelt = d->nelt;
20850   bool end_perm = false;
20851   machine_mode half_mode;
20852   rtx (*gen_and) (rtx, rtx, rtx);
20853   rtx (*gen_pack) (rtx, rtx, rtx);
20854   rtx (*gen_shift) (rtx, rtx, rtx);
20855 
20856   if (d->one_operand_p)
20857     return false;
20858 
20859   switch (d->vmode)
20860     {
20861     case E_V4HImode:
20862       /* Required for "pack".  */
20863       if (!TARGET_SSE4_1)
20864 	return false;
20865       c = 0xffff;
20866       s = 16;
20867       half_mode = V2SImode;
20868       gen_and = gen_andv2si3;
20869       gen_pack = gen_mmx_packusdw;
20870       gen_shift = gen_lshrv2si3;
20871       break;
20872     case E_V8HImode:
20873       /* Required for "pack".  */
20874       if (!TARGET_SSE4_1)
20875         return false;
20876       c = 0xffff;
20877       s = 16;
20878       half_mode = V4SImode;
20879       gen_and = gen_andv4si3;
20880       gen_pack = gen_sse4_1_packusdw;
20881       gen_shift = gen_lshrv4si3;
20882       break;
20883     case E_V8QImode:
20884       /* No check as all instructions are SSE2.  */
20885       c = 0xff;
20886       s = 8;
20887       half_mode = V4HImode;
20888       gen_and = gen_andv4hi3;
20889       gen_pack = gen_mmx_packuswb;
20890       gen_shift = gen_lshrv4hi3;
20891       break;
20892     case E_V16QImode:
20893       /* No check as all instructions are SSE2.  */
20894       c = 0xff;
20895       s = 8;
20896       half_mode = V8HImode;
20897       gen_and = gen_andv8hi3;
20898       gen_pack = gen_sse2_packuswb;
20899       gen_shift = gen_lshrv8hi3;
20900       break;
20901     case E_V16HImode:
20902       if (!TARGET_AVX2)
20903         return false;
20904       c = 0xffff;
20905       s = 16;
20906       half_mode = V8SImode;
20907       gen_and = gen_andv8si3;
20908       gen_pack = gen_avx2_packusdw;
20909       gen_shift = gen_lshrv8si3;
20910       end_perm = true;
20911       break;
20912     case E_V32QImode:
20913       if (!TARGET_AVX2)
20914         return false;
20915       c = 0xff;
20916       s = 8;
20917       half_mode = V16HImode;
20918       gen_and = gen_andv16hi3;
20919       gen_pack = gen_avx2_packuswb;
20920       gen_shift = gen_lshrv16hi3;
20921       end_perm = true;
20922       break;
20923     default:
20924       /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
20925 	 are more profitable than general shuffles.  */
20926       return false;
20927     }
20928 
20929   /* Check that permutation is even or odd.  */
20930   odd = d->perm[0];
20931   if (odd > 1)
20932     return false;
20933 
20934   for (i = 1; i < nelt; ++i)
20935     if (d->perm[i] != 2 * i + odd)
20936       return false;
20937 
20938   if (d->testing_p)
20939     return true;
20940 
20941   dop0 = gen_reg_rtx (half_mode);
20942   dop1 = gen_reg_rtx (half_mode);
20943   if (odd == 0)
20944     {
20945       t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
20946       t = force_reg (half_mode, t);
20947       emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
20948       emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
20949     }
20950   else
20951     {
20952       emit_insn (gen_shift (dop0,
20953 			    gen_lowpart (half_mode, d->op0),
20954 			    GEN_INT (s)));
20955       emit_insn (gen_shift (dop1,
20956 			    gen_lowpart (half_mode, d->op1),
20957 			    GEN_INT (s)));
20958     }
20959   /* In AVX2 for 256 bit case we need to permute pack result.  */
20960   if (TARGET_AVX2 && end_perm)
20961     {
20962       op = gen_reg_rtx (d->vmode);
20963       t = gen_reg_rtx (V4DImode);
20964       emit_insn (gen_pack (op, dop0, dop1));
20965       emit_insn (gen_avx2_permv4di_1 (t,
20966 				      gen_lowpart (V4DImode, op),
20967 				      const0_rtx,
20968 				      const2_rtx,
20969 				      const1_rtx,
20970 				      GEN_INT (3)));
20971       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
20972     }
20973   else
20974     emit_insn (gen_pack (d->target, dop0, dop1));
20975 
20976   return true;
20977 }
20978 
20979 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
20980    and extract-odd permutations of two V64QI operands
20981    with two "shifts", two "truncs" and one "concat" insns for "odd"
20982    and two "truncs" and one concat insn for "even."
20983    Have already failed all two instruction sequences.  */
20984 
20985 static bool
expand_vec_perm_even_odd_trunc(struct expand_vec_perm_d * d)20986 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
20987 {
20988   rtx t1, t2, t3, t4;
20989   unsigned i, odd, nelt = d->nelt;
20990 
20991   if (!TARGET_AVX512BW
20992       || d->one_operand_p
20993       || d->vmode != V64QImode)
20994     return false;
20995 
20996   /* Check that permutation is even or odd.  */
20997   odd = d->perm[0];
20998   if (odd > 1)
20999     return false;
21000 
21001   for (i = 1; i < nelt; ++i)
21002     if (d->perm[i] != 2 * i + odd)
21003       return false;
21004 
21005   if (d->testing_p)
21006     return true;
21007 
21008 
21009   if (odd)
21010     {
21011       t1 = gen_reg_rtx (V32HImode);
21012       t2 = gen_reg_rtx (V32HImode);
21013       emit_insn (gen_lshrv32hi3 (t1,
21014 				 gen_lowpart (V32HImode, d->op0),
21015 				 GEN_INT (8)));
21016       emit_insn (gen_lshrv32hi3 (t2,
21017 				 gen_lowpart (V32HImode, d->op1),
21018 				 GEN_INT (8)));
21019     }
21020   else
21021     {
21022       t1 = gen_lowpart (V32HImode, d->op0);
21023       t2 = gen_lowpart (V32HImode, d->op1);
21024     }
21025 
21026   t3 = gen_reg_rtx (V32QImode);
21027   t4 = gen_reg_rtx (V32QImode);
21028   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21029   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21030   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21031 
21032   return true;
21033 }
21034 
21035 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
21036    and extract-odd permutations.  */
21037 
21038 static bool
expand_vec_perm_even_odd_1(struct expand_vec_perm_d * d,unsigned odd)21039 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21040 {
21041   rtx t1, t2, t3, t4, t5;
21042 
21043   switch (d->vmode)
21044     {
21045     case E_V4DFmode:
21046       if (d->testing_p)
21047 	break;
21048       t1 = gen_reg_rtx (V4DFmode);
21049       t2 = gen_reg_rtx (V4DFmode);
21050 
21051       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
21052       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21053       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21054 
21055       /* Now an unpck[lh]pd will produce the result required.  */
21056       if (odd)
21057 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21058       else
21059 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21060       emit_insn (t3);
21061       break;
21062 
21063     case E_V8SFmode:
21064       {
21065 	int mask = odd ? 0xdd : 0x88;
21066 
21067 	if (d->testing_p)
21068 	  break;
21069 	t1 = gen_reg_rtx (V8SFmode);
21070 	t2 = gen_reg_rtx (V8SFmode);
21071 	t3 = gen_reg_rtx (V8SFmode);
21072 
21073 	/* Shuffle within the 128-bit lanes to produce:
21074 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
21075 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21076 				      GEN_INT (mask)));
21077 
21078 	/* Shuffle the lanes around to produce:
21079 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
21080 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21081 					    GEN_INT (0x3)));
21082 
21083 	/* Shuffle within the 128-bit lanes to produce:
21084 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
21085 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21086 
21087 	/* Shuffle within the 128-bit lanes to produce:
21088 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
21089 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21090 
21091 	/* Shuffle the lanes around to produce:
21092 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
21093 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21094 					    GEN_INT (0x20)));
21095       }
21096       break;
21097 
21098     case E_V2DFmode:
21099     case E_V4SFmode:
21100     case E_V2DImode:
21101     case E_V2SImode:
21102     case E_V4SImode:
21103     case E_V2HImode:
21104       /* These are always directly implementable by expand_vec_perm_1.  */
21105       gcc_unreachable ();
21106 
21107     case E_V2SFmode:
21108       gcc_assert (TARGET_MMX_WITH_SSE);
21109       /* We have no suitable instructions.  */
21110       if (d->testing_p)
21111 	return false;
21112       break;
21113 
21114     case E_V4QImode:
21115       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21116 	return expand_vec_perm_pshufb2 (d);
21117       else
21118 	{
21119 	  if (d->testing_p)
21120 	    break;
21121 	  /* We need 2*log2(N)-1 operations to achieve odd/even
21122 	     with interleave. */
21123 	  t1 = gen_reg_rtx (V4QImode);
21124 	  emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21125 	  emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21126 	  if (odd)
21127 	    t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21128 	  else
21129 	    t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21130 	  emit_insn (t2);
21131 	}
21132       break;
21133 
21134     case E_V4HImode:
21135       if (TARGET_SSE4_1)
21136 	return expand_vec_perm_even_odd_pack (d);
21137       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21138 	return expand_vec_perm_pshufb2 (d);
21139       else
21140 	{
21141 	  if (d->testing_p)
21142 	    break;
21143 	  /* We need 2*log2(N)-1 operations to achieve odd/even
21144 	     with interleave. */
21145 	  t1 = gen_reg_rtx (V4HImode);
21146 	  emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21147 	  emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21148 	  if (odd)
21149 	    t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21150 	  else
21151 	    t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21152 	  emit_insn (t2);
21153 	}
21154       break;
21155 
21156     case E_V8HImode:
21157       if (TARGET_SSE4_1)
21158 	return expand_vec_perm_even_odd_pack (d);
21159       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21160 	return expand_vec_perm_pshufb2 (d);
21161       else
21162 	{
21163 	  if (d->testing_p)
21164 	    break;
21165 	  /* We need 2*log2(N)-1 operations to achieve odd/even
21166 	     with interleave. */
21167 	  t1 = gen_reg_rtx (V8HImode);
21168 	  t2 = gen_reg_rtx (V8HImode);
21169 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
21170 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
21171 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
21172 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
21173 	  if (odd)
21174 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
21175 	  else
21176 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
21177 	  emit_insn (t3);
21178 	}
21179       break;
21180 
21181     case E_V8QImode:
21182     case E_V16QImode:
21183       return expand_vec_perm_even_odd_pack (d);
21184 
21185     case E_V16HImode:
21186     case E_V32QImode:
21187       return expand_vec_perm_even_odd_pack (d);
21188 
21189     case E_V64QImode:
21190       return expand_vec_perm_even_odd_trunc (d);
21191 
21192     case E_V4DImode:
21193       if (!TARGET_AVX2)
21194 	{
21195 	  struct expand_vec_perm_d d_copy = *d;
21196 	  d_copy.vmode = V4DFmode;
21197 	  if (d->testing_p)
21198 	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
21199 	  else
21200 	    d_copy.target = gen_reg_rtx (V4DFmode);
21201 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
21202 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
21203 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21204 	    {
21205 	      if (!d->testing_p)
21206 		emit_move_insn (d->target,
21207 				gen_lowpart (V4DImode, d_copy.target));
21208 	      return true;
21209 	    }
21210 	  return false;
21211 	}
21212 
21213       if (d->testing_p)
21214 	break;
21215 
21216       t1 = gen_reg_rtx (V4DImode);
21217       t2 = gen_reg_rtx (V4DImode);
21218 
21219       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
21220       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
21221       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
21222 
21223       /* Now an vpunpck[lh]qdq will produce the result required.  */
21224       if (odd)
21225 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
21226       else
21227 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
21228       emit_insn (t3);
21229       break;
21230 
21231     case E_V8SImode:
21232       if (!TARGET_AVX2)
21233 	{
21234 	  struct expand_vec_perm_d d_copy = *d;
21235 	  d_copy.vmode = V8SFmode;
21236 	  if (d->testing_p)
21237 	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
21238 	  else
21239 	    d_copy.target = gen_reg_rtx (V8SFmode);
21240 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
21241 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
21242 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21243 	    {
21244 	      if (!d->testing_p)
21245 		emit_move_insn (d->target,
21246 				gen_lowpart (V8SImode, d_copy.target));
21247 	      return true;
21248 	    }
21249 	  return false;
21250 	}
21251 
21252       if (d->testing_p)
21253 	break;
21254 
21255       t1 = gen_reg_rtx (V8SImode);
21256       t2 = gen_reg_rtx (V8SImode);
21257       t3 = gen_reg_rtx (V4DImode);
21258       t4 = gen_reg_rtx (V4DImode);
21259       t5 = gen_reg_rtx (V4DImode);
21260 
21261       /* Shuffle the lanes around into
21262 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
21263       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
21264 				    gen_lowpart (V4DImode, d->op1),
21265 				    GEN_INT (0x20)));
21266       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
21267 				    gen_lowpart (V4DImode, d->op1),
21268 				    GEN_INT (0x31)));
21269 
21270       /* Swap the 2nd and 3rd position in each lane into
21271 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
21272       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
21273 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21274       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
21275 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21276 
21277       /* Now an vpunpck[lh]qdq will produce
21278 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
21279       if (odd)
21280 	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
21281 					   gen_lowpart (V4DImode, t2));
21282       else
21283 	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
21284 					  gen_lowpart (V4DImode, t2));
21285       emit_insn (t3);
21286       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
21287       break;
21288 
21289     default:
21290       gcc_unreachable ();
21291     }
21292 
21293   return true;
21294 }
21295 
21296 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
21297    extract-even and extract-odd permutations.  */
21298 
21299 static bool
expand_vec_perm_even_odd(struct expand_vec_perm_d * d)21300 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
21301 {
21302   unsigned i, odd, nelt = d->nelt;
21303 
21304   odd = d->perm[0];
21305   if (odd != 0 && odd != 1)
21306     return false;
21307 
21308   for (i = 1; i < nelt; ++i)
21309     if (d->perm[i] != 2 * i + odd)
21310       return false;
21311 
21312   if (d->vmode == E_V32HImode
21313       && d->testing_p
21314       && !TARGET_AVX512BW)
21315     return false;
21316 
21317   return expand_vec_perm_even_odd_1 (d, odd);
21318 }
21319 
21320 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
21321    permutations.  We assume that expand_vec_perm_1 has already failed.  */
21322 
21323 static bool
expand_vec_perm_broadcast_1(struct expand_vec_perm_d * d)21324 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
21325 {
21326   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
21327   machine_mode vmode = d->vmode;
21328   rtx (*gen) (rtx, rtx, rtx);
21329   unsigned char perm2[4];
21330   rtx op0 = d->op0, dest;
21331   bool ok;
21332 
21333   switch (vmode)
21334     {
21335     case E_V4DFmode:
21336     case E_V8SFmode:
21337       /* These are special-cased in sse.md so that we can optionally
21338 	 use the vbroadcast instruction.  They expand to two insns
21339 	 if the input happens to be in a register.  */
21340       gcc_unreachable ();
21341 
21342     case E_V2DFmode:
21343     case E_V2SFmode:
21344     case E_V4SFmode:
21345     case E_V2DImode:
21346     case E_V2SImode:
21347     case E_V4SImode:
21348     case E_V2HImode:
21349     case E_V4HImode:
21350       /* These are always implementable using standard shuffle patterns.  */
21351       gcc_unreachable ();
21352 
21353     case E_V4QImode:
21354       /* This can be implemented via interleave and pshuflw.  */
21355       if (d->testing_p)
21356 	return true;
21357 
21358       if (elt >= nelt2)
21359 	{
21360 	  gen = gen_mmx_punpckhbw_low;
21361 	  elt -= nelt2;
21362 	}
21363       else
21364 	gen = gen_mmx_punpcklbw_low;
21365 
21366       dest = gen_reg_rtx (vmode);
21367       emit_insn (gen (dest, op0, op0));
21368       vmode = get_mode_wider_vector (vmode);
21369       op0 = gen_lowpart (vmode, dest);
21370 
21371       memset (perm2, elt, 2);
21372       dest = gen_reg_rtx (vmode);
21373       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21374       gcc_assert (ok);
21375 
21376       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21377       return true;
21378 
21379     case E_V8QImode:
21380       /* This can be implemented via interleave.  We save one insn by
21381 	 stopping once we have promoted to V2SImode and then use pshufd.  */
21382       if (d->testing_p)
21383 	return true;
21384       do
21385 	{
21386 	  if (elt >= nelt2)
21387 	    {
21388 	      gen = vmode == V8QImode ? gen_mmx_punpckhbw
21389 				      : gen_mmx_punpckhwd;
21390 	      elt -= nelt2;
21391 	    }
21392 	  else
21393 	    gen = vmode == V8QImode ? gen_mmx_punpcklbw
21394 				    : gen_mmx_punpcklwd;
21395 	  nelt2 /= 2;
21396 
21397 	  dest = gen_reg_rtx (vmode);
21398 	  emit_insn (gen (dest, op0, op0));
21399 	  vmode = get_mode_wider_vector (vmode);
21400 	  op0 = gen_lowpart (vmode, dest);
21401 	}
21402       while (vmode != V2SImode);
21403 
21404       memset (perm2, elt, 2);
21405       dest = gen_reg_rtx (vmode);
21406       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21407       gcc_assert (ok);
21408 
21409       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21410       return true;
21411 
21412     case E_V8HImode:
21413     case E_V16QImode:
21414       /* These can be implemented via interleave.  We save one insn by
21415 	 stopping once we have promoted to V4SImode and then use pshufd.  */
21416       if (d->testing_p)
21417 	return true;
21418       do
21419 	{
21420 	  if (elt >= nelt2)
21421 	    {
21422 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
21423 				       : gen_vec_interleave_highv8hi;
21424 	      elt -= nelt2;
21425 	    }
21426 	  else
21427 	    gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
21428 				     : gen_vec_interleave_lowv8hi;
21429 	  nelt2 /= 2;
21430 
21431 	  dest = gen_reg_rtx (vmode);
21432 	  emit_insn (gen (dest, op0, op0));
21433 	  vmode = get_mode_wider_vector (vmode);
21434 	  op0 = gen_lowpart (vmode, dest);
21435 	}
21436       while (vmode != V4SImode);
21437 
21438       memset (perm2, elt, 4);
21439       dest = gen_reg_rtx (vmode);
21440       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21441       gcc_assert (ok);
21442 
21443       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21444       return true;
21445 
21446     case E_V32QImode:
21447     case E_V16HImode:
21448     case E_V8SImode:
21449     case E_V4DImode:
21450       /* For AVX2 broadcasts of the first element vpbroadcast* or
21451 	 vpermq should be used by expand_vec_perm_1.  */
21452       gcc_assert (!TARGET_AVX2 || d->perm[0]);
21453       return false;
21454 
21455     case E_V64QImode:
21456       gcc_assert (!TARGET_AVX512BW || d->perm[0]);
21457       return false;
21458 
21459     case E_V32HImode:
21460       gcc_assert (!TARGET_AVX512BW);
21461       return false;
21462 
21463     default:
21464       gcc_unreachable ();
21465     }
21466 }
21467 
21468 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
21469    broadcast permutations.  */
21470 
21471 static bool
expand_vec_perm_broadcast(struct expand_vec_perm_d * d)21472 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
21473 {
21474   unsigned i, elt, nelt = d->nelt;
21475 
21476   if (!d->one_operand_p)
21477     return false;
21478 
21479   elt = d->perm[0];
21480   for (i = 1; i < nelt; ++i)
21481     if (d->perm[i] != elt)
21482       return false;
21483 
21484   return expand_vec_perm_broadcast_1 (d);
21485 }
21486 
21487 /* Implement arbitrary permutations of two V64QImode operands
21488    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
21489 static bool
expand_vec_perm_vpermt2_vpshub2(struct expand_vec_perm_d * d)21490 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
21491 {
21492   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
21493     return false;
21494 
21495   if (d->testing_p)
21496     return true;
21497 
21498   struct expand_vec_perm_d ds[2];
21499   rtx rperm[128], vperm, target0, target1;
21500   unsigned int i, nelt;
21501   machine_mode vmode;
21502 
21503   nelt = d->nelt;
21504   vmode = V64QImode;
21505 
21506   for (i = 0; i < 2; i++)
21507     {
21508       ds[i] = *d;
21509       ds[i].vmode = V32HImode;
21510       ds[i].nelt = 32;
21511       ds[i].target = gen_reg_rtx (V32HImode);
21512       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
21513       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
21514     }
21515 
21516   /* Prepare permutations such that the first one takes care of
21517      putting the even bytes into the right positions or one higher
21518      positions (ds[0]) and the second one takes care of
21519      putting the odd bytes into the right positions or one below
21520      (ds[1]).  */
21521 
21522   for (i = 0; i < nelt; i++)
21523     {
21524       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
21525       if (i & 1)
21526 	{
21527 	  rperm[i] = constm1_rtx;
21528 	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21529 	}
21530       else
21531 	{
21532 	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21533 	  rperm[i + 64] = constm1_rtx;
21534 	}
21535     }
21536 
21537   bool ok = expand_vec_perm_1 (&ds[0]);
21538   gcc_assert (ok);
21539   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
21540 
21541   ok = expand_vec_perm_1 (&ds[1]);
21542   gcc_assert (ok);
21543   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
21544 
21545   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
21546   vperm = force_reg (vmode, vperm);
21547   target0 = gen_reg_rtx (V64QImode);
21548   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
21549 
21550   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
21551   vperm = force_reg (vmode, vperm);
21552   target1 = gen_reg_rtx (V64QImode);
21553   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
21554 
21555   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
21556   return true;
21557 }
21558 
21559 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
21560    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
21561    all the shorter instruction sequences.  */
21562 
21563 static bool
expand_vec_perm_vpshufb4_vpermq2(struct expand_vec_perm_d * d)21564 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
21565 {
21566   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
21567   unsigned int i, nelt, eltsz;
21568   bool used[4];
21569 
21570   if (!TARGET_AVX2
21571       || d->one_operand_p
21572       || (d->vmode != V32QImode && d->vmode != V16HImode))
21573     return false;
21574 
21575   if (d->testing_p)
21576     return true;
21577 
21578   nelt = d->nelt;
21579   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21580 
21581   /* Generate 4 permutation masks.  If the required element is within
21582      the same lane, it is shuffled in.  If the required element from the
21583      other lane, force a zero by setting bit 7 in the permutation mask.
21584      In the other mask the mask has non-negative elements if element
21585      is requested from the other lane, but also moved to the other lane,
21586      so that the result of vpshufb can have the two V2TImode halves
21587      swapped.  */
21588   m128 = GEN_INT (-128);
21589   for (i = 0; i < 32; ++i)
21590     {
21591       rperm[0][i] = m128;
21592       rperm[1][i] = m128;
21593       rperm[2][i] = m128;
21594       rperm[3][i] = m128;
21595     }
21596   used[0] = false;
21597   used[1] = false;
21598   used[2] = false;
21599   used[3] = false;
21600   for (i = 0; i < nelt; ++i)
21601     {
21602       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21603       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21604       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
21605 
21606       for (j = 0; j < eltsz; ++j)
21607 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
21608       used[which] = true;
21609     }
21610 
21611   for (i = 0; i < 2; ++i)
21612     {
21613       if (!used[2 * i + 1])
21614 	{
21615 	  h[i] = NULL_RTX;
21616 	  continue;
21617 	}
21618       vperm = gen_rtx_CONST_VECTOR (V32QImode,
21619 				    gen_rtvec_v (32, rperm[2 * i + 1]));
21620       vperm = force_reg (V32QImode, vperm);
21621       h[i] = gen_reg_rtx (V32QImode);
21622       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21623       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
21624     }
21625 
21626   /* Swap the 128-byte lanes of h[X].  */
21627   for (i = 0; i < 2; ++i)
21628    {
21629      if (h[i] == NULL_RTX)
21630        continue;
21631      op = gen_reg_rtx (V4DImode);
21632      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
21633 				     const2_rtx, GEN_INT (3), const0_rtx,
21634 				     const1_rtx));
21635      h[i] = gen_lowpart (V32QImode, op);
21636    }
21637 
21638   for (i = 0; i < 2; ++i)
21639     {
21640       if (!used[2 * i])
21641 	{
21642 	  l[i] = NULL_RTX;
21643 	  continue;
21644 	}
21645       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
21646       vperm = force_reg (V32QImode, vperm);
21647       l[i] = gen_reg_rtx (V32QImode);
21648       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21649       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
21650     }
21651 
21652   for (i = 0; i < 2; ++i)
21653     {
21654       if (h[i] && l[i])
21655 	{
21656 	  op = gen_reg_rtx (V32QImode);
21657 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
21658 	  l[i] = op;
21659 	}
21660       else if (h[i])
21661 	l[i] = h[i];
21662     }
21663 
21664   gcc_assert (l[0] && l[1]);
21665   op = d->target;
21666   if (d->vmode != V32QImode)
21667     op = gen_reg_rtx (V32QImode);
21668   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
21669   if (op != d->target)
21670     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21671   return true;
21672 }
21673 
21674 /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
21675    taken care of, perform the expansion in D and return true on success.  */
21676 
21677 static bool
ix86_expand_vec_perm_const_1(struct expand_vec_perm_d * d)21678 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
21679 {
21680   /* Try a single instruction expansion.  */
21681   if (expand_vec_perm_1 (d))
21682     return true;
21683 
21684   /* Try sequences of two instructions.  */
21685 
21686   if (expand_vec_perm_pshuflw_pshufhw (d))
21687     return true;
21688 
21689   if (expand_vec_perm_palignr (d, false))
21690     return true;
21691 
21692   if (expand_vec_perm_interleave2 (d))
21693     return true;
21694 
21695   if (expand_vec_perm_broadcast (d))
21696     return true;
21697 
21698   if (expand_vec_perm_vpermq_perm_1 (d))
21699     return true;
21700 
21701   if (expand_vec_perm_vperm2f128 (d))
21702     return true;
21703 
21704   if (expand_vec_perm_pblendv (d))
21705     return true;
21706 
21707   if (expand_vec_perm_2perm_interleave (d, true))
21708     return true;
21709 
21710   if (expand_vec_perm_2perm_pblendv (d, true))
21711     return true;
21712 
21713   /* Try sequences of three instructions.  */
21714 
21715   if (expand_vec_perm_even_odd_pack (d))
21716     return true;
21717 
21718   if (expand_vec_perm_2vperm2f128_vshuf (d))
21719     return true;
21720 
21721   if (expand_vec_perm_pshufb2 (d))
21722     return true;
21723 
21724   if (expand_vec_perm_interleave3 (d))
21725     return true;
21726 
21727   if (expand_vec_perm_vperm2f128_vblend (d))
21728     return true;
21729 
21730   if (expand_vec_perm_2perm_interleave (d, false))
21731     return true;
21732 
21733   if (expand_vec_perm_2perm_pblendv (d, false))
21734     return true;
21735 
21736   /* Try sequences of four instructions.  */
21737 
21738   if (expand_vec_perm_even_odd_trunc (d))
21739     return true;
21740   if (expand_vec_perm_vpshufb2_vpermq (d))
21741     return true;
21742 
21743   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
21744     return true;
21745 
21746   if (expand_vec_perm_vpermt2_vpshub2 (d))
21747     return true;
21748 
21749   /* ??? Look for narrow permutations whose element orderings would
21750      allow the promotion to a wider mode.  */
21751 
21752   /* ??? Look for sequences of interleave or a wider permute that place
21753      the data into the correct lanes for a half-vector shuffle like
21754      pshuf[lh]w or vpermilps.  */
21755 
21756   /* ??? Look for sequences of interleave that produce the desired results.
21757      The combinatorics of punpck[lh] get pretty ugly... */
21758 
21759   if (expand_vec_perm_even_odd (d))
21760     return true;
21761 
21762   /* Even longer sequences.  */
21763   if (expand_vec_perm_vpshufb4_vpermq2 (d))
21764     return true;
21765 
21766   /* See if we can get the same permutation in different vector integer
21767      mode.  */
21768   struct expand_vec_perm_d nd;
21769   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21770     {
21771       if (!d->testing_p)
21772 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21773       return true;
21774     }
21775 
21776   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
21777   if (expand_vec_perm2_vperm2f128_vblend (d))
21778     return true;
21779 
21780   return false;
21781 }
21782 
21783 /* If a permutation only uses one operand, make it clear. Returns true
21784    if the permutation references both operands.  */
21785 
21786 static bool
canonicalize_perm(struct expand_vec_perm_d * d)21787 canonicalize_perm (struct expand_vec_perm_d *d)
21788 {
21789   int i, which, nelt = d->nelt;
21790 
21791   for (i = which = 0; i < nelt; ++i)
21792     which |= (d->perm[i] < nelt ? 1 : 2);
21793 
21794   d->one_operand_p = true;
21795   switch (which)
21796     {
21797     default:
21798       gcc_unreachable();
21799 
21800     case 3:
21801       if (!rtx_equal_p (d->op0, d->op1))
21802         {
21803 	  d->one_operand_p = false;
21804 	  break;
21805         }
21806       /* The elements of PERM do not suggest that only the first operand
21807 	 is used, but both operands are identical.  Allow easier matching
21808 	 of the permutation by folding the permutation into the single
21809 	 input vector.  */
21810       /* FALLTHRU */
21811 
21812     case 2:
21813       for (i = 0; i < nelt; ++i)
21814         d->perm[i] &= nelt - 1;
21815       d->op0 = d->op1;
21816       break;
21817 
21818     case 1:
21819       d->op1 = d->op0;
21820       break;
21821     }
21822 
21823   return (which == 3);
21824 }
21825 
21826 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
21827 
21828 bool
ix86_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)21829 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
21830 			       rtx op1, const vec_perm_indices &sel)
21831 {
21832   struct expand_vec_perm_d d;
21833   unsigned char perm[MAX_VECT_LEN];
21834   unsigned int i, nelt, which;
21835   bool two_args;
21836 
21837   /* For HF mode vector, convert it to HI using subreg.  */
21838   if (GET_MODE_INNER (vmode) == HFmode)
21839     {
21840       machine_mode orig_mode = vmode;
21841       vmode = mode_for_vector (HImode,
21842 			       GET_MODE_NUNITS (vmode)).require ();
21843       if (target)
21844 	target = lowpart_subreg (vmode, target, orig_mode);
21845       if (op0)
21846 	op0 = lowpart_subreg (vmode, op0, orig_mode);
21847       if (op1)
21848 	op1 = lowpart_subreg (vmode, op1, orig_mode);
21849     }
21850 
21851   d.target = target;
21852   d.op0 = op0;
21853   d.op1 = op1;
21854 
21855   d.vmode = vmode;
21856   gcc_assert (VECTOR_MODE_P (d.vmode));
21857   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
21858   d.testing_p = !target;
21859 
21860   gcc_assert (sel.length () == nelt);
21861   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
21862 
21863   /* Given sufficient ISA support we can just return true here
21864      for selected vector modes.  */
21865   switch (d.vmode)
21866     {
21867     case E_V16SFmode:
21868     case E_V16SImode:
21869     case E_V8DImode:
21870     case E_V8DFmode:
21871       if (!TARGET_AVX512F)
21872 	return false;
21873       /* All implementable with a single vperm[it]2 insn.  */
21874       if (d.testing_p)
21875 	return true;
21876       break;
21877     case E_V32HImode:
21878       if (!TARGET_AVX512F)
21879 	return false;
21880       if (d.testing_p && TARGET_AVX512BW)
21881 	/* All implementable with a single vperm[it]2 insn.  */
21882 	return true;
21883       break;
21884     case E_V64QImode:
21885       if (!TARGET_AVX512F)
21886 	return false;
21887       if (d.testing_p && TARGET_AVX512BW)
21888 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
21889 	return true;
21890       break;
21891     case E_V8SImode:
21892     case E_V8SFmode:
21893     case E_V4DFmode:
21894     case E_V4DImode:
21895       if (!TARGET_AVX)
21896 	return false;
21897       if (d.testing_p && TARGET_AVX512VL)
21898 	/* All implementable with a single vperm[it]2 insn.  */
21899 	return true;
21900       break;
21901     case E_V16HImode:
21902       if (!TARGET_SSE2)
21903 	return false;
21904       if (d.testing_p && TARGET_AVX2)
21905 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
21906 	return true;
21907       break;
21908     case E_V32QImode:
21909       if (!TARGET_SSE2)
21910 	return false;
21911       if (d.testing_p && TARGET_AVX2)
21912 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
21913 	return true;
21914       break;
21915     case E_V8HImode:
21916     case E_V16QImode:
21917       if (!TARGET_SSE2)
21918 	return false;
21919       /* Fall through.  */
21920     case E_V4SImode:
21921     case E_V4SFmode:
21922       if (!TARGET_SSE)
21923 	return false;
21924       /* All implementable with a single vpperm insn.  */
21925       if (d.testing_p && TARGET_XOP)
21926 	return true;
21927       /* All implementable with 2 pshufb + 1 ior.  */
21928       if (d.testing_p && TARGET_SSSE3)
21929 	return true;
21930       break;
21931     case E_V2SFmode:
21932     case E_V2SImode:
21933     case E_V4HImode:
21934     case E_V8QImode:
21935       if (!TARGET_MMX_WITH_SSE)
21936 	return false;
21937       break;
21938     case E_V2HImode:
21939       if (!TARGET_SSE2)
21940 	return false;
21941       /* All implementable with *punpckwd.  */
21942       if (d.testing_p)
21943 	return true;
21944       break;
21945     case E_V4QImode:
21946       if (!TARGET_SSE2)
21947 	return false;
21948       break;
21949     case E_V2DImode:
21950     case E_V2DFmode:
21951       if (!TARGET_SSE)
21952 	return false;
21953       /* All implementable with shufpd or unpck[lh]pd.  */
21954       if (d.testing_p)
21955 	return true;
21956       break;
21957     default:
21958       return false;
21959     }
21960 
21961   for (i = which = 0; i < nelt; ++i)
21962     {
21963       unsigned char e = sel[i];
21964       gcc_assert (e < 2 * nelt);
21965       d.perm[i] = e;
21966       perm[i] = e;
21967       which |= (e < nelt ? 1 : 2);
21968     }
21969 
21970   if (d.testing_p)
21971     {
21972       /* For all elements from second vector, fold the elements to first.  */
21973       if (which == 2)
21974 	for (i = 0; i < nelt; ++i)
21975 	  d.perm[i] -= nelt;
21976 
21977       /* Check whether the mask can be applied to the vector type.  */
21978       d.one_operand_p = (which != 3);
21979 
21980       /* Implementable with shufps, pshufd or pshuflw.  */
21981       if (d.one_operand_p
21982 	  && (d.vmode == V4SFmode || d.vmode == V2SFmode
21983 	      || d.vmode == V4SImode || d.vmode == V2SImode
21984 	      || d.vmode == V4HImode || d.vmode == V2HImode))
21985 	return true;
21986 
21987       /* Otherwise we have to go through the motions and see if we can
21988 	 figure out how to generate the requested permutation.  */
21989       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
21990       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
21991       if (!d.one_operand_p)
21992 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
21993 
21994       start_sequence ();
21995       bool ret = ix86_expand_vec_perm_const_1 (&d);
21996       end_sequence ();
21997 
21998       return ret;
21999     }
22000 
22001   two_args = canonicalize_perm (&d);
22002 
22003   /* If one of the operands is a zero vector, try to match pmovzx.  */
22004   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22005     {
22006       struct expand_vec_perm_d dzero = d;
22007       if (d.op0 == CONST0_RTX (vmode))
22008 	{
22009 	  d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22010 	  std::swap (dzero.op0, dzero.op1);
22011 	  for (i = 0; i < nelt; ++i)
22012 	    dzero.perm[i] ^= nelt;
22013 	}
22014       else
22015 	d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22016 
22017       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22018 				  dzero.perm, nelt, dzero.testing_p))
22019 	return true;
22020     }
22021 
22022   /* Force operands into registers.  */
22023   rtx nop0 = force_reg (vmode, d.op0);
22024   if (d.op0 == d.op1)
22025     d.op1 = nop0;
22026   d.op0 = nop0;
22027   d.op1 = force_reg (vmode, d.op1);
22028 
22029   if (ix86_expand_vec_perm_const_1 (&d))
22030     return true;
22031 
22032   /* If the selector says both arguments are needed, but the operands are the
22033      same, the above tried to expand with one_operand_p and flattened selector.
22034      If that didn't work, retry without one_operand_p; we succeeded with that
22035      during testing.  */
22036   if (two_args && d.one_operand_p)
22037     {
22038       d.one_operand_p = false;
22039       memcpy (d.perm, perm, sizeof (perm));
22040       return ix86_expand_vec_perm_const_1 (&d);
22041     }
22042 
22043   return false;
22044 }
22045 
22046 void
ix86_expand_vec_extract_even_odd(rtx targ,rtx op0,rtx op1,unsigned odd)22047 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22048 {
22049   struct expand_vec_perm_d d;
22050   unsigned i, nelt;
22051 
22052   d.target = targ;
22053   d.op0 = op0;
22054   d.op1 = op1;
22055   d.vmode = GET_MODE (targ);
22056   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22057   d.one_operand_p = false;
22058   d.testing_p = false;
22059 
22060   for (i = 0; i < nelt; ++i)
22061     d.perm[i] = i * 2 + odd;
22062 
22063   /* We'll either be able to implement the permutation directly...  */
22064   if (expand_vec_perm_1 (&d))
22065     return;
22066 
22067   /* ... or we use the special-case patterns.  */
22068   expand_vec_perm_even_odd_1 (&d, odd);
22069 }
22070 
22071 static void
ix86_expand_vec_interleave(rtx targ,rtx op0,rtx op1,bool high_p)22072 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22073 {
22074   struct expand_vec_perm_d d;
22075   unsigned i, nelt, base;
22076   bool ok;
22077 
22078   d.target = targ;
22079   d.op0 = op0;
22080   d.op1 = op1;
22081   d.vmode = GET_MODE (targ);
22082   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22083   d.one_operand_p = false;
22084   d.testing_p = false;
22085 
22086   base = high_p ? nelt / 2 : 0;
22087   for (i = 0; i < nelt / 2; ++i)
22088     {
22089       d.perm[i * 2] = i + base;
22090       d.perm[i * 2 + 1] = i + base + nelt;
22091     }
22092 
22093   /* Note that for AVX this isn't one instruction.  */
22094   ok = ix86_expand_vec_perm_const_1 (&d);
22095   gcc_assert (ok);
22096 }
22097 
22098 /* This function is similar as ix86_expand_vecop_qihi,
22099    but optimized under AVX512BW by using vpmovwb.
22100    For example, optimize vector MUL generation like
22101 
22102    vpmovzxbw ymm2, xmm0
22103    vpmovzxbw ymm3, xmm1
22104    vpmullw   ymm4, ymm2, ymm3
22105    vpmovwb   xmm0, ymm4
22106 
22107    it would take less instructions than ix86_expand_vecop_qihi.
22108    Return true if success.  */
22109 
22110 static bool
ix86_expand_vecop_qihi2(enum rtx_code code,rtx dest,rtx op1,rtx op2)22111 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22112 {
22113   machine_mode himode, qimode = GET_MODE (dest);
22114   rtx hop1, hop2, hdest;
22115   rtx (*gen_extend)(rtx, rtx);
22116   rtx (*gen_truncate)(rtx, rtx);
22117   bool uns_p = (code == ASHIFTRT) ? false : true;
22118 
22119   /* There's no V64HImode multiplication instruction.  */
22120   if (qimode == E_V64QImode)
22121     return false;
22122 
22123   /* vpmovwb only available under AVX512BW.  */
22124   if (!TARGET_AVX512BW)
22125     return false;
22126   if ((qimode == V8QImode || qimode == V16QImode)
22127       && !TARGET_AVX512VL)
22128     return false;
22129   /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
22130   if (qimode == V32QImode
22131       && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
22132     return false;
22133 
22134   switch (qimode)
22135     {
22136     case E_V8QImode:
22137       himode = V8HImode;
22138       gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
22139       gen_truncate = gen_truncv8hiv8qi2;
22140       break;
22141     case E_V16QImode:
22142       himode = V16HImode;
22143       gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
22144       gen_truncate = gen_truncv16hiv16qi2;
22145       break;
22146     case E_V32QImode:
22147       himode = V32HImode;
22148       gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
22149       gen_truncate = gen_truncv32hiv32qi2;
22150       break;
22151     default:
22152       gcc_unreachable ();
22153     }
22154 
22155   hop1 = gen_reg_rtx (himode);
22156   hop2 = gen_reg_rtx (himode);
22157   hdest = gen_reg_rtx (himode);
22158   emit_insn (gen_extend (hop1, op1));
22159   emit_insn (gen_extend (hop2, op2));
22160   emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
22161 						      hop1, hop2)));
22162   emit_insn (gen_truncate (dest, hdest));
22163   return true;
22164 }
22165 
22166 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22167    same operation on V*HImode. Return true if success. */
22168 static bool
ix86_expand_vec_shift_qihi_constant(enum rtx_code code,rtx dest,rtx op1,rtx op2)22169 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
22170 				     rtx dest, rtx op1, rtx op2)
22171 {
22172   machine_mode qimode, himode;
22173   HOST_WIDE_INT and_constant, xor_constant;
22174   HOST_WIDE_INT shift_amount;
22175   rtx vec_const_and, vec_const_xor;
22176   rtx tmp, op1_subreg;
22177   rtx (*gen_shift) (rtx, rtx, rtx);
22178   rtx (*gen_and) (rtx, rtx, rtx);
22179   rtx (*gen_xor) (rtx, rtx, rtx);
22180   rtx (*gen_sub) (rtx, rtx, rtx);
22181 
22182   /* Only optimize shift by constant.  */
22183   if (!CONST_INT_P (op2))
22184     return false;
22185 
22186   qimode = GET_MODE (dest);
22187   shift_amount = INTVAL (op2);
22188   /* Do nothing when shift amount greater equal 8.  */
22189   if (shift_amount > 7)
22190     return false;
22191 
22192   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
22193   /* Record sign bit.  */
22194   xor_constant = 1 << (8 - shift_amount - 1);
22195 
22196   /* Zero upper/lower bits shift from left/right element.  */
22197   and_constant
22198     = (code == ASHIFT ? 256 - (1 << shift_amount)
22199        : (1 << (8 - shift_amount)) - 1);
22200 
22201   switch (qimode)
22202     {
22203     case V16QImode:
22204       himode = V8HImode;
22205       gen_shift =
22206 	((code == ASHIFT)
22207 	 ? gen_ashlv8hi3
22208 	 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
22209       gen_and = gen_andv16qi3;
22210       gen_xor = gen_xorv16qi3;
22211       gen_sub = gen_subv16qi3;
22212       break;
22213     case V32QImode:
22214       himode = V16HImode;
22215       gen_shift =
22216 	((code == ASHIFT)
22217 	 ? gen_ashlv16hi3
22218 	 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
22219       gen_and = gen_andv32qi3;
22220       gen_xor = gen_xorv32qi3;
22221       gen_sub = gen_subv32qi3;
22222       break;
22223     case V64QImode:
22224       himode = V32HImode;
22225       gen_shift =
22226 	((code == ASHIFT)
22227 	 ? gen_ashlv32hi3
22228 	 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
22229       gen_and = gen_andv64qi3;
22230       gen_xor = gen_xorv64qi3;
22231       gen_sub = gen_subv64qi3;
22232       break;
22233     default:
22234       gcc_unreachable ();
22235     }
22236 
22237   tmp = gen_reg_rtx (himode);
22238   vec_const_and = gen_reg_rtx (qimode);
22239   op1_subreg = lowpart_subreg (himode, op1, qimode);
22240 
22241   /* For ASHIFT and LSHIFTRT, perform operation like
22242      vpsllw/vpsrlw $shift_amount, %op1, %dest.
22243      vpand %vec_const_and, %dest.  */
22244   emit_insn (gen_shift (tmp, op1_subreg, op2));
22245   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
22246   emit_move_insn (vec_const_and,
22247 		  ix86_build_const_vector (qimode, true,
22248 					   gen_int_mode (and_constant, QImode)));
22249   emit_insn (gen_and (dest, dest, vec_const_and));
22250 
22251   /* For ASHIFTRT, perform extra operation like
22252      vpxor %vec_const_xor, %dest, %dest
22253      vpsubb %vec_const_xor, %dest, %dest  */
22254   if (code == ASHIFTRT)
22255     {
22256       vec_const_xor = gen_reg_rtx (qimode);
22257       emit_move_insn (vec_const_xor,
22258 		      ix86_build_const_vector (qimode, true,
22259 					       gen_int_mode (xor_constant, QImode)));
22260       emit_insn (gen_xor (dest, dest, vec_const_xor));
22261       emit_insn (gen_sub (dest, dest, vec_const_xor));
22262     }
22263   return true;
22264 }
22265 
22266 /* Expand a vector operation CODE for a V*QImode in terms of the
22267    same operation on V*HImode.  */
22268 
22269 void
ix86_expand_vecop_qihi(enum rtx_code code,rtx dest,rtx op1,rtx op2)22270 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22271 {
22272   machine_mode qimode = GET_MODE (dest);
22273   machine_mode himode;
22274   rtx (*gen_il) (rtx, rtx, rtx);
22275   rtx (*gen_ih) (rtx, rtx, rtx);
22276   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
22277   struct expand_vec_perm_d d;
22278   bool ok, full_interleave;
22279   bool uns_p = false;
22280   int i;
22281 
22282   if (CONST_INT_P (op2)
22283       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
22284       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
22285     return;
22286 
22287   if (TARGET_AVX512BW
22288       && VECTOR_MODE_P (GET_MODE (op2))
22289       && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
22290     return;
22291 
22292   switch (qimode)
22293     {
22294     case E_V16QImode:
22295       himode = V8HImode;
22296       gen_il = gen_vec_interleave_lowv16qi;
22297       gen_ih = gen_vec_interleave_highv16qi;
22298       break;
22299     case E_V32QImode:
22300       himode = V16HImode;
22301       gen_il = gen_avx2_interleave_lowv32qi;
22302       gen_ih = gen_avx2_interleave_highv32qi;
22303       break;
22304     case E_V64QImode:
22305       himode = V32HImode;
22306       gen_il = gen_avx512bw_interleave_lowv64qi;
22307       gen_ih = gen_avx512bw_interleave_highv64qi;
22308       break;
22309     default:
22310       gcc_unreachable ();
22311     }
22312 
22313   switch (code)
22314     {
22315     case MULT:
22316       /* Unpack data such that we've got a source byte in each low byte of
22317 	 each word.  We don't care what goes into the high byte of each word.
22318 	 Rather than trying to get zero in there, most convenient is to let
22319 	 it be a copy of the low byte.  */
22320       op2_l = gen_reg_rtx (qimode);
22321       op2_h = gen_reg_rtx (qimode);
22322       emit_insn (gen_il (op2_l, op2, op2));
22323       emit_insn (gen_ih (op2_h, op2, op2));
22324 
22325       op1_l = gen_reg_rtx (qimode);
22326       op1_h = gen_reg_rtx (qimode);
22327       emit_insn (gen_il (op1_l, op1, op1));
22328       emit_insn (gen_ih (op1_h, op1, op1));
22329       full_interleave = qimode == V16QImode;
22330       break;
22331 
22332     case ASHIFT:
22333     case LSHIFTRT:
22334       uns_p = true;
22335       /* FALLTHRU */
22336     case ASHIFTRT:
22337       op1_l = gen_reg_rtx (himode);
22338       op1_h = gen_reg_rtx (himode);
22339       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
22340       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
22341       /* vashr/vlshr/vashl  */
22342       if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22343 	{
22344 	  rtx tmp = force_reg (qimode, op2);
22345 	  op2_l = gen_reg_rtx (himode);
22346 	  op2_h = gen_reg_rtx (himode);
22347 	  ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
22348 	  ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
22349 	}
22350       else
22351 	op2_l = op2_h = op2;
22352 
22353       full_interleave = true;
22354       break;
22355     default:
22356       gcc_unreachable ();
22357     }
22358 
22359   /* Perform vashr/vlshr/vashl.  */
22360   if (code != MULT
22361       && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22362     {
22363       res_l = gen_reg_rtx (himode);
22364       res_h = gen_reg_rtx (himode);
22365       emit_insn (gen_rtx_SET (res_l,
22366 			      simplify_gen_binary (code, himode,
22367 						   op1_l, op2_l)));
22368       emit_insn (gen_rtx_SET (res_h,
22369 			      simplify_gen_binary (code, himode,
22370 						   op1_h, op2_h)));
22371     }
22372   /* Performance mult/ashr/lshr/ashl.  */
22373   else
22374     {
22375       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
22376 				   1, OPTAB_DIRECT);
22377       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
22378 				   1, OPTAB_DIRECT);
22379     }
22380 
22381   gcc_assert (res_l && res_h);
22382 
22383   /* Merge the data back into the right place.  */
22384   d.target = dest;
22385   d.op0 = gen_lowpart (qimode, res_l);
22386   d.op1 = gen_lowpart (qimode, res_h);
22387   d.vmode = qimode;
22388   d.nelt = GET_MODE_NUNITS (qimode);
22389   d.one_operand_p = false;
22390   d.testing_p = false;
22391 
22392   if (full_interleave)
22393     {
22394       /* For SSE2, we used an full interleave, so the desired
22395 	 results are in the even elements.  */
22396       for (i = 0; i < d.nelt; ++i)
22397 	d.perm[i] = i * 2;
22398     }
22399   else
22400     {
22401       /* For AVX, the interleave used above was not cross-lane.  So the
22402 	 extraction is evens but with the second and third quarter swapped.
22403 	 Happily, that is even one insn shorter than even extraction.
22404 	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
22405 	 always first from the first and then from the second source operand,
22406 	 the index bits above the low 4 bits remains the same.
22407 	 Thus, for d.nelt == 32 we want permutation
22408 	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
22409 	 and for d.nelt == 64 we want permutation
22410 	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
22411 	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
22412       for (i = 0; i < d.nelt; ++i)
22413 	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
22414     }
22415 
22416   ok = ix86_expand_vec_perm_const_1 (&d);
22417   gcc_assert (ok);
22418 
22419   set_unique_reg_note (get_last_insn (), REG_EQUAL,
22420 		       gen_rtx_fmt_ee (code, qimode, op1, op2));
22421 }
22422 
22423 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
22424    if op is CONST_VECTOR with all odd elements equal to their
22425    preceding element.  */
22426 
22427 static bool
const_vector_equal_evenodd_p(rtx op)22428 const_vector_equal_evenodd_p (rtx op)
22429 {
22430   machine_mode mode = GET_MODE (op);
22431   int i, nunits = GET_MODE_NUNITS (mode);
22432   if (GET_CODE (op) != CONST_VECTOR
22433       || nunits != CONST_VECTOR_NUNITS (op))
22434     return false;
22435   for (i = 0; i < nunits; i += 2)
22436     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
22437       return false;
22438   return true;
22439 }
22440 
22441 void
ix86_expand_mul_widen_evenodd(rtx dest,rtx op1,rtx op2,bool uns_p,bool odd_p)22442 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
22443 			       bool uns_p, bool odd_p)
22444 {
22445   machine_mode mode = GET_MODE (op1);
22446   machine_mode wmode = GET_MODE (dest);
22447   rtx x;
22448   rtx orig_op1 = op1, orig_op2 = op2;
22449 
22450   if (!nonimmediate_operand (op1, mode))
22451     op1 = force_reg (mode, op1);
22452   if (!nonimmediate_operand (op2, mode))
22453     op2 = force_reg (mode, op2);
22454 
22455   /* We only play even/odd games with vectors of SImode.  */
22456   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
22457 
22458   /* If we're looking for the odd results, shift those members down to
22459      the even slots.  For some cpus this is faster than a PSHUFD.  */
22460   if (odd_p)
22461     {
22462       /* For XOP use vpmacsdqh, but only for smult, as it is only
22463 	 signed.  */
22464       if (TARGET_XOP && mode == V4SImode && !uns_p)
22465 	{
22466 	  x = force_reg (wmode, CONST0_RTX (wmode));
22467 	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
22468 	  return;
22469 	}
22470 
22471       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
22472       if (!const_vector_equal_evenodd_p (orig_op1))
22473 	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
22474 			    x, NULL, 1, OPTAB_DIRECT);
22475       if (!const_vector_equal_evenodd_p (orig_op2))
22476 	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
22477 			    x, NULL, 1, OPTAB_DIRECT);
22478       op1 = gen_lowpart (mode, op1);
22479       op2 = gen_lowpart (mode, op2);
22480     }
22481 
22482   if (mode == V16SImode)
22483     {
22484       if (uns_p)
22485 	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
22486       else
22487 	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
22488     }
22489   else if (mode == V8SImode)
22490     {
22491       if (uns_p)
22492 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
22493       else
22494 	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
22495     }
22496   else if (uns_p)
22497     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
22498   else if (TARGET_SSE4_1)
22499     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
22500   else
22501     {
22502       rtx s1, s2, t0, t1, t2;
22503 
22504       /* The easiest way to implement this without PMULDQ is to go through
22505 	 the motions as if we are performing a full 64-bit multiply.  With
22506 	 the exception that we need to do less shuffling of the elements.  */
22507 
22508       /* Compute the sign-extension, aka highparts, of the two operands.  */
22509       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22510 				op1, pc_rtx, pc_rtx);
22511       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22512 				op2, pc_rtx, pc_rtx);
22513 
22514       /* Multiply LO(A) * HI(B), and vice-versa.  */
22515       t1 = gen_reg_rtx (wmode);
22516       t2 = gen_reg_rtx (wmode);
22517       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
22518       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
22519 
22520       /* Multiply LO(A) * LO(B).  */
22521       t0 = gen_reg_rtx (wmode);
22522       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
22523 
22524       /* Combine and shift the highparts into place.  */
22525       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
22526       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
22527 			 1, OPTAB_DIRECT);
22528 
22529       /* Combine high and low parts.  */
22530       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
22531       return;
22532     }
22533   emit_insn (x);
22534 }
22535 
22536 void
ix86_expand_mul_widen_hilo(rtx dest,rtx op1,rtx op2,bool uns_p,bool high_p)22537 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
22538 			    bool uns_p, bool high_p)
22539 {
22540   machine_mode wmode = GET_MODE (dest);
22541   machine_mode mode = GET_MODE (op1);
22542   rtx t1, t2, t3, t4, mask;
22543 
22544   switch (mode)
22545     {
22546     case E_V4SImode:
22547       t1 = gen_reg_rtx (mode);
22548       t2 = gen_reg_rtx (mode);
22549       if (TARGET_XOP && !uns_p)
22550 	{
22551 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
22552 	     shuffle the elements once so that all elements are in the right
22553 	     place for immediate use: { A C B D }.  */
22554 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
22555 					const1_rtx, GEN_INT (3)));
22556 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
22557 					const1_rtx, GEN_INT (3)));
22558 	}
22559       else
22560 	{
22561 	  /* Put the elements into place for the multiply.  */
22562 	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
22563 	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
22564 	  high_p = false;
22565 	}
22566       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
22567       break;
22568 
22569     case E_V8SImode:
22570       /* Shuffle the elements between the lanes.  After this we
22571 	 have { A B E F | C D G H } for each operand.  */
22572       t1 = gen_reg_rtx (V4DImode);
22573       t2 = gen_reg_rtx (V4DImode);
22574       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
22575 				      const0_rtx, const2_rtx,
22576 				      const1_rtx, GEN_INT (3)));
22577       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
22578 				      const0_rtx, const2_rtx,
22579 				      const1_rtx, GEN_INT (3)));
22580 
22581       /* Shuffle the elements within the lanes.  After this we
22582 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
22583       t3 = gen_reg_rtx (V8SImode);
22584       t4 = gen_reg_rtx (V8SImode);
22585       mask = GEN_INT (high_p
22586 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
22587 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
22588       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
22589       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
22590 
22591       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
22592       break;
22593 
22594     case E_V8HImode:
22595     case E_V16HImode:
22596       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
22597 			 uns_p, OPTAB_DIRECT);
22598       t2 = expand_binop (mode,
22599 			 uns_p ? umul_highpart_optab : smul_highpart_optab,
22600 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
22601       gcc_assert (t1 && t2);
22602 
22603       t3 = gen_reg_rtx (mode);
22604       ix86_expand_vec_interleave (t3, t1, t2, high_p);
22605       emit_move_insn (dest, gen_lowpart (wmode, t3));
22606       break;
22607 
22608     case E_V16QImode:
22609     case E_V32QImode:
22610     case E_V32HImode:
22611     case E_V16SImode:
22612     case E_V64QImode:
22613       t1 = gen_reg_rtx (wmode);
22614       t2 = gen_reg_rtx (wmode);
22615       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
22616       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
22617 
22618       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
22619       break;
22620 
22621     default:
22622       gcc_unreachable ();
22623     }
22624 }
22625 
22626 void
ix86_expand_sse2_mulv4si3(rtx op0,rtx op1,rtx op2)22627 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
22628 {
22629   rtx res_1, res_2, res_3, res_4;
22630 
22631   res_1 = gen_reg_rtx (V4SImode);
22632   res_2 = gen_reg_rtx (V4SImode);
22633   res_3 = gen_reg_rtx (V2DImode);
22634   res_4 = gen_reg_rtx (V2DImode);
22635   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
22636   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
22637 
22638   /* Move the results in element 2 down to element 1; we don't care
22639      what goes in elements 2 and 3.  Then we can merge the parts
22640      back together with an interleave.
22641 
22642      Note that two other sequences were tried:
22643      (1) Use interleaves at the start instead of psrldq, which allows
22644      us to use a single shufps to merge things back at the end.
22645      (2) Use shufps here to combine the two vectors, then pshufd to
22646      put the elements in the correct order.
22647      In both cases the cost of the reformatting stall was too high
22648      and the overall sequence slower.  */
22649 
22650   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
22651 				const0_rtx, const2_rtx,
22652 				const0_rtx, const0_rtx));
22653   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
22654 				const0_rtx, const2_rtx,
22655 				const0_rtx, const0_rtx));
22656   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
22657 
22658   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
22659 }
22660 
22661 void
ix86_expand_sse2_mulvxdi3(rtx op0,rtx op1,rtx op2)22662 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
22663 {
22664   machine_mode mode = GET_MODE (op0);
22665   rtx t1, t2, t3, t4, t5, t6;
22666 
22667   if (TARGET_AVX512DQ && mode == V8DImode)
22668     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
22669   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
22670     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
22671   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
22672     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
22673   else if (TARGET_XOP && mode == V2DImode)
22674     {
22675       /* op1: A,B,C,D, op2: E,F,G,H */
22676       op1 = gen_lowpart (V4SImode, op1);
22677       op2 = gen_lowpart (V4SImode, op2);
22678 
22679       t1 = gen_reg_rtx (V4SImode);
22680       t2 = gen_reg_rtx (V4SImode);
22681       t3 = gen_reg_rtx (V2DImode);
22682       t4 = gen_reg_rtx (V2DImode);
22683 
22684       /* t1: B,A,D,C */
22685       emit_insn (gen_sse2_pshufd_1 (t1, op1,
22686 				    GEN_INT (1),
22687 				    GEN_INT (0),
22688 				    GEN_INT (3),
22689 				    GEN_INT (2)));
22690 
22691       /* t2: (B*E),(A*F),(D*G),(C*H) */
22692       emit_insn (gen_mulv4si3 (t2, t1, op2));
22693 
22694       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
22695       emit_insn (gen_xop_phadddq (t3, t2));
22696 
22697       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
22698       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
22699 
22700       /* Multiply lower parts and add all */
22701       t5 = gen_reg_rtx (V2DImode);
22702       emit_insn (gen_vec_widen_umult_even_v4si (t5,
22703 					gen_lowpart (V4SImode, op1),
22704 					gen_lowpart (V4SImode, op2)));
22705       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
22706     }
22707   else
22708     {
22709       machine_mode nmode;
22710       rtx (*umul) (rtx, rtx, rtx);
22711 
22712       if (mode == V2DImode)
22713 	{
22714 	  umul = gen_vec_widen_umult_even_v4si;
22715 	  nmode = V4SImode;
22716 	}
22717       else if (mode == V4DImode)
22718 	{
22719 	  umul = gen_vec_widen_umult_even_v8si;
22720 	  nmode = V8SImode;
22721 	}
22722       else if (mode == V8DImode)
22723 	{
22724 	  umul = gen_vec_widen_umult_even_v16si;
22725 	  nmode = V16SImode;
22726 	}
22727       else
22728 	gcc_unreachable ();
22729 
22730 
22731       /* Multiply low parts.  */
22732       t1 = gen_reg_rtx (mode);
22733       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
22734 
22735       /* Shift input vectors right 32 bits so we can multiply high parts.  */
22736       t6 = GEN_INT (32);
22737       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
22738       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
22739 
22740       /* Multiply high parts by low parts.  */
22741       t4 = gen_reg_rtx (mode);
22742       t5 = gen_reg_rtx (mode);
22743       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
22744       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
22745 
22746       /* Combine and shift the highparts back.  */
22747       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
22748       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
22749 
22750       /* Combine high and low parts.  */
22751       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
22752     }
22753 
22754   set_unique_reg_note (get_last_insn (), REG_EQUAL,
22755 		       gen_rtx_MULT (mode, op1, op2));
22756 }
22757 
22758 /* Return 1 if control tansfer instruction INSN
22759    should be encoded with notrack prefix.  */
22760 
22761 bool
ix86_notrack_prefixed_insn_p(rtx_insn * insn)22762 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
22763 {
22764   if (!insn || !((flag_cf_protection & CF_BRANCH)))
22765     return false;
22766 
22767   if (CALL_P (insn))
22768     {
22769       rtx call = get_call_rtx_from (insn);
22770       gcc_assert (call != NULL_RTX);
22771       rtx addr = XEXP (call, 0);
22772 
22773       /* Do not emit 'notrack' if it's not an indirect call.  */
22774       if (MEM_P (addr)
22775 	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
22776 	return false;
22777       else
22778 	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
22779     }
22780 
22781   if (JUMP_P (insn) && !flag_cet_switch)
22782     {
22783       rtx target = JUMP_LABEL (insn);
22784       if (target == NULL_RTX || ANY_RETURN_P (target))
22785 	return false;
22786 
22787       /* Check the jump is a switch table.  */
22788       rtx_insn *label = as_a<rtx_insn *> (target);
22789       rtx_insn *table = next_insn (label);
22790       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
22791 	return false;
22792       else
22793 	return true;
22794     }
22795   return false;
22796 }
22797 
22798 /* Calculate integer abs() using only SSE2 instructions.  */
22799 
22800 void
ix86_expand_sse2_abs(rtx target,rtx input)22801 ix86_expand_sse2_abs (rtx target, rtx input)
22802 {
22803   machine_mode mode = GET_MODE (target);
22804   rtx tmp0, tmp1, x;
22805 
22806   switch (mode)
22807     {
22808     case E_V2DImode:
22809     case E_V4DImode:
22810       /* For 64-bit signed integer X, with SSE4.2 use
22811 	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
22812 	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
22813 	 32 and use logical instead of arithmetic right shift (which is
22814 	 unimplemented) and subtract.  */
22815       if (TARGET_SSE4_2)
22816 	{
22817 	  tmp0 = gen_reg_rtx (mode);
22818 	  tmp1 = gen_reg_rtx (mode);
22819 	  emit_move_insn (tmp1, CONST0_RTX (mode));
22820 	  if (mode == E_V2DImode)
22821 	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
22822 	  else
22823 	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
22824 	}
22825       else
22826 	{
22827 	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
22828 				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
22829 					       - 1), NULL, 0, OPTAB_DIRECT);
22830 	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
22831 	}
22832 
22833       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
22834 				  NULL, 0, OPTAB_DIRECT);
22835       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
22836 			       target, 0, OPTAB_DIRECT);
22837       break;
22838 
22839     case E_V4SImode:
22840       /* For 32-bit signed integer X, the best way to calculate the absolute
22841 	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
22842       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
22843 				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
22844 				  NULL, 0, OPTAB_DIRECT);
22845       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
22846 				  NULL, 0, OPTAB_DIRECT);
22847       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
22848 			       target, 0, OPTAB_DIRECT);
22849       break;
22850 
22851     case E_V8HImode:
22852       /* For 16-bit signed integer X, the best way to calculate the absolute
22853 	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
22854       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
22855 
22856       x = expand_simple_binop (mode, SMAX, tmp0, input,
22857 			       target, 0, OPTAB_DIRECT);
22858       break;
22859 
22860     case E_V16QImode:
22861       /* For 8-bit signed integer X, the best way to calculate the absolute
22862 	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
22863 	 as SSE2 provides the PMINUB insn.  */
22864       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
22865 
22866       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
22867 			       target, 0, OPTAB_DIRECT);
22868       break;
22869 
22870     default:
22871       gcc_unreachable ();
22872     }
22873 
22874   if (x != target)
22875     emit_move_insn (target, x);
22876 }
22877 
22878 /* Expand an extract from a vector register through pextr insn.
22879    Return true if successful.  */
22880 
22881 bool
ix86_expand_pextr(rtx * operands)22882 ix86_expand_pextr (rtx *operands)
22883 {
22884   rtx dst = operands[0];
22885   rtx src = operands[1];
22886 
22887   unsigned int size = INTVAL (operands[2]);
22888   unsigned int pos = INTVAL (operands[3]);
22889 
22890   if (SUBREG_P (dst))
22891     {
22892       /* Reject non-lowpart subregs.  */
22893       if (SUBREG_BYTE (dst) > 0)
22894 	return false;
22895       dst = SUBREG_REG (dst);
22896     }
22897 
22898   if (SUBREG_P (src))
22899     {
22900       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
22901       src = SUBREG_REG (src);
22902     }
22903 
22904   switch (GET_MODE (src))
22905     {
22906     case E_V16QImode:
22907     case E_V8HImode:
22908     case E_V4SImode:
22909     case E_V2DImode:
22910     case E_V1TImode:
22911       {
22912 	machine_mode srcmode, dstmode;
22913 	rtx d, pat;
22914 
22915 	if (!int_mode_for_size (size, 0).exists (&dstmode))
22916 	  return false;
22917 
22918 	switch (dstmode)
22919 	  {
22920 	  case E_QImode:
22921 	    if (!TARGET_SSE4_1)
22922 	      return false;
22923 	    srcmode = V16QImode;
22924 	    break;
22925 
22926 	  case E_HImode:
22927 	    if (!TARGET_SSE2)
22928 	      return false;
22929 	    srcmode = V8HImode;
22930 	    break;
22931 
22932 	  case E_SImode:
22933 	    if (!TARGET_SSE4_1)
22934 	      return false;
22935 	    srcmode = V4SImode;
22936 	    break;
22937 
22938 	  case E_DImode:
22939 	    gcc_assert (TARGET_64BIT);
22940 	    if (!TARGET_SSE4_1)
22941 	      return false;
22942 	    srcmode = V2DImode;
22943 	    break;
22944 
22945 	  default:
22946 	    return false;
22947 	  }
22948 
22949 	/* Reject extractions from misaligned positions.  */
22950 	if (pos & (size-1))
22951 	  return false;
22952 
22953 	if (GET_MODE (dst) == dstmode)
22954 	  d = dst;
22955 	else
22956 	  d = gen_reg_rtx (dstmode);
22957 
22958 	/* Construct insn pattern.  */
22959 	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
22960 	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
22961 
22962 	/* Let the rtl optimizers know about the zero extension performed.  */
22963 	if (dstmode == QImode || dstmode == HImode)
22964 	  {
22965 	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
22966 	    d = gen_lowpart (SImode, d);
22967 	  }
22968 
22969 	emit_insn (gen_rtx_SET (d, pat));
22970 
22971 	if (d != dst)
22972 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
22973 	return true;
22974       }
22975 
22976     default:
22977       return false;
22978     }
22979 }
22980 
22981 /* Expand an insert into a vector register through pinsr insn.
22982    Return true if successful.  */
22983 
22984 bool
ix86_expand_pinsr(rtx * operands)22985 ix86_expand_pinsr (rtx *operands)
22986 {
22987   rtx dst = operands[0];
22988   rtx src = operands[3];
22989 
22990   unsigned int size = INTVAL (operands[1]);
22991   unsigned int pos = INTVAL (operands[2]);
22992 
22993   if (SUBREG_P (dst))
22994     {
22995       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
22996       dst = SUBREG_REG (dst);
22997     }
22998 
22999   switch (GET_MODE (dst))
23000     {
23001     case E_V16QImode:
23002     case E_V8HImode:
23003     case E_V4SImode:
23004     case E_V2DImode:
23005     case E_V1TImode:
23006       {
23007 	machine_mode srcmode, dstmode;
23008 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
23009 	rtx d;
23010 
23011 	if (!int_mode_for_size (size, 0).exists (&srcmode))
23012 	  return false;
23013 
23014 	switch (srcmode)
23015 	  {
23016 	  case E_QImode:
23017 	    if (!TARGET_SSE4_1)
23018 	      return false;
23019 	    dstmode = V16QImode;
23020 	    pinsr = gen_sse4_1_pinsrb;
23021 	    break;
23022 
23023 	  case E_HImode:
23024 	    if (!TARGET_SSE2)
23025 	      return false;
23026 	    dstmode = V8HImode;
23027 	    pinsr = gen_sse2_pinsrw;
23028 	    break;
23029 
23030 	  case E_SImode:
23031 	    if (!TARGET_SSE4_1)
23032 	      return false;
23033 	    dstmode = V4SImode;
23034 	    pinsr = gen_sse4_1_pinsrd;
23035 	    break;
23036 
23037 	  case E_DImode:
23038 	    gcc_assert (TARGET_64BIT);
23039 	    if (!TARGET_SSE4_1)
23040 	      return false;
23041 	    dstmode = V2DImode;
23042 	    pinsr = gen_sse4_1_pinsrq;
23043 	    break;
23044 
23045 	  default:
23046 	    return false;
23047 	  }
23048 
23049 	/* Reject insertions to misaligned positions.  */
23050 	if (pos & (size-1))
23051 	  return false;
23052 
23053 	if (SUBREG_P (src))
23054 	  {
23055 	    unsigned int srcpos = SUBREG_BYTE (src);
23056 
23057 	    if (srcpos > 0)
23058 	      {
23059 		rtx extr_ops[4];
23060 
23061 		extr_ops[0] = gen_reg_rtx (srcmode);
23062 		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23063 		extr_ops[2] = GEN_INT (size);
23064 		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23065 
23066 		if (!ix86_expand_pextr (extr_ops))
23067 		  return false;
23068 
23069 		src = extr_ops[0];
23070 	      }
23071 	    else
23072 	      src = gen_lowpart (srcmode, SUBREG_REG (src));
23073 	  }
23074 
23075 	if (GET_MODE (dst) == dstmode)
23076 	  d = dst;
23077 	else
23078 	  d = gen_reg_rtx (dstmode);
23079 
23080 	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23081 			  gen_lowpart (srcmode, src),
23082 			  GEN_INT (1 << (pos / size))));
23083 	if (d != dst)
23084 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23085 	return true;
23086       }
23087 
23088     default:
23089       return false;
23090     }
23091 }
23092 
23093 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23094    upper against lower halves up to SSE reg size.  */
23095 
23096 machine_mode
ix86_split_reduction(machine_mode mode)23097 ix86_split_reduction (machine_mode mode)
23098 {
23099   /* Reduce lowpart against highpart until we reach SSE reg width to
23100      avoid cross-lane operations.  */
23101   switch (mode)
23102     {
23103     case E_V8DImode:
23104     case E_V4DImode:
23105       return V2DImode;
23106     case E_V16SImode:
23107     case E_V8SImode:
23108       return V4SImode;
23109     case E_V32HImode:
23110     case E_V16HImode:
23111       return V8HImode;
23112     case E_V64QImode:
23113     case E_V32QImode:
23114       return V16QImode;
23115     case E_V16SFmode:
23116     case E_V8SFmode:
23117       return V4SFmode;
23118     case E_V8DFmode:
23119     case E_V4DFmode:
23120       return V2DFmode;
23121     default:
23122       return mode;
23123     }
23124 }
23125 
23126 /* Generate call to __divmoddi4.  */
23127 
23128 void
ix86_expand_divmod_libfunc(rtx libfunc,machine_mode mode,rtx op0,rtx op1,rtx * quot_p,rtx * rem_p)23129 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
23130 			    rtx op0, rtx op1,
23131 			    rtx *quot_p, rtx *rem_p)
23132 {
23133   rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
23134 
23135   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
23136 				      mode, op0, mode, op1, mode,
23137 				      XEXP (rem, 0), Pmode);
23138   *quot_p = quot;
23139   *rem_p = rem;
23140 }
23141 
ix86_expand_atomic_fetch_op_loop(rtx target,rtx mem,rtx val,enum rtx_code code,bool after,bool doubleword)23142 void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
23143 				       enum rtx_code code, bool after,
23144 				       bool doubleword)
23145 {
23146   rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
23147   rtx_code_label *loop_label, *pause_label, *done_label;
23148   machine_mode mode = GET_MODE (target);
23149 
23150   old_reg = gen_reg_rtx (mode);
23151   new_reg = old_reg;
23152   loop_label = gen_label_rtx ();
23153   pause_label = gen_label_rtx ();
23154   done_label = gen_label_rtx ();
23155   old_mem = copy_to_reg (mem);
23156   emit_label (loop_label);
23157   emit_move_insn (old_reg, old_mem);
23158 
23159   /* return value for atomic_fetch_op.  */
23160   if (!after)
23161     emit_move_insn (target, old_reg);
23162 
23163   if (code == NOT)
23164     {
23165       new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
23166 				     true, OPTAB_LIB_WIDEN);
23167       new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
23168     }
23169   else
23170     new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
23171 				   true, OPTAB_LIB_WIDEN);
23172 
23173   /* return value for atomic_op_fetch.  */
23174   if (after)
23175     emit_move_insn (target, new_reg);
23176 
23177   /* Load memory again inside loop.  */
23178   new_mem = copy_to_reg (mem);
23179   /* Compare mem value with expected value.  */
23180 
23181   if (doubleword)
23182     {
23183       machine_mode half_mode = (mode == DImode)? SImode : DImode;
23184       rtx low_new_mem = gen_lowpart (half_mode, new_mem);
23185       rtx low_old_mem = gen_lowpart (half_mode, old_mem);
23186       rtx high_new_mem = gen_highpart (half_mode, new_mem);
23187       rtx high_old_mem = gen_highpart (half_mode, old_mem);
23188       emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
23189 			       half_mode, 1, pause_label,
23190 			       profile_probability::guessed_never ());
23191       emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
23192 			       half_mode, 1, pause_label,
23193 			       profile_probability::guessed_never ());
23194     }
23195   else
23196     emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
23197 			     GET_MODE (old_mem), 1, pause_label,
23198 			     profile_probability::guessed_never ());
23199 
23200   success = NULL_RTX;
23201   oldval = old_mem;
23202   expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
23203 				  new_reg, false, MEMMODEL_SYNC_SEQ_CST,
23204 				  MEMMODEL_RELAXED);
23205   if (oldval != old_mem)
23206     emit_move_insn (old_mem, oldval);
23207 
23208   emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
23209 			   GET_MODE (success), 1, loop_label,
23210 			   profile_probability::guessed_never ());
23211 
23212   emit_jump_insn (gen_jump (done_label));
23213   emit_barrier ();
23214 
23215   /* If mem is not expected, pause and loop back.  */
23216   emit_label (pause_label);
23217   emit_insn (gen_pause ());
23218   emit_jump_insn (gen_jump (loop_label));
23219   emit_barrier ();
23220   emit_label (done_label);
23221 }
23222 
23223 #include "gt-i386-expand.h"
23224