1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 
96 /* Split one or more double-mode RTL references into pairs of half-mode
97    references.  The RTL can be REG, offsettable MEM, integer constant, or
98    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
99    split and "num" is its length.  lo_half and hi_half are output arrays
100    that parallel "operands".  */
101 
102 void
split_double_mode(machine_mode mode,rtx operands[],int num,rtx lo_half[],rtx hi_half[])103 split_double_mode (machine_mode mode, rtx operands[],
104 		   int num, rtx lo_half[], rtx hi_half[])
105 {
106   machine_mode half_mode;
107   unsigned int byte;
108   rtx mem_op = NULL_RTX;
109   int mem_num = 0;
110 
111   switch (mode)
112     {
113     case E_TImode:
114       half_mode = DImode;
115       break;
116     case E_DImode:
117       half_mode = SImode;
118       break;
119     case E_P2HImode:
120       half_mode = HImode;
121       break;
122     case E_P2QImode:
123       half_mode = QImode;
124       break;
125     default:
126       gcc_unreachable ();
127     }
128 
129   byte = GET_MODE_SIZE (half_mode);
130 
131   while (num--)
132     {
133       rtx op = operands[num];
134 
135       /* simplify_subreg refuse to split volatile memory addresses,
136          but we still have to handle it.  */
137       if (MEM_P (op))
138 	{
139 	  if (mem_op && rtx_equal_p (op, mem_op))
140 	    {
141 	      lo_half[num] = lo_half[mem_num];
142 	      hi_half[num] = hi_half[mem_num];
143 	    }
144 	  else
145 	    {
146 	      mem_op = op;
147 	      mem_num = num;
148 	      lo_half[num] = adjust_address (op, half_mode, 0);
149 	      hi_half[num] = adjust_address (op, half_mode, byte);
150 	    }
151 	}
152       else
153 	{
154 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
155 					      GET_MODE (op) == VOIDmode
156 					      ? mode : GET_MODE (op), 0);
157 
158 	  rtx tmp = simplify_gen_subreg (half_mode, op,
159 					 GET_MODE (op) == VOIDmode
160 					 ? mode : GET_MODE (op), byte);
161 	  /* simplify_gen_subreg will return NULL RTX for the
162 	     high half of the paradoxical subreg. */
163 	  hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 	}
165     }
166 }
167 
168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
169    for the target.  */
170 
171 void
ix86_expand_clear(rtx dest)172 ix86_expand_clear (rtx dest)
173 {
174   rtx tmp;
175 
176   /* We play register width games, which are only valid after reload.  */
177   gcc_assert (reload_completed);
178 
179   /* Avoid HImode and its attendant prefix byte.  */
180   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
181     dest = gen_rtx_REG (SImode, REGNO (dest));
182   tmp = gen_rtx_SET (dest, const0_rtx);
183 
184   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
185     {
186       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
187       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
188     }
189 
190   emit_insn (tmp);
191 }
192 
193 void
ix86_expand_move(machine_mode mode,rtx operands[])194 ix86_expand_move (machine_mode mode, rtx operands[])
195 {
196   rtx op0, op1;
197   rtx tmp, addend = NULL_RTX;
198   enum tls_model model;
199 
200   op0 = operands[0];
201   op1 = operands[1];
202 
203   /* Avoid complex sets of likely spilled hard registers before reload.  */
204   if (!ix86_hardreg_mov_ok (op0, op1))
205     {
206       tmp = gen_reg_rtx (mode);
207       operands[0] = tmp;
208       ix86_expand_move (mode, operands);
209       operands[0] = op0;
210       operands[1] = tmp;
211       op1 = tmp;
212     }
213 
214   switch (GET_CODE (op1))
215     {
216     case CONST:
217       tmp = XEXP (op1, 0);
218 
219       if (GET_CODE (tmp) != PLUS
220 	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
221 	break;
222 
223       op1 = XEXP (tmp, 0);
224       addend = XEXP (tmp, 1);
225       /* FALLTHRU */
226 
227     case SYMBOL_REF:
228       model = SYMBOL_REF_TLS_MODEL (op1);
229 
230       if (model)
231 	op1 = legitimize_tls_address (op1, model, true);
232       else if (ix86_force_load_from_GOT_p (op1))
233 	{
234 	  /* Load the external function address via GOT slot to avoid PLT.  */
235 	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
236 				(TARGET_64BIT
237 				 ? UNSPEC_GOTPCREL
238 				 : UNSPEC_GOT));
239 	  op1 = gen_rtx_CONST (Pmode, op1);
240 	  op1 = gen_const_mem (Pmode, op1);
241 	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
242 	}
243       else
244 	{
245 	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
246 	  if (tmp)
247 	    {
248 	      op1 = tmp;
249 	      if (!addend)
250 		break;
251 	    }
252 	  else
253 	    {
254 	      op1 = operands[1];
255 	      break;
256 	    }
257 	}
258 
259       if (addend)
260 	{
261 	  op1 = force_operand (op1, NULL_RTX);
262 	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
263 				     op0, 1, OPTAB_DIRECT);
264 	}
265       else
266 	op1 = force_operand (op1, op0);
267 
268       if (op1 == op0)
269 	return;
270 
271       op1 = convert_to_mode (mode, op1, 1);
272 
273     default:
274       break;
275     }
276 
277   if ((flag_pic || MACHOPIC_INDIRECT)
278       && symbolic_operand (op1, mode))
279     {
280       if (TARGET_MACHO && !TARGET_64BIT)
281 	{
282 #if TARGET_MACHO
283 	  /* dynamic-no-pic */
284 	  if (MACHOPIC_INDIRECT)
285 	    {
286 	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
287 			 ? op0 : gen_reg_rtx (Pmode);
288 	      op1 = machopic_indirect_data_reference (op1, temp);
289 	      if (MACHOPIC_PURE)
290 		op1 = machopic_legitimize_pic_address (op1, mode,
291 						       temp == op1 ? 0 : temp);
292 	    }
293 	  if (op0 != op1 && GET_CODE (op0) != MEM)
294 	    {
295 	      rtx insn = gen_rtx_SET (op0, op1);
296 	      emit_insn (insn);
297 	      return;
298 	    }
299 	  if (GET_CODE (op0) == MEM)
300 	    op1 = force_reg (Pmode, op1);
301 	  else
302 	    {
303 	      rtx temp = op0;
304 	      if (GET_CODE (temp) != REG)
305 		temp = gen_reg_rtx (Pmode);
306 	      temp = legitimize_pic_address (op1, temp);
307 	      if (temp == op0)
308 	    return;
309 	      op1 = temp;
310 	    }
311       /* dynamic-no-pic */
312 #endif
313 	}
314       else
315 	{
316 	  if (MEM_P (op0))
317 	    op1 = force_reg (mode, op1);
318 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
319 	    {
320 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
321 	      op1 = legitimize_pic_address (op1, reg);
322 	      if (op0 == op1)
323 		return;
324 	      op1 = convert_to_mode (mode, op1, 1);
325 	    }
326 	}
327     }
328   else
329     {
330       if (MEM_P (op0)
331 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
332 	      || !push_operand (op0, mode))
333 	  && MEM_P (op1))
334 	op1 = force_reg (mode, op1);
335 
336       if (push_operand (op0, mode)
337 	  && ! general_no_elim_operand (op1, mode))
338 	op1 = copy_to_mode_reg (mode, op1);
339 
340       /* Force large constants in 64bit compilation into register
341 	 to get them CSEed.  */
342       if (can_create_pseudo_p ()
343 	  && (mode == DImode) && TARGET_64BIT
344 	  && immediate_operand (op1, mode)
345 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
346 	  && !register_operand (op0, mode)
347 	  && optimize)
348 	op1 = copy_to_mode_reg (mode, op1);
349 
350       if (can_create_pseudo_p ()
351 	  && CONST_DOUBLE_P (op1))
352 	{
353 	  /* If we are loading a floating point constant to a register,
354 	     force the value to memory now, since we'll get better code
355 	     out the back end.  */
356 
357 	  op1 = validize_mem (force_const_mem (mode, op1));
358 	  if (!register_operand (op0, mode))
359 	    {
360 	      rtx temp = gen_reg_rtx (mode);
361 	      emit_insn (gen_rtx_SET (temp, op1));
362 	      emit_move_insn (op0, temp);
363 	      return;
364 	    }
365 	}
366     }
367 
368   emit_insn (gen_rtx_SET (op0, op1));
369 }
370 
371 void
ix86_expand_vector_move(machine_mode mode,rtx operands[])372 ix86_expand_vector_move (machine_mode mode, rtx operands[])
373 {
374   rtx op0 = operands[0], op1 = operands[1];
375   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
376      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
377   unsigned int align = (TARGET_IAMCU
378 			? GET_MODE_BITSIZE (mode)
379 			: GET_MODE_ALIGNMENT (mode));
380 
381   if (push_operand (op0, VOIDmode))
382     op0 = emit_move_resolve_push (mode, op0);
383 
384   /* Force constants other than zero into memory.  We do not know how
385      the instructions used to build constants modify the upper 64 bits
386      of the register, once we have that information we may be able
387      to handle some of them more efficiently.  */
388   if (can_create_pseudo_p ()
389       && (CONSTANT_P (op1)
390 	  || (SUBREG_P (op1)
391 	      && CONSTANT_P (SUBREG_REG (op1))))
392       && ((register_operand (op0, mode)
393 	   && !standard_sse_constant_p (op1, mode))
394 	  /* ix86_expand_vector_move_misalign() does not like constants.  */
395 	  || (SSE_REG_MODE_P (mode)
396 	      && MEM_P (op0)
397 	      && MEM_ALIGN (op0) < align)))
398     {
399       if (SUBREG_P (op1))
400 	{
401 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
402 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
403 	  if (r)
404 	    r = validize_mem (r);
405 	  else
406 	    r = force_reg (imode, SUBREG_REG (op1));
407 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
408 	}
409       else
410 	op1 = validize_mem (force_const_mem (mode, op1));
411     }
412 
413   /* We need to check memory alignment for SSE mode since attribute
414      can make operands unaligned.  */
415   if (can_create_pseudo_p ()
416       && SSE_REG_MODE_P (mode)
417       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
418 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
419     {
420       rtx tmp[2];
421 
422       /* ix86_expand_vector_move_misalign() does not like both
423 	 arguments in memory.  */
424       if (!register_operand (op0, mode)
425 	  && !register_operand (op1, mode))
426 	op1 = force_reg (mode, op1);
427 
428       tmp[0] = op0; tmp[1] = op1;
429       ix86_expand_vector_move_misalign (mode, tmp);
430       return;
431     }
432 
433   /* Make operand1 a register if it isn't already.  */
434   if (can_create_pseudo_p ()
435       && !register_operand (op0, mode)
436       && !register_operand (op1, mode))
437     {
438       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
439       return;
440     }
441 
442   emit_insn (gen_rtx_SET (op0, op1));
443 }
444 
445 /* Split 32-byte AVX unaligned load and store if needed.  */
446 
447 static void
ix86_avx256_split_vector_move_misalign(rtx op0,rtx op1)448 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
449 {
450   rtx m;
451   rtx (*extract) (rtx, rtx, rtx);
452   machine_mode mode;
453 
454   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
455       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
456     {
457       emit_insn (gen_rtx_SET (op0, op1));
458       return;
459     }
460 
461   rtx orig_op0 = NULL_RTX;
462   mode = GET_MODE (op0);
463   switch (GET_MODE_CLASS (mode))
464     {
465     case MODE_VECTOR_INT:
466     case MODE_INT:
467       if (mode != V32QImode)
468 	{
469 	  if (!MEM_P (op0))
470 	    {
471 	      orig_op0 = op0;
472 	      op0 = gen_reg_rtx (V32QImode);
473 	    }
474 	  else
475 	    op0 = gen_lowpart (V32QImode, op0);
476 	  op1 = gen_lowpart (V32QImode, op1);
477 	  mode = V32QImode;
478 	}
479       break;
480     case MODE_VECTOR_FLOAT:
481       break;
482     default:
483       gcc_unreachable ();
484     }
485 
486   switch (mode)
487     {
488     default:
489       gcc_unreachable ();
490     case E_V32QImode:
491       extract = gen_avx_vextractf128v32qi;
492       mode = V16QImode;
493       break;
494     case E_V8SFmode:
495       extract = gen_avx_vextractf128v8sf;
496       mode = V4SFmode;
497       break;
498     case E_V4DFmode:
499       extract = gen_avx_vextractf128v4df;
500       mode = V2DFmode;
501       break;
502     }
503 
504   if (MEM_P (op1))
505     {
506       rtx r = gen_reg_rtx (mode);
507       m = adjust_address (op1, mode, 0);
508       emit_move_insn (r, m);
509       m = adjust_address (op1, mode, 16);
510       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
511       emit_move_insn (op0, r);
512     }
513   else if (MEM_P (op0))
514     {
515       m = adjust_address (op0, mode, 0);
516       emit_insn (extract (m, op1, const0_rtx));
517       m = adjust_address (op0, mode, 16);
518       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
519     }
520   else
521     gcc_unreachable ();
522 
523   if (orig_op0)
524     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
525 }
526 
527 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
528    straight to ix86_expand_vector_move.  */
529 /* Code generation for scalar reg-reg moves of single and double precision data:
530      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
531        movaps reg, reg
532      else
533        movss reg, reg
534      if (x86_sse_partial_reg_dependency == true)
535        movapd reg, reg
536      else
537        movsd reg, reg
538 
539    Code generation for scalar loads of double precision data:
540      if (x86_sse_split_regs == true)
541        movlpd mem, reg      (gas syntax)
542      else
543        movsd mem, reg
544 
545    Code generation for unaligned packed loads of single precision data
546    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
547      if (x86_sse_unaligned_move_optimal)
548        movups mem, reg
549 
550      if (x86_sse_partial_reg_dependency == true)
551        {
552          xorps  reg, reg
553          movlps mem, reg
554          movhps mem+8, reg
555        }
556      else
557        {
558          movlps mem, reg
559          movhps mem+8, reg
560        }
561 
562    Code generation for unaligned packed loads of double precision data
563    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
564      if (x86_sse_unaligned_move_optimal)
565        movupd mem, reg
566 
567      if (x86_sse_split_regs == true)
568        {
569          movlpd mem, reg
570          movhpd mem+8, reg
571        }
572      else
573        {
574          movsd  mem, reg
575          movhpd mem+8, reg
576        }
577  */
578 
579 void
ix86_expand_vector_move_misalign(machine_mode mode,rtx operands[])580 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
581 {
582   rtx op0, op1, m;
583 
584   op0 = operands[0];
585   op1 = operands[1];
586 
587   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
588   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
589     {
590       emit_insn (gen_rtx_SET (op0, op1));
591       return;
592     }
593 
594   if (TARGET_AVX)
595     {
596       if (GET_MODE_SIZE (mode) == 32)
597 	ix86_avx256_split_vector_move_misalign (op0, op1);
598       else
599 	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
600 	emit_insn (gen_rtx_SET (op0, op1));
601       return;
602     }
603 
604   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
605       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
606     {
607       emit_insn (gen_rtx_SET (op0, op1));
608       return;
609     }
610 
611   /* ??? If we have typed data, then it would appear that using
612      movdqu is the only way to get unaligned data loaded with
613      integer type.  */
614   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
615     {
616       emit_insn (gen_rtx_SET (op0, op1));
617       return;
618     }
619 
620   if (MEM_P (op1))
621     {
622       if (TARGET_SSE2 && mode == V2DFmode)
623         {
624           rtx zero;
625 
626 	  /* When SSE registers are split into halves, we can avoid
627 	     writing to the top half twice.  */
628 	  if (TARGET_SSE_SPLIT_REGS)
629 	    {
630 	      emit_clobber (op0);
631 	      zero = op0;
632 	    }
633 	  else
634 	    {
635 	      /* ??? Not sure about the best option for the Intel chips.
636 		 The following would seem to satisfy; the register is
637 		 entirely cleared, breaking the dependency chain.  We
638 		 then store to the upper half, with a dependency depth
639 		 of one.  A rumor has it that Intel recommends two movsd
640 		 followed by an unpacklpd, but this is unconfirmed.  And
641 		 given that the dependency depth of the unpacklpd would
642 		 still be one, I'm not sure why this would be better.  */
643 	      zero = CONST0_RTX (V2DFmode);
644 	    }
645 
646 	  m = adjust_address (op1, DFmode, 0);
647 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
648 	  m = adjust_address (op1, DFmode, 8);
649 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
650 	}
651       else
652         {
653 	  rtx t;
654 
655 	  if (mode != V4SFmode)
656 	    t = gen_reg_rtx (V4SFmode);
657 	  else
658 	    t = op0;
659 
660 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
661 	    emit_move_insn (t, CONST0_RTX (V4SFmode));
662 	  else
663 	    emit_clobber (t);
664 
665 	  m = adjust_address (op1, V2SFmode, 0);
666 	  emit_insn (gen_sse_loadlps (t, t, m));
667 	  m = adjust_address (op1, V2SFmode, 8);
668 	  emit_insn (gen_sse_loadhps (t, t, m));
669 	  if (mode != V4SFmode)
670 	    emit_move_insn (op0, gen_lowpart (mode, t));
671 	}
672     }
673   else if (MEM_P (op0))
674     {
675       if (TARGET_SSE2 && mode == V2DFmode)
676 	{
677 	  m = adjust_address (op0, DFmode, 0);
678 	  emit_insn (gen_sse2_storelpd (m, op1));
679 	  m = adjust_address (op0, DFmode, 8);
680 	  emit_insn (gen_sse2_storehpd (m, op1));
681 	}
682       else
683 	{
684 	  if (mode != V4SFmode)
685 	    op1 = gen_lowpart (V4SFmode, op1);
686 
687 	  m = adjust_address (op0, V2SFmode, 0);
688 	  emit_insn (gen_sse_storelps (m, op1));
689 	  m = adjust_address (op0, V2SFmode, 8);
690 	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
691 	}
692     }
693   else
694     gcc_unreachable ();
695 }
696 
697 /* Move bits 64:95 to bits 32:63.  */
698 
699 void
ix86_move_vector_high_sse_to_mmx(rtx op)700 ix86_move_vector_high_sse_to_mmx (rtx op)
701 {
702   rtx mask = gen_rtx_PARALLEL (VOIDmode,
703 			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
704 					  GEN_INT (0), GEN_INT (0)));
705   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
706   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
707   rtx insn = gen_rtx_SET (dest, op);
708   emit_insn (insn);
709 }
710 
711 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
712 
713 void
ix86_split_mmx_pack(rtx operands[],enum rtx_code code)714 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
715 {
716   rtx op0 = operands[0];
717   rtx op1 = operands[1];
718   rtx op2 = operands[2];
719 
720   machine_mode dmode = GET_MODE (op0);
721   machine_mode smode = GET_MODE (op1);
722   machine_mode inner_dmode = GET_MODE_INNER (dmode);
723   machine_mode inner_smode = GET_MODE_INNER (smode);
724 
725   /* Get the corresponding SSE mode for destination.  */
726   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
727   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
728 					    nunits).require ();
729   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
730 						 nunits / 2).require ();
731 
732   /* Get the corresponding SSE mode for source.  */
733   nunits = 16 / GET_MODE_SIZE (inner_smode);
734   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
735 					    nunits).require ();
736 
737   /* Generate SSE pack with signed/unsigned saturation.  */
738   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
739   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
740   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
741 
742   op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
743   op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
744   rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
745 						    op1, op2));
746   emit_insn (insn);
747 
748   ix86_move_vector_high_sse_to_mmx (op0);
749 }
750 
751 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
752 
753 void
ix86_split_mmx_punpck(rtx operands[],bool high_p)754 ix86_split_mmx_punpck (rtx operands[], bool high_p)
755 {
756   rtx op0 = operands[0];
757   rtx op1 = operands[1];
758   rtx op2 = operands[2];
759   machine_mode mode = GET_MODE (op0);
760   rtx mask;
761   /* The corresponding SSE mode.  */
762   machine_mode sse_mode, double_sse_mode;
763 
764   switch (mode)
765     {
766     case E_V8QImode:
767       sse_mode = V16QImode;
768       double_sse_mode = V32QImode;
769       mask = gen_rtx_PARALLEL (VOIDmode,
770 			       gen_rtvec (16,
771 					  GEN_INT (0), GEN_INT (16),
772 					  GEN_INT (1), GEN_INT (17),
773 					  GEN_INT (2), GEN_INT (18),
774 					  GEN_INT (3), GEN_INT (19),
775 					  GEN_INT (4), GEN_INT (20),
776 					  GEN_INT (5), GEN_INT (21),
777 					  GEN_INT (6), GEN_INT (22),
778 					  GEN_INT (7), GEN_INT (23)));
779       break;
780 
781     case E_V4HImode:
782       sse_mode = V8HImode;
783       double_sse_mode = V16HImode;
784       mask = gen_rtx_PARALLEL (VOIDmode,
785 			       gen_rtvec (8,
786 					  GEN_INT (0), GEN_INT (8),
787 					  GEN_INT (1), GEN_INT (9),
788 					  GEN_INT (2), GEN_INT (10),
789 					  GEN_INT (3), GEN_INT (11)));
790       break;
791 
792     case E_V2SImode:
793       sse_mode = V4SImode;
794       double_sse_mode = V8SImode;
795       mask = gen_rtx_PARALLEL (VOIDmode,
796 			       gen_rtvec (4,
797 					  GEN_INT (0), GEN_INT (4),
798 					  GEN_INT (1), GEN_INT (5)));
799       break;
800 
801     default:
802       gcc_unreachable ();
803     }
804 
805   /* Generate SSE punpcklXX.  */
806   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
807   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
808   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
809 
810   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
811   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
812   rtx insn = gen_rtx_SET (dest, op2);
813   emit_insn (insn);
814 
815   if (high_p)
816     {
817       /* Move bits 64:127 to bits 0:63.  */
818       mask = gen_rtx_PARALLEL (VOIDmode,
819 			       gen_rtvec (4, GEN_INT (2), GEN_INT (3),
820 					  GEN_INT (0), GEN_INT (0)));
821       dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
822       op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
823       insn = gen_rtx_SET (dest, op1);
824       emit_insn (insn);
825     }
826 }
827 
828 /* Helper function of ix86_fixup_binary_operands to canonicalize
829    operand order.  Returns true if the operands should be swapped.  */
830 
831 static bool
ix86_swap_binary_operands_p(enum rtx_code code,machine_mode mode,rtx operands[])832 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
833 			     rtx operands[])
834 {
835   rtx dst = operands[0];
836   rtx src1 = operands[1];
837   rtx src2 = operands[2];
838 
839   /* If the operation is not commutative, we can't do anything.  */
840   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
841       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
842     return false;
843 
844   /* Highest priority is that src1 should match dst.  */
845   if (rtx_equal_p (dst, src1))
846     return false;
847   if (rtx_equal_p (dst, src2))
848     return true;
849 
850   /* Next highest priority is that immediate constants come second.  */
851   if (immediate_operand (src2, mode))
852     return false;
853   if (immediate_operand (src1, mode))
854     return true;
855 
856   /* Lowest priority is that memory references should come second.  */
857   if (MEM_P (src2))
858     return false;
859   if (MEM_P (src1))
860     return true;
861 
862   return false;
863 }
864 
865 
866 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
867    destination to use for the operation.  If different from the true
868    destination in operands[0], a copy operation will be required.  */
869 
870 rtx
ix86_fixup_binary_operands(enum rtx_code code,machine_mode mode,rtx operands[])871 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
872 			    rtx operands[])
873 {
874   rtx dst = operands[0];
875   rtx src1 = operands[1];
876   rtx src2 = operands[2];
877 
878   /* Canonicalize operand order.  */
879   if (ix86_swap_binary_operands_p (code, mode, operands))
880     {
881       /* It is invalid to swap operands of different modes.  */
882       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
883 
884       std::swap (src1, src2);
885     }
886 
887   /* Both source operands cannot be in memory.  */
888   if (MEM_P (src1) && MEM_P (src2))
889     {
890       /* Optimization: Only read from memory once.  */
891       if (rtx_equal_p (src1, src2))
892 	{
893 	  src2 = force_reg (mode, src2);
894 	  src1 = src2;
895 	}
896       else if (rtx_equal_p (dst, src1))
897 	src2 = force_reg (mode, src2);
898       else
899 	src1 = force_reg (mode, src1);
900     }
901 
902   /* If the destination is memory, and we do not have matching source
903      operands, do things in registers.  */
904   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
905     dst = gen_reg_rtx (mode);
906 
907   /* Source 1 cannot be a constant.  */
908   if (CONSTANT_P (src1))
909     src1 = force_reg (mode, src1);
910 
911   /* Source 1 cannot be a non-matching memory.  */
912   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
913     src1 = force_reg (mode, src1);
914 
915   /* Improve address combine.  */
916   if (code == PLUS
917       && GET_MODE_CLASS (mode) == MODE_INT
918       && MEM_P (src2))
919     src2 = force_reg (mode, src2);
920 
921   operands[1] = src1;
922   operands[2] = src2;
923   return dst;
924 }
925 
926 /* Similarly, but assume that the destination has already been
927    set up properly.  */
928 
929 void
ix86_fixup_binary_operands_no_copy(enum rtx_code code,machine_mode mode,rtx operands[])930 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
931 				    machine_mode mode, rtx operands[])
932 {
933   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
934   gcc_assert (dst == operands[0]);
935 }
936 
937 /* Attempt to expand a binary operator.  Make the expansion closer to the
938    actual machine, then just general_operand, which will allow 3 separate
939    memory references (one output, two input) in a single insn.  */
940 
941 void
ix86_expand_binary_operator(enum rtx_code code,machine_mode mode,rtx operands[])942 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
943 			     rtx operands[])
944 {
945   rtx src1, src2, dst, op, clob;
946 
947   dst = ix86_fixup_binary_operands (code, mode, operands);
948   src1 = operands[1];
949   src2 = operands[2];
950 
951  /* Emit the instruction.  */
952 
953   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
954 
955   if (reload_completed
956       && code == PLUS
957       && !rtx_equal_p (dst, src1))
958     {
959       /* This is going to be an LEA; avoid splitting it later.  */
960       emit_insn (op);
961     }
962   else
963     {
964       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
965       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
966     }
967 
968   /* Fix up the destination if needed.  */
969   if (dst != operands[0])
970     emit_move_insn (operands[0], dst);
971 }
972 
973 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
974    the given OPERANDS.  */
975 
976 void
ix86_expand_vector_logical_operator(enum rtx_code code,machine_mode mode,rtx operands[])977 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
978 				     rtx operands[])
979 {
980   rtx op1 = NULL_RTX, op2 = NULL_RTX;
981   if (SUBREG_P (operands[1]))
982     {
983       op1 = operands[1];
984       op2 = operands[2];
985     }
986   else if (SUBREG_P (operands[2]))
987     {
988       op1 = operands[2];
989       op2 = operands[1];
990     }
991   /* Optimize (__m128i) d | (__m128i) e and similar code
992      when d and e are float vectors into float vector logical
993      insn.  In C/C++ without using intrinsics there is no other way
994      to express vector logical operation on float vectors than
995      to cast them temporarily to integer vectors.  */
996   if (op1
997       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
998       && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
999       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1000       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1001       && SUBREG_BYTE (op1) == 0
1002       && (GET_CODE (op2) == CONST_VECTOR
1003 	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1004 	      && SUBREG_BYTE (op2) == 0))
1005       && can_create_pseudo_p ())
1006     {
1007       rtx dst;
1008       switch (GET_MODE (SUBREG_REG (op1)))
1009 	{
1010 	case E_V4SFmode:
1011 	case E_V8SFmode:
1012 	case E_V16SFmode:
1013 	case E_V2DFmode:
1014 	case E_V4DFmode:
1015 	case E_V8DFmode:
1016 	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1017 	  if (GET_CODE (op2) == CONST_VECTOR)
1018 	    {
1019 	      op2 = gen_lowpart (GET_MODE (dst), op2);
1020 	      op2 = force_reg (GET_MODE (dst), op2);
1021 	    }
1022 	  else
1023 	    {
1024 	      op1 = operands[1];
1025 	      op2 = SUBREG_REG (operands[2]);
1026 	      if (!vector_operand (op2, GET_MODE (dst)))
1027 		op2 = force_reg (GET_MODE (dst), op2);
1028 	    }
1029 	  op1 = SUBREG_REG (op1);
1030 	  if (!vector_operand (op1, GET_MODE (dst)))
1031 	    op1 = force_reg (GET_MODE (dst), op1);
1032 	  emit_insn (gen_rtx_SET (dst,
1033 				  gen_rtx_fmt_ee (code, GET_MODE (dst),
1034 						  op1, op2)));
1035 	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
1036 	  return;
1037 	default:
1038 	  break;
1039 	}
1040     }
1041   if (!vector_operand (operands[1], mode))
1042     operands[1] = force_reg (mode, operands[1]);
1043   if (!vector_operand (operands[2], mode))
1044     operands[2] = force_reg (mode, operands[2]);
1045   ix86_fixup_binary_operands_no_copy (code, mode, operands);
1046   emit_insn (gen_rtx_SET (operands[0],
1047 			  gen_rtx_fmt_ee (code, mode, operands[1],
1048 					  operands[2])));
1049 }
1050 
1051 /* Return TRUE or FALSE depending on whether the binary operator meets the
1052    appropriate constraints.  */
1053 
1054 bool
ix86_binary_operator_ok(enum rtx_code code,machine_mode mode,rtx operands[3])1055 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1056 			 rtx operands[3])
1057 {
1058   rtx dst = operands[0];
1059   rtx src1 = operands[1];
1060   rtx src2 = operands[2];
1061 
1062   /* Both source operands cannot be in memory.  */
1063   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1064       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1065     return false;
1066 
1067   /* Canonicalize operand order for commutative operators.  */
1068   if (ix86_swap_binary_operands_p (code, mode, operands))
1069     std::swap (src1, src2);
1070 
1071   /* If the destination is memory, we must have a matching source operand.  */
1072   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1073     return false;
1074 
1075   /* Source 1 cannot be a constant.  */
1076   if (CONSTANT_P (src1))
1077     return false;
1078 
1079   /* Source 1 cannot be a non-matching memory.  */
1080   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1081     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
1082     return (code == AND
1083 	    && (mode == HImode
1084 		|| mode == SImode
1085 		|| (TARGET_64BIT && mode == DImode))
1086 	    && satisfies_constraint_L (src2));
1087 
1088   return true;
1089 }
1090 
1091 /* Attempt to expand a unary operator.  Make the expansion closer to the
1092    actual machine, then just general_operand, which will allow 2 separate
1093    memory references (one output, one input) in a single insn.  */
1094 
1095 void
ix86_expand_unary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1096 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1097 			    rtx operands[])
1098 {
1099   bool matching_memory = false;
1100   rtx src, dst, op, clob;
1101 
1102   dst = operands[0];
1103   src = operands[1];
1104 
1105   /* If the destination is memory, and we do not have matching source
1106      operands, do things in registers.  */
1107   if (MEM_P (dst))
1108     {
1109       if (rtx_equal_p (dst, src))
1110 	matching_memory = true;
1111       else
1112 	dst = gen_reg_rtx (mode);
1113     }
1114 
1115   /* When source operand is memory, destination must match.  */
1116   if (MEM_P (src) && !matching_memory)
1117     src = force_reg (mode, src);
1118 
1119   /* Emit the instruction.  */
1120 
1121   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1122 
1123   if (code == NOT)
1124     emit_insn (op);
1125   else
1126     {
1127       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1128       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1129     }
1130 
1131   /* Fix up the destination if needed.  */
1132   if (dst != operands[0])
1133     emit_move_insn (operands[0], dst);
1134 }
1135 
1136 /* Predict just emitted jump instruction to be taken with probability PROB.  */
1137 
1138 static void
predict_jump(int prob)1139 predict_jump (int prob)
1140 {
1141   rtx_insn *insn = get_last_insn ();
1142   gcc_assert (JUMP_P (insn));
1143   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1144 }
1145 
1146 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1147    divisor are within the range [0-255].  */
1148 
1149 void
ix86_split_idivmod(machine_mode mode,rtx operands[],bool unsigned_p)1150 ix86_split_idivmod (machine_mode mode, rtx operands[],
1151 		    bool unsigned_p)
1152 {
1153   rtx_code_label *end_label, *qimode_label;
1154   rtx div, mod;
1155   rtx_insn *insn;
1156   rtx scratch, tmp0, tmp1, tmp2;
1157   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1158 
1159   switch (mode)
1160     {
1161     case E_SImode:
1162       if (GET_MODE (operands[0]) == SImode)
1163 	{
1164 	  if (GET_MODE (operands[1]) == SImode)
1165 	    gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1166 	  else
1167 	    gen_divmod4_1
1168 	      = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1169 	}
1170       else
1171 	gen_divmod4_1
1172 	  = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1173       break;
1174 
1175     case E_DImode:
1176       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1177       break;
1178 
1179     default:
1180       gcc_unreachable ();
1181     }
1182 
1183   end_label = gen_label_rtx ();
1184   qimode_label = gen_label_rtx ();
1185 
1186   scratch = gen_reg_rtx (mode);
1187 
1188   /* Use 8bit unsigned divimod if dividend and divisor are within
1189      the range [0-255].  */
1190   emit_move_insn (scratch, operands[2]);
1191   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1192 				 scratch, 1, OPTAB_DIRECT);
1193   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1194   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1195   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1196   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1197 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1198 			       pc_rtx);
1199   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1200   predict_jump (REG_BR_PROB_BASE * 50 / 100);
1201   JUMP_LABEL (insn) = qimode_label;
1202 
1203   /* Generate original signed/unsigned divimod.  */
1204   emit_insn (gen_divmod4_1 (operands[0], operands[1],
1205 			    operands[2], operands[3]));
1206 
1207   /* Branch to the end.  */
1208   emit_jump_insn (gen_jump (end_label));
1209   emit_barrier ();
1210 
1211   /* Generate 8bit unsigned divide.  */
1212   emit_label (qimode_label);
1213   /* Don't use operands[0] for result of 8bit divide since not all
1214      registers support QImode ZERO_EXTRACT.  */
1215   tmp0 = lowpart_subreg (HImode, scratch, mode);
1216   tmp1 = lowpart_subreg (HImode, operands[2], mode);
1217   tmp2 = lowpart_subreg (QImode, operands[3], mode);
1218   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1219 
1220   if (unsigned_p)
1221     {
1222       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1223       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1224     }
1225   else
1226     {
1227       div = gen_rtx_DIV (mode, operands[2], operands[3]);
1228       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1229     }
1230   if (mode == SImode)
1231     {
1232       if (GET_MODE (operands[0]) != SImode)
1233 	div = gen_rtx_ZERO_EXTEND (DImode, div);
1234       if (GET_MODE (operands[1]) != SImode)
1235 	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1236     }
1237 
1238   /* Extract remainder from AH.  */
1239   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1240   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1241 			       GEN_INT (8), GEN_INT (8));
1242   insn = emit_move_insn (operands[1], tmp1);
1243   set_unique_reg_note (insn, REG_EQUAL, mod);
1244 
1245   /* Zero extend quotient from AL.  */
1246   tmp1 = gen_lowpart (QImode, tmp0);
1247   insn = emit_insn (gen_extend_insn
1248 		    (operands[0], tmp1,
1249 		     GET_MODE (operands[0]), QImode, 1));
1250   set_unique_reg_note (insn, REG_EQUAL, div);
1251 
1252   emit_label (end_label);
1253 }
1254 
1255 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1256    matches destination.  RTX includes clobber of FLAGS_REG.  */
1257 
1258 void
ix86_emit_binop(enum rtx_code code,machine_mode mode,rtx dst,rtx src)1259 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1260 		 rtx dst, rtx src)
1261 {
1262   rtx op, clob;
1263 
1264   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1265   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1266 
1267   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1268 }
1269 
1270 /* Return true if regno1 def is nearest to the insn.  */
1271 
1272 static bool
find_nearest_reg_def(rtx_insn * insn,int regno1,int regno2)1273 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1274 {
1275   rtx_insn *prev = insn;
1276   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1277 
1278   if (insn == start)
1279     return false;
1280   while (prev && prev != start)
1281     {
1282       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1283 	{
1284 	  prev = PREV_INSN (prev);
1285 	  continue;
1286 	}
1287       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1288 	return true;
1289       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1290 	return false;
1291       prev = PREV_INSN (prev);
1292     }
1293 
1294   /* None of the regs is defined in the bb.  */
1295   return false;
1296 }
1297 
1298 /* Split lea instructions into a sequence of instructions
1299    which are executed on ALU to avoid AGU stalls.
1300    It is assumed that it is allowed to clobber flags register
1301    at lea position.  */
1302 
1303 void
ix86_split_lea_for_addr(rtx_insn * insn,rtx operands[],machine_mode mode)1304 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1305 {
1306   unsigned int regno0, regno1, regno2;
1307   struct ix86_address parts;
1308   rtx target, tmp;
1309   int ok, adds;
1310 
1311   ok = ix86_decompose_address (operands[1], &parts);
1312   gcc_assert (ok);
1313 
1314   target = gen_lowpart (mode, operands[0]);
1315 
1316   regno0 = true_regnum (target);
1317   regno1 = INVALID_REGNUM;
1318   regno2 = INVALID_REGNUM;
1319 
1320   if (parts.base)
1321     {
1322       parts.base = gen_lowpart (mode, parts.base);
1323       regno1 = true_regnum (parts.base);
1324     }
1325 
1326   if (parts.index)
1327     {
1328       parts.index = gen_lowpart (mode, parts.index);
1329       regno2 = true_regnum (parts.index);
1330     }
1331 
1332   if (parts.disp)
1333     parts.disp = gen_lowpart (mode, parts.disp);
1334 
1335   if (parts.scale > 1)
1336     {
1337       /* Case r1 = r1 + ...  */
1338       if (regno1 == regno0)
1339 	{
1340 	  /* If we have a case r1 = r1 + C * r2 then we
1341 	     should use multiplication which is very
1342 	     expensive.  Assume cost model is wrong if we
1343 	     have such case here.  */
1344 	  gcc_assert (regno2 != regno0);
1345 
1346 	  for (adds = parts.scale; adds > 0; adds--)
1347 	    ix86_emit_binop (PLUS, mode, target, parts.index);
1348 	}
1349       else
1350 	{
1351 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
1352 	  if (regno0 != regno2)
1353 	    emit_insn (gen_rtx_SET (target, parts.index));
1354 
1355 	  /* Use shift for scaling, but emit it as MULT instead
1356 	     to avoid it being immediately peephole2 optimized back
1357 	     into lea.  */
1358 	  ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1359 
1360 	  if (parts.base)
1361 	    ix86_emit_binop (PLUS, mode, target, parts.base);
1362 
1363 	  if (parts.disp && parts.disp != const0_rtx)
1364 	    ix86_emit_binop (PLUS, mode, target, parts.disp);
1365 	}
1366     }
1367   else if (!parts.base && !parts.index)
1368     {
1369       gcc_assert(parts.disp);
1370       emit_insn (gen_rtx_SET (target, parts.disp));
1371     }
1372   else
1373     {
1374       if (!parts.base)
1375 	{
1376 	  if (regno0 != regno2)
1377 	    emit_insn (gen_rtx_SET (target, parts.index));
1378 	}
1379       else if (!parts.index)
1380 	{
1381 	  if (regno0 != regno1)
1382 	    emit_insn (gen_rtx_SET (target, parts.base));
1383 	}
1384       else
1385 	{
1386 	  if (regno0 == regno1)
1387 	    tmp = parts.index;
1388 	  else if (regno0 == regno2)
1389 	    tmp = parts.base;
1390 	  else
1391 	    {
1392 	      rtx tmp1;
1393 
1394 	      /* Find better operand for SET instruction, depending
1395 		 on which definition is farther from the insn.  */
1396 	      if (find_nearest_reg_def (insn, regno1, regno2))
1397 		tmp = parts.index, tmp1 = parts.base;
1398 	      else
1399 		tmp = parts.base, tmp1 = parts.index;
1400 
1401 	      emit_insn (gen_rtx_SET (target, tmp));
1402 
1403 	      if (parts.disp && parts.disp != const0_rtx)
1404 		ix86_emit_binop (PLUS, mode, target, parts.disp);
1405 
1406 	      ix86_emit_binop (PLUS, mode, target, tmp1);
1407 	      return;
1408 	    }
1409 
1410 	  ix86_emit_binop (PLUS, mode, target, tmp);
1411 	}
1412 
1413       if (parts.disp && parts.disp != const0_rtx)
1414 	ix86_emit_binop (PLUS, mode, target, parts.disp);
1415     }
1416 }
1417 
1418 /* Post-reload splitter for converting an SF or DFmode value in an
1419    SSE register into an unsigned SImode.  */
1420 
1421 void
ix86_split_convert_uns_si_sse(rtx operands[])1422 ix86_split_convert_uns_si_sse (rtx operands[])
1423 {
1424   machine_mode vecmode;
1425   rtx value, large, zero_or_two31, input, two31, x;
1426 
1427   large = operands[1];
1428   zero_or_two31 = operands[2];
1429   input = operands[3];
1430   two31 = operands[4];
1431   vecmode = GET_MODE (large);
1432   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1433 
1434   /* Load up the value into the low element.  We must ensure that the other
1435      elements are valid floats -- zero is the easiest such value.  */
1436   if (MEM_P (input))
1437     {
1438       if (vecmode == V4SFmode)
1439 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1440       else
1441 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1442     }
1443   else
1444     {
1445       input = gen_rtx_REG (vecmode, REGNO (input));
1446       emit_move_insn (value, CONST0_RTX (vecmode));
1447       if (vecmode == V4SFmode)
1448 	emit_insn (gen_sse_movss (value, value, input));
1449       else
1450 	emit_insn (gen_sse2_movsd (value, value, input));
1451     }
1452 
1453   emit_move_insn (large, two31);
1454   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1455 
1456   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1457   emit_insn (gen_rtx_SET (large, x));
1458 
1459   x = gen_rtx_AND (vecmode, zero_or_two31, large);
1460   emit_insn (gen_rtx_SET (zero_or_two31, x));
1461 
1462   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1463   emit_insn (gen_rtx_SET (value, x));
1464 
1465   large = gen_rtx_REG (V4SImode, REGNO (large));
1466   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1467 
1468   x = gen_rtx_REG (V4SImode, REGNO (value));
1469   if (vecmode == V4SFmode)
1470     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1471   else
1472     emit_insn (gen_sse2_cvttpd2dq (x, value));
1473   value = x;
1474 
1475   emit_insn (gen_xorv4si3 (value, value, large));
1476 }
1477 
1478 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1479 						 machine_mode mode, rtx target,
1480 						 rtx var, int one_var);
1481 
1482 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1483    Expects the 64-bit DImode to be supplied in a pair of integral
1484    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
1485    -mfpmath=sse, !optimize_size only.  */
1486 
1487 void
ix86_expand_convert_uns_didf_sse(rtx target,rtx input)1488 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1489 {
1490   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1491   rtx int_xmm, fp_xmm;
1492   rtx biases, exponents;
1493   rtx x;
1494 
1495   int_xmm = gen_reg_rtx (V4SImode);
1496   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1497     emit_insn (gen_movdi_to_sse (int_xmm, input));
1498   else if (TARGET_SSE_SPLIT_REGS)
1499     {
1500       emit_clobber (int_xmm);
1501       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1502     }
1503   else
1504     {
1505       x = gen_reg_rtx (V2DImode);
1506       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1507       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1508     }
1509 
1510   x = gen_rtx_CONST_VECTOR (V4SImode,
1511 			    gen_rtvec (4, GEN_INT (0x43300000UL),
1512 				       GEN_INT (0x45300000UL),
1513 				       const0_rtx, const0_rtx));
1514   exponents = validize_mem (force_const_mem (V4SImode, x));
1515 
1516   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1517   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1518 
1519   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1520      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1521      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1522      (0x1.0p84 + double(fp_value_hi_xmm)).
1523      Note these exponents differ by 32.  */
1524 
1525   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1526 
1527   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1528      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
1529   real_ldexp (&bias_lo_rvt, &dconst1, 52);
1530   real_ldexp (&bias_hi_rvt, &dconst1, 84);
1531   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1532   x = const_double_from_real_value (bias_hi_rvt, DFmode);
1533   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1534   biases = validize_mem (force_const_mem (V2DFmode, biases));
1535   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1536 
1537   /* Add the upper and lower DFmode values together.  */
1538   if (TARGET_SSE3)
1539     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1540   else
1541     {
1542       x = copy_to_mode_reg (V2DFmode, fp_xmm);
1543       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1544       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1545     }
1546 
1547   ix86_expand_vector_extract (false, target, fp_xmm, 0);
1548 }
1549 
1550 /* Not used, but eases macroization of patterns.  */
1551 void
ix86_expand_convert_uns_sixf_sse(rtx,rtx)1552 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1553 {
1554   gcc_unreachable ();
1555 }
1556 
1557 /* Convert an unsigned SImode value into a DFmode.  Only currently used
1558    for SSE, but applicable anywhere.  */
1559 
1560 void
ix86_expand_convert_uns_sidf_sse(rtx target,rtx input)1561 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1562 {
1563   REAL_VALUE_TYPE TWO31r;
1564   rtx x, fp;
1565 
1566   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1567 			   NULL, 1, OPTAB_DIRECT);
1568 
1569   fp = gen_reg_rtx (DFmode);
1570   emit_insn (gen_floatsidf2 (fp, x));
1571 
1572   real_ldexp (&TWO31r, &dconst1, 31);
1573   x = const_double_from_real_value (TWO31r, DFmode);
1574 
1575   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1576   if (x != target)
1577     emit_move_insn (target, x);
1578 }
1579 
1580 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
1581    32-bit mode; otherwise we have a direct convert instruction.  */
1582 
1583 void
ix86_expand_convert_sign_didf_sse(rtx target,rtx input)1584 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1585 {
1586   REAL_VALUE_TYPE TWO32r;
1587   rtx fp_lo, fp_hi, x;
1588 
1589   fp_lo = gen_reg_rtx (DFmode);
1590   fp_hi = gen_reg_rtx (DFmode);
1591 
1592   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1593 
1594   real_ldexp (&TWO32r, &dconst1, 32);
1595   x = const_double_from_real_value (TWO32r, DFmode);
1596   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1597 
1598   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1599 
1600   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1601 			   0, OPTAB_DIRECT);
1602   if (x != target)
1603     emit_move_insn (target, x);
1604 }
1605 
1606 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1607    For x86_32, -mfpmath=sse, !optimize_size only.  */
1608 void
ix86_expand_convert_uns_sisf_sse(rtx target,rtx input)1609 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1610 {
1611   REAL_VALUE_TYPE ONE16r;
1612   rtx fp_hi, fp_lo, int_hi, int_lo, x;
1613 
1614   real_ldexp (&ONE16r, &dconst1, 16);
1615   x = const_double_from_real_value (ONE16r, SFmode);
1616   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1617 				      NULL, 0, OPTAB_DIRECT);
1618   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1619 				      NULL, 0, OPTAB_DIRECT);
1620   fp_hi = gen_reg_rtx (SFmode);
1621   fp_lo = gen_reg_rtx (SFmode);
1622   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1623   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1624   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1625 			       0, OPTAB_DIRECT);
1626   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1627 			       0, OPTAB_DIRECT);
1628   if (!rtx_equal_p (target, fp_hi))
1629     emit_move_insn (target, fp_hi);
1630 }
1631 
1632 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
1633    a vector of unsigned ints VAL to vector of floats TARGET.  */
1634 
1635 void
ix86_expand_vector_convert_uns_vsivsf(rtx target,rtx val)1636 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1637 {
1638   rtx tmp[8];
1639   REAL_VALUE_TYPE TWO16r;
1640   machine_mode intmode = GET_MODE (val);
1641   machine_mode fltmode = GET_MODE (target);
1642   rtx (*cvt) (rtx, rtx);
1643 
1644   if (intmode == V4SImode)
1645     cvt = gen_floatv4siv4sf2;
1646   else
1647     cvt = gen_floatv8siv8sf2;
1648   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1649   tmp[0] = force_reg (intmode, tmp[0]);
1650   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1651 				OPTAB_DIRECT);
1652   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1653 				NULL_RTX, 1, OPTAB_DIRECT);
1654   tmp[3] = gen_reg_rtx (fltmode);
1655   emit_insn (cvt (tmp[3], tmp[1]));
1656   tmp[4] = gen_reg_rtx (fltmode);
1657   emit_insn (cvt (tmp[4], tmp[2]));
1658   real_ldexp (&TWO16r, &dconst1, 16);
1659   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1660   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1661   tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1662 				OPTAB_DIRECT);
1663   tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1664 				OPTAB_DIRECT);
1665   if (tmp[7] != target)
1666     emit_move_insn (target, tmp[7]);
1667 }
1668 
1669 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1670    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1671    This is done by doing just signed conversion if < 0x1p31, and otherwise by
1672    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
1673 
1674 rtx
ix86_expand_adjust_ufix_to_sfix_si(rtx val,rtx * xorp)1675 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1676 {
1677   REAL_VALUE_TYPE TWO31r;
1678   rtx two31r, tmp[4];
1679   machine_mode mode = GET_MODE (val);
1680   machine_mode scalarmode = GET_MODE_INNER (mode);
1681   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1682   rtx (*cmp) (rtx, rtx, rtx, rtx);
1683   int i;
1684 
1685   for (i = 0; i < 3; i++)
1686     tmp[i] = gen_reg_rtx (mode);
1687   real_ldexp (&TWO31r, &dconst1, 31);
1688   two31r = const_double_from_real_value (TWO31r, scalarmode);
1689   two31r = ix86_build_const_vector (mode, 1, two31r);
1690   two31r = force_reg (mode, two31r);
1691   switch (mode)
1692     {
1693     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1694     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1695     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1696     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1697     default: gcc_unreachable ();
1698     }
1699   tmp[3] = gen_rtx_LE (mode, two31r, val);
1700   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1701   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1702 				0, OPTAB_DIRECT);
1703   if (intmode == V4SImode || TARGET_AVX2)
1704     *xorp = expand_simple_binop (intmode, ASHIFT,
1705 				 gen_lowpart (intmode, tmp[0]),
1706 				 GEN_INT (31), NULL_RTX, 0,
1707 				 OPTAB_DIRECT);
1708   else
1709     {
1710       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1711       two31 = ix86_build_const_vector (intmode, 1, two31);
1712       *xorp = expand_simple_binop (intmode, AND,
1713 				   gen_lowpart (intmode, tmp[0]),
1714 				   two31, NULL_RTX, 0,
1715 				   OPTAB_DIRECT);
1716     }
1717   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1718 			      0, OPTAB_DIRECT);
1719 }
1720 
1721 /* Generate code for floating point ABS or NEG.  */
1722 
1723 void
ix86_expand_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1724 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1725 				rtx operands[])
1726 {
1727   rtx set, dst, src;
1728   bool use_sse = false;
1729   bool vector_mode = VECTOR_MODE_P (mode);
1730   machine_mode vmode = mode;
1731   rtvec par;
1732 
1733   if (vector_mode || mode == TFmode)
1734     use_sse = true;
1735   else if (TARGET_SSE_MATH)
1736     {
1737       use_sse = SSE_FLOAT_MODE_P (mode);
1738       if (mode == SFmode)
1739 	vmode = V4SFmode;
1740       else if (mode == DFmode)
1741 	vmode = V2DFmode;
1742     }
1743 
1744   dst = operands[0];
1745   src = operands[1];
1746 
1747   set = gen_rtx_fmt_e (code, mode, src);
1748   set = gen_rtx_SET (dst, set);
1749 
1750   if (use_sse)
1751     {
1752       rtx mask, use, clob;
1753 
1754       /* NEG and ABS performed with SSE use bitwise mask operations.
1755 	 Create the appropriate mask now.  */
1756       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1757       use = gen_rtx_USE (VOIDmode, mask);
1758       if (vector_mode || mode == TFmode)
1759 	par = gen_rtvec (2, set, use);
1760       else
1761 	{
1762           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1763 	  par = gen_rtvec (3, set, use, clob);
1764         }
1765     }
1766   else
1767     {
1768       rtx clob;
1769 
1770       /* Changing of sign for FP values is doable using integer unit too.  */
1771       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1772       par = gen_rtvec (2, set, clob);
1773     }
1774 
1775   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1776 }
1777 
1778 /* Deconstruct a floating point ABS or NEG operation
1779    with integer registers into integer operations.  */
1780 
1781 void
ix86_split_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1782 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1783 			       rtx operands[])
1784 {
1785   enum rtx_code absneg_op;
1786   rtx dst, set;
1787 
1788   gcc_assert (operands_match_p (operands[0], operands[1]));
1789 
1790   switch (mode)
1791     {
1792     case E_SFmode:
1793       dst = gen_lowpart (SImode, operands[0]);
1794 
1795       if (code == ABS)
1796 	{
1797 	  set = gen_int_mode (0x7fffffff, SImode);
1798 	  absneg_op = AND;
1799 	}
1800       else
1801 	{
1802 	  set = gen_int_mode (0x80000000, SImode);
1803 	  absneg_op = XOR;
1804 	}
1805       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1806       break;
1807 
1808     case E_DFmode:
1809       if (TARGET_64BIT)
1810 	{
1811 	  dst = gen_lowpart (DImode, operands[0]);
1812 	  dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1813 
1814 	  if (code == ABS)
1815 	    set = const0_rtx;
1816 	  else
1817 	    set = gen_rtx_NOT (DImode, dst);
1818 	}
1819       else
1820 	{
1821 	  dst = gen_highpart (SImode, operands[0]);
1822 
1823 	  if (code == ABS)
1824 	    {
1825 	      set = gen_int_mode (0x7fffffff, SImode);
1826 	      absneg_op = AND;
1827 	    }
1828 	  else
1829 	    {
1830 	      set = gen_int_mode (0x80000000, SImode);
1831 	      absneg_op = XOR;
1832 	    }
1833 	  set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1834 	}
1835       break;
1836 
1837     case E_XFmode:
1838       dst = gen_rtx_REG (SImode,
1839 			 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1840       if (code == ABS)
1841 	{
1842 	  set = GEN_INT (0x7fff);
1843 	  absneg_op = AND;
1844 	}
1845       else
1846 	{
1847 	  set = GEN_INT (0x8000);
1848 	  absneg_op = XOR;
1849 	}
1850       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1851       break;
1852 
1853     default:
1854       gcc_unreachable ();
1855     }
1856 
1857   set = gen_rtx_SET (dst, set);
1858 
1859   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1860   rtvec par = gen_rtvec (2, set, clob);
1861 
1862   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1863 }
1864 
1865 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
1866 
1867 void
ix86_expand_copysign(rtx operands[])1868 ix86_expand_copysign (rtx operands[])
1869 {
1870   machine_mode mode, vmode;
1871   rtx dest, op0, op1, mask;
1872 
1873   dest = operands[0];
1874   op0 = operands[1];
1875   op1 = operands[2];
1876 
1877   mode = GET_MODE (dest);
1878 
1879   if (mode == SFmode)
1880     vmode = V4SFmode;
1881   else if (mode == DFmode)
1882     vmode = V2DFmode;
1883   else if (mode == TFmode)
1884     vmode = mode;
1885   else
1886     gcc_unreachable ();
1887 
1888   mask = ix86_build_signbit_mask (vmode, 0, 0);
1889 
1890   if (CONST_DOUBLE_P (op0))
1891     {
1892       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1893 	op0 = simplify_unary_operation (ABS, mode, op0, mode);
1894 
1895       if (mode == SFmode || mode == DFmode)
1896 	{
1897 	  if (op0 == CONST0_RTX (mode))
1898 	    op0 = CONST0_RTX (vmode);
1899 	  else
1900 	    {
1901 	      rtx v = ix86_build_const_vector (vmode, false, op0);
1902 
1903 	      op0 = force_reg (vmode, v);
1904 	    }
1905 	}
1906       else if (op0 != CONST0_RTX (mode))
1907 	op0 = force_reg (mode, op0);
1908 
1909       emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1910     }
1911   else
1912     {
1913       rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1914 
1915       emit_insn (gen_copysign3_var
1916 		 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1917     }
1918 }
1919 
1920 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
1921    be a constant, and so has already been expanded into a vector constant.  */
1922 
1923 void
ix86_split_copysign_const(rtx operands[])1924 ix86_split_copysign_const (rtx operands[])
1925 {
1926   machine_mode mode, vmode;
1927   rtx dest, op0, mask, x;
1928 
1929   dest = operands[0];
1930   op0 = operands[1];
1931   mask = operands[3];
1932 
1933   mode = GET_MODE (dest);
1934   vmode = GET_MODE (mask);
1935 
1936   dest = lowpart_subreg (vmode, dest, mode);
1937   x = gen_rtx_AND (vmode, dest, mask);
1938   emit_insn (gen_rtx_SET (dest, x));
1939 
1940   if (op0 != CONST0_RTX (vmode))
1941     {
1942       x = gen_rtx_IOR (vmode, dest, op0);
1943       emit_insn (gen_rtx_SET (dest, x));
1944     }
1945 }
1946 
1947 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
1948    so we have to do two masks.  */
1949 
1950 void
ix86_split_copysign_var(rtx operands[])1951 ix86_split_copysign_var (rtx operands[])
1952 {
1953   machine_mode mode, vmode;
1954   rtx dest, scratch, op0, op1, mask, nmask, x;
1955 
1956   dest = operands[0];
1957   scratch = operands[1];
1958   op0 = operands[2];
1959   op1 = operands[3];
1960   nmask = operands[4];
1961   mask = operands[5];
1962 
1963   mode = GET_MODE (dest);
1964   vmode = GET_MODE (mask);
1965 
1966   if (rtx_equal_p (op0, op1))
1967     {
1968       /* Shouldn't happen often (it's useless, obviously), but when it does
1969 	 we'd generate incorrect code if we continue below.  */
1970       emit_move_insn (dest, op0);
1971       return;
1972     }
1973 
1974   if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
1975     {
1976       gcc_assert (REGNO (op1) == REGNO (scratch));
1977 
1978       x = gen_rtx_AND (vmode, scratch, mask);
1979       emit_insn (gen_rtx_SET (scratch, x));
1980 
1981       dest = mask;
1982       op0 = lowpart_subreg (vmode, op0, mode);
1983       x = gen_rtx_NOT (vmode, dest);
1984       x = gen_rtx_AND (vmode, x, op0);
1985       emit_insn (gen_rtx_SET (dest, x));
1986     }
1987   else
1988     {
1989       if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
1990 	{
1991 	  x = gen_rtx_AND (vmode, scratch, mask);
1992 	}
1993       else						/* alternative 2,4 */
1994 	{
1995           gcc_assert (REGNO (mask) == REGNO (scratch));
1996           op1 = lowpart_subreg (vmode, op1, mode);
1997 	  x = gen_rtx_AND (vmode, scratch, op1);
1998 	}
1999       emit_insn (gen_rtx_SET (scratch, x));
2000 
2001       if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
2002 	{
2003 	  dest = lowpart_subreg (vmode, op0, mode);
2004 	  x = gen_rtx_AND (vmode, dest, nmask);
2005 	}
2006       else						/* alternative 3,4 */
2007 	{
2008           gcc_assert (REGNO (nmask) == REGNO (dest));
2009 	  dest = nmask;
2010 	  op0 = lowpart_subreg (vmode, op0, mode);
2011 	  x = gen_rtx_AND (vmode, dest, op0);
2012 	}
2013       emit_insn (gen_rtx_SET (dest, x));
2014     }
2015 
2016   x = gen_rtx_IOR (vmode, dest, scratch);
2017   emit_insn (gen_rtx_SET (dest, x));
2018 }
2019 
2020 /* Expand an xorsign operation.  */
2021 
2022 void
ix86_expand_xorsign(rtx operands[])2023 ix86_expand_xorsign (rtx operands[])
2024 {
2025   machine_mode mode, vmode;
2026   rtx dest, op0, op1, mask;
2027 
2028   dest = operands[0];
2029   op0 = operands[1];
2030   op1 = operands[2];
2031 
2032   mode = GET_MODE (dest);
2033 
2034   if (mode == SFmode)
2035     vmode = V4SFmode;
2036   else if (mode == DFmode)
2037     vmode = V2DFmode;
2038   else
2039     gcc_unreachable ();
2040 
2041   mask = ix86_build_signbit_mask (vmode, 0, 0);
2042 
2043   emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2044 }
2045 
2046 /* Deconstruct an xorsign operation into bit masks.  */
2047 
2048 void
ix86_split_xorsign(rtx operands[])2049 ix86_split_xorsign (rtx operands[])
2050 {
2051   machine_mode mode, vmode;
2052   rtx dest, op0, mask, x;
2053 
2054   dest = operands[0];
2055   op0 = operands[1];
2056   mask = operands[3];
2057 
2058   mode = GET_MODE (dest);
2059   vmode = GET_MODE (mask);
2060 
2061   dest = lowpart_subreg (vmode, dest, mode);
2062   x = gen_rtx_AND (vmode, dest, mask);
2063   emit_insn (gen_rtx_SET (dest, x));
2064 
2065   op0 = lowpart_subreg (vmode, op0, mode);
2066   x = gen_rtx_XOR (vmode, dest, op0);
2067   emit_insn (gen_rtx_SET (dest, x));
2068 }
2069 
2070 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2071 
2072 void
ix86_expand_branch(enum rtx_code code,rtx op0,rtx op1,rtx label)2073 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2074 {
2075   machine_mode mode = GET_MODE (op0);
2076   rtx tmp;
2077 
2078   /* Handle special case - vector comparsion with boolean result, transform
2079      it using ptest instruction.  */
2080   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2081     {
2082       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2083       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2084 
2085       gcc_assert (code == EQ || code == NE);
2086       /* Generate XOR since we can't check that one operand is zero vector.  */
2087       tmp = gen_reg_rtx (mode);
2088       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2089       tmp = gen_lowpart (p_mode, tmp);
2090       emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2091 			      gen_rtx_UNSPEC (CCmode,
2092 					      gen_rtvec (2, tmp, tmp),
2093 					      UNSPEC_PTEST)));
2094       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2095       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2096 				  gen_rtx_LABEL_REF (VOIDmode, label),
2097 				  pc_rtx);
2098       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2099       return;
2100     }
2101 
2102   switch (mode)
2103     {
2104     case E_SFmode:
2105     case E_DFmode:
2106     case E_XFmode:
2107     case E_QImode:
2108     case E_HImode:
2109     case E_SImode:
2110       simple:
2111       tmp = ix86_expand_compare (code, op0, op1);
2112       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2113 				  gen_rtx_LABEL_REF (VOIDmode, label),
2114 				  pc_rtx);
2115       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2116       return;
2117 
2118     case E_DImode:
2119       if (TARGET_64BIT)
2120 	goto simple;
2121       /* For 32-bit target DI comparison may be performed on
2122 	 SSE registers.  To allow this we should avoid split
2123 	 to SI mode which is achieved by doing xor in DI mode
2124 	 and then comparing with zero (which is recognized by
2125 	 STV pass).  We don't compare using xor when optimizing
2126 	 for size.  */
2127       if (!optimize_insn_for_size_p ()
2128 	  && TARGET_STV
2129 	  && (code == EQ || code == NE))
2130 	{
2131 	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2132 	  op1 = const0_rtx;
2133 	}
2134       /* FALLTHRU */
2135     case E_TImode:
2136       /* Expand DImode branch into multiple compare+branch.  */
2137       {
2138 	rtx lo[2], hi[2];
2139 	rtx_code_label *label2;
2140 	enum rtx_code code1, code2, code3;
2141 	machine_mode submode;
2142 
2143 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2144 	  {
2145 	    std::swap (op0, op1);
2146 	    code = swap_condition (code);
2147 	  }
2148 
2149 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
2150 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
2151 
2152 	submode = mode == DImode ? SImode : DImode;
2153 
2154 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2155 	   avoid two branches.  This costs one extra insn, so disable when
2156 	   optimizing for size.  */
2157 
2158 	if ((code == EQ || code == NE)
2159 	    && (!optimize_insn_for_size_p ()
2160 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
2161 	  {
2162 	    rtx xor0, xor1;
2163 
2164 	    xor1 = hi[0];
2165 	    if (hi[1] != const0_rtx)
2166 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2167 				   NULL_RTX, 0, OPTAB_WIDEN);
2168 
2169 	    xor0 = lo[0];
2170 	    if (lo[1] != const0_rtx)
2171 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2172 				   NULL_RTX, 0, OPTAB_WIDEN);
2173 
2174 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
2175 				NULL_RTX, 0, OPTAB_WIDEN);
2176 
2177 	    ix86_expand_branch (code, tmp, const0_rtx, label);
2178 	    return;
2179 	  }
2180 
2181 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
2182 	   op1 is a constant and the low word is zero, then we can just
2183 	   examine the high word.  Similarly for low word -1 and
2184 	   less-or-equal-than or greater-than.  */
2185 
2186 	if (CONST_INT_P (hi[1]))
2187 	  switch (code)
2188 	    {
2189 	    case LT: case LTU: case GE: case GEU:
2190 	      if (lo[1] == const0_rtx)
2191 		{
2192 		  ix86_expand_branch (code, hi[0], hi[1], label);
2193 		  return;
2194 		}
2195 	      break;
2196 	    case LE: case LEU: case GT: case GTU:
2197 	      if (lo[1] == constm1_rtx)
2198 		{
2199 		  ix86_expand_branch (code, hi[0], hi[1], label);
2200 		  return;
2201 		}
2202 	      break;
2203 	    default:
2204 	      break;
2205 	    }
2206 
2207 	/* Emulate comparisons that do not depend on Zero flag with
2208 	   double-word subtraction.  Note that only Overflow, Sign
2209 	   and Carry flags are valid, so swap arguments and condition
2210 	   of comparisons that would otherwise test Zero flag.  */
2211 
2212 	switch (code)
2213 	  {
2214 	  case LE: case LEU: case GT: case GTU:
2215 	    std::swap (lo[0], lo[1]);
2216 	    std::swap (hi[0], hi[1]);
2217 	    code = swap_condition (code);
2218 	    /* FALLTHRU */
2219 
2220 	  case LT: case LTU: case GE: case GEU:
2221 	    {
2222 	      bool uns = (code == LTU || code == GEU);
2223 	      rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2224 		= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2225 
2226 	      if (!nonimmediate_operand (lo[0], submode))
2227 		lo[0] = force_reg (submode, lo[0]);
2228 	      if (!x86_64_general_operand (lo[1], submode))
2229 		lo[1] = force_reg (submode, lo[1]);
2230 
2231 	      if (!register_operand (hi[0], submode))
2232 		hi[0] = force_reg (submode, hi[0]);
2233 	      if ((uns && !nonimmediate_operand (hi[1], submode))
2234 		  || (!uns && !x86_64_general_operand (hi[1], submode)))
2235 		hi[1] = force_reg (submode, hi[1]);
2236 
2237 	      emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2238 
2239 	      tmp = gen_rtx_SCRATCH (submode);
2240 	      emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2241 
2242 	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2243 	      ix86_expand_branch (code, tmp, const0_rtx, label);
2244 	      return;
2245 	    }
2246 
2247 	  default:
2248 	    break;
2249 	  }
2250 
2251 	/* Otherwise, we need two or three jumps.  */
2252 
2253 	label2 = gen_label_rtx ();
2254 
2255 	code1 = code;
2256 	code2 = swap_condition (code);
2257 	code3 = unsigned_condition (code);
2258 
2259 	switch (code)
2260 	  {
2261 	  case LT: case GT: case LTU: case GTU:
2262 	    break;
2263 
2264 	  case LE:   code1 = LT;  code2 = GT;  break;
2265 	  case GE:   code1 = GT;  code2 = LT;  break;
2266 	  case LEU:  code1 = LTU; code2 = GTU; break;
2267 	  case GEU:  code1 = GTU; code2 = LTU; break;
2268 
2269 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
2270 	  case NE:   code2 = UNKNOWN; break;
2271 
2272 	  default:
2273 	    gcc_unreachable ();
2274 	  }
2275 
2276 	/*
2277 	 * a < b =>
2278 	 *    if (hi(a) < hi(b)) goto true;
2279 	 *    if (hi(a) > hi(b)) goto false;
2280 	 *    if (lo(a) < lo(b)) goto true;
2281 	 *  false:
2282 	 */
2283 
2284 	if (code1 != UNKNOWN)
2285 	  ix86_expand_branch (code1, hi[0], hi[1], label);
2286 	if (code2 != UNKNOWN)
2287 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
2288 
2289 	ix86_expand_branch (code3, lo[0], lo[1], label);
2290 
2291 	if (code2 != UNKNOWN)
2292 	  emit_label (label2);
2293 	return;
2294       }
2295 
2296     default:
2297       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2298       goto simple;
2299     }
2300 }
2301 
2302 /* Figure out whether to use unordered fp comparisons.  */
2303 
2304 static bool
ix86_unordered_fp_compare(enum rtx_code code)2305 ix86_unordered_fp_compare (enum rtx_code code)
2306 {
2307   if (!TARGET_IEEE_FP)
2308     return false;
2309 
2310   switch (code)
2311     {
2312     case LT:
2313     case LE:
2314     case GT:
2315     case GE:
2316     case LTGT:
2317       return false;
2318 
2319     case EQ:
2320     case NE:
2321 
2322     case UNORDERED:
2323     case ORDERED:
2324     case UNLT:
2325     case UNLE:
2326     case UNGT:
2327     case UNGE:
2328     case UNEQ:
2329       return true;
2330 
2331     default:
2332       gcc_unreachable ();
2333     }
2334 }
2335 
2336 /* Return a comparison we can do and that it is equivalent to
2337    swap_condition (code) apart possibly from orderedness.
2338    But, never change orderedness if TARGET_IEEE_FP, returning
2339    UNKNOWN in that case if necessary.  */
2340 
2341 static enum rtx_code
ix86_fp_swap_condition(enum rtx_code code)2342 ix86_fp_swap_condition (enum rtx_code code)
2343 {
2344   switch (code)
2345     {
2346     case GT:                   /* GTU - CF=0 & ZF=0 */
2347       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2348     case GE:                   /* GEU - CF=0 */
2349       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2350     case UNLT:                 /* LTU - CF=1 */
2351       return TARGET_IEEE_FP ? UNKNOWN : GT;
2352     case UNLE:                 /* LEU - CF=1 | ZF=1 */
2353       return TARGET_IEEE_FP ? UNKNOWN : GE;
2354     default:
2355       return swap_condition (code);
2356     }
2357 }
2358 
2359 /* Return cost of comparison CODE using the best strategy for performance.
2360    All following functions do use number of instructions as a cost metrics.
2361    In future this should be tweaked to compute bytes for optimize_size and
2362    take into account performance of various instructions on various CPUs.  */
2363 
2364 static int
ix86_fp_comparison_cost(enum rtx_code code)2365 ix86_fp_comparison_cost (enum rtx_code code)
2366 {
2367   int arith_cost;
2368 
2369   /* The cost of code using bit-twiddling on %ah.  */
2370   switch (code)
2371     {
2372     case UNLE:
2373     case UNLT:
2374     case LTGT:
2375     case GT:
2376     case GE:
2377     case UNORDERED:
2378     case ORDERED:
2379     case UNEQ:
2380       arith_cost = 4;
2381       break;
2382     case LT:
2383     case NE:
2384     case EQ:
2385     case UNGE:
2386       arith_cost = TARGET_IEEE_FP ? 5 : 4;
2387       break;
2388     case LE:
2389     case UNGT:
2390       arith_cost = TARGET_IEEE_FP ? 6 : 4;
2391       break;
2392     default:
2393       gcc_unreachable ();
2394     }
2395 
2396   switch (ix86_fp_comparison_strategy (code))
2397     {
2398     case IX86_FPCMP_COMI:
2399       return arith_cost > 4 ? 3 : 2;
2400     case IX86_FPCMP_SAHF:
2401       return arith_cost > 4 ? 4 : 3;
2402     default:
2403       return arith_cost;
2404     }
2405 }
2406 
2407 /* Swap, force into registers, or otherwise massage the two operands
2408    to a fp comparison.  The operands are updated in place; the new
2409    comparison code is returned.  */
2410 
2411 static enum rtx_code
ix86_prepare_fp_compare_args(enum rtx_code code,rtx * pop0,rtx * pop1)2412 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2413 {
2414   bool unordered_compare = ix86_unordered_fp_compare (code);
2415   rtx op0 = *pop0, op1 = *pop1;
2416   machine_mode op_mode = GET_MODE (op0);
2417   bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2418 
2419   /* All of the unordered compare instructions only work on registers.
2420      The same is true of the fcomi compare instructions.  The XFmode
2421      compare instructions require registers except when comparing
2422      against zero or when converting operand 1 from fixed point to
2423      floating point.  */
2424 
2425   if (!is_sse
2426       && (unordered_compare
2427 	  || (op_mode == XFmode
2428 	      && ! (standard_80387_constant_p (op0) == 1
2429 		    || standard_80387_constant_p (op1) == 1)
2430 	      && GET_CODE (op1) != FLOAT)
2431 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2432     {
2433       op0 = force_reg (op_mode, op0);
2434       op1 = force_reg (op_mode, op1);
2435     }
2436   else
2437     {
2438       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
2439 	 things around if they appear profitable, otherwise force op0
2440 	 into a register.  */
2441 
2442       if (standard_80387_constant_p (op0) == 0
2443 	  || (MEM_P (op0)
2444 	      && ! (standard_80387_constant_p (op1) == 0
2445 		    || MEM_P (op1))))
2446 	{
2447 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
2448 	  if (new_code != UNKNOWN)
2449 	    {
2450 	      std::swap (op0, op1);
2451 	      code = new_code;
2452 	    }
2453 	}
2454 
2455       if (!REG_P (op0))
2456 	op0 = force_reg (op_mode, op0);
2457 
2458       if (CONSTANT_P (op1))
2459 	{
2460 	  int tmp = standard_80387_constant_p (op1);
2461 	  if (tmp == 0)
2462 	    op1 = validize_mem (force_const_mem (op_mode, op1));
2463 	  else if (tmp == 1)
2464 	    {
2465 	      if (TARGET_CMOVE)
2466 		op1 = force_reg (op_mode, op1);
2467 	    }
2468 	  else
2469 	    op1 = force_reg (op_mode, op1);
2470 	}
2471     }
2472 
2473   /* Try to rearrange the comparison to make it cheaper.  */
2474   if (ix86_fp_comparison_cost (code)
2475       > ix86_fp_comparison_cost (swap_condition (code))
2476       && (REG_P (op1) || can_create_pseudo_p ()))
2477     {
2478       std::swap (op0, op1);
2479       code = swap_condition (code);
2480       if (!REG_P (op0))
2481 	op0 = force_reg (op_mode, op0);
2482     }
2483 
2484   *pop0 = op0;
2485   *pop1 = op1;
2486   return code;
2487 }
2488 
2489 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
2490 
2491 static rtx
ix86_expand_fp_compare(enum rtx_code code,rtx op0,rtx op1)2492 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2493 {
2494   bool unordered_compare = ix86_unordered_fp_compare (code);
2495   machine_mode cmp_mode;
2496   rtx tmp, scratch;
2497 
2498   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2499 
2500   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2501   if (unordered_compare)
2502     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2503 
2504   /* Do fcomi/sahf based test when profitable.  */
2505   switch (ix86_fp_comparison_strategy (code))
2506     {
2507     case IX86_FPCMP_COMI:
2508       cmp_mode = CCFPmode;
2509       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2510       break;
2511 
2512     case IX86_FPCMP_SAHF:
2513       cmp_mode = CCFPmode;
2514       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2515       scratch = gen_reg_rtx (HImode);
2516       emit_insn (gen_rtx_SET (scratch, tmp));
2517       emit_insn (gen_x86_sahf_1 (scratch));
2518       break;
2519 
2520     case IX86_FPCMP_ARITH:
2521       cmp_mode = CCNOmode;
2522       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2523       scratch = gen_reg_rtx (HImode);
2524       emit_insn (gen_rtx_SET (scratch, tmp));
2525 
2526       /* In the unordered case, we have to check C2 for NaN's, which
2527 	 doesn't happen to work out to anything nice combination-wise.
2528 	 So do some bit twiddling on the value we've got in AH to come
2529 	 up with an appropriate set of condition codes.  */
2530 
2531       switch (code)
2532 	{
2533 	case GT:
2534 	case UNGT:
2535 	  if (code == GT || !TARGET_IEEE_FP)
2536 	    {
2537 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2538 	      code = EQ;
2539 	    }
2540 	  else
2541 	    {
2542 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2543 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2544 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2545 	      cmp_mode = CCmode;
2546 	      code = GEU;
2547 	    }
2548 	  break;
2549 	case LT:
2550 	case UNLT:
2551 	  if (code == LT && TARGET_IEEE_FP)
2552 	    {
2553 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2554 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2555 	      cmp_mode = CCmode;
2556 	      code = EQ;
2557 	    }
2558 	  else
2559 	    {
2560 	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2561 	      code = NE;
2562 	    }
2563 	  break;
2564 	case GE:
2565 	case UNGE:
2566 	  if (code == GE || !TARGET_IEEE_FP)
2567 	    {
2568 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2569 	      code = EQ;
2570 	    }
2571 	  else
2572 	    {
2573 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2574 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2575 	      code = NE;
2576 	    }
2577 	  break;
2578 	case LE:
2579 	case UNLE:
2580 	  if (code == LE && TARGET_IEEE_FP)
2581 	    {
2582 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2583 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2584 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2585 	      cmp_mode = CCmode;
2586 	      code = LTU;
2587 	    }
2588 	  else
2589 	    {
2590 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2591 	      code = NE;
2592 	    }
2593 	  break;
2594 	case EQ:
2595 	case UNEQ:
2596 	  if (code == EQ && TARGET_IEEE_FP)
2597 	    {
2598 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2599 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2600 	      cmp_mode = CCmode;
2601 	      code = EQ;
2602 	    }
2603 	  else
2604 	    {
2605 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2606 	      code = NE;
2607 	    }
2608 	  break;
2609 	case NE:
2610 	case LTGT:
2611 	  if (code == NE && TARGET_IEEE_FP)
2612 	    {
2613 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2614 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2615 					     GEN_INT (0x40)));
2616 	      code = NE;
2617 	    }
2618 	  else
2619 	    {
2620 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2621 	      code = EQ;
2622 	    }
2623 	  break;
2624 
2625 	case UNORDERED:
2626 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2627 	  code = NE;
2628 	  break;
2629 	case ORDERED:
2630 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2631 	  code = EQ;
2632 	  break;
2633 
2634 	default:
2635 	  gcc_unreachable ();
2636 	}
2637 	break;
2638 
2639     default:
2640       gcc_unreachable();
2641     }
2642 
2643   /* Return the test that should be put into the flags user, i.e.
2644      the bcc, scc, or cmov instruction.  */
2645   return gen_rtx_fmt_ee (code, VOIDmode,
2646 			 gen_rtx_REG (cmp_mode, FLAGS_REG),
2647 			 const0_rtx);
2648 }
2649 
2650 /* Generate insn patterns to do an integer compare of OPERANDS.  */
2651 
2652 static rtx
ix86_expand_int_compare(enum rtx_code code,rtx op0,rtx op1)2653 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2654 {
2655   machine_mode cmpmode;
2656   rtx tmp, flags;
2657 
2658   cmpmode = SELECT_CC_MODE (code, op0, op1);
2659   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2660 
2661   /* This is very simple, but making the interface the same as in the
2662      FP case makes the rest of the code easier.  */
2663   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2664   emit_insn (gen_rtx_SET (flags, tmp));
2665 
2666   /* Return the test that should be put into the flags user, i.e.
2667      the bcc, scc, or cmov instruction.  */
2668   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2669 }
2670 
2671 static rtx
ix86_expand_compare(enum rtx_code code,rtx op0,rtx op1)2672 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2673 {
2674   rtx ret;
2675 
2676   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2677     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2678 
2679   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2680     {
2681       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2682       ret = ix86_expand_fp_compare (code, op0, op1);
2683     }
2684   else
2685     ret = ix86_expand_int_compare (code, op0, op1);
2686 
2687   return ret;
2688 }
2689 
2690 void
ix86_expand_setcc(rtx dest,enum rtx_code code,rtx op0,rtx op1)2691 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2692 {
2693   rtx ret;
2694 
2695   gcc_assert (GET_MODE (dest) == QImode);
2696 
2697   ret = ix86_expand_compare (code, op0, op1);
2698   PUT_MODE (ret, QImode);
2699   emit_insn (gen_rtx_SET (dest, ret));
2700 }
2701 
2702 /* Expand comparison setting or clearing carry flag.  Return true when
2703    successful and set pop for the operation.  */
2704 static bool
ix86_expand_carry_flag_compare(enum rtx_code code,rtx op0,rtx op1,rtx * pop)2705 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2706 {
2707   machine_mode mode
2708     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2709 
2710   /* Do not handle double-mode compares that go through special path.  */
2711   if (mode == (TARGET_64BIT ? TImode : DImode))
2712     return false;
2713 
2714   if (SCALAR_FLOAT_MODE_P (mode))
2715     {
2716       rtx compare_op;
2717       rtx_insn *compare_seq;
2718 
2719       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2720 
2721       /* Shortcut:  following common codes never translate
2722 	 into carry flag compares.  */
2723       if (code == EQ || code == NE || code == UNEQ || code == LTGT
2724 	  || code == ORDERED || code == UNORDERED)
2725 	return false;
2726 
2727       /* These comparisons require zero flag; swap operands so they won't.  */
2728       if ((code == GT || code == UNLE || code == LE || code == UNGT)
2729 	  && !TARGET_IEEE_FP)
2730 	{
2731 	  std::swap (op0, op1);
2732 	  code = swap_condition (code);
2733 	}
2734 
2735       /* Try to expand the comparison and verify that we end up with
2736 	 carry flag based comparison.  This fails to be true only when
2737 	 we decide to expand comparison using arithmetic that is not
2738 	 too common scenario.  */
2739       start_sequence ();
2740       compare_op = ix86_expand_fp_compare (code, op0, op1);
2741       compare_seq = get_insns ();
2742       end_sequence ();
2743 
2744       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2745         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2746       else
2747 	code = GET_CODE (compare_op);
2748 
2749       if (code != LTU && code != GEU)
2750 	return false;
2751 
2752       emit_insn (compare_seq);
2753       *pop = compare_op;
2754       return true;
2755     }
2756 
2757   if (!INTEGRAL_MODE_P (mode))
2758     return false;
2759 
2760   switch (code)
2761     {
2762     case LTU:
2763     case GEU:
2764       break;
2765 
2766     /* Convert a==0 into (unsigned)a<1.  */
2767     case EQ:
2768     case NE:
2769       if (op1 != const0_rtx)
2770 	return false;
2771       op1 = const1_rtx;
2772       code = (code == EQ ? LTU : GEU);
2773       break;
2774 
2775     /* Convert a>b into b<a or a>=b-1.  */
2776     case GTU:
2777     case LEU:
2778       if (CONST_INT_P (op1))
2779 	{
2780 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2781 	  /* Bail out on overflow.  We still can swap operands but that
2782 	     would force loading of the constant into register.  */
2783 	  if (op1 == const0_rtx
2784 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2785 	    return false;
2786 	  code = (code == GTU ? GEU : LTU);
2787 	}
2788       else
2789 	{
2790 	  std::swap (op0, op1);
2791 	  code = (code == GTU ? LTU : GEU);
2792 	}
2793       break;
2794 
2795     /* Convert a>=0 into (unsigned)a<0x80000000.  */
2796     case LT:
2797     case GE:
2798       if (mode == DImode || op1 != const0_rtx)
2799 	return false;
2800       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2801       code = (code == LT ? GEU : LTU);
2802       break;
2803     case LE:
2804     case GT:
2805       if (mode == DImode || op1 != constm1_rtx)
2806 	return false;
2807       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2808       code = (code == LE ? GEU : LTU);
2809       break;
2810 
2811     default:
2812       return false;
2813     }
2814   /* Swapping operands may cause constant to appear as first operand.  */
2815   if (!nonimmediate_operand (op0, VOIDmode))
2816     {
2817       if (!can_create_pseudo_p ())
2818 	return false;
2819       op0 = force_reg (mode, op0);
2820     }
2821   *pop = ix86_expand_compare (code, op0, op1);
2822   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2823   return true;
2824 }
2825 
2826 /* Expand conditional increment or decrement using adb/sbb instructions.
2827    The default case using setcc followed by the conditional move can be
2828    done by generic code.  */
2829 bool
ix86_expand_int_addcc(rtx operands[])2830 ix86_expand_int_addcc (rtx operands[])
2831 {
2832   enum rtx_code code = GET_CODE (operands[1]);
2833   rtx flags;
2834   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2835   rtx compare_op;
2836   rtx val = const0_rtx;
2837   bool fpcmp = false;
2838   machine_mode mode;
2839   rtx op0 = XEXP (operands[1], 0);
2840   rtx op1 = XEXP (operands[1], 1);
2841 
2842   if (operands[3] != const1_rtx
2843       && operands[3] != constm1_rtx)
2844     return false;
2845   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2846      return false;
2847   code = GET_CODE (compare_op);
2848 
2849   flags = XEXP (compare_op, 0);
2850 
2851   if (GET_MODE (flags) == CCFPmode)
2852     {
2853       fpcmp = true;
2854       code = ix86_fp_compare_code_to_integer (code);
2855     }
2856 
2857   if (code != LTU)
2858     {
2859       val = constm1_rtx;
2860       if (fpcmp)
2861 	PUT_CODE (compare_op,
2862 		  reverse_condition_maybe_unordered
2863 		    (GET_CODE (compare_op)));
2864       else
2865 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2866     }
2867 
2868   mode = GET_MODE (operands[0]);
2869 
2870   /* Construct either adc or sbb insn.  */
2871   if ((code == LTU) == (operands[3] == constm1_rtx))
2872     insn = gen_sub3_carry;
2873   else
2874     insn = gen_add3_carry;
2875 
2876   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2877 
2878   return true;
2879 }
2880 
2881 bool
ix86_expand_int_movcc(rtx operands[])2882 ix86_expand_int_movcc (rtx operands[])
2883 {
2884   enum rtx_code code = GET_CODE (operands[1]), compare_code;
2885   rtx_insn *compare_seq;
2886   rtx compare_op;
2887   machine_mode mode = GET_MODE (operands[0]);
2888   bool sign_bit_compare_p = false;
2889   rtx op0 = XEXP (operands[1], 0);
2890   rtx op1 = XEXP (operands[1], 1);
2891 
2892   if (GET_MODE (op0) == TImode
2893       || (GET_MODE (op0) == DImode
2894 	  && !TARGET_64BIT))
2895     return false;
2896 
2897   start_sequence ();
2898   compare_op = ix86_expand_compare (code, op0, op1);
2899   compare_seq = get_insns ();
2900   end_sequence ();
2901 
2902   compare_code = GET_CODE (compare_op);
2903 
2904   if ((op1 == const0_rtx && (code == GE || code == LT))
2905       || (op1 == constm1_rtx && (code == GT || code == LE)))
2906     sign_bit_compare_p = true;
2907 
2908   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2909      HImode insns, we'd be swallowed in word prefix ops.  */
2910 
2911   if ((mode != HImode || TARGET_FAST_PREFIX)
2912       && (mode != (TARGET_64BIT ? TImode : DImode))
2913       && CONST_INT_P (operands[2])
2914       && CONST_INT_P (operands[3]))
2915     {
2916       rtx out = operands[0];
2917       HOST_WIDE_INT ct = INTVAL (operands[2]);
2918       HOST_WIDE_INT cf = INTVAL (operands[3]);
2919       HOST_WIDE_INT diff;
2920 
2921       diff = ct - cf;
2922       /*  Sign bit compares are better done using shifts than we do by using
2923 	  sbb.  */
2924       if (sign_bit_compare_p
2925 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2926 	{
2927 	  /* Detect overlap between destination and compare sources.  */
2928 	  rtx tmp = out;
2929 
2930           if (!sign_bit_compare_p)
2931 	    {
2932 	      rtx flags;
2933 	      bool fpcmp = false;
2934 
2935 	      compare_code = GET_CODE (compare_op);
2936 
2937 	      flags = XEXP (compare_op, 0);
2938 
2939 	      if (GET_MODE (flags) == CCFPmode)
2940 		{
2941 		  fpcmp = true;
2942 		  compare_code
2943 		    = ix86_fp_compare_code_to_integer (compare_code);
2944 		}
2945 
2946 	      /* To simplify rest of code, restrict to the GEU case.  */
2947 	      if (compare_code == LTU)
2948 		{
2949 		  std::swap (ct, cf);
2950 		  compare_code = reverse_condition (compare_code);
2951 		  code = reverse_condition (code);
2952 		}
2953 	      else
2954 		{
2955 		  if (fpcmp)
2956 		    PUT_CODE (compare_op,
2957 			      reverse_condition_maybe_unordered
2958 			        (GET_CODE (compare_op)));
2959 		  else
2960 		    PUT_CODE (compare_op,
2961 			      reverse_condition (GET_CODE (compare_op)));
2962 		}
2963 	      diff = ct - cf;
2964 
2965 	      if (reg_overlap_mentioned_p (out, op0)
2966 		  || reg_overlap_mentioned_p (out, op1))
2967 		tmp = gen_reg_rtx (mode);
2968 
2969 	      if (mode == DImode)
2970 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2971 	      else
2972 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
2973 						 flags, compare_op));
2974 	    }
2975 	  else
2976 	    {
2977 	      if (code == GT || code == GE)
2978 		code = reverse_condition (code);
2979 	      else
2980 		{
2981 		  std::swap (ct, cf);
2982 		  diff = ct - cf;
2983 		}
2984 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2985 	    }
2986 
2987 	  if (diff == 1)
2988 	    {
2989 	      /*
2990 	       * cmpl op0,op1
2991 	       * sbbl dest,dest
2992 	       * [addl dest, ct]
2993 	       *
2994 	       * Size 5 - 8.
2995 	       */
2996 	      if (ct)
2997 		tmp = expand_simple_binop (mode, PLUS,
2998 					   tmp, GEN_INT (ct),
2999 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3000 	    }
3001 	  else if (cf == -1)
3002 	    {
3003 	      /*
3004 	       * cmpl op0,op1
3005 	       * sbbl dest,dest
3006 	       * orl $ct, dest
3007 	       *
3008 	       * Size 8.
3009 	       */
3010 	      tmp = expand_simple_binop (mode, IOR,
3011 					 tmp, GEN_INT (ct),
3012 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3013 	    }
3014 	  else if (diff == -1 && ct)
3015 	    {
3016 	      /*
3017 	       * cmpl op0,op1
3018 	       * sbbl dest,dest
3019 	       * notl dest
3020 	       * [addl dest, cf]
3021 	       *
3022 	       * Size 8 - 11.
3023 	       */
3024 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3025 	      if (cf)
3026 		tmp = expand_simple_binop (mode, PLUS,
3027 					   copy_rtx (tmp), GEN_INT (cf),
3028 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3029 	    }
3030 	  else
3031 	    {
3032 	      /*
3033 	       * cmpl op0,op1
3034 	       * sbbl dest,dest
3035 	       * [notl dest]
3036 	       * andl cf - ct, dest
3037 	       * [addl dest, ct]
3038 	       *
3039 	       * Size 8 - 11.
3040 	       */
3041 
3042 	      if (cf == 0)
3043 		{
3044 		  cf = ct;
3045 		  ct = 0;
3046 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3047 		}
3048 
3049 	      tmp = expand_simple_binop (mode, AND,
3050 					 copy_rtx (tmp),
3051 					 gen_int_mode (cf - ct, mode),
3052 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3053 	      if (ct)
3054 		tmp = expand_simple_binop (mode, PLUS,
3055 					   copy_rtx (tmp), GEN_INT (ct),
3056 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3057 	    }
3058 
3059 	  if (!rtx_equal_p (tmp, out))
3060 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3061 
3062 	  return true;
3063 	}
3064 
3065       if (diff < 0)
3066 	{
3067 	  machine_mode cmp_mode = GET_MODE (op0);
3068 	  enum rtx_code new_code;
3069 
3070 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
3071 	    {
3072 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3073 
3074 	      /* We may be reversing a non-trapping
3075 		 comparison to a trapping comparison.  */
3076 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3077 		      && code != EQ && code != NE
3078 		      && code != ORDERED && code != UNORDERED)
3079 		    new_code = UNKNOWN;
3080 		  else
3081 		    new_code = reverse_condition_maybe_unordered (code);
3082 	    }
3083 	  else
3084 	    new_code = ix86_reverse_condition (code, cmp_mode);
3085 	  if (new_code != UNKNOWN)
3086 	    {
3087 	      std::swap (ct, cf);
3088 	      diff = -diff;
3089 	      code = new_code;
3090 	    }
3091 	}
3092 
3093       compare_code = UNKNOWN;
3094       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3095 	  && CONST_INT_P (op1))
3096 	{
3097 	  if (op1 == const0_rtx
3098 	      && (code == LT || code == GE))
3099 	    compare_code = code;
3100 	  else if (op1 == constm1_rtx)
3101 	    {
3102 	      if (code == LE)
3103 		compare_code = LT;
3104 	      else if (code == GT)
3105 		compare_code = GE;
3106 	    }
3107 	}
3108 
3109       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
3110       if (compare_code != UNKNOWN
3111 	  && GET_MODE (op0) == GET_MODE (out)
3112 	  && (cf == -1 || ct == -1))
3113 	{
3114 	  /* If lea code below could be used, only optimize
3115 	     if it results in a 2 insn sequence.  */
3116 
3117 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3118 		 || diff == 3 || diff == 5 || diff == 9)
3119 	      || (compare_code == LT && ct == -1)
3120 	      || (compare_code == GE && cf == -1))
3121 	    {
3122 	      /*
3123 	       * notl op1	(if necessary)
3124 	       * sarl $31, op1
3125 	       * orl cf, op1
3126 	       */
3127 	      if (ct != -1)
3128 		{
3129 		  cf = ct;
3130 		  ct = -1;
3131 		  code = reverse_condition (code);
3132 		}
3133 
3134 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3135 
3136 	      out = expand_simple_binop (mode, IOR,
3137 					 out, GEN_INT (cf),
3138 					 out, 1, OPTAB_DIRECT);
3139 	      if (out != operands[0])
3140 		emit_move_insn (operands[0], out);
3141 
3142 	      return true;
3143 	    }
3144 	}
3145 
3146 
3147       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3148 	   || diff == 3 || diff == 5 || diff == 9)
3149 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3150 	  && (mode != DImode
3151 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3152 	{
3153 	  /*
3154 	   * xorl dest,dest
3155 	   * cmpl op1,op2
3156 	   * setcc dest
3157 	   * lea cf(dest*(ct-cf)),dest
3158 	   *
3159 	   * Size 14.
3160 	   *
3161 	   * This also catches the degenerate setcc-only case.
3162 	   */
3163 
3164 	  rtx tmp;
3165 	  int nops;
3166 
3167 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3168 
3169 	  nops = 0;
3170 	  /* On x86_64 the lea instruction operates on Pmode, so we need
3171 	     to get arithmetics done in proper mode to match.  */
3172 	  if (diff == 1)
3173 	    tmp = copy_rtx (out);
3174 	  else
3175 	    {
3176 	      rtx out1;
3177 	      out1 = copy_rtx (out);
3178 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3179 	      nops++;
3180 	      if (diff & 1)
3181 		{
3182 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
3183 		  nops++;
3184 		}
3185 	    }
3186 	  if (cf != 0)
3187 	    {
3188 	      tmp = plus_constant (mode, tmp, cf);
3189 	      nops++;
3190 	    }
3191 	  if (!rtx_equal_p (tmp, out))
3192 	    {
3193 	      if (nops == 1)
3194 		out = force_operand (tmp, copy_rtx (out));
3195 	      else
3196 		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3197 	    }
3198 	  if (!rtx_equal_p (out, operands[0]))
3199 	    emit_move_insn (operands[0], copy_rtx (out));
3200 
3201 	  return true;
3202 	}
3203 
3204       /*
3205        * General case:			Jumpful:
3206        *   xorl dest,dest		cmpl op1, op2
3207        *   cmpl op1, op2		movl ct, dest
3208        *   setcc dest			jcc 1f
3209        *   decl dest			movl cf, dest
3210        *   andl (cf-ct),dest		1:
3211        *   addl ct,dest
3212        *
3213        * Size 20.			Size 14.
3214        *
3215        * This is reasonably steep, but branch mispredict costs are
3216        * high on modern cpus, so consider failing only if optimizing
3217        * for space.
3218        */
3219 
3220       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3221 	  && BRANCH_COST (optimize_insn_for_speed_p (),
3222 		  	  false) >= 2)
3223 	{
3224 	  if (cf == 0)
3225 	    {
3226 	      machine_mode cmp_mode = GET_MODE (op0);
3227 	      enum rtx_code new_code;
3228 
3229 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
3230 		{
3231 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3232 
3233 		  /* We may be reversing a non-trapping
3234 		     comparison to a trapping comparison.  */
3235 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3236 		      && code != EQ && code != NE
3237 		      && code != ORDERED && code != UNORDERED)
3238 		    new_code = UNKNOWN;
3239 		  else
3240 		    new_code = reverse_condition_maybe_unordered (code);
3241 
3242 		}
3243 	      else
3244 		{
3245 		  new_code = ix86_reverse_condition (code, cmp_mode);
3246 		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
3247 		    compare_code = reverse_condition (compare_code);
3248 		}
3249 
3250 	      if (new_code != UNKNOWN)
3251 		{
3252 		  cf = ct;
3253 		  ct = 0;
3254 		  code = new_code;
3255 		}
3256 	    }
3257 
3258 	  if (compare_code != UNKNOWN)
3259 	    {
3260 	      /* notl op1	(if needed)
3261 		 sarl $31, op1
3262 		 andl (cf-ct), op1
3263 		 addl ct, op1
3264 
3265 		 For x < 0 (resp. x <= -1) there will be no notl,
3266 		 so if possible swap the constants to get rid of the
3267 		 complement.
3268 		 True/false will be -1/0 while code below (store flag
3269 		 followed by decrement) is 0/-1, so the constants need
3270 		 to be exchanged once more.  */
3271 
3272 	      if (compare_code == GE || !cf)
3273 		{
3274 		  code = reverse_condition (code);
3275 		  compare_code = LT;
3276 		}
3277 	      else
3278 		std::swap (ct, cf);
3279 
3280 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3281 	    }
3282 	  else
3283 	    {
3284 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3285 
3286 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3287 					 constm1_rtx,
3288 					 copy_rtx (out), 1, OPTAB_DIRECT);
3289 	    }
3290 
3291 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
3292 				     gen_int_mode (cf - ct, mode),
3293 				     copy_rtx (out), 1, OPTAB_DIRECT);
3294 	  if (ct)
3295 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3296 				       copy_rtx (out), 1, OPTAB_DIRECT);
3297 	  if (!rtx_equal_p (out, operands[0]))
3298 	    emit_move_insn (operands[0], copy_rtx (out));
3299 
3300 	  return true;
3301 	}
3302     }
3303 
3304   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3305     {
3306       /* Try a few things more with specific constants and a variable.  */
3307 
3308       optab op;
3309       rtx var, orig_out, out, tmp;
3310 
3311       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3312 	return false;
3313 
3314       /* If one of the two operands is an interesting constant, load a
3315 	 constant with the above and mask it in with a logical operation.  */
3316 
3317       if (CONST_INT_P (operands[2]))
3318 	{
3319 	  var = operands[3];
3320 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3321 	    operands[3] = constm1_rtx, op = and_optab;
3322 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3323 	    operands[3] = const0_rtx, op = ior_optab;
3324 	  else
3325 	    return false;
3326 	}
3327       else if (CONST_INT_P (operands[3]))
3328 	{
3329 	  var = operands[2];
3330 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3331 	    {
3332 	      /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3333 		 "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
3334 	      if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3335 		operands[1] = simplify_gen_relational (LT, VOIDmode,
3336 						       GET_MODE (op0),
3337 						       op0, const0_rtx);
3338 
3339 	      operands[2] = constm1_rtx;
3340 	      op = and_optab;
3341 	    }
3342 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3343 	    operands[2] = const0_rtx, op = ior_optab;
3344 	  else
3345 	    return false;
3346 	}
3347       else
3348         return false;
3349 
3350       orig_out = operands[0];
3351       tmp = gen_reg_rtx (mode);
3352       operands[0] = tmp;
3353 
3354       /* Recurse to get the constant loaded.  */
3355       if (!ix86_expand_int_movcc (operands))
3356         return false;
3357 
3358       /* Mask in the interesting variable.  */
3359       out = expand_binop (mode, op, var, tmp, orig_out, 0,
3360 			  OPTAB_WIDEN);
3361       if (!rtx_equal_p (out, orig_out))
3362 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3363 
3364       return true;
3365     }
3366 
3367   /*
3368    * For comparison with above,
3369    *
3370    * movl cf,dest
3371    * movl ct,tmp
3372    * cmpl op1,op2
3373    * cmovcc tmp,dest
3374    *
3375    * Size 15.
3376    */
3377 
3378   if (! nonimmediate_operand (operands[2], mode))
3379     operands[2] = force_reg (mode, operands[2]);
3380   if (! nonimmediate_operand (operands[3], mode))
3381     operands[3] = force_reg (mode, operands[3]);
3382 
3383   if (! register_operand (operands[2], VOIDmode)
3384       && (mode == QImode
3385           || ! register_operand (operands[3], VOIDmode)))
3386     operands[2] = force_reg (mode, operands[2]);
3387 
3388   if (mode == QImode
3389       && ! register_operand (operands[3], VOIDmode))
3390     operands[3] = force_reg (mode, operands[3]);
3391 
3392   emit_insn (compare_seq);
3393   emit_insn (gen_rtx_SET (operands[0],
3394 			  gen_rtx_IF_THEN_ELSE (mode,
3395 						compare_op, operands[2],
3396 						operands[3])));
3397   return true;
3398 }
3399 
3400 /* Detect conditional moves that exactly match min/max operational
3401    semantics.  Note that this is IEEE safe, as long as we don't
3402    interchange the operands.
3403 
3404    Returns FALSE if this conditional move doesn't match a MIN/MAX,
3405    and TRUE if the operation is successful and instructions are emitted.  */
3406 
3407 static bool
ix86_expand_sse_fp_minmax(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx if_true,rtx if_false)3408 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3409 			   rtx cmp_op1, rtx if_true, rtx if_false)
3410 {
3411   machine_mode mode;
3412   bool is_min;
3413   rtx tmp;
3414 
3415   if (code == LT)
3416     ;
3417   else if (code == UNGE)
3418     std::swap (if_true, if_false);
3419   else
3420     return false;
3421 
3422   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3423     is_min = true;
3424   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3425     is_min = false;
3426   else
3427     return false;
3428 
3429   mode = GET_MODE (dest);
3430 
3431   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3432      but MODE may be a vector mode and thus not appropriate.  */
3433   if (!flag_finite_math_only || flag_signed_zeros)
3434     {
3435       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3436       rtvec v;
3437 
3438       if_true = force_reg (mode, if_true);
3439       v = gen_rtvec (2, if_true, if_false);
3440       tmp = gen_rtx_UNSPEC (mode, v, u);
3441     }
3442   else
3443     {
3444       code = is_min ? SMIN : SMAX;
3445       if (MEM_P (if_true) && MEM_P (if_false))
3446 	if_true = force_reg (mode, if_true);
3447       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3448     }
3449 
3450   emit_insn (gen_rtx_SET (dest, tmp));
3451   return true;
3452 }
3453 
3454 /* Return true if MODE is valid for vector compare to mask register,
3455    Same result for conditionl vector move with mask register.  */
3456 static bool
ix86_valid_mask_cmp_mode(machine_mode mode)3457 ix86_valid_mask_cmp_mode (machine_mode mode)
3458 {
3459   /* XOP has its own vector conditional movement.  */
3460   if (TARGET_XOP && !TARGET_AVX512F)
3461     return false;
3462 
3463   /* AVX512F is needed for mask operation.  */
3464   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3465     return false;
3466 
3467   /* AVX512BW is needed for vector QI/HImode,
3468      AVX512VL is needed for 128/256-bit vector.  */
3469   machine_mode inner_mode = GET_MODE_INNER (mode);
3470   int vector_size = GET_MODE_SIZE (mode);
3471   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3472     return false;
3473 
3474   return vector_size == 64 || TARGET_AVX512VL;
3475 }
3476 
3477 /* Return true if integer mask comparison should be used.  */
3478 static bool
ix86_use_mask_cmp_p(machine_mode mode,machine_mode cmp_mode,rtx op_true,rtx op_false)3479 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3480 		     rtx op_true, rtx op_false)
3481 {
3482   if (GET_MODE_SIZE (mode) == 64)
3483     return true;
3484 
3485   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
3486   gcc_assert (!op_true == !op_false);
3487 
3488   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3489      vector dest is required.  */
3490   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3491     return false;
3492 
3493   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
3494   if (op_false == CONST0_RTX (mode)
3495       || op_true == CONST0_RTX (mode)
3496       || (INTEGRAL_MODE_P (mode)
3497 	  && (op_true == CONSTM1_RTX (mode)
3498 	      || op_false == CONSTM1_RTX (mode))))
3499     return false;
3500 
3501   return true;
3502 }
3503 
3504 /* Expand an SSE comparison.  Return the register with the result.  */
3505 
3506 static rtx
ix86_expand_sse_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx op_true,rtx op_false)3507 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3508 		     rtx op_true, rtx op_false)
3509 {
3510   machine_mode mode = GET_MODE (dest);
3511   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3512 
3513   /* In general case result of comparison can differ from operands' type.  */
3514   machine_mode cmp_mode;
3515 
3516   /* In AVX512F the result of comparison is an integer mask.  */
3517   bool maskcmp = false;
3518   rtx x;
3519 
3520   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3521     {
3522       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3523       maskcmp = true;
3524       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3525     }
3526   else
3527     cmp_mode = cmp_ops_mode;
3528 
3529   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3530 
3531   int (*op1_predicate)(rtx, machine_mode)
3532     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3533 
3534   if (!op1_predicate (cmp_op1, cmp_ops_mode))
3535     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3536 
3537   if (optimize
3538       || (maskcmp && cmp_mode != mode)
3539       || (op_true && reg_overlap_mentioned_p (dest, op_true))
3540       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3541     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3542 
3543   if (maskcmp)
3544     {
3545       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3546       gcc_assert (ok);
3547       return dest;
3548     }
3549 
3550   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3551 
3552   if (cmp_mode != mode)
3553     {
3554       x = force_reg (cmp_ops_mode, x);
3555       convert_move (dest, x, false);
3556     }
3557   else
3558     emit_insn (gen_rtx_SET (dest, x));
3559 
3560   return dest;
3561 }
3562 
3563 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3564    operations.  This is used for both scalar and vector conditional moves.  */
3565 
3566 void
ix86_expand_sse_movcc(rtx dest,rtx cmp,rtx op_true,rtx op_false)3567 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3568 {
3569   machine_mode mode = GET_MODE (dest);
3570   machine_mode cmpmode = GET_MODE (cmp);
3571 
3572   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
3573   if (rtx_equal_p (op_true, op_false))
3574     {
3575       emit_move_insn (dest, op_true);
3576       return;
3577     }
3578 
3579   rtx t2, t3, x;
3580 
3581   /* If we have an integer mask and FP value then we need
3582      to cast mask to FP mode.  */
3583   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3584     {
3585       cmp = force_reg (cmpmode, cmp);
3586       cmp = gen_rtx_SUBREG (mode, cmp, 0);
3587     }
3588 
3589   /* In AVX512F the result of comparison is an integer mask.  */
3590   if (mode != cmpmode
3591       && GET_MODE_CLASS (cmpmode) == MODE_INT)
3592     {
3593       gcc_assert (ix86_valid_mask_cmp_mode (mode));
3594       /* Using vector move with mask register.  */
3595       cmp = force_reg (cmpmode, cmp);
3596       /* Optimize for mask zero.  */
3597       op_true = (op_true != CONST0_RTX (mode)
3598 		 ? force_reg (mode, op_true) : op_true);
3599       op_false = (op_false != CONST0_RTX (mode)
3600 		  ? force_reg (mode, op_false) : op_false);
3601       if (op_true == CONST0_RTX (mode))
3602 	{
3603 	  rtx n = gen_reg_rtx (cmpmode);
3604 	  if (cmpmode == E_DImode && !TARGET_64BIT)
3605 	    emit_insn (gen_knotdi (n, cmp));
3606 	  else
3607 	    emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
3608 	  cmp = n;
3609 	  /* Reverse op_true op_false.  */
3610 	  std::swap (op_true, op_false);
3611 	}
3612 
3613       rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3614       emit_insn (gen_rtx_SET (dest, vec_merge));
3615       return;
3616     }
3617   else if (vector_all_ones_operand (op_true, mode)
3618 	   && op_false == CONST0_RTX (mode))
3619     {
3620       emit_insn (gen_rtx_SET (dest, cmp));
3621       return;
3622     }
3623   else if (op_false == CONST0_RTX (mode))
3624     {
3625       op_true = force_reg (mode, op_true);
3626       x = gen_rtx_AND (mode, cmp, op_true);
3627       emit_insn (gen_rtx_SET (dest, x));
3628       return;
3629     }
3630   else if (op_true == CONST0_RTX (mode))
3631     {
3632       op_false = force_reg (mode, op_false);
3633       x = gen_rtx_NOT (mode, cmp);
3634       x = gen_rtx_AND (mode, x, op_false);
3635       emit_insn (gen_rtx_SET (dest, x));
3636       return;
3637     }
3638   else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3639     {
3640       op_false = force_reg (mode, op_false);
3641       x = gen_rtx_IOR (mode, cmp, op_false);
3642       emit_insn (gen_rtx_SET (dest, x));
3643       return;
3644     }
3645   else if (TARGET_XOP)
3646     {
3647       op_true = force_reg (mode, op_true);
3648 
3649       if (!nonimmediate_operand (op_false, mode))
3650 	op_false = force_reg (mode, op_false);
3651 
3652       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3653 							  op_true,
3654 							  op_false)));
3655       return;
3656     }
3657 
3658   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3659   rtx d = dest;
3660 
3661   if (!vector_operand (op_true, mode))
3662     op_true = force_reg (mode, op_true);
3663 
3664   op_false = force_reg (mode, op_false);
3665 
3666   switch (mode)
3667     {
3668     case E_V4SFmode:
3669       if (TARGET_SSE4_1)
3670 	gen = gen_sse4_1_blendvps;
3671       break;
3672     case E_V2DFmode:
3673       if (TARGET_SSE4_1)
3674 	gen = gen_sse4_1_blendvpd;
3675       break;
3676     case E_SFmode:
3677       if (TARGET_SSE4_1)
3678 	{
3679 	  gen = gen_sse4_1_blendvss;
3680 	  op_true = force_reg (mode, op_true);
3681 	}
3682       break;
3683     case E_DFmode:
3684       if (TARGET_SSE4_1)
3685 	{
3686 	  gen = gen_sse4_1_blendvsd;
3687 	  op_true = force_reg (mode, op_true);
3688 	}
3689       break;
3690     case E_V16QImode:
3691     case E_V8HImode:
3692     case E_V4SImode:
3693     case E_V2DImode:
3694       if (TARGET_SSE4_1)
3695 	{
3696 	  gen = gen_sse4_1_pblendvb;
3697 	  if (mode != V16QImode)
3698 	    d = gen_reg_rtx (V16QImode);
3699 	  op_false = gen_lowpart (V16QImode, op_false);
3700 	  op_true = gen_lowpart (V16QImode, op_true);
3701 	  cmp = gen_lowpart (V16QImode, cmp);
3702 	}
3703       break;
3704     case E_V8SFmode:
3705       if (TARGET_AVX)
3706 	gen = gen_avx_blendvps256;
3707       break;
3708     case E_V4DFmode:
3709       if (TARGET_AVX)
3710 	gen = gen_avx_blendvpd256;
3711       break;
3712     case E_V32QImode:
3713     case E_V16HImode:
3714     case E_V8SImode:
3715     case E_V4DImode:
3716       if (TARGET_AVX2)
3717 	{
3718 	  gen = gen_avx2_pblendvb;
3719 	  if (mode != V32QImode)
3720 	    d = gen_reg_rtx (V32QImode);
3721 	  op_false = gen_lowpart (V32QImode, op_false);
3722 	  op_true = gen_lowpart (V32QImode, op_true);
3723 	  cmp = gen_lowpart (V32QImode, cmp);
3724 	}
3725       break;
3726 
3727     case E_V64QImode:
3728       gen = gen_avx512bw_blendmv64qi;
3729       break;
3730     case E_V32HImode:
3731       gen = gen_avx512bw_blendmv32hi;
3732       break;
3733     case E_V16SImode:
3734       gen = gen_avx512f_blendmv16si;
3735       break;
3736     case E_V8DImode:
3737       gen = gen_avx512f_blendmv8di;
3738       break;
3739     case E_V8DFmode:
3740       gen = gen_avx512f_blendmv8df;
3741       break;
3742     case E_V16SFmode:
3743       gen = gen_avx512f_blendmv16sf;
3744       break;
3745 
3746     default:
3747       break;
3748     }
3749 
3750   if (gen != NULL)
3751     {
3752       emit_insn (gen (d, op_false, op_true, cmp));
3753       if (d != dest)
3754 	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3755     }
3756   else
3757     {
3758       op_true = force_reg (mode, op_true);
3759 
3760       t2 = gen_reg_rtx (mode);
3761       if (optimize)
3762 	t3 = gen_reg_rtx (mode);
3763       else
3764 	t3 = dest;
3765 
3766       x = gen_rtx_AND (mode, op_true, cmp);
3767       emit_insn (gen_rtx_SET (t2, x));
3768 
3769       x = gen_rtx_NOT (mode, cmp);
3770       x = gen_rtx_AND (mode, x, op_false);
3771       emit_insn (gen_rtx_SET (t3, x));
3772 
3773       x = gen_rtx_IOR (mode, t3, t2);
3774       emit_insn (gen_rtx_SET (dest, x));
3775     }
3776 }
3777 
3778 /* Swap, force into registers, or otherwise massage the two operands
3779    to an sse comparison with a mask result.  Thus we differ a bit from
3780    ix86_prepare_fp_compare_args which expects to produce a flags result.
3781 
3782    The DEST operand exists to help determine whether to commute commutative
3783    operators.  The POP0/POP1 operands are updated in place.  The new
3784    comparison code is returned, or UNKNOWN if not implementable.  */
3785 
3786 static enum rtx_code
ix86_prepare_sse_fp_compare_args(rtx dest,enum rtx_code code,rtx * pop0,rtx * pop1)3787 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3788 				  rtx *pop0, rtx *pop1)
3789 {
3790   switch (code)
3791     {
3792     case LTGT:
3793     case UNEQ:
3794       /* AVX supports all the needed comparisons.  */
3795       if (TARGET_AVX)
3796 	break;
3797       /* We have no LTGT as an operator.  We could implement it with
3798 	 NE & ORDERED, but this requires an extra temporary.  It's
3799 	 not clear that it's worth it.  */
3800       return UNKNOWN;
3801 
3802     case LT:
3803     case LE:
3804     case UNGT:
3805     case UNGE:
3806       /* These are supported directly.  */
3807       break;
3808 
3809     case EQ:
3810     case NE:
3811     case UNORDERED:
3812     case ORDERED:
3813       /* AVX has 3 operand comparisons, no need to swap anything.  */
3814       if (TARGET_AVX)
3815 	break;
3816       /* For commutative operators, try to canonicalize the destination
3817 	 operand to be first in the comparison - this helps reload to
3818 	 avoid extra moves.  */
3819       if (!dest || !rtx_equal_p (dest, *pop1))
3820 	break;
3821       /* FALLTHRU */
3822 
3823     case GE:
3824     case GT:
3825     case UNLE:
3826     case UNLT:
3827       /* These are not supported directly before AVX, and furthermore
3828 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
3829 	 comparison operands to transform into something that is
3830 	 supported.  */
3831       std::swap (*pop0, *pop1);
3832       code = swap_condition (code);
3833       break;
3834 
3835     default:
3836       gcc_unreachable ();
3837     }
3838 
3839   return code;
3840 }
3841 
3842 /* Expand a floating-point conditional move.  Return true if successful.  */
3843 
3844 bool
ix86_expand_fp_movcc(rtx operands[])3845 ix86_expand_fp_movcc (rtx operands[])
3846 {
3847   machine_mode mode = GET_MODE (operands[0]);
3848   enum rtx_code code = GET_CODE (operands[1]);
3849   rtx tmp, compare_op;
3850   rtx op0 = XEXP (operands[1], 0);
3851   rtx op1 = XEXP (operands[1], 1);
3852 
3853   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3854     {
3855       machine_mode cmode;
3856 
3857       /* Since we've no cmove for sse registers, don't force bad register
3858 	 allocation just to gain access to it.  Deny movcc when the
3859 	 comparison mode doesn't match the move mode.  */
3860       cmode = GET_MODE (op0);
3861       if (cmode == VOIDmode)
3862 	cmode = GET_MODE (op1);
3863       if (cmode != mode)
3864 	return false;
3865 
3866       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3867       if (code == UNKNOWN)
3868 	return false;
3869 
3870       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3871 				     operands[2], operands[3]))
3872 	return true;
3873 
3874       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3875 				 operands[2], operands[3]);
3876       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3877       return true;
3878     }
3879 
3880   if (GET_MODE (op0) == TImode
3881       || (GET_MODE (op0) == DImode
3882 	  && !TARGET_64BIT))
3883     return false;
3884 
3885   /* The floating point conditional move instructions don't directly
3886      support conditions resulting from a signed integer comparison.  */
3887 
3888   compare_op = ix86_expand_compare (code, op0, op1);
3889   if (!fcmov_comparison_operator (compare_op, VOIDmode))
3890     {
3891       tmp = gen_reg_rtx (QImode);
3892       ix86_expand_setcc (tmp, code, op0, op1);
3893 
3894       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3895     }
3896 
3897   emit_insn (gen_rtx_SET (operands[0],
3898 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
3899 						operands[2], operands[3])));
3900 
3901   return true;
3902 }
3903 
3904 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
3905 
3906 static int
ix86_int_cmp_code_to_pcmp_immediate(enum rtx_code code)3907 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3908 {
3909   switch (code)
3910     {
3911     case EQ:
3912       return 0;
3913     case LT:
3914     case LTU:
3915       return 1;
3916     case LE:
3917     case LEU:
3918       return 2;
3919     case NE:
3920       return 4;
3921     case GE:
3922     case GEU:
3923       return 5;
3924     case GT:
3925     case GTU:
3926       return 6;
3927     default:
3928       gcc_unreachable ();
3929     }
3930 }
3931 
3932 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
3933 
3934 static int
ix86_fp_cmp_code_to_pcmp_immediate(enum rtx_code code)3935 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3936 {
3937   switch (code)
3938     {
3939     case EQ:
3940       return 0x00;
3941     case NE:
3942       return 0x04;
3943     case GT:
3944       return 0x0e;
3945     case LE:
3946       return 0x02;
3947     case GE:
3948       return 0x0d;
3949     case LT:
3950       return 0x01;
3951     case UNLE:
3952       return 0x0a;
3953     case UNLT:
3954       return 0x09;
3955     case UNGE:
3956       return 0x05;
3957     case UNGT:
3958       return 0x06;
3959     case UNEQ:
3960       return 0x18;
3961     case LTGT:
3962       return 0x0c;
3963     case ORDERED:
3964       return 0x07;
3965     case UNORDERED:
3966       return 0x03;
3967     default:
3968       gcc_unreachable ();
3969     }
3970 }
3971 
3972 /* Return immediate value to be used in UNSPEC_PCMP
3973    for comparison CODE in MODE.  */
3974 
3975 static int
ix86_cmp_code_to_pcmp_immediate(enum rtx_code code,machine_mode mode)3976 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3977 {
3978   if (FLOAT_MODE_P (mode))
3979     return ix86_fp_cmp_code_to_pcmp_immediate (code);
3980   return ix86_int_cmp_code_to_pcmp_immediate (code);
3981 }
3982 
3983 /* Expand AVX-512 vector comparison.  */
3984 
3985 bool
ix86_expand_mask_vec_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1)3986 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3987 {
3988   machine_mode mask_mode = GET_MODE (dest);
3989   machine_mode cmp_mode = GET_MODE (cmp_op0);
3990   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3991   int unspec_code;
3992   rtx unspec;
3993 
3994   switch (code)
3995     {
3996     case LEU:
3997     case GTU:
3998     case GEU:
3999     case LTU:
4000       unspec_code = UNSPEC_UNSIGNED_PCMP;
4001       break;
4002 
4003     default:
4004       unspec_code = UNSPEC_PCMP;
4005     }
4006 
4007   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4008 			   unspec_code);
4009   emit_insn (gen_rtx_SET (dest, unspec));
4010 
4011   return true;
4012 }
4013 
4014 /* Expand fp vector comparison.  */
4015 
4016 bool
ix86_expand_fp_vec_cmp(rtx operands[])4017 ix86_expand_fp_vec_cmp (rtx operands[])
4018 {
4019   enum rtx_code code = GET_CODE (operands[1]);
4020   rtx cmp;
4021 
4022   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4023 					   &operands[2], &operands[3]);
4024   if (code == UNKNOWN)
4025     {
4026       rtx temp;
4027       switch (GET_CODE (operands[1]))
4028 	{
4029 	case LTGT:
4030 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4031 				      operands[3], NULL, NULL);
4032 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4033 				     operands[3], NULL, NULL);
4034 	  code = AND;
4035 	  break;
4036 	case UNEQ:
4037 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4038 				      operands[3], NULL, NULL);
4039 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4040 				     operands[3], NULL, NULL);
4041 	  code = IOR;
4042 	  break;
4043 	default:
4044 	  gcc_unreachable ();
4045 	}
4046       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4047 				 OPTAB_DIRECT);
4048     }
4049   else
4050     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4051 			       NULL, NULL);
4052 
4053   if (operands[0] != cmp)
4054     emit_move_insn (operands[0], cmp);
4055 
4056   return true;
4057 }
4058 
4059 static rtx
ix86_expand_int_sse_cmp(rtx dest,enum rtx_code code,rtx cop0,rtx cop1,rtx op_true,rtx op_false,bool * negate)4060 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4061 			 rtx op_true, rtx op_false, bool *negate)
4062 {
4063   machine_mode data_mode = GET_MODE (dest);
4064   machine_mode mode = GET_MODE (cop0);
4065   rtx x;
4066 
4067   *negate = false;
4068 
4069   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
4070   if (TARGET_XOP
4071       && (mode == V16QImode || mode == V8HImode
4072 	  || mode == V4SImode || mode == V2DImode))
4073     ;
4074   /* AVX512F supports all of the comparsions
4075      on all 128/256/512-bit vector int types.  */
4076   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4077     ;
4078   else
4079     {
4080       /* Canonicalize the comparison to EQ, GT, GTU.  */
4081       switch (code)
4082 	{
4083 	case EQ:
4084 	case GT:
4085 	case GTU:
4086 	  break;
4087 
4088 	case NE:
4089 	case LE:
4090 	case LEU:
4091 	  code = reverse_condition (code);
4092 	  *negate = true;
4093 	  break;
4094 
4095 	case GE:
4096 	case GEU:
4097 	  code = reverse_condition (code);
4098 	  *negate = true;
4099 	  /* FALLTHRU */
4100 
4101 	case LT:
4102 	case LTU:
4103 	  std::swap (cop0, cop1);
4104 	  code = swap_condition (code);
4105 	  break;
4106 
4107 	default:
4108 	  gcc_unreachable ();
4109 	}
4110 
4111       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
4112       if (mode == V2DImode)
4113 	{
4114 	  switch (code)
4115 	    {
4116 	    case EQ:
4117 	      /* SSE4.1 supports EQ.  */
4118 	      if (!TARGET_SSE4_1)
4119 		return NULL;
4120 	      break;
4121 
4122 	    case GT:
4123 	    case GTU:
4124 	      /* SSE4.2 supports GT/GTU.  */
4125 	      if (!TARGET_SSE4_2)
4126 		return NULL;
4127 	      break;
4128 
4129 	    default:
4130 	      gcc_unreachable ();
4131 	    }
4132 	}
4133 
4134       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4135       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4136       if (*negate)
4137 	std::swap (optrue, opfalse);
4138 
4139       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4140 	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4141 	 min (x, y) == x).  While we add one instruction (the minimum),
4142 	 we remove the need for two instructions in the negation, as the
4143 	 result is done this way.
4144 	 When using masks, do it for SI/DImode element types, as it is shorter
4145 	 than the two subtractions.  */
4146       if ((code != EQ
4147 	   && GET_MODE_SIZE (mode) != 64
4148 	   && vector_all_ones_operand (opfalse, data_mode)
4149 	   && optrue == CONST0_RTX (data_mode))
4150 	  || (code == GTU
4151 	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4152 	      /* Don't do it if not using integer masks and we'd end up with
4153 		 the right values in the registers though.  */
4154 	      && (GET_MODE_SIZE (mode) == 64
4155 		  || !vector_all_ones_operand (optrue, data_mode)
4156 		  || opfalse != CONST0_RTX (data_mode))))
4157 	{
4158 	  rtx (*gen) (rtx, rtx, rtx) = NULL;
4159 
4160 	  switch (mode)
4161 	    {
4162 	    case E_V16SImode:
4163 	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4164 	      break;
4165 	    case E_V8DImode:
4166 	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4167 	      cop0 = force_reg (mode, cop0);
4168 	      cop1 = force_reg (mode, cop1);
4169 	      break;
4170 	    case E_V32QImode:
4171 	      if (TARGET_AVX2)
4172 		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4173 	      break;
4174 	    case E_V16HImode:
4175 	      if (TARGET_AVX2)
4176 		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4177 	      break;
4178 	    case E_V8SImode:
4179 	      if (TARGET_AVX2)
4180 		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4181 	      break;
4182 	    case E_V4DImode:
4183 	      if (TARGET_AVX512VL)
4184 		{
4185 		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4186 		  cop0 = force_reg (mode, cop0);
4187 		  cop1 = force_reg (mode, cop1);
4188 		}
4189 	      break;
4190 	    case E_V16QImode:
4191 	      if (code == GTU && TARGET_SSE2)
4192 		gen = gen_uminv16qi3;
4193 	      else if (code == GT && TARGET_SSE4_1)
4194 		gen = gen_sminv16qi3;
4195 	      break;
4196 	    case E_V8HImode:
4197 	      if (code == GTU && TARGET_SSE4_1)
4198 		gen = gen_uminv8hi3;
4199 	      else if (code == GT && TARGET_SSE2)
4200 		gen = gen_sminv8hi3;
4201 	      break;
4202 	    case E_V4SImode:
4203 	      if (TARGET_SSE4_1)
4204 		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4205 	      break;
4206 	    case E_V2DImode:
4207 	      if (TARGET_AVX512VL)
4208 		{
4209 		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4210 		  cop0 = force_reg (mode, cop0);
4211 		  cop1 = force_reg (mode, cop1);
4212 		}
4213 	      break;
4214 	    default:
4215 	      break;
4216 	    }
4217 
4218 	  if (gen)
4219 	    {
4220 	      rtx tem = gen_reg_rtx (mode);
4221 	      if (!vector_operand (cop0, mode))
4222 		cop0 = force_reg (mode, cop0);
4223 	      if (!vector_operand (cop1, mode))
4224 		cop1 = force_reg (mode, cop1);
4225 	      *negate = !*negate;
4226 	      emit_insn (gen (tem, cop0, cop1));
4227 	      cop1 = tem;
4228 	      code = EQ;
4229 	    }
4230 	}
4231 
4232       /* Unsigned parallel compare is not supported by the hardware.
4233 	 Play some tricks to turn this into a signed comparison
4234 	 against 0.  */
4235       if (code == GTU)
4236 	{
4237 	  cop0 = force_reg (mode, cop0);
4238 
4239 	  switch (mode)
4240 	    {
4241 	    case E_V16SImode:
4242 	    case E_V8DImode:
4243 	    case E_V8SImode:
4244 	    case E_V4DImode:
4245 	    case E_V4SImode:
4246 	    case E_V2DImode:
4247 		{
4248 		  rtx t1, t2, mask;
4249 
4250 		  /* Subtract (-(INT MAX) - 1) from both operands to make
4251 		     them signed.  */
4252 		  mask = ix86_build_signbit_mask (mode, true, false);
4253 		  t1 = gen_reg_rtx (mode);
4254 		  emit_insn (gen_sub3_insn (t1, cop0, mask));
4255 
4256 		  t2 = gen_reg_rtx (mode);
4257 		  emit_insn (gen_sub3_insn (t2, cop1, mask));
4258 
4259 		  cop0 = t1;
4260 		  cop1 = t2;
4261 		  code = GT;
4262 		}
4263 	      break;
4264 
4265 	    case E_V64QImode:
4266 	    case E_V32HImode:
4267 	    case E_V32QImode:
4268 	    case E_V16HImode:
4269 	    case E_V16QImode:
4270 	    case E_V8HImode:
4271 	      /* Perform a parallel unsigned saturating subtraction.  */
4272 	      x = gen_reg_rtx (mode);
4273 	      emit_insn (gen_rtx_SET
4274 			 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4275 	      cop0 = x;
4276 	      cop1 = CONST0_RTX (mode);
4277 	      code = EQ;
4278 	      *negate = !*negate;
4279 	      break;
4280 
4281 	    default:
4282 	      gcc_unreachable ();
4283 	    }
4284 	}
4285     }
4286 
4287   if (*negate)
4288     std::swap (op_true, op_false);
4289 
4290   /* Allow the comparison to be done in one mode, but the movcc to
4291      happen in another mode.  */
4292   if (data_mode == mode)
4293     {
4294       x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4295 			       op_true, op_false);
4296     }
4297   else
4298     {
4299       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4300       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4301 			       op_true, op_false);
4302       if (GET_MODE (x) == mode)
4303 	x = gen_lowpart (data_mode, x);
4304     }
4305 
4306   return x;
4307 }
4308 
4309 /* Expand integer vector comparison.  */
4310 
4311 bool
ix86_expand_int_vec_cmp(rtx operands[])4312 ix86_expand_int_vec_cmp (rtx operands[])
4313 {
4314   rtx_code code = GET_CODE (operands[1]);
4315   bool negate = false;
4316   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4317 				     operands[3], NULL, NULL, &negate);
4318 
4319   if (!cmp)
4320     return false;
4321 
4322   if (negate)
4323     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4324 				   CONST0_RTX (GET_MODE (cmp)),
4325 				   NULL, NULL, &negate);
4326 
4327   gcc_assert (!negate);
4328 
4329   if (operands[0] != cmp)
4330     emit_move_insn (operands[0], cmp);
4331 
4332   return true;
4333 }
4334 
4335 /* Expand a floating-point vector conditional move; a vcond operation
4336    rather than a movcc operation.  */
4337 
4338 bool
ix86_expand_fp_vcond(rtx operands[])4339 ix86_expand_fp_vcond (rtx operands[])
4340 {
4341   enum rtx_code code = GET_CODE (operands[3]);
4342   rtx cmp;
4343 
4344   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4345 					   &operands[4], &operands[5]);
4346   if (code == UNKNOWN)
4347     {
4348       rtx temp;
4349       switch (GET_CODE (operands[3]))
4350 	{
4351 	case LTGT:
4352 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4353 				      operands[5], operands[0], operands[0]);
4354 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4355 				     operands[5], operands[1], operands[2]);
4356 	  code = AND;
4357 	  break;
4358 	case UNEQ:
4359 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4360 				      operands[5], operands[0], operands[0]);
4361 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4362 				     operands[5], operands[1], operands[2]);
4363 	  code = IOR;
4364 	  break;
4365 	default:
4366 	  gcc_unreachable ();
4367 	}
4368       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4369 				 OPTAB_DIRECT);
4370       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4371       return true;
4372     }
4373 
4374   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4375 				 operands[5], operands[1], operands[2]))
4376     return true;
4377 
4378   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4379 			     operands[1], operands[2]);
4380   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4381   return true;
4382 }
4383 
4384 /* Expand a signed/unsigned integral vector conditional move.  */
4385 
4386 bool
ix86_expand_int_vcond(rtx operands[])4387 ix86_expand_int_vcond (rtx operands[])
4388 {
4389   machine_mode data_mode = GET_MODE (operands[0]);
4390   machine_mode mode = GET_MODE (operands[4]);
4391   enum rtx_code code = GET_CODE (operands[3]);
4392   bool negate = false;
4393   rtx x, cop0, cop1;
4394 
4395   cop0 = operands[4];
4396   cop1 = operands[5];
4397 
4398   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4399      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
4400   if ((code == LT || code == GE)
4401       && data_mode == mode
4402       && cop1 == CONST0_RTX (mode)
4403       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4404       && GET_MODE_UNIT_SIZE (data_mode) > 1
4405       && GET_MODE_UNIT_SIZE (data_mode) <= 8
4406       && (GET_MODE_SIZE (data_mode) == 16
4407 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4408     {
4409       rtx negop = operands[2 - (code == LT)];
4410       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4411       if (negop == CONST1_RTX (data_mode))
4412 	{
4413 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4414 					 operands[0], 1, OPTAB_DIRECT);
4415 	  if (res != operands[0])
4416 	    emit_move_insn (operands[0], res);
4417 	  return true;
4418 	}
4419       else if (GET_MODE_INNER (data_mode) != DImode
4420 	       && vector_all_ones_operand (negop, data_mode))
4421 	{
4422 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4423 					 operands[0], 0, OPTAB_DIRECT);
4424 	  if (res != operands[0])
4425 	    emit_move_insn (operands[0], res);
4426 	  return true;
4427 	}
4428     }
4429 
4430   if (!nonimmediate_operand (cop1, mode))
4431     cop1 = force_reg (mode, cop1);
4432   if (!general_operand (operands[1], data_mode))
4433     operands[1] = force_reg (data_mode, operands[1]);
4434   if (!general_operand (operands[2], data_mode))
4435     operands[2] = force_reg (data_mode, operands[2]);
4436 
4437   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4438 			       operands[1], operands[2], &negate);
4439 
4440   if (!x)
4441     return false;
4442 
4443   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4444 			 operands[2-negate]);
4445   return true;
4446 }
4447 
4448 static bool
ix86_expand_vec_perm_vpermt2(rtx target,rtx mask,rtx op0,rtx op1,struct expand_vec_perm_d * d)4449 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4450 			      struct expand_vec_perm_d *d)
4451 {
4452   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4453      expander, so args are either in d, or in op0, op1 etc.  */
4454   machine_mode mode = GET_MODE (d ? d->op0 : op0);
4455   machine_mode maskmode = mode;
4456   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4457 
4458   switch (mode)
4459     {
4460     case E_V8HImode:
4461       if (TARGET_AVX512VL && TARGET_AVX512BW)
4462 	gen = gen_avx512vl_vpermt2varv8hi3;
4463       break;
4464     case E_V16HImode:
4465       if (TARGET_AVX512VL && TARGET_AVX512BW)
4466 	gen = gen_avx512vl_vpermt2varv16hi3;
4467       break;
4468     case E_V64QImode:
4469       if (TARGET_AVX512VBMI)
4470 	gen = gen_avx512bw_vpermt2varv64qi3;
4471       break;
4472     case E_V32HImode:
4473       if (TARGET_AVX512BW)
4474 	gen = gen_avx512bw_vpermt2varv32hi3;
4475       break;
4476     case E_V4SImode:
4477       if (TARGET_AVX512VL)
4478 	gen = gen_avx512vl_vpermt2varv4si3;
4479       break;
4480     case E_V8SImode:
4481       if (TARGET_AVX512VL)
4482 	gen = gen_avx512vl_vpermt2varv8si3;
4483       break;
4484     case E_V16SImode:
4485       if (TARGET_AVX512F)
4486 	gen = gen_avx512f_vpermt2varv16si3;
4487       break;
4488     case E_V4SFmode:
4489       if (TARGET_AVX512VL)
4490 	{
4491 	  gen = gen_avx512vl_vpermt2varv4sf3;
4492 	  maskmode = V4SImode;
4493 	}
4494       break;
4495     case E_V8SFmode:
4496       if (TARGET_AVX512VL)
4497 	{
4498 	  gen = gen_avx512vl_vpermt2varv8sf3;
4499 	  maskmode = V8SImode;
4500 	}
4501       break;
4502     case E_V16SFmode:
4503       if (TARGET_AVX512F)
4504 	{
4505 	  gen = gen_avx512f_vpermt2varv16sf3;
4506 	  maskmode = V16SImode;
4507 	}
4508       break;
4509     case E_V2DImode:
4510       if (TARGET_AVX512VL)
4511 	gen = gen_avx512vl_vpermt2varv2di3;
4512       break;
4513     case E_V4DImode:
4514       if (TARGET_AVX512VL)
4515 	gen = gen_avx512vl_vpermt2varv4di3;
4516       break;
4517     case E_V8DImode:
4518       if (TARGET_AVX512F)
4519 	gen = gen_avx512f_vpermt2varv8di3;
4520       break;
4521     case E_V2DFmode:
4522       if (TARGET_AVX512VL)
4523 	{
4524 	  gen = gen_avx512vl_vpermt2varv2df3;
4525 	  maskmode = V2DImode;
4526 	}
4527       break;
4528     case E_V4DFmode:
4529       if (TARGET_AVX512VL)
4530 	{
4531 	  gen = gen_avx512vl_vpermt2varv4df3;
4532 	  maskmode = V4DImode;
4533 	}
4534       break;
4535     case E_V8DFmode:
4536       if (TARGET_AVX512F)
4537 	{
4538 	  gen = gen_avx512f_vpermt2varv8df3;
4539 	  maskmode = V8DImode;
4540 	}
4541       break;
4542     default:
4543       break;
4544     }
4545 
4546   if (gen == NULL)
4547     return false;
4548 
4549   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4550      expander, so args are either in d, or in op0, op1 etc.  */
4551   if (d)
4552     {
4553       rtx vec[64];
4554       target = d->target;
4555       op0 = d->op0;
4556       op1 = d->op1;
4557       for (int i = 0; i < d->nelt; ++i)
4558 	vec[i] = GEN_INT (d->perm[i]);
4559       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4560     }
4561 
4562   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4563   return true;
4564 }
4565 
4566 /* Expand a variable vector permutation.  */
4567 
4568 void
ix86_expand_vec_perm(rtx operands[])4569 ix86_expand_vec_perm (rtx operands[])
4570 {
4571   rtx target = operands[0];
4572   rtx op0 = operands[1];
4573   rtx op1 = operands[2];
4574   rtx mask = operands[3];
4575   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4576   machine_mode mode = GET_MODE (op0);
4577   machine_mode maskmode = GET_MODE (mask);
4578   int w, e, i;
4579   bool one_operand_shuffle = rtx_equal_p (op0, op1);
4580 
4581   /* Number of elements in the vector.  */
4582   w = GET_MODE_NUNITS (mode);
4583   e = GET_MODE_UNIT_SIZE (mode);
4584   gcc_assert (w <= 64);
4585 
4586   if (TARGET_AVX512F && one_operand_shuffle)
4587     {
4588       rtx (*gen) (rtx, rtx, rtx) = NULL;
4589       switch (mode)
4590 	{
4591 	case E_V16SImode:
4592 	  gen =gen_avx512f_permvarv16si;
4593 	  break;
4594 	case E_V16SFmode:
4595 	  gen = gen_avx512f_permvarv16sf;
4596 	  break;
4597 	case E_V8DImode:
4598 	  gen = gen_avx512f_permvarv8di;
4599 	  break;
4600 	case E_V8DFmode:
4601 	  gen = gen_avx512f_permvarv8df;
4602 	  break;
4603 	default:
4604 	  break;
4605 	}
4606       if (gen != NULL)
4607 	{
4608 	  emit_insn (gen (target, op0, mask));
4609 	  return;
4610 	}
4611     }
4612 
4613   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4614     return;
4615 
4616   if (TARGET_AVX2)
4617     {
4618       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4619 	{
4620 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4621 	     an constant shuffle operand.  With a tiny bit of effort we can
4622 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
4623 	     unfortunate but there's no avoiding it.
4624 	     Similarly for V16HImode we don't have instructions for variable
4625 	     shuffling, while for V32QImode we can use after preparing suitable
4626 	     masks vpshufb; vpshufb; vpermq; vpor.  */
4627 
4628 	  if (mode == V16HImode)
4629 	    {
4630 	      maskmode = mode = V32QImode;
4631 	      w = 32;
4632 	      e = 1;
4633 	    }
4634 	  else
4635 	    {
4636 	      maskmode = mode = V8SImode;
4637 	      w = 8;
4638 	      e = 4;
4639 	    }
4640 	  t1 = gen_reg_rtx (maskmode);
4641 
4642 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
4643 	       mask = { A B C D }
4644 	       t1 = { A A B B C C D D }.  */
4645 	  for (i = 0; i < w / 2; ++i)
4646 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4647 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4648 	  vt = force_reg (maskmode, vt);
4649 	  mask = gen_lowpart (maskmode, mask);
4650 	  if (maskmode == V8SImode)
4651 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4652 	  else
4653 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4654 
4655 	  /* Multiply the shuffle indicies by two.  */
4656 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4657 				    OPTAB_DIRECT);
4658 
4659 	  /* Add one to the odd shuffle indicies:
4660 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
4661 	  for (i = 0; i < w / 2; ++i)
4662 	    {
4663 	      vec[i * 2] = const0_rtx;
4664 	      vec[i * 2 + 1] = const1_rtx;
4665 	    }
4666 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4667 	  vt = validize_mem (force_const_mem (maskmode, vt));
4668 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4669 				    OPTAB_DIRECT);
4670 
4671 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
4672 	  operands[3] = mask = t1;
4673 	  target = gen_reg_rtx (mode);
4674 	  op0 = gen_lowpart (mode, op0);
4675 	  op1 = gen_lowpart (mode, op1);
4676 	}
4677 
4678       switch (mode)
4679 	{
4680 	case E_V8SImode:
4681 	  /* The VPERMD and VPERMPS instructions already properly ignore
4682 	     the high bits of the shuffle elements.  No need for us to
4683 	     perform an AND ourselves.  */
4684 	  if (one_operand_shuffle)
4685 	    {
4686 	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4687 	      if (target != operands[0])
4688 		emit_move_insn (operands[0],
4689 				gen_lowpart (GET_MODE (operands[0]), target));
4690 	    }
4691 	  else
4692 	    {
4693 	      t1 = gen_reg_rtx (V8SImode);
4694 	      t2 = gen_reg_rtx (V8SImode);
4695 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4696 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4697 	      goto merge_two;
4698 	    }
4699 	  return;
4700 
4701 	case E_V8SFmode:
4702 	  mask = gen_lowpart (V8SImode, mask);
4703 	  if (one_operand_shuffle)
4704 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4705 	  else
4706 	    {
4707 	      t1 = gen_reg_rtx (V8SFmode);
4708 	      t2 = gen_reg_rtx (V8SFmode);
4709 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4710 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4711 	      goto merge_two;
4712 	    }
4713 	  return;
4714 
4715         case E_V4SImode:
4716 	  /* By combining the two 128-bit input vectors into one 256-bit
4717 	     input vector, we can use VPERMD and VPERMPS for the full
4718 	     two-operand shuffle.  */
4719 	  t1 = gen_reg_rtx (V8SImode);
4720 	  t2 = gen_reg_rtx (V8SImode);
4721 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4722 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4723 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4724 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4725 	  return;
4726 
4727         case E_V4SFmode:
4728 	  t1 = gen_reg_rtx (V8SFmode);
4729 	  t2 = gen_reg_rtx (V8SImode);
4730 	  mask = gen_lowpart (V4SImode, mask);
4731 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4732 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4733 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4734 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4735 	  return;
4736 
4737 	case E_V32QImode:
4738 	  t1 = gen_reg_rtx (V32QImode);
4739 	  t2 = gen_reg_rtx (V32QImode);
4740 	  t3 = gen_reg_rtx (V32QImode);
4741 	  vt2 = GEN_INT (-128);
4742 	  vt = gen_const_vec_duplicate (V32QImode, vt2);
4743 	  vt = force_reg (V32QImode, vt);
4744 	  for (i = 0; i < 32; i++)
4745 	    vec[i] = i < 16 ? vt2 : const0_rtx;
4746 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4747 	  vt2 = force_reg (V32QImode, vt2);
4748 	  /* From mask create two adjusted masks, which contain the same
4749 	     bits as mask in the low 7 bits of each vector element.
4750 	     The first mask will have the most significant bit clear
4751 	     if it requests element from the same 128-bit lane
4752 	     and MSB set if it requests element from the other 128-bit lane.
4753 	     The second mask will have the opposite values of the MSB,
4754 	     and additionally will have its 128-bit lanes swapped.
4755 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4756 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
4757 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4758 	     stands for other 12 bytes.  */
4759 	  /* The bit whether element is from the same lane or the other
4760 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
4761 	  t5 = gen_reg_rtx (V4DImode);
4762 	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4763 				    GEN_INT (3)));
4764 	  /* Clear MSB bits from the mask just in case it had them set.  */
4765 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4766 	  /* After this t1 will have MSB set for elements from other lane.  */
4767 	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4768 	  /* Clear bits other than MSB.  */
4769 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
4770 	  /* Or in the lower bits from mask into t3.  */
4771 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
4772 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
4773 	     lane.  */
4774 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
4775 	  /* Swap 128-bit lanes in t3.  */
4776 	  t6 = gen_reg_rtx (V4DImode);
4777 	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4778 					  const2_rtx, GEN_INT (3),
4779 					  const0_rtx, const1_rtx));
4780 	  /* And or in the lower bits from mask into t1.  */
4781 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
4782 	  if (one_operand_shuffle)
4783 	    {
4784 	      /* Each of these shuffles will put 0s in places where
4785 		 element from the other 128-bit lane is needed, otherwise
4786 		 will shuffle in the requested value.  */
4787 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4788 						gen_lowpart (V32QImode, t6)));
4789 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4790 	      /* For t3 the 128-bit lanes are swapped again.  */
4791 	      t7 = gen_reg_rtx (V4DImode);
4792 	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4793 					      const2_rtx, GEN_INT (3),
4794 					      const0_rtx, const1_rtx));
4795 	      /* And oring both together leads to the result.  */
4796 	      emit_insn (gen_iorv32qi3 (target, t1,
4797 					gen_lowpart (V32QImode, t7)));
4798 	      if (target != operands[0])
4799 		emit_move_insn (operands[0],
4800 				gen_lowpart (GET_MODE (operands[0]), target));
4801 	      return;
4802 	    }
4803 
4804 	  t4 = gen_reg_rtx (V32QImode);
4805 	  /* Similarly to the above one_operand_shuffle code,
4806 	     just for repeated twice for each operand.  merge_two:
4807 	     code will merge the two results together.  */
4808 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4809 					    gen_lowpart (V32QImode, t6)));
4810 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4811 					    gen_lowpart (V32QImode, t6)));
4812 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4813 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4814 	  t7 = gen_reg_rtx (V4DImode);
4815 	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4816 					  const2_rtx, GEN_INT (3),
4817 					  const0_rtx, const1_rtx));
4818 	  t8 = gen_reg_rtx (V4DImode);
4819 	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4820 					  const2_rtx, GEN_INT (3),
4821 					  const0_rtx, const1_rtx));
4822 	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4823 	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4824 	  t1 = t4;
4825 	  t2 = t3;
4826 	  goto merge_two;
4827 
4828 	default:
4829 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
4830 	  break;
4831 	}
4832     }
4833 
4834   if (TARGET_XOP)
4835     {
4836       /* The XOP VPPERM insn supports three inputs.  By ignoring the
4837 	 one_operand_shuffle special case, we avoid creating another
4838 	 set of constant vectors in memory.  */
4839       one_operand_shuffle = false;
4840 
4841       /* mask = mask & {2*w-1, ...} */
4842       vt = GEN_INT (2*w - 1);
4843     }
4844   else
4845     {
4846       /* mask = mask & {w-1, ...} */
4847       vt = GEN_INT (w - 1);
4848     }
4849 
4850   vt = gen_const_vec_duplicate (maskmode, vt);
4851   mask = expand_simple_binop (maskmode, AND, mask, vt,
4852 			      NULL_RTX, 0, OPTAB_DIRECT);
4853 
4854   /* For non-QImode operations, convert the word permutation control
4855      into a byte permutation control.  */
4856   if (mode != V16QImode)
4857     {
4858       mask = expand_simple_binop (maskmode, ASHIFT, mask,
4859 				  GEN_INT (exact_log2 (e)),
4860 				  NULL_RTX, 0, OPTAB_DIRECT);
4861 
4862       /* Convert mask to vector of chars.  */
4863       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4864 
4865       /* Replicate each of the input bytes into byte positions:
4866 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4867 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4868 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
4869       for (i = 0; i < 16; ++i)
4870 	vec[i] = GEN_INT (i/e * e);
4871       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4872       vt = validize_mem (force_const_mem (V16QImode, vt));
4873       if (TARGET_XOP)
4874 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4875       else
4876 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4877 
4878       /* Convert it into the byte positions by doing
4879 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
4880       for (i = 0; i < 16; ++i)
4881 	vec[i] = GEN_INT (i % e);
4882       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4883       vt = validize_mem (force_const_mem (V16QImode, vt));
4884       emit_insn (gen_addv16qi3 (mask, mask, vt));
4885     }
4886 
4887   /* The actual shuffle operations all operate on V16QImode.  */
4888   op0 = gen_lowpart (V16QImode, op0);
4889   op1 = gen_lowpart (V16QImode, op1);
4890 
4891   if (TARGET_XOP)
4892     {
4893       if (GET_MODE (target) != V16QImode)
4894 	target = gen_reg_rtx (V16QImode);
4895       emit_insn (gen_xop_pperm (target, op0, op1, mask));
4896       if (target != operands[0])
4897 	emit_move_insn (operands[0],
4898 			gen_lowpart (GET_MODE (operands[0]), target));
4899     }
4900   else if (one_operand_shuffle)
4901     {
4902       if (GET_MODE (target) != V16QImode)
4903 	target = gen_reg_rtx (V16QImode);
4904       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4905       if (target != operands[0])
4906 	emit_move_insn (operands[0],
4907 			gen_lowpart (GET_MODE (operands[0]), target));
4908     }
4909   else
4910     {
4911       rtx xops[6];
4912       bool ok;
4913 
4914       /* Shuffle the two input vectors independently.  */
4915       t1 = gen_reg_rtx (V16QImode);
4916       t2 = gen_reg_rtx (V16QImode);
4917       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4918       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4919 
4920  merge_two:
4921       /* Then merge them together.  The key is whether any given control
4922          element contained a bit set that indicates the second word.  */
4923       mask = operands[3];
4924       vt = GEN_INT (w);
4925       if (maskmode == V2DImode && !TARGET_SSE4_1)
4926 	{
4927 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
4928 	     more shuffle to convert the V2DI input mask into a V4SI
4929 	     input mask.  At which point the masking that expand_int_vcond
4930 	     will work as desired.  */
4931 	  rtx t3 = gen_reg_rtx (V4SImode);
4932 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4933 				        const0_rtx, const0_rtx,
4934 				        const2_rtx, const2_rtx));
4935 	  mask = t3;
4936 	  maskmode = V4SImode;
4937 	  e = w = 4;
4938 	}
4939 
4940       vt = gen_const_vec_duplicate (maskmode, vt);
4941       vt = force_reg (maskmode, vt);
4942       mask = expand_simple_binop (maskmode, AND, mask, vt,
4943 				  NULL_RTX, 0, OPTAB_DIRECT);
4944 
4945       if (GET_MODE (target) != mode)
4946 	target = gen_reg_rtx (mode);
4947       xops[0] = target;
4948       xops[1] = gen_lowpart (mode, t2);
4949       xops[2] = gen_lowpart (mode, t1);
4950       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4951       xops[4] = mask;
4952       xops[5] = vt;
4953       ok = ix86_expand_int_vcond (xops);
4954       gcc_assert (ok);
4955       if (target != operands[0])
4956 	emit_move_insn (operands[0],
4957 			gen_lowpart (GET_MODE (operands[0]), target));
4958     }
4959 }
4960 
4961 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
4962    true if we should do zero extension, else sign extension.  HIGH_P is
4963    true if we want the N/2 high elements, else the low elements.  */
4964 
4965 void
ix86_expand_sse_unpack(rtx dest,rtx src,bool unsigned_p,bool high_p)4966 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4967 {
4968   machine_mode imode = GET_MODE (src);
4969   rtx tmp;
4970 
4971   if (TARGET_SSE4_1)
4972     {
4973       rtx (*unpack)(rtx, rtx);
4974       rtx (*extract)(rtx, rtx) = NULL;
4975       machine_mode halfmode = BLKmode;
4976 
4977       switch (imode)
4978 	{
4979 	case E_V64QImode:
4980 	  if (unsigned_p)
4981 	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4982 	  else
4983 	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4984 	  halfmode = V32QImode;
4985 	  extract
4986 	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4987 	  break;
4988 	case E_V32QImode:
4989 	  if (unsigned_p)
4990 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
4991 	  else
4992 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
4993 	  halfmode = V16QImode;
4994 	  extract
4995 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4996 	  break;
4997 	case E_V32HImode:
4998 	  if (unsigned_p)
4999 	    unpack = gen_avx512f_zero_extendv16hiv16si2;
5000 	  else
5001 	    unpack = gen_avx512f_sign_extendv16hiv16si2;
5002 	  halfmode = V16HImode;
5003 	  extract
5004 	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5005 	  break;
5006 	case E_V16HImode:
5007 	  if (unsigned_p)
5008 	    unpack = gen_avx2_zero_extendv8hiv8si2;
5009 	  else
5010 	    unpack = gen_avx2_sign_extendv8hiv8si2;
5011 	  halfmode = V8HImode;
5012 	  extract
5013 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5014 	  break;
5015 	case E_V16SImode:
5016 	  if (unsigned_p)
5017 	    unpack = gen_avx512f_zero_extendv8siv8di2;
5018 	  else
5019 	    unpack = gen_avx512f_sign_extendv8siv8di2;
5020 	  halfmode = V8SImode;
5021 	  extract
5022 	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5023 	  break;
5024 	case E_V8SImode:
5025 	  if (unsigned_p)
5026 	    unpack = gen_avx2_zero_extendv4siv4di2;
5027 	  else
5028 	    unpack = gen_avx2_sign_extendv4siv4di2;
5029 	  halfmode = V4SImode;
5030 	  extract
5031 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5032 	  break;
5033 	case E_V16QImode:
5034 	  if (unsigned_p)
5035 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5036 	  else
5037 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5038 	  break;
5039 	case E_V8HImode:
5040 	  if (unsigned_p)
5041 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
5042 	  else
5043 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
5044 	  break;
5045 	case E_V4SImode:
5046 	  if (unsigned_p)
5047 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
5048 	  else
5049 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
5050 	  break;
5051 	default:
5052 	  gcc_unreachable ();
5053 	}
5054 
5055       if (GET_MODE_SIZE (imode) >= 32)
5056 	{
5057 	  tmp = gen_reg_rtx (halfmode);
5058 	  emit_insn (extract (tmp, src));
5059 	}
5060       else if (high_p)
5061 	{
5062 	  /* Shift higher 8 bytes to lower 8 bytes.  */
5063 	  tmp = gen_reg_rtx (V1TImode);
5064 	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5065 					 GEN_INT (64)));
5066 	  tmp = gen_lowpart (imode, tmp);
5067 	}
5068       else
5069 	tmp = src;
5070 
5071       emit_insn (unpack (dest, tmp));
5072     }
5073   else
5074     {
5075       rtx (*unpack)(rtx, rtx, rtx);
5076 
5077       switch (imode)
5078 	{
5079 	case E_V16QImode:
5080 	  if (high_p)
5081 	    unpack = gen_vec_interleave_highv16qi;
5082 	  else
5083 	    unpack = gen_vec_interleave_lowv16qi;
5084 	  break;
5085 	case E_V8HImode:
5086 	  if (high_p)
5087 	    unpack = gen_vec_interleave_highv8hi;
5088 	  else
5089 	    unpack = gen_vec_interleave_lowv8hi;
5090 	  break;
5091 	case E_V4SImode:
5092 	  if (high_p)
5093 	    unpack = gen_vec_interleave_highv4si;
5094 	  else
5095 	    unpack = gen_vec_interleave_lowv4si;
5096 	  break;
5097 	default:
5098 	  gcc_unreachable ();
5099 	}
5100 
5101       if (unsigned_p)
5102 	tmp = force_reg (imode, CONST0_RTX (imode));
5103       else
5104 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5105 				   src, pc_rtx, pc_rtx);
5106 
5107       rtx tmp2 = gen_reg_rtx (imode);
5108       emit_insn (unpack (tmp2, src, tmp));
5109       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5110     }
5111 }
5112 
5113 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
5114    but works for floating pointer parameters and nonoffsetable memories.
5115    For pushes, it returns just stack offsets; the values will be saved
5116    in the right order.  Maximally three parts are generated.  */
5117 
5118 static int
ix86_split_to_parts(rtx operand,rtx * parts,machine_mode mode)5119 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5120 {
5121   int size;
5122 
5123   if (!TARGET_64BIT)
5124     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5125   else
5126     size = (GET_MODE_SIZE (mode) + 4) / 8;
5127 
5128   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5129   gcc_assert (size >= 2 && size <= 4);
5130 
5131   /* Optimize constant pool reference to immediates.  This is used by fp
5132      moves, that force all constants to memory to allow combining.  */
5133   if (MEM_P (operand) && MEM_READONLY_P (operand))
5134     operand = avoid_constant_pool_reference (operand);
5135 
5136   if (MEM_P (operand) && !offsettable_memref_p (operand))
5137     {
5138       /* The only non-offsetable memories we handle are pushes.  */
5139       int ok = push_operand (operand, VOIDmode);
5140 
5141       gcc_assert (ok);
5142 
5143       operand = copy_rtx (operand);
5144       PUT_MODE (operand, word_mode);
5145       parts[0] = parts[1] = parts[2] = parts[3] = operand;
5146       return size;
5147     }
5148 
5149   if (GET_CODE (operand) == CONST_VECTOR)
5150     {
5151       scalar_int_mode imode = int_mode_for_mode (mode).require ();
5152       /* Caution: if we looked through a constant pool memory above,
5153 	 the operand may actually have a different mode now.  That's
5154 	 ok, since we want to pun this all the way back to an integer.  */
5155       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5156       gcc_assert (operand != NULL);
5157       mode = imode;
5158     }
5159 
5160   if (!TARGET_64BIT)
5161     {
5162       if (mode == DImode)
5163 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5164       else
5165 	{
5166 	  int i;
5167 
5168 	  if (REG_P (operand))
5169 	    {
5170 	      gcc_assert (reload_completed);
5171 	      for (i = 0; i < size; i++)
5172 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5173 	    }
5174 	  else if (offsettable_memref_p (operand))
5175 	    {
5176 	      operand = adjust_address (operand, SImode, 0);
5177 	      parts[0] = operand;
5178 	      for (i = 1; i < size; i++)
5179 		parts[i] = adjust_address (operand, SImode, 4 * i);
5180 	    }
5181 	  else if (CONST_DOUBLE_P (operand))
5182 	    {
5183 	      const REAL_VALUE_TYPE *r;
5184 	      long l[4];
5185 
5186 	      r = CONST_DOUBLE_REAL_VALUE (operand);
5187 	      switch (mode)
5188 		{
5189 		case E_TFmode:
5190 		  real_to_target (l, r, mode);
5191 		  parts[3] = gen_int_mode (l[3], SImode);
5192 		  parts[2] = gen_int_mode (l[2], SImode);
5193 		  break;
5194 		case E_XFmode:
5195 		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5196 		     long double may not be 80-bit.  */
5197 		  real_to_target (l, r, mode);
5198 		  parts[2] = gen_int_mode (l[2], SImode);
5199 		  break;
5200 		case E_DFmode:
5201 		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5202 		  break;
5203 		default:
5204 		  gcc_unreachable ();
5205 		}
5206 	      parts[1] = gen_int_mode (l[1], SImode);
5207 	      parts[0] = gen_int_mode (l[0], SImode);
5208 	    }
5209 	  else
5210 	    gcc_unreachable ();
5211 	}
5212     }
5213   else
5214     {
5215       if (mode == TImode)
5216 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5217       if (mode == XFmode || mode == TFmode)
5218 	{
5219 	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5220 	  if (REG_P (operand))
5221 	    {
5222 	      gcc_assert (reload_completed);
5223 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5224 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5225 	    }
5226 	  else if (offsettable_memref_p (operand))
5227 	    {
5228 	      operand = adjust_address (operand, DImode, 0);
5229 	      parts[0] = operand;
5230 	      parts[1] = adjust_address (operand, upper_mode, 8);
5231 	    }
5232 	  else if (CONST_DOUBLE_P (operand))
5233 	    {
5234 	      long l[4];
5235 
5236 	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5237 
5238 	      /* real_to_target puts 32-bit pieces in each long.  */
5239 	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5240 				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5241 					  << 32), DImode);
5242 
5243 	      if (upper_mode == SImode)
5244 	        parts[1] = gen_int_mode (l[2], SImode);
5245 	      else
5246 	        parts[1]
5247 		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5248 				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5249 				     << 32), DImode);
5250 	    }
5251 	  else
5252 	    gcc_unreachable ();
5253 	}
5254     }
5255 
5256   return size;
5257 }
5258 
5259 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5260    Return false when normal moves are needed; true when all required
5261    insns have been emitted.  Operands 2-4 contain the input values
5262    int the correct order; operands 5-7 contain the output values.  */
5263 
5264 void
ix86_split_long_move(rtx operands[])5265 ix86_split_long_move (rtx operands[])
5266 {
5267   rtx part[2][4];
5268   int nparts, i, j;
5269   int push = 0;
5270   int collisions = 0;
5271   machine_mode mode = GET_MODE (operands[0]);
5272   bool collisionparts[4];
5273 
5274   /* The DFmode expanders may ask us to move double.
5275      For 64bit target this is single move.  By hiding the fact
5276      here we simplify i386.md splitters.  */
5277   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5278     {
5279       /* Optimize constant pool reference to immediates.  This is used by
5280 	 fp moves, that force all constants to memory to allow combining.  */
5281 
5282       if (MEM_P (operands[1])
5283 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5284 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5285 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
5286       if (push_operand (operands[0], VOIDmode))
5287 	{
5288 	  operands[0] = copy_rtx (operands[0]);
5289 	  PUT_MODE (operands[0], word_mode);
5290 	}
5291       else
5292         operands[0] = gen_lowpart (DImode, operands[0]);
5293       operands[1] = gen_lowpart (DImode, operands[1]);
5294       emit_move_insn (operands[0], operands[1]);
5295       return;
5296     }
5297 
5298   /* The only non-offsettable memory we handle is push.  */
5299   if (push_operand (operands[0], VOIDmode))
5300     push = 1;
5301   else
5302     gcc_assert (!MEM_P (operands[0])
5303 		|| offsettable_memref_p (operands[0]));
5304 
5305   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5306   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5307 
5308   /* When emitting push, take care for source operands on the stack.  */
5309   if (push && MEM_P (operands[1])
5310       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5311     {
5312       rtx src_base = XEXP (part[1][nparts - 1], 0);
5313 
5314       /* Compensate for the stack decrement by 4.  */
5315       if (!TARGET_64BIT && nparts == 3
5316 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5317 	src_base = plus_constant (Pmode, src_base, 4);
5318 
5319       /* src_base refers to the stack pointer and is
5320 	 automatically decreased by emitted push.  */
5321       for (i = 0; i < nparts; i++)
5322 	part[1][i] = change_address (part[1][i],
5323 				     GET_MODE (part[1][i]), src_base);
5324     }
5325 
5326   /* We need to do copy in the right order in case an address register
5327      of the source overlaps the destination.  */
5328   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5329     {
5330       rtx tmp;
5331 
5332       for (i = 0; i < nparts; i++)
5333 	{
5334 	  collisionparts[i]
5335 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5336 	  if (collisionparts[i])
5337 	    collisions++;
5338 	}
5339 
5340       /* Collision in the middle part can be handled by reordering.  */
5341       if (collisions == 1 && nparts == 3 && collisionparts [1])
5342 	{
5343 	  std::swap (part[0][1], part[0][2]);
5344 	  std::swap (part[1][1], part[1][2]);
5345 	}
5346       else if (collisions == 1
5347 	       && nparts == 4
5348 	       && (collisionparts [1] || collisionparts [2]))
5349 	{
5350 	  if (collisionparts [1])
5351 	    {
5352 	      std::swap (part[0][1], part[0][2]);
5353 	      std::swap (part[1][1], part[1][2]);
5354 	    }
5355 	  else
5356 	    {
5357 	      std::swap (part[0][2], part[0][3]);
5358 	      std::swap (part[1][2], part[1][3]);
5359 	    }
5360 	}
5361 
5362       /* If there are more collisions, we can't handle it by reordering.
5363 	 Do an lea to the last part and use only one colliding move.  */
5364       else if (collisions > 1)
5365 	{
5366 	  rtx base, addr;
5367 
5368 	  collisions = 1;
5369 
5370 	  base = part[0][nparts - 1];
5371 
5372 	  /* Handle the case when the last part isn't valid for lea.
5373 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
5374 	  if (GET_MODE (base) != Pmode)
5375 	    base = gen_rtx_REG (Pmode, REGNO (base));
5376 
5377 	  addr = XEXP (part[1][0], 0);
5378 	  if (TARGET_TLS_DIRECT_SEG_REFS)
5379 	    {
5380 	      struct ix86_address parts;
5381 	      int ok = ix86_decompose_address (addr, &parts);
5382 	      gcc_assert (ok);
5383 	      /* It is not valid to use %gs: or %fs: in lea.  */
5384 	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5385 	    }
5386 	  emit_insn (gen_rtx_SET (base, addr));
5387 	  part[1][0] = replace_equiv_address (part[1][0], base);
5388 	  for (i = 1; i < nparts; i++)
5389 	    {
5390 	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5391 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
5392 	    }
5393 	}
5394     }
5395 
5396   if (push)
5397     {
5398       if (!TARGET_64BIT)
5399 	{
5400 	  if (nparts == 3)
5401 	    {
5402 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5403                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5404 	      emit_move_insn (part[0][2], part[1][2]);
5405 	    }
5406 	  else if (nparts == 4)
5407 	    {
5408 	      emit_move_insn (part[0][3], part[1][3]);
5409 	      emit_move_insn (part[0][2], part[1][2]);
5410 	    }
5411 	}
5412       else
5413 	{
5414 	  /* In 64bit mode we don't have 32bit push available.  In case this is
5415 	     register, it is OK - we will just use larger counterpart.  We also
5416 	     retype memory - these comes from attempt to avoid REX prefix on
5417 	     moving of second half of TFmode value.  */
5418 	  if (GET_MODE (part[1][1]) == SImode)
5419 	    {
5420 	      switch (GET_CODE (part[1][1]))
5421 		{
5422 		case MEM:
5423 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
5424 		  break;
5425 
5426 		case REG:
5427 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5428 		  break;
5429 
5430 		default:
5431 		  gcc_unreachable ();
5432 		}
5433 
5434 	      if (GET_MODE (part[1][0]) == SImode)
5435 		part[1][0] = part[1][1];
5436 	    }
5437 	}
5438       emit_move_insn (part[0][1], part[1][1]);
5439       emit_move_insn (part[0][0], part[1][0]);
5440       return;
5441     }
5442 
5443   /* Choose correct order to not overwrite the source before it is copied.  */
5444   if ((REG_P (part[0][0])
5445        && REG_P (part[1][1])
5446        && (REGNO (part[0][0]) == REGNO (part[1][1])
5447 	   || (nparts == 3
5448 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
5449 	   || (nparts == 4
5450 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
5451       || (collisions > 0
5452 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5453     {
5454       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5455 	{
5456 	  operands[2 + i] = part[0][j];
5457 	  operands[6 + i] = part[1][j];
5458 	}
5459     }
5460   else
5461     {
5462       for (i = 0; i < nparts; i++)
5463 	{
5464 	  operands[2 + i] = part[0][i];
5465 	  operands[6 + i] = part[1][i];
5466 	}
5467     }
5468 
5469   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
5470   if (optimize_insn_for_size_p ())
5471     {
5472       for (j = 0; j < nparts - 1; j++)
5473 	if (CONST_INT_P (operands[6 + j])
5474 	    && operands[6 + j] != const0_rtx
5475 	    && REG_P (operands[2 + j]))
5476 	  for (i = j; i < nparts - 1; i++)
5477 	    if (CONST_INT_P (operands[7 + i])
5478 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5479 	      operands[7 + i] = operands[2 + j];
5480     }
5481 
5482   for (i = 0; i < nparts; i++)
5483     emit_move_insn (operands[2 + i], operands[6 + i]);
5484 
5485   return;
5486 }
5487 
5488 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5489    left shift by a constant, either using a single shift or
5490    a sequence of add instructions.  */
5491 
5492 static void
ix86_expand_ashl_const(rtx operand,int count,machine_mode mode)5493 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5494 {
5495   if (count == 1
5496       || (count * ix86_cost->add <= ix86_cost->shift_const
5497 	  && !optimize_insn_for_size_p ()))
5498     {
5499       while (count-- > 0)
5500 	emit_insn (gen_add2_insn (operand, operand));
5501     }
5502   else
5503     {
5504       rtx (*insn)(rtx, rtx, rtx);
5505 
5506       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5507       emit_insn (insn (operand, operand, GEN_INT (count)));
5508     }
5509 }
5510 
5511 void
ix86_split_ashl(rtx * operands,rtx scratch,machine_mode mode)5512 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5513 {
5514   rtx (*gen_ashl3)(rtx, rtx, rtx);
5515   rtx (*gen_shld)(rtx, rtx, rtx);
5516   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5517   machine_mode half_mode;
5518 
5519   rtx low[2], high[2];
5520   int count;
5521 
5522   if (CONST_INT_P (operands[2]))
5523     {
5524       split_double_mode (mode, operands, 2, low, high);
5525       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5526 
5527       if (count >= half_width)
5528 	{
5529 	  emit_move_insn (high[0], low[1]);
5530 	  emit_move_insn (low[0], const0_rtx);
5531 
5532 	  if (count > half_width)
5533 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
5534 	}
5535       else
5536 	{
5537 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5538 
5539 	  if (!rtx_equal_p (operands[0], operands[1]))
5540 	    emit_move_insn (operands[0], operands[1]);
5541 
5542 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5543 	  ix86_expand_ashl_const (low[0], count, mode);
5544 	}
5545       return;
5546     }
5547 
5548   split_double_mode (mode, operands, 1, low, high);
5549   half_mode = mode == DImode ? SImode : DImode;
5550 
5551   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5552 
5553   if (operands[1] == const1_rtx)
5554     {
5555       /* Assuming we've chosen a QImode capable registers, then 1 << N
5556 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
5557       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5558 	{
5559 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5560 
5561 	  ix86_expand_clear (low[0]);
5562 	  ix86_expand_clear (high[0]);
5563 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5564 
5565 	  d = gen_lowpart (QImode, low[0]);
5566 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5567 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
5568 	  emit_insn (gen_rtx_SET (d, s));
5569 
5570 	  d = gen_lowpart (QImode, high[0]);
5571 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5572 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
5573 	  emit_insn (gen_rtx_SET (d, s));
5574 	}
5575 
5576       /* Otherwise, we can get the same results by manually performing
5577 	 a bit extract operation on bit 5/6, and then performing the two
5578 	 shifts.  The two methods of getting 0/1 into low/high are exactly
5579 	 the same size.  Avoiding the shift in the bit extract case helps
5580 	 pentium4 a bit; no one else seems to care much either way.  */
5581       else
5582 	{
5583 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
5584 	  rtx (*gen_and3)(rtx, rtx, rtx);
5585 	  rtx (*gen_xor3)(rtx, rtx, rtx);
5586 	  HOST_WIDE_INT bits;
5587 	  rtx x;
5588 
5589 	  if (mode == DImode)
5590 	    {
5591 	      gen_lshr3 = gen_lshrsi3;
5592 	      gen_and3 = gen_andsi3;
5593 	      gen_xor3 = gen_xorsi3;
5594 	      bits = 5;
5595 	    }
5596 	  else
5597 	    {
5598 	      gen_lshr3 = gen_lshrdi3;
5599 	      gen_and3 = gen_anddi3;
5600 	      gen_xor3 = gen_xordi3;
5601 	      bits = 6;
5602 	    }
5603 
5604 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5605 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5606 	  else
5607 	    x = gen_lowpart (half_mode, operands[2]);
5608 	  emit_insn (gen_rtx_SET (high[0], x));
5609 
5610 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5611 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5612 	  emit_move_insn (low[0], high[0]);
5613 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5614 	}
5615 
5616       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5617       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5618       return;
5619     }
5620 
5621   if (operands[1] == constm1_rtx)
5622     {
5623       /* For -1 << N, we can avoid the shld instruction, because we
5624 	 know that we're shifting 0...31/63 ones into a -1.  */
5625       emit_move_insn (low[0], constm1_rtx);
5626       if (optimize_insn_for_size_p ())
5627 	emit_move_insn (high[0], low[0]);
5628       else
5629 	emit_move_insn (high[0], constm1_rtx);
5630     }
5631   else
5632     {
5633       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5634 
5635       if (!rtx_equal_p (operands[0], operands[1]))
5636 	emit_move_insn (operands[0], operands[1]);
5637 
5638       split_double_mode (mode, operands, 1, low, high);
5639       emit_insn (gen_shld (high[0], low[0], operands[2]));
5640     }
5641 
5642   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5643 
5644   if (TARGET_CMOVE && scratch)
5645     {
5646       ix86_expand_clear (scratch);
5647       emit_insn (gen_x86_shift_adj_1
5648 		 (half_mode, high[0], low[0], operands[2], scratch));
5649     }
5650   else
5651     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5652 }
5653 
5654 void
ix86_split_ashr(rtx * operands,rtx scratch,machine_mode mode)5655 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5656 {
5657   rtx (*gen_ashr3)(rtx, rtx, rtx)
5658     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5659   rtx (*gen_shrd)(rtx, rtx, rtx);
5660   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5661 
5662   rtx low[2], high[2];
5663   int count;
5664 
5665   if (CONST_INT_P (operands[2]))
5666     {
5667       split_double_mode (mode, operands, 2, low, high);
5668       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5669 
5670       if (count == GET_MODE_BITSIZE (mode) - 1)
5671 	{
5672 	  emit_move_insn (high[0], high[1]);
5673 	  emit_insn (gen_ashr3 (high[0], high[0],
5674 				GEN_INT (half_width - 1)));
5675 	  emit_move_insn (low[0], high[0]);
5676 
5677 	}
5678       else if (count >= half_width)
5679 	{
5680 	  emit_move_insn (low[0], high[1]);
5681 	  emit_move_insn (high[0], low[0]);
5682 	  emit_insn (gen_ashr3 (high[0], high[0],
5683 				GEN_INT (half_width - 1)));
5684 
5685 	  if (count > half_width)
5686 	    emit_insn (gen_ashr3 (low[0], low[0],
5687 				  GEN_INT (count - half_width)));
5688 	}
5689       else
5690 	{
5691 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5692 
5693 	  if (!rtx_equal_p (operands[0], operands[1]))
5694 	    emit_move_insn (operands[0], operands[1]);
5695 
5696 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5697 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5698 	}
5699     }
5700   else
5701     {
5702       machine_mode half_mode;
5703 
5704       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5705 
5706      if (!rtx_equal_p (operands[0], operands[1]))
5707 	emit_move_insn (operands[0], operands[1]);
5708 
5709       split_double_mode (mode, operands, 1, low, high);
5710       half_mode = mode == DImode ? SImode : DImode;
5711 
5712       emit_insn (gen_shrd (low[0], high[0], operands[2]));
5713       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5714 
5715       if (TARGET_CMOVE && scratch)
5716 	{
5717 	  emit_move_insn (scratch, high[0]);
5718 	  emit_insn (gen_ashr3 (scratch, scratch,
5719 				GEN_INT (half_width - 1)));
5720 	  emit_insn (gen_x86_shift_adj_1
5721 		     (half_mode, low[0], high[0], operands[2], scratch));
5722 	}
5723       else
5724 	emit_insn (gen_x86_shift_adj_3
5725 		   (half_mode, low[0], high[0], operands[2]));
5726     }
5727 }
5728 
5729 void
ix86_split_lshr(rtx * operands,rtx scratch,machine_mode mode)5730 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5731 {
5732   rtx (*gen_lshr3)(rtx, rtx, rtx)
5733     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5734   rtx (*gen_shrd)(rtx, rtx, rtx);
5735   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5736 
5737   rtx low[2], high[2];
5738   int count;
5739 
5740   if (CONST_INT_P (operands[2]))
5741     {
5742       split_double_mode (mode, operands, 2, low, high);
5743       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5744 
5745       if (count >= half_width)
5746 	{
5747 	  emit_move_insn (low[0], high[1]);
5748 	  ix86_expand_clear (high[0]);
5749 
5750 	  if (count > half_width)
5751 	    emit_insn (gen_lshr3 (low[0], low[0],
5752 				  GEN_INT (count - half_width)));
5753 	}
5754       else
5755 	{
5756 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5757 
5758 	  if (!rtx_equal_p (operands[0], operands[1]))
5759 	    emit_move_insn (operands[0], operands[1]);
5760 
5761 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5762 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5763 	}
5764     }
5765   else
5766     {
5767       machine_mode half_mode;
5768 
5769       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5770 
5771       if (!rtx_equal_p (operands[0], operands[1]))
5772 	emit_move_insn (operands[0], operands[1]);
5773 
5774       split_double_mode (mode, operands, 1, low, high);
5775       half_mode = mode == DImode ? SImode : DImode;
5776 
5777       emit_insn (gen_shrd (low[0], high[0], operands[2]));
5778       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5779 
5780       if (TARGET_CMOVE && scratch)
5781 	{
5782 	  ix86_expand_clear (scratch);
5783 	  emit_insn (gen_x86_shift_adj_1
5784 		     (half_mode, low[0], high[0], operands[2], scratch));
5785 	}
5786       else
5787 	emit_insn (gen_x86_shift_adj_2
5788 		   (half_mode, low[0], high[0], operands[2]));
5789     }
5790 }
5791 
5792 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
5793    DImode for constant loop counts.  */
5794 
5795 static machine_mode
counter_mode(rtx count_exp)5796 counter_mode (rtx count_exp)
5797 {
5798   if (GET_MODE (count_exp) != VOIDmode)
5799     return GET_MODE (count_exp);
5800   if (!CONST_INT_P (count_exp))
5801     return Pmode;
5802   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5803     return DImode;
5804   return SImode;
5805 }
5806 
5807 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5808    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5809    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
5810    memory by VALUE (supposed to be in MODE).
5811 
5812    The size is rounded down to whole number of chunk size moved at once.
5813    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
5814 
5815 
5816 static void
expand_set_or_cpymem_via_loop(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx count,machine_mode mode,int unroll,int expected_size,bool issetmem)5817 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5818 			       rtx destptr, rtx srcptr, rtx value,
5819 			       rtx count, machine_mode mode, int unroll,
5820 			       int expected_size, bool issetmem)
5821 {
5822   rtx_code_label *out_label, *top_label;
5823   rtx iter, tmp;
5824   machine_mode iter_mode = counter_mode (count);
5825   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5826   rtx piece_size = GEN_INT (piece_size_n);
5827   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5828   rtx size;
5829   int i;
5830 
5831   top_label = gen_label_rtx ();
5832   out_label = gen_label_rtx ();
5833   iter = gen_reg_rtx (iter_mode);
5834 
5835   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5836 			      NULL, 1, OPTAB_DIRECT);
5837   /* Those two should combine.  */
5838   if (piece_size == const1_rtx)
5839     {
5840       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5841 			       true, out_label);
5842       predict_jump (REG_BR_PROB_BASE * 10 / 100);
5843     }
5844   emit_move_insn (iter, const0_rtx);
5845 
5846   emit_label (top_label);
5847 
5848   tmp = convert_modes (Pmode, iter_mode, iter, true);
5849 
5850   /* This assert could be relaxed - in this case we'll need to compute
5851      smallest power of two, containing in PIECE_SIZE_N and pass it to
5852      offset_address.  */
5853   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5854   destmem = offset_address (destmem, tmp, piece_size_n);
5855   destmem = adjust_address (destmem, mode, 0);
5856 
5857   if (!issetmem)
5858     {
5859       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5860       srcmem = adjust_address (srcmem, mode, 0);
5861 
5862       /* When unrolling for chips that reorder memory reads and writes,
5863 	 we can save registers by using single temporary.
5864 	 Also using 4 temporaries is overkill in 32bit mode.  */
5865       if (!TARGET_64BIT && 0)
5866 	{
5867 	  for (i = 0; i < unroll; i++)
5868 	    {
5869 	      if (i)
5870 		{
5871 		  destmem = adjust_address (copy_rtx (destmem), mode,
5872 					    GET_MODE_SIZE (mode));
5873 		  srcmem = adjust_address (copy_rtx (srcmem), mode,
5874 					   GET_MODE_SIZE (mode));
5875 		}
5876 	      emit_move_insn (destmem, srcmem);
5877 	    }
5878 	}
5879       else
5880 	{
5881 	  rtx tmpreg[4];
5882 	  gcc_assert (unroll <= 4);
5883 	  for (i = 0; i < unroll; i++)
5884 	    {
5885 	      tmpreg[i] = gen_reg_rtx (mode);
5886 	      if (i)
5887 		srcmem = adjust_address (copy_rtx (srcmem), mode,
5888 					 GET_MODE_SIZE (mode));
5889 	      emit_move_insn (tmpreg[i], srcmem);
5890 	    }
5891 	  for (i = 0; i < unroll; i++)
5892 	    {
5893 	      if (i)
5894 		destmem = adjust_address (copy_rtx (destmem), mode,
5895 					  GET_MODE_SIZE (mode));
5896 	      emit_move_insn (destmem, tmpreg[i]);
5897 	    }
5898 	}
5899     }
5900   else
5901     for (i = 0; i < unroll; i++)
5902       {
5903 	if (i)
5904 	  destmem = adjust_address (copy_rtx (destmem), mode,
5905 				    GET_MODE_SIZE (mode));
5906 	emit_move_insn (destmem, value);
5907       }
5908 
5909   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5910 			     true, OPTAB_LIB_WIDEN);
5911   if (tmp != iter)
5912     emit_move_insn (iter, tmp);
5913 
5914   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5915 			   true, top_label);
5916   if (expected_size != -1)
5917     {
5918       expected_size /= GET_MODE_SIZE (mode) * unroll;
5919       if (expected_size == 0)
5920 	predict_jump (0);
5921       else if (expected_size > REG_BR_PROB_BASE)
5922 	predict_jump (REG_BR_PROB_BASE - 1);
5923       else
5924         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5925 		      / expected_size);
5926     }
5927   else
5928     predict_jump (REG_BR_PROB_BASE * 80 / 100);
5929   iter = ix86_zero_extend_to_Pmode (iter);
5930   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5931 			     true, OPTAB_LIB_WIDEN);
5932   if (tmp != destptr)
5933     emit_move_insn (destptr, tmp);
5934   if (!issetmem)
5935     {
5936       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5937 				 true, OPTAB_LIB_WIDEN);
5938       if (tmp != srcptr)
5939 	emit_move_insn (srcptr, tmp);
5940     }
5941   emit_label (out_label);
5942 }
5943 
5944 /* Divide COUNTREG by SCALE.  */
5945 static rtx
scale_counter(rtx countreg,int scale)5946 scale_counter (rtx countreg, int scale)
5947 {
5948   rtx sc;
5949 
5950   if (scale == 1)
5951     return countreg;
5952   if (CONST_INT_P (countreg))
5953     return GEN_INT (INTVAL (countreg) / scale);
5954   gcc_assert (REG_P (countreg));
5955 
5956   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5957 			    GEN_INT (exact_log2 (scale)),
5958 			    NULL, 1, OPTAB_DIRECT);
5959   return sc;
5960 }
5961 
5962 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5963    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5964    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5965    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5966    ORIG_VALUE is the original value passed to memset to fill the memory with.
5967    Other arguments have same meaning as for previous function.  */
5968 
5969 static void
expand_set_or_cpymem_via_rep(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx orig_value,rtx count,machine_mode mode,bool issetmem)5970 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5971 			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5972 			   rtx count,
5973 			   machine_mode mode, bool issetmem)
5974 {
5975   rtx destexp;
5976   rtx srcexp;
5977   rtx countreg;
5978   HOST_WIDE_INT rounded_count;
5979 
5980   /* If possible, it is shorter to use rep movs.
5981      TODO: Maybe it is better to move this logic to decide_alg.  */
5982   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5983       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
5984       && (!issetmem || orig_value == const0_rtx))
5985     mode = SImode;
5986 
5987   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5988     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5989 
5990   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5991 						       GET_MODE_SIZE (mode)));
5992   if (mode != QImode)
5993     {
5994       destexp = gen_rtx_ASHIFT (Pmode, countreg,
5995 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5996       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5997     }
5998   else
5999     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
6000   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
6001     {
6002       rounded_count
6003 	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6004       destmem = shallow_copy_rtx (destmem);
6005       set_mem_size (destmem, rounded_count);
6006     }
6007   else if (MEM_SIZE_KNOWN_P (destmem))
6008     clear_mem_size (destmem);
6009 
6010   if (issetmem)
6011     {
6012       value = force_reg (mode, gen_lowpart (mode, value));
6013       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6014     }
6015   else
6016     {
6017       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6018 	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6019       if (mode != QImode)
6020 	{
6021 	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6022 				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6023 	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6024 	}
6025       else
6026 	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6027       if (CONST_INT_P (count))
6028 	{
6029 	  rounded_count
6030 	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6031 	  srcmem = shallow_copy_rtx (srcmem);
6032 	  set_mem_size (srcmem, rounded_count);
6033 	}
6034       else
6035 	{
6036 	  if (MEM_SIZE_KNOWN_P (srcmem))
6037 	    clear_mem_size (srcmem);
6038 	}
6039       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6040 			      destexp, srcexp));
6041     }
6042 }
6043 
6044 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6045    DESTMEM.
6046    SRC is passed by pointer to be updated on return.
6047    Return value is updated DST.  */
6048 static rtx
emit_memmov(rtx destmem,rtx * srcmem,rtx destptr,rtx srcptr,HOST_WIDE_INT size_to_move)6049 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6050 	     HOST_WIDE_INT size_to_move)
6051 {
6052   rtx dst = destmem, src = *srcmem, tempreg;
6053   enum insn_code code;
6054   machine_mode move_mode;
6055   int piece_size, i;
6056 
6057   /* Find the widest mode in which we could perform moves.
6058      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6059      it until move of such size is supported.  */
6060   piece_size = 1 << floor_log2 (size_to_move);
6061   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6062 	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6063     {
6064       gcc_assert (piece_size > 1);
6065       piece_size >>= 1;
6066     }
6067 
6068   /* Find the corresponding vector mode with the same size as MOVE_MODE.
6069      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
6070   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6071     {
6072       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6073       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6074 	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6075 	{
6076 	  move_mode = word_mode;
6077 	  piece_size = GET_MODE_SIZE (move_mode);
6078 	  code = optab_handler (mov_optab, move_mode);
6079 	}
6080     }
6081   gcc_assert (code != CODE_FOR_nothing);
6082 
6083   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6084   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6085 
6086   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
6087   gcc_assert (size_to_move % piece_size == 0);
6088 
6089   for (i = 0; i < size_to_move; i += piece_size)
6090     {
6091       /* We move from memory to memory, so we'll need to do it via
6092 	 a temporary register.  */
6093       tempreg = gen_reg_rtx (move_mode);
6094       emit_insn (GEN_FCN (code) (tempreg, src));
6095       emit_insn (GEN_FCN (code) (dst, tempreg));
6096 
6097       emit_move_insn (destptr,
6098 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
6099       emit_move_insn (srcptr,
6100 		      plus_constant (Pmode, copy_rtx (srcptr), piece_size));
6101 
6102       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6103 					  piece_size);
6104       src = adjust_automodify_address_nv (src, move_mode, srcptr,
6105 					  piece_size);
6106     }
6107 
6108   /* Update DST and SRC rtx.  */
6109   *srcmem = src;
6110   return dst;
6111 }
6112 
6113 /* Helper function for the string operations below.  Dest VARIABLE whether
6114    it is aligned to VALUE bytes.  If true, jump to the label.  */
6115 
6116 static rtx_code_label *
ix86_expand_aligntest(rtx variable,int value,bool epilogue)6117 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6118 {
6119   rtx_code_label *label = gen_label_rtx ();
6120   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6121   if (GET_MODE (variable) == DImode)
6122     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6123   else
6124     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6125   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6126 			   1, label);
6127   if (epilogue)
6128     predict_jump (REG_BR_PROB_BASE * 50 / 100);
6129   else
6130     predict_jump (REG_BR_PROB_BASE * 90 / 100);
6131   return label;
6132 }
6133 
6134 
6135 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
6136 
6137 static void
expand_cpymem_epilogue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx count,int max_size)6138 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6139 			rtx destptr, rtx srcptr, rtx count, int max_size)
6140 {
6141   rtx src, dest;
6142   if (CONST_INT_P (count))
6143     {
6144       HOST_WIDE_INT countval = INTVAL (count);
6145       HOST_WIDE_INT epilogue_size = countval % max_size;
6146       int i;
6147 
6148       /* For now MAX_SIZE should be a power of 2.  This assert could be
6149 	 relaxed, but it'll require a bit more complicated epilogue
6150 	 expanding.  */
6151       gcc_assert ((max_size & (max_size - 1)) == 0);
6152       for (i = max_size; i >= 1; i >>= 1)
6153 	{
6154 	  if (epilogue_size & i)
6155 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6156 	}
6157       return;
6158     }
6159   if (max_size > 8)
6160     {
6161       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6162 				    count, 1, OPTAB_DIRECT);
6163       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6164 				     count, QImode, 1, 4, false);
6165       return;
6166     }
6167 
6168   /* When there are stringops, we can cheaply increase dest and src pointers.
6169      Otherwise we save code size by maintaining offset (zero is readily
6170      available from preceding rep operation) and using x86 addressing modes.
6171    */
6172   if (TARGET_SINGLE_STRINGOP)
6173     {
6174       if (max_size > 4)
6175 	{
6176 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6177 	  src = change_address (srcmem, SImode, srcptr);
6178 	  dest = change_address (destmem, SImode, destptr);
6179 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6180 	  emit_label (label);
6181 	  LABEL_NUSES (label) = 1;
6182 	}
6183       if (max_size > 2)
6184 	{
6185 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6186 	  src = change_address (srcmem, HImode, srcptr);
6187 	  dest = change_address (destmem, HImode, destptr);
6188 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6189 	  emit_label (label);
6190 	  LABEL_NUSES (label) = 1;
6191 	}
6192       if (max_size > 1)
6193 	{
6194 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6195 	  src = change_address (srcmem, QImode, srcptr);
6196 	  dest = change_address (destmem, QImode, destptr);
6197 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6198 	  emit_label (label);
6199 	  LABEL_NUSES (label) = 1;
6200 	}
6201     }
6202   else
6203     {
6204       rtx offset = force_reg (Pmode, const0_rtx);
6205       rtx tmp;
6206 
6207       if (max_size > 4)
6208 	{
6209 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6210 	  src = change_address (srcmem, SImode, srcptr);
6211 	  dest = change_address (destmem, SImode, destptr);
6212 	  emit_move_insn (dest, src);
6213 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6214 				     true, OPTAB_LIB_WIDEN);
6215 	  if (tmp != offset)
6216 	    emit_move_insn (offset, tmp);
6217 	  emit_label (label);
6218 	  LABEL_NUSES (label) = 1;
6219 	}
6220       if (max_size > 2)
6221 	{
6222 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6223 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6224 	  src = change_address (srcmem, HImode, tmp);
6225 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6226 	  dest = change_address (destmem, HImode, tmp);
6227 	  emit_move_insn (dest, src);
6228 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6229 				     true, OPTAB_LIB_WIDEN);
6230 	  if (tmp != offset)
6231 	    emit_move_insn (offset, tmp);
6232 	  emit_label (label);
6233 	  LABEL_NUSES (label) = 1;
6234 	}
6235       if (max_size > 1)
6236 	{
6237 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6238 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6239 	  src = change_address (srcmem, QImode, tmp);
6240 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6241 	  dest = change_address (destmem, QImode, tmp);
6242 	  emit_move_insn (dest, src);
6243 	  emit_label (label);
6244 	  LABEL_NUSES (label) = 1;
6245 	}
6246     }
6247 }
6248 
6249 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6250    with value PROMOTED_VAL.
6251    SRC is passed by pointer to be updated on return.
6252    Return value is updated DST.  */
6253 static rtx
emit_memset(rtx destmem,rtx destptr,rtx promoted_val,HOST_WIDE_INT size_to_move)6254 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6255 	     HOST_WIDE_INT size_to_move)
6256 {
6257   rtx dst = destmem;
6258   enum insn_code code;
6259   machine_mode move_mode;
6260   int piece_size, i;
6261 
6262   /* Find the widest mode in which we could perform moves.
6263      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6264      it until move of such size is supported.  */
6265   move_mode = GET_MODE (promoted_val);
6266   if (move_mode == VOIDmode)
6267     move_mode = QImode;
6268   if (size_to_move < GET_MODE_SIZE (move_mode))
6269     {
6270       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6271       move_mode = int_mode_for_size (move_bits, 0).require ();
6272       promoted_val = gen_lowpart (move_mode, promoted_val);
6273     }
6274   piece_size = GET_MODE_SIZE (move_mode);
6275   code = optab_handler (mov_optab, move_mode);
6276   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6277 
6278   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6279 
6280   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
6281   gcc_assert (size_to_move % piece_size == 0);
6282 
6283   for (i = 0; i < size_to_move; i += piece_size)
6284     {
6285       if (piece_size <= GET_MODE_SIZE (word_mode))
6286 	{
6287 	  emit_insn (gen_strset (destptr, dst, promoted_val));
6288 	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6289 					      piece_size);
6290 	  continue;
6291 	}
6292 
6293       emit_insn (GEN_FCN (code) (dst, promoted_val));
6294 
6295       emit_move_insn (destptr,
6296 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
6297 
6298       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6299 					  piece_size);
6300     }
6301 
6302   /* Update DST rtx.  */
6303   return dst;
6304 }
6305 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
6306 static void
expand_setmem_epilogue_via_loop(rtx destmem,rtx destptr,rtx value,rtx count,int max_size)6307 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6308 				 rtx count, int max_size)
6309 {
6310   count = expand_simple_binop (counter_mode (count), AND, count,
6311 			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6312   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6313 				 gen_lowpart (QImode, value), count, QImode,
6314 				 1, max_size / 2, true);
6315 }
6316 
6317 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
6318 static void
expand_setmem_epilogue(rtx destmem,rtx destptr,rtx value,rtx vec_value,rtx count,int max_size)6319 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6320 			rtx count, int max_size)
6321 {
6322   rtx dest;
6323 
6324   if (CONST_INT_P (count))
6325     {
6326       HOST_WIDE_INT countval = INTVAL (count);
6327       HOST_WIDE_INT epilogue_size = countval % max_size;
6328       int i;
6329 
6330       /* For now MAX_SIZE should be a power of 2.  This assert could be
6331 	 relaxed, but it'll require a bit more complicated epilogue
6332 	 expanding.  */
6333       gcc_assert ((max_size & (max_size - 1)) == 0);
6334       for (i = max_size; i >= 1; i >>= 1)
6335 	{
6336 	  if (epilogue_size & i)
6337 	    {
6338 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6339 		destmem = emit_memset (destmem, destptr, vec_value, i);
6340 	      else
6341 		destmem = emit_memset (destmem, destptr, value, i);
6342 	    }
6343 	}
6344       return;
6345     }
6346   if (max_size > 32)
6347     {
6348       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6349       return;
6350     }
6351   if (max_size > 16)
6352     {
6353       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6354       if (TARGET_64BIT)
6355 	{
6356 	  dest = change_address (destmem, DImode, destptr);
6357 	  emit_insn (gen_strset (destptr, dest, value));
6358 	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6359 	  emit_insn (gen_strset (destptr, dest, value));
6360 	}
6361       else
6362 	{
6363 	  dest = change_address (destmem, SImode, destptr);
6364 	  emit_insn (gen_strset (destptr, dest, value));
6365 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6366 	  emit_insn (gen_strset (destptr, dest, value));
6367 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6368 	  emit_insn (gen_strset (destptr, dest, value));
6369 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6370 	  emit_insn (gen_strset (destptr, dest, value));
6371 	}
6372       emit_label (label);
6373       LABEL_NUSES (label) = 1;
6374     }
6375   if (max_size > 8)
6376     {
6377       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6378       if (TARGET_64BIT)
6379 	{
6380 	  dest = change_address (destmem, DImode, destptr);
6381 	  emit_insn (gen_strset (destptr, dest, value));
6382 	}
6383       else
6384 	{
6385 	  dest = change_address (destmem, SImode, destptr);
6386 	  emit_insn (gen_strset (destptr, dest, value));
6387 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6388 	  emit_insn (gen_strset (destptr, dest, value));
6389 	}
6390       emit_label (label);
6391       LABEL_NUSES (label) = 1;
6392     }
6393   if (max_size > 4)
6394     {
6395       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6396       dest = change_address (destmem, SImode, destptr);
6397       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6398       emit_label (label);
6399       LABEL_NUSES (label) = 1;
6400     }
6401   if (max_size > 2)
6402     {
6403       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6404       dest = change_address (destmem, HImode, destptr);
6405       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6406       emit_label (label);
6407       LABEL_NUSES (label) = 1;
6408     }
6409   if (max_size > 1)
6410     {
6411       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6412       dest = change_address (destmem, QImode, destptr);
6413       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6414       emit_label (label);
6415       LABEL_NUSES (label) = 1;
6416     }
6417 }
6418 
6419 /* Adjust COUNTER by the VALUE.  */
6420 static void
ix86_adjust_counter(rtx countreg,HOST_WIDE_INT value)6421 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6422 {
6423   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6424 }
6425 
6426 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6427    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
6428    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6429    ignored.
6430    Return value is updated DESTMEM.  */
6431 
6432 static rtx
expand_set_or_cpymem_prologue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int align,int desired_alignment,bool issetmem)6433 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6434 				  rtx destptr, rtx srcptr, rtx value,
6435 				  rtx vec_value, rtx count, int align,
6436 				  int desired_alignment, bool issetmem)
6437 {
6438   int i;
6439   for (i = 1; i < desired_alignment; i <<= 1)
6440     {
6441       if (align <= i)
6442 	{
6443 	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6444 	  if (issetmem)
6445 	    {
6446 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6447 		destmem = emit_memset (destmem, destptr, vec_value, i);
6448 	      else
6449 		destmem = emit_memset (destmem, destptr, value, i);
6450 	    }
6451 	  else
6452 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6453 	  ix86_adjust_counter (count, i);
6454 	  emit_label (label);
6455 	  LABEL_NUSES (label) = 1;
6456 	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6457 	}
6458     }
6459   return destmem;
6460 }
6461 
6462 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6463    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6464    and jump to DONE_LABEL.  */
6465 static void
expand_small_cpymem_or_setmem(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int size,rtx done_label,bool issetmem)6466 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6467 			       rtx destptr, rtx srcptr,
6468 			       rtx value, rtx vec_value,
6469 			       rtx count, int size,
6470 			       rtx done_label, bool issetmem)
6471 {
6472   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6473   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6474   rtx modesize;
6475   int n;
6476 
6477   /* If we do not have vector value to copy, we must reduce size.  */
6478   if (issetmem)
6479     {
6480       if (!vec_value)
6481 	{
6482 	  if (GET_MODE (value) == VOIDmode && size > 8)
6483 	    mode = Pmode;
6484 	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6485 	    mode = GET_MODE (value);
6486 	}
6487       else
6488 	mode = GET_MODE (vec_value), value = vec_value;
6489     }
6490   else
6491     {
6492       /* Choose appropriate vector mode.  */
6493       if (size >= 32)
6494 	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6495       else if (size >= 16)
6496 	mode = TARGET_SSE ? V16QImode : DImode;
6497       srcmem = change_address (srcmem, mode, srcptr);
6498     }
6499   destmem = change_address (destmem, mode, destptr);
6500   modesize = GEN_INT (GET_MODE_SIZE (mode));
6501   gcc_assert (GET_MODE_SIZE (mode) <= size);
6502   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6503     {
6504       if (issetmem)
6505 	emit_move_insn (destmem, gen_lowpart (mode, value));
6506       else
6507 	{
6508           emit_move_insn (destmem, srcmem);
6509           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6510 	}
6511       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6512     }
6513 
6514   destmem = offset_address (destmem, count, 1);
6515   destmem = offset_address (destmem, GEN_INT (-2 * size),
6516 			    GET_MODE_SIZE (mode));
6517   if (!issetmem)
6518     {
6519       srcmem = offset_address (srcmem, count, 1);
6520       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6521 			       GET_MODE_SIZE (mode));
6522     }
6523   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6524     {
6525       if (issetmem)
6526 	emit_move_insn (destmem, gen_lowpart (mode, value));
6527       else
6528 	{
6529 	  emit_move_insn (destmem, srcmem);
6530 	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6531 	}
6532       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6533     }
6534   emit_jump_insn (gen_jump (done_label));
6535   emit_barrier ();
6536 
6537   emit_label (label);
6538   LABEL_NUSES (label) = 1;
6539 }
6540 
6541 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6542    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6543    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6544    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6545    DONE_LABEL is a label after the whole copying sequence. The label is created
6546    on demand if *DONE_LABEL is NULL.
6547    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
6548    bounds after the initial copies.
6549 
6550    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6551    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6552    we will dispatch to a library call for large blocks.
6553 
6554    In pseudocode we do:
6555 
6556    if (COUNT < SIZE)
6557      {
6558        Assume that SIZE is 4. Bigger sizes are handled analogously
6559        if (COUNT & 4)
6560 	 {
6561 	    copy 4 bytes from SRCPTR to DESTPTR
6562 	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6563 	    goto done_label
6564 	 }
6565        if (!COUNT)
6566 	 goto done_label;
6567        copy 1 byte from SRCPTR to DESTPTR
6568        if (COUNT & 2)
6569 	 {
6570 	    copy 2 bytes from SRCPTR to DESTPTR
6571 	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6572 	 }
6573      }
6574    else
6575      {
6576        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6577        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6578 
6579        OLD_DESPTR = DESTPTR;
6580        Align DESTPTR up to DESIRED_ALIGN
6581        SRCPTR += DESTPTR - OLD_DESTPTR
6582        COUNT -= DEST_PTR - OLD_DESTPTR
6583        if (DYNAMIC_CHECK)
6584 	 Round COUNT down to multiple of SIZE
6585        << optional caller supplied zero size guard is here >>
6586        << optional caller supplied dynamic check is here >>
6587        << caller supplied main copy loop is here >>
6588      }
6589    done_label:
6590   */
6591 static void
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves(rtx destmem,rtx srcmem,rtx * destptr,rtx * srcptr,machine_mode mode,rtx value,rtx vec_value,rtx * count,rtx_code_label ** done_label,int size,int desired_align,int align,unsigned HOST_WIDE_INT * min_size,bool dynamic_check,bool issetmem)6592 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6593 							    rtx *destptr, rtx *srcptr,
6594 							    machine_mode mode,
6595 							    rtx value, rtx vec_value,
6596 							    rtx *count,
6597 							    rtx_code_label **done_label,
6598 							    int size,
6599 							    int desired_align,
6600 							    int align,
6601 							    unsigned HOST_WIDE_INT *min_size,
6602 							    bool dynamic_check,
6603 							    bool issetmem)
6604 {
6605   rtx_code_label *loop_label = NULL, *label;
6606   int n;
6607   rtx modesize;
6608   int prolog_size = 0;
6609   rtx mode_value;
6610 
6611   /* Chose proper value to copy.  */
6612   if (issetmem && VECTOR_MODE_P (mode))
6613     mode_value = vec_value;
6614   else
6615     mode_value = value;
6616   gcc_assert (GET_MODE_SIZE (mode) <= size);
6617 
6618   /* See if block is big or small, handle small blocks.  */
6619   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6620     {
6621       int size2 = size;
6622       loop_label = gen_label_rtx ();
6623 
6624       if (!*done_label)
6625 	*done_label = gen_label_rtx ();
6626 
6627       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6628 			       1, loop_label);
6629       size2 >>= 1;
6630 
6631       /* Handle sizes > 3.  */
6632       for (;size2 > 2; size2 >>= 1)
6633 	expand_small_cpymem_or_setmem (destmem, srcmem,
6634 				       *destptr, *srcptr,
6635 				       value, vec_value,
6636 				       *count,
6637 				       size2, *done_label, issetmem);
6638       /* Nothing to copy?  Jump to DONE_LABEL if so */
6639       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6640 			       1, *done_label);
6641 
6642       /* Do a byte copy.  */
6643       destmem = change_address (destmem, QImode, *destptr);
6644       if (issetmem)
6645 	emit_move_insn (destmem, gen_lowpart (QImode, value));
6646       else
6647 	{
6648           srcmem = change_address (srcmem, QImode, *srcptr);
6649           emit_move_insn (destmem, srcmem);
6650 	}
6651 
6652       /* Handle sizes 2 and 3.  */
6653       label = ix86_expand_aligntest (*count, 2, false);
6654       destmem = change_address (destmem, HImode, *destptr);
6655       destmem = offset_address (destmem, *count, 1);
6656       destmem = offset_address (destmem, GEN_INT (-2), 2);
6657       if (issetmem)
6658         emit_move_insn (destmem, gen_lowpart (HImode, value));
6659       else
6660 	{
6661 	  srcmem = change_address (srcmem, HImode, *srcptr);
6662 	  srcmem = offset_address (srcmem, *count, 1);
6663 	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6664 	  emit_move_insn (destmem, srcmem);
6665 	}
6666 
6667       emit_label (label);
6668       LABEL_NUSES (label) = 1;
6669       emit_jump_insn (gen_jump (*done_label));
6670       emit_barrier ();
6671     }
6672   else
6673     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6674 		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6675 
6676   /* Start memcpy for COUNT >= SIZE.  */
6677   if (loop_label)
6678     {
6679        emit_label (loop_label);
6680        LABEL_NUSES (loop_label) = 1;
6681     }
6682 
6683   /* Copy first desired_align bytes.  */
6684   if (!issetmem)
6685     srcmem = change_address (srcmem, mode, *srcptr);
6686   destmem = change_address (destmem, mode, *destptr);
6687   modesize = GEN_INT (GET_MODE_SIZE (mode));
6688   for (n = 0; prolog_size < desired_align - align; n++)
6689     {
6690       if (issetmem)
6691         emit_move_insn (destmem, mode_value);
6692       else
6693 	{
6694           emit_move_insn (destmem, srcmem);
6695           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6696 	}
6697       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6698       prolog_size += GET_MODE_SIZE (mode);
6699     }
6700 
6701 
6702   /* Copy last SIZE bytes.  */
6703   destmem = offset_address (destmem, *count, 1);
6704   destmem = offset_address (destmem,
6705 			    GEN_INT (-size - prolog_size),
6706 			    1);
6707   if (issetmem)
6708     emit_move_insn (destmem, mode_value);
6709   else
6710     {
6711       srcmem = offset_address (srcmem, *count, 1);
6712       srcmem = offset_address (srcmem,
6713 			       GEN_INT (-size - prolog_size),
6714 			       1);
6715       emit_move_insn (destmem, srcmem);
6716     }
6717   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6718     {
6719       destmem = offset_address (destmem, modesize, 1);
6720       if (issetmem)
6721 	emit_move_insn (destmem, mode_value);
6722       else
6723 	{
6724           srcmem = offset_address (srcmem, modesize, 1);
6725           emit_move_insn (destmem, srcmem);
6726 	}
6727     }
6728 
6729   /* Align destination.  */
6730   if (desired_align > 1 && desired_align > align)
6731     {
6732       rtx saveddest = *destptr;
6733 
6734       gcc_assert (desired_align <= size);
6735       /* Align destptr up, place it to new register.  */
6736       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6737 				      GEN_INT (prolog_size),
6738 				      NULL_RTX, 1, OPTAB_DIRECT);
6739       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6740 	REG_POINTER (*destptr) = 1;
6741       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6742 				      GEN_INT (-desired_align),
6743 				      *destptr, 1, OPTAB_DIRECT);
6744       /* See how many bytes we skipped.  */
6745       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6746 				       *destptr,
6747 				       saveddest, 1, OPTAB_DIRECT);
6748       /* Adjust srcptr and count.  */
6749       if (!issetmem)
6750 	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6751 				       saveddest, *srcptr, 1, OPTAB_DIRECT);
6752       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6753 				    saveddest, *count, 1, OPTAB_DIRECT);
6754       /* We copied at most size + prolog_size.  */
6755       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6756 	*min_size
6757 	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6758       else
6759 	*min_size = 0;
6760 
6761       /* Our loops always round down the block size, but for dispatch to
6762          library we need precise value.  */
6763       if (dynamic_check)
6764 	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
6765 				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6766     }
6767   else
6768     {
6769       gcc_assert (prolog_size == 0);
6770       /* Decrease count, so we won't end up copying last word twice.  */
6771       if (!CONST_INT_P (*count))
6772 	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6773 				      constm1_rtx, *count, 1, OPTAB_DIRECT);
6774       else
6775 	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6776 				      (unsigned HOST_WIDE_INT)size));
6777       if (*min_size)
6778 	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6779     }
6780 }
6781 
6782 
6783 /* This function is like the previous one, except here we know how many bytes
6784    need to be copied.  That allows us to update alignment not only of DST, which
6785    is returned, but also of SRC, which is passed as a pointer for that
6786    reason.  */
6787 static rtx
expand_set_or_cpymem_constant_prologue(rtx dst,rtx * srcp,rtx destreg,rtx srcreg,rtx value,rtx vec_value,int desired_align,int align_bytes,bool issetmem)6788 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6789 					   rtx srcreg, rtx value, rtx vec_value,
6790 					   int desired_align, int align_bytes,
6791 					   bool issetmem)
6792 {
6793   rtx src = NULL;
6794   rtx orig_dst = dst;
6795   rtx orig_src = NULL;
6796   int piece_size = 1;
6797   int copied_bytes = 0;
6798 
6799   if (!issetmem)
6800     {
6801       gcc_assert (srcp != NULL);
6802       src = *srcp;
6803       orig_src = src;
6804     }
6805 
6806   for (piece_size = 1;
6807        piece_size <= desired_align && copied_bytes < align_bytes;
6808        piece_size <<= 1)
6809     {
6810       if (align_bytes & piece_size)
6811 	{
6812 	  if (issetmem)
6813 	    {
6814 	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6815 		dst = emit_memset (dst, destreg, vec_value, piece_size);
6816 	      else
6817 		dst = emit_memset (dst, destreg, value, piece_size);
6818 	    }
6819 	  else
6820 	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6821 	  copied_bytes += piece_size;
6822 	}
6823     }
6824   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6825     set_mem_align (dst, desired_align * BITS_PER_UNIT);
6826   if (MEM_SIZE_KNOWN_P (orig_dst))
6827     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6828 
6829   if (!issetmem)
6830     {
6831       int src_align_bytes = get_mem_align_offset (src, desired_align
6832 						       * BITS_PER_UNIT);
6833       if (src_align_bytes >= 0)
6834 	src_align_bytes = desired_align - src_align_bytes;
6835       if (src_align_bytes >= 0)
6836 	{
6837 	  unsigned int src_align;
6838 	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6839 	    {
6840 	      if ((src_align_bytes & (src_align - 1))
6841 		   == (align_bytes & (src_align - 1)))
6842 		break;
6843 	    }
6844 	  if (src_align > (unsigned int) desired_align)
6845 	    src_align = desired_align;
6846 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6847 	    set_mem_align (src, src_align * BITS_PER_UNIT);
6848 	}
6849       if (MEM_SIZE_KNOWN_P (orig_src))
6850 	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6851       *srcp = src;
6852     }
6853 
6854   return dst;
6855 }
6856 
6857 /* Return true if ALG can be used in current context.
6858    Assume we expand memset if MEMSET is true.  */
6859 static bool
alg_usable_p(enum stringop_alg alg,bool memset,bool have_as)6860 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6861 {
6862   if (alg == no_stringop)
6863     return false;
6864   if (alg == vector_loop)
6865     return TARGET_SSE || TARGET_AVX;
6866   /* Algorithms using the rep prefix want at least edi and ecx;
6867      additionally, memset wants eax and memcpy wants esi.  Don't
6868      consider such algorithms if the user has appropriated those
6869      registers for their own purposes, or if we have a non-default
6870      address space, since some string insns cannot override the segment.  */
6871   if (alg == rep_prefix_1_byte
6872       || alg == rep_prefix_4_byte
6873       || alg == rep_prefix_8_byte)
6874     {
6875       if (have_as)
6876 	return false;
6877       if (fixed_regs[CX_REG]
6878 	  || fixed_regs[DI_REG]
6879 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6880 	return false;
6881     }
6882   return true;
6883 }
6884 
6885 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
6886 static enum stringop_alg
decide_alg(HOST_WIDE_INT count,HOST_WIDE_INT expected_size,unsigned HOST_WIDE_INT min_size,unsigned HOST_WIDE_INT max_size,bool memset,bool zero_memset,bool have_as,int * dynamic_check,bool * noalign,bool recur)6887 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6888 	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6889 	    bool memset, bool zero_memset, bool have_as,
6890 	    int *dynamic_check, bool *noalign, bool recur)
6891 {
6892   const struct stringop_algs *algs;
6893   bool optimize_for_speed;
6894   int max = 0;
6895   const struct processor_costs *cost;
6896   int i;
6897   bool any_alg_usable_p = false;
6898 
6899   *noalign = false;
6900   *dynamic_check = -1;
6901 
6902   /* Even if the string operation call is cold, we still might spend a lot
6903      of time processing large blocks.  */
6904   if (optimize_function_for_size_p (cfun)
6905       || (optimize_insn_for_size_p ()
6906  	  && (max_size < 256
6907               || (expected_size != -1 && expected_size < 256))))
6908     optimize_for_speed = false;
6909   else
6910     optimize_for_speed = true;
6911 
6912   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6913   if (memset)
6914     algs = &cost->memset[TARGET_64BIT != 0];
6915   else
6916     algs = &cost->memcpy[TARGET_64BIT != 0];
6917 
6918   /* See maximal size for user defined algorithm.  */
6919   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6920     {
6921       enum stringop_alg candidate = algs->size[i].alg;
6922       bool usable = alg_usable_p (candidate, memset, have_as);
6923       any_alg_usable_p |= usable;
6924 
6925       if (candidate != libcall && candidate && usable)
6926 	max = algs->size[i].max;
6927     }
6928 
6929   /* If expected size is not known but max size is small enough
6930      so inline version is a win, set expected size into
6931      the range.  */
6932   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6933       && expected_size == -1)
6934     expected_size = min_size / 2 + max_size / 2;
6935 
6936   /* If user specified the algorithm, honor it if possible.  */
6937   if (ix86_stringop_alg != no_stringop
6938       && alg_usable_p (ix86_stringop_alg, memset, have_as))
6939     return ix86_stringop_alg;
6940   /* rep; movq or rep; movl is the smallest variant.  */
6941   else if (!optimize_for_speed)
6942     {
6943       *noalign = true;
6944       if (!count || (count & 3) || (memset && !zero_memset))
6945 	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6946 	       ? rep_prefix_1_byte : loop_1_byte;
6947       else
6948 	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6949 	       ? rep_prefix_4_byte : loop;
6950     }
6951   /* Very tiny blocks are best handled via the loop, REP is expensive to
6952      setup.  */
6953   else if (expected_size != -1 && expected_size < 4)
6954     return loop_1_byte;
6955   else if (expected_size != -1)
6956     {
6957       enum stringop_alg alg = libcall;
6958       bool alg_noalign = false;
6959       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6960 	{
6961 	  /* We get here if the algorithms that were not libcall-based
6962 	     were rep-prefix based and we are unable to use rep prefixes
6963 	     based on global register usage.  Break out of the loop and
6964 	     use the heuristic below.  */
6965 	  if (algs->size[i].max == 0)
6966 	    break;
6967 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6968 	    {
6969 	      enum stringop_alg candidate = algs->size[i].alg;
6970 
6971 	      if (candidate != libcall
6972 		  && alg_usable_p (candidate, memset, have_as))
6973 		{
6974 		  alg = candidate;
6975 		  alg_noalign = algs->size[i].noalign;
6976 		}
6977 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6978 		 last non-libcall inline algorithm.  */
6979 	      if (TARGET_INLINE_ALL_STRINGOPS)
6980 		{
6981 		  /* When the current size is best to be copied by a libcall,
6982 		     but we are still forced to inline, run the heuristic below
6983 		     that will pick code for medium sized blocks.  */
6984 		  if (alg != libcall)
6985 		    {
6986 		      *noalign = alg_noalign;
6987 		      return alg;
6988 		    }
6989 		  else if (!any_alg_usable_p)
6990 		    break;
6991 		}
6992 	      else if (alg_usable_p (candidate, memset, have_as)
6993 		       && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
6994 			    && candidate == rep_prefix_1_byte
6995 			    /* NB: If min_size != max_size, size is
6996 			       unknown.  */
6997 			    && min_size != max_size))
6998 		{
6999 		  *noalign = algs->size[i].noalign;
7000 		  return candidate;
7001 		}
7002 	    }
7003 	}
7004     }
7005   /* When asked to inline the call anyway, try to pick meaningful choice.
7006      We look for maximal size of block that is faster to copy by hand and
7007      take blocks of at most of that size guessing that average size will
7008      be roughly half of the block.
7009 
7010      If this turns out to be bad, we might simply specify the preferred
7011      choice in ix86_costs.  */
7012   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7013       && (algs->unknown_size == libcall
7014 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
7015     {
7016       enum stringop_alg alg;
7017       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
7018 
7019       /* If there aren't any usable algorithms or if recursing already,
7020 	 then recursing on smaller sizes or same size isn't going to
7021 	 find anything.  Just return the simple byte-at-a-time copy loop.  */
7022       if (!any_alg_usable_p || recur)
7023 	{
7024 	  /* Pick something reasonable.  */
7025 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7026 	    *dynamic_check = 128;
7027 	  return loop_1_byte;
7028 	}
7029       alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7030 			zero_memset, have_as, dynamic_check, noalign, true);
7031       gcc_assert (*dynamic_check == -1);
7032       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7033 	*dynamic_check = max;
7034       else
7035 	gcc_assert (alg != libcall);
7036       return alg;
7037     }
7038   return (alg_usable_p (algs->unknown_size, memset, have_as)
7039 	  ? algs->unknown_size : libcall);
7040 }
7041 
7042 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
7043    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
7044 static int
decide_alignment(int align,enum stringop_alg alg,int expected_size,machine_mode move_mode)7045 decide_alignment (int align,
7046 		  enum stringop_alg alg,
7047 		  int expected_size,
7048 		  machine_mode move_mode)
7049 {
7050   int desired_align = 0;
7051 
7052   gcc_assert (alg != no_stringop);
7053 
7054   if (alg == libcall)
7055     return 0;
7056   if (move_mode == VOIDmode)
7057     return 0;
7058 
7059   desired_align = GET_MODE_SIZE (move_mode);
7060   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7061      copying whole cacheline at once.  */
7062   if (TARGET_PENTIUMPRO
7063       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7064     desired_align = 8;
7065 
7066   if (optimize_size)
7067     desired_align = 1;
7068   if (desired_align < align)
7069     desired_align = align;
7070   if (expected_size != -1 && expected_size < 4)
7071     desired_align = align;
7072 
7073   return desired_align;
7074 }
7075 
7076 
7077 /* Helper function for memcpy.  For QImode value 0xXY produce
7078    0xXYXYXYXY of wide specified by MODE.  This is essentially
7079    a * 0x10101010, but we can do slightly better than
7080    synth_mult by unwinding the sequence by hand on CPUs with
7081    slow multiply.  */
7082 static rtx
promote_duplicated_reg(machine_mode mode,rtx val)7083 promote_duplicated_reg (machine_mode mode, rtx val)
7084 {
7085   machine_mode valmode = GET_MODE (val);
7086   rtx tmp;
7087   int nops = mode == DImode ? 3 : 2;
7088 
7089   gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7090   if (val == const0_rtx)
7091     return copy_to_mode_reg (mode, CONST0_RTX (mode));
7092   if (CONST_INT_P (val))
7093     {
7094       HOST_WIDE_INT v = INTVAL (val) & 255;
7095 
7096       v |= v << 8;
7097       v |= v << 16;
7098       if (mode == DImode)
7099         v |= (v << 16) << 16;
7100       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7101     }
7102 
7103   if (valmode == VOIDmode)
7104     valmode = QImode;
7105   if (valmode != QImode)
7106     val = gen_lowpart (QImode, val);
7107   if (mode == QImode)
7108     return val;
7109   if (!TARGET_PARTIAL_REG_STALL)
7110     nops--;
7111   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7112       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7113       <= (ix86_cost->shift_const + ix86_cost->add) * nops
7114           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7115     {
7116       rtx reg = convert_modes (mode, QImode, val, true);
7117       tmp = promote_duplicated_reg (mode, const1_rtx);
7118       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7119 				  OPTAB_DIRECT);
7120     }
7121   else
7122     {
7123       rtx reg = convert_modes (mode, QImode, val, true);
7124 
7125       if (!TARGET_PARTIAL_REG_STALL)
7126 	emit_insn (gen_insv_1 (mode, reg, reg));
7127       else
7128 	{
7129 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7130 				     NULL, 1, OPTAB_DIRECT);
7131 	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7132 				     OPTAB_DIRECT);
7133 	}
7134       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7135 			         NULL, 1, OPTAB_DIRECT);
7136       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7137       if (mode == SImode)
7138 	return reg;
7139       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7140 				 NULL, 1, OPTAB_DIRECT);
7141       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7142       return reg;
7143     }
7144 }
7145 
7146 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7147    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7148    alignment from ALIGN to DESIRED_ALIGN.  */
7149 static rtx
promote_duplicated_reg_to_size(rtx val,int size_needed,int desired_align,int align)7150 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7151 				int align)
7152 {
7153   rtx promoted_val;
7154 
7155   if (TARGET_64BIT
7156       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7157     promoted_val = promote_duplicated_reg (DImode, val);
7158   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7159     promoted_val = promote_duplicated_reg (SImode, val);
7160   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7161     promoted_val = promote_duplicated_reg (HImode, val);
7162   else
7163     promoted_val = val;
7164 
7165   return promoted_val;
7166 }
7167 
7168 /* Copy the address to a Pmode register.  This is used for x32 to
7169    truncate DImode TLS address to a SImode register. */
7170 
7171 static rtx
ix86_copy_addr_to_reg(rtx addr)7172 ix86_copy_addr_to_reg (rtx addr)
7173 {
7174   rtx reg;
7175   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7176     {
7177       reg = copy_addr_to_reg (addr);
7178       REG_POINTER (reg) = 1;
7179       return reg;
7180     }
7181   else
7182     {
7183       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7184       reg = copy_to_mode_reg (DImode, addr);
7185       REG_POINTER (reg) = 1;
7186       return gen_rtx_SUBREG (SImode, reg, 0);
7187     }
7188 }
7189 
7190 /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
7191    operations when profitable.  The code depends upon architecture, block size
7192    and alignment, but always has one of the following overall structures:
7193 
7194    Aligned move sequence:
7195 
7196      1) Prologue guard: Conditional that jumps up to epilogues for small
7197 	blocks that can be handled by epilogue alone.  This is faster
7198 	but also needed for correctness, since prologue assume the block
7199 	is larger than the desired alignment.
7200 
7201 	Optional dynamic check for size and libcall for large
7202 	blocks is emitted here too, with -minline-stringops-dynamically.
7203 
7204      2) Prologue: copy first few bytes in order to get destination
7205 	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
7206 	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7207 	copied.  We emit either a jump tree on power of two sized
7208 	blocks, or a byte loop.
7209 
7210      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7211 	with specified algorithm.
7212 
7213      4) Epilogue: code copying tail of the block that is too small to be
7214 	handled by main body (or up to size guarded by prologue guard).
7215 
7216   Misaligned move sequence
7217 
7218      1) missaligned move prologue/epilogue containing:
7219         a) Prologue handling small memory blocks and jumping to done_label
7220 	   (skipped if blocks are known to be large enough)
7221 	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7222            needed by single possibly misaligned move
7223 	   (skipped if alignment is not needed)
7224         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7225 
7226      2) Zero size guard dispatching to done_label, if needed
7227 
7228      3) dispatch to library call, if needed,
7229 
7230      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7231 	with specified algorithm.  */
7232 bool
ix86_expand_set_or_cpymem(rtx dst,rtx src,rtx count_exp,rtx val_exp,rtx align_exp,rtx expected_align_exp,rtx expected_size_exp,rtx min_size_exp,rtx max_size_exp,rtx probable_max_size_exp,bool issetmem)7233 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7234 			   rtx align_exp, rtx expected_align_exp,
7235 			   rtx expected_size_exp, rtx min_size_exp,
7236 			   rtx max_size_exp, rtx probable_max_size_exp,
7237 			   bool issetmem)
7238 {
7239   rtx destreg;
7240   rtx srcreg = NULL;
7241   rtx_code_label *label = NULL;
7242   rtx tmp;
7243   rtx_code_label *jump_around_label = NULL;
7244   HOST_WIDE_INT align = 1;
7245   unsigned HOST_WIDE_INT count = 0;
7246   HOST_WIDE_INT expected_size = -1;
7247   int size_needed = 0, epilogue_size_needed;
7248   int desired_align = 0, align_bytes = 0;
7249   enum stringop_alg alg;
7250   rtx promoted_val = NULL;
7251   rtx vec_promoted_val = NULL;
7252   bool force_loopy_epilogue = false;
7253   int dynamic_check;
7254   bool need_zero_guard = false;
7255   bool noalign;
7256   machine_mode move_mode = VOIDmode;
7257   machine_mode wider_mode;
7258   int unroll_factor = 1;
7259   /* TODO: Once value ranges are available, fill in proper data.  */
7260   unsigned HOST_WIDE_INT min_size = 0;
7261   unsigned HOST_WIDE_INT max_size = -1;
7262   unsigned HOST_WIDE_INT probable_max_size = -1;
7263   bool misaligned_prologue_used = false;
7264   bool have_as;
7265 
7266   if (CONST_INT_P (align_exp))
7267     align = INTVAL (align_exp);
7268   /* i386 can do misaligned access on reasonably increased cost.  */
7269   if (CONST_INT_P (expected_align_exp)
7270       && INTVAL (expected_align_exp) > align)
7271     align = INTVAL (expected_align_exp);
7272   /* ALIGN is the minimum of destination and source alignment, but we care here
7273      just about destination alignment.  */
7274   else if (!issetmem
7275 	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7276     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7277 
7278   if (CONST_INT_P (count_exp))
7279     {
7280       min_size = max_size = probable_max_size = count = expected_size
7281 	= INTVAL (count_exp);
7282       /* When COUNT is 0, there is nothing to do.  */
7283       if (!count)
7284 	return true;
7285     }
7286   else
7287     {
7288       if (min_size_exp)
7289 	min_size = INTVAL (min_size_exp);
7290       if (max_size_exp)
7291 	max_size = INTVAL (max_size_exp);
7292       if (probable_max_size_exp)
7293 	probable_max_size = INTVAL (probable_max_size_exp);
7294       if (CONST_INT_P (expected_size_exp))
7295 	expected_size = INTVAL (expected_size_exp);
7296      }
7297 
7298   /* Make sure we don't need to care about overflow later on.  */
7299   if (count > (HOST_WIDE_INT_1U << 30))
7300     return false;
7301 
7302   have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7303   if (!issetmem)
7304     have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7305 
7306   /* Step 0: Decide on preferred algorithm, desired alignment and
7307      size of chunks to be copied by main loop.  */
7308   alg = decide_alg (count, expected_size, min_size, probable_max_size,
7309 		    issetmem,
7310 		    issetmem && val_exp == const0_rtx, have_as,
7311 		    &dynamic_check, &noalign, false);
7312 
7313   if (dump_file)
7314     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7315 	     stringop_alg_names[alg]);
7316 
7317   if (alg == libcall)
7318     return false;
7319   gcc_assert (alg != no_stringop);
7320 
7321   /* For now vector-version of memset is generated only for memory zeroing, as
7322      creating of promoted vector value is very cheap in this case.  */
7323   if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7324     alg = unrolled_loop;
7325 
7326   if (!count)
7327     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7328   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7329   if (!issetmem)
7330     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7331 
7332   unroll_factor = 1;
7333   move_mode = word_mode;
7334   switch (alg)
7335     {
7336     case libcall:
7337     case no_stringop:
7338     case last_alg:
7339       gcc_unreachable ();
7340     case loop_1_byte:
7341       need_zero_guard = true;
7342       move_mode = QImode;
7343       break;
7344     case loop:
7345       need_zero_guard = true;
7346       break;
7347     case unrolled_loop:
7348       need_zero_guard = true;
7349       unroll_factor = (TARGET_64BIT ? 4 : 2);
7350       break;
7351     case vector_loop:
7352       need_zero_guard = true;
7353       unroll_factor = 4;
7354       /* Find the widest supported mode.  */
7355       move_mode = word_mode;
7356       while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7357 	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7358 	move_mode = wider_mode;
7359 
7360       if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7361 	move_mode = TImode;
7362 
7363       /* Find the corresponding vector mode with the same size as MOVE_MODE.
7364 	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
7365       if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7366 	{
7367 	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7368 	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7369 	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7370 	    move_mode = word_mode;
7371 	}
7372       gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7373       break;
7374     case rep_prefix_8_byte:
7375       move_mode = DImode;
7376       break;
7377     case rep_prefix_4_byte:
7378       move_mode = SImode;
7379       break;
7380     case rep_prefix_1_byte:
7381       move_mode = QImode;
7382       break;
7383     }
7384   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7385   epilogue_size_needed = size_needed;
7386 
7387   /* If we are going to call any library calls conditionally, make sure any
7388      pending stack adjustment happen before the first conditional branch,
7389      otherwise they will be emitted before the library call only and won't
7390      happen from the other branches.  */
7391   if (dynamic_check != -1)
7392     do_pending_stack_adjust ();
7393 
7394   desired_align = decide_alignment (align, alg, expected_size, move_mode);
7395   if (!TARGET_ALIGN_STRINGOPS || noalign)
7396     align = desired_align;
7397 
7398   /* Step 1: Prologue guard.  */
7399 
7400   /* Alignment code needs count to be in register.  */
7401   if (CONST_INT_P (count_exp) && desired_align > align)
7402     {
7403       if (INTVAL (count_exp) > desired_align
7404 	  && INTVAL (count_exp) > size_needed)
7405 	{
7406 	  align_bytes
7407 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7408 	  if (align_bytes <= 0)
7409 	    align_bytes = 0;
7410 	  else
7411 	    align_bytes = desired_align - align_bytes;
7412 	}
7413       if (align_bytes == 0)
7414 	count_exp = force_reg (counter_mode (count_exp), count_exp);
7415     }
7416   gcc_assert (desired_align >= 1 && align >= 1);
7417 
7418   /* Misaligned move sequences handle both prologue and epilogue at once.
7419      Default code generation results in a smaller code for large alignments
7420      and also avoids redundant job when sizes are known precisely.  */
7421   misaligned_prologue_used
7422     = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7423        && MAX (desired_align, epilogue_size_needed) <= 32
7424        && desired_align <= epilogue_size_needed
7425        && ((desired_align > align && !align_bytes)
7426 	   || (!count && epilogue_size_needed > 1)));
7427 
7428   /* Do the cheap promotion to allow better CSE across the
7429      main loop and epilogue (ie one load of the big constant in the
7430      front of all code.
7431      For now the misaligned move sequences do not have fast path
7432      without broadcasting.  */
7433   if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7434     {
7435       if (alg == vector_loop)
7436 	{
7437 	  gcc_assert (val_exp == const0_rtx);
7438 	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7439 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
7440 							 GET_MODE_SIZE (word_mode),
7441 							 desired_align, align);
7442 	}
7443       else
7444 	{
7445 	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7446 							 desired_align, align);
7447 	}
7448     }
7449   /* Misaligned move sequences handles both prologues and epilogues at once.
7450      Default code generation results in smaller code for large alignments and
7451      also avoids redundant job when sizes are known precisely.  */
7452   if (misaligned_prologue_used)
7453     {
7454       /* Misaligned move prologue handled small blocks by itself.  */
7455       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7456 	   (dst, src, &destreg, &srcreg,
7457 	    move_mode, promoted_val, vec_promoted_val,
7458 	    &count_exp,
7459 	    &jump_around_label,
7460             desired_align < align
7461 	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7462 	    desired_align, align, &min_size, dynamic_check, issetmem);
7463       if (!issetmem)
7464         src = change_address (src, BLKmode, srcreg);
7465       dst = change_address (dst, BLKmode, destreg);
7466       set_mem_align (dst, desired_align * BITS_PER_UNIT);
7467       epilogue_size_needed = 0;
7468       if (need_zero_guard
7469 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
7470 	{
7471 	  /* It is possible that we copied enough so the main loop will not
7472 	     execute.  */
7473 	  gcc_assert (size_needed > 1);
7474 	  if (jump_around_label == NULL_RTX)
7475 	    jump_around_label = gen_label_rtx ();
7476 	  emit_cmp_and_jump_insns (count_exp,
7477 				   GEN_INT (size_needed),
7478 				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7479 	  if (expected_size == -1
7480 	      || expected_size < (desired_align - align) / 2 + size_needed)
7481 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7482 	  else
7483 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7484 	}
7485     }
7486   /* Ensure that alignment prologue won't copy past end of block.  */
7487   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7488     {
7489       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7490       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7491 	 Make sure it is power of 2.  */
7492       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7493 
7494       /* To improve performance of small blocks, we jump around the VAL
7495 	 promoting mode.  This mean that if the promoted VAL is not constant,
7496 	 we might not use it in the epilogue and have to use byte
7497 	 loop variant.  */
7498       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7499 	force_loopy_epilogue = true;
7500       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7501 	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7502 	{
7503 	  /* If main algorithm works on QImode, no epilogue is needed.
7504 	     For small sizes just don't align anything.  */
7505 	  if (size_needed == 1)
7506 	    desired_align = align;
7507 	  else
7508 	    goto epilogue;
7509 	}
7510       else if (!count
7511 	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7512 	{
7513 	  label = gen_label_rtx ();
7514 	  emit_cmp_and_jump_insns (count_exp,
7515 				   GEN_INT (epilogue_size_needed),
7516 				   LTU, 0, counter_mode (count_exp), 1, label);
7517 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
7518 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7519 	  else
7520 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7521 	}
7522     }
7523 
7524   /* Emit code to decide on runtime whether library call or inline should be
7525      used.  */
7526   if (dynamic_check != -1)
7527     {
7528       if (!issetmem && CONST_INT_P (count_exp))
7529 	{
7530 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7531 	    {
7532 	      emit_block_copy_via_libcall (dst, src, count_exp);
7533 	      count_exp = const0_rtx;
7534 	      goto epilogue;
7535 	    }
7536 	}
7537       else
7538 	{
7539 	  rtx_code_label *hot_label = gen_label_rtx ();
7540 	  if (jump_around_label == NULL_RTX)
7541 	    jump_around_label = gen_label_rtx ();
7542 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7543 				   LEU, 0, counter_mode (count_exp),
7544 				   1, hot_label);
7545 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
7546 	  if (issetmem)
7547 	    set_storage_via_libcall (dst, count_exp, val_exp);
7548 	  else
7549 	    emit_block_copy_via_libcall (dst, src, count_exp);
7550 	  emit_jump (jump_around_label);
7551 	  emit_label (hot_label);
7552 	}
7553     }
7554 
7555   /* Step 2: Alignment prologue.  */
7556   /* Do the expensive promotion once we branched off the small blocks.  */
7557   if (issetmem && !promoted_val)
7558     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7559 						   desired_align, align);
7560 
7561   if (desired_align > align && !misaligned_prologue_used)
7562     {
7563       if (align_bytes == 0)
7564 	{
7565 	  /* Except for the first move in prologue, we no longer know
7566 	     constant offset in aliasing info.  It don't seems to worth
7567 	     the pain to maintain it for the first move, so throw away
7568 	     the info early.  */
7569 	  dst = change_address (dst, BLKmode, destreg);
7570 	  if (!issetmem)
7571 	    src = change_address (src, BLKmode, srcreg);
7572 	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7573 					    promoted_val, vec_promoted_val,
7574 					    count_exp, align, desired_align,
7575 					    issetmem);
7576 	  /* At most desired_align - align bytes are copied.  */
7577 	  if (min_size < (unsigned)(desired_align - align))
7578 	    min_size = 0;
7579 	  else
7580 	    min_size -= desired_align - align;
7581 	}
7582       else
7583 	{
7584 	  /* If we know how many bytes need to be stored before dst is
7585 	     sufficiently aligned, maintain aliasing info accurately.  */
7586 	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7587 							   srcreg,
7588 							   promoted_val,
7589 							   vec_promoted_val,
7590 							   desired_align,
7591 							   align_bytes,
7592 							   issetmem);
7593 
7594 	  count_exp = plus_constant (counter_mode (count_exp),
7595 				     count_exp, -align_bytes);
7596 	  count -= align_bytes;
7597 	  min_size -= align_bytes;
7598 	  max_size -= align_bytes;
7599 	}
7600       if (need_zero_guard
7601 	  && min_size < (unsigned HOST_WIDE_INT) size_needed
7602 	  && (count < (unsigned HOST_WIDE_INT) size_needed
7603 	      || (align_bytes == 0
7604 		  && count < ((unsigned HOST_WIDE_INT) size_needed
7605 			      + desired_align - align))))
7606 	{
7607 	  /* It is possible that we copied enough so the main loop will not
7608 	     execute.  */
7609 	  gcc_assert (size_needed > 1);
7610 	  if (label == NULL_RTX)
7611 	    label = gen_label_rtx ();
7612 	  emit_cmp_and_jump_insns (count_exp,
7613 				   GEN_INT (size_needed),
7614 				   LTU, 0, counter_mode (count_exp), 1, label);
7615 	  if (expected_size == -1
7616 	      || expected_size < (desired_align - align) / 2 + size_needed)
7617 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7618 	  else
7619 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7620 	}
7621     }
7622   if (label && size_needed == 1)
7623     {
7624       emit_label (label);
7625       LABEL_NUSES (label) = 1;
7626       label = NULL;
7627       epilogue_size_needed = 1;
7628       if (issetmem)
7629 	promoted_val = val_exp;
7630     }
7631   else if (label == NULL_RTX && !misaligned_prologue_used)
7632     epilogue_size_needed = size_needed;
7633 
7634   /* Step 3: Main loop.  */
7635 
7636   switch (alg)
7637     {
7638     case libcall:
7639     case no_stringop:
7640     case last_alg:
7641       gcc_unreachable ();
7642     case loop_1_byte:
7643     case loop:
7644     case unrolled_loop:
7645       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7646 				     count_exp, move_mode, unroll_factor,
7647 				     expected_size, issetmem);
7648       break;
7649     case vector_loop:
7650       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7651 				     vec_promoted_val, count_exp, move_mode,
7652 				     unroll_factor, expected_size, issetmem);
7653       break;
7654     case rep_prefix_8_byte:
7655     case rep_prefix_4_byte:
7656     case rep_prefix_1_byte:
7657       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7658 				       val_exp, count_exp, move_mode, issetmem);
7659       break;
7660     }
7661   /* Adjust properly the offset of src and dest memory for aliasing.  */
7662   if (CONST_INT_P (count_exp))
7663     {
7664       if (!issetmem)
7665 	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7666 					    (count / size_needed) * size_needed);
7667       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7668 					  (count / size_needed) * size_needed);
7669     }
7670   else
7671     {
7672       if (!issetmem)
7673 	src = change_address (src, BLKmode, srcreg);
7674       dst = change_address (dst, BLKmode, destreg);
7675     }
7676 
7677   /* Step 4: Epilogue to copy the remaining bytes.  */
7678  epilogue:
7679   if (label)
7680     {
7681       /* When the main loop is done, COUNT_EXP might hold original count,
7682 	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7683 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7684 	 bytes. Compensate if needed.  */
7685 
7686       if (size_needed < epilogue_size_needed)
7687 	{
7688 	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7689 				     GEN_INT (size_needed - 1), count_exp, 1,
7690 				     OPTAB_DIRECT);
7691 	  if (tmp != count_exp)
7692 	    emit_move_insn (count_exp, tmp);
7693 	}
7694       emit_label (label);
7695       LABEL_NUSES (label) = 1;
7696     }
7697 
7698   if (count_exp != const0_rtx && epilogue_size_needed > 1)
7699     {
7700       if (force_loopy_epilogue)
7701 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7702 					 epilogue_size_needed);
7703       else
7704 	{
7705 	  if (issetmem)
7706 	    expand_setmem_epilogue (dst, destreg, promoted_val,
7707 				    vec_promoted_val, count_exp,
7708 				    epilogue_size_needed);
7709 	  else
7710 	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7711 				    epilogue_size_needed);
7712 	}
7713     }
7714   if (jump_around_label)
7715     emit_label (jump_around_label);
7716   return true;
7717 }
7718 
7719 /* Expand cmpstrn or memcmp.  */
7720 
7721 bool
ix86_expand_cmpstrn_or_cmpmem(rtx result,rtx src1,rtx src2,rtx length,rtx align,bool is_cmpstrn)7722 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
7723 			       rtx length, rtx align, bool is_cmpstrn)
7724 {
7725   /* Expand strncmp and memcmp only with -minline-all-stringops since
7726      "repz cmpsb" can be much slower than strncmp and memcmp functions
7727      implemented with vector instructions, see
7728 
7729      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7730    */
7731   if (!TARGET_INLINE_ALL_STRINGOPS)
7732     return false;
7733 
7734   /* Can't use this if the user has appropriated ecx, esi or edi.  */
7735   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
7736     return false;
7737 
7738   if (is_cmpstrn)
7739     {
7740       /* For strncmp, length is the maximum length, which can be larger
7741 	 than actual string lengths.  We can expand the cmpstrn pattern
7742 	 to "repz cmpsb" only if one of the strings is a constant so
7743 	 that expand_builtin_strncmp() can write the length argument to
7744 	 be the minimum of the const string length and the actual length
7745 	 argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
7746       tree t1 = MEM_EXPR (src1);
7747       tree t2 = MEM_EXPR (src2);
7748       if (!((t1 && TREE_CODE (t1) == MEM_REF
7749 	     && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
7750 	     && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
7751 		 == STRING_CST))
7752 	    || (t2 && TREE_CODE (t2) == MEM_REF
7753 		&& TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
7754 		&& (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
7755 		    == STRING_CST))))
7756 	return false;
7757     }
7758 
7759   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
7760   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
7761   if (addr1 != XEXP (src1, 0))
7762     src1 = replace_equiv_address_nv (src1, addr1);
7763   if (addr2 != XEXP (src2, 0))
7764     src2 = replace_equiv_address_nv (src2, addr2);
7765 
7766   /* NB: Make a copy of the data length to avoid changing the original
7767      data length by cmpstrnqi patterns.  */
7768   length = ix86_zero_extend_to_Pmode (length);
7769   rtx lengthreg = gen_reg_rtx (Pmode);
7770   emit_move_insn (lengthreg, length);
7771 
7772   /* If we are testing strict equality, we can use known alignment to
7773      good advantage.  This may be possible with combine, particularly
7774      once cc0 is dead.  */
7775   if (CONST_INT_P (length))
7776     {
7777       if (length == const0_rtx)
7778 	{
7779 	  emit_move_insn (result, const0_rtx);
7780 	  return true;
7781 	}
7782       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
7783 				     src1, src2));
7784     }
7785   else
7786     {
7787       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
7788       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
7789 				  src1, src2));
7790     }
7791 
7792   rtx out = gen_lowpart (QImode, result);
7793   emit_insn (gen_cmpintqi (out));
7794   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
7795 
7796   return true;
7797 }
7798 
7799 /* Expand the appropriate insns for doing strlen if not just doing
7800    repnz; scasb
7801 
7802    out = result, initialized with the start address
7803    align_rtx = alignment of the address.
7804    scratch = scratch register, initialized with the startaddress when
7805 	not aligned, otherwise undefined
7806 
7807    This is just the body. It needs the initializations mentioned above and
7808    some address computing at the end.  These things are done in i386.md.  */
7809 
7810 static void
ix86_expand_strlensi_unroll_1(rtx out,rtx src,rtx align_rtx)7811 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7812 {
7813   int align;
7814   rtx tmp;
7815   rtx_code_label *align_2_label = NULL;
7816   rtx_code_label *align_3_label = NULL;
7817   rtx_code_label *align_4_label = gen_label_rtx ();
7818   rtx_code_label *end_0_label = gen_label_rtx ();
7819   rtx mem;
7820   rtx tmpreg = gen_reg_rtx (SImode);
7821   rtx scratch = gen_reg_rtx (SImode);
7822   rtx cmp;
7823 
7824   align = 0;
7825   if (CONST_INT_P (align_rtx))
7826     align = INTVAL (align_rtx);
7827 
7828   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
7829 
7830   /* Is there a known alignment and is it less than 4?  */
7831   if (align < 4)
7832     {
7833       rtx scratch1 = gen_reg_rtx (Pmode);
7834       emit_move_insn (scratch1, out);
7835       /* Is there a known alignment and is it not 2? */
7836       if (align != 2)
7837 	{
7838 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7839 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7840 
7841 	  /* Leave just the 3 lower bits.  */
7842 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7843 				    NULL_RTX, 0, OPTAB_WIDEN);
7844 
7845 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7846 				   Pmode, 1, align_4_label);
7847 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7848 				   Pmode, 1, align_2_label);
7849 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7850 				   Pmode, 1, align_3_label);
7851 	}
7852       else
7853         {
7854 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
7855 	     check if is aligned to 4 - byte.  */
7856 
7857 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7858 				    NULL_RTX, 0, OPTAB_WIDEN);
7859 
7860 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7861 				   Pmode, 1, align_4_label);
7862         }
7863 
7864       mem = change_address (src, QImode, out);
7865 
7866       /* Now compare the bytes.  */
7867 
7868       /* Compare the first n unaligned byte on a byte per byte basis.  */
7869       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7870 			       QImode, 1, end_0_label);
7871 
7872       /* Increment the address.  */
7873       emit_insn (gen_add2_insn (out, const1_rtx));
7874 
7875       /* Not needed with an alignment of 2 */
7876       if (align != 2)
7877 	{
7878 	  emit_label (align_2_label);
7879 
7880 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7881 				   end_0_label);
7882 
7883 	  emit_insn (gen_add2_insn (out, const1_rtx));
7884 
7885 	  emit_label (align_3_label);
7886 	}
7887 
7888       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7889 			       end_0_label);
7890 
7891       emit_insn (gen_add2_insn (out, const1_rtx));
7892     }
7893 
7894   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
7895      align this loop.  It gives only huge programs, but does not help to
7896      speed up.  */
7897   emit_label (align_4_label);
7898 
7899   mem = change_address (src, SImode, out);
7900   emit_move_insn (scratch, mem);
7901   emit_insn (gen_add2_insn (out, GEN_INT (4)));
7902 
7903   /* This formula yields a nonzero result iff one of the bytes is zero.
7904      This saves three branches inside loop and many cycles.  */
7905 
7906   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7907   emit_insn (gen_one_cmplsi2 (scratch, scratch));
7908   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7909   emit_insn (gen_andsi3 (tmpreg, tmpreg,
7910 			 gen_int_mode (0x80808080, SImode)));
7911   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7912 			   align_4_label);
7913 
7914   if (TARGET_CMOVE)
7915     {
7916        rtx reg = gen_reg_rtx (SImode);
7917        rtx reg2 = gen_reg_rtx (Pmode);
7918        emit_move_insn (reg, tmpreg);
7919        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7920 
7921        /* If zero is not in the first two bytes, move two bytes forward.  */
7922        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7923        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7924        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7925        emit_insn (gen_rtx_SET (tmpreg,
7926 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
7927 						     reg,
7928 						     tmpreg)));
7929        /* Emit lea manually to avoid clobbering of flags.  */
7930        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
7931 
7932        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7933        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7934        emit_insn (gen_rtx_SET (out,
7935 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7936 						     reg2,
7937 						     out)));
7938     }
7939   else
7940     {
7941        rtx_code_label *end_2_label = gen_label_rtx ();
7942        /* Is zero in the first two bytes? */
7943 
7944        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7945        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7946        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7947        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7948                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7949                             pc_rtx);
7950        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7951        JUMP_LABEL (tmp) = end_2_label;
7952 
7953        /* Not in the first two.  Move two bytes forward.  */
7954        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7955        emit_insn (gen_add2_insn (out, const2_rtx));
7956 
7957        emit_label (end_2_label);
7958 
7959     }
7960 
7961   /* Avoid branch in fixing the byte.  */
7962   tmpreg = gen_lowpart (QImode, tmpreg);
7963   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7964   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7965   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7966   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7967 
7968   emit_label (end_0_label);
7969 }
7970 
7971 /* Expand strlen.  */
7972 
7973 bool
ix86_expand_strlen(rtx out,rtx src,rtx eoschar,rtx align)7974 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7975 {
7976 if (TARGET_UNROLL_STRLEN
7977 	   && TARGET_INLINE_ALL_STRINGOPS
7978 	   && eoschar == const0_rtx
7979 	   && optimize > 1)
7980     {
7981       /* The generic case of strlen expander is long.  Avoid it's
7982 	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
7983       rtx addr = force_reg (Pmode, XEXP (src, 0));
7984       /* Well it seems that some optimizer does not combine a call like
7985 	 foo(strlen(bar), strlen(bar));
7986 	 when the move and the subtraction is done here.  It does calculate
7987 	 the length just once when these instructions are done inside of
7988 	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
7989 	 often used and I use one fewer register for the lifetime of
7990 	 output_strlen_unroll() this is better.  */
7991 
7992       emit_move_insn (out, addr);
7993 
7994       ix86_expand_strlensi_unroll_1 (out, src, align);
7995 
7996       /* strlensi_unroll_1 returns the address of the zero at the end of
7997 	 the string, like memchr(), so compute the length by subtracting
7998 	 the start address.  */
7999       emit_insn (gen_sub2_insn (out, addr));
8000       return true;
8001     }
8002   else
8003     return false;
8004 }
8005 
8006 /* For given symbol (function) construct code to compute address of it's PLT
8007    entry in large x86-64 PIC model.  */
8008 
8009 static rtx
construct_plt_address(rtx symbol)8010 construct_plt_address (rtx symbol)
8011 {
8012   rtx tmp, unspec;
8013 
8014   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
8015   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
8016   gcc_assert (Pmode == DImode);
8017 
8018   tmp = gen_reg_rtx (Pmode);
8019   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
8020 
8021   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
8022   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
8023   return tmp;
8024 }
8025 
8026 /* Additional registers that are clobbered by SYSV calls.  */
8027 
8028 static int const x86_64_ms_sysv_extra_clobbered_registers
8029 		 [NUM_X86_64_MS_CLOBBERED_REGS] =
8030 {
8031   SI_REG, DI_REG,
8032   XMM6_REG, XMM7_REG,
8033   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
8034   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
8035 };
8036 
8037 rtx_insn *
ix86_expand_call(rtx retval,rtx fnaddr,rtx callarg1,rtx callarg2,rtx pop,bool sibcall)8038 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
8039 		  rtx callarg2,
8040 		  rtx pop, bool sibcall)
8041 {
8042   rtx vec[3];
8043   rtx use = NULL, call;
8044   unsigned int vec_len = 0;
8045   tree fndecl;
8046 
8047   if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8048     {
8049       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
8050       if (fndecl
8051 	  && (lookup_attribute ("interrupt",
8052 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
8053 	error ("interrupt service routine cannot be called directly");
8054     }
8055   else
8056     fndecl = NULL_TREE;
8057 
8058   if (pop == const0_rtx)
8059     pop = NULL;
8060   gcc_assert (!TARGET_64BIT || !pop);
8061 
8062   rtx addr = XEXP (fnaddr, 0);
8063   if (TARGET_MACHO && !TARGET_64BIT)
8064     {
8065 #if TARGET_MACHO
8066       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8067 	fnaddr = machopic_indirect_call_target (fnaddr);
8068 #endif
8069     }
8070   else
8071     {
8072       /* Static functions and indirect calls don't need the pic register.  Also,
8073 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8074 	 it an indirect call.  */
8075       if (flag_pic
8076 	  && GET_CODE (addr) == SYMBOL_REF
8077 	  && !SYMBOL_REF_LOCAL_P (addr))
8078 	{
8079 	  if (flag_plt
8080 	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
8081 		  || !lookup_attribute ("noplt",
8082 					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8083 	    {
8084 	      if (!TARGET_64BIT
8085 		  || (ix86_cmodel == CM_LARGE_PIC
8086 		      && DEFAULT_ABI != MS_ABI))
8087 		{
8088 		  use_reg (&use, gen_rtx_REG (Pmode,
8089 					      REAL_PIC_OFFSET_TABLE_REGNUM));
8090 		  if (ix86_use_pseudo_pic_reg ())
8091 		    emit_move_insn (gen_rtx_REG (Pmode,
8092 						 REAL_PIC_OFFSET_TABLE_REGNUM),
8093 				    pic_offset_table_rtx);
8094 		}
8095 	    }
8096 	  else if (!TARGET_PECOFF && !TARGET_MACHO)
8097 	    {
8098 	      if (TARGET_64BIT
8099 		  && ix86_cmodel == CM_LARGE_PIC
8100 		  && DEFAULT_ABI != MS_ABI)
8101 		{
8102 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8103 					   UNSPEC_GOT);
8104 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8105 		  fnaddr = force_reg (Pmode, fnaddr);
8106 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
8107 		}
8108 	      else if (TARGET_64BIT)
8109 		{
8110 		  fnaddr = gen_rtx_UNSPEC (Pmode,
8111 					   gen_rtvec (1, addr),
8112 					   UNSPEC_GOTPCREL);
8113 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8114 		}
8115 	      else
8116 		{
8117 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8118 					   UNSPEC_GOT);
8119 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8120 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8121 					 fnaddr);
8122 		}
8123 	      fnaddr = gen_const_mem (Pmode, fnaddr);
8124 	      /* Pmode may not be the same as word_mode for x32, which
8125 		 doesn't support indirect branch via 32-bit memory slot.
8126 		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8127 		 indirect branch via x32 GOT slot is OK.  */
8128 	      if (GET_MODE (fnaddr) != word_mode)
8129 		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8130 	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
8131 	    }
8132 	}
8133     }
8134 
8135   /* Skip setting up RAX register for -mskip-rax-setup when there are no
8136      parameters passed in vector registers.  */
8137   if (TARGET_64BIT
8138       && (INTVAL (callarg2) > 0
8139 	  || (INTVAL (callarg2) == 0
8140 	      && (TARGET_SSE || !flag_skip_rax_setup))))
8141     {
8142       rtx al = gen_rtx_REG (QImode, AX_REG);
8143       emit_move_insn (al, callarg2);
8144       use_reg (&use, al);
8145     }
8146 
8147   if (ix86_cmodel == CM_LARGE_PIC
8148       && !TARGET_PECOFF
8149       && MEM_P (fnaddr)
8150       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8151       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8152     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8153   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8154      branch via x32 GOT slot is OK.  */
8155   else if (!(TARGET_X32
8156 	     && MEM_P (fnaddr)
8157 	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8158 	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8159 	   && (sibcall
8160 	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8161 	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8162     {
8163       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8164       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8165     }
8166 
8167   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8168 
8169   if (retval)
8170     call = gen_rtx_SET (retval, call);
8171   vec[vec_len++] = call;
8172 
8173   if (pop)
8174     {
8175       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8176       pop = gen_rtx_SET (stack_pointer_rtx, pop);
8177       vec[vec_len++] = pop;
8178     }
8179 
8180   if (cfun->machine->no_caller_saved_registers
8181       && (!fndecl
8182 	  || (!TREE_THIS_VOLATILE (fndecl)
8183 	      && !lookup_attribute ("no_caller_saved_registers",
8184 				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8185     {
8186       static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8187       bool is_64bit_ms_abi = (TARGET_64BIT
8188 			      && ix86_function_abi (fndecl) == MS_ABI);
8189       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8190 
8191       /* If there are no caller-saved registers, add all registers
8192 	 that are clobbered by the call which returns.  */
8193       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8194 	if (!fixed_regs[i]
8195 	    && (ix86_call_used_regs[i] == 1
8196 		|| (ix86_call_used_regs[i] & c_mask))
8197 	    && !STACK_REGNO_P (i)
8198 	    && !MMX_REGNO_P (i))
8199 	  clobber_reg (&use,
8200 		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8201     }
8202   else if (TARGET_64BIT_MS_ABI
8203 	   && (!callarg2 || INTVAL (callarg2) != -2))
8204     {
8205       unsigned i;
8206 
8207       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8208 	{
8209 	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8210 	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8211 
8212 	  clobber_reg (&use, gen_rtx_REG (mode, regno));
8213 	}
8214 
8215       /* Set here, but it may get cleared later.  */
8216       if (TARGET_CALL_MS2SYSV_XLOGUES)
8217 	{
8218 	  if (!TARGET_SSE)
8219 	    ;
8220 
8221 	  /* Don't break hot-patched functions.  */
8222 	  else if (ix86_function_ms_hook_prologue (current_function_decl))
8223 	    ;
8224 
8225 	  /* TODO: Cases not yet examined.  */
8226 	  else if (flag_split_stack)
8227 	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8228 
8229 	  else
8230 	    {
8231 	      gcc_assert (!reload_completed);
8232 	      cfun->machine->call_ms2sysv = true;
8233 	    }
8234 	}
8235     }
8236 
8237   if (TARGET_MACHO && TARGET_64BIT && !sibcall
8238       && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
8239 	  || !fndecl || TREE_PUBLIC (fndecl)))
8240     {
8241       /* We allow public functions defined in a TU to bind locally for PIC
8242 	 code (the default) on 64bit Mach-O.
8243 	 If such functions are not inlined, we cannot tell at compile-time if
8244 	 they will be called via the lazy symbol resolver (this can depend on
8245 	 options given at link-time).  Therefore, we must assume that the lazy
8246 	 resolver could be used which clobbers R11 and R10.  */
8247       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
8248       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
8249     }
8250 
8251   if (vec_len > 1)
8252     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8253   rtx_insn *call_insn = emit_call_insn (call);
8254   if (use)
8255     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8256 
8257   return call_insn;
8258 }
8259 
8260 /* Split simple return with popping POPC bytes from stack to indirect
8261    branch with stack adjustment .  */
8262 
8263 void
ix86_split_simple_return_pop_internal(rtx popc)8264 ix86_split_simple_return_pop_internal (rtx popc)
8265 {
8266   struct machine_function *m = cfun->machine;
8267   rtx ecx = gen_rtx_REG (SImode, CX_REG);
8268   rtx_insn *insn;
8269 
8270   /* There is no "pascal" calling convention in any 64bit ABI.  */
8271   gcc_assert (!TARGET_64BIT);
8272 
8273   insn = emit_insn (gen_pop (ecx));
8274   m->fs.cfa_offset -= UNITS_PER_WORD;
8275   m->fs.sp_offset -= UNITS_PER_WORD;
8276 
8277   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8278   x = gen_rtx_SET (stack_pointer_rtx, x);
8279   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8280   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8281   RTX_FRAME_RELATED_P (insn) = 1;
8282 
8283   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8284   x = gen_rtx_SET (stack_pointer_rtx, x);
8285   insn = emit_insn (x);
8286   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8287   RTX_FRAME_RELATED_P (insn) = 1;
8288 
8289   /* Now return address is in ECX.  */
8290   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8291 }
8292 
8293 /* Errors in the source file can cause expand_expr to return const0_rtx
8294    where we expect a vector.  To avoid crashing, use one of the vector
8295    clear instructions.  */
8296 
8297 static rtx
safe_vector_operand(rtx x,machine_mode mode)8298 safe_vector_operand (rtx x, machine_mode mode)
8299 {
8300   if (x == const0_rtx)
8301     x = CONST0_RTX (mode);
8302   return x;
8303 }
8304 
8305 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
8306 
8307 static rtx
ix86_expand_binop_builtin(enum insn_code icode,tree exp,rtx target)8308 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8309 {
8310   rtx pat;
8311   tree arg0 = CALL_EXPR_ARG (exp, 0);
8312   tree arg1 = CALL_EXPR_ARG (exp, 1);
8313   rtx op0 = expand_normal (arg0);
8314   rtx op1 = expand_normal (arg1);
8315   machine_mode tmode = insn_data[icode].operand[0].mode;
8316   machine_mode mode0 = insn_data[icode].operand[1].mode;
8317   machine_mode mode1 = insn_data[icode].operand[2].mode;
8318 
8319   if (VECTOR_MODE_P (mode0))
8320     op0 = safe_vector_operand (op0, mode0);
8321   if (VECTOR_MODE_P (mode1))
8322     op1 = safe_vector_operand (op1, mode1);
8323 
8324   if (optimize || !target
8325       || GET_MODE (target) != tmode
8326       || !insn_data[icode].operand[0].predicate (target, tmode))
8327     target = gen_reg_rtx (tmode);
8328 
8329   if (GET_MODE (op1) == SImode && mode1 == TImode)
8330     {
8331       rtx x = gen_reg_rtx (V4SImode);
8332       emit_insn (gen_sse2_loadd (x, op1));
8333       op1 = gen_lowpart (TImode, x);
8334     }
8335 
8336   if (!insn_data[icode].operand[1].predicate (op0, mode0))
8337     op0 = copy_to_mode_reg (mode0, op0);
8338   if (!insn_data[icode].operand[2].predicate (op1, mode1))
8339     op1 = copy_to_mode_reg (mode1, op1);
8340 
8341   pat = GEN_FCN (icode) (target, op0, op1);
8342   if (! pat)
8343     return 0;
8344 
8345   emit_insn (pat);
8346 
8347   return target;
8348 }
8349 
8350 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
8351 
8352 static rtx
ix86_expand_multi_arg_builtin(enum insn_code icode,tree exp,rtx target,enum ix86_builtin_func_type m_type,enum rtx_code sub_code)8353 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8354 			       enum ix86_builtin_func_type m_type,
8355 			       enum rtx_code sub_code)
8356 {
8357   rtx pat;
8358   unsigned int i, nargs;
8359   bool comparison_p = false;
8360   bool tf_p = false;
8361   bool last_arg_constant = false;
8362   int num_memory = 0;
8363   rtx xops[4];
8364 
8365   machine_mode tmode = insn_data[icode].operand[0].mode;
8366 
8367   switch (m_type)
8368     {
8369     case MULTI_ARG_4_DF2_DI_I:
8370     case MULTI_ARG_4_DF2_DI_I1:
8371     case MULTI_ARG_4_SF2_SI_I:
8372     case MULTI_ARG_4_SF2_SI_I1:
8373       nargs = 4;
8374       last_arg_constant = true;
8375       break;
8376 
8377     case MULTI_ARG_3_SF:
8378     case MULTI_ARG_3_DF:
8379     case MULTI_ARG_3_SF2:
8380     case MULTI_ARG_3_DF2:
8381     case MULTI_ARG_3_DI:
8382     case MULTI_ARG_3_SI:
8383     case MULTI_ARG_3_SI_DI:
8384     case MULTI_ARG_3_HI:
8385     case MULTI_ARG_3_HI_SI:
8386     case MULTI_ARG_3_QI:
8387     case MULTI_ARG_3_DI2:
8388     case MULTI_ARG_3_SI2:
8389     case MULTI_ARG_3_HI2:
8390     case MULTI_ARG_3_QI2:
8391       nargs = 3;
8392       break;
8393 
8394     case MULTI_ARG_2_SF:
8395     case MULTI_ARG_2_DF:
8396     case MULTI_ARG_2_DI:
8397     case MULTI_ARG_2_SI:
8398     case MULTI_ARG_2_HI:
8399     case MULTI_ARG_2_QI:
8400       nargs = 2;
8401       break;
8402 
8403     case MULTI_ARG_2_DI_IMM:
8404     case MULTI_ARG_2_SI_IMM:
8405     case MULTI_ARG_2_HI_IMM:
8406     case MULTI_ARG_2_QI_IMM:
8407       nargs = 2;
8408       last_arg_constant = true;
8409       break;
8410 
8411     case MULTI_ARG_1_SF:
8412     case MULTI_ARG_1_DF:
8413     case MULTI_ARG_1_SF2:
8414     case MULTI_ARG_1_DF2:
8415     case MULTI_ARG_1_DI:
8416     case MULTI_ARG_1_SI:
8417     case MULTI_ARG_1_HI:
8418     case MULTI_ARG_1_QI:
8419     case MULTI_ARG_1_SI_DI:
8420     case MULTI_ARG_1_HI_DI:
8421     case MULTI_ARG_1_HI_SI:
8422     case MULTI_ARG_1_QI_DI:
8423     case MULTI_ARG_1_QI_SI:
8424     case MULTI_ARG_1_QI_HI:
8425       nargs = 1;
8426       break;
8427 
8428     case MULTI_ARG_2_DI_CMP:
8429     case MULTI_ARG_2_SI_CMP:
8430     case MULTI_ARG_2_HI_CMP:
8431     case MULTI_ARG_2_QI_CMP:
8432       nargs = 2;
8433       comparison_p = true;
8434       break;
8435 
8436     case MULTI_ARG_2_SF_TF:
8437     case MULTI_ARG_2_DF_TF:
8438     case MULTI_ARG_2_DI_TF:
8439     case MULTI_ARG_2_SI_TF:
8440     case MULTI_ARG_2_HI_TF:
8441     case MULTI_ARG_2_QI_TF:
8442       nargs = 2;
8443       tf_p = true;
8444       break;
8445 
8446     default:
8447       gcc_unreachable ();
8448     }
8449 
8450   if (optimize || !target
8451       || GET_MODE (target) != tmode
8452       || !insn_data[icode].operand[0].predicate (target, tmode))
8453     target = gen_reg_rtx (tmode);
8454   else if (memory_operand (target, tmode))
8455     num_memory++;
8456 
8457   gcc_assert (nargs <= ARRAY_SIZE (xops));
8458 
8459   for (i = 0; i < nargs; i++)
8460     {
8461       tree arg = CALL_EXPR_ARG (exp, i);
8462       rtx op = expand_normal (arg);
8463       int adjust = (comparison_p) ? 1 : 0;
8464       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8465 
8466       if (last_arg_constant && i == nargs - 1)
8467 	{
8468 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8469 	    {
8470 	      enum insn_code new_icode = icode;
8471 	      switch (icode)
8472 		{
8473 		case CODE_FOR_xop_vpermil2v2df3:
8474 		case CODE_FOR_xop_vpermil2v4sf3:
8475 		case CODE_FOR_xop_vpermil2v4df3:
8476 		case CODE_FOR_xop_vpermil2v8sf3:
8477 		  error ("the last argument must be a 2-bit immediate");
8478 		  return gen_reg_rtx (tmode);
8479 		case CODE_FOR_xop_rotlv2di3:
8480 		  new_icode = CODE_FOR_rotlv2di3;
8481 		  goto xop_rotl;
8482 		case CODE_FOR_xop_rotlv4si3:
8483 		  new_icode = CODE_FOR_rotlv4si3;
8484 		  goto xop_rotl;
8485 		case CODE_FOR_xop_rotlv8hi3:
8486 		  new_icode = CODE_FOR_rotlv8hi3;
8487 		  goto xop_rotl;
8488 		case CODE_FOR_xop_rotlv16qi3:
8489 		  new_icode = CODE_FOR_rotlv16qi3;
8490 		xop_rotl:
8491 		  if (CONST_INT_P (op))
8492 		    {
8493 		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8494 		      op = GEN_INT (INTVAL (op) & mask);
8495 		      gcc_checking_assert
8496 			(insn_data[icode].operand[i + 1].predicate (op, mode));
8497 		    }
8498 		  else
8499 		    {
8500 		      gcc_checking_assert
8501 			(nargs == 2
8502 			 && insn_data[new_icode].operand[0].mode == tmode
8503 			 && insn_data[new_icode].operand[1].mode == tmode
8504 			 && insn_data[new_icode].operand[2].mode == mode
8505 			 && insn_data[new_icode].operand[0].predicate
8506 			    == insn_data[icode].operand[0].predicate
8507 			 && insn_data[new_icode].operand[1].predicate
8508 			    == insn_data[icode].operand[1].predicate);
8509 		      icode = new_icode;
8510 		      goto non_constant;
8511 		    }
8512 		  break;
8513 		default:
8514 		  gcc_unreachable ();
8515 		}
8516 	    }
8517 	}
8518       else
8519 	{
8520 	non_constant:
8521 	  if (VECTOR_MODE_P (mode))
8522 	    op = safe_vector_operand (op, mode);
8523 
8524 	  /* If we aren't optimizing, only allow one memory operand to be
8525 	     generated.  */
8526 	  if (memory_operand (op, mode))
8527 	    num_memory++;
8528 
8529 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8530 
8531 	  if (optimize
8532 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8533 	      || num_memory > 1)
8534 	    op = force_reg (mode, op);
8535 	}
8536 
8537       xops[i] = op;
8538     }
8539 
8540   switch (nargs)
8541     {
8542     case 1:
8543       pat = GEN_FCN (icode) (target, xops[0]);
8544       break;
8545 
8546     case 2:
8547       if (tf_p)
8548 	pat = GEN_FCN (icode) (target, xops[0], xops[1],
8549 			       GEN_INT ((int)sub_code));
8550       else if (! comparison_p)
8551 	pat = GEN_FCN (icode) (target, xops[0], xops[1]);
8552       else
8553 	{
8554 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8555 				       xops[0], xops[1]);
8556 
8557 	  pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
8558 	}
8559       break;
8560 
8561     case 3:
8562       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
8563       break;
8564 
8565     case 4:
8566       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
8567       break;
8568 
8569     default:
8570       gcc_unreachable ();
8571     }
8572 
8573   if (! pat)
8574     return 0;
8575 
8576   emit_insn (pat);
8577   return target;
8578 }
8579 
8580 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8581    insns with vec_merge.  */
8582 
8583 static rtx
ix86_expand_unop_vec_merge_builtin(enum insn_code icode,tree exp,rtx target)8584 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8585 				    rtx target)
8586 {
8587   rtx pat;
8588   tree arg0 = CALL_EXPR_ARG (exp, 0);
8589   rtx op1, op0 = expand_normal (arg0);
8590   machine_mode tmode = insn_data[icode].operand[0].mode;
8591   machine_mode mode0 = insn_data[icode].operand[1].mode;
8592 
8593   if (optimize || !target
8594       || GET_MODE (target) != tmode
8595       || !insn_data[icode].operand[0].predicate (target, tmode))
8596     target = gen_reg_rtx (tmode);
8597 
8598   if (VECTOR_MODE_P (mode0))
8599     op0 = safe_vector_operand (op0, mode0);
8600 
8601   if ((optimize && !register_operand (op0, mode0))
8602       || !insn_data[icode].operand[1].predicate (op0, mode0))
8603     op0 = copy_to_mode_reg (mode0, op0);
8604 
8605   op1 = op0;
8606   if (!insn_data[icode].operand[2].predicate (op1, mode0))
8607     op1 = copy_to_mode_reg (mode0, op1);
8608 
8609   pat = GEN_FCN (icode) (target, op0, op1);
8610   if (! pat)
8611     return 0;
8612   emit_insn (pat);
8613   return target;
8614 }
8615 
8616 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
8617 
8618 static rtx
ix86_expand_sse_compare(const struct builtin_description * d,tree exp,rtx target,bool swap)8619 ix86_expand_sse_compare (const struct builtin_description *d,
8620 			 tree exp, rtx target, bool swap)
8621 {
8622   rtx pat;
8623   tree arg0 = CALL_EXPR_ARG (exp, 0);
8624   tree arg1 = CALL_EXPR_ARG (exp, 1);
8625   rtx op0 = expand_normal (arg0);
8626   rtx op1 = expand_normal (arg1);
8627   rtx op2;
8628   machine_mode tmode = insn_data[d->icode].operand[0].mode;
8629   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8630   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8631   enum rtx_code comparison = d->comparison;
8632 
8633   if (VECTOR_MODE_P (mode0))
8634     op0 = safe_vector_operand (op0, mode0);
8635   if (VECTOR_MODE_P (mode1))
8636     op1 = safe_vector_operand (op1, mode1);
8637 
8638   /* Swap operands if we have a comparison that isn't available in
8639      hardware.  */
8640   if (swap)
8641     std::swap (op0, op1);
8642 
8643   if (optimize || !target
8644       || GET_MODE (target) != tmode
8645       || !insn_data[d->icode].operand[0].predicate (target, tmode))
8646     target = gen_reg_rtx (tmode);
8647 
8648   if ((optimize && !register_operand (op0, mode0))
8649       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8650     op0 = copy_to_mode_reg (mode0, op0);
8651   if ((optimize && !register_operand (op1, mode1))
8652       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8653     op1 = copy_to_mode_reg (mode1, op1);
8654 
8655   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8656   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8657   if (! pat)
8658     return 0;
8659   emit_insn (pat);
8660   return target;
8661 }
8662 
8663 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
8664 
8665 static rtx
ix86_expand_sse_comi(const struct builtin_description * d,tree exp,rtx target)8666 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8667 		      rtx target)
8668 {
8669   rtx pat;
8670   tree arg0 = CALL_EXPR_ARG (exp, 0);
8671   tree arg1 = CALL_EXPR_ARG (exp, 1);
8672   rtx op0 = expand_normal (arg0);
8673   rtx op1 = expand_normal (arg1);
8674   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8675   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8676   enum rtx_code comparison = d->comparison;
8677 
8678   if (VECTOR_MODE_P (mode0))
8679     op0 = safe_vector_operand (op0, mode0);
8680   if (VECTOR_MODE_P (mode1))
8681     op1 = safe_vector_operand (op1, mode1);
8682 
8683   target = gen_reg_rtx (SImode);
8684   emit_move_insn (target, const0_rtx);
8685   target = gen_rtx_SUBREG (QImode, target, 0);
8686 
8687   if ((optimize && !register_operand (op0, mode0))
8688       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8689     op0 = copy_to_mode_reg (mode0, op0);
8690   if ((optimize && !register_operand (op1, mode1))
8691       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8692     op1 = copy_to_mode_reg (mode1, op1);
8693 
8694   pat = GEN_FCN (d->icode) (op0, op1);
8695   if (! pat)
8696     return 0;
8697   emit_insn (pat);
8698   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8699 			  gen_rtx_fmt_ee (comparison, QImode,
8700 					  SET_DEST (pat),
8701 					  const0_rtx)));
8702 
8703   return SUBREG_REG (target);
8704 }
8705 
8706 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
8707 
8708 static rtx
ix86_expand_sse_round(const struct builtin_description * d,tree exp,rtx target)8709 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8710 		       rtx target)
8711 {
8712   rtx pat;
8713   tree arg0 = CALL_EXPR_ARG (exp, 0);
8714   rtx op1, op0 = expand_normal (arg0);
8715   machine_mode tmode = insn_data[d->icode].operand[0].mode;
8716   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8717 
8718   if (optimize || target == 0
8719       || GET_MODE (target) != tmode
8720       || !insn_data[d->icode].operand[0].predicate (target, tmode))
8721     target = gen_reg_rtx (tmode);
8722 
8723   if (VECTOR_MODE_P (mode0))
8724     op0 = safe_vector_operand (op0, mode0);
8725 
8726   if ((optimize && !register_operand (op0, mode0))
8727       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8728     op0 = copy_to_mode_reg (mode0, op0);
8729 
8730   op1 = GEN_INT (d->comparison);
8731 
8732   pat = GEN_FCN (d->icode) (target, op0, op1);
8733   if (! pat)
8734     return 0;
8735   emit_insn (pat);
8736   return target;
8737 }
8738 
8739 static rtx
ix86_expand_sse_round_vec_pack_sfix(const struct builtin_description * d,tree exp,rtx target)8740 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8741 				     tree exp, rtx target)
8742 {
8743   rtx pat;
8744   tree arg0 = CALL_EXPR_ARG (exp, 0);
8745   tree arg1 = CALL_EXPR_ARG (exp, 1);
8746   rtx op0 = expand_normal (arg0);
8747   rtx op1 = expand_normal (arg1);
8748   rtx op2;
8749   machine_mode tmode = insn_data[d->icode].operand[0].mode;
8750   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8751   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8752 
8753   if (optimize || target == 0
8754       || GET_MODE (target) != tmode
8755       || !insn_data[d->icode].operand[0].predicate (target, tmode))
8756     target = gen_reg_rtx (tmode);
8757 
8758   op0 = safe_vector_operand (op0, mode0);
8759   op1 = safe_vector_operand (op1, mode1);
8760 
8761   if ((optimize && !register_operand (op0, mode0))
8762       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8763     op0 = copy_to_mode_reg (mode0, op0);
8764   if ((optimize && !register_operand (op1, mode1))
8765       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8766     op1 = copy_to_mode_reg (mode1, op1);
8767 
8768   op2 = GEN_INT (d->comparison);
8769 
8770   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8771   if (! pat)
8772     return 0;
8773   emit_insn (pat);
8774   return target;
8775 }
8776 
8777 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
8778 
8779 static rtx
ix86_expand_sse_ptest(const struct builtin_description * d,tree exp,rtx target)8780 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8781 		       rtx target)
8782 {
8783   rtx pat;
8784   tree arg0 = CALL_EXPR_ARG (exp, 0);
8785   tree arg1 = CALL_EXPR_ARG (exp, 1);
8786   rtx op0 = expand_normal (arg0);
8787   rtx op1 = expand_normal (arg1);
8788   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8789   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8790   enum rtx_code comparison = d->comparison;
8791 
8792   if (VECTOR_MODE_P (mode0))
8793     op0 = safe_vector_operand (op0, mode0);
8794   if (VECTOR_MODE_P (mode1))
8795     op1 = safe_vector_operand (op1, mode1);
8796 
8797   target = gen_reg_rtx (SImode);
8798   emit_move_insn (target, const0_rtx);
8799   target = gen_rtx_SUBREG (QImode, target, 0);
8800 
8801   if ((optimize && !register_operand (op0, mode0))
8802       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8803     op0 = copy_to_mode_reg (mode0, op0);
8804   if ((optimize && !register_operand (op1, mode1))
8805       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8806     op1 = copy_to_mode_reg (mode1, op1);
8807 
8808   pat = GEN_FCN (d->icode) (op0, op1);
8809   if (! pat)
8810     return 0;
8811   emit_insn (pat);
8812   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8813 			  gen_rtx_fmt_ee (comparison, QImode,
8814 					  SET_DEST (pat),
8815 					  const0_rtx)));
8816 
8817   return SUBREG_REG (target);
8818 }
8819 
8820 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
8821 
8822 static rtx
ix86_expand_sse_pcmpestr(const struct builtin_description * d,tree exp,rtx target)8823 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8824 			  tree exp, rtx target)
8825 {
8826   rtx pat;
8827   tree arg0 = CALL_EXPR_ARG (exp, 0);
8828   tree arg1 = CALL_EXPR_ARG (exp, 1);
8829   tree arg2 = CALL_EXPR_ARG (exp, 2);
8830   tree arg3 = CALL_EXPR_ARG (exp, 3);
8831   tree arg4 = CALL_EXPR_ARG (exp, 4);
8832   rtx scratch0, scratch1;
8833   rtx op0 = expand_normal (arg0);
8834   rtx op1 = expand_normal (arg1);
8835   rtx op2 = expand_normal (arg2);
8836   rtx op3 = expand_normal (arg3);
8837   rtx op4 = expand_normal (arg4);
8838   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8839 
8840   tmode0 = insn_data[d->icode].operand[0].mode;
8841   tmode1 = insn_data[d->icode].operand[1].mode;
8842   modev2 = insn_data[d->icode].operand[2].mode;
8843   modei3 = insn_data[d->icode].operand[3].mode;
8844   modev4 = insn_data[d->icode].operand[4].mode;
8845   modei5 = insn_data[d->icode].operand[5].mode;
8846   modeimm = insn_data[d->icode].operand[6].mode;
8847 
8848   if (VECTOR_MODE_P (modev2))
8849     op0 = safe_vector_operand (op0, modev2);
8850   if (VECTOR_MODE_P (modev4))
8851     op2 = safe_vector_operand (op2, modev4);
8852 
8853   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8854     op0 = copy_to_mode_reg (modev2, op0);
8855   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8856     op1 = copy_to_mode_reg (modei3, op1);
8857   if ((optimize && !register_operand (op2, modev4))
8858       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8859     op2 = copy_to_mode_reg (modev4, op2);
8860   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8861     op3 = copy_to_mode_reg (modei5, op3);
8862 
8863   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8864     {
8865       error ("the fifth argument must be an 8-bit immediate");
8866       return const0_rtx;
8867     }
8868 
8869   if (d->code == IX86_BUILTIN_PCMPESTRI128)
8870     {
8871       if (optimize || !target
8872 	  || GET_MODE (target) != tmode0
8873 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8874 	target = gen_reg_rtx (tmode0);
8875 
8876       scratch1 = gen_reg_rtx (tmode1);
8877 
8878       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8879     }
8880   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8881     {
8882       if (optimize || !target
8883 	  || GET_MODE (target) != tmode1
8884 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8885 	target = gen_reg_rtx (tmode1);
8886 
8887       scratch0 = gen_reg_rtx (tmode0);
8888 
8889       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8890     }
8891   else
8892     {
8893       gcc_assert (d->flag);
8894 
8895       scratch0 = gen_reg_rtx (tmode0);
8896       scratch1 = gen_reg_rtx (tmode1);
8897 
8898       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8899     }
8900 
8901   if (! pat)
8902     return 0;
8903 
8904   emit_insn (pat);
8905 
8906   if (d->flag)
8907     {
8908       target = gen_reg_rtx (SImode);
8909       emit_move_insn (target, const0_rtx);
8910       target = gen_rtx_SUBREG (QImode, target, 0);
8911 
8912       emit_insn
8913 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8914 		      gen_rtx_fmt_ee (EQ, QImode,
8915 				      gen_rtx_REG ((machine_mode) d->flag,
8916 						   FLAGS_REG),
8917 				      const0_rtx)));
8918       return SUBREG_REG (target);
8919     }
8920   else
8921     return target;
8922 }
8923 
8924 
8925 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
8926 
8927 static rtx
ix86_expand_sse_pcmpistr(const struct builtin_description * d,tree exp,rtx target)8928 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8929 			  tree exp, rtx target)
8930 {
8931   rtx pat;
8932   tree arg0 = CALL_EXPR_ARG (exp, 0);
8933   tree arg1 = CALL_EXPR_ARG (exp, 1);
8934   tree arg2 = CALL_EXPR_ARG (exp, 2);
8935   rtx scratch0, scratch1;
8936   rtx op0 = expand_normal (arg0);
8937   rtx op1 = expand_normal (arg1);
8938   rtx op2 = expand_normal (arg2);
8939   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8940 
8941   tmode0 = insn_data[d->icode].operand[0].mode;
8942   tmode1 = insn_data[d->icode].operand[1].mode;
8943   modev2 = insn_data[d->icode].operand[2].mode;
8944   modev3 = insn_data[d->icode].operand[3].mode;
8945   modeimm = insn_data[d->icode].operand[4].mode;
8946 
8947   if (VECTOR_MODE_P (modev2))
8948     op0 = safe_vector_operand (op0, modev2);
8949   if (VECTOR_MODE_P (modev3))
8950     op1 = safe_vector_operand (op1, modev3);
8951 
8952   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8953     op0 = copy_to_mode_reg (modev2, op0);
8954   if ((optimize && !register_operand (op1, modev3))
8955       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8956     op1 = copy_to_mode_reg (modev3, op1);
8957 
8958   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8959     {
8960       error ("the third argument must be an 8-bit immediate");
8961       return const0_rtx;
8962     }
8963 
8964   if (d->code == IX86_BUILTIN_PCMPISTRI128)
8965     {
8966       if (optimize || !target
8967 	  || GET_MODE (target) != tmode0
8968 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8969 	target = gen_reg_rtx (tmode0);
8970 
8971       scratch1 = gen_reg_rtx (tmode1);
8972 
8973       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8974     }
8975   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8976     {
8977       if (optimize || !target
8978 	  || GET_MODE (target) != tmode1
8979 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8980 	target = gen_reg_rtx (tmode1);
8981 
8982       scratch0 = gen_reg_rtx (tmode0);
8983 
8984       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8985     }
8986   else
8987     {
8988       gcc_assert (d->flag);
8989 
8990       scratch0 = gen_reg_rtx (tmode0);
8991       scratch1 = gen_reg_rtx (tmode1);
8992 
8993       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8994     }
8995 
8996   if (! pat)
8997     return 0;
8998 
8999   emit_insn (pat);
9000 
9001   if (d->flag)
9002     {
9003       target = gen_reg_rtx (SImode);
9004       emit_move_insn (target, const0_rtx);
9005       target = gen_rtx_SUBREG (QImode, target, 0);
9006 
9007       emit_insn
9008 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9009 		      gen_rtx_fmt_ee (EQ, QImode,
9010 				      gen_rtx_REG ((machine_mode) d->flag,
9011 						   FLAGS_REG),
9012 				      const0_rtx)));
9013       return SUBREG_REG (target);
9014     }
9015   else
9016     return target;
9017 }
9018 
9019 /* Fixup modeless constants to fit required mode.  */
9020 
9021 static rtx
fixup_modeless_constant(rtx x,machine_mode mode)9022 fixup_modeless_constant (rtx x, machine_mode mode)
9023 {
9024   if (GET_MODE (x) == VOIDmode)
9025     x = convert_to_mode (mode, x, 1);
9026   return x;
9027 }
9028 
9029 /* Subroutine of ix86_expand_builtin to take care of insns with
9030    variable number of operands.  */
9031 
9032 static rtx
ix86_expand_args_builtin(const struct builtin_description * d,tree exp,rtx target)9033 ix86_expand_args_builtin (const struct builtin_description *d,
9034 			  tree exp, rtx target)
9035 {
9036   rtx pat, real_target;
9037   unsigned int i, nargs;
9038   unsigned int nargs_constant = 0;
9039   unsigned int mask_pos = 0;
9040   int num_memory = 0;
9041   rtx xops[6];
9042   bool second_arg_count = false;
9043   enum insn_code icode = d->icode;
9044   const struct insn_data_d *insn_p = &insn_data[icode];
9045   machine_mode tmode = insn_p->operand[0].mode;
9046   machine_mode rmode = VOIDmode;
9047   bool swap = false;
9048   enum rtx_code comparison = d->comparison;
9049 
9050   switch ((enum ix86_builtin_func_type) d->flag)
9051     {
9052     case V2DF_FTYPE_V2DF_ROUND:
9053     case V4DF_FTYPE_V4DF_ROUND:
9054     case V8DF_FTYPE_V8DF_ROUND:
9055     case V4SF_FTYPE_V4SF_ROUND:
9056     case V8SF_FTYPE_V8SF_ROUND:
9057     case V16SF_FTYPE_V16SF_ROUND:
9058     case V4SI_FTYPE_V4SF_ROUND:
9059     case V8SI_FTYPE_V8SF_ROUND:
9060     case V16SI_FTYPE_V16SF_ROUND:
9061       return ix86_expand_sse_round (d, exp, target);
9062     case V4SI_FTYPE_V2DF_V2DF_ROUND:
9063     case V8SI_FTYPE_V4DF_V4DF_ROUND:
9064     case V16SI_FTYPE_V8DF_V8DF_ROUND:
9065       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
9066     case INT_FTYPE_V8SF_V8SF_PTEST:
9067     case INT_FTYPE_V4DI_V4DI_PTEST:
9068     case INT_FTYPE_V4DF_V4DF_PTEST:
9069     case INT_FTYPE_V4SF_V4SF_PTEST:
9070     case INT_FTYPE_V2DI_V2DI_PTEST:
9071     case INT_FTYPE_V2DF_V2DF_PTEST:
9072       return ix86_expand_sse_ptest (d, exp, target);
9073     case FLOAT128_FTYPE_FLOAT128:
9074     case FLOAT_FTYPE_FLOAT:
9075     case INT_FTYPE_INT:
9076     case UINT_FTYPE_UINT:
9077     case UINT16_FTYPE_UINT16:
9078     case UINT64_FTYPE_INT:
9079     case UINT64_FTYPE_UINT64:
9080     case INT64_FTYPE_INT64:
9081     case INT64_FTYPE_V4SF:
9082     case INT64_FTYPE_V2DF:
9083     case INT_FTYPE_V16QI:
9084     case INT_FTYPE_V8QI:
9085     case INT_FTYPE_V8SF:
9086     case INT_FTYPE_V4DF:
9087     case INT_FTYPE_V4SF:
9088     case INT_FTYPE_V2DF:
9089     case INT_FTYPE_V32QI:
9090     case V16QI_FTYPE_V16QI:
9091     case V8SI_FTYPE_V8SF:
9092     case V8SI_FTYPE_V4SI:
9093     case V8HI_FTYPE_V8HI:
9094     case V8HI_FTYPE_V16QI:
9095     case V8QI_FTYPE_V8QI:
9096     case V8SF_FTYPE_V8SF:
9097     case V8SF_FTYPE_V8SI:
9098     case V8SF_FTYPE_V4SF:
9099     case V8SF_FTYPE_V8HI:
9100     case V4SI_FTYPE_V4SI:
9101     case V4SI_FTYPE_V16QI:
9102     case V4SI_FTYPE_V4SF:
9103     case V4SI_FTYPE_V8SI:
9104     case V4SI_FTYPE_V8HI:
9105     case V4SI_FTYPE_V4DF:
9106     case V4SI_FTYPE_V2DF:
9107     case V4HI_FTYPE_V4HI:
9108     case V4DF_FTYPE_V4DF:
9109     case V4DF_FTYPE_V4SI:
9110     case V4DF_FTYPE_V4SF:
9111     case V4DF_FTYPE_V2DF:
9112     case V4SF_FTYPE_V4SF:
9113     case V4SF_FTYPE_V4SI:
9114     case V4SF_FTYPE_V8SF:
9115     case V4SF_FTYPE_V4DF:
9116     case V4SF_FTYPE_V8HI:
9117     case V4SF_FTYPE_V2DF:
9118     case V2DI_FTYPE_V2DI:
9119     case V2DI_FTYPE_V16QI:
9120     case V2DI_FTYPE_V8HI:
9121     case V2DI_FTYPE_V4SI:
9122     case V2DF_FTYPE_V2DF:
9123     case V2DF_FTYPE_V4SI:
9124     case V2DF_FTYPE_V4DF:
9125     case V2DF_FTYPE_V4SF:
9126     case V2DF_FTYPE_V2SI:
9127     case V2SI_FTYPE_V2SI:
9128     case V2SI_FTYPE_V4SF:
9129     case V2SI_FTYPE_V2SF:
9130     case V2SI_FTYPE_V2DF:
9131     case V2SF_FTYPE_V2SF:
9132     case V2SF_FTYPE_V2SI:
9133     case V32QI_FTYPE_V32QI:
9134     case V32QI_FTYPE_V16QI:
9135     case V16HI_FTYPE_V16HI:
9136     case V16HI_FTYPE_V8HI:
9137     case V8SI_FTYPE_V8SI:
9138     case V16HI_FTYPE_V16QI:
9139     case V8SI_FTYPE_V16QI:
9140     case V4DI_FTYPE_V16QI:
9141     case V8SI_FTYPE_V8HI:
9142     case V4DI_FTYPE_V8HI:
9143     case V4DI_FTYPE_V4SI:
9144     case V4DI_FTYPE_V2DI:
9145     case UQI_FTYPE_UQI:
9146     case UHI_FTYPE_UHI:
9147     case USI_FTYPE_USI:
9148     case USI_FTYPE_UQI:
9149     case USI_FTYPE_UHI:
9150     case UDI_FTYPE_UDI:
9151     case UHI_FTYPE_V16QI:
9152     case USI_FTYPE_V32QI:
9153     case UDI_FTYPE_V64QI:
9154     case V16QI_FTYPE_UHI:
9155     case V32QI_FTYPE_USI:
9156     case V64QI_FTYPE_UDI:
9157     case V8HI_FTYPE_UQI:
9158     case V16HI_FTYPE_UHI:
9159     case V32HI_FTYPE_USI:
9160     case V4SI_FTYPE_UQI:
9161     case V8SI_FTYPE_UQI:
9162     case V4SI_FTYPE_UHI:
9163     case V8SI_FTYPE_UHI:
9164     case UQI_FTYPE_V8HI:
9165     case UHI_FTYPE_V16HI:
9166     case USI_FTYPE_V32HI:
9167     case UQI_FTYPE_V4SI:
9168     case UQI_FTYPE_V8SI:
9169     case UHI_FTYPE_V16SI:
9170     case UQI_FTYPE_V2DI:
9171     case UQI_FTYPE_V4DI:
9172     case UQI_FTYPE_V8DI:
9173     case V16SI_FTYPE_UHI:
9174     case V2DI_FTYPE_UQI:
9175     case V4DI_FTYPE_UQI:
9176     case V16SI_FTYPE_INT:
9177     case V16SF_FTYPE_V8SF:
9178     case V16SI_FTYPE_V8SI:
9179     case V16SF_FTYPE_V4SF:
9180     case V16SI_FTYPE_V4SI:
9181     case V16SI_FTYPE_V16SF:
9182     case V16SI_FTYPE_V16SI:
9183     case V64QI_FTYPE_V64QI:
9184     case V32HI_FTYPE_V32HI:
9185     case V16SF_FTYPE_V16SF:
9186     case V8DI_FTYPE_UQI:
9187     case V8DI_FTYPE_V8DI:
9188     case V8DF_FTYPE_V4DF:
9189     case V8DF_FTYPE_V2DF:
9190     case V8DF_FTYPE_V8DF:
9191     case V4DI_FTYPE_V4DI:
9192     case V16HI_FTYPE_V16SF:
9193     case V8HI_FTYPE_V8SF:
9194     case V8HI_FTYPE_V4SF:
9195       nargs = 1;
9196       break;
9197     case V4SF_FTYPE_V4SF_VEC_MERGE:
9198     case V2DF_FTYPE_V2DF_VEC_MERGE:
9199       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9200     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9201     case V16QI_FTYPE_V16QI_V16QI:
9202     case V16QI_FTYPE_V8HI_V8HI:
9203     case V16SF_FTYPE_V16SF_V16SF:
9204     case V8QI_FTYPE_V8QI_V8QI:
9205     case V8QI_FTYPE_V4HI_V4HI:
9206     case V8HI_FTYPE_V8HI_V8HI:
9207     case V8HI_FTYPE_V16QI_V16QI:
9208     case V8HI_FTYPE_V4SI_V4SI:
9209     case V8SF_FTYPE_V8SF_V8SF:
9210     case V8SF_FTYPE_V8SF_V8SI:
9211     case V8DF_FTYPE_V8DF_V8DF:
9212     case V4SI_FTYPE_V4SI_V4SI:
9213     case V4SI_FTYPE_V8HI_V8HI:
9214     case V4SI_FTYPE_V2DF_V2DF:
9215     case V4HI_FTYPE_V4HI_V4HI:
9216     case V4HI_FTYPE_V8QI_V8QI:
9217     case V4HI_FTYPE_V2SI_V2SI:
9218     case V4DF_FTYPE_V4DF_V4DF:
9219     case V4DF_FTYPE_V4DF_V4DI:
9220     case V4SF_FTYPE_V4SF_V4SF:
9221     case V4SF_FTYPE_V4SF_V4SI:
9222     case V4SF_FTYPE_V4SF_V2SI:
9223     case V4SF_FTYPE_V4SF_V2DF:
9224     case V4SF_FTYPE_V4SF_UINT:
9225     case V4SF_FTYPE_V4SF_DI:
9226     case V4SF_FTYPE_V4SF_SI:
9227     case V2DI_FTYPE_V2DI_V2DI:
9228     case V2DI_FTYPE_V16QI_V16QI:
9229     case V2DI_FTYPE_V4SI_V4SI:
9230     case V2DI_FTYPE_V2DI_V16QI:
9231     case V2SI_FTYPE_V2SI_V2SI:
9232     case V2SI_FTYPE_V4HI_V4HI:
9233     case V2SI_FTYPE_V2SF_V2SF:
9234     case V2DF_FTYPE_V2DF_V2DF:
9235     case V2DF_FTYPE_V2DF_V4SF:
9236     case V2DF_FTYPE_V2DF_V2DI:
9237     case V2DF_FTYPE_V2DF_DI:
9238     case V2DF_FTYPE_V2DF_SI:
9239     case V2DF_FTYPE_V2DF_UINT:
9240     case V2SF_FTYPE_V2SF_V2SF:
9241     case V1DI_FTYPE_V1DI_V1DI:
9242     case V1DI_FTYPE_V8QI_V8QI:
9243     case V1DI_FTYPE_V2SI_V2SI:
9244     case V32QI_FTYPE_V16HI_V16HI:
9245     case V16HI_FTYPE_V8SI_V8SI:
9246     case V64QI_FTYPE_V64QI_V64QI:
9247     case V32QI_FTYPE_V32QI_V32QI:
9248     case V16HI_FTYPE_V32QI_V32QI:
9249     case V16HI_FTYPE_V16HI_V16HI:
9250     case V8SI_FTYPE_V4DF_V4DF:
9251     case V8SI_FTYPE_V8SI_V8SI:
9252     case V8SI_FTYPE_V16HI_V16HI:
9253     case V4DI_FTYPE_V4DI_V4DI:
9254     case V4DI_FTYPE_V8SI_V8SI:
9255     case V8DI_FTYPE_V64QI_V64QI:
9256       if (comparison == UNKNOWN)
9257 	return ix86_expand_binop_builtin (icode, exp, target);
9258       nargs = 2;
9259       break;
9260     case V4SF_FTYPE_V4SF_V4SF_SWAP:
9261     case V2DF_FTYPE_V2DF_V2DF_SWAP:
9262       gcc_assert (comparison != UNKNOWN);
9263       nargs = 2;
9264       swap = true;
9265       break;
9266     case V16HI_FTYPE_V16HI_V8HI_COUNT:
9267     case V16HI_FTYPE_V16HI_SI_COUNT:
9268     case V8SI_FTYPE_V8SI_V4SI_COUNT:
9269     case V8SI_FTYPE_V8SI_SI_COUNT:
9270     case V4DI_FTYPE_V4DI_V2DI_COUNT:
9271     case V4DI_FTYPE_V4DI_INT_COUNT:
9272     case V8HI_FTYPE_V8HI_V8HI_COUNT:
9273     case V8HI_FTYPE_V8HI_SI_COUNT:
9274     case V4SI_FTYPE_V4SI_V4SI_COUNT:
9275     case V4SI_FTYPE_V4SI_SI_COUNT:
9276     case V4HI_FTYPE_V4HI_V4HI_COUNT:
9277     case V4HI_FTYPE_V4HI_SI_COUNT:
9278     case V2DI_FTYPE_V2DI_V2DI_COUNT:
9279     case V2DI_FTYPE_V2DI_SI_COUNT:
9280     case V2SI_FTYPE_V2SI_V2SI_COUNT:
9281     case V2SI_FTYPE_V2SI_SI_COUNT:
9282     case V1DI_FTYPE_V1DI_V1DI_COUNT:
9283     case V1DI_FTYPE_V1DI_SI_COUNT:
9284       nargs = 2;
9285       second_arg_count = true;
9286       break;
9287     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9288     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9289     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9290     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9291     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9292     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9293     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9294     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9295     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9296     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9297     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9298     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9299     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9300     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9301     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9302     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9303     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9304     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9305       nargs = 4;
9306       second_arg_count = true;
9307       break;
9308     case UINT64_FTYPE_UINT64_UINT64:
9309     case UINT_FTYPE_UINT_UINT:
9310     case UINT_FTYPE_UINT_USHORT:
9311     case UINT_FTYPE_UINT_UCHAR:
9312     case UINT16_FTYPE_UINT16_INT:
9313     case UINT8_FTYPE_UINT8_INT:
9314     case UQI_FTYPE_UQI_UQI:
9315     case UHI_FTYPE_UHI_UHI:
9316     case USI_FTYPE_USI_USI:
9317     case UDI_FTYPE_UDI_UDI:
9318     case V16SI_FTYPE_V8DF_V8DF:
9319     case V32HI_FTYPE_V16SF_V16SF:
9320     case V16HI_FTYPE_V8SF_V8SF:
9321     case V8HI_FTYPE_V4SF_V4SF:
9322     case V16HI_FTYPE_V16SF_UHI:
9323     case V8HI_FTYPE_V8SF_UQI:
9324     case V8HI_FTYPE_V4SF_UQI:
9325       nargs = 2;
9326       break;
9327     case V2DI_FTYPE_V2DI_INT_CONVERT:
9328       nargs = 2;
9329       rmode = V1TImode;
9330       nargs_constant = 1;
9331       break;
9332     case V4DI_FTYPE_V4DI_INT_CONVERT:
9333       nargs = 2;
9334       rmode = V2TImode;
9335       nargs_constant = 1;
9336       break;
9337     case V8DI_FTYPE_V8DI_INT_CONVERT:
9338       nargs = 2;
9339       rmode = V4TImode;
9340       nargs_constant = 1;
9341       break;
9342     case V8HI_FTYPE_V8HI_INT:
9343     case V8HI_FTYPE_V8SF_INT:
9344     case V16HI_FTYPE_V16SF_INT:
9345     case V8HI_FTYPE_V4SF_INT:
9346     case V8SF_FTYPE_V8SF_INT:
9347     case V4SF_FTYPE_V16SF_INT:
9348     case V16SF_FTYPE_V16SF_INT:
9349     case V4SI_FTYPE_V4SI_INT:
9350     case V4SI_FTYPE_V8SI_INT:
9351     case V4HI_FTYPE_V4HI_INT:
9352     case V4DF_FTYPE_V4DF_INT:
9353     case V4DF_FTYPE_V8DF_INT:
9354     case V4SF_FTYPE_V4SF_INT:
9355     case V4SF_FTYPE_V8SF_INT:
9356     case V2DI_FTYPE_V2DI_INT:
9357     case V2DF_FTYPE_V2DF_INT:
9358     case V2DF_FTYPE_V4DF_INT:
9359     case V16HI_FTYPE_V16HI_INT:
9360     case V8SI_FTYPE_V8SI_INT:
9361     case V16SI_FTYPE_V16SI_INT:
9362     case V4SI_FTYPE_V16SI_INT:
9363     case V4DI_FTYPE_V4DI_INT:
9364     case V2DI_FTYPE_V4DI_INT:
9365     case V4DI_FTYPE_V8DI_INT:
9366     case UQI_FTYPE_UQI_UQI_CONST:
9367     case UHI_FTYPE_UHI_UQI:
9368     case USI_FTYPE_USI_UQI:
9369     case UDI_FTYPE_UDI_UQI:
9370       nargs = 2;
9371       nargs_constant = 1;
9372       break;
9373     case V16QI_FTYPE_V16QI_V16QI_V16QI:
9374     case V8SF_FTYPE_V8SF_V8SF_V8SF:
9375     case V4DF_FTYPE_V4DF_V4DF_V4DF:
9376     case V4SF_FTYPE_V4SF_V4SF_V4SF:
9377     case V2DF_FTYPE_V2DF_V2DF_V2DF:
9378     case V32QI_FTYPE_V32QI_V32QI_V32QI:
9379     case UHI_FTYPE_V16SI_V16SI_UHI:
9380     case UQI_FTYPE_V8DI_V8DI_UQI:
9381     case V16HI_FTYPE_V16SI_V16HI_UHI:
9382     case V16QI_FTYPE_V16SI_V16QI_UHI:
9383     case V16QI_FTYPE_V8DI_V16QI_UQI:
9384     case V16SF_FTYPE_V16SF_V16SF_UHI:
9385     case V16SF_FTYPE_V4SF_V16SF_UHI:
9386     case V16SI_FTYPE_SI_V16SI_UHI:
9387     case V16SI_FTYPE_V16HI_V16SI_UHI:
9388     case V16SI_FTYPE_V16QI_V16SI_UHI:
9389     case V8SF_FTYPE_V4SF_V8SF_UQI:
9390     case V4DF_FTYPE_V2DF_V4DF_UQI:
9391     case V8SI_FTYPE_V4SI_V8SI_UQI:
9392     case V8SI_FTYPE_SI_V8SI_UQI:
9393     case V4SI_FTYPE_V4SI_V4SI_UQI:
9394     case V4SI_FTYPE_SI_V4SI_UQI:
9395     case V4DI_FTYPE_V2DI_V4DI_UQI:
9396     case V4DI_FTYPE_DI_V4DI_UQI:
9397     case V2DI_FTYPE_V2DI_V2DI_UQI:
9398     case V2DI_FTYPE_DI_V2DI_UQI:
9399     case V64QI_FTYPE_V64QI_V64QI_UDI:
9400     case V64QI_FTYPE_V16QI_V64QI_UDI:
9401     case V64QI_FTYPE_QI_V64QI_UDI:
9402     case V32QI_FTYPE_V32QI_V32QI_USI:
9403     case V32QI_FTYPE_V16QI_V32QI_USI:
9404     case V32QI_FTYPE_QI_V32QI_USI:
9405     case V16QI_FTYPE_V16QI_V16QI_UHI:
9406     case V16QI_FTYPE_QI_V16QI_UHI:
9407     case V32HI_FTYPE_V8HI_V32HI_USI:
9408     case V32HI_FTYPE_HI_V32HI_USI:
9409     case V16HI_FTYPE_V8HI_V16HI_UHI:
9410     case V16HI_FTYPE_HI_V16HI_UHI:
9411     case V8HI_FTYPE_V8HI_V8HI_UQI:
9412     case V8HI_FTYPE_HI_V8HI_UQI:
9413     case V8SF_FTYPE_V8HI_V8SF_UQI:
9414     case V4SF_FTYPE_V8HI_V4SF_UQI:
9415     case V8SI_FTYPE_V8SF_V8SI_UQI:
9416     case V4SI_FTYPE_V4SF_V4SI_UQI:
9417     case V4DI_FTYPE_V4SF_V4DI_UQI:
9418     case V2DI_FTYPE_V4SF_V2DI_UQI:
9419     case V4SF_FTYPE_V4DI_V4SF_UQI:
9420     case V4SF_FTYPE_V2DI_V4SF_UQI:
9421     case V4DF_FTYPE_V4DI_V4DF_UQI:
9422     case V2DF_FTYPE_V2DI_V2DF_UQI:
9423     case V16QI_FTYPE_V8HI_V16QI_UQI:
9424     case V16QI_FTYPE_V16HI_V16QI_UHI:
9425     case V16QI_FTYPE_V4SI_V16QI_UQI:
9426     case V16QI_FTYPE_V8SI_V16QI_UQI:
9427     case V8HI_FTYPE_V4SI_V8HI_UQI:
9428     case V8HI_FTYPE_V8SI_V8HI_UQI:
9429     case V16QI_FTYPE_V2DI_V16QI_UQI:
9430     case V16QI_FTYPE_V4DI_V16QI_UQI:
9431     case V8HI_FTYPE_V2DI_V8HI_UQI:
9432     case V8HI_FTYPE_V4DI_V8HI_UQI:
9433     case V4SI_FTYPE_V2DI_V4SI_UQI:
9434     case V4SI_FTYPE_V4DI_V4SI_UQI:
9435     case V32QI_FTYPE_V32HI_V32QI_USI:
9436     case UHI_FTYPE_V16QI_V16QI_UHI:
9437     case USI_FTYPE_V32QI_V32QI_USI:
9438     case UDI_FTYPE_V64QI_V64QI_UDI:
9439     case UQI_FTYPE_V8HI_V8HI_UQI:
9440     case UHI_FTYPE_V16HI_V16HI_UHI:
9441     case USI_FTYPE_V32HI_V32HI_USI:
9442     case UQI_FTYPE_V4SI_V4SI_UQI:
9443     case UQI_FTYPE_V8SI_V8SI_UQI:
9444     case UQI_FTYPE_V2DI_V2DI_UQI:
9445     case UQI_FTYPE_V4DI_V4DI_UQI:
9446     case V4SF_FTYPE_V2DF_V4SF_UQI:
9447     case V4SF_FTYPE_V4DF_V4SF_UQI:
9448     case V16SI_FTYPE_V16SI_V16SI_UHI:
9449     case V16SI_FTYPE_V4SI_V16SI_UHI:
9450     case V2DI_FTYPE_V4SI_V2DI_UQI:
9451     case V2DI_FTYPE_V8HI_V2DI_UQI:
9452     case V2DI_FTYPE_V16QI_V2DI_UQI:
9453     case V4DI_FTYPE_V4DI_V4DI_UQI:
9454     case V4DI_FTYPE_V4SI_V4DI_UQI:
9455     case V4DI_FTYPE_V8HI_V4DI_UQI:
9456     case V4DI_FTYPE_V16QI_V4DI_UQI:
9457     case V4DI_FTYPE_V4DF_V4DI_UQI:
9458     case V2DI_FTYPE_V2DF_V2DI_UQI:
9459     case V4SI_FTYPE_V4DF_V4SI_UQI:
9460     case V4SI_FTYPE_V2DF_V4SI_UQI:
9461     case V4SI_FTYPE_V8HI_V4SI_UQI:
9462     case V4SI_FTYPE_V16QI_V4SI_UQI:
9463     case V4DI_FTYPE_V4DI_V4DI_V4DI:
9464     case V8DF_FTYPE_V2DF_V8DF_UQI:
9465     case V8DF_FTYPE_V4DF_V8DF_UQI:
9466     case V8DF_FTYPE_V8DF_V8DF_UQI:
9467     case V8SF_FTYPE_V8SF_V8SF_UQI:
9468     case V8SF_FTYPE_V8SI_V8SF_UQI:
9469     case V4DF_FTYPE_V4DF_V4DF_UQI:
9470     case V4SF_FTYPE_V4SF_V4SF_UQI:
9471     case V2DF_FTYPE_V2DF_V2DF_UQI:
9472     case V2DF_FTYPE_V4SF_V2DF_UQI:
9473     case V2DF_FTYPE_V4SI_V2DF_UQI:
9474     case V4SF_FTYPE_V4SI_V4SF_UQI:
9475     case V4DF_FTYPE_V4SF_V4DF_UQI:
9476     case V4DF_FTYPE_V4SI_V4DF_UQI:
9477     case V8SI_FTYPE_V8SI_V8SI_UQI:
9478     case V8SI_FTYPE_V8HI_V8SI_UQI:
9479     case V8SI_FTYPE_V16QI_V8SI_UQI:
9480     case V8DF_FTYPE_V8SI_V8DF_UQI:
9481     case V8DI_FTYPE_DI_V8DI_UQI:
9482     case V16SF_FTYPE_V8SF_V16SF_UHI:
9483     case V16SI_FTYPE_V8SI_V16SI_UHI:
9484     case V16HI_FTYPE_V16HI_V16HI_UHI:
9485     case V8HI_FTYPE_V16QI_V8HI_UQI:
9486     case V16HI_FTYPE_V16QI_V16HI_UHI:
9487     case V32HI_FTYPE_V32HI_V32HI_USI:
9488     case V32HI_FTYPE_V32QI_V32HI_USI:
9489     case V8DI_FTYPE_V16QI_V8DI_UQI:
9490     case V8DI_FTYPE_V2DI_V8DI_UQI:
9491     case V8DI_FTYPE_V4DI_V8DI_UQI:
9492     case V8DI_FTYPE_V8DI_V8DI_UQI:
9493     case V8DI_FTYPE_V8HI_V8DI_UQI:
9494     case V8DI_FTYPE_V8SI_V8DI_UQI:
9495     case V8HI_FTYPE_V8DI_V8HI_UQI:
9496     case V8SI_FTYPE_V8DI_V8SI_UQI:
9497     case V4SI_FTYPE_V4SI_V4SI_V4SI:
9498     case V16SI_FTYPE_V16SI_V16SI_V16SI:
9499     case V8DI_FTYPE_V8DI_V8DI_V8DI:
9500     case V32HI_FTYPE_V32HI_V32HI_V32HI:
9501     case V2DI_FTYPE_V2DI_V2DI_V2DI:
9502     case V16HI_FTYPE_V16HI_V16HI_V16HI:
9503     case V8SI_FTYPE_V8SI_V8SI_V8SI:
9504     case V8HI_FTYPE_V8HI_V8HI_V8HI:
9505     case V32HI_FTYPE_V16SF_V16SF_USI:
9506     case V16HI_FTYPE_V8SF_V8SF_UHI:
9507     case V8HI_FTYPE_V4SF_V4SF_UQI:
9508     case V16HI_FTYPE_V16SF_V16HI_UHI:
9509     case V8HI_FTYPE_V8SF_V8HI_UQI:
9510     case V8HI_FTYPE_V4SF_V8HI_UQI:
9511     case V16SF_FTYPE_V16SF_V32HI_V32HI:
9512     case V8SF_FTYPE_V8SF_V16HI_V16HI:
9513     case V4SF_FTYPE_V4SF_V8HI_V8HI:
9514       nargs = 3;
9515       break;
9516     case V32QI_FTYPE_V32QI_V32QI_INT:
9517     case V16HI_FTYPE_V16HI_V16HI_INT:
9518     case V16QI_FTYPE_V16QI_V16QI_INT:
9519     case V4DI_FTYPE_V4DI_V4DI_INT:
9520     case V8HI_FTYPE_V8HI_V8HI_INT:
9521     case V8SI_FTYPE_V8SI_V8SI_INT:
9522     case V8SI_FTYPE_V8SI_V4SI_INT:
9523     case V8SF_FTYPE_V8SF_V8SF_INT:
9524     case V8SF_FTYPE_V8SF_V4SF_INT:
9525     case V4SI_FTYPE_V4SI_V4SI_INT:
9526     case V4DF_FTYPE_V4DF_V4DF_INT:
9527     case V16SF_FTYPE_V16SF_V16SF_INT:
9528     case V16SF_FTYPE_V16SF_V4SF_INT:
9529     case V16SI_FTYPE_V16SI_V4SI_INT:
9530     case V4DF_FTYPE_V4DF_V2DF_INT:
9531     case V4SF_FTYPE_V4SF_V4SF_INT:
9532     case V2DI_FTYPE_V2DI_V2DI_INT:
9533     case V4DI_FTYPE_V4DI_V2DI_INT:
9534     case V2DF_FTYPE_V2DF_V2DF_INT:
9535     case UQI_FTYPE_V8DI_V8UDI_INT:
9536     case UQI_FTYPE_V8DF_V8DF_INT:
9537     case UQI_FTYPE_V2DF_V2DF_INT:
9538     case UQI_FTYPE_V4SF_V4SF_INT:
9539     case UHI_FTYPE_V16SI_V16SI_INT:
9540     case UHI_FTYPE_V16SF_V16SF_INT:
9541     case V64QI_FTYPE_V64QI_V64QI_INT:
9542     case V32HI_FTYPE_V32HI_V32HI_INT:
9543     case V16SI_FTYPE_V16SI_V16SI_INT:
9544     case V8DI_FTYPE_V8DI_V8DI_INT:
9545       nargs = 3;
9546       nargs_constant = 1;
9547       break;
9548     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9549       nargs = 3;
9550       rmode = V4DImode;
9551       nargs_constant = 1;
9552       break;
9553     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9554       nargs = 3;
9555       rmode = V2DImode;
9556       nargs_constant = 1;
9557       break;
9558     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9559       nargs = 3;
9560       rmode = DImode;
9561       nargs_constant = 1;
9562       break;
9563     case V2DI_FTYPE_V2DI_UINT_UINT:
9564       nargs = 3;
9565       nargs_constant = 2;
9566       break;
9567     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9568       nargs = 3;
9569       rmode = V8DImode;
9570       nargs_constant = 1;
9571       break;
9572     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9573       nargs = 5;
9574       rmode = V8DImode;
9575       mask_pos = 2;
9576       nargs_constant = 1;
9577       break;
9578     case QI_FTYPE_V8DF_INT_UQI:
9579     case QI_FTYPE_V4DF_INT_UQI:
9580     case QI_FTYPE_V2DF_INT_UQI:
9581     case HI_FTYPE_V16SF_INT_UHI:
9582     case QI_FTYPE_V8SF_INT_UQI:
9583     case QI_FTYPE_V4SF_INT_UQI:
9584     case V4SI_FTYPE_V4SI_V4SI_UHI:
9585     case V8SI_FTYPE_V8SI_V8SI_UHI:
9586       nargs = 3;
9587       mask_pos = 1;
9588       nargs_constant = 1;
9589       break;
9590     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9591       nargs = 5;
9592       rmode = V4DImode;
9593       mask_pos = 2;
9594       nargs_constant = 1;
9595       break;
9596     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9597       nargs = 5;
9598       rmode = V2DImode;
9599       mask_pos = 2;
9600       nargs_constant = 1;
9601       break;
9602     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9603     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9604     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9605     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9606     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9607     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9608     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9609     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9610     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9611     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9612     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9613     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9614     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9615     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9616     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9617     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9618     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9619     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9620     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9621     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9622     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9623     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9624     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9625     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9626     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9627     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9628     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9629     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9630     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9631     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9632     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9633     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9634     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9635     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9636     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9637     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9638     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9639     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9640     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9641     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9642     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9643     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9644     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9645     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9646     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9647     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9648     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9649     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9650     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9651     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9652     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9653     case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9654     case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9655     case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9656       nargs = 4;
9657       break;
9658     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9659     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9660     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9661     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9662     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9663       nargs = 4;
9664       nargs_constant = 1;
9665       break;
9666     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9667     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9668     case QI_FTYPE_V4DF_V4DF_INT_UQI:
9669     case QI_FTYPE_V8SF_V8SF_INT_UQI:
9670     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9671     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9672     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9673     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9674     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9675     case USI_FTYPE_V32QI_V32QI_INT_USI:
9676     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9677     case USI_FTYPE_V32HI_V32HI_INT_USI:
9678     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9679     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9680       nargs = 4;
9681       mask_pos = 1;
9682       nargs_constant = 1;
9683       break;
9684     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9685       nargs = 4;
9686       nargs_constant = 2;
9687       break;
9688     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9689     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9690     case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9691     case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9692     case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9693       nargs = 4;
9694       break;
9695     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9696     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9697       mask_pos = 1;
9698       nargs = 4;
9699       nargs_constant = 1;
9700       break;
9701     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9702     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9703     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9704     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9705     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9706     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9707     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9708     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9709     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9710     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9711     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9712     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9713     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9714     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9715     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9716     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9717     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9718     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9719     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9720     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9721     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9722     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9723     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9724     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9725     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9726     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9727     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9728     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9729     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9730     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9731       nargs = 4;
9732       mask_pos = 2;
9733       nargs_constant = 1;
9734       break;
9735     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9736     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9737     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9738     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9739     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9740     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9741     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9742     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9743     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9744     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9745     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9746     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9747     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9748     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9749     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9750     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9751     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9752     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9753     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9754     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9755     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9756     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9757     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9758     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9759     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9760     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9761     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9762       nargs = 5;
9763       mask_pos = 2;
9764       nargs_constant = 1;
9765       break;
9766     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9767     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9768     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9769     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9770     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9771     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9772     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9773     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9774     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9775     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9776       nargs = 5;
9777       mask_pos = 1;
9778       nargs_constant = 1;
9779       break;
9780     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9781     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9782     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9783     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9784     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9785     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9786     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9787     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9788     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9789     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9790     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9791     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9792       nargs = 5;
9793       mask_pos = 1;
9794       nargs_constant = 2;
9795       break;
9796 
9797     default:
9798       gcc_unreachable ();
9799     }
9800 
9801   gcc_assert (nargs <= ARRAY_SIZE (xops));
9802 
9803   if (comparison != UNKNOWN)
9804     {
9805       gcc_assert (nargs == 2);
9806       return ix86_expand_sse_compare (d, exp, target, swap);
9807     }
9808 
9809   if (rmode == VOIDmode || rmode == tmode)
9810     {
9811       if (optimize
9812 	  || target == 0
9813 	  || GET_MODE (target) != tmode
9814 	  || !insn_p->operand[0].predicate (target, tmode))
9815 	target = gen_reg_rtx (tmode);
9816       else if (memory_operand (target, tmode))
9817 	num_memory++;
9818       real_target = target;
9819     }
9820   else
9821     {
9822       real_target = gen_reg_rtx (tmode);
9823       target = lowpart_subreg (rmode, real_target, tmode);
9824     }
9825 
9826   for (i = 0; i < nargs; i++)
9827     {
9828       tree arg = CALL_EXPR_ARG (exp, i);
9829       rtx op = expand_normal (arg);
9830       machine_mode mode = insn_p->operand[i + 1].mode;
9831       bool match = insn_p->operand[i + 1].predicate (op, mode);
9832 
9833       if (second_arg_count && i == 1)
9834 	{
9835 	  /* SIMD shift insns take either an 8-bit immediate or
9836 	     register as count.  But builtin functions take int as
9837 	     count.  If count doesn't match, we put it in register.
9838 	     The instructions are using 64-bit count, if op is just
9839 	     32-bit, zero-extend it, as negative shift counts
9840 	     are undefined behavior and zero-extension is more
9841 	     efficient.  */
9842 	  if (!match)
9843 	    {
9844 	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
9845 		op = convert_modes (mode, GET_MODE (op), op, 1);
9846 	      else
9847 		op = lowpart_subreg (mode, op, GET_MODE (op));
9848 	      if (!insn_p->operand[i + 1].predicate (op, mode))
9849 		op = copy_to_reg (op);
9850 	    }
9851 	}
9852       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9853 	       (!mask_pos && (nargs - i) <= nargs_constant))
9854 	{
9855 	  if (!match)
9856 	    switch (icode)
9857 	      {
9858 	      case CODE_FOR_avx_vinsertf128v4di:
9859 	      case CODE_FOR_avx_vextractf128v4di:
9860 		error ("the last argument must be an 1-bit immediate");
9861 		return const0_rtx;
9862 
9863 	      case CODE_FOR_avx512f_cmpv8di3_mask:
9864 	      case CODE_FOR_avx512f_cmpv16si3_mask:
9865 	      case CODE_FOR_avx512f_ucmpv8di3_mask:
9866 	      case CODE_FOR_avx512f_ucmpv16si3_mask:
9867 	      case CODE_FOR_avx512vl_cmpv4di3_mask:
9868 	      case CODE_FOR_avx512vl_cmpv8si3_mask:
9869 	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
9870 	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
9871 	      case CODE_FOR_avx512vl_cmpv2di3_mask:
9872 	      case CODE_FOR_avx512vl_cmpv4si3_mask:
9873 	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
9874 	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
9875 		error ("the last argument must be a 3-bit immediate");
9876 		return const0_rtx;
9877 
9878 	      case CODE_FOR_sse4_1_roundsd:
9879 	      case CODE_FOR_sse4_1_roundss:
9880 
9881 	      case CODE_FOR_sse4_1_roundpd:
9882 	      case CODE_FOR_sse4_1_roundps:
9883 	      case CODE_FOR_avx_roundpd256:
9884 	      case CODE_FOR_avx_roundps256:
9885 
9886 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9887 	      case CODE_FOR_sse4_1_roundps_sfix:
9888 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9889 	      case CODE_FOR_avx_roundps_sfix256:
9890 
9891 	      case CODE_FOR_sse4_1_blendps:
9892 	      case CODE_FOR_avx_blendpd256:
9893 	      case CODE_FOR_avx_vpermilv4df:
9894 	      case CODE_FOR_avx_vpermilv4df_mask:
9895 	      case CODE_FOR_avx512f_getmantv8df_mask:
9896 	      case CODE_FOR_avx512f_getmantv16sf_mask:
9897 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
9898 	      case CODE_FOR_avx512vl_getmantv4df_mask:
9899 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
9900 	      case CODE_FOR_avx512vl_getmantv2df_mask:
9901 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
9902 	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9903 	      case CODE_FOR_avx512dq_rangepv4df_mask:
9904 	      case CODE_FOR_avx512dq_rangepv8sf_mask:
9905 	      case CODE_FOR_avx512dq_rangepv2df_mask:
9906 	      case CODE_FOR_avx512dq_rangepv4sf_mask:
9907 	      case CODE_FOR_avx_shufpd256_mask:
9908 		error ("the last argument must be a 4-bit immediate");
9909 		return const0_rtx;
9910 
9911 	      case CODE_FOR_sha1rnds4:
9912 	      case CODE_FOR_sse4_1_blendpd:
9913 	      case CODE_FOR_avx_vpermilv2df:
9914 	      case CODE_FOR_avx_vpermilv2df_mask:
9915 	      case CODE_FOR_xop_vpermil2v2df3:
9916 	      case CODE_FOR_xop_vpermil2v4sf3:
9917 	      case CODE_FOR_xop_vpermil2v4df3:
9918 	      case CODE_FOR_xop_vpermil2v8sf3:
9919 	      case CODE_FOR_avx512f_vinsertf32x4_mask:
9920 	      case CODE_FOR_avx512f_vinserti32x4_mask:
9921 	      case CODE_FOR_avx512f_vextractf32x4_mask:
9922 	      case CODE_FOR_avx512f_vextracti32x4_mask:
9923 	      case CODE_FOR_sse2_shufpd:
9924 	      case CODE_FOR_sse2_shufpd_mask:
9925 	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
9926 	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
9927 	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
9928 	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
9929 		error ("the last argument must be a 2-bit immediate");
9930 		return const0_rtx;
9931 
9932 	      case CODE_FOR_avx_vextractf128v4df:
9933 	      case CODE_FOR_avx_vextractf128v8sf:
9934 	      case CODE_FOR_avx_vextractf128v8si:
9935 	      case CODE_FOR_avx_vinsertf128v4df:
9936 	      case CODE_FOR_avx_vinsertf128v8sf:
9937 	      case CODE_FOR_avx_vinsertf128v8si:
9938 	      case CODE_FOR_avx512f_vinsertf64x4_mask:
9939 	      case CODE_FOR_avx512f_vinserti64x4_mask:
9940 	      case CODE_FOR_avx512f_vextractf64x4_mask:
9941 	      case CODE_FOR_avx512f_vextracti64x4_mask:
9942 	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
9943 	      case CODE_FOR_avx512dq_vinserti32x8_mask:
9944 	      case CODE_FOR_avx512vl_vinsertv4df:
9945 	      case CODE_FOR_avx512vl_vinsertv4di:
9946 	      case CODE_FOR_avx512vl_vinsertv8sf:
9947 	      case CODE_FOR_avx512vl_vinsertv8si:
9948 		error ("the last argument must be a 1-bit immediate");
9949 		return const0_rtx;
9950 
9951 	      case CODE_FOR_avx_vmcmpv2df3:
9952 	      case CODE_FOR_avx_vmcmpv4sf3:
9953 	      case CODE_FOR_avx_cmpv2df3:
9954 	      case CODE_FOR_avx_cmpv4sf3:
9955 	      case CODE_FOR_avx_cmpv4df3:
9956 	      case CODE_FOR_avx_cmpv8sf3:
9957 	      case CODE_FOR_avx512f_cmpv8df3_mask:
9958 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
9959 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
9960 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9961 		error ("the last argument must be a 5-bit immediate");
9962 		return const0_rtx;
9963 
9964 	      default:
9965 		switch (nargs_constant)
9966 		  {
9967 		  case 2:
9968 		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9969 			(!mask_pos && (nargs - i) == nargs_constant))
9970 		      {
9971 			error ("the next to last argument must be an 8-bit immediate");
9972 			break;
9973 		      }
9974 		    /* FALLTHRU */
9975 		  case 1:
9976 		    error ("the last argument must be an 8-bit immediate");
9977 		    break;
9978 		  default:
9979 		    gcc_unreachable ();
9980 		  }
9981 		return const0_rtx;
9982 	      }
9983 	}
9984       else
9985 	{
9986 	  if (VECTOR_MODE_P (mode))
9987 	    op = safe_vector_operand (op, mode);
9988 
9989 	  /* If we aren't optimizing, only allow one memory operand to
9990 	     be generated.  */
9991 	  if (memory_operand (op, mode))
9992 	    num_memory++;
9993 
9994 	  op = fixup_modeless_constant (op, mode);
9995 
9996 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9997 	    {
9998 	      if (optimize || !match || num_memory > 1)
9999 		op = copy_to_mode_reg (mode, op);
10000 	    }
10001 	  else
10002 	    {
10003 	      op = copy_to_reg (op);
10004 	      op = lowpart_subreg (mode, op, GET_MODE (op));
10005 	    }
10006 	}
10007 
10008       xops[i] = op;
10009     }
10010 
10011   switch (nargs)
10012     {
10013     case 1:
10014       pat = GEN_FCN (icode) (real_target, xops[0]);
10015       break;
10016     case 2:
10017       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
10018       break;
10019     case 3:
10020       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
10021       break;
10022     case 4:
10023       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10024 			     xops[2], xops[3]);
10025       break;
10026     case 5:
10027       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10028 			     xops[2], xops[3], xops[4]);
10029       break;
10030     case 6:
10031       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10032 			     xops[2], xops[3], xops[4], xops[5]);
10033       break;
10034     default:
10035       gcc_unreachable ();
10036     }
10037 
10038   if (! pat)
10039     return 0;
10040 
10041   emit_insn (pat);
10042   return target;
10043 }
10044 
10045 /* Transform pattern of following layout:
10046      (set A
10047        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10048      )
10049    into:
10050      (set (A B)) */
10051 
10052 static rtx
ix86_erase_embedded_rounding(rtx pat)10053 ix86_erase_embedded_rounding (rtx pat)
10054 {
10055   if (GET_CODE (pat) == INSN)
10056     pat = PATTERN (pat);
10057 
10058   gcc_assert (GET_CODE (pat) == SET);
10059   rtx src = SET_SRC (pat);
10060   gcc_assert (XVECLEN (src, 0) == 2);
10061   rtx p0 = XVECEXP (src, 0, 0);
10062   gcc_assert (GET_CODE (src) == UNSPEC
10063 	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
10064   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
10065   return res;
10066 }
10067 
10068 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10069    with rounding.  */
10070 static rtx
ix86_expand_sse_comi_round(const struct builtin_description * d,tree exp,rtx target)10071 ix86_expand_sse_comi_round (const struct builtin_description *d,
10072 			    tree exp, rtx target)
10073 {
10074   rtx pat, set_dst;
10075   tree arg0 = CALL_EXPR_ARG (exp, 0);
10076   tree arg1 = CALL_EXPR_ARG (exp, 1);
10077   tree arg2 = CALL_EXPR_ARG (exp, 2);
10078   tree arg3 = CALL_EXPR_ARG (exp, 3);
10079   rtx op0 = expand_normal (arg0);
10080   rtx op1 = expand_normal (arg1);
10081   rtx op2 = expand_normal (arg2);
10082   rtx op3 = expand_normal (arg3);
10083   enum insn_code icode = d->icode;
10084   const struct insn_data_d *insn_p = &insn_data[icode];
10085   machine_mode mode0 = insn_p->operand[0].mode;
10086   machine_mode mode1 = insn_p->operand[1].mode;
10087 
10088   /* See avxintrin.h for values.  */
10089   static const enum rtx_code comparisons[32] =
10090     {
10091       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10092       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10093       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10094       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
10095     };
10096   static const bool ordereds[32] =
10097     {
10098       true,  true,  true,  false, false, false, false, true,
10099       false, false, false, true,  true,  true,  true,  false,
10100       true,  true,  true,  false, false, false, false, true,
10101       false, false, false, true,  true,  true,  true,  false
10102     };
10103   static const bool non_signalings[32] =
10104     {
10105       true,  false, false, true,  true,  false, false, true,
10106       true,  false, false, true,  true,  false, false, true,
10107       false, true,  true,  false, false, true,  true,  false,
10108       false, true,  true,  false, false, true,  true,  false
10109     };
10110 
10111   if (!CONST_INT_P (op2))
10112     {
10113       error ("the third argument must be comparison constant");
10114       return const0_rtx;
10115     }
10116   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10117     {
10118       error ("incorrect comparison mode");
10119       return const0_rtx;
10120     }
10121 
10122   if (!insn_p->operand[2].predicate (op3, SImode))
10123     {
10124       error ("incorrect rounding operand");
10125       return const0_rtx;
10126     }
10127 
10128   if (VECTOR_MODE_P (mode0))
10129     op0 = safe_vector_operand (op0, mode0);
10130   if (VECTOR_MODE_P (mode1))
10131     op1 = safe_vector_operand (op1, mode1);
10132 
10133   enum rtx_code comparison = comparisons[INTVAL (op2)];
10134   bool ordered = ordereds[INTVAL (op2)];
10135   bool non_signaling = non_signalings[INTVAL (op2)];
10136   rtx const_val = const0_rtx;
10137 
10138   bool check_unordered = false;
10139   machine_mode mode = CCFPmode;
10140   switch (comparison)
10141     {
10142     case ORDERED:
10143       if (!ordered)
10144 	{
10145 	  /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
10146 	  if (!non_signaling)
10147 	    ordered = true;
10148 	  mode = CCSmode;
10149 	}
10150       else
10151 	{
10152 	  /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
10153 	  if (non_signaling)
10154 	    ordered = false;
10155 	  mode = CCPmode;
10156 	}
10157       comparison = NE;
10158       break;
10159     case UNORDERED:
10160       if (ordered)
10161 	{
10162 	  /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
10163 	  if (non_signaling)
10164 	    ordered = false;
10165 	  mode = CCSmode;
10166 	}
10167       else
10168 	{
10169 	  /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
10170 	  if (!non_signaling)
10171 	    ordered = true;
10172 	  mode = CCPmode;
10173 	}
10174       comparison = EQ;
10175       break;
10176 
10177     case LE:	/* -> GE  */
10178     case LT:	/* -> GT  */
10179     case UNGE:	/* -> UNLE  */
10180     case UNGT:	/* -> UNLT  */
10181       std::swap (op0, op1);
10182       comparison = swap_condition (comparison);
10183       /* FALLTHRU */
10184     case GT:
10185     case GE:
10186     case UNEQ:
10187     case UNLT:
10188     case UNLE:
10189     case LTGT:
10190       /* These are supported by CCFPmode.  NB: Use ordered/signaling
10191 	 COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
10192 	 with NAN operands.  */
10193       if (ordered == non_signaling)
10194 	ordered = !ordered;
10195       break;
10196     case EQ:
10197       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
10198 	 _CMP_EQ_OQ/_CMP_EQ_OS.  */
10199       check_unordered = true;
10200       mode = CCZmode;
10201       break;
10202     case NE:
10203       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
10204 	 _CMP_NEQ_UQ/_CMP_NEQ_US.  */
10205       gcc_assert (!ordered);
10206       check_unordered = true;
10207       mode = CCZmode;
10208       const_val = const1_rtx;
10209       break;
10210     default:
10211       gcc_unreachable ();
10212     }
10213 
10214   target = gen_reg_rtx (SImode);
10215   emit_move_insn (target, const_val);
10216   target = gen_rtx_SUBREG (QImode, target, 0);
10217 
10218   if ((optimize && !register_operand (op0, mode0))
10219       || !insn_p->operand[0].predicate (op0, mode0))
10220     op0 = copy_to_mode_reg (mode0, op0);
10221   if ((optimize && !register_operand (op1, mode1))
10222       || !insn_p->operand[1].predicate (op1, mode1))
10223     op1 = copy_to_mode_reg (mode1, op1);
10224 
10225   /*
10226      1. COMI: ordered and signaling.
10227      2. UCOMI: unordered and non-signaling.
10228    */
10229   if (non_signaling)
10230     icode = (icode == CODE_FOR_sse_comi_round
10231 	     ? CODE_FOR_sse_ucomi_round
10232 	     : CODE_FOR_sse2_ucomi_round);
10233 
10234   pat = GEN_FCN (icode) (op0, op1, op3);
10235   if (! pat)
10236     return 0;
10237 
10238   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
10239   if (INTVAL (op3) == NO_ROUND)
10240     {
10241       pat = ix86_erase_embedded_rounding (pat);
10242       if (! pat)
10243 	return 0;
10244 
10245       set_dst = SET_DEST (pat);
10246     }
10247   else
10248     {
10249       gcc_assert (GET_CODE (pat) == SET);
10250       set_dst = SET_DEST (pat);
10251     }
10252 
10253   emit_insn (pat);
10254 
10255   rtx_code_label *label = NULL;
10256 
10257   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10258      with NAN operands.  */
10259   if (check_unordered)
10260     {
10261       gcc_assert (comparison == EQ || comparison == NE);
10262 
10263       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10264       label = gen_label_rtx ();
10265       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10266       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10267 				  gen_rtx_LABEL_REF (VOIDmode, label),
10268 				  pc_rtx);
10269       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10270     }
10271 
10272   /* NB: Set CCFPmode and check a different CCmode which is in subset
10273      of CCFPmode.  */
10274   if (GET_MODE (set_dst) != mode)
10275     {
10276       gcc_assert (mode == CCAmode || mode == CCCmode
10277 		  || mode == CCOmode || mode == CCPmode
10278 		  || mode == CCSmode || mode == CCZmode);
10279       set_dst = gen_rtx_REG (mode, FLAGS_REG);
10280     }
10281 
10282   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10283 			  gen_rtx_fmt_ee (comparison, QImode,
10284 					  set_dst,
10285 					  const0_rtx)));
10286 
10287   if (label)
10288     emit_label (label);
10289 
10290   return SUBREG_REG (target);
10291 }
10292 
10293 static rtx
ix86_expand_round_builtin(const struct builtin_description * d,tree exp,rtx target)10294 ix86_expand_round_builtin (const struct builtin_description *d,
10295 			   tree exp, rtx target)
10296 {
10297   rtx pat;
10298   unsigned int i, nargs;
10299   rtx xops[6];
10300   enum insn_code icode = d->icode;
10301   const struct insn_data_d *insn_p = &insn_data[icode];
10302   machine_mode tmode = insn_p->operand[0].mode;
10303   unsigned int nargs_constant = 0;
10304   unsigned int redundant_embed_rnd = 0;
10305 
10306   switch ((enum ix86_builtin_func_type) d->flag)
10307     {
10308     case UINT64_FTYPE_V2DF_INT:
10309     case UINT64_FTYPE_V4SF_INT:
10310     case UINT_FTYPE_V2DF_INT:
10311     case UINT_FTYPE_V4SF_INT:
10312     case INT64_FTYPE_V2DF_INT:
10313     case INT64_FTYPE_V4SF_INT:
10314     case INT_FTYPE_V2DF_INT:
10315     case INT_FTYPE_V4SF_INT:
10316       nargs = 2;
10317       break;
10318     case V4SF_FTYPE_V4SF_UINT_INT:
10319     case V4SF_FTYPE_V4SF_UINT64_INT:
10320     case V2DF_FTYPE_V2DF_UINT64_INT:
10321     case V4SF_FTYPE_V4SF_INT_INT:
10322     case V4SF_FTYPE_V4SF_INT64_INT:
10323     case V2DF_FTYPE_V2DF_INT64_INT:
10324     case V4SF_FTYPE_V4SF_V4SF_INT:
10325     case V2DF_FTYPE_V2DF_V2DF_INT:
10326     case V4SF_FTYPE_V4SF_V2DF_INT:
10327     case V2DF_FTYPE_V2DF_V4SF_INT:
10328       nargs = 3;
10329       break;
10330     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10331     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10332     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10333     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10334     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10335     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10336     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10337     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10338     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10339     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10340     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10341     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10342     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10343     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10344       nargs = 4;
10345       break;
10346     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10347     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10348       nargs_constant = 2;
10349       nargs = 4;
10350       break;
10351     case INT_FTYPE_V4SF_V4SF_INT_INT:
10352     case INT_FTYPE_V2DF_V2DF_INT_INT:
10353       return ix86_expand_sse_comi_round (d, exp, target);
10354     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10355     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10356     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10357     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10358     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10359     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10360     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
10361     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10362     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10363     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
10364       nargs = 5;
10365       break;
10366     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10367     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10368     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
10369     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
10370       nargs_constant = 4;
10371       nargs = 5;
10372       break;
10373     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10374     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10375     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10376     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10377       nargs_constant = 3;
10378       nargs = 5;
10379       break;
10380     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10381     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10382     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10383     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10384     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10385     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10386       nargs = 6;
10387       nargs_constant = 4;
10388       break;
10389     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10390     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10391     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10392     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10393       nargs = 6;
10394       nargs_constant = 3;
10395       break;
10396     default:
10397       gcc_unreachable ();
10398     }
10399   gcc_assert (nargs <= ARRAY_SIZE (xops));
10400 
10401   if (optimize
10402       || target == 0
10403       || GET_MODE (target) != tmode
10404       || !insn_p->operand[0].predicate (target, tmode))
10405     target = gen_reg_rtx (tmode);
10406 
10407   for (i = 0; i < nargs; i++)
10408     {
10409       tree arg = CALL_EXPR_ARG (exp, i);
10410       rtx op = expand_normal (arg);
10411       machine_mode mode = insn_p->operand[i + 1].mode;
10412       bool match = insn_p->operand[i + 1].predicate (op, mode);
10413 
10414       if (i == nargs - nargs_constant)
10415 	{
10416 	  if (!match)
10417 	    {
10418 	      switch (icode)
10419 		{
10420 		case CODE_FOR_avx512f_getmantv8df_mask_round:
10421 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
10422 		case CODE_FOR_avx512f_vgetmantv2df_round:
10423 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10424 		case CODE_FOR_avx512f_vgetmantv4sf_round:
10425 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10426 		  error ("the immediate argument must be a 4-bit immediate");
10427 		  return const0_rtx;
10428 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
10429 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10430 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10431 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10432 		  error ("the immediate argument must be a 5-bit immediate");
10433 		  return const0_rtx;
10434 		default:
10435 		  error ("the immediate argument must be an 8-bit immediate");
10436 		  return const0_rtx;
10437 		}
10438 	    }
10439 	}
10440       else if (i == nargs-1)
10441 	{
10442 	  if (!insn_p->operand[nargs].predicate (op, SImode))
10443 	    {
10444 	      error ("incorrect rounding operand");
10445 	      return const0_rtx;
10446 	    }
10447 
10448 	  /* If there is no rounding use normal version of the pattern.  */
10449 	  if (INTVAL (op) == NO_ROUND)
10450 	    redundant_embed_rnd = 1;
10451 	}
10452       else
10453 	{
10454 	  if (VECTOR_MODE_P (mode))
10455 	    op = safe_vector_operand (op, mode);
10456 
10457 	  op = fixup_modeless_constant (op, mode);
10458 
10459 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10460 	    {
10461 	      if (optimize || !match)
10462 		op = copy_to_mode_reg (mode, op);
10463 	    }
10464 	  else
10465 	    {
10466 	      op = copy_to_reg (op);
10467 	      op = lowpart_subreg (mode, op, GET_MODE (op));
10468 	    }
10469 	}
10470 
10471       xops[i] = op;
10472     }
10473 
10474   switch (nargs)
10475     {
10476     case 1:
10477       pat = GEN_FCN (icode) (target, xops[0]);
10478       break;
10479     case 2:
10480       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10481       break;
10482     case 3:
10483       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10484       break;
10485     case 4:
10486       pat = GEN_FCN (icode) (target, xops[0], xops[1],
10487 			     xops[2], xops[3]);
10488       break;
10489     case 5:
10490       pat = GEN_FCN (icode) (target, xops[0], xops[1],
10491 			     xops[2], xops[3], xops[4]);
10492       break;
10493     case 6:
10494       pat = GEN_FCN (icode) (target, xops[0], xops[1],
10495 			     xops[2], xops[3], xops[4], xops[5]);
10496       break;
10497     default:
10498       gcc_unreachable ();
10499     }
10500 
10501   if (!pat)
10502     return 0;
10503 
10504   if (redundant_embed_rnd)
10505     pat = ix86_erase_embedded_rounding (pat);
10506 
10507   emit_insn (pat);
10508   return target;
10509 }
10510 
10511 /* Subroutine of ix86_expand_builtin to take care of special insns
10512    with variable number of operands.  */
10513 
10514 static rtx
ix86_expand_special_args_builtin(const struct builtin_description * d,tree exp,rtx target)10515 ix86_expand_special_args_builtin (const struct builtin_description *d,
10516 				  tree exp, rtx target)
10517 {
10518   tree arg;
10519   rtx pat, op;
10520   unsigned int i, nargs, arg_adjust, memory;
10521   bool aligned_mem = false;
10522   rtx xops[3];
10523   enum insn_code icode = d->icode;
10524   const struct insn_data_d *insn_p = &insn_data[icode];
10525   machine_mode tmode = insn_p->operand[0].mode;
10526   enum { load, store } klass;
10527 
10528   switch ((enum ix86_builtin_func_type) d->flag)
10529     {
10530     case VOID_FTYPE_VOID:
10531       emit_insn (GEN_FCN (icode) (target));
10532       return 0;
10533     case VOID_FTYPE_UINT64:
10534     case VOID_FTYPE_UNSIGNED:
10535       nargs = 0;
10536       klass = store;
10537       memory = 0;
10538       break;
10539 
10540     case INT_FTYPE_VOID:
10541     case USHORT_FTYPE_VOID:
10542     case UINT64_FTYPE_VOID:
10543     case UINT_FTYPE_VOID:
10544     case UINT8_FTYPE_VOID:
10545     case UNSIGNED_FTYPE_VOID:
10546       nargs = 0;
10547       klass = load;
10548       memory = 0;
10549       break;
10550     case UINT64_FTYPE_PUNSIGNED:
10551     case V2DI_FTYPE_PV2DI:
10552     case V4DI_FTYPE_PV4DI:
10553     case V32QI_FTYPE_PCCHAR:
10554     case V16QI_FTYPE_PCCHAR:
10555     case V8SF_FTYPE_PCV4SF:
10556     case V8SF_FTYPE_PCFLOAT:
10557     case V4SF_FTYPE_PCFLOAT:
10558     case V4DF_FTYPE_PCV2DF:
10559     case V4DF_FTYPE_PCDOUBLE:
10560     case V2DF_FTYPE_PCDOUBLE:
10561     case VOID_FTYPE_PVOID:
10562     case V8DI_FTYPE_PV8DI:
10563       nargs = 1;
10564       klass = load;
10565       memory = 0;
10566       switch (icode)
10567 	{
10568 	case CODE_FOR_sse4_1_movntdqa:
10569 	case CODE_FOR_avx2_movntdqa:
10570 	case CODE_FOR_avx512f_movntdqa:
10571 	  aligned_mem = true;
10572 	  break;
10573 	default:
10574 	  break;
10575 	}
10576       break;
10577     case VOID_FTYPE_PV2SF_V4SF:
10578     case VOID_FTYPE_PV8DI_V8DI:
10579     case VOID_FTYPE_PV4DI_V4DI:
10580     case VOID_FTYPE_PV2DI_V2DI:
10581     case VOID_FTYPE_PCHAR_V32QI:
10582     case VOID_FTYPE_PCHAR_V16QI:
10583     case VOID_FTYPE_PFLOAT_V16SF:
10584     case VOID_FTYPE_PFLOAT_V8SF:
10585     case VOID_FTYPE_PFLOAT_V4SF:
10586     case VOID_FTYPE_PDOUBLE_V8DF:
10587     case VOID_FTYPE_PDOUBLE_V4DF:
10588     case VOID_FTYPE_PDOUBLE_V2DF:
10589     case VOID_FTYPE_PLONGLONG_LONGLONG:
10590     case VOID_FTYPE_PULONGLONG_ULONGLONG:
10591     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10592     case VOID_FTYPE_PINT_INT:
10593       nargs = 1;
10594       klass = store;
10595       /* Reserve memory operand for target.  */
10596       memory = ARRAY_SIZE (xops);
10597       switch (icode)
10598 	{
10599 	/* These builtins and instructions require the memory
10600 	   to be properly aligned.  */
10601 	case CODE_FOR_avx_movntv4di:
10602 	case CODE_FOR_sse2_movntv2di:
10603 	case CODE_FOR_avx_movntv8sf:
10604 	case CODE_FOR_sse_movntv4sf:
10605 	case CODE_FOR_sse4a_vmmovntv4sf:
10606 	case CODE_FOR_avx_movntv4df:
10607 	case CODE_FOR_sse2_movntv2df:
10608 	case CODE_FOR_sse4a_vmmovntv2df:
10609 	case CODE_FOR_sse2_movntidi:
10610 	case CODE_FOR_sse_movntq:
10611 	case CODE_FOR_sse2_movntisi:
10612 	case CODE_FOR_avx512f_movntv16sf:
10613 	case CODE_FOR_avx512f_movntv8df:
10614 	case CODE_FOR_avx512f_movntv8di:
10615 	  aligned_mem = true;
10616 	  break;
10617 	default:
10618 	  break;
10619 	}
10620       break;
10621     case VOID_FTYPE_PVOID_PCVOID:
10622 	nargs = 1;
10623 	klass = store;
10624 	memory = 0;
10625 
10626 	break;
10627     case V4SF_FTYPE_V4SF_PCV2SF:
10628     case V2DF_FTYPE_V2DF_PCDOUBLE:
10629       nargs = 2;
10630       klass = load;
10631       memory = 1;
10632       break;
10633     case V8SF_FTYPE_PCV8SF_V8SI:
10634     case V4DF_FTYPE_PCV4DF_V4DI:
10635     case V4SF_FTYPE_PCV4SF_V4SI:
10636     case V2DF_FTYPE_PCV2DF_V2DI:
10637     case V8SI_FTYPE_PCV8SI_V8SI:
10638     case V4DI_FTYPE_PCV4DI_V4DI:
10639     case V4SI_FTYPE_PCV4SI_V4SI:
10640     case V2DI_FTYPE_PCV2DI_V2DI:
10641     case VOID_FTYPE_INT_INT64:
10642       nargs = 2;
10643       klass = load;
10644       memory = 0;
10645       break;
10646     case VOID_FTYPE_PV8DF_V8DF_UQI:
10647     case VOID_FTYPE_PV4DF_V4DF_UQI:
10648     case VOID_FTYPE_PV2DF_V2DF_UQI:
10649     case VOID_FTYPE_PV16SF_V16SF_UHI:
10650     case VOID_FTYPE_PV8SF_V8SF_UQI:
10651     case VOID_FTYPE_PV4SF_V4SF_UQI:
10652     case VOID_FTYPE_PV8DI_V8DI_UQI:
10653     case VOID_FTYPE_PV4DI_V4DI_UQI:
10654     case VOID_FTYPE_PV2DI_V2DI_UQI:
10655     case VOID_FTYPE_PV16SI_V16SI_UHI:
10656     case VOID_FTYPE_PV8SI_V8SI_UQI:
10657     case VOID_FTYPE_PV4SI_V4SI_UQI:
10658     case VOID_FTYPE_PV64QI_V64QI_UDI:
10659     case VOID_FTYPE_PV32HI_V32HI_USI:
10660     case VOID_FTYPE_PV32QI_V32QI_USI:
10661     case VOID_FTYPE_PV16QI_V16QI_UHI:
10662     case VOID_FTYPE_PV16HI_V16HI_UHI:
10663     case VOID_FTYPE_PV8HI_V8HI_UQI:
10664       switch (icode)
10665 	{
10666 	/* These builtins and instructions require the memory
10667 	   to be properly aligned.  */
10668 	case CODE_FOR_avx512f_storev16sf_mask:
10669 	case CODE_FOR_avx512f_storev16si_mask:
10670 	case CODE_FOR_avx512f_storev8df_mask:
10671 	case CODE_FOR_avx512f_storev8di_mask:
10672 	case CODE_FOR_avx512vl_storev8sf_mask:
10673 	case CODE_FOR_avx512vl_storev8si_mask:
10674 	case CODE_FOR_avx512vl_storev4df_mask:
10675 	case CODE_FOR_avx512vl_storev4di_mask:
10676 	case CODE_FOR_avx512vl_storev4sf_mask:
10677 	case CODE_FOR_avx512vl_storev4si_mask:
10678 	case CODE_FOR_avx512vl_storev2df_mask:
10679 	case CODE_FOR_avx512vl_storev2di_mask:
10680 	  aligned_mem = true;
10681 	  break;
10682 	default:
10683 	  break;
10684 	}
10685       /* FALLTHRU */
10686     case VOID_FTYPE_PV8SF_V8SI_V8SF:
10687     case VOID_FTYPE_PV4DF_V4DI_V4DF:
10688     case VOID_FTYPE_PV4SF_V4SI_V4SF:
10689     case VOID_FTYPE_PV2DF_V2DI_V2DF:
10690     case VOID_FTYPE_PV8SI_V8SI_V8SI:
10691     case VOID_FTYPE_PV4DI_V4DI_V4DI:
10692     case VOID_FTYPE_PV4SI_V4SI_V4SI:
10693     case VOID_FTYPE_PV2DI_V2DI_V2DI:
10694     case VOID_FTYPE_PV8SI_V8DI_UQI:
10695     case VOID_FTYPE_PV8HI_V8DI_UQI:
10696     case VOID_FTYPE_PV16HI_V16SI_UHI:
10697     case VOID_FTYPE_PUDI_V8DI_UQI:
10698     case VOID_FTYPE_PV16QI_V16SI_UHI:
10699     case VOID_FTYPE_PV4SI_V4DI_UQI:
10700     case VOID_FTYPE_PUDI_V2DI_UQI:
10701     case VOID_FTYPE_PUDI_V4DI_UQI:
10702     case VOID_FTYPE_PUSI_V2DI_UQI:
10703     case VOID_FTYPE_PV8HI_V8SI_UQI:
10704     case VOID_FTYPE_PUDI_V4SI_UQI:
10705     case VOID_FTYPE_PUSI_V4DI_UQI:
10706     case VOID_FTYPE_PUHI_V2DI_UQI:
10707     case VOID_FTYPE_PUDI_V8SI_UQI:
10708     case VOID_FTYPE_PUSI_V4SI_UQI:
10709     case VOID_FTYPE_PCHAR_V64QI_UDI:
10710     case VOID_FTYPE_PCHAR_V32QI_USI:
10711     case VOID_FTYPE_PCHAR_V16QI_UHI:
10712     case VOID_FTYPE_PSHORT_V32HI_USI:
10713     case VOID_FTYPE_PSHORT_V16HI_UHI:
10714     case VOID_FTYPE_PSHORT_V8HI_UQI:
10715     case VOID_FTYPE_PINT_V16SI_UHI:
10716     case VOID_FTYPE_PINT_V8SI_UQI:
10717     case VOID_FTYPE_PINT_V4SI_UQI:
10718     case VOID_FTYPE_PINT64_V8DI_UQI:
10719     case VOID_FTYPE_PINT64_V4DI_UQI:
10720     case VOID_FTYPE_PINT64_V2DI_UQI:
10721     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10722     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10723     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10724     case VOID_FTYPE_PFLOAT_V16SF_UHI:
10725     case VOID_FTYPE_PFLOAT_V8SF_UQI:
10726     case VOID_FTYPE_PFLOAT_V4SF_UQI:
10727     case VOID_FTYPE_PV32QI_V32HI_USI:
10728     case VOID_FTYPE_PV16QI_V16HI_UHI:
10729     case VOID_FTYPE_PUDI_V8HI_UQI:
10730       nargs = 2;
10731       klass = store;
10732       /* Reserve memory operand for target.  */
10733       memory = ARRAY_SIZE (xops);
10734       break;
10735     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10736     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10737     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10738     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10739     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10740     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10741     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10742     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10743     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10744     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10745     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10746     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10747     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10748     case V32HI_FTYPE_PCV32HI_V32HI_USI:
10749     case V32QI_FTYPE_PCV32QI_V32QI_USI:
10750     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10751     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10752     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10753       switch (icode)
10754 	{
10755 	/* These builtins and instructions require the memory
10756 	   to be properly aligned.  */
10757 	case CODE_FOR_avx512f_loadv16sf_mask:
10758 	case CODE_FOR_avx512f_loadv16si_mask:
10759 	case CODE_FOR_avx512f_loadv8df_mask:
10760 	case CODE_FOR_avx512f_loadv8di_mask:
10761 	case CODE_FOR_avx512vl_loadv8sf_mask:
10762 	case CODE_FOR_avx512vl_loadv8si_mask:
10763 	case CODE_FOR_avx512vl_loadv4df_mask:
10764 	case CODE_FOR_avx512vl_loadv4di_mask:
10765 	case CODE_FOR_avx512vl_loadv4sf_mask:
10766 	case CODE_FOR_avx512vl_loadv4si_mask:
10767 	case CODE_FOR_avx512vl_loadv2df_mask:
10768 	case CODE_FOR_avx512vl_loadv2di_mask:
10769 	case CODE_FOR_avx512bw_loadv64qi_mask:
10770 	case CODE_FOR_avx512vl_loadv32qi_mask:
10771 	case CODE_FOR_avx512vl_loadv16qi_mask:
10772 	case CODE_FOR_avx512bw_loadv32hi_mask:
10773 	case CODE_FOR_avx512vl_loadv16hi_mask:
10774 	case CODE_FOR_avx512vl_loadv8hi_mask:
10775 	  aligned_mem = true;
10776 	  break;
10777 	default:
10778 	  break;
10779 	}
10780       /* FALLTHRU */
10781     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10782     case V32QI_FTYPE_PCCHAR_V32QI_USI:
10783     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10784     case V32HI_FTYPE_PCSHORT_V32HI_USI:
10785     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10786     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10787     case V16SI_FTYPE_PCINT_V16SI_UHI:
10788     case V8SI_FTYPE_PCINT_V8SI_UQI:
10789     case V4SI_FTYPE_PCINT_V4SI_UQI:
10790     case V8DI_FTYPE_PCINT64_V8DI_UQI:
10791     case V4DI_FTYPE_PCINT64_V4DI_UQI:
10792     case V2DI_FTYPE_PCINT64_V2DI_UQI:
10793     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10794     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10795     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10796     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10797     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10798     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10799       nargs = 3;
10800       klass = load;
10801       memory = 0;
10802       break;
10803     default:
10804       gcc_unreachable ();
10805     }
10806 
10807   gcc_assert (nargs <= ARRAY_SIZE (xops));
10808 
10809   if (klass == store)
10810     {
10811       arg = CALL_EXPR_ARG (exp, 0);
10812       op = expand_normal (arg);
10813       gcc_assert (target == 0);
10814       if (memory)
10815 	{
10816 	  op = ix86_zero_extend_to_Pmode (op);
10817 	  target = gen_rtx_MEM (tmode, op);
10818 	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10819 	     on it.  Try to improve it using get_pointer_alignment,
10820 	     and if the special builtin is one that requires strict
10821 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
10822 	     Failure to do so could lead to ix86_legitimate_combined_insn
10823 	     rejecting all changes to such insns.  */
10824 	  unsigned int align = get_pointer_alignment (arg);
10825 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10826 	    align = GET_MODE_ALIGNMENT (tmode);
10827 	  if (MEM_ALIGN (target) < align)
10828 	    set_mem_align (target, align);
10829 	}
10830       else
10831 	target = force_reg (tmode, op);
10832       arg_adjust = 1;
10833     }
10834   else
10835     {
10836       arg_adjust = 0;
10837       if (optimize
10838 	  || target == 0
10839 	  || !register_operand (target, tmode)
10840 	  || GET_MODE (target) != tmode)
10841 	target = gen_reg_rtx (tmode);
10842     }
10843 
10844   for (i = 0; i < nargs; i++)
10845     {
10846       machine_mode mode = insn_p->operand[i + 1].mode;
10847 
10848       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10849       op = expand_normal (arg);
10850 
10851       if (i == memory)
10852 	{
10853 	  /* This must be the memory operand.  */
10854 	  op = ix86_zero_extend_to_Pmode (op);
10855 	  op = gen_rtx_MEM (mode, op);
10856 	  /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10857 	     on it.  Try to improve it using get_pointer_alignment,
10858 	     and if the special builtin is one that requires strict
10859 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
10860 	     Failure to do so could lead to ix86_legitimate_combined_insn
10861 	     rejecting all changes to such insns.  */
10862 	  unsigned int align = get_pointer_alignment (arg);
10863 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10864 	    align = GET_MODE_ALIGNMENT (mode);
10865 	  if (MEM_ALIGN (op) < align)
10866 	    set_mem_align (op, align);
10867 	}
10868       else
10869 	{
10870 	  /* This must be register.  */
10871 	  if (VECTOR_MODE_P (mode))
10872 	    op = safe_vector_operand (op, mode);
10873 
10874 	  op = fixup_modeless_constant (op, mode);
10875 
10876 	  /* NB: 3-operands load implied it's a mask load or v{p}expand*,
10877 	     and that mask operand shoud be at the end.
10878 	     Keep all-ones mask which would be simplified by the expander.  */
10879 	  if (nargs == 3 && i == 2 && klass == load
10880 	      && constm1_operand (op, mode)
10881 	      && insn_p->operand[i].predicate (op, mode))
10882 	    ;
10883 	  else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10884 	    op = copy_to_mode_reg (mode, op);
10885 	  else
10886 	    {
10887 	      op = copy_to_reg (op);
10888 	      op = lowpart_subreg (mode, op, GET_MODE (op));
10889 	    }
10890 	}
10891 
10892       xops[i]= op;
10893     }
10894 
10895   switch (nargs)
10896     {
10897     case 0:
10898       pat = GEN_FCN (icode) (target);
10899       break;
10900     case 1:
10901       pat = GEN_FCN (icode) (target, xops[0]);
10902       break;
10903     case 2:
10904       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10905       break;
10906     case 3:
10907       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10908       break;
10909     default:
10910       gcc_unreachable ();
10911     }
10912 
10913   if (! pat)
10914     return 0;
10915 
10916   emit_insn (pat);
10917   return klass == store ? 0 : target;
10918 }
10919 
10920 /* Return the integer constant in ARG.  Constrain it to be in the range
10921    of the subparts of VEC_TYPE; issue an error if not.  */
10922 
10923 static int
get_element_number(tree vec_type,tree arg)10924 get_element_number (tree vec_type, tree arg)
10925 {
10926   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10927 
10928   if (!tree_fits_uhwi_p (arg)
10929       || (elt = tree_to_uhwi (arg), elt > max))
10930     {
10931       error ("selector must be an integer constant in the range "
10932 	     "[0, %wi]", max);
10933       return 0;
10934     }
10935 
10936   return elt;
10937 }
10938 
10939 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10940    ix86_expand_vector_init.  We DO have language-level syntax for this, in
10941    the form of  (type){ init-list }.  Except that since we can't place emms
10942    instructions from inside the compiler, we can't allow the use of MMX
10943    registers unless the user explicitly asks for it.  So we do *not* define
10944    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
10945    we have builtins invoked by mmintrin.h that gives us license to emit
10946    these sorts of instructions.  */
10947 
10948 static rtx
ix86_expand_vec_init_builtin(tree type,tree exp,rtx target)10949 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10950 {
10951   machine_mode tmode = TYPE_MODE (type);
10952   machine_mode inner_mode = GET_MODE_INNER (tmode);
10953   int i, n_elt = GET_MODE_NUNITS (tmode);
10954   rtvec v = rtvec_alloc (n_elt);
10955 
10956   gcc_assert (VECTOR_MODE_P (tmode));
10957   gcc_assert (call_expr_nargs (exp) == n_elt);
10958 
10959   for (i = 0; i < n_elt; ++i)
10960     {
10961       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10962       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10963     }
10964 
10965   if (!target || !register_operand (target, tmode))
10966     target = gen_reg_rtx (tmode);
10967 
10968   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10969   return target;
10970 }
10971 
10972 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10973    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
10974    had a language-level syntax for referencing vector elements.  */
10975 
10976 static rtx
ix86_expand_vec_ext_builtin(tree exp,rtx target)10977 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10978 {
10979   machine_mode tmode, mode0;
10980   tree arg0, arg1;
10981   int elt;
10982   rtx op0;
10983 
10984   arg0 = CALL_EXPR_ARG (exp, 0);
10985   arg1 = CALL_EXPR_ARG (exp, 1);
10986 
10987   op0 = expand_normal (arg0);
10988   elt = get_element_number (TREE_TYPE (arg0), arg1);
10989 
10990   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10991   mode0 = TYPE_MODE (TREE_TYPE (arg0));
10992   gcc_assert (VECTOR_MODE_P (mode0));
10993 
10994   op0 = force_reg (mode0, op0);
10995 
10996   if (optimize || !target || !register_operand (target, tmode))
10997     target = gen_reg_rtx (tmode);
10998 
10999   ix86_expand_vector_extract (true, target, op0, elt);
11000 
11001   return target;
11002 }
11003 
11004 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
11005    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
11006    a language-level syntax for referencing vector elements.  */
11007 
11008 static rtx
ix86_expand_vec_set_builtin(tree exp)11009 ix86_expand_vec_set_builtin (tree exp)
11010 {
11011   machine_mode tmode, mode1;
11012   tree arg0, arg1, arg2;
11013   int elt;
11014   rtx op0, op1, target;
11015 
11016   arg0 = CALL_EXPR_ARG (exp, 0);
11017   arg1 = CALL_EXPR_ARG (exp, 1);
11018   arg2 = CALL_EXPR_ARG (exp, 2);
11019 
11020   tmode = TYPE_MODE (TREE_TYPE (arg0));
11021   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
11022   gcc_assert (VECTOR_MODE_P (tmode));
11023 
11024   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
11025   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
11026   elt = get_element_number (TREE_TYPE (arg0), arg2);
11027 
11028   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
11029     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
11030 
11031   op0 = force_reg (tmode, op0);
11032   op1 = force_reg (mode1, op1);
11033 
11034   /* OP0 is the source of these builtin functions and shouldn't be
11035      modified.  Create a copy, use it and return it as target.  */
11036   target = gen_reg_rtx (tmode);
11037   emit_move_insn (target, op0);
11038   ix86_expand_vector_set (true, target, op1, elt);
11039 
11040   return target;
11041 }
11042 
11043 /* Expand an expression EXP that calls a built-in function,
11044    with result going to TARGET if that's convenient
11045    (and in mode MODE if that's convenient).
11046    SUBTARGET may be used as the target for computing one of EXP's operands.
11047    IGNORE is nonzero if the value is to be ignored.  */
11048 
11049 rtx
ix86_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)11050 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
11051 		     machine_mode mode, int ignore)
11052 {
11053   size_t i;
11054   enum insn_code icode, icode2;
11055   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11056   tree arg0, arg1, arg2, arg3, arg4;
11057   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
11058   machine_mode mode0, mode1, mode2, mode3, mode4;
11059   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
11060 
11061   /* For CPU builtins that can be folded, fold first and expand the fold.  */
11062   switch (fcode)
11063     {
11064     case IX86_BUILTIN_CPU_INIT:
11065       {
11066 	/* Make it call __cpu_indicator_init in libgcc. */
11067 	tree call_expr, fndecl, type;
11068         type = build_function_type_list (integer_type_node, NULL_TREE);
11069 	fndecl = build_fn_decl ("__cpu_indicator_init", type);
11070 	call_expr = build_call_expr (fndecl, 0);
11071 	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11072       }
11073     case IX86_BUILTIN_CPU_IS:
11074     case IX86_BUILTIN_CPU_SUPPORTS:
11075       {
11076 	tree arg0 = CALL_EXPR_ARG (exp, 0);
11077 	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11078 	gcc_assert (fold_expr != NULL_TREE);
11079 	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11080       }
11081     }
11082 
11083   HOST_WIDE_INT isa = ix86_isa_flags;
11084   HOST_WIDE_INT isa2 = ix86_isa_flags2;
11085   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11086   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11087   /* The general case is we require all the ISAs specified in bisa{,2}
11088      to be enabled.
11089      The exceptions are:
11090      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11091      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11092      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11093      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11094        OPTION_MASK_ISA2_AVXVNNI
11095      where for each such pair it is sufficient if either of the ISAs is
11096      enabled, plus if it is ored with other options also those others.
11097      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
11098   if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11099        == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11100       && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11101     isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11102 
11103   if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11104        == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11105       && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11106     isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11107 
11108   if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11109        == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11110       && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11111     isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11112 
11113   if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11114 	== (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11115        || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
11116       && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11117 	   == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11118 	  || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
11119     {
11120       isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
11121       isa2 |= OPTION_MASK_ISA2_AVXVNNI;
11122     }
11123 
11124   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
11125       /* __builtin_ia32_maskmovq requires MMX registers.  */
11126       && fcode != IX86_BUILTIN_MASKMOVQ)
11127     {
11128       bisa &= ~OPTION_MASK_ISA_MMX;
11129       bisa |= OPTION_MASK_ISA_SSE2;
11130     }
11131 
11132   if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11133     {
11134       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11135       if (TARGET_ABI_X32)
11136 	bisa |= OPTION_MASK_ABI_X32;
11137       else
11138 	bisa |= OPTION_MASK_ABI_64;
11139       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11140 				       (enum fpmath_unit) 0,
11141 				       (enum prefer_vector_width) 0,
11142 				       false, add_abi_p);
11143       if (!opts)
11144 	error ("%qE needs unknown isa option", fndecl);
11145       else
11146 	{
11147 	  gcc_assert (opts != NULL);
11148 	  error ("%qE needs isa option %s", fndecl, opts);
11149 	  free (opts);
11150 	}
11151       return expand_call (exp, target, ignore);
11152     }
11153 
11154   switch (fcode)
11155     {
11156     case IX86_BUILTIN_MASKMOVQ:
11157     case IX86_BUILTIN_MASKMOVDQU:
11158       icode = (fcode == IX86_BUILTIN_MASKMOVQ
11159 	       ? CODE_FOR_mmx_maskmovq
11160 	       : CODE_FOR_sse2_maskmovdqu);
11161       /* Note the arg order is different from the operand order.  */
11162       arg1 = CALL_EXPR_ARG (exp, 0);
11163       arg2 = CALL_EXPR_ARG (exp, 1);
11164       arg0 = CALL_EXPR_ARG (exp, 2);
11165       op0 = expand_normal (arg0);
11166       op1 = expand_normal (arg1);
11167       op2 = expand_normal (arg2);
11168       mode0 = insn_data[icode].operand[0].mode;
11169       mode1 = insn_data[icode].operand[1].mode;
11170       mode2 = insn_data[icode].operand[2].mode;
11171 
11172       op0 = ix86_zero_extend_to_Pmode (op0);
11173       op0 = gen_rtx_MEM (mode1, op0);
11174 
11175       if (!insn_data[icode].operand[0].predicate (op0, mode0))
11176 	op0 = copy_to_mode_reg (mode0, op0);
11177       if (!insn_data[icode].operand[1].predicate (op1, mode1))
11178 	op1 = copy_to_mode_reg (mode1, op1);
11179       if (!insn_data[icode].operand[2].predicate (op2, mode2))
11180 	op2 = copy_to_mode_reg (mode2, op2);
11181       pat = GEN_FCN (icode) (op0, op1, op2);
11182       if (! pat)
11183 	return 0;
11184       emit_insn (pat);
11185       return 0;
11186 
11187     case IX86_BUILTIN_LDMXCSR:
11188       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11189       target = assign_386_stack_local (SImode, SLOT_TEMP);
11190       emit_move_insn (target, op0);
11191       emit_insn (gen_sse_ldmxcsr (target));
11192       return 0;
11193 
11194     case IX86_BUILTIN_STMXCSR:
11195       target = assign_386_stack_local (SImode, SLOT_TEMP);
11196       emit_insn (gen_sse_stmxcsr (target));
11197       return copy_to_mode_reg (SImode, target);
11198 
11199     case IX86_BUILTIN_CLFLUSH:
11200 	arg0 = CALL_EXPR_ARG (exp, 0);
11201 	op0 = expand_normal (arg0);
11202 	icode = CODE_FOR_sse2_clflush;
11203 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11204 	  op0 = ix86_zero_extend_to_Pmode (op0);
11205 
11206 	emit_insn (gen_sse2_clflush (op0));
11207 	return 0;
11208 
11209     case IX86_BUILTIN_CLWB:
11210 	arg0 = CALL_EXPR_ARG (exp, 0);
11211 	op0 = expand_normal (arg0);
11212 	icode = CODE_FOR_clwb;
11213 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11214 	  op0 = ix86_zero_extend_to_Pmode (op0);
11215 
11216 	emit_insn (gen_clwb (op0));
11217 	return 0;
11218 
11219     case IX86_BUILTIN_CLFLUSHOPT:
11220 	arg0 = CALL_EXPR_ARG (exp, 0);
11221 	op0 = expand_normal (arg0);
11222 	icode = CODE_FOR_clflushopt;
11223 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11224 	  op0 = ix86_zero_extend_to_Pmode (op0);
11225 
11226 	emit_insn (gen_clflushopt (op0));
11227 	return 0;
11228 
11229     case IX86_BUILTIN_MONITOR:
11230     case IX86_BUILTIN_MONITORX:
11231       arg0 = CALL_EXPR_ARG (exp, 0);
11232       arg1 = CALL_EXPR_ARG (exp, 1);
11233       arg2 = CALL_EXPR_ARG (exp, 2);
11234       op0 = expand_normal (arg0);
11235       op1 = expand_normal (arg1);
11236       op2 = expand_normal (arg2);
11237       if (!REG_P (op0))
11238 	op0 = ix86_zero_extend_to_Pmode (op0);
11239       if (!REG_P (op1))
11240 	op1 = copy_to_mode_reg (SImode, op1);
11241       if (!REG_P (op2))
11242 	op2 = copy_to_mode_reg (SImode, op2);
11243 
11244       emit_insn (fcode == IX86_BUILTIN_MONITOR
11245 		 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11246 		 : gen_monitorx (Pmode, op0, op1, op2));
11247       return 0;
11248 
11249     case IX86_BUILTIN_MWAIT:
11250       arg0 = CALL_EXPR_ARG (exp, 0);
11251       arg1 = CALL_EXPR_ARG (exp, 1);
11252       op0 = expand_normal (arg0);
11253       op1 = expand_normal (arg1);
11254       if (!REG_P (op0))
11255 	op0 = copy_to_mode_reg (SImode, op0);
11256       if (!REG_P (op1))
11257 	op1 = copy_to_mode_reg (SImode, op1);
11258       emit_insn (gen_sse3_mwait (op0, op1));
11259       return 0;
11260 
11261     case IX86_BUILTIN_MWAITX:
11262       arg0 = CALL_EXPR_ARG (exp, 0);
11263       arg1 = CALL_EXPR_ARG (exp, 1);
11264       arg2 = CALL_EXPR_ARG (exp, 2);
11265       op0 = expand_normal (arg0);
11266       op1 = expand_normal (arg1);
11267       op2 = expand_normal (arg2);
11268       if (!REG_P (op0))
11269 	op0 = copy_to_mode_reg (SImode, op0);
11270       if (!REG_P (op1))
11271 	op1 = copy_to_mode_reg (SImode, op1);
11272       if (!REG_P (op2))
11273 	op2 = copy_to_mode_reg (SImode, op2);
11274       emit_insn (gen_mwaitx (op0, op1, op2));
11275       return 0;
11276 
11277     case IX86_BUILTIN_UMONITOR:
11278       arg0 = CALL_EXPR_ARG (exp, 0);
11279       op0 = expand_normal (arg0);
11280 
11281       op0 = ix86_zero_extend_to_Pmode (op0);
11282       emit_insn (gen_umonitor (Pmode, op0));
11283       return 0;
11284 
11285     case IX86_BUILTIN_UMWAIT:
11286     case IX86_BUILTIN_TPAUSE:
11287       arg0 = CALL_EXPR_ARG (exp, 0);
11288       arg1 = CALL_EXPR_ARG (exp, 1);
11289       op0 = expand_normal (arg0);
11290       op1 = expand_normal (arg1);
11291 
11292       if (!REG_P (op0))
11293 	op0 = copy_to_mode_reg (SImode, op0);
11294 
11295       op1 = force_reg (DImode, op1);
11296 
11297       if (TARGET_64BIT)
11298 	{
11299 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11300 				     NULL, 1, OPTAB_DIRECT);
11301 	  switch (fcode)
11302 	    {
11303 	    case IX86_BUILTIN_UMWAIT:
11304 	      icode = CODE_FOR_umwait_rex64;
11305 	      break;
11306 	    case IX86_BUILTIN_TPAUSE:
11307 	      icode = CODE_FOR_tpause_rex64;
11308 	      break;
11309 	    default:
11310 	      gcc_unreachable ();
11311 	    }
11312 
11313 	  op2 = gen_lowpart (SImode, op2);
11314 	  op1 = gen_lowpart (SImode, op1);
11315 	  pat = GEN_FCN (icode) (op0, op1, op2);
11316 	}
11317       else
11318 	{
11319 	  switch (fcode)
11320 	    {
11321 	    case IX86_BUILTIN_UMWAIT:
11322 	      icode = CODE_FOR_umwait;
11323 	      break;
11324 	    case IX86_BUILTIN_TPAUSE:
11325 	      icode = CODE_FOR_tpause;
11326 	      break;
11327 	    default:
11328 	      gcc_unreachable ();
11329 	    }
11330 	  pat = GEN_FCN (icode) (op0, op1);
11331 	}
11332 
11333       if (!pat)
11334 	return 0;
11335 
11336       emit_insn (pat);
11337 
11338       if (target == 0
11339 	  || !register_operand (target, QImode))
11340 	target = gen_reg_rtx (QImode);
11341 
11342       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11343 			const0_rtx);
11344       emit_insn (gen_rtx_SET (target, pat));
11345 
11346       return target;
11347 
11348     case IX86_BUILTIN_TESTUI:
11349       emit_insn (gen_testui ());
11350 
11351       if (target == 0
11352 	  || !register_operand (target, QImode))
11353 	target = gen_reg_rtx (QImode);
11354 
11355       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11356 			 const0_rtx);
11357       emit_insn (gen_rtx_SET (target, pat));
11358 
11359       return target;
11360 
11361     case IX86_BUILTIN_CLZERO:
11362       arg0 = CALL_EXPR_ARG (exp, 0);
11363       op0 = expand_normal (arg0);
11364       if (!REG_P (op0))
11365 	op0 = ix86_zero_extend_to_Pmode (op0);
11366       emit_insn (gen_clzero (Pmode, op0));
11367       return 0;
11368 
11369     case IX86_BUILTIN_CLDEMOTE:
11370       arg0 = CALL_EXPR_ARG (exp, 0);
11371       op0 = expand_normal (arg0);
11372       icode = CODE_FOR_cldemote;
11373       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11374 	op0 = ix86_zero_extend_to_Pmode (op0);
11375 
11376       emit_insn (gen_cldemote (op0));
11377       return 0;
11378 
11379     case IX86_BUILTIN_LOADIWKEY:
11380       {
11381 	arg0 = CALL_EXPR_ARG (exp, 0);
11382 	arg1 = CALL_EXPR_ARG (exp, 1);
11383 	arg2 = CALL_EXPR_ARG (exp, 2);
11384 	arg3 = CALL_EXPR_ARG (exp, 3);
11385 
11386 	op0 = expand_normal (arg0);
11387 	op1 = expand_normal (arg1);
11388 	op2 = expand_normal (arg2);
11389 	op3 = expand_normal (arg3);
11390 
11391 	if (!REG_P (op0))
11392 	  op0 = copy_to_mode_reg (V2DImode, op0);
11393 	if (!REG_P (op1))
11394 	  op1 = copy_to_mode_reg (V2DImode, op1);
11395 	if (!REG_P (op2))
11396 	  op2 = copy_to_mode_reg (V2DImode, op2);
11397 	if (!REG_P (op3))
11398 	  op3 = copy_to_mode_reg (SImode, op3);
11399 
11400 	emit_insn (gen_loadiwkey (op0, op1, op2, op3));
11401 
11402 	return 0;
11403       }
11404 
11405     case IX86_BUILTIN_AESDEC128KLU8:
11406       icode = CODE_FOR_aesdec128klu8;
11407       goto aesdecenc_expand;
11408 
11409     case IX86_BUILTIN_AESDEC256KLU8:
11410       icode = CODE_FOR_aesdec256klu8;
11411       goto aesdecenc_expand;
11412 
11413     case IX86_BUILTIN_AESENC128KLU8:
11414       icode = CODE_FOR_aesenc128klu8;
11415       goto aesdecenc_expand;
11416 
11417     case IX86_BUILTIN_AESENC256KLU8:
11418       icode = CODE_FOR_aesenc256klu8;
11419 
11420     aesdecenc_expand:
11421 
11422       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
11423       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
11424       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11425 
11426       op0 = expand_normal (arg0);
11427       op1 = expand_normal (arg1);
11428       op2 = expand_normal (arg2);
11429 
11430       if (!address_operand (op0, V2DImode))
11431 	{
11432 	  op0 = convert_memory_address (Pmode, op0);
11433 	  op0 = copy_addr_to_reg (op0);
11434 	}
11435       op0 = gen_rtx_MEM (V2DImode, op0);
11436 
11437       if (!REG_P (op1))
11438 	op1 = copy_to_mode_reg (V2DImode, op1);
11439 
11440       if (!address_operand (op2, VOIDmode))
11441 	{
11442 	  op2 = convert_memory_address (Pmode, op2);
11443 	  op2 = copy_addr_to_reg (op2);
11444 	}
11445       op2 = gen_rtx_MEM (BLKmode, op2);
11446 
11447       emit_insn (GEN_FCN (icode) (op1, op1, op2));
11448 
11449       if (target == 0)
11450 	target = gen_reg_rtx (QImode);
11451 
11452       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11453 			const0_rtx);
11454       emit_insn (gen_rtx_SET (target, pat));
11455 
11456       emit_insn (gen_rtx_SET (op0, op1));
11457 
11458       return target;
11459 
11460     case IX86_BUILTIN_AESDECWIDE128KLU8:
11461       icode = CODE_FOR_aesdecwide128klu8;
11462       goto wideaesdecenc_expand;
11463 
11464     case IX86_BUILTIN_AESDECWIDE256KLU8:
11465       icode = CODE_FOR_aesdecwide256klu8;
11466       goto wideaesdecenc_expand;
11467 
11468     case IX86_BUILTIN_AESENCWIDE128KLU8:
11469       icode = CODE_FOR_aesencwide128klu8;
11470       goto wideaesdecenc_expand;
11471 
11472     case IX86_BUILTIN_AESENCWIDE256KLU8:
11473       icode = CODE_FOR_aesencwide256klu8;
11474 
11475     wideaesdecenc_expand:
11476 
11477       rtx xmm_regs[8];
11478       rtx op;
11479 
11480       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
11481       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
11482       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11483 
11484       op0 = expand_normal (arg0);
11485       op1 = expand_normal (arg1);
11486       op2 = expand_normal (arg2);
11487 
11488       if (!address_operand (op2, VOIDmode))
11489 	{
11490 	  op2 = convert_memory_address (Pmode, op2);
11491 	  op2 = copy_addr_to_reg (op2);
11492 	}
11493       op2 = gen_rtx_MEM (BLKmode, op2);
11494 
11495       for (i = 0; i < 8; i++)
11496 	{
11497 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11498 
11499 	  op = gen_rtx_MEM (V2DImode,
11500 			    plus_constant (Pmode, op1, (i * 16)));
11501 
11502 	  emit_move_insn (xmm_regs[i], op);
11503 	}
11504 
11505       emit_insn (GEN_FCN (icode) (op2));
11506 
11507       if (target == 0)
11508 	target = gen_reg_rtx (QImode);
11509 
11510       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11511 			const0_rtx);
11512       emit_insn (gen_rtx_SET (target, pat));
11513 
11514       for (i = 0; i < 8; i++)
11515 	{
11516 	  op = gen_rtx_MEM (V2DImode,
11517 			    plus_constant (Pmode, op0, (i * 16)));
11518 	  emit_move_insn (op, xmm_regs[i]);
11519 	}
11520 
11521       return target;
11522 
11523     case IX86_BUILTIN_ENCODEKEY128U32:
11524       {
11525 	rtx op, xmm_regs[7];
11526 
11527 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11528 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
11529 	arg2 = CALL_EXPR_ARG (exp, 2); // void *h
11530 
11531 	op0 = expand_normal (arg0);
11532 	op1 = expand_normal (arg1);
11533 	op2 = expand_normal (arg2);
11534 
11535 	if (!REG_P (op0))
11536 	  op0 = copy_to_mode_reg (SImode, op0);
11537 
11538 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11539 	emit_move_insn (op, op1);
11540 
11541 	for (i = 0; i < 3; i++)
11542 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11543 
11544 	if (target == 0)
11545 	  target = gen_reg_rtx (SImode);
11546 
11547 	emit_insn (gen_encodekey128u32 (target, op0));
11548 
11549 	for (i = 0; i < 3; i++)
11550 	  {
11551 	    op = gen_rtx_MEM (V2DImode,
11552 			      plus_constant (Pmode, op2, (i * 16)));
11553 	    emit_move_insn (op, xmm_regs[i]);
11554 	  }
11555 
11556 	return target;
11557       }
11558     case IX86_BUILTIN_ENCODEKEY256U32:
11559       {
11560 	rtx op, xmm_regs[7];
11561 
11562 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11563 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
11564 	arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
11565 	arg3 = CALL_EXPR_ARG (exp, 3); // void *h
11566 
11567 	op0 = expand_normal (arg0);
11568 	op1 = expand_normal (arg1);
11569 	op2 = expand_normal (arg2);
11570 	op3 = expand_normal (arg3);
11571 
11572 	if (!REG_P (op0))
11573 	  op0 = copy_to_mode_reg (SImode, op0);
11574 
11575 	/* Force to use xmm0, xmm1 for keylow, keyhi*/
11576 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11577 	emit_move_insn (op, op1);
11578 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
11579 	emit_move_insn (op, op2);
11580 
11581 	for (i = 0; i < 4; i++)
11582 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11583 
11584 	if (target == 0)
11585 	  target = gen_reg_rtx (SImode);
11586 
11587 	emit_insn (gen_encodekey256u32 (target, op0));
11588 
11589 	for (i = 0; i < 4; i++)
11590 	  {
11591 	    op = gen_rtx_MEM (V2DImode,
11592 			      plus_constant (Pmode, op3, (i * 16)));
11593 	    emit_move_insn (op, xmm_regs[i]);
11594 	  }
11595 
11596 	return target;
11597       }
11598 
11599     case IX86_BUILTIN_VEC_INIT_V2SI:
11600     case IX86_BUILTIN_VEC_INIT_V4HI:
11601     case IX86_BUILTIN_VEC_INIT_V8QI:
11602       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11603 
11604     case IX86_BUILTIN_VEC_EXT_V2DF:
11605     case IX86_BUILTIN_VEC_EXT_V2DI:
11606     case IX86_BUILTIN_VEC_EXT_V4SF:
11607     case IX86_BUILTIN_VEC_EXT_V4SI:
11608     case IX86_BUILTIN_VEC_EXT_V8HI:
11609     case IX86_BUILTIN_VEC_EXT_V2SI:
11610     case IX86_BUILTIN_VEC_EXT_V4HI:
11611     case IX86_BUILTIN_VEC_EXT_V16QI:
11612       return ix86_expand_vec_ext_builtin (exp, target);
11613 
11614     case IX86_BUILTIN_VEC_SET_V2DI:
11615     case IX86_BUILTIN_VEC_SET_V4SF:
11616     case IX86_BUILTIN_VEC_SET_V4SI:
11617     case IX86_BUILTIN_VEC_SET_V8HI:
11618     case IX86_BUILTIN_VEC_SET_V4HI:
11619     case IX86_BUILTIN_VEC_SET_V16QI:
11620       return ix86_expand_vec_set_builtin (exp);
11621 
11622     case IX86_BUILTIN_NANQ:
11623     case IX86_BUILTIN_NANSQ:
11624       return expand_call (exp, target, ignore);
11625 
11626     case IX86_BUILTIN_RDPID:
11627 
11628       op0 = gen_reg_rtx (word_mode);
11629 
11630       if (TARGET_64BIT)
11631 	{
11632 	  insn = gen_rdpid_rex64 (op0);
11633 	  op0 = convert_to_mode (SImode, op0, 1);
11634 	}
11635       else
11636 	insn = gen_rdpid (op0);
11637 
11638       emit_insn (insn);
11639 
11640       if (target == 0
11641 	  || !register_operand (target, SImode))
11642 	target = gen_reg_rtx (SImode);
11643 
11644       emit_move_insn (target, op0);
11645       return target;
11646 
11647     case IX86_BUILTIN_2INTERSECTD512:
11648     case IX86_BUILTIN_2INTERSECTQ512:
11649     case IX86_BUILTIN_2INTERSECTD256:
11650     case IX86_BUILTIN_2INTERSECTQ256:
11651     case IX86_BUILTIN_2INTERSECTD128:
11652     case IX86_BUILTIN_2INTERSECTQ128:
11653       arg0 = CALL_EXPR_ARG (exp, 0);
11654       arg1 = CALL_EXPR_ARG (exp, 1);
11655       arg2 = CALL_EXPR_ARG (exp, 2);
11656       arg3 = CALL_EXPR_ARG (exp, 3);
11657       op0 = expand_normal (arg0);
11658       op1 = expand_normal (arg1);
11659       op2 = expand_normal (arg2);
11660       op3 = expand_normal (arg3);
11661 
11662       if (!address_operand (op0, VOIDmode))
11663 	{
11664 	  op0 = convert_memory_address (Pmode, op0);
11665 	  op0 = copy_addr_to_reg (op0);
11666 	}
11667       if (!address_operand (op1, VOIDmode))
11668 	{
11669 	  op1 = convert_memory_address (Pmode, op1);
11670 	  op1 = copy_addr_to_reg (op1);
11671 	}
11672 
11673       switch (fcode)
11674 	{
11675 	case IX86_BUILTIN_2INTERSECTD512:
11676 	  mode4 = P2HImode;
11677 	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11678 	  break;
11679 	case IX86_BUILTIN_2INTERSECTQ512:
11680 	  mode4 = P2QImode;
11681 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11682 	  break;
11683 	case IX86_BUILTIN_2INTERSECTD256:
11684 	  mode4 = P2QImode;
11685 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11686 	  break;
11687 	case IX86_BUILTIN_2INTERSECTQ256:
11688 	  mode4 = P2QImode;
11689 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11690 	  break;
11691 	case IX86_BUILTIN_2INTERSECTD128:
11692 	  mode4 = P2QImode;
11693 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11694 	  break;
11695 	case IX86_BUILTIN_2INTERSECTQ128:
11696 	  mode4 = P2QImode;
11697 	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11698 	  break;
11699 	default:
11700 	  gcc_unreachable ();
11701 	}
11702 
11703       mode2 = insn_data[icode].operand[1].mode;
11704       mode3 = insn_data[icode].operand[2].mode;
11705       if (!insn_data[icode].operand[1].predicate (op2, mode2))
11706 	op2 = copy_to_mode_reg (mode2, op2);
11707       if (!insn_data[icode].operand[2].predicate (op3, mode3))
11708 	op3 = copy_to_mode_reg (mode3, op3);
11709 
11710       op4 = gen_reg_rtx (mode4);
11711       emit_insn (GEN_FCN (icode) (op4, op2, op3));
11712       mode0 = mode4 == P2HImode ? HImode : QImode;
11713       emit_move_insn (gen_rtx_MEM (mode0, op0),
11714 		      gen_lowpart (mode0, op4));
11715       emit_move_insn (gen_rtx_MEM (mode0, op1),
11716 		      gen_highpart (mode0, op4));
11717 
11718       return 0;
11719 
11720     case IX86_BUILTIN_RDPMC:
11721     case IX86_BUILTIN_RDTSC:
11722     case IX86_BUILTIN_RDTSCP:
11723     case IX86_BUILTIN_XGETBV:
11724 
11725       op0 = gen_reg_rtx (DImode);
11726       op1 = gen_reg_rtx (DImode);
11727 
11728       if (fcode == IX86_BUILTIN_RDPMC)
11729 	{
11730 	  arg0 = CALL_EXPR_ARG (exp, 0);
11731 	  op2 = expand_normal (arg0);
11732 	  if (!register_operand (op2, SImode))
11733 	    op2 = copy_to_mode_reg (SImode, op2);
11734 
11735 	  insn = (TARGET_64BIT
11736 		  ? gen_rdpmc_rex64 (op0, op1, op2)
11737 		  : gen_rdpmc (op0, op2));
11738 	  emit_insn (insn);
11739 	}
11740       else if (fcode == IX86_BUILTIN_XGETBV)
11741 	{
11742 	  arg0 = CALL_EXPR_ARG (exp, 0);
11743 	  op2 = expand_normal (arg0);
11744 	  if (!register_operand (op2, SImode))
11745 	    op2 = copy_to_mode_reg (SImode, op2);
11746 
11747 	  insn = (TARGET_64BIT
11748 		  ? gen_xgetbv_rex64 (op0, op1, op2)
11749 		  : gen_xgetbv (op0, op2));
11750 	  emit_insn (insn);
11751 	}
11752       else if (fcode == IX86_BUILTIN_RDTSC)
11753 	{
11754 	  insn = (TARGET_64BIT
11755 		  ? gen_rdtsc_rex64 (op0, op1)
11756 		  : gen_rdtsc (op0));
11757 	  emit_insn (insn);
11758 	}
11759       else
11760 	{
11761 	  op2 = gen_reg_rtx (SImode);
11762 
11763 	  insn = (TARGET_64BIT
11764 		  ? gen_rdtscp_rex64 (op0, op1, op2)
11765 		  : gen_rdtscp (op0, op2));
11766 	  emit_insn (insn);
11767 
11768 	  arg0 = CALL_EXPR_ARG (exp, 0);
11769 	  op4 = expand_normal (arg0);
11770 	  if (!address_operand (op4, VOIDmode))
11771 	    {
11772 	      op4 = convert_memory_address (Pmode, op4);
11773 	      op4 = copy_addr_to_reg (op4);
11774 	    }
11775 	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11776 	}
11777 
11778       if (target == 0
11779 	  || !register_operand (target, DImode))
11780         target = gen_reg_rtx (DImode);
11781 
11782       if (TARGET_64BIT)
11783 	{
11784 	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11785 				     op1, 1, OPTAB_DIRECT);
11786 	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
11787 				     op0, 1, OPTAB_DIRECT);
11788 	}
11789 
11790       emit_move_insn (target, op0);
11791       return target;
11792 
11793     case IX86_BUILTIN_ENQCMD:
11794     case IX86_BUILTIN_ENQCMDS:
11795     case IX86_BUILTIN_MOVDIR64B:
11796 
11797       arg0 = CALL_EXPR_ARG (exp, 0);
11798       arg1 = CALL_EXPR_ARG (exp, 1);
11799       op0 = expand_normal (arg0);
11800       op1 = expand_normal (arg1);
11801 
11802       op0 = ix86_zero_extend_to_Pmode (op0);
11803       if (!address_operand (op1, VOIDmode))
11804       {
11805 	op1 = convert_memory_address (Pmode, op1);
11806 	op1 = copy_addr_to_reg (op1);
11807       }
11808       op1 = gen_rtx_MEM (XImode, op1);
11809 
11810       if (fcode == IX86_BUILTIN_MOVDIR64B)
11811 	{
11812 	  emit_insn (gen_movdir64b (Pmode, op0, op1));
11813 	  return 0;
11814 	}
11815       else
11816 	{
11817 	  if (target == 0
11818 	      || !register_operand (target, SImode))
11819 	    target = gen_reg_rtx (SImode);
11820 
11821 	  emit_move_insn (target, const0_rtx);
11822 	  target = gen_rtx_SUBREG (QImode, target, 0);
11823 
11824 	  int unspecv = (fcode == IX86_BUILTIN_ENQCMD
11825 			 ? UNSPECV_ENQCMD
11826 			 : UNSPECV_ENQCMDS);
11827 	  icode = code_for_enqcmd (unspecv, Pmode);
11828 	  emit_insn (GEN_FCN (icode) (op0, op1));
11829 
11830 	  emit_insn
11831 	    (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11832 			  gen_rtx_fmt_ee (EQ, QImode,
11833 					  gen_rtx_REG (CCZmode, FLAGS_REG),
11834 					  const0_rtx)));
11835 	  return SUBREG_REG (target);
11836 	}
11837 
11838     case IX86_BUILTIN_FXSAVE:
11839     case IX86_BUILTIN_FXRSTOR:
11840     case IX86_BUILTIN_FXSAVE64:
11841     case IX86_BUILTIN_FXRSTOR64:
11842     case IX86_BUILTIN_FNSTENV:
11843     case IX86_BUILTIN_FLDENV:
11844       mode0 = BLKmode;
11845       switch (fcode)
11846 	{
11847 	case IX86_BUILTIN_FXSAVE:
11848 	  icode = CODE_FOR_fxsave;
11849 	  break;
11850 	case IX86_BUILTIN_FXRSTOR:
11851 	  icode = CODE_FOR_fxrstor;
11852 	  break;
11853 	case IX86_BUILTIN_FXSAVE64:
11854 	  icode = CODE_FOR_fxsave64;
11855 	  break;
11856 	case IX86_BUILTIN_FXRSTOR64:
11857 	  icode = CODE_FOR_fxrstor64;
11858 	  break;
11859 	case IX86_BUILTIN_FNSTENV:
11860 	  icode = CODE_FOR_fnstenv;
11861 	  break;
11862 	case IX86_BUILTIN_FLDENV:
11863 	  icode = CODE_FOR_fldenv;
11864 	  break;
11865 	default:
11866 	  gcc_unreachable ();
11867 	}
11868 
11869       arg0 = CALL_EXPR_ARG (exp, 0);
11870       op0 = expand_normal (arg0);
11871 
11872       if (!address_operand (op0, VOIDmode))
11873 	{
11874 	  op0 = convert_memory_address (Pmode, op0);
11875 	  op0 = copy_addr_to_reg (op0);
11876 	}
11877       op0 = gen_rtx_MEM (mode0, op0);
11878 
11879       pat = GEN_FCN (icode) (op0);
11880       if (pat)
11881 	emit_insn (pat);
11882       return 0;
11883 
11884     case IX86_BUILTIN_XSETBV:
11885       arg0 = CALL_EXPR_ARG (exp, 0);
11886       arg1 = CALL_EXPR_ARG (exp, 1);
11887       op0 = expand_normal (arg0);
11888       op1 = expand_normal (arg1);
11889 
11890       if (!REG_P (op0))
11891 	op0 = copy_to_mode_reg (SImode, op0);
11892 
11893       op1 = force_reg (DImode, op1);
11894 
11895       if (TARGET_64BIT)
11896 	{
11897 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11898 				     NULL, 1, OPTAB_DIRECT);
11899 
11900 	  icode = CODE_FOR_xsetbv_rex64;
11901 
11902 	  op2 = gen_lowpart (SImode, op2);
11903 	  op1 = gen_lowpart (SImode, op1);
11904 	  pat = GEN_FCN (icode) (op0, op1, op2);
11905 	}
11906       else
11907 	{
11908 	  icode = CODE_FOR_xsetbv;
11909 
11910 	  pat = GEN_FCN (icode) (op0, op1);
11911 	}
11912       if (pat)
11913 	emit_insn (pat);
11914       return 0;
11915 
11916     case IX86_BUILTIN_XSAVE:
11917     case IX86_BUILTIN_XRSTOR:
11918     case IX86_BUILTIN_XSAVE64:
11919     case IX86_BUILTIN_XRSTOR64:
11920     case IX86_BUILTIN_XSAVEOPT:
11921     case IX86_BUILTIN_XSAVEOPT64:
11922     case IX86_BUILTIN_XSAVES:
11923     case IX86_BUILTIN_XRSTORS:
11924     case IX86_BUILTIN_XSAVES64:
11925     case IX86_BUILTIN_XRSTORS64:
11926     case IX86_BUILTIN_XSAVEC:
11927     case IX86_BUILTIN_XSAVEC64:
11928       arg0 = CALL_EXPR_ARG (exp, 0);
11929       arg1 = CALL_EXPR_ARG (exp, 1);
11930       op0 = expand_normal (arg0);
11931       op1 = expand_normal (arg1);
11932 
11933       if (!address_operand (op0, VOIDmode))
11934 	{
11935 	  op0 = convert_memory_address (Pmode, op0);
11936 	  op0 = copy_addr_to_reg (op0);
11937 	}
11938       op0 = gen_rtx_MEM (BLKmode, op0);
11939 
11940       op1 = force_reg (DImode, op1);
11941 
11942       if (TARGET_64BIT)
11943 	{
11944 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11945 				     NULL, 1, OPTAB_DIRECT);
11946 	  switch (fcode)
11947 	    {
11948 	    case IX86_BUILTIN_XSAVE:
11949 	      icode = CODE_FOR_xsave_rex64;
11950 	      break;
11951 	    case IX86_BUILTIN_XRSTOR:
11952 	      icode = CODE_FOR_xrstor_rex64;
11953 	      break;
11954 	    case IX86_BUILTIN_XSAVE64:
11955 	      icode = CODE_FOR_xsave64;
11956 	      break;
11957 	    case IX86_BUILTIN_XRSTOR64:
11958 	      icode = CODE_FOR_xrstor64;
11959 	      break;
11960 	    case IX86_BUILTIN_XSAVEOPT:
11961 	      icode = CODE_FOR_xsaveopt_rex64;
11962 	      break;
11963 	    case IX86_BUILTIN_XSAVEOPT64:
11964 	      icode = CODE_FOR_xsaveopt64;
11965 	      break;
11966 	    case IX86_BUILTIN_XSAVES:
11967 	      icode = CODE_FOR_xsaves_rex64;
11968 	      break;
11969 	    case IX86_BUILTIN_XRSTORS:
11970 	      icode = CODE_FOR_xrstors_rex64;
11971 	      break;
11972 	    case IX86_BUILTIN_XSAVES64:
11973 	      icode = CODE_FOR_xsaves64;
11974 	      break;
11975 	    case IX86_BUILTIN_XRSTORS64:
11976 	      icode = CODE_FOR_xrstors64;
11977 	      break;
11978 	    case IX86_BUILTIN_XSAVEC:
11979 	      icode = CODE_FOR_xsavec_rex64;
11980 	      break;
11981 	    case IX86_BUILTIN_XSAVEC64:
11982 	      icode = CODE_FOR_xsavec64;
11983 	      break;
11984 	    default:
11985 	      gcc_unreachable ();
11986 	    }
11987 
11988 	  op2 = gen_lowpart (SImode, op2);
11989 	  op1 = gen_lowpart (SImode, op1);
11990 	  pat = GEN_FCN (icode) (op0, op1, op2);
11991 	}
11992       else
11993 	{
11994 	  switch (fcode)
11995 	    {
11996 	    case IX86_BUILTIN_XSAVE:
11997 	      icode = CODE_FOR_xsave;
11998 	      break;
11999 	    case IX86_BUILTIN_XRSTOR:
12000 	      icode = CODE_FOR_xrstor;
12001 	      break;
12002 	    case IX86_BUILTIN_XSAVEOPT:
12003 	      icode = CODE_FOR_xsaveopt;
12004 	      break;
12005 	    case IX86_BUILTIN_XSAVES:
12006 	      icode = CODE_FOR_xsaves;
12007 	      break;
12008 	    case IX86_BUILTIN_XRSTORS:
12009 	      icode = CODE_FOR_xrstors;
12010 	      break;
12011 	    case IX86_BUILTIN_XSAVEC:
12012 	      icode = CODE_FOR_xsavec;
12013 	      break;
12014 	    default:
12015 	      gcc_unreachable ();
12016 	    }
12017 	  pat = GEN_FCN (icode) (op0, op1);
12018 	}
12019 
12020       if (pat)
12021 	emit_insn (pat);
12022       return 0;
12023 
12024     case IX86_BUILTIN_LLWPCB:
12025       arg0 = CALL_EXPR_ARG (exp, 0);
12026       op0 = expand_normal (arg0);
12027 
12028       if (!register_operand (op0, Pmode))
12029 	op0 = ix86_zero_extend_to_Pmode (op0);
12030       emit_insn (gen_lwp_llwpcb (Pmode, op0));
12031       return 0;
12032 
12033     case IX86_BUILTIN_SLWPCB:
12034       if (!target
12035 	  || !register_operand (target, Pmode))
12036 	target = gen_reg_rtx (Pmode);
12037       emit_insn (gen_lwp_slwpcb (Pmode, target));
12038       return target;
12039 
12040     case IX86_BUILTIN_LWPVAL32:
12041     case IX86_BUILTIN_LWPVAL64:
12042     case IX86_BUILTIN_LWPINS32:
12043     case IX86_BUILTIN_LWPINS64:
12044       mode = ((fcode == IX86_BUILTIN_LWPVAL32
12045 	       || fcode == IX86_BUILTIN_LWPINS32)
12046 	      ? SImode : DImode);
12047 
12048       if (fcode == IX86_BUILTIN_LWPVAL32
12049 	  || fcode == IX86_BUILTIN_LWPVAL64)
12050 	icode = code_for_lwp_lwpval (mode);
12051       else
12052 	icode = code_for_lwp_lwpins (mode);
12053 
12054       arg0 = CALL_EXPR_ARG (exp, 0);
12055       arg1 = CALL_EXPR_ARG (exp, 1);
12056       arg2 = CALL_EXPR_ARG (exp, 2);
12057       op0 = expand_normal (arg0);
12058       op1 = expand_normal (arg1);
12059       op2 = expand_normal (arg2);
12060       mode0 = insn_data[icode].operand[0].mode;
12061 
12062       if (!insn_data[icode].operand[0].predicate (op0, mode0))
12063 	op0 = copy_to_mode_reg (mode0, op0);
12064       if (!insn_data[icode].operand[1].predicate (op1, SImode))
12065 	op1 = copy_to_mode_reg (SImode, op1);
12066 
12067       if (!CONST_INT_P (op2))
12068 	{
12069 	  error ("the last argument must be a 32-bit immediate");
12070 	  return const0_rtx;
12071 	}
12072 
12073       emit_insn (GEN_FCN (icode) (op0, op1, op2));
12074 
12075       if (fcode == IX86_BUILTIN_LWPINS32
12076 	  || fcode == IX86_BUILTIN_LWPINS64)
12077 	{
12078 	  if (target == 0
12079 	      || !nonimmediate_operand (target, QImode))
12080 	    target = gen_reg_rtx (QImode);
12081 
12082 	  pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12083 			    const0_rtx);
12084 	  emit_insn (gen_rtx_SET (target, pat));
12085 
12086 	  return target;
12087 	}
12088       else
12089 	return 0;
12090 
12091     case IX86_BUILTIN_BEXTRI32:
12092     case IX86_BUILTIN_BEXTRI64:
12093       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
12094 
12095       arg0 = CALL_EXPR_ARG (exp, 0);
12096       arg1 = CALL_EXPR_ARG (exp, 1);
12097       op0 = expand_normal (arg0);
12098       op1 = expand_normal (arg1);
12099 
12100       if (!CONST_INT_P (op1))
12101 	{
12102 	  error ("last argument must be an immediate");
12103 	  return const0_rtx;
12104 	}
12105       else
12106 	{
12107 	  unsigned char lsb_index = UINTVAL (op1);
12108 	  unsigned char length = UINTVAL (op1) >> 8;
12109 
12110 	  unsigned char bitsize = GET_MODE_BITSIZE (mode);
12111 
12112 	  icode = code_for_tbm_bextri (mode);
12113 
12114 	  mode1 = insn_data[icode].operand[1].mode;
12115 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
12116 	    op0 = copy_to_mode_reg (mode1, op0);
12117 
12118 	  mode0 = insn_data[icode].operand[0].mode;
12119 	  if (target == 0
12120 	      || !register_operand (target, mode0))
12121 	    target = gen_reg_rtx (mode0);
12122 
12123 	  if (length == 0 || lsb_index >= bitsize)
12124 	    {
12125 	      emit_move_insn (target, const0_rtx);
12126 	      return target;
12127 	    }
12128 
12129 	  if (length + lsb_index > bitsize)
12130 	    length = bitsize - lsb_index;
12131 
12132 	  op1 = GEN_INT (length);
12133 	  op2 = GEN_INT (lsb_index);
12134 
12135 	  emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
12136 	  return target;
12137 	}
12138 
12139     case IX86_BUILTIN_RDRAND16_STEP:
12140       mode = HImode;
12141       goto rdrand_step;
12142 
12143     case IX86_BUILTIN_RDRAND32_STEP:
12144       mode = SImode;
12145       goto rdrand_step;
12146 
12147     case IX86_BUILTIN_RDRAND64_STEP:
12148       mode = DImode;
12149 
12150 rdrand_step:
12151       arg0 = CALL_EXPR_ARG (exp, 0);
12152       op1 = expand_normal (arg0);
12153       if (!address_operand (op1, VOIDmode))
12154 	{
12155 	  op1 = convert_memory_address (Pmode, op1);
12156 	  op1 = copy_addr_to_reg (op1);
12157 	}
12158 
12159       op0 = gen_reg_rtx (mode);
12160       emit_insn (gen_rdrand (mode, op0));
12161 
12162       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12163 
12164       op1 = force_reg (SImode, const1_rtx);
12165 
12166       /* Emit SImode conditional move.  */
12167       if (mode == HImode)
12168 	{
12169 	  if (TARGET_ZERO_EXTEND_WITH_AND
12170 	      && optimize_function_for_speed_p (cfun))
12171 	    {
12172 	      op2 = force_reg (SImode, const0_rtx);
12173 
12174 	      emit_insn (gen_movstricthi
12175 			 (gen_lowpart (HImode, op2), op0));
12176 	    }
12177 	  else
12178 	    {
12179 	      op2 = gen_reg_rtx (SImode);
12180 
12181 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
12182 	    }
12183 	}
12184       else if (mode == SImode)
12185 	op2 = op0;
12186       else
12187 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
12188 
12189       if (target == 0
12190 	  || !register_operand (target, SImode))
12191 	target = gen_reg_rtx (SImode);
12192 
12193       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
12194 			 const0_rtx);
12195       emit_insn (gen_rtx_SET (target,
12196 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
12197       return target;
12198 
12199     case IX86_BUILTIN_RDSEED16_STEP:
12200       mode = HImode;
12201       goto rdseed_step;
12202 
12203     case IX86_BUILTIN_RDSEED32_STEP:
12204       mode = SImode;
12205       goto rdseed_step;
12206 
12207     case IX86_BUILTIN_RDSEED64_STEP:
12208       mode = DImode;
12209 
12210 rdseed_step:
12211       arg0 = CALL_EXPR_ARG (exp, 0);
12212       op1 = expand_normal (arg0);
12213       if (!address_operand (op1, VOIDmode))
12214 	{
12215 	  op1 = convert_memory_address (Pmode, op1);
12216 	  op1 = copy_addr_to_reg (op1);
12217 	}
12218 
12219       op0 = gen_reg_rtx (mode);
12220       emit_insn (gen_rdseed (mode, op0));
12221 
12222       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12223 
12224       op2 = gen_reg_rtx (QImode);
12225 
12226       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12227                          const0_rtx);
12228       emit_insn (gen_rtx_SET (op2, pat));
12229 
12230       if (target == 0
12231 	  || !register_operand (target, SImode))
12232         target = gen_reg_rtx (SImode);
12233 
12234       emit_insn (gen_zero_extendqisi2 (target, op2));
12235       return target;
12236 
12237     case IX86_BUILTIN_SBB32:
12238       icode = CODE_FOR_subborrowsi;
12239       icode2 = CODE_FOR_subborrowsi_0;
12240       mode0 = SImode;
12241       mode1 = DImode;
12242       mode2 = CCmode;
12243       goto handlecarry;
12244 
12245     case IX86_BUILTIN_SBB64:
12246       icode = CODE_FOR_subborrowdi;
12247       icode2 = CODE_FOR_subborrowdi_0;
12248       mode0 = DImode;
12249       mode1 = TImode;
12250       mode2 = CCmode;
12251       goto handlecarry;
12252 
12253     case IX86_BUILTIN_ADDCARRYX32:
12254       icode = CODE_FOR_addcarrysi;
12255       icode2 = CODE_FOR_addcarrysi_0;
12256       mode0 = SImode;
12257       mode1 = DImode;
12258       mode2 = CCCmode;
12259       goto handlecarry;
12260 
12261     case IX86_BUILTIN_ADDCARRYX64:
12262       icode = CODE_FOR_addcarrydi;
12263       icode2 = CODE_FOR_addcarrydi_0;
12264       mode0 = DImode;
12265       mode1 = TImode;
12266       mode2 = CCCmode;
12267 
12268     handlecarry:
12269       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
12270       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
12271       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
12272       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
12273 
12274       op1 = expand_normal (arg0);
12275       if (!integer_zerop (arg0))
12276 	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
12277 
12278       op2 = expand_normal (arg1);
12279       if (!register_operand (op2, mode0))
12280 	op2 = copy_to_mode_reg (mode0, op2);
12281 
12282       op3 = expand_normal (arg2);
12283       if (!register_operand (op3, mode0))
12284 	op3 = copy_to_mode_reg (mode0, op3);
12285 
12286       op4 = expand_normal (arg3);
12287       if (!address_operand (op4, VOIDmode))
12288 	{
12289 	  op4 = convert_memory_address (Pmode, op4);
12290 	  op4 = copy_addr_to_reg (op4);
12291 	}
12292 
12293       op0 = gen_reg_rtx (mode0);
12294       if (integer_zerop (arg0))
12295 	{
12296 	  /* If arg0 is 0, optimize right away into add or sub
12297 	     instruction that sets CCCmode flags.  */
12298 	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
12299 	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
12300 	}
12301       else
12302 	{
12303 	  /* Generate CF from input operand.  */
12304 	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
12305 
12306 	  /* Generate instruction that consumes CF.  */
12307 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
12308 	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
12309 	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
12310 	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
12311 	}
12312 
12313       /* Return current CF value.  */
12314       if (target == 0)
12315         target = gen_reg_rtx (QImode);
12316 
12317       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
12318       emit_insn (gen_rtx_SET (target, pat));
12319 
12320       /* Store the result.  */
12321       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
12322 
12323       return target;
12324 
12325     case IX86_BUILTIN_READ_FLAGS:
12326       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
12327 
12328       if (optimize
12329 	  || target == NULL_RTX
12330 	  || !nonimmediate_operand (target, word_mode)
12331 	  || GET_MODE (target) != word_mode)
12332 	target = gen_reg_rtx (word_mode);
12333 
12334       emit_insn (gen_pop (target));
12335       return target;
12336 
12337     case IX86_BUILTIN_WRITE_FLAGS:
12338 
12339       arg0 = CALL_EXPR_ARG (exp, 0);
12340       op0 = expand_normal (arg0);
12341       if (!general_no_elim_operand (op0, word_mode))
12342 	op0 = copy_to_mode_reg (word_mode, op0);
12343 
12344       emit_insn (gen_push (op0));
12345       emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
12346       return 0;
12347 
12348     case IX86_BUILTIN_KTESTC8:
12349       icode = CODE_FOR_ktestqi;
12350       mode3 = CCCmode;
12351       goto kortest;
12352 
12353     case IX86_BUILTIN_KTESTZ8:
12354       icode = CODE_FOR_ktestqi;
12355       mode3 = CCZmode;
12356       goto kortest;
12357 
12358     case IX86_BUILTIN_KTESTC16:
12359       icode = CODE_FOR_ktesthi;
12360       mode3 = CCCmode;
12361       goto kortest;
12362 
12363     case IX86_BUILTIN_KTESTZ16:
12364       icode = CODE_FOR_ktesthi;
12365       mode3 = CCZmode;
12366       goto kortest;
12367 
12368     case IX86_BUILTIN_KTESTC32:
12369       icode = CODE_FOR_ktestsi;
12370       mode3 = CCCmode;
12371       goto kortest;
12372 
12373     case IX86_BUILTIN_KTESTZ32:
12374       icode = CODE_FOR_ktestsi;
12375       mode3 = CCZmode;
12376       goto kortest;
12377 
12378     case IX86_BUILTIN_KTESTC64:
12379       icode = CODE_FOR_ktestdi;
12380       mode3 = CCCmode;
12381       goto kortest;
12382 
12383     case IX86_BUILTIN_KTESTZ64:
12384       icode = CODE_FOR_ktestdi;
12385       mode3 = CCZmode;
12386       goto kortest;
12387 
12388     case IX86_BUILTIN_KORTESTC8:
12389       icode = CODE_FOR_kortestqi;
12390       mode3 = CCCmode;
12391       goto kortest;
12392 
12393     case IX86_BUILTIN_KORTESTZ8:
12394       icode = CODE_FOR_kortestqi;
12395       mode3 = CCZmode;
12396       goto kortest;
12397 
12398     case IX86_BUILTIN_KORTESTC16:
12399       icode = CODE_FOR_kortesthi;
12400       mode3 = CCCmode;
12401       goto kortest;
12402 
12403     case IX86_BUILTIN_KORTESTZ16:
12404       icode = CODE_FOR_kortesthi;
12405       mode3 = CCZmode;
12406       goto kortest;
12407 
12408     case IX86_BUILTIN_KORTESTC32:
12409       icode = CODE_FOR_kortestsi;
12410       mode3 = CCCmode;
12411       goto kortest;
12412 
12413     case IX86_BUILTIN_KORTESTZ32:
12414       icode = CODE_FOR_kortestsi;
12415       mode3 = CCZmode;
12416       goto kortest;
12417 
12418     case IX86_BUILTIN_KORTESTC64:
12419       icode = CODE_FOR_kortestdi;
12420       mode3 = CCCmode;
12421       goto kortest;
12422 
12423     case IX86_BUILTIN_KORTESTZ64:
12424       icode = CODE_FOR_kortestdi;
12425       mode3 = CCZmode;
12426 
12427     kortest:
12428       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
12429       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
12430       op0 = expand_normal (arg0);
12431       op1 = expand_normal (arg1);
12432 
12433       mode0 = insn_data[icode].operand[0].mode;
12434       mode1 = insn_data[icode].operand[1].mode;
12435 
12436       if (GET_MODE (op0) != VOIDmode)
12437 	op0 = force_reg (GET_MODE (op0), op0);
12438 
12439       op0 = gen_lowpart (mode0, op0);
12440 
12441       if (!insn_data[icode].operand[0].predicate (op0, mode0))
12442 	op0 = copy_to_mode_reg (mode0, op0);
12443 
12444       if (GET_MODE (op1) != VOIDmode)
12445 	op1 = force_reg (GET_MODE (op1), op1);
12446 
12447       op1 = gen_lowpart (mode1, op1);
12448 
12449       if (!insn_data[icode].operand[1].predicate (op1, mode1))
12450 	op1 = copy_to_mode_reg (mode1, op1);
12451 
12452       target = gen_reg_rtx (QImode);
12453 
12454       /* Emit kortest.  */
12455       emit_insn (GEN_FCN (icode) (op0, op1));
12456       /* And use setcc to return result from flags.  */
12457       ix86_expand_setcc (target, EQ,
12458 			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12459       return target;
12460 
12461     case IX86_BUILTIN_GATHERSIV2DF:
12462       icode = CODE_FOR_avx2_gathersiv2df;
12463       goto gather_gen;
12464     case IX86_BUILTIN_GATHERSIV4DF:
12465       icode = CODE_FOR_avx2_gathersiv4df;
12466       goto gather_gen;
12467     case IX86_BUILTIN_GATHERDIV2DF:
12468       icode = CODE_FOR_avx2_gatherdiv2df;
12469       goto gather_gen;
12470     case IX86_BUILTIN_GATHERDIV4DF:
12471       icode = CODE_FOR_avx2_gatherdiv4df;
12472       goto gather_gen;
12473     case IX86_BUILTIN_GATHERSIV4SF:
12474       icode = CODE_FOR_avx2_gathersiv4sf;
12475       goto gather_gen;
12476     case IX86_BUILTIN_GATHERSIV8SF:
12477       icode = CODE_FOR_avx2_gathersiv8sf;
12478       goto gather_gen;
12479     case IX86_BUILTIN_GATHERDIV4SF:
12480       icode = CODE_FOR_avx2_gatherdiv4sf;
12481       goto gather_gen;
12482     case IX86_BUILTIN_GATHERDIV8SF:
12483       icode = CODE_FOR_avx2_gatherdiv8sf;
12484       goto gather_gen;
12485     case IX86_BUILTIN_GATHERSIV2DI:
12486       icode = CODE_FOR_avx2_gathersiv2di;
12487       goto gather_gen;
12488     case IX86_BUILTIN_GATHERSIV4DI:
12489       icode = CODE_FOR_avx2_gathersiv4di;
12490       goto gather_gen;
12491     case IX86_BUILTIN_GATHERDIV2DI:
12492       icode = CODE_FOR_avx2_gatherdiv2di;
12493       goto gather_gen;
12494     case IX86_BUILTIN_GATHERDIV4DI:
12495       icode = CODE_FOR_avx2_gatherdiv4di;
12496       goto gather_gen;
12497     case IX86_BUILTIN_GATHERSIV4SI:
12498       icode = CODE_FOR_avx2_gathersiv4si;
12499       goto gather_gen;
12500     case IX86_BUILTIN_GATHERSIV8SI:
12501       icode = CODE_FOR_avx2_gathersiv8si;
12502       goto gather_gen;
12503     case IX86_BUILTIN_GATHERDIV4SI:
12504       icode = CODE_FOR_avx2_gatherdiv4si;
12505       goto gather_gen;
12506     case IX86_BUILTIN_GATHERDIV8SI:
12507       icode = CODE_FOR_avx2_gatherdiv8si;
12508       goto gather_gen;
12509     case IX86_BUILTIN_GATHERALTSIV4DF:
12510       icode = CODE_FOR_avx2_gathersiv4df;
12511       goto gather_gen;
12512     case IX86_BUILTIN_GATHERALTDIV8SF:
12513       icode = CODE_FOR_avx2_gatherdiv8sf;
12514       goto gather_gen;
12515     case IX86_BUILTIN_GATHERALTSIV4DI:
12516       icode = CODE_FOR_avx2_gathersiv4di;
12517       goto gather_gen;
12518     case IX86_BUILTIN_GATHERALTDIV8SI:
12519       icode = CODE_FOR_avx2_gatherdiv8si;
12520       goto gather_gen;
12521     case IX86_BUILTIN_GATHER3SIV16SF:
12522       icode = CODE_FOR_avx512f_gathersiv16sf;
12523       goto gather_gen;
12524     case IX86_BUILTIN_GATHER3SIV8DF:
12525       icode = CODE_FOR_avx512f_gathersiv8df;
12526       goto gather_gen;
12527     case IX86_BUILTIN_GATHER3DIV16SF:
12528       icode = CODE_FOR_avx512f_gatherdiv16sf;
12529       goto gather_gen;
12530     case IX86_BUILTIN_GATHER3DIV8DF:
12531       icode = CODE_FOR_avx512f_gatherdiv8df;
12532       goto gather_gen;
12533     case IX86_BUILTIN_GATHER3SIV16SI:
12534       icode = CODE_FOR_avx512f_gathersiv16si;
12535       goto gather_gen;
12536     case IX86_BUILTIN_GATHER3SIV8DI:
12537       icode = CODE_FOR_avx512f_gathersiv8di;
12538       goto gather_gen;
12539     case IX86_BUILTIN_GATHER3DIV16SI:
12540       icode = CODE_FOR_avx512f_gatherdiv16si;
12541       goto gather_gen;
12542     case IX86_BUILTIN_GATHER3DIV8DI:
12543       icode = CODE_FOR_avx512f_gatherdiv8di;
12544       goto gather_gen;
12545     case IX86_BUILTIN_GATHER3ALTSIV8DF:
12546       icode = CODE_FOR_avx512f_gathersiv8df;
12547       goto gather_gen;
12548     case IX86_BUILTIN_GATHER3ALTDIV16SF:
12549       icode = CODE_FOR_avx512f_gatherdiv16sf;
12550       goto gather_gen;
12551     case IX86_BUILTIN_GATHER3ALTSIV8DI:
12552       icode = CODE_FOR_avx512f_gathersiv8di;
12553       goto gather_gen;
12554     case IX86_BUILTIN_GATHER3ALTDIV16SI:
12555       icode = CODE_FOR_avx512f_gatherdiv16si;
12556       goto gather_gen;
12557     case IX86_BUILTIN_GATHER3SIV2DF:
12558       icode = CODE_FOR_avx512vl_gathersiv2df;
12559       goto gather_gen;
12560     case IX86_BUILTIN_GATHER3SIV4DF:
12561       icode = CODE_FOR_avx512vl_gathersiv4df;
12562       goto gather_gen;
12563     case IX86_BUILTIN_GATHER3DIV2DF:
12564       icode = CODE_FOR_avx512vl_gatherdiv2df;
12565       goto gather_gen;
12566     case IX86_BUILTIN_GATHER3DIV4DF:
12567       icode = CODE_FOR_avx512vl_gatherdiv4df;
12568       goto gather_gen;
12569     case IX86_BUILTIN_GATHER3SIV4SF:
12570       icode = CODE_FOR_avx512vl_gathersiv4sf;
12571       goto gather_gen;
12572     case IX86_BUILTIN_GATHER3SIV8SF:
12573       icode = CODE_FOR_avx512vl_gathersiv8sf;
12574       goto gather_gen;
12575     case IX86_BUILTIN_GATHER3DIV4SF:
12576       icode = CODE_FOR_avx512vl_gatherdiv4sf;
12577       goto gather_gen;
12578     case IX86_BUILTIN_GATHER3DIV8SF:
12579       icode = CODE_FOR_avx512vl_gatherdiv8sf;
12580       goto gather_gen;
12581     case IX86_BUILTIN_GATHER3SIV2DI:
12582       icode = CODE_FOR_avx512vl_gathersiv2di;
12583       goto gather_gen;
12584     case IX86_BUILTIN_GATHER3SIV4DI:
12585       icode = CODE_FOR_avx512vl_gathersiv4di;
12586       goto gather_gen;
12587     case IX86_BUILTIN_GATHER3DIV2DI:
12588       icode = CODE_FOR_avx512vl_gatherdiv2di;
12589       goto gather_gen;
12590     case IX86_BUILTIN_GATHER3DIV4DI:
12591       icode = CODE_FOR_avx512vl_gatherdiv4di;
12592       goto gather_gen;
12593     case IX86_BUILTIN_GATHER3SIV4SI:
12594       icode = CODE_FOR_avx512vl_gathersiv4si;
12595       goto gather_gen;
12596     case IX86_BUILTIN_GATHER3SIV8SI:
12597       icode = CODE_FOR_avx512vl_gathersiv8si;
12598       goto gather_gen;
12599     case IX86_BUILTIN_GATHER3DIV4SI:
12600       icode = CODE_FOR_avx512vl_gatherdiv4si;
12601       goto gather_gen;
12602     case IX86_BUILTIN_GATHER3DIV8SI:
12603       icode = CODE_FOR_avx512vl_gatherdiv8si;
12604       goto gather_gen;
12605     case IX86_BUILTIN_GATHER3ALTSIV4DF:
12606       icode = CODE_FOR_avx512vl_gathersiv4df;
12607       goto gather_gen;
12608     case IX86_BUILTIN_GATHER3ALTDIV8SF:
12609       icode = CODE_FOR_avx512vl_gatherdiv8sf;
12610       goto gather_gen;
12611     case IX86_BUILTIN_GATHER3ALTSIV4DI:
12612       icode = CODE_FOR_avx512vl_gathersiv4di;
12613       goto gather_gen;
12614     case IX86_BUILTIN_GATHER3ALTDIV8SI:
12615       icode = CODE_FOR_avx512vl_gatherdiv8si;
12616       goto gather_gen;
12617     case IX86_BUILTIN_SCATTERSIV16SF:
12618       icode = CODE_FOR_avx512f_scattersiv16sf;
12619       goto scatter_gen;
12620     case IX86_BUILTIN_SCATTERSIV8DF:
12621       icode = CODE_FOR_avx512f_scattersiv8df;
12622       goto scatter_gen;
12623     case IX86_BUILTIN_SCATTERDIV16SF:
12624       icode = CODE_FOR_avx512f_scatterdiv16sf;
12625       goto scatter_gen;
12626     case IX86_BUILTIN_SCATTERDIV8DF:
12627       icode = CODE_FOR_avx512f_scatterdiv8df;
12628       goto scatter_gen;
12629     case IX86_BUILTIN_SCATTERSIV16SI:
12630       icode = CODE_FOR_avx512f_scattersiv16si;
12631       goto scatter_gen;
12632     case IX86_BUILTIN_SCATTERSIV8DI:
12633       icode = CODE_FOR_avx512f_scattersiv8di;
12634       goto scatter_gen;
12635     case IX86_BUILTIN_SCATTERDIV16SI:
12636       icode = CODE_FOR_avx512f_scatterdiv16si;
12637       goto scatter_gen;
12638     case IX86_BUILTIN_SCATTERDIV8DI:
12639       icode = CODE_FOR_avx512f_scatterdiv8di;
12640       goto scatter_gen;
12641     case IX86_BUILTIN_SCATTERSIV8SF:
12642       icode = CODE_FOR_avx512vl_scattersiv8sf;
12643       goto scatter_gen;
12644     case IX86_BUILTIN_SCATTERSIV4SF:
12645       icode = CODE_FOR_avx512vl_scattersiv4sf;
12646       goto scatter_gen;
12647     case IX86_BUILTIN_SCATTERSIV4DF:
12648       icode = CODE_FOR_avx512vl_scattersiv4df;
12649       goto scatter_gen;
12650     case IX86_BUILTIN_SCATTERSIV2DF:
12651       icode = CODE_FOR_avx512vl_scattersiv2df;
12652       goto scatter_gen;
12653     case IX86_BUILTIN_SCATTERDIV8SF:
12654       icode = CODE_FOR_avx512vl_scatterdiv8sf;
12655       goto scatter_gen;
12656     case IX86_BUILTIN_SCATTERDIV4SF:
12657       icode = CODE_FOR_avx512vl_scatterdiv4sf;
12658       goto scatter_gen;
12659     case IX86_BUILTIN_SCATTERDIV4DF:
12660       icode = CODE_FOR_avx512vl_scatterdiv4df;
12661       goto scatter_gen;
12662     case IX86_BUILTIN_SCATTERDIV2DF:
12663       icode = CODE_FOR_avx512vl_scatterdiv2df;
12664       goto scatter_gen;
12665     case IX86_BUILTIN_SCATTERSIV8SI:
12666       icode = CODE_FOR_avx512vl_scattersiv8si;
12667       goto scatter_gen;
12668     case IX86_BUILTIN_SCATTERSIV4SI:
12669       icode = CODE_FOR_avx512vl_scattersiv4si;
12670       goto scatter_gen;
12671     case IX86_BUILTIN_SCATTERSIV4DI:
12672       icode = CODE_FOR_avx512vl_scattersiv4di;
12673       goto scatter_gen;
12674     case IX86_BUILTIN_SCATTERSIV2DI:
12675       icode = CODE_FOR_avx512vl_scattersiv2di;
12676       goto scatter_gen;
12677     case IX86_BUILTIN_SCATTERDIV8SI:
12678       icode = CODE_FOR_avx512vl_scatterdiv8si;
12679       goto scatter_gen;
12680     case IX86_BUILTIN_SCATTERDIV4SI:
12681       icode = CODE_FOR_avx512vl_scatterdiv4si;
12682       goto scatter_gen;
12683     case IX86_BUILTIN_SCATTERDIV4DI:
12684       icode = CODE_FOR_avx512vl_scatterdiv4di;
12685       goto scatter_gen;
12686     case IX86_BUILTIN_SCATTERDIV2DI:
12687       icode = CODE_FOR_avx512vl_scatterdiv2di;
12688       goto scatter_gen;
12689     case IX86_BUILTIN_GATHERPFDPD:
12690       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12691       goto vec_prefetch_gen;
12692     case IX86_BUILTIN_SCATTERALTSIV8DF:
12693       icode = CODE_FOR_avx512f_scattersiv8df;
12694       goto scatter_gen;
12695     case IX86_BUILTIN_SCATTERALTDIV16SF:
12696       icode = CODE_FOR_avx512f_scatterdiv16sf;
12697       goto scatter_gen;
12698     case IX86_BUILTIN_SCATTERALTSIV8DI:
12699       icode = CODE_FOR_avx512f_scattersiv8di;
12700       goto scatter_gen;
12701     case IX86_BUILTIN_SCATTERALTDIV16SI:
12702       icode = CODE_FOR_avx512f_scatterdiv16si;
12703       goto scatter_gen;
12704     case IX86_BUILTIN_SCATTERALTSIV4DF:
12705       icode = CODE_FOR_avx512vl_scattersiv4df;
12706       goto scatter_gen;
12707     case IX86_BUILTIN_SCATTERALTDIV8SF:
12708       icode = CODE_FOR_avx512vl_scatterdiv8sf;
12709       goto scatter_gen;
12710     case IX86_BUILTIN_SCATTERALTSIV4DI:
12711       icode = CODE_FOR_avx512vl_scattersiv4di;
12712       goto scatter_gen;
12713     case IX86_BUILTIN_SCATTERALTDIV8SI:
12714       icode = CODE_FOR_avx512vl_scatterdiv8si;
12715       goto scatter_gen;
12716     case IX86_BUILTIN_SCATTERALTSIV2DF:
12717       icode = CODE_FOR_avx512vl_scattersiv2df;
12718       goto scatter_gen;
12719     case IX86_BUILTIN_SCATTERALTDIV4SF:
12720       icode = CODE_FOR_avx512vl_scatterdiv4sf;
12721       goto scatter_gen;
12722     case IX86_BUILTIN_SCATTERALTSIV2DI:
12723       icode = CODE_FOR_avx512vl_scattersiv2di;
12724       goto scatter_gen;
12725     case IX86_BUILTIN_SCATTERALTDIV4SI:
12726       icode = CODE_FOR_avx512vl_scatterdiv4si;
12727       goto scatter_gen;
12728     case IX86_BUILTIN_GATHERPFDPS:
12729       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12730       goto vec_prefetch_gen;
12731     case IX86_BUILTIN_GATHERPFQPD:
12732       icode = CODE_FOR_avx512pf_gatherpfv8didf;
12733       goto vec_prefetch_gen;
12734     case IX86_BUILTIN_GATHERPFQPS:
12735       icode = CODE_FOR_avx512pf_gatherpfv8disf;
12736       goto vec_prefetch_gen;
12737     case IX86_BUILTIN_SCATTERPFDPD:
12738       icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12739       goto vec_prefetch_gen;
12740     case IX86_BUILTIN_SCATTERPFDPS:
12741       icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12742       goto vec_prefetch_gen;
12743     case IX86_BUILTIN_SCATTERPFQPD:
12744       icode = CODE_FOR_avx512pf_scatterpfv8didf;
12745       goto vec_prefetch_gen;
12746     case IX86_BUILTIN_SCATTERPFQPS:
12747       icode = CODE_FOR_avx512pf_scatterpfv8disf;
12748       goto vec_prefetch_gen;
12749 
12750     gather_gen:
12751       rtx half;
12752       rtx (*gen) (rtx, rtx);
12753 
12754       arg0 = CALL_EXPR_ARG (exp, 0);
12755       arg1 = CALL_EXPR_ARG (exp, 1);
12756       arg2 = CALL_EXPR_ARG (exp, 2);
12757       arg3 = CALL_EXPR_ARG (exp, 3);
12758       arg4 = CALL_EXPR_ARG (exp, 4);
12759       op0 = expand_normal (arg0);
12760       op1 = expand_normal (arg1);
12761       op2 = expand_normal (arg2);
12762       op3 = expand_normal (arg3);
12763       op4 = expand_normal (arg4);
12764       /* Note the arg order is different from the operand order.  */
12765       mode0 = insn_data[icode].operand[1].mode;
12766       mode2 = insn_data[icode].operand[3].mode;
12767       mode3 = insn_data[icode].operand[4].mode;
12768       mode4 = insn_data[icode].operand[5].mode;
12769 
12770       if (target == NULL_RTX
12771 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
12772 	  || !insn_data[icode].operand[0].predicate (target,
12773 						     GET_MODE (target)))
12774 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12775       else
12776 	subtarget = target;
12777 
12778       switch (fcode)
12779 	{
12780 	case IX86_BUILTIN_GATHER3ALTSIV8DF:
12781 	case IX86_BUILTIN_GATHER3ALTSIV8DI:
12782 	  half = gen_reg_rtx (V8SImode);
12783 	  if (!nonimmediate_operand (op2, V16SImode))
12784 	    op2 = copy_to_mode_reg (V16SImode, op2);
12785 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
12786 	  op2 = half;
12787 	  break;
12788 	case IX86_BUILTIN_GATHER3ALTSIV4DF:
12789 	case IX86_BUILTIN_GATHER3ALTSIV4DI:
12790 	case IX86_BUILTIN_GATHERALTSIV4DF:
12791 	case IX86_BUILTIN_GATHERALTSIV4DI:
12792 	  half = gen_reg_rtx (V4SImode);
12793 	  if (!nonimmediate_operand (op2, V8SImode))
12794 	    op2 = copy_to_mode_reg (V8SImode, op2);
12795 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
12796 	  op2 = half;
12797 	  break;
12798 	case IX86_BUILTIN_GATHER3ALTDIV16SF:
12799 	case IX86_BUILTIN_GATHER3ALTDIV16SI:
12800 	  half = gen_reg_rtx (mode0);
12801 	  if (mode0 == V8SFmode)
12802 	    gen = gen_vec_extract_lo_v16sf;
12803 	  else
12804 	    gen = gen_vec_extract_lo_v16si;
12805 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
12806 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12807 	  emit_insn (gen (half, op0));
12808 	  op0 = half;
12809 	  op3 = lowpart_subreg (QImode, op3, HImode);
12810 	  break;
12811 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
12812 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
12813 	case IX86_BUILTIN_GATHERALTDIV8SF:
12814 	case IX86_BUILTIN_GATHERALTDIV8SI:
12815 	  half = gen_reg_rtx (mode0);
12816 	  if (mode0 == V4SFmode)
12817 	    gen = gen_vec_extract_lo_v8sf;
12818 	  else
12819 	    gen = gen_vec_extract_lo_v8si;
12820 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
12821 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12822 	  emit_insn (gen (half, op0));
12823 	  op0 = half;
12824 	  if (VECTOR_MODE_P (GET_MODE (op3)))
12825 	    {
12826 	      half = gen_reg_rtx (mode0);
12827 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
12828 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12829 	      emit_insn (gen (half, op3));
12830 	      op3 = half;
12831 	    }
12832 	  break;
12833 	default:
12834 	  break;
12835 	}
12836 
12837       /* Force memory operand only with base register here.  But we
12838 	 don't want to do it on memory operand for other builtin
12839 	 functions.  */
12840       op1 = ix86_zero_extend_to_Pmode (op1);
12841 
12842       if (!insn_data[icode].operand[1].predicate (op0, mode0))
12843 	op0 = copy_to_mode_reg (mode0, op0);
12844       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12845 	op1 = copy_to_mode_reg (Pmode, op1);
12846       if (!insn_data[icode].operand[3].predicate (op2, mode2))
12847 	op2 = copy_to_mode_reg (mode2, op2);
12848 
12849       op3 = fixup_modeless_constant (op3, mode3);
12850 
12851       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12852 	{
12853 	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
12854 	    op3 = copy_to_mode_reg (mode3, op3);
12855 	}
12856       else
12857 	{
12858 	  op3 = copy_to_reg (op3);
12859 	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12860 	}
12861       if (!insn_data[icode].operand[5].predicate (op4, mode4))
12862 	{
12863           error ("the last argument must be scale 1, 2, 4, 8");
12864           return const0_rtx;
12865 	}
12866 
12867       /* Optimize.  If mask is known to have all high bits set,
12868 	 replace op0 with pc_rtx to signal that the instruction
12869 	 overwrites the whole destination and doesn't use its
12870 	 previous contents.  */
12871       if (optimize)
12872 	{
12873 	  if (TREE_CODE (arg3) == INTEGER_CST)
12874 	    {
12875 	      if (integer_all_onesp (arg3))
12876 		op0 = pc_rtx;
12877 	    }
12878 	  else if (TREE_CODE (arg3) == VECTOR_CST)
12879 	    {
12880 	      unsigned int negative = 0;
12881 	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12882 		{
12883 		  tree cst = VECTOR_CST_ELT (arg3, i);
12884 		  if (TREE_CODE (cst) == INTEGER_CST
12885 		      && tree_int_cst_sign_bit (cst))
12886 		    negative++;
12887 		  else if (TREE_CODE (cst) == REAL_CST
12888 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12889 		    negative++;
12890 		}
12891 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12892 		op0 = pc_rtx;
12893 	    }
12894 	  else if (TREE_CODE (arg3) == SSA_NAME
12895 		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12896 	    {
12897 	      /* Recognize also when mask is like:
12898 		 __v2df src = _mm_setzero_pd ();
12899 		 __v2df mask = _mm_cmpeq_pd (src, src);
12900 		 or
12901 		 __v8sf src = _mm256_setzero_ps ();
12902 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12903 		 as that is a cheaper way to load all ones into
12904 		 a register than having to load a constant from
12905 		 memory.  */
12906 	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12907 	      if (is_gimple_call (def_stmt))
12908 		{
12909 		  tree fndecl = gimple_call_fndecl (def_stmt);
12910 		  if (fndecl
12911 		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12912 		    switch (DECL_MD_FUNCTION_CODE (fndecl))
12913 		      {
12914 		      case IX86_BUILTIN_CMPPD:
12915 		      case IX86_BUILTIN_CMPPS:
12916 		      case IX86_BUILTIN_CMPPD256:
12917 		      case IX86_BUILTIN_CMPPS256:
12918 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12919 			  break;
12920 			/* FALLTHRU */
12921 		      case IX86_BUILTIN_CMPEQPD:
12922 		      case IX86_BUILTIN_CMPEQPS:
12923 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12924 			    && initializer_zerop (gimple_call_arg (def_stmt,
12925 								   1)))
12926 			  op0 = pc_rtx;
12927 			break;
12928 		      default:
12929 			break;
12930 		      }
12931 		}
12932 	    }
12933 	}
12934 
12935       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12936       if (! pat)
12937 	return const0_rtx;
12938       emit_insn (pat);
12939 
12940       switch (fcode)
12941 	{
12942 	case IX86_BUILTIN_GATHER3DIV16SF:
12943 	  if (target == NULL_RTX)
12944 	    target = gen_reg_rtx (V8SFmode);
12945 	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12946 	  break;
12947 	case IX86_BUILTIN_GATHER3DIV16SI:
12948 	  if (target == NULL_RTX)
12949 	    target = gen_reg_rtx (V8SImode);
12950 	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12951 	  break;
12952 	case IX86_BUILTIN_GATHER3DIV8SF:
12953 	case IX86_BUILTIN_GATHERDIV8SF:
12954 	  if (target == NULL_RTX)
12955 	    target = gen_reg_rtx (V4SFmode);
12956 	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12957 	  break;
12958 	case IX86_BUILTIN_GATHER3DIV8SI:
12959 	case IX86_BUILTIN_GATHERDIV8SI:
12960 	  if (target == NULL_RTX)
12961 	    target = gen_reg_rtx (V4SImode);
12962 	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12963 	  break;
12964 	default:
12965 	  target = subtarget;
12966 	  break;
12967 	}
12968       return target;
12969 
12970     scatter_gen:
12971       arg0 = CALL_EXPR_ARG (exp, 0);
12972       arg1 = CALL_EXPR_ARG (exp, 1);
12973       arg2 = CALL_EXPR_ARG (exp, 2);
12974       arg3 = CALL_EXPR_ARG (exp, 3);
12975       arg4 = CALL_EXPR_ARG (exp, 4);
12976       op0 = expand_normal (arg0);
12977       op1 = expand_normal (arg1);
12978       op2 = expand_normal (arg2);
12979       op3 = expand_normal (arg3);
12980       op4 = expand_normal (arg4);
12981       mode1 = insn_data[icode].operand[1].mode;
12982       mode2 = insn_data[icode].operand[2].mode;
12983       mode3 = insn_data[icode].operand[3].mode;
12984       mode4 = insn_data[icode].operand[4].mode;
12985 
12986       /* Scatter instruction stores operand op3 to memory with
12987 	 indices from op2 and scale from op4 under writemask op1.
12988 	 If index operand op2 has more elements then source operand
12989 	 op3 one need to use only its low half. And vice versa.  */
12990       switch (fcode)
12991 	{
12992 	case IX86_BUILTIN_SCATTERALTSIV8DF:
12993 	case IX86_BUILTIN_SCATTERALTSIV8DI:
12994 	  half = gen_reg_rtx (V8SImode);
12995 	  if (!nonimmediate_operand (op2, V16SImode))
12996 	    op2 = copy_to_mode_reg (V16SImode, op2);
12997 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
12998 	  op2 = half;
12999 	  break;
13000 	case IX86_BUILTIN_SCATTERALTDIV16SF:
13001 	case IX86_BUILTIN_SCATTERALTDIV16SI:
13002 	  half = gen_reg_rtx (mode3);
13003 	  if (mode3 == V8SFmode)
13004 	    gen = gen_vec_extract_lo_v16sf;
13005 	  else
13006 	    gen = gen_vec_extract_lo_v16si;
13007 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
13008 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13009 	  emit_insn (gen (half, op3));
13010 	  op3 = half;
13011 	  break;
13012 	case IX86_BUILTIN_SCATTERALTSIV4DF:
13013 	case IX86_BUILTIN_SCATTERALTSIV4DI:
13014 	  half = gen_reg_rtx (V4SImode);
13015 	  if (!nonimmediate_operand (op2, V8SImode))
13016 	    op2 = copy_to_mode_reg (V8SImode, op2);
13017 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
13018 	  op2 = half;
13019 	  break;
13020 	case IX86_BUILTIN_SCATTERALTDIV8SF:
13021 	case IX86_BUILTIN_SCATTERALTDIV8SI:
13022 	  half = gen_reg_rtx (mode3);
13023 	  if (mode3 == V4SFmode)
13024 	    gen = gen_vec_extract_lo_v8sf;
13025 	  else
13026 	    gen = gen_vec_extract_lo_v8si;
13027 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
13028 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13029 	  emit_insn (gen (half, op3));
13030 	  op3 = half;
13031 	  break;
13032 	case IX86_BUILTIN_SCATTERALTSIV2DF:
13033 	case IX86_BUILTIN_SCATTERALTSIV2DI:
13034 	  if (!nonimmediate_operand (op2, V4SImode))
13035 	    op2 = copy_to_mode_reg (V4SImode, op2);
13036 	  break;
13037 	case IX86_BUILTIN_SCATTERALTDIV4SF:
13038 	case IX86_BUILTIN_SCATTERALTDIV4SI:
13039 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
13040 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13041 	  break;
13042 	default:
13043 	  break;
13044 	}
13045 
13046       /* Force memory operand only with base register here.  But we
13047 	 don't want to do it on memory operand for other builtin
13048 	 functions.  */
13049       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
13050 
13051       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13052 	op0 = copy_to_mode_reg (Pmode, op0);
13053 
13054       op1 = fixup_modeless_constant (op1, mode1);
13055 
13056       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
13057 	{
13058 	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
13059 	    op1 = copy_to_mode_reg (mode1, op1);
13060 	}
13061       else
13062 	{
13063 	  op1 = copy_to_reg (op1);
13064 	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
13065 	}
13066 
13067       if (!insn_data[icode].operand[2].predicate (op2, mode2))
13068 	op2 = copy_to_mode_reg (mode2, op2);
13069 
13070       if (!insn_data[icode].operand[3].predicate (op3, mode3))
13071 	op3 = copy_to_mode_reg (mode3, op3);
13072 
13073       if (!insn_data[icode].operand[4].predicate (op4, mode4))
13074 	{
13075 	  error ("the last argument must be scale 1, 2, 4, 8");
13076 	  return const0_rtx;
13077 	}
13078 
13079       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13080       if (! pat)
13081 	return const0_rtx;
13082 
13083       emit_insn (pat);
13084       return 0;
13085 
13086     vec_prefetch_gen:
13087       arg0 = CALL_EXPR_ARG (exp, 0);
13088       arg1 = CALL_EXPR_ARG (exp, 1);
13089       arg2 = CALL_EXPR_ARG (exp, 2);
13090       arg3 = CALL_EXPR_ARG (exp, 3);
13091       arg4 = CALL_EXPR_ARG (exp, 4);
13092       op0 = expand_normal (arg0);
13093       op1 = expand_normal (arg1);
13094       op2 = expand_normal (arg2);
13095       op3 = expand_normal (arg3);
13096       op4 = expand_normal (arg4);
13097       mode0 = insn_data[icode].operand[0].mode;
13098       mode1 = insn_data[icode].operand[1].mode;
13099       mode3 = insn_data[icode].operand[3].mode;
13100       mode4 = insn_data[icode].operand[4].mode;
13101 
13102       op0 = fixup_modeless_constant (op0, mode0);
13103 
13104       if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
13105 	{
13106 	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
13107 	    op0 = copy_to_mode_reg (mode0, op0);
13108 	}
13109       else
13110 	{
13111 	  op0 = copy_to_reg (op0);
13112 	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
13113 	}
13114 
13115       if (!insn_data[icode].operand[1].predicate (op1, mode1))
13116 	op1 = copy_to_mode_reg (mode1, op1);
13117 
13118       /* Force memory operand only with base register here.  But we
13119 	 don't want to do it on memory operand for other builtin
13120 	 functions.  */
13121       op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
13122 
13123       if (!insn_data[icode].operand[2].predicate (op2, Pmode))
13124 	op2 = copy_to_mode_reg (Pmode, op2);
13125 
13126       if (!insn_data[icode].operand[3].predicate (op3, mode3))
13127 	{
13128 	  error ("the forth argument must be scale 1, 2, 4, 8");
13129 	  return const0_rtx;
13130 	}
13131 
13132       if (!insn_data[icode].operand[4].predicate (op4, mode4))
13133 	{
13134 	  error ("incorrect hint operand");
13135 	  return const0_rtx;
13136 	}
13137 
13138       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13139       if (! pat)
13140 	return const0_rtx;
13141 
13142       emit_insn (pat);
13143 
13144       return 0;
13145 
13146     case IX86_BUILTIN_XABORT:
13147       icode = CODE_FOR_xabort;
13148       arg0 = CALL_EXPR_ARG (exp, 0);
13149       op0 = expand_normal (arg0);
13150       mode0 = insn_data[icode].operand[0].mode;
13151       if (!insn_data[icode].operand[0].predicate (op0, mode0))
13152 	{
13153 	  error ("the argument to %<xabort%> intrinsic must "
13154 		 "be an 8-bit immediate");
13155 	  return const0_rtx;
13156 	}
13157       emit_insn (gen_xabort (op0));
13158       return 0;
13159 
13160     case IX86_BUILTIN_RDSSPD:
13161     case IX86_BUILTIN_RDSSPQ:
13162       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
13163 
13164       if (target == 0
13165 	  || !register_operand (target, mode))
13166 	target = gen_reg_rtx (mode);
13167 
13168       op0 = force_reg (mode, const0_rtx);
13169 
13170       emit_insn (gen_rdssp (mode, target, op0));
13171       return target;
13172 
13173     case IX86_BUILTIN_INCSSPD:
13174     case IX86_BUILTIN_INCSSPQ:
13175       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
13176 
13177       arg0 = CALL_EXPR_ARG (exp, 0);
13178       op0 = expand_normal (arg0);
13179 
13180       op0 = force_reg (mode, op0);
13181 
13182       emit_insn (gen_incssp (mode, op0));
13183       return 0;
13184 
13185     case IX86_BUILTIN_HRESET:
13186       icode = CODE_FOR_hreset;
13187       arg0 = CALL_EXPR_ARG (exp, 0);
13188       op0 = expand_normal (arg0);
13189       op0 = force_reg (SImode, op0);
13190       emit_insn (gen_hreset (op0));
13191       return 0;
13192 
13193     case IX86_BUILTIN_RSTORSSP:
13194     case IX86_BUILTIN_CLRSSBSY:
13195       arg0 = CALL_EXPR_ARG (exp, 0);
13196       op0 = expand_normal (arg0);
13197       icode = (fcode == IX86_BUILTIN_RSTORSSP
13198 	       ? CODE_FOR_rstorssp
13199 	       : CODE_FOR_clrssbsy);
13200 
13201       if (!address_operand (op0, VOIDmode))
13202 	{
13203 	  op0 = convert_memory_address (Pmode, op0);
13204 	  op0 = copy_addr_to_reg (op0);
13205 	}
13206       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
13207       return 0;
13208 
13209     case IX86_BUILTIN_WRSSD:
13210     case IX86_BUILTIN_WRSSQ:
13211     case IX86_BUILTIN_WRUSSD:
13212     case IX86_BUILTIN_WRUSSQ:
13213       mode = ((fcode == IX86_BUILTIN_WRSSD
13214 	       || fcode == IX86_BUILTIN_WRUSSD)
13215 	      ? SImode : DImode);
13216 
13217       arg0 = CALL_EXPR_ARG (exp, 0);
13218       op0 = expand_normal (arg0);
13219       arg1 = CALL_EXPR_ARG (exp, 1);
13220       op1 = expand_normal (arg1);
13221 
13222       op0 = force_reg (mode, op0);
13223 
13224       if (!address_operand (op1, VOIDmode))
13225 	{
13226 	  op1 = convert_memory_address (Pmode, op1);
13227 	  op1 = copy_addr_to_reg (op1);
13228 	}
13229       op1 = gen_rtx_MEM (mode, op1);
13230 
13231       icode = ((fcode == IX86_BUILTIN_WRSSD
13232 		|| fcode == IX86_BUILTIN_WRSSQ)
13233 	       ? code_for_wrss (mode)
13234 	       : code_for_wruss (mode));
13235       emit_insn (GEN_FCN (icode) (op0, op1));
13236 
13237       return 0;
13238 
13239     case IX86_BUILTIN_VZEROUPPER:
13240       cfun->machine->has_explicit_vzeroupper = true;
13241       break;
13242 
13243     default:
13244       break;
13245     }
13246 
13247   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13248       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
13249     {
13250       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
13251       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
13252 					       target);
13253     }
13254 
13255   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
13256       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
13257     {
13258       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
13259       rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
13260       rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
13261       rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
13262       int masked = 1;
13263       machine_mode mode, wide_mode, nar_mode;
13264 
13265       nar_mode  = V4SFmode;
13266       mode      = V16SFmode;
13267       wide_mode = V64SFmode;
13268       fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
13269       fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
13270 
13271       switch (fcode)
13272 	{
13273 	case IX86_BUILTIN_4FMAPS:
13274 	  fcn = gen_avx5124fmaddps_4fmaddps;
13275 	  masked = 0;
13276 	  goto v4fma_expand;
13277 
13278 	case IX86_BUILTIN_4DPWSSD:
13279 	  nar_mode  = V4SImode;
13280 	  mode      = V16SImode;
13281 	  wide_mode = V64SImode;
13282 	  fcn = gen_avx5124vnniw_vp4dpwssd;
13283 	  masked = 0;
13284 	  goto v4fma_expand;
13285 
13286 	case IX86_BUILTIN_4DPWSSDS:
13287 	  nar_mode  = V4SImode;
13288 	  mode      = V16SImode;
13289 	  wide_mode = V64SImode;
13290 	  fcn = gen_avx5124vnniw_vp4dpwssds;
13291 	  masked = 0;
13292 	  goto v4fma_expand;
13293 
13294 	case IX86_BUILTIN_4FNMAPS:
13295 	  fcn = gen_avx5124fmaddps_4fnmaddps;
13296 	  masked = 0;
13297 	  goto v4fma_expand;
13298 
13299 	case IX86_BUILTIN_4FNMAPS_MASK:
13300 	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
13301 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
13302 	  goto v4fma_expand;
13303 
13304 	case IX86_BUILTIN_4DPWSSD_MASK:
13305 	  nar_mode  = V4SImode;
13306 	  mode      = V16SImode;
13307 	  wide_mode = V64SImode;
13308 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
13309 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
13310 	  goto v4fma_expand;
13311 
13312 	case IX86_BUILTIN_4DPWSSDS_MASK:
13313 	  nar_mode  = V4SImode;
13314 	  mode      = V16SImode;
13315 	  wide_mode = V64SImode;
13316 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
13317 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
13318 	  goto v4fma_expand;
13319 
13320 	case IX86_BUILTIN_4FMAPS_MASK:
13321 	  {
13322 	    tree args[4];
13323 	    rtx ops[4];
13324 	    rtx wide_reg;
13325 	    rtx accum;
13326 	    rtx addr;
13327 	    rtx mem;
13328 
13329 v4fma_expand:
13330 	    wide_reg = gen_reg_rtx (wide_mode);
13331 	    for (i = 0; i < 4; i++)
13332 	      {
13333 		args[i] = CALL_EXPR_ARG (exp, i);
13334 		ops[i] = expand_normal (args[i]);
13335 
13336 		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
13337 				ops[i]);
13338 	      }
13339 
13340 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13341 	    accum = force_reg (mode, accum);
13342 
13343 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13344 	    addr = force_reg (Pmode, addr);
13345 
13346 	    mem = gen_rtx_MEM (nar_mode, addr);
13347 
13348 	    target = gen_reg_rtx (mode);
13349 
13350 	    emit_move_insn (target, accum);
13351 
13352 	    if (! masked)
13353 	      emit_insn (fcn (target, accum, wide_reg, mem));
13354 	    else
13355 	      {
13356 		rtx merge, mask;
13357 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13358 
13359 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13360 
13361 		if (CONST_INT_P (mask))
13362 		  mask = fixup_modeless_constant (mask, HImode);
13363 
13364 		mask = force_reg (HImode, mask);
13365 
13366 		if (GET_MODE (mask) != HImode)
13367 		  mask = gen_rtx_SUBREG (HImode, mask, 0);
13368 
13369 		/* If merge is 0 then we're about to emit z-masked variant.  */
13370 		if (const0_operand (merge, mode))
13371 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13372 		/* If merge is the same as accum then emit merge-masked variant.  */
13373 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13374 		  {
13375 		    merge = force_reg (mode, merge);
13376 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13377 		  }
13378 		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
13379 		else
13380 		  {
13381 		    target = gen_reg_rtx (mode);
13382 		    emit_move_insn (target, merge);
13383 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13384 		  }
13385 	      }
13386 	    return target;
13387 	  }
13388 
13389 	case IX86_BUILTIN_4FNMASS:
13390 	  fcn = gen_avx5124fmaddps_4fnmaddss;
13391 	  masked = 0;
13392 	  goto s4fma_expand;
13393 
13394 	case IX86_BUILTIN_4FMASS:
13395 	  fcn = gen_avx5124fmaddps_4fmaddss;
13396 	  masked = 0;
13397 	  goto s4fma_expand;
13398 
13399 	case IX86_BUILTIN_4FNMASS_MASK:
13400 	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13401 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13402 	  goto s4fma_expand;
13403 
13404 	case IX86_BUILTIN_4FMASS_MASK:
13405 	  {
13406 	    tree args[4];
13407 	    rtx ops[4];
13408 	    rtx wide_reg;
13409 	    rtx accum;
13410 	    rtx addr;
13411 	    rtx mem;
13412 
13413 	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13414 	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13415 
13416 s4fma_expand:
13417 	    mode = V4SFmode;
13418 	    wide_reg = gen_reg_rtx (V64SFmode);
13419 	    for (i = 0; i < 4; i++)
13420 	      {
13421 		rtx tmp;
13422 		args[i] = CALL_EXPR_ARG (exp, i);
13423 		ops[i] = expand_normal (args[i]);
13424 
13425 		tmp = gen_reg_rtx (SFmode);
13426 		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13427 
13428 		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13429 				gen_rtx_SUBREG (V16SFmode, tmp, 0));
13430 	      }
13431 
13432 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13433 	    accum = force_reg (V4SFmode, accum);
13434 
13435 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13436 	    addr = force_reg (Pmode, addr);
13437 
13438 	    mem = gen_rtx_MEM (V4SFmode, addr);
13439 
13440 	    target = gen_reg_rtx (V4SFmode);
13441 
13442 	    emit_move_insn (target, accum);
13443 
13444 	    if (! masked)
13445 	      emit_insn (fcn (target, accum, wide_reg, mem));
13446 	    else
13447 	      {
13448 		rtx merge, mask;
13449 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13450 
13451 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13452 
13453 		if (CONST_INT_P (mask))
13454 		  mask = fixup_modeless_constant (mask, QImode);
13455 
13456 		mask = force_reg (QImode, mask);
13457 
13458 		if (GET_MODE (mask) != QImode)
13459 		  mask = gen_rtx_SUBREG (QImode, mask, 0);
13460 
13461 		/* If merge is 0 then we're about to emit z-masked variant.  */
13462 		if (const0_operand (merge, mode))
13463 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13464 		/* If merge is the same as accum then emit merge-masked
13465 		   variant.  */
13466 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13467 		  {
13468 		    merge = force_reg (mode, merge);
13469 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13470 		  }
13471 		/* Merge with something unknown might happen if we z-mask
13472 		   w/ -O0.  */
13473 		else
13474 		  {
13475 		    target = gen_reg_rtx (mode);
13476 		    emit_move_insn (target, merge);
13477 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13478 		  }
13479 		}
13480 	      return target;
13481 	    }
13482 	  case IX86_BUILTIN_RDPID:
13483 	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13484 						     target);
13485 	  case IX86_BUILTIN_FABSQ:
13486 	  case IX86_BUILTIN_COPYSIGNQ:
13487 	    if (!TARGET_SSE)
13488 	      /* Emit a normal call if SSE isn't available.  */
13489 	      return expand_call (exp, target, ignore);
13490 	    /* FALLTHRU */
13491 	  default:
13492 	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13493 	  }
13494     }
13495 
13496   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13497       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13498     {
13499       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13500       return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13501     }
13502 
13503   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13504       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13505     {
13506       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13507       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13508     }
13509 
13510   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13511       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13512     {
13513       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13514       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13515     }
13516 
13517   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13518       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13519     {
13520       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13521       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13522     }
13523 
13524   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13525       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13526     {
13527       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13528       const struct builtin_description *d = bdesc_multi_arg + i;
13529       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13530 					    (enum ix86_builtin_func_type)
13531 					    d->flag, d->comparison);
13532     }
13533 
13534   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13535       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13536     {
13537       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13538       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13539 					       target);
13540     }
13541 
13542   gcc_unreachable ();
13543 }
13544 
13545 /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
13546    fill target with val via vec_duplicate.  */
13547 
13548 static bool
ix86_vector_duplicate_value(machine_mode mode,rtx target,rtx val)13549 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13550 {
13551   bool ok;
13552   rtx_insn *insn;
13553   rtx dup;
13554 
13555   /* First attempt to recognize VAL as-is.  */
13556   dup = gen_vec_duplicate (mode, val);
13557   insn = emit_insn (gen_rtx_SET (target, dup));
13558   if (recog_memoized (insn) < 0)
13559     {
13560       rtx_insn *seq;
13561       machine_mode innermode = GET_MODE_INNER (mode);
13562       rtx reg;
13563 
13564       /* If that fails, force VAL into a register.  */
13565 
13566       start_sequence ();
13567       reg = force_reg (innermode, val);
13568       if (GET_MODE (reg) != innermode)
13569 	reg = gen_lowpart (innermode, reg);
13570       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13571       seq = get_insns ();
13572       end_sequence ();
13573       if (seq)
13574 	emit_insn_before (seq, insn);
13575 
13576       ok = recog_memoized (insn) >= 0;
13577       gcc_assert (ok);
13578     }
13579   return true;
13580 }
13581 
13582 /* Get a vector mode of the same size as the original but with elements
13583    twice as wide.  This is only guaranteed to apply to integral vectors.  */
13584 
13585 static machine_mode
get_mode_wider_vector(machine_mode o)13586 get_mode_wider_vector (machine_mode o)
13587 {
13588   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
13589   machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13590   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13591   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13592   return n;
13593 }
13594 
13595 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13596 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13597 
13598 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13599    with all elements equal to VAR.  Return true if successful.  */
13600 
13601 static bool
ix86_expand_vector_init_duplicate(bool mmx_ok,machine_mode mode,rtx target,rtx val)13602 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13603 				   rtx target, rtx val)
13604 {
13605   bool ok;
13606 
13607   switch (mode)
13608     {
13609     case E_V2SImode:
13610     case E_V2SFmode:
13611       if (!mmx_ok)
13612 	return false;
13613       /* FALLTHRU */
13614 
13615     case E_V4DFmode:
13616     case E_V4DImode:
13617     case E_V8SFmode:
13618     case E_V8SImode:
13619     case E_V2DFmode:
13620     case E_V2DImode:
13621     case E_V4SFmode:
13622     case E_V4SImode:
13623     case E_V16SImode:
13624     case E_V8DImode:
13625     case E_V16SFmode:
13626     case E_V8DFmode:
13627       return ix86_vector_duplicate_value (mode, target, val);
13628 
13629     case E_V4HImode:
13630       if (!mmx_ok)
13631 	return false;
13632       if (TARGET_SSE || TARGET_3DNOW_A)
13633 	{
13634 	  rtx x;
13635 
13636 	  val = gen_lowpart (SImode, val);
13637 	  x = gen_rtx_TRUNCATE (HImode, val);
13638 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
13639 	  emit_insn (gen_rtx_SET (target, x));
13640 	  return true;
13641 	}
13642       goto widen;
13643 
13644     case E_V8QImode:
13645       if (!mmx_ok)
13646 	return false;
13647       goto widen;
13648 
13649     case E_V8HImode:
13650       if (TARGET_AVX2)
13651 	return ix86_vector_duplicate_value (mode, target, val);
13652 
13653       if (TARGET_SSE2)
13654 	{
13655 	  struct expand_vec_perm_d dperm;
13656 	  rtx tmp1, tmp2;
13657 
13658 	permute:
13659 	  memset (&dperm, 0, sizeof (dperm));
13660 	  dperm.target = target;
13661 	  dperm.vmode = mode;
13662 	  dperm.nelt = GET_MODE_NUNITS (mode);
13663 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13664 	  dperm.one_operand_p = true;
13665 
13666 	  /* Extend to SImode using a paradoxical SUBREG.  */
13667 	  tmp1 = gen_reg_rtx (SImode);
13668 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
13669 
13670 	  /* Insert the SImode value as low element of a V4SImode vector. */
13671 	  tmp2 = gen_reg_rtx (V4SImode);
13672 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13673 	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13674 
13675 	  ok = (expand_vec_perm_1 (&dperm)
13676 		|| expand_vec_perm_broadcast_1 (&dperm));
13677 	  gcc_assert (ok);
13678 	  return ok;
13679 	}
13680       goto widen;
13681 
13682     case E_V16QImode:
13683       if (TARGET_AVX2)
13684 	return ix86_vector_duplicate_value (mode, target, val);
13685 
13686       if (TARGET_SSE2)
13687 	goto permute;
13688       goto widen;
13689 
13690     widen:
13691       /* Replicate the value once into the next wider mode and recurse.  */
13692       {
13693 	machine_mode smode, wsmode, wvmode;
13694 	rtx x;
13695 
13696 	smode = GET_MODE_INNER (mode);
13697 	wvmode = get_mode_wider_vector (mode);
13698 	wsmode = GET_MODE_INNER (wvmode);
13699 
13700 	val = convert_modes (wsmode, smode, val, true);
13701 	x = expand_simple_binop (wsmode, ASHIFT, val,
13702 				 GEN_INT (GET_MODE_BITSIZE (smode)),
13703 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13704 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13705 
13706 	x = gen_reg_rtx (wvmode);
13707 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13708 	gcc_assert (ok);
13709 	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13710 	return ok;
13711       }
13712 
13713     case E_V16HImode:
13714     case E_V32QImode:
13715       if (TARGET_AVX2)
13716 	return ix86_vector_duplicate_value (mode, target, val);
13717       else
13718 	{
13719 	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13720 	  rtx x = gen_reg_rtx (hvmode);
13721 
13722 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13723 	  gcc_assert (ok);
13724 
13725 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
13726 	  emit_insn (gen_rtx_SET (target, x));
13727 	}
13728       return true;
13729 
13730     case E_V64QImode:
13731     case E_V32HImode:
13732       if (TARGET_AVX512BW)
13733 	return ix86_vector_duplicate_value (mode, target, val);
13734       else
13735 	{
13736 	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13737 	  rtx x = gen_reg_rtx (hvmode);
13738 
13739 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13740 	  gcc_assert (ok);
13741 
13742 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
13743 	  emit_insn (gen_rtx_SET (target, x));
13744 	}
13745       return true;
13746 
13747     default:
13748       return false;
13749     }
13750 }
13751 
13752 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13753    whose ONE_VAR element is VAR, and other elements are zero.  Return true
13754    if successful.  */
13755 
13756 static bool
ix86_expand_vector_init_one_nonzero(bool mmx_ok,machine_mode mode,rtx target,rtx var,int one_var)13757 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13758 				     rtx target, rtx var, int one_var)
13759 {
13760   machine_mode vsimode;
13761   rtx new_target;
13762   rtx x, tmp;
13763   bool use_vector_set = false;
13764   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13765 
13766   switch (mode)
13767     {
13768     case E_V2DImode:
13769       /* For SSE4.1, we normally use vector set.  But if the second
13770 	 element is zero and inter-unit moves are OK, we use movq
13771 	 instead.  */
13772       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13773 			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
13774 			     && one_var == 0));
13775       break;
13776     case E_V16QImode:
13777     case E_V4SImode:
13778     case E_V4SFmode:
13779       use_vector_set = TARGET_SSE4_1;
13780       break;
13781     case E_V8HImode:
13782       use_vector_set = TARGET_SSE2;
13783       break;
13784     case E_V8QImode:
13785       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13786       break;
13787     case E_V4HImode:
13788       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13789       break;
13790     case E_V32QImode:
13791     case E_V16HImode:
13792       use_vector_set = TARGET_AVX;
13793       break;
13794     case E_V8SImode:
13795       use_vector_set = TARGET_AVX;
13796       gen_vec_set_0 = gen_vec_setv8si_0;
13797       break;
13798     case E_V8SFmode:
13799       use_vector_set = TARGET_AVX;
13800       gen_vec_set_0 = gen_vec_setv8sf_0;
13801       break;
13802     case E_V4DFmode:
13803       use_vector_set = TARGET_AVX;
13804       gen_vec_set_0 = gen_vec_setv4df_0;
13805       break;
13806     case E_V4DImode:
13807       /* Use ix86_expand_vector_set in 64bit mode only.  */
13808       use_vector_set = TARGET_AVX && TARGET_64BIT;
13809       gen_vec_set_0 = gen_vec_setv4di_0;
13810       break;
13811     case E_V16SImode:
13812       use_vector_set = TARGET_AVX512F && one_var == 0;
13813       gen_vec_set_0 = gen_vec_setv16si_0;
13814       break;
13815     case E_V16SFmode:
13816       use_vector_set = TARGET_AVX512F && one_var == 0;
13817       gen_vec_set_0 = gen_vec_setv16sf_0;
13818       break;
13819     case E_V8DFmode:
13820       use_vector_set = TARGET_AVX512F && one_var == 0;
13821       gen_vec_set_0 = gen_vec_setv8df_0;
13822       break;
13823     case E_V8DImode:
13824       /* Use ix86_expand_vector_set in 64bit mode only.  */
13825       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13826       gen_vec_set_0 = gen_vec_setv8di_0;
13827       break;
13828     default:
13829       break;
13830     }
13831 
13832   if (use_vector_set)
13833     {
13834       if (gen_vec_set_0 && one_var == 0)
13835 	{
13836 	  var = force_reg (GET_MODE_INNER (mode), var);
13837 	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13838 	  return true;
13839 	}
13840       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13841       var = force_reg (GET_MODE_INNER (mode), var);
13842       ix86_expand_vector_set (mmx_ok, target, var, one_var);
13843       return true;
13844     }
13845 
13846   switch (mode)
13847     {
13848     case E_V2SFmode:
13849     case E_V2SImode:
13850       if (!mmx_ok)
13851 	return false;
13852       /* FALLTHRU */
13853 
13854     case E_V2DFmode:
13855     case E_V2DImode:
13856       if (one_var != 0)
13857 	return false;
13858       var = force_reg (GET_MODE_INNER (mode), var);
13859       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13860       emit_insn (gen_rtx_SET (target, x));
13861       return true;
13862 
13863     case E_V4SFmode:
13864     case E_V4SImode:
13865       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13866 	new_target = gen_reg_rtx (mode);
13867       else
13868 	new_target = target;
13869       var = force_reg (GET_MODE_INNER (mode), var);
13870       x = gen_rtx_VEC_DUPLICATE (mode, var);
13871       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13872       emit_insn (gen_rtx_SET (new_target, x));
13873       if (one_var != 0)
13874 	{
13875 	  /* We need to shuffle the value to the correct position, so
13876 	     create a new pseudo to store the intermediate result.  */
13877 
13878 	  /* With SSE2, we can use the integer shuffle insns.  */
13879 	  if (mode != V4SFmode && TARGET_SSE2)
13880 	    {
13881 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13882 					    const1_rtx,
13883 					    GEN_INT (one_var == 1 ? 0 : 1),
13884 					    GEN_INT (one_var == 2 ? 0 : 1),
13885 					    GEN_INT (one_var == 3 ? 0 : 1)));
13886 	      if (target != new_target)
13887 		emit_move_insn (target, new_target);
13888 	      return true;
13889 	    }
13890 
13891 	  /* Otherwise convert the intermediate result to V4SFmode and
13892 	     use the SSE1 shuffle instructions.  */
13893 	  if (mode != V4SFmode)
13894 	    {
13895 	      tmp = gen_reg_rtx (V4SFmode);
13896 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13897 	    }
13898 	  else
13899 	    tmp = new_target;
13900 
13901 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13902 				       const1_rtx,
13903 				       GEN_INT (one_var == 1 ? 0 : 1),
13904 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
13905 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13906 
13907 	  if (mode != V4SFmode)
13908 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13909 	  else if (tmp != target)
13910 	    emit_move_insn (target, tmp);
13911 	}
13912       else if (target != new_target)
13913 	emit_move_insn (target, new_target);
13914       return true;
13915 
13916     case E_V8HImode:
13917     case E_V16QImode:
13918       vsimode = V4SImode;
13919       goto widen;
13920     case E_V4HImode:
13921     case E_V8QImode:
13922       if (!mmx_ok)
13923 	return false;
13924       vsimode = V2SImode;
13925       goto widen;
13926     widen:
13927       if (one_var != 0)
13928 	return false;
13929 
13930       /* Zero extend the variable element to SImode and recurse.  */
13931       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13932 
13933       x = gen_reg_rtx (vsimode);
13934       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13935 						var, one_var))
13936 	gcc_unreachable ();
13937 
13938       emit_move_insn (target, gen_lowpart (mode, x));
13939       return true;
13940 
13941     default:
13942       return false;
13943     }
13944 }
13945 
13946 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13947    consisting of the values in VALS.  It is known that all elements
13948    except ONE_VAR are constants.  Return true if successful.  */
13949 
13950 static bool
ix86_expand_vector_init_one_var(bool mmx_ok,machine_mode mode,rtx target,rtx vals,int one_var)13951 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13952 				 rtx target, rtx vals, int one_var)
13953 {
13954   rtx var = XVECEXP (vals, 0, one_var);
13955   machine_mode wmode;
13956   rtx const_vec, x;
13957 
13958   const_vec = copy_rtx (vals);
13959   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13960   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13961 
13962   switch (mode)
13963     {
13964     case E_V2DFmode:
13965     case E_V2DImode:
13966     case E_V2SFmode:
13967     case E_V2SImode:
13968       /* For the two element vectors, it's just as easy to use
13969 	 the general case.  */
13970       return false;
13971 
13972     case E_V4DImode:
13973       /* Use ix86_expand_vector_set in 64bit mode only.  */
13974       if (!TARGET_64BIT)
13975 	return false;
13976       /* FALLTHRU */
13977     case E_V4DFmode:
13978     case E_V8SFmode:
13979     case E_V8SImode:
13980     case E_V16HImode:
13981     case E_V32QImode:
13982     case E_V4SFmode:
13983     case E_V4SImode:
13984     case E_V8HImode:
13985     case E_V4HImode:
13986       break;
13987 
13988     case E_V16QImode:
13989       if (TARGET_SSE4_1)
13990 	break;
13991       wmode = V8HImode;
13992       goto widen;
13993     case E_V8QImode:
13994       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13995 	break;
13996       wmode = V4HImode;
13997       goto widen;
13998     widen:
13999       /* There's no way to set one QImode entry easily.  Combine
14000 	 the variable value with its adjacent constant value, and
14001 	 promote to an HImode set.  */
14002       x = XVECEXP (vals, 0, one_var ^ 1);
14003       if (one_var & 1)
14004 	{
14005 	  var = convert_modes (HImode, QImode, var, true);
14006 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
14007 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
14008 	  x = GEN_INT (INTVAL (x) & 0xff);
14009 	}
14010       else
14011 	{
14012 	  var = convert_modes (HImode, QImode, var, true);
14013 	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
14014 	}
14015       if (x != const0_rtx)
14016 	var = expand_simple_binop (HImode, IOR, var, x, var,
14017 				   1, OPTAB_LIB_WIDEN);
14018 
14019       x = gen_reg_rtx (wmode);
14020       emit_move_insn (x, gen_lowpart (wmode, const_vec));
14021       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
14022 
14023       emit_move_insn (target, gen_lowpart (mode, x));
14024       return true;
14025 
14026     default:
14027       return false;
14028     }
14029 
14030   emit_move_insn (target, const_vec);
14031   ix86_expand_vector_set (mmx_ok, target, var, one_var);
14032   return true;
14033 }
14034 
14035 /* A subroutine of ix86_expand_vector_init_general.  Use vector
14036    concatenate to handle the most general case: all values variable,
14037    and none identical.  */
14038 
14039 static void
ix86_expand_vector_init_concat(machine_mode mode,rtx target,rtx * ops,int n)14040 ix86_expand_vector_init_concat (machine_mode mode,
14041 				rtx target, rtx *ops, int n)
14042 {
14043   machine_mode half_mode = VOIDmode;
14044   rtx half[2];
14045   rtvec v;
14046   int i, j;
14047 
14048   switch (n)
14049     {
14050     case 2:
14051       switch (mode)
14052 	{
14053 	case E_V16SImode:
14054 	  half_mode = V8SImode;
14055 	  break;
14056 	case E_V16SFmode:
14057 	  half_mode = V8SFmode;
14058 	  break;
14059 	case E_V8DImode:
14060 	  half_mode = V4DImode;
14061 	  break;
14062 	case E_V8DFmode:
14063 	  half_mode = V4DFmode;
14064 	  break;
14065 	case E_V8SImode:
14066 	  half_mode = V4SImode;
14067 	  break;
14068 	case E_V8SFmode:
14069 	  half_mode = V4SFmode;
14070 	  break;
14071 	case E_V4DImode:
14072 	  half_mode = V2DImode;
14073 	  break;
14074 	case E_V4DFmode:
14075 	  half_mode = V2DFmode;
14076 	  break;
14077 	case E_V4SImode:
14078 	  half_mode = V2SImode;
14079 	  break;
14080 	case E_V4SFmode:
14081 	  half_mode = V2SFmode;
14082 	  break;
14083 	case E_V2DImode:
14084 	  half_mode = DImode;
14085 	  break;
14086 	case E_V2SImode:
14087 	  half_mode = SImode;
14088 	  break;
14089 	case E_V2DFmode:
14090 	  half_mode = DFmode;
14091 	  break;
14092 	case E_V2SFmode:
14093 	  half_mode = SFmode;
14094 	  break;
14095 	default:
14096 	  gcc_unreachable ();
14097 	}
14098 
14099       if (!register_operand (ops[1], half_mode))
14100 	ops[1] = force_reg (half_mode, ops[1]);
14101       if (!register_operand (ops[0], half_mode))
14102 	ops[0] = force_reg (half_mode, ops[0]);
14103       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
14104 							  ops[1])));
14105       break;
14106 
14107     case 4:
14108       switch (mode)
14109 	{
14110 	case E_V4DImode:
14111 	  half_mode = V2DImode;
14112 	  break;
14113 	case E_V4DFmode:
14114 	  half_mode = V2DFmode;
14115 	  break;
14116 	case E_V4SImode:
14117 	  half_mode = V2SImode;
14118 	  break;
14119 	case E_V4SFmode:
14120 	  half_mode = V2SFmode;
14121 	  break;
14122 	default:
14123 	  gcc_unreachable ();
14124 	}
14125       goto half;
14126 
14127     case 8:
14128       switch (mode)
14129 	{
14130 	case E_V8DImode:
14131 	  half_mode = V4DImode;
14132 	  break;
14133 	case E_V8DFmode:
14134 	  half_mode = V4DFmode;
14135 	  break;
14136 	case E_V8SImode:
14137 	  half_mode = V4SImode;
14138 	  break;
14139 	case E_V8SFmode:
14140 	  half_mode = V4SFmode;
14141 	  break;
14142 	default:
14143 	  gcc_unreachable ();
14144 	}
14145       goto half;
14146 
14147     case 16:
14148       switch (mode)
14149 	{
14150 	case E_V16SImode:
14151 	  half_mode = V8SImode;
14152 	  break;
14153 	case E_V16SFmode:
14154 	  half_mode = V8SFmode;
14155 	  break;
14156 	default:
14157 	  gcc_unreachable ();
14158 	}
14159       goto half;
14160 
14161 half:
14162       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
14163       i = n - 1;
14164       for (j = 1; j != -1; j--)
14165 	{
14166 	  half[j] = gen_reg_rtx (half_mode);
14167 	  switch (n >> 1)
14168 	    {
14169 	    case 2:
14170 	      v = gen_rtvec (2, ops[i-1], ops[i]);
14171 	      i -= 2;
14172 	      break;
14173 	    case 4:
14174 	      v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
14175 	      i -= 4;
14176 	      break;
14177 	    case 8:
14178 	      v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
14179 			     ops[i-3], ops[i-2], ops[i-1], ops[i]);
14180 	      i -= 8;
14181 	      break;
14182 	    default:
14183 	      gcc_unreachable ();
14184 	    }
14185 	  ix86_expand_vector_init (false, half[j],
14186 				   gen_rtx_PARALLEL (half_mode, v));
14187 	}
14188 
14189       ix86_expand_vector_init_concat (mode, target, half, 2);
14190       break;
14191 
14192     default:
14193       gcc_unreachable ();
14194     }
14195 }
14196 
14197 /* A subroutine of ix86_expand_vector_init_general.  Use vector
14198    interleave to handle the most general case: all values variable,
14199    and none identical.  */
14200 
14201 static void
ix86_expand_vector_init_interleave(machine_mode mode,rtx target,rtx * ops,int n)14202 ix86_expand_vector_init_interleave (machine_mode mode,
14203 				    rtx target, rtx *ops, int n)
14204 {
14205   machine_mode first_imode, second_imode, third_imode, inner_mode;
14206   int i, j;
14207   rtx op0, op1;
14208   rtx (*gen_load_even) (rtx, rtx, rtx);
14209   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
14210   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
14211 
14212   switch (mode)
14213     {
14214     case E_V8HImode:
14215       gen_load_even = gen_vec_setv8hi;
14216       gen_interleave_first_low = gen_vec_interleave_lowv4si;
14217       gen_interleave_second_low = gen_vec_interleave_lowv2di;
14218       inner_mode = HImode;
14219       first_imode = V4SImode;
14220       second_imode = V2DImode;
14221       third_imode = VOIDmode;
14222       break;
14223     case E_V16QImode:
14224       gen_load_even = gen_vec_setv16qi;
14225       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
14226       gen_interleave_second_low = gen_vec_interleave_lowv4si;
14227       inner_mode = QImode;
14228       first_imode = V8HImode;
14229       second_imode = V4SImode;
14230       third_imode = V2DImode;
14231       break;
14232     default:
14233       gcc_unreachable ();
14234     }
14235 
14236   for (i = 0; i < n; i++)
14237     {
14238       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
14239       op0 = gen_reg_rtx (SImode);
14240       emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
14241 
14242       /* Insert the SImode value as low element of V4SImode vector. */
14243       op1 = gen_reg_rtx (V4SImode);
14244       op0 = gen_rtx_VEC_MERGE (V4SImode,
14245 			       gen_rtx_VEC_DUPLICATE (V4SImode,
14246 						      op0),
14247 			       CONST0_RTX (V4SImode),
14248 			       const1_rtx);
14249       emit_insn (gen_rtx_SET (op1, op0));
14250 
14251       /* Cast the V4SImode vector back to a vector in orignal mode.  */
14252       op0 = gen_reg_rtx (mode);
14253       emit_move_insn (op0, gen_lowpart (mode, op1));
14254 
14255       /* Load even elements into the second position.  */
14256       emit_insn (gen_load_even (op0,
14257 				force_reg (inner_mode,
14258 					   ops [i + i + 1]),
14259 				const1_rtx));
14260 
14261       /* Cast vector to FIRST_IMODE vector.  */
14262       ops[i] = gen_reg_rtx (first_imode);
14263       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
14264     }
14265 
14266   /* Interleave low FIRST_IMODE vectors.  */
14267   for (i = j = 0; i < n; i += 2, j++)
14268     {
14269       op0 = gen_reg_rtx (first_imode);
14270       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
14271 
14272       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
14273       ops[j] = gen_reg_rtx (second_imode);
14274       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
14275     }
14276 
14277   /* Interleave low SECOND_IMODE vectors.  */
14278   switch (second_imode)
14279     {
14280     case E_V4SImode:
14281       for (i = j = 0; i < n / 2; i += 2, j++)
14282 	{
14283 	  op0 = gen_reg_rtx (second_imode);
14284 	  emit_insn (gen_interleave_second_low (op0, ops[i],
14285 						ops[i + 1]));
14286 
14287 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14288 	     vector.  */
14289 	  ops[j] = gen_reg_rtx (third_imode);
14290 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
14291 	}
14292       second_imode = V2DImode;
14293       gen_interleave_second_low = gen_vec_interleave_lowv2di;
14294       /* FALLTHRU */
14295 
14296     case E_V2DImode:
14297       op0 = gen_reg_rtx (second_imode);
14298       emit_insn (gen_interleave_second_low (op0, ops[0],
14299 					    ops[1]));
14300 
14301       /* Cast the SECOND_IMODE vector back to a vector on original
14302 	 mode.  */
14303       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
14304       break;
14305 
14306     default:
14307       gcc_unreachable ();
14308     }
14309 }
14310 
14311 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
14312    all values variable, and none identical.  */
14313 
14314 static void
ix86_expand_vector_init_general(bool mmx_ok,machine_mode mode,rtx target,rtx vals)14315 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
14316 				 rtx target, rtx vals)
14317 {
14318   rtx ops[64], op0, op1, op2, op3, op4, op5;
14319   machine_mode half_mode = VOIDmode;
14320   machine_mode quarter_mode = VOIDmode;
14321   int n, i;
14322 
14323   switch (mode)
14324     {
14325     case E_V2SFmode:
14326     case E_V2SImode:
14327       if (!mmx_ok && !TARGET_SSE)
14328 	break;
14329       /* FALLTHRU */
14330 
14331     case E_V16SImode:
14332     case E_V16SFmode:
14333     case E_V8DFmode:
14334     case E_V8DImode:
14335     case E_V8SFmode:
14336     case E_V8SImode:
14337     case E_V4DFmode:
14338     case E_V4DImode:
14339     case E_V4SFmode:
14340     case E_V4SImode:
14341     case E_V2DFmode:
14342     case E_V2DImode:
14343       n = GET_MODE_NUNITS (mode);
14344       for (i = 0; i < n; i++)
14345 	ops[i] = XVECEXP (vals, 0, i);
14346       ix86_expand_vector_init_concat (mode, target, ops, n);
14347       return;
14348 
14349     case E_V2TImode:
14350       for (i = 0; i < 2; i++)
14351 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14352       op0 = gen_reg_rtx (V4DImode);
14353       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
14354       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14355       return;
14356 
14357     case E_V4TImode:
14358       for (i = 0; i < 4; i++)
14359 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14360       ops[4] = gen_reg_rtx (V4DImode);
14361       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14362       ops[5] = gen_reg_rtx (V4DImode);
14363       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14364       op0 = gen_reg_rtx (V8DImode);
14365       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14366       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14367       return;
14368 
14369     case E_V32QImode:
14370       half_mode = V16QImode;
14371       goto half;
14372 
14373     case E_V16HImode:
14374       half_mode = V8HImode;
14375       goto half;
14376 
14377 half:
14378       n = GET_MODE_NUNITS (mode);
14379       for (i = 0; i < n; i++)
14380 	ops[i] = XVECEXP (vals, 0, i);
14381       op0 = gen_reg_rtx (half_mode);
14382       op1 = gen_reg_rtx (half_mode);
14383       ix86_expand_vector_init_interleave (half_mode, op0, ops,
14384 					  n >> 2);
14385       ix86_expand_vector_init_interleave (half_mode, op1,
14386 					  &ops [n >> 1], n >> 2);
14387       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14388       return;
14389 
14390     case E_V64QImode:
14391       quarter_mode = V16QImode;
14392       half_mode = V32QImode;
14393       goto quarter;
14394 
14395     case E_V32HImode:
14396       quarter_mode = V8HImode;
14397       half_mode = V16HImode;
14398       goto quarter;
14399 
14400 quarter:
14401       n = GET_MODE_NUNITS (mode);
14402       for (i = 0; i < n; i++)
14403 	ops[i] = XVECEXP (vals, 0, i);
14404       op0 = gen_reg_rtx (quarter_mode);
14405       op1 = gen_reg_rtx (quarter_mode);
14406       op2 = gen_reg_rtx (quarter_mode);
14407       op3 = gen_reg_rtx (quarter_mode);
14408       op4 = gen_reg_rtx (half_mode);
14409       op5 = gen_reg_rtx (half_mode);
14410       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14411 					  n >> 3);
14412       ix86_expand_vector_init_interleave (quarter_mode, op1,
14413 					  &ops [n >> 2], n >> 3);
14414       ix86_expand_vector_init_interleave (quarter_mode, op2,
14415 					  &ops [n >> 1], n >> 3);
14416       ix86_expand_vector_init_interleave (quarter_mode, op3,
14417 					  &ops [(n >> 1) | (n >> 2)], n >> 3);
14418       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14419       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14420       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14421       return;
14422 
14423     case E_V16QImode:
14424       if (!TARGET_SSE4_1)
14425 	break;
14426       /* FALLTHRU */
14427 
14428     case E_V8HImode:
14429       if (!TARGET_SSE2)
14430 	break;
14431 
14432       /* Don't use ix86_expand_vector_init_interleave if we can't
14433 	 move from GPR to SSE register directly.  */
14434       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14435 	break;
14436 
14437       n = GET_MODE_NUNITS (mode);
14438       for (i = 0; i < n; i++)
14439 	ops[i] = XVECEXP (vals, 0, i);
14440       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14441       return;
14442 
14443     case E_V4HImode:
14444     case E_V8QImode:
14445       break;
14446 
14447     default:
14448       gcc_unreachable ();
14449     }
14450 
14451     {
14452       int i, j, n_elts, n_words, n_elt_per_word;
14453       machine_mode inner_mode;
14454       rtx words[4], shift;
14455 
14456       inner_mode = GET_MODE_INNER (mode);
14457       n_elts = GET_MODE_NUNITS (mode);
14458       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14459       n_elt_per_word = n_elts / n_words;
14460       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14461 
14462       for (i = 0; i < n_words; ++i)
14463 	{
14464 	  rtx word = NULL_RTX;
14465 
14466 	  for (j = 0; j < n_elt_per_word; ++j)
14467 	    {
14468 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14469 	      elt = convert_modes (word_mode, inner_mode, elt, true);
14470 
14471 	      if (j == 0)
14472 		word = elt;
14473 	      else
14474 		{
14475 		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14476 					      word, 1, OPTAB_LIB_WIDEN);
14477 		  word = expand_simple_binop (word_mode, IOR, word, elt,
14478 					      word, 1, OPTAB_LIB_WIDEN);
14479 		}
14480 	    }
14481 
14482 	  words[i] = word;
14483 	}
14484 
14485       if (n_words == 1)
14486 	emit_move_insn (target, gen_lowpart (mode, words[0]));
14487       else if (n_words == 2)
14488 	{
14489 	  rtx tmp = gen_reg_rtx (mode);
14490 	  emit_clobber (tmp);
14491 	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14492 	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14493 	  emit_move_insn (target, tmp);
14494 	}
14495       else if (n_words == 4)
14496 	{
14497 	  rtx tmp = gen_reg_rtx (V4SImode);
14498 	  gcc_assert (word_mode == SImode);
14499 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14500 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14501 	  emit_move_insn (target, gen_lowpart (mode, tmp));
14502 	}
14503       else
14504 	gcc_unreachable ();
14505     }
14506 }
14507 
14508 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
14509    instructions unless MMX_OK is true.  */
14510 
14511 void
ix86_expand_vector_init(bool mmx_ok,rtx target,rtx vals)14512 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14513 {
14514   machine_mode mode = GET_MODE (target);
14515   machine_mode inner_mode = GET_MODE_INNER (mode);
14516   int n_elts = GET_MODE_NUNITS (mode);
14517   int n_var = 0, one_var = -1;
14518   bool all_same = true, all_const_zero = true;
14519   int i;
14520   rtx x;
14521 
14522   /* Handle first initialization from vector elts.  */
14523   if (n_elts != XVECLEN (vals, 0))
14524     {
14525       rtx subtarget = target;
14526       x = XVECEXP (vals, 0, 0);
14527       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14528       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14529 	{
14530 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14531 	  if (inner_mode == QImode
14532 	      || inner_mode == HImode
14533 	      || inner_mode == TImode)
14534 	    {
14535 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14536 	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
14537 	      n_bits /= GET_MODE_SIZE (elt_mode);
14538 	      mode = mode_for_vector (elt_mode, n_bits).require ();
14539 	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
14540 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
14541 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
14542 	      subtarget = gen_reg_rtx (mode);
14543 	    }
14544 	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14545 	  if (subtarget != target)
14546 	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14547 	  return;
14548 	}
14549       gcc_unreachable ();
14550     }
14551 
14552   for (i = 0; i < n_elts; ++i)
14553     {
14554       x = XVECEXP (vals, 0, i);
14555       if (!(CONST_SCALAR_INT_P (x)
14556 	    || CONST_DOUBLE_P (x)
14557 	    || CONST_FIXED_P (x)))
14558 	n_var++, one_var = i;
14559       else if (x != CONST0_RTX (inner_mode))
14560 	all_const_zero = false;
14561       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14562 	all_same = false;
14563     }
14564 
14565   /* Constants are best loaded from the constant pool.  */
14566   if (n_var == 0)
14567     {
14568       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14569       return;
14570     }
14571 
14572   /* If all values are identical, broadcast the value.  */
14573   if (all_same
14574       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14575 					    XVECEXP (vals, 0, 0)))
14576     return;
14577 
14578   /* Values where only one field is non-constant are best loaded from
14579      the pool and overwritten via move later.  */
14580   if (n_var == 1)
14581     {
14582       if (all_const_zero
14583 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14584 						  XVECEXP (vals, 0, one_var),
14585 						  one_var))
14586 	return;
14587 
14588       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14589 	return;
14590     }
14591 
14592   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14593 }
14594 
14595 /* Implemented as
14596    V setg (V v, int idx, T val)
14597    {
14598      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14599      V valv = (V){val, val, val, val, val, val, val, val};
14600      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14601      v = (v & ~mask) | (valv & mask);
14602      return v;
14603    }.  */
14604 void
ix86_expand_vector_set_var(rtx target,rtx val,rtx idx)14605 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
14606 {
14607   rtx vec[64];
14608   machine_mode mode = GET_MODE (target);
14609   machine_mode cmp_mode = mode;
14610   int n_elts = GET_MODE_NUNITS (mode);
14611   rtx valv,idxv,constv,idx_tmp;
14612   bool ok = false;
14613 
14614   /* 512-bits vector byte/word broadcast and comparison only available
14615      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14616      when without TARGET_AVX512BW.  */
14617   if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
14618     {
14619       gcc_assert (TARGET_AVX512F);
14620       rtx vhi, vlo, idx_hi;
14621       machine_mode half_mode;
14622       rtx (*extract_hi)(rtx, rtx);
14623       rtx (*extract_lo)(rtx, rtx);
14624 
14625       if (mode == V32HImode)
14626 	{
14627 	  half_mode = V16HImode;
14628 	  extract_hi = gen_vec_extract_hi_v32hi;
14629 	  extract_lo = gen_vec_extract_lo_v32hi;
14630 	}
14631       else
14632 	{
14633 	  half_mode = V32QImode;
14634 	  extract_hi = gen_vec_extract_hi_v64qi;
14635 	  extract_lo = gen_vec_extract_lo_v64qi;
14636 	}
14637 
14638       vhi = gen_reg_rtx (half_mode);
14639       vlo = gen_reg_rtx (half_mode);
14640       idx_hi = gen_reg_rtx (GET_MODE (idx));
14641       emit_insn (extract_hi (vhi, target));
14642       emit_insn (extract_lo (vlo, target));
14643       vec[0] = idx_hi;
14644       vec[1] = idx;
14645       vec[2] = GEN_INT (n_elts/2);
14646       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
14647       ix86_expand_vector_set_var (vhi, val, idx_hi);
14648       ix86_expand_vector_set_var (vlo, val, idx);
14649       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
14650       return;
14651     }
14652 
14653   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
14654     {
14655       switch (mode)
14656 	{
14657 	case E_V2DFmode:
14658 	  cmp_mode = V2DImode;
14659 	  break;
14660 	case E_V4DFmode:
14661 	  cmp_mode = V4DImode;
14662 	  break;
14663 	case E_V8DFmode:
14664 	  cmp_mode = V8DImode;
14665 	  break;
14666 	case E_V4SFmode:
14667 	  cmp_mode = V4SImode;
14668 	  break;
14669 	case E_V8SFmode:
14670 	  cmp_mode = V8SImode;
14671 	  break;
14672 	case E_V16SFmode:
14673 	  cmp_mode = V16SImode;
14674 	  break;
14675 	default:
14676 	  gcc_unreachable ();
14677 	}
14678     }
14679 
14680   for (int i = 0; i != n_elts; i++)
14681     vec[i] = GEN_INT (i);
14682   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
14683   valv = gen_reg_rtx (mode);
14684   idxv = gen_reg_rtx (cmp_mode);
14685   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
14686 
14687   ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
14688   gcc_assert (ok);
14689   ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
14690   gcc_assert (ok);
14691   vec[0] = target;
14692   vec[1] = valv;
14693   vec[2] = target;
14694   vec[3] = gen_rtx_EQ (mode, idxv, constv);
14695   vec[4] = idxv;
14696   vec[5] = constv;
14697   ok = ix86_expand_int_vcond (vec);
14698   gcc_assert (ok);
14699 }
14700 
14701 void
ix86_expand_vector_set(bool mmx_ok,rtx target,rtx val,int elt)14702 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14703 {
14704   machine_mode mode = GET_MODE (target);
14705   machine_mode inner_mode = GET_MODE_INNER (mode);
14706   machine_mode half_mode;
14707   bool use_vec_merge = false;
14708   rtx tmp;
14709   static rtx (*gen_extract[6][2]) (rtx, rtx)
14710     = {
14711 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14712 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14713 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14714 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14715 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14716 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14717       };
14718   static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14719     = {
14720 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14721 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14722 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14723 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14724 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14725 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14726       };
14727   int i, j, n;
14728   machine_mode mmode = VOIDmode;
14729   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14730 
14731   switch (mode)
14732     {
14733     case E_V2SImode:
14734       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14735       if (use_vec_merge)
14736 	break;
14737       /* FALLTHRU */
14738 
14739     case E_V2SFmode:
14740       if (mmx_ok)
14741 	{
14742 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14743 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14744 	  if (elt == 0)
14745 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14746 	  else
14747 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14748 	  emit_insn (gen_rtx_SET (target, tmp));
14749 	  return;
14750 	}
14751       break;
14752 
14753     case E_V2DImode:
14754       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14755       if (use_vec_merge)
14756 	break;
14757 
14758       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14759       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14760       if (elt == 0)
14761 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14762       else
14763 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14764       emit_insn (gen_rtx_SET (target, tmp));
14765       return;
14766 
14767     case E_V2DFmode:
14768       /* NB: For ELT == 0, use standard scalar operation patterns which
14769 	 preserve the rest of the vector for combiner:
14770 
14771 	 (vec_merge:V2DF
14772 	   (vec_duplicate:V2DF (reg:DF))
14773 	   (reg:V2DF)
14774 	   (const_int 1))
14775        */
14776       if (elt == 0)
14777 	goto do_vec_merge;
14778 
14779       {
14780 	rtx op0, op1;
14781 
14782 	/* For the two element vectors, we implement a VEC_CONCAT with
14783 	   the extraction of the other element.  */
14784 
14785 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14786 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14787 
14788 	if (elt == 0)
14789 	  op0 = val, op1 = tmp;
14790 	else
14791 	  op0 = tmp, op1 = val;
14792 
14793 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14794 	emit_insn (gen_rtx_SET (target, tmp));
14795       }
14796       return;
14797 
14798     case E_V4SFmode:
14799       use_vec_merge = TARGET_SSE4_1;
14800       if (use_vec_merge)
14801 	break;
14802 
14803       switch (elt)
14804 	{
14805 	case 0:
14806 	  use_vec_merge = true;
14807 	  break;
14808 
14809 	case 1:
14810 	  /* tmp = target = A B C D */
14811 	  tmp = copy_to_reg (target);
14812 	  /* target = A A B B */
14813 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14814 	  /* target = X A B B */
14815 	  ix86_expand_vector_set (false, target, val, 0);
14816 	  /* target = A X C D  */
14817 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14818 					  const1_rtx, const0_rtx,
14819 					  GEN_INT (2+4), GEN_INT (3+4)));
14820 	  return;
14821 
14822 	case 2:
14823 	  /* tmp = target = A B C D */
14824 	  tmp = copy_to_reg (target);
14825 	  /* tmp = X B C D */
14826 	  ix86_expand_vector_set (false, tmp, val, 0);
14827 	  /* target = A B X D */
14828 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14829 					  const0_rtx, const1_rtx,
14830 					  GEN_INT (0+4), GEN_INT (3+4)));
14831 	  return;
14832 
14833 	case 3:
14834 	  /* tmp = target = A B C D */
14835 	  tmp = copy_to_reg (target);
14836 	  /* tmp = X B C D */
14837 	  ix86_expand_vector_set (false, tmp, val, 0);
14838 	  /* target = A B X D */
14839 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14840 					  const0_rtx, const1_rtx,
14841 					  GEN_INT (2+4), GEN_INT (0+4)));
14842 	  return;
14843 
14844 	default:
14845 	  gcc_unreachable ();
14846 	}
14847       break;
14848 
14849     case E_V4SImode:
14850       use_vec_merge = TARGET_SSE4_1;
14851       if (use_vec_merge)
14852 	break;
14853 
14854       /* Element 0 handled by vec_merge below.  */
14855       if (elt == 0)
14856 	{
14857 	  use_vec_merge = true;
14858 	  break;
14859 	}
14860 
14861       if (TARGET_SSE2)
14862 	{
14863 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
14864 	     store into element 0, then shuffle them back.  */
14865 
14866 	  rtx order[4];
14867 
14868 	  order[0] = GEN_INT (elt);
14869 	  order[1] = const1_rtx;
14870 	  order[2] = const2_rtx;
14871 	  order[3] = GEN_INT (3);
14872 	  order[elt] = const0_rtx;
14873 
14874 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14875 					order[1], order[2], order[3]));
14876 
14877 	  ix86_expand_vector_set (false, target, val, 0);
14878 
14879 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14880 					order[1], order[2], order[3]));
14881 	}
14882       else
14883 	{
14884 	  /* For SSE1, we have to reuse the V4SF code.  */
14885 	  rtx t = gen_reg_rtx (V4SFmode);
14886 	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
14887 	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14888 	  emit_move_insn (target, gen_lowpart (mode, t));
14889 	}
14890       return;
14891 
14892     case E_V8HImode:
14893       use_vec_merge = TARGET_SSE2;
14894       break;
14895     case E_V4HImode:
14896       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14897       break;
14898 
14899     case E_V16QImode:
14900       use_vec_merge = TARGET_SSE4_1;
14901       break;
14902 
14903     case E_V8QImode:
14904       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14905       break;
14906 
14907     case E_V32QImode:
14908       half_mode = V16QImode;
14909       j = 0;
14910       n = 16;
14911       goto half;
14912 
14913     case E_V16HImode:
14914       half_mode = V8HImode;
14915       j = 1;
14916       n = 8;
14917       goto half;
14918 
14919     case E_V8SImode:
14920       half_mode = V4SImode;
14921       j = 2;
14922       n = 4;
14923       goto half;
14924 
14925     case E_V4DImode:
14926       half_mode = V2DImode;
14927       j = 3;
14928       n = 2;
14929       goto half;
14930 
14931     case E_V8SFmode:
14932       half_mode = V4SFmode;
14933       j = 4;
14934       n = 4;
14935       goto half;
14936 
14937     case E_V4DFmode:
14938       half_mode = V2DFmode;
14939       j = 5;
14940       n = 2;
14941       goto half;
14942 
14943 half:
14944       /* Compute offset.  */
14945       i = elt / n;
14946       elt %= n;
14947 
14948       gcc_assert (i <= 1);
14949 
14950       /* Extract the half.  */
14951       tmp = gen_reg_rtx (half_mode);
14952       emit_insn (gen_extract[j][i] (tmp, target));
14953 
14954       /* Put val in tmp at elt.  */
14955       ix86_expand_vector_set (false, tmp, val, elt);
14956 
14957       /* Put it back.  */
14958       emit_insn (gen_insert[j][i] (target, target, tmp));
14959       return;
14960 
14961     case E_V8DFmode:
14962       if (TARGET_AVX512F)
14963 	{
14964 	  mmode = QImode;
14965 	  gen_blendm = gen_avx512f_blendmv8df;
14966 	}
14967       break;
14968 
14969     case E_V8DImode:
14970       if (TARGET_AVX512F)
14971 	{
14972 	  mmode = QImode;
14973 	  gen_blendm = gen_avx512f_blendmv8di;
14974 	}
14975       break;
14976 
14977     case E_V16SFmode:
14978       if (TARGET_AVX512F)
14979 	{
14980 	  mmode = HImode;
14981 	  gen_blendm = gen_avx512f_blendmv16sf;
14982 	}
14983       break;
14984 
14985     case E_V16SImode:
14986       if (TARGET_AVX512F)
14987 	{
14988 	  mmode = HImode;
14989 	  gen_blendm = gen_avx512f_blendmv16si;
14990 	}
14991       break;
14992 
14993     case E_V32HImode:
14994       if (TARGET_AVX512BW)
14995 	{
14996 	  mmode = SImode;
14997 	  gen_blendm = gen_avx512bw_blendmv32hi;
14998 	}
14999       else if (TARGET_AVX512F)
15000 	{
15001 	  half_mode = E_V8HImode;
15002 	  n = 8;
15003 	  goto quarter;
15004 	}
15005       break;
15006 
15007     case E_V64QImode:
15008       if (TARGET_AVX512BW)
15009 	{
15010 	  mmode = DImode;
15011 	  gen_blendm = gen_avx512bw_blendmv64qi;
15012 	}
15013       else if (TARGET_AVX512F)
15014 	{
15015 	  half_mode = E_V16QImode;
15016 	  n = 16;
15017 	  goto quarter;
15018 	}
15019       break;
15020 
15021 quarter:
15022       /* Compute offset.  */
15023       i = elt / n;
15024       elt %= n;
15025 
15026       gcc_assert (i <= 3);
15027 
15028       {
15029 	/* Extract the quarter.  */
15030 	tmp = gen_reg_rtx (V4SImode);
15031 	rtx tmp2 = gen_lowpart (V16SImode, target);
15032 	rtx mask = gen_reg_rtx (QImode);
15033 
15034 	emit_move_insn (mask, constm1_rtx);
15035 	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
15036 						   tmp, mask));
15037 
15038 	tmp2 = gen_reg_rtx (half_mode);
15039 	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
15040 	tmp = tmp2;
15041 
15042 	/* Put val in tmp at elt.  */
15043 	ix86_expand_vector_set (false, tmp, val, elt);
15044 
15045 	/* Put it back.  */
15046 	tmp2 = gen_reg_rtx (V16SImode);
15047 	rtx tmp3 = gen_lowpart (V16SImode, target);
15048 	mask = gen_reg_rtx (HImode);
15049 	emit_move_insn (mask, constm1_rtx);
15050 	tmp = gen_lowpart (V4SImode, tmp);
15051 	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
15052 						  tmp3, mask));
15053 	emit_move_insn (target, gen_lowpart (mode, tmp2));
15054       }
15055       return;
15056 
15057     default:
15058       break;
15059     }
15060 
15061   if (mmode != VOIDmode)
15062     {
15063       tmp = gen_reg_rtx (mode);
15064       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
15065       /* The avx512*_blendm<mode> expanders have different operand order
15066 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
15067 	 elements where the mask is set and second input operand otherwise,
15068 	 in {sse,avx}*_*blend* the first input operand is used for elements
15069 	 where the mask is clear and second input operand otherwise.  */
15070       emit_insn (gen_blendm (target, target, tmp,
15071 			     force_reg (mmode,
15072 					gen_int_mode (HOST_WIDE_INT_1U << elt,
15073 						      mmode))));
15074     }
15075   else if (use_vec_merge)
15076     {
15077 do_vec_merge:
15078       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
15079       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
15080 			       GEN_INT (HOST_WIDE_INT_1U << elt));
15081       emit_insn (gen_rtx_SET (target, tmp));
15082     }
15083   else
15084     {
15085       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15086 
15087       emit_move_insn (mem, target);
15088 
15089       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
15090       emit_move_insn (tmp, val);
15091 
15092       emit_move_insn (target, mem);
15093     }
15094 }
15095 
15096 void
ix86_expand_vector_extract(bool mmx_ok,rtx target,rtx vec,int elt)15097 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
15098 {
15099   machine_mode mode = GET_MODE (vec);
15100   machine_mode inner_mode = GET_MODE_INNER (mode);
15101   bool use_vec_extr = false;
15102   rtx tmp;
15103 
15104   switch (mode)
15105     {
15106     case E_V2SImode:
15107       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15108       if (use_vec_extr)
15109 	break;
15110       /* FALLTHRU */
15111 
15112     case E_V2SFmode:
15113       if (!mmx_ok)
15114 	break;
15115       /* FALLTHRU */
15116 
15117     case E_V2DFmode:
15118     case E_V2DImode:
15119     case E_V2TImode:
15120     case E_V4TImode:
15121       use_vec_extr = true;
15122       break;
15123 
15124     case E_V4SFmode:
15125       use_vec_extr = TARGET_SSE4_1;
15126       if (use_vec_extr)
15127 	break;
15128 
15129       switch (elt)
15130 	{
15131 	case 0:
15132 	  tmp = vec;
15133 	  break;
15134 
15135 	case 1:
15136 	case 3:
15137 	  tmp = gen_reg_rtx (mode);
15138 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
15139 				       GEN_INT (elt), GEN_INT (elt),
15140 				       GEN_INT (elt+4), GEN_INT (elt+4)));
15141 	  break;
15142 
15143 	case 2:
15144 	  tmp = gen_reg_rtx (mode);
15145 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
15146 	  break;
15147 
15148 	default:
15149 	  gcc_unreachable ();
15150 	}
15151       vec = tmp;
15152       use_vec_extr = true;
15153       elt = 0;
15154       break;
15155 
15156     case E_V4SImode:
15157       use_vec_extr = TARGET_SSE4_1;
15158       if (use_vec_extr)
15159 	break;
15160 
15161       if (TARGET_SSE2)
15162 	{
15163 	  switch (elt)
15164 	    {
15165 	    case 0:
15166 	      tmp = vec;
15167 	      break;
15168 
15169 	    case 1:
15170 	    case 3:
15171 	      tmp = gen_reg_rtx (mode);
15172 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
15173 					    GEN_INT (elt), GEN_INT (elt),
15174 					    GEN_INT (elt), GEN_INT (elt)));
15175 	      break;
15176 
15177 	    case 2:
15178 	      tmp = gen_reg_rtx (mode);
15179 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
15180 	      break;
15181 
15182 	    default:
15183 	      gcc_unreachable ();
15184 	    }
15185 	  vec = tmp;
15186 	  use_vec_extr = true;
15187 	  elt = 0;
15188 	}
15189       else
15190 	{
15191 	  /* For SSE1, we have to reuse the V4SF code.  */
15192 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
15193 				      gen_lowpart (V4SFmode, vec), elt);
15194 	  return;
15195 	}
15196       break;
15197 
15198     case E_V8HImode:
15199       use_vec_extr = TARGET_SSE2;
15200       break;
15201     case E_V4HImode:
15202       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15203       break;
15204 
15205     case E_V16QImode:
15206       use_vec_extr = TARGET_SSE4_1;
15207       if (!use_vec_extr
15208 	  && TARGET_SSE2
15209 	  && elt == 0
15210 	  && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
15211 	{
15212 	  tmp = gen_reg_rtx (SImode);
15213 	  ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
15214 				      0);
15215 	  emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
15216 	  return;
15217 	}
15218       break;
15219 
15220     case E_V8SFmode:
15221       if (TARGET_AVX)
15222 	{
15223 	  tmp = gen_reg_rtx (V4SFmode);
15224 	  if (elt < 4)
15225 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
15226 	  else
15227 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
15228 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
15229 	  return;
15230 	}
15231       break;
15232 
15233     case E_V4DFmode:
15234       if (TARGET_AVX)
15235 	{
15236 	  tmp = gen_reg_rtx (V2DFmode);
15237 	  if (elt < 2)
15238 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
15239 	  else
15240 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
15241 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
15242 	  return;
15243 	}
15244       break;
15245 
15246     case E_V32QImode:
15247       if (TARGET_AVX)
15248 	{
15249 	  tmp = gen_reg_rtx (V16QImode);
15250 	  if (elt < 16)
15251 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
15252 	  else
15253 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
15254 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
15255 	  return;
15256 	}
15257       break;
15258 
15259     case E_V16HImode:
15260       if (TARGET_AVX)
15261 	{
15262 	  tmp = gen_reg_rtx (V8HImode);
15263 	  if (elt < 8)
15264 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
15265 	  else
15266 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
15267 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
15268 	  return;
15269 	}
15270       break;
15271 
15272     case E_V8SImode:
15273       if (TARGET_AVX)
15274 	{
15275 	  tmp = gen_reg_rtx (V4SImode);
15276 	  if (elt < 4)
15277 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
15278 	  else
15279 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
15280 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
15281 	  return;
15282 	}
15283       break;
15284 
15285     case E_V4DImode:
15286       if (TARGET_AVX)
15287 	{
15288 	  tmp = gen_reg_rtx (V2DImode);
15289 	  if (elt < 2)
15290 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
15291 	  else
15292 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
15293 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
15294 	  return;
15295 	}
15296       break;
15297 
15298     case E_V32HImode:
15299       if (TARGET_AVX512BW)
15300 	{
15301 	  tmp = gen_reg_rtx (V16HImode);
15302 	  if (elt < 16)
15303 	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
15304 	  else
15305 	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
15306 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
15307 	  return;
15308 	}
15309       break;
15310 
15311     case E_V64QImode:
15312       if (TARGET_AVX512BW)
15313 	{
15314 	  tmp = gen_reg_rtx (V32QImode);
15315 	  if (elt < 32)
15316 	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
15317 	  else
15318 	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
15319 	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
15320 	  return;
15321 	}
15322       break;
15323 
15324     case E_V16SFmode:
15325       tmp = gen_reg_rtx (V8SFmode);
15326       if (elt < 8)
15327 	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
15328       else
15329 	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
15330       ix86_expand_vector_extract (false, target, tmp, elt & 7);
15331       return;
15332 
15333     case E_V8DFmode:
15334       tmp = gen_reg_rtx (V4DFmode);
15335       if (elt < 4)
15336 	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
15337       else
15338 	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
15339       ix86_expand_vector_extract (false, target, tmp, elt & 3);
15340       return;
15341 
15342     case E_V16SImode:
15343       tmp = gen_reg_rtx (V8SImode);
15344       if (elt < 8)
15345 	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
15346       else
15347 	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
15348       ix86_expand_vector_extract (false, target, tmp, elt & 7);
15349       return;
15350 
15351     case E_V8DImode:
15352       tmp = gen_reg_rtx (V4DImode);
15353       if (elt < 4)
15354 	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
15355       else
15356 	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
15357       ix86_expand_vector_extract (false, target, tmp, elt & 3);
15358       return;
15359 
15360     case E_V8QImode:
15361       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15362       /* ??? Could extract the appropriate HImode element and shift.  */
15363       break;
15364 
15365     default:
15366       break;
15367     }
15368 
15369   if (use_vec_extr)
15370     {
15371       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
15372       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
15373 
15374       /* Let the rtl optimizers know about the zero extension performed.  */
15375       if (inner_mode == QImode || inner_mode == HImode)
15376 	{
15377 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
15378 	  target = gen_lowpart (SImode, target);
15379 	}
15380 
15381       emit_insn (gen_rtx_SET (target, tmp));
15382     }
15383   else
15384     {
15385       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15386 
15387       emit_move_insn (mem, vec);
15388 
15389       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
15390       emit_move_insn (target, tmp);
15391     }
15392 }
15393 
15394 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15395    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15396    The upper bits of DEST are undefined, though they shouldn't cause
15397    exceptions (some bits from src or all zeros are ok).  */
15398 
15399 static void
emit_reduc_half(rtx dest,rtx src,int i)15400 emit_reduc_half (rtx dest, rtx src, int i)
15401 {
15402   rtx tem, d = dest;
15403   switch (GET_MODE (src))
15404     {
15405     case E_V4SFmode:
15406       if (i == 128)
15407 	tem = gen_sse_movhlps (dest, src, src);
15408       else
15409 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
15410 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
15411       break;
15412     case E_V2DFmode:
15413       tem = gen_vec_interleave_highv2df (dest, src, src);
15414       break;
15415     case E_V16QImode:
15416     case E_V8HImode:
15417     case E_V4SImode:
15418     case E_V2DImode:
15419       d = gen_reg_rtx (V1TImode);
15420       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
15421 				GEN_INT (i / 2));
15422       break;
15423     case E_V8SFmode:
15424       if (i == 256)
15425 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
15426       else
15427 	tem = gen_avx_shufps256 (dest, src, src,
15428 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
15429       break;
15430     case E_V4DFmode:
15431       if (i == 256)
15432 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
15433       else
15434 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
15435       break;
15436     case E_V32QImode:
15437     case E_V16HImode:
15438     case E_V8SImode:
15439     case E_V4DImode:
15440       if (i == 256)
15441 	{
15442 	  if (GET_MODE (dest) != V4DImode)
15443 	    d = gen_reg_rtx (V4DImode);
15444 	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
15445 				   gen_lowpart (V4DImode, src),
15446 				   const1_rtx);
15447 	}
15448       else
15449 	{
15450 	  d = gen_reg_rtx (V2TImode);
15451 	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
15452 				    GEN_INT (i / 2));
15453 	}
15454       break;
15455     case E_V64QImode:
15456     case E_V32HImode:
15457       if (i < 64)
15458 	{
15459 	  d = gen_reg_rtx (V4TImode);
15460 	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
15461 					GEN_INT (i / 2));
15462 	  break;
15463 	}
15464       /* FALLTHRU */
15465     case E_V16SImode:
15466     case E_V16SFmode:
15467     case E_V8DImode:
15468     case E_V8DFmode:
15469       if (i > 128)
15470 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
15471 					gen_lowpart (V16SImode, src),
15472 					gen_lowpart (V16SImode, src),
15473 					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
15474 					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
15475 					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
15476 					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
15477 					GEN_INT (0xC), GEN_INT (0xD),
15478 					GEN_INT (0xE), GEN_INT (0xF),
15479 					GEN_INT (0x10), GEN_INT (0x11),
15480 					GEN_INT (0x12), GEN_INT (0x13),
15481 					GEN_INT (0x14), GEN_INT (0x15),
15482 					GEN_INT (0x16), GEN_INT (0x17));
15483       else
15484 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
15485 				    gen_lowpart (V16SImode, src),
15486 				    GEN_INT (i == 128 ? 0x2 : 0x1),
15487 				    GEN_INT (0x3),
15488 				    GEN_INT (0x3),
15489 				    GEN_INT (0x3),
15490 				    GEN_INT (i == 128 ? 0x6 : 0x5),
15491 				    GEN_INT (0x7),
15492 				    GEN_INT (0x7),
15493 				    GEN_INT (0x7),
15494 				    GEN_INT (i == 128 ? 0xA : 0x9),
15495 				    GEN_INT (0xB),
15496 				    GEN_INT (0xB),
15497 				    GEN_INT (0xB),
15498 				    GEN_INT (i == 128 ? 0xE : 0xD),
15499 				    GEN_INT (0xF),
15500 				    GEN_INT (0xF),
15501 				    GEN_INT (0xF));
15502       break;
15503     default:
15504       gcc_unreachable ();
15505     }
15506   emit_insn (tem);
15507   if (d != dest)
15508     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15509 }
15510 
15511 /* Expand a vector reduction.  FN is the binary pattern to reduce;
15512    DEST is the destination; IN is the input vector.  */
15513 
15514 void
ix86_expand_reduc(rtx (* fn)(rtx,rtx,rtx),rtx dest,rtx in)15515 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15516 {
15517   rtx half, dst, vec = in;
15518   machine_mode mode = GET_MODE (in);
15519   int i;
15520 
15521   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
15522   if (TARGET_SSE4_1
15523       && mode == V8HImode
15524       && fn == gen_uminv8hi3)
15525     {
15526       emit_insn (gen_sse4_1_phminposuw (dest, in));
15527       return;
15528     }
15529 
15530   for (i = GET_MODE_BITSIZE (mode);
15531        i > GET_MODE_UNIT_BITSIZE (mode);
15532        i >>= 1)
15533     {
15534       half = gen_reg_rtx (mode);
15535       emit_reduc_half (half, vec, i);
15536       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15537 	dst = dest;
15538       else
15539 	dst = gen_reg_rtx (mode);
15540       emit_insn (fn (dst, half, vec));
15541       vec = dst;
15542     }
15543 }
15544 
15545 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15546    FP status register is set.  */
15547 
15548 void
ix86_emit_fp_unordered_jump(rtx label)15549 ix86_emit_fp_unordered_jump (rtx label)
15550 {
15551   rtx reg = gen_reg_rtx (HImode);
15552   rtx_insn *insn;
15553   rtx temp;
15554 
15555   emit_insn (gen_x86_fnstsw_1 (reg));
15556 
15557   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15558     {
15559       emit_insn (gen_x86_sahf_1 (reg));
15560 
15561       temp = gen_rtx_REG (CCmode, FLAGS_REG);
15562       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15563     }
15564   else
15565     {
15566       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15567 
15568       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15569       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15570     }
15571 
15572   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15573 			      gen_rtx_LABEL_REF (VOIDmode, label),
15574 			      pc_rtx);
15575   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15576   predict_jump (REG_BR_PROB_BASE * 10 / 100);
15577   JUMP_LABEL (insn) = label;
15578 }
15579 
15580 /* Output code to perform an sinh XFmode calculation.  */
15581 
ix86_emit_i387_sinh(rtx op0,rtx op1)15582 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15583 {
15584   rtx e1 = gen_reg_rtx (XFmode);
15585   rtx e2 = gen_reg_rtx (XFmode);
15586   rtx scratch = gen_reg_rtx (HImode);
15587   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15588   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15589   rtx cst1, tmp;
15590   rtx_code_label *jump_label = gen_label_rtx ();
15591   rtx_insn *insn;
15592 
15593   /* scratch = fxam (op1) */
15594   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15595 
15596   /* e1 = expm1 (|op1|) */
15597   emit_insn (gen_absxf2 (e2, op1));
15598   emit_insn (gen_expm1xf2 (e1, e2));
15599 
15600   /* e2 = e1 / (e1 + 1.0) + e1 */
15601   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15602   emit_insn (gen_addxf3 (e2, e1, cst1));
15603   emit_insn (gen_divxf3 (e2, e1, e2));
15604   emit_insn (gen_addxf3 (e2, e2, e1));
15605 
15606   /* flags = signbit (op1) */
15607   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15608 
15609   /* if (flags) then e2 = -e2 */
15610   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15611 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15612 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15613 			      pc_rtx);
15614   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15615   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15616   JUMP_LABEL (insn) = jump_label;
15617 
15618   emit_insn (gen_negxf2 (e2, e2));
15619 
15620   emit_label (jump_label);
15621   LABEL_NUSES (jump_label) = 1;
15622 
15623   /* op0 = 0.5 * e2 */
15624   half = force_reg (XFmode, half);
15625   emit_insn (gen_mulxf3 (op0, e2, half));
15626 }
15627 
15628 /* Output code to perform an cosh XFmode calculation.  */
15629 
ix86_emit_i387_cosh(rtx op0,rtx op1)15630 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15631 {
15632   rtx e1 = gen_reg_rtx (XFmode);
15633   rtx e2 = gen_reg_rtx (XFmode);
15634   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15635   rtx cst1;
15636 
15637   /* e1 = exp (op1) */
15638   emit_insn (gen_expxf2 (e1, op1));
15639 
15640   /* e2 = e1 + 1.0 / e1 */
15641   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15642   emit_insn (gen_divxf3 (e2, cst1, e1));
15643   emit_insn (gen_addxf3 (e2, e1, e2));
15644 
15645   /* op0 = 0.5 * e2 */
15646   half = force_reg (XFmode, half);
15647   emit_insn (gen_mulxf3 (op0, e2, half));
15648 }
15649 
15650 /* Output code to perform an tanh XFmode calculation.  */
15651 
ix86_emit_i387_tanh(rtx op0,rtx op1)15652 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15653 {
15654   rtx e1 = gen_reg_rtx (XFmode);
15655   rtx e2 = gen_reg_rtx (XFmode);
15656   rtx scratch = gen_reg_rtx (HImode);
15657   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15658   rtx cst2, tmp;
15659   rtx_code_label *jump_label = gen_label_rtx ();
15660   rtx_insn *insn;
15661 
15662   /* scratch = fxam (op1) */
15663   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15664 
15665   /* e1 = expm1 (-|2 * op1|) */
15666   emit_insn (gen_addxf3 (e2, op1, op1));
15667   emit_insn (gen_absxf2 (e2, e2));
15668   emit_insn (gen_negxf2 (e2, e2));
15669   emit_insn (gen_expm1xf2 (e1, e2));
15670 
15671   /* e2 = e1 / (e1 + 2.0) */
15672   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15673   emit_insn (gen_addxf3 (e2, e1, cst2));
15674   emit_insn (gen_divxf3 (e2, e1, e2));
15675 
15676   /* flags = signbit (op1) */
15677   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15678 
15679   /* if (!flags) then e2 = -e2 */
15680   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15681 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
15682 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15683 			      pc_rtx);
15684   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15685   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15686   JUMP_LABEL (insn) = jump_label;
15687 
15688   emit_insn (gen_negxf2 (e2, e2));
15689 
15690   emit_label (jump_label);
15691   LABEL_NUSES (jump_label) = 1;
15692 
15693   emit_move_insn (op0, e2);
15694 }
15695 
15696 /* Output code to perform an asinh XFmode calculation.  */
15697 
ix86_emit_i387_asinh(rtx op0,rtx op1)15698 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15699 {
15700   rtx e1 = gen_reg_rtx (XFmode);
15701   rtx e2 = gen_reg_rtx (XFmode);
15702   rtx scratch = gen_reg_rtx (HImode);
15703   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15704   rtx cst1, tmp;
15705   rtx_code_label *jump_label = gen_label_rtx ();
15706   rtx_insn *insn;
15707 
15708   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15709   emit_insn (gen_mulxf3 (e1, op1, op1));
15710   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15711   emit_insn (gen_addxf3 (e2, e1, cst1));
15712   emit_insn (gen_sqrtxf2 (e2, e2));
15713   emit_insn (gen_addxf3 (e2, e2, cst1));
15714 
15715   /* e1 = e1 / e2 */
15716   emit_insn (gen_divxf3 (e1, e1, e2));
15717 
15718   /* scratch = fxam (op1) */
15719   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15720 
15721   /* e1 = e1 + |op1| */
15722   emit_insn (gen_absxf2 (e2, op1));
15723   emit_insn (gen_addxf3 (e1, e1, e2));
15724 
15725   /* e2 = log1p (e1) */
15726   ix86_emit_i387_log1p (e2, e1);
15727 
15728   /* flags = signbit (op1) */
15729   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15730 
15731   /* if (flags) then e2 = -e2 */
15732   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15733 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15734 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15735 			      pc_rtx);
15736   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15737   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15738   JUMP_LABEL (insn) = jump_label;
15739 
15740   emit_insn (gen_negxf2 (e2, e2));
15741 
15742   emit_label (jump_label);
15743   LABEL_NUSES (jump_label) = 1;
15744 
15745   emit_move_insn (op0, e2);
15746 }
15747 
15748 /* Output code to perform an acosh XFmode calculation.  */
15749 
ix86_emit_i387_acosh(rtx op0,rtx op1)15750 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15751 {
15752   rtx e1 = gen_reg_rtx (XFmode);
15753   rtx e2 = gen_reg_rtx (XFmode);
15754   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15755 
15756   /* e2 = sqrt (op1 + 1.0) */
15757   emit_insn (gen_addxf3 (e2, op1, cst1));
15758   emit_insn (gen_sqrtxf2 (e2, e2));
15759 
15760   /* e1 = sqrt (op1 - 1.0) */
15761   emit_insn (gen_subxf3 (e1, op1, cst1));
15762   emit_insn (gen_sqrtxf2 (e1, e1));
15763 
15764   /* e1 = e1 * e2 */
15765   emit_insn (gen_mulxf3 (e1, e1, e2));
15766 
15767   /* e1 = e1 + op1 */
15768   emit_insn (gen_addxf3 (e1, e1, op1));
15769 
15770   /* op0 = log (e1) */
15771   emit_insn (gen_logxf2 (op0, e1));
15772 }
15773 
15774 /* Output code to perform an atanh XFmode calculation.  */
15775 
ix86_emit_i387_atanh(rtx op0,rtx op1)15776 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15777 {
15778   rtx e1 = gen_reg_rtx (XFmode);
15779   rtx e2 = gen_reg_rtx (XFmode);
15780   rtx scratch = gen_reg_rtx (HImode);
15781   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15782   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15783   rtx cst1, tmp;
15784   rtx_code_label *jump_label = gen_label_rtx ();
15785   rtx_insn *insn;
15786 
15787   /* scratch = fxam (op1) */
15788   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15789 
15790   /* e2 = |op1| */
15791   emit_insn (gen_absxf2 (e2, op1));
15792 
15793   /* e1 = -(e2 + e2) / (e2 + 1.0) */
15794   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15795   emit_insn (gen_addxf3 (e1, e2, cst1));
15796   emit_insn (gen_addxf3 (e2, e2, e2));
15797   emit_insn (gen_negxf2 (e2, e2));
15798   emit_insn (gen_divxf3 (e1, e2, e1));
15799 
15800   /* e2 = log1p (e1) */
15801   ix86_emit_i387_log1p (e2, e1);
15802 
15803   /* flags = signbit (op1) */
15804   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15805 
15806   /* if (!flags) then e2 = -e2 */
15807   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15808 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
15809 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15810 			      pc_rtx);
15811   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15812   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15813   JUMP_LABEL (insn) = jump_label;
15814 
15815   emit_insn (gen_negxf2 (e2, e2));
15816 
15817   emit_label (jump_label);
15818   LABEL_NUSES (jump_label) = 1;
15819 
15820   /* op0 = 0.5 * e2 */
15821   half = force_reg (XFmode, half);
15822   emit_insn (gen_mulxf3 (op0, e2, half));
15823 }
15824 
15825 /* Output code to perform a log1p XFmode calculation.  */
15826 
ix86_emit_i387_log1p(rtx op0,rtx op1)15827 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15828 {
15829   rtx_code_label *label1 = gen_label_rtx ();
15830   rtx_code_label *label2 = gen_label_rtx ();
15831 
15832   rtx tmp = gen_reg_rtx (XFmode);
15833   rtx res = gen_reg_rtx (XFmode);
15834   rtx cst, cstln2, cst1;
15835   rtx_insn *insn;
15836 
15837   cst = const_double_from_real_value
15838     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15839   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15840 
15841   emit_insn (gen_absxf2 (tmp, op1));
15842 
15843   cst = force_reg (XFmode, cst);
15844   ix86_expand_branch (GE, tmp, cst, label1);
15845   predict_jump (REG_BR_PROB_BASE * 10 / 100);
15846   insn = get_last_insn ();
15847   JUMP_LABEL (insn) = label1;
15848 
15849   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15850   emit_jump (label2);
15851 
15852   emit_label (label1);
15853   LABEL_NUSES (label1) = 1;
15854 
15855   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15856   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15857   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15858 
15859   emit_label (label2);
15860   LABEL_NUSES (label2) = 1;
15861 
15862   emit_move_insn (op0, res);
15863 }
15864 
15865 /* Emit code for round calculation.  */
ix86_emit_i387_round(rtx op0,rtx op1)15866 void ix86_emit_i387_round (rtx op0, rtx op1)
15867 {
15868   machine_mode inmode = GET_MODE (op1);
15869   machine_mode outmode = GET_MODE (op0);
15870   rtx e1 = gen_reg_rtx (XFmode);
15871   rtx e2 = gen_reg_rtx (XFmode);
15872   rtx scratch = gen_reg_rtx (HImode);
15873   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15874   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15875   rtx res = gen_reg_rtx (outmode);
15876   rtx_code_label *jump_label = gen_label_rtx ();
15877   rtx (*floor_insn) (rtx, rtx);
15878   rtx (*neg_insn) (rtx, rtx);
15879   rtx_insn *insn;
15880   rtx tmp;
15881 
15882   switch (inmode)
15883     {
15884     case E_SFmode:
15885     case E_DFmode:
15886       tmp = gen_reg_rtx (XFmode);
15887 
15888       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15889       op1 = tmp;
15890       break;
15891     case E_XFmode:
15892       break;
15893     default:
15894       gcc_unreachable ();
15895     }
15896 
15897   switch (outmode)
15898     {
15899     case E_SFmode:
15900       floor_insn = gen_frndintxf2_floor;
15901       neg_insn = gen_negsf2;
15902       break;
15903     case E_DFmode:
15904       floor_insn = gen_frndintxf2_floor;
15905       neg_insn = gen_negdf2;
15906       break;
15907     case E_XFmode:
15908       floor_insn = gen_frndintxf2_floor;
15909       neg_insn = gen_negxf2;
15910       break;
15911     case E_HImode:
15912       floor_insn = gen_lfloorxfhi2;
15913       neg_insn = gen_neghi2;
15914       break;
15915     case E_SImode:
15916       floor_insn = gen_lfloorxfsi2;
15917       neg_insn = gen_negsi2;
15918       break;
15919     case E_DImode:
15920       floor_insn = gen_lfloorxfdi2;
15921       neg_insn = gen_negdi2;
15922       break;
15923     default:
15924       gcc_unreachable ();
15925     }
15926 
15927   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15928 
15929   /* scratch = fxam(op1) */
15930   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15931 
15932   /* e1 = fabs(op1) */
15933   emit_insn (gen_absxf2 (e1, op1));
15934 
15935   /* e2 = e1 + 0.5 */
15936   half = force_reg (XFmode, half);
15937   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15938 
15939   /* res = floor(e2) */
15940   switch (outmode)
15941     {
15942     case E_SFmode:
15943     case E_DFmode:
15944       {
15945 	tmp = gen_reg_rtx (XFmode);
15946 
15947 	emit_insn (floor_insn (tmp, e2));
15948 	emit_insn (gen_rtx_SET (res,
15949 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15950 						UNSPEC_TRUNC_NOOP)));
15951       }
15952       break;
15953     default:
15954       emit_insn (floor_insn (res, e2));
15955     }
15956 
15957   /* flags = signbit(a) */
15958   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15959 
15960   /* if (flags) then res = -res */
15961   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15962 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15963 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15964 			      pc_rtx);
15965   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15966   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15967   JUMP_LABEL (insn) = jump_label;
15968 
15969   emit_insn (neg_insn (res, res));
15970 
15971   emit_label (jump_label);
15972   LABEL_NUSES (jump_label) = 1;
15973 
15974   emit_move_insn (op0, res);
15975 }
15976 
15977 /* Output code to perform a Newton-Rhapson approximation of a single precision
15978    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
15979 
ix86_emit_swdivsf(rtx res,rtx a,rtx b,machine_mode mode)15980 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15981 {
15982   rtx x0, x1, e0, e1;
15983 
15984   x0 = gen_reg_rtx (mode);
15985   e0 = gen_reg_rtx (mode);
15986   e1 = gen_reg_rtx (mode);
15987   x1 = gen_reg_rtx (mode);
15988 
15989   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15990 
15991   b = force_reg (mode, b);
15992 
15993   /* x0 = rcp(b) estimate */
15994   if (mode == V16SFmode || mode == V8DFmode)
15995     {
15996       if (TARGET_AVX512ER)
15997 	{
15998 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15999 						      UNSPEC_RCP28)));
16000 	  /* res = a * x0 */
16001 	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
16002 	  return;
16003 	}
16004       else
16005 	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
16006 						    UNSPEC_RCP14)));
16007     }
16008   else
16009     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
16010 						UNSPEC_RCP)));
16011 
16012   /* e0 = x0 * b */
16013   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
16014 
16015   /* e0 = x0 * e0 */
16016   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
16017 
16018   /* e1 = x0 + x0 */
16019   emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
16020 
16021   /* x1 = e1 - e0 */
16022   emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
16023 
16024   /* res = a * x1 */
16025   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
16026 }
16027 
16028 /* Output code to perform a Newton-Rhapson approximation of a
16029    single precision floating point [reciprocal] square root.  */
16030 
ix86_emit_swsqrtsf(rtx res,rtx a,machine_mode mode,bool recip)16031 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
16032 {
16033   rtx x0, e0, e1, e2, e3, mthree, mhalf;
16034   REAL_VALUE_TYPE r;
16035   int unspec;
16036 
16037   x0 = gen_reg_rtx (mode);
16038   e0 = gen_reg_rtx (mode);
16039   e1 = gen_reg_rtx (mode);
16040   e2 = gen_reg_rtx (mode);
16041   e3 = gen_reg_rtx (mode);
16042 
16043   if (TARGET_AVX512ER && mode == V16SFmode)
16044     {
16045       if (recip)
16046 	/* res = rsqrt28(a) estimate */
16047 	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16048 						     UNSPEC_RSQRT28)));
16049       else
16050 	{
16051 	  /* x0 = rsqrt28(a) estimate */
16052 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16053 						      UNSPEC_RSQRT28)));
16054 	  /* res = rcp28(x0) estimate */
16055 	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
16056 						       UNSPEC_RCP28)));
16057 	}
16058       return;
16059     }
16060 
16061   real_from_integer (&r, VOIDmode, -3, SIGNED);
16062   mthree = const_double_from_real_value (r, SFmode);
16063 
16064   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
16065   mhalf = const_double_from_real_value (r, SFmode);
16066   unspec = UNSPEC_RSQRT;
16067 
16068   if (VECTOR_MODE_P (mode))
16069     {
16070       mthree = ix86_build_const_vector (mode, true, mthree);
16071       mhalf = ix86_build_const_vector (mode, true, mhalf);
16072       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
16073       if (GET_MODE_SIZE (mode) == 64)
16074 	unspec = UNSPEC_RSQRT14;
16075     }
16076 
16077   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16078      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16079 
16080   a = force_reg (mode, a);
16081 
16082   /* x0 = rsqrt(a) estimate */
16083   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16084 					      unspec)));
16085 
16086   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
16087   if (!recip)
16088     {
16089       rtx zero = force_reg (mode, CONST0_RTX(mode));
16090       rtx mask;
16091 
16092       /* Handle masked compare.  */
16093       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
16094 	{
16095 	  mask = gen_reg_rtx (HImode);
16096 	  /* Imm value 0x4 corresponds to not-equal comparison.  */
16097 	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
16098 	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
16099 	}
16100       else
16101 	{
16102 	  mask = gen_reg_rtx (mode);
16103 	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
16104 	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
16105 	}
16106     }
16107 
16108   mthree = force_reg (mode, mthree);
16109 
16110   /* e0 = x0 * a */
16111   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
16112 
16113   unsigned vector_size = GET_MODE_SIZE (mode);
16114   if (TARGET_FMA
16115       || (TARGET_AVX512F && vector_size == 64)
16116       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
16117     emit_insn (gen_rtx_SET (e2,
16118 			    gen_rtx_FMA (mode, e0, x0, mthree)));
16119   else
16120     {
16121       /* e1 = e0 * x0 */
16122       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
16123 
16124       /* e2 = e1 - 3. */
16125       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
16126     }
16127 
16128   mhalf = force_reg (mode, mhalf);
16129   if (recip)
16130     /* e3 = -.5 * x0 */
16131     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
16132   else
16133     /* e3 = -.5 * e0 */
16134     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
16135   /* ret = e2 * e3 */
16136   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
16137 }
16138 
16139 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
16140    mask for masking out the sign-bit is stored in *SMASK, if that is
16141    non-null.  */
16142 
16143 static rtx
ix86_expand_sse_fabs(rtx op0,rtx * smask)16144 ix86_expand_sse_fabs (rtx op0, rtx *smask)
16145 {
16146   machine_mode vmode, mode = GET_MODE (op0);
16147   rtx xa, mask;
16148 
16149   xa = gen_reg_rtx (mode);
16150   if (mode == SFmode)
16151     vmode = V4SFmode;
16152   else if (mode == DFmode)
16153     vmode = V2DFmode;
16154   else
16155     vmode = mode;
16156   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
16157   if (!VECTOR_MODE_P (mode))
16158     {
16159       /* We need to generate a scalar mode mask in this case.  */
16160       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16161       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16162       mask = gen_reg_rtx (mode);
16163       emit_insn (gen_rtx_SET (mask, tmp));
16164     }
16165   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
16166 
16167   if (smask)
16168     *smask = mask;
16169 
16170   return xa;
16171 }
16172 
16173 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16174    swapping the operands if SWAP_OPERANDS is true.  The expanded
16175    code is a forward jump to a newly created label in case the
16176    comparison is true.  The generated label rtx is returned.  */
16177 static rtx_code_label *
ix86_expand_sse_compare_and_jump(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)16178 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
16179                                   bool swap_operands)
16180 {
16181   bool unordered_compare = ix86_unordered_fp_compare (code);
16182   rtx_code_label *label;
16183   rtx tmp, reg;
16184 
16185   if (swap_operands)
16186     std::swap (op0, op1);
16187 
16188   label = gen_label_rtx ();
16189   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
16190   if (unordered_compare)
16191     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
16192   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
16193   emit_insn (gen_rtx_SET (reg, tmp));
16194   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
16195   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
16196 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
16197   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16198   JUMP_LABEL (tmp) = label;
16199 
16200   return label;
16201 }
16202 
16203 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16204    using comparison code CODE.  Operands are swapped for the comparison if
16205    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
16206 static rtx
ix86_expand_sse_compare_mask(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)16207 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
16208 			      bool swap_operands)
16209 {
16210   rtx (*insn)(rtx, rtx, rtx, rtx);
16211   machine_mode mode = GET_MODE (op0);
16212   rtx mask = gen_reg_rtx (mode);
16213 
16214   if (swap_operands)
16215     std::swap (op0, op1);
16216 
16217   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
16218 
16219   emit_insn (insn (mask, op0, op1,
16220 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
16221   return mask;
16222 }
16223 
16224 /* Expand copysign from SIGN to the positive value ABS_VALUE
16225    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
16226    the sign-bit.  */
16227 
16228 static void
ix86_sse_copysign_to_positive(rtx result,rtx abs_value,rtx sign,rtx mask)16229 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
16230 {
16231   machine_mode mode = GET_MODE (sign);
16232   rtx sgn = gen_reg_rtx (mode);
16233   if (mask == NULL_RTX)
16234     {
16235       machine_mode vmode;
16236 
16237       if (mode == SFmode)
16238 	vmode = V4SFmode;
16239       else if (mode == DFmode)
16240 	vmode = V2DFmode;
16241       else
16242 	vmode = mode;
16243 
16244       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
16245       if (!VECTOR_MODE_P (mode))
16246 	{
16247 	  /* We need to generate a scalar mode mask in this case.  */
16248 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16249 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16250 	  mask = gen_reg_rtx (mode);
16251 	  emit_insn (gen_rtx_SET (mask, tmp));
16252 	}
16253     }
16254   else
16255     mask = gen_rtx_NOT (mode, mask);
16256   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
16257   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
16258 }
16259 
16260 /* Expand SSE sequence for computing lround from OP1 storing
16261    into OP0.  */
16262 
16263 void
ix86_expand_lround(rtx op0,rtx op1)16264 ix86_expand_lround (rtx op0, rtx op1)
16265 {
16266   /* C code for the stuff we're doing below:
16267 	tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16268 	return (long)tmp;
16269    */
16270   machine_mode mode = GET_MODE (op1);
16271   const struct real_format *fmt;
16272   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16273   rtx adj;
16274 
16275   /* load nextafter (0.5, 0.0) */
16276   fmt = REAL_MODE_FORMAT (mode);
16277   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16278   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16279 
16280   /* adj = copysign (0.5, op1) */
16281   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
16282   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
16283 
16284   /* adj = op1 + adj */
16285   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
16286 
16287   /* op0 = (imode)adj */
16288   expand_fix (op0, adj, 0);
16289 }
16290 
16291 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16292    into OPERAND0.  */
16293 
16294 void
ix86_expand_lfloorceil(rtx op0,rtx op1,bool do_floor)16295 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
16296 {
16297   /* C code for the stuff we're doing below (for do_floor):
16298 	xi = (long)op1;
16299 	xi -= (double)xi > op1 ? 1 : 0;
16300 	return xi;
16301    */
16302   machine_mode fmode = GET_MODE (op1);
16303   machine_mode imode = GET_MODE (op0);
16304   rtx ireg, freg, tmp;
16305   rtx_code_label *label;
16306 
16307   /* reg = (long)op1 */
16308   ireg = gen_reg_rtx (imode);
16309   expand_fix (ireg, op1, 0);
16310 
16311   /* freg = (double)reg */
16312   freg = gen_reg_rtx (fmode);
16313   expand_float (freg, ireg, 0);
16314 
16315   /* ireg = (freg > op1) ? ireg - 1 : ireg */
16316   label = ix86_expand_sse_compare_and_jump (UNLE,
16317 					    freg, op1, !do_floor);
16318   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
16319 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
16320   emit_move_insn (ireg, tmp);
16321 
16322   emit_label (label);
16323   LABEL_NUSES (label) = 1;
16324 
16325   emit_move_insn (op0, ireg);
16326 }
16327 
16328 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16329    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
16330 
16331 static rtx
ix86_gen_TWO52(machine_mode mode)16332 ix86_gen_TWO52 (machine_mode mode)
16333 {
16334   const struct real_format *fmt;
16335   REAL_VALUE_TYPE TWO52r;
16336   rtx TWO52;
16337 
16338   fmt = REAL_MODE_FORMAT (mode);
16339   real_2expN (&TWO52r, fmt->p - 1, mode);
16340   TWO52 = const_double_from_real_value (TWO52r, mode);
16341   TWO52 = force_reg (mode, TWO52);
16342 
16343   return TWO52;
16344 }
16345 
16346 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
16347 
16348 void
ix86_expand_rint(rtx operand0,rtx operand1)16349 ix86_expand_rint (rtx operand0, rtx operand1)
16350 {
16351   /* C code for the stuff we're doing below:
16352 	xa = fabs (operand1);
16353 	if (!isless (xa, 2**52))
16354 	  return operand1;
16355 	two52 = 2**52;
16356 	if (flag_rounding_math)
16357 	  {
16358 	    two52 = copysign (two52, operand1);
16359 	    xa = operand1;
16360 	  }
16361 	xa = xa + two52 - two52;
16362 	return copysign (xa, operand1);
16363    */
16364   machine_mode mode = GET_MODE (operand0);
16365   rtx res, xa, TWO52, mask;
16366   rtx_code_label *label;
16367 
16368   TWO52 = ix86_gen_TWO52 (mode);
16369 
16370   /* Temporary for holding the result, initialized to the input
16371      operand to ease control flow.  */
16372   res = copy_to_reg (operand1);
16373 
16374   /* xa = abs (operand1) */
16375   xa = ix86_expand_sse_fabs (res, &mask);
16376 
16377   /* if (!isless (xa, TWO52)) goto label; */
16378   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16379 
16380   if (flag_rounding_math)
16381     {
16382       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
16383       xa = res;
16384     }
16385 
16386   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16387   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16388 
16389   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16390   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16391     xa = ix86_expand_sse_fabs (xa, NULL);
16392 
16393   ix86_sse_copysign_to_positive (res, xa, res, mask);
16394 
16395   emit_label (label);
16396   LABEL_NUSES (label) = 1;
16397 
16398   emit_move_insn (operand0, res);
16399 }
16400 
16401 /* Expand SSE2 sequence for computing floor or ceil
16402    from OPERAND1 storing into OPERAND0.  */
16403 void
ix86_expand_floorceil(rtx operand0,rtx operand1,bool do_floor)16404 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
16405 {
16406   /* C code for the stuff we expand below.
16407 	double xa = fabs (x), x2;
16408 	if (!isless (xa, TWO52))
16409 	  return x;
16410 	x2 = (double)(long)x;
16411 
16412      Compensate.  Floor:
16413 	if (x2 > x)
16414 	  x2 -= 1;
16415      Compensate.  Ceil:
16416 	if (x2 < x)
16417 	  x2 += 1;
16418 
16419 	if (HONOR_SIGNED_ZEROS (mode))
16420 	  return copysign (x2, x);
16421 	return x2;
16422    */
16423   machine_mode mode = GET_MODE (operand0);
16424   rtx xa, xi, TWO52, tmp, one, res, mask;
16425   rtx_code_label *label;
16426 
16427   TWO52 = ix86_gen_TWO52 (mode);
16428 
16429   /* Temporary for holding the result, initialized to the input
16430      operand to ease control flow.  */
16431   res = copy_to_reg (operand1);
16432 
16433   /* xa = abs (operand1) */
16434   xa = ix86_expand_sse_fabs (res, &mask);
16435 
16436   /* if (!isless (xa, TWO52)) goto label; */
16437   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16438 
16439   /* xa = (double)(long)x */
16440   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16441   expand_fix (xi, res, 0);
16442   expand_float (xa, xi, 0);
16443 
16444   /* generate 1.0 */
16445   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16446 
16447   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16448   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16449   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16450   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16451 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16452   if (HONOR_SIGNED_ZEROS (mode))
16453     {
16454       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16455       if (do_floor && flag_rounding_math)
16456 	tmp = ix86_expand_sse_fabs (tmp, NULL);
16457 
16458       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16459     }
16460   emit_move_insn (res, tmp);
16461 
16462   emit_label (label);
16463   LABEL_NUSES (label) = 1;
16464 
16465   emit_move_insn (operand0, res);
16466 }
16467 
16468 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16469    into OPERAND0 without relying on DImode truncation via cvttsd2siq
16470    that is only available on 64bit targets.  */
16471 void
ix86_expand_floorceildf_32(rtx operand0,rtx operand1,bool do_floor)16472 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
16473 {
16474   /* C code for the stuff we expand below.
16475 	double xa = fabs (x), x2;
16476 	if (!isless (xa, TWO52))
16477 	  return x;
16478 	xa = xa + TWO52 - TWO52;
16479 	x2 = copysign (xa, x);
16480 
16481      Compensate.  Floor:
16482 	if (x2 > x)
16483 	  x2 -= 1;
16484      Compensate.  Ceil:
16485 	if (x2 < x)
16486 	  x2 += 1;
16487 
16488 	if (HONOR_SIGNED_ZEROS (mode))
16489 	  x2 = copysign (x2, x);
16490 	return x2;
16491    */
16492   machine_mode mode = GET_MODE (operand0);
16493   rtx xa, TWO52, tmp, one, res, mask;
16494   rtx_code_label *label;
16495 
16496   TWO52 = ix86_gen_TWO52 (mode);
16497 
16498   /* Temporary for holding the result, initialized to the input
16499      operand to ease control flow.  */
16500   res = copy_to_reg (operand1);
16501 
16502   /* xa = abs (operand1) */
16503   xa = ix86_expand_sse_fabs (res, &mask);
16504 
16505   /* if (!isless (xa, TWO52)) goto label; */
16506   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16507 
16508   /* xa = xa + TWO52 - TWO52; */
16509   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16510   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16511 
16512   /* xa = copysign (xa, operand1) */
16513   ix86_sse_copysign_to_positive (xa, xa, res, mask);
16514 
16515   /* generate 1.0 */
16516   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16517 
16518   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16519   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16520   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16521   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16522 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16523   if (HONOR_SIGNED_ZEROS (mode))
16524     {
16525       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16526       if (do_floor && flag_rounding_math)
16527 	tmp = ix86_expand_sse_fabs (tmp, NULL);
16528 
16529       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16530     }
16531   emit_move_insn (res, tmp);
16532 
16533   emit_label (label);
16534   LABEL_NUSES (label) = 1;
16535 
16536   emit_move_insn (operand0, res);
16537 }
16538 
16539 /* Expand SSE sequence for computing trunc
16540    from OPERAND1 storing into OPERAND0.  */
16541 void
ix86_expand_trunc(rtx operand0,rtx operand1)16542 ix86_expand_trunc (rtx operand0, rtx operand1)
16543 {
16544   /* C code for SSE variant we expand below.
16545 	double xa = fabs (x), x2;
16546 	if (!isless (xa, TWO52))
16547 	  return x;
16548 	x2 = (double)(long)x;
16549 	if (HONOR_SIGNED_ZEROS (mode))
16550 	  return copysign (x2, x);
16551 	return x2;
16552    */
16553   machine_mode mode = GET_MODE (operand0);
16554   rtx xa, xi, TWO52, res, mask;
16555   rtx_code_label *label;
16556 
16557   TWO52 = ix86_gen_TWO52 (mode);
16558 
16559   /* Temporary for holding the result, initialized to the input
16560      operand to ease control flow.  */
16561   res = copy_to_reg (operand1);
16562 
16563   /* xa = abs (operand1) */
16564   xa = ix86_expand_sse_fabs (res, &mask);
16565 
16566   /* if (!isless (xa, TWO52)) goto label; */
16567   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16568 
16569   /* xa = (double)(long)x */
16570   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16571   expand_fix (xi, res, 0);
16572   expand_float (xa, xi, 0);
16573 
16574   if (HONOR_SIGNED_ZEROS (mode))
16575     ix86_sse_copysign_to_positive (xa, xa, res, mask);
16576 
16577   emit_move_insn (res, xa);
16578 
16579   emit_label (label);
16580   LABEL_NUSES (label) = 1;
16581 
16582   emit_move_insn (operand0, res);
16583 }
16584 
16585 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16586    into OPERAND0 without relying on DImode truncation via cvttsd2siq
16587    that is only available on 64bit targets.  */
16588 void
ix86_expand_truncdf_32(rtx operand0,rtx operand1)16589 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16590 {
16591   machine_mode mode = GET_MODE (operand0);
16592   rtx xa, xa2, TWO52, tmp, one, res, mask;
16593   rtx_code_label *label;
16594 
16595   /* C code for SSE variant we expand below.
16596 	double xa = fabs (x), x2;
16597 	if (!isless (xa, TWO52))
16598 	  return x;
16599 	xa2 = xa + TWO52 - TWO52;
16600      Compensate:
16601 	if (xa2 > xa)
16602 	  xa2 -= 1.0;
16603 	x2 = copysign (xa2, x);
16604 	return x2;
16605    */
16606 
16607   TWO52 = ix86_gen_TWO52 (mode);
16608 
16609   /* Temporary for holding the result, initialized to the input
16610      operand to ease control flow.  */
16611   res =copy_to_reg (operand1);
16612 
16613   /* xa = abs (operand1) */
16614   xa = ix86_expand_sse_fabs (res, &mask);
16615 
16616   /* if (!isless (xa, TWO52)) goto label; */
16617   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16618 
16619   /* xa2 = xa + TWO52 - TWO52; */
16620   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16621   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16622 
16623   /* generate 1.0 */
16624   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16625 
16626   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
16627   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16628   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16629   tmp = expand_simple_binop (mode, MINUS,
16630 			     xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16631   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16632   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16633     tmp = ix86_expand_sse_fabs (tmp, NULL);
16634 
16635   /* res = copysign (xa2, operand1) */
16636   ix86_sse_copysign_to_positive (res, tmp, res, mask);
16637 
16638   emit_label (label);
16639   LABEL_NUSES (label) = 1;
16640 
16641   emit_move_insn (operand0, res);
16642 }
16643 
16644 /* Expand SSE sequence for computing round
16645    from OPERAND1 storing into OPERAND0.  */
16646 void
ix86_expand_round(rtx operand0,rtx operand1)16647 ix86_expand_round (rtx operand0, rtx operand1)
16648 {
16649   /* C code for the stuff we're doing below:
16650 	double xa = fabs (x);
16651 	if (!isless (xa, TWO52))
16652 	  return x;
16653 	xa = (double)(long)(xa + nextafter (0.5, 0.0));
16654 	return copysign (xa, x);
16655    */
16656   machine_mode mode = GET_MODE (operand0);
16657   rtx res, TWO52, xa, xi, half, mask;
16658   rtx_code_label *label;
16659   const struct real_format *fmt;
16660   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16661 
16662   /* Temporary for holding the result, initialized to the input
16663      operand to ease control flow.  */
16664   res = copy_to_reg (operand1);
16665 
16666   TWO52 = ix86_gen_TWO52 (mode);
16667   xa = ix86_expand_sse_fabs (res, &mask);
16668   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16669 
16670   /* load nextafter (0.5, 0.0) */
16671   fmt = REAL_MODE_FORMAT (mode);
16672   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16673   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16674 
16675   /* xa = xa + 0.5 */
16676   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16677   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16678 
16679   /* xa = (double)(int64_t)xa */
16680   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16681   expand_fix (xi, xa, 0);
16682   expand_float (xa, xi, 0);
16683 
16684   /* res = copysign (xa, operand1) */
16685   ix86_sse_copysign_to_positive (res, xa, res, mask);
16686 
16687   emit_label (label);
16688   LABEL_NUSES (label) = 1;
16689 
16690   emit_move_insn (operand0, res);
16691 }
16692 
16693 /* Expand SSE sequence for computing round from OPERAND1 storing
16694    into OPERAND0 without relying on DImode truncation via cvttsd2siq
16695    that is only available on 64bit targets.  */
16696 void
ix86_expand_rounddf_32(rtx operand0,rtx operand1)16697 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16698 {
16699   /* C code for the stuff we expand below.
16700 	double xa = fabs (x), xa2, x2;
16701 	if (!isless (xa, TWO52))
16702 	  return x;
16703      Using the absolute value and copying back sign makes
16704      -0.0 -> -0.0 correct.
16705 	xa2 = xa + TWO52 - TWO52;
16706      Compensate.
16707 	dxa = xa2 - xa;
16708 	if (dxa <= -0.5)
16709 	  xa2 += 1;
16710 	else if (dxa > 0.5)
16711 	  xa2 -= 1;
16712 	x2 = copysign (xa2, x);
16713 	return x2;
16714    */
16715   machine_mode mode = GET_MODE (operand0);
16716   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16717   rtx_code_label *label;
16718 
16719   TWO52 = ix86_gen_TWO52 (mode);
16720 
16721   /* Temporary for holding the result, initialized to the input
16722      operand to ease control flow.  */
16723   res = copy_to_reg (operand1);
16724 
16725   /* xa = abs (operand1) */
16726   xa = ix86_expand_sse_fabs (res, &mask);
16727 
16728   /* if (!isless (xa, TWO52)) goto label; */
16729   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16730 
16731   /* xa2 = xa + TWO52 - TWO52; */
16732   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16733   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16734 
16735   /* dxa = xa2 - xa; */
16736   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16737 
16738   /* generate 0.5, 1.0 and -0.5 */
16739   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16740   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16741   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16742 			       0, OPTAB_DIRECT);
16743 
16744   /* Compensate.  */
16745   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16746   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16747   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16748   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16749   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16750   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16751   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16752   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16753 
16754   /* res = copysign (xa2, operand1) */
16755   ix86_sse_copysign_to_positive (res, xa2, res, mask);
16756 
16757   emit_label (label);
16758   LABEL_NUSES (label) = 1;
16759 
16760   emit_move_insn (operand0, res);
16761 }
16762 
16763 /* Expand SSE sequence for computing round
16764    from OP1 storing into OP0 using sse4 round insn.  */
16765 void
ix86_expand_round_sse4(rtx op0,rtx op1)16766 ix86_expand_round_sse4 (rtx op0, rtx op1)
16767 {
16768   machine_mode mode = GET_MODE (op0);
16769   rtx e1, e2, res, half;
16770   const struct real_format *fmt;
16771   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16772   rtx (*gen_copysign) (rtx, rtx, rtx);
16773   rtx (*gen_round) (rtx, rtx, rtx);
16774 
16775   switch (mode)
16776     {
16777     case E_SFmode:
16778       gen_copysign = gen_copysignsf3;
16779       gen_round = gen_sse4_1_roundsf2;
16780       break;
16781     case E_DFmode:
16782       gen_copysign = gen_copysigndf3;
16783       gen_round = gen_sse4_1_rounddf2;
16784       break;
16785     default:
16786       gcc_unreachable ();
16787     }
16788 
16789   /* round (a) = trunc (a + copysign (0.5, a)) */
16790 
16791   /* load nextafter (0.5, 0.0) */
16792   fmt = REAL_MODE_FORMAT (mode);
16793   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16794   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16795   half = const_double_from_real_value (pred_half, mode);
16796 
16797   /* e1 = copysign (0.5, op1) */
16798   e1 = gen_reg_rtx (mode);
16799   emit_insn (gen_copysign (e1, half, op1));
16800 
16801   /* e2 = op1 + e1 */
16802   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16803 
16804   /* res = trunc (e2) */
16805   res = gen_reg_rtx (mode);
16806   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16807 
16808   emit_move_insn (op0, res);
16809 }
16810 
16811 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16812    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16813    insn every time.  */
16814 
16815 static GTY(()) rtx_insn *vselect_insn;
16816 
16817 /* Initialize vselect_insn.  */
16818 
16819 static void
init_vselect_insn(void)16820 init_vselect_insn (void)
16821 {
16822   unsigned i;
16823   rtx x;
16824 
16825   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16826   for (i = 0; i < MAX_VECT_LEN; ++i)
16827     XVECEXP (x, 0, i) = const0_rtx;
16828   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16829 							const0_rtx), x);
16830   x = gen_rtx_SET (const0_rtx, x);
16831   start_sequence ();
16832   vselect_insn = emit_insn (x);
16833   end_sequence ();
16834 }
16835 
16836 /* Construct (set target (vec_select op0 (parallel perm))) and
16837    return true if that's a valid instruction in the active ISA.  */
16838 
16839 static bool
expand_vselect(rtx target,rtx op0,const unsigned char * perm,unsigned nelt,bool testing_p)16840 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16841 		unsigned nelt, bool testing_p)
16842 {
16843   unsigned int i;
16844   rtx x, save_vconcat;
16845   int icode;
16846 
16847   if (vselect_insn == NULL_RTX)
16848     init_vselect_insn ();
16849 
16850   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16851   PUT_NUM_ELEM (XVEC (x, 0), nelt);
16852   for (i = 0; i < nelt; ++i)
16853     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16854   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16855   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16856   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16857   SET_DEST (PATTERN (vselect_insn)) = target;
16858   icode = recog_memoized (vselect_insn);
16859 
16860   if (icode >= 0 && !testing_p)
16861     emit_insn (copy_rtx (PATTERN (vselect_insn)));
16862 
16863   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16864   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16865   INSN_CODE (vselect_insn) = -1;
16866 
16867   return icode >= 0;
16868 }
16869 
16870 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
16871 
16872 static bool
expand_vselect_vconcat(rtx target,rtx op0,rtx op1,const unsigned char * perm,unsigned nelt,bool testing_p)16873 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16874 			const unsigned char *perm, unsigned nelt,
16875 			bool testing_p)
16876 {
16877   machine_mode v2mode;
16878   rtx x;
16879   bool ok;
16880 
16881   if (vselect_insn == NULL_RTX)
16882     init_vselect_insn ();
16883 
16884   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16885     return false;
16886   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16887   PUT_MODE (x, v2mode);
16888   XEXP (x, 0) = op0;
16889   XEXP (x, 1) = op1;
16890   ok = expand_vselect (target, x, perm, nelt, testing_p);
16891   XEXP (x, 0) = const0_rtx;
16892   XEXP (x, 1) = const0_rtx;
16893   return ok;
16894 }
16895 
16896 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16897    using movss or movsd.  */
16898 static bool
expand_vec_perm_movs(struct expand_vec_perm_d * d)16899 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16900 {
16901   machine_mode vmode = d->vmode;
16902   unsigned i, nelt = d->nelt;
16903   rtx x;
16904 
16905   if (d->one_operand_p)
16906     return false;
16907 
16908   if (!(TARGET_SSE && vmode == V4SFmode)
16909       && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
16910       && !(TARGET_SSE2 && vmode == V2DFmode))
16911     return false;
16912 
16913   /* Only the first element is changed.  */
16914   if (d->perm[0] != nelt && d->perm[0] != 0)
16915     return false;
16916   for (i = 1; i < nelt; ++i)
16917     if (d->perm[i] != i + nelt - d->perm[0])
16918       return false;
16919 
16920   if (d->testing_p)
16921     return true;
16922 
16923   if (d->perm[0] == nelt)
16924     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16925   else
16926     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16927 
16928   emit_insn (gen_rtx_SET (d->target, x));
16929 
16930   return true;
16931 }
16932 
16933 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16934    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
16935 
16936 static bool
expand_vec_perm_blend(struct expand_vec_perm_d * d)16937 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16938 {
16939   machine_mode mmode, vmode = d->vmode;
16940   unsigned i, nelt = d->nelt;
16941   unsigned HOST_WIDE_INT mask;
16942   rtx target, op0, op1, maskop, x;
16943   rtx rperm[32], vperm;
16944 
16945   if (d->one_operand_p)
16946     return false;
16947   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16948       && (TARGET_AVX512BW
16949 	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
16950     ;
16951   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16952     ;
16953   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16954     ;
16955   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16956     ;
16957   else
16958     return false;
16959 
16960   /* This is a blend, not a permute.  Elements must stay in their
16961      respective lanes.  */
16962   for (i = 0; i < nelt; ++i)
16963     {
16964       unsigned e = d->perm[i];
16965       if (!(e == i || e == i + nelt))
16966 	return false;
16967     }
16968 
16969   if (d->testing_p)
16970     return true;
16971 
16972   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
16973      decision should be extracted elsewhere, so that we only try that
16974      sequence once all budget==3 options have been tried.  */
16975   target = d->target;
16976   op0 = d->op0;
16977   op1 = d->op1;
16978   mask = 0;
16979 
16980   switch (vmode)
16981     {
16982     case E_V8DFmode:
16983     case E_V16SFmode:
16984     case E_V4DFmode:
16985     case E_V8SFmode:
16986     case E_V2DFmode:
16987     case E_V4SFmode:
16988     case E_V8HImode:
16989     case E_V8SImode:
16990     case E_V32HImode:
16991     case E_V64QImode:
16992     case E_V16SImode:
16993     case E_V8DImode:
16994       for (i = 0; i < nelt; ++i)
16995 	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16996       break;
16997 
16998     case E_V2DImode:
16999       for (i = 0; i < 2; ++i)
17000 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
17001       vmode = V8HImode;
17002       goto do_subreg;
17003 
17004     case E_V4SImode:
17005       for (i = 0; i < 4; ++i)
17006 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17007       vmode = V8HImode;
17008       goto do_subreg;
17009 
17010     case E_V16QImode:
17011       /* See if bytes move in pairs so we can use pblendw with
17012 	 an immediate argument, rather than pblendvb with a vector
17013 	 argument.  */
17014       for (i = 0; i < 16; i += 2)
17015 	if (d->perm[i] + 1 != d->perm[i + 1])
17016 	  {
17017 	  use_pblendvb:
17018 	    for (i = 0; i < nelt; ++i)
17019 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
17020 
17021 	  finish_pblendvb:
17022 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17023 	    vperm = force_reg (vmode, vperm);
17024 
17025 	    if (GET_MODE_SIZE (vmode) == 16)
17026 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
17027 	    else
17028 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
17029 	    if (target != d->target)
17030 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17031 	    return true;
17032 	  }
17033 
17034       for (i = 0; i < 8; ++i)
17035 	mask |= (d->perm[i * 2] >= 16) << i;
17036       vmode = V8HImode;
17037       /* FALLTHRU */
17038 
17039     do_subreg:
17040       target = gen_reg_rtx (vmode);
17041       op0 = gen_lowpart (vmode, op0);
17042       op1 = gen_lowpart (vmode, op1);
17043       break;
17044 
17045     case E_V32QImode:
17046       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
17047       for (i = 0; i < 32; i += 2)
17048 	if (d->perm[i] + 1 != d->perm[i + 1])
17049 	  goto use_pblendvb;
17050       /* See if bytes move in quadruplets.  If yes, vpblendd
17051 	 with immediate can be used.  */
17052       for (i = 0; i < 32; i += 4)
17053 	if (d->perm[i] + 2 != d->perm[i + 2])
17054 	  break;
17055       if (i < 32)
17056 	{
17057 	  /* See if bytes move the same in both lanes.  If yes,
17058 	     vpblendw with immediate can be used.  */
17059 	  for (i = 0; i < 16; i += 2)
17060 	    if (d->perm[i] + 16 != d->perm[i + 16])
17061 	      goto use_pblendvb;
17062 
17063 	  /* Use vpblendw.  */
17064 	  for (i = 0; i < 16; ++i)
17065 	    mask |= (d->perm[i * 2] >= 32) << i;
17066 	  vmode = V16HImode;
17067 	  goto do_subreg;
17068 	}
17069 
17070       /* Use vpblendd.  */
17071       for (i = 0; i < 8; ++i)
17072 	mask |= (d->perm[i * 4] >= 32) << i;
17073       vmode = V8SImode;
17074       goto do_subreg;
17075 
17076     case E_V16HImode:
17077       /* See if words move in pairs.  If yes, vpblendd can be used.  */
17078       for (i = 0; i < 16; i += 2)
17079 	if (d->perm[i] + 1 != d->perm[i + 1])
17080 	  break;
17081       if (i < 16)
17082 	{
17083 	  /* See if words move the same in both lanes.  If not,
17084 	     vpblendvb must be used.  */
17085 	  for (i = 0; i < 8; i++)
17086 	    if (d->perm[i] + 8 != d->perm[i + 8])
17087 	      {
17088 		/* Use vpblendvb.  */
17089 		for (i = 0; i < 32; ++i)
17090 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
17091 
17092 		vmode = V32QImode;
17093 		nelt = 32;
17094 		target = gen_reg_rtx (vmode);
17095 		op0 = gen_lowpart (vmode, op0);
17096 		op1 = gen_lowpart (vmode, op1);
17097 		goto finish_pblendvb;
17098 	      }
17099 
17100 	  /* Use vpblendw.  */
17101 	  for (i = 0; i < 16; ++i)
17102 	    mask |= (d->perm[i] >= 16) << i;
17103 	  break;
17104 	}
17105 
17106       /* Use vpblendd.  */
17107       for (i = 0; i < 8; ++i)
17108 	mask |= (d->perm[i * 2] >= 16) << i;
17109       vmode = V8SImode;
17110       goto do_subreg;
17111 
17112     case E_V4DImode:
17113       /* Use vpblendd.  */
17114       for (i = 0; i < 4; ++i)
17115 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17116       vmode = V8SImode;
17117       goto do_subreg;
17118 
17119     default:
17120       gcc_unreachable ();
17121     }
17122 
17123   switch (vmode)
17124     {
17125     case E_V8DFmode:
17126     case E_V8DImode:
17127       mmode = QImode;
17128       break;
17129     case E_V16SFmode:
17130     case E_V16SImode:
17131       mmode = HImode;
17132       break;
17133     case E_V32HImode:
17134       mmode = SImode;
17135       break;
17136     case E_V64QImode:
17137       mmode = DImode;
17138       break;
17139     default:
17140       mmode = VOIDmode;
17141     }
17142 
17143   if (mmode != VOIDmode)
17144     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
17145   else
17146     maskop = GEN_INT (mask);
17147 
17148   /* This matches five different patterns with the different modes.  */
17149   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
17150   x = gen_rtx_SET (target, x);
17151   emit_insn (x);
17152   if (target != d->target)
17153     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17154 
17155   return true;
17156 }
17157 
17158 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
17159    in terms of the variable form of vpermilps.
17160 
17161    Note that we will have already failed the immediate input vpermilps,
17162    which requires that the high and low part shuffle be identical; the
17163    variable form doesn't require that.  */
17164 
17165 static bool
expand_vec_perm_vpermil(struct expand_vec_perm_d * d)17166 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
17167 {
17168   rtx rperm[8], vperm;
17169   unsigned i;
17170 
17171   if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
17172     return false;
17173 
17174   /* We can only permute within the 128-bit lane.  */
17175   for (i = 0; i < 8; ++i)
17176     {
17177       unsigned e = d->perm[i];
17178       if (i < 4 ? e >= 4 : e < 4)
17179 	return false;
17180     }
17181 
17182   if (d->testing_p)
17183     return true;
17184 
17185   for (i = 0; i < 8; ++i)
17186     {
17187       unsigned e = d->perm[i];
17188 
17189       /* Within each 128-bit lane, the elements of op0 are numbered
17190 	 from 0 and the elements of op1 are numbered from 4.  */
17191       if (e >= 8 + 4)
17192 	e -= 8;
17193       else if (e >= 4)
17194 	e -= 4;
17195 
17196       rperm[i] = GEN_INT (e);
17197     }
17198 
17199   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
17200   vperm = force_reg (V8SImode, vperm);
17201   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
17202 
17203   return true;
17204 }
17205 
17206 /* Return true if permutation D can be performed as VMODE permutation
17207    instead.  */
17208 
17209 static bool
valid_perm_using_mode_p(machine_mode vmode,struct expand_vec_perm_d * d)17210 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
17211 {
17212   unsigned int i, j, chunk;
17213 
17214   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
17215       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
17216       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
17217     return false;
17218 
17219   if (GET_MODE_NUNITS (vmode) >= d->nelt)
17220     return true;
17221 
17222   chunk = d->nelt / GET_MODE_NUNITS (vmode);
17223   for (i = 0; i < d->nelt; i += chunk)
17224     if (d->perm[i] & (chunk - 1))
17225       return false;
17226     else
17227       for (j = 1; j < chunk; ++j)
17228 	if (d->perm[i] + j != d->perm[i + j])
17229 	  return false;
17230 
17231   return true;
17232 }
17233 
17234 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
17235    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
17236 
17237 static bool
expand_vec_perm_pshufb(struct expand_vec_perm_d * d)17238 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
17239 {
17240   unsigned i, nelt, eltsz, mask;
17241   unsigned char perm[64];
17242   machine_mode vmode = V16QImode;
17243   rtx rperm[64], vperm, target, op0, op1;
17244 
17245   nelt = d->nelt;
17246 
17247   if (!d->one_operand_p)
17248     {
17249       if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
17250 	{
17251 	  if (TARGET_AVX2
17252 	      && valid_perm_using_mode_p (V2TImode, d))
17253 	    {
17254 	      if (d->testing_p)
17255 		return true;
17256 
17257 	      /* Use vperm2i128 insn.  The pattern uses
17258 		 V4DImode instead of V2TImode.  */
17259 	      target = d->target;
17260 	      if (d->vmode != V4DImode)
17261 		target = gen_reg_rtx (V4DImode);
17262 	      op0 = gen_lowpart (V4DImode, d->op0);
17263 	      op1 = gen_lowpart (V4DImode, d->op1);
17264 	      rperm[0]
17265 		= GEN_INT ((d->perm[0] / (nelt / 2))
17266 			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
17267 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
17268 	      if (target != d->target)
17269 		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17270 	      return true;
17271 	    }
17272 	  return false;
17273 	}
17274     }
17275   else
17276     {
17277       if (GET_MODE_SIZE (d->vmode) == 16)
17278 	{
17279 	  if (!TARGET_SSSE3)
17280 	    return false;
17281 	}
17282       else if (GET_MODE_SIZE (d->vmode) == 32)
17283 	{
17284 	  if (!TARGET_AVX2)
17285 	    return false;
17286 
17287 	  /* V4DImode should be already handled through
17288 	     expand_vselect by vpermq instruction.  */
17289 	  gcc_assert (d->vmode != V4DImode);
17290 
17291 	  vmode = V32QImode;
17292 	  if (d->vmode == V8SImode
17293 	      || d->vmode == V16HImode
17294 	      || d->vmode == V32QImode)
17295 	    {
17296 	      /* First see if vpermq can be used for
17297 		 V8SImode/V16HImode/V32QImode.  */
17298 	      if (valid_perm_using_mode_p (V4DImode, d))
17299 		{
17300 		  for (i = 0; i < 4; i++)
17301 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
17302 		  if (d->testing_p)
17303 		    return true;
17304 		  target = gen_reg_rtx (V4DImode);
17305 		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
17306 				      perm, 4, false))
17307 		    {
17308 		      emit_move_insn (d->target,
17309 				      gen_lowpart (d->vmode, target));
17310 		      return true;
17311 		    }
17312 		  return false;
17313 		}
17314 
17315 	      /* Next see if vpermd can be used.  */
17316 	      if (valid_perm_using_mode_p (V8SImode, d))
17317 		vmode = V8SImode;
17318 	    }
17319 	  /* Or if vpermps can be used.  */
17320 	  else if (d->vmode == V8SFmode)
17321 	    vmode = V8SImode;
17322 
17323 	  if (vmode == V32QImode)
17324 	    {
17325 	      /* vpshufb only works intra lanes, it is not
17326 		 possible to shuffle bytes in between the lanes.  */
17327 	      for (i = 0; i < nelt; ++i)
17328 		if ((d->perm[i] ^ i) & (nelt / 2))
17329 		  return false;
17330 	    }
17331 	}
17332       else if (GET_MODE_SIZE (d->vmode) == 64)
17333 	{
17334 	  if (!TARGET_AVX512BW)
17335 	    return false;
17336 
17337 	  /* If vpermq didn't work, vpshufb won't work either.  */
17338 	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
17339 	    return false;
17340 
17341 	  vmode = V64QImode;
17342 	  if (d->vmode == V16SImode
17343 	      || d->vmode == V32HImode
17344 	      || d->vmode == V64QImode)
17345 	    {
17346 	      /* First see if vpermq can be used for
17347 		 V16SImode/V32HImode/V64QImode.  */
17348 	      if (valid_perm_using_mode_p (V8DImode, d))
17349 		{
17350 		  for (i = 0; i < 8; i++)
17351 		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
17352 		  if (d->testing_p)
17353 		    return true;
17354 		  target = gen_reg_rtx (V8DImode);
17355 		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
17356 				      perm, 8, false))
17357 		    {
17358 		      emit_move_insn (d->target,
17359 				      gen_lowpart (d->vmode, target));
17360 		      return true;
17361 		    }
17362 		  return false;
17363 		}
17364 
17365 	      /* Next see if vpermd can be used.  */
17366 	      if (valid_perm_using_mode_p (V16SImode, d))
17367 		vmode = V16SImode;
17368 	    }
17369 	  /* Or if vpermps can be used.  */
17370 	  else if (d->vmode == V16SFmode)
17371 	    vmode = V16SImode;
17372 	  if (vmode == V64QImode)
17373 	    {
17374 	      /* vpshufb only works intra lanes, it is not
17375 		 possible to shuffle bytes in between the lanes.  */
17376 	      for (i = 0; i < nelt; ++i)
17377 		if ((d->perm[i] ^ i) & (3 * nelt / 4))
17378 		  return false;
17379 	    }
17380 	}
17381       else
17382 	return false;
17383     }
17384 
17385   if (d->testing_p)
17386     return true;
17387 
17388   if (vmode == V8SImode)
17389     for (i = 0; i < 8; ++i)
17390       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
17391   else if (vmode == V16SImode)
17392     for (i = 0; i < 16; ++i)
17393       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
17394   else
17395     {
17396       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
17397       if (!d->one_operand_p)
17398 	mask = 2 * nelt - 1;
17399       else if (vmode == V16QImode)
17400 	mask = nelt - 1;
17401       else if (vmode == V64QImode)
17402 	mask = nelt / 4 - 1;
17403       else
17404 	mask = nelt / 2 - 1;
17405 
17406       for (i = 0; i < nelt; ++i)
17407 	{
17408 	  unsigned j, e = d->perm[i] & mask;
17409 	  for (j = 0; j < eltsz; ++j)
17410 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
17411 	}
17412     }
17413 
17414   vperm = gen_rtx_CONST_VECTOR (vmode,
17415 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
17416   vperm = force_reg (vmode, vperm);
17417 
17418   target = d->target;
17419   if (d->vmode != vmode)
17420     target = gen_reg_rtx (vmode);
17421   op0 = gen_lowpart (vmode, d->op0);
17422   if (d->one_operand_p)
17423     {
17424       if (vmode == V16QImode)
17425 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
17426       else if (vmode == V32QImode)
17427 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
17428       else if (vmode == V64QImode)
17429 	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
17430       else if (vmode == V8SFmode)
17431 	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
17432       else if (vmode == V8SImode)
17433 	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
17434       else if (vmode == V16SFmode)
17435 	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
17436       else if (vmode == V16SImode)
17437 	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
17438       else
17439 	gcc_unreachable ();
17440     }
17441   else
17442     {
17443       op1 = gen_lowpart (vmode, d->op1);
17444       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
17445     }
17446   if (target != d->target)
17447     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17448 
17449   return true;
17450 }
17451 
17452 /* For V*[QHS]Imode permutations, check if the same permutation
17453    can't be performed in a 2x, 4x or 8x wider inner mode.  */
17454 
17455 static bool
canonicalize_vector_int_perm(const struct expand_vec_perm_d * d,struct expand_vec_perm_d * nd)17456 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
17457 			      struct expand_vec_perm_d *nd)
17458 {
17459   int i;
17460   machine_mode mode = VOIDmode;
17461 
17462   switch (d->vmode)
17463     {
17464     case E_V16QImode: mode = V8HImode; break;
17465     case E_V32QImode: mode = V16HImode; break;
17466     case E_V64QImode: mode = V32HImode; break;
17467     case E_V8HImode: mode = V4SImode; break;
17468     case E_V16HImode: mode = V8SImode; break;
17469     case E_V32HImode: mode = V16SImode; break;
17470     case E_V4SImode: mode = V2DImode; break;
17471     case E_V8SImode: mode = V4DImode; break;
17472     case E_V16SImode: mode = V8DImode; break;
17473     default: return false;
17474     }
17475   for (i = 0; i < d->nelt; i += 2)
17476     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
17477       return false;
17478   nd->vmode = mode;
17479   nd->nelt = d->nelt / 2;
17480   for (i = 0; i < nd->nelt; i++)
17481     nd->perm[i] = d->perm[2 * i] / 2;
17482   if (GET_MODE_INNER (mode) != DImode)
17483     canonicalize_vector_int_perm (nd, nd);
17484   if (nd != d)
17485     {
17486       nd->one_operand_p = d->one_operand_p;
17487       nd->testing_p = d->testing_p;
17488       if (d->op0 == d->op1)
17489 	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
17490       else
17491 	{
17492 	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
17493 	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
17494 	}
17495       if (d->testing_p)
17496 	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
17497       else
17498 	nd->target = gen_reg_rtx (nd->vmode);
17499     }
17500   return true;
17501 }
17502 
17503 /* Try to expand one-operand permutation with constant mask.  */
17504 
17505 static bool
ix86_expand_vec_one_operand_perm_avx512(struct expand_vec_perm_d * d)17506 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17507 {
17508   machine_mode mode = GET_MODE (d->op0);
17509   machine_mode maskmode = mode;
17510   rtx (*gen) (rtx, rtx, rtx) = NULL;
17511   rtx target, op0, mask;
17512   rtx vec[64];
17513 
17514   if (!rtx_equal_p (d->op0, d->op1))
17515     return false;
17516 
17517   if (!TARGET_AVX512F)
17518     return false;
17519 
17520   switch (mode)
17521     {
17522     case E_V16SImode:
17523       gen = gen_avx512f_permvarv16si;
17524       break;
17525     case E_V16SFmode:
17526       gen = gen_avx512f_permvarv16sf;
17527       maskmode = V16SImode;
17528       break;
17529     case E_V8DImode:
17530       gen = gen_avx512f_permvarv8di;
17531       break;
17532     case E_V8DFmode:
17533       gen = gen_avx512f_permvarv8df;
17534       maskmode = V8DImode;
17535       break;
17536     default:
17537       return false;
17538     }
17539 
17540   target = d->target;
17541   op0 = d->op0;
17542   for (int i = 0; i < d->nelt; ++i)
17543     vec[i] = GEN_INT (d->perm[i]);
17544   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17545   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17546   return true;
17547 }
17548 
17549 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17550 
17551 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
17552    in a single instruction.  */
17553 
17554 static bool
expand_vec_perm_1(struct expand_vec_perm_d * d)17555 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17556 {
17557   unsigned i, nelt = d->nelt;
17558   struct expand_vec_perm_d nd;
17559 
17560   /* Check plain VEC_SELECT first, because AVX has instructions that could
17561      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17562      input where SEL+CONCAT may not.  */
17563   if (d->one_operand_p)
17564     {
17565       int mask = nelt - 1;
17566       bool identity_perm = true;
17567       bool broadcast_perm = true;
17568 
17569       for (i = 0; i < nelt; i++)
17570 	{
17571 	  nd.perm[i] = d->perm[i] & mask;
17572 	  if (nd.perm[i] != i)
17573 	    identity_perm = false;
17574 	  if (nd.perm[i])
17575 	    broadcast_perm = false;
17576 	}
17577 
17578       if (identity_perm)
17579 	{
17580 	  if (!d->testing_p)
17581 	    emit_move_insn (d->target, d->op0);
17582 	  return true;
17583 	}
17584       else if (broadcast_perm && TARGET_AVX2)
17585 	{
17586 	  /* Use vpbroadcast{b,w,d}.  */
17587 	  rtx (*gen) (rtx, rtx) = NULL;
17588 	  switch (d->vmode)
17589 	    {
17590 	    case E_V64QImode:
17591 	      if (TARGET_AVX512BW)
17592 		gen = gen_avx512bw_vec_dupv64qi_1;
17593 	      break;
17594 	    case E_V32QImode:
17595 	      gen = gen_avx2_pbroadcastv32qi_1;
17596 	      break;
17597 	    case E_V32HImode:
17598 	      if (TARGET_AVX512BW)
17599 		gen = gen_avx512bw_vec_dupv32hi_1;
17600 	      break;
17601 	    case E_V16HImode:
17602 	      gen = gen_avx2_pbroadcastv16hi_1;
17603 	      break;
17604 	    case E_V16SImode:
17605 	      if (TARGET_AVX512F)
17606 		gen = gen_avx512f_vec_dupv16si_1;
17607 	      break;
17608 	    case E_V8SImode:
17609 	      gen = gen_avx2_pbroadcastv8si_1;
17610 	      break;
17611 	    case E_V16QImode:
17612 	      gen = gen_avx2_pbroadcastv16qi;
17613 	      break;
17614 	    case E_V8HImode:
17615 	      gen = gen_avx2_pbroadcastv8hi;
17616 	      break;
17617 	    case E_V16SFmode:
17618 	      if (TARGET_AVX512F)
17619 		gen = gen_avx512f_vec_dupv16sf_1;
17620 	      break;
17621 	    case E_V8SFmode:
17622 	      gen = gen_avx2_vec_dupv8sf_1;
17623 	      break;
17624 	    case E_V8DFmode:
17625 	      if (TARGET_AVX512F)
17626 		gen = gen_avx512f_vec_dupv8df_1;
17627 	      break;
17628 	    case E_V8DImode:
17629 	      if (TARGET_AVX512F)
17630 		gen = gen_avx512f_vec_dupv8di_1;
17631 	      break;
17632 	    /* For other modes prefer other shuffles this function creates.  */
17633 	    default: break;
17634 	    }
17635 	  if (gen != NULL)
17636 	    {
17637 	      if (!d->testing_p)
17638 		emit_insn (gen (d->target, d->op0));
17639 	      return true;
17640 	    }
17641 	}
17642 
17643       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17644 	return true;
17645 
17646       /* There are plenty of patterns in sse.md that are written for
17647 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
17648 	 that should be changed, to avoid the nastiness here.  */
17649 
17650       /* Recognize interleave style patterns, which means incrementing
17651 	 every other permutation operand.  */
17652       for (i = 0; i < nelt; i += 2)
17653 	{
17654 	  nd.perm[i] = d->perm[i] & mask;
17655 	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17656 	}
17657       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17658 				  d->testing_p))
17659 	return true;
17660 
17661       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
17662       if (nelt >= 4)
17663 	{
17664 	  for (i = 0; i < nelt; i += 4)
17665 	    {
17666 	      nd.perm[i + 0] = d->perm[i + 0] & mask;
17667 	      nd.perm[i + 1] = d->perm[i + 1] & mask;
17668 	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17669 	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17670 	    }
17671 
17672 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17673 				      d->testing_p))
17674 	    return true;
17675 	}
17676     }
17677 
17678   /* Try movss/movsd instructions.  */
17679   if (expand_vec_perm_movs (d))
17680     return true;
17681 
17682   /* Finally, try the fully general two operand permute.  */
17683   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17684 			      d->testing_p))
17685     return true;
17686 
17687   /* Recognize interleave style patterns with reversed operands.  */
17688   if (!d->one_operand_p)
17689     {
17690       for (i = 0; i < nelt; ++i)
17691 	{
17692 	  unsigned e = d->perm[i];
17693 	  if (e >= nelt)
17694 	    e -= nelt;
17695 	  else
17696 	    e += nelt;
17697 	  nd.perm[i] = e;
17698 	}
17699 
17700       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17701 				  d->testing_p))
17702 	return true;
17703     }
17704 
17705   /* Try the SSE4.1 blend variable merge instructions.  */
17706   if (expand_vec_perm_blend (d))
17707     return true;
17708 
17709   /* Try one of the AVX vpermil variable permutations.  */
17710   if (expand_vec_perm_vpermil (d))
17711     return true;
17712 
17713   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17714      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
17715   if (expand_vec_perm_pshufb (d))
17716     return true;
17717 
17718   /* Try the AVX2 vpalignr instruction.  */
17719   if (expand_vec_perm_palignr (d, true))
17720     return true;
17721 
17722   /* Try the AVX512F vperm{s,d} instructions.  */
17723   if (ix86_expand_vec_one_operand_perm_avx512 (d))
17724     return true;
17725 
17726   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
17727   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17728     return true;
17729 
17730   /* See if we can get the same permutation in different vector integer
17731      mode.  */
17732   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17733     {
17734       if (!d->testing_p)
17735 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17736       return true;
17737     }
17738   return false;
17739 }
17740 
17741 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
17742    in terms of a pair of pshuflw + pshufhw instructions.  */
17743 
17744 static bool
expand_vec_perm_pshuflw_pshufhw(struct expand_vec_perm_d * d)17745 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17746 {
17747   unsigned char perm2[MAX_VECT_LEN];
17748   unsigned i;
17749   bool ok;
17750 
17751   if (d->vmode != V8HImode || !d->one_operand_p)
17752     return false;
17753 
17754   /* The two permutations only operate in 64-bit lanes.  */
17755   for (i = 0; i < 4; ++i)
17756     if (d->perm[i] >= 4)
17757       return false;
17758   for (i = 4; i < 8; ++i)
17759     if (d->perm[i] < 4)
17760       return false;
17761 
17762   if (d->testing_p)
17763     return true;
17764 
17765   /* Emit the pshuflw.  */
17766   memcpy (perm2, d->perm, 4);
17767   for (i = 4; i < 8; ++i)
17768     perm2[i] = i;
17769   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17770   gcc_assert (ok);
17771 
17772   /* Emit the pshufhw.  */
17773   memcpy (perm2 + 4, d->perm + 4, 4);
17774   for (i = 0; i < 4; ++i)
17775     perm2[i] = i;
17776   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17777   gcc_assert (ok);
17778 
17779   return true;
17780 }
17781 
17782 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17783    the permutation using the SSSE3 palignr instruction.  This succeeds
17784    when all of the elements in PERM fit within one vector and we merely
17785    need to shift them down so that a single vector permutation has a
17786    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
17787    the vpalignr instruction itself can perform the requested permutation.  */
17788 
17789 static bool
expand_vec_perm_palignr(struct expand_vec_perm_d * d,bool single_insn_only_p)17790 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17791 {
17792   unsigned i, nelt = d->nelt;
17793   unsigned min, max, minswap, maxswap;
17794   bool in_order, ok, swap = false;
17795   rtx shift, target;
17796   struct expand_vec_perm_d dcopy;
17797 
17798   /* Even with AVX, palignr only operates on 128-bit vectors,
17799      in AVX2 palignr operates on both 128-bit lanes.  */
17800   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17801       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17802     return false;
17803 
17804   min = 2 * nelt;
17805   max = 0;
17806   minswap = 2 * nelt;
17807   maxswap = 0;
17808   for (i = 0; i < nelt; ++i)
17809     {
17810       unsigned e = d->perm[i];
17811       unsigned eswap = d->perm[i] ^ nelt;
17812       if (GET_MODE_SIZE (d->vmode) == 32)
17813 	{
17814 	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17815 	  eswap = e ^ (nelt / 2);
17816 	}
17817       if (e < min)
17818 	min = e;
17819       if (e > max)
17820 	max = e;
17821       if (eswap < minswap)
17822 	minswap = eswap;
17823       if (eswap > maxswap)
17824 	maxswap = eswap;
17825     }
17826   if (min == 0
17827       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17828     {
17829       if (d->one_operand_p
17830 	  || minswap == 0
17831 	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17832 				   ? nelt / 2 : nelt))
17833 	return false;
17834       swap = true;
17835       min = minswap;
17836       max = maxswap;
17837     }
17838 
17839   /* Given that we have SSSE3, we know we'll be able to implement the
17840      single operand permutation after the palignr with pshufb for
17841      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
17842      first.  */
17843   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17844     return true;
17845 
17846   dcopy = *d;
17847   if (swap)
17848     {
17849       dcopy.op0 = d->op1;
17850       dcopy.op1 = d->op0;
17851       for (i = 0; i < nelt; ++i)
17852 	dcopy.perm[i] ^= nelt;
17853     }
17854 
17855   in_order = true;
17856   for (i = 0; i < nelt; ++i)
17857     {
17858       unsigned e = dcopy.perm[i];
17859       if (GET_MODE_SIZE (d->vmode) == 32
17860 	  && e >= nelt
17861 	  && (e & (nelt / 2 - 1)) < min)
17862 	e = e - min - (nelt / 2);
17863       else
17864 	e = e - min;
17865       if (e != i)
17866 	in_order = false;
17867       dcopy.perm[i] = e;
17868     }
17869   dcopy.one_operand_p = true;
17870 
17871   if (single_insn_only_p && !in_order)
17872     return false;
17873 
17874   /* For AVX2, test whether we can permute the result in one instruction.  */
17875   if (d->testing_p)
17876     {
17877       if (in_order)
17878 	return true;
17879       dcopy.op1 = dcopy.op0;
17880       return expand_vec_perm_1 (&dcopy);
17881     }
17882 
17883   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17884   if (GET_MODE_SIZE (d->vmode) == 16)
17885     {
17886       target = gen_reg_rtx (TImode);
17887       emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17888 				      gen_lowpart (TImode, dcopy.op0), shift));
17889     }
17890   else
17891     {
17892       target = gen_reg_rtx (V2TImode);
17893       emit_insn (gen_avx2_palignrv2ti (target,
17894 				       gen_lowpart (V2TImode, dcopy.op1),
17895 				       gen_lowpart (V2TImode, dcopy.op0),
17896 				       shift));
17897     }
17898 
17899   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17900 
17901   /* Test for the degenerate case where the alignment by itself
17902      produces the desired permutation.  */
17903   if (in_order)
17904     {
17905       emit_move_insn (d->target, dcopy.op0);
17906       return true;
17907     }
17908 
17909   ok = expand_vec_perm_1 (&dcopy);
17910   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17911 
17912   return ok;
17913 }
17914 
17915 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17916    the permutation using the SSE4_1 pblendv instruction.  Potentially
17917    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
17918 
17919 static bool
expand_vec_perm_pblendv(struct expand_vec_perm_d * d)17920 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17921 {
17922   unsigned i, which, nelt = d->nelt;
17923   struct expand_vec_perm_d dcopy, dcopy1;
17924   machine_mode vmode = d->vmode;
17925   bool ok;
17926 
17927   /* Use the same checks as in expand_vec_perm_blend.  */
17928   if (d->one_operand_p)
17929     return false;
17930   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17931     ;
17932   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17933     ;
17934   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17935     ;
17936   else
17937     return false;
17938 
17939   /* Figure out where permutation elements stay not in their
17940      respective lanes.  */
17941   for (i = 0, which = 0; i < nelt; ++i)
17942     {
17943       unsigned e = d->perm[i];
17944       if (e != i)
17945 	which |= (e < nelt ? 1 : 2);
17946     }
17947   /* We can pblend the part where elements stay not in their
17948      respective lanes only when these elements are all in one
17949      half of a permutation.
17950      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17951      lanes, but both 8 and 9 >= 8
17952      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17953      respective lanes and 8 >= 8, but 2 not.  */
17954   if (which != 1 && which != 2)
17955     return false;
17956   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17957     return true;
17958 
17959   /* First we apply one operand permutation to the part where
17960      elements stay not in their respective lanes.  */
17961   dcopy = *d;
17962   if (which == 2)
17963     dcopy.op0 = dcopy.op1 = d->op1;
17964   else
17965     dcopy.op0 = dcopy.op1 = d->op0;
17966   if (!d->testing_p)
17967     dcopy.target = gen_reg_rtx (vmode);
17968   dcopy.one_operand_p = true;
17969 
17970   for (i = 0; i < nelt; ++i)
17971     dcopy.perm[i] = d->perm[i] & (nelt - 1);
17972 
17973   ok = expand_vec_perm_1 (&dcopy);
17974   if (GET_MODE_SIZE (vmode) != 16 && !ok)
17975     return false;
17976   else
17977     gcc_assert (ok);
17978   if (d->testing_p)
17979     return true;
17980 
17981   /* Next we put permuted elements into their positions.  */
17982   dcopy1 = *d;
17983   if (which == 2)
17984     dcopy1.op1 = dcopy.target;
17985   else
17986     dcopy1.op0 = dcopy.target;
17987 
17988   for (i = 0; i < nelt; ++i)
17989     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17990 
17991   ok = expand_vec_perm_blend (&dcopy1);
17992   gcc_assert (ok);
17993 
17994   return true;
17995 }
17996 
17997 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17998 
17999 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
18000    a two vector permutation into a single vector permutation by using
18001    an interleave operation to merge the vectors.  */
18002 
18003 static bool
expand_vec_perm_interleave2(struct expand_vec_perm_d * d)18004 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
18005 {
18006   struct expand_vec_perm_d dremap, dfinal;
18007   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
18008   unsigned HOST_WIDE_INT contents;
18009   unsigned char remap[2 * MAX_VECT_LEN];
18010   rtx_insn *seq;
18011   bool ok, same_halves = false;
18012 
18013   if (GET_MODE_SIZE (d->vmode) == 16)
18014     {
18015       if (d->one_operand_p)
18016 	return false;
18017     }
18018   else if (GET_MODE_SIZE (d->vmode) == 32)
18019     {
18020       if (!TARGET_AVX)
18021 	return false;
18022       /* For 32-byte modes allow even d->one_operand_p.
18023 	 The lack of cross-lane shuffling in some instructions
18024 	 might prevent a single insn shuffle.  */
18025       dfinal = *d;
18026       dfinal.testing_p = true;
18027       /* If expand_vec_perm_interleave3 can expand this into
18028 	 a 3 insn sequence, give up and let it be expanded as
18029 	 3 insn sequence.  While that is one insn longer,
18030 	 it doesn't need a memory operand and in the common
18031 	 case that both interleave low and high permutations
18032 	 with the same operands are adjacent needs 4 insns
18033 	 for both after CSE.  */
18034       if (expand_vec_perm_interleave3 (&dfinal))
18035 	return false;
18036     }
18037   else
18038     return false;
18039 
18040   /* Examine from whence the elements come.  */
18041   contents = 0;
18042   for (i = 0; i < nelt; ++i)
18043     contents |= HOST_WIDE_INT_1U << d->perm[i];
18044 
18045   memset (remap, 0xff, sizeof (remap));
18046   dremap = *d;
18047 
18048   if (GET_MODE_SIZE (d->vmode) == 16)
18049     {
18050       unsigned HOST_WIDE_INT h1, h2, h3, h4;
18051 
18052       /* Split the two input vectors into 4 halves.  */
18053       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
18054       h2 = h1 << nelt2;
18055       h3 = h2 << nelt2;
18056       h4 = h3 << nelt2;
18057 
18058       /* If the elements from the low halves use interleave low, and similarly
18059 	 for interleave high.  If the elements are from mis-matched halves, we
18060 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
18061       if ((contents & (h1 | h3)) == contents)
18062 	{
18063 	  /* punpckl* */
18064 	  for (i = 0; i < nelt2; ++i)
18065 	    {
18066 	      remap[i] = i * 2;
18067 	      remap[i + nelt] = i * 2 + 1;
18068 	      dremap.perm[i * 2] = i;
18069 	      dremap.perm[i * 2 + 1] = i + nelt;
18070 	    }
18071 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
18072 	    dremap.vmode = V4SFmode;
18073 	}
18074       else if ((contents & (h2 | h4)) == contents)
18075 	{
18076 	  /* punpckh* */
18077 	  for (i = 0; i < nelt2; ++i)
18078 	    {
18079 	      remap[i + nelt2] = i * 2;
18080 	      remap[i + nelt + nelt2] = i * 2 + 1;
18081 	      dremap.perm[i * 2] = i + nelt2;
18082 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18083 	    }
18084 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
18085 	    dremap.vmode = V4SFmode;
18086 	}
18087       else if ((contents & (h1 | h4)) == contents)
18088 	{
18089 	  /* shufps */
18090 	  for (i = 0; i < nelt2; ++i)
18091 	    {
18092 	      remap[i] = i;
18093 	      remap[i + nelt + nelt2] = i + nelt2;
18094 	      dremap.perm[i] = i;
18095 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
18096 	    }
18097 	  if (nelt != 4)
18098 	    {
18099 	      /* shufpd */
18100 	      dremap.vmode = V2DImode;
18101 	      dremap.nelt = 2;
18102 	      dremap.perm[0] = 0;
18103 	      dremap.perm[1] = 3;
18104 	    }
18105 	}
18106       else if ((contents & (h2 | h3)) == contents)
18107 	{
18108 	  /* shufps */
18109 	  for (i = 0; i < nelt2; ++i)
18110 	    {
18111 	      remap[i + nelt2] = i;
18112 	      remap[i + nelt] = i + nelt2;
18113 	      dremap.perm[i] = i + nelt2;
18114 	      dremap.perm[i + nelt2] = i + nelt;
18115 	    }
18116 	  if (nelt != 4)
18117 	    {
18118 	      /* shufpd */
18119 	      dremap.vmode = V2DImode;
18120 	      dremap.nelt = 2;
18121 	      dremap.perm[0] = 1;
18122 	      dremap.perm[1] = 2;
18123 	    }
18124 	}
18125       else
18126 	return false;
18127     }
18128   else
18129     {
18130       unsigned int nelt4 = nelt / 4, nzcnt = 0;
18131       unsigned HOST_WIDE_INT q[8];
18132       unsigned int nonzero_halves[4];
18133 
18134       /* Split the two input vectors into 8 quarters.  */
18135       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
18136       for (i = 1; i < 8; ++i)
18137 	q[i] = q[0] << (nelt4 * i);
18138       for (i = 0; i < 4; ++i)
18139 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
18140 	  {
18141 	    nonzero_halves[nzcnt] = i;
18142 	    ++nzcnt;
18143 	  }
18144 
18145       if (nzcnt == 1)
18146 	{
18147 	  gcc_assert (d->one_operand_p);
18148 	  nonzero_halves[1] = nonzero_halves[0];
18149 	  same_halves = true;
18150 	}
18151       else if (d->one_operand_p)
18152 	{
18153 	  gcc_assert (nonzero_halves[0] == 0);
18154 	  gcc_assert (nonzero_halves[1] == 1);
18155 	}
18156 
18157       if (nzcnt <= 2)
18158 	{
18159 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
18160 	    {
18161 	      /* Attempt to increase the likelihood that dfinal
18162 		 shuffle will be intra-lane.  */
18163 	      std::swap (nonzero_halves[0], nonzero_halves[1]);
18164 	    }
18165 
18166 	  /* vperm2f128 or vperm2i128.  */
18167 	  for (i = 0; i < nelt2; ++i)
18168 	    {
18169 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
18170 	      remap[i + nonzero_halves[0] * nelt2] = i;
18171 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
18172 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
18173 	    }
18174 
18175 	  if (d->vmode != V8SFmode
18176 	      && d->vmode != V4DFmode
18177 	      && d->vmode != V8SImode)
18178 	    {
18179 	      dremap.vmode = V8SImode;
18180 	      dremap.nelt = 8;
18181 	      for (i = 0; i < 4; ++i)
18182 		{
18183 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
18184 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
18185 		}
18186 	    }
18187 	}
18188       else if (d->one_operand_p)
18189 	return false;
18190       else if (TARGET_AVX2
18191 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
18192 	{
18193 	  /* vpunpckl* */
18194 	  for (i = 0; i < nelt4; ++i)
18195 	    {
18196 	      remap[i] = i * 2;
18197 	      remap[i + nelt] = i * 2 + 1;
18198 	      remap[i + nelt2] = i * 2 + nelt2;
18199 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
18200 	      dremap.perm[i * 2] = i;
18201 	      dremap.perm[i * 2 + 1] = i + nelt;
18202 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
18203 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
18204 	    }
18205 	}
18206       else if (TARGET_AVX2
18207 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
18208 	{
18209 	  /* vpunpckh* */
18210 	  for (i = 0; i < nelt4; ++i)
18211 	    {
18212 	      remap[i + nelt4] = i * 2;
18213 	      remap[i + nelt + nelt4] = i * 2 + 1;
18214 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
18215 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
18216 	      dremap.perm[i * 2] = i + nelt4;
18217 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
18218 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
18219 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
18220 	    }
18221 	}
18222       else
18223 	return false;
18224     }
18225 
18226   /* Use the remapping array set up above to move the elements from their
18227      swizzled locations into their final destinations.  */
18228   dfinal = *d;
18229   for (i = 0; i < nelt; ++i)
18230     {
18231       unsigned e = remap[d->perm[i]];
18232       gcc_assert (e < nelt);
18233       /* If same_halves is true, both halves of the remapped vector are the
18234 	 same.  Avoid cross-lane accesses if possible.  */
18235       if (same_halves && i >= nelt2)
18236 	{
18237 	  gcc_assert (e < nelt2);
18238 	  dfinal.perm[i] = e + nelt2;
18239 	}
18240       else
18241 	dfinal.perm[i] = e;
18242     }
18243   if (!d->testing_p)
18244     {
18245       dremap.target = gen_reg_rtx (dremap.vmode);
18246       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18247     }
18248   dfinal.op1 = dfinal.op0;
18249   dfinal.one_operand_p = true;
18250 
18251   /* Test if the final remap can be done with a single insn.  For V4SFmode or
18252      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
18253   start_sequence ();
18254   ok = expand_vec_perm_1 (&dfinal);
18255   seq = get_insns ();
18256   end_sequence ();
18257 
18258   if (!ok)
18259     return false;
18260 
18261   if (d->testing_p)
18262     return true;
18263 
18264   if (dremap.vmode != dfinal.vmode)
18265     {
18266       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
18267       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
18268     }
18269 
18270   ok = expand_vec_perm_1 (&dremap);
18271   gcc_assert (ok);
18272 
18273   emit_insn (seq);
18274   return true;
18275 }
18276 
18277 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
18278    a single vector cross-lane permutation into vpermq followed
18279    by any of the single insn permutations.  */
18280 
18281 static bool
expand_vec_perm_vpermq_perm_1(struct expand_vec_perm_d * d)18282 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
18283 {
18284   struct expand_vec_perm_d dremap, dfinal;
18285   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
18286   unsigned contents[2];
18287   bool ok;
18288 
18289   if (!(TARGET_AVX2
18290 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
18291 	&& d->one_operand_p))
18292     return false;
18293 
18294   contents[0] = 0;
18295   contents[1] = 0;
18296   for (i = 0; i < nelt2; ++i)
18297     {
18298       contents[0] |= 1u << (d->perm[i] / nelt4);
18299       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
18300     }
18301 
18302   for (i = 0; i < 2; ++i)
18303     {
18304       unsigned int cnt = 0;
18305       for (j = 0; j < 4; ++j)
18306 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
18307 	  return false;
18308     }
18309 
18310   if (d->testing_p)
18311     return true;
18312 
18313   dremap = *d;
18314   dremap.vmode = V4DImode;
18315   dremap.nelt = 4;
18316   dremap.target = gen_reg_rtx (V4DImode);
18317   dremap.op0 = gen_lowpart (V4DImode, d->op0);
18318   dremap.op1 = dremap.op0;
18319   dremap.one_operand_p = true;
18320   for (i = 0; i < 2; ++i)
18321     {
18322       unsigned int cnt = 0;
18323       for (j = 0; j < 4; ++j)
18324 	if ((contents[i] & (1u << j)) != 0)
18325 	  dremap.perm[2 * i + cnt++] = j;
18326       for (; cnt < 2; ++cnt)
18327 	dremap.perm[2 * i + cnt] = 0;
18328     }
18329 
18330   dfinal = *d;
18331   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18332   dfinal.op1 = dfinal.op0;
18333   dfinal.one_operand_p = true;
18334   for (i = 0, j = 0; i < nelt; ++i)
18335     {
18336       if (i == nelt2)
18337 	j = 2;
18338       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
18339       if ((d->perm[i] / nelt4) == dremap.perm[j])
18340 	;
18341       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
18342 	dfinal.perm[i] |= nelt4;
18343       else
18344 	gcc_unreachable ();
18345     }
18346 
18347   ok = expand_vec_perm_1 (&dremap);
18348   gcc_assert (ok);
18349 
18350   ok = expand_vec_perm_1 (&dfinal);
18351   gcc_assert (ok);
18352 
18353   return true;
18354 }
18355 
18356 static bool canonicalize_perm (struct expand_vec_perm_d *d);
18357 
18358 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
18359    a vector permutation using two instructions, vperm2f128 resp.
18360    vperm2i128 followed by any single in-lane permutation.  */
18361 
18362 static bool
expand_vec_perm_vperm2f128(struct expand_vec_perm_d * d)18363 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
18364 {
18365   struct expand_vec_perm_d dfirst, dsecond;
18366   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
18367   bool ok;
18368 
18369   if (!TARGET_AVX
18370       || GET_MODE_SIZE (d->vmode) != 32
18371       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
18372     return false;
18373 
18374   dsecond = *d;
18375   dsecond.one_operand_p = false;
18376   dsecond.testing_p = true;
18377 
18378   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18379      immediate.  For perm < 16 the second permutation uses
18380      d->op0 as first operand, for perm >= 16 it uses d->op1
18381      as first operand.  The second operand is the result of
18382      vperm2[fi]128.  */
18383   for (perm = 0; perm < 32; perm++)
18384     {
18385       /* Ignore permutations which do not move anything cross-lane.  */
18386       if (perm < 16)
18387 	{
18388 	  /* The second shuffle for e.g. V4DFmode has
18389 	     0123 and ABCD operands.
18390 	     Ignore AB23, as 23 is already in the second lane
18391 	     of the first operand.  */
18392 	  if ((perm & 0xc) == (1 << 2)) continue;
18393 	  /* And 01CD, as 01 is in the first lane of the first
18394 	     operand.  */
18395 	  if ((perm & 3) == 0) continue;
18396 	  /* And 4567, as then the vperm2[fi]128 doesn't change
18397 	     anything on the original 4567 second operand.  */
18398 	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
18399 	}
18400       else
18401 	{
18402 	  /* The second shuffle for e.g. V4DFmode has
18403 	     4567 and ABCD operands.
18404 	     Ignore AB67, as 67 is already in the second lane
18405 	     of the first operand.  */
18406 	  if ((perm & 0xc) == (3 << 2)) continue;
18407 	  /* And 45CD, as 45 is in the first lane of the first
18408 	     operand.  */
18409 	  if ((perm & 3) == 2) continue;
18410 	  /* And 0123, as then the vperm2[fi]128 doesn't change
18411 	     anything on the original 0123 first operand.  */
18412 	  if ((perm & 0xf) == (1 << 2)) continue;
18413 	}
18414 
18415       for (i = 0; i < nelt; i++)
18416 	{
18417 	  j = d->perm[i] / nelt2;
18418 	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
18419 	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
18420 	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
18421 	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
18422 	  else
18423 	    break;
18424 	}
18425 
18426       if (i == nelt)
18427 	{
18428 	  start_sequence ();
18429 	  ok = expand_vec_perm_1 (&dsecond);
18430 	  end_sequence ();
18431 	}
18432       else
18433 	ok = false;
18434 
18435       if (ok)
18436 	{
18437 	  if (d->testing_p)
18438 	    return true;
18439 
18440 	  /* Found a usable second shuffle.  dfirst will be
18441 	     vperm2f128 on d->op0 and d->op1.  */
18442 	  dsecond.testing_p = false;
18443 	  dfirst = *d;
18444 	  dfirst.target = gen_reg_rtx (d->vmode);
18445 	  for (i = 0; i < nelt; i++)
18446 	    dfirst.perm[i] = (i & (nelt2 - 1))
18447 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
18448 
18449 	  canonicalize_perm (&dfirst);
18450 	  ok = expand_vec_perm_1 (&dfirst);
18451 	  gcc_assert (ok);
18452 
18453 	  /* And dsecond is some single insn shuffle, taking
18454 	     d->op0 and result of vperm2f128 (if perm < 16) or
18455 	     d->op1 and result of vperm2f128 (otherwise).  */
18456 	  if (perm >= 16)
18457 	    dsecond.op0 = dsecond.op1;
18458 	  dsecond.op1 = dfirst.target;
18459 
18460 	  ok = expand_vec_perm_1 (&dsecond);
18461 	  gcc_assert (ok);
18462 
18463 	  return true;
18464 	}
18465 
18466       /* For one operand, the only useful vperm2f128 permutation is 0x01
18467 	 aka lanes swap.  */
18468       if (d->one_operand_p)
18469 	return false;
18470     }
18471 
18472   return false;
18473 }
18474 
18475 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
18476    a two vector permutation using 2 intra-lane interleave insns
18477    and cross-lane shuffle for 32-byte vectors.  */
18478 
18479 static bool
expand_vec_perm_interleave3(struct expand_vec_perm_d * d)18480 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
18481 {
18482   unsigned i, nelt;
18483   rtx (*gen) (rtx, rtx, rtx);
18484 
18485   if (d->one_operand_p)
18486     return false;
18487   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
18488     ;
18489   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
18490     ;
18491   else
18492     return false;
18493 
18494   nelt = d->nelt;
18495   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
18496     return false;
18497   for (i = 0; i < nelt; i += 2)
18498     if (d->perm[i] != d->perm[0] + i / 2
18499 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
18500       return false;
18501 
18502   if (d->testing_p)
18503     return true;
18504 
18505   switch (d->vmode)
18506     {
18507     case E_V32QImode:
18508       if (d->perm[0])
18509 	gen = gen_vec_interleave_highv32qi;
18510       else
18511 	gen = gen_vec_interleave_lowv32qi;
18512       break;
18513     case E_V16HImode:
18514       if (d->perm[0])
18515 	gen = gen_vec_interleave_highv16hi;
18516       else
18517 	gen = gen_vec_interleave_lowv16hi;
18518       break;
18519     case E_V8SImode:
18520       if (d->perm[0])
18521 	gen = gen_vec_interleave_highv8si;
18522       else
18523 	gen = gen_vec_interleave_lowv8si;
18524       break;
18525     case E_V4DImode:
18526       if (d->perm[0])
18527 	gen = gen_vec_interleave_highv4di;
18528       else
18529 	gen = gen_vec_interleave_lowv4di;
18530       break;
18531     case E_V8SFmode:
18532       if (d->perm[0])
18533 	gen = gen_vec_interleave_highv8sf;
18534       else
18535 	gen = gen_vec_interleave_lowv8sf;
18536       break;
18537     case E_V4DFmode:
18538       if (d->perm[0])
18539 	gen = gen_vec_interleave_highv4df;
18540       else
18541 	gen = gen_vec_interleave_lowv4df;
18542       break;
18543     default:
18544       gcc_unreachable ();
18545     }
18546 
18547   emit_insn (gen (d->target, d->op0, d->op1));
18548   return true;
18549 }
18550 
18551 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
18552    a single vector permutation using a single intra-lane vector
18553    permutation, vperm2f128 swapping the lanes and vblend* insn blending
18554    the non-swapped and swapped vectors together.  */
18555 
18556 static bool
expand_vec_perm_vperm2f128_vblend(struct expand_vec_perm_d * d)18557 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18558 {
18559   struct expand_vec_perm_d dfirst, dsecond;
18560   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18561   rtx_insn *seq;
18562   bool ok;
18563   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18564 
18565   if (!TARGET_AVX
18566       || TARGET_AVX2
18567       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18568       || !d->one_operand_p)
18569     return false;
18570 
18571   dfirst = *d;
18572   for (i = 0; i < nelt; i++)
18573     dfirst.perm[i] = 0xff;
18574   for (i = 0, msk = 0; i < nelt; i++)
18575     {
18576       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18577       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18578 	return false;
18579       dfirst.perm[j] = d->perm[i];
18580       if (j != i)
18581 	msk |= (1 << i);
18582     }
18583   for (i = 0; i < nelt; i++)
18584     if (dfirst.perm[i] == 0xff)
18585       dfirst.perm[i] = i;
18586 
18587   if (!d->testing_p)
18588     dfirst.target = gen_reg_rtx (dfirst.vmode);
18589 
18590   start_sequence ();
18591   ok = expand_vec_perm_1 (&dfirst);
18592   seq = get_insns ();
18593   end_sequence ();
18594 
18595   if (!ok)
18596     return false;
18597 
18598   if (d->testing_p)
18599     return true;
18600 
18601   emit_insn (seq);
18602 
18603   dsecond = *d;
18604   dsecond.op0 = dfirst.target;
18605   dsecond.op1 = dfirst.target;
18606   dsecond.one_operand_p = true;
18607   dsecond.target = gen_reg_rtx (dsecond.vmode);
18608   for (i = 0; i < nelt; i++)
18609     dsecond.perm[i] = i ^ nelt2;
18610 
18611   ok = expand_vec_perm_1 (&dsecond);
18612   gcc_assert (ok);
18613 
18614   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18615   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18616   return true;
18617 }
18618 
18619 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
18620    permutation using two vperm2f128, followed by a vshufpd insn blending
18621    the two vectors together.  */
18622 
18623 static bool
expand_vec_perm_2vperm2f128_vshuf(struct expand_vec_perm_d * d)18624 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18625 {
18626   struct expand_vec_perm_d dfirst, dsecond, dthird;
18627   bool ok;
18628 
18629   if (!TARGET_AVX || (d->vmode != V4DFmode))
18630     return false;
18631 
18632   if (d->testing_p)
18633     return true;
18634 
18635   dfirst = *d;
18636   dsecond = *d;
18637   dthird = *d;
18638 
18639   dfirst.perm[0] = (d->perm[0] & ~1);
18640   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18641   dfirst.perm[2] = (d->perm[2] & ~1);
18642   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18643   dsecond.perm[0] = (d->perm[1] & ~1);
18644   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18645   dsecond.perm[2] = (d->perm[3] & ~1);
18646   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18647   dthird.perm[0] = (d->perm[0] % 2);
18648   dthird.perm[1] = (d->perm[1] % 2) + 4;
18649   dthird.perm[2] = (d->perm[2] % 2) + 2;
18650   dthird.perm[3] = (d->perm[3] % 2) + 6;
18651 
18652   dfirst.target = gen_reg_rtx (dfirst.vmode);
18653   dsecond.target = gen_reg_rtx (dsecond.vmode);
18654   dthird.op0 = dfirst.target;
18655   dthird.op1 = dsecond.target;
18656   dthird.one_operand_p = false;
18657 
18658   canonicalize_perm (&dfirst);
18659   canonicalize_perm (&dsecond);
18660 
18661   ok = expand_vec_perm_1 (&dfirst)
18662        && expand_vec_perm_1 (&dsecond)
18663        && expand_vec_perm_1 (&dthird);
18664 
18665   gcc_assert (ok);
18666 
18667   return true;
18668 }
18669 
18670 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18671 
18672 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
18673    a two vector permutation using two intra-lane vector
18674    permutations, vperm2f128 swapping the lanes and vblend* insn blending
18675    the non-swapped and swapped vectors together.  */
18676 
18677 static bool
expand_vec_perm2_vperm2f128_vblend(struct expand_vec_perm_d * d)18678 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18679 {
18680   struct expand_vec_perm_d dfirst, dsecond, dthird;
18681   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18682   rtx_insn *seq1, *seq2;
18683   bool ok;
18684   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18685 
18686   if (!TARGET_AVX
18687       || TARGET_AVX2
18688       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18689       || d->one_operand_p)
18690     return false;
18691 
18692   dfirst = *d;
18693   dsecond = *d;
18694   for (i = 0; i < nelt; i++)
18695     {
18696       dfirst.perm[i] = 0xff;
18697       dsecond.perm[i] = 0xff;
18698     }
18699   for (i = 0, msk = 0; i < nelt; i++)
18700     {
18701       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18702       if (j == i)
18703 	{
18704 	  dfirst.perm[j] = d->perm[i];
18705 	  which1 |= (d->perm[i] < nelt ? 1 : 2);
18706 	}
18707       else
18708 	{
18709 	  dsecond.perm[j] = d->perm[i];
18710 	  which2 |= (d->perm[i] < nelt ? 1 : 2);
18711 	  msk |= (1U << i);
18712 	}
18713     }
18714   if (msk == 0 || msk == (1U << nelt) - 1)
18715     return false;
18716 
18717   if (!d->testing_p)
18718     {
18719       dfirst.target = gen_reg_rtx (dfirst.vmode);
18720       dsecond.target = gen_reg_rtx (dsecond.vmode);
18721     }
18722 
18723   for (i = 0; i < nelt; i++)
18724     {
18725       if (dfirst.perm[i] == 0xff)
18726 	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18727       if (dsecond.perm[i] == 0xff)
18728 	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18729     }
18730   canonicalize_perm (&dfirst);
18731   start_sequence ();
18732   ok = ix86_expand_vec_perm_const_1 (&dfirst);
18733   seq1 = get_insns ();
18734   end_sequence ();
18735 
18736   if (!ok)
18737     return false;
18738 
18739   canonicalize_perm (&dsecond);
18740   start_sequence ();
18741   ok = ix86_expand_vec_perm_const_1 (&dsecond);
18742   seq2 = get_insns ();
18743   end_sequence ();
18744 
18745   if (!ok)
18746     return false;
18747 
18748   if (d->testing_p)
18749     return true;
18750 
18751   emit_insn (seq1);
18752   emit_insn (seq2);
18753 
18754   dthird = *d;
18755   dthird.op0 = dsecond.target;
18756   dthird.op1 = dsecond.target;
18757   dthird.one_operand_p = true;
18758   dthird.target = gen_reg_rtx (dthird.vmode);
18759   for (i = 0; i < nelt; i++)
18760     dthird.perm[i] = i ^ nelt2;
18761 
18762   ok = expand_vec_perm_1 (&dthird);
18763   gcc_assert (ok);
18764 
18765   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18766   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18767   return true;
18768 }
18769 
18770 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
18771    permutation with two pshufb insns and an ior.  We should have already
18772    failed all two instruction sequences.  */
18773 
18774 static bool
expand_vec_perm_pshufb2(struct expand_vec_perm_d * d)18775 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18776 {
18777   rtx rperm[2][16], vperm, l, h, op, m128;
18778   unsigned int i, nelt, eltsz;
18779 
18780   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18781     return false;
18782   gcc_assert (!d->one_operand_p);
18783 
18784   if (d->testing_p)
18785     return true;
18786 
18787   nelt = d->nelt;
18788   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18789 
18790   /* Generate two permutation masks.  If the required element is within
18791      the given vector it is shuffled into the proper lane.  If the required
18792      element is in the other vector, force a zero into the lane by setting
18793      bit 7 in the permutation mask.  */
18794   m128 = GEN_INT (-128);
18795   for (i = 0; i < nelt; ++i)
18796     {
18797       unsigned j, e = d->perm[i];
18798       unsigned which = (e >= nelt);
18799       if (e >= nelt)
18800 	e -= nelt;
18801 
18802       for (j = 0; j < eltsz; ++j)
18803 	{
18804 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18805 	  rperm[1-which][i*eltsz + j] = m128;
18806 	}
18807     }
18808 
18809   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18810   vperm = force_reg (V16QImode, vperm);
18811 
18812   l = gen_reg_rtx (V16QImode);
18813   op = gen_lowpart (V16QImode, d->op0);
18814   emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18815 
18816   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18817   vperm = force_reg (V16QImode, vperm);
18818 
18819   h = gen_reg_rtx (V16QImode);
18820   op = gen_lowpart (V16QImode, d->op1);
18821   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18822 
18823   op = d->target;
18824   if (d->vmode != V16QImode)
18825     op = gen_reg_rtx (V16QImode);
18826   emit_insn (gen_iorv16qi3 (op, l, h));
18827   if (op != d->target)
18828     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18829 
18830   return true;
18831 }
18832 
18833 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18834    with two vpshufb insns, vpermq and vpor.  We should have already failed
18835    all two or three instruction sequences.  */
18836 
18837 static bool
expand_vec_perm_vpshufb2_vpermq(struct expand_vec_perm_d * d)18838 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18839 {
18840   rtx rperm[2][32], vperm, l, h, hp, op, m128;
18841   unsigned int i, nelt, eltsz;
18842 
18843   if (!TARGET_AVX2
18844       || !d->one_operand_p
18845       || (d->vmode != V32QImode && d->vmode != V16HImode))
18846     return false;
18847 
18848   if (d->testing_p)
18849     return true;
18850 
18851   nelt = d->nelt;
18852   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18853 
18854   /* Generate two permutation masks.  If the required element is within
18855      the same lane, it is shuffled in.  If the required element from the
18856      other lane, force a zero by setting bit 7 in the permutation mask.
18857      In the other mask the mask has non-negative elements if element
18858      is requested from the other lane, but also moved to the other lane,
18859      so that the result of vpshufb can have the two V2TImode halves
18860      swapped.  */
18861   m128 = GEN_INT (-128);
18862   for (i = 0; i < nelt; ++i)
18863     {
18864       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18865       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18866 
18867       for (j = 0; j < eltsz; ++j)
18868 	{
18869 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18870 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18871 	}
18872     }
18873 
18874   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18875   vperm = force_reg (V32QImode, vperm);
18876 
18877   h = gen_reg_rtx (V32QImode);
18878   op = gen_lowpart (V32QImode, d->op0);
18879   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18880 
18881   /* Swap the 128-byte lanes of h into hp.  */
18882   hp = gen_reg_rtx (V4DImode);
18883   op = gen_lowpart (V4DImode, h);
18884   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18885 				  const1_rtx));
18886 
18887   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18888   vperm = force_reg (V32QImode, vperm);
18889 
18890   l = gen_reg_rtx (V32QImode);
18891   op = gen_lowpart (V32QImode, d->op0);
18892   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18893 
18894   op = d->target;
18895   if (d->vmode != V32QImode)
18896     op = gen_reg_rtx (V32QImode);
18897   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18898   if (op != d->target)
18899     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18900 
18901   return true;
18902 }
18903 
18904 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18905    and extract-odd permutations of two V32QImode and V16QImode operand
18906    with two vpshufb insns, vpor and vpermq.  We should have already
18907    failed all two or three instruction sequences.  */
18908 
18909 static bool
expand_vec_perm_vpshufb2_vpermq_even_odd(struct expand_vec_perm_d * d)18910 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18911 {
18912   rtx rperm[2][32], vperm, l, h, ior, op, m128;
18913   unsigned int i, nelt, eltsz;
18914 
18915   if (!TARGET_AVX2
18916       || d->one_operand_p
18917       || (d->vmode != V32QImode && d->vmode != V16HImode))
18918     return false;
18919 
18920   for (i = 0; i < d->nelt; ++i)
18921     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18922       return false;
18923 
18924   if (d->testing_p)
18925     return true;
18926 
18927   nelt = d->nelt;
18928   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18929 
18930   /* Generate two permutation masks.  In the first permutation mask
18931      the first quarter will contain indexes for the first half
18932      of the op0, the second quarter will contain bit 7 set, third quarter
18933      will contain indexes for the second half of the op0 and the
18934      last quarter bit 7 set.  In the second permutation mask
18935      the first quarter will contain bit 7 set, the second quarter
18936      indexes for the first half of the op1, the third quarter bit 7 set
18937      and last quarter indexes for the second half of the op1.
18938      I.e. the first mask e.g. for V32QImode extract even will be:
18939      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18940      (all values masked with 0xf except for -128) and second mask
18941      for extract even will be
18942      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
18943   m128 = GEN_INT (-128);
18944   for (i = 0; i < nelt; ++i)
18945     {
18946       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18947       unsigned which = d->perm[i] >= nelt;
18948       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18949 
18950       for (j = 0; j < eltsz; ++j)
18951 	{
18952 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18953 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18954 	}
18955     }
18956 
18957   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18958   vperm = force_reg (V32QImode, vperm);
18959 
18960   l = gen_reg_rtx (V32QImode);
18961   op = gen_lowpart (V32QImode, d->op0);
18962   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18963 
18964   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18965   vperm = force_reg (V32QImode, vperm);
18966 
18967   h = gen_reg_rtx (V32QImode);
18968   op = gen_lowpart (V32QImode, d->op1);
18969   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18970 
18971   ior = gen_reg_rtx (V32QImode);
18972   emit_insn (gen_iorv32qi3 (ior, l, h));
18973 
18974   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
18975   op = gen_reg_rtx (V4DImode);
18976   ior = gen_lowpart (V4DImode, ior);
18977   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18978 				  const1_rtx, GEN_INT (3)));
18979   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18980 
18981   return true;
18982 }
18983 
18984 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18985    and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18986    with two "and" and "pack" or two "shift" and "pack" insns.  We should
18987    have already failed all two instruction sequences.  */
18988 
18989 static bool
expand_vec_perm_even_odd_pack(struct expand_vec_perm_d * d)18990 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18991 {
18992   rtx op, dop0, dop1, t;
18993   unsigned i, odd, c, s, nelt = d->nelt;
18994   bool end_perm = false;
18995   machine_mode half_mode;
18996   rtx (*gen_and) (rtx, rtx, rtx);
18997   rtx (*gen_pack) (rtx, rtx, rtx);
18998   rtx (*gen_shift) (rtx, rtx, rtx);
18999 
19000   if (d->one_operand_p)
19001     return false;
19002 
19003   switch (d->vmode)
19004     {
19005     case E_V8HImode:
19006       /* Required for "pack".  */
19007       if (!TARGET_SSE4_1)
19008         return false;
19009       c = 0xffff;
19010       s = 16;
19011       half_mode = V4SImode;
19012       gen_and = gen_andv4si3;
19013       gen_pack = gen_sse4_1_packusdw;
19014       gen_shift = gen_lshrv4si3;
19015       break;
19016     case E_V16QImode:
19017       /* No check as all instructions are SSE2.  */
19018       c = 0xff;
19019       s = 8;
19020       half_mode = V8HImode;
19021       gen_and = gen_andv8hi3;
19022       gen_pack = gen_sse2_packuswb;
19023       gen_shift = gen_lshrv8hi3;
19024       break;
19025     case E_V16HImode:
19026       if (!TARGET_AVX2)
19027         return false;
19028       c = 0xffff;
19029       s = 16;
19030       half_mode = V8SImode;
19031       gen_and = gen_andv8si3;
19032       gen_pack = gen_avx2_packusdw;
19033       gen_shift = gen_lshrv8si3;
19034       end_perm = true;
19035       break;
19036     case E_V32QImode:
19037       if (!TARGET_AVX2)
19038         return false;
19039       c = 0xff;
19040       s = 8;
19041       half_mode = V16HImode;
19042       gen_and = gen_andv16hi3;
19043       gen_pack = gen_avx2_packuswb;
19044       gen_shift = gen_lshrv16hi3;
19045       end_perm = true;
19046       break;
19047     default:
19048       /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
19049 	 general shuffles.  */
19050       return false;
19051     }
19052 
19053   /* Check that permutation is even or odd.  */
19054   odd = d->perm[0];
19055   if (odd > 1)
19056     return false;
19057 
19058   for (i = 1; i < nelt; ++i)
19059     if (d->perm[i] != 2 * i + odd)
19060       return false;
19061 
19062   if (d->testing_p)
19063     return true;
19064 
19065   dop0 = gen_reg_rtx (half_mode);
19066   dop1 = gen_reg_rtx (half_mode);
19067   if (odd == 0)
19068     {
19069       t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
19070       t = force_reg (half_mode, t);
19071       emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
19072       emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
19073     }
19074   else
19075     {
19076       emit_insn (gen_shift (dop0,
19077 			    gen_lowpart (half_mode, d->op0),
19078 			    GEN_INT (s)));
19079       emit_insn (gen_shift (dop1,
19080 			    gen_lowpart (half_mode, d->op1),
19081 			    GEN_INT (s)));
19082     }
19083   /* In AVX2 for 256 bit case we need to permute pack result.  */
19084   if (TARGET_AVX2 && end_perm)
19085     {
19086       op = gen_reg_rtx (d->vmode);
19087       t = gen_reg_rtx (V4DImode);
19088       emit_insn (gen_pack (op, dop0, dop1));
19089       emit_insn (gen_avx2_permv4di_1 (t,
19090 				      gen_lowpart (V4DImode, op),
19091 				      const0_rtx,
19092 				      const2_rtx,
19093 				      const1_rtx,
19094 				      GEN_INT (3)));
19095       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
19096     }
19097   else
19098     emit_insn (gen_pack (d->target, dop0, dop1));
19099 
19100   return true;
19101 }
19102 
19103 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
19104    and extract-odd permutations of two V64QI operands
19105    with two "shifts", two "truncs" and one "concat" insns for "odd"
19106    and two "truncs" and one concat insn for "even."
19107    Have already failed all two instruction sequences.  */
19108 
19109 static bool
expand_vec_perm_even_odd_trunc(struct expand_vec_perm_d * d)19110 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
19111 {
19112   rtx t1, t2, t3, t4;
19113   unsigned i, odd, nelt = d->nelt;
19114 
19115   if (!TARGET_AVX512BW
19116       || d->one_operand_p
19117       || d->vmode != V64QImode)
19118     return false;
19119 
19120   /* Check that permutation is even or odd.  */
19121   odd = d->perm[0];
19122   if (odd > 1)
19123     return false;
19124 
19125   for (i = 1; i < nelt; ++i)
19126     if (d->perm[i] != 2 * i + odd)
19127       return false;
19128 
19129   if (d->testing_p)
19130     return true;
19131 
19132 
19133   if (odd)
19134     {
19135       t1 = gen_reg_rtx (V32HImode);
19136       t2 = gen_reg_rtx (V32HImode);
19137       emit_insn (gen_lshrv32hi3 (t1,
19138 				 gen_lowpart (V32HImode, d->op0),
19139 				 GEN_INT (8)));
19140       emit_insn (gen_lshrv32hi3 (t2,
19141 				 gen_lowpart (V32HImode, d->op1),
19142 				 GEN_INT (8)));
19143     }
19144   else
19145     {
19146       t1 = gen_lowpart (V32HImode, d->op0);
19147       t2 = gen_lowpart (V32HImode, d->op1);
19148     }
19149 
19150   t3 = gen_reg_rtx (V32QImode);
19151   t4 = gen_reg_rtx (V32QImode);
19152   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
19153   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
19154   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
19155 
19156   return true;
19157 }
19158 
19159 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
19160    and extract-odd permutations.  */
19161 
19162 static bool
expand_vec_perm_even_odd_1(struct expand_vec_perm_d * d,unsigned odd)19163 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
19164 {
19165   rtx t1, t2, t3, t4, t5;
19166 
19167   switch (d->vmode)
19168     {
19169     case E_V4DFmode:
19170       if (d->testing_p)
19171 	break;
19172       t1 = gen_reg_rtx (V4DFmode);
19173       t2 = gen_reg_rtx (V4DFmode);
19174 
19175       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
19176       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
19177       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
19178 
19179       /* Now an unpck[lh]pd will produce the result required.  */
19180       if (odd)
19181 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
19182       else
19183 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
19184       emit_insn (t3);
19185       break;
19186 
19187     case E_V8SFmode:
19188       {
19189 	int mask = odd ? 0xdd : 0x88;
19190 
19191 	if (d->testing_p)
19192 	  break;
19193 	t1 = gen_reg_rtx (V8SFmode);
19194 	t2 = gen_reg_rtx (V8SFmode);
19195 	t3 = gen_reg_rtx (V8SFmode);
19196 
19197 	/* Shuffle within the 128-bit lanes to produce:
19198 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
19199 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
19200 				      GEN_INT (mask)));
19201 
19202 	/* Shuffle the lanes around to produce:
19203 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
19204 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
19205 					    GEN_INT (0x3)));
19206 
19207 	/* Shuffle within the 128-bit lanes to produce:
19208 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
19209 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
19210 
19211 	/* Shuffle within the 128-bit lanes to produce:
19212 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
19213 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
19214 
19215 	/* Shuffle the lanes around to produce:
19216 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
19217 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
19218 					    GEN_INT (0x20)));
19219       }
19220       break;
19221 
19222     case E_V2DFmode:
19223     case E_V4SFmode:
19224     case E_V2DImode:
19225     case E_V2SImode:
19226     case E_V4SImode:
19227       /* These are always directly implementable by expand_vec_perm_1.  */
19228       gcc_unreachable ();
19229 
19230     case E_V2SFmode:
19231       gcc_assert (TARGET_MMX_WITH_SSE);
19232       /* We have no suitable instructions.  */
19233       if (d->testing_p)
19234 	return false;
19235       break;
19236 
19237     case E_V4HImode:
19238       if (d->testing_p)
19239 	break;
19240       /* We need 2*log2(N)-1 operations to achieve odd/even
19241 	 with interleave. */
19242       t1 = gen_reg_rtx (V4HImode);
19243       emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
19244       emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
19245       if (odd)
19246 	t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
19247       else
19248 	t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
19249       emit_insn (t2);
19250       break;
19251 
19252     case E_V8HImode:
19253       if (TARGET_SSE4_1)
19254 	return expand_vec_perm_even_odd_pack (d);
19255       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
19256 	return expand_vec_perm_pshufb2 (d);
19257       else
19258 	{
19259 	  if (d->testing_p)
19260 	    break;
19261 	  /* We need 2*log2(N)-1 operations to achieve odd/even
19262 	     with interleave. */
19263 	  t1 = gen_reg_rtx (V8HImode);
19264 	  t2 = gen_reg_rtx (V8HImode);
19265 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
19266 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
19267 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
19268 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
19269 	  if (odd)
19270 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
19271 	  else
19272 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
19273 	  emit_insn (t3);
19274 	}
19275       break;
19276 
19277     case E_V16QImode:
19278       return expand_vec_perm_even_odd_pack (d);
19279 
19280     case E_V16HImode:
19281     case E_V32QImode:
19282       return expand_vec_perm_even_odd_pack (d);
19283 
19284     case E_V64QImode:
19285       return expand_vec_perm_even_odd_trunc (d);
19286 
19287     case E_V4DImode:
19288       if (!TARGET_AVX2)
19289 	{
19290 	  struct expand_vec_perm_d d_copy = *d;
19291 	  d_copy.vmode = V4DFmode;
19292 	  if (d->testing_p)
19293 	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
19294 	  else
19295 	    d_copy.target = gen_reg_rtx (V4DFmode);
19296 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
19297 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
19298 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19299 	    {
19300 	      if (!d->testing_p)
19301 		emit_move_insn (d->target,
19302 				gen_lowpart (V4DImode, d_copy.target));
19303 	      return true;
19304 	    }
19305 	  return false;
19306 	}
19307 
19308       if (d->testing_p)
19309 	break;
19310 
19311       t1 = gen_reg_rtx (V4DImode);
19312       t2 = gen_reg_rtx (V4DImode);
19313 
19314       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
19315       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
19316       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
19317 
19318       /* Now an vpunpck[lh]qdq will produce the result required.  */
19319       if (odd)
19320 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
19321       else
19322 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
19323       emit_insn (t3);
19324       break;
19325 
19326     case E_V8SImode:
19327       if (!TARGET_AVX2)
19328 	{
19329 	  struct expand_vec_perm_d d_copy = *d;
19330 	  d_copy.vmode = V8SFmode;
19331 	  if (d->testing_p)
19332 	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
19333 	  else
19334 	    d_copy.target = gen_reg_rtx (V8SFmode);
19335 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
19336 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
19337 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19338 	    {
19339 	      if (!d->testing_p)
19340 		emit_move_insn (d->target,
19341 				gen_lowpart (V8SImode, d_copy.target));
19342 	      return true;
19343 	    }
19344 	  return false;
19345 	}
19346 
19347       if (d->testing_p)
19348 	break;
19349 
19350       t1 = gen_reg_rtx (V8SImode);
19351       t2 = gen_reg_rtx (V8SImode);
19352       t3 = gen_reg_rtx (V4DImode);
19353       t4 = gen_reg_rtx (V4DImode);
19354       t5 = gen_reg_rtx (V4DImode);
19355 
19356       /* Shuffle the lanes around into
19357 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
19358       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
19359 				    gen_lowpart (V4DImode, d->op1),
19360 				    GEN_INT (0x20)));
19361       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
19362 				    gen_lowpart (V4DImode, d->op1),
19363 				    GEN_INT (0x31)));
19364 
19365       /* Swap the 2nd and 3rd position in each lane into
19366 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
19367       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
19368 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19369       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
19370 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19371 
19372       /* Now an vpunpck[lh]qdq will produce
19373 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
19374       if (odd)
19375 	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
19376 					   gen_lowpart (V4DImode, t2));
19377       else
19378 	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
19379 					  gen_lowpart (V4DImode, t2));
19380       emit_insn (t3);
19381       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
19382       break;
19383 
19384     default:
19385       gcc_unreachable ();
19386     }
19387 
19388   return true;
19389 }
19390 
19391 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
19392    extract-even and extract-odd permutations.  */
19393 
19394 static bool
expand_vec_perm_even_odd(struct expand_vec_perm_d * d)19395 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
19396 {
19397   unsigned i, odd, nelt = d->nelt;
19398 
19399   odd = d->perm[0];
19400   if (odd != 0 && odd != 1)
19401     return false;
19402 
19403   for (i = 1; i < nelt; ++i)
19404     if (d->perm[i] != 2 * i + odd)
19405       return false;
19406 
19407   return expand_vec_perm_even_odd_1 (d, odd);
19408 }
19409 
19410 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
19411    permutations.  We assume that expand_vec_perm_1 has already failed.  */
19412 
19413 static bool
expand_vec_perm_broadcast_1(struct expand_vec_perm_d * d)19414 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
19415 {
19416   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
19417   machine_mode vmode = d->vmode;
19418   unsigned char perm2[4];
19419   rtx op0 = d->op0, dest;
19420   bool ok;
19421 
19422   switch (vmode)
19423     {
19424     case E_V4DFmode:
19425     case E_V8SFmode:
19426       /* These are special-cased in sse.md so that we can optionally
19427 	 use the vbroadcast instruction.  They expand to two insns
19428 	 if the input happens to be in a register.  */
19429       gcc_unreachable ();
19430 
19431     case E_V2DFmode:
19432     case E_V2SFmode:
19433     case E_V4SFmode:
19434     case E_V2DImode:
19435     case E_V2SImode:
19436     case E_V4SImode:
19437       /* These are always implementable using standard shuffle patterns.  */
19438       gcc_unreachable ();
19439 
19440     case E_V8HImode:
19441     case E_V16QImode:
19442       /* These can be implemented via interleave.  We save one insn by
19443 	 stopping once we have promoted to V4SImode and then use pshufd.  */
19444       if (d->testing_p)
19445 	return true;
19446       do
19447 	{
19448 	  rtx dest;
19449 	  rtx (*gen) (rtx, rtx, rtx)
19450 	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
19451 				 : gen_vec_interleave_lowv8hi;
19452 
19453 	  if (elt >= nelt2)
19454 	    {
19455 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
19456 				       : gen_vec_interleave_highv8hi;
19457 	      elt -= nelt2;
19458 	    }
19459 	  nelt2 /= 2;
19460 
19461 	  dest = gen_reg_rtx (vmode);
19462 	  emit_insn (gen (dest, op0, op0));
19463 	  vmode = get_mode_wider_vector (vmode);
19464 	  op0 = gen_lowpart (vmode, dest);
19465 	}
19466       while (vmode != V4SImode);
19467 
19468       memset (perm2, elt, 4);
19469       dest = gen_reg_rtx (V4SImode);
19470       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
19471       gcc_assert (ok);
19472       if (!d->testing_p)
19473 	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
19474       return true;
19475 
19476     case E_V64QImode:
19477     case E_V32QImode:
19478     case E_V16HImode:
19479     case E_V8SImode:
19480     case E_V4DImode:
19481       /* For AVX2 broadcasts of the first element vpbroadcast* or
19482 	 vpermq should be used by expand_vec_perm_1.  */
19483       gcc_assert (!TARGET_AVX2 || d->perm[0]);
19484       return false;
19485 
19486     default:
19487       gcc_unreachable ();
19488     }
19489 }
19490 
19491 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
19492    broadcast permutations.  */
19493 
19494 static bool
expand_vec_perm_broadcast(struct expand_vec_perm_d * d)19495 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
19496 {
19497   unsigned i, elt, nelt = d->nelt;
19498 
19499   if (!d->one_operand_p)
19500     return false;
19501 
19502   elt = d->perm[0];
19503   for (i = 1; i < nelt; ++i)
19504     if (d->perm[i] != elt)
19505       return false;
19506 
19507   return expand_vec_perm_broadcast_1 (d);
19508 }
19509 
19510 /* Implement arbitrary permutations of two V64QImode operands
19511    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
19512 static bool
expand_vec_perm_vpermt2_vpshub2(struct expand_vec_perm_d * d)19513 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
19514 {
19515   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
19516     return false;
19517 
19518   if (d->testing_p)
19519     return true;
19520 
19521   struct expand_vec_perm_d ds[2];
19522   rtx rperm[128], vperm, target0, target1;
19523   unsigned int i, nelt;
19524   machine_mode vmode;
19525 
19526   nelt = d->nelt;
19527   vmode = V64QImode;
19528 
19529   for (i = 0; i < 2; i++)
19530     {
19531       ds[i] = *d;
19532       ds[i].vmode = V32HImode;
19533       ds[i].nelt = 32;
19534       ds[i].target = gen_reg_rtx (V32HImode);
19535       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19536       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19537     }
19538 
19539   /* Prepare permutations such that the first one takes care of
19540      putting the even bytes into the right positions or one higher
19541      positions (ds[0]) and the second one takes care of
19542      putting the odd bytes into the right positions or one below
19543      (ds[1]).  */
19544 
19545   for (i = 0; i < nelt; i++)
19546     {
19547       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19548       if (i & 1)
19549 	{
19550 	  rperm[i] = constm1_rtx;
19551 	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19552 	}
19553       else
19554 	{
19555 	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19556 	  rperm[i + 64] = constm1_rtx;
19557 	}
19558     }
19559 
19560   bool ok = expand_vec_perm_1 (&ds[0]);
19561   gcc_assert (ok);
19562   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19563 
19564   ok = expand_vec_perm_1 (&ds[1]);
19565   gcc_assert (ok);
19566   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19567 
19568   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19569   vperm = force_reg (vmode, vperm);
19570   target0 = gen_reg_rtx (V64QImode);
19571   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19572 
19573   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19574   vperm = force_reg (vmode, vperm);
19575   target1 = gen_reg_rtx (V64QImode);
19576   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19577 
19578   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19579   return true;
19580 }
19581 
19582 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19583    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
19584    all the shorter instruction sequences.  */
19585 
19586 static bool
expand_vec_perm_vpshufb4_vpermq2(struct expand_vec_perm_d * d)19587 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19588 {
19589   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19590   unsigned int i, nelt, eltsz;
19591   bool used[4];
19592 
19593   if (!TARGET_AVX2
19594       || d->one_operand_p
19595       || (d->vmode != V32QImode && d->vmode != V16HImode))
19596     return false;
19597 
19598   if (d->testing_p)
19599     return true;
19600 
19601   nelt = d->nelt;
19602   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19603 
19604   /* Generate 4 permutation masks.  If the required element is within
19605      the same lane, it is shuffled in.  If the required element from the
19606      other lane, force a zero by setting bit 7 in the permutation mask.
19607      In the other mask the mask has non-negative elements if element
19608      is requested from the other lane, but also moved to the other lane,
19609      so that the result of vpshufb can have the two V2TImode halves
19610      swapped.  */
19611   m128 = GEN_INT (-128);
19612   for (i = 0; i < 32; ++i)
19613     {
19614       rperm[0][i] = m128;
19615       rperm[1][i] = m128;
19616       rperm[2][i] = m128;
19617       rperm[3][i] = m128;
19618     }
19619   used[0] = false;
19620   used[1] = false;
19621   used[2] = false;
19622   used[3] = false;
19623   for (i = 0; i < nelt; ++i)
19624     {
19625       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19626       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19627       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19628 
19629       for (j = 0; j < eltsz; ++j)
19630 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19631       used[which] = true;
19632     }
19633 
19634   for (i = 0; i < 2; ++i)
19635     {
19636       if (!used[2 * i + 1])
19637 	{
19638 	  h[i] = NULL_RTX;
19639 	  continue;
19640 	}
19641       vperm = gen_rtx_CONST_VECTOR (V32QImode,
19642 				    gen_rtvec_v (32, rperm[2 * i + 1]));
19643       vperm = force_reg (V32QImode, vperm);
19644       h[i] = gen_reg_rtx (V32QImode);
19645       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19646       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19647     }
19648 
19649   /* Swap the 128-byte lanes of h[X].  */
19650   for (i = 0; i < 2; ++i)
19651    {
19652      if (h[i] == NULL_RTX)
19653        continue;
19654      op = gen_reg_rtx (V4DImode);
19655      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19656 				     const2_rtx, GEN_INT (3), const0_rtx,
19657 				     const1_rtx));
19658      h[i] = gen_lowpart (V32QImode, op);
19659    }
19660 
19661   for (i = 0; i < 2; ++i)
19662     {
19663       if (!used[2 * i])
19664 	{
19665 	  l[i] = NULL_RTX;
19666 	  continue;
19667 	}
19668       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19669       vperm = force_reg (V32QImode, vperm);
19670       l[i] = gen_reg_rtx (V32QImode);
19671       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19672       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19673     }
19674 
19675   for (i = 0; i < 2; ++i)
19676     {
19677       if (h[i] && l[i])
19678 	{
19679 	  op = gen_reg_rtx (V32QImode);
19680 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19681 	  l[i] = op;
19682 	}
19683       else if (h[i])
19684 	l[i] = h[i];
19685     }
19686 
19687   gcc_assert (l[0] && l[1]);
19688   op = d->target;
19689   if (d->vmode != V32QImode)
19690     op = gen_reg_rtx (V32QImode);
19691   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19692   if (op != d->target)
19693     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19694   return true;
19695 }
19696 
19697 /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
19698    taken care of, perform the expansion in D and return true on success.  */
19699 
19700 static bool
ix86_expand_vec_perm_const_1(struct expand_vec_perm_d * d)19701 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19702 {
19703   /* Try a single instruction expansion.  */
19704   if (expand_vec_perm_1 (d))
19705     return true;
19706 
19707   /* Try sequences of two instructions.  */
19708 
19709   if (expand_vec_perm_pshuflw_pshufhw (d))
19710     return true;
19711 
19712   if (expand_vec_perm_palignr (d, false))
19713     return true;
19714 
19715   if (expand_vec_perm_interleave2 (d))
19716     return true;
19717 
19718   if (expand_vec_perm_broadcast (d))
19719     return true;
19720 
19721   if (expand_vec_perm_vpermq_perm_1 (d))
19722     return true;
19723 
19724   if (expand_vec_perm_vperm2f128 (d))
19725     return true;
19726 
19727   if (expand_vec_perm_pblendv (d))
19728     return true;
19729 
19730   /* Try sequences of three instructions.  */
19731 
19732   if (expand_vec_perm_even_odd_pack (d))
19733     return true;
19734 
19735   if (expand_vec_perm_2vperm2f128_vshuf (d))
19736     return true;
19737 
19738   if (expand_vec_perm_pshufb2 (d))
19739     return true;
19740 
19741   if (expand_vec_perm_interleave3 (d))
19742     return true;
19743 
19744   if (expand_vec_perm_vperm2f128_vblend (d))
19745     return true;
19746 
19747   /* Try sequences of four instructions.  */
19748 
19749   if (expand_vec_perm_even_odd_trunc (d))
19750     return true;
19751   if (expand_vec_perm_vpshufb2_vpermq (d))
19752     return true;
19753 
19754   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19755     return true;
19756 
19757   if (expand_vec_perm_vpermt2_vpshub2 (d))
19758     return true;
19759 
19760   /* ??? Look for narrow permutations whose element orderings would
19761      allow the promotion to a wider mode.  */
19762 
19763   /* ??? Look for sequences of interleave or a wider permute that place
19764      the data into the correct lanes for a half-vector shuffle like
19765      pshuf[lh]w or vpermilps.  */
19766 
19767   /* ??? Look for sequences of interleave that produce the desired results.
19768      The combinatorics of punpck[lh] get pretty ugly... */
19769 
19770   if (expand_vec_perm_even_odd (d))
19771     return true;
19772 
19773   /* Even longer sequences.  */
19774   if (expand_vec_perm_vpshufb4_vpermq2 (d))
19775     return true;
19776 
19777   /* See if we can get the same permutation in different vector integer
19778      mode.  */
19779   struct expand_vec_perm_d nd;
19780   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19781     {
19782       if (!d->testing_p)
19783 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19784       return true;
19785     }
19786 
19787   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
19788   if (expand_vec_perm2_vperm2f128_vblend (d))
19789     return true;
19790 
19791   return false;
19792 }
19793 
19794 /* If a permutation only uses one operand, make it clear. Returns true
19795    if the permutation references both operands.  */
19796 
19797 static bool
canonicalize_perm(struct expand_vec_perm_d * d)19798 canonicalize_perm (struct expand_vec_perm_d *d)
19799 {
19800   int i, which, nelt = d->nelt;
19801 
19802   for (i = which = 0; i < nelt; ++i)
19803     which |= (d->perm[i] < nelt ? 1 : 2);
19804 
19805   d->one_operand_p = true;
19806   switch (which)
19807     {
19808     default:
19809       gcc_unreachable();
19810 
19811     case 3:
19812       if (!rtx_equal_p (d->op0, d->op1))
19813         {
19814 	  d->one_operand_p = false;
19815 	  break;
19816         }
19817       /* The elements of PERM do not suggest that only the first operand
19818 	 is used, but both operands are identical.  Allow easier matching
19819 	 of the permutation by folding the permutation into the single
19820 	 input vector.  */
19821       /* FALLTHRU */
19822 
19823     case 2:
19824       for (i = 0; i < nelt; ++i)
19825         d->perm[i] &= nelt - 1;
19826       d->op0 = d->op1;
19827       break;
19828 
19829     case 1:
19830       d->op1 = d->op0;
19831       break;
19832     }
19833 
19834   return (which == 3);
19835 }
19836 
19837 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19838 
19839 bool
ix86_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)19840 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19841 			       rtx op1, const vec_perm_indices &sel)
19842 {
19843   struct expand_vec_perm_d d;
19844   unsigned char perm[MAX_VECT_LEN];
19845   unsigned int i, nelt, which;
19846   bool two_args;
19847 
19848   d.target = target;
19849   d.op0 = op0;
19850   d.op1 = op1;
19851 
19852   d.vmode = vmode;
19853   gcc_assert (VECTOR_MODE_P (d.vmode));
19854   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19855   d.testing_p = !target;
19856 
19857   gcc_assert (sel.length () == nelt);
19858   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19859 
19860   /* Given sufficient ISA support we can just return true here
19861      for selected vector modes.  */
19862   switch (d.vmode)
19863     {
19864     case E_V16SFmode:
19865     case E_V16SImode:
19866     case E_V8DImode:
19867     case E_V8DFmode:
19868       if (!TARGET_AVX512F)
19869 	return false;
19870       /* All implementable with a single vperm[it]2 insn.  */
19871       if (d.testing_p)
19872 	return true;
19873       break;
19874     case E_V32HImode:
19875       if (!TARGET_AVX512BW)
19876 	return false;
19877       if (d.testing_p)
19878 	/* All implementable with a single vperm[it]2 insn.  */
19879 	return true;
19880       break;
19881     case E_V64QImode:
19882       if (!TARGET_AVX512BW)
19883 	return false;
19884       if (d.testing_p)
19885 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
19886 	return true;
19887       break;
19888     case E_V8SImode:
19889     case E_V8SFmode:
19890     case E_V4DFmode:
19891     case E_V4DImode:
19892       if (!TARGET_AVX)
19893 	return false;
19894       if (d.testing_p && TARGET_AVX512VL)
19895 	/* All implementable with a single vperm[it]2 insn.  */
19896 	return true;
19897       break;
19898     case E_V16HImode:
19899       if (!TARGET_SSE2)
19900 	return false;
19901       if (d.testing_p && TARGET_AVX2)
19902 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
19903 	return true;
19904       break;
19905     case E_V32QImode:
19906       if (!TARGET_SSE2)
19907 	return false;
19908       if (d.testing_p && TARGET_AVX2)
19909 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
19910 	return true;
19911       break;
19912     case E_V8HImode:
19913     case E_V16QImode:
19914       if (!TARGET_SSE2)
19915 	return false;
19916       /* Fall through.  */
19917     case E_V4SImode:
19918     case E_V4SFmode:
19919       if (!TARGET_SSE)
19920 	return false;
19921       /* All implementable with a single vpperm insn.  */
19922       if (d.testing_p && TARGET_XOP)
19923 	return true;
19924       /* All implementable with 2 pshufb + 1 ior.  */
19925       if (d.testing_p && TARGET_SSSE3)
19926 	return true;
19927       break;
19928     case E_V2SFmode:
19929     case E_V2SImode:
19930     case E_V4HImode:
19931       if (!TARGET_MMX_WITH_SSE)
19932 	return false;
19933       break;
19934     case E_V2DImode:
19935     case E_V2DFmode:
19936       if (!TARGET_SSE)
19937 	return false;
19938       /* All implementable with shufpd or unpck[lh]pd.  */
19939       if (d.testing_p)
19940 	return true;
19941       break;
19942     default:
19943       return false;
19944     }
19945 
19946   for (i = which = 0; i < nelt; ++i)
19947     {
19948       unsigned char e = sel[i];
19949       gcc_assert (e < 2 * nelt);
19950       d.perm[i] = e;
19951       perm[i] = e;
19952       which |= (e < nelt ? 1 : 2);
19953     }
19954 
19955   if (d.testing_p)
19956     {
19957       /* For all elements from second vector, fold the elements to first.  */
19958       if (which == 2)
19959 	for (i = 0; i < nelt; ++i)
19960 	  d.perm[i] -= nelt;
19961 
19962       /* Check whether the mask can be applied to the vector type.  */
19963       d.one_operand_p = (which != 3);
19964 
19965       /* Implementable with shufps or pshufd.  */
19966       if (d.one_operand_p
19967 	  && (d.vmode == V4SFmode || d.vmode == V2SFmode
19968 	      || d.vmode == V4SImode || d.vmode == V2SImode))
19969 	return true;
19970 
19971       /* Otherwise we have to go through the motions and see if we can
19972 	 figure out how to generate the requested permutation.  */
19973       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19974       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19975       if (!d.one_operand_p)
19976 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19977 
19978       start_sequence ();
19979       bool ret = ix86_expand_vec_perm_const_1 (&d);
19980       end_sequence ();
19981 
19982       return ret;
19983     }
19984 
19985   two_args = canonicalize_perm (&d);
19986 
19987   /* If one of the operands is a zero vector, try to match pmovzx.  */
19988   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
19989     {
19990       struct expand_vec_perm_d dzero = d;
19991       if (d.op0 == CONST0_RTX (vmode))
19992 	{
19993 	  d.op1 = dzero.op1 = force_reg (vmode, d.op1);
19994 	  std::swap (dzero.op0, dzero.op1);
19995 	  for (i = 0; i < nelt; ++i)
19996 	    dzero.perm[i] ^= nelt;
19997 	}
19998       else
19999 	d.op0 = dzero.op0 = force_reg (vmode, d.op0);
20000 
20001       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
20002 				  dzero.perm, nelt, dzero.testing_p))
20003 	return true;
20004     }
20005 
20006   /* Force operands into registers.  */
20007   rtx nop0 = force_reg (vmode, d.op0);
20008   if (d.op0 == d.op1)
20009     d.op1 = nop0;
20010   d.op0 = nop0;
20011   d.op1 = force_reg (vmode, d.op1);
20012 
20013   if (ix86_expand_vec_perm_const_1 (&d))
20014     return true;
20015 
20016   /* If the selector says both arguments are needed, but the operands are the
20017      same, the above tried to expand with one_operand_p and flattened selector.
20018      If that didn't work, retry without one_operand_p; we succeeded with that
20019      during testing.  */
20020   if (two_args && d.one_operand_p)
20021     {
20022       d.one_operand_p = false;
20023       memcpy (d.perm, perm, sizeof (perm));
20024       return ix86_expand_vec_perm_const_1 (&d);
20025     }
20026 
20027   return false;
20028 }
20029 
20030 void
ix86_expand_vec_extract_even_odd(rtx targ,rtx op0,rtx op1,unsigned odd)20031 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
20032 {
20033   struct expand_vec_perm_d d;
20034   unsigned i, nelt;
20035 
20036   d.target = targ;
20037   d.op0 = op0;
20038   d.op1 = op1;
20039   d.vmode = GET_MODE (targ);
20040   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20041   d.one_operand_p = false;
20042   d.testing_p = false;
20043 
20044   for (i = 0; i < nelt; ++i)
20045     d.perm[i] = i * 2 + odd;
20046 
20047   /* We'll either be able to implement the permutation directly...  */
20048   if (expand_vec_perm_1 (&d))
20049     return;
20050 
20051   /* ... or we use the special-case patterns.  */
20052   expand_vec_perm_even_odd_1 (&d, odd);
20053 }
20054 
20055 static void
ix86_expand_vec_interleave(rtx targ,rtx op0,rtx op1,bool high_p)20056 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
20057 {
20058   struct expand_vec_perm_d d;
20059   unsigned i, nelt, base;
20060   bool ok;
20061 
20062   d.target = targ;
20063   d.op0 = op0;
20064   d.op1 = op1;
20065   d.vmode = GET_MODE (targ);
20066   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20067   d.one_operand_p = false;
20068   d.testing_p = false;
20069 
20070   base = high_p ? nelt / 2 : 0;
20071   for (i = 0; i < nelt / 2; ++i)
20072     {
20073       d.perm[i * 2] = i + base;
20074       d.perm[i * 2 + 1] = i + base + nelt;
20075     }
20076 
20077   /* Note that for AVX this isn't one instruction.  */
20078   ok = ix86_expand_vec_perm_const_1 (&d);
20079   gcc_assert (ok);
20080 }
20081 
20082 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20083    under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20084 
20085    vpmovzxbw ymm2, xmm0
20086    vpmovzxbw ymm3, xmm1
20087    vpmullw   ymm4, ymm2, ymm3
20088    vpmovwb   xmm0, ymm4
20089 
20090    it would take less instructions than ix86_expand_vecop_qihi.
20091    Return true if success.  */
20092 
20093 bool
ix86_expand_vecmul_qihi(rtx dest,rtx op1,rtx op2)20094 ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
20095 {
20096   machine_mode himode, qimode = GET_MODE (dest);
20097   rtx hop1, hop2, hdest;
20098   rtx (*gen_extend)(rtx, rtx);
20099   rtx (*gen_truncate)(rtx, rtx);
20100 
20101   /* There's no V64HImode multiplication instruction.  */
20102   if (qimode == E_V64QImode)
20103     return false;
20104 
20105   /* vpmovwb only available under AVX512BW.  */
20106   if (!TARGET_AVX512BW)
20107     return false;
20108   if ((qimode == V8QImode || qimode == V16QImode)
20109       && !TARGET_AVX512VL)
20110     return false;
20111   /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
20112   if (qimode == V32QImode
20113       && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
20114     return false;
20115 
20116   switch (qimode)
20117     {
20118     case E_V8QImode:
20119       himode = V8HImode;
20120       gen_extend = gen_zero_extendv8qiv8hi2;
20121       gen_truncate = gen_truncv8hiv8qi2;
20122       break;
20123     case E_V16QImode:
20124       himode = V16HImode;
20125       gen_extend = gen_zero_extendv16qiv16hi2;
20126       gen_truncate = gen_truncv16hiv16qi2;
20127       break;
20128     case E_V32QImode:
20129       himode = V32HImode;
20130       gen_extend = gen_zero_extendv32qiv32hi2;
20131       gen_truncate = gen_truncv32hiv32qi2;
20132       break;
20133     default:
20134       gcc_unreachable ();
20135     }
20136 
20137   hop1 = gen_reg_rtx (himode);
20138   hop2 = gen_reg_rtx (himode);
20139   hdest = gen_reg_rtx (himode);
20140   emit_insn (gen_extend (hop1, op1));
20141   emit_insn (gen_extend (hop2, op2));
20142   emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
20143 						      hop1, hop2)));
20144   emit_insn (gen_truncate (dest, hdest));
20145   return true;
20146 }
20147 
20148 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20149    same operation on V*HImode. Return true if success. */
20150 bool
ix86_expand_vec_shift_qihi_constant(enum rtx_code code,rtx dest,rtx op1,rtx op2)20151 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20152 {
20153   machine_mode qimode, himode;
20154   HOST_WIDE_INT and_constant, xor_constant;
20155   HOST_WIDE_INT shift_amount;
20156   rtx vec_const_and, vec_const_xor;
20157   rtx tmp, op1_subreg;
20158   rtx (*gen_shift) (rtx, rtx, rtx);
20159   rtx (*gen_and) (rtx, rtx, rtx);
20160   rtx (*gen_xor) (rtx, rtx, rtx);
20161   rtx (*gen_sub) (rtx, rtx, rtx);
20162 
20163   /* Only optimize shift by constant.  */
20164   if (!CONST_INT_P (op2))
20165     return false;
20166 
20167   qimode = GET_MODE (dest);
20168   shift_amount = INTVAL (op2);
20169   /* Do nothing when shift amount greater equal 8.  */
20170   if (shift_amount > 7)
20171     return false;
20172 
20173   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
20174   /* Record sign bit.  */
20175   xor_constant = 1 << (8 - shift_amount - 1);
20176 
20177   /* Zero upper/lower bits shift from left/right element.  */
20178   and_constant
20179     = (code == ASHIFT ? 256 - (1 << shift_amount)
20180        : (1 << (8 - shift_amount)) - 1);
20181 
20182   switch (qimode)
20183     {
20184     case V16QImode:
20185       himode = V8HImode;
20186       gen_shift =
20187 	((code == ASHIFT)
20188 	 ? gen_ashlv8hi3
20189 	 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
20190       gen_and = gen_andv16qi3;
20191       gen_xor = gen_xorv16qi3;
20192       gen_sub = gen_subv16qi3;
20193       break;
20194     case V32QImode:
20195       himode = V16HImode;
20196       gen_shift =
20197 	((code == ASHIFT)
20198 	 ? gen_ashlv16hi3
20199 	 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
20200       gen_and = gen_andv32qi3;
20201       gen_xor = gen_xorv32qi3;
20202       gen_sub = gen_subv32qi3;
20203       break;
20204     case V64QImode:
20205       himode = V32HImode;
20206       gen_shift =
20207 	((code == ASHIFT)
20208 	 ? gen_ashlv32hi3
20209 	 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
20210       gen_and = gen_andv64qi3;
20211       gen_xor = gen_xorv64qi3;
20212       gen_sub = gen_subv64qi3;
20213       break;
20214     default:
20215       gcc_unreachable ();
20216     }
20217 
20218   tmp = gen_reg_rtx (himode);
20219   vec_const_and = gen_reg_rtx (qimode);
20220   op1_subreg = lowpart_subreg (himode, op1, qimode);
20221 
20222   /* For ASHIFT and LSHIFTRT, perform operation like
20223      vpsllw/vpsrlw $shift_amount, %op1, %dest.
20224      vpand %vec_const_and, %dest.  */
20225   emit_insn (gen_shift (tmp, op1_subreg, op2));
20226   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
20227   emit_move_insn (vec_const_and,
20228 		  ix86_build_const_vector (qimode, true,
20229 					   gen_int_mode (and_constant, QImode)));
20230   emit_insn (gen_and (dest, dest, vec_const_and));
20231 
20232   /* For ASHIFTRT, perform extra operation like
20233      vpxor %vec_const_xor, %dest, %dest
20234      vpsubb %vec_const_xor, %dest, %dest  */
20235   if (code == ASHIFTRT)
20236     {
20237       vec_const_xor = gen_reg_rtx (qimode);
20238       emit_move_insn (vec_const_xor,
20239 		      ix86_build_const_vector (qimode, true,
20240 					       gen_int_mode (xor_constant, QImode)));
20241       emit_insn (gen_xor (dest, dest, vec_const_xor));
20242       emit_insn (gen_sub (dest, dest, vec_const_xor));
20243     }
20244   return true;
20245 }
20246 
20247 /* Expand a vector operation CODE for a V*QImode in terms of the
20248    same operation on V*HImode.  */
20249 
20250 void
ix86_expand_vecop_qihi(enum rtx_code code,rtx dest,rtx op1,rtx op2)20251 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20252 {
20253   machine_mode qimode = GET_MODE (dest);
20254   machine_mode himode;
20255   rtx (*gen_il) (rtx, rtx, rtx);
20256   rtx (*gen_ih) (rtx, rtx, rtx);
20257   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
20258   struct expand_vec_perm_d d;
20259   bool ok, full_interleave;
20260   bool uns_p = false;
20261   int i;
20262 
20263   switch (qimode)
20264     {
20265     case E_V16QImode:
20266       himode = V8HImode;
20267       gen_il = gen_vec_interleave_lowv16qi;
20268       gen_ih = gen_vec_interleave_highv16qi;
20269       break;
20270     case E_V32QImode:
20271       himode = V16HImode;
20272       gen_il = gen_avx2_interleave_lowv32qi;
20273       gen_ih = gen_avx2_interleave_highv32qi;
20274       break;
20275     case E_V64QImode:
20276       himode = V32HImode;
20277       gen_il = gen_avx512bw_interleave_lowv64qi;
20278       gen_ih = gen_avx512bw_interleave_highv64qi;
20279       break;
20280     default:
20281       gcc_unreachable ();
20282     }
20283 
20284   op2_l = op2_h = op2;
20285   switch (code)
20286     {
20287     case MULT:
20288       /* Unpack data such that we've got a source byte in each low byte of
20289 	 each word.  We don't care what goes into the high byte of each word.
20290 	 Rather than trying to get zero in there, most convenient is to let
20291 	 it be a copy of the low byte.  */
20292       op2_l = gen_reg_rtx (qimode);
20293       op2_h = gen_reg_rtx (qimode);
20294       emit_insn (gen_il (op2_l, op2, op2));
20295       emit_insn (gen_ih (op2_h, op2, op2));
20296 
20297       op1_l = gen_reg_rtx (qimode);
20298       op1_h = gen_reg_rtx (qimode);
20299       emit_insn (gen_il (op1_l, op1, op1));
20300       emit_insn (gen_ih (op1_h, op1, op1));
20301       full_interleave = qimode == V16QImode;
20302       break;
20303 
20304     case ASHIFT:
20305     case LSHIFTRT:
20306       uns_p = true;
20307       /* FALLTHRU */
20308     case ASHIFTRT:
20309       op1_l = gen_reg_rtx (himode);
20310       op1_h = gen_reg_rtx (himode);
20311       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
20312       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
20313       full_interleave = true;
20314       break;
20315     default:
20316       gcc_unreachable ();
20317     }
20318 
20319   /* Perform the operation.  */
20320   res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
20321 			       1, OPTAB_DIRECT);
20322   res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
20323 			       1, OPTAB_DIRECT);
20324   gcc_assert (res_l && res_h);
20325 
20326   /* Merge the data back into the right place.  */
20327   d.target = dest;
20328   d.op0 = gen_lowpart (qimode, res_l);
20329   d.op1 = gen_lowpart (qimode, res_h);
20330   d.vmode = qimode;
20331   d.nelt = GET_MODE_NUNITS (qimode);
20332   d.one_operand_p = false;
20333   d.testing_p = false;
20334 
20335   if (full_interleave)
20336     {
20337       /* For SSE2, we used an full interleave, so the desired
20338 	 results are in the even elements.  */
20339       for (i = 0; i < d.nelt; ++i)
20340 	d.perm[i] = i * 2;
20341     }
20342   else
20343     {
20344       /* For AVX, the interleave used above was not cross-lane.  So the
20345 	 extraction is evens but with the second and third quarter swapped.
20346 	 Happily, that is even one insn shorter than even extraction.
20347 	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
20348 	 always first from the first and then from the second source operand,
20349 	 the index bits above the low 4 bits remains the same.
20350 	 Thus, for d.nelt == 32 we want permutation
20351 	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20352 	 and for d.nelt == 64 we want permutation
20353 	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20354 	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
20355       for (i = 0; i < d.nelt; ++i)
20356 	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
20357     }
20358 
20359   ok = ix86_expand_vec_perm_const_1 (&d);
20360   gcc_assert (ok);
20361 
20362   set_unique_reg_note (get_last_insn (), REG_EQUAL,
20363 		       gen_rtx_fmt_ee (code, qimode, op1, op2));
20364 }
20365 
20366 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
20367    if op is CONST_VECTOR with all odd elements equal to their
20368    preceding element.  */
20369 
20370 static bool
const_vector_equal_evenodd_p(rtx op)20371 const_vector_equal_evenodd_p (rtx op)
20372 {
20373   machine_mode mode = GET_MODE (op);
20374   int i, nunits = GET_MODE_NUNITS (mode);
20375   if (GET_CODE (op) != CONST_VECTOR
20376       || nunits != CONST_VECTOR_NUNITS (op))
20377     return false;
20378   for (i = 0; i < nunits; i += 2)
20379     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
20380       return false;
20381   return true;
20382 }
20383 
20384 void
ix86_expand_mul_widen_evenodd(rtx dest,rtx op1,rtx op2,bool uns_p,bool odd_p)20385 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
20386 			       bool uns_p, bool odd_p)
20387 {
20388   machine_mode mode = GET_MODE (op1);
20389   machine_mode wmode = GET_MODE (dest);
20390   rtx x;
20391   rtx orig_op1 = op1, orig_op2 = op2;
20392 
20393   if (!nonimmediate_operand (op1, mode))
20394     op1 = force_reg (mode, op1);
20395   if (!nonimmediate_operand (op2, mode))
20396     op2 = force_reg (mode, op2);
20397 
20398   /* We only play even/odd games with vectors of SImode.  */
20399   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
20400 
20401   /* If we're looking for the odd results, shift those members down to
20402      the even slots.  For some cpus this is faster than a PSHUFD.  */
20403   if (odd_p)
20404     {
20405       /* For XOP use vpmacsdqh, but only for smult, as it is only
20406 	 signed.  */
20407       if (TARGET_XOP && mode == V4SImode && !uns_p)
20408 	{
20409 	  x = force_reg (wmode, CONST0_RTX (wmode));
20410 	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
20411 	  return;
20412 	}
20413 
20414       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
20415       if (!const_vector_equal_evenodd_p (orig_op1))
20416 	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
20417 			    x, NULL, 1, OPTAB_DIRECT);
20418       if (!const_vector_equal_evenodd_p (orig_op2))
20419 	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
20420 			    x, NULL, 1, OPTAB_DIRECT);
20421       op1 = gen_lowpart (mode, op1);
20422       op2 = gen_lowpart (mode, op2);
20423     }
20424 
20425   if (mode == V16SImode)
20426     {
20427       if (uns_p)
20428 	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
20429       else
20430 	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
20431     }
20432   else if (mode == V8SImode)
20433     {
20434       if (uns_p)
20435 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
20436       else
20437 	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
20438     }
20439   else if (uns_p)
20440     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
20441   else if (TARGET_SSE4_1)
20442     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
20443   else
20444     {
20445       rtx s1, s2, t0, t1, t2;
20446 
20447       /* The easiest way to implement this without PMULDQ is to go through
20448 	 the motions as if we are performing a full 64-bit multiply.  With
20449 	 the exception that we need to do less shuffling of the elements.  */
20450 
20451       /* Compute the sign-extension, aka highparts, of the two operands.  */
20452       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20453 				op1, pc_rtx, pc_rtx);
20454       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20455 				op2, pc_rtx, pc_rtx);
20456 
20457       /* Multiply LO(A) * HI(B), and vice-versa.  */
20458       t1 = gen_reg_rtx (wmode);
20459       t2 = gen_reg_rtx (wmode);
20460       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
20461       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
20462 
20463       /* Multiply LO(A) * LO(B).  */
20464       t0 = gen_reg_rtx (wmode);
20465       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
20466 
20467       /* Combine and shift the highparts into place.  */
20468       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
20469       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
20470 			 1, OPTAB_DIRECT);
20471 
20472       /* Combine high and low parts.  */
20473       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
20474       return;
20475     }
20476   emit_insn (x);
20477 }
20478 
20479 void
ix86_expand_mul_widen_hilo(rtx dest,rtx op1,rtx op2,bool uns_p,bool high_p)20480 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
20481 			    bool uns_p, bool high_p)
20482 {
20483   machine_mode wmode = GET_MODE (dest);
20484   machine_mode mode = GET_MODE (op1);
20485   rtx t1, t2, t3, t4, mask;
20486 
20487   switch (mode)
20488     {
20489     case E_V4SImode:
20490       t1 = gen_reg_rtx (mode);
20491       t2 = gen_reg_rtx (mode);
20492       if (TARGET_XOP && !uns_p)
20493 	{
20494 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
20495 	     shuffle the elements once so that all elements are in the right
20496 	     place for immediate use: { A C B D }.  */
20497 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
20498 					const1_rtx, GEN_INT (3)));
20499 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
20500 					const1_rtx, GEN_INT (3)));
20501 	}
20502       else
20503 	{
20504 	  /* Put the elements into place for the multiply.  */
20505 	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
20506 	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
20507 	  high_p = false;
20508 	}
20509       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
20510       break;
20511 
20512     case E_V8SImode:
20513       /* Shuffle the elements between the lanes.  After this we
20514 	 have { A B E F | C D G H } for each operand.  */
20515       t1 = gen_reg_rtx (V4DImode);
20516       t2 = gen_reg_rtx (V4DImode);
20517       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
20518 				      const0_rtx, const2_rtx,
20519 				      const1_rtx, GEN_INT (3)));
20520       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
20521 				      const0_rtx, const2_rtx,
20522 				      const1_rtx, GEN_INT (3)));
20523 
20524       /* Shuffle the elements within the lanes.  After this we
20525 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
20526       t3 = gen_reg_rtx (V8SImode);
20527       t4 = gen_reg_rtx (V8SImode);
20528       mask = GEN_INT (high_p
20529 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20530 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20531       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
20532       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
20533 
20534       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
20535       break;
20536 
20537     case E_V8HImode:
20538     case E_V16HImode:
20539       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
20540 			 uns_p, OPTAB_DIRECT);
20541       t2 = expand_binop (mode,
20542 			 uns_p ? umul_highpart_optab : smul_highpart_optab,
20543 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
20544       gcc_assert (t1 && t2);
20545 
20546       t3 = gen_reg_rtx (mode);
20547       ix86_expand_vec_interleave (t3, t1, t2, high_p);
20548       emit_move_insn (dest, gen_lowpart (wmode, t3));
20549       break;
20550 
20551     case E_V16QImode:
20552     case E_V32QImode:
20553     case E_V32HImode:
20554     case E_V16SImode:
20555     case E_V64QImode:
20556       t1 = gen_reg_rtx (wmode);
20557       t2 = gen_reg_rtx (wmode);
20558       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
20559       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
20560 
20561       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
20562       break;
20563 
20564     default:
20565       gcc_unreachable ();
20566     }
20567 }
20568 
20569 void
ix86_expand_sse2_mulv4si3(rtx op0,rtx op1,rtx op2)20570 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
20571 {
20572   rtx res_1, res_2, res_3, res_4;
20573 
20574   res_1 = gen_reg_rtx (V4SImode);
20575   res_2 = gen_reg_rtx (V4SImode);
20576   res_3 = gen_reg_rtx (V2DImode);
20577   res_4 = gen_reg_rtx (V2DImode);
20578   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
20579   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
20580 
20581   /* Move the results in element 2 down to element 1; we don't care
20582      what goes in elements 2 and 3.  Then we can merge the parts
20583      back together with an interleave.
20584 
20585      Note that two other sequences were tried:
20586      (1) Use interleaves at the start instead of psrldq, which allows
20587      us to use a single shufps to merge things back at the end.
20588      (2) Use shufps here to combine the two vectors, then pshufd to
20589      put the elements in the correct order.
20590      In both cases the cost of the reformatting stall was too high
20591      and the overall sequence slower.  */
20592 
20593   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
20594 				const0_rtx, const2_rtx,
20595 				const0_rtx, const0_rtx));
20596   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
20597 				const0_rtx, const2_rtx,
20598 				const0_rtx, const0_rtx));
20599   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
20600 
20601   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
20602 }
20603 
20604 void
ix86_expand_sse2_mulvxdi3(rtx op0,rtx op1,rtx op2)20605 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
20606 {
20607   machine_mode mode = GET_MODE (op0);
20608   rtx t1, t2, t3, t4, t5, t6;
20609 
20610   if (TARGET_AVX512DQ && mode == V8DImode)
20611     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
20612   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
20613     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20614   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20615     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20616   else if (TARGET_XOP && mode == V2DImode)
20617     {
20618       /* op1: A,B,C,D, op2: E,F,G,H */
20619       op1 = gen_lowpart (V4SImode, op1);
20620       op2 = gen_lowpart (V4SImode, op2);
20621 
20622       t1 = gen_reg_rtx (V4SImode);
20623       t2 = gen_reg_rtx (V4SImode);
20624       t3 = gen_reg_rtx (V2DImode);
20625       t4 = gen_reg_rtx (V2DImode);
20626 
20627       /* t1: B,A,D,C */
20628       emit_insn (gen_sse2_pshufd_1 (t1, op1,
20629 				    GEN_INT (1),
20630 				    GEN_INT (0),
20631 				    GEN_INT (3),
20632 				    GEN_INT (2)));
20633 
20634       /* t2: (B*E),(A*F),(D*G),(C*H) */
20635       emit_insn (gen_mulv4si3 (t2, t1, op2));
20636 
20637       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20638       emit_insn (gen_xop_phadddq (t3, t2));
20639 
20640       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20641       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20642 
20643       /* Multiply lower parts and add all */
20644       t5 = gen_reg_rtx (V2DImode);
20645       emit_insn (gen_vec_widen_umult_even_v4si (t5,
20646 					gen_lowpart (V4SImode, op1),
20647 					gen_lowpart (V4SImode, op2)));
20648       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
20649     }
20650   else
20651     {
20652       machine_mode nmode;
20653       rtx (*umul) (rtx, rtx, rtx);
20654 
20655       if (mode == V2DImode)
20656 	{
20657 	  umul = gen_vec_widen_umult_even_v4si;
20658 	  nmode = V4SImode;
20659 	}
20660       else if (mode == V4DImode)
20661 	{
20662 	  umul = gen_vec_widen_umult_even_v8si;
20663 	  nmode = V8SImode;
20664 	}
20665       else if (mode == V8DImode)
20666 	{
20667 	  umul = gen_vec_widen_umult_even_v16si;
20668 	  nmode = V16SImode;
20669 	}
20670       else
20671 	gcc_unreachable ();
20672 
20673 
20674       /* Multiply low parts.  */
20675       t1 = gen_reg_rtx (mode);
20676       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20677 
20678       /* Shift input vectors right 32 bits so we can multiply high parts.  */
20679       t6 = GEN_INT (32);
20680       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20681       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20682 
20683       /* Multiply high parts by low parts.  */
20684       t4 = gen_reg_rtx (mode);
20685       t5 = gen_reg_rtx (mode);
20686       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20687       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20688 
20689       /* Combine and shift the highparts back.  */
20690       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20691       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20692 
20693       /* Combine high and low parts.  */
20694       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20695     }
20696 
20697   set_unique_reg_note (get_last_insn (), REG_EQUAL,
20698 		       gen_rtx_MULT (mode, op1, op2));
20699 }
20700 
20701 /* Return 1 if control tansfer instruction INSN
20702    should be encoded with notrack prefix.  */
20703 
20704 bool
ix86_notrack_prefixed_insn_p(rtx_insn * insn)20705 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
20706 {
20707   if (!insn || !((flag_cf_protection & CF_BRANCH)))
20708     return false;
20709 
20710   if (CALL_P (insn))
20711     {
20712       rtx call = get_call_rtx_from (insn);
20713       gcc_assert (call != NULL_RTX);
20714       rtx addr = XEXP (call, 0);
20715 
20716       /* Do not emit 'notrack' if it's not an indirect call.  */
20717       if (MEM_P (addr)
20718 	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20719 	return false;
20720       else
20721 	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20722     }
20723 
20724   if (JUMP_P (insn) && !flag_cet_switch)
20725     {
20726       rtx target = JUMP_LABEL (insn);
20727       if (target == NULL_RTX || ANY_RETURN_P (target))
20728 	return false;
20729 
20730       /* Check the jump is a switch table.  */
20731       rtx_insn *label = as_a<rtx_insn *> (target);
20732       rtx_insn *table = next_insn (label);
20733       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20734 	return false;
20735       else
20736 	return true;
20737     }
20738   return false;
20739 }
20740 
20741 /* Calculate integer abs() using only SSE2 instructions.  */
20742 
20743 void
ix86_expand_sse2_abs(rtx target,rtx input)20744 ix86_expand_sse2_abs (rtx target, rtx input)
20745 {
20746   machine_mode mode = GET_MODE (target);
20747   rtx tmp0, tmp1, x;
20748 
20749   switch (mode)
20750     {
20751     case E_V2DImode:
20752     case E_V4DImode:
20753       /* For 64-bit signed integer X, with SSE4.2 use
20754 	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20755 	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20756 	 32 and use logical instead of arithmetic right shift (which is
20757 	 unimplemented) and subtract.  */
20758       if (TARGET_SSE4_2)
20759 	{
20760 	  tmp0 = gen_reg_rtx (mode);
20761 	  tmp1 = gen_reg_rtx (mode);
20762 	  emit_move_insn (tmp1, CONST0_RTX (mode));
20763 	  if (mode == E_V2DImode)
20764 	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20765 	  else
20766 	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20767 	}
20768       else
20769 	{
20770 	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20771 				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20772 					       - 1), NULL, 0, OPTAB_DIRECT);
20773 	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20774 	}
20775 
20776       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20777 				  NULL, 0, OPTAB_DIRECT);
20778       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20779 			       target, 0, OPTAB_DIRECT);
20780       break;
20781 
20782     case E_V4SImode:
20783       /* For 32-bit signed integer X, the best way to calculate the absolute
20784 	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
20785       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20786 				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20787 				  NULL, 0, OPTAB_DIRECT);
20788       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20789 				  NULL, 0, OPTAB_DIRECT);
20790       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20791 			       target, 0, OPTAB_DIRECT);
20792       break;
20793 
20794     case E_V8HImode:
20795       /* For 16-bit signed integer X, the best way to calculate the absolute
20796 	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
20797       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20798 
20799       x = expand_simple_binop (mode, SMAX, tmp0, input,
20800 			       target, 0, OPTAB_DIRECT);
20801       break;
20802 
20803     case E_V16QImode:
20804       /* For 8-bit signed integer X, the best way to calculate the absolute
20805 	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20806 	 as SSE2 provides the PMINUB insn.  */
20807       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20808 
20809       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20810 			       target, 0, OPTAB_DIRECT);
20811       break;
20812 
20813     default:
20814       gcc_unreachable ();
20815     }
20816 
20817   if (x != target)
20818     emit_move_insn (target, x);
20819 }
20820 
20821 /* Expand an extract from a vector register through pextr insn.
20822    Return true if successful.  */
20823 
20824 bool
ix86_expand_pextr(rtx * operands)20825 ix86_expand_pextr (rtx *operands)
20826 {
20827   rtx dst = operands[0];
20828   rtx src = operands[1];
20829 
20830   unsigned int size = INTVAL (operands[2]);
20831   unsigned int pos = INTVAL (operands[3]);
20832 
20833   if (SUBREG_P (dst))
20834     {
20835       /* Reject non-lowpart subregs.  */
20836       if (SUBREG_BYTE (dst) > 0)
20837 	return false;
20838       dst = SUBREG_REG (dst);
20839     }
20840 
20841   if (SUBREG_P (src))
20842     {
20843       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20844       src = SUBREG_REG (src);
20845     }
20846 
20847   switch (GET_MODE (src))
20848     {
20849     case E_V16QImode:
20850     case E_V8HImode:
20851     case E_V4SImode:
20852     case E_V2DImode:
20853     case E_V1TImode:
20854       {
20855 	machine_mode srcmode, dstmode;
20856 	rtx d, pat;
20857 
20858 	if (!int_mode_for_size (size, 0).exists (&dstmode))
20859 	  return false;
20860 
20861 	switch (dstmode)
20862 	  {
20863 	  case E_QImode:
20864 	    if (!TARGET_SSE4_1)
20865 	      return false;
20866 	    srcmode = V16QImode;
20867 	    break;
20868 
20869 	  case E_HImode:
20870 	    if (!TARGET_SSE2)
20871 	      return false;
20872 	    srcmode = V8HImode;
20873 	    break;
20874 
20875 	  case E_SImode:
20876 	    if (!TARGET_SSE4_1)
20877 	      return false;
20878 	    srcmode = V4SImode;
20879 	    break;
20880 
20881 	  case E_DImode:
20882 	    gcc_assert (TARGET_64BIT);
20883 	    if (!TARGET_SSE4_1)
20884 	      return false;
20885 	    srcmode = V2DImode;
20886 	    break;
20887 
20888 	  default:
20889 	    return false;
20890 	  }
20891 
20892 	/* Reject extractions from misaligned positions.  */
20893 	if (pos & (size-1))
20894 	  return false;
20895 
20896 	if (GET_MODE (dst) == dstmode)
20897 	  d = dst;
20898 	else
20899 	  d = gen_reg_rtx (dstmode);
20900 
20901 	/* Construct insn pattern.  */
20902 	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20903 	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20904 
20905 	/* Let the rtl optimizers know about the zero extension performed.  */
20906 	if (dstmode == QImode || dstmode == HImode)
20907 	  {
20908 	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20909 	    d = gen_lowpart (SImode, d);
20910 	  }
20911 
20912 	emit_insn (gen_rtx_SET (d, pat));
20913 
20914 	if (d != dst)
20915 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20916 	return true;
20917       }
20918 
20919     default:
20920       return false;
20921     }
20922 }
20923 
20924 /* Expand an insert into a vector register through pinsr insn.
20925    Return true if successful.  */
20926 
20927 bool
ix86_expand_pinsr(rtx * operands)20928 ix86_expand_pinsr (rtx *operands)
20929 {
20930   rtx dst = operands[0];
20931   rtx src = operands[3];
20932 
20933   unsigned int size = INTVAL (operands[1]);
20934   unsigned int pos = INTVAL (operands[2]);
20935 
20936   if (SUBREG_P (dst))
20937     {
20938       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20939       dst = SUBREG_REG (dst);
20940     }
20941 
20942   switch (GET_MODE (dst))
20943     {
20944     case E_V16QImode:
20945     case E_V8HImode:
20946     case E_V4SImode:
20947     case E_V2DImode:
20948     case E_V1TImode:
20949       {
20950 	machine_mode srcmode, dstmode;
20951 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
20952 	rtx d;
20953 
20954 	if (!int_mode_for_size (size, 0).exists (&srcmode))
20955 	  return false;
20956 
20957 	switch (srcmode)
20958 	  {
20959 	  case E_QImode:
20960 	    if (!TARGET_SSE4_1)
20961 	      return false;
20962 	    dstmode = V16QImode;
20963 	    pinsr = gen_sse4_1_pinsrb;
20964 	    break;
20965 
20966 	  case E_HImode:
20967 	    if (!TARGET_SSE2)
20968 	      return false;
20969 	    dstmode = V8HImode;
20970 	    pinsr = gen_sse2_pinsrw;
20971 	    break;
20972 
20973 	  case E_SImode:
20974 	    if (!TARGET_SSE4_1)
20975 	      return false;
20976 	    dstmode = V4SImode;
20977 	    pinsr = gen_sse4_1_pinsrd;
20978 	    break;
20979 
20980 	  case E_DImode:
20981 	    gcc_assert (TARGET_64BIT);
20982 	    if (!TARGET_SSE4_1)
20983 	      return false;
20984 	    dstmode = V2DImode;
20985 	    pinsr = gen_sse4_1_pinsrq;
20986 	    break;
20987 
20988 	  default:
20989 	    return false;
20990 	  }
20991 
20992 	/* Reject insertions to misaligned positions.  */
20993 	if (pos & (size-1))
20994 	  return false;
20995 
20996 	if (SUBREG_P (src))
20997 	  {
20998 	    unsigned int srcpos = SUBREG_BYTE (src);
20999 
21000 	    if (srcpos > 0)
21001 	      {
21002 		rtx extr_ops[4];
21003 
21004 		extr_ops[0] = gen_reg_rtx (srcmode);
21005 		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
21006 		extr_ops[2] = GEN_INT (size);
21007 		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
21008 
21009 		if (!ix86_expand_pextr (extr_ops))
21010 		  return false;
21011 
21012 		src = extr_ops[0];
21013 	      }
21014 	    else
21015 	      src = gen_lowpart (srcmode, SUBREG_REG (src));
21016 	  }
21017 
21018 	if (GET_MODE (dst) == dstmode)
21019 	  d = dst;
21020 	else
21021 	  d = gen_reg_rtx (dstmode);
21022 
21023 	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
21024 			  gen_lowpart (srcmode, src),
21025 			  GEN_INT (1 << (pos / size))));
21026 	if (d != dst)
21027 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
21028 	return true;
21029       }
21030 
21031     default:
21032       return false;
21033     }
21034 }
21035 
21036 /* All CPUs prefer to avoid cross-lane operations so perform reductions
21037    upper against lower halves up to SSE reg size.  */
21038 
21039 machine_mode
ix86_split_reduction(machine_mode mode)21040 ix86_split_reduction (machine_mode mode)
21041 {
21042   /* Reduce lowpart against highpart until we reach SSE reg width to
21043      avoid cross-lane operations.  */
21044   switch (mode)
21045     {
21046     case E_V8DImode:
21047     case E_V4DImode:
21048       return V2DImode;
21049     case E_V16SImode:
21050     case E_V8SImode:
21051       return V4SImode;
21052     case E_V32HImode:
21053     case E_V16HImode:
21054       return V8HImode;
21055     case E_V64QImode:
21056     case E_V32QImode:
21057       return V16QImode;
21058     case E_V16SFmode:
21059     case E_V8SFmode:
21060       return V4SFmode;
21061     case E_V8DFmode:
21062     case E_V4DFmode:
21063       return V2DFmode;
21064     default:
21065       return mode;
21066     }
21067 }
21068 
21069 /* Generate call to __divmoddi4.  */
21070 
21071 void
ix86_expand_divmod_libfunc(rtx libfunc,machine_mode mode,rtx op0,rtx op1,rtx * quot_p,rtx * rem_p)21072 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
21073 			    rtx op0, rtx op1,
21074 			    rtx *quot_p, rtx *rem_p)
21075 {
21076   rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
21077 
21078   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
21079 				      mode, op0, mode, op1, mode,
21080 				      XEXP (rem, 0), Pmode);
21081   *quot_p = quot;
21082   *rem_p = rem;
21083 }
21084 
21085 #include "gt-i386-expand.h"
21086