xref: /dragonfly/contrib/gcc-4.7/gcc/config/i386/i386.c (revision dc71b7ab)
1 /* Subroutines used for code generation on IA-32.
2    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3    2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013
4    Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12 
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 
65 enum upper_128bits_state
66 {
67   unknown = 0,
68   unused,
69   used
70 };
71 
72 typedef struct block_info_def
73 {
74   /* State of the upper 128bits of AVX registers at exit.  */
75   enum upper_128bits_state state;
76   /* TRUE if state of the upper 128bits of AVX registers is unchanged
77      in this block.  */
78   bool unchanged;
79   /* TRUE if block has been processed.  */
80   bool processed;
81   /* TRUE if block has been scanned.  */
82   bool scanned;
83   /* Previous state of the upper 128bits of AVX registers at entry.  */
84   enum upper_128bits_state prev;
85 } *block_info;
86 
87 #define BLOCK_INFO(B)   ((block_info) (B)->aux)
88 
89 enum call_avx256_state
90 {
91   /* Callee returns 256bit AVX register.  */
92   callee_return_avx256 = -1,
93   /* Callee returns and passes 256bit AVX register.  */
94   callee_return_pass_avx256,
95   /* Callee passes 256bit AVX register.  */
96   callee_pass_avx256,
97   /* Callee doesn't return nor passe 256bit AVX register, or no
98      256bit AVX register in function return.  */
99   call_no_avx256,
100   /* vzeroupper intrinsic.  */
101   vzeroupper_intrinsic
102 };
103 
104 /* Check if a 256bit AVX register is referenced in stores.   */
105 
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109   if ((REG_P (dest)
110        && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111       || (GET_CODE (set) == SET
112 	  && REG_P (SET_SRC (set))
113 	  && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114     {
115       enum upper_128bits_state *state
116 	= (enum upper_128bits_state *) data;
117       *state = used;
118     }
119 }
120 
121 /* Helper function for move_or_delete_vzeroupper_1.  Look for vzeroupper
122    in basic block BB.  Delete it if upper 128bit AVX registers are
123    unused.  If it isn't deleted, move it to just before a jump insn.
124 
125    STATE is state of the upper 128bits of AVX registers at entry.  */
126 
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 			     enum upper_128bits_state state)
130 {
131   rtx insn, bb_end;
132   rtx vzeroupper_insn = NULL_RTX;
133   rtx pat;
134   int avx256;
135   bool unchanged;
136 
137   if (BLOCK_INFO (bb)->unchanged)
138     {
139       if (dump_file)
140 	fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 		 bb->index, state);
142 
143       BLOCK_INFO (bb)->state = state;
144       return;
145     }
146 
147   if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148     {
149       if (dump_file)
150 	fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 		 bb->index, BLOCK_INFO (bb)->state);
152       return;
153     }
154 
155   BLOCK_INFO (bb)->prev = state;
156 
157   if (dump_file)
158     fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 	     bb->index, state);
160 
161   unchanged = true;
162 
163   /* BB_END changes when it is deleted.  */
164   bb_end = BB_END (bb);
165   insn = BB_HEAD (bb);
166   while (insn != bb_end)
167     {
168       insn = NEXT_INSN (insn);
169 
170       if (!NONDEBUG_INSN_P (insn))
171 	continue;
172 
173       /* Move vzeroupper before jump/call.  */
174       if (JUMP_P (insn) || CALL_P (insn))
175 	{
176 	  if (!vzeroupper_insn)
177 	    continue;
178 
179 	  if (PREV_INSN (insn) != vzeroupper_insn)
180 	    {
181 	      if (dump_file)
182 		{
183 		  fprintf (dump_file, "Move vzeroupper after:\n");
184 		  print_rtl_single (dump_file, PREV_INSN (insn));
185 		  fprintf (dump_file, "before:\n");
186 		  print_rtl_single (dump_file, insn);
187 		}
188 	      reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 				  PREV_INSN (insn));
190 	    }
191 	  vzeroupper_insn = NULL_RTX;
192 	  continue;
193 	}
194 
195       pat = PATTERN (insn);
196 
197       /* Check insn for vzeroupper intrinsic.  */
198       if (GET_CODE (pat) == UNSPEC_VOLATILE
199 	  && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 	{
201 	  if (dump_file)
202 	    {
203 	      /* Found vzeroupper intrinsic.  */
204 	      fprintf (dump_file, "Found vzeroupper:\n");
205 	      print_rtl_single (dump_file, insn);
206 	    }
207 	}
208       else
209 	{
210 	  /* Check insn for vzeroall intrinsic.  */
211 	  if (GET_CODE (pat) == PARALLEL
212 	      && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 	      && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 	    {
215 	      state = unused;
216 	      unchanged = false;
217 
218 	      /* Delete pending vzeroupper insertion.  */
219 	      if (vzeroupper_insn)
220 		{
221 		  delete_insn (vzeroupper_insn);
222 		  vzeroupper_insn = NULL_RTX;
223 		}
224 	    }
225 	  else if (state != used)
226 	    {
227 	      note_stores (pat, check_avx256_stores, &state);
228 	      if (state == used)
229 		unchanged = false;
230 	    }
231 	  continue;
232 	}
233 
234       /* Process vzeroupper intrinsic.  */
235       avx256 = INTVAL (XVECEXP (pat, 0, 0));
236 
237       if (state == unused)
238 	{
239 	  /* Since the upper 128bits are cleared, callee must not pass
240 	     256bit AVX register.  We only need to check if callee
241 	     returns 256bit AVX register.  */
242 	  if (avx256 == callee_return_avx256)
243 	    {
244 	      state = used;
245 	      unchanged = false;
246 	    }
247 
248 	  /* Remove unnecessary vzeroupper since upper 128bits are
249 	     cleared.  */
250 	  if (dump_file)
251 	    {
252 	      fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 	      print_rtl_single (dump_file, insn);
254 	    }
255 	  delete_insn (insn);
256 	}
257       else
258 	{
259 	  /* Set state to UNUSED if callee doesn't return 256bit AVX
260 	     register.  */
261 	  if (avx256 != callee_return_pass_avx256)
262 	    state = unused;
263 
264 	  if (avx256 == callee_return_pass_avx256
265 	      || avx256 == callee_pass_avx256)
266 	    {
267 	      /* Must remove vzeroupper since callee passes in 256bit
268 		 AVX register.  */
269 	      if (dump_file)
270 		{
271 		  fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 		  print_rtl_single (dump_file, insn);
273 		}
274 	      delete_insn (insn);
275 	    }
276 	  else
277 	    {
278 	      vzeroupper_insn = insn;
279 	      unchanged = false;
280 	    }
281 	}
282     }
283 
284   BLOCK_INFO (bb)->state = state;
285   BLOCK_INFO (bb)->unchanged = unchanged;
286   BLOCK_INFO (bb)->scanned = true;
287 
288   if (dump_file)
289     fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 	     bb->index, unchanged ? "unchanged" : "changed",
291 	     state);
292 }
293 
294 /* Helper function for move_or_delete_vzeroupper.  Process vzeroupper
295    in BLOCK and check its predecessor blocks.  Treat UNKNOWN state
296    as USED if UNKNOWN_IS_UNUSED is true.  Return TRUE if the exit
297    state is changed.  */
298 
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302   edge e;
303   edge_iterator ei;
304   enum upper_128bits_state state, old_state, new_state;
305   bool seen_unknown;
306 
307   if (dump_file)
308     fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 	     block->index, BLOCK_INFO (block)->processed);
310 
311   if (BLOCK_INFO (block)->processed)
312     return false;
313 
314   state = unused;
315 
316   /* Check all predecessor edges of this block.  */
317   seen_unknown = false;
318   FOR_EACH_EDGE (e, ei, block->preds)
319     {
320       if (e->src == block)
321 	continue;
322       switch (BLOCK_INFO (e->src)->state)
323 	{
324 	case unknown:
325 	  if (!unknown_is_unused)
326 	    seen_unknown = true;
327 	case unused:
328 	  break;
329 	case used:
330 	  state = used;
331 	  goto done;
332 	}
333     }
334 
335   if (seen_unknown)
336     state = unknown;
337 
338 done:
339   old_state = BLOCK_INFO (block)->state;
340   move_or_delete_vzeroupper_2 (block, state);
341   new_state = BLOCK_INFO (block)->state;
342 
343   if (state != unknown || new_state == used)
344     BLOCK_INFO (block)->processed = true;
345 
346   /* Need to rescan if the upper 128bits of AVX registers are changed
347      to USED at exit.  */
348   if (new_state != old_state)
349     {
350       if (new_state == used)
351 	cfun->machine->rescan_vzeroupper_p = 1;
352       return true;
353     }
354   else
355     return false;
356 }
357 
358 /* Go through the instruction stream looking for vzeroupper.  Delete
359    it if upper 128bit AVX registers are unused.  If it isn't deleted,
360    move it to just before a jump insn.  */
361 
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365   edge e;
366   edge_iterator ei;
367   basic_block bb;
368   fibheap_t worklist, pending, fibheap_swap;
369   sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370   int *bb_order;
371   int *rc_order;
372   int i;
373 
374   /* Set up block info for each basic block.  */
375   alloc_aux_for_blocks (sizeof (struct block_info_def));
376 
377   /* Process outgoing edges of entry point.  */
378   if (dump_file)
379     fprintf (dump_file, "Process outgoing edges of entry point\n");
380 
381   FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382     {
383       move_or_delete_vzeroupper_2 (e->dest,
384 				   cfun->machine->caller_pass_avx256_p
385 				   ? used : unused);
386       BLOCK_INFO (e->dest)->processed = true;
387     }
388 
389   /* Compute reverse completion order of depth first search of the CFG
390      so that the data-flow runs faster.  */
391   rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392   bb_order = XNEWVEC (int, last_basic_block);
393   pre_and_rev_post_order_compute (NULL, rc_order, false);
394   for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395     bb_order[rc_order[i]] = i;
396   free (rc_order);
397 
398   worklist = fibheap_new ();
399   pending = fibheap_new ();
400   visited = sbitmap_alloc (last_basic_block);
401   in_worklist = sbitmap_alloc (last_basic_block);
402   in_pending = sbitmap_alloc (last_basic_block);
403   sbitmap_zero (in_worklist);
404 
405   /* Don't check outgoing edges of entry point.  */
406   sbitmap_ones (in_pending);
407   FOR_EACH_BB (bb)
408     if (BLOCK_INFO (bb)->processed)
409       RESET_BIT (in_pending, bb->index);
410     else
411       {
412 	move_or_delete_vzeroupper_1 (bb, false);
413 	fibheap_insert (pending, bb_order[bb->index], bb);
414       }
415 
416   if (dump_file)
417     fprintf (dump_file, "Check remaining basic blocks\n");
418 
419   while (!fibheap_empty (pending))
420     {
421       fibheap_swap = pending;
422       pending = worklist;
423       worklist = fibheap_swap;
424       sbitmap_swap = in_pending;
425       in_pending = in_worklist;
426       in_worklist = sbitmap_swap;
427 
428       sbitmap_zero (visited);
429 
430       cfun->machine->rescan_vzeroupper_p = 0;
431 
432       while (!fibheap_empty (worklist))
433 	{
434 	  bb = (basic_block) fibheap_extract_min (worklist);
435 	  RESET_BIT (in_worklist, bb->index);
436 	  gcc_assert (!TEST_BIT (visited, bb->index));
437 	  if (!TEST_BIT (visited, bb->index))
438 	    {
439 	      edge_iterator ei;
440 
441 	      SET_BIT (visited, bb->index);
442 
443 	      if (move_or_delete_vzeroupper_1 (bb, false))
444 		FOR_EACH_EDGE (e, ei, bb->succs)
445 		  {
446 		    if (e->dest == EXIT_BLOCK_PTR
447 			|| BLOCK_INFO (e->dest)->processed)
448 		      continue;
449 
450 		    if (TEST_BIT (visited, e->dest->index))
451 		      {
452 			if (!TEST_BIT (in_pending, e->dest->index))
453 			  {
454 			    /* Send E->DEST to next round.  */
455 			    SET_BIT (in_pending, e->dest->index);
456 			    fibheap_insert (pending,
457 					    bb_order[e->dest->index],
458 					    e->dest);
459 			  }
460 		      }
461 		    else if (!TEST_BIT (in_worklist, e->dest->index))
462 		      {
463 			/* Add E->DEST to current round.  */
464 			SET_BIT (in_worklist, e->dest->index);
465 			fibheap_insert (worklist, bb_order[e->dest->index],
466 					e->dest);
467 		      }
468 		  }
469 	    }
470 	}
471 
472       if (!cfun->machine->rescan_vzeroupper_p)
473 	break;
474     }
475 
476   free (bb_order);
477   fibheap_delete (worklist);
478   fibheap_delete (pending);
479   sbitmap_free (visited);
480   sbitmap_free (in_worklist);
481   sbitmap_free (in_pending);
482 
483   if (dump_file)
484     fprintf (dump_file, "Process remaining basic blocks\n");
485 
486   FOR_EACH_BB (bb)
487     move_or_delete_vzeroupper_1 (bb, true);
488 
489   free_aux_for_blocks ();
490 }
491 
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493 
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497 
498 /* Return index of given mode in mult and division cost tables.  */
499 #define MODE_INDEX(mode)					\
500   ((mode) == QImode ? 0						\
501    : (mode) == HImode ? 1					\
502    : (mode) == SImode ? 2					\
503    : (mode) == DImode ? 3					\
504    : 4)
505 
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509 
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511 
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514   COSTS_N_BYTES (2),			/* cost of an add instruction */
515   COSTS_N_BYTES (3),			/* cost of a lea instruction */
516   COSTS_N_BYTES (2),			/* variable shift costs */
517   COSTS_N_BYTES (3),			/* constant shift costs */
518   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
519    COSTS_N_BYTES (3),			/*				 HI */
520    COSTS_N_BYTES (3),			/*				 SI */
521    COSTS_N_BYTES (3),			/*				 DI */
522    COSTS_N_BYTES (5)},			/*			      other */
523   0,					/* cost of multiply per each bit set */
524   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
525    COSTS_N_BYTES (3),			/*			    HI */
526    COSTS_N_BYTES (3),			/*			    SI */
527    COSTS_N_BYTES (3),			/*			    DI */
528    COSTS_N_BYTES (5)},			/*			    other */
529   COSTS_N_BYTES (3),			/* cost of movsx */
530   COSTS_N_BYTES (3),			/* cost of movzx */
531   0,					/* "large" insn */
532   2,					/* MOVE_RATIO */
533   2,				     /* cost for loading QImode using movzbl */
534   {2, 2, 2},				/* cost of loading integer registers
535 					   in QImode, HImode and SImode.
536 					   Relative to reg-reg move (2).  */
537   {2, 2, 2},				/* cost of storing integer registers */
538   2,					/* cost of reg,reg fld/fst */
539   {2, 2, 2},				/* cost of loading fp registers
540 					   in SFmode, DFmode and XFmode */
541   {2, 2, 2},				/* cost of storing fp registers
542 					   in SFmode, DFmode and XFmode */
543   3,					/* cost of moving MMX register */
544   {3, 3},				/* cost of loading MMX registers
545 					   in SImode and DImode */
546   {3, 3},				/* cost of storing MMX registers
547 					   in SImode and DImode */
548   3,					/* cost of moving SSE register */
549   {3, 3, 3},				/* cost of loading SSE registers
550 					   in SImode, DImode and TImode */
551   {3, 3, 3},				/* cost of storing SSE registers
552 					   in SImode, DImode and TImode */
553   3,					/* MMX or SSE register to integer */
554   0,					/* size of l1 cache  */
555   0,					/* size of l2 cache  */
556   0,					/* size of prefetch block */
557   0,					/* number of parallel prefetches */
558   2,					/* Branch cost */
559   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
560   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
561   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
562   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
563   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
564   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
565   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569   1,					/* scalar_stmt_cost.  */
570   1,					/* scalar load_cost.  */
571   1,					/* scalar_store_cost.  */
572   1,					/* vec_stmt_cost.  */
573   1,					/* vec_to_scalar_cost.  */
574   1,					/* scalar_to_vec_cost.  */
575   1,					/* vec_align_load_cost.  */
576   1,					/* vec_unalign_load_cost.  */
577   1,					/* vec_store_cost.  */
578   1,					/* cond_taken_branch_cost.  */
579   1,					/* cond_not_taken_branch_cost.  */
580 };
581 
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = {	/* 386 specific costs */
585   COSTS_N_INSNS (1),			/* cost of an add instruction */
586   COSTS_N_INSNS (1),			/* cost of a lea instruction */
587   COSTS_N_INSNS (3),			/* variable shift costs */
588   COSTS_N_INSNS (2),			/* constant shift costs */
589   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
590    COSTS_N_INSNS (6),			/*				 HI */
591    COSTS_N_INSNS (6),			/*				 SI */
592    COSTS_N_INSNS (6),			/*				 DI */
593    COSTS_N_INSNS (6)},			/*			      other */
594   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
595   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
596    COSTS_N_INSNS (23),			/*			    HI */
597    COSTS_N_INSNS (23),			/*			    SI */
598    COSTS_N_INSNS (23),			/*			    DI */
599    COSTS_N_INSNS (23)},			/*			    other */
600   COSTS_N_INSNS (3),			/* cost of movsx */
601   COSTS_N_INSNS (2),			/* cost of movzx */
602   15,					/* "large" insn */
603   3,					/* MOVE_RATIO */
604   4,				     /* cost for loading QImode using movzbl */
605   {2, 4, 2},				/* cost of loading integer registers
606 					   in QImode, HImode and SImode.
607 					   Relative to reg-reg move (2).  */
608   {2, 4, 2},				/* cost of storing integer registers */
609   2,					/* cost of reg,reg fld/fst */
610   {8, 8, 8},				/* cost of loading fp registers
611 					   in SFmode, DFmode and XFmode */
612   {8, 8, 8},				/* cost of storing fp registers
613 					   in SFmode, DFmode and XFmode */
614   2,					/* cost of moving MMX register */
615   {4, 8},				/* cost of loading MMX registers
616 					   in SImode and DImode */
617   {4, 8},				/* cost of storing MMX registers
618 					   in SImode and DImode */
619   2,					/* cost of moving SSE register */
620   {4, 8, 16},				/* cost of loading SSE registers
621 					   in SImode, DImode and TImode */
622   {4, 8, 16},				/* cost of storing SSE registers
623 					   in SImode, DImode and TImode */
624   3,					/* MMX or SSE register to integer */
625   0,					/* size of l1 cache  */
626   0,					/* size of l2 cache  */
627   0,					/* size of prefetch block */
628   0,					/* number of parallel prefetches */
629   1,					/* Branch cost */
630   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
631   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
632   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
633   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
634   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
635   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
636   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637    DUMMY_STRINGOP_ALGS},
638   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639    DUMMY_STRINGOP_ALGS},
640   1,					/* scalar_stmt_cost.  */
641   1,					/* scalar load_cost.  */
642   1,					/* scalar_store_cost.  */
643   1,					/* vec_stmt_cost.  */
644   1,					/* vec_to_scalar_cost.  */
645   1,					/* scalar_to_vec_cost.  */
646   1,					/* vec_align_load_cost.  */
647   2,					/* vec_unalign_load_cost.  */
648   1,					/* vec_store_cost.  */
649   3,					/* cond_taken_branch_cost.  */
650   1,					/* cond_not_taken_branch_cost.  */
651 };
652 
653 static const
654 struct processor_costs i486_cost = {	/* 486 specific costs */
655   COSTS_N_INSNS (1),			/* cost of an add instruction */
656   COSTS_N_INSNS (1),			/* cost of a lea instruction */
657   COSTS_N_INSNS (3),			/* variable shift costs */
658   COSTS_N_INSNS (2),			/* constant shift costs */
659   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
660    COSTS_N_INSNS (12),			/*				 HI */
661    COSTS_N_INSNS (12),			/*				 SI */
662    COSTS_N_INSNS (12),			/*				 DI */
663    COSTS_N_INSNS (12)},			/*			      other */
664   1,					/* cost of multiply per each bit set */
665   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
666    COSTS_N_INSNS (40),			/*			    HI */
667    COSTS_N_INSNS (40),			/*			    SI */
668    COSTS_N_INSNS (40),			/*			    DI */
669    COSTS_N_INSNS (40)},			/*			    other */
670   COSTS_N_INSNS (3),			/* cost of movsx */
671   COSTS_N_INSNS (2),			/* cost of movzx */
672   15,					/* "large" insn */
673   3,					/* MOVE_RATIO */
674   4,				     /* cost for loading QImode using movzbl */
675   {2, 4, 2},				/* cost of loading integer registers
676 					   in QImode, HImode and SImode.
677 					   Relative to reg-reg move (2).  */
678   {2, 4, 2},				/* cost of storing integer registers */
679   2,					/* cost of reg,reg fld/fst */
680   {8, 8, 8},				/* cost of loading fp registers
681 					   in SFmode, DFmode and XFmode */
682   {8, 8, 8},				/* cost of storing fp registers
683 					   in SFmode, DFmode and XFmode */
684   2,					/* cost of moving MMX register */
685   {4, 8},				/* cost of loading MMX registers
686 					   in SImode and DImode */
687   {4, 8},				/* cost of storing MMX registers
688 					   in SImode and DImode */
689   2,					/* cost of moving SSE register */
690   {4, 8, 16},				/* cost of loading SSE registers
691 					   in SImode, DImode and TImode */
692   {4, 8, 16},				/* cost of storing SSE registers
693 					   in SImode, DImode and TImode */
694   3,					/* MMX or SSE register to integer */
695   4,					/* size of l1 cache.  486 has 8kB cache
696 					   shared for code and data, so 4kB is
697 					   not really precise.  */
698   4,					/* size of l2 cache  */
699   0,					/* size of prefetch block */
700   0,					/* number of parallel prefetches */
701   1,					/* Branch cost */
702   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
703   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
704   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
705   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
706   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
707   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
708   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709    DUMMY_STRINGOP_ALGS},
710   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711    DUMMY_STRINGOP_ALGS},
712   1,					/* scalar_stmt_cost.  */
713   1,					/* scalar load_cost.  */
714   1,					/* scalar_store_cost.  */
715   1,					/* vec_stmt_cost.  */
716   1,					/* vec_to_scalar_cost.  */
717   1,					/* scalar_to_vec_cost.  */
718   1,					/* vec_align_load_cost.  */
719   2,					/* vec_unalign_load_cost.  */
720   1,					/* vec_store_cost.  */
721   3,					/* cond_taken_branch_cost.  */
722   1,					/* cond_not_taken_branch_cost.  */
723 };
724 
725 static const
726 struct processor_costs pentium_cost = {
727   COSTS_N_INSNS (1),			/* cost of an add instruction */
728   COSTS_N_INSNS (1),			/* cost of a lea instruction */
729   COSTS_N_INSNS (4),			/* variable shift costs */
730   COSTS_N_INSNS (1),			/* constant shift costs */
731   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
732    COSTS_N_INSNS (11),			/*				 HI */
733    COSTS_N_INSNS (11),			/*				 SI */
734    COSTS_N_INSNS (11),			/*				 DI */
735    COSTS_N_INSNS (11)},			/*			      other */
736   0,					/* cost of multiply per each bit set */
737   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
738    COSTS_N_INSNS (25),			/*			    HI */
739    COSTS_N_INSNS (25),			/*			    SI */
740    COSTS_N_INSNS (25),			/*			    DI */
741    COSTS_N_INSNS (25)},			/*			    other */
742   COSTS_N_INSNS (3),			/* cost of movsx */
743   COSTS_N_INSNS (2),			/* cost of movzx */
744   8,					/* "large" insn */
745   6,					/* MOVE_RATIO */
746   6,				     /* cost for loading QImode using movzbl */
747   {2, 4, 2},				/* cost of loading integer registers
748 					   in QImode, HImode and SImode.
749 					   Relative to reg-reg move (2).  */
750   {2, 4, 2},				/* cost of storing integer registers */
751   2,					/* cost of reg,reg fld/fst */
752   {2, 2, 6},				/* cost of loading fp registers
753 					   in SFmode, DFmode and XFmode */
754   {4, 4, 6},				/* cost of storing fp registers
755 					   in SFmode, DFmode and XFmode */
756   8,					/* cost of moving MMX register */
757   {8, 8},				/* cost of loading MMX registers
758 					   in SImode and DImode */
759   {8, 8},				/* cost of storing MMX registers
760 					   in SImode and DImode */
761   2,					/* cost of moving SSE register */
762   {4, 8, 16},				/* cost of loading SSE registers
763 					   in SImode, DImode and TImode */
764   {4, 8, 16},				/* cost of storing SSE registers
765 					   in SImode, DImode and TImode */
766   3,					/* MMX or SSE register to integer */
767   8,					/* size of l1 cache.  */
768   8,					/* size of l2 cache  */
769   0,					/* size of prefetch block */
770   0,					/* number of parallel prefetches */
771   2,					/* Branch cost */
772   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
773   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
774   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
775   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
776   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
777   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
778   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779    DUMMY_STRINGOP_ALGS},
780   {{libcall, {{-1, rep_prefix_4_byte}}},
781    DUMMY_STRINGOP_ALGS},
782   1,					/* scalar_stmt_cost.  */
783   1,					/* scalar load_cost.  */
784   1,					/* scalar_store_cost.  */
785   1,					/* vec_stmt_cost.  */
786   1,					/* vec_to_scalar_cost.  */
787   1,					/* scalar_to_vec_cost.  */
788   1,					/* vec_align_load_cost.  */
789   2,					/* vec_unalign_load_cost.  */
790   1,					/* vec_store_cost.  */
791   3,					/* cond_taken_branch_cost.  */
792   1,					/* cond_not_taken_branch_cost.  */
793 };
794 
795 static const
796 struct processor_costs pentiumpro_cost = {
797   COSTS_N_INSNS (1),			/* cost of an add instruction */
798   COSTS_N_INSNS (1),			/* cost of a lea instruction */
799   COSTS_N_INSNS (1),			/* variable shift costs */
800   COSTS_N_INSNS (1),			/* constant shift costs */
801   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
802    COSTS_N_INSNS (4),			/*				 HI */
803    COSTS_N_INSNS (4),			/*				 SI */
804    COSTS_N_INSNS (4),			/*				 DI */
805    COSTS_N_INSNS (4)},			/*			      other */
806   0,					/* cost of multiply per each bit set */
807   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
808    COSTS_N_INSNS (17),			/*			    HI */
809    COSTS_N_INSNS (17),			/*			    SI */
810    COSTS_N_INSNS (17),			/*			    DI */
811    COSTS_N_INSNS (17)},			/*			    other */
812   COSTS_N_INSNS (1),			/* cost of movsx */
813   COSTS_N_INSNS (1),			/* cost of movzx */
814   8,					/* "large" insn */
815   6,					/* MOVE_RATIO */
816   2,				     /* cost for loading QImode using movzbl */
817   {4, 4, 4},				/* cost of loading integer registers
818 					   in QImode, HImode and SImode.
819 					   Relative to reg-reg move (2).  */
820   {2, 2, 2},				/* cost of storing integer registers */
821   2,					/* cost of reg,reg fld/fst */
822   {2, 2, 6},				/* cost of loading fp registers
823 					   in SFmode, DFmode and XFmode */
824   {4, 4, 6},				/* cost of storing fp registers
825 					   in SFmode, DFmode and XFmode */
826   2,					/* cost of moving MMX register */
827   {2, 2},				/* cost of loading MMX registers
828 					   in SImode and DImode */
829   {2, 2},				/* cost of storing MMX registers
830 					   in SImode and DImode */
831   2,					/* cost of moving SSE register */
832   {2, 2, 8},				/* cost of loading SSE registers
833 					   in SImode, DImode and TImode */
834   {2, 2, 8},				/* cost of storing SSE registers
835 					   in SImode, DImode and TImode */
836   3,					/* MMX or SSE register to integer */
837   8,					/* size of l1 cache.  */
838   256,					/* size of l2 cache  */
839   32,					/* size of prefetch block */
840   6,					/* number of parallel prefetches */
841   2,					/* Branch cost */
842   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
843   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
844   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
845   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
846   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
847   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
848   /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849      (we ensure the alignment).  For small blocks inline loop is still a
850      noticeable win, for bigger blocks either rep movsl or rep movsb is
851      way to go.  Rep movsb has apparently more expensive startup time in CPU,
852      but after 4K the difference is down in the noise.  */
853   {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 			{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855    DUMMY_STRINGOP_ALGS},
856   {{rep_prefix_4_byte, {{1024, unrolled_loop},
857   			{8192, rep_prefix_4_byte}, {-1, libcall}}},
858    DUMMY_STRINGOP_ALGS},
859   1,					/* scalar_stmt_cost.  */
860   1,					/* scalar load_cost.  */
861   1,					/* scalar_store_cost.  */
862   1,					/* vec_stmt_cost.  */
863   1,					/* vec_to_scalar_cost.  */
864   1,					/* scalar_to_vec_cost.  */
865   1,					/* vec_align_load_cost.  */
866   2,					/* vec_unalign_load_cost.  */
867   1,					/* vec_store_cost.  */
868   3,					/* cond_taken_branch_cost.  */
869   1,					/* cond_not_taken_branch_cost.  */
870 };
871 
872 static const
873 struct processor_costs geode_cost = {
874   COSTS_N_INSNS (1),			/* cost of an add instruction */
875   COSTS_N_INSNS (1),			/* cost of a lea instruction */
876   COSTS_N_INSNS (2),			/* variable shift costs */
877   COSTS_N_INSNS (1),			/* constant shift costs */
878   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
879    COSTS_N_INSNS (4),			/*				 HI */
880    COSTS_N_INSNS (7),			/*				 SI */
881    COSTS_N_INSNS (7),			/*				 DI */
882    COSTS_N_INSNS (7)},			/*			      other */
883   0,					/* cost of multiply per each bit set */
884   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
885    COSTS_N_INSNS (23),			/*			    HI */
886    COSTS_N_INSNS (39),			/*			    SI */
887    COSTS_N_INSNS (39),			/*			    DI */
888    COSTS_N_INSNS (39)},			/*			    other */
889   COSTS_N_INSNS (1),			/* cost of movsx */
890   COSTS_N_INSNS (1),			/* cost of movzx */
891   8,					/* "large" insn */
892   4,					/* MOVE_RATIO */
893   1,				     /* cost for loading QImode using movzbl */
894   {1, 1, 1},				/* cost of loading integer registers
895 					   in QImode, HImode and SImode.
896 					   Relative to reg-reg move (2).  */
897   {1, 1, 1},				/* cost of storing integer registers */
898   1,					/* cost of reg,reg fld/fst */
899   {1, 1, 1},				/* cost of loading fp registers
900 					   in SFmode, DFmode and XFmode */
901   {4, 6, 6},				/* cost of storing fp registers
902 					   in SFmode, DFmode and XFmode */
903 
904   1,					/* cost of moving MMX register */
905   {1, 1},				/* cost of loading MMX registers
906 					   in SImode and DImode */
907   {1, 1},				/* cost of storing MMX registers
908 					   in SImode and DImode */
909   1,					/* cost of moving SSE register */
910   {1, 1, 1},				/* cost of loading SSE registers
911 					   in SImode, DImode and TImode */
912   {1, 1, 1},				/* cost of storing SSE registers
913 					   in SImode, DImode and TImode */
914   1,					/* MMX or SSE register to integer */
915   64,					/* size of l1 cache.  */
916   128,					/* size of l2 cache.  */
917   32,					/* size of prefetch block */
918   1,					/* number of parallel prefetches */
919   1,					/* Branch cost */
920   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
921   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
922   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
923   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
924   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
925   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
926   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927    DUMMY_STRINGOP_ALGS},
928   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929    DUMMY_STRINGOP_ALGS},
930   1,					/* scalar_stmt_cost.  */
931   1,					/* scalar load_cost.  */
932   1,					/* scalar_store_cost.  */
933   1,					/* vec_stmt_cost.  */
934   1,					/* vec_to_scalar_cost.  */
935   1,					/* scalar_to_vec_cost.  */
936   1,					/* vec_align_load_cost.  */
937   2,					/* vec_unalign_load_cost.  */
938   1,					/* vec_store_cost.  */
939   3,					/* cond_taken_branch_cost.  */
940   1,					/* cond_not_taken_branch_cost.  */
941 };
942 
943 static const
944 struct processor_costs k6_cost = {
945   COSTS_N_INSNS (1),			/* cost of an add instruction */
946   COSTS_N_INSNS (2),			/* cost of a lea instruction */
947   COSTS_N_INSNS (1),			/* variable shift costs */
948   COSTS_N_INSNS (1),			/* constant shift costs */
949   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
950    COSTS_N_INSNS (3),			/*				 HI */
951    COSTS_N_INSNS (3),			/*				 SI */
952    COSTS_N_INSNS (3),			/*				 DI */
953    COSTS_N_INSNS (3)},			/*			      other */
954   0,					/* cost of multiply per each bit set */
955   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
956    COSTS_N_INSNS (18),			/*			    HI */
957    COSTS_N_INSNS (18),			/*			    SI */
958    COSTS_N_INSNS (18),			/*			    DI */
959    COSTS_N_INSNS (18)},			/*			    other */
960   COSTS_N_INSNS (2),			/* cost of movsx */
961   COSTS_N_INSNS (2),			/* cost of movzx */
962   8,					/* "large" insn */
963   4,					/* MOVE_RATIO */
964   3,				     /* cost for loading QImode using movzbl */
965   {4, 5, 4},				/* cost of loading integer registers
966 					   in QImode, HImode and SImode.
967 					   Relative to reg-reg move (2).  */
968   {2, 3, 2},				/* cost of storing integer registers */
969   4,					/* cost of reg,reg fld/fst */
970   {6, 6, 6},				/* cost of loading fp registers
971 					   in SFmode, DFmode and XFmode */
972   {4, 4, 4},				/* cost of storing fp registers
973 					   in SFmode, DFmode and XFmode */
974   2,					/* cost of moving MMX register */
975   {2, 2},				/* cost of loading MMX registers
976 					   in SImode and DImode */
977   {2, 2},				/* cost of storing MMX registers
978 					   in SImode and DImode */
979   2,					/* cost of moving SSE register */
980   {2, 2, 8},				/* cost of loading SSE registers
981 					   in SImode, DImode and TImode */
982   {2, 2, 8},				/* cost of storing SSE registers
983 					   in SImode, DImode and TImode */
984   6,					/* MMX or SSE register to integer */
985   32,					/* size of l1 cache.  */
986   32,					/* size of l2 cache.  Some models
987 					   have integrated l2 cache, but
988 					   optimizing for k6 is not important
989 					   enough to worry about that.  */
990   32,					/* size of prefetch block */
991   1,					/* number of parallel prefetches */
992   1,					/* Branch cost */
993   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
994   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
995   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
996   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
997   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
998   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
999   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000    DUMMY_STRINGOP_ALGS},
1001   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002    DUMMY_STRINGOP_ALGS},
1003   1,					/* scalar_stmt_cost.  */
1004   1,					/* scalar load_cost.  */
1005   1,					/* scalar_store_cost.  */
1006   1,					/* vec_stmt_cost.  */
1007   1,					/* vec_to_scalar_cost.  */
1008   1,					/* scalar_to_vec_cost.  */
1009   1,					/* vec_align_load_cost.  */
1010   2,					/* vec_unalign_load_cost.  */
1011   1,					/* vec_store_cost.  */
1012   3,					/* cond_taken_branch_cost.  */
1013   1,					/* cond_not_taken_branch_cost.  */
1014 };
1015 
1016 static const
1017 struct processor_costs athlon_cost = {
1018   COSTS_N_INSNS (1),			/* cost of an add instruction */
1019   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1020   COSTS_N_INSNS (1),			/* variable shift costs */
1021   COSTS_N_INSNS (1),			/* constant shift costs */
1022   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
1023    COSTS_N_INSNS (5),			/*				 HI */
1024    COSTS_N_INSNS (5),			/*				 SI */
1025    COSTS_N_INSNS (5),			/*				 DI */
1026    COSTS_N_INSNS (5)},			/*			      other */
1027   0,					/* cost of multiply per each bit set */
1028   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1029    COSTS_N_INSNS (26),			/*			    HI */
1030    COSTS_N_INSNS (42),			/*			    SI */
1031    COSTS_N_INSNS (74),			/*			    DI */
1032    COSTS_N_INSNS (74)},			/*			    other */
1033   COSTS_N_INSNS (1),			/* cost of movsx */
1034   COSTS_N_INSNS (1),			/* cost of movzx */
1035   8,					/* "large" insn */
1036   9,					/* MOVE_RATIO */
1037   4,				     /* cost for loading QImode using movzbl */
1038   {3, 4, 3},				/* cost of loading integer registers
1039 					   in QImode, HImode and SImode.
1040 					   Relative to reg-reg move (2).  */
1041   {3, 4, 3},				/* cost of storing integer registers */
1042   4,					/* cost of reg,reg fld/fst */
1043   {4, 4, 12},				/* cost of loading fp registers
1044 					   in SFmode, DFmode and XFmode */
1045   {6, 6, 8},				/* cost of storing fp registers
1046 					   in SFmode, DFmode and XFmode */
1047   2,					/* cost of moving MMX register */
1048   {4, 4},				/* cost of loading MMX registers
1049 					   in SImode and DImode */
1050   {4, 4},				/* cost of storing MMX registers
1051 					   in SImode and DImode */
1052   2,					/* cost of moving SSE register */
1053   {4, 4, 6},				/* cost of loading SSE registers
1054 					   in SImode, DImode and TImode */
1055   {4, 4, 5},				/* cost of storing SSE registers
1056 					   in SImode, DImode and TImode */
1057   5,					/* MMX or SSE register to integer */
1058   64,					/* size of l1 cache.  */
1059   256,					/* size of l2 cache.  */
1060   64,					/* size of prefetch block */
1061   6,					/* number of parallel prefetches */
1062   5,					/* Branch cost */
1063   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1064   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1065   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
1066   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1067   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1068   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1069   /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070      compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071      128 bytes for memset.  */
1072   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073    DUMMY_STRINGOP_ALGS},
1074   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075    DUMMY_STRINGOP_ALGS},
1076   1,					/* scalar_stmt_cost.  */
1077   1,					/* scalar load_cost.  */
1078   1,					/* scalar_store_cost.  */
1079   1,					/* vec_stmt_cost.  */
1080   1,					/* vec_to_scalar_cost.  */
1081   1,					/* scalar_to_vec_cost.  */
1082   1,					/* vec_align_load_cost.  */
1083   2,					/* vec_unalign_load_cost.  */
1084   1,					/* vec_store_cost.  */
1085   3,					/* cond_taken_branch_cost.  */
1086   1,					/* cond_not_taken_branch_cost.  */
1087 };
1088 
1089 static const
1090 struct processor_costs k8_cost = {
1091   COSTS_N_INSNS (1),			/* cost of an add instruction */
1092   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1093   COSTS_N_INSNS (1),			/* variable shift costs */
1094   COSTS_N_INSNS (1),			/* constant shift costs */
1095   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1096    COSTS_N_INSNS (4),			/*				 HI */
1097    COSTS_N_INSNS (3),			/*				 SI */
1098    COSTS_N_INSNS (4),			/*				 DI */
1099    COSTS_N_INSNS (5)},			/*			      other */
1100   0,					/* cost of multiply per each bit set */
1101   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1102    COSTS_N_INSNS (26),			/*			    HI */
1103    COSTS_N_INSNS (42),			/*			    SI */
1104    COSTS_N_INSNS (74),			/*			    DI */
1105    COSTS_N_INSNS (74)},			/*			    other */
1106   COSTS_N_INSNS (1),			/* cost of movsx */
1107   COSTS_N_INSNS (1),			/* cost of movzx */
1108   8,					/* "large" insn */
1109   9,					/* MOVE_RATIO */
1110   4,				     /* cost for loading QImode using movzbl */
1111   {3, 4, 3},				/* cost of loading integer registers
1112 					   in QImode, HImode and SImode.
1113 					   Relative to reg-reg move (2).  */
1114   {3, 4, 3},				/* cost of storing integer registers */
1115   4,					/* cost of reg,reg fld/fst */
1116   {4, 4, 12},				/* cost of loading fp registers
1117 					   in SFmode, DFmode and XFmode */
1118   {6, 6, 8},				/* cost of storing fp registers
1119 					   in SFmode, DFmode and XFmode */
1120   2,					/* cost of moving MMX register */
1121   {3, 3},				/* cost of loading MMX registers
1122 					   in SImode and DImode */
1123   {4, 4},				/* cost of storing MMX registers
1124 					   in SImode and DImode */
1125   2,					/* cost of moving SSE register */
1126   {4, 3, 6},				/* cost of loading SSE registers
1127 					   in SImode, DImode and TImode */
1128   {4, 4, 5},				/* cost of storing SSE registers
1129 					   in SImode, DImode and TImode */
1130   5,					/* MMX or SSE register to integer */
1131   64,					/* size of l1 cache.  */
1132   512,					/* size of l2 cache.  */
1133   64,					/* size of prefetch block */
1134   /* New AMD processors never drop prefetches; if they cannot be performed
1135      immediately, they are queued.  We set number of simultaneous prefetches
1136      to a large constant to reflect this (it probably is not a good idea not
1137      to limit number of prefetches at all, as their execution also takes some
1138      time).  */
1139   100,					/* number of parallel prefetches */
1140   3,					/* Branch cost */
1141   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1142   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1143   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1144   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1145   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1146   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1147   /* K8 has optimized REP instruction for medium sized blocks, but for very
1148      small blocks it is better to use loop. For large blocks, libcall can
1149      do nontemporary accesses and beat inline considerably.  */
1150   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152   {{libcall, {{8, loop}, {24, unrolled_loop},
1153 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155   4,					/* scalar_stmt_cost.  */
1156   2,					/* scalar load_cost.  */
1157   2,					/* scalar_store_cost.  */
1158   5,					/* vec_stmt_cost.  */
1159   0,					/* vec_to_scalar_cost.  */
1160   2,					/* scalar_to_vec_cost.  */
1161   2,					/* vec_align_load_cost.  */
1162   3,					/* vec_unalign_load_cost.  */
1163   3,					/* vec_store_cost.  */
1164   3,					/* cond_taken_branch_cost.  */
1165   2,					/* cond_not_taken_branch_cost.  */
1166 };
1167 
1168 struct processor_costs amdfam10_cost = {
1169   COSTS_N_INSNS (1),			/* cost of an add instruction */
1170   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1171   COSTS_N_INSNS (1),			/* variable shift costs */
1172   COSTS_N_INSNS (1),			/* constant shift costs */
1173   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1174    COSTS_N_INSNS (4),			/*				 HI */
1175    COSTS_N_INSNS (3),			/*				 SI */
1176    COSTS_N_INSNS (4),			/*				 DI */
1177    COSTS_N_INSNS (5)},			/*			      other */
1178   0,					/* cost of multiply per each bit set */
1179   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1180    COSTS_N_INSNS (35),			/*			    HI */
1181    COSTS_N_INSNS (51),			/*			    SI */
1182    COSTS_N_INSNS (83),			/*			    DI */
1183    COSTS_N_INSNS (83)},			/*			    other */
1184   COSTS_N_INSNS (1),			/* cost of movsx */
1185   COSTS_N_INSNS (1),			/* cost of movzx */
1186   8,					/* "large" insn */
1187   9,					/* MOVE_RATIO */
1188   4,				     /* cost for loading QImode using movzbl */
1189   {3, 4, 3},				/* cost of loading integer registers
1190 					   in QImode, HImode and SImode.
1191 					   Relative to reg-reg move (2).  */
1192   {3, 4, 3},				/* cost of storing integer registers */
1193   4,					/* cost of reg,reg fld/fst */
1194   {4, 4, 12},				/* cost of loading fp registers
1195 		   			   in SFmode, DFmode and XFmode */
1196   {6, 6, 8},				/* cost of storing fp registers
1197  		   			   in SFmode, DFmode and XFmode */
1198   2,					/* cost of moving MMX register */
1199   {3, 3},				/* cost of loading MMX registers
1200 					   in SImode and DImode */
1201   {4, 4},				/* cost of storing MMX registers
1202 					   in SImode and DImode */
1203   2,					/* cost of moving SSE register */
1204   {4, 4, 3},				/* cost of loading SSE registers
1205 					   in SImode, DImode and TImode */
1206   {4, 4, 5},				/* cost of storing SSE registers
1207 					   in SImode, DImode and TImode */
1208   3,					/* MMX or SSE register to integer */
1209   					/* On K8:
1210   					    MOVD reg64, xmmreg Double FSTORE 4
1211 					    MOVD reg32, xmmreg Double FSTORE 4
1212 					   On AMDFAM10:
1213 					    MOVD reg64, xmmreg Double FADD 3
1214 							       1/1  1/1
1215 					    MOVD reg32, xmmreg Double FADD 3
1216 							       1/1  1/1 */
1217   64,					/* size of l1 cache.  */
1218   512,					/* size of l2 cache.  */
1219   64,					/* size of prefetch block */
1220   /* New AMD processors never drop prefetches; if they cannot be performed
1221      immediately, they are queued.  We set number of simultaneous prefetches
1222      to a large constant to reflect this (it probably is not a good idea not
1223      to limit number of prefetches at all, as their execution also takes some
1224      time).  */
1225   100,					/* number of parallel prefetches */
1226   2,					/* Branch cost */
1227   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1228   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1229   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1230   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1231   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1232   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1233 
1234   /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235      very small blocks it is better to use loop. For large blocks, libcall can
1236      do nontemporary accesses and beat inline considerably.  */
1237   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239   {{libcall, {{8, loop}, {24, unrolled_loop},
1240 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242   4,					/* scalar_stmt_cost.  */
1243   2,					/* scalar load_cost.  */
1244   2,					/* scalar_store_cost.  */
1245   6,					/* vec_stmt_cost.  */
1246   0,					/* vec_to_scalar_cost.  */
1247   2,					/* scalar_to_vec_cost.  */
1248   2,					/* vec_align_load_cost.  */
1249   2,					/* vec_unalign_load_cost.  */
1250   2,					/* vec_store_cost.  */
1251   2,					/* cond_taken_branch_cost.  */
1252   1,					/* cond_not_taken_branch_cost.  */
1253 };
1254 
1255 struct processor_costs bdver1_cost = {
1256   COSTS_N_INSNS (1),			/* cost of an add instruction */
1257   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1258   COSTS_N_INSNS (1),			/* variable shift costs */
1259   COSTS_N_INSNS (1),			/* constant shift costs */
1260   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1261    COSTS_N_INSNS (4),			/*				 HI */
1262    COSTS_N_INSNS (4),			/*				 SI */
1263    COSTS_N_INSNS (6),			/*				 DI */
1264    COSTS_N_INSNS (6)},			/*			      other */
1265   0,					/* cost of multiply per each bit set */
1266   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1267    COSTS_N_INSNS (35),			/*			    HI */
1268    COSTS_N_INSNS (51),			/*			    SI */
1269    COSTS_N_INSNS (83),			/*			    DI */
1270    COSTS_N_INSNS (83)},			/*			    other */
1271   COSTS_N_INSNS (1),			/* cost of movsx */
1272   COSTS_N_INSNS (1),			/* cost of movzx */
1273   8,					/* "large" insn */
1274   9,					/* MOVE_RATIO */
1275   4,				     /* cost for loading QImode using movzbl */
1276   {5, 5, 4},				/* cost of loading integer registers
1277 					   in QImode, HImode and SImode.
1278 					   Relative to reg-reg move (2).  */
1279   {4, 4, 4},				/* cost of storing integer registers */
1280   2,					/* cost of reg,reg fld/fst */
1281   {5, 5, 12},				/* cost of loading fp registers
1282 		   			   in SFmode, DFmode and XFmode */
1283   {4, 4, 8},				/* cost of storing fp registers
1284  		   			   in SFmode, DFmode and XFmode */
1285   2,					/* cost of moving MMX register */
1286   {4, 4},				/* cost of loading MMX registers
1287 					   in SImode and DImode */
1288   {4, 4},				/* cost of storing MMX registers
1289 					   in SImode and DImode */
1290   2,					/* cost of moving SSE register */
1291   {4, 4, 4},				/* cost of loading SSE registers
1292 					   in SImode, DImode and TImode */
1293   {4, 4, 4},				/* cost of storing SSE registers
1294 					   in SImode, DImode and TImode */
1295   2,					/* MMX or SSE register to integer */
1296   					/* On K8:
1297 					    MOVD reg64, xmmreg Double FSTORE 4
1298 					    MOVD reg32, xmmreg Double FSTORE 4
1299 					   On AMDFAM10:
1300 					    MOVD reg64, xmmreg Double FADD 3
1301 							       1/1  1/1
1302 					    MOVD reg32, xmmreg Double FADD 3
1303 							       1/1  1/1 */
1304   16,					/* size of l1 cache.  */
1305   2048,					/* size of l2 cache.  */
1306   64,					/* size of prefetch block */
1307   /* New AMD processors never drop prefetches; if they cannot be performed
1308      immediately, they are queued.  We set number of simultaneous prefetches
1309      to a large constant to reflect this (it probably is not a good idea not
1310      to limit number of prefetches at all, as their execution also takes some
1311      time).  */
1312   100,					/* number of parallel prefetches */
1313   2,					/* Branch cost */
1314   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1315   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1316   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1317   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1318   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1319   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1320 
1321   /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
1322       very small blocks it is better to use loop. For large blocks, libcall
1323       can do nontemporary accesses and beat inline considerably.  */
1324   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326   {{libcall, {{8, loop}, {24, unrolled_loop},
1327 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329   6,					/* scalar_stmt_cost.  */
1330   4,					/* scalar load_cost.  */
1331   4,					/* scalar_store_cost.  */
1332   6,					/* vec_stmt_cost.  */
1333   0,					/* vec_to_scalar_cost.  */
1334   2,					/* scalar_to_vec_cost.  */
1335   4,					/* vec_align_load_cost.  */
1336   4,					/* vec_unalign_load_cost.  */
1337   4,					/* vec_store_cost.  */
1338   2,					/* cond_taken_branch_cost.  */
1339   1,					/* cond_not_taken_branch_cost.  */
1340 };
1341 
1342 struct processor_costs bdver2_cost = {
1343   COSTS_N_INSNS (1),			/* cost of an add instruction */
1344   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1345   COSTS_N_INSNS (1),			/* variable shift costs */
1346   COSTS_N_INSNS (1),			/* constant shift costs */
1347   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1348    COSTS_N_INSNS (4),			/*				 HI */
1349    COSTS_N_INSNS (4),			/*				 SI */
1350    COSTS_N_INSNS (6),			/*				 DI */
1351    COSTS_N_INSNS (6)},			/*			      other */
1352   0,					/* cost of multiply per each bit set */
1353   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1354    COSTS_N_INSNS (35),			/*			    HI */
1355    COSTS_N_INSNS (51),			/*			    SI */
1356    COSTS_N_INSNS (83),			/*			    DI */
1357    COSTS_N_INSNS (83)},			/*			    other */
1358   COSTS_N_INSNS (1),			/* cost of movsx */
1359   COSTS_N_INSNS (1),			/* cost of movzx */
1360   8,					/* "large" insn */
1361   9,					/* MOVE_RATIO */
1362   4,				     /* cost for loading QImode using movzbl */
1363   {5, 5, 4},				/* cost of loading integer registers
1364 					   in QImode, HImode and SImode.
1365 					   Relative to reg-reg move (2).  */
1366   {4, 4, 4},				/* cost of storing integer registers */
1367   2,					/* cost of reg,reg fld/fst */
1368   {5, 5, 12},				/* cost of loading fp registers
1369 		   			   in SFmode, DFmode and XFmode */
1370   {4, 4, 8},				/* cost of storing fp registers
1371  		   			   in SFmode, DFmode and XFmode */
1372   2,					/* cost of moving MMX register */
1373   {4, 4},				/* cost of loading MMX registers
1374 					   in SImode and DImode */
1375   {4, 4},				/* cost of storing MMX registers
1376 					   in SImode and DImode */
1377   2,					/* cost of moving SSE register */
1378   {4, 4, 4},				/* cost of loading SSE registers
1379 					   in SImode, DImode and TImode */
1380   {4, 4, 4},				/* cost of storing SSE registers
1381 					   in SImode, DImode and TImode */
1382   2,					/* MMX or SSE register to integer */
1383   					/* On K8:
1384 					    MOVD reg64, xmmreg Double FSTORE 4
1385 					    MOVD reg32, xmmreg Double FSTORE 4
1386 					   On AMDFAM10:
1387 					    MOVD reg64, xmmreg Double FADD 3
1388 							       1/1  1/1
1389 					    MOVD reg32, xmmreg Double FADD 3
1390 							       1/1  1/1 */
1391   16,					/* size of l1 cache.  */
1392   2048,					/* size of l2 cache.  */
1393   64,					/* size of prefetch block */
1394   /* New AMD processors never drop prefetches; if they cannot be performed
1395      immediately, they are queued.  We set number of simultaneous prefetches
1396      to a large constant to reflect this (it probably is not a good idea not
1397      to limit number of prefetches at all, as their execution also takes some
1398      time).  */
1399   100,					/* number of parallel prefetches */
1400   2,					/* Branch cost */
1401   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1402   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1403   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1404   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1405   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1406   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1407 
1408   /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1409       very small blocks it is better to use loop. For large blocks, libcall
1410       can do nontemporary accesses and beat inline considerably.  */
1411   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413   {{libcall, {{8, loop}, {24, unrolled_loop},
1414 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416   6,					/* scalar_stmt_cost.  */
1417   4,					/* scalar load_cost.  */
1418   4,					/* scalar_store_cost.  */
1419   6,					/* vec_stmt_cost.  */
1420   0,					/* vec_to_scalar_cost.  */
1421   2,					/* scalar_to_vec_cost.  */
1422   4,					/* vec_align_load_cost.  */
1423   4,					/* vec_unalign_load_cost.  */
1424   4,					/* vec_store_cost.  */
1425   2,					/* cond_taken_branch_cost.  */
1426   1,					/* cond_not_taken_branch_cost.  */
1427 };
1428 
1429 struct processor_costs btver1_cost = {
1430   COSTS_N_INSNS (1),			/* cost of an add instruction */
1431   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1432   COSTS_N_INSNS (1),			/* variable shift costs */
1433   COSTS_N_INSNS (1),			/* constant shift costs */
1434   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1435    COSTS_N_INSNS (4),			/*				 HI */
1436    COSTS_N_INSNS (3),			/*				 SI */
1437    COSTS_N_INSNS (4),			/*				 DI */
1438    COSTS_N_INSNS (5)},			/*			      other */
1439   0,					/* cost of multiply per each bit set */
1440   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1441    COSTS_N_INSNS (35),			/*			    HI */
1442    COSTS_N_INSNS (51),			/*			    SI */
1443    COSTS_N_INSNS (83),			/*			    DI */
1444    COSTS_N_INSNS (83)},			/*			    other */
1445   COSTS_N_INSNS (1),			/* cost of movsx */
1446   COSTS_N_INSNS (1),			/* cost of movzx */
1447   8,					/* "large" insn */
1448   9,					/* MOVE_RATIO */
1449   4,				     /* cost for loading QImode using movzbl */
1450   {3, 4, 3},				/* cost of loading integer registers
1451 					   in QImode, HImode and SImode.
1452 					   Relative to reg-reg move (2).  */
1453   {3, 4, 3},				/* cost of storing integer registers */
1454   4,					/* cost of reg,reg fld/fst */
1455   {4, 4, 12},				/* cost of loading fp registers
1456 					   in SFmode, DFmode and XFmode */
1457   {6, 6, 8},				/* cost of storing fp registers
1458 					   in SFmode, DFmode and XFmode */
1459   2,					/* cost of moving MMX register */
1460   {3, 3},				/* cost of loading MMX registers
1461 					   in SImode and DImode */
1462   {4, 4},				/* cost of storing MMX registers
1463 					   in SImode and DImode */
1464   2,					/* cost of moving SSE register */
1465   {4, 4, 3},				/* cost of loading SSE registers
1466 					   in SImode, DImode and TImode */
1467   {4, 4, 5},				/* cost of storing SSE registers
1468 					   in SImode, DImode and TImode */
1469   3,					/* MMX or SSE register to integer */
1470 					/* On K8:
1471 					   MOVD reg64, xmmreg Double FSTORE 4
1472 					   MOVD reg32, xmmreg Double FSTORE 4
1473 					   On AMDFAM10:
1474 					   MOVD reg64, xmmreg Double FADD 3
1475 							       1/1  1/1
1476 					    MOVD reg32, xmmreg Double FADD 3
1477 							       1/1  1/1 */
1478   32,					/* size of l1 cache.  */
1479   512,					/* size of l2 cache.  */
1480   64,					/* size of prefetch block */
1481   100,					/* number of parallel prefetches */
1482   2,					/* Branch cost */
1483   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1484   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1485   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1486   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1487   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1488   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1489 
1490   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491      very small blocks it is better to use loop. For large blocks, libcall can
1492      do nontemporary accesses and beat inline considerably.  */
1493   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495   {{libcall, {{8, loop}, {24, unrolled_loop},
1496 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498   4,					/* scalar_stmt_cost.  */
1499   2,					/* scalar load_cost.  */
1500   2,					/* scalar_store_cost.  */
1501   6,					/* vec_stmt_cost.  */
1502   0,					/* vec_to_scalar_cost.  */
1503   2,					/* scalar_to_vec_cost.  */
1504   2,					/* vec_align_load_cost.  */
1505   2,					/* vec_unalign_load_cost.  */
1506   2,					/* vec_store_cost.  */
1507   2,					/* cond_taken_branch_cost.  */
1508   1,					/* cond_not_taken_branch_cost.  */
1509 };
1510 
1511 static const
1512 struct processor_costs pentium4_cost = {
1513   COSTS_N_INSNS (1),			/* cost of an add instruction */
1514   COSTS_N_INSNS (3),			/* cost of a lea instruction */
1515   COSTS_N_INSNS (4),			/* variable shift costs */
1516   COSTS_N_INSNS (4),			/* constant shift costs */
1517   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
1518    COSTS_N_INSNS (15),			/*				 HI */
1519    COSTS_N_INSNS (15),			/*				 SI */
1520    COSTS_N_INSNS (15),			/*				 DI */
1521    COSTS_N_INSNS (15)},			/*			      other */
1522   0,					/* cost of multiply per each bit set */
1523   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
1524    COSTS_N_INSNS (56),			/*			    HI */
1525    COSTS_N_INSNS (56),			/*			    SI */
1526    COSTS_N_INSNS (56),			/*			    DI */
1527    COSTS_N_INSNS (56)},			/*			    other */
1528   COSTS_N_INSNS (1),			/* cost of movsx */
1529   COSTS_N_INSNS (1),			/* cost of movzx */
1530   16,					/* "large" insn */
1531   6,					/* MOVE_RATIO */
1532   2,				     /* cost for loading QImode using movzbl */
1533   {4, 5, 4},				/* cost of loading integer registers
1534 					   in QImode, HImode and SImode.
1535 					   Relative to reg-reg move (2).  */
1536   {2, 3, 2},				/* cost of storing integer registers */
1537   2,					/* cost of reg,reg fld/fst */
1538   {2, 2, 6},				/* cost of loading fp registers
1539 					   in SFmode, DFmode and XFmode */
1540   {4, 4, 6},				/* cost of storing fp registers
1541 					   in SFmode, DFmode and XFmode */
1542   2,					/* cost of moving MMX register */
1543   {2, 2},				/* cost of loading MMX registers
1544 					   in SImode and DImode */
1545   {2, 2},				/* cost of storing MMX registers
1546 					   in SImode and DImode */
1547   12,					/* cost of moving SSE register */
1548   {12, 12, 12},				/* cost of loading SSE registers
1549 					   in SImode, DImode and TImode */
1550   {2, 2, 8},				/* cost of storing SSE registers
1551 					   in SImode, DImode and TImode */
1552   10,					/* MMX or SSE register to integer */
1553   8,					/* size of l1 cache.  */
1554   256,					/* size of l2 cache.  */
1555   64,					/* size of prefetch block */
1556   6,					/* number of parallel prefetches */
1557   2,					/* Branch cost */
1558   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1559   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
1560   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
1561   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1562   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1563   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
1564   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565    DUMMY_STRINGOP_ALGS},
1566   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567    {-1, libcall}}},
1568    DUMMY_STRINGOP_ALGS},
1569   1,					/* scalar_stmt_cost.  */
1570   1,					/* scalar load_cost.  */
1571   1,					/* scalar_store_cost.  */
1572   1,					/* vec_stmt_cost.  */
1573   1,					/* vec_to_scalar_cost.  */
1574   1,					/* scalar_to_vec_cost.  */
1575   1,					/* vec_align_load_cost.  */
1576   2,					/* vec_unalign_load_cost.  */
1577   1,					/* vec_store_cost.  */
1578   3,					/* cond_taken_branch_cost.  */
1579   1,					/* cond_not_taken_branch_cost.  */
1580 };
1581 
1582 static const
1583 struct processor_costs nocona_cost = {
1584   COSTS_N_INSNS (1),			/* cost of an add instruction */
1585   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1586   COSTS_N_INSNS (1),			/* variable shift costs */
1587   COSTS_N_INSNS (1),			/* constant shift costs */
1588   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
1589    COSTS_N_INSNS (10),			/*				 HI */
1590    COSTS_N_INSNS (10),			/*				 SI */
1591    COSTS_N_INSNS (10),			/*				 DI */
1592    COSTS_N_INSNS (10)},			/*			      other */
1593   0,					/* cost of multiply per each bit set */
1594   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
1595    COSTS_N_INSNS (66),			/*			    HI */
1596    COSTS_N_INSNS (66),			/*			    SI */
1597    COSTS_N_INSNS (66),			/*			    DI */
1598    COSTS_N_INSNS (66)},			/*			    other */
1599   COSTS_N_INSNS (1),			/* cost of movsx */
1600   COSTS_N_INSNS (1),			/* cost of movzx */
1601   16,					/* "large" insn */
1602   17,					/* MOVE_RATIO */
1603   4,				     /* cost for loading QImode using movzbl */
1604   {4, 4, 4},				/* cost of loading integer registers
1605 					   in QImode, HImode and SImode.
1606 					   Relative to reg-reg move (2).  */
1607   {4, 4, 4},				/* cost of storing integer registers */
1608   3,					/* cost of reg,reg fld/fst */
1609   {12, 12, 12},				/* cost of loading fp registers
1610 					   in SFmode, DFmode and XFmode */
1611   {4, 4, 4},				/* cost of storing fp registers
1612 					   in SFmode, DFmode and XFmode */
1613   6,					/* cost of moving MMX register */
1614   {12, 12},				/* cost of loading MMX registers
1615 					   in SImode and DImode */
1616   {12, 12},				/* cost of storing MMX registers
1617 					   in SImode and DImode */
1618   6,					/* cost of moving SSE register */
1619   {12, 12, 12},				/* cost of loading SSE registers
1620 					   in SImode, DImode and TImode */
1621   {12, 12, 12},				/* cost of storing SSE registers
1622 					   in SImode, DImode and TImode */
1623   8,					/* MMX or SSE register to integer */
1624   8,					/* size of l1 cache.  */
1625   1024,					/* size of l2 cache.  */
1626   128,					/* size of prefetch block */
1627   8,					/* number of parallel prefetches */
1628   1,					/* Branch cost */
1629   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1630   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1631   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
1632   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
1633   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
1634   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
1635   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636    {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 	      {100000, unrolled_loop}, {-1, libcall}}}},
1638   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639    {-1, libcall}}},
1640    {libcall, {{24, loop}, {64, unrolled_loop},
1641 	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642   1,					/* scalar_stmt_cost.  */
1643   1,					/* scalar load_cost.  */
1644   1,					/* scalar_store_cost.  */
1645   1,					/* vec_stmt_cost.  */
1646   1,					/* vec_to_scalar_cost.  */
1647   1,					/* scalar_to_vec_cost.  */
1648   1,					/* vec_align_load_cost.  */
1649   2,					/* vec_unalign_load_cost.  */
1650   1,					/* vec_store_cost.  */
1651   3,					/* cond_taken_branch_cost.  */
1652   1,					/* cond_not_taken_branch_cost.  */
1653 };
1654 
1655 static const
1656 struct processor_costs atom_cost = {
1657   COSTS_N_INSNS (1),			/* cost of an add instruction */
1658   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1659   COSTS_N_INSNS (1),			/* variable shift costs */
1660   COSTS_N_INSNS (1),			/* constant shift costs */
1661   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1662    COSTS_N_INSNS (4),			/*				 HI */
1663    COSTS_N_INSNS (3),			/*				 SI */
1664    COSTS_N_INSNS (4),			/*				 DI */
1665    COSTS_N_INSNS (2)},			/*			      other */
1666   0,					/* cost of multiply per each bit set */
1667   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1668    COSTS_N_INSNS (26),			/*			    HI */
1669    COSTS_N_INSNS (42),			/*			    SI */
1670    COSTS_N_INSNS (74),			/*			    DI */
1671    COSTS_N_INSNS (74)},			/*			    other */
1672   COSTS_N_INSNS (1),			/* cost of movsx */
1673   COSTS_N_INSNS (1),			/* cost of movzx */
1674   8,					/* "large" insn */
1675   17,					/* MOVE_RATIO */
1676   4,					/* cost for loading QImode using movzbl */
1677   {4, 4, 4},				/* cost of loading integer registers
1678 					   in QImode, HImode and SImode.
1679 					   Relative to reg-reg move (2).  */
1680   {4, 4, 4},				/* cost of storing integer registers */
1681   4,					/* cost of reg,reg fld/fst */
1682   {12, 12, 12},				/* cost of loading fp registers
1683 					   in SFmode, DFmode and XFmode */
1684   {6, 6, 8},				/* cost of storing fp registers
1685 					   in SFmode, DFmode and XFmode */
1686   2,					/* cost of moving MMX register */
1687   {8, 8},				/* cost of loading MMX registers
1688 					   in SImode and DImode */
1689   {8, 8},				/* cost of storing MMX registers
1690 					   in SImode and DImode */
1691   2,					/* cost of moving SSE register */
1692   {8, 8, 8},				/* cost of loading SSE registers
1693 					   in SImode, DImode and TImode */
1694   {8, 8, 8},				/* cost of storing SSE registers
1695 					   in SImode, DImode and TImode */
1696   5,					/* MMX or SSE register to integer */
1697   32,					/* size of l1 cache.  */
1698   256,					/* size of l2 cache.  */
1699   64,					/* size of prefetch block */
1700   6,					/* number of parallel prefetches */
1701   3,					/* Branch cost */
1702   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1703   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1704   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1705   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1706   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1707   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1708   {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709    {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 	  {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711   {{libcall, {{8, loop}, {15, unrolled_loop},
1712 	  {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713    {libcall, {{24, loop}, {32, unrolled_loop},
1714 	  {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715   1,					/* scalar_stmt_cost.  */
1716   1,					/* scalar load_cost.  */
1717   1,					/* scalar_store_cost.  */
1718   1,					/* vec_stmt_cost.  */
1719   1,					/* vec_to_scalar_cost.  */
1720   1,					/* scalar_to_vec_cost.  */
1721   1,					/* vec_align_load_cost.  */
1722   2,					/* vec_unalign_load_cost.  */
1723   1,					/* vec_store_cost.  */
1724   3,					/* cond_taken_branch_cost.  */
1725   1,					/* cond_not_taken_branch_cost.  */
1726 };
1727 
1728 /* Generic64 should produce code tuned for Nocona and K8.  */
1729 static const
1730 struct processor_costs generic64_cost = {
1731   COSTS_N_INSNS (1),			/* cost of an add instruction */
1732   /* On all chips taken into consideration lea is 2 cycles and more.  With
1733      this cost however our current implementation of synth_mult results in
1734      use of unnecessary temporary registers causing regression on several
1735      SPECfp benchmarks.  */
1736   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1737   COSTS_N_INSNS (1),			/* variable shift costs */
1738   COSTS_N_INSNS (1),			/* constant shift costs */
1739   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1740    COSTS_N_INSNS (4),			/*				 HI */
1741    COSTS_N_INSNS (3),			/*				 SI */
1742    COSTS_N_INSNS (4),			/*				 DI */
1743    COSTS_N_INSNS (2)},			/*			      other */
1744   0,					/* cost of multiply per each bit set */
1745   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1746    COSTS_N_INSNS (26),			/*			    HI */
1747    COSTS_N_INSNS (42),			/*			    SI */
1748    COSTS_N_INSNS (74),			/*			    DI */
1749    COSTS_N_INSNS (74)},			/*			    other */
1750   COSTS_N_INSNS (1),			/* cost of movsx */
1751   COSTS_N_INSNS (1),			/* cost of movzx */
1752   8,					/* "large" insn */
1753   17,					/* MOVE_RATIO */
1754   4,				     /* cost for loading QImode using movzbl */
1755   {4, 4, 4},				/* cost of loading integer registers
1756 					   in QImode, HImode and SImode.
1757 					   Relative to reg-reg move (2).  */
1758   {4, 4, 4},				/* cost of storing integer registers */
1759   4,					/* cost of reg,reg fld/fst */
1760   {12, 12, 12},				/* cost of loading fp registers
1761 					   in SFmode, DFmode and XFmode */
1762   {6, 6, 8},				/* cost of storing fp registers
1763 					   in SFmode, DFmode and XFmode */
1764   2,					/* cost of moving MMX register */
1765   {8, 8},				/* cost of loading MMX registers
1766 					   in SImode and DImode */
1767   {8, 8},				/* cost of storing MMX registers
1768 					   in SImode and DImode */
1769   2,					/* cost of moving SSE register */
1770   {8, 8, 8},				/* cost of loading SSE registers
1771 					   in SImode, DImode and TImode */
1772   {8, 8, 8},				/* cost of storing SSE registers
1773 					   in SImode, DImode and TImode */
1774   5,					/* MMX or SSE register to integer */
1775   32,					/* size of l1 cache.  */
1776   512,					/* size of l2 cache.  */
1777   64,					/* size of prefetch block */
1778   6,					/* number of parallel prefetches */
1779   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780      value is increased to perhaps more appropriate value of 5.  */
1781   3,					/* Branch cost */
1782   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1783   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1784   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1785   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1786   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1787   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1788   {DUMMY_STRINGOP_ALGS,
1789    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790   {DUMMY_STRINGOP_ALGS,
1791    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792   1,					/* scalar_stmt_cost.  */
1793   1,					/* scalar load_cost.  */
1794   1,					/* scalar_store_cost.  */
1795   1,					/* vec_stmt_cost.  */
1796   1,					/* vec_to_scalar_cost.  */
1797   1,					/* scalar_to_vec_cost.  */
1798   1,					/* vec_align_load_cost.  */
1799   2,					/* vec_unalign_load_cost.  */
1800   1,					/* vec_store_cost.  */
1801   3,					/* cond_taken_branch_cost.  */
1802   1,					/* cond_not_taken_branch_cost.  */
1803 };
1804 
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806    Athlon and K8.  */
1807 static const
1808 struct processor_costs generic32_cost = {
1809   COSTS_N_INSNS (1),			/* cost of an add instruction */
1810   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1811   COSTS_N_INSNS (1),			/* variable shift costs */
1812   COSTS_N_INSNS (1),			/* constant shift costs */
1813   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1814    COSTS_N_INSNS (4),			/*				 HI */
1815    COSTS_N_INSNS (3),			/*				 SI */
1816    COSTS_N_INSNS (4),			/*				 DI */
1817    COSTS_N_INSNS (2)},			/*			      other */
1818   0,					/* cost of multiply per each bit set */
1819   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1820    COSTS_N_INSNS (26),			/*			    HI */
1821    COSTS_N_INSNS (42),			/*			    SI */
1822    COSTS_N_INSNS (74),			/*			    DI */
1823    COSTS_N_INSNS (74)},			/*			    other */
1824   COSTS_N_INSNS (1),			/* cost of movsx */
1825   COSTS_N_INSNS (1),			/* cost of movzx */
1826   8,					/* "large" insn */
1827   17,					/* MOVE_RATIO */
1828   4,				     /* cost for loading QImode using movzbl */
1829   {4, 4, 4},				/* cost of loading integer registers
1830 					   in QImode, HImode and SImode.
1831 					   Relative to reg-reg move (2).  */
1832   {4, 4, 4},				/* cost of storing integer registers */
1833   4,					/* cost of reg,reg fld/fst */
1834   {12, 12, 12},				/* cost of loading fp registers
1835 					   in SFmode, DFmode and XFmode */
1836   {6, 6, 8},				/* cost of storing fp registers
1837 					   in SFmode, DFmode and XFmode */
1838   2,					/* cost of moving MMX register */
1839   {8, 8},				/* cost of loading MMX registers
1840 					   in SImode and DImode */
1841   {8, 8},				/* cost of storing MMX registers
1842 					   in SImode and DImode */
1843   2,					/* cost of moving SSE register */
1844   {8, 8, 8},				/* cost of loading SSE registers
1845 					   in SImode, DImode and TImode */
1846   {8, 8, 8},				/* cost of storing SSE registers
1847 					   in SImode, DImode and TImode */
1848   5,					/* MMX or SSE register to integer */
1849   32,					/* size of l1 cache.  */
1850   256,					/* size of l2 cache.  */
1851   64,					/* size of prefetch block */
1852   6,					/* number of parallel prefetches */
1853   3,					/* Branch cost */
1854   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1855   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1856   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1857   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1858   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1859   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1860   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861    DUMMY_STRINGOP_ALGS},
1862   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863    DUMMY_STRINGOP_ALGS},
1864   1,					/* scalar_stmt_cost.  */
1865   1,					/* scalar load_cost.  */
1866   1,					/* scalar_store_cost.  */
1867   1,					/* vec_stmt_cost.  */
1868   1,					/* vec_to_scalar_cost.  */
1869   1,					/* scalar_to_vec_cost.  */
1870   1,					/* vec_align_load_cost.  */
1871   2,					/* vec_unalign_load_cost.  */
1872   1,					/* vec_store_cost.  */
1873   3,					/* cond_taken_branch_cost.  */
1874   1,					/* cond_not_taken_branch_cost.  */
1875 };
1876 
1877 const struct processor_costs *ix86_cost = &pentium_cost;
1878 
1879 /* Processor feature/optimization bitmasks.  */
1880 #define m_386 (1<<PROCESSOR_I386)
1881 #define m_486 (1<<PROCESSOR_I486)
1882 #define m_PENT (1<<PROCESSOR_PENTIUM)
1883 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1886 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895 #define m_ATOM (1<<PROCESSOR_ATOM)
1896 
1897 #define m_GEODE (1<<PROCESSOR_GEODE)
1898 #define m_K6 (1<<PROCESSOR_K6)
1899 #define m_K6_GEODE (m_K6 | m_GEODE)
1900 #define m_K8 (1<<PROCESSOR_K8)
1901 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1902 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906 #define m_BDVER	(m_BDVER1 | m_BDVER2)
1907 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909 
1910 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912 
1913 /* Generic instruction choice should be common subset of supported CPUs
1914    (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
1915 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916 
1917 /* Feature tests against the various tunings.  */
1918 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919 
1920 /* Feature tests against the various tunings used to create ix86_tune_features
1921    based on the processor mask.  */
1922 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923   /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924      negatively, so enabling for Generic64 seems like good code size
1925      tradeoff.  We can't enable it for 32bit generic because it does not
1926      work well with PPro base chips.  */
1927   m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928 
1929   /* X86_TUNE_PUSH_MEMORY */
1930   m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931 
1932   /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1933   m_486 | m_PENT,
1934 
1935   /* X86_TUNE_UNROLL_STRLEN */
1936   m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937 
1938   /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939      on simulation result. But after P4 was made, no performance benefit
1940      was observed with branch hints.  It also increases the code size.
1941      As a result, icc never generates branch hints.  */
1942   0,
1943 
1944   /* X86_TUNE_DOUBLE_WITH_ADD */
1945   ~m_386,
1946 
1947   /* X86_TUNE_USE_SAHF */
1948   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949 
1950   /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951      partial dependencies.  */
1952   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC,
1953 
1954   /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955      register stalls on Generic32 compilation setting as well.  However
1956      in current implementation the partial register stalls are not eliminated
1957      very well - they can be introduced via subregs synthesized by combine
1958      and can happen in caller/callee saving sequences.  Because this option
1959      pays back little on PPro based chips and is in conflict with partial reg
1960      dependencies used by Athlon/P4 based chips, it is better to leave it off
1961      for generic32 for now.  */
1962   m_PPRO,
1963 
1964   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965   m_CORE2I7 | m_GENERIC,
1966 
1967   /* X86_TUNE_USE_HIMODE_FIOP */
1968   m_386 | m_486 | m_K6_GEODE,
1969 
1970   /* X86_TUNE_USE_SIMODE_FIOP */
1971   ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1972 
1973   /* X86_TUNE_USE_MOV0 */
1974   m_K6,
1975 
1976   /* X86_TUNE_USE_CLTD */
1977   ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1978 
1979   /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
1980   m_PENT4,
1981 
1982   /* X86_TUNE_SPLIT_LONG_MOVES */
1983   m_PPRO,
1984 
1985   /* X86_TUNE_READ_MODIFY_WRITE */
1986   ~m_PENT,
1987 
1988   /* X86_TUNE_READ_MODIFY */
1989   ~(m_PENT | m_PPRO),
1990 
1991   /* X86_TUNE_PROMOTE_QIMODE */
1992   m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1993 
1994   /* X86_TUNE_FAST_PREFIX */
1995   ~(m_386 | m_486 | m_PENT),
1996 
1997   /* X86_TUNE_SINGLE_STRINGOP */
1998   m_386 | m_P4_NOCONA,
1999 
2000   /* X86_TUNE_QIMODE_MATH */
2001   ~0,
2002 
2003   /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2004      register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
2005      might be considered for Generic32 if our scheme for avoiding partial
2006      stalls was more effective.  */
2007   ~m_PPRO,
2008 
2009   /* X86_TUNE_PROMOTE_QI_REGS */
2010   0,
2011 
2012   /* X86_TUNE_PROMOTE_HI_REGS */
2013   m_PPRO,
2014 
2015   /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2016      over esp addition.  */
2017   m_386 | m_486 | m_PENT | m_PPRO,
2018 
2019   /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2020      over esp addition.  */
2021   m_PENT,
2022 
2023   /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2024      over esp subtraction.  */
2025   m_386 | m_486 | m_PENT | m_K6_GEODE,
2026 
2027   /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2028      over esp subtraction.  */
2029   m_PENT | m_K6_GEODE,
2030 
2031   /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2032      for DFmode copies */
2033   ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2034 
2035   /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2036   m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2037 
2038   /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2039      conflict here in between PPro/Pentium4 based chips that thread 128bit
2040      SSE registers as single units versus K8 based chips that divide SSE
2041      registers to two 64bit halves.  This knob promotes all store destinations
2042      to be 128bit to allow register renaming on 128bit SSE units, but usually
2043      results in one extra microop on 64bit SSE units.  Experimental results
2044      shows that disabling this option on P4 brings over 20% SPECfp regression,
2045      while enabling it on K8 brings roughly 2.4% regression that can be partly
2046      masked by careful scheduling of moves.  */
2047   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM  | m_AMDFAM10 | m_BDVER | m_GENERIC,
2048 
2049   /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2050   m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2051 
2052   /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2053   m_COREI7 | m_BDVER,
2054 
2055   /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2056   m_BDVER ,
2057 
2058   /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2059      are resolved on SSE register parts instead of whole registers, so we may
2060      maintain just lower part of scalar values in proper format leaving the
2061      upper part undefined.  */
2062   m_ATHLON_K8,
2063 
2064   /* X86_TUNE_SSE_TYPELESS_STORES */
2065   m_AMD_MULTIPLE,
2066 
2067   /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2068   m_PPRO | m_P4_NOCONA,
2069 
2070   /* X86_TUNE_MEMORY_MISMATCH_STALL */
2071   m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2072 
2073   /* X86_TUNE_PROLOGUE_USING_MOVE */
2074   m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2075 
2076   /* X86_TUNE_EPILOGUE_USING_MOVE */
2077   m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2078 
2079   /* X86_TUNE_SHIFT1 */
2080   ~m_486,
2081 
2082   /* X86_TUNE_USE_FFREEP */
2083   m_AMD_MULTIPLE,
2084 
2085   /* X86_TUNE_INTER_UNIT_MOVES */
2086   ~(m_AMD_MULTIPLE | m_GENERIC),
2087 
2088   /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2089   ~(m_AMDFAM10 | m_BDVER ),
2090 
2091   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2092      than 4 branch instructions in the 16 byte window.  */
2093   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2094 
2095   /* X86_TUNE_SCHEDULE */
2096   m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2097 
2098   /* X86_TUNE_USE_BT */
2099   m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2100 
2101   /* X86_TUNE_USE_INCDEC */
2102   ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2103 
2104   /* X86_TUNE_PAD_RETURNS */
2105   m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2106 
2107   /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion.  */
2108   m_ATOM,
2109 
2110   /* X86_TUNE_EXT_80387_CONSTANTS */
2111   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2112 
2113   /* X86_TUNE_SHORTEN_X87_SSE */
2114   ~m_K8,
2115 
2116   /* X86_TUNE_AVOID_VECTOR_DECODE */
2117   m_CORE2I7_64 | m_K8 | m_GENERIC64,
2118 
2119   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2120      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
2121   ~(m_386 | m_486),
2122 
2123   /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2124      vector path on AMD machines.  */
2125   m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2126 
2127   /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2128      machines.  */
2129   m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130 
2131   /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2132      than a MOV.  */
2133   m_PENT,
2134 
2135   /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2136      but one byte longer.  */
2137   m_PENT,
2138 
2139   /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2140      operand that cannot be represented using a modRM byte.  The XOR
2141      replacement is long decoded, so this split helps here as well.  */
2142   m_K6,
2143 
2144   /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2145      from FP to FP. */
2146   m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2147 
2148   /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2149      from integer to FP. */
2150   m_AMDFAM10,
2151 
2152   /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2153      with a subsequent conditional jump instruction into a single
2154      compare-and-branch uop.  */
2155   m_BDVER,
2156 
2157   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2158      will impact LEA instruction selection. */
2159   m_ATOM,
2160 
2161   /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2162      instructions.  */
2163   ~m_ATOM,
2164 
2165   /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2166      at -O3.  For the moment, the prefetching seems badly tuned for Intel
2167      chips.  */
2168   m_K6_GEODE | m_AMD_MULTIPLE,
2169 
2170   /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2171      the auto-vectorizer.  */
2172   m_BDVER,
2173 
2174   /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2175      during reassociation of integer computation.  */
2176   m_ATOM,
2177 
2178   /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2179      during reassociation of fp computation.  */
2180   m_ATOM
2181 };
2182 
2183 /* Feature tests against the various architecture variations.  */
2184 unsigned char ix86_arch_features[X86_ARCH_LAST];
2185 
2186 /* Feature tests against the various architecture variations, used to create
2187    ix86_arch_features based on the processor mask.  */
2188 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2189   /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
2190   ~(m_386 | m_486 | m_PENT | m_K6),
2191 
2192   /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
2193   ~m_386,
2194 
2195   /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2196   ~(m_386 | m_486),
2197 
2198   /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
2199   ~m_386,
2200 
2201   /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
2202   ~m_386,
2203 };
2204 
2205 static const unsigned int x86_accumulate_outgoing_args
2206   = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2207 
2208 static const unsigned int x86_arch_always_fancy_math_387
2209   = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2210 
2211 static const unsigned int x86_avx256_split_unaligned_load
2212   = m_COREI7 | m_GENERIC;
2213 
2214 static const unsigned int x86_avx256_split_unaligned_store
2215   = m_COREI7 | m_BDVER | m_GENERIC;
2216 
2217 /* In case the average insn count for single function invocation is
2218    lower than this constant, emit fast (but longer) prologue and
2219    epilogue code.  */
2220 #define FAST_PROLOGUE_INSN_COUNT 20
2221 
2222 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
2223 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2224 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2225 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2226 
2227 /* Array of the smallest class containing reg number REGNO, indexed by
2228    REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
2229 
2230 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2231 {
2232   /* ax, dx, cx, bx */
2233   AREG, DREG, CREG, BREG,
2234   /* si, di, bp, sp */
2235   SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2236   /* FP registers */
2237   FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2238   FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2239   /* arg pointer */
2240   NON_Q_REGS,
2241   /* flags, fpsr, fpcr, frame */
2242   NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2243   /* SSE registers */
2244   SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2245   SSE_REGS, SSE_REGS,
2246   /* MMX registers */
2247   MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2248   MMX_REGS, MMX_REGS,
2249   /* REX registers */
2250   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2252   /* SSE REX registers */
2253   SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2254   SSE_REGS, SSE_REGS,
2255 };
2256 
2257 /* The "default" register map used in 32bit mode.  */
2258 
2259 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2260 {
2261   0, 2, 1, 3, 6, 7, 4, 5,		/* general regs */
2262   12, 13, 14, 15, 16, 17, 18, 19,	/* fp regs */
2263   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
2264   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE */
2265   29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
2266   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
2267   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
2268 };
2269 
2270 /* The "default" register map used in 64bit mode.  */
2271 
2272 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2273 {
2274   0, 1, 2, 3, 4, 5, 6, 7,		/* general regs */
2275   33, 34, 35, 36, 37, 38, 39, 40,	/* fp regs */
2276   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
2277   17, 18, 19, 20, 21, 22, 23, 24,	/* SSE */
2278   41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
2279   8,9,10,11,12,13,14,15,		/* extended integer registers */
2280   25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
2281 };
2282 
2283 /* Define the register numbers to be used in Dwarf debugging information.
2284    The SVR4 reference port C compiler uses the following register numbers
2285    in its Dwarf output code:
2286 	0 for %eax (gcc regno = 0)
2287 	1 for %ecx (gcc regno = 2)
2288 	2 for %edx (gcc regno = 1)
2289 	3 for %ebx (gcc regno = 3)
2290 	4 for %esp (gcc regno = 7)
2291 	5 for %ebp (gcc regno = 6)
2292 	6 for %esi (gcc regno = 4)
2293 	7 for %edi (gcc regno = 5)
2294    The following three DWARF register numbers are never generated by
2295    the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2296    believes these numbers have these meanings.
2297 	8  for %eip    (no gcc equivalent)
2298 	9  for %eflags (gcc regno = 17)
2299 	10 for %trapno (no gcc equivalent)
2300    It is not at all clear how we should number the FP stack registers
2301    for the x86 architecture.  If the version of SDB on x86/svr4 were
2302    a bit less brain dead with respect to floating-point then we would
2303    have a precedent to follow with respect to DWARF register numbers
2304    for x86 FP registers, but the SDB on x86/svr4 is so completely
2305    broken with respect to FP registers that it is hardly worth thinking
2306    of it as something to strive for compatibility with.
2307    The version of x86/svr4 SDB I have at the moment does (partially)
2308    seem to believe that DWARF register number 11 is associated with
2309    the x86 register %st(0), but that's about all.  Higher DWARF
2310    register numbers don't seem to be associated with anything in
2311    particular, and even for DWARF regno 11, SDB only seems to under-
2312    stand that it should say that a variable lives in %st(0) (when
2313    asked via an `=' command) if we said it was in DWARF regno 11,
2314    but SDB still prints garbage when asked for the value of the
2315    variable in question (via a `/' command).
2316    (Also note that the labels SDB prints for various FP stack regs
2317    when doing an `x' command are all wrong.)
2318    Note that these problems generally don't affect the native SVR4
2319    C compiler because it doesn't allow the use of -O with -g and
2320    because when it is *not* optimizing, it allocates a memory
2321    location for each floating-point variable, and the memory
2322    location is what gets described in the DWARF AT_location
2323    attribute for the variable in question.
2324    Regardless of the severe mental illness of the x86/svr4 SDB, we
2325    do something sensible here and we use the following DWARF
2326    register numbers.  Note that these are all stack-top-relative
2327    numbers.
2328 	11 for %st(0) (gcc regno = 8)
2329 	12 for %st(1) (gcc regno = 9)
2330 	13 for %st(2) (gcc regno = 10)
2331 	14 for %st(3) (gcc regno = 11)
2332 	15 for %st(4) (gcc regno = 12)
2333 	16 for %st(5) (gcc regno = 13)
2334 	17 for %st(6) (gcc regno = 14)
2335 	18 for %st(7) (gcc regno = 15)
2336 */
2337 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2338 {
2339   0, 2, 1, 3, 6, 7, 5, 4,		/* general regs */
2340   11, 12, 13, 14, 15, 16, 17, 18,	/* fp regs */
2341   -1, 9, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
2342   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE registers */
2343   29, 30, 31, 32, 33, 34, 35, 36,	/* MMX registers */
2344   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
2345   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
2346 };
2347 
2348 /* Define parameter passing and return registers.  */
2349 
2350 static int const x86_64_int_parameter_registers[6] =
2351 {
2352   DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2353 };
2354 
2355 static int const x86_64_ms_abi_int_parameter_registers[4] =
2356 {
2357   CX_REG, DX_REG, R8_REG, R9_REG
2358 };
2359 
2360 static int const x86_64_int_return_registers[4] =
2361 {
2362   AX_REG, DX_REG, DI_REG, SI_REG
2363 };
2364 
2365 /* Define the structure for the machine field in struct function.  */
2366 
2367 struct GTY(()) stack_local_entry {
2368   unsigned short mode;
2369   unsigned short n;
2370   rtx rtl;
2371   struct stack_local_entry *next;
2372 };
2373 
2374 /* Structure describing stack frame layout.
2375    Stack grows downward:
2376 
2377    [arguments]
2378 					<- ARG_POINTER
2379    saved pc
2380 
2381    saved static chain			if ix86_static_chain_on_stack
2382 
2383    saved frame pointer			if frame_pointer_needed
2384 					<- HARD_FRAME_POINTER
2385    [saved regs]
2386 					<- regs_save_offset
2387    [padding0]
2388 
2389    [saved SSE regs]
2390 					<- sse_regs_save_offset
2391    [padding1]          |
2392 		       |		<- FRAME_POINTER
2393    [va_arg registers]  |
2394 		       |
2395    [frame]	       |
2396 		       |
2397    [padding2]	       | = to_allocate
2398 					<- STACK_POINTER
2399   */
2400 struct ix86_frame
2401 {
2402   int nsseregs;
2403   int nregs;
2404   int va_arg_size;
2405   int red_zone_size;
2406   int outgoing_arguments_size;
2407   HOST_WIDE_INT frame;
2408 
2409   /* The offsets relative to ARG_POINTER.  */
2410   HOST_WIDE_INT frame_pointer_offset;
2411   HOST_WIDE_INT hard_frame_pointer_offset;
2412   HOST_WIDE_INT stack_pointer_offset;
2413   HOST_WIDE_INT hfp_save_offset;
2414   HOST_WIDE_INT reg_save_offset;
2415   HOST_WIDE_INT sse_reg_save_offset;
2416 
2417   /* When save_regs_using_mov is set, emit prologue using
2418      move instead of push instructions.  */
2419   bool save_regs_using_mov;
2420 };
2421 
2422 /* Which cpu are we scheduling for.  */
2423 enum attr_cpu ix86_schedule;
2424 
2425 /* Which cpu are we optimizing for.  */
2426 enum processor_type ix86_tune;
2427 
2428 /* Which instruction set architecture to use.  */
2429 enum processor_type ix86_arch;
2430 
2431 /* True if processor has SSE prefetch instruction.  */
2432 int x86_prefetch_sse;
2433 
2434 /* True if processor has prefetchw instruction.  */
2435 int x86_prefetchw;
2436 
2437 /* -mstackrealign option */
2438 static const char ix86_force_align_arg_pointer_string[]
2439   = "force_align_arg_pointer";
2440 
2441 static rtx (*ix86_gen_leave) (void);
2442 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2445 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2446 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2449 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2450 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2451 
2452 /* Preferred alignment for stack boundary in bits.  */
2453 unsigned int ix86_preferred_stack_boundary;
2454 
2455 /* Alignment for incoming stack boundary in bits specified at
2456    command line.  */
2457 static unsigned int ix86_user_incoming_stack_boundary;
2458 
2459 /* Default alignment for incoming stack boundary in bits.  */
2460 static unsigned int ix86_default_incoming_stack_boundary;
2461 
2462 /* Alignment for incoming stack boundary in bits.  */
2463 unsigned int ix86_incoming_stack_boundary;
2464 
2465 /* Calling abi specific va_list type nodes.  */
2466 static GTY(()) tree sysv_va_list_type_node;
2467 static GTY(()) tree ms_va_list_type_node;
2468 
2469 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
2470 char internal_label_prefix[16];
2471 int internal_label_prefix_len;
2472 
2473 /* Fence to use after loop using movnt.  */
2474 tree x86_mfence;
2475 
2476 /* Register class used for passing given 64bit part of the argument.
2477    These represent classes as documented by the PS ABI, with the exception
2478    of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2479    use SF or DFmode move instead of DImode to avoid reformatting penalties.
2480 
2481    Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2482    whenever possible (upper half does contain padding).  */
2483 enum x86_64_reg_class
2484   {
2485     X86_64_NO_CLASS,
2486     X86_64_INTEGER_CLASS,
2487     X86_64_INTEGERSI_CLASS,
2488     X86_64_SSE_CLASS,
2489     X86_64_SSESF_CLASS,
2490     X86_64_SSEDF_CLASS,
2491     X86_64_SSEUP_CLASS,
2492     X86_64_X87_CLASS,
2493     X86_64_X87UP_CLASS,
2494     X86_64_COMPLEX_X87_CLASS,
2495     X86_64_MEMORY_CLASS
2496   };
2497 
2498 #define MAX_CLASSES 4
2499 
2500 /* Table of constants used by fldpi, fldln2, etc....  */
2501 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2502 static bool ext_80387_constants_init = 0;
2503 
2504 
2505 static struct machine_function * ix86_init_machine_status (void);
2506 static rtx ix86_function_value (const_tree, const_tree, bool);
2507 static bool ix86_function_value_regno_p (const unsigned int);
2508 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2509 						const_tree);
2510 static rtx ix86_static_chain (const_tree, bool);
2511 static int ix86_function_regparm (const_tree, const_tree);
2512 static void ix86_compute_frame_layout (struct ix86_frame *);
2513 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2514 						 rtx, rtx, int);
2515 static void ix86_add_new_builtins (HOST_WIDE_INT);
2516 static tree ix86_canonical_va_list_type (tree);
2517 static void predict_jump (int);
2518 static unsigned int split_stack_prologue_scratch_regno (void);
2519 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2520 
2521 enum ix86_function_specific_strings
2522 {
2523   IX86_FUNCTION_SPECIFIC_ARCH,
2524   IX86_FUNCTION_SPECIFIC_TUNE,
2525   IX86_FUNCTION_SPECIFIC_MAX
2526 };
2527 
2528 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2529 				 const char *, enum fpmath_unit, bool);
2530 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2531 static void ix86_function_specific_save (struct cl_target_option *);
2532 static void ix86_function_specific_restore (struct cl_target_option *);
2533 static void ix86_function_specific_print (FILE *, int,
2534 					  struct cl_target_option *);
2535 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2536 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2537 						 struct gcc_options *);
2538 static bool ix86_can_inline_p (tree, tree);
2539 static void ix86_set_current_function (tree);
2540 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2541 
2542 static enum calling_abi ix86_function_abi (const_tree);
2543 
2544 
2545 #ifndef SUBTARGET32_DEFAULT_CPU
2546 #define SUBTARGET32_DEFAULT_CPU "i386"
2547 #endif
2548 
2549 /* The svr4 ABI for the i386 says that records and unions are returned
2550    in memory.  */
2551 #ifndef DEFAULT_PCC_STRUCT_RETURN
2552 #define DEFAULT_PCC_STRUCT_RETURN 1
2553 #endif
2554 
2555 /* Whether -mtune= or -march= were specified */
2556 static int ix86_tune_defaulted;
2557 static int ix86_arch_specified;
2558 
2559 /* Vectorization library interface and handlers.  */
2560 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2561 
2562 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2563 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2564 
2565 /* Processor target table, indexed by processor number */
2566 struct ptt
2567 {
2568   const struct processor_costs *cost;		/* Processor costs */
2569   const int align_loop;				/* Default alignments.  */
2570   const int align_loop_max_skip;
2571   const int align_jump;
2572   const int align_jump_max_skip;
2573   const int align_func;
2574 };
2575 
2576 static const struct ptt processor_target_table[PROCESSOR_max] =
2577 {
2578   {&i386_cost, 4, 3, 4, 3, 4},
2579   {&i486_cost, 16, 15, 16, 15, 16},
2580   {&pentium_cost, 16, 7, 16, 7, 16},
2581   {&pentiumpro_cost, 16, 15, 16, 10, 16},
2582   {&geode_cost, 0, 0, 0, 0, 0},
2583   {&k6_cost, 32, 7, 32, 7, 32},
2584   {&athlon_cost, 16, 7, 16, 7, 16},
2585   {&pentium4_cost, 0, 0, 0, 0, 0},
2586   {&k8_cost, 16, 7, 16, 7, 16},
2587   {&nocona_cost, 0, 0, 0, 0, 0},
2588   /* Core 2 32-bit.  */
2589   {&generic32_cost, 16, 10, 16, 10, 16},
2590   /* Core 2 64-bit.  */
2591   {&generic64_cost, 16, 10, 16, 10, 16},
2592   /* Core i7 32-bit.  */
2593   {&generic32_cost, 16, 10, 16, 10, 16},
2594   /* Core i7 64-bit.  */
2595   {&generic64_cost, 16, 10, 16, 10, 16},
2596   {&generic32_cost, 16, 7, 16, 7, 16},
2597   {&generic64_cost, 16, 10, 16, 10, 16},
2598   {&amdfam10_cost, 32, 24, 32, 7, 32},
2599   {&bdver1_cost, 32, 24, 32, 7, 32},
2600   {&bdver2_cost, 32, 24, 32, 7, 32},
2601   {&btver1_cost, 32, 24, 32, 7, 32},
2602   {&atom_cost, 16, 15, 16, 7, 16}
2603 };
2604 
2605 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2606 {
2607   "generic",
2608   "i386",
2609   "i486",
2610   "pentium",
2611   "pentium-mmx",
2612   "pentiumpro",
2613   "pentium2",
2614   "pentium3",
2615   "pentium4",
2616   "pentium-m",
2617   "prescott",
2618   "nocona",
2619   "core2",
2620   "corei7",
2621   "atom",
2622   "geode",
2623   "k6",
2624   "k6-2",
2625   "k6-3",
2626   "athlon",
2627   "athlon-4",
2628   "k8",
2629   "amdfam10",
2630   "bdver1",
2631   "bdver2",
2632   "btver1"
2633 };
2634 
2635 /* Return true if a red-zone is in use.  */
2636 
2637 static inline bool
2638 ix86_using_red_zone (void)
2639 {
2640   return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2641 }
2642 
2643 /* Return a string that documents the current -m options.  The caller is
2644    responsible for freeing the string.  */
2645 
2646 static char *
2647 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2648 		    const char *tune, enum fpmath_unit fpmath,
2649 		    bool add_nl_p)
2650 {
2651   struct ix86_target_opts
2652   {
2653     const char *option;		/* option string */
2654     HOST_WIDE_INT mask;		/* isa mask options */
2655   };
2656 
2657   /* This table is ordered so that options like -msse4.2 that imply
2658      preceding options while match those first.  */
2659   static struct ix86_target_opts isa_opts[] =
2660   {
2661     { "-m64",		OPTION_MASK_ISA_64BIT },
2662     { "-mfma4",		OPTION_MASK_ISA_FMA4 },
2663     { "-mfma",		OPTION_MASK_ISA_FMA },
2664     { "-mxop",		OPTION_MASK_ISA_XOP },
2665     { "-mlwp",		OPTION_MASK_ISA_LWP },
2666     { "-msse4a",	OPTION_MASK_ISA_SSE4A },
2667     { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
2668     { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
2669     { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
2670     { "-msse3",		OPTION_MASK_ISA_SSE3 },
2671     { "-msse2",		OPTION_MASK_ISA_SSE2 },
2672     { "-msse",		OPTION_MASK_ISA_SSE },
2673     { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
2674     { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
2675     { "-mmmx",		OPTION_MASK_ISA_MMX },
2676     { "-mabm",		OPTION_MASK_ISA_ABM },
2677     { "-mbmi",		OPTION_MASK_ISA_BMI },
2678     { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
2679     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
2680     { "-mtbm",		OPTION_MASK_ISA_TBM },
2681     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
2682     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
2683     { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
2684     { "-maes",		OPTION_MASK_ISA_AES },
2685     { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
2686     { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
2687     { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
2688     { "-mf16c",		OPTION_MASK_ISA_F16C },
2689   };
2690 
2691   /* Flag options.  */
2692   static struct ix86_target_opts flag_opts[] =
2693   {
2694     { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
2695     { "-m80387",			MASK_80387 },
2696     { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
2697     { "-malign-double",			MASK_ALIGN_DOUBLE },
2698     { "-mcld",				MASK_CLD },
2699     { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
2700     { "-mieee-fp",			MASK_IEEE_FP },
2701     { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
2702     { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
2703     { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
2704     { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
2705     { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
2706     { "-mno-push-args",			MASK_NO_PUSH_ARGS },
2707     { "-mno-red-zone",			MASK_NO_RED_ZONE },
2708     { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
2709     { "-mrecip",			MASK_RECIP },
2710     { "-mrtd",				MASK_RTD },
2711     { "-msseregparm",			MASK_SSEREGPARM },
2712     { "-mstack-arg-probe",		MASK_STACK_PROBE },
2713     { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
2714     { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
2715     { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
2716     { "-mvzeroupper",			MASK_VZEROUPPER },
2717     { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2718     { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE},
2719     { "-mprefer-avx128",		MASK_PREFER_AVX128},
2720   };
2721 
2722   const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2723 
2724   char isa_other[40];
2725   char target_other[40];
2726   unsigned num = 0;
2727   unsigned i, j;
2728   char *ret;
2729   char *ptr;
2730   size_t len;
2731   size_t line_len;
2732   size_t sep_len;
2733 
2734   memset (opts, '\0', sizeof (opts));
2735 
2736   /* Add -march= option.  */
2737   if (arch)
2738     {
2739       opts[num][0] = "-march=";
2740       opts[num++][1] = arch;
2741     }
2742 
2743   /* Add -mtune= option.  */
2744   if (tune)
2745     {
2746       opts[num][0] = "-mtune=";
2747       opts[num++][1] = tune;
2748     }
2749 
2750   /* Pick out the options in isa options.  */
2751   for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2752     {
2753       if ((isa & isa_opts[i].mask) != 0)
2754 	{
2755 	  opts[num++][0] = isa_opts[i].option;
2756 	  isa &= ~ isa_opts[i].mask;
2757 	}
2758     }
2759 
2760   if (isa && add_nl_p)
2761     {
2762       opts[num++][0] = isa_other;
2763       sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2764 	       isa);
2765     }
2766 
2767   /* Add flag options.  */
2768   for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2769     {
2770       if ((flags & flag_opts[i].mask) != 0)
2771 	{
2772 	  opts[num++][0] = flag_opts[i].option;
2773 	  flags &= ~ flag_opts[i].mask;
2774 	}
2775     }
2776 
2777   if (flags && add_nl_p)
2778     {
2779       opts[num++][0] = target_other;
2780       sprintf (target_other, "(other flags: %#x)", flags);
2781     }
2782 
2783   /* Add -fpmath= option.  */
2784   if (fpmath)
2785     {
2786       opts[num][0] = "-mfpmath=";
2787       switch ((int) fpmath)
2788 	{
2789 	case FPMATH_387:
2790 	  opts[num++][1] = "387";
2791 	  break;
2792 
2793 	case FPMATH_SSE:
2794 	  opts[num++][1] = "sse";
2795 	  break;
2796 
2797 	case FPMATH_387 | FPMATH_SSE:
2798 	  opts[num++][1] = "sse+387";
2799 	  break;
2800 
2801 	default:
2802 	  gcc_unreachable ();
2803 	}
2804     }
2805 
2806   /* Any options?  */
2807   if (num == 0)
2808     return NULL;
2809 
2810   gcc_assert (num < ARRAY_SIZE (opts));
2811 
2812   /* Size the string.  */
2813   len = 0;
2814   sep_len = (add_nl_p) ? 3 : 1;
2815   for (i = 0; i < num; i++)
2816     {
2817       len += sep_len;
2818       for (j = 0; j < 2; j++)
2819 	if (opts[i][j])
2820 	  len += strlen (opts[i][j]);
2821     }
2822 
2823   /* Build the string.  */
2824   ret = ptr = (char *) xmalloc (len);
2825   line_len = 0;
2826 
2827   for (i = 0; i < num; i++)
2828     {
2829       size_t len2[2];
2830 
2831       for (j = 0; j < 2; j++)
2832 	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2833 
2834       if (i != 0)
2835 	{
2836 	  *ptr++ = ' ';
2837 	  line_len++;
2838 
2839 	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2840 	    {
2841 	      *ptr++ = '\\';
2842 	      *ptr++ = '\n';
2843 	      line_len = 0;
2844 	    }
2845 	}
2846 
2847       for (j = 0; j < 2; j++)
2848 	if (opts[i][j])
2849 	  {
2850 	    memcpy (ptr, opts[i][j], len2[j]);
2851 	    ptr += len2[j];
2852 	    line_len += len2[j];
2853 	  }
2854     }
2855 
2856   *ptr = '\0';
2857   gcc_assert (ret + len >= ptr);
2858 
2859   return ret;
2860 }
2861 
2862 /* Return true, if profiling code should be emitted before
2863    prologue. Otherwise it returns false.
2864    Note: For x86 with "hotfix" it is sorried.  */
2865 static bool
2866 ix86_profile_before_prologue (void)
2867 {
2868   return flag_fentry != 0;
2869 }
2870 
2871 /* Function that is callable from the debugger to print the current
2872    options.  */
2873 void
2874 ix86_debug_options (void)
2875 {
2876   char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2877 				   ix86_arch_string, ix86_tune_string,
2878 				   ix86_fpmath, true);
2879 
2880   if (opts)
2881     {
2882       fprintf (stderr, "%s\n\n", opts);
2883       free (opts);
2884     }
2885   else
2886     fputs ("<no options>\n\n", stderr);
2887 
2888   return;
2889 }
2890 
2891 /* Override various settings based on options.  If MAIN_ARGS_P, the
2892    options are from the command line, otherwise they are from
2893    attributes.  */
2894 
2895 static void
2896 ix86_option_override_internal (bool main_args_p)
2897 {
2898   int i;
2899   unsigned int ix86_arch_mask, ix86_tune_mask;
2900   const bool ix86_tune_specified = (ix86_tune_string != NULL);
2901   const char *prefix;
2902   const char *suffix;
2903   const char *sw;
2904 
2905 #define PTA_3DNOW	 	(HOST_WIDE_INT_1 << 0)
2906 #define PTA_3DNOW_A	 	(HOST_WIDE_INT_1 << 1)
2907 #define PTA_64BIT		(HOST_WIDE_INT_1 << 2)
2908 #define PTA_ABM			(HOST_WIDE_INT_1 << 3)
2909 #define PTA_AES		 	(HOST_WIDE_INT_1 << 4)
2910 #define PTA_AVX			(HOST_WIDE_INT_1 << 5)
2911 #define PTA_BMI		 	(HOST_WIDE_INT_1 << 6)
2912 #define PTA_CX16		(HOST_WIDE_INT_1 << 7)
2913 #define PTA_F16C		(HOST_WIDE_INT_1 << 8)
2914 #define PTA_FMA			(HOST_WIDE_INT_1 << 9)
2915 #define PTA_FMA4	 	(HOST_WIDE_INT_1 << 10)
2916 #define PTA_FSGSBASE		(HOST_WIDE_INT_1 << 11)
2917 #define PTA_LWP		 	(HOST_WIDE_INT_1 << 12)
2918 #define PTA_LZCNT	 	(HOST_WIDE_INT_1 << 13)
2919 #define PTA_MMX			(HOST_WIDE_INT_1 << 14)
2920 #define PTA_MOVBE		(HOST_WIDE_INT_1 << 15)
2921 #define PTA_NO_SAHF		(HOST_WIDE_INT_1 << 16)
2922 #define PTA_PCLMUL		(HOST_WIDE_INT_1 << 17)
2923 #define PTA_POPCNT		(HOST_WIDE_INT_1 << 18)
2924 #define PTA_PREFETCH_SSE	(HOST_WIDE_INT_1 << 19)
2925 #define PTA_RDRND	 	(HOST_WIDE_INT_1 << 20)
2926 #define PTA_SSE			(HOST_WIDE_INT_1 << 21)
2927 #define PTA_SSE2		(HOST_WIDE_INT_1 << 22)
2928 #define PTA_SSE3		(HOST_WIDE_INT_1 << 23)
2929 #define PTA_SSE4_1	 	(HOST_WIDE_INT_1 << 24)
2930 #define PTA_SSE4_2	 	(HOST_WIDE_INT_1 << 25)
2931 #define PTA_SSE4A		(HOST_WIDE_INT_1 << 26)
2932 #define PTA_SSSE3		(HOST_WIDE_INT_1 << 27)
2933 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
2934 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
2935 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
2936 #define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
2937 #define PTA_PREFETCHW		(HOST_WIDE_INT_1 << 32)
2938 
2939 /* if this reaches 64, need to widen struct pta flags below */
2940 
2941   static struct pta
2942     {
2943       const char *const name;		/* processor name or nickname.  */
2944       const enum processor_type processor;
2945       const enum attr_cpu schedule;
2946       const unsigned HOST_WIDE_INT flags;
2947     }
2948   const processor_alias_table[] =
2949     {
2950       {"i386", PROCESSOR_I386, CPU_NONE, 0},
2951       {"i486", PROCESSOR_I486, CPU_NONE, 0},
2952       {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2953       {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2954       {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2955       {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2956       {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2957       {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2958       {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2959       {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2960       {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2961       {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2962       {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2963 	PTA_MMX | PTA_SSE},
2964       {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2965 	PTA_MMX | PTA_SSE},
2966       {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2967 	PTA_MMX | PTA_SSE | PTA_SSE2},
2968       {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2969 	PTA_MMX |PTA_SSE | PTA_SSE2},
2970       {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2971 	PTA_MMX | PTA_SSE | PTA_SSE2},
2972       {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2973 	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2974       {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2975 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 	| PTA_CX16 | PTA_NO_SAHF},
2977       {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2978 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 	| PTA_SSSE3 | PTA_CX16},
2980       {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2981 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2982 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2983       {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2984 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2985 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2986 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2987       {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2988 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2989 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2990 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2991 	| PTA_RDRND | PTA_F16C},
2992       {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2993 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2995 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2996 	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2997 	| PTA_FMA | PTA_MOVBE},
2998       {"atom", PROCESSOR_ATOM, CPU_ATOM,
2999 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3000 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3001       {"geode", PROCESSOR_GEODE, CPU_GEODE,
3002 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3003       {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3004       {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3005       {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3006       {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3007 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008       {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3009 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3010       {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3011 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012       {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014       {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3015 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3016       {"x86-64", PROCESSOR_K8, CPU_K8,
3017 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3018       {"k8", PROCESSOR_K8, CPU_K8,
3019 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 	| PTA_SSE2 | PTA_NO_SAHF},
3021       {"k8-sse3", PROCESSOR_K8, CPU_K8,
3022 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024       {"opteron", PROCESSOR_K8, CPU_K8,
3025 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 	| PTA_SSE2 | PTA_NO_SAHF},
3027       {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3028 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030       {"athlon64", PROCESSOR_K8, CPU_K8,
3031 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 	| PTA_SSE2 | PTA_NO_SAHF},
3033       {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3034 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3036       {"athlon-fx", PROCESSOR_K8, CPU_K8,
3037 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 	| PTA_SSE2 | PTA_NO_SAHF},
3039       {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3040 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3041 	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3042       {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3043 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3044 	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3045       {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3046 	PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3047 	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3048 	| PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3049 	| PTA_FMA4 | PTA_XOP | PTA_LWP},
3050       {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3051 	PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3052 	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3053 	| PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3054 	| PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3055 	| PTA_FMA},
3056       {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3057 	PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3058 	| PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16},
3059       {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3060 	0 /* flags are only used for -march switch.  */ },
3061       {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3062 	PTA_64BIT /* flags are only used for -march switch.  */ },
3063     };
3064 
3065   /* -mrecip options.  */
3066   static struct
3067     {
3068       const char *string;           /* option name */
3069       unsigned int mask;            /* mask bits to set */
3070     }
3071   const recip_options[] =
3072     {
3073       { "all",       RECIP_MASK_ALL },
3074       { "none",      RECIP_MASK_NONE },
3075       { "div",       RECIP_MASK_DIV },
3076       { "sqrt",      RECIP_MASK_SQRT },
3077       { "vec-div",   RECIP_MASK_VEC_DIV },
3078       { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
3079     };
3080 
3081   int const pta_size = ARRAY_SIZE (processor_alias_table);
3082 
3083   /* Set up prefix/suffix so the error messages refer to either the command
3084      line argument, or the attribute(target).  */
3085   if (main_args_p)
3086     {
3087       prefix = "-m";
3088       suffix = "";
3089       sw = "switch";
3090     }
3091   else
3092     {
3093       prefix = "option(\"";
3094       suffix = "\")";
3095       sw = "attribute";
3096     }
3097 
3098 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3099   SUBTARGET_OVERRIDE_OPTIONS;
3100 #endif
3101 
3102 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3103   SUBSUBTARGET_OVERRIDE_OPTIONS;
3104 #endif
3105 
3106   if (TARGET_X32)
3107     ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3108 
3109   /* -fPIC is the default for x86_64.  */
3110   if (TARGET_MACHO && TARGET_64BIT)
3111     flag_pic = 2;
3112 
3113   /* Need to check -mtune=generic first.  */
3114   if (ix86_tune_string)
3115     {
3116       if (!strcmp (ix86_tune_string, "generic")
3117 	  || !strcmp (ix86_tune_string, "i686")
3118 	  /* As special support for cross compilers we read -mtune=native
3119 	     as -mtune=generic.  With native compilers we won't see the
3120 	     -mtune=native, as it was changed by the driver.  */
3121 	  || !strcmp (ix86_tune_string, "native"))
3122 	{
3123 	  if (TARGET_64BIT)
3124 	    ix86_tune_string = "generic64";
3125 	  else
3126 	    ix86_tune_string = "generic32";
3127 	}
3128       /* If this call is for setting the option attribute, allow the
3129 	 generic32/generic64 that was previously set.  */
3130       else if (!main_args_p
3131 	       && (!strcmp (ix86_tune_string, "generic32")
3132 		   || !strcmp (ix86_tune_string, "generic64")))
3133 	;
3134       else if (!strncmp (ix86_tune_string, "generic", 7))
3135         error ("bad value (%s) for %stune=%s %s",
3136 	       ix86_tune_string, prefix, suffix, sw);
3137       else if (!strcmp (ix86_tune_string, "x86-64"))
3138         warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3139                  "%stune=k8%s or %stune=generic%s instead as appropriate",
3140                  prefix, suffix, prefix, suffix, prefix, suffix);
3141     }
3142   else
3143     {
3144       if (ix86_arch_string)
3145 	ix86_tune_string = ix86_arch_string;
3146       if (!ix86_tune_string)
3147 	{
3148 	  ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3149 	  ix86_tune_defaulted = 1;
3150 	}
3151 
3152       /* ix86_tune_string is set to ix86_arch_string or defaulted.  We
3153 	 need to use a sensible tune option.  */
3154       if (!strcmp (ix86_tune_string, "generic")
3155 	  || !strcmp (ix86_tune_string, "x86-64")
3156 	  || !strcmp (ix86_tune_string, "i686"))
3157 	{
3158 	  if (TARGET_64BIT)
3159 	    ix86_tune_string = "generic64";
3160 	  else
3161 	    ix86_tune_string = "generic32";
3162 	}
3163     }
3164 
3165   if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3166     {
3167       /* rep; movq isn't available in 32-bit code.  */
3168       error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3169       ix86_stringop_alg = no_stringop;
3170     }
3171 
3172   if (!ix86_arch_string)
3173     ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3174   else
3175     ix86_arch_specified = 1;
3176 
3177   if (!global_options_set.x_ix86_abi)
3178     ix86_abi = DEFAULT_ABI;
3179 
3180   if (global_options_set.x_ix86_cmodel)
3181     {
3182       switch (ix86_cmodel)
3183 	{
3184 	case CM_SMALL:
3185 	case CM_SMALL_PIC:
3186 	  if (flag_pic)
3187 	    ix86_cmodel = CM_SMALL_PIC;
3188 	  if (!TARGET_64BIT)
3189 	    error ("code model %qs not supported in the %s bit mode",
3190 		   "small", "32");
3191 	  break;
3192 
3193 	case CM_MEDIUM:
3194 	case CM_MEDIUM_PIC:
3195 	  if (flag_pic)
3196 	    ix86_cmodel = CM_MEDIUM_PIC;
3197 	  if (!TARGET_64BIT)
3198 	    error ("code model %qs not supported in the %s bit mode",
3199 		   "medium", "32");
3200 	  else if (TARGET_X32)
3201 	    error ("code model %qs not supported in x32 mode",
3202 		   "medium");
3203 	  break;
3204 
3205 	case CM_LARGE:
3206 	case CM_LARGE_PIC:
3207 	  if (flag_pic)
3208 	    ix86_cmodel = CM_LARGE_PIC;
3209 	  if (!TARGET_64BIT)
3210 	    error ("code model %qs not supported in the %s bit mode",
3211 		   "large", "32");
3212 	  else if (TARGET_X32)
3213 	    error ("code model %qs not supported in x32 mode",
3214 		   "large");
3215 	  break;
3216 
3217 	case CM_32:
3218 	  if (flag_pic)
3219 	    error ("code model %s does not support PIC mode", "32");
3220 	  if (TARGET_64BIT)
3221 	    error ("code model %qs not supported in the %s bit mode",
3222 		   "32", "64");
3223 	  break;
3224 
3225 	case CM_KERNEL:
3226 	  if (flag_pic)
3227 	    {
3228 	      error ("code model %s does not support PIC mode", "kernel");
3229 	      ix86_cmodel = CM_32;
3230 	    }
3231 	  if (!TARGET_64BIT)
3232 	    error ("code model %qs not supported in the %s bit mode",
3233 		   "kernel", "32");
3234 	  break;
3235 
3236 	default:
3237 	  gcc_unreachable ();
3238 	}
3239     }
3240   else
3241     {
3242       /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3243 	 use of rip-relative addressing.  This eliminates fixups that
3244 	 would otherwise be needed if this object is to be placed in a
3245 	 DLL, and is essentially just as efficient as direct addressing.  */
3246       if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3247 	ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3248       else if (TARGET_64BIT)
3249 	ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3250       else
3251         ix86_cmodel = CM_32;
3252     }
3253   if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3254     {
3255       error ("-masm=intel not supported in this configuration");
3256       ix86_asm_dialect = ASM_ATT;
3257     }
3258   if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3259     sorry ("%i-bit mode not compiled in",
3260 	   (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3261 
3262   for (i = 0; i < pta_size; i++)
3263     if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3264       {
3265 	ix86_schedule = processor_alias_table[i].schedule;
3266 	ix86_arch = processor_alias_table[i].processor;
3267 	/* Default cpu tuning to the architecture.  */
3268 	ix86_tune = ix86_arch;
3269 
3270 	if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3271 	  error ("CPU you selected does not support x86-64 "
3272 		 "instruction set");
3273 
3274 	if (processor_alias_table[i].flags & PTA_MMX
3275 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3276 	  ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3277 	if (processor_alias_table[i].flags & PTA_3DNOW
3278 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3279 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3280 	if (processor_alias_table[i].flags & PTA_3DNOW_A
3281 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3282 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3283 	if (processor_alias_table[i].flags & PTA_SSE
3284 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3285 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3286 	if (processor_alias_table[i].flags & PTA_SSE2
3287 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3288 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3289 	if (processor_alias_table[i].flags & PTA_SSE3
3290 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3291 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3292 	if (processor_alias_table[i].flags & PTA_SSSE3
3293 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3294 	  ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3295 	if (processor_alias_table[i].flags & PTA_SSE4_1
3296 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3297 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3298 	if (processor_alias_table[i].flags & PTA_SSE4_2
3299 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3300 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3301 	if (processor_alias_table[i].flags & PTA_AVX
3302 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3303 	  ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3304 	if (processor_alias_table[i].flags & PTA_AVX2
3305 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3306 	  ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3307 	if (processor_alias_table[i].flags & PTA_FMA
3308 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3309 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3310 	if (processor_alias_table[i].flags & PTA_SSE4A
3311 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3312 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3313 	if (processor_alias_table[i].flags & PTA_FMA4
3314 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3315 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3316 	if (processor_alias_table[i].flags & PTA_XOP
3317 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3318 	  ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3319 	if (processor_alias_table[i].flags & PTA_LWP
3320 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3321 	  ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3322 	if (processor_alias_table[i].flags & PTA_ABM
3323 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3324 	  ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3325 	if (processor_alias_table[i].flags & PTA_BMI
3326 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3327 	  ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3328 	if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3329 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3330 	  ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3331 	if (processor_alias_table[i].flags & PTA_TBM
3332 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3333 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3334 	if (processor_alias_table[i].flags & PTA_BMI2
3335 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3336 	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3337 	if (processor_alias_table[i].flags & PTA_CX16
3338 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3339 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3340 	if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3341 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3342 	  ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3343 	if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3344 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3345 	  ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3346 	if (processor_alias_table[i].flags & PTA_MOVBE
3347 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3348 	  ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3349 	if (processor_alias_table[i].flags & PTA_AES
3350 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3351 	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
3352 	if (processor_alias_table[i].flags & PTA_PCLMUL
3353 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3354 	  ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3355 	if (processor_alias_table[i].flags & PTA_FSGSBASE
3356 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3357 	  ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3358 	if (processor_alias_table[i].flags & PTA_RDRND
3359 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3360 	  ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3361 	if (processor_alias_table[i].flags & PTA_F16C
3362 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3363 	  ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3364 	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3365 	  x86_prefetch_sse = true;
3366 	if (processor_alias_table[i].flags & PTA_PREFETCHW)
3367 	  x86_prefetchw = true;
3368 
3369 	break;
3370       }
3371 
3372   if (!strcmp (ix86_arch_string, "generic"))
3373     error ("generic CPU can be used only for %stune=%s %s",
3374 	   prefix, suffix, sw);
3375   else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3376     error ("bad value (%s) for %sarch=%s %s",
3377 	   ix86_arch_string, prefix, suffix, sw);
3378 
3379   ix86_arch_mask = 1u << ix86_arch;
3380   for (i = 0; i < X86_ARCH_LAST; ++i)
3381     ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3382 
3383   for (i = 0; i < pta_size; i++)
3384     if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3385       {
3386 	ix86_schedule = processor_alias_table[i].schedule;
3387 	ix86_tune = processor_alias_table[i].processor;
3388 	if (TARGET_64BIT)
3389 	  {
3390 	    if (!(processor_alias_table[i].flags & PTA_64BIT))
3391 	      {
3392 		if (ix86_tune_defaulted)
3393 		  {
3394 		    ix86_tune_string = "x86-64";
3395 		    for (i = 0; i < pta_size; i++)
3396 		      if (! strcmp (ix86_tune_string,
3397 				    processor_alias_table[i].name))
3398 			break;
3399 		    ix86_schedule = processor_alias_table[i].schedule;
3400 		    ix86_tune = processor_alias_table[i].processor;
3401 		  }
3402 		else
3403 		  error ("CPU you selected does not support x86-64 "
3404 			 "instruction set");
3405 	      }
3406 	  }
3407 	else
3408 	  {
3409 	    /* Adjust tuning when compiling for 32-bit ABI.  */
3410 	    switch (ix86_tune)
3411 	      {
3412 	      case PROCESSOR_GENERIC64:
3413 		ix86_tune = PROCESSOR_GENERIC32;
3414 		ix86_schedule = CPU_PENTIUMPRO;
3415 		break;
3416 
3417 	      case PROCESSOR_CORE2_64:
3418 		ix86_tune = PROCESSOR_CORE2_32;
3419 		break;
3420 
3421 	      case PROCESSOR_COREI7_64:
3422 		ix86_tune = PROCESSOR_COREI7_32;
3423 		break;
3424 
3425 	      default:
3426 		break;
3427 	      }
3428 	  }
3429 	/* Intel CPUs have always interpreted SSE prefetch instructions as
3430 	   NOPs; so, we can enable SSE prefetch instructions even when
3431 	   -mtune (rather than -march) points us to a processor that has them.
3432 	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3433 	   higher processors.  */
3434 	if (TARGET_CMOV
3435 	    && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3436 	  x86_prefetch_sse = true;
3437 	break;
3438       }
3439 
3440   if (ix86_tune_specified && i == pta_size)
3441     error ("bad value (%s) for %stune=%s %s",
3442 	   ix86_tune_string, prefix, suffix, sw);
3443 
3444   ix86_tune_mask = 1u << ix86_tune;
3445   for (i = 0; i < X86_TUNE_LAST; ++i)
3446     ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3447 
3448 #ifndef USE_IX86_FRAME_POINTER
3449 #define USE_IX86_FRAME_POINTER 0
3450 #endif
3451 
3452 #ifndef USE_X86_64_FRAME_POINTER
3453 #define USE_X86_64_FRAME_POINTER 0
3454 #endif
3455 
3456   /* Set the default values for switches whose default depends on TARGET_64BIT
3457      in case they weren't overwritten by command line options.  */
3458   if (TARGET_64BIT)
3459     {
3460       if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3461 	flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3462       if (flag_asynchronous_unwind_tables == 2)
3463 	flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3464       if (flag_pcc_struct_return == 2)
3465 	flag_pcc_struct_return = 0;
3466     }
3467   else
3468     {
3469       if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3470 	flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3471       if (flag_asynchronous_unwind_tables == 2)
3472 	flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3473       if (flag_pcc_struct_return == 2)
3474 	flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3475     }
3476 
3477   if (optimize_size)
3478     ix86_cost = &ix86_size_cost;
3479   else
3480     ix86_cost = processor_target_table[ix86_tune].cost;
3481 
3482   /* Arrange to set up i386_stack_locals for all functions.  */
3483   init_machine_status = ix86_init_machine_status;
3484 
3485   /* Validate -mregparm= value.  */
3486   if (global_options_set.x_ix86_regparm)
3487     {
3488       if (TARGET_64BIT)
3489 	warning (0, "-mregparm is ignored in 64-bit mode");
3490       if (ix86_regparm > REGPARM_MAX)
3491 	{
3492 	  error ("-mregparm=%d is not between 0 and %d",
3493 		 ix86_regparm, REGPARM_MAX);
3494 	  ix86_regparm = 0;
3495 	}
3496     }
3497   if (TARGET_64BIT)
3498     ix86_regparm = REGPARM_MAX;
3499 
3500   /* Default align_* from the processor table.  */
3501   if (align_loops == 0)
3502     {
3503       align_loops = processor_target_table[ix86_tune].align_loop;
3504       align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3505     }
3506   if (align_jumps == 0)
3507     {
3508       align_jumps = processor_target_table[ix86_tune].align_jump;
3509       align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3510     }
3511   if (align_functions == 0)
3512     {
3513       align_functions = processor_target_table[ix86_tune].align_func;
3514     }
3515 
3516   /* Provide default for -mbranch-cost= value.  */
3517   if (!global_options_set.x_ix86_branch_cost)
3518     ix86_branch_cost = ix86_cost->branch_cost;
3519 
3520   if (TARGET_64BIT)
3521     {
3522       target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3523 
3524       /* Enable by default the SSE and MMX builtins.  Do allow the user to
3525 	 explicitly disable any of these.  In particular, disabling SSE and
3526 	 MMX for kernel code is extremely useful.  */
3527       if (!ix86_arch_specified)
3528       ix86_isa_flags
3529 	|= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3530 	     | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3531 
3532       if (TARGET_RTD)
3533 	warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3534     }
3535   else
3536     {
3537       target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3538 
3539       if (!ix86_arch_specified)
3540       ix86_isa_flags
3541 	|= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3542 
3543       /* i386 ABI does not specify red zone.  It still makes sense to use it
3544          when programmer takes care to stack from being destroyed.  */
3545       if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3546         target_flags |= MASK_NO_RED_ZONE;
3547     }
3548 
3549   /* Keep nonleaf frame pointers.  */
3550   if (flag_omit_frame_pointer)
3551     target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3552   else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3553     flag_omit_frame_pointer = 1;
3554 
3555   /* If we're doing fast math, we don't care about comparison order
3556      wrt NaNs.  This lets us use a shorter comparison sequence.  */
3557   if (flag_finite_math_only)
3558     target_flags &= ~MASK_IEEE_FP;
3559 
3560   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3561      since the insns won't need emulation.  */
3562   if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3563     target_flags &= ~MASK_NO_FANCY_MATH_387;
3564 
3565   /* Likewise, if the target doesn't have a 387, or we've specified
3566      software floating point, don't use 387 inline intrinsics.  */
3567   if (!TARGET_80387)
3568     target_flags |= MASK_NO_FANCY_MATH_387;
3569 
3570   /* Turn on MMX builtins for -msse.  */
3571   if (TARGET_SSE)
3572     {
3573       ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3574       x86_prefetch_sse = true;
3575     }
3576 
3577   /* Turn on popcnt instruction for -msse4.2 or -mabm.  */
3578   if (TARGET_SSE4_2 || TARGET_ABM)
3579     ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3580 
3581   /* Turn on lzcnt instruction for -mabm.  */
3582   if (TARGET_ABM)
3583     ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3584 
3585   /* Validate -mpreferred-stack-boundary= value or default it to
3586      PREFERRED_STACK_BOUNDARY_DEFAULT.  */
3587   ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3588   if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3589     {
3590       int min = (TARGET_64BIT ? 4 : 2);
3591       int max = (TARGET_SEH ? 4 : 12);
3592 
3593       if (ix86_preferred_stack_boundary_arg < min
3594 	  || ix86_preferred_stack_boundary_arg > max)
3595 	{
3596 	  if (min == max)
3597 	    error ("-mpreferred-stack-boundary is not supported "
3598 		   "for this target");
3599 	  else
3600 	    error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3601 		   ix86_preferred_stack_boundary_arg, min, max);
3602 	}
3603       else
3604 	ix86_preferred_stack_boundary
3605 	  = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3606     }
3607 
3608   /* Set the default value for -mstackrealign.  */
3609   if (ix86_force_align_arg_pointer == -1)
3610     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3611 
3612   ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3613 
3614   /* Validate -mincoming-stack-boundary= value or default it to
3615      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
3616   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3617   if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3618     {
3619       if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3620 	  || ix86_incoming_stack_boundary_arg > 12)
3621 	error ("-mincoming-stack-boundary=%d is not between %d and 12",
3622 	       ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3623       else
3624 	{
3625 	  ix86_user_incoming_stack_boundary
3626 	    = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3627 	  ix86_incoming_stack_boundary
3628 	    = ix86_user_incoming_stack_boundary;
3629 	}
3630     }
3631 
3632   /* Accept -msseregparm only if at least SSE support is enabled.  */
3633   if (TARGET_SSEREGPARM
3634       && ! TARGET_SSE)
3635     error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3636 
3637   if (global_options_set.x_ix86_fpmath)
3638     {
3639       if (ix86_fpmath & FPMATH_SSE)
3640 	{
3641 	  if (!TARGET_SSE)
3642 	    {
3643 	      warning (0, "SSE instruction set disabled, using 387 arithmetics");
3644 	      ix86_fpmath = FPMATH_387;
3645 	    }
3646 	  else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3647 	    {
3648 	      warning (0, "387 instruction set disabled, using SSE arithmetics");
3649 	      ix86_fpmath = FPMATH_SSE;
3650 	    }
3651 	}
3652     }
3653   else
3654     ix86_fpmath = TARGET_FPMATH_DEFAULT;
3655 
3656   /* If the i387 is disabled, then do not return values in it. */
3657   if (!TARGET_80387)
3658     target_flags &= ~MASK_FLOAT_RETURNS;
3659 
3660   /* Use external vectorized library in vectorizing intrinsics.  */
3661   if (global_options_set.x_ix86_veclibabi_type)
3662     switch (ix86_veclibabi_type)
3663       {
3664       case ix86_veclibabi_type_svml:
3665 	ix86_veclib_handler = ix86_veclibabi_svml;
3666 	break;
3667 
3668       case ix86_veclibabi_type_acml:
3669 	ix86_veclib_handler = ix86_veclibabi_acml;
3670 	break;
3671 
3672       default:
3673 	gcc_unreachable ();
3674       }
3675 
3676   if ((!USE_IX86_FRAME_POINTER
3677        || (x86_accumulate_outgoing_args & ix86_tune_mask))
3678       && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3679       && !optimize_size)
3680     target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3681 
3682   /* ??? Unwind info is not correct around the CFG unless either a frame
3683      pointer is present or M_A_O_A is set.  Fixing this requires rewriting
3684      unwind info generation to be aware of the CFG and propagating states
3685      around edges.  */
3686   if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3687        || flag_exceptions || flag_non_call_exceptions)
3688       && flag_omit_frame_pointer
3689       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3690     {
3691       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3692 	warning (0, "unwind tables currently require either a frame pointer "
3693 		 "or %saccumulate-outgoing-args%s for correctness",
3694 		 prefix, suffix);
3695       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3696     }
3697 
3698   /* If stack probes are required, the space used for large function
3699      arguments on the stack must also be probed, so enable
3700      -maccumulate-outgoing-args so this happens in the prologue.  */
3701   if (TARGET_STACK_PROBE
3702       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3703     {
3704       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3705 	warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3706 		 "for correctness", prefix, suffix);
3707       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3708     }
3709 
3710   /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
3711   {
3712     char *p;
3713     ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714     p = strchr (internal_label_prefix, 'X');
3715     internal_label_prefix_len = p - internal_label_prefix;
3716     *p = '\0';
3717   }
3718 
3719   /* When scheduling description is not available, disable scheduler pass
3720      so it won't slow down the compilation and make x87 code slower.  */
3721   if (!TARGET_SCHEDULE)
3722     flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723 
3724   maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 			 ix86_cost->simultaneous_prefetches,
3726 			 global_options.x_param_values,
3727 			 global_options_set.x_param_values);
3728   maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 			 global_options.x_param_values,
3730 			 global_options_set.x_param_values);
3731   maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 			 global_options.x_param_values,
3733 			 global_options_set.x_param_values);
3734   maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 			 global_options.x_param_values,
3736 			 global_options_set.x_param_values);
3737 
3738   /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
3739   if (flag_prefetch_loop_arrays < 0
3740       && HAVE_prefetch
3741       && optimize >= 3
3742       && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743     flag_prefetch_loop_arrays = 1;
3744 
3745   /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746      can be optimized to ap = __builtin_next_arg (0).  */
3747   if (!TARGET_64BIT && !flag_split_stack)
3748     targetm.expand_builtin_va_start = NULL;
3749 
3750   if (TARGET_64BIT)
3751     {
3752       ix86_gen_leave = gen_leave_rex64;
3753       ix86_gen_add3 = gen_adddi3;
3754       ix86_gen_sub3 = gen_subdi3;
3755       ix86_gen_sub3_carry = gen_subdi3_carry;
3756       ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757       ix86_gen_monitor = gen_sse3_monitor64;
3758       ix86_gen_andsp = gen_anddi3;
3759       ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760       ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761       ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762     }
3763   else
3764     {
3765       ix86_gen_leave = gen_leave;
3766       ix86_gen_add3 = gen_addsi3;
3767       ix86_gen_sub3 = gen_subsi3;
3768       ix86_gen_sub3_carry = gen_subsi3_carry;
3769       ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770       ix86_gen_monitor = gen_sse3_monitor;
3771       ix86_gen_andsp = gen_andsi3;
3772       ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773       ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774       ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775     }
3776 
3777 #ifdef USE_IX86_CLD
3778   /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
3779   if (!TARGET_64BIT)
3780     target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782 
3783   if (!TARGET_64BIT && flag_pic)
3784     {
3785       if (flag_fentry > 0)
3786         sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 	       "with -fpic");
3788       flag_fentry = 0;
3789     }
3790   else if (TARGET_SEH)
3791     {
3792       if (flag_fentry == 0)
3793 	sorry ("-mno-fentry isn%'t compatible with SEH");
3794       flag_fentry = 1;
3795     }
3796   else if (flag_fentry < 0)
3797    {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799      flag_fentry = 1;
3800 #else
3801      flag_fentry = 0;
3802 #endif
3803    }
3804 
3805   if (TARGET_AVX)
3806     {
3807       /* When not optimize for size, enable vzeroupper optimization for
3808 	 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 	 AVX unaligned load/store.  */
3810       if (!optimize_size)
3811 	{
3812 	  if (flag_expensive_optimizations
3813 	      && !(target_flags_explicit & MASK_VZEROUPPER))
3814 	    target_flags |= MASK_VZEROUPPER;
3815 	  if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 	      && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 	    target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 	  if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 	      && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 	    target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 	  /* Enable 128-bit AVX instruction generation for the auto-vectorizer.  */
3822 	  if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 	    target_flags |= MASK_PREFER_AVX128;
3824 	}
3825     }
3826   else
3827     {
3828       /* Disable vzeroupper pass if TARGET_AVX is disabled.  */
3829       target_flags &= ~MASK_VZEROUPPER;
3830     }
3831 
3832   if (ix86_recip_name)
3833     {
3834       char *p = ASTRDUP (ix86_recip_name);
3835       char *q;
3836       unsigned int mask, i;
3837       bool invert;
3838 
3839       while ((q = strtok (p, ",")) != NULL)
3840 	{
3841 	  p = NULL;
3842 	  if (*q == '!')
3843 	    {
3844 	      invert = true;
3845 	      q++;
3846 	    }
3847 	  else
3848 	    invert = false;
3849 
3850 	  if (!strcmp (q, "default"))
3851 	    mask = RECIP_MASK_ALL;
3852 	  else
3853 	    {
3854 	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 		if (!strcmp (q, recip_options[i].string))
3856 		  {
3857 		    mask = recip_options[i].mask;
3858 		    break;
3859 		  }
3860 
3861 	      if (i == ARRAY_SIZE (recip_options))
3862 		{
3863 		  error ("unknown option for -mrecip=%s", q);
3864 		  invert = false;
3865 		  mask = RECIP_MASK_NONE;
3866 		}
3867 	    }
3868 
3869 	  recip_mask_explicit |= mask;
3870 	  if (invert)
3871 	    recip_mask &= ~mask;
3872 	  else
3873 	    recip_mask |= mask;
3874 	}
3875     }
3876 
3877   if (TARGET_RECIP)
3878     recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879   else if (target_flags_explicit & MASK_RECIP)
3880     recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881 
3882   /* Save the initial options in case the user does function specific
3883      options.  */
3884   if (main_args_p)
3885     target_option_default_node = target_option_current_node
3886       = build_target_option_node ();
3887 }
3888 
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes.  */
3890 
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894   if (!val)
3895     return false;
3896 
3897   if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898     return true;
3899 
3900   if (GET_CODE (val) == PARALLEL)
3901     {
3902       int i;
3903       rtx r;
3904 
3905       for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 	{
3907 	  r = XVECEXP (val, 0, i);
3908 	  if (GET_CODE (r) == EXPR_LIST
3909 	      && XEXP (r, 0)
3910 	      && REG_P (XEXP (r, 0))
3911 	      && (GET_MODE (XEXP (r, 0)) == OImode
3912 		  || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 	    return true;
3914 	}
3915     }
3916 
3917   return false;
3918 }
3919 
3920 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
3921 
3922 static void
3923 ix86_option_override (void)
3924 {
3925   ix86_option_override_internal (true);
3926 }
3927 
3928 /* Update register usage after having seen the compiler flags.  */
3929 
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933   int i;
3934   unsigned int j;
3935 
3936   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937     {
3938       if (fixed_regs[i] > 1)
3939 	fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940       if (call_used_regs[i] > 1)
3941 	call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942     }
3943 
3944   /* The PIC register, if it exists, is fixed.  */
3945   j = PIC_OFFSET_TABLE_REGNUM;
3946   if (j != INVALID_REGNUM)
3947     fixed_regs[j] = call_used_regs[j] = 1;
3948 
3949   /* The 64-bit MS_ABI changes the set of call-used registers.  */
3950   if (TARGET_64BIT_MS_ABI)
3951     {
3952       call_used_regs[SI_REG] = 0;
3953       call_used_regs[DI_REG] = 0;
3954       call_used_regs[XMM6_REG] = 0;
3955       call_used_regs[XMM7_REG] = 0;
3956       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 	call_used_regs[i] = 0;
3958     }
3959 
3960   /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961      other call-clobbered regs for 64-bit.  */
3962   if (TARGET_64BIT)
3963     {
3964       CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965 
3966       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 	if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 	    && call_used_regs[i])
3969 	  SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970     }
3971 
3972   /* If MMX is disabled, squash the registers.  */
3973   if (! TARGET_MMX)
3974     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975       if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977 
3978   /* If SSE is disabled, squash the registers.  */
3979   if (! TARGET_SSE)
3980     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981       if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983 
3984   /* If the FPU is disabled, squash the registers.  */
3985   if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987       if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989 
3990   /* If 32-bit, squash the 64-bit registers.  */
3991   if (! TARGET_64BIT)
3992     {
3993       for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 	reg_names[i] = "";
3995       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 	reg_names[i] = "";
3997     }
3998 }
3999 
4000 
4001 /* Save the current options */
4002 
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006   ptr->arch = ix86_arch;
4007   ptr->schedule = ix86_schedule;
4008   ptr->tune = ix86_tune;
4009   ptr->branch_cost = ix86_branch_cost;
4010   ptr->tune_defaulted = ix86_tune_defaulted;
4011   ptr->arch_specified = ix86_arch_specified;
4012   ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013   ptr->ix86_target_flags_explicit = target_flags_explicit;
4014   ptr->x_recip_mask_explicit = recip_mask_explicit;
4015 
4016   /* The fields are char but the variables are not; make sure the
4017      values fit in the fields.  */
4018   gcc_assert (ptr->arch == ix86_arch);
4019   gcc_assert (ptr->schedule == ix86_schedule);
4020   gcc_assert (ptr->tune == ix86_tune);
4021   gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023 
4024 /* Restore the current options */
4025 
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029   enum processor_type old_tune = ix86_tune;
4030   enum processor_type old_arch = ix86_arch;
4031   unsigned int ix86_arch_mask, ix86_tune_mask;
4032   int i;
4033 
4034   ix86_arch = (enum processor_type) ptr->arch;
4035   ix86_schedule = (enum attr_cpu) ptr->schedule;
4036   ix86_tune = (enum processor_type) ptr->tune;
4037   ix86_branch_cost = ptr->branch_cost;
4038   ix86_tune_defaulted = ptr->tune_defaulted;
4039   ix86_arch_specified = ptr->arch_specified;
4040   ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041   target_flags_explicit = ptr->ix86_target_flags_explicit;
4042   recip_mask_explicit = ptr->x_recip_mask_explicit;
4043 
4044   /* Recreate the arch feature tests if the arch changed */
4045   if (old_arch != ix86_arch)
4046     {
4047       ix86_arch_mask = 1u << ix86_arch;
4048       for (i = 0; i < X86_ARCH_LAST; ++i)
4049 	ix86_arch_features[i]
4050 	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051     }
4052 
4053   /* Recreate the tune optimization tests */
4054   if (old_tune != ix86_tune)
4055     {
4056       ix86_tune_mask = 1u << ix86_tune;
4057       for (i = 0; i < X86_TUNE_LAST; ++i)
4058 	ix86_tune_features[i]
4059 	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060     }
4061 }
4062 
4063 /* Print the current options */
4064 
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 			      struct cl_target_option *ptr)
4068 {
4069   char *target_string
4070     = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 			  NULL, NULL, ptr->x_ix86_fpmath, false);
4072 
4073   fprintf (file, "%*sarch = %d (%s)\n",
4074 	   indent, "",
4075 	   ptr->arch,
4076 	   ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 	    ? cpu_names[ptr->arch]
4078 	    : "<unknown>"));
4079 
4080   fprintf (file, "%*stune = %d (%s)\n",
4081 	   indent, "",
4082 	   ptr->tune,
4083 	   ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 	    ? cpu_names[ptr->tune]
4085 	    : "<unknown>"));
4086 
4087   fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088 
4089   if (target_string)
4090     {
4091       fprintf (file, "%*s%s\n", indent, "", target_string);
4092       free (target_string);
4093     }
4094 }
4095 
4096 
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098    set the current options from the argument. If we have a list, recursively go
4099    over the list.  */
4100 
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 				     struct gcc_options *enum_opts_set)
4104 {
4105   char *next_optstr;
4106   bool ret = true;
4107 
4108 #define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
4113 
4114   enum ix86_opt_type
4115   {
4116     ix86_opt_unknown,
4117     ix86_opt_yes,
4118     ix86_opt_no,
4119     ix86_opt_str,
4120     ix86_opt_enum,
4121     ix86_opt_isa
4122   };
4123 
4124   static const struct
4125   {
4126     const char *string;
4127     size_t len;
4128     enum ix86_opt_type type;
4129     int opt;
4130     int mask;
4131   } attrs[] = {
4132     /* isa options */
4133     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
4134     IX86_ATTR_ISA ("abm",	OPT_mabm),
4135     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
4136     IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
4137     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
4138     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
4139     IX86_ATTR_ISA ("aes",	OPT_maes),
4140     IX86_ATTR_ISA ("avx",	OPT_mavx),
4141     IX86_ATTR_ISA ("avx2",	OPT_mavx2),
4142     IX86_ATTR_ISA ("mmx",	OPT_mmmx),
4143     IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
4144     IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
4145     IX86_ATTR_ISA ("sse",	OPT_msse),
4146     IX86_ATTR_ISA ("sse2",	OPT_msse2),
4147     IX86_ATTR_ISA ("sse3",	OPT_msse3),
4148     IX86_ATTR_ISA ("sse4",	OPT_msse4),
4149     IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
4150     IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
4151     IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
4152     IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
4153     IX86_ATTR_ISA ("fma4",	OPT_mfma4),
4154     IX86_ATTR_ISA ("fma",	OPT_mfma),
4155     IX86_ATTR_ISA ("xop",	OPT_mxop),
4156     IX86_ATTR_ISA ("lwp",	OPT_mlwp),
4157     IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
4158     IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
4159     IX86_ATTR_ISA ("f16c",	OPT_mf16c),
4160 
4161     /* enum options */
4162     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
4163 
4164     /* string options */
4165     IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
4166     IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
4167 
4168     /* flag options */
4169     IX86_ATTR_YES ("cld",
4170 		   OPT_mcld,
4171 		   MASK_CLD),
4172 
4173     IX86_ATTR_NO ("fancy-math-387",
4174 		  OPT_mfancy_math_387,
4175 		  MASK_NO_FANCY_MATH_387),
4176 
4177     IX86_ATTR_YES ("ieee-fp",
4178 		   OPT_mieee_fp,
4179 		   MASK_IEEE_FP),
4180 
4181     IX86_ATTR_YES ("inline-all-stringops",
4182 		   OPT_minline_all_stringops,
4183 		   MASK_INLINE_ALL_STRINGOPS),
4184 
4185     IX86_ATTR_YES ("inline-stringops-dynamically",
4186 		   OPT_minline_stringops_dynamically,
4187 		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188 
4189     IX86_ATTR_NO ("align-stringops",
4190 		  OPT_mno_align_stringops,
4191 		  MASK_NO_ALIGN_STRINGOPS),
4192 
4193     IX86_ATTR_YES ("recip",
4194 		   OPT_mrecip,
4195 		   MASK_RECIP),
4196 
4197   };
4198 
4199   /* If this is a list, recurse to get the options.  */
4200   if (TREE_CODE (args) == TREE_LIST)
4201     {
4202       bool ret = true;
4203 
4204       for (; args; args = TREE_CHAIN (args))
4205 	if (TREE_VALUE (args)
4206 	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 						     p_strings, enum_opts_set))
4208 	  ret = false;
4209 
4210       return ret;
4211     }
4212 
4213   else if (TREE_CODE (args) != STRING_CST)
4214     gcc_unreachable ();
4215 
4216   /* Handle multiple arguments separated by commas.  */
4217   next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218 
4219   while (next_optstr && *next_optstr != '\0')
4220     {
4221       char *p = next_optstr;
4222       char *orig_p = p;
4223       char *comma = strchr (next_optstr, ',');
4224       const char *opt_string;
4225       size_t len, opt_len;
4226       int opt;
4227       bool opt_set_p;
4228       char ch;
4229       unsigned i;
4230       enum ix86_opt_type type = ix86_opt_unknown;
4231       int mask = 0;
4232 
4233       if (comma)
4234 	{
4235 	  *comma = '\0';
4236 	  len = comma - next_optstr;
4237 	  next_optstr = comma + 1;
4238 	}
4239       else
4240 	{
4241 	  len = strlen (p);
4242 	  next_optstr = NULL;
4243 	}
4244 
4245       /* Recognize no-xxx.  */
4246       if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 	{
4248 	  opt_set_p = false;
4249 	  p += 3;
4250 	  len -= 3;
4251 	}
4252       else
4253 	opt_set_p = true;
4254 
4255       /* Find the option.  */
4256       ch = *p;
4257       opt = N_OPTS;
4258       for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 	{
4260 	  type = attrs[i].type;
4261 	  opt_len = attrs[i].len;
4262 	  if (ch == attrs[i].string[0]
4263 	      && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 		  ? len == opt_len
4265 		  : len > opt_len)
4266 	      && memcmp (p, attrs[i].string, opt_len) == 0)
4267 	    {
4268 	      opt = attrs[i].opt;
4269 	      mask = attrs[i].mask;
4270 	      opt_string = attrs[i].string;
4271 	      break;
4272 	    }
4273 	}
4274 
4275       /* Process the option.  */
4276       if (opt == N_OPTS)
4277 	{
4278 	  error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 	  ret = false;
4280 	}
4281 
4282       else if (type == ix86_opt_isa)
4283 	{
4284 	  struct cl_decoded_option decoded;
4285 
4286 	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 	  ix86_handle_option (&global_options, &global_options_set,
4288 			      &decoded, input_location);
4289 	}
4290 
4291       else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 	{
4293 	  if (type == ix86_opt_no)
4294 	    opt_set_p = !opt_set_p;
4295 
4296 	  if (opt_set_p)
4297 	    target_flags |= mask;
4298 	  else
4299 	    target_flags &= ~mask;
4300 	}
4301 
4302       else if (type == ix86_opt_str)
4303 	{
4304 	  if (p_strings[opt])
4305 	    {
4306 	      error ("option(\"%s\") was already specified", opt_string);
4307 	      ret = false;
4308 	    }
4309 	  else
4310 	    p_strings[opt] = xstrdup (p + opt_len);
4311 	}
4312 
4313       else if (type == ix86_opt_enum)
4314 	{
4315 	  bool arg_ok;
4316 	  int value;
4317 
4318 	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 	  if (arg_ok)
4320 	    set_option (&global_options, enum_opts_set, opt, value,
4321 			p + opt_len, DK_UNSPECIFIED, input_location,
4322 			global_dc);
4323 	  else
4324 	    {
4325 	      error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 	      ret = false;
4327 	    }
4328 	}
4329 
4330       else
4331 	gcc_unreachable ();
4332     }
4333 
4334   return ret;
4335 }
4336 
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
4338 
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342   const char *orig_arch_string = ix86_arch_string;
4343   const char *orig_tune_string = ix86_tune_string;
4344   enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345   int orig_tune_defaulted = ix86_tune_defaulted;
4346   int orig_arch_specified = ix86_arch_specified;
4347   char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348   tree t = NULL_TREE;
4349   int i;
4350   struct cl_target_option *def
4351     = TREE_TARGET_OPTION (target_option_default_node);
4352   struct gcc_options enum_opts_set;
4353 
4354   memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355 
4356   /* Process each of the options on the chain.  */
4357   if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 					     &enum_opts_set))
4359     return NULL_TREE;
4360 
4361   /* If the changed options are different from the default, rerun
4362      ix86_option_override_internal, and then save the options away.
4363      The string options are are attribute options, and will be undone
4364      when we copy the save structure.  */
4365   if (ix86_isa_flags != def->x_ix86_isa_flags
4366       || target_flags != def->x_target_flags
4367       || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368       || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369       || enum_opts_set.x_ix86_fpmath)
4370     {
4371       /* If we are using the default tune= or arch=, undo the string assigned,
4372 	 and use the default.  */
4373       if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 	ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375       else if (!orig_arch_specified)
4376 	ix86_arch_string = NULL;
4377 
4378       if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 	ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380       else if (orig_tune_defaulted)
4381 	ix86_tune_string = NULL;
4382 
4383       /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
4384       if (enum_opts_set.x_ix86_fpmath)
4385 	global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386       else if (!TARGET_64BIT && TARGET_SSE)
4387 	{
4388 	  ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 	  global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 	}
4391 
4392       /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
4393       ix86_option_override_internal (false);
4394 
4395       /* Add any builtin functions with the new isa if any.  */
4396       ix86_add_new_builtins (ix86_isa_flags);
4397 
4398       /* Save the current options unless we are validating options for
4399 	 #pragma.  */
4400       t = build_target_option_node ();
4401 
4402       ix86_arch_string = orig_arch_string;
4403       ix86_tune_string = orig_tune_string;
4404       global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405 
4406       /* Free up memory allocated to hold the strings */
4407       for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 	free (option_strings[i]);
4409     }
4410 
4411   return t;
4412 }
4413 
4414 /* Hook to validate attribute((target("string"))).  */
4415 
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 			       tree ARG_UNUSED (name),
4419 			       tree args,
4420 			       int ARG_UNUSED (flags))
4421 {
4422   struct cl_target_option cur_target;
4423   bool ret = true;
4424   tree old_optimize = build_optimization_node ();
4425   tree new_target, new_optimize;
4426   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427 
4428   /* If the function changed the optimization levels as well as setting target
4429      options, start with the optimizations specified.  */
4430   if (func_optimize && func_optimize != old_optimize)
4431     cl_optimization_restore (&global_options,
4432 			     TREE_OPTIMIZATION (func_optimize));
4433 
4434   /* The target attributes may also change some optimization flags, so update
4435      the optimization options if necessary.  */
4436   cl_target_option_save (&cur_target, &global_options);
4437   new_target = ix86_valid_target_attribute_tree (args);
4438   new_optimize = build_optimization_node ();
4439 
4440   if (!new_target)
4441     ret = false;
4442 
4443   else if (fndecl)
4444     {
4445       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446 
4447       if (old_optimize != new_optimize)
4448 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449     }
4450 
4451   cl_target_option_restore (&global_options, &cur_target);
4452 
4453   if (old_optimize != new_optimize)
4454     cl_optimization_restore (&global_options,
4455 			     TREE_OPTIMIZATION (old_optimize));
4456 
4457   return ret;
4458 }
4459 
4460 
4461 /* Hook to determine if one function can safely inline another.  */
4462 
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466   bool ret = false;
4467   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469 
4470   /* If callee has no option attributes, then it is ok to inline.  */
4471   if (!callee_tree)
4472     ret = true;
4473 
4474   /* If caller has no option attributes, but callee does then it is not ok to
4475      inline.  */
4476   else if (!caller_tree)
4477     ret = false;
4478 
4479   else
4480     {
4481       struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482       struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483 
4484       /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 	 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 	 function.  */
4487       if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 	  != callee_opts->x_ix86_isa_flags)
4489 	ret = false;
4490 
4491       /* See if we have the same non-isa options.  */
4492       else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 	ret = false;
4494 
4495       /* See if arch, tune, etc. are the same.  */
4496       else if (caller_opts->arch != callee_opts->arch)
4497 	ret = false;
4498 
4499       else if (caller_opts->tune != callee_opts->tune)
4500 	ret = false;
4501 
4502       else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 	ret = false;
4504 
4505       else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 	ret = false;
4507 
4508       else
4509 	ret = true;
4510     }
4511 
4512   return ret;
4513 }
4514 
4515 
4516 /* Remember the last target of ix86_set_current_function.  */
4517 static GTY(()) tree ix86_previous_fndecl;
4518 
4519 /* Establish appropriate back-end context for processing the function
4520    FNDECL.  The argument might be NULL to indicate processing at top
4521    level, outside of any function scope.  */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525   /* Only change the context if the function changes.  This hook is called
4526      several times in the course of compiling a function, and we don't want to
4527      slow things down too much or call target_reinit when it isn't safe.  */
4528   if (fndecl && fndecl != ix86_previous_fndecl)
4529     {
4530       tree old_tree = (ix86_previous_fndecl
4531 		       ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 		       : NULL_TREE);
4533 
4534       tree new_tree = (fndecl
4535 		       ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 		       : NULL_TREE);
4537 
4538       ix86_previous_fndecl = fndecl;
4539       if (old_tree == new_tree)
4540 	;
4541 
4542       else if (new_tree)
4543 	{
4544 	  cl_target_option_restore (&global_options,
4545 				    TREE_TARGET_OPTION (new_tree));
4546 	  target_reinit ();
4547 	}
4548 
4549       else if (old_tree)
4550 	{
4551 	  struct cl_target_option *def
4552 	    = TREE_TARGET_OPTION (target_option_current_node);
4553 
4554 	  cl_target_option_restore (&global_options, def);
4555 	  target_reinit ();
4556 	}
4557     }
4558 }
4559 
4560 
4561 /* Return true if this goes in large data/bss.  */
4562 
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566   if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567     return false;
4568 
4569   /* Functions are never large data.  */
4570   if (TREE_CODE (exp) == FUNCTION_DECL)
4571     return false;
4572 
4573   if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574     {
4575       const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576       if (strcmp (section, ".ldata") == 0
4577 	  || strcmp (section, ".lbss") == 0)
4578 	return true;
4579       return false;
4580     }
4581   else
4582     {
4583       HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584 
4585       /* If this is an incomplete type with size 0, then we can't put it
4586 	 in data because it might be too big when completed.  */
4587       if (!size || size > ix86_section_threshold)
4588 	return true;
4589     }
4590 
4591   return false;
4592 }
4593 
4594 /* Switch to the appropriate section for output of DECL.
4595    DECL is either a `VAR_DECL' node or a constant of some sort.
4596    RELOC indicates whether forming the initial value of DECL requires
4597    link-time relocations.  */
4598 
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 	ATTRIBUTE_UNUSED;
4601 
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 			   unsigned HOST_WIDE_INT align)
4605 {
4606   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607       && ix86_in_large_data_p (decl))
4608     {
4609       const char *sname = NULL;
4610       unsigned int flags = SECTION_WRITE;
4611       switch (categorize_decl_for_section (decl, reloc))
4612 	{
4613 	case SECCAT_DATA:
4614 	  sname = ".ldata";
4615 	  break;
4616 	case SECCAT_DATA_REL:
4617 	  sname = ".ldata.rel";
4618 	  break;
4619 	case SECCAT_DATA_REL_LOCAL:
4620 	  sname = ".ldata.rel.local";
4621 	  break;
4622 	case SECCAT_DATA_REL_RO:
4623 	  sname = ".ldata.rel.ro";
4624 	  break;
4625 	case SECCAT_DATA_REL_RO_LOCAL:
4626 	  sname = ".ldata.rel.ro.local";
4627 	  break;
4628 	case SECCAT_BSS:
4629 	  sname = ".lbss";
4630 	  flags |= SECTION_BSS;
4631 	  break;
4632 	case SECCAT_RODATA:
4633 	case SECCAT_RODATA_MERGE_STR:
4634 	case SECCAT_RODATA_MERGE_STR_INIT:
4635 	case SECCAT_RODATA_MERGE_CONST:
4636 	  sname = ".lrodata";
4637 	  flags = 0;
4638 	  break;
4639 	case SECCAT_SRODATA:
4640 	case SECCAT_SDATA:
4641 	case SECCAT_SBSS:
4642 	  gcc_unreachable ();
4643 	case SECCAT_TEXT:
4644 	case SECCAT_TDATA:
4645 	case SECCAT_TBSS:
4646 	  /* We don't split these for medium model.  Place them into
4647 	     default sections and hope for best.  */
4648 	  break;
4649 	}
4650       if (sname)
4651 	{
4652 	  /* We might get called with string constants, but get_named_section
4653 	     doesn't like them as they are not DECLs.  Also, we need to set
4654 	     flags in that case.  */
4655 	  if (!DECL_P (decl))
4656 	    return get_section (sname, flags, NULL);
4657 	  return get_named_section (decl, sname, reloc);
4658 	}
4659     }
4660   return default_elf_select_section (decl, reloc, align);
4661 }
4662 
4663 /* Build up a unique section name, expressed as a
4664    STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665    RELOC indicates whether the initial value of EXP requires
4666    link-time relocations.  */
4667 
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672       && ix86_in_large_data_p (decl))
4673     {
4674       const char *prefix = NULL;
4675       /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
4676       bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677 
4678       switch (categorize_decl_for_section (decl, reloc))
4679 	{
4680 	case SECCAT_DATA:
4681 	case SECCAT_DATA_REL:
4682 	case SECCAT_DATA_REL_LOCAL:
4683 	case SECCAT_DATA_REL_RO:
4684 	case SECCAT_DATA_REL_RO_LOCAL:
4685           prefix = one_only ? ".ld" : ".ldata";
4686 	  break;
4687 	case SECCAT_BSS:
4688           prefix = one_only ? ".lb" : ".lbss";
4689 	  break;
4690 	case SECCAT_RODATA:
4691 	case SECCAT_RODATA_MERGE_STR:
4692 	case SECCAT_RODATA_MERGE_STR_INIT:
4693 	case SECCAT_RODATA_MERGE_CONST:
4694           prefix = one_only ? ".lr" : ".lrodata";
4695 	  break;
4696 	case SECCAT_SRODATA:
4697 	case SECCAT_SDATA:
4698 	case SECCAT_SBSS:
4699 	  gcc_unreachable ();
4700 	case SECCAT_TEXT:
4701 	case SECCAT_TDATA:
4702 	case SECCAT_TBSS:
4703 	  /* We don't split these for medium model.  Place them into
4704 	     default sections and hope for best.  */
4705 	  break;
4706 	}
4707       if (prefix)
4708 	{
4709 	  const char *name, *linkonce;
4710 	  char *string;
4711 
4712 	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 	  name = targetm.strip_name_encoding (name);
4714 
4715 	  /* If we're using one_only, then there needs to be a .gnu.linkonce
4716      	     prefix to the section name.  */
4717 	  linkonce = one_only ? ".gnu.linkonce" : "";
4718 
4719 	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720 
4721 	  DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 	  return;
4723 	}
4724     }
4725   default_unique_section (decl, reloc);
4726 }
4727 
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730    uninitialized external linkage data object.
4731 
4732    For medium model x86-64 we need to use .largecomm opcode for
4733    large objects.  */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 			const char *name, unsigned HOST_WIDE_INT size,
4737 			int align)
4738 {
4739   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740       && size > (unsigned int)ix86_section_threshold)
4741     fputs (".largecomm\t", file);
4742   else
4743     fputs (COMMON_ASM_OP, file);
4744   assemble_name (file, name);
4745   fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 	   size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749 
4750 /* Utility function for targets to use in implementing
4751    ASM_OUTPUT_ALIGNED_BSS.  */
4752 
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 			const char *name, unsigned HOST_WIDE_INT size,
4756 			int align)
4757 {
4758   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759       && size > (unsigned int)ix86_section_threshold)
4760     switch_to_section (get_named_section (decl, ".lbss", 0));
4761   else
4762     switch_to_section (bss_section);
4763   ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765   last_assemble_variable_decl = decl;
4766   ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768   /* Standard thing is just output label for the object.  */
4769   ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771   ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 
4774 /* Decide whether we must probe the stack before any space allocation
4775    on this target.  It's essentially TARGET_STACK_PROBE except when
4776    -fstack-check causes the stack to be already probed differently.  */
4777 
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781   /* Do not probe the stack twice if static stack checking is enabled.  */
4782   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783     return false;
4784 
4785   return TARGET_STACK_PROBE;
4786 }
4787 
4788 /* Decide whether we can make a sibling call to a function.  DECL is the
4789    declaration of the function being targeted by the call and EXP is the
4790    CALL_EXPR representing the call.  */
4791 
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795   tree type, decl_or_type;
4796   rtx a, b;
4797 
4798   /* If we are generating position-independent code, we cannot sibcall
4799      optimize any indirect call, or a direct call to a global function,
4800      as the PLT requires %ebx be live. (Darwin does not have a PLT.)  */
4801   if (!TARGET_MACHO
4802       && !TARGET_64BIT
4803       && flag_pic
4804       && (!decl || !targetm.binds_local_p (decl)))
4805     return false;
4806 
4807   /* If we need to align the outgoing stack, then sibcalling would
4808      unalign the stack, which may break the called function.  */
4809   if (ix86_minimum_incoming_stack_boundary (true)
4810       < PREFERRED_STACK_BOUNDARY)
4811     return false;
4812 
4813   if (decl)
4814     {
4815       decl_or_type = decl;
4816       type = TREE_TYPE (decl);
4817     }
4818   else
4819     {
4820       /* We're looking at the CALL_EXPR, we need the type of the function.  */
4821       type = CALL_EXPR_FN (exp);		/* pointer expression */
4822       type = TREE_TYPE (type);			/* pointer type */
4823       type = TREE_TYPE (type);			/* function type */
4824       decl_or_type = type;
4825     }
4826 
4827   /* Check that the return value locations are the same.  Like
4828      if we are returning floats on the 80387 register stack, we cannot
4829      make a sibcall from a function that doesn't return a float to a
4830      function that does or, conversely, from a function that does return
4831      a float to a function that doesn't; the necessary stack adjustment
4832      would not be executed.  This is also the place we notice
4833      differences in the return value ABI.  Note that it is ok for one
4834      of the functions to have void return type as long as the return
4835      value of the other is passed in a register.  */
4836   a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837   b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 			   cfun->decl, false);
4839   if (STACK_REG_P (a) || STACK_REG_P (b))
4840     {
4841       if (!rtx_equal_p (a, b))
4842 	return false;
4843     }
4844   else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845     {
4846       /* Disable sibcall if we need to generate vzeroupper after
4847 	 callee returns.  */
4848       if (TARGET_VZEROUPPER
4849 	  && cfun->machine->callee_return_avx256_p
4850 	  && !cfun->machine->caller_return_avx256_p)
4851 	return false;
4852     }
4853   else if (!rtx_equal_p (a, b))
4854     return false;
4855 
4856   if (TARGET_64BIT)
4857     {
4858       /* The SYSV ABI has more call-clobbered registers;
4859 	 disallow sibcalls from MS to SYSV.  */
4860       if (cfun->machine->call_abi == MS_ABI
4861 	  && ix86_function_type_abi (type) == SYSV_ABI)
4862 	return false;
4863     }
4864   else
4865     {
4866       /* If this call is indirect, we'll need to be able to use a
4867 	 call-clobbered register for the address of the target function.
4868 	 Make sure that all such registers are not used for passing
4869 	 parameters.  Note that DLLIMPORT functions are indirect.  */
4870       if (!decl
4871 	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 	{
4873 	  if (ix86_function_regparm (type, NULL) >= 3)
4874 	    {
4875 	      /* ??? Need to count the actual number of registers to be used,
4876 		 not the possible number of registers.  Fix later.  */
4877 	      return false;
4878 	    }
4879 	}
4880     }
4881 
4882   /* Otherwise okay.  That also includes certain types of indirect calls.  */
4883   return true;
4884 }
4885 
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887    and "sseregparm" calling convention attributes;
4888    arguments as in struct attribute_spec.handler.  */
4889 
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 				   tree args,
4893 				   int flags ATTRIBUTE_UNUSED,
4894 				   bool *no_add_attrs)
4895 {
4896   if (TREE_CODE (*node) != FUNCTION_TYPE
4897       && TREE_CODE (*node) != METHOD_TYPE
4898       && TREE_CODE (*node) != FIELD_DECL
4899       && TREE_CODE (*node) != TYPE_DECL)
4900     {
4901       warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 	       name);
4903       *no_add_attrs = true;
4904       return NULL_TREE;
4905     }
4906 
4907   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
4908   if (is_attribute_p ("regparm", name))
4909     {
4910       tree cst;
4911 
4912       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913         {
4914 	  error ("fastcall and regparm attributes are not compatible");
4915 	}
4916 
4917       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 	{
4919 	  error ("regparam and thiscall attributes are not compatible");
4920 	}
4921 
4922       cst = TREE_VALUE (args);
4923       if (TREE_CODE (cst) != INTEGER_CST)
4924 	{
4925 	  warning (OPT_Wattributes,
4926 		   "%qE attribute requires an integer constant argument",
4927 		   name);
4928 	  *no_add_attrs = true;
4929 	}
4930       else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 	{
4932 	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 		   name, REGPARM_MAX);
4934 	  *no_add_attrs = true;
4935 	}
4936 
4937       return NULL_TREE;
4938     }
4939 
4940   if (TARGET_64BIT)
4941     {
4942       /* Do not warn when emulating the MS ABI.  */
4943       if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 	   && TREE_CODE (*node) != METHOD_TYPE)
4945 	  || ix86_function_type_abi (*node) != MS_ABI)
4946 	warning (OPT_Wattributes, "%qE attribute ignored",
4947 	         name);
4948       *no_add_attrs = true;
4949       return NULL_TREE;
4950     }
4951 
4952   /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
4953   if (is_attribute_p ("fastcall", name))
4954     {
4955       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956         {
4957 	  error ("fastcall and cdecl attributes are not compatible");
4958 	}
4959       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960         {
4961 	  error ("fastcall and stdcall attributes are not compatible");
4962 	}
4963       if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964         {
4965 	  error ("fastcall and regparm attributes are not compatible");
4966 	}
4967       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 	{
4969 	  error ("fastcall and thiscall attributes are not compatible");
4970 	}
4971     }
4972 
4973   /* Can combine stdcall with fastcall (redundant), regparm and
4974      sseregparm.  */
4975   else if (is_attribute_p ("stdcall", name))
4976     {
4977       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978         {
4979 	  error ("stdcall and cdecl attributes are not compatible");
4980 	}
4981       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982         {
4983 	  error ("stdcall and fastcall attributes are not compatible");
4984 	}
4985       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 	{
4987 	  error ("stdcall and thiscall attributes are not compatible");
4988 	}
4989     }
4990 
4991   /* Can combine cdecl with regparm and sseregparm.  */
4992   else if (is_attribute_p ("cdecl", name))
4993     {
4994       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995         {
4996 	  error ("stdcall and cdecl attributes are not compatible");
4997 	}
4998       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999         {
5000 	  error ("fastcall and cdecl attributes are not compatible");
5001 	}
5002       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 	{
5004 	  error ("cdecl and thiscall attributes are not compatible");
5005 	}
5006     }
5007   else if (is_attribute_p ("thiscall", name))
5008     {
5009       if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 	warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 	         name);
5012       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 	{
5014 	  error ("stdcall and thiscall attributes are not compatible");
5015 	}
5016       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 	{
5018 	  error ("fastcall and thiscall attributes are not compatible");
5019 	}
5020       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 	{
5022 	  error ("cdecl and thiscall attributes are not compatible");
5023 	}
5024     }
5025 
5026   /* Can combine sseregparm with all attributes.  */
5027 
5028   return NULL_TREE;
5029 }
5030 
5031 /* The transactional memory builtins are implicitly regparm or fastcall
5032    depending on the ABI.  Override the generic do-nothing attribute that
5033    these builtins were declared with, and replace it with one of the two
5034    attributes that we expect elsewhere.  */
5035 
5036 static tree
5037 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5038     				  tree args ATTRIBUTE_UNUSED,
5039 				  int flags ATTRIBUTE_UNUSED,
5040 				  bool *no_add_attrs)
5041 {
5042   tree alt;
5043 
5044   /* In no case do we want to add the placeholder attribute.  */
5045   *no_add_attrs = true;
5046 
5047   /* The 64-bit ABI is unchanged for transactional memory.  */
5048   if (TARGET_64BIT)
5049     return NULL_TREE;
5050 
5051   /* ??? Is there a better way to validate 32-bit windows?  We have
5052      cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
5053   if (CHECK_STACK_LIMIT > 0)
5054     alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5055   else
5056     {
5057       alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5058       alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5059     }
5060   decl_attributes (node, alt, flags);
5061 
5062   return NULL_TREE;
5063 }
5064 
5065 /* This function determines from TYPE the calling-convention.  */
5066 
5067 unsigned int
5068 ix86_get_callcvt (const_tree type)
5069 {
5070   unsigned int ret = 0;
5071   bool is_stdarg;
5072   tree attrs;
5073 
5074   if (TARGET_64BIT)
5075     return IX86_CALLCVT_CDECL;
5076 
5077   attrs = TYPE_ATTRIBUTES (type);
5078   if (attrs != NULL_TREE)
5079     {
5080       if (lookup_attribute ("cdecl", attrs))
5081 	ret |= IX86_CALLCVT_CDECL;
5082       else if (lookup_attribute ("stdcall", attrs))
5083 	ret |= IX86_CALLCVT_STDCALL;
5084       else if (lookup_attribute ("fastcall", attrs))
5085 	ret |= IX86_CALLCVT_FASTCALL;
5086       else if (lookup_attribute ("thiscall", attrs))
5087 	ret |= IX86_CALLCVT_THISCALL;
5088 
5089       /* Regparam isn't allowed for thiscall and fastcall.  */
5090       if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5091 	{
5092 	  if (lookup_attribute ("regparm", attrs))
5093 	    ret |= IX86_CALLCVT_REGPARM;
5094 	  if (lookup_attribute ("sseregparm", attrs))
5095 	    ret |= IX86_CALLCVT_SSEREGPARM;
5096 	}
5097 
5098       if (IX86_BASE_CALLCVT(ret) != 0)
5099 	return ret;
5100     }
5101 
5102   is_stdarg = stdarg_p (type);
5103   if (TARGET_RTD && !is_stdarg)
5104     return IX86_CALLCVT_STDCALL | ret;
5105 
5106   if (ret != 0
5107       || is_stdarg
5108       || TREE_CODE (type) != METHOD_TYPE
5109       || ix86_function_type_abi (type) != MS_ABI)
5110     return IX86_CALLCVT_CDECL | ret;
5111 
5112   return IX86_CALLCVT_THISCALL;
5113 }
5114 
5115 /* Return 0 if the attributes for two types are incompatible, 1 if they
5116    are compatible, and 2 if they are nearly compatible (which causes a
5117    warning to be generated).  */
5118 
5119 static int
5120 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5121 {
5122   unsigned int ccvt1, ccvt2;
5123 
5124   if (TREE_CODE (type1) != FUNCTION_TYPE
5125       && TREE_CODE (type1) != METHOD_TYPE)
5126     return 1;
5127 
5128   ccvt1 = ix86_get_callcvt (type1);
5129   ccvt2 = ix86_get_callcvt (type2);
5130   if (ccvt1 != ccvt2)
5131     return 0;
5132   if (ix86_function_regparm (type1, NULL)
5133       != ix86_function_regparm (type2, NULL))
5134     return 0;
5135 
5136   return 1;
5137 }
5138 
5139 /* Return the regparm value for a function with the indicated TYPE and DECL.
5140    DECL may be NULL when calling function indirectly
5141    or considering a libcall.  */
5142 
5143 static int
5144 ix86_function_regparm (const_tree type, const_tree decl)
5145 {
5146   tree attr;
5147   int regparm;
5148   unsigned int ccvt;
5149 
5150   if (TARGET_64BIT)
5151     return (ix86_function_type_abi (type) == SYSV_ABI
5152 	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5153   ccvt = ix86_get_callcvt (type);
5154   regparm = ix86_regparm;
5155 
5156   if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5157     {
5158       attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5159       if (attr)
5160 	{
5161 	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5162 	  return regparm;
5163 	}
5164     }
5165   else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5166     return 2;
5167   else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5168     return 1;
5169 
5170   /* Use register calling convention for local functions when possible.  */
5171   if (decl
5172       && TREE_CODE (decl) == FUNCTION_DECL
5173       && optimize
5174       && !(profile_flag && !flag_fentry))
5175     {
5176       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
5177       struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5178       if (i && i->local && i->can_change_signature)
5179 	{
5180 	  int local_regparm, globals = 0, regno;
5181 
5182 	  /* Make sure no regparm register is taken by a
5183 	     fixed register variable.  */
5184 	  for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5185 	    if (fixed_regs[local_regparm])
5186 	      break;
5187 
5188 	  /* We don't want to use regparm(3) for nested functions as
5189 	     these use a static chain pointer in the third argument.  */
5190 	  if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5191 	    local_regparm = 2;
5192 
5193 	  /* In 32-bit mode save a register for the split stack.  */
5194 	  if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5195 	    local_regparm = 2;
5196 
5197 	  /* Each fixed register usage increases register pressure,
5198 	     so less registers should be used for argument passing.
5199 	     This functionality can be overriden by an explicit
5200 	     regparm value.  */
5201 	  for (regno = 0; regno <= DI_REG; regno++)
5202 	    if (fixed_regs[regno])
5203 	      globals++;
5204 
5205 	  local_regparm
5206 	    = globals < local_regparm ? local_regparm - globals : 0;
5207 
5208 	  if (local_regparm > regparm)
5209 	    regparm = local_regparm;
5210 	}
5211     }
5212 
5213   return regparm;
5214 }
5215 
5216 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5217    DFmode (2) arguments in SSE registers for a function with the
5218    indicated TYPE and DECL.  DECL may be NULL when calling function
5219    indirectly or considering a libcall.  Otherwise return 0.  */
5220 
5221 static int
5222 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5223 {
5224   gcc_assert (!TARGET_64BIT);
5225 
5226   /* Use SSE registers to pass SFmode and DFmode arguments if requested
5227      by the sseregparm attribute.  */
5228   if (TARGET_SSEREGPARM
5229       || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5230     {
5231       if (!TARGET_SSE)
5232 	{
5233 	  if (warn)
5234 	    {
5235 	      if (decl)
5236 		error ("calling %qD with attribute sseregparm without "
5237 		       "SSE/SSE2 enabled", decl);
5238 	      else
5239 		error ("calling %qT with attribute sseregparm without "
5240 		       "SSE/SSE2 enabled", type);
5241 	    }
5242 	  return 0;
5243 	}
5244 
5245       return 2;
5246     }
5247 
5248   /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5249      (and DFmode for SSE2) arguments in SSE registers.  */
5250   if (decl && TARGET_SSE_MATH && optimize
5251       && !(profile_flag && !flag_fentry))
5252     {
5253       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
5254       struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5255       if (i && i->local && i->can_change_signature)
5256 	return TARGET_SSE2 ? 2 : 1;
5257     }
5258 
5259   return 0;
5260 }
5261 
5262 /* Return true if EAX is live at the start of the function.  Used by
5263    ix86_expand_prologue to determine if we need special help before
5264    calling allocate_stack_worker.  */
5265 
5266 static bool
5267 ix86_eax_live_at_start_p (void)
5268 {
5269   /* Cheat.  Don't bother working forward from ix86_function_regparm
5270      to the function type to whether an actual argument is located in
5271      eax.  Instead just look at cfg info, which is still close enough
5272      to correct at this point.  This gives false positives for broken
5273      functions that might use uninitialized data that happens to be
5274      allocated in eax, but who cares?  */
5275   return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5276 }
5277 
5278 static bool
5279 ix86_keep_aggregate_return_pointer (tree fntype)
5280 {
5281   tree attr;
5282 
5283   if (!TARGET_64BIT)
5284     {
5285       attr = lookup_attribute ("callee_pop_aggregate_return",
5286 			       TYPE_ATTRIBUTES (fntype));
5287       if (attr)
5288 	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5289 
5290       /* For 32-bit MS-ABI the default is to keep aggregate
5291          return pointer.  */
5292       if (ix86_function_type_abi (fntype) == MS_ABI)
5293 	return true;
5294     }
5295   return KEEP_AGGREGATE_RETURN_POINTER != 0;
5296 }
5297 
5298 /* Value is the number of bytes of arguments automatically
5299    popped when returning from a subroutine call.
5300    FUNDECL is the declaration node of the function (as a tree),
5301    FUNTYPE is the data type of the function (as a tree),
5302    or for a library call it is an identifier node for the subroutine name.
5303    SIZE is the number of bytes of arguments passed on the stack.
5304 
5305    On the 80386, the RTD insn may be used to pop them if the number
5306      of args is fixed, but if the number is variable then the caller
5307      must pop them all.  RTD can't be used for library calls now
5308      because the library is compiled with the Unix compiler.
5309    Use of RTD is a selectable option, since it is incompatible with
5310    standard Unix calling sequences.  If the option is not selected,
5311    the caller must always pop the args.
5312 
5313    The attribute stdcall is equivalent to RTD on a per module basis.  */
5314 
5315 static int
5316 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5317 {
5318   unsigned int ccvt;
5319 
5320   /* None of the 64-bit ABIs pop arguments.  */
5321   if (TARGET_64BIT)
5322     return 0;
5323 
5324   ccvt = ix86_get_callcvt (funtype);
5325 
5326   if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5327 	       | IX86_CALLCVT_THISCALL)) != 0
5328       && ! stdarg_p (funtype))
5329     return size;
5330 
5331   /* Lose any fake structure return argument if it is passed on the stack.  */
5332   if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5333       && !ix86_keep_aggregate_return_pointer (funtype))
5334     {
5335       int nregs = ix86_function_regparm (funtype, fundecl);
5336       if (nregs == 0)
5337 	return GET_MODE_SIZE (Pmode);
5338     }
5339 
5340   return 0;
5341 }
5342 
5343 /* Argument support functions.  */
5344 
5345 /* Return true when register may be used to pass function parameters.  */
5346 bool
5347 ix86_function_arg_regno_p (int regno)
5348 {
5349   int i;
5350   const int *parm_regs;
5351 
5352   if (!TARGET_64BIT)
5353     {
5354       if (TARGET_MACHO)
5355         return (regno < REGPARM_MAX
5356                 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5357       else
5358         return (regno < REGPARM_MAX
5359 	        || (TARGET_MMX && MMX_REGNO_P (regno)
5360 	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5361 	        || (TARGET_SSE && SSE_REGNO_P (regno)
5362 		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5363     }
5364 
5365   if (TARGET_MACHO)
5366     {
5367       if (SSE_REGNO_P (regno) && TARGET_SSE)
5368         return true;
5369     }
5370   else
5371     {
5372       if (TARGET_SSE && SSE_REGNO_P (regno)
5373           && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5374         return true;
5375     }
5376 
5377   /* TODO: The function should depend on current function ABI but
5378      builtins.c would need updating then. Therefore we use the
5379      default ABI.  */
5380 
5381   /* RAX is used as hidden argument to va_arg functions.  */
5382   if (ix86_abi == SYSV_ABI && regno == AX_REG)
5383     return true;
5384 
5385   if (ix86_abi == MS_ABI)
5386     parm_regs = x86_64_ms_abi_int_parameter_registers;
5387   else
5388     parm_regs = x86_64_int_parameter_registers;
5389   for (i = 0; i < (ix86_abi == MS_ABI
5390 		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5391     if (regno == parm_regs[i])
5392       return true;
5393   return false;
5394 }
5395 
5396 /* Return if we do not know how to pass TYPE solely in registers.  */
5397 
5398 static bool
5399 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5400 {
5401   if (must_pass_in_stack_var_size_or_pad (mode, type))
5402     return true;
5403 
5404   /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
5405      The layout_type routine is crafty and tries to trick us into passing
5406      currently unsupported vector types on the stack by using TImode.  */
5407   return (!TARGET_64BIT && mode == TImode
5408 	  && type && TREE_CODE (type) != VECTOR_TYPE);
5409 }
5410 
5411 /* It returns the size, in bytes, of the area reserved for arguments passed
5412    in registers for the function represented by fndecl dependent to the used
5413    abi format.  */
5414 int
5415 ix86_reg_parm_stack_space (const_tree fndecl)
5416 {
5417   enum calling_abi call_abi = SYSV_ABI;
5418   if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5419     call_abi = ix86_function_abi (fndecl);
5420   else
5421     call_abi = ix86_function_type_abi (fndecl);
5422   if (TARGET_64BIT && call_abi == MS_ABI)
5423     return 32;
5424   return 0;
5425 }
5426 
5427 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5428    call abi used.  */
5429 enum calling_abi
5430 ix86_function_type_abi (const_tree fntype)
5431 {
5432   if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5433     {
5434       enum calling_abi abi = ix86_abi;
5435       if (abi == SYSV_ABI)
5436 	{
5437 	  if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5438 	    abi = MS_ABI;
5439 	}
5440       else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5441 	abi = SYSV_ABI;
5442       return abi;
5443     }
5444   return ix86_abi;
5445 }
5446 
5447 static bool
5448 ix86_function_ms_hook_prologue (const_tree fn)
5449 {
5450   if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5451     {
5452       if (decl_function_context (fn) != NULL_TREE)
5453 	error_at (DECL_SOURCE_LOCATION (fn),
5454 		  "ms_hook_prologue is not compatible with nested function");
5455       else
5456         return true;
5457     }
5458   return false;
5459 }
5460 
5461 static enum calling_abi
5462 ix86_function_abi (const_tree fndecl)
5463 {
5464   if (! fndecl)
5465     return ix86_abi;
5466   return ix86_function_type_abi (TREE_TYPE (fndecl));
5467 }
5468 
5469 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5470    call abi used.  */
5471 enum calling_abi
5472 ix86_cfun_abi (void)
5473 {
5474   if (! cfun)
5475     return ix86_abi;
5476   return cfun->machine->call_abi;
5477 }
5478 
5479 /* Write the extra assembler code needed to declare a function properly.  */
5480 
5481 void
5482 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5483 				tree decl)
5484 {
5485   bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5486 
5487   if (is_ms_hook)
5488     {
5489       int i, filler_count = (TARGET_64BIT ? 32 : 16);
5490       unsigned int filler_cc = 0xcccccccc;
5491 
5492       for (i = 0; i < filler_count; i += 4)
5493         fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5494     }
5495 
5496 #ifdef SUBTARGET_ASM_UNWIND_INIT
5497   SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5498 #endif
5499 
5500   ASM_OUTPUT_LABEL (asm_out_file, fname);
5501 
5502   /* Output magic byte marker, if hot-patch attribute is set.  */
5503   if (is_ms_hook)
5504     {
5505       if (TARGET_64BIT)
5506 	{
5507 	  /* leaq [%rsp + 0], %rsp  */
5508 	  asm_fprintf (asm_out_file, ASM_BYTE
5509 		       "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5510 	}
5511       else
5512 	{
5513           /* movl.s %edi, %edi
5514 	     push   %ebp
5515 	     movl.s %esp, %ebp */
5516 	  asm_fprintf (asm_out_file, ASM_BYTE
5517 		       "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5518 	}
5519     }
5520 }
5521 
5522 /* regclass.c  */
5523 extern void init_regs (void);
5524 
5525 /* Implementation of call abi switching target hook. Specific to FNDECL
5526    the specific call register sets are set.  See also
5527    ix86_conditional_register_usage for more details.  */
5528 void
5529 ix86_call_abi_override (const_tree fndecl)
5530 {
5531   if (fndecl == NULL_TREE)
5532     cfun->machine->call_abi = ix86_abi;
5533   else
5534     cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5535 }
5536 
5537 /* 64-bit MS and SYSV ABI have different set of call used registers.  Avoid
5538    expensive re-initialization of init_regs each time we switch function context
5539    since this is needed only during RTL expansion.  */
5540 static void
5541 ix86_maybe_switch_abi (void)
5542 {
5543   if (TARGET_64BIT &&
5544       call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5545     reinit_regs ();
5546 }
5547 
5548 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5549    for a call to a function whose data type is FNTYPE.
5550    For a library call, FNTYPE is 0.  */
5551 
5552 void
5553 init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
5554 		      tree fntype,	/* tree ptr for function decl */
5555 		      rtx libname,	/* SYMBOL_REF of library name or 0 */
5556 		      tree fndecl,
5557 		      int caller)
5558 {
5559   struct cgraph_local_info *i;
5560   tree fnret_type;
5561 
5562   memset (cum, 0, sizeof (*cum));
5563 
5564   /* Initialize for the current callee.  */
5565   if (caller)
5566     {
5567       cfun->machine->callee_pass_avx256_p = false;
5568       cfun->machine->callee_return_avx256_p = false;
5569     }
5570 
5571   if (fndecl)
5572     {
5573       i = cgraph_local_info (fndecl);
5574       cum->call_abi = ix86_function_abi (fndecl);
5575       fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5576     }
5577   else
5578     {
5579       i = NULL;
5580       cum->call_abi = ix86_function_type_abi (fntype);
5581       if (fntype)
5582 	fnret_type = TREE_TYPE (fntype);
5583       else
5584 	fnret_type = NULL;
5585     }
5586 
5587   if (TARGET_VZEROUPPER && fnret_type)
5588     {
5589       rtx fnret_value = ix86_function_value (fnret_type, fntype,
5590 					     false);
5591       if (function_pass_avx256_p (fnret_value))
5592 	{
5593 	  /* The return value of this function uses 256bit AVX modes.  */
5594 	  if (caller)
5595 	    {
5596 	      cfun->machine->callee_return_avx256_p = true;
5597 	      cum->callee_return_avx256_p = true;
5598 	    }
5599 	  else
5600 	    cfun->machine->caller_return_avx256_p = true;
5601 	}
5602     }
5603 
5604   cum->caller = caller;
5605 
5606   /* Set up the number of registers to use for passing arguments.  */
5607 
5608   if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5609     sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5610 	   "or subtarget optimization implying it");
5611   cum->nregs = ix86_regparm;
5612   if (TARGET_64BIT)
5613     {
5614       cum->nregs = (cum->call_abi == SYSV_ABI
5615                    ? X86_64_REGPARM_MAX
5616                    : X86_64_MS_REGPARM_MAX);
5617     }
5618   if (TARGET_SSE)
5619     {
5620       cum->sse_nregs = SSE_REGPARM_MAX;
5621       if (TARGET_64BIT)
5622         {
5623           cum->sse_nregs = (cum->call_abi == SYSV_ABI
5624                            ? X86_64_SSE_REGPARM_MAX
5625                            : X86_64_MS_SSE_REGPARM_MAX);
5626         }
5627     }
5628   if (TARGET_MMX)
5629     cum->mmx_nregs = MMX_REGPARM_MAX;
5630   cum->warn_avx = true;
5631   cum->warn_sse = true;
5632   cum->warn_mmx = true;
5633 
5634   /* Because type might mismatch in between caller and callee, we need to
5635      use actual type of function for local calls.
5636      FIXME: cgraph_analyze can be told to actually record if function uses
5637      va_start so for local functions maybe_vaarg can be made aggressive
5638      helping K&R code.
5639      FIXME: once typesytem is fixed, we won't need this code anymore.  */
5640   if (i && i->local && i->can_change_signature)
5641     fntype = TREE_TYPE (fndecl);
5642   cum->maybe_vaarg = (fntype
5643 		      ? (!prototype_p (fntype) || stdarg_p (fntype))
5644 		      : !libname);
5645 
5646   if (!TARGET_64BIT)
5647     {
5648       /* If there are variable arguments, then we won't pass anything
5649          in registers in 32-bit mode. */
5650       if (stdarg_p (fntype))
5651 	{
5652 	  cum->nregs = 0;
5653 	  cum->sse_nregs = 0;
5654 	  cum->mmx_nregs = 0;
5655 	  cum->warn_avx = 0;
5656 	  cum->warn_sse = 0;
5657 	  cum->warn_mmx = 0;
5658 	  return;
5659 	}
5660 
5661       /* Use ecx and edx registers if function has fastcall attribute,
5662 	 else look for regparm information.  */
5663       if (fntype)
5664 	{
5665 	  unsigned int ccvt = ix86_get_callcvt (fntype);
5666 	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5667 	    {
5668 	      cum->nregs = 1;
5669 	      cum->fastcall = 1; /* Same first register as in fastcall.  */
5670 	    }
5671 	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5672 	    {
5673 	      cum->nregs = 2;
5674 	      cum->fastcall = 1;
5675 	    }
5676 	  else
5677 	    cum->nregs = ix86_function_regparm (fntype, fndecl);
5678 	}
5679 
5680       /* Set up the number of SSE registers used for passing SFmode
5681 	 and DFmode arguments.  Warn for mismatching ABI.  */
5682       cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5683     }
5684 }
5685 
5686 /* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
5687    But in the case of vector types, it is some vector mode.
5688 
5689    When we have only some of our vector isa extensions enabled, then there
5690    are some modes for which vector_mode_supported_p is false.  For these
5691    modes, the generic vector support in gcc will choose some non-vector mode
5692    in order to implement the type.  By computing the natural mode, we'll
5693    select the proper ABI location for the operand and not depend on whatever
5694    the middle-end decides to do with these vector types.
5695 
5696    The midde-end can't deal with the vector types > 16 bytes.  In this
5697    case, we return the original mode and warn ABI change if CUM isn't
5698    NULL.  */
5699 
5700 static enum machine_mode
5701 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5702 {
5703   enum machine_mode mode = TYPE_MODE (type);
5704 
5705   if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5706     {
5707       HOST_WIDE_INT size = int_size_in_bytes (type);
5708       if ((size == 8 || size == 16 || size == 32)
5709 	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
5710 	  && TYPE_VECTOR_SUBPARTS (type) > 1)
5711 	{
5712 	  enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5713 
5714 	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5715 	    mode = MIN_MODE_VECTOR_FLOAT;
5716 	  else
5717 	    mode = MIN_MODE_VECTOR_INT;
5718 
5719 	  /* Get the mode which has this inner mode and number of units.  */
5720 	  for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5721 	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5722 		&& GET_MODE_INNER (mode) == innermode)
5723 	      {
5724 		if (size == 32 && !TARGET_AVX)
5725 		  {
5726 		    static bool warnedavx;
5727 
5728 		    if (cum
5729 			&& !warnedavx
5730 			&& cum->warn_avx)
5731 		      {
5732 			warnedavx = true;
5733 			warning (0, "AVX vector argument without AVX "
5734 				 "enabled changes the ABI");
5735 		      }
5736 		    return TYPE_MODE (type);
5737 		  }
5738 		else
5739 		  return mode;
5740 	      }
5741 
5742 	  gcc_unreachable ();
5743 	}
5744     }
5745 
5746   return mode;
5747 }
5748 
5749 /* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
5750    this may not agree with the mode that the type system has chosen for the
5751    register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
5752    go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
5753 
5754 static rtx
5755 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5756 		     unsigned int regno)
5757 {
5758   rtx tmp;
5759 
5760   if (orig_mode != BLKmode)
5761     tmp = gen_rtx_REG (orig_mode, regno);
5762   else
5763     {
5764       tmp = gen_rtx_REG (mode, regno);
5765       tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5766       tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5767     }
5768 
5769   return tmp;
5770 }
5771 
5772 /* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
5773    of this code is to classify each 8bytes of incoming argument by the register
5774    class and assign registers accordingly.  */
5775 
5776 /* Return the union class of CLASS1 and CLASS2.
5777    See the x86-64 PS ABI for details.  */
5778 
5779 static enum x86_64_reg_class
5780 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5781 {
5782   /* Rule #1: If both classes are equal, this is the resulting class.  */
5783   if (class1 == class2)
5784     return class1;
5785 
5786   /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5787      the other class.  */
5788   if (class1 == X86_64_NO_CLASS)
5789     return class2;
5790   if (class2 == X86_64_NO_CLASS)
5791     return class1;
5792 
5793   /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
5794   if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5795     return X86_64_MEMORY_CLASS;
5796 
5797   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
5798   if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5799       || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5800     return X86_64_INTEGERSI_CLASS;
5801   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5802       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5803     return X86_64_INTEGER_CLASS;
5804 
5805   /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5806      MEMORY is used.  */
5807   if (class1 == X86_64_X87_CLASS
5808       || class1 == X86_64_X87UP_CLASS
5809       || class1 == X86_64_COMPLEX_X87_CLASS
5810       || class2 == X86_64_X87_CLASS
5811       || class2 == X86_64_X87UP_CLASS
5812       || class2 == X86_64_COMPLEX_X87_CLASS)
5813     return X86_64_MEMORY_CLASS;
5814 
5815   /* Rule #6: Otherwise class SSE is used.  */
5816   return X86_64_SSE_CLASS;
5817 }
5818 
5819 /* Classify the argument of type TYPE and mode MODE.
5820    CLASSES will be filled by the register class used to pass each word
5821    of the operand.  The number of words is returned.  In case the parameter
5822    should be passed in memory, 0 is returned. As a special case for zero
5823    sized containers, classes[0] will be NO_CLASS and 1 is returned.
5824 
5825    BIT_OFFSET is used internally for handling records and specifies offset
5826    of the offset in bits modulo 256 to avoid overflow cases.
5827 
5828    See the x86-64 PS ABI for details.
5829 */
5830 
5831 static int
5832 classify_argument (enum machine_mode mode, const_tree type,
5833 		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5834 {
5835   HOST_WIDE_INT bytes =
5836     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5837   int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5838 
5839   /* Variable sized entities are always passed/returned in memory.  */
5840   if (bytes < 0)
5841     return 0;
5842 
5843   if (mode != VOIDmode
5844       && targetm.calls.must_pass_in_stack (mode, type))
5845     return 0;
5846 
5847   if (type && AGGREGATE_TYPE_P (type))
5848     {
5849       int i;
5850       tree field;
5851       enum x86_64_reg_class subclasses[MAX_CLASSES];
5852 
5853       /* On x86-64 we pass structures larger than 32 bytes on the stack.  */
5854       if (bytes > 32)
5855 	return 0;
5856 
5857       for (i = 0; i < words; i++)
5858 	classes[i] = X86_64_NO_CLASS;
5859 
5860       /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
5861 	 signalize memory class, so handle it as special case.  */
5862       if (!words)
5863 	{
5864 	  classes[0] = X86_64_NO_CLASS;
5865 	  return 1;
5866 	}
5867 
5868       /* Classify each field of record and merge classes.  */
5869       switch (TREE_CODE (type))
5870 	{
5871 	case RECORD_TYPE:
5872 	  /* And now merge the fields of structure.  */
5873 	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5874 	    {
5875 	      if (TREE_CODE (field) == FIELD_DECL)
5876 		{
5877 		  int num;
5878 
5879 		  if (TREE_TYPE (field) == error_mark_node)
5880 		    continue;
5881 
5882 		  /* Bitfields are always classified as integer.  Handle them
5883 		     early, since later code would consider them to be
5884 		     misaligned integers.  */
5885 		  if (DECL_BIT_FIELD (field))
5886 		    {
5887 		      for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5888 			   i < ((int_bit_position (field) + (bit_offset % 64))
5889 			        + tree_low_cst (DECL_SIZE (field), 0)
5890 				+ 63) / 8 / 8; i++)
5891 			classes[i] =
5892 			  merge_classes (X86_64_INTEGER_CLASS,
5893 					 classes[i]);
5894 		    }
5895 		  else
5896 		    {
5897 		      int pos;
5898 
5899 		      type = TREE_TYPE (field);
5900 
5901 		      /* Flexible array member is ignored.  */
5902 		      if (TYPE_MODE (type) == BLKmode
5903 			  && TREE_CODE (type) == ARRAY_TYPE
5904 			  && TYPE_SIZE (type) == NULL_TREE
5905 			  && TYPE_DOMAIN (type) != NULL_TREE
5906 			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5907 			      == NULL_TREE))
5908 			{
5909 			  static bool warned;
5910 
5911 			  if (!warned && warn_psabi)
5912 			    {
5913 			      warned = true;
5914 			      inform (input_location,
5915 				      "the ABI of passing struct with"
5916 				      " a flexible array member has"
5917 				      " changed in GCC 4.4");
5918 			    }
5919 			  continue;
5920 			}
5921 		      num = classify_argument (TYPE_MODE (type), type,
5922 					       subclasses,
5923 					       (int_bit_position (field)
5924 						+ bit_offset) % 256);
5925 		      if (!num)
5926 			return 0;
5927 		      pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5928 		      for (i = 0; i < num && (i + pos) < words; i++)
5929 			classes[i + pos] =
5930 			  merge_classes (subclasses[i], classes[i + pos]);
5931 		    }
5932 		}
5933 	    }
5934 	  break;
5935 
5936 	case ARRAY_TYPE:
5937 	  /* Arrays are handled as small records.  */
5938 	  {
5939 	    int num;
5940 	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5941 				     TREE_TYPE (type), subclasses, bit_offset);
5942 	    if (!num)
5943 	      return 0;
5944 
5945 	    /* The partial classes are now full classes.  */
5946 	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5947 	      subclasses[0] = X86_64_SSE_CLASS;
5948 	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
5949 		&& !((bit_offset % 64) == 0 && bytes == 4))
5950 	      subclasses[0] = X86_64_INTEGER_CLASS;
5951 
5952 	    for (i = 0; i < words; i++)
5953 	      classes[i] = subclasses[i % num];
5954 
5955 	    break;
5956 	  }
5957 	case UNION_TYPE:
5958 	case QUAL_UNION_TYPE:
5959 	  /* Unions are similar to RECORD_TYPE but offset is always 0.
5960 	     */
5961 	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5962 	    {
5963 	      if (TREE_CODE (field) == FIELD_DECL)
5964 		{
5965 		  int num;
5966 
5967 		  if (TREE_TYPE (field) == error_mark_node)
5968 		    continue;
5969 
5970 		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5971 					   TREE_TYPE (field), subclasses,
5972 					   bit_offset);
5973 		  if (!num)
5974 		    return 0;
5975 		  for (i = 0; i < num; i++)
5976 		    classes[i] = merge_classes (subclasses[i], classes[i]);
5977 		}
5978 	    }
5979 	  break;
5980 
5981 	default:
5982 	  gcc_unreachable ();
5983 	}
5984 
5985       if (words > 2)
5986 	{
5987 	  /* When size > 16 bytes, if the first one isn't
5988 	     X86_64_SSE_CLASS or any other ones aren't
5989 	     X86_64_SSEUP_CLASS, everything should be passed in
5990 	     memory.  */
5991 	  if (classes[0] != X86_64_SSE_CLASS)
5992 	      return 0;
5993 
5994 	  for (i = 1; i < words; i++)
5995 	    if (classes[i] != X86_64_SSEUP_CLASS)
5996 	      return 0;
5997 	}
5998 
5999       /* Final merger cleanup.  */
6000       for (i = 0; i < words; i++)
6001 	{
6002 	  /* If one class is MEMORY, everything should be passed in
6003 	     memory.  */
6004 	  if (classes[i] == X86_64_MEMORY_CLASS)
6005 	    return 0;
6006 
6007 	  /* The X86_64_SSEUP_CLASS should be always preceded by
6008 	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
6009 	  if (classes[i] == X86_64_SSEUP_CLASS
6010 	      && classes[i - 1] != X86_64_SSE_CLASS
6011 	      && classes[i - 1] != X86_64_SSEUP_CLASS)
6012 	    {
6013 	      /* The first one should never be X86_64_SSEUP_CLASS.  */
6014 	      gcc_assert (i != 0);
6015 	      classes[i] = X86_64_SSE_CLASS;
6016 	    }
6017 
6018 	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6019 	       everything should be passed in memory.  */
6020 	  if (classes[i] == X86_64_X87UP_CLASS
6021 	      && (classes[i - 1] != X86_64_X87_CLASS))
6022 	    {
6023 	      static bool warned;
6024 
6025 	      /* The first one should never be X86_64_X87UP_CLASS.  */
6026 	      gcc_assert (i != 0);
6027 	      if (!warned && warn_psabi)
6028 		{
6029 		  warned = true;
6030 		  inform (input_location,
6031 			  "the ABI of passing union with long double"
6032 			  " has changed in GCC 4.4");
6033 		}
6034 	      return 0;
6035 	    }
6036 	}
6037       return words;
6038     }
6039 
6040   /* Compute alignment needed.  We align all types to natural boundaries with
6041      exception of XFmode that is aligned to 64bits.  */
6042   if (mode != VOIDmode && mode != BLKmode)
6043     {
6044       int mode_alignment = GET_MODE_BITSIZE (mode);
6045 
6046       if (mode == XFmode)
6047 	mode_alignment = 128;
6048       else if (mode == XCmode)
6049 	mode_alignment = 256;
6050       if (COMPLEX_MODE_P (mode))
6051 	mode_alignment /= 2;
6052       /* Misaligned fields are always returned in memory.  */
6053       if (bit_offset % mode_alignment)
6054 	return 0;
6055     }
6056 
6057   /* for V1xx modes, just use the base mode */
6058   if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6059       && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6060     mode = GET_MODE_INNER (mode);
6061 
6062   /* Classification of atomic types.  */
6063   switch (mode)
6064     {
6065     case SDmode:
6066     case DDmode:
6067       classes[0] = X86_64_SSE_CLASS;
6068       return 1;
6069     case TDmode:
6070       classes[0] = X86_64_SSE_CLASS;
6071       classes[1] = X86_64_SSEUP_CLASS;
6072       return 2;
6073     case DImode:
6074     case SImode:
6075     case HImode:
6076     case QImode:
6077     case CSImode:
6078     case CHImode:
6079     case CQImode:
6080       {
6081 	int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6082 
6083 	if (size <= 32)
6084 	  {
6085 	    classes[0] = X86_64_INTEGERSI_CLASS;
6086 	    return 1;
6087 	  }
6088 	else if (size <= 64)
6089 	  {
6090 	    classes[0] = X86_64_INTEGER_CLASS;
6091 	    return 1;
6092 	  }
6093 	else if (size <= 64+32)
6094 	  {
6095 	    classes[0] = X86_64_INTEGER_CLASS;
6096 	    classes[1] = X86_64_INTEGERSI_CLASS;
6097 	    return 2;
6098 	  }
6099 	else if (size <= 64+64)
6100 	  {
6101 	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6102 	    return 2;
6103 	  }
6104 	else
6105 	  gcc_unreachable ();
6106       }
6107     case CDImode:
6108     case TImode:
6109       classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6110       return 2;
6111     case COImode:
6112     case OImode:
6113       /* OImode shouldn't be used directly.  */
6114       gcc_unreachable ();
6115     case CTImode:
6116       return 0;
6117     case SFmode:
6118       if (!(bit_offset % 64))
6119 	classes[0] = X86_64_SSESF_CLASS;
6120       else
6121 	classes[0] = X86_64_SSE_CLASS;
6122       return 1;
6123     case DFmode:
6124       classes[0] = X86_64_SSEDF_CLASS;
6125       return 1;
6126     case XFmode:
6127       classes[0] = X86_64_X87_CLASS;
6128       classes[1] = X86_64_X87UP_CLASS;
6129       return 2;
6130     case TFmode:
6131       classes[0] = X86_64_SSE_CLASS;
6132       classes[1] = X86_64_SSEUP_CLASS;
6133       return 2;
6134     case SCmode:
6135       classes[0] = X86_64_SSE_CLASS;
6136       if (!(bit_offset % 64))
6137 	return 1;
6138       else
6139 	{
6140 	  static bool warned;
6141 
6142 	  if (!warned && warn_psabi)
6143 	    {
6144 	      warned = true;
6145 	      inform (input_location,
6146 		      "the ABI of passing structure with complex float"
6147 		      " member has changed in GCC 4.4");
6148 	    }
6149 	  classes[1] = X86_64_SSESF_CLASS;
6150 	  return 2;
6151 	}
6152     case DCmode:
6153       classes[0] = X86_64_SSEDF_CLASS;
6154       classes[1] = X86_64_SSEDF_CLASS;
6155       return 2;
6156     case XCmode:
6157       classes[0] = X86_64_COMPLEX_X87_CLASS;
6158       return 1;
6159     case TCmode:
6160       /* This modes is larger than 16 bytes.  */
6161       return 0;
6162     case V8SFmode:
6163     case V8SImode:
6164     case V32QImode:
6165     case V16HImode:
6166     case V4DFmode:
6167     case V4DImode:
6168       classes[0] = X86_64_SSE_CLASS;
6169       classes[1] = X86_64_SSEUP_CLASS;
6170       classes[2] = X86_64_SSEUP_CLASS;
6171       classes[3] = X86_64_SSEUP_CLASS;
6172       return 4;
6173     case V4SFmode:
6174     case V4SImode:
6175     case V16QImode:
6176     case V8HImode:
6177     case V2DFmode:
6178     case V2DImode:
6179       classes[0] = X86_64_SSE_CLASS;
6180       classes[1] = X86_64_SSEUP_CLASS;
6181       return 2;
6182     case V1TImode:
6183     case V1DImode:
6184     case V2SFmode:
6185     case V2SImode:
6186     case V4HImode:
6187     case V8QImode:
6188       classes[0] = X86_64_SSE_CLASS;
6189       return 1;
6190     case BLKmode:
6191     case VOIDmode:
6192       return 0;
6193     default:
6194       gcc_assert (VECTOR_MODE_P (mode));
6195 
6196       if (bytes > 16)
6197 	return 0;
6198 
6199       gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6200 
6201       if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6202 	classes[0] = X86_64_INTEGERSI_CLASS;
6203       else
6204 	classes[0] = X86_64_INTEGER_CLASS;
6205       classes[1] = X86_64_INTEGER_CLASS;
6206       return 1 + (bytes > 8);
6207     }
6208 }
6209 
6210 /* Examine the argument and return set number of register required in each
6211    class.  Return 0 iff parameter should be passed in memory.  */
6212 static int
6213 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6214 		  int *int_nregs, int *sse_nregs)
6215 {
6216   enum x86_64_reg_class regclass[MAX_CLASSES];
6217   int n = classify_argument (mode, type, regclass, 0);
6218 
6219   *int_nregs = 0;
6220   *sse_nregs = 0;
6221   if (!n)
6222     return 0;
6223   for (n--; n >= 0; n--)
6224     switch (regclass[n])
6225       {
6226       case X86_64_INTEGER_CLASS:
6227       case X86_64_INTEGERSI_CLASS:
6228 	(*int_nregs)++;
6229 	break;
6230       case X86_64_SSE_CLASS:
6231       case X86_64_SSESF_CLASS:
6232       case X86_64_SSEDF_CLASS:
6233 	(*sse_nregs)++;
6234 	break;
6235       case X86_64_NO_CLASS:
6236       case X86_64_SSEUP_CLASS:
6237 	break;
6238       case X86_64_X87_CLASS:
6239       case X86_64_X87UP_CLASS:
6240 	if (!in_return)
6241 	  return 0;
6242 	break;
6243       case X86_64_COMPLEX_X87_CLASS:
6244 	return in_return ? 2 : 0;
6245       case X86_64_MEMORY_CLASS:
6246 	gcc_unreachable ();
6247       }
6248   return 1;
6249 }
6250 
6251 /* Construct container for the argument used by GCC interface.  See
6252    FUNCTION_ARG for the detailed description.  */
6253 
6254 static rtx
6255 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6256 		     const_tree type, int in_return, int nintregs, int nsseregs,
6257 		     const int *intreg, int sse_regno)
6258 {
6259   /* The following variables hold the static issued_error state.  */
6260   static bool issued_sse_arg_error;
6261   static bool issued_sse_ret_error;
6262   static bool issued_x87_ret_error;
6263 
6264   enum machine_mode tmpmode;
6265   int bytes =
6266     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6267   enum x86_64_reg_class regclass[MAX_CLASSES];
6268   int n;
6269   int i;
6270   int nexps = 0;
6271   int needed_sseregs, needed_intregs;
6272   rtx exp[MAX_CLASSES];
6273   rtx ret;
6274 
6275   n = classify_argument (mode, type, regclass, 0);
6276   if (!n)
6277     return NULL;
6278   if (!examine_argument (mode, type, in_return, &needed_intregs,
6279 			 &needed_sseregs))
6280     return NULL;
6281   if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6282     return NULL;
6283 
6284   /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
6285      some less clueful developer tries to use floating-point anyway.  */
6286   if (needed_sseregs && !TARGET_SSE)
6287     {
6288       if (in_return)
6289 	{
6290 	  if (!issued_sse_ret_error)
6291 	    {
6292 	      error ("SSE register return with SSE disabled");
6293 	      issued_sse_ret_error = true;
6294 	    }
6295 	}
6296       else if (!issued_sse_arg_error)
6297 	{
6298 	  error ("SSE register argument with SSE disabled");
6299 	  issued_sse_arg_error = true;
6300 	}
6301       return NULL;
6302     }
6303 
6304   /* Likewise, error if the ABI requires us to return values in the
6305      x87 registers and the user specified -mno-80387.  */
6306   if (!TARGET_80387 && in_return)
6307     for (i = 0; i < n; i++)
6308       if (regclass[i] == X86_64_X87_CLASS
6309 	  || regclass[i] == X86_64_X87UP_CLASS
6310 	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6311 	{
6312 	  if (!issued_x87_ret_error)
6313 	    {
6314 	      error ("x87 register return with x87 disabled");
6315 	      issued_x87_ret_error = true;
6316 	    }
6317 	  return NULL;
6318 	}
6319 
6320   /* First construct simple cases.  Avoid SCmode, since we want to use
6321      single register to pass this type.  */
6322   if (n == 1 && mode != SCmode)
6323     switch (regclass[0])
6324       {
6325       case X86_64_INTEGER_CLASS:
6326       case X86_64_INTEGERSI_CLASS:
6327 	return gen_rtx_REG (mode, intreg[0]);
6328       case X86_64_SSE_CLASS:
6329       case X86_64_SSESF_CLASS:
6330       case X86_64_SSEDF_CLASS:
6331 	if (mode != BLKmode)
6332 	  return gen_reg_or_parallel (mode, orig_mode,
6333 				      SSE_REGNO (sse_regno));
6334 	break;
6335       case X86_64_X87_CLASS:
6336       case X86_64_COMPLEX_X87_CLASS:
6337 	return gen_rtx_REG (mode, FIRST_STACK_REG);
6338       case X86_64_NO_CLASS:
6339 	/* Zero sized array, struct or class.  */
6340 	return NULL;
6341       default:
6342 	gcc_unreachable ();
6343       }
6344   if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6345       && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6346     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6347   if (n == 4
6348       && regclass[0] == X86_64_SSE_CLASS
6349       && regclass[1] == X86_64_SSEUP_CLASS
6350       && regclass[2] == X86_64_SSEUP_CLASS
6351       && regclass[3] == X86_64_SSEUP_CLASS
6352       && mode != BLKmode)
6353     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6354 
6355   if (n == 2
6356       && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6357     return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6358   if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6359       && regclass[1] == X86_64_INTEGER_CLASS
6360       && (mode == CDImode || mode == TImode || mode == TFmode)
6361       && intreg[0] + 1 == intreg[1])
6362     return gen_rtx_REG (mode, intreg[0]);
6363 
6364   /* Otherwise figure out the entries of the PARALLEL.  */
6365   for (i = 0; i < n; i++)
6366     {
6367       int pos;
6368 
6369       switch (regclass[i])
6370         {
6371 	  case X86_64_NO_CLASS:
6372 	    break;
6373 	  case X86_64_INTEGER_CLASS:
6374 	  case X86_64_INTEGERSI_CLASS:
6375 	    /* Merge TImodes on aligned occasions here too.  */
6376 	    if (i * 8 + 8 > bytes)
6377 	      tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6378 	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6379 	      tmpmode = SImode;
6380 	    else
6381 	      tmpmode = DImode;
6382 	    /* We've requested 24 bytes we don't have mode for.  Use DImode.  */
6383 	    if (tmpmode == BLKmode)
6384 	      tmpmode = DImode;
6385 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6386 					       gen_rtx_REG (tmpmode, *intreg),
6387 					       GEN_INT (i*8));
6388 	    intreg++;
6389 	    break;
6390 	  case X86_64_SSESF_CLASS:
6391 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6392 					       gen_rtx_REG (SFmode,
6393 							    SSE_REGNO (sse_regno)),
6394 					       GEN_INT (i*8));
6395 	    sse_regno++;
6396 	    break;
6397 	  case X86_64_SSEDF_CLASS:
6398 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6399 					       gen_rtx_REG (DFmode,
6400 							    SSE_REGNO (sse_regno)),
6401 					       GEN_INT (i*8));
6402 	    sse_regno++;
6403 	    break;
6404 	  case X86_64_SSE_CLASS:
6405 	    pos = i;
6406 	    switch (n)
6407 	      {
6408 	      case 1:
6409 		tmpmode = DImode;
6410 		break;
6411 	      case 2:
6412 		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6413 		  {
6414 		    tmpmode = TImode;
6415 		    i++;
6416 		  }
6417 		else
6418 		  tmpmode = DImode;
6419 		break;
6420 	      case 4:
6421 		gcc_assert (i == 0
6422 			    && regclass[1] == X86_64_SSEUP_CLASS
6423 			    && regclass[2] == X86_64_SSEUP_CLASS
6424 			    && regclass[3] == X86_64_SSEUP_CLASS);
6425 		tmpmode = OImode;
6426 		i += 3;
6427 		break;
6428 	      default:
6429 		gcc_unreachable ();
6430 	      }
6431 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6432 					       gen_rtx_REG (tmpmode,
6433 							    SSE_REGNO (sse_regno)),
6434 					       GEN_INT (pos*8));
6435 	    sse_regno++;
6436 	    break;
6437 	  default:
6438 	    gcc_unreachable ();
6439 	}
6440     }
6441 
6442   /* Empty aligned struct, union or class.  */
6443   if (nexps == 0)
6444     return NULL;
6445 
6446   ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6447   for (i = 0; i < nexps; i++)
6448     XVECEXP (ret, 0, i) = exp [i];
6449   return ret;
6450 }
6451 
6452 /* Update the data in CUM to advance over an argument of mode MODE
6453    and data type TYPE.  (TYPE is null for libcalls where that information
6454    may not be available.)  */
6455 
6456 static void
6457 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6458 			 const_tree type, HOST_WIDE_INT bytes,
6459 			 HOST_WIDE_INT words)
6460 {
6461   switch (mode)
6462     {
6463     default:
6464       break;
6465 
6466     case BLKmode:
6467       if (bytes < 0)
6468 	break;
6469       /* FALLTHRU */
6470 
6471     case DImode:
6472     case SImode:
6473     case HImode:
6474     case QImode:
6475       cum->words += words;
6476       cum->nregs -= words;
6477       cum->regno += words;
6478 
6479       if (cum->nregs <= 0)
6480 	{
6481 	  cum->nregs = 0;
6482 	  cum->regno = 0;
6483 	}
6484       break;
6485 
6486     case OImode:
6487       /* OImode shouldn't be used directly.  */
6488       gcc_unreachable ();
6489 
6490     case DFmode:
6491       if (cum->float_in_sse < 2)
6492 	break;
6493     case SFmode:
6494       if (cum->float_in_sse < 1)
6495 	break;
6496       /* FALLTHRU */
6497 
6498     case V8SFmode:
6499     case V8SImode:
6500     case V32QImode:
6501     case V16HImode:
6502     case V4DFmode:
6503     case V4DImode:
6504     case TImode:
6505     case V16QImode:
6506     case V8HImode:
6507     case V4SImode:
6508     case V2DImode:
6509     case V4SFmode:
6510     case V2DFmode:
6511       if (!type || !AGGREGATE_TYPE_P (type))
6512 	{
6513 	  cum->sse_words += words;
6514 	  cum->sse_nregs -= 1;
6515 	  cum->sse_regno += 1;
6516 	  if (cum->sse_nregs <= 0)
6517 	    {
6518 	      cum->sse_nregs = 0;
6519 	      cum->sse_regno = 0;
6520 	    }
6521 	}
6522       break;
6523 
6524     case V8QImode:
6525     case V4HImode:
6526     case V2SImode:
6527     case V2SFmode:
6528     case V1TImode:
6529     case V1DImode:
6530       if (!type || !AGGREGATE_TYPE_P (type))
6531 	{
6532 	  cum->mmx_words += words;
6533 	  cum->mmx_nregs -= 1;
6534 	  cum->mmx_regno += 1;
6535 	  if (cum->mmx_nregs <= 0)
6536 	    {
6537 	      cum->mmx_nregs = 0;
6538 	      cum->mmx_regno = 0;
6539 	    }
6540 	}
6541       break;
6542     }
6543 }
6544 
6545 static void
6546 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6547 			 const_tree type, HOST_WIDE_INT words, bool named)
6548 {
6549   int int_nregs, sse_nregs;
6550 
6551   /* Unnamed 256bit vector mode parameters are passed on stack.  */
6552   if (!named && VALID_AVX256_REG_MODE (mode))
6553     return;
6554 
6555   if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6556       && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6557     {
6558       cum->nregs -= int_nregs;
6559       cum->sse_nregs -= sse_nregs;
6560       cum->regno += int_nregs;
6561       cum->sse_regno += sse_nregs;
6562     }
6563   else
6564     {
6565       int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6566       cum->words = (cum->words + align - 1) & ~(align - 1);
6567       cum->words += words;
6568     }
6569 }
6570 
6571 static void
6572 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6573 			    HOST_WIDE_INT words)
6574 {
6575   /* Otherwise, this should be passed indirect.  */
6576   gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6577 
6578   cum->words += words;
6579   if (cum->nregs > 0)
6580     {
6581       cum->nregs -= 1;
6582       cum->regno += 1;
6583     }
6584 }
6585 
6586 /* Update the data in CUM to advance over an argument of mode MODE and
6587    data type TYPE.  (TYPE is null for libcalls where that information
6588    may not be available.)  */
6589 
6590 static void
6591 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6592 			   const_tree type, bool named)
6593 {
6594   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6595   HOST_WIDE_INT bytes, words;
6596 
6597   if (mode == BLKmode)
6598     bytes = int_size_in_bytes (type);
6599   else
6600     bytes = GET_MODE_SIZE (mode);
6601   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6602 
6603   if (type)
6604     mode = type_natural_mode (type, NULL);
6605 
6606   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6607     function_arg_advance_ms_64 (cum, bytes, words);
6608   else if (TARGET_64BIT)
6609     function_arg_advance_64 (cum, mode, type, words, named);
6610   else
6611     function_arg_advance_32 (cum, mode, type, bytes, words);
6612 }
6613 
6614 /* Define where to put the arguments to a function.
6615    Value is zero to push the argument on the stack,
6616    or a hard register in which to store the argument.
6617 
6618    MODE is the argument's machine mode.
6619    TYPE is the data type of the argument (as a tree).
6620     This is null for libcalls where that information may
6621     not be available.
6622    CUM is a variable of type CUMULATIVE_ARGS which gives info about
6623     the preceding args and about the function being called.
6624    NAMED is nonzero if this argument is a named parameter
6625     (otherwise it is an extra parameter matching an ellipsis).  */
6626 
6627 static rtx
6628 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6629 		 enum machine_mode orig_mode, const_tree type,
6630 		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6631 {
6632   static bool warnedsse, warnedmmx;
6633 
6634   /* Avoid the AL settings for the Unix64 ABI.  */
6635   if (mode == VOIDmode)
6636     return constm1_rtx;
6637 
6638   switch (mode)
6639     {
6640     default:
6641       break;
6642 
6643     case BLKmode:
6644       if (bytes < 0)
6645 	break;
6646       /* FALLTHRU */
6647     case DImode:
6648     case SImode:
6649     case HImode:
6650     case QImode:
6651       if (words <= cum->nregs)
6652 	{
6653 	  int regno = cum->regno;
6654 
6655 	  /* Fastcall allocates the first two DWORD (SImode) or
6656             smaller arguments to ECX and EDX if it isn't an
6657             aggregate type .  */
6658 	  if (cum->fastcall)
6659 	    {
6660 	      if (mode == BLKmode
6661 		  || mode == DImode
6662 		  || (type && AGGREGATE_TYPE_P (type)))
6663 	        break;
6664 
6665 	      /* ECX not EAX is the first allocated register.  */
6666 	      if (regno == AX_REG)
6667 		regno = CX_REG;
6668 	    }
6669 	  return gen_rtx_REG (mode, regno);
6670 	}
6671       break;
6672 
6673     case DFmode:
6674       if (cum->float_in_sse < 2)
6675 	break;
6676     case SFmode:
6677       if (cum->float_in_sse < 1)
6678 	break;
6679       /* FALLTHRU */
6680     case TImode:
6681       /* In 32bit, we pass TImode in xmm registers.  */
6682     case V16QImode:
6683     case V8HImode:
6684     case V4SImode:
6685     case V2DImode:
6686     case V4SFmode:
6687     case V2DFmode:
6688       if (!type || !AGGREGATE_TYPE_P (type))
6689 	{
6690 	  if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6691 	    {
6692 	      warnedsse = true;
6693 	      warning (0, "SSE vector argument without SSE enabled "
6694 		       "changes the ABI");
6695 	    }
6696 	  if (cum->sse_nregs)
6697 	    return gen_reg_or_parallel (mode, orig_mode,
6698 				        cum->sse_regno + FIRST_SSE_REG);
6699 	}
6700       break;
6701 
6702     case OImode:
6703       /* OImode shouldn't be used directly.  */
6704       gcc_unreachable ();
6705 
6706     case V8SFmode:
6707     case V8SImode:
6708     case V32QImode:
6709     case V16HImode:
6710     case V4DFmode:
6711     case V4DImode:
6712       if (!type || !AGGREGATE_TYPE_P (type))
6713 	{
6714 	  if (cum->sse_nregs)
6715 	    return gen_reg_or_parallel (mode, orig_mode,
6716 				        cum->sse_regno + FIRST_SSE_REG);
6717 	}
6718       break;
6719 
6720     case V8QImode:
6721     case V4HImode:
6722     case V2SImode:
6723     case V2SFmode:
6724     case V1TImode:
6725     case V1DImode:
6726       if (!type || !AGGREGATE_TYPE_P (type))
6727 	{
6728 	  if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6729 	    {
6730 	      warnedmmx = true;
6731 	      warning (0, "MMX vector argument without MMX enabled "
6732 		       "changes the ABI");
6733 	    }
6734 	  if (cum->mmx_nregs)
6735 	    return gen_reg_or_parallel (mode, orig_mode,
6736 				        cum->mmx_regno + FIRST_MMX_REG);
6737 	}
6738       break;
6739     }
6740 
6741   return NULL_RTX;
6742 }
6743 
6744 static rtx
6745 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6746 		 enum machine_mode orig_mode, const_tree type, bool named)
6747 {
6748   /* Handle a hidden AL argument containing number of registers
6749      for varargs x86-64 functions.  */
6750   if (mode == VOIDmode)
6751     return GEN_INT (cum->maybe_vaarg
6752 		    ? (cum->sse_nregs < 0
6753 		       ? X86_64_SSE_REGPARM_MAX
6754 		       : cum->sse_regno)
6755 		    : -1);
6756 
6757   switch (mode)
6758     {
6759     default:
6760       break;
6761 
6762     case V8SFmode:
6763     case V8SImode:
6764     case V32QImode:
6765     case V16HImode:
6766     case V4DFmode:
6767     case V4DImode:
6768       /* Unnamed 256bit vector mode parameters are passed on stack.  */
6769       if (!named)
6770 	return NULL;
6771       break;
6772     }
6773 
6774   return construct_container (mode, orig_mode, type, 0, cum->nregs,
6775 			      cum->sse_nregs,
6776 			      &x86_64_int_parameter_registers [cum->regno],
6777 			      cum->sse_regno);
6778 }
6779 
6780 static rtx
6781 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6782 		    enum machine_mode orig_mode, bool named,
6783 		    HOST_WIDE_INT bytes)
6784 {
6785   unsigned int regno;
6786 
6787   /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6788      We use value of -2 to specify that current function call is MSABI.  */
6789   if (mode == VOIDmode)
6790     return GEN_INT (-2);
6791 
6792   /* If we've run out of registers, it goes on the stack.  */
6793   if (cum->nregs == 0)
6794     return NULL_RTX;
6795 
6796   regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6797 
6798   /* Only floating point modes are passed in anything but integer regs.  */
6799   if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6800     {
6801       if (named)
6802 	regno = cum->regno + FIRST_SSE_REG;
6803       else
6804 	{
6805 	  rtx t1, t2;
6806 
6807 	  /* Unnamed floating parameters are passed in both the
6808 	     SSE and integer registers.  */
6809 	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6810 	  t2 = gen_rtx_REG (mode, regno);
6811 	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6812 	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6813 	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6814 	}
6815     }
6816   /* Handle aggregated types passed in register.  */
6817   if (orig_mode == BLKmode)
6818     {
6819       if (bytes > 0 && bytes <= 8)
6820         mode = (bytes > 4 ? DImode : SImode);
6821       if (mode == BLKmode)
6822         mode = DImode;
6823     }
6824 
6825   return gen_reg_or_parallel (mode, orig_mode, regno);
6826 }
6827 
6828 /* Return where to put the arguments to a function.
6829    Return zero to push the argument on the stack, or a hard register in which to store the argument.
6830 
6831    MODE is the argument's machine mode.  TYPE is the data type of the
6832    argument.  It is null for libcalls where that information may not be
6833    available.  CUM gives information about the preceding args and about
6834    the function being called.  NAMED is nonzero if this argument is a
6835    named parameter (otherwise it is an extra parameter matching an
6836    ellipsis).  */
6837 
6838 static rtx
6839 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6840 		   const_tree type, bool named)
6841 {
6842   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6843   enum machine_mode mode = omode;
6844   HOST_WIDE_INT bytes, words;
6845   rtx arg;
6846 
6847   if (mode == BLKmode)
6848     bytes = int_size_in_bytes (type);
6849   else
6850     bytes = GET_MODE_SIZE (mode);
6851   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6852 
6853   /* To simplify the code below, represent vector types with a vector mode
6854      even if MMX/SSE are not active.  */
6855   if (type && TREE_CODE (type) == VECTOR_TYPE)
6856     mode = type_natural_mode (type, cum);
6857 
6858   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6859     arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6860   else if (TARGET_64BIT)
6861     arg = function_arg_64 (cum, mode, omode, type, named);
6862   else
6863     arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6864 
6865   if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6866     {
6867       /* This argument uses 256bit AVX modes.  */
6868       if (cum->caller)
6869 	cum->callee_pass_avx256_p = true;
6870       else
6871 	cfun->machine->caller_pass_avx256_p = true;
6872     }
6873 
6874   if (cum->caller && mode == VOIDmode)
6875     {
6876       /* This function is called with MODE == VOIDmode immediately
6877 	 before the call instruction is emitted.  We copy callee 256bit
6878 	 AVX info from the current CUM here.  */
6879       cfun->machine->callee_return_avx256_p = cum->callee_return_avx256_p;
6880       cfun->machine->callee_pass_avx256_p = cum->callee_pass_avx256_p;
6881     }
6882 
6883   return arg;
6884 }
6885 
6886 /* A C expression that indicates when an argument must be passed by
6887    reference.  If nonzero for an argument, a copy of that argument is
6888    made in memory and a pointer to the argument is passed instead of
6889    the argument itself.  The pointer is passed in whatever way is
6890    appropriate for passing a pointer to that type.  */
6891 
6892 static bool
6893 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6894 			enum machine_mode mode ATTRIBUTE_UNUSED,
6895 			const_tree type, bool named ATTRIBUTE_UNUSED)
6896 {
6897   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6898 
6899   /* See Windows x64 Software Convention.  */
6900   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6901     {
6902       int msize = (int) GET_MODE_SIZE (mode);
6903       if (type)
6904 	{
6905 	  /* Arrays are passed by reference.  */
6906 	  if (TREE_CODE (type) == ARRAY_TYPE)
6907 	    return true;
6908 
6909 	  if (AGGREGATE_TYPE_P (type))
6910 	    {
6911 	      /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6912 	         are passed by reference.  */
6913 	      msize = int_size_in_bytes (type);
6914 	    }
6915 	}
6916 
6917       /* __m128 is passed by reference.  */
6918       switch (msize) {
6919       case 1: case 2: case 4: case 8:
6920         break;
6921       default:
6922         return true;
6923       }
6924     }
6925   else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6926     return 1;
6927 
6928   return 0;
6929 }
6930 
6931 /* Return true when TYPE should be 128bit aligned for 32bit argument
6932    passing ABI.  XXX: This function is obsolete and is only used for
6933    checking psABI compatibility with previous versions of GCC.  */
6934 
6935 static bool
6936 ix86_compat_aligned_value_p (const_tree type)
6937 {
6938   enum machine_mode mode = TYPE_MODE (type);
6939   if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6940        || mode == TDmode
6941        || mode == TFmode
6942        || mode == TCmode)
6943       && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6944     return true;
6945   if (TYPE_ALIGN (type) < 128)
6946     return false;
6947 
6948   if (AGGREGATE_TYPE_P (type))
6949     {
6950       /* Walk the aggregates recursively.  */
6951       switch (TREE_CODE (type))
6952 	{
6953 	case RECORD_TYPE:
6954 	case UNION_TYPE:
6955 	case QUAL_UNION_TYPE:
6956 	  {
6957 	    tree field;
6958 
6959 	    /* Walk all the structure fields.  */
6960 	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6961 	      {
6962 		if (TREE_CODE (field) == FIELD_DECL
6963 		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6964 		  return true;
6965 	      }
6966 	    break;
6967 	  }
6968 
6969 	case ARRAY_TYPE:
6970 	  /* Just for use if some languages passes arrays by value.  */
6971 	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6972 	    return true;
6973 	  break;
6974 
6975 	default:
6976 	  gcc_unreachable ();
6977 	}
6978     }
6979   return false;
6980 }
6981 
6982 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6983    XXX: This function is obsolete and is only used for checking psABI
6984    compatibility with previous versions of GCC.  */
6985 
6986 static unsigned int
6987 ix86_compat_function_arg_boundary (enum machine_mode mode,
6988 				   const_tree type, unsigned int align)
6989 {
6990   /* In 32bit, only _Decimal128 and __float128 are aligned to their
6991      natural boundaries.  */
6992   if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6993     {
6994       /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
6995 	 make an exception for SSE modes since these require 128bit
6996 	 alignment.
6997 
6998 	 The handling here differs from field_alignment.  ICC aligns MMX
6999 	 arguments to 4 byte boundaries, while structure fields are aligned
7000 	 to 8 byte boundaries.  */
7001       if (!type)
7002 	{
7003 	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7004 	    align = PARM_BOUNDARY;
7005 	}
7006       else
7007 	{
7008 	  if (!ix86_compat_aligned_value_p (type))
7009 	    align = PARM_BOUNDARY;
7010 	}
7011     }
7012   if (align > BIGGEST_ALIGNMENT)
7013     align = BIGGEST_ALIGNMENT;
7014   return align;
7015 }
7016 
7017 /* Return true when TYPE should be 128bit aligned for 32bit argument
7018    passing ABI.  */
7019 
7020 static bool
7021 ix86_contains_aligned_value_p (const_tree type)
7022 {
7023   enum machine_mode mode = TYPE_MODE (type);
7024 
7025   if (mode == XFmode || mode == XCmode)
7026     return false;
7027 
7028   if (TYPE_ALIGN (type) < 128)
7029     return false;
7030 
7031   if (AGGREGATE_TYPE_P (type))
7032     {
7033       /* Walk the aggregates recursively.  */
7034       switch (TREE_CODE (type))
7035 	{
7036 	case RECORD_TYPE:
7037 	case UNION_TYPE:
7038 	case QUAL_UNION_TYPE:
7039 	  {
7040 	    tree field;
7041 
7042 	    /* Walk all the structure fields.  */
7043 	    for (field = TYPE_FIELDS (type);
7044 		 field;
7045 		 field = DECL_CHAIN (field))
7046 	      {
7047 		if (TREE_CODE (field) == FIELD_DECL
7048 		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7049 		  return true;
7050 	      }
7051 	    break;
7052 	  }
7053 
7054 	case ARRAY_TYPE:
7055 	  /* Just for use if some languages passes arrays by value.  */
7056 	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7057 	    return true;
7058 	  break;
7059 
7060 	default:
7061 	  gcc_unreachable ();
7062 	}
7063     }
7064   else
7065     return TYPE_ALIGN (type) >= 128;
7066 
7067   return false;
7068 }
7069 
7070 /* Gives the alignment boundary, in bits, of an argument with the
7071    specified mode and type.  */
7072 
7073 static unsigned int
7074 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7075 {
7076   unsigned int align;
7077   if (type)
7078     {
7079       /* Since the main variant type is used for call, we convert it to
7080 	 the main variant type.  */
7081       type = TYPE_MAIN_VARIANT (type);
7082       align = TYPE_ALIGN (type);
7083     }
7084   else
7085     align = GET_MODE_ALIGNMENT (mode);
7086   if (align < PARM_BOUNDARY)
7087     align = PARM_BOUNDARY;
7088   else
7089     {
7090       static bool warned;
7091       unsigned int saved_align = align;
7092 
7093       if (!TARGET_64BIT)
7094 	{
7095 	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
7096 	  if (!type)
7097 	    {
7098 	      if (mode == XFmode || mode == XCmode)
7099 		align = PARM_BOUNDARY;
7100 	    }
7101 	  else if (!ix86_contains_aligned_value_p (type))
7102 	    align = PARM_BOUNDARY;
7103 
7104 	  if (align < 128)
7105 	    align = PARM_BOUNDARY;
7106 	}
7107 
7108       if (warn_psabi
7109 	  && !warned
7110 	  && align != ix86_compat_function_arg_boundary (mode, type,
7111 							 saved_align))
7112 	{
7113 	  warned = true;
7114 	  inform (input_location,
7115 		  "The ABI for passing parameters with %d-byte"
7116 		  " alignment has changed in GCC 4.6",
7117 		  align / BITS_PER_UNIT);
7118 	}
7119     }
7120 
7121   return align;
7122 }
7123 
7124 /* Return true if N is a possible register number of function value.  */
7125 
7126 static bool
7127 ix86_function_value_regno_p (const unsigned int regno)
7128 {
7129   switch (regno)
7130     {
7131     case AX_REG:
7132       return true;
7133 
7134     case FIRST_FLOAT_REG:
7135       /* TODO: The function should depend on current function ABI but
7136        builtins.c would need updating then. Therefore we use the
7137        default ABI.  */
7138       if (TARGET_64BIT && ix86_abi == MS_ABI)
7139 	return false;
7140       return TARGET_FLOAT_RETURNS_IN_80387;
7141 
7142     case FIRST_SSE_REG:
7143       return TARGET_SSE;
7144 
7145     case FIRST_MMX_REG:
7146       if (TARGET_MACHO || TARGET_64BIT)
7147 	return false;
7148       return TARGET_MMX;
7149     }
7150 
7151   return false;
7152 }
7153 
7154 /* Define how to find the value returned by a function.
7155    VALTYPE is the data type of the value (as a tree).
7156    If the precise function being called is known, FUNC is its FUNCTION_DECL;
7157    otherwise, FUNC is 0.  */
7158 
7159 static rtx
7160 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7161 		   const_tree fntype, const_tree fn)
7162 {
7163   unsigned int regno;
7164 
7165   /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7166      we normally prevent this case when mmx is not available.  However
7167      some ABIs may require the result to be returned like DImode.  */
7168   if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7169     regno = FIRST_MMX_REG;
7170 
7171   /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
7172      we prevent this case when sse is not available.  However some ABIs
7173      may require the result to be returned like integer TImode.  */
7174   else if (mode == TImode
7175 	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7176     regno = FIRST_SSE_REG;
7177 
7178   /* 32-byte vector modes in %ymm0.   */
7179   else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7180     regno = FIRST_SSE_REG;
7181 
7182   /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
7183   else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7184     regno = FIRST_FLOAT_REG;
7185   else
7186     /* Most things go in %eax.  */
7187     regno = AX_REG;
7188 
7189   /* Override FP return register with %xmm0 for local functions when
7190      SSE math is enabled or for functions with sseregparm attribute.  */
7191   if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7192     {
7193       int sse_level = ix86_function_sseregparm (fntype, fn, false);
7194       if ((sse_level >= 1 && mode == SFmode)
7195 	  || (sse_level == 2 && mode == DFmode))
7196 	regno = FIRST_SSE_REG;
7197     }
7198 
7199   /* OImode shouldn't be used directly.  */
7200   gcc_assert (mode != OImode);
7201 
7202   return gen_rtx_REG (orig_mode, regno);
7203 }
7204 
7205 static rtx
7206 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7207 		   const_tree valtype)
7208 {
7209   rtx ret;
7210 
7211   /* Handle libcalls, which don't provide a type node.  */
7212   if (valtype == NULL)
7213     {
7214       unsigned int regno;
7215 
7216       switch (mode)
7217 	{
7218 	case SFmode:
7219 	case SCmode:
7220 	case DFmode:
7221 	case DCmode:
7222 	case TFmode:
7223 	case SDmode:
7224 	case DDmode:
7225 	case TDmode:
7226 	  regno = FIRST_SSE_REG;
7227 	  break;
7228 	case XFmode:
7229 	case XCmode:
7230 	  regno = FIRST_FLOAT_REG;
7231 	  break;
7232 	case TCmode:
7233 	  return NULL;
7234 	default:
7235 	  regno = AX_REG;
7236 	}
7237 
7238       return gen_rtx_REG (mode, regno);
7239     }
7240   else if (POINTER_TYPE_P (valtype))
7241     {
7242       /* Pointers are always returned in Pmode. */
7243       mode = Pmode;
7244     }
7245 
7246   ret = construct_container (mode, orig_mode, valtype, 1,
7247 			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7248 			     x86_64_int_return_registers, 0);
7249 
7250   /* For zero sized structures, construct_container returns NULL, but we
7251      need to keep rest of compiler happy by returning meaningful value.  */
7252   if (!ret)
7253     ret = gen_rtx_REG (orig_mode, AX_REG);
7254 
7255   return ret;
7256 }
7257 
7258 static rtx
7259 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7260 {
7261   unsigned int regno = AX_REG;
7262 
7263   if (TARGET_SSE)
7264     {
7265       switch (GET_MODE_SIZE (mode))
7266         {
7267         case 16:
7268           if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7269 	     && !COMPLEX_MODE_P (mode))
7270 	    regno = FIRST_SSE_REG;
7271 	  break;
7272 	case 8:
7273 	case 4:
7274 	  if (mode == SFmode || mode == DFmode)
7275 	    regno = FIRST_SSE_REG;
7276 	  break;
7277 	default:
7278 	  break;
7279         }
7280     }
7281   return gen_rtx_REG (orig_mode, regno);
7282 }
7283 
7284 static rtx
7285 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7286 		       enum machine_mode orig_mode, enum machine_mode mode)
7287 {
7288   const_tree fn, fntype;
7289 
7290   fn = NULL_TREE;
7291   if (fntype_or_decl && DECL_P (fntype_or_decl))
7292     fn = fntype_or_decl;
7293   fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7294 
7295   if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7296     return function_value_ms_64 (orig_mode, mode);
7297   else if (TARGET_64BIT)
7298     return function_value_64 (orig_mode, mode, valtype);
7299   else
7300     return function_value_32 (orig_mode, mode, fntype, fn);
7301 }
7302 
7303 static rtx
7304 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7305 		     bool outgoing ATTRIBUTE_UNUSED)
7306 {
7307   enum machine_mode mode, orig_mode;
7308 
7309   orig_mode = TYPE_MODE (valtype);
7310   mode = type_natural_mode (valtype, NULL);
7311   return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7312 }
7313 
7314 /* Pointer function arguments and return values are promoted to Pmode.  */
7315 
7316 static enum machine_mode
7317 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7318 			    int *punsignedp, const_tree fntype,
7319 			    int for_return)
7320 {
7321   if (type != NULL_TREE && POINTER_TYPE_P (type))
7322     {
7323       *punsignedp = POINTERS_EXTEND_UNSIGNED;
7324       return Pmode;
7325     }
7326   return default_promote_function_mode (type, mode, punsignedp, fntype,
7327 					for_return);
7328 }
7329 
7330 rtx
7331 ix86_libcall_value (enum machine_mode mode)
7332 {
7333   return ix86_function_value_1 (NULL, NULL, mode, mode);
7334 }
7335 
7336 /* Return true iff type is returned in memory.  */
7337 
7338 static bool ATTRIBUTE_UNUSED
7339 return_in_memory_32 (const_tree type, enum machine_mode mode)
7340 {
7341   HOST_WIDE_INT size;
7342 
7343   if (mode == BLKmode)
7344     return true;
7345 
7346   size = int_size_in_bytes (type);
7347 
7348   if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7349     return false;
7350 
7351   if (VECTOR_MODE_P (mode) || mode == TImode)
7352     {
7353       /* User-created vectors small enough to fit in EAX.  */
7354       if (size < 8)
7355 	return false;
7356 
7357       /* MMX/3dNow values are returned in MM0,
7358 	 except when it doesn't exits or the ABI prescribes otherwise.  */
7359       if (size == 8)
7360 	return !TARGET_MMX || TARGET_VECT8_RETURNS;
7361 
7362       /* SSE values are returned in XMM0, except when it doesn't exist.  */
7363       if (size == 16)
7364 	return !TARGET_SSE;
7365 
7366       /* AVX values are returned in YMM0, except when it doesn't exist.  */
7367       if (size == 32)
7368 	return !TARGET_AVX;
7369     }
7370 
7371   if (mode == XFmode)
7372     return false;
7373 
7374   if (size > 12)
7375     return true;
7376 
7377   /* OImode shouldn't be used directly.  */
7378   gcc_assert (mode != OImode);
7379 
7380   return false;
7381 }
7382 
7383 static bool ATTRIBUTE_UNUSED
7384 return_in_memory_64 (const_tree type, enum machine_mode mode)
7385 {
7386   int needed_intregs, needed_sseregs;
7387   return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7388 }
7389 
7390 static bool ATTRIBUTE_UNUSED
7391 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7392 {
7393   HOST_WIDE_INT size = int_size_in_bytes (type);
7394 
7395   /* __m128 is returned in xmm0.  */
7396   if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7397       && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7398     return false;
7399 
7400   /* Otherwise, the size must be exactly in [1248]. */
7401   return size != 1 && size != 2 && size != 4 && size != 8;
7402 }
7403 
7404 static bool
7405 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7406 {
7407 #ifdef SUBTARGET_RETURN_IN_MEMORY
7408   return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7409 #else
7410   const enum machine_mode mode = type_natural_mode (type, NULL);
7411 
7412   if (TARGET_64BIT)
7413     {
7414       if (ix86_function_type_abi (fntype) == MS_ABI)
7415 	return return_in_memory_ms_64 (type, mode);
7416       else
7417 	return return_in_memory_64 (type, mode);
7418     }
7419   else
7420     return return_in_memory_32 (type, mode);
7421 #endif
7422 }
7423 
7424 /* When returning SSE vector types, we have a choice of either
7425      (1) being abi incompatible with a -march switch, or
7426      (2) generating an error.
7427    Given no good solution, I think the safest thing is one warning.
7428    The user won't be able to use -Werror, but....
7429 
7430    Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7431    called in response to actually generating a caller or callee that
7432    uses such a type.  As opposed to TARGET_RETURN_IN_MEMORY, which is called
7433    via aggregate_value_p for general type probing from tree-ssa.  */
7434 
7435 static rtx
7436 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7437 {
7438   static bool warnedsse, warnedmmx;
7439 
7440   if (!TARGET_64BIT && type)
7441     {
7442       /* Look at the return type of the function, not the function type.  */
7443       enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7444 
7445       if (!TARGET_SSE && !warnedsse)
7446 	{
7447 	  if (mode == TImode
7448 	      || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7449 	    {
7450 	      warnedsse = true;
7451 	      warning (0, "SSE vector return without SSE enabled "
7452 		       "changes the ABI");
7453 	    }
7454 	}
7455 
7456       if (!TARGET_MMX && !warnedmmx)
7457 	{
7458 	  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7459 	    {
7460 	      warnedmmx = true;
7461 	      warning (0, "MMX vector return without MMX enabled "
7462 		       "changes the ABI");
7463 	    }
7464 	}
7465     }
7466 
7467   return NULL;
7468 }
7469 
7470 
7471 /* Create the va_list data type.  */
7472 
7473 /* Returns the calling convention specific va_list date type.
7474    The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI.  */
7475 
7476 static tree
7477 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7478 {
7479   tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7480 
7481   /* For i386 we use plain pointer to argument area.  */
7482   if (!TARGET_64BIT || abi == MS_ABI)
7483     return build_pointer_type (char_type_node);
7484 
7485   record = lang_hooks.types.make_type (RECORD_TYPE);
7486   type_decl = build_decl (BUILTINS_LOCATION,
7487 			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
7488 
7489   f_gpr = build_decl (BUILTINS_LOCATION,
7490 		      FIELD_DECL, get_identifier ("gp_offset"),
7491 		      unsigned_type_node);
7492   f_fpr = build_decl (BUILTINS_LOCATION,
7493 		      FIELD_DECL, get_identifier ("fp_offset"),
7494 		      unsigned_type_node);
7495   f_ovf = build_decl (BUILTINS_LOCATION,
7496 		      FIELD_DECL, get_identifier ("overflow_arg_area"),
7497 		      ptr_type_node);
7498   f_sav = build_decl (BUILTINS_LOCATION,
7499 		      FIELD_DECL, get_identifier ("reg_save_area"),
7500 		      ptr_type_node);
7501 
7502   va_list_gpr_counter_field = f_gpr;
7503   va_list_fpr_counter_field = f_fpr;
7504 
7505   DECL_FIELD_CONTEXT (f_gpr) = record;
7506   DECL_FIELD_CONTEXT (f_fpr) = record;
7507   DECL_FIELD_CONTEXT (f_ovf) = record;
7508   DECL_FIELD_CONTEXT (f_sav) = record;
7509 
7510   TYPE_STUB_DECL (record) = type_decl;
7511   TYPE_NAME (record) = type_decl;
7512   TYPE_FIELDS (record) = f_gpr;
7513   DECL_CHAIN (f_gpr) = f_fpr;
7514   DECL_CHAIN (f_fpr) = f_ovf;
7515   DECL_CHAIN (f_ovf) = f_sav;
7516 
7517   layout_type (record);
7518 
7519   /* The correct type is an array type of one element.  */
7520   return build_array_type (record, build_index_type (size_zero_node));
7521 }
7522 
7523 /* Setup the builtin va_list data type and for 64-bit the additional
7524    calling convention specific va_list data types.  */
7525 
7526 static tree
7527 ix86_build_builtin_va_list (void)
7528 {
7529   tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7530 
7531   /* Initialize abi specific va_list builtin types.  */
7532   if (TARGET_64BIT)
7533     {
7534       tree t;
7535       if (ix86_abi == MS_ABI)
7536         {
7537           t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7538           if (TREE_CODE (t) != RECORD_TYPE)
7539             t = build_variant_type_copy (t);
7540           sysv_va_list_type_node = t;
7541         }
7542       else
7543         {
7544           t = ret;
7545           if (TREE_CODE (t) != RECORD_TYPE)
7546             t = build_variant_type_copy (t);
7547           sysv_va_list_type_node = t;
7548         }
7549       if (ix86_abi != MS_ABI)
7550         {
7551           t = ix86_build_builtin_va_list_abi (MS_ABI);
7552           if (TREE_CODE (t) != RECORD_TYPE)
7553             t = build_variant_type_copy (t);
7554           ms_va_list_type_node = t;
7555         }
7556       else
7557         {
7558           t = ret;
7559           if (TREE_CODE (t) != RECORD_TYPE)
7560             t = build_variant_type_copy (t);
7561           ms_va_list_type_node = t;
7562         }
7563     }
7564 
7565   return ret;
7566 }
7567 
7568 /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
7569 
7570 static void
7571 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7572 {
7573   rtx save_area, mem;
7574   alias_set_type set;
7575   int i, max;
7576 
7577   /* GPR size of varargs save area.  */
7578   if (cfun->va_list_gpr_size)
7579     ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7580   else
7581     ix86_varargs_gpr_size = 0;
7582 
7583   /* FPR size of varargs save area.  We don't need it if we don't pass
7584      anything in SSE registers.  */
7585   if (TARGET_SSE && cfun->va_list_fpr_size)
7586     ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7587   else
7588     ix86_varargs_fpr_size = 0;
7589 
7590   if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7591     return;
7592 
7593   save_area = frame_pointer_rtx;
7594   set = get_varargs_alias_set ();
7595 
7596   max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7597   if (max > X86_64_REGPARM_MAX)
7598     max = X86_64_REGPARM_MAX;
7599 
7600   for (i = cum->regno; i < max; i++)
7601     {
7602       mem = gen_rtx_MEM (Pmode,
7603 			 plus_constant (save_area, i * UNITS_PER_WORD));
7604       MEM_NOTRAP_P (mem) = 1;
7605       set_mem_alias_set (mem, set);
7606       emit_move_insn (mem, gen_rtx_REG (Pmode,
7607 					x86_64_int_parameter_registers[i]));
7608     }
7609 
7610   if (ix86_varargs_fpr_size)
7611     {
7612       enum machine_mode smode;
7613       rtx label, test;
7614 
7615       /* Now emit code to save SSE registers.  The AX parameter contains number
7616 	 of SSE parameter registers used to call this function, though all we
7617 	 actually check here is the zero/non-zero status.  */
7618 
7619       label = gen_label_rtx ();
7620       test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7621       emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7622 				      label));
7623 
7624       /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7625 	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
7626 	 be if we could determine the real mode of the data, via a hook
7627 	 into pass_stdarg.  Ignore all that for now.  */
7628       smode = V4SFmode;
7629       if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7630 	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7631 
7632       max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7633       if (max > X86_64_SSE_REGPARM_MAX)
7634 	max = X86_64_SSE_REGPARM_MAX;
7635 
7636       for (i = cum->sse_regno; i < max; ++i)
7637 	{
7638 	  mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7639 	  mem = gen_rtx_MEM (smode, mem);
7640 	  MEM_NOTRAP_P (mem) = 1;
7641 	  set_mem_alias_set (mem, set);
7642 	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7643 
7644 	  emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7645 	}
7646 
7647       emit_label (label);
7648     }
7649 }
7650 
7651 static void
7652 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7653 {
7654   alias_set_type set = get_varargs_alias_set ();
7655   int i;
7656 
7657   /* Reset to zero, as there might be a sysv vaarg used
7658      before.  */
7659   ix86_varargs_gpr_size = 0;
7660   ix86_varargs_fpr_size = 0;
7661 
7662   for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7663     {
7664       rtx reg, mem;
7665 
7666       mem = gen_rtx_MEM (Pmode,
7667 			 plus_constant (virtual_incoming_args_rtx,
7668 					i * UNITS_PER_WORD));
7669       MEM_NOTRAP_P (mem) = 1;
7670       set_mem_alias_set (mem, set);
7671 
7672       reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7673       emit_move_insn (mem, reg);
7674     }
7675 }
7676 
7677 static void
7678 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7679 			     tree type, int *pretend_size ATTRIBUTE_UNUSED,
7680 			     int no_rtl)
7681 {
7682   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7683   CUMULATIVE_ARGS next_cum;
7684   tree fntype;
7685 
7686   /* This argument doesn't appear to be used anymore.  Which is good,
7687      because the old code here didn't suppress rtl generation.  */
7688   gcc_assert (!no_rtl);
7689 
7690   if (!TARGET_64BIT)
7691     return;
7692 
7693   fntype = TREE_TYPE (current_function_decl);
7694 
7695   /* For varargs, we do not want to skip the dummy va_dcl argument.
7696      For stdargs, we do want to skip the last named argument.  */
7697   next_cum = *cum;
7698   if (stdarg_p (fntype))
7699     ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7700 			       true);
7701 
7702   if (cum->call_abi == MS_ABI)
7703     setup_incoming_varargs_ms_64 (&next_cum);
7704   else
7705     setup_incoming_varargs_64 (&next_cum);
7706 }
7707 
7708 /* Checks if TYPE is of kind va_list char *.  */
7709 
7710 static bool
7711 is_va_list_char_pointer (tree type)
7712 {
7713   tree canonic;
7714 
7715   /* For 32-bit it is always true.  */
7716   if (!TARGET_64BIT)
7717     return true;
7718   canonic = ix86_canonical_va_list_type (type);
7719   return (canonic == ms_va_list_type_node
7720           || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7721 }
7722 
7723 /* Implement va_start.  */
7724 
7725 static void
7726 ix86_va_start (tree valist, rtx nextarg)
7727 {
7728   HOST_WIDE_INT words, n_gpr, n_fpr;
7729   tree f_gpr, f_fpr, f_ovf, f_sav;
7730   tree gpr, fpr, ovf, sav, t;
7731   tree type;
7732   rtx ovf_rtx;
7733 
7734   if (flag_split_stack
7735       && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7736     {
7737       unsigned int scratch_regno;
7738 
7739       /* When we are splitting the stack, we can't refer to the stack
7740 	 arguments using internal_arg_pointer, because they may be on
7741 	 the old stack.  The split stack prologue will arrange to
7742 	 leave a pointer to the old stack arguments in a scratch
7743 	 register, which we here copy to a pseudo-register.  The split
7744 	 stack prologue can't set the pseudo-register directly because
7745 	 it (the prologue) runs before any registers have been saved.  */
7746 
7747       scratch_regno = split_stack_prologue_scratch_regno ();
7748       if (scratch_regno != INVALID_REGNUM)
7749 	{
7750 	  rtx reg, seq;
7751 
7752 	  reg = gen_reg_rtx (Pmode);
7753 	  cfun->machine->split_stack_varargs_pointer = reg;
7754 
7755 	  start_sequence ();
7756 	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7757 	  seq = get_insns ();
7758 	  end_sequence ();
7759 
7760 	  push_topmost_sequence ();
7761 	  emit_insn_after (seq, entry_of_function ());
7762 	  pop_topmost_sequence ();
7763 	}
7764     }
7765 
7766   /* Only 64bit target needs something special.  */
7767   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7768     {
7769       if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7770 	std_expand_builtin_va_start (valist, nextarg);
7771       else
7772 	{
7773 	  rtx va_r, next;
7774 
7775 	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7776 	  next = expand_binop (ptr_mode, add_optab,
7777 			       cfun->machine->split_stack_varargs_pointer,
7778 			       crtl->args.arg_offset_rtx,
7779 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
7780 	  convert_move (va_r, next, 0);
7781 	}
7782       return;
7783     }
7784 
7785   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7786   f_fpr = DECL_CHAIN (f_gpr);
7787   f_ovf = DECL_CHAIN (f_fpr);
7788   f_sav = DECL_CHAIN (f_ovf);
7789 
7790   valist = build_simple_mem_ref (valist);
7791   TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7792   /* The following should be folded into the MEM_REF offset.  */
7793   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7794 		f_gpr, NULL_TREE);
7795   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7796 		f_fpr, NULL_TREE);
7797   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7798 		f_ovf, NULL_TREE);
7799   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7800 		f_sav, NULL_TREE);
7801 
7802   /* Count number of gp and fp argument registers used.  */
7803   words = crtl->args.info.words;
7804   n_gpr = crtl->args.info.regno;
7805   n_fpr = crtl->args.info.sse_regno;
7806 
7807   if (cfun->va_list_gpr_size)
7808     {
7809       type = TREE_TYPE (gpr);
7810       t = build2 (MODIFY_EXPR, type,
7811 		  gpr, build_int_cst (type, n_gpr * 8));
7812       TREE_SIDE_EFFECTS (t) = 1;
7813       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7814     }
7815 
7816   if (TARGET_SSE && cfun->va_list_fpr_size)
7817     {
7818       type = TREE_TYPE (fpr);
7819       t = build2 (MODIFY_EXPR, type, fpr,
7820 		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7821       TREE_SIDE_EFFECTS (t) = 1;
7822       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7823     }
7824 
7825   /* Find the overflow area.  */
7826   type = TREE_TYPE (ovf);
7827   if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7828     ovf_rtx = crtl->args.internal_arg_pointer;
7829   else
7830     ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7831   t = make_tree (type, ovf_rtx);
7832   if (words != 0)
7833     t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7834   t = build2 (MODIFY_EXPR, type, ovf, t);
7835   TREE_SIDE_EFFECTS (t) = 1;
7836   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7837 
7838   if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7839     {
7840       /* Find the register save area.
7841 	 Prologue of the function save it right above stack frame.  */
7842       type = TREE_TYPE (sav);
7843       t = make_tree (type, frame_pointer_rtx);
7844       if (!ix86_varargs_gpr_size)
7845 	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7846       t = build2 (MODIFY_EXPR, type, sav, t);
7847       TREE_SIDE_EFFECTS (t) = 1;
7848       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7849     }
7850 }
7851 
7852 /* Implement va_arg.  */
7853 
7854 static tree
7855 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7856 		      gimple_seq *post_p)
7857 {
7858   static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7859   tree f_gpr, f_fpr, f_ovf, f_sav;
7860   tree gpr, fpr, ovf, sav, t;
7861   int size, rsize;
7862   tree lab_false, lab_over = NULL_TREE;
7863   tree addr, t2;
7864   rtx container;
7865   int indirect_p = 0;
7866   tree ptrtype;
7867   enum machine_mode nat_mode;
7868   unsigned int arg_boundary;
7869 
7870   /* Only 64bit target needs something special.  */
7871   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7872     return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7873 
7874   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7875   f_fpr = DECL_CHAIN (f_gpr);
7876   f_ovf = DECL_CHAIN (f_fpr);
7877   f_sav = DECL_CHAIN (f_ovf);
7878 
7879   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7880 		build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7881   valist = build_va_arg_indirect_ref (valist);
7882   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7883   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7884   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7885 
7886   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7887   if (indirect_p)
7888     type = build_pointer_type (type);
7889   size = int_size_in_bytes (type);
7890   rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7891 
7892   nat_mode = type_natural_mode (type, NULL);
7893   switch (nat_mode)
7894     {
7895     case V8SFmode:
7896     case V8SImode:
7897     case V32QImode:
7898     case V16HImode:
7899     case V4DFmode:
7900     case V4DImode:
7901       /* Unnamed 256bit vector mode parameters are passed on stack.  */
7902       if (!TARGET_64BIT_MS_ABI)
7903 	{
7904 	  container = NULL;
7905 	  break;
7906 	}
7907 
7908     default:
7909       container = construct_container (nat_mode, TYPE_MODE (type),
7910 				       type, 0, X86_64_REGPARM_MAX,
7911 				       X86_64_SSE_REGPARM_MAX, intreg,
7912 				       0);
7913       break;
7914     }
7915 
7916   /* Pull the value out of the saved registers.  */
7917 
7918   addr = create_tmp_var (ptr_type_node, "addr");
7919 
7920   if (container)
7921     {
7922       int needed_intregs, needed_sseregs;
7923       bool need_temp;
7924       tree int_addr, sse_addr;
7925 
7926       lab_false = create_artificial_label (UNKNOWN_LOCATION);
7927       lab_over = create_artificial_label (UNKNOWN_LOCATION);
7928 
7929       examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7930 
7931       need_temp = (!REG_P (container)
7932 		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
7933 		       || TYPE_ALIGN (type) > 128));
7934 
7935       /* In case we are passing structure, verify that it is consecutive block
7936          on the register save area.  If not we need to do moves.  */
7937       if (!need_temp && !REG_P (container))
7938 	{
7939 	  /* Verify that all registers are strictly consecutive  */
7940 	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7941 	    {
7942 	      int i;
7943 
7944 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7945 		{
7946 		  rtx slot = XVECEXP (container, 0, i);
7947 		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7948 		      || INTVAL (XEXP (slot, 1)) != i * 16)
7949 		    need_temp = 1;
7950 		}
7951 	    }
7952 	  else
7953 	    {
7954 	      int i;
7955 
7956 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7957 		{
7958 		  rtx slot = XVECEXP (container, 0, i);
7959 		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7960 		      || INTVAL (XEXP (slot, 1)) != i * 8)
7961 		    need_temp = 1;
7962 		}
7963 	    }
7964 	}
7965       if (!need_temp)
7966 	{
7967 	  int_addr = addr;
7968 	  sse_addr = addr;
7969 	}
7970       else
7971 	{
7972 	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
7973 	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7974 	}
7975 
7976       /* First ensure that we fit completely in registers.  */
7977       if (needed_intregs)
7978 	{
7979 	  t = build_int_cst (TREE_TYPE (gpr),
7980 			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7981 	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7982 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7983 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7984 	  gimplify_and_add (t, pre_p);
7985 	}
7986       if (needed_sseregs)
7987 	{
7988 	  t = build_int_cst (TREE_TYPE (fpr),
7989 			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7990 			     + X86_64_REGPARM_MAX * 8);
7991 	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7992 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7993 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7994 	  gimplify_and_add (t, pre_p);
7995 	}
7996 
7997       /* Compute index to start of area used for integer regs.  */
7998       if (needed_intregs)
7999 	{
8000 	  /* int_addr = gpr + sav; */
8001 	  t = fold_build_pointer_plus (sav, gpr);
8002 	  gimplify_assign (int_addr, t, pre_p);
8003 	}
8004       if (needed_sseregs)
8005 	{
8006 	  /* sse_addr = fpr + sav; */
8007 	  t = fold_build_pointer_plus (sav, fpr);
8008 	  gimplify_assign (sse_addr, t, pre_p);
8009 	}
8010       if (need_temp)
8011 	{
8012 	  int i, prev_size = 0;
8013 	  tree temp = create_tmp_var (type, "va_arg_tmp");
8014 
8015 	  /* addr = &temp; */
8016 	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8017 	  gimplify_assign (addr, t, pre_p);
8018 
8019 	  for (i = 0; i < XVECLEN (container, 0); i++)
8020 	    {
8021 	      rtx slot = XVECEXP (container, 0, i);
8022 	      rtx reg = XEXP (slot, 0);
8023 	      enum machine_mode mode = GET_MODE (reg);
8024 	      tree piece_type;
8025 	      tree addr_type;
8026 	      tree daddr_type;
8027 	      tree src_addr, src;
8028 	      int src_offset;
8029 	      tree dest_addr, dest;
8030 	      int cur_size = GET_MODE_SIZE (mode);
8031 
8032 	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8033 	      prev_size = INTVAL (XEXP (slot, 1));
8034 	      if (prev_size + cur_size > size)
8035 		{
8036 		  cur_size = size - prev_size;
8037 		  mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8038 		  if (mode == BLKmode)
8039 		    mode = QImode;
8040 		}
8041 	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
8042 	      if (mode == GET_MODE (reg))
8043 		addr_type = build_pointer_type (piece_type);
8044 	      else
8045 		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8046 							 true);
8047 	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8048 							true);
8049 
8050 	      if (SSE_REGNO_P (REGNO (reg)))
8051 		{
8052 		  src_addr = sse_addr;
8053 		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8054 		}
8055 	      else
8056 		{
8057 		  src_addr = int_addr;
8058 		  src_offset = REGNO (reg) * 8;
8059 		}
8060 	      src_addr = fold_convert (addr_type, src_addr);
8061 	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8062 
8063 	      dest_addr = fold_convert (daddr_type, addr);
8064 	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8065 	      if (cur_size == GET_MODE_SIZE (mode))
8066 		{
8067 		  src = build_va_arg_indirect_ref (src_addr);
8068 		  dest = build_va_arg_indirect_ref (dest_addr);
8069 
8070 		  gimplify_assign (dest, src, pre_p);
8071 		}
8072 	      else
8073 		{
8074 		  tree copy
8075 		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8076 				       3, dest_addr, src_addr,
8077 				       size_int (cur_size));
8078 		  gimplify_and_add (copy, pre_p);
8079 		}
8080 	      prev_size += cur_size;
8081 	    }
8082 	}
8083 
8084       if (needed_intregs)
8085 	{
8086 	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8087 		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8088 	  gimplify_assign (gpr, t, pre_p);
8089 	}
8090 
8091       if (needed_sseregs)
8092 	{
8093 	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8094 		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8095 	  gimplify_assign (fpr, t, pre_p);
8096 	}
8097 
8098       gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8099 
8100       gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8101     }
8102 
8103   /* ... otherwise out of the overflow area.  */
8104 
8105   /* When we align parameter on stack for caller, if the parameter
8106      alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8107      aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
8108      here with caller.  */
8109   arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8110   if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8111     arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8112 
8113   /* Care for on-stack alignment if needed.  */
8114   if (arg_boundary <= 64 || size == 0)
8115     t = ovf;
8116  else
8117     {
8118       HOST_WIDE_INT align = arg_boundary / 8;
8119       t = fold_build_pointer_plus_hwi (ovf, align - 1);
8120       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8121 		  build_int_cst (TREE_TYPE (t), -align));
8122     }
8123 
8124   gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8125   gimplify_assign (addr, t, pre_p);
8126 
8127   t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8128   gimplify_assign (unshare_expr (ovf), t, pre_p);
8129 
8130   if (container)
8131     gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8132 
8133   ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8134   addr = fold_convert (ptrtype, addr);
8135 
8136   if (indirect_p)
8137     addr = build_va_arg_indirect_ref (addr);
8138   return build_va_arg_indirect_ref (addr);
8139 }
8140 
8141 /* Return true if OPNUM's MEM should be matched
8142    in movabs* patterns.  */
8143 
8144 bool
8145 ix86_check_movabs (rtx insn, int opnum)
8146 {
8147   rtx set, mem;
8148 
8149   set = PATTERN (insn);
8150   if (GET_CODE (set) == PARALLEL)
8151     set = XVECEXP (set, 0, 0);
8152   gcc_assert (GET_CODE (set) == SET);
8153   mem = XEXP (set, opnum);
8154   while (GET_CODE (mem) == SUBREG)
8155     mem = SUBREG_REG (mem);
8156   gcc_assert (MEM_P (mem));
8157   return volatile_ok || !MEM_VOLATILE_P (mem);
8158 }
8159 
8160 /* Initialize the table of extra 80387 mathematical constants.  */
8161 
8162 static void
8163 init_ext_80387_constants (void)
8164 {
8165   static const char * cst[5] =
8166   {
8167     "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
8168     "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
8169     "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
8170     "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
8171     "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
8172   };
8173   int i;
8174 
8175   for (i = 0; i < 5; i++)
8176     {
8177       real_from_string (&ext_80387_constants_table[i], cst[i]);
8178       /* Ensure each constant is rounded to XFmode precision.  */
8179       real_convert (&ext_80387_constants_table[i],
8180 		    XFmode, &ext_80387_constants_table[i]);
8181     }
8182 
8183   ext_80387_constants_init = 1;
8184 }
8185 
8186 /* Return non-zero if the constant is something that
8187    can be loaded with a special instruction.  */
8188 
8189 int
8190 standard_80387_constant_p (rtx x)
8191 {
8192   enum machine_mode mode = GET_MODE (x);
8193 
8194   REAL_VALUE_TYPE r;
8195 
8196   if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8197     return -1;
8198 
8199   if (x == CONST0_RTX (mode))
8200     return 1;
8201   if (x == CONST1_RTX (mode))
8202     return 2;
8203 
8204   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8205 
8206   /* For XFmode constants, try to find a special 80387 instruction when
8207      optimizing for size or on those CPUs that benefit from them.  */
8208   if (mode == XFmode
8209       && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8210     {
8211       int i;
8212 
8213       if (! ext_80387_constants_init)
8214 	init_ext_80387_constants ();
8215 
8216       for (i = 0; i < 5; i++)
8217         if (real_identical (&r, &ext_80387_constants_table[i]))
8218 	  return i + 3;
8219     }
8220 
8221   /* Load of the constant -0.0 or -1.0 will be split as
8222      fldz;fchs or fld1;fchs sequence.  */
8223   if (real_isnegzero (&r))
8224     return 8;
8225   if (real_identical (&r, &dconstm1))
8226     return 9;
8227 
8228   return 0;
8229 }
8230 
8231 /* Return the opcode of the special instruction to be used to load
8232    the constant X.  */
8233 
8234 const char *
8235 standard_80387_constant_opcode (rtx x)
8236 {
8237   switch (standard_80387_constant_p (x))
8238     {
8239     case 1:
8240       return "fldz";
8241     case 2:
8242       return "fld1";
8243     case 3:
8244       return "fldlg2";
8245     case 4:
8246       return "fldln2";
8247     case 5:
8248       return "fldl2e";
8249     case 6:
8250       return "fldl2t";
8251     case 7:
8252       return "fldpi";
8253     case 8:
8254     case 9:
8255       return "#";
8256     default:
8257       gcc_unreachable ();
8258     }
8259 }
8260 
8261 /* Return the CONST_DOUBLE representing the 80387 constant that is
8262    loaded by the specified special instruction.  The argument IDX
8263    matches the return value from standard_80387_constant_p.  */
8264 
8265 rtx
8266 standard_80387_constant_rtx (int idx)
8267 {
8268   int i;
8269 
8270   if (! ext_80387_constants_init)
8271     init_ext_80387_constants ();
8272 
8273   switch (idx)
8274     {
8275     case 3:
8276     case 4:
8277     case 5:
8278     case 6:
8279     case 7:
8280       i = idx - 3;
8281       break;
8282 
8283     default:
8284       gcc_unreachable ();
8285     }
8286 
8287   return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8288 				       XFmode);
8289 }
8290 
8291 /* Return 1 if X is all 0s and 2 if x is all 1s
8292    in supported SSE/AVX vector mode.  */
8293 
8294 int
8295 standard_sse_constant_p (rtx x)
8296 {
8297   enum machine_mode mode = GET_MODE (x);
8298 
8299   if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8300     return 1;
8301   if (vector_all_ones_operand (x, mode))
8302     switch (mode)
8303       {
8304       case V16QImode:
8305       case V8HImode:
8306       case V4SImode:
8307       case V2DImode:
8308 	if (TARGET_SSE2)
8309 	  return 2;
8310       case V32QImode:
8311       case V16HImode:
8312       case V8SImode:
8313       case V4DImode:
8314 	if (TARGET_AVX2)
8315 	  return 2;
8316       default:
8317 	break;
8318       }
8319 
8320   return 0;
8321 }
8322 
8323 /* Return the opcode of the special instruction to be used to load
8324    the constant X.  */
8325 
8326 const char *
8327 standard_sse_constant_opcode (rtx insn, rtx x)
8328 {
8329   switch (standard_sse_constant_p (x))
8330     {
8331     case 1:
8332       switch (get_attr_mode (insn))
8333 	{
8334 	case MODE_TI:
8335 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8336 	    return "%vpxor\t%0, %d0";
8337 	case MODE_V2DF:
8338 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8339 	    return "%vxorpd\t%0, %d0";
8340 	case MODE_V4SF:
8341 	  return "%vxorps\t%0, %d0";
8342 
8343 	case MODE_OI:
8344 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8345 	    return "vpxor\t%x0, %x0, %x0";
8346 	case MODE_V4DF:
8347 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8348 	    return "vxorpd\t%x0, %x0, %x0";
8349 	case MODE_V8SF:
8350 	  return "vxorps\t%x0, %x0, %x0";
8351 
8352 	default:
8353 	  break;
8354 	}
8355 
8356     case 2:
8357       if (TARGET_AVX)
8358 	return "vpcmpeqd\t%0, %0, %0";
8359       else
8360 	return "pcmpeqd\t%0, %0";
8361 
8362     default:
8363       break;
8364     }
8365   gcc_unreachable ();
8366 }
8367 
8368 /* Returns true if OP contains a symbol reference */
8369 
8370 bool
8371 symbolic_reference_mentioned_p (rtx op)
8372 {
8373   const char *fmt;
8374   int i;
8375 
8376   if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8377     return true;
8378 
8379   fmt = GET_RTX_FORMAT (GET_CODE (op));
8380   for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8381     {
8382       if (fmt[i] == 'E')
8383 	{
8384 	  int j;
8385 
8386 	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8387 	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8388 	      return true;
8389 	}
8390 
8391       else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8392 	return true;
8393     }
8394 
8395   return false;
8396 }
8397 
8398 /* Return true if it is appropriate to emit `ret' instructions in the
8399    body of a function.  Do this only if the epilogue is simple, needing a
8400    couple of insns.  Prior to reloading, we can't tell how many registers
8401    must be saved, so return false then.  Return false if there is no frame
8402    marker to de-allocate.  */
8403 
8404 bool
8405 ix86_can_use_return_insn_p (void)
8406 {
8407   struct ix86_frame frame;
8408 
8409   if (! reload_completed || frame_pointer_needed)
8410     return 0;
8411 
8412   /* Don't allow more than 32k pop, since that's all we can do
8413      with one instruction.  */
8414   if (crtl->args.pops_args && crtl->args.size >= 32768)
8415     return 0;
8416 
8417   ix86_compute_frame_layout (&frame);
8418   return (frame.stack_pointer_offset == UNITS_PER_WORD
8419 	  && (frame.nregs + frame.nsseregs) == 0);
8420 }
8421 
8422 /* Value should be nonzero if functions must have frame pointers.
8423    Zero means the frame pointer need not be set up (and parms may
8424    be accessed via the stack pointer) in functions that seem suitable.  */
8425 
8426 static bool
8427 ix86_frame_pointer_required (void)
8428 {
8429   /* If we accessed previous frames, then the generated code expects
8430      to be able to access the saved ebp value in our frame.  */
8431   if (cfun->machine->accesses_prev_frame)
8432     return true;
8433 
8434   /* Several x86 os'es need a frame pointer for other reasons,
8435      usually pertaining to setjmp.  */
8436   if (SUBTARGET_FRAME_POINTER_REQUIRED)
8437     return true;
8438 
8439   /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
8440   if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8441     return true;
8442 
8443   /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8444      allocation is 4GB.  */
8445   if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8446     return true;
8447 
8448   /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8449      turns off the frame pointer by default.  Turn it back on now if
8450      we've not got a leaf function.  */
8451   if (TARGET_OMIT_LEAF_FRAME_POINTER
8452       && (!current_function_is_leaf
8453 	  || ix86_current_function_calls_tls_descriptor))
8454     return true;
8455 
8456   if (crtl->profile && !flag_fentry)
8457     return true;
8458 
8459   return false;
8460 }
8461 
8462 /* Record that the current function accesses previous call frames.  */
8463 
8464 void
8465 ix86_setup_frame_addresses (void)
8466 {
8467   cfun->machine->accesses_prev_frame = 1;
8468 }
8469 
8470 #ifndef USE_HIDDEN_LINKONCE
8471 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8472 #  define USE_HIDDEN_LINKONCE 1
8473 # else
8474 #  define USE_HIDDEN_LINKONCE 0
8475 # endif
8476 #endif
8477 
8478 static int pic_labels_used;
8479 
8480 /* Fills in the label name that should be used for a pc thunk for
8481    the given register.  */
8482 
8483 static void
8484 get_pc_thunk_name (char name[32], unsigned int regno)
8485 {
8486   gcc_assert (!TARGET_64BIT);
8487 
8488   if (USE_HIDDEN_LINKONCE)
8489     sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8490   else
8491     ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8492 }
8493 
8494 
8495 /* This function generates code for -fpic that loads %ebx with
8496    the return address of the caller and then returns.  */
8497 
8498 static void
8499 ix86_code_end (void)
8500 {
8501   rtx xops[2];
8502   int regno;
8503 
8504   for (regno = AX_REG; regno <= SP_REG; regno++)
8505     {
8506       char name[32];
8507       tree decl;
8508 
8509       if (!(pic_labels_used & (1 << regno)))
8510 	continue;
8511 
8512       get_pc_thunk_name (name, regno);
8513 
8514       decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8515 			 get_identifier (name),
8516 			 build_function_type_list (void_type_node, NULL_TREE));
8517       DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8518 				       NULL_TREE, void_type_node);
8519       TREE_PUBLIC (decl) = 1;
8520       TREE_STATIC (decl) = 1;
8521 
8522 #if TARGET_MACHO
8523       if (TARGET_MACHO)
8524 	{
8525 	  switch_to_section (darwin_sections[text_coal_section]);
8526 	  fputs ("\t.weak_definition\t", asm_out_file);
8527 	  assemble_name (asm_out_file, name);
8528 	  fputs ("\n\t.private_extern\t", asm_out_file);
8529 	  assemble_name (asm_out_file, name);
8530 	  putc ('\n', asm_out_file);
8531 	  ASM_OUTPUT_LABEL (asm_out_file, name);
8532 	  DECL_WEAK (decl) = 1;
8533 	}
8534       else
8535 #endif
8536       if (USE_HIDDEN_LINKONCE)
8537 	{
8538 	  DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8539 
8540 	  targetm.asm_out.unique_section (decl, 0);
8541 	  switch_to_section (get_named_section (decl, NULL, 0));
8542 
8543 	  targetm.asm_out.globalize_label (asm_out_file, name);
8544 	  fputs ("\t.hidden\t", asm_out_file);
8545 	  assemble_name (asm_out_file, name);
8546 	  putc ('\n', asm_out_file);
8547 	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8548 	}
8549       else
8550 	{
8551 	  switch_to_section (text_section);
8552 	  ASM_OUTPUT_LABEL (asm_out_file, name);
8553 	}
8554 
8555       DECL_INITIAL (decl) = make_node (BLOCK);
8556       current_function_decl = decl;
8557       init_function_start (decl);
8558       first_function_block_is_cold = false;
8559       /* Make sure unwind info is emitted for the thunk if needed.  */
8560       final_start_function (emit_barrier (), asm_out_file, 1);
8561 
8562       /* Pad stack IP move with 4 instructions (two NOPs count
8563 	 as one instruction).  */
8564       if (TARGET_PAD_SHORT_FUNCTION)
8565 	{
8566 	  int i = 8;
8567 
8568 	  while (i--)
8569 	    fputs ("\tnop\n", asm_out_file);
8570 	}
8571 
8572       xops[0] = gen_rtx_REG (Pmode, regno);
8573       xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8574       output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8575       fputs ("\tret\n", asm_out_file);
8576       final_end_function ();
8577       init_insn_lengths ();
8578       free_after_compilation (cfun);
8579       set_cfun (NULL);
8580       current_function_decl = NULL;
8581     }
8582 
8583   if (flag_split_stack)
8584     file_end_indicate_split_stack ();
8585 }
8586 
8587 /* Emit code for the SET_GOT patterns.  */
8588 
8589 const char *
8590 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8591 {
8592   rtx xops[3];
8593 
8594   xops[0] = dest;
8595 
8596   if (TARGET_VXWORKS_RTP && flag_pic)
8597     {
8598       /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
8599       xops[2] = gen_rtx_MEM (Pmode,
8600 			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8601       output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8602 
8603       /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8604 	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8605 	 an unadorned address.  */
8606       xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8607       SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8608       output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8609       return "";
8610     }
8611 
8612   xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8613 
8614   if (!flag_pic)
8615     {
8616       xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8617 
8618       output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8619 
8620 #if TARGET_MACHO
8621       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
8622          is what will be referenced by the Mach-O PIC subsystem.  */
8623       if (!label)
8624 	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8625 #endif
8626 
8627       targetm.asm_out.internal_label (asm_out_file, "L",
8628 				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8629     }
8630   else
8631     {
8632       char name[32];
8633       get_pc_thunk_name (name, REGNO (dest));
8634       pic_labels_used |= 1 << REGNO (dest);
8635 
8636       xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8637       xops[2] = gen_rtx_MEM (QImode, xops[2]);
8638       output_asm_insn ("call\t%X2", xops);
8639       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
8640          is what will be referenced by the Mach-O PIC subsystem.  */
8641 #if TARGET_MACHO
8642       if (!label)
8643 	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8644       else
8645         targetm.asm_out.internal_label (asm_out_file, "L",
8646 					   CODE_LABEL_NUMBER (label));
8647 #endif
8648     }
8649 
8650   if (!TARGET_MACHO)
8651     output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8652 
8653   return "";
8654 }
8655 
8656 /* Generate an "push" pattern for input ARG.  */
8657 
8658 static rtx
8659 gen_push (rtx arg)
8660 {
8661   struct machine_function *m = cfun->machine;
8662 
8663   if (m->fs.cfa_reg == stack_pointer_rtx)
8664     m->fs.cfa_offset += UNITS_PER_WORD;
8665   m->fs.sp_offset += UNITS_PER_WORD;
8666 
8667   return gen_rtx_SET (VOIDmode,
8668 		      gen_rtx_MEM (Pmode,
8669 				   gen_rtx_PRE_DEC (Pmode,
8670 						    stack_pointer_rtx)),
8671 		      arg);
8672 }
8673 
8674 /* Generate an "pop" pattern for input ARG.  */
8675 
8676 static rtx
8677 gen_pop (rtx arg)
8678 {
8679   return gen_rtx_SET (VOIDmode,
8680 		      arg,
8681 		      gen_rtx_MEM (Pmode,
8682 				   gen_rtx_POST_INC (Pmode,
8683 						     stack_pointer_rtx)));
8684 }
8685 
8686 /* Return >= 0 if there is an unused call-clobbered register available
8687    for the entire function.  */
8688 
8689 static unsigned int
8690 ix86_select_alt_pic_regnum (void)
8691 {
8692   if (current_function_is_leaf
8693       && !crtl->profile
8694       && !ix86_current_function_calls_tls_descriptor)
8695     {
8696       int i, drap;
8697       /* Can't use the same register for both PIC and DRAP.  */
8698       if (crtl->drap_reg)
8699 	drap = REGNO (crtl->drap_reg);
8700       else
8701 	drap = -1;
8702       for (i = 2; i >= 0; --i)
8703         if (i != drap && !df_regs_ever_live_p (i))
8704 	  return i;
8705     }
8706 
8707   return INVALID_REGNUM;
8708 }
8709 
8710 /* Return TRUE if we need to save REGNO.  */
8711 
8712 static bool
8713 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8714 {
8715   if (pic_offset_table_rtx
8716       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8717       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8718 	  || crtl->profile
8719 	  || crtl->calls_eh_return
8720 	  || crtl->uses_const_pool))
8721     return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8722 
8723   if (crtl->calls_eh_return && maybe_eh_return)
8724     {
8725       unsigned i;
8726       for (i = 0; ; i++)
8727 	{
8728 	  unsigned test = EH_RETURN_DATA_REGNO (i);
8729 	  if (test == INVALID_REGNUM)
8730 	    break;
8731 	  if (test == regno)
8732 	    return true;
8733 	}
8734     }
8735 
8736   if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8737     return true;
8738 
8739   return (df_regs_ever_live_p (regno)
8740 	  && !call_used_regs[regno]
8741 	  && !fixed_regs[regno]
8742 	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8743 }
8744 
8745 /* Return number of saved general prupose registers.  */
8746 
8747 static int
8748 ix86_nsaved_regs (void)
8749 {
8750   int nregs = 0;
8751   int regno;
8752 
8753   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8754     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8755       nregs ++;
8756   return nregs;
8757 }
8758 
8759 /* Return number of saved SSE registrers.  */
8760 
8761 static int
8762 ix86_nsaved_sseregs (void)
8763 {
8764   int nregs = 0;
8765   int regno;
8766 
8767   if (!TARGET_64BIT_MS_ABI)
8768     return 0;
8769   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8770     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8771       nregs ++;
8772   return nregs;
8773 }
8774 
8775 /* Given FROM and TO register numbers, say whether this elimination is
8776    allowed.  If stack alignment is needed, we can only replace argument
8777    pointer with hard frame pointer, or replace frame pointer with stack
8778    pointer.  Otherwise, frame pointer elimination is automatically
8779    handled and all other eliminations are valid.  */
8780 
8781 static bool
8782 ix86_can_eliminate (const int from, const int to)
8783 {
8784   if (stack_realign_fp)
8785     return ((from == ARG_POINTER_REGNUM
8786 	     && to == HARD_FRAME_POINTER_REGNUM)
8787 	    || (from == FRAME_POINTER_REGNUM
8788 		&& to == STACK_POINTER_REGNUM));
8789   else
8790     return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8791 }
8792 
8793 /* Return the offset between two registers, one to be eliminated, and the other
8794    its replacement, at the start of a routine.  */
8795 
8796 HOST_WIDE_INT
8797 ix86_initial_elimination_offset (int from, int to)
8798 {
8799   struct ix86_frame frame;
8800   ix86_compute_frame_layout (&frame);
8801 
8802   if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8803     return frame.hard_frame_pointer_offset;
8804   else if (from == FRAME_POINTER_REGNUM
8805 	   && to == HARD_FRAME_POINTER_REGNUM)
8806     return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8807   else
8808     {
8809       gcc_assert (to == STACK_POINTER_REGNUM);
8810 
8811       if (from == ARG_POINTER_REGNUM)
8812 	return frame.stack_pointer_offset;
8813 
8814       gcc_assert (from == FRAME_POINTER_REGNUM);
8815       return frame.stack_pointer_offset - frame.frame_pointer_offset;
8816     }
8817 }
8818 
8819 /* In a dynamically-aligned function, we can't know the offset from
8820    stack pointer to frame pointer, so we must ensure that setjmp
8821    eliminates fp against the hard fp (%ebp) rather than trying to
8822    index from %esp up to the top of the frame across a gap that is
8823    of unknown (at compile-time) size.  */
8824 static rtx
8825 ix86_builtin_setjmp_frame_value (void)
8826 {
8827   return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8828 }
8829 
8830 /* When using -fsplit-stack, the allocation routines set a field in
8831    the TCB to the bottom of the stack plus this much space, measured
8832    in bytes.  */
8833 
8834 #define SPLIT_STACK_AVAILABLE 256
8835 
8836 /* Fill structure ix86_frame about frame of currently computed function.  */
8837 
8838 static void
8839 ix86_compute_frame_layout (struct ix86_frame *frame)
8840 {
8841   unsigned int stack_alignment_needed;
8842   HOST_WIDE_INT offset;
8843   unsigned int preferred_alignment;
8844   HOST_WIDE_INT size = get_frame_size ();
8845   HOST_WIDE_INT to_allocate;
8846 
8847   frame->nregs = ix86_nsaved_regs ();
8848   frame->nsseregs = ix86_nsaved_sseregs ();
8849 
8850   stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8851   preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8852 
8853   /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8854      function prologues and leaf.  */
8855   if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8856       && (!current_function_is_leaf || cfun->calls_alloca != 0
8857           || ix86_current_function_calls_tls_descriptor))
8858     {
8859       preferred_alignment = 16;
8860       stack_alignment_needed = 16;
8861       crtl->preferred_stack_boundary = 128;
8862       crtl->stack_alignment_needed = 128;
8863     }
8864 
8865   gcc_assert (!size || stack_alignment_needed);
8866   gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8867   gcc_assert (preferred_alignment <= stack_alignment_needed);
8868 
8869   /* For SEH we have to limit the amount of code movement into the prologue.
8870      At present we do this via a BLOCKAGE, at which point there's very little
8871      scheduling that can be done, which means that there's very little point
8872      in doing anything except PUSHs.  */
8873   if (TARGET_SEH)
8874     cfun->machine->use_fast_prologue_epilogue = false;
8875 
8876   /* During reload iteration the amount of registers saved can change.
8877      Recompute the value as needed.  Do not recompute when amount of registers
8878      didn't change as reload does multiple calls to the function and does not
8879      expect the decision to change within single iteration.  */
8880   else if (!optimize_function_for_size_p (cfun)
8881            && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8882     {
8883       int count = frame->nregs;
8884       struct cgraph_node *node = cgraph_get_node (current_function_decl);
8885 
8886       cfun->machine->use_fast_prologue_epilogue_nregs = count;
8887 
8888       /* The fast prologue uses move instead of push to save registers.  This
8889          is significantly longer, but also executes faster as modern hardware
8890          can execute the moves in parallel, but can't do that for push/pop.
8891 
8892 	 Be careful about choosing what prologue to emit:  When function takes
8893 	 many instructions to execute we may use slow version as well as in
8894 	 case function is known to be outside hot spot (this is known with
8895 	 feedback only).  Weight the size of function by number of registers
8896 	 to save as it is cheap to use one or two push instructions but very
8897 	 slow to use many of them.  */
8898       if (count)
8899 	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8900       if (node->frequency < NODE_FREQUENCY_NORMAL
8901 	  || (flag_branch_probabilities
8902 	      && node->frequency < NODE_FREQUENCY_HOT))
8903         cfun->machine->use_fast_prologue_epilogue = false;
8904       else
8905         cfun->machine->use_fast_prologue_epilogue
8906 	   = !expensive_function_p (count);
8907     }
8908 
8909   frame->save_regs_using_mov
8910     = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8911        /* If static stack checking is enabled and done with probes,
8912 	  the registers need to be saved before allocating the frame.  */
8913        && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8914 
8915   /* Skip return address.  */
8916   offset = UNITS_PER_WORD;
8917 
8918   /* Skip pushed static chain.  */
8919   if (ix86_static_chain_on_stack)
8920     offset += UNITS_PER_WORD;
8921 
8922   /* Skip saved base pointer.  */
8923   if (frame_pointer_needed)
8924     offset += UNITS_PER_WORD;
8925   frame->hfp_save_offset = offset;
8926 
8927   /* The traditional frame pointer location is at the top of the frame.  */
8928   frame->hard_frame_pointer_offset = offset;
8929 
8930   /* Register save area */
8931   offset += frame->nregs * UNITS_PER_WORD;
8932   frame->reg_save_offset = offset;
8933 
8934   /* On SEH target, registers are pushed just before the frame pointer
8935      location.  */
8936   if (TARGET_SEH)
8937     frame->hard_frame_pointer_offset = offset;
8938 
8939   /* Align and set SSE register save area.  */
8940   if (frame->nsseregs)
8941     {
8942       /* The only ABI that has saved SSE registers (Win64) also has a
8943          16-byte aligned default stack, and thus we don't need to be
8944 	 within the re-aligned local stack frame to save them.  */
8945       gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8946       offset = (offset + 16 - 1) & -16;
8947       offset += frame->nsseregs * 16;
8948     }
8949   frame->sse_reg_save_offset = offset;
8950 
8951   /* The re-aligned stack starts here.  Values before this point are not
8952      directly comparable with values below this point.  In order to make
8953      sure that no value happens to be the same before and after, force
8954      the alignment computation below to add a non-zero value.  */
8955   if (stack_realign_fp)
8956     offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8957 
8958   /* Va-arg area */
8959   frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8960   offset += frame->va_arg_size;
8961 
8962   /* Align start of frame for local function.  */
8963   if (stack_realign_fp
8964       || offset != frame->sse_reg_save_offset
8965       || size != 0
8966       || !current_function_is_leaf
8967       || cfun->calls_alloca
8968       || ix86_current_function_calls_tls_descriptor)
8969     offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8970 
8971   /* Frame pointer points here.  */
8972   frame->frame_pointer_offset = offset;
8973 
8974   offset += size;
8975 
8976   /* Add outgoing arguments area.  Can be skipped if we eliminated
8977      all the function calls as dead code.
8978      Skipping is however impossible when function calls alloca.  Alloca
8979      expander assumes that last crtl->outgoing_args_size
8980      of stack frame are unused.  */
8981   if (ACCUMULATE_OUTGOING_ARGS
8982       && (!current_function_is_leaf || cfun->calls_alloca
8983 	  || ix86_current_function_calls_tls_descriptor))
8984     {
8985       offset += crtl->outgoing_args_size;
8986       frame->outgoing_arguments_size = crtl->outgoing_args_size;
8987     }
8988   else
8989     frame->outgoing_arguments_size = 0;
8990 
8991   /* Align stack boundary.  Only needed if we're calling another function
8992      or using alloca.  */
8993   if (!current_function_is_leaf || cfun->calls_alloca
8994       || ix86_current_function_calls_tls_descriptor)
8995     offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8996 
8997   /* We've reached end of stack frame.  */
8998   frame->stack_pointer_offset = offset;
8999 
9000   /* Size prologue needs to allocate.  */
9001   to_allocate = offset - frame->sse_reg_save_offset;
9002 
9003   if ((!to_allocate && frame->nregs <= 1)
9004       || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9005     frame->save_regs_using_mov = false;
9006 
9007   if (ix86_using_red_zone ()
9008       && current_function_sp_is_unchanging
9009       && current_function_is_leaf
9010       && !ix86_current_function_calls_tls_descriptor)
9011     {
9012       frame->red_zone_size = to_allocate;
9013       if (frame->save_regs_using_mov)
9014 	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9015       if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9016 	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9017     }
9018   else
9019     frame->red_zone_size = 0;
9020   frame->stack_pointer_offset -= frame->red_zone_size;
9021 
9022   /* The SEH frame pointer location is near the bottom of the frame.
9023      This is enforced by the fact that the difference between the
9024      stack pointer and the frame pointer is limited to 240 bytes in
9025      the unwind data structure.  */
9026   if (TARGET_SEH)
9027     {
9028       HOST_WIDE_INT diff;
9029 
9030       /* If we can leave the frame pointer where it is, do so.  Also, returns
9031 	 the establisher frame for __builtin_frame_address (0).  */
9032       diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9033       if (diff <= SEH_MAX_FRAME_SIZE
9034 	  && (diff > 240 || (diff & 15) != 0)
9035 	  && !crtl->accesses_prior_frames)
9036 	{
9037 	  /* Ideally we'd determine what portion of the local stack frame
9038 	     (within the constraint of the lowest 240) is most heavily used.
9039 	     But without that complication, simply bias the frame pointer
9040 	     by 128 bytes so as to maximize the amount of the local stack
9041 	     frame that is addressable with 8-bit offsets.  */
9042 	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9043 	}
9044     }
9045 }
9046 
9047 /* This is semi-inlined memory_address_length, but simplified
9048    since we know that we're always dealing with reg+offset, and
9049    to avoid having to create and discard all that rtl.  */
9050 
9051 static inline int
9052 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9053 {
9054   int len = 4;
9055 
9056   if (offset == 0)
9057     {
9058       /* EBP and R13 cannot be encoded without an offset.  */
9059       len = (regno == BP_REG || regno == R13_REG);
9060     }
9061   else if (IN_RANGE (offset, -128, 127))
9062     len = 1;
9063 
9064   /* ESP and R12 must be encoded with a SIB byte.  */
9065   if (regno == SP_REG || regno == R12_REG)
9066     len++;
9067 
9068   return len;
9069 }
9070 
9071 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9072    The valid base registers are taken from CFUN->MACHINE->FS.  */
9073 
9074 static rtx
9075 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9076 {
9077   const struct machine_function *m = cfun->machine;
9078   rtx base_reg = NULL;
9079   HOST_WIDE_INT base_offset = 0;
9080 
9081   if (m->use_fast_prologue_epilogue)
9082     {
9083       /* Choose the base register most likely to allow the most scheduling
9084          opportunities.  Generally FP is valid througout the function,
9085          while DRAP must be reloaded within the epilogue.  But choose either
9086          over the SP due to increased encoding size.  */
9087 
9088       if (m->fs.fp_valid)
9089 	{
9090 	  base_reg = hard_frame_pointer_rtx;
9091 	  base_offset = m->fs.fp_offset - cfa_offset;
9092 	}
9093       else if (m->fs.drap_valid)
9094 	{
9095 	  base_reg = crtl->drap_reg;
9096 	  base_offset = 0 - cfa_offset;
9097 	}
9098       else if (m->fs.sp_valid)
9099 	{
9100 	  base_reg = stack_pointer_rtx;
9101 	  base_offset = m->fs.sp_offset - cfa_offset;
9102 	}
9103     }
9104   else
9105     {
9106       HOST_WIDE_INT toffset;
9107       int len = 16, tlen;
9108 
9109       /* Choose the base register with the smallest address encoding.
9110          With a tie, choose FP > DRAP > SP.  */
9111       if (m->fs.sp_valid)
9112 	{
9113 	  base_reg = stack_pointer_rtx;
9114 	  base_offset = m->fs.sp_offset - cfa_offset;
9115           len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9116 	}
9117       if (m->fs.drap_valid)
9118 	{
9119 	  toffset = 0 - cfa_offset;
9120 	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9121 	  if (tlen <= len)
9122 	    {
9123 	      base_reg = crtl->drap_reg;
9124 	      base_offset = toffset;
9125 	      len = tlen;
9126 	    }
9127 	}
9128       if (m->fs.fp_valid)
9129 	{
9130 	  toffset = m->fs.fp_offset - cfa_offset;
9131 	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9132 	  if (tlen <= len)
9133 	    {
9134 	      base_reg = hard_frame_pointer_rtx;
9135 	      base_offset = toffset;
9136 	      len = tlen;
9137 	    }
9138 	}
9139     }
9140   gcc_assert (base_reg != NULL);
9141 
9142   return plus_constant (base_reg, base_offset);
9143 }
9144 
9145 /* Emit code to save registers in the prologue.  */
9146 
9147 static void
9148 ix86_emit_save_regs (void)
9149 {
9150   unsigned int regno;
9151   rtx insn;
9152 
9153   for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9154     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9155       {
9156 	insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9157 	RTX_FRAME_RELATED_P (insn) = 1;
9158       }
9159 }
9160 
9161 /* Emit a single register save at CFA - CFA_OFFSET.  */
9162 
9163 static void
9164 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9165 			      HOST_WIDE_INT cfa_offset)
9166 {
9167   struct machine_function *m = cfun->machine;
9168   rtx reg = gen_rtx_REG (mode, regno);
9169   rtx mem, addr, base, insn;
9170 
9171   addr = choose_baseaddr (cfa_offset);
9172   mem = gen_frame_mem (mode, addr);
9173 
9174   /* For SSE saves, we need to indicate the 128-bit alignment.  */
9175   set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9176 
9177   insn = emit_move_insn (mem, reg);
9178   RTX_FRAME_RELATED_P (insn) = 1;
9179 
9180   base = addr;
9181   if (GET_CODE (base) == PLUS)
9182     base = XEXP (base, 0);
9183   gcc_checking_assert (REG_P (base));
9184 
9185   /* When saving registers into a re-aligned local stack frame, avoid
9186      any tricky guessing by dwarf2out.  */
9187   if (m->fs.realigned)
9188     {
9189       gcc_checking_assert (stack_realign_drap);
9190 
9191       if (regno == REGNO (crtl->drap_reg))
9192 	{
9193 	  /* A bit of a hack.  We force the DRAP register to be saved in
9194 	     the re-aligned stack frame, which provides us with a copy
9195 	     of the CFA that will last past the prologue.  Install it.  */
9196 	  gcc_checking_assert (cfun->machine->fs.fp_valid);
9197 	  addr = plus_constant (hard_frame_pointer_rtx,
9198 				cfun->machine->fs.fp_offset - cfa_offset);
9199 	  mem = gen_rtx_MEM (mode, addr);
9200 	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9201 	}
9202       else
9203 	{
9204 	  /* The frame pointer is a stable reference within the
9205 	     aligned frame.  Use it.  */
9206 	  gcc_checking_assert (cfun->machine->fs.fp_valid);
9207 	  addr = plus_constant (hard_frame_pointer_rtx,
9208 				cfun->machine->fs.fp_offset - cfa_offset);
9209 	  mem = gen_rtx_MEM (mode, addr);
9210 	  add_reg_note (insn, REG_CFA_EXPRESSION,
9211 			gen_rtx_SET (VOIDmode, mem, reg));
9212 	}
9213     }
9214 
9215   /* The memory may not be relative to the current CFA register,
9216      which means that we may need to generate a new pattern for
9217      use by the unwind info.  */
9218   else if (base != m->fs.cfa_reg)
9219     {
9220       addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9221       mem = gen_rtx_MEM (mode, addr);
9222       add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9223     }
9224 }
9225 
9226 /* Emit code to save registers using MOV insns.
9227    First register is stored at CFA - CFA_OFFSET.  */
9228 static void
9229 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9230 {
9231   unsigned int regno;
9232 
9233   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9234     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9235       {
9236         ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9237 	cfa_offset -= UNITS_PER_WORD;
9238       }
9239 }
9240 
9241 /* Emit code to save SSE registers using MOV insns.
9242    First register is stored at CFA - CFA_OFFSET.  */
9243 static void
9244 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9245 {
9246   unsigned int regno;
9247 
9248   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9249     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9250       {
9251 	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9252 	cfa_offset -= 16;
9253       }
9254 }
9255 
9256 static GTY(()) rtx queued_cfa_restores;
9257 
9258 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9259    manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
9260    Don't add the note if the previously saved value will be left untouched
9261    within stack red-zone till return, as unwinders can find the same value
9262    in the register and on the stack.  */
9263 
9264 static void
9265 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9266 {
9267   if (!crtl->shrink_wrapped
9268       && cfa_offset <= cfun->machine->fs.red_zone_offset)
9269     return;
9270 
9271   if (insn)
9272     {
9273       add_reg_note (insn, REG_CFA_RESTORE, reg);
9274       RTX_FRAME_RELATED_P (insn) = 1;
9275     }
9276   else
9277     queued_cfa_restores
9278       = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9279 }
9280 
9281 /* Add queued REG_CFA_RESTORE notes if any to INSN.  */
9282 
9283 static void
9284 ix86_add_queued_cfa_restore_notes (rtx insn)
9285 {
9286   rtx last;
9287   if (!queued_cfa_restores)
9288     return;
9289   for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9290     ;
9291   XEXP (last, 1) = REG_NOTES (insn);
9292   REG_NOTES (insn) = queued_cfa_restores;
9293   queued_cfa_restores = NULL_RTX;
9294   RTX_FRAME_RELATED_P (insn) = 1;
9295 }
9296 
9297 /* Expand prologue or epilogue stack adjustment.
9298    The pattern exist to put a dependency on all ebp-based memory accesses.
9299    STYLE should be negative if instructions should be marked as frame related,
9300    zero if %r11 register is live and cannot be freely used and positive
9301    otherwise.  */
9302 
9303 static void
9304 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9305 			   int style, bool set_cfa)
9306 {
9307   struct machine_function *m = cfun->machine;
9308   rtx insn;
9309   bool add_frame_related_expr = false;
9310 
9311   if (! TARGET_64BIT)
9312     insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9313   else if (x86_64_immediate_operand (offset, DImode))
9314     insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9315   else
9316     {
9317       rtx tmp;
9318       /* r11 is used by indirect sibcall return as well, set before the
9319 	 epilogue and used after the epilogue.  */
9320       if (style)
9321         tmp = gen_rtx_REG (DImode, R11_REG);
9322       else
9323 	{
9324 	  gcc_assert (src != hard_frame_pointer_rtx
9325 		      && dest != hard_frame_pointer_rtx);
9326 	  tmp = hard_frame_pointer_rtx;
9327 	}
9328       insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9329       if (style < 0)
9330 	add_frame_related_expr = true;
9331 
9332       insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9333     }
9334 
9335   insn = emit_insn (insn);
9336   if (style >= 0)
9337     ix86_add_queued_cfa_restore_notes (insn);
9338 
9339   if (set_cfa)
9340     {
9341       rtx r;
9342 
9343       gcc_assert (m->fs.cfa_reg == src);
9344       m->fs.cfa_offset += INTVAL (offset);
9345       m->fs.cfa_reg = dest;
9346 
9347       r = gen_rtx_PLUS (Pmode, src, offset);
9348       r = gen_rtx_SET (VOIDmode, dest, r);
9349       add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9350       RTX_FRAME_RELATED_P (insn) = 1;
9351     }
9352   else if (style < 0)
9353     {
9354       RTX_FRAME_RELATED_P (insn) = 1;
9355       if (add_frame_related_expr)
9356 	{
9357 	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
9358 	  r = gen_rtx_SET (VOIDmode, dest, r);
9359 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9360 	}
9361     }
9362 
9363   if (dest == stack_pointer_rtx)
9364     {
9365       HOST_WIDE_INT ooffset = m->fs.sp_offset;
9366       bool valid = m->fs.sp_valid;
9367 
9368       if (src == hard_frame_pointer_rtx)
9369 	{
9370 	  valid = m->fs.fp_valid;
9371 	  ooffset = m->fs.fp_offset;
9372 	}
9373       else if (src == crtl->drap_reg)
9374 	{
9375 	  valid = m->fs.drap_valid;
9376 	  ooffset = 0;
9377 	}
9378       else
9379 	{
9380 	  /* Else there are two possibilities: SP itself, which we set
9381 	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
9382 	     taken care of this by hand along the eh_return path.  */
9383 	  gcc_checking_assert (src == stack_pointer_rtx
9384 			       || offset == const0_rtx);
9385 	}
9386 
9387       m->fs.sp_offset = ooffset - INTVAL (offset);
9388       m->fs.sp_valid = valid;
9389     }
9390 }
9391 
9392 /* Find an available register to be used as dynamic realign argument
9393    pointer regsiter.  Such a register will be written in prologue and
9394    used in begin of body, so it must not be
9395 	1. parameter passing register.
9396 	2. GOT pointer.
9397    We reuse static-chain register if it is available.  Otherwise, we
9398    use DI for i386 and R13 for x86-64.  We chose R13 since it has
9399    shorter encoding.
9400 
9401    Return: the regno of chosen register.  */
9402 
9403 static unsigned int
9404 find_drap_reg (void)
9405 {
9406   tree decl = cfun->decl;
9407 
9408   if (TARGET_64BIT)
9409     {
9410       /* Use R13 for nested function or function need static chain.
9411 	 Since function with tail call may use any caller-saved
9412 	 registers in epilogue, DRAP must not use caller-saved
9413 	 register in such case.  */
9414       if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9415 	return R13_REG;
9416 
9417       return R10_REG;
9418     }
9419   else
9420     {
9421       /* Use DI for nested function or function need static chain.
9422 	 Since function with tail call may use any caller-saved
9423 	 registers in epilogue, DRAP must not use caller-saved
9424 	 register in such case.  */
9425       if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9426 	return DI_REG;
9427 
9428       /* Reuse static chain register if it isn't used for parameter
9429          passing.  */
9430       if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9431 	{
9432 	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9433 	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9434 	    return CX_REG;
9435 	}
9436       return DI_REG;
9437     }
9438 }
9439 
9440 /* Return minimum incoming stack alignment.  */
9441 
9442 static unsigned int
9443 ix86_minimum_incoming_stack_boundary (bool sibcall)
9444 {
9445   unsigned int incoming_stack_boundary;
9446 
9447   /* Prefer the one specified at command line. */
9448   if (ix86_user_incoming_stack_boundary)
9449     incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9450   /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9451      if -mstackrealign is used, it isn't used for sibcall check and
9452      estimated stack alignment is 128bit.  */
9453   else if (!sibcall
9454 	   && !TARGET_64BIT
9455 	   && ix86_force_align_arg_pointer
9456 	   && crtl->stack_alignment_estimated == 128)
9457     incoming_stack_boundary = MIN_STACK_BOUNDARY;
9458   else
9459     incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9460 
9461   /* Incoming stack alignment can be changed on individual functions
9462      via force_align_arg_pointer attribute.  We use the smallest
9463      incoming stack boundary.  */
9464   if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9465       && lookup_attribute (ix86_force_align_arg_pointer_string,
9466 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9467     incoming_stack_boundary = MIN_STACK_BOUNDARY;
9468 
9469   /* The incoming stack frame has to be aligned at least at
9470      parm_stack_boundary.  */
9471   if (incoming_stack_boundary < crtl->parm_stack_boundary)
9472     incoming_stack_boundary = crtl->parm_stack_boundary;
9473 
9474   /* Stack at entrance of main is aligned by runtime.  We use the
9475      smallest incoming stack boundary. */
9476   if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9477       && DECL_NAME (current_function_decl)
9478       && MAIN_NAME_P (DECL_NAME (current_function_decl))
9479       && DECL_FILE_SCOPE_P (current_function_decl))
9480     incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9481 
9482   return incoming_stack_boundary;
9483 }
9484 
9485 /* Update incoming stack boundary and estimated stack alignment.  */
9486 
9487 static void
9488 ix86_update_stack_boundary (void)
9489 {
9490   ix86_incoming_stack_boundary
9491     = ix86_minimum_incoming_stack_boundary (false);
9492 
9493   /* x86_64 vararg needs 16byte stack alignment for register save
9494      area.  */
9495   if (TARGET_64BIT
9496       && cfun->stdarg
9497       && crtl->stack_alignment_estimated < 128)
9498     crtl->stack_alignment_estimated = 128;
9499 }
9500 
9501 /* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
9502    needed or an rtx for DRAP otherwise.  */
9503 
9504 static rtx
9505 ix86_get_drap_rtx (void)
9506 {
9507   if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9508     crtl->need_drap = true;
9509 
9510   if (stack_realign_drap)
9511     {
9512       /* Assign DRAP to vDRAP and returns vDRAP */
9513       unsigned int regno = find_drap_reg ();
9514       rtx drap_vreg;
9515       rtx arg_ptr;
9516       rtx seq, insn;
9517 
9518       arg_ptr = gen_rtx_REG (Pmode, regno);
9519       crtl->drap_reg = arg_ptr;
9520 
9521       start_sequence ();
9522       drap_vreg = copy_to_reg (arg_ptr);
9523       seq = get_insns ();
9524       end_sequence ();
9525 
9526       insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9527       if (!optimize)
9528 	{
9529 	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9530 	  RTX_FRAME_RELATED_P (insn) = 1;
9531 	}
9532       return drap_vreg;
9533     }
9534   else
9535     return NULL;
9536 }
9537 
9538 /* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
9539 
9540 static rtx
9541 ix86_internal_arg_pointer (void)
9542 {
9543   return virtual_incoming_args_rtx;
9544 }
9545 
9546 struct scratch_reg {
9547   rtx reg;
9548   bool saved;
9549 };
9550 
9551 /* Return a short-lived scratch register for use on function entry.
9552    In 32-bit mode, it is valid only after the registers are saved
9553    in the prologue.  This register must be released by means of
9554    release_scratch_register_on_entry once it is dead.  */
9555 
9556 static void
9557 get_scratch_register_on_entry (struct scratch_reg *sr)
9558 {
9559   int regno;
9560 
9561   sr->saved = false;
9562 
9563   if (TARGET_64BIT)
9564     {
9565       /* We always use R11 in 64-bit mode.  */
9566       regno = R11_REG;
9567     }
9568   else
9569     {
9570       tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9571       bool fastcall_p
9572 	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9573       bool thiscall_p
9574 	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9575       bool static_chain_p = DECL_STATIC_CHAIN (decl);
9576       int regparm = ix86_function_regparm (fntype, decl);
9577       int drap_regno
9578 	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9579 
9580       /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9581 	  for the static chain register.  */
9582       if ((regparm < 1 || (fastcall_p && !static_chain_p))
9583 	  && drap_regno != AX_REG)
9584 	regno = AX_REG;
9585       /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9586 	  for the static chain register.  */
9587       else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9588         regno = AX_REG;
9589       else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9590 	regno = DX_REG;
9591       /* ecx is the static chain register.  */
9592       else if (regparm < 3 && !fastcall_p && !thiscall_p
9593 	       && !static_chain_p
9594 	       && drap_regno != CX_REG)
9595 	regno = CX_REG;
9596       else if (ix86_save_reg (BX_REG, true))
9597 	regno = BX_REG;
9598       /* esi is the static chain register.  */
9599       else if (!(regparm == 3 && static_chain_p)
9600 	       && ix86_save_reg (SI_REG, true))
9601 	regno = SI_REG;
9602       else if (ix86_save_reg (DI_REG, true))
9603 	regno = DI_REG;
9604       else
9605 	{
9606 	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9607 	  sr->saved = true;
9608 	}
9609     }
9610 
9611   sr->reg = gen_rtx_REG (Pmode, regno);
9612   if (sr->saved)
9613     {
9614       rtx insn = emit_insn (gen_push (sr->reg));
9615       RTX_FRAME_RELATED_P (insn) = 1;
9616     }
9617 }
9618 
9619 /* Release a scratch register obtained from the preceding function.  */
9620 
9621 static void
9622 release_scratch_register_on_entry (struct scratch_reg *sr)
9623 {
9624   if (sr->saved)
9625     {
9626       struct machine_function *m = cfun->machine;
9627       rtx x, insn = emit_insn (gen_pop (sr->reg));
9628 
9629       /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop.  */
9630       RTX_FRAME_RELATED_P (insn) = 1;
9631       x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9632       x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9633       add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9634       m->fs.sp_offset -= UNITS_PER_WORD;
9635     }
9636 }
9637 
9638 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9639 
9640 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.  */
9641 
9642 static void
9643 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9644 {
9645   /* We skip the probe for the first interval + a small dope of 4 words and
9646      probe that many bytes past the specified size to maintain a protection
9647      area at the botton of the stack.  */
9648   const int dope = 4 * UNITS_PER_WORD;
9649   rtx size_rtx = GEN_INT (size), last;
9650 
9651   /* See if we have a constant small number of probes to generate.  If so,
9652      that's the easy case.  The run-time loop is made up of 11 insns in the
9653      generic case while the compile-time loop is made up of 3+2*(n-1) insns
9654      for n # of intervals.  */
9655   if (size <= 5 * PROBE_INTERVAL)
9656     {
9657       HOST_WIDE_INT i, adjust;
9658       bool first_probe = true;
9659 
9660       /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9661 	 values of N from 1 until it exceeds SIZE.  If only one probe is
9662 	 needed, this will not generate any code.  Then adjust and probe
9663 	 to PROBE_INTERVAL + SIZE.  */
9664       for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9665 	{
9666 	  if (first_probe)
9667 	    {
9668 	      adjust = 2 * PROBE_INTERVAL + dope;
9669 	      first_probe = false;
9670 	    }
9671 	  else
9672 	    adjust = PROBE_INTERVAL;
9673 
9674 	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9675 				  plus_constant (stack_pointer_rtx, -adjust)));
9676 	  emit_stack_probe (stack_pointer_rtx);
9677 	}
9678 
9679       if (first_probe)
9680 	adjust = size + PROBE_INTERVAL + dope;
9681       else
9682         adjust = size + PROBE_INTERVAL - i;
9683 
9684       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9685 			      plus_constant (stack_pointer_rtx, -adjust)));
9686       emit_stack_probe (stack_pointer_rtx);
9687 
9688       /* Adjust back to account for the additional first interval.  */
9689       last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9690 				     plus_constant (stack_pointer_rtx,
9691 						    PROBE_INTERVAL + dope)));
9692     }
9693 
9694   /* Otherwise, do the same as above, but in a loop.  Note that we must be
9695      extra careful with variables wrapping around because we might be at
9696      the very top (or the very bottom) of the address space and we have
9697      to be able to handle this case properly; in particular, we use an
9698      equality test for the loop condition.  */
9699   else
9700     {
9701       HOST_WIDE_INT rounded_size;
9702       struct scratch_reg sr;
9703 
9704       get_scratch_register_on_entry (&sr);
9705 
9706 
9707       /* Step 1: round SIZE to the previous multiple of the interval.  */
9708 
9709       rounded_size = size & -PROBE_INTERVAL;
9710 
9711 
9712       /* Step 2: compute initial and final value of the loop counter.  */
9713 
9714       /* SP = SP_0 + PROBE_INTERVAL.  */
9715       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9716 			      plus_constant (stack_pointer_rtx,
9717 					     - (PROBE_INTERVAL + dope))));
9718 
9719       /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
9720       emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9721       emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9722 			      gen_rtx_PLUS (Pmode, sr.reg,
9723 					    stack_pointer_rtx)));
9724 
9725 
9726       /* Step 3: the loop
9727 
9728 	 while (SP != LAST_ADDR)
9729 	   {
9730 	     SP = SP + PROBE_INTERVAL
9731 	     probe at SP
9732 	   }
9733 
9734 	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9735 	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
9736 
9737       emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9738 
9739 
9740       /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9741 	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
9742 
9743       if (size != rounded_size)
9744 	{
9745 	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9746 			          plus_constant (stack_pointer_rtx,
9747 						 rounded_size - size)));
9748 	  emit_stack_probe (stack_pointer_rtx);
9749 	}
9750 
9751       /* Adjust back to account for the additional first interval.  */
9752       last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9753 				     plus_constant (stack_pointer_rtx,
9754 						    PROBE_INTERVAL + dope)));
9755 
9756       release_scratch_register_on_entry (&sr);
9757     }
9758 
9759   gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9760 
9761   /* Even if the stack pointer isn't the CFA register, we need to correctly
9762      describe the adjustments made to it, in particular differentiate the
9763      frame-related ones from the frame-unrelated ones.  */
9764   if (size > 0)
9765     {
9766       rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9767       XVECEXP (expr, 0, 0)
9768 	= gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9769 		       plus_constant (stack_pointer_rtx, -size));
9770       XVECEXP (expr, 0, 1)
9771 	= gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9772 		       plus_constant (stack_pointer_rtx,
9773 				      PROBE_INTERVAL + dope + size));
9774       add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9775       RTX_FRAME_RELATED_P (last) = 1;
9776 
9777       cfun->machine->fs.sp_offset += size;
9778     }
9779 
9780   /* Make sure nothing is scheduled before we are done.  */
9781   emit_insn (gen_blockage ());
9782 }
9783 
9784 /* Adjust the stack pointer up to REG while probing it.  */
9785 
9786 const char *
9787 output_adjust_stack_and_probe (rtx reg)
9788 {
9789   static int labelno = 0;
9790   char loop_lab[32], end_lab[32];
9791   rtx xops[2];
9792 
9793   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9794   ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9795 
9796   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9797 
9798   /* Jump to END_LAB if SP == LAST_ADDR.  */
9799   xops[0] = stack_pointer_rtx;
9800   xops[1] = reg;
9801   output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9802   fputs ("\tje\t", asm_out_file);
9803   assemble_name_raw (asm_out_file, end_lab);
9804   fputc ('\n', asm_out_file);
9805 
9806   /* SP = SP + PROBE_INTERVAL.  */
9807   xops[1] = GEN_INT (PROBE_INTERVAL);
9808   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9809 
9810   /* Probe at SP.  */
9811   xops[1] = const0_rtx;
9812   output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9813 
9814   fprintf (asm_out_file, "\tjmp\t");
9815   assemble_name_raw (asm_out_file, loop_lab);
9816   fputc ('\n', asm_out_file);
9817 
9818   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9819 
9820   return "";
9821 }
9822 
9823 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9824    inclusive.  These are offsets from the current stack pointer.  */
9825 
9826 static void
9827 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9828 {
9829   /* See if we have a constant small number of probes to generate.  If so,
9830      that's the easy case.  The run-time loop is made up of 7 insns in the
9831      generic case while the compile-time loop is made up of n insns for n #
9832      of intervals.  */
9833   if (size <= 7 * PROBE_INTERVAL)
9834     {
9835       HOST_WIDE_INT i;
9836 
9837       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9838 	 it exceeds SIZE.  If only one probe is needed, this will not
9839 	 generate any code.  Then probe at FIRST + SIZE.  */
9840       for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9841 	emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9842 
9843       emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9844     }
9845 
9846   /* Otherwise, do the same as above, but in a loop.  Note that we must be
9847      extra careful with variables wrapping around because we might be at
9848      the very top (or the very bottom) of the address space and we have
9849      to be able to handle this case properly; in particular, we use an
9850      equality test for the loop condition.  */
9851   else
9852     {
9853       HOST_WIDE_INT rounded_size, last;
9854       struct scratch_reg sr;
9855 
9856       get_scratch_register_on_entry (&sr);
9857 
9858 
9859       /* Step 1: round SIZE to the previous multiple of the interval.  */
9860 
9861       rounded_size = size & -PROBE_INTERVAL;
9862 
9863 
9864       /* Step 2: compute initial and final value of the loop counter.  */
9865 
9866       /* TEST_OFFSET = FIRST.  */
9867       emit_move_insn (sr.reg, GEN_INT (-first));
9868 
9869       /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
9870       last = first + rounded_size;
9871 
9872 
9873       /* Step 3: the loop
9874 
9875 	 while (TEST_ADDR != LAST_ADDR)
9876 	   {
9877 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9878 	     probe at TEST_ADDR
9879 	   }
9880 
9881          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9882          until it is equal to ROUNDED_SIZE.  */
9883 
9884       emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9885 
9886 
9887       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9888 	 that SIZE is equal to ROUNDED_SIZE.  */
9889 
9890       if (size != rounded_size)
9891 	emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9892 						       stack_pointer_rtx,
9893 						       sr.reg),
9894 					 rounded_size - size));
9895 
9896       release_scratch_register_on_entry (&sr);
9897     }
9898 
9899   /* Make sure nothing is scheduled before we are done.  */
9900   emit_insn (gen_blockage ());
9901 }
9902 
9903 /* Probe a range of stack addresses from REG to END, inclusive.  These are
9904    offsets from the current stack pointer.  */
9905 
9906 const char *
9907 output_probe_stack_range (rtx reg, rtx end)
9908 {
9909   static int labelno = 0;
9910   char loop_lab[32], end_lab[32];
9911   rtx xops[3];
9912 
9913   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9914   ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9915 
9916   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9917 
9918   /* Jump to END_LAB if TEST_ADDR == LAST_ADDR.  */
9919   xops[0] = reg;
9920   xops[1] = end;
9921   output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9922   fputs ("\tje\t", asm_out_file);
9923   assemble_name_raw (asm_out_file, end_lab);
9924   fputc ('\n', asm_out_file);
9925 
9926   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
9927   xops[1] = GEN_INT (PROBE_INTERVAL);
9928   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9929 
9930   /* Probe at TEST_ADDR.  */
9931   xops[0] = stack_pointer_rtx;
9932   xops[1] = reg;
9933   xops[2] = const0_rtx;
9934   output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9935 
9936   fprintf (asm_out_file, "\tjmp\t");
9937   assemble_name_raw (asm_out_file, loop_lab);
9938   fputc ('\n', asm_out_file);
9939 
9940   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9941 
9942   return "";
9943 }
9944 
9945 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9946    to be generated in correct form.  */
9947 static void
9948 ix86_finalize_stack_realign_flags (void)
9949 {
9950   /* Check if stack realign is really needed after reload, and
9951      stores result in cfun */
9952   unsigned int incoming_stack_boundary
9953     = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9954        ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9955   unsigned int stack_realign = (incoming_stack_boundary
9956 				< (current_function_is_leaf
9957 				   ? crtl->max_used_stack_slot_alignment
9958 				   : crtl->stack_alignment_needed));
9959 
9960   if (crtl->stack_realign_finalized)
9961     {
9962       /* After stack_realign_needed is finalized, we can't no longer
9963 	 change it.  */
9964       gcc_assert (crtl->stack_realign_needed == stack_realign);
9965       return;
9966     }
9967 
9968   /* If the only reason for frame_pointer_needed is that we conservatively
9969      assumed stack realignment might be needed, but in the end nothing that
9970      needed the stack alignment had been spilled, clear frame_pointer_needed
9971      and say we don't need stack realignment.  */
9972   if (stack_realign
9973       && !crtl->need_drap
9974       && frame_pointer_needed
9975       && current_function_is_leaf
9976       && flag_omit_frame_pointer
9977       && current_function_sp_is_unchanging
9978       && !ix86_current_function_calls_tls_descriptor
9979       && !crtl->accesses_prior_frames
9980       && !cfun->calls_alloca
9981       && !crtl->calls_eh_return
9982       && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9983       && !ix86_frame_pointer_required ()
9984       && get_frame_size () == 0
9985       && ix86_nsaved_sseregs () == 0
9986       && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9987     {
9988       HARD_REG_SET set_up_by_prologue, prologue_used;
9989       basic_block bb;
9990 
9991       CLEAR_HARD_REG_SET (prologue_used);
9992       CLEAR_HARD_REG_SET (set_up_by_prologue);
9993       add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9994       add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9995       add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9996 			   HARD_FRAME_POINTER_REGNUM);
9997       FOR_EACH_BB (bb)
9998         {
9999           rtx insn;
10000 	  FOR_BB_INSNS (bb, insn)
10001 	    if (NONDEBUG_INSN_P (insn)
10002 		&& requires_stack_frame_p (insn, prologue_used,
10003 					   set_up_by_prologue))
10004 	      {
10005 		crtl->stack_realign_needed = stack_realign;
10006 		crtl->stack_realign_finalized = true;
10007 		return;
10008 	      }
10009 	}
10010 
10011       frame_pointer_needed = false;
10012       stack_realign = false;
10013       crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10014       crtl->stack_alignment_needed = incoming_stack_boundary;
10015       crtl->stack_alignment_estimated = incoming_stack_boundary;
10016       if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10017 	crtl->preferred_stack_boundary = incoming_stack_boundary;
10018       df_finish_pass (true);
10019       df_scan_alloc (NULL);
10020       df_scan_blocks ();
10021       df_compute_regs_ever_live (true);
10022       df_analyze ();
10023     }
10024 
10025   crtl->stack_realign_needed = stack_realign;
10026   crtl->stack_realign_finalized = true;
10027 }
10028 
10029 /* Expand the prologue into a bunch of separate insns.  */
10030 
10031 void
10032 ix86_expand_prologue (void)
10033 {
10034   struct machine_function *m = cfun->machine;
10035   rtx insn, t;
10036   bool pic_reg_used;
10037   struct ix86_frame frame;
10038   HOST_WIDE_INT allocate;
10039   bool int_registers_saved;
10040   bool sse_registers_saved;
10041 
10042   ix86_finalize_stack_realign_flags ();
10043 
10044   /* DRAP should not coexist with stack_realign_fp */
10045   gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10046 
10047   memset (&m->fs, 0, sizeof (m->fs));
10048 
10049   /* Initialize CFA state for before the prologue.  */
10050   m->fs.cfa_reg = stack_pointer_rtx;
10051   m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10052 
10053   /* Track SP offset to the CFA.  We continue tracking this after we've
10054      swapped the CFA register away from SP.  In the case of re-alignment
10055      this is fudged; we're interested to offsets within the local frame.  */
10056   m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10057   m->fs.sp_valid = true;
10058 
10059   ix86_compute_frame_layout (&frame);
10060 
10061   if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10062     {
10063       /* We should have already generated an error for any use of
10064          ms_hook on a nested function.  */
10065       gcc_checking_assert (!ix86_static_chain_on_stack);
10066 
10067       /* Check if profiling is active and we shall use profiling before
10068          prologue variant. If so sorry.  */
10069       if (crtl->profile && flag_fentry != 0)
10070         sorry ("ms_hook_prologue attribute isn%'t compatible "
10071 	       "with -mfentry for 32-bit");
10072 
10073       /* In ix86_asm_output_function_label we emitted:
10074 	 8b ff     movl.s %edi,%edi
10075 	 55        push   %ebp
10076 	 8b ec     movl.s %esp,%ebp
10077 
10078 	 This matches the hookable function prologue in Win32 API
10079 	 functions in Microsoft Windows XP Service Pack 2 and newer.
10080 	 Wine uses this to enable Windows apps to hook the Win32 API
10081 	 functions provided by Wine.
10082 
10083 	 What that means is that we've already set up the frame pointer.  */
10084 
10085       if (frame_pointer_needed
10086 	  && !(crtl->drap_reg && crtl->stack_realign_needed))
10087 	{
10088 	  rtx push, mov;
10089 
10090 	  /* We've decided to use the frame pointer already set up.
10091 	     Describe this to the unwinder by pretending that both
10092 	     push and mov insns happen right here.
10093 
10094 	     Putting the unwind info here at the end of the ms_hook
10095 	     is done so that we can make absolutely certain we get
10096 	     the required byte sequence at the start of the function,
10097 	     rather than relying on an assembler that can produce
10098 	     the exact encoding required.
10099 
10100 	     However it does mean (in the unpatched case) that we have
10101 	     a 1 insn window where the asynchronous unwind info is
10102 	     incorrect.  However, if we placed the unwind info at
10103 	     its correct location we would have incorrect unwind info
10104 	     in the patched case.  Which is probably all moot since
10105 	     I don't expect Wine generates dwarf2 unwind info for the
10106 	     system libraries that use this feature.  */
10107 
10108 	  insn = emit_insn (gen_blockage ());
10109 
10110 	  push = gen_push (hard_frame_pointer_rtx);
10111 	  mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10112 			     stack_pointer_rtx);
10113 	  RTX_FRAME_RELATED_P (push) = 1;
10114 	  RTX_FRAME_RELATED_P (mov) = 1;
10115 
10116 	  RTX_FRAME_RELATED_P (insn) = 1;
10117 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10118 			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10119 
10120 	  /* Note that gen_push incremented m->fs.cfa_offset, even
10121 	     though we didn't emit the push insn here.  */
10122 	  m->fs.cfa_reg = hard_frame_pointer_rtx;
10123 	  m->fs.fp_offset = m->fs.cfa_offset;
10124 	  m->fs.fp_valid = true;
10125 	}
10126       else
10127 	{
10128 	  /* The frame pointer is not needed so pop %ebp again.
10129 	     This leaves us with a pristine state.  */
10130 	  emit_insn (gen_pop (hard_frame_pointer_rtx));
10131 	}
10132     }
10133 
10134   /* The first insn of a function that accepts its static chain on the
10135      stack is to push the register that would be filled in by a direct
10136      call.  This insn will be skipped by the trampoline.  */
10137   else if (ix86_static_chain_on_stack)
10138     {
10139       insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10140       emit_insn (gen_blockage ());
10141 
10142       /* We don't want to interpret this push insn as a register save,
10143 	 only as a stack adjustment.  The real copy of the register as
10144 	 a save will be done later, if needed.  */
10145       t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10146       t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10147       add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10148       RTX_FRAME_RELATED_P (insn) = 1;
10149     }
10150 
10151   /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10152      of DRAP is needed and stack realignment is really needed after reload */
10153   if (stack_realign_drap)
10154     {
10155       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10156 
10157       /* Only need to push parameter pointer reg if it is caller saved.  */
10158       if (!call_used_regs[REGNO (crtl->drap_reg)])
10159 	{
10160 	  /* Push arg pointer reg */
10161 	  insn = emit_insn (gen_push (crtl->drap_reg));
10162 	  RTX_FRAME_RELATED_P (insn) = 1;
10163 	}
10164 
10165       /* Grab the argument pointer.  */
10166       t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10167       insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10168       RTX_FRAME_RELATED_P (insn) = 1;
10169       m->fs.cfa_reg = crtl->drap_reg;
10170       m->fs.cfa_offset = 0;
10171 
10172       /* Align the stack.  */
10173       insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10174 					stack_pointer_rtx,
10175 					GEN_INT (-align_bytes)));
10176       RTX_FRAME_RELATED_P (insn) = 1;
10177 
10178       /* Replicate the return address on the stack so that return
10179 	 address can be reached via (argp - 1) slot.  This is needed
10180 	 to implement macro RETURN_ADDR_RTX and intrinsic function
10181 	 expand_builtin_return_addr etc.  */
10182       t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10183       t = gen_frame_mem (Pmode, t);
10184       insn = emit_insn (gen_push (t));
10185       RTX_FRAME_RELATED_P (insn) = 1;
10186 
10187       /* For the purposes of frame and register save area addressing,
10188 	 we've started over with a new frame.  */
10189       m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10190       m->fs.realigned = true;
10191     }
10192 
10193   int_registers_saved = (frame.nregs == 0);
10194   sse_registers_saved = (frame.nsseregs == 0);
10195 
10196   if (frame_pointer_needed && !m->fs.fp_valid)
10197     {
10198       /* Note: AT&T enter does NOT have reversed args.  Enter is probably
10199          slower on all targets.  Also sdb doesn't like it.  */
10200       insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10201       RTX_FRAME_RELATED_P (insn) = 1;
10202 
10203       /* Push registers now, before setting the frame pointer
10204 	 on SEH target.  */
10205       if (!int_registers_saved
10206 	  && TARGET_SEH
10207 	  && !frame.save_regs_using_mov)
10208 	{
10209 	  ix86_emit_save_regs ();
10210 	  int_registers_saved = true;
10211 	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10212 	}
10213 
10214       if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10215 	{
10216 	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10217 	  RTX_FRAME_RELATED_P (insn) = 1;
10218 
10219 	  if (m->fs.cfa_reg == stack_pointer_rtx)
10220 	    m->fs.cfa_reg = hard_frame_pointer_rtx;
10221 	  m->fs.fp_offset = m->fs.sp_offset;
10222 	  m->fs.fp_valid = true;
10223 	}
10224     }
10225 
10226   if (!int_registers_saved)
10227     {
10228       /* If saving registers via PUSH, do so now.  */
10229       if (!frame.save_regs_using_mov)
10230 	{
10231 	  ix86_emit_save_regs ();
10232 	  int_registers_saved = true;
10233 	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10234 	}
10235 
10236       /* When using red zone we may start register saving before allocating
10237 	 the stack frame saving one cycle of the prologue.  However, avoid
10238 	 doing this if we have to probe the stack; at least on x86_64 the
10239 	 stack probe can turn into a call that clobbers a red zone location. */
10240       else if (ix86_using_red_zone ()
10241 	       && (! TARGET_STACK_PROBE
10242 		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10243 	{
10244 	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10245 	  int_registers_saved = true;
10246 	}
10247     }
10248 
10249   if (stack_realign_fp)
10250     {
10251       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10252       gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10253 
10254       /* The computation of the size of the re-aligned stack frame means
10255 	 that we must allocate the size of the register save area before
10256 	 performing the actual alignment.  Otherwise we cannot guarantee
10257 	 that there's enough storage above the realignment point.  */
10258       if (m->fs.sp_offset != frame.sse_reg_save_offset)
10259         pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10260 				   GEN_INT (m->fs.sp_offset
10261 					    - frame.sse_reg_save_offset),
10262 				   -1, false);
10263 
10264       /* Align the stack.  */
10265       insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10266 					stack_pointer_rtx,
10267 					GEN_INT (-align_bytes)));
10268 
10269       /* For the purposes of register save area addressing, the stack
10270          pointer is no longer valid.  As for the value of sp_offset,
10271 	 see ix86_compute_frame_layout, which we need to match in order
10272 	 to pass verification of stack_pointer_offset at the end.  */
10273       m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10274       m->fs.sp_valid = false;
10275     }
10276 
10277   allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10278 
10279   if (flag_stack_usage_info)
10280     {
10281       /* We start to count from ARG_POINTER.  */
10282       HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10283 
10284       /* If it was realigned, take into account the fake frame.  */
10285       if (stack_realign_drap)
10286 	{
10287 	  if (ix86_static_chain_on_stack)
10288 	    stack_size += UNITS_PER_WORD;
10289 
10290 	  if (!call_used_regs[REGNO (crtl->drap_reg)])
10291 	    stack_size += UNITS_PER_WORD;
10292 
10293 	  /* This over-estimates by 1 minimal-stack-alignment-unit but
10294 	     mitigates that by counting in the new return address slot.  */
10295 	  current_function_dynamic_stack_size
10296 	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
10297 	}
10298 
10299       current_function_static_stack_size = stack_size;
10300     }
10301 
10302   /* On SEH target with very large frame size, allocate an area to save
10303      SSE registers (as the very large allocation won't be described).  */
10304   if (TARGET_SEH
10305       && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10306       && !sse_registers_saved)
10307     {
10308       HOST_WIDE_INT sse_size =
10309 	frame.sse_reg_save_offset - frame.reg_save_offset;
10310 
10311       gcc_assert (int_registers_saved);
10312 
10313       /* No need to do stack checking as the area will be immediately
10314 	 written.  */
10315       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10316 			         GEN_INT (-sse_size), -1,
10317 				 m->fs.cfa_reg == stack_pointer_rtx);
10318       allocate -= sse_size;
10319       ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10320       sse_registers_saved = true;
10321     }
10322 
10323   /* The stack has already been decremented by the instruction calling us
10324      so probe if the size is non-negative to preserve the protection area.  */
10325   if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10326     {
10327       /* We expect the registers to be saved when probes are used.  */
10328       gcc_assert (int_registers_saved);
10329 
10330       if (STACK_CHECK_MOVING_SP)
10331 	{
10332 	  ix86_adjust_stack_and_probe (allocate);
10333 	  allocate = 0;
10334 	}
10335       else
10336 	{
10337 	  HOST_WIDE_INT size = allocate;
10338 
10339 	  if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10340 	    size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10341 
10342 	  if (TARGET_STACK_PROBE)
10343 	    ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10344 	  else
10345 	    ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10346 	}
10347     }
10348 
10349   if (allocate == 0)
10350     ;
10351   else if (!ix86_target_stack_probe ()
10352 	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10353     {
10354       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10355 			         GEN_INT (-allocate), -1,
10356 			         m->fs.cfa_reg == stack_pointer_rtx);
10357     }
10358   else
10359     {
10360       rtx eax = gen_rtx_REG (Pmode, AX_REG);
10361       rtx r10 = NULL;
10362       rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10363       const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10364       bool eax_live = false;
10365       bool r10_live = false;
10366 
10367       if (TARGET_64BIT)
10368         r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10369       if (!TARGET_64BIT_MS_ABI)
10370         eax_live = ix86_eax_live_at_start_p ();
10371 
10372       /* Note that SEH directives need to continue tracking the stack
10373 	 pointer even after the frame pointer has been set up.  */
10374       if (eax_live)
10375 	{
10376 	  insn = emit_insn (gen_push (eax));
10377 	  allocate -= UNITS_PER_WORD;
10378 	  if (sp_is_cfa_reg || TARGET_SEH)
10379 	    {
10380 	      if (sp_is_cfa_reg)
10381 		m->fs.cfa_offset += UNITS_PER_WORD;
10382 	      RTX_FRAME_RELATED_P (insn) = 1;
10383 	    }
10384 	}
10385 
10386       if (r10_live)
10387 	{
10388 	  r10 = gen_rtx_REG (Pmode, R10_REG);
10389 	  insn = emit_insn (gen_push (r10));
10390 	  allocate -= UNITS_PER_WORD;
10391 	  if (sp_is_cfa_reg || TARGET_SEH)
10392 	    {
10393 	      if (sp_is_cfa_reg)
10394 		m->fs.cfa_offset += UNITS_PER_WORD;
10395 	      RTX_FRAME_RELATED_P (insn) = 1;
10396 	    }
10397 	}
10398 
10399       emit_move_insn (eax, GEN_INT (allocate));
10400       emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10401 
10402       /* Use the fact that AX still contains ALLOCATE.  */
10403       adjust_stack_insn = (TARGET_64BIT
10404 			   ? gen_pro_epilogue_adjust_stack_di_sub
10405 			   : gen_pro_epilogue_adjust_stack_si_sub);
10406 
10407       insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10408 					   stack_pointer_rtx, eax));
10409 
10410       if (sp_is_cfa_reg || TARGET_SEH)
10411 	{
10412 	  if (sp_is_cfa_reg)
10413 	    m->fs.cfa_offset += allocate;
10414 	  RTX_FRAME_RELATED_P (insn) = 1;
10415 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10416 			gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10417 				     plus_constant (stack_pointer_rtx,
10418 						    -allocate)));
10419 	}
10420       m->fs.sp_offset += allocate;
10421 
10422       if (r10_live && eax_live)
10423         {
10424 	  t = choose_baseaddr (m->fs.sp_offset - allocate);
10425 	  emit_move_insn (r10, gen_frame_mem (Pmode, t));
10426 	  t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10427 	  emit_move_insn (eax, gen_frame_mem (Pmode, t));
10428 	}
10429       else if (eax_live || r10_live)
10430 	{
10431 	  t = choose_baseaddr (m->fs.sp_offset - allocate);
10432 	  emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10433 	}
10434     }
10435   gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10436 
10437   /* If we havn't already set up the frame pointer, do so now.  */
10438   if (frame_pointer_needed && !m->fs.fp_valid)
10439     {
10440       insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10441 			    GEN_INT (frame.stack_pointer_offset
10442 				     - frame.hard_frame_pointer_offset));
10443       insn = emit_insn (insn);
10444       RTX_FRAME_RELATED_P (insn) = 1;
10445       add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10446 
10447       if (m->fs.cfa_reg == stack_pointer_rtx)
10448 	m->fs.cfa_reg = hard_frame_pointer_rtx;
10449       m->fs.fp_offset = frame.hard_frame_pointer_offset;
10450       m->fs.fp_valid = true;
10451     }
10452 
10453   if (!int_registers_saved)
10454     ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10455   if (!sse_registers_saved)
10456     ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10457 
10458   pic_reg_used = false;
10459   if (pic_offset_table_rtx
10460       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10461 	  || crtl->profile))
10462     {
10463       unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10464 
10465       if (alt_pic_reg_used != INVALID_REGNUM)
10466 	SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10467 
10468       pic_reg_used = true;
10469     }
10470 
10471   if (pic_reg_used)
10472     {
10473       if (TARGET_64BIT)
10474 	{
10475 	  if (ix86_cmodel == CM_LARGE_PIC)
10476 	    {
10477               rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10478 	      rtx label = gen_label_rtx ();
10479 	      emit_label (label);
10480 	      LABEL_PRESERVE_P (label) = 1;
10481 	      gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10482 	      insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10483 	      insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10484 	      insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10485 					    pic_offset_table_rtx, tmp_reg));
10486 	    }
10487 	  else
10488             insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10489 	}
10490       else
10491 	{
10492           insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10493 	  RTX_FRAME_RELATED_P (insn) = 1;
10494 	  add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10495 	}
10496     }
10497 
10498   /* In the pic_reg_used case, make sure that the got load isn't deleted
10499      when mcount needs it.  Blockage to avoid call movement across mcount
10500      call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10501      note.  */
10502   if (crtl->profile && !flag_fentry && pic_reg_used)
10503     emit_insn (gen_prologue_use (pic_offset_table_rtx));
10504 
10505   if (crtl->drap_reg && !crtl->stack_realign_needed)
10506     {
10507       /* vDRAP is setup but after reload it turns out stack realign
10508          isn't necessary, here we will emit prologue to setup DRAP
10509          without stack realign adjustment */
10510       t = choose_baseaddr (0);
10511       emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10512     }
10513 
10514   /* Prevent instructions from being scheduled into register save push
10515      sequence when access to the redzone area is done through frame pointer.
10516      The offset between the frame pointer and the stack pointer is calculated
10517      relative to the value of the stack pointer at the end of the function
10518      prologue, and moving instructions that access redzone area via frame
10519      pointer inside push sequence violates this assumption.  */
10520   if (frame_pointer_needed && frame.red_zone_size)
10521     emit_insn (gen_memory_blockage ());
10522 
10523   /* Emit cld instruction if stringops are used in the function.  */
10524   if (TARGET_CLD && ix86_current_function_needs_cld)
10525     emit_insn (gen_cld ());
10526 
10527   /* SEH requires that the prologue end within 256 bytes of the start of
10528      the function.  Prevent instruction schedules that would extend that.
10529      Further, prevent alloca modifications to the stack pointer from being
10530      combined with prologue modifications.  */
10531   if (TARGET_SEH)
10532     emit_insn (gen_prologue_use (stack_pointer_rtx));
10533 }
10534 
10535 /* Emit code to restore REG using a POP insn.  */
10536 
10537 static void
10538 ix86_emit_restore_reg_using_pop (rtx reg)
10539 {
10540   struct machine_function *m = cfun->machine;
10541   rtx insn = emit_insn (gen_pop (reg));
10542 
10543   ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10544   m->fs.sp_offset -= UNITS_PER_WORD;
10545 
10546   if (m->fs.cfa_reg == crtl->drap_reg
10547       && REGNO (reg) == REGNO (crtl->drap_reg))
10548     {
10549       /* Previously we'd represented the CFA as an expression
10550 	 like *(%ebp - 8).  We've just popped that value from
10551 	 the stack, which means we need to reset the CFA to
10552 	 the drap register.  This will remain until we restore
10553 	 the stack pointer.  */
10554       add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10555       RTX_FRAME_RELATED_P (insn) = 1;
10556 
10557       /* This means that the DRAP register is valid for addressing too.  */
10558       m->fs.drap_valid = true;
10559       return;
10560     }
10561 
10562   if (m->fs.cfa_reg == stack_pointer_rtx)
10563     {
10564       rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10565       x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10566       add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10567       RTX_FRAME_RELATED_P (insn) = 1;
10568 
10569       m->fs.cfa_offset -= UNITS_PER_WORD;
10570     }
10571 
10572   /* When the frame pointer is the CFA, and we pop it, we are
10573      swapping back to the stack pointer as the CFA.  This happens
10574      for stack frames that don't allocate other data, so we assume
10575      the stack pointer is now pointing at the return address, i.e.
10576      the function entry state, which makes the offset be 1 word.  */
10577   if (reg == hard_frame_pointer_rtx)
10578     {
10579       m->fs.fp_valid = false;
10580       if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10581 	{
10582 	  m->fs.cfa_reg = stack_pointer_rtx;
10583 	  m->fs.cfa_offset -= UNITS_PER_WORD;
10584 
10585 	  add_reg_note (insn, REG_CFA_DEF_CFA,
10586 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10587 				      GEN_INT (m->fs.cfa_offset)));
10588 	  RTX_FRAME_RELATED_P (insn) = 1;
10589 	}
10590     }
10591 }
10592 
10593 /* Emit code to restore saved registers using POP insns.  */
10594 
10595 static void
10596 ix86_emit_restore_regs_using_pop (void)
10597 {
10598   unsigned int regno;
10599 
10600   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10601     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10602       ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10603 }
10604 
10605 /* Emit code and notes for the LEAVE instruction.  */
10606 
10607 static void
10608 ix86_emit_leave (void)
10609 {
10610   struct machine_function *m = cfun->machine;
10611   rtx insn = emit_insn (ix86_gen_leave ());
10612 
10613   ix86_add_queued_cfa_restore_notes (insn);
10614 
10615   gcc_assert (m->fs.fp_valid);
10616   m->fs.sp_valid = true;
10617   m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10618   m->fs.fp_valid = false;
10619 
10620   if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10621     {
10622       m->fs.cfa_reg = stack_pointer_rtx;
10623       m->fs.cfa_offset = m->fs.sp_offset;
10624 
10625       add_reg_note (insn, REG_CFA_DEF_CFA,
10626 		    plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10627       RTX_FRAME_RELATED_P (insn) = 1;
10628     }
10629   ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10630 			     m->fs.fp_offset);
10631 }
10632 
10633 /* Emit code to restore saved registers using MOV insns.
10634    First register is restored from CFA - CFA_OFFSET.  */
10635 static void
10636 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10637 				  bool maybe_eh_return)
10638 {
10639   struct machine_function *m = cfun->machine;
10640   unsigned int regno;
10641 
10642   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10643     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10644       {
10645 	rtx reg = gen_rtx_REG (Pmode, regno);
10646 	rtx insn, mem;
10647 
10648 	mem = choose_baseaddr (cfa_offset);
10649 	mem = gen_frame_mem (Pmode, mem);
10650 	insn = emit_move_insn (reg, mem);
10651 
10652         if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10653 	  {
10654 	    /* Previously we'd represented the CFA as an expression
10655 	       like *(%ebp - 8).  We've just popped that value from
10656 	       the stack, which means we need to reset the CFA to
10657 	       the drap register.  This will remain until we restore
10658 	       the stack pointer.  */
10659 	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10660 	    RTX_FRAME_RELATED_P (insn) = 1;
10661 
10662 	    /* This means that the DRAP register is valid for addressing.  */
10663 	    m->fs.drap_valid = true;
10664 	  }
10665 	else
10666 	  ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10667 
10668 	cfa_offset -= UNITS_PER_WORD;
10669       }
10670 }
10671 
10672 /* Emit code to restore saved registers using MOV insns.
10673    First register is restored from CFA - CFA_OFFSET.  */
10674 static void
10675 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10676 				      bool maybe_eh_return)
10677 {
10678   unsigned int regno;
10679 
10680   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10681     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10682       {
10683 	rtx reg = gen_rtx_REG (V4SFmode, regno);
10684 	rtx mem;
10685 
10686 	mem = choose_baseaddr (cfa_offset);
10687 	mem = gen_rtx_MEM (V4SFmode, mem);
10688 	set_mem_align (mem, 128);
10689 	emit_move_insn (reg, mem);
10690 
10691 	ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10692 
10693 	cfa_offset -= 16;
10694       }
10695 }
10696 
10697 /* Emit vzeroupper if needed.  */
10698 
10699 void
10700 ix86_maybe_emit_epilogue_vzeroupper (void)
10701 {
10702   if (TARGET_VZEROUPPER
10703       && !TREE_THIS_VOLATILE (cfun->decl)
10704       && !cfun->machine->caller_return_avx256_p)
10705     emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10706 }
10707 
10708 /* Restore function stack, frame, and registers.  */
10709 
10710 void
10711 ix86_expand_epilogue (int style)
10712 {
10713   struct machine_function *m = cfun->machine;
10714   struct machine_frame_state frame_state_save = m->fs;
10715   struct ix86_frame frame;
10716   bool restore_regs_via_mov;
10717   bool using_drap;
10718 
10719   ix86_finalize_stack_realign_flags ();
10720   ix86_compute_frame_layout (&frame);
10721 
10722   m->fs.sp_valid = (!frame_pointer_needed
10723 		    || (current_function_sp_is_unchanging
10724 			&& !stack_realign_fp));
10725   gcc_assert (!m->fs.sp_valid
10726 	      || m->fs.sp_offset == frame.stack_pointer_offset);
10727 
10728   /* The FP must be valid if the frame pointer is present.  */
10729   gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10730   gcc_assert (!m->fs.fp_valid
10731 	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10732 
10733   /* We must have *some* valid pointer to the stack frame.  */
10734   gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10735 
10736   /* The DRAP is never valid at this point.  */
10737   gcc_assert (!m->fs.drap_valid);
10738 
10739   /* See the comment about red zone and frame
10740      pointer usage in ix86_expand_prologue.  */
10741   if (frame_pointer_needed && frame.red_zone_size)
10742     emit_insn (gen_memory_blockage ());
10743 
10744   using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10745   gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10746 
10747   /* Determine the CFA offset of the end of the red-zone.  */
10748   m->fs.red_zone_offset = 0;
10749   if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10750     {
10751       /* The red-zone begins below the return address.  */
10752       m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10753 
10754       /* When the register save area is in the aligned portion of
10755          the stack, determine the maximum runtime displacement that
10756 	 matches up with the aligned frame.  */
10757       if (stack_realign_drap)
10758 	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10759 				  + UNITS_PER_WORD);
10760     }
10761 
10762   /* Special care must be taken for the normal return case of a function
10763      using eh_return: the eax and edx registers are marked as saved, but
10764      not restored along this path.  Adjust the save location to match.  */
10765   if (crtl->calls_eh_return && style != 2)
10766     frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10767 
10768   /* EH_RETURN requires the use of moves to function properly.  */
10769   if (crtl->calls_eh_return)
10770     restore_regs_via_mov = true;
10771   /* SEH requires the use of pops to identify the epilogue.  */
10772   else if (TARGET_SEH)
10773     restore_regs_via_mov = false;
10774   /* If we're only restoring one register and sp is not valid then
10775      using a move instruction to restore the register since it's
10776      less work than reloading sp and popping the register.  */
10777   else if (!m->fs.sp_valid && frame.nregs <= 1)
10778     restore_regs_via_mov = true;
10779   else if (TARGET_EPILOGUE_USING_MOVE
10780 	   && cfun->machine->use_fast_prologue_epilogue
10781 	   && (frame.nregs > 1
10782 	       || m->fs.sp_offset != frame.reg_save_offset))
10783     restore_regs_via_mov = true;
10784   else if (frame_pointer_needed
10785 	   && !frame.nregs
10786 	   && m->fs.sp_offset != frame.reg_save_offset)
10787     restore_regs_via_mov = true;
10788   else if (frame_pointer_needed
10789 	   && TARGET_USE_LEAVE
10790 	   && cfun->machine->use_fast_prologue_epilogue
10791 	   && frame.nregs == 1)
10792     restore_regs_via_mov = true;
10793   else
10794     restore_regs_via_mov = false;
10795 
10796   if (restore_regs_via_mov || frame.nsseregs)
10797     {
10798       /* Ensure that the entire register save area is addressable via
10799 	 the stack pointer, if we will restore via sp.  */
10800       if (TARGET_64BIT
10801 	  && m->fs.sp_offset > 0x7fffffff
10802 	  && !(m->fs.fp_valid || m->fs.drap_valid)
10803 	  && (frame.nsseregs + frame.nregs) != 0)
10804 	{
10805 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10806 				     GEN_INT (m->fs.sp_offset
10807 					      - frame.sse_reg_save_offset),
10808 				     style,
10809 				     m->fs.cfa_reg == stack_pointer_rtx);
10810 	}
10811     }
10812 
10813   /* If there are any SSE registers to restore, then we have to do it
10814      via moves, since there's obviously no pop for SSE regs.  */
10815   if (frame.nsseregs)
10816     ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10817 					  style == 2);
10818 
10819   if (restore_regs_via_mov)
10820     {
10821       rtx t;
10822 
10823       if (frame.nregs)
10824 	ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10825 
10826       /* eh_return epilogues need %ecx added to the stack pointer.  */
10827       if (style == 2)
10828 	{
10829 	  rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10830 
10831 	  /* Stack align doesn't work with eh_return.  */
10832 	  gcc_assert (!stack_realign_drap);
10833 	  /* Neither does regparm nested functions.  */
10834 	  gcc_assert (!ix86_static_chain_on_stack);
10835 
10836 	  if (frame_pointer_needed)
10837 	    {
10838 	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10839 	      t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10840 	      emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10841 
10842 	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10843 	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
10844 
10845 	      /* Note that we use SA as a temporary CFA, as the return
10846 		 address is at the proper place relative to it.  We
10847 		 pretend this happens at the FP restore insn because
10848 		 prior to this insn the FP would be stored at the wrong
10849 		 offset relative to SA, and after this insn we have no
10850 		 other reasonable register to use for the CFA.  We don't
10851 		 bother resetting the CFA to the SP for the duration of
10852 		 the return insn.  */
10853 	      add_reg_note (insn, REG_CFA_DEF_CFA,
10854 			    plus_constant (sa, UNITS_PER_WORD));
10855 	      ix86_add_queued_cfa_restore_notes (insn);
10856 	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10857 	      RTX_FRAME_RELATED_P (insn) = 1;
10858 
10859 	      m->fs.cfa_reg = sa;
10860 	      m->fs.cfa_offset = UNITS_PER_WORD;
10861 	      m->fs.fp_valid = false;
10862 
10863 	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10864 					 const0_rtx, style, false);
10865 	    }
10866 	  else
10867 	    {
10868 	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10869 	      t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10870 	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10871 	      ix86_add_queued_cfa_restore_notes (insn);
10872 
10873 	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10874 	      if (m->fs.cfa_offset != UNITS_PER_WORD)
10875 		{
10876 		  m->fs.cfa_offset = UNITS_PER_WORD;
10877 		  add_reg_note (insn, REG_CFA_DEF_CFA,
10878 				plus_constant (stack_pointer_rtx,
10879 					       UNITS_PER_WORD));
10880 		  RTX_FRAME_RELATED_P (insn) = 1;
10881 		}
10882 	    }
10883 	  m->fs.sp_offset = UNITS_PER_WORD;
10884 	  m->fs.sp_valid = true;
10885 	}
10886     }
10887   else
10888     {
10889       /* SEH requires that the function end with (1) a stack adjustment
10890 	 if necessary, (2) a sequence of pops, and (3) a return or
10891 	 jump instruction.  Prevent insns from the function body from
10892 	 being scheduled into this sequence.  */
10893       if (TARGET_SEH)
10894 	{
10895 	  /* Prevent a catch region from being adjacent to the standard
10896 	     epilogue sequence.  Unfortuantely crtl->uses_eh_lsda nor
10897 	     several other flags that would be interesting to test are
10898 	     not yet set up.  */
10899 	  if (flag_non_call_exceptions)
10900 	    emit_insn (gen_nops (const1_rtx));
10901 	  else
10902 	    emit_insn (gen_blockage ());
10903 	}
10904 
10905       /* First step is to deallocate the stack frame so that we can
10906 	 pop the registers.  Also do it on SEH target for very large
10907 	 frame as the emitted instructions aren't allowed by the ABI in
10908 	 epilogues.  */
10909       if (!m->fs.sp_valid
10910  	  || (TARGET_SEH
10911 	      && (m->fs.sp_offset - frame.reg_save_offset
10912 		  >= SEH_MAX_FRAME_SIZE)))
10913 	{
10914 	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10915 				     GEN_INT (m->fs.fp_offset
10916 					      - frame.reg_save_offset),
10917 				     style, false);
10918 	}
10919       else if (m->fs.sp_offset != frame.reg_save_offset)
10920 	{
10921 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10922 				     GEN_INT (m->fs.sp_offset
10923 					      - frame.reg_save_offset),
10924 				     style,
10925 				     m->fs.cfa_reg == stack_pointer_rtx);
10926 	}
10927 
10928       ix86_emit_restore_regs_using_pop ();
10929     }
10930 
10931   /* If we used a stack pointer and haven't already got rid of it,
10932      then do so now.  */
10933   if (m->fs.fp_valid)
10934     {
10935       /* If the stack pointer is valid and pointing at the frame
10936 	 pointer store address, then we only need a pop.  */
10937       if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10938 	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10939       /* Leave results in shorter dependency chains on CPUs that are
10940 	 able to grok it fast.  */
10941       else if (TARGET_USE_LEAVE
10942 	       || optimize_function_for_size_p (cfun)
10943 	       || !cfun->machine->use_fast_prologue_epilogue)
10944 	ix86_emit_leave ();
10945       else
10946         {
10947 	  pro_epilogue_adjust_stack (stack_pointer_rtx,
10948 				     hard_frame_pointer_rtx,
10949 				     const0_rtx, style, !using_drap);
10950 	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10951         }
10952     }
10953 
10954   if (using_drap)
10955     {
10956       int param_ptr_offset = UNITS_PER_WORD;
10957       rtx insn;
10958 
10959       gcc_assert (stack_realign_drap);
10960 
10961       if (ix86_static_chain_on_stack)
10962 	param_ptr_offset += UNITS_PER_WORD;
10963       if (!call_used_regs[REGNO (crtl->drap_reg)])
10964 	param_ptr_offset += UNITS_PER_WORD;
10965 
10966       insn = emit_insn (gen_rtx_SET
10967 			(VOIDmode, stack_pointer_rtx,
10968 			 gen_rtx_PLUS (Pmode,
10969 				       crtl->drap_reg,
10970 				       GEN_INT (-param_ptr_offset))));
10971       m->fs.cfa_reg = stack_pointer_rtx;
10972       m->fs.cfa_offset = param_ptr_offset;
10973       m->fs.sp_offset = param_ptr_offset;
10974       m->fs.realigned = false;
10975 
10976       add_reg_note (insn, REG_CFA_DEF_CFA,
10977 		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10978 				  GEN_INT (param_ptr_offset)));
10979       RTX_FRAME_RELATED_P (insn) = 1;
10980 
10981       if (!call_used_regs[REGNO (crtl->drap_reg)])
10982 	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10983     }
10984 
10985   /* At this point the stack pointer must be valid, and we must have
10986      restored all of the registers.  We may not have deallocated the
10987      entire stack frame.  We've delayed this until now because it may
10988      be possible to merge the local stack deallocation with the
10989      deallocation forced by ix86_static_chain_on_stack.   */
10990   gcc_assert (m->fs.sp_valid);
10991   gcc_assert (!m->fs.fp_valid);
10992   gcc_assert (!m->fs.realigned);
10993   if (m->fs.sp_offset != UNITS_PER_WORD)
10994     {
10995       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10996 				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10997 				 style, true);
10998     }
10999   else
11000     ix86_add_queued_cfa_restore_notes (get_last_insn ());
11001 
11002   /* Sibcall epilogues don't want a return instruction.  */
11003   if (style == 0)
11004     {
11005       m->fs = frame_state_save;
11006       return;
11007     }
11008 
11009   /* Emit vzeroupper if needed.  */
11010   ix86_maybe_emit_epilogue_vzeroupper ();
11011 
11012   if (crtl->args.pops_args && crtl->args.size)
11013     {
11014       rtx popc = GEN_INT (crtl->args.pops_args);
11015 
11016       /* i386 can only pop 64K bytes.  If asked to pop more, pop return
11017 	 address, do explicit add, and jump indirectly to the caller.  */
11018 
11019       if (crtl->args.pops_args >= 65536)
11020 	{
11021 	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
11022 	  rtx insn;
11023 
11024 	  /* There is no "pascal" calling convention in any 64bit ABI.  */
11025 	  gcc_assert (!TARGET_64BIT);
11026 
11027 	  insn = emit_insn (gen_pop (ecx));
11028 	  m->fs.cfa_offset -= UNITS_PER_WORD;
11029 	  m->fs.sp_offset -= UNITS_PER_WORD;
11030 
11031 	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
11032 			copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11033 	  add_reg_note (insn, REG_CFA_REGISTER,
11034 			gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11035 	  RTX_FRAME_RELATED_P (insn) = 1;
11036 
11037 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11038 				     popc, -1, true);
11039 	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11040 	}
11041       else
11042 	emit_jump_insn (gen_simple_return_pop_internal (popc));
11043     }
11044   else
11045     emit_jump_insn (gen_simple_return_internal ());
11046 
11047   /* Restore the state back to the state from the prologue,
11048      so that it's correct for the next epilogue.  */
11049   m->fs = frame_state_save;
11050 }
11051 
11052 /* Reset from the function's potential modifications.  */
11053 
11054 static void
11055 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11056 			       HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11057 {
11058   if (pic_offset_table_rtx)
11059     SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11060 #if TARGET_MACHO
11061   /* Mach-O doesn't support labels at the end of objects, so if
11062      it looks like we might want one, insert a NOP.  */
11063   {
11064     rtx insn = get_last_insn ();
11065     rtx deleted_debug_label = NULL_RTX;
11066     while (insn
11067 	   && NOTE_P (insn)
11068 	   && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11069       {
11070 	/* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11071 	   notes only, instead set their CODE_LABEL_NUMBER to -1,
11072 	   otherwise there would be code generation differences
11073 	   in between -g and -g0.  */
11074 	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11075 	  deleted_debug_label = insn;
11076 	insn = PREV_INSN (insn);
11077       }
11078     if (insn
11079 	&& (LABEL_P (insn)
11080 	    || (NOTE_P (insn)
11081 		&& NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11082       fputs ("\tnop\n", file);
11083     else if (deleted_debug_label)
11084       for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11085 	if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11086 	  CODE_LABEL_NUMBER (insn) = -1;
11087   }
11088 #endif
11089 
11090 }
11091 
11092 /* Return a scratch register to use in the split stack prologue.  The
11093    split stack prologue is used for -fsplit-stack.  It is the first
11094    instructions in the function, even before the regular prologue.
11095    The scratch register can be any caller-saved register which is not
11096    used for parameters or for the static chain.  */
11097 
11098 static unsigned int
11099 split_stack_prologue_scratch_regno (void)
11100 {
11101   if (TARGET_64BIT)
11102     return R11_REG;
11103   else
11104     {
11105       bool is_fastcall, is_thiscall;
11106       int regparm;
11107 
11108       is_fastcall = (lookup_attribute ("fastcall",
11109 				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11110 		     != NULL);
11111       is_thiscall = (lookup_attribute ("thiscall",
11112 				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11113 		     != NULL);
11114       regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11115 
11116       if (is_fastcall)
11117 	{
11118 	  if (DECL_STATIC_CHAIN (cfun->decl))
11119 	    {
11120 	      sorry ("-fsplit-stack does not support fastcall with "
11121 		     "nested function");
11122 	      return INVALID_REGNUM;
11123 	    }
11124 	  return AX_REG;
11125 	}
11126       else if (is_thiscall)
11127         {
11128 	  if (!DECL_STATIC_CHAIN (cfun->decl))
11129 	    return DX_REG;
11130 	  return AX_REG;
11131 	}
11132       else if (regparm < 3)
11133 	{
11134 	  if (!DECL_STATIC_CHAIN (cfun->decl))
11135 	    return CX_REG;
11136 	  else
11137 	    {
11138 	      if (regparm >= 2)
11139 		{
11140 		  sorry ("-fsplit-stack does not support 2 register "
11141 			 " parameters for a nested function");
11142 		  return INVALID_REGNUM;
11143 		}
11144 	      return DX_REG;
11145 	    }
11146 	}
11147       else
11148 	{
11149 	  /* FIXME: We could make this work by pushing a register
11150 	     around the addition and comparison.  */
11151 	  sorry ("-fsplit-stack does not support 3 register parameters");
11152 	  return INVALID_REGNUM;
11153 	}
11154     }
11155 }
11156 
11157 /* A SYMBOL_REF for the function which allocates new stackspace for
11158    -fsplit-stack.  */
11159 
11160 static GTY(()) rtx split_stack_fn;
11161 
11162 /* A SYMBOL_REF for the more stack function when using the large
11163    model.  */
11164 
11165 static GTY(()) rtx split_stack_fn_large;
11166 
11167 /* Handle -fsplit-stack.  These are the first instructions in the
11168    function, even before the regular prologue.  */
11169 
11170 void
11171 ix86_expand_split_stack_prologue (void)
11172 {
11173   struct ix86_frame frame;
11174   HOST_WIDE_INT allocate;
11175   unsigned HOST_WIDE_INT args_size;
11176   rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11177   rtx scratch_reg = NULL_RTX;
11178   rtx varargs_label = NULL_RTX;
11179   rtx fn;
11180 
11181   gcc_assert (flag_split_stack && reload_completed);
11182 
11183   ix86_finalize_stack_realign_flags ();
11184   ix86_compute_frame_layout (&frame);
11185   allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11186 
11187   /* This is the label we will branch to if we have enough stack
11188      space.  We expect the basic block reordering pass to reverse this
11189      branch if optimizing, so that we branch in the unlikely case.  */
11190   label = gen_label_rtx ();
11191 
11192   /* We need to compare the stack pointer minus the frame size with
11193      the stack boundary in the TCB.  The stack boundary always gives
11194      us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11195      can compare directly.  Otherwise we need to do an addition.  */
11196 
11197   limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11198 			  UNSPEC_STACK_CHECK);
11199   limit = gen_rtx_CONST (Pmode, limit);
11200   limit = gen_rtx_MEM (Pmode, limit);
11201   if (allocate < SPLIT_STACK_AVAILABLE)
11202     current = stack_pointer_rtx;
11203   else
11204     {
11205       unsigned int scratch_regno;
11206       rtx offset;
11207 
11208       /* We need a scratch register to hold the stack pointer minus
11209 	 the required frame size.  Since this is the very start of the
11210 	 function, the scratch register can be any caller-saved
11211 	 register which is not used for parameters.  */
11212       offset = GEN_INT (- allocate);
11213       scratch_regno = split_stack_prologue_scratch_regno ();
11214       if (scratch_regno == INVALID_REGNUM)
11215 	return;
11216       scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11217       if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11218 	{
11219 	  /* We don't use ix86_gen_add3 in this case because it will
11220 	     want to split to lea, but when not optimizing the insn
11221 	     will not be split after this point.  */
11222 	  emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11223 				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11224 						offset)));
11225 	}
11226       else
11227 	{
11228 	  emit_move_insn (scratch_reg, offset);
11229 	  emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11230 				 stack_pointer_rtx));
11231 	}
11232       current = scratch_reg;
11233     }
11234 
11235   ix86_expand_branch (GEU, current, limit, label);
11236   jump_insn = get_last_insn ();
11237   JUMP_LABEL (jump_insn) = label;
11238 
11239   /* Mark the jump as very likely to be taken.  */
11240   add_reg_note (jump_insn, REG_BR_PROB,
11241 		GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11242 
11243   if (split_stack_fn == NULL_RTX)
11244     split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11245   fn = split_stack_fn;
11246 
11247   /* Get more stack space.  We pass in the desired stack space and the
11248      size of the arguments to copy to the new stack.  In 32-bit mode
11249      we push the parameters; __morestack will return on a new stack
11250      anyhow.  In 64-bit mode we pass the parameters in r10 and
11251      r11.  */
11252   allocate_rtx = GEN_INT (allocate);
11253   args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11254   call_fusage = NULL_RTX;
11255   if (TARGET_64BIT)
11256     {
11257       rtx reg10, reg11;
11258 
11259       reg10 = gen_rtx_REG (Pmode, R10_REG);
11260       reg11 = gen_rtx_REG (Pmode, R11_REG);
11261 
11262       /* If this function uses a static chain, it will be in %r10.
11263 	 Preserve it across the call to __morestack.  */
11264       if (DECL_STATIC_CHAIN (cfun->decl))
11265 	{
11266 	  rtx rax;
11267 
11268 	  rax = gen_rtx_REG (Pmode, AX_REG);
11269 	  emit_move_insn (rax, reg10);
11270 	  use_reg (&call_fusage, rax);
11271 	}
11272 
11273       if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11274 	{
11275 	  HOST_WIDE_INT argval;
11276 
11277 	  /* When using the large model we need to load the address
11278 	     into a register, and we've run out of registers.  So we
11279 	     switch to a different calling convention, and we call a
11280 	     different function: __morestack_large.  We pass the
11281 	     argument size in the upper 32 bits of r10 and pass the
11282 	     frame size in the lower 32 bits.  */
11283 	  gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11284 	  gcc_assert ((args_size & 0xffffffff) == args_size);
11285 
11286 	  if (split_stack_fn_large == NULL_RTX)
11287 	    split_stack_fn_large =
11288 	      gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11289 
11290 	  if (ix86_cmodel == CM_LARGE_PIC)
11291 	    {
11292 	      rtx label, x;
11293 
11294 	      label = gen_label_rtx ();
11295 	      emit_label (label);
11296 	      LABEL_PRESERVE_P (label) = 1;
11297 	      emit_insn (gen_set_rip_rex64 (reg10, label));
11298 	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
11299 	      emit_insn (gen_adddi3 (reg10, reg10, reg11));
11300 	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11301 				  UNSPEC_GOT);
11302 	      x = gen_rtx_CONST (Pmode, x);
11303 	      emit_move_insn (reg11, x);
11304 	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
11305 	      x = gen_const_mem (Pmode, x);
11306 	      emit_move_insn (reg11, x);
11307 	    }
11308 	  else
11309 	    emit_move_insn (reg11, split_stack_fn_large);
11310 
11311 	  fn = reg11;
11312 
11313 	  argval = ((args_size << 16) << 16) + allocate;
11314 	  emit_move_insn (reg10, GEN_INT (argval));
11315 	}
11316       else
11317 	{
11318 	  emit_move_insn (reg10, allocate_rtx);
11319 	  emit_move_insn (reg11, GEN_INT (args_size));
11320 	  use_reg (&call_fusage, reg11);
11321 	}
11322 
11323       use_reg (&call_fusage, reg10);
11324     }
11325   else
11326     {
11327       emit_insn (gen_push (GEN_INT (args_size)));
11328       emit_insn (gen_push (allocate_rtx));
11329     }
11330   call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11331 				GEN_INT (UNITS_PER_WORD), constm1_rtx,
11332 				NULL_RTX, false);
11333   add_function_usage_to (call_insn, call_fusage);
11334 
11335   /* In order to make call/return prediction work right, we now need
11336      to execute a return instruction.  See
11337      libgcc/config/i386/morestack.S for the details on how this works.
11338 
11339      For flow purposes gcc must not see this as a return
11340      instruction--we need control flow to continue at the subsequent
11341      label.  Therefore, we use an unspec.  */
11342   gcc_assert (crtl->args.pops_args < 65536);
11343   emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11344 
11345   /* If we are in 64-bit mode and this function uses a static chain,
11346      we saved %r10 in %rax before calling _morestack.  */
11347   if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11348     emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11349 		    gen_rtx_REG (Pmode, AX_REG));
11350 
11351   /* If this function calls va_start, we need to store a pointer to
11352      the arguments on the old stack, because they may not have been
11353      all copied to the new stack.  At this point the old stack can be
11354      found at the frame pointer value used by __morestack, because
11355      __morestack has set that up before calling back to us.  Here we
11356      store that pointer in a scratch register, and in
11357      ix86_expand_prologue we store the scratch register in a stack
11358      slot.  */
11359   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11360     {
11361       unsigned int scratch_regno;
11362       rtx frame_reg;
11363       int words;
11364 
11365       scratch_regno = split_stack_prologue_scratch_regno ();
11366       scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11367       frame_reg = gen_rtx_REG (Pmode, BP_REG);
11368 
11369       /* 64-bit:
11370 	 fp -> old fp value
11371 	       return address within this function
11372 	       return address of caller of this function
11373 	       stack arguments
11374 	 So we add three words to get to the stack arguments.
11375 
11376 	 32-bit:
11377 	 fp -> old fp value
11378 	       return address within this function
11379                first argument to __morestack
11380                second argument to __morestack
11381                return address of caller of this function
11382                stack arguments
11383          So we add five words to get to the stack arguments.
11384       */
11385       words = TARGET_64BIT ? 3 : 5;
11386       emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11387 			      gen_rtx_PLUS (Pmode, frame_reg,
11388 					    GEN_INT (words * UNITS_PER_WORD))));
11389 
11390       varargs_label = gen_label_rtx ();
11391       emit_jump_insn (gen_jump (varargs_label));
11392       JUMP_LABEL (get_last_insn ()) = varargs_label;
11393 
11394       emit_barrier ();
11395     }
11396 
11397   emit_label (label);
11398   LABEL_NUSES (label) = 1;
11399 
11400   /* If this function calls va_start, we now have to set the scratch
11401      register for the case where we do not call __morestack.  In this
11402      case we need to set it based on the stack pointer.  */
11403   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11404     {
11405       emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11406 			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11407 					    GEN_INT (UNITS_PER_WORD))));
11408 
11409       emit_label (varargs_label);
11410       LABEL_NUSES (varargs_label) = 1;
11411     }
11412 }
11413 
11414 /* We may have to tell the dataflow pass that the split stack prologue
11415    is initializing a scratch register.  */
11416 
11417 static void
11418 ix86_live_on_entry (bitmap regs)
11419 {
11420   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11421     {
11422       gcc_assert (flag_split_stack);
11423       bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11424     }
11425 }
11426 
11427 /* Determine if op is suitable SUBREG RTX for address.  */
11428 
11429 static bool
11430 ix86_address_subreg_operand (rtx op)
11431 {
11432   enum machine_mode mode;
11433 
11434   if (!REG_P (op))
11435     return false;
11436 
11437   mode = GET_MODE (op);
11438 
11439   if (GET_MODE_CLASS (mode) != MODE_INT)
11440     return false;
11441 
11442   /* Don't allow SUBREGs that span more than a word.  It can lead to spill
11443      failures when the register is one word out of a two word structure.  */
11444   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11445     return false;
11446 
11447   /* Allow only SUBREGs of non-eliminable hard registers.  */
11448   return register_no_elim_operand (op, mode);
11449 }
11450 
11451 /* Extract the parts of an RTL expression that is a valid memory address
11452    for an instruction.  Return 0 if the structure of the address is
11453    grossly off.  Return -1 if the address contains ASHIFT, so it is not
11454    strictly valid, but still used for computing length of lea instruction.  */
11455 
11456 int
11457 ix86_decompose_address (rtx addr, struct ix86_address *out)
11458 {
11459   rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11460   rtx base_reg, index_reg;
11461   HOST_WIDE_INT scale = 1;
11462   rtx scale_rtx = NULL_RTX;
11463   rtx tmp;
11464   int retval = 1;
11465   enum ix86_address_seg seg = SEG_DEFAULT;
11466 
11467   /* Allow zero-extended SImode addresses,
11468      they will be emitted with addr32 prefix.  */
11469   if (TARGET_64BIT && GET_MODE (addr) == DImode)
11470     {
11471       if (GET_CODE (addr) == ZERO_EXTEND
11472 	  && GET_MODE (XEXP (addr, 0)) == SImode)
11473 	{
11474 	  addr = XEXP (addr, 0);
11475 	  if (CONST_INT_P (addr))
11476 	    return 0;
11477 	}
11478       else if (GET_CODE (addr) == AND
11479 	       && const_32bit_mask (XEXP (addr, 1), DImode))
11480 	{
11481 	  addr = XEXP (addr, 0);
11482 
11483 	  /* Adjust SUBREGs.  */
11484 	  if (GET_CODE (addr) == SUBREG
11485 	      && GET_MODE (SUBREG_REG (addr)) == SImode)
11486 	    {
11487 	      addr = SUBREG_REG (addr);
11488 	      if (CONST_INT_P (addr))
11489 		return 0;
11490 	    }
11491 	  else if (GET_MODE (addr) == DImode)
11492 	    addr = gen_rtx_SUBREG (SImode, addr, 0);
11493 	  else if (GET_MODE (addr) != VOIDmode)
11494 	    return 0;
11495 	}
11496     }
11497 
11498   /* Allow SImode subregs of DImode addresses,
11499      they will be emitted with addr32 prefix.  */
11500   if (TARGET_64BIT && GET_MODE (addr) == SImode)
11501     {
11502       if (GET_CODE (addr) == SUBREG
11503 	  && GET_MODE (SUBREG_REG (addr)) == DImode)
11504 	{
11505 	  addr = SUBREG_REG (addr);
11506 	  if (CONST_INT_P (addr))
11507 	    return 0;
11508 	}
11509     }
11510 
11511   if (REG_P (addr))
11512     base = addr;
11513   else if (GET_CODE (addr) == SUBREG)
11514     {
11515       if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11516 	base = addr;
11517       else
11518 	return 0;
11519     }
11520   else if (GET_CODE (addr) == PLUS)
11521     {
11522       rtx addends[4], op;
11523       int n = 0, i;
11524 
11525       op = addr;
11526       do
11527 	{
11528 	  if (n >= 4)
11529 	    return 0;
11530 	  addends[n++] = XEXP (op, 1);
11531 	  op = XEXP (op, 0);
11532 	}
11533       while (GET_CODE (op) == PLUS);
11534       if (n >= 4)
11535 	return 0;
11536       addends[n] = op;
11537 
11538       for (i = n; i >= 0; --i)
11539 	{
11540 	  op = addends[i];
11541 	  switch (GET_CODE (op))
11542 	    {
11543 	    case MULT:
11544 	      if (index)
11545 		return 0;
11546 	      index = XEXP (op, 0);
11547 	      scale_rtx = XEXP (op, 1);
11548 	      break;
11549 
11550 	    case ASHIFT:
11551 	      if (index)
11552 		return 0;
11553 	      index = XEXP (op, 0);
11554 	      tmp = XEXP (op, 1);
11555 	      if (!CONST_INT_P (tmp))
11556 		return 0;
11557 	      scale = INTVAL (tmp);
11558 	      if ((unsigned HOST_WIDE_INT) scale > 3)
11559 		return 0;
11560 	      scale = 1 << scale;
11561 	      break;
11562 
11563 	    case UNSPEC:
11564 	      if (XINT (op, 1) == UNSPEC_TP
11565 	          && TARGET_TLS_DIRECT_SEG_REFS
11566 	          && seg == SEG_DEFAULT)
11567 		seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11568 	      else
11569 		return 0;
11570 	      break;
11571 
11572 	    case SUBREG:
11573 	      if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11574 		return 0;
11575 	      /* FALLTHRU */
11576 
11577 	    case REG:
11578 	      if (!base)
11579 		base = op;
11580 	      else if (!index)
11581 		index = op;
11582 	      else
11583 		return 0;
11584 	      break;
11585 
11586 	    case CONST:
11587 	    case CONST_INT:
11588 	    case SYMBOL_REF:
11589 	    case LABEL_REF:
11590 	      if (disp)
11591 		return 0;
11592 	      disp = op;
11593 	      break;
11594 
11595 	    default:
11596 	      return 0;
11597 	    }
11598 	}
11599     }
11600   else if (GET_CODE (addr) == MULT)
11601     {
11602       index = XEXP (addr, 0);		/* index*scale */
11603       scale_rtx = XEXP (addr, 1);
11604     }
11605   else if (GET_CODE (addr) == ASHIFT)
11606     {
11607       /* We're called for lea too, which implements ashift on occasion.  */
11608       index = XEXP (addr, 0);
11609       tmp = XEXP (addr, 1);
11610       if (!CONST_INT_P (tmp))
11611 	return 0;
11612       scale = INTVAL (tmp);
11613       if ((unsigned HOST_WIDE_INT) scale > 3)
11614 	return 0;
11615       scale = 1 << scale;
11616       retval = -1;
11617     }
11618   else if (CONST_INT_P (addr))
11619     {
11620       if (!x86_64_immediate_operand (addr, VOIDmode))
11621 	return 0;
11622 
11623       /* Constant addresses are sign extended to 64bit, we have to
11624 	 prevent addresses from 0x80000000 to 0xffffffff in x32 mode.  */
11625       if (TARGET_X32
11626 	  && val_signbit_known_set_p (SImode, INTVAL (addr)))
11627 	return 0;
11628 
11629       disp = addr;
11630     }
11631   else
11632     disp = addr;			/* displacement */
11633 
11634   if (index)
11635     {
11636       if (REG_P (index))
11637 	;
11638       else if (GET_CODE (index) == SUBREG
11639 	       && ix86_address_subreg_operand (SUBREG_REG (index)))
11640 	;
11641       else
11642 	return 0;
11643     }
11644 
11645   /* Extract the integral value of scale.  */
11646   if (scale_rtx)
11647     {
11648       if (!CONST_INT_P (scale_rtx))
11649 	return 0;
11650       scale = INTVAL (scale_rtx);
11651     }
11652 
11653   base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11654   index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11655 
11656   /* Avoid useless 0 displacement.  */
11657   if (disp == const0_rtx && (base || index))
11658     disp = NULL_RTX;
11659 
11660   /* Allow arg pointer and stack pointer as index if there is not scaling.  */
11661   if (base_reg && index_reg && scale == 1
11662       && (index_reg == arg_pointer_rtx
11663 	  || index_reg == frame_pointer_rtx
11664 	  || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11665     {
11666       rtx tmp;
11667       tmp = base, base = index, index = tmp;
11668       tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11669     }
11670 
11671   /* Special case: %ebp cannot be encoded as a base without a displacement.
11672      Similarly %r13.  */
11673   if (!disp
11674       && base_reg
11675       && (base_reg == hard_frame_pointer_rtx
11676 	  || base_reg == frame_pointer_rtx
11677 	  || base_reg == arg_pointer_rtx
11678 	  || (REG_P (base_reg)
11679 	      && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11680 		  || REGNO (base_reg) == R13_REG))))
11681     disp = const0_rtx;
11682 
11683   /* Special case: on K6, [%esi] makes the instruction vector decoded.
11684      Avoid this by transforming to [%esi+0].
11685      Reload calls address legitimization without cfun defined, so we need
11686      to test cfun for being non-NULL. */
11687   if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11688       && base_reg && !index_reg && !disp
11689       && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11690     disp = const0_rtx;
11691 
11692   /* Special case: encode reg+reg instead of reg*2.  */
11693   if (!base && index && scale == 2)
11694     base = index, base_reg = index_reg, scale = 1;
11695 
11696   /* Special case: scaling cannot be encoded without base or displacement.  */
11697   if (!base && !disp && index && scale != 1)
11698     disp = const0_rtx;
11699 
11700   out->base = base;
11701   out->index = index;
11702   out->disp = disp;
11703   out->scale = scale;
11704   out->seg = seg;
11705 
11706   return retval;
11707 }
11708 
11709 /* Return cost of the memory address x.
11710    For i386, it is better to use a complex address than let gcc copy
11711    the address into a reg and make a new pseudo.  But not if the address
11712    requires to two regs - that would mean more pseudos with longer
11713    lifetimes.  */
11714 static int
11715 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11716 {
11717   struct ix86_address parts;
11718   int cost = 1;
11719   int ok = ix86_decompose_address (x, &parts);
11720 
11721   gcc_assert (ok);
11722 
11723   if (parts.base && GET_CODE (parts.base) == SUBREG)
11724     parts.base = SUBREG_REG (parts.base);
11725   if (parts.index && GET_CODE (parts.index) == SUBREG)
11726     parts.index = SUBREG_REG (parts.index);
11727 
11728   /* Attempt to minimize number of registers in the address.  */
11729   if ((parts.base
11730        && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11731       || (parts.index
11732 	  && (!REG_P (parts.index)
11733 	      || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11734     cost++;
11735 
11736   if (parts.base
11737       && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11738       && parts.index
11739       && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11740       && parts.base != parts.index)
11741     cost++;
11742 
11743   /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11744      since it's predecode logic can't detect the length of instructions
11745      and it degenerates to vector decoded.  Increase cost of such
11746      addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
11747      to split such addresses or even refuse such addresses at all.
11748 
11749      Following addressing modes are affected:
11750       [base+scale*index]
11751       [scale*index+disp]
11752       [base+index]
11753 
11754      The first and last case  may be avoidable by explicitly coding the zero in
11755      memory address, but I don't have AMD-K6 machine handy to check this
11756      theory.  */
11757 
11758   if (TARGET_K6
11759       && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11760 	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11761 	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11762     cost += 10;
11763 
11764   return cost;
11765 }
11766 
11767 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11768    this is used for to form addresses to local data when -fPIC is in
11769    use.  */
11770 
11771 static bool
11772 darwin_local_data_pic (rtx disp)
11773 {
11774   return (GET_CODE (disp) == UNSPEC
11775 	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11776 }
11777 
11778 /* Determine if a given RTX is a valid constant.  We already know this
11779    satisfies CONSTANT_P.  */
11780 
11781 static bool
11782 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11783 {
11784   switch (GET_CODE (x))
11785     {
11786     case CONST:
11787       x = XEXP (x, 0);
11788 
11789       if (GET_CODE (x) == PLUS)
11790 	{
11791 	  if (!CONST_INT_P (XEXP (x, 1)))
11792 	    return false;
11793 	  x = XEXP (x, 0);
11794 	}
11795 
11796       if (TARGET_MACHO && darwin_local_data_pic (x))
11797 	return true;
11798 
11799       /* Only some unspecs are valid as "constants".  */
11800       if (GET_CODE (x) == UNSPEC)
11801 	switch (XINT (x, 1))
11802 	  {
11803 	  case UNSPEC_GOT:
11804 	  case UNSPEC_GOTOFF:
11805 	  case UNSPEC_PLTOFF:
11806 	    return TARGET_64BIT;
11807 	  case UNSPEC_TPOFF:
11808 	  case UNSPEC_NTPOFF:
11809 	    x = XVECEXP (x, 0, 0);
11810 	    return (GET_CODE (x) == SYMBOL_REF
11811 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11812 	  case UNSPEC_DTPOFF:
11813 	    x = XVECEXP (x, 0, 0);
11814 	    return (GET_CODE (x) == SYMBOL_REF
11815 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11816 	  default:
11817 	    return false;
11818 	  }
11819 
11820       /* We must have drilled down to a symbol.  */
11821       if (GET_CODE (x) == LABEL_REF)
11822 	return true;
11823       if (GET_CODE (x) != SYMBOL_REF)
11824 	return false;
11825       /* FALLTHRU */
11826 
11827     case SYMBOL_REF:
11828       /* TLS symbols are never valid.  */
11829       if (SYMBOL_REF_TLS_MODEL (x))
11830 	return false;
11831 
11832       /* DLLIMPORT symbols are never valid.  */
11833       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11834 	  && SYMBOL_REF_DLLIMPORT_P (x))
11835 	return false;
11836 
11837 #if TARGET_MACHO
11838       /* mdynamic-no-pic */
11839       if (MACHO_DYNAMIC_NO_PIC_P)
11840 	return machopic_symbol_defined_p (x);
11841 #endif
11842       break;
11843 
11844     case CONST_DOUBLE:
11845       if (GET_MODE (x) == TImode
11846 	  && x != CONST0_RTX (TImode)
11847           && !TARGET_64BIT)
11848 	return false;
11849       break;
11850 
11851     case CONST_VECTOR:
11852       if (!standard_sse_constant_p (x))
11853 	return false;
11854 
11855     default:
11856       break;
11857     }
11858 
11859   /* Otherwise we handle everything else in the move patterns.  */
11860   return true;
11861 }
11862 
11863 /* Determine if it's legal to put X into the constant pool.  This
11864    is not possible for the address of thread-local symbols, which
11865    is checked above.  */
11866 
11867 static bool
11868 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11869 {
11870   /* We can always put integral constants and vectors in memory.  */
11871   switch (GET_CODE (x))
11872     {
11873     case CONST_INT:
11874     case CONST_DOUBLE:
11875     case CONST_VECTOR:
11876       return false;
11877 
11878     default:
11879       break;
11880     }
11881   return !ix86_legitimate_constant_p (mode, x);
11882 }
11883 
11884 
11885 /* Nonzero if the constant value X is a legitimate general operand
11886    when generating PIC code.  It is given that flag_pic is on and
11887    that X satisfies CONSTANT_P or is a CONST_DOUBLE.  */
11888 
11889 bool
11890 legitimate_pic_operand_p (rtx x)
11891 {
11892   rtx inner;
11893 
11894   switch (GET_CODE (x))
11895     {
11896     case CONST:
11897       inner = XEXP (x, 0);
11898       if (GET_CODE (inner) == PLUS
11899 	  && CONST_INT_P (XEXP (inner, 1)))
11900 	inner = XEXP (inner, 0);
11901 
11902       /* Only some unspecs are valid as "constants".  */
11903       if (GET_CODE (inner) == UNSPEC)
11904 	switch (XINT (inner, 1))
11905 	  {
11906 	  case UNSPEC_GOT:
11907 	  case UNSPEC_GOTOFF:
11908 	  case UNSPEC_PLTOFF:
11909 	    return TARGET_64BIT;
11910 	  case UNSPEC_TPOFF:
11911 	    x = XVECEXP (inner, 0, 0);
11912 	    return (GET_CODE (x) == SYMBOL_REF
11913 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11914 	  case UNSPEC_MACHOPIC_OFFSET:
11915 	    return legitimate_pic_address_disp_p (x);
11916 	  default:
11917 	    return false;
11918 	  }
11919       /* FALLTHRU */
11920 
11921     case SYMBOL_REF:
11922     case LABEL_REF:
11923       return legitimate_pic_address_disp_p (x);
11924 
11925     default:
11926       return true;
11927     }
11928 }
11929 
11930 /* Determine if a given CONST RTX is a valid memory displacement
11931    in PIC mode.  */
11932 
11933 bool
11934 legitimate_pic_address_disp_p (rtx disp)
11935 {
11936   bool saw_plus;
11937 
11938   /* In 64bit mode we can allow direct addresses of symbols and labels
11939      when they are not dynamic symbols.  */
11940   if (TARGET_64BIT)
11941     {
11942       rtx op0 = disp, op1;
11943 
11944       switch (GET_CODE (disp))
11945 	{
11946 	case LABEL_REF:
11947 	  return true;
11948 
11949 	case CONST:
11950 	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
11951 	    break;
11952 	  op0 = XEXP (XEXP (disp, 0), 0);
11953 	  op1 = XEXP (XEXP (disp, 0), 1);
11954 	  if (!CONST_INT_P (op1)
11955 	      || INTVAL (op1) >= 16*1024*1024
11956 	      || INTVAL (op1) < -16*1024*1024)
11957             break;
11958 	  if (GET_CODE (op0) == LABEL_REF)
11959 	    return true;
11960 	  if (GET_CODE (op0) == CONST
11961 	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
11962 	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11963 	    return true;
11964 	  if (GET_CODE (op0) == UNSPEC
11965 	      && XINT (op0, 1) == UNSPEC_PCREL)
11966 	    return true;
11967 	  if (GET_CODE (op0) != SYMBOL_REF)
11968 	    break;
11969 	  /* FALLTHRU */
11970 
11971 	case SYMBOL_REF:
11972 	  /* TLS references should always be enclosed in UNSPEC.  */
11973 	  if (SYMBOL_REF_TLS_MODEL (op0))
11974 	    return false;
11975 	  if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11976 	      && ix86_cmodel != CM_LARGE_PIC)
11977 	    return true;
11978 	  break;
11979 
11980 	default:
11981 	  break;
11982 	}
11983     }
11984   if (GET_CODE (disp) != CONST)
11985     return false;
11986   disp = XEXP (disp, 0);
11987 
11988   if (TARGET_64BIT)
11989     {
11990       /* We are unsafe to allow PLUS expressions.  This limit allowed distance
11991          of GOT tables.  We should not need these anyway.  */
11992       if (GET_CODE (disp) != UNSPEC
11993 	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
11994 	      && XINT (disp, 1) != UNSPEC_GOTOFF
11995 	      && XINT (disp, 1) != UNSPEC_PCREL
11996 	      && XINT (disp, 1) != UNSPEC_PLTOFF))
11997 	return false;
11998 
11999       if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12000 	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12001 	return false;
12002       return true;
12003     }
12004 
12005   saw_plus = false;
12006   if (GET_CODE (disp) == PLUS)
12007     {
12008       if (!CONST_INT_P (XEXP (disp, 1)))
12009 	return false;
12010       disp = XEXP (disp, 0);
12011       saw_plus = true;
12012     }
12013 
12014   if (TARGET_MACHO && darwin_local_data_pic (disp))
12015     return true;
12016 
12017   if (GET_CODE (disp) != UNSPEC)
12018     return false;
12019 
12020   switch (XINT (disp, 1))
12021     {
12022     case UNSPEC_GOT:
12023       if (saw_plus)
12024 	return false;
12025       /* We need to check for both symbols and labels because VxWorks loads
12026 	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
12027 	 details.  */
12028       return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12029 	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12030     case UNSPEC_GOTOFF:
12031       /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12032 	 While ABI specify also 32bit relocation but we don't produce it in
12033 	 small PIC model at all.  */
12034       if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12035 	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12036 	  && !TARGET_64BIT)
12037         return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12038       return false;
12039     case UNSPEC_GOTTPOFF:
12040     case UNSPEC_GOTNTPOFF:
12041     case UNSPEC_INDNTPOFF:
12042       if (saw_plus)
12043 	return false;
12044       disp = XVECEXP (disp, 0, 0);
12045       return (GET_CODE (disp) == SYMBOL_REF
12046 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12047     case UNSPEC_NTPOFF:
12048       disp = XVECEXP (disp, 0, 0);
12049       return (GET_CODE (disp) == SYMBOL_REF
12050 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12051     case UNSPEC_DTPOFF:
12052       disp = XVECEXP (disp, 0, 0);
12053       return (GET_CODE (disp) == SYMBOL_REF
12054 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12055     }
12056 
12057   return false;
12058 }
12059 
12060 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS.  Returns a value to
12061    replace the input X, or the original X if no replacement is called for.
12062    The output parameter *WIN is 1 if the calling macro should goto WIN,
12063    0 if it should not.  */
12064 
12065 bool
12066 ix86_legitimize_reload_address (rtx x,
12067 				enum machine_mode mode ATTRIBUTE_UNUSED,
12068 				int opnum, int type,
12069 				int ind_levels ATTRIBUTE_UNUSED)
12070 {
12071   /* Reload can generate:
12072 
12073      (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12074 		       (reg:DI 97))
12075 	      (reg:DI 2 cx))
12076 
12077      This RTX is rejected from ix86_legitimate_address_p due to
12078      non-strictness of base register 97.  Following this rejection,
12079      reload pushes all three components into separate registers,
12080      creating invalid memory address RTX.
12081 
12082      Following code reloads only the invalid part of the
12083      memory address RTX.  */
12084 
12085   if (GET_CODE (x) == PLUS
12086       && REG_P (XEXP (x, 1))
12087       && GET_CODE (XEXP (x, 0)) == PLUS
12088       && REG_P (XEXP (XEXP (x, 0), 1)))
12089     {
12090       rtx base, index;
12091       bool something_reloaded = false;
12092 
12093       base = XEXP (XEXP (x, 0), 1);
12094       if (!REG_OK_FOR_BASE_STRICT_P (base))
12095 	{
12096 	  push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12097 		       BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12098 		       opnum, (enum reload_type)type);
12099 	  something_reloaded = true;
12100 	}
12101 
12102       index = XEXP (x, 1);
12103       if (!REG_OK_FOR_INDEX_STRICT_P (index))
12104 	{
12105 	  push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12106 		       INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12107 		       opnum, (enum reload_type)type);
12108 	  something_reloaded = true;
12109 	}
12110 
12111       gcc_assert (something_reloaded);
12112       return true;
12113     }
12114 
12115   return false;
12116 }
12117 
12118 /* Recognizes RTL expressions that are valid memory addresses for an
12119    instruction.  The MODE argument is the machine mode for the MEM
12120    expression that wants to use this address.
12121 
12122    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
12123    convert common non-canonical forms to canonical form so that they will
12124    be recognized.  */
12125 
12126 static bool
12127 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12128 		           rtx addr, bool strict)
12129 {
12130   struct ix86_address parts;
12131   rtx base, index, disp;
12132   HOST_WIDE_INT scale;
12133 
12134   if (ix86_decompose_address (addr, &parts) <= 0)
12135     /* Decomposition failed.  */
12136     return false;
12137 
12138   base = parts.base;
12139   index = parts.index;
12140   disp = parts.disp;
12141   scale = parts.scale;
12142 
12143   /* Validate base register.  */
12144   if (base)
12145     {
12146       rtx reg;
12147 
12148       if (REG_P (base))
12149   	reg = base;
12150       else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12151 	reg = SUBREG_REG (base);
12152       else
12153 	/* Base is not a register.  */
12154 	return false;
12155 
12156       if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12157 	return false;
12158 
12159       if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12160 	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12161 	/* Base is not valid.  */
12162 	return false;
12163     }
12164 
12165   /* Validate index register.  */
12166   if (index)
12167     {
12168       rtx reg;
12169 
12170       if (REG_P (index))
12171   	reg = index;
12172       else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12173 	reg = SUBREG_REG (index);
12174       else
12175 	/* Index is not a register.  */
12176 	return false;
12177 
12178       if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12179 	return false;
12180 
12181       if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12182 	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12183 	/* Index is not valid.  */
12184 	return false;
12185     }
12186 
12187   /* Index and base should have the same mode.  */
12188   if (base && index
12189       && GET_MODE (base) != GET_MODE (index))
12190     return false;
12191 
12192   /* Validate scale factor.  */
12193   if (scale != 1)
12194     {
12195       if (!index)
12196 	/* Scale without index.  */
12197 	return false;
12198 
12199       if (scale != 2 && scale != 4 && scale != 8)
12200 	/* Scale is not a valid multiplier.  */
12201 	return false;
12202     }
12203 
12204   /* Validate displacement.  */
12205   if (disp)
12206     {
12207       if (GET_CODE (disp) == CONST
12208 	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
12209 	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12210 	switch (XINT (XEXP (disp, 0), 1))
12211 	  {
12212 	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12213 	     used.  While ABI specify also 32bit relocations, we don't produce
12214 	     them at all and use IP relative instead.  */
12215 	  case UNSPEC_GOT:
12216 	  case UNSPEC_GOTOFF:
12217 	    gcc_assert (flag_pic);
12218 	    if (!TARGET_64BIT)
12219 	      goto is_legitimate_pic;
12220 
12221 	    /* 64bit address unspec.  */
12222 	    return false;
12223 
12224 	  case UNSPEC_GOTPCREL:
12225 	  case UNSPEC_PCREL:
12226 	    gcc_assert (flag_pic);
12227 	    goto is_legitimate_pic;
12228 
12229 	  case UNSPEC_GOTTPOFF:
12230 	  case UNSPEC_GOTNTPOFF:
12231 	  case UNSPEC_INDNTPOFF:
12232 	  case UNSPEC_NTPOFF:
12233 	  case UNSPEC_DTPOFF:
12234 	    break;
12235 
12236 	  case UNSPEC_STACK_CHECK:
12237 	    gcc_assert (flag_split_stack);
12238 	    break;
12239 
12240 	  default:
12241 	    /* Invalid address unspec.  */
12242 	    return false;
12243 	  }
12244 
12245       else if (SYMBOLIC_CONST (disp)
12246 	       && (flag_pic
12247 		   || (TARGET_MACHO
12248 #if TARGET_MACHO
12249 		       && MACHOPIC_INDIRECT
12250 		       && !machopic_operand_p (disp)
12251 #endif
12252 	       )))
12253 	{
12254 
12255 	is_legitimate_pic:
12256 	  if (TARGET_64BIT && (index || base))
12257 	    {
12258 	      /* foo@dtpoff(%rX) is ok.  */
12259 	      if (GET_CODE (disp) != CONST
12260 		  || GET_CODE (XEXP (disp, 0)) != PLUS
12261 		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12262 		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12263 		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12264 		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12265 		/* Non-constant pic memory reference.  */
12266 		return false;
12267 	    }
12268 	  else if ((!TARGET_MACHO || flag_pic)
12269 		    && ! legitimate_pic_address_disp_p (disp))
12270 	    /* Displacement is an invalid pic construct.  */
12271 	    return false;
12272 #if TARGET_MACHO
12273 	  else if (MACHO_DYNAMIC_NO_PIC_P
12274 		   && !ix86_legitimate_constant_p (Pmode, disp))
12275 	    /* displacment must be referenced via non_lazy_pointer */
12276 	    return false;
12277 #endif
12278 
12279           /* This code used to verify that a symbolic pic displacement
12280 	     includes the pic_offset_table_rtx register.
12281 
12282 	     While this is good idea, unfortunately these constructs may
12283 	     be created by "adds using lea" optimization for incorrect
12284 	     code like:
12285 
12286 	     int a;
12287 	     int foo(int i)
12288 	       {
12289 	         return *(&a+i);
12290 	       }
12291 
12292 	     This code is nonsensical, but results in addressing
12293 	     GOT table with pic_offset_table_rtx base.  We can't
12294 	     just refuse it easily, since it gets matched by
12295 	     "addsi3" pattern, that later gets split to lea in the
12296 	     case output register differs from input.  While this
12297 	     can be handled by separate addsi pattern for this case
12298 	     that never results in lea, this seems to be easier and
12299 	     correct fix for crash to disable this test.  */
12300 	}
12301       else if (GET_CODE (disp) != LABEL_REF
12302 	       && !CONST_INT_P (disp)
12303 	       && (GET_CODE (disp) != CONST
12304 		   || !ix86_legitimate_constant_p (Pmode, disp))
12305 	       && (GET_CODE (disp) != SYMBOL_REF
12306 		   || !ix86_legitimate_constant_p (Pmode, disp)))
12307 	/* Displacement is not constant.  */
12308 	return false;
12309       else if (TARGET_64BIT
12310 	       && !x86_64_immediate_operand (disp, VOIDmode))
12311 	/* Displacement is out of range.  */
12312 	return false;
12313     }
12314 
12315   /* Everything looks valid.  */
12316   return true;
12317 }
12318 
12319 /* Determine if a given RTX is a valid constant address.  */
12320 
12321 bool
12322 constant_address_p (rtx x)
12323 {
12324   return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12325 }
12326 
12327 /* Return a unique alias set for the GOT.  */
12328 
12329 static alias_set_type
12330 ix86_GOT_alias_set (void)
12331 {
12332   static alias_set_type set = -1;
12333   if (set == -1)
12334     set = new_alias_set ();
12335   return set;
12336 }
12337 
12338 /* Return a legitimate reference for ORIG (an address) using the
12339    register REG.  If REG is 0, a new pseudo is generated.
12340 
12341    There are two types of references that must be handled:
12342 
12343    1. Global data references must load the address from the GOT, via
12344       the PIC reg.  An insn is emitted to do this load, and the reg is
12345       returned.
12346 
12347    2. Static data references, constant pool addresses, and code labels
12348       compute the address as an offset from the GOT, whose base is in
12349       the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
12350       differentiate them from global data objects.  The returned
12351       address is the PIC reg + an unspec constant.
12352 
12353    TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12354    reg also appears in the address.  */
12355 
12356 static rtx
12357 legitimize_pic_address (rtx orig, rtx reg)
12358 {
12359   rtx addr = orig;
12360   rtx new_rtx = orig;
12361 
12362 #if TARGET_MACHO
12363   if (TARGET_MACHO && !TARGET_64BIT)
12364     {
12365       if (reg == 0)
12366 	reg = gen_reg_rtx (Pmode);
12367       /* Use the generic Mach-O PIC machinery.  */
12368       return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12369     }
12370 #endif
12371 
12372   if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12373     new_rtx = addr;
12374   else if (TARGET_64BIT
12375 	   && ix86_cmodel != CM_SMALL_PIC
12376 	   && gotoff_operand (addr, Pmode))
12377     {
12378       rtx tmpreg;
12379       /* This symbol may be referenced via a displacement from the PIC
12380 	 base address (@GOTOFF).  */
12381 
12382       if (reload_in_progress)
12383 	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12384       if (GET_CODE (addr) == CONST)
12385 	addr = XEXP (addr, 0);
12386       if (GET_CODE (addr) == PLUS)
12387 	  {
12388             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12389 				      UNSPEC_GOTOFF);
12390 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12391 	  }
12392 	else
12393           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12394       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12395       if (!reg)
12396         tmpreg = gen_reg_rtx (Pmode);
12397       else
12398 	tmpreg = reg;
12399       emit_move_insn (tmpreg, new_rtx);
12400 
12401       if (reg != 0)
12402 	{
12403 	  new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12404 					 tmpreg, 1, OPTAB_DIRECT);
12405 	  new_rtx = reg;
12406 	}
12407       else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12408     }
12409   else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12410     {
12411       /* This symbol may be referenced via a displacement from the PIC
12412 	 base address (@GOTOFF).  */
12413 
12414       if (reload_in_progress)
12415 	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12416       if (GET_CODE (addr) == CONST)
12417 	addr = XEXP (addr, 0);
12418       if (GET_CODE (addr) == PLUS)
12419 	  {
12420             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12421 				      UNSPEC_GOTOFF);
12422 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12423 	  }
12424 	else
12425           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12426       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12427       new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12428 
12429       if (reg != 0)
12430 	{
12431 	  emit_move_insn (reg, new_rtx);
12432 	  new_rtx = reg;
12433 	}
12434     }
12435   else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12436 	   /* We can't use @GOTOFF for text labels on VxWorks;
12437 	      see gotoff_operand.  */
12438 	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12439     {
12440       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12441         {
12442           if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12443             return legitimize_dllimport_symbol (addr, true);
12444           if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12445               && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12446               && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12447             {
12448               rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12449               return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12450             }
12451         }
12452 
12453       /* For x64 PE-COFF there is no GOT table.  So we use address
12454          directly.  */
12455       if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12456       {
12457 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12458 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12459 
12460 	  if (reg == 0)
12461 	    reg = gen_reg_rtx (Pmode);
12462   	  emit_move_insn (reg, new_rtx);
12463 	  new_rtx = reg;
12464       }
12465       else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12466 	{
12467 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12468 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12469 	  new_rtx = gen_const_mem (Pmode, new_rtx);
12470 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12471 
12472 	  if (reg == 0)
12473 	    reg = gen_reg_rtx (Pmode);
12474 	  /* Use directly gen_movsi, otherwise the address is loaded
12475 	     into register for CSE.  We don't want to CSE this addresses,
12476 	     instead we CSE addresses from the GOT table, so skip this.  */
12477 	  emit_insn (gen_movsi (reg, new_rtx));
12478 	  new_rtx = reg;
12479 	}
12480       else
12481 	{
12482 	  /* This symbol must be referenced via a load from the
12483 	     Global Offset Table (@GOT).  */
12484 
12485 	  if (reload_in_progress)
12486 	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12487 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12488 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12489 	  if (TARGET_64BIT)
12490 	    new_rtx = force_reg (Pmode, new_rtx);
12491 	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12492 	  new_rtx = gen_const_mem (Pmode, new_rtx);
12493 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12494 
12495 	  if (reg == 0)
12496 	    reg = gen_reg_rtx (Pmode);
12497 	  emit_move_insn (reg, new_rtx);
12498 	  new_rtx = reg;
12499 	}
12500     }
12501   else
12502     {
12503       if (CONST_INT_P (addr)
12504 	  && !x86_64_immediate_operand (addr, VOIDmode))
12505 	{
12506 	  if (reg)
12507 	    {
12508 	      emit_move_insn (reg, addr);
12509 	      new_rtx = reg;
12510 	    }
12511 	  else
12512 	    new_rtx = force_reg (Pmode, addr);
12513 	}
12514       else if (GET_CODE (addr) == CONST)
12515 	{
12516 	  addr = XEXP (addr, 0);
12517 
12518 	  /* We must match stuff we generate before.  Assume the only
12519 	     unspecs that can get here are ours.  Not that we could do
12520 	     anything with them anyway....  */
12521 	  if (GET_CODE (addr) == UNSPEC
12522 	      || (GET_CODE (addr) == PLUS
12523 		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12524 	    return orig;
12525 	  gcc_assert (GET_CODE (addr) == PLUS);
12526 	}
12527       if (GET_CODE (addr) == PLUS)
12528 	{
12529 	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12530 
12531 	  /* Check first to see if this is a constant offset from a @GOTOFF
12532 	     symbol reference.  */
12533 	  if (gotoff_operand (op0, Pmode)
12534 	      && CONST_INT_P (op1))
12535 	    {
12536 	      if (!TARGET_64BIT)
12537 		{
12538 		  if (reload_in_progress)
12539 		    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12540 		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12541 					    UNSPEC_GOTOFF);
12542 		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12543 		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12544 		  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12545 
12546 		  if (reg != 0)
12547 		    {
12548 		      emit_move_insn (reg, new_rtx);
12549 		      new_rtx = reg;
12550 		    }
12551 		}
12552 	      else
12553 		{
12554 		  if (INTVAL (op1) < -16*1024*1024
12555 		      || INTVAL (op1) >= 16*1024*1024)
12556 		    {
12557 		      if (!x86_64_immediate_operand (op1, Pmode))
12558 			op1 = force_reg (Pmode, op1);
12559 		      new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12560 		    }
12561 		}
12562 	    }
12563 	  else
12564 	    {
12565 	      rtx base = legitimize_pic_address (op0, reg);
12566 	      enum machine_mode mode = GET_MODE (base);
12567 	      new_rtx
12568 	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12569 
12570 	      if (CONST_INT_P (new_rtx))
12571 		{
12572 		  if (INTVAL (new_rtx) < -16*1024*1024
12573 		      || INTVAL (new_rtx) >= 16*1024*1024)
12574 		    {
12575 		      if (!x86_64_immediate_operand (new_rtx, mode))
12576 			new_rtx = force_reg (mode, new_rtx);
12577 		      new_rtx
12578 		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12579 		    }
12580 		  else
12581 		    new_rtx = plus_constant (base, INTVAL (new_rtx));
12582 		}
12583 	      else
12584 		{
12585 		  if (GET_CODE (new_rtx) == PLUS
12586 		      && CONSTANT_P (XEXP (new_rtx, 1)))
12587 		    {
12588 		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12589 		      new_rtx = XEXP (new_rtx, 1);
12590 		    }
12591 		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12592 		}
12593 	    }
12594 	}
12595     }
12596   return new_rtx;
12597 }
12598 
12599 /* Load the thread pointer.  If TO_REG is true, force it into a register.  */
12600 
12601 static rtx
12602 get_thread_pointer (bool to_reg)
12603 {
12604   rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12605 
12606   if (GET_MODE (tp) != Pmode)
12607     tp = convert_to_mode (Pmode, tp, 1);
12608 
12609   if (to_reg)
12610     tp = copy_addr_to_reg (tp);
12611 
12612   return tp;
12613 }
12614 
12615 /* Construct the SYMBOL_REF for the tls_get_addr function.  */
12616 
12617 static GTY(()) rtx ix86_tls_symbol;
12618 
12619 static rtx
12620 ix86_tls_get_addr (void)
12621 {
12622   if (!ix86_tls_symbol)
12623     {
12624       const char *sym
12625 	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12626 	   ? "___tls_get_addr" : "__tls_get_addr");
12627 
12628       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12629     }
12630 
12631   return ix86_tls_symbol;
12632 }
12633 
12634 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
12635 
12636 static GTY(()) rtx ix86_tls_module_base_symbol;
12637 
12638 rtx
12639 ix86_tls_module_base (void)
12640 {
12641   if (!ix86_tls_module_base_symbol)
12642     {
12643       ix86_tls_module_base_symbol
12644 	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12645 
12646       SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12647 	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12648     }
12649 
12650   return ix86_tls_module_base_symbol;
12651 }
12652 
12653 /* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
12654    false if we expect this to be used for a memory address and true if
12655    we expect to load the address into a register.  */
12656 
12657 static rtx
12658 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12659 {
12660   rtx dest, base, off;
12661   rtx pic = NULL_RTX, tp = NULL_RTX;
12662   int type;
12663 
12664   switch (model)
12665     {
12666     case TLS_MODEL_GLOBAL_DYNAMIC:
12667       dest = gen_reg_rtx (Pmode);
12668 
12669       if (!TARGET_64BIT)
12670 	{
12671 	  if (flag_pic)
12672 	    pic = pic_offset_table_rtx;
12673 	  else
12674 	    {
12675 	      pic = gen_reg_rtx (Pmode);
12676 	      emit_insn (gen_set_got (pic));
12677 	    }
12678 	}
12679 
12680       if (TARGET_GNU2_TLS)
12681 	{
12682 	  if (TARGET_64BIT)
12683 	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12684 	  else
12685 	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12686 
12687 	  tp = get_thread_pointer (true);
12688 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12689 
12690 	  if (GET_MODE (x) != Pmode)
12691 	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
12692 
12693 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12694 	}
12695       else
12696 	{
12697 	  rtx caddr = ix86_tls_get_addr ();
12698 
12699 	  if (TARGET_64BIT)
12700 	    {
12701 	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
12702 	      rtx insns;
12703 
12704 	      start_sequence ();
12705 	      emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12706 	      insns = get_insns ();
12707 	      end_sequence ();
12708 
12709 	      if (GET_MODE (x) != Pmode)
12710 		x = gen_rtx_ZERO_EXTEND (Pmode, x);
12711 
12712 	      RTL_CONST_CALL_P (insns) = 1;
12713 	      emit_libcall_block (insns, dest, rax, x);
12714 	    }
12715 	  else
12716 	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12717 	}
12718       break;
12719 
12720     case TLS_MODEL_LOCAL_DYNAMIC:
12721       base = gen_reg_rtx (Pmode);
12722 
12723       if (!TARGET_64BIT)
12724 	{
12725 	  if (flag_pic)
12726 	    pic = pic_offset_table_rtx;
12727 	  else
12728 	    {
12729 	      pic = gen_reg_rtx (Pmode);
12730 	      emit_insn (gen_set_got (pic));
12731 	    }
12732 	}
12733 
12734       if (TARGET_GNU2_TLS)
12735 	{
12736 	  rtx tmp = ix86_tls_module_base ();
12737 
12738 	  if (TARGET_64BIT)
12739 	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12740 	  else
12741 	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12742 
12743 	  tp = get_thread_pointer (true);
12744 	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
12745 			       gen_rtx_MINUS (Pmode, tmp, tp));
12746 	}
12747       else
12748 	{
12749 	  rtx caddr = ix86_tls_get_addr ();
12750 
12751 	  if (TARGET_64BIT)
12752 	    {
12753 	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
12754 	      rtx insns, eqv;
12755 
12756 	      start_sequence ();
12757 	      emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12758 	      insns = get_insns ();
12759 	      end_sequence ();
12760 
12761 	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12762 		 share the LD_BASE result with other LD model accesses.  */
12763 	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12764 				    UNSPEC_TLS_LD_BASE);
12765 
12766 	      RTL_CONST_CALL_P (insns) = 1;
12767 	      emit_libcall_block (insns, base, rax, eqv);
12768 	    }
12769 	  else
12770 	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12771 	}
12772 
12773       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12774       off = gen_rtx_CONST (Pmode, off);
12775 
12776       dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12777 
12778       if (TARGET_GNU2_TLS)
12779 	{
12780 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12781 
12782 	  if (GET_MODE (x) != Pmode)
12783 	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
12784 
12785 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12786 	}
12787       break;
12788 
12789     case TLS_MODEL_INITIAL_EXEC:
12790       if (TARGET_64BIT)
12791 	{
12792 	  if (TARGET_SUN_TLS)
12793 	    {
12794 	      /* The Sun linker took the AMD64 TLS spec literally
12795 		 and can only handle %rax as destination of the
12796 		 initial executable code sequence.  */
12797 
12798 	      dest = gen_reg_rtx (Pmode);
12799 	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12800 	      return dest;
12801 	    }
12802 
12803 	  pic = NULL;
12804 	  type = UNSPEC_GOTNTPOFF;
12805 	}
12806       else if (flag_pic)
12807 	{
12808 	  if (reload_in_progress)
12809 	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12810 	  pic = pic_offset_table_rtx;
12811 	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12812 	}
12813       else if (!TARGET_ANY_GNU_TLS)
12814 	{
12815 	  pic = gen_reg_rtx (Pmode);
12816 	  emit_insn (gen_set_got (pic));
12817 	  type = UNSPEC_GOTTPOFF;
12818 	}
12819       else
12820 	{
12821 	  pic = NULL;
12822 	  type = UNSPEC_INDNTPOFF;
12823 	}
12824 
12825       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12826       off = gen_rtx_CONST (Pmode, off);
12827       if (pic)
12828 	off = gen_rtx_PLUS (Pmode, pic, off);
12829       off = gen_const_mem (Pmode, off);
12830       set_mem_alias_set (off, ix86_GOT_alias_set ());
12831 
12832       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12833 	{
12834           base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12835 	  off = force_reg (Pmode, off);
12836 	  return gen_rtx_PLUS (Pmode, base, off);
12837 	}
12838       else
12839 	{
12840 	  base = get_thread_pointer (true);
12841 	  dest = gen_reg_rtx (Pmode);
12842 	  emit_insn (gen_subsi3 (dest, base, off));
12843 	}
12844       break;
12845 
12846     case TLS_MODEL_LOCAL_EXEC:
12847       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12848 			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12849 			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12850       off = gen_rtx_CONST (Pmode, off);
12851 
12852       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12853 	{
12854 	  base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12855 	  return gen_rtx_PLUS (Pmode, base, off);
12856 	}
12857       else
12858 	{
12859 	  base = get_thread_pointer (true);
12860 	  dest = gen_reg_rtx (Pmode);
12861 	  emit_insn (gen_subsi3 (dest, base, off));
12862 	}
12863       break;
12864 
12865     default:
12866       gcc_unreachable ();
12867     }
12868 
12869   return dest;
12870 }
12871 
12872 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12873    to symbol DECL.  */
12874 
12875 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12876   htab_t dllimport_map;
12877 
12878 static tree
12879 get_dllimport_decl (tree decl)
12880 {
12881   struct tree_map *h, in;
12882   void **loc;
12883   const char *name;
12884   const char *prefix;
12885   size_t namelen, prefixlen;
12886   char *imp_name;
12887   tree to;
12888   rtx rtl;
12889 
12890   if (!dllimport_map)
12891     dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12892 
12893   in.hash = htab_hash_pointer (decl);
12894   in.base.from = decl;
12895   loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12896   h = (struct tree_map *) *loc;
12897   if (h)
12898     return h->to;
12899 
12900   *loc = h = ggc_alloc_tree_map ();
12901   h->hash = in.hash;
12902   h->base.from = decl;
12903   h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12904 			   VAR_DECL, NULL, ptr_type_node);
12905   DECL_ARTIFICIAL (to) = 1;
12906   DECL_IGNORED_P (to) = 1;
12907   DECL_EXTERNAL (to) = 1;
12908   TREE_READONLY (to) = 1;
12909 
12910   name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12911   name = targetm.strip_name_encoding (name);
12912   prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12913     ? "*__imp_" : "*__imp__";
12914   namelen = strlen (name);
12915   prefixlen = strlen (prefix);
12916   imp_name = (char *) alloca (namelen + prefixlen + 1);
12917   memcpy (imp_name, prefix, prefixlen);
12918   memcpy (imp_name + prefixlen, name, namelen + 1);
12919 
12920   name = ggc_alloc_string (imp_name, namelen + prefixlen);
12921   rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12922   SET_SYMBOL_REF_DECL (rtl, to);
12923   SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12924 
12925   rtl = gen_const_mem (Pmode, rtl);
12926   set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12927 
12928   SET_DECL_RTL (to, rtl);
12929   SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12930 
12931   return to;
12932 }
12933 
12934 /* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
12935    true if we require the result be a register.  */
12936 
12937 static rtx
12938 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12939 {
12940   tree imp_decl;
12941   rtx x;
12942 
12943   gcc_assert (SYMBOL_REF_DECL (symbol));
12944   imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12945 
12946   x = DECL_RTL (imp_decl);
12947   if (want_reg)
12948     x = force_reg (Pmode, x);
12949   return x;
12950 }
12951 
12952 /* Try machine-dependent ways of modifying an illegitimate address
12953    to be legitimate.  If we find one, return the new, valid address.
12954    This macro is used in only one place: `memory_address' in explow.c.
12955 
12956    OLDX is the address as it was before break_out_memory_refs was called.
12957    In some cases it is useful to look at this to decide what needs to be done.
12958 
12959    It is always safe for this macro to do nothing.  It exists to recognize
12960    opportunities to optimize the output.
12961 
12962    For the 80386, we handle X+REG by loading X into a register R and
12963    using R+REG.  R will go in a general reg and indexing will be used.
12964    However, if REG is a broken-out memory address or multiplication,
12965    nothing needs to be done because REG can certainly go in a general reg.
12966 
12967    When -fpic is used, special handling is needed for symbolic references.
12968    See comments by legitimize_pic_address in i386.c for details.  */
12969 
12970 static rtx
12971 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12972 			 enum machine_mode mode)
12973 {
12974   int changed = 0;
12975   unsigned log;
12976 
12977   log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12978   if (log)
12979     return legitimize_tls_address (x, (enum tls_model) log, false);
12980   if (GET_CODE (x) == CONST
12981       && GET_CODE (XEXP (x, 0)) == PLUS
12982       && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12983       && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12984     {
12985       rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12986 				      (enum tls_model) log, false);
12987       return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12988     }
12989 
12990   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12991     {
12992       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12993 	return legitimize_dllimport_symbol (x, true);
12994       if (GET_CODE (x) == CONST
12995 	  && GET_CODE (XEXP (x, 0)) == PLUS
12996 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12997 	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12998 	{
12999 	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13000 	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13001 	}
13002     }
13003 
13004   if (flag_pic && SYMBOLIC_CONST (x))
13005     return legitimize_pic_address (x, 0);
13006 
13007 #if TARGET_MACHO
13008   if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13009     return machopic_indirect_data_reference (x, 0);
13010 #endif
13011 
13012   /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13013   if (GET_CODE (x) == ASHIFT
13014       && CONST_INT_P (XEXP (x, 1))
13015       && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13016     {
13017       changed = 1;
13018       log = INTVAL (XEXP (x, 1));
13019       x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13020 			GEN_INT (1 << log));
13021     }
13022 
13023   if (GET_CODE (x) == PLUS)
13024     {
13025       /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
13026 
13027       if (GET_CODE (XEXP (x, 0)) == ASHIFT
13028 	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13029 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13030 	{
13031 	  changed = 1;
13032 	  log = INTVAL (XEXP (XEXP (x, 0), 1));
13033 	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
13034 				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13035 				      GEN_INT (1 << log));
13036 	}
13037 
13038       if (GET_CODE (XEXP (x, 1)) == ASHIFT
13039 	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13040 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13041 	{
13042 	  changed = 1;
13043 	  log = INTVAL (XEXP (XEXP (x, 1), 1));
13044 	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
13045 				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13046 				      GEN_INT (1 << log));
13047 	}
13048 
13049       /* Put multiply first if it isn't already.  */
13050       if (GET_CODE (XEXP (x, 1)) == MULT)
13051 	{
13052 	  rtx tmp = XEXP (x, 0);
13053 	  XEXP (x, 0) = XEXP (x, 1);
13054 	  XEXP (x, 1) = tmp;
13055 	  changed = 1;
13056 	}
13057 
13058       /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13059 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
13060 	 created by virtual register instantiation, register elimination, and
13061 	 similar optimizations.  */
13062       if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13063 	{
13064 	  changed = 1;
13065 	  x = gen_rtx_PLUS (Pmode,
13066 			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
13067 					  XEXP (XEXP (x, 1), 0)),
13068 			    XEXP (XEXP (x, 1), 1));
13069 	}
13070 
13071       /* Canonicalize
13072 	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13073 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
13074       else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13075 	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13076 	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13077 	       && CONSTANT_P (XEXP (x, 1)))
13078 	{
13079 	  rtx constant;
13080 	  rtx other = NULL_RTX;
13081 
13082 	  if (CONST_INT_P (XEXP (x, 1)))
13083 	    {
13084 	      constant = XEXP (x, 1);
13085 	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13086 	    }
13087 	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13088 	    {
13089 	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13090 	      other = XEXP (x, 1);
13091 	    }
13092 	  else
13093 	    constant = 0;
13094 
13095 	  if (constant)
13096 	    {
13097 	      changed = 1;
13098 	      x = gen_rtx_PLUS (Pmode,
13099 				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13100 					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
13101 				plus_constant (other, INTVAL (constant)));
13102 	    }
13103 	}
13104 
13105       if (changed && ix86_legitimate_address_p (mode, x, false))
13106 	return x;
13107 
13108       if (GET_CODE (XEXP (x, 0)) == MULT)
13109 	{
13110 	  changed = 1;
13111 	  XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13112 	}
13113 
13114       if (GET_CODE (XEXP (x, 1)) == MULT)
13115 	{
13116 	  changed = 1;
13117 	  XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13118 	}
13119 
13120       if (changed
13121 	  && REG_P (XEXP (x, 1))
13122 	  && REG_P (XEXP (x, 0)))
13123 	return x;
13124 
13125       if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13126 	{
13127 	  changed = 1;
13128 	  x = legitimize_pic_address (x, 0);
13129 	}
13130 
13131       if (changed && ix86_legitimate_address_p (mode, x, false))
13132 	return x;
13133 
13134       if (REG_P (XEXP (x, 0)))
13135 	{
13136 	  rtx temp = gen_reg_rtx (Pmode);
13137 	  rtx val  = force_operand (XEXP (x, 1), temp);
13138 	  if (val != temp)
13139 	    {
13140 	      if (GET_MODE (val) != Pmode)
13141 		val = convert_to_mode (Pmode, val, 1);
13142 	      emit_move_insn (temp, val);
13143 	    }
13144 
13145 	  XEXP (x, 1) = temp;
13146 	  return x;
13147 	}
13148 
13149       else if (REG_P (XEXP (x, 1)))
13150 	{
13151 	  rtx temp = gen_reg_rtx (Pmode);
13152 	  rtx val  = force_operand (XEXP (x, 0), temp);
13153 	  if (val != temp)
13154 	    {
13155 	      if (GET_MODE (val) != Pmode)
13156 		val = convert_to_mode (Pmode, val, 1);
13157 	      emit_move_insn (temp, val);
13158 	    }
13159 
13160 	  XEXP (x, 0) = temp;
13161 	  return x;
13162 	}
13163     }
13164 
13165   return x;
13166 }
13167 
13168 /* Print an integer constant expression in assembler syntax.  Addition
13169    and subtraction are the only arithmetic that may appear in these
13170    expressions.  FILE is the stdio stream to write to, X is the rtx, and
13171    CODE is the operand print code from the output string.  */
13172 
13173 static void
13174 output_pic_addr_const (FILE *file, rtx x, int code)
13175 {
13176   char buf[256];
13177 
13178   switch (GET_CODE (x))
13179     {
13180     case PC:
13181       gcc_assert (flag_pic);
13182       putc ('.', file);
13183       break;
13184 
13185     case SYMBOL_REF:
13186       if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13187 	output_addr_const (file, x);
13188       else
13189 	{
13190 	  const char *name = XSTR (x, 0);
13191 
13192 	  /* Mark the decl as referenced so that cgraph will
13193 	     output the function.  */
13194 	  if (SYMBOL_REF_DECL (x))
13195 	    mark_decl_referenced (SYMBOL_REF_DECL (x));
13196 
13197 #if TARGET_MACHO
13198 	  if (MACHOPIC_INDIRECT
13199 	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13200 	    name = machopic_indirection_name (x, /*stub_p=*/true);
13201 #endif
13202 	  assemble_name (file, name);
13203 	}
13204       if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13205 	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13206 	fputs ("@PLT", file);
13207       break;
13208 
13209     case LABEL_REF:
13210       x = XEXP (x, 0);
13211       /* FALLTHRU */
13212     case CODE_LABEL:
13213       ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13214       assemble_name (asm_out_file, buf);
13215       break;
13216 
13217     case CONST_INT:
13218       fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13219       break;
13220 
13221     case CONST:
13222       /* This used to output parentheses around the expression,
13223 	 but that does not work on the 386 (either ATT or BSD assembler).  */
13224       output_pic_addr_const (file, XEXP (x, 0), code);
13225       break;
13226 
13227     case CONST_DOUBLE:
13228       if (GET_MODE (x) == VOIDmode)
13229 	{
13230 	  /* We can use %d if the number is <32 bits and positive.  */
13231 	  if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13232 	    fprintf (file, "0x%lx%08lx",
13233 		     (unsigned long) CONST_DOUBLE_HIGH (x),
13234 		     (unsigned long) CONST_DOUBLE_LOW (x));
13235 	  else
13236 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13237 	}
13238       else
13239 	/* We can't handle floating point constants;
13240 	   TARGET_PRINT_OPERAND must handle them.  */
13241 	output_operand_lossage ("floating constant misused");
13242       break;
13243 
13244     case PLUS:
13245       /* Some assemblers need integer constants to appear first.  */
13246       if (CONST_INT_P (XEXP (x, 0)))
13247 	{
13248 	  output_pic_addr_const (file, XEXP (x, 0), code);
13249 	  putc ('+', file);
13250 	  output_pic_addr_const (file, XEXP (x, 1), code);
13251 	}
13252       else
13253 	{
13254 	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
13255 	  output_pic_addr_const (file, XEXP (x, 1), code);
13256 	  putc ('+', file);
13257 	  output_pic_addr_const (file, XEXP (x, 0), code);
13258 	}
13259       break;
13260 
13261     case MINUS:
13262       if (!TARGET_MACHO)
13263 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13264       output_pic_addr_const (file, XEXP (x, 0), code);
13265       putc ('-', file);
13266       output_pic_addr_const (file, XEXP (x, 1), code);
13267       if (!TARGET_MACHO)
13268 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13269       break;
13270 
13271      case UNSPEC:
13272        if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13273 	 {
13274 	   bool f = i386_asm_output_addr_const_extra (file, x);
13275 	   gcc_assert (f);
13276 	   break;
13277 	 }
13278 
13279        gcc_assert (XVECLEN (x, 0) == 1);
13280        output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13281        switch (XINT (x, 1))
13282 	{
13283 	case UNSPEC_GOT:
13284 	  fputs ("@GOT", file);
13285 	  break;
13286 	case UNSPEC_GOTOFF:
13287 	  fputs ("@GOTOFF", file);
13288 	  break;
13289 	case UNSPEC_PLTOFF:
13290 	  fputs ("@PLTOFF", file);
13291 	  break;
13292 	case UNSPEC_PCREL:
13293 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13294 		 "(%rip)" : "[rip]", file);
13295 	  break;
13296 	case UNSPEC_GOTPCREL:
13297 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13298 		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13299 	  break;
13300 	case UNSPEC_GOTTPOFF:
13301 	  /* FIXME: This might be @TPOFF in Sun ld too.  */
13302 	  fputs ("@gottpoff", file);
13303 	  break;
13304 	case UNSPEC_TPOFF:
13305 	  fputs ("@tpoff", file);
13306 	  break;
13307 	case UNSPEC_NTPOFF:
13308 	  if (TARGET_64BIT)
13309 	    fputs ("@tpoff", file);
13310 	  else
13311 	    fputs ("@ntpoff", file);
13312 	  break;
13313 	case UNSPEC_DTPOFF:
13314 	  fputs ("@dtpoff", file);
13315 	  break;
13316 	case UNSPEC_GOTNTPOFF:
13317 	  if (TARGET_64BIT)
13318 	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13319 		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
13320 	  else
13321 	    fputs ("@gotntpoff", file);
13322 	  break;
13323 	case UNSPEC_INDNTPOFF:
13324 	  fputs ("@indntpoff", file);
13325 	  break;
13326 #if TARGET_MACHO
13327 	case UNSPEC_MACHOPIC_OFFSET:
13328 	  putc ('-', file);
13329 	  machopic_output_function_base_name (file);
13330 	  break;
13331 #endif
13332 	default:
13333 	  output_operand_lossage ("invalid UNSPEC as operand");
13334 	  break;
13335 	}
13336        break;
13337 
13338     default:
13339       output_operand_lossage ("invalid expression as operand");
13340     }
13341 }
13342 
13343 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13344    We need to emit DTP-relative relocations.  */
13345 
13346 static void ATTRIBUTE_UNUSED
13347 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13348 {
13349   fputs (ASM_LONG, file);
13350   output_addr_const (file, x);
13351   fputs ("@dtpoff", file);
13352   switch (size)
13353     {
13354     case 4:
13355       break;
13356     case 8:
13357       fputs (", 0", file);
13358       break;
13359     default:
13360       gcc_unreachable ();
13361    }
13362 }
13363 
13364 /* Return true if X is a representation of the PIC register.  This copes
13365    with calls from ix86_find_base_term, where the register might have
13366    been replaced by a cselib value.  */
13367 
13368 static bool
13369 ix86_pic_register_p (rtx x)
13370 {
13371   if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13372     return (pic_offset_table_rtx
13373 	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13374   else
13375     return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13376 }
13377 
13378 /* Helper function for ix86_delegitimize_address.
13379    Attempt to delegitimize TLS local-exec accesses.  */
13380 
13381 static rtx
13382 ix86_delegitimize_tls_address (rtx orig_x)
13383 {
13384   rtx x = orig_x, unspec;
13385   struct ix86_address addr;
13386 
13387   if (!TARGET_TLS_DIRECT_SEG_REFS)
13388     return orig_x;
13389   if (MEM_P (x))
13390     x = XEXP (x, 0);
13391   if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13392     return orig_x;
13393   if (ix86_decompose_address (x, &addr) == 0
13394       || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13395       || addr.disp == NULL_RTX
13396       || GET_CODE (addr.disp) != CONST)
13397     return orig_x;
13398   unspec = XEXP (addr.disp, 0);
13399   if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13400     unspec = XEXP (unspec, 0);
13401   if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13402     return orig_x;
13403   x = XVECEXP (unspec, 0, 0);
13404   gcc_assert (GET_CODE (x) == SYMBOL_REF);
13405   if (unspec != XEXP (addr.disp, 0))
13406     x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13407   if (addr.index)
13408     {
13409       rtx idx = addr.index;
13410       if (addr.scale != 1)
13411 	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13412       x = gen_rtx_PLUS (Pmode, idx, x);
13413     }
13414   if (addr.base)
13415     x = gen_rtx_PLUS (Pmode, addr.base, x);
13416   if (MEM_P (orig_x))
13417     x = replace_equiv_address_nv (orig_x, x);
13418   return x;
13419 }
13420 
13421 /* In the name of slightly smaller debug output, and to cater to
13422    general assembler lossage, recognize PIC+GOTOFF and turn it back
13423    into a direct symbol reference.
13424 
13425    On Darwin, this is necessary to avoid a crash, because Darwin
13426    has a different PIC label for each routine but the DWARF debugging
13427    information is not associated with any particular routine, so it's
13428    necessary to remove references to the PIC label from RTL stored by
13429    the DWARF output code.  */
13430 
13431 static rtx
13432 ix86_delegitimize_address (rtx x)
13433 {
13434   rtx orig_x = delegitimize_mem_from_attrs (x);
13435   /* addend is NULL or some rtx if x is something+GOTOFF where
13436      something doesn't include the PIC register.  */
13437   rtx addend = NULL_RTX;
13438   /* reg_addend is NULL or a multiple of some register.  */
13439   rtx reg_addend = NULL_RTX;
13440   /* const_addend is NULL or a const_int.  */
13441   rtx const_addend = NULL_RTX;
13442   /* This is the result, or NULL.  */
13443   rtx result = NULL_RTX;
13444 
13445   x = orig_x;
13446 
13447   if (MEM_P (x))
13448     x = XEXP (x, 0);
13449 
13450   if (TARGET_64BIT)
13451     {
13452       if (GET_CODE (x) == CONST
13453           && GET_CODE (XEXP (x, 0)) == PLUS
13454           && GET_MODE (XEXP (x, 0)) == Pmode
13455           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13456           && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13457           && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13458         {
13459 	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13460 	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13461 	  if (MEM_P (orig_x))
13462 	    x = replace_equiv_address_nv (orig_x, x);
13463 	  return x;
13464 	}
13465       if (GET_CODE (x) != CONST
13466 	  || GET_CODE (XEXP (x, 0)) != UNSPEC
13467 	  || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13468 	      && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13469 	  || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13470 	return ix86_delegitimize_tls_address (orig_x);
13471       x = XVECEXP (XEXP (x, 0), 0, 0);
13472       if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13473 	{
13474 	  x = simplify_gen_subreg (GET_MODE (orig_x), x,
13475 				   GET_MODE (x), 0);
13476 	  if (x == NULL_RTX)
13477 	    return orig_x;
13478 	}
13479       return x;
13480     }
13481 
13482   if (GET_CODE (x) != PLUS
13483       || GET_CODE (XEXP (x, 1)) != CONST)
13484     return ix86_delegitimize_tls_address (orig_x);
13485 
13486   if (ix86_pic_register_p (XEXP (x, 0)))
13487     /* %ebx + GOT/GOTOFF */
13488     ;
13489   else if (GET_CODE (XEXP (x, 0)) == PLUS)
13490     {
13491       /* %ebx + %reg * scale + GOT/GOTOFF */
13492       reg_addend = XEXP (x, 0);
13493       if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13494 	reg_addend = XEXP (reg_addend, 1);
13495       else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13496 	reg_addend = XEXP (reg_addend, 0);
13497       else
13498 	{
13499 	  reg_addend = NULL_RTX;
13500 	  addend = XEXP (x, 0);
13501 	}
13502     }
13503   else
13504     addend = XEXP (x, 0);
13505 
13506   x = XEXP (XEXP (x, 1), 0);
13507   if (GET_CODE (x) == PLUS
13508       && CONST_INT_P (XEXP (x, 1)))
13509     {
13510       const_addend = XEXP (x, 1);
13511       x = XEXP (x, 0);
13512     }
13513 
13514   if (GET_CODE (x) == UNSPEC
13515       && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13516 	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13517     result = XVECEXP (x, 0, 0);
13518 
13519   if (TARGET_MACHO && darwin_local_data_pic (x)
13520       && !MEM_P (orig_x))
13521     result = XVECEXP (x, 0, 0);
13522 
13523   if (! result)
13524     return ix86_delegitimize_tls_address (orig_x);
13525 
13526   if (const_addend)
13527     result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13528   if (reg_addend)
13529     result = gen_rtx_PLUS (Pmode, reg_addend, result);
13530   if (addend)
13531     {
13532       /* If the rest of original X doesn't involve the PIC register, add
13533 	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
13534 	 for code like:
13535 	 leal (%ebx, %ecx, 4), %ecx
13536 	 ...
13537 	 movl foo@GOTOFF(%ecx), %edx
13538 	 in which case we return (%ecx - %ebx) + foo.  */
13539       if (pic_offset_table_rtx)
13540         result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13541 						     pic_offset_table_rtx),
13542 			       result);
13543       else
13544 	return orig_x;
13545     }
13546   if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13547     {
13548       result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13549       if (result == NULL_RTX)
13550 	return orig_x;
13551     }
13552   return result;
13553 }
13554 
13555 /* If X is a machine specific address (i.e. a symbol or label being
13556    referenced as a displacement from the GOT implemented using an
13557    UNSPEC), then return the base term.  Otherwise return X.  */
13558 
13559 rtx
13560 ix86_find_base_term (rtx x)
13561 {
13562   rtx term;
13563 
13564   if (TARGET_64BIT)
13565     {
13566       if (GET_CODE (x) != CONST)
13567 	return x;
13568       term = XEXP (x, 0);
13569       if (GET_CODE (term) == PLUS
13570 	  && (CONST_INT_P (XEXP (term, 1))
13571 	      || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13572 	term = XEXP (term, 0);
13573       if (GET_CODE (term) != UNSPEC
13574 	  || (XINT (term, 1) != UNSPEC_GOTPCREL
13575 	      && XINT (term, 1) != UNSPEC_PCREL))
13576 	return x;
13577 
13578       return XVECEXP (term, 0, 0);
13579     }
13580 
13581   return ix86_delegitimize_address (x);
13582 }
13583 
13584 static void
13585 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13586 		    int fp, FILE *file)
13587 {
13588   const char *suffix;
13589 
13590   if (mode == CCFPmode || mode == CCFPUmode)
13591     {
13592       code = ix86_fp_compare_code_to_integer (code);
13593       mode = CCmode;
13594     }
13595   if (reverse)
13596     code = reverse_condition (code);
13597 
13598   switch (code)
13599     {
13600     case EQ:
13601       switch (mode)
13602 	{
13603 	case CCAmode:
13604 	  suffix = "a";
13605 	  break;
13606 
13607 	case CCCmode:
13608 	  suffix = "c";
13609 	  break;
13610 
13611 	case CCOmode:
13612 	  suffix = "o";
13613 	  break;
13614 
13615 	case CCSmode:
13616 	  suffix = "s";
13617 	  break;
13618 
13619 	default:
13620 	  suffix = "e";
13621 	}
13622       break;
13623     case NE:
13624       switch (mode)
13625 	{
13626 	case CCAmode:
13627 	  suffix = "na";
13628 	  break;
13629 
13630 	case CCCmode:
13631 	  suffix = "nc";
13632 	  break;
13633 
13634 	case CCOmode:
13635 	  suffix = "no";
13636 	  break;
13637 
13638 	case CCSmode:
13639 	  suffix = "ns";
13640 	  break;
13641 
13642 	default:
13643 	  suffix = "ne";
13644 	}
13645       break;
13646     case GT:
13647       gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13648       suffix = "g";
13649       break;
13650     case GTU:
13651       /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13652 	 Those same assemblers have the same but opposite lossage on cmov.  */
13653       if (mode == CCmode)
13654 	suffix = fp ? "nbe" : "a";
13655       else if (mode == CCCmode)
13656 	suffix = "b";
13657       else
13658 	gcc_unreachable ();
13659       break;
13660     case LT:
13661       switch (mode)
13662 	{
13663 	case CCNOmode:
13664 	case CCGOCmode:
13665 	  suffix = "s";
13666 	  break;
13667 
13668 	case CCmode:
13669 	case CCGCmode:
13670 	  suffix = "l";
13671 	  break;
13672 
13673 	default:
13674 	  gcc_unreachable ();
13675 	}
13676       break;
13677     case LTU:
13678       gcc_assert (mode == CCmode || mode == CCCmode);
13679       suffix = "b";
13680       break;
13681     case GE:
13682       switch (mode)
13683 	{
13684 	case CCNOmode:
13685 	case CCGOCmode:
13686 	  suffix = "ns";
13687 	  break;
13688 
13689 	case CCmode:
13690 	case CCGCmode:
13691 	  suffix = "ge";
13692 	  break;
13693 
13694 	default:
13695 	  gcc_unreachable ();
13696 	}
13697       break;
13698     case GEU:
13699       /* ??? As above.  */
13700       gcc_assert (mode == CCmode || mode == CCCmode);
13701       suffix = fp ? "nb" : "ae";
13702       break;
13703     case LE:
13704       gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13705       suffix = "le";
13706       break;
13707     case LEU:
13708       /* ??? As above.  */
13709       if (mode == CCmode)
13710 	suffix = "be";
13711       else if (mode == CCCmode)
13712 	suffix = fp ? "nb" : "ae";
13713       else
13714 	gcc_unreachable ();
13715       break;
13716     case UNORDERED:
13717       suffix = fp ? "u" : "p";
13718       break;
13719     case ORDERED:
13720       suffix = fp ? "nu" : "np";
13721       break;
13722     default:
13723       gcc_unreachable ();
13724     }
13725   fputs (suffix, file);
13726 }
13727 
13728 /* Print the name of register X to FILE based on its machine mode and number.
13729    If CODE is 'w', pretend the mode is HImode.
13730    If CODE is 'b', pretend the mode is QImode.
13731    If CODE is 'k', pretend the mode is SImode.
13732    If CODE is 'q', pretend the mode is DImode.
13733    If CODE is 'x', pretend the mode is V4SFmode.
13734    If CODE is 't', pretend the mode is V8SFmode.
13735    If CODE is 'h', pretend the reg is the 'high' byte register.
13736    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13737    If CODE is 'd', duplicate the operand for AVX instruction.
13738  */
13739 
13740 void
13741 print_reg (rtx x, int code, FILE *file)
13742 {
13743   const char *reg;
13744   unsigned int regno;
13745   bool duplicated = code == 'd' && TARGET_AVX;
13746 
13747   if (ASSEMBLER_DIALECT == ASM_ATT)
13748     putc ('%', file);
13749 
13750   if (x == pc_rtx)
13751     {
13752       gcc_assert (TARGET_64BIT);
13753       fputs ("rip", file);
13754       return;
13755     }
13756 
13757   regno = true_regnum (x);
13758   gcc_assert (regno != ARG_POINTER_REGNUM
13759 	      && regno != FRAME_POINTER_REGNUM
13760 	      && regno != FLAGS_REG
13761 	      && regno != FPSR_REG
13762 	      && regno != FPCR_REG);
13763 
13764   if (code == 'w' || MMX_REG_P (x))
13765     code = 2;
13766   else if (code == 'b')
13767     code = 1;
13768   else if (code == 'k')
13769     code = 4;
13770   else if (code == 'q')
13771     code = 8;
13772   else if (code == 'y')
13773     code = 3;
13774   else if (code == 'h')
13775     code = 0;
13776   else if (code == 'x')
13777     code = 16;
13778   else if (code == 't')
13779     code = 32;
13780   else
13781     code = GET_MODE_SIZE (GET_MODE (x));
13782 
13783   /* Irritatingly, AMD extended registers use different naming convention
13784      from the normal registers: "r%d[bwd]"  */
13785   if (REX_INT_REGNO_P (regno))
13786     {
13787       gcc_assert (TARGET_64BIT);
13788       putc ('r', file);
13789       fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13790       switch (code)
13791 	{
13792 	  case 0:
13793 	    error ("extended registers have no high halves");
13794 	    break;
13795 	  case 1:
13796 	    putc ('b', file);
13797 	    break;
13798 	  case 2:
13799 	    putc ('w', file);
13800 	    break;
13801 	  case 4:
13802 	    putc ('d', file);
13803 	    break;
13804 	  case 8:
13805 	    /* no suffix */
13806 	    break;
13807 	  default:
13808 	    error ("unsupported operand size for extended register");
13809 	    break;
13810 	}
13811       return;
13812     }
13813 
13814   reg = NULL;
13815   switch (code)
13816     {
13817     case 3:
13818       if (STACK_TOP_P (x))
13819 	{
13820 	  reg = "st(0)";
13821 	  break;
13822 	}
13823       /* FALLTHRU */
13824     case 8:
13825     case 4:
13826     case 12:
13827       if (! ANY_FP_REG_P (x))
13828 	putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13829       /* FALLTHRU */
13830     case 16:
13831     case 2:
13832     normal:
13833       reg = hi_reg_name[regno];
13834       break;
13835     case 1:
13836       if (regno >= ARRAY_SIZE (qi_reg_name))
13837 	goto normal;
13838       reg = qi_reg_name[regno];
13839       break;
13840     case 0:
13841       if (regno >= ARRAY_SIZE (qi_high_reg_name))
13842 	goto normal;
13843       reg = qi_high_reg_name[regno];
13844       break;
13845     case 32:
13846       if (SSE_REG_P (x))
13847 	{
13848 	  gcc_assert (!duplicated);
13849 	  putc ('y', file);
13850 	  fputs (hi_reg_name[regno] + 1, file);
13851 	  return;
13852 	}
13853       break;
13854     default:
13855       gcc_unreachable ();
13856     }
13857 
13858   fputs (reg, file);
13859   if (duplicated)
13860     {
13861       if (ASSEMBLER_DIALECT == ASM_ATT)
13862 	fprintf (file, ", %%%s", reg);
13863       else
13864 	fprintf (file, ", %s", reg);
13865     }
13866 }
13867 
13868 /* Locate some local-dynamic symbol still in use by this function
13869    so that we can print its name in some tls_local_dynamic_base
13870    pattern.  */
13871 
13872 static int
13873 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13874 {
13875   rtx x = *px;
13876 
13877   if (GET_CODE (x) == SYMBOL_REF
13878       && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13879     {
13880       cfun->machine->some_ld_name = XSTR (x, 0);
13881       return 1;
13882     }
13883 
13884   return 0;
13885 }
13886 
13887 static const char *
13888 get_some_local_dynamic_name (void)
13889 {
13890   rtx insn;
13891 
13892   if (cfun->machine->some_ld_name)
13893     return cfun->machine->some_ld_name;
13894 
13895   for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13896     if (NONDEBUG_INSN_P (insn)
13897 	&& for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13898       return cfun->machine->some_ld_name;
13899 
13900   return NULL;
13901 }
13902 
13903 /* Meaning of CODE:
13904    L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13905    C -- print opcode suffix for set/cmov insn.
13906    c -- like C, but print reversed condition
13907    F,f -- likewise, but for floating-point.
13908    O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13909         otherwise nothing
13910    R -- print the prefix for register names.
13911    z -- print the opcode suffix for the size of the current operand.
13912    Z -- likewise, with special suffixes for x87 instructions.
13913    * -- print a star (in certain assembler syntax)
13914    A -- print an absolute memory reference.
13915    E -- print address with DImode register names if TARGET_64BIT.
13916    w -- print the operand as if it's a "word" (HImode) even if it isn't.
13917    s -- print a shift double count, followed by the assemblers argument
13918 	delimiter.
13919    b -- print the QImode name of the register for the indicated operand.
13920 	%b0 would print %al if operands[0] is reg 0.
13921    w --  likewise, print the HImode name of the register.
13922    k --  likewise, print the SImode name of the register.
13923    q --  likewise, print the DImode name of the register.
13924    x --  likewise, print the V4SFmode name of the register.
13925    t --  likewise, print the V8SFmode name of the register.
13926    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13927    y -- print "st(0)" instead of "st" as a register.
13928    d -- print duplicated register operand for AVX instruction.
13929    D -- print condition for SSE cmp instruction.
13930    P -- if PIC, print an @PLT suffix.
13931    p -- print raw symbol name.
13932    X -- don't print any sort of PIC '@' suffix for a symbol.
13933    & -- print some in-use local-dynamic symbol name.
13934    H -- print a memory address offset by 8; used for sse high-parts
13935    Y -- print condition for XOP pcom* instruction.
13936    + -- print a branch hint as 'cs' or 'ds' prefix
13937    ; -- print a semicolon (after prefixes due to bug in older gas).
13938    ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13939    @ -- print a segment register of thread base pointer load
13940  */
13941 
13942 void
13943 ix86_print_operand (FILE *file, rtx x, int code)
13944 {
13945   if (code)
13946     {
13947       switch (code)
13948 	{
13949 	case '*':
13950 	  if (ASSEMBLER_DIALECT == ASM_ATT)
13951 	    putc ('*', file);
13952 	  return;
13953 
13954 	case '&':
13955 	  {
13956 	    const char *name = get_some_local_dynamic_name ();
13957 	    if (name == NULL)
13958 	      output_operand_lossage ("'%%&' used without any "
13959 				      "local dynamic TLS references");
13960 	    else
13961 	      assemble_name (file, name);
13962 	    return;
13963 	  }
13964 
13965 	case 'A':
13966 	  switch (ASSEMBLER_DIALECT)
13967 	    {
13968 	    case ASM_ATT:
13969 	      putc ('*', file);
13970 	      break;
13971 
13972 	    case ASM_INTEL:
13973 	      /* Intel syntax. For absolute addresses, registers should not
13974 		 be surrounded by braces.  */
13975 	      if (!REG_P (x))
13976 		{
13977 		  putc ('[', file);
13978 		  ix86_print_operand (file, x, 0);
13979 		  putc (']', file);
13980 		  return;
13981 		}
13982 	      break;
13983 
13984 	    default:
13985 	      gcc_unreachable ();
13986 	    }
13987 
13988 	  ix86_print_operand (file, x, 0);
13989 	  return;
13990 
13991 	case 'E':
13992 	  /* Wrap address in an UNSPEC to declare special handling.  */
13993 	  if (TARGET_64BIT)
13994 	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13995 
13996 	  output_address (x);
13997 	  return;
13998 
13999 	case 'L':
14000 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14001 	    putc ('l', file);
14002 	  return;
14003 
14004 	case 'W':
14005 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14006 	    putc ('w', file);
14007 	  return;
14008 
14009 	case 'B':
14010 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14011 	    putc ('b', file);
14012 	  return;
14013 
14014 	case 'Q':
14015 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14016 	    putc ('l', file);
14017 	  return;
14018 
14019 	case 'S':
14020 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14021 	    putc ('s', file);
14022 	  return;
14023 
14024 	case 'T':
14025 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14026 	    putc ('t', file);
14027 	  return;
14028 
14029 	case 'z':
14030 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14031 	    {
14032 	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
14033 	      if (ASSEMBLER_DIALECT == ASM_INTEL)
14034 		return;
14035 
14036 	      switch (GET_MODE_SIZE (GET_MODE (x)))
14037 		{
14038 		case 1:
14039 		  putc ('b', file);
14040 		  return;
14041 
14042 		case 2:
14043 		  putc ('w', file);
14044 		  return;
14045 
14046 		case 4:
14047 		  putc ('l', file);
14048 		  return;
14049 
14050 		case 8:
14051 		  putc ('q', file);
14052 		  return;
14053 
14054 		default:
14055 		  output_operand_lossage
14056 		    ("invalid operand size for operand code '%c'", code);
14057 		  return;
14058 		}
14059 	    }
14060 
14061 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14062 	    warning
14063 	      (0, "non-integer operand used with operand code '%c'", code);
14064 	  /* FALLTHRU */
14065 
14066 	case 'Z':
14067 	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
14068 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
14069 	    return;
14070 
14071 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14072 	    {
14073 	      switch (GET_MODE_SIZE (GET_MODE (x)))
14074 		{
14075 		case 2:
14076 #ifdef HAVE_AS_IX86_FILDS
14077 		  putc ('s', file);
14078 #endif
14079 		  return;
14080 
14081 		case 4:
14082 		  putc ('l', file);
14083 		  return;
14084 
14085 		case 8:
14086 #ifdef HAVE_AS_IX86_FILDQ
14087 		  putc ('q', file);
14088 #else
14089 		  fputs ("ll", file);
14090 #endif
14091 		  return;
14092 
14093 		default:
14094 		  break;
14095 		}
14096 	    }
14097 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14098 	    {
14099 	      /* 387 opcodes don't get size suffixes
14100 		 if the operands are registers.  */
14101 	      if (STACK_REG_P (x))
14102 		return;
14103 
14104 	      switch (GET_MODE_SIZE (GET_MODE (x)))
14105 		{
14106 		case 4:
14107 		  putc ('s', file);
14108 		  return;
14109 
14110 		case 8:
14111 		  putc ('l', file);
14112 		  return;
14113 
14114 		case 12:
14115 		case 16:
14116 		  putc ('t', file);
14117 		  return;
14118 
14119 		default:
14120 		  break;
14121 		}
14122 	    }
14123 	  else
14124 	    {
14125 	      output_operand_lossage
14126 		("invalid operand type used with operand code '%c'", code);
14127 	      return;
14128 	    }
14129 
14130 	  output_operand_lossage
14131 	    ("invalid operand size for operand code '%c'", code);
14132 	  return;
14133 
14134 	case 'd':
14135 	case 'b':
14136 	case 'w':
14137 	case 'k':
14138 	case 'q':
14139 	case 'h':
14140 	case 't':
14141 	case 'y':
14142 	case 'x':
14143 	case 'X':
14144 	case 'P':
14145 	case 'p':
14146 	  break;
14147 
14148 	case 's':
14149 	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14150 	    {
14151 	      ix86_print_operand (file, x, 0);
14152 	      fputs (", ", file);
14153 	    }
14154 	  return;
14155 
14156 	case 'D':
14157 	  /* Little bit of braindamage here.  The SSE compare instructions
14158 	     does use completely different names for the comparisons that the
14159 	     fp conditional moves.  */
14160 	  if (TARGET_AVX)
14161 	    {
14162 	      switch (GET_CODE (x))
14163 		{
14164 		case EQ:
14165 		  fputs ("eq", file);
14166 		  break;
14167 		case UNEQ:
14168 		  fputs ("eq_us", file);
14169 		  break;
14170 		case LT:
14171 		  fputs ("lt", file);
14172 		  break;
14173 		case UNLT:
14174 		  fputs ("nge", file);
14175 		  break;
14176 		case LE:
14177 		  fputs ("le", file);
14178 		  break;
14179 		case UNLE:
14180 		  fputs ("ngt", file);
14181 		  break;
14182 		case UNORDERED:
14183 		  fputs ("unord", file);
14184 		  break;
14185 		case NE:
14186 		  fputs ("neq", file);
14187 		  break;
14188 		case LTGT:
14189 		  fputs ("neq_oq", file);
14190 		  break;
14191 		case GE:
14192 		  fputs ("ge", file);
14193 		  break;
14194 		case UNGE:
14195 		  fputs ("nlt", file);
14196 		  break;
14197 		case GT:
14198 		  fputs ("gt", file);
14199 		  break;
14200 		case UNGT:
14201 		  fputs ("nle", file);
14202 		  break;
14203 		case ORDERED:
14204 		  fputs ("ord", file);
14205 		  break;
14206 		default:
14207 		  output_operand_lossage ("operand is not a condition code, "
14208 					  "invalid operand code 'D'");
14209 		  return;
14210 		}
14211 	    }
14212 	  else
14213 	    {
14214 	      switch (GET_CODE (x))
14215 		{
14216 		case EQ:
14217 		case UNEQ:
14218 		  fputs ("eq", file);
14219 		  break;
14220 		case LT:
14221 		case UNLT:
14222 		  fputs ("lt", file);
14223 		  break;
14224 		case LE:
14225 		case UNLE:
14226 		  fputs ("le", file);
14227 		  break;
14228 		case UNORDERED:
14229 		  fputs ("unord", file);
14230 		  break;
14231 		case NE:
14232 		case LTGT:
14233 		  fputs ("neq", file);
14234 		  break;
14235 		case UNGE:
14236 		case GE:
14237 		  fputs ("nlt", file);
14238 		  break;
14239 		case UNGT:
14240 		case GT:
14241 		  fputs ("nle", file);
14242 		  break;
14243 		case ORDERED:
14244 		  fputs ("ord", file);
14245 		  break;
14246 		default:
14247 		  output_operand_lossage ("operand is not a condition code, "
14248 					  "invalid operand code 'D'");
14249 		  return;
14250 		}
14251 	    }
14252 	  return;
14253 	case 'O':
14254 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14255 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14256 	    {
14257 	      switch (GET_MODE (x))
14258 		{
14259 		case HImode: putc ('w', file); break;
14260 		case SImode:
14261 		case SFmode: putc ('l', file); break;
14262 		case DImode:
14263 		case DFmode: putc ('q', file); break;
14264 		default: gcc_unreachable ();
14265 		}
14266 	      putc ('.', file);
14267 	    }
14268 #endif
14269 	  return;
14270 	case 'C':
14271 	  if (!COMPARISON_P (x))
14272 	    {
14273 	      output_operand_lossage ("operand is neither a constant nor a "
14274 				      "condition code, invalid operand code "
14275 				      "'C'");
14276 	      return;
14277 	    }
14278 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14279 	  return;
14280 	case 'F':
14281 	  if (!COMPARISON_P (x))
14282 	    {
14283 	      output_operand_lossage ("operand is neither a constant nor a "
14284 				      "condition code, invalid operand code "
14285 				      "'F'");
14286 	      return;
14287 	    }
14288 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14289 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14290 	    putc ('.', file);
14291 #endif
14292 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14293 	  return;
14294 
14295 	  /* Like above, but reverse condition */
14296 	case 'c':
14297 	  /* Check to see if argument to %c is really a constant
14298 	     and not a condition code which needs to be reversed.  */
14299 	  if (!COMPARISON_P (x))
14300 	    {
14301 	      output_operand_lossage ("operand is neither a constant nor a "
14302 				      "condition code, invalid operand "
14303 				      "code 'c'");
14304 	      return;
14305 	    }
14306 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14307 	  return;
14308 	case 'f':
14309 	  if (!COMPARISON_P (x))
14310 	    {
14311 	      output_operand_lossage ("operand is neither a constant nor a "
14312 				      "condition code, invalid operand "
14313 				      "code 'f'");
14314 	      return;
14315 	    }
14316 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14317 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14318 	    putc ('.', file);
14319 #endif
14320 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14321 	  return;
14322 
14323 	case 'H':
14324 	  if (!offsettable_memref_p (x))
14325 	    {
14326 	      output_operand_lossage ("operand is not an offsettable memory "
14327 				      "reference, invalid operand "
14328 				      "code 'H'");
14329 	      return;
14330 	    }
14331 	  /* It doesn't actually matter what mode we use here, as we're
14332 	     only going to use this for printing.  */
14333 	  x = adjust_address_nv (x, DImode, 8);
14334 	  break;
14335 
14336 	case '+':
14337 	  {
14338 	    rtx x;
14339 
14340 	    if (!optimize
14341 	        || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14342 	      return;
14343 
14344 	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14345 	    if (x)
14346 	      {
14347 		int pred_val = INTVAL (XEXP (x, 0));
14348 
14349 		if (pred_val < REG_BR_PROB_BASE * 45 / 100
14350 		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
14351 		  {
14352 		    int taken = pred_val > REG_BR_PROB_BASE / 2;
14353 		    int cputaken = final_forward_branch_p (current_output_insn) == 0;
14354 
14355 		    /* Emit hints only in the case default branch prediction
14356 		       heuristics would fail.  */
14357 		    if (taken != cputaken)
14358 		      {
14359 			/* We use 3e (DS) prefix for taken branches and
14360 			   2e (CS) prefix for not taken branches.  */
14361 			if (taken)
14362 			  fputs ("ds ; ", file);
14363 			else
14364 			  fputs ("cs ; ", file);
14365 		      }
14366 		  }
14367 	      }
14368 	    return;
14369 	  }
14370 
14371 	case 'Y':
14372 	  switch (GET_CODE (x))
14373 	    {
14374 	    case NE:
14375 	      fputs ("neq", file);
14376 	      break;
14377 	    case EQ:
14378 	      fputs ("eq", file);
14379 	      break;
14380 	    case GE:
14381 	    case GEU:
14382 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14383 	      break;
14384 	    case GT:
14385 	    case GTU:
14386 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14387 	      break;
14388 	    case LE:
14389 	    case LEU:
14390 	      fputs ("le", file);
14391 	      break;
14392 	    case LT:
14393 	    case LTU:
14394 	      fputs ("lt", file);
14395 	      break;
14396 	    case UNORDERED:
14397 	      fputs ("unord", file);
14398 	      break;
14399 	    case ORDERED:
14400 	      fputs ("ord", file);
14401 	      break;
14402 	    case UNEQ:
14403 	      fputs ("ueq", file);
14404 	      break;
14405 	    case UNGE:
14406 	      fputs ("nlt", file);
14407 	      break;
14408 	    case UNGT:
14409 	      fputs ("nle", file);
14410 	      break;
14411 	    case UNLE:
14412 	      fputs ("ule", file);
14413 	      break;
14414 	    case UNLT:
14415 	      fputs ("ult", file);
14416 	      break;
14417 	    case LTGT:
14418 	      fputs ("une", file);
14419 	      break;
14420 	    default:
14421 	      output_operand_lossage ("operand is not a condition code, "
14422 				      "invalid operand code 'Y'");
14423 	      return;
14424 	    }
14425 	  return;
14426 
14427 	case ';':
14428 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14429 	  putc (';', file);
14430 #endif
14431 	  return;
14432 
14433 	case '@':
14434 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14435 	    putc ('%', file);
14436 
14437 	  /* The kernel uses a different segment register for performance
14438 	     reasons; a system call would not have to trash the userspace
14439 	     segment register, which would be expensive.  */
14440 	  if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14441 	    fputs ("fs", file);
14442 	  else
14443 	    fputs ("gs", file);
14444 	  return;
14445 
14446 	case '~':
14447 	  putc (TARGET_AVX2 ? 'i' : 'f', file);
14448 	  return;
14449 
14450 	default:
14451 	    output_operand_lossage ("invalid operand code '%c'", code);
14452 	}
14453     }
14454 
14455   if (REG_P (x))
14456     print_reg (x, code, file);
14457 
14458   else if (MEM_P (x))
14459     {
14460       /* No `byte ptr' prefix for call instructions or BLKmode operands.  */
14461       if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14462 	  && GET_MODE (x) != BLKmode)
14463 	{
14464 	  const char * size;
14465 	  switch (GET_MODE_SIZE (GET_MODE (x)))
14466 	    {
14467 	    case 1: size = "BYTE"; break;
14468 	    case 2: size = "WORD"; break;
14469 	    case 4: size = "DWORD"; break;
14470 	    case 8: size = "QWORD"; break;
14471 	    case 12: size = "TBYTE"; break;
14472 	    case 16:
14473 	      if (GET_MODE (x) == XFmode)
14474 		size = "TBYTE";
14475               else
14476 		size = "XMMWORD";
14477               break;
14478 	    case 32: size = "YMMWORD"; break;
14479 	    default:
14480 	      gcc_unreachable ();
14481 	    }
14482 
14483 	  /* Check for explicit size override (codes 'b', 'w', 'k',
14484 	     'q' and 'x')  */
14485 	  if (code == 'b')
14486 	    size = "BYTE";
14487 	  else if (code == 'w')
14488 	    size = "WORD";
14489 	  else if (code == 'k')
14490 	    size = "DWORD";
14491 	  else if (code == 'q')
14492 	    size = "QWORD";
14493 	  else if (code == 'x')
14494 	    size = "XMMWORD";
14495 
14496 	  fputs (size, file);
14497 	  fputs (" PTR ", file);
14498 	}
14499 
14500       x = XEXP (x, 0);
14501       /* Avoid (%rip) for call operands.  */
14502       if (CONSTANT_ADDRESS_P (x) && code == 'P'
14503 	  && !CONST_INT_P (x))
14504 	output_addr_const (file, x);
14505       else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14506 	output_operand_lossage ("invalid constraints for operand");
14507       else
14508 	output_address (x);
14509     }
14510 
14511   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14512     {
14513       REAL_VALUE_TYPE r;
14514       long l;
14515 
14516       REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14517       REAL_VALUE_TO_TARGET_SINGLE (r, l);
14518 
14519       if (ASSEMBLER_DIALECT == ASM_ATT)
14520 	putc ('$', file);
14521       /* Sign extend 32bit SFmode immediate to 8 bytes.  */
14522       if (code == 'q')
14523 	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14524 		 (unsigned long long) (int) l);
14525       else
14526 	fprintf (file, "0x%08x", (unsigned int) l);
14527     }
14528 
14529   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14530     {
14531       REAL_VALUE_TYPE r;
14532       long l[2];
14533 
14534       REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14535       REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14536 
14537       if (ASSEMBLER_DIALECT == ASM_ATT)
14538 	putc ('$', file);
14539       fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14540     }
14541 
14542   /* These float cases don't actually occur as immediate operands.  */
14543   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14544     {
14545       char dstr[30];
14546 
14547       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14548       fputs (dstr, file);
14549     }
14550 
14551   else
14552     {
14553       /* We have patterns that allow zero sets of memory, for instance.
14554 	 In 64-bit mode, we should probably support all 8-byte vectors,
14555 	 since we can in fact encode that into an immediate.  */
14556       if (GET_CODE (x) == CONST_VECTOR)
14557 	{
14558 	  gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14559 	  x = const0_rtx;
14560 	}
14561 
14562       if (code != 'P' && code != 'p')
14563 	{
14564 	  if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14565 	    {
14566 	      if (ASSEMBLER_DIALECT == ASM_ATT)
14567 		putc ('$', file);
14568 	    }
14569 	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14570 		   || GET_CODE (x) == LABEL_REF)
14571 	    {
14572 	      if (ASSEMBLER_DIALECT == ASM_ATT)
14573 		putc ('$', file);
14574 	      else
14575 		fputs ("OFFSET FLAT:", file);
14576 	    }
14577 	}
14578       if (CONST_INT_P (x))
14579 	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14580       else if (flag_pic || MACHOPIC_INDIRECT)
14581 	output_pic_addr_const (file, x, code);
14582       else
14583 	output_addr_const (file, x);
14584     }
14585 }
14586 
14587 static bool
14588 ix86_print_operand_punct_valid_p (unsigned char code)
14589 {
14590   return (code == '@' || code == '*' || code == '+'
14591 	  || code == '&' || code == ';' || code == '~');
14592 }
14593 
14594 /* Print a memory operand whose address is ADDR.  */
14595 
14596 static void
14597 ix86_print_operand_address (FILE *file, rtx addr)
14598 {
14599   struct ix86_address parts;
14600   rtx base, index, disp;
14601   int scale;
14602   int ok;
14603   bool vsib = false;
14604   int code = 0;
14605 
14606   if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14607     {
14608       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14609       gcc_assert (parts.index == NULL_RTX);
14610       parts.index = XVECEXP (addr, 0, 1);
14611       parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14612       addr = XVECEXP (addr, 0, 0);
14613       vsib = true;
14614     }
14615   else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14616     {
14617       gcc_assert (TARGET_64BIT);
14618       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14619       code = 'q';
14620     }
14621   else
14622     ok = ix86_decompose_address (addr, &parts);
14623 
14624   gcc_assert (ok);
14625 
14626   base = parts.base;
14627   index = parts.index;
14628   disp = parts.disp;
14629   scale = parts.scale;
14630 
14631   switch (parts.seg)
14632     {
14633     case SEG_DEFAULT:
14634       break;
14635     case SEG_FS:
14636     case SEG_GS:
14637       if (ASSEMBLER_DIALECT == ASM_ATT)
14638 	putc ('%', file);
14639       fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14640       break;
14641     default:
14642       gcc_unreachable ();
14643     }
14644 
14645   /* Use one byte shorter RIP relative addressing for 64bit mode.  */
14646   if (TARGET_64BIT && !base && !index)
14647     {
14648       rtx symbol = disp;
14649 
14650       if (GET_CODE (disp) == CONST
14651 	  && GET_CODE (XEXP (disp, 0)) == PLUS
14652 	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14653 	symbol = XEXP (XEXP (disp, 0), 0);
14654 
14655       if (GET_CODE (symbol) == LABEL_REF
14656 	  || (GET_CODE (symbol) == SYMBOL_REF
14657 	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14658 	base = pc_rtx;
14659     }
14660   if (!base && !index)
14661     {
14662       /* Displacement only requires special attention.  */
14663 
14664       if (CONST_INT_P (disp))
14665 	{
14666 	  if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14667 	    fputs ("ds:", file);
14668 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14669 	}
14670       else if (flag_pic)
14671 	output_pic_addr_const (file, disp, 0);
14672       else
14673 	output_addr_const (file, disp);
14674     }
14675   else
14676     {
14677       /* Print SImode register names to force addr32 prefix.  */
14678       if (SImode_address_operand (addr, VOIDmode))
14679 	{
14680 #ifdef ENABLE_CHECKING
14681 	  gcc_assert (TARGET_64BIT);
14682 	  switch (GET_CODE (addr))
14683 	    {
14684 	    case SUBREG:
14685 	      gcc_assert (GET_MODE (addr) == SImode);
14686 	      gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14687 	      break;
14688 	    case ZERO_EXTEND:
14689 	    case AND:
14690 	      gcc_assert (GET_MODE (addr) == DImode);
14691 	      break;
14692 	    default:
14693 	      gcc_unreachable ();
14694 	    }
14695 #endif
14696 	  gcc_assert (!code);
14697 	  code = 'k';
14698 	}
14699       else if (code == 0
14700 	       && TARGET_X32
14701 	       && disp
14702 	       && CONST_INT_P (disp)
14703 	       && INTVAL (disp) < -16*1024*1024)
14704 	{
14705 	  /* X32 runs in 64-bit mode, where displacement, DISP, in
14706 	     address DISP(%r64), is encoded as 32-bit immediate sign-
14707 	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
14708 	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
14709 	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14710 	     which is invalid for x32.  The correct address is %r64
14711 	     - 0x40000300 == 0xf7ffdd64.  To properly encode
14712 	     -0x40000300(%r64) for x32, we zero-extend negative
14713 	     displacement by forcing addr32 prefix which truncates
14714 	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
14715 	     zero-extend all negative displacements, including -1(%rsp).
14716 	     However, for small negative displacements, sign-extension
14717 	     won't cause overflow.  We only zero-extend negative
14718 	     displacements if they < -16*1024*1024, which is also used
14719 	     to check legitimate address displacements for PIC.  */
14720 	  code = 'k';
14721 	}
14722 
14723       if (ASSEMBLER_DIALECT == ASM_ATT)
14724 	{
14725 	  if (disp)
14726 	    {
14727 	      if (flag_pic)
14728 		output_pic_addr_const (file, disp, 0);
14729 	      else if (GET_CODE (disp) == LABEL_REF)
14730 		output_asm_label (disp);
14731 	      else
14732 		output_addr_const (file, disp);
14733 	    }
14734 
14735 	  putc ('(', file);
14736 	  if (base)
14737 	    print_reg (base, code, file);
14738 	  if (index)
14739 	    {
14740 	      putc (',', file);
14741 	      print_reg (index, vsib ? 0 : code, file);
14742 	      if (scale != 1 || vsib)
14743 		fprintf (file, ",%d", scale);
14744 	    }
14745 	  putc (')', file);
14746 	}
14747       else
14748 	{
14749 	  rtx offset = NULL_RTX;
14750 
14751 	  if (disp)
14752 	    {
14753 	      /* Pull out the offset of a symbol; print any symbol itself.  */
14754 	      if (GET_CODE (disp) == CONST
14755 		  && GET_CODE (XEXP (disp, 0)) == PLUS
14756 		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14757 		{
14758 		  offset = XEXP (XEXP (disp, 0), 1);
14759 		  disp = gen_rtx_CONST (VOIDmode,
14760 					XEXP (XEXP (disp, 0), 0));
14761 		}
14762 
14763 	      if (flag_pic)
14764 		output_pic_addr_const (file, disp, 0);
14765 	      else if (GET_CODE (disp) == LABEL_REF)
14766 		output_asm_label (disp);
14767 	      else if (CONST_INT_P (disp))
14768 		offset = disp;
14769 	      else
14770 		output_addr_const (file, disp);
14771 	    }
14772 
14773 	  putc ('[', file);
14774 	  if (base)
14775 	    {
14776 	      print_reg (base, code, file);
14777 	      if (offset)
14778 		{
14779 		  if (INTVAL (offset) >= 0)
14780 		    putc ('+', file);
14781 		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14782 		}
14783 	    }
14784 	  else if (offset)
14785 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14786 	  else
14787 	    putc ('0', file);
14788 
14789 	  if (index)
14790 	    {
14791 	      putc ('+', file);
14792 	      print_reg (index, vsib ? 0 : code, file);
14793 	      if (scale != 1 || vsib)
14794 		fprintf (file, "*%d", scale);
14795 	    }
14796 	  putc (']', file);
14797 	}
14798     }
14799 }
14800 
14801 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
14802 
14803 static bool
14804 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14805 {
14806   rtx op;
14807 
14808   if (GET_CODE (x) != UNSPEC)
14809     return false;
14810 
14811   op = XVECEXP (x, 0, 0);
14812   switch (XINT (x, 1))
14813     {
14814     case UNSPEC_GOTTPOFF:
14815       output_addr_const (file, op);
14816       /* FIXME: This might be @TPOFF in Sun ld.  */
14817       fputs ("@gottpoff", file);
14818       break;
14819     case UNSPEC_TPOFF:
14820       output_addr_const (file, op);
14821       fputs ("@tpoff", file);
14822       break;
14823     case UNSPEC_NTPOFF:
14824       output_addr_const (file, op);
14825       if (TARGET_64BIT)
14826 	fputs ("@tpoff", file);
14827       else
14828 	fputs ("@ntpoff", file);
14829       break;
14830     case UNSPEC_DTPOFF:
14831       output_addr_const (file, op);
14832       fputs ("@dtpoff", file);
14833       break;
14834     case UNSPEC_GOTNTPOFF:
14835       output_addr_const (file, op);
14836       if (TARGET_64BIT)
14837 	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14838 	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14839       else
14840 	fputs ("@gotntpoff", file);
14841       break;
14842     case UNSPEC_INDNTPOFF:
14843       output_addr_const (file, op);
14844       fputs ("@indntpoff", file);
14845       break;
14846 #if TARGET_MACHO
14847     case UNSPEC_MACHOPIC_OFFSET:
14848       output_addr_const (file, op);
14849       putc ('-', file);
14850       machopic_output_function_base_name (file);
14851       break;
14852 #endif
14853 
14854     case UNSPEC_STACK_CHECK:
14855       {
14856 	int offset;
14857 
14858 	gcc_assert (flag_split_stack);
14859 
14860 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14861 	offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14862 #else
14863 	gcc_unreachable ();
14864 #endif
14865 
14866 	fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14867       }
14868       break;
14869 
14870     default:
14871       return false;
14872     }
14873 
14874   return true;
14875 }
14876 
14877 /* Split one or more double-mode RTL references into pairs of half-mode
14878    references.  The RTL can be REG, offsettable MEM, integer constant, or
14879    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
14880    split and "num" is its length.  lo_half and hi_half are output arrays
14881    that parallel "operands".  */
14882 
14883 void
14884 split_double_mode (enum machine_mode mode, rtx operands[],
14885 		   int num, rtx lo_half[], rtx hi_half[])
14886 {
14887   enum machine_mode half_mode;
14888   unsigned int byte;
14889 
14890   switch (mode)
14891     {
14892     case TImode:
14893       half_mode = DImode;
14894       break;
14895     case DImode:
14896       half_mode = SImode;
14897       break;
14898     default:
14899       gcc_unreachable ();
14900     }
14901 
14902   byte = GET_MODE_SIZE (half_mode);
14903 
14904   while (num--)
14905     {
14906       rtx op = operands[num];
14907 
14908       /* simplify_subreg refuse to split volatile memory addresses,
14909          but we still have to handle it.  */
14910       if (MEM_P (op))
14911 	{
14912 	  lo_half[num] = adjust_address (op, half_mode, 0);
14913 	  hi_half[num] = adjust_address (op, half_mode, byte);
14914 	}
14915       else
14916 	{
14917 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
14918 					      GET_MODE (op) == VOIDmode
14919 					      ? mode : GET_MODE (op), 0);
14920 	  hi_half[num] = simplify_gen_subreg (half_mode, op,
14921 					      GET_MODE (op) == VOIDmode
14922 					      ? mode : GET_MODE (op), byte);
14923 	}
14924     }
14925 }
14926 
14927 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14928    MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
14929    is the expression of the binary operation.  The output may either be
14930    emitted here, or returned to the caller, like all output_* functions.
14931 
14932    There is no guarantee that the operands are the same mode, as they
14933    might be within FLOAT or FLOAT_EXTEND expressions.  */
14934 
14935 #ifndef SYSV386_COMPAT
14936 /* Set to 1 for compatibility with brain-damaged assemblers.  No-one
14937    wants to fix the assemblers because that causes incompatibility
14938    with gcc.  No-one wants to fix gcc because that causes
14939    incompatibility with assemblers...  You can use the option of
14940    -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
14941 #define SYSV386_COMPAT 1
14942 #endif
14943 
14944 const char *
14945 output_387_binary_op (rtx insn, rtx *operands)
14946 {
14947   static char buf[40];
14948   const char *p;
14949   const char *ssep;
14950   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14951 
14952 #ifdef ENABLE_CHECKING
14953   /* Even if we do not want to check the inputs, this documents input
14954      constraints.  Which helps in understanding the following code.  */
14955   if (STACK_REG_P (operands[0])
14956       && ((REG_P (operands[1])
14957 	   && REGNO (operands[0]) == REGNO (operands[1])
14958 	   && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14959 	  || (REG_P (operands[2])
14960 	      && REGNO (operands[0]) == REGNO (operands[2])
14961 	      && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14962       && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14963     ; /* ok */
14964   else
14965     gcc_assert (is_sse);
14966 #endif
14967 
14968   switch (GET_CODE (operands[3]))
14969     {
14970     case PLUS:
14971       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14972 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14973 	p = "fiadd";
14974       else
14975 	p = "fadd";
14976       ssep = "vadd";
14977       break;
14978 
14979     case MINUS:
14980       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14981 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14982 	p = "fisub";
14983       else
14984 	p = "fsub";
14985       ssep = "vsub";
14986       break;
14987 
14988     case MULT:
14989       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14990 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14991 	p = "fimul";
14992       else
14993 	p = "fmul";
14994       ssep = "vmul";
14995       break;
14996 
14997     case DIV:
14998       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14999 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15000 	p = "fidiv";
15001       else
15002 	p = "fdiv";
15003       ssep = "vdiv";
15004       break;
15005 
15006     default:
15007       gcc_unreachable ();
15008     }
15009 
15010   if (is_sse)
15011    {
15012      if (TARGET_AVX)
15013        {
15014 	 strcpy (buf, ssep);
15015 	 if (GET_MODE (operands[0]) == SFmode)
15016 	   strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15017 	 else
15018 	   strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15019        }
15020      else
15021        {
15022 	 strcpy (buf, ssep + 1);
15023 	 if (GET_MODE (operands[0]) == SFmode)
15024 	   strcat (buf, "ss\t{%2, %0|%0, %2}");
15025 	 else
15026 	   strcat (buf, "sd\t{%2, %0|%0, %2}");
15027        }
15028       return buf;
15029    }
15030   strcpy (buf, p);
15031 
15032   switch (GET_CODE (operands[3]))
15033     {
15034     case MULT:
15035     case PLUS:
15036       if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15037 	{
15038 	  rtx temp = operands[2];
15039 	  operands[2] = operands[1];
15040 	  operands[1] = temp;
15041 	}
15042 
15043       /* know operands[0] == operands[1].  */
15044 
15045       if (MEM_P (operands[2]))
15046 	{
15047 	  p = "%Z2\t%2";
15048 	  break;
15049 	}
15050 
15051       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15052 	{
15053 	  if (STACK_TOP_P (operands[0]))
15054 	    /* How is it that we are storing to a dead operand[2]?
15055 	       Well, presumably operands[1] is dead too.  We can't
15056 	       store the result to st(0) as st(0) gets popped on this
15057 	       instruction.  Instead store to operands[2] (which I
15058 	       think has to be st(1)).  st(1) will be popped later.
15059 	       gcc <= 2.8.1 didn't have this check and generated
15060 	       assembly code that the Unixware assembler rejected.  */
15061 	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
15062 	  else
15063 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
15064 	  break;
15065 	}
15066 
15067       if (STACK_TOP_P (operands[0]))
15068 	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
15069       else
15070 	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
15071       break;
15072 
15073     case MINUS:
15074     case DIV:
15075       if (MEM_P (operands[1]))
15076 	{
15077 	  p = "r%Z1\t%1";
15078 	  break;
15079 	}
15080 
15081       if (MEM_P (operands[2]))
15082 	{
15083 	  p = "%Z2\t%2";
15084 	  break;
15085 	}
15086 
15087       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15088 	{
15089 #if SYSV386_COMPAT
15090 	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15091 	     derived assemblers, confusingly reverse the direction of
15092 	     the operation for fsub{r} and fdiv{r} when the
15093 	     destination register is not st(0).  The Intel assembler
15094 	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
15095 	     figure out what the hardware really does.  */
15096 	  if (STACK_TOP_P (operands[0]))
15097 	    p = "{p\t%0, %2|rp\t%2, %0}";
15098 	  else
15099 	    p = "{rp\t%2, %0|p\t%0, %2}";
15100 #else
15101 	  if (STACK_TOP_P (operands[0]))
15102 	    /* As above for fmul/fadd, we can't store to st(0).  */
15103 	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
15104 	  else
15105 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
15106 #endif
15107 	  break;
15108 	}
15109 
15110       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15111 	{
15112 #if SYSV386_COMPAT
15113 	  if (STACK_TOP_P (operands[0]))
15114 	    p = "{rp\t%0, %1|p\t%1, %0}";
15115 	  else
15116 	    p = "{p\t%1, %0|rp\t%0, %1}";
15117 #else
15118 	  if (STACK_TOP_P (operands[0]))
15119 	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
15120 	  else
15121 	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
15122 #endif
15123 	  break;
15124 	}
15125 
15126       if (STACK_TOP_P (operands[0]))
15127 	{
15128 	  if (STACK_TOP_P (operands[1]))
15129 	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
15130 	  else
15131 	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
15132 	  break;
15133 	}
15134       else if (STACK_TOP_P (operands[1]))
15135 	{
15136 #if SYSV386_COMPAT
15137 	  p = "{\t%1, %0|r\t%0, %1}";
15138 #else
15139 	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
15140 #endif
15141 	}
15142       else
15143 	{
15144 #if SYSV386_COMPAT
15145 	  p = "{r\t%2, %0|\t%0, %2}";
15146 #else
15147 	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
15148 #endif
15149 	}
15150       break;
15151 
15152     default:
15153       gcc_unreachable ();
15154     }
15155 
15156   strcat (buf, p);
15157   return buf;
15158 }
15159 
15160 /* Return needed mode for entity in optimize_mode_switching pass.  */
15161 
15162 int
15163 ix86_mode_needed (int entity, rtx insn)
15164 {
15165   enum attr_i387_cw mode;
15166 
15167   /* The mode UNINITIALIZED is used to store control word after a
15168      function call or ASM pattern.  The mode ANY specify that function
15169      has no requirements on the control word and make no changes in the
15170      bits we are interested in.  */
15171 
15172   if (CALL_P (insn)
15173       || (NONJUMP_INSN_P (insn)
15174 	  && (asm_noperands (PATTERN (insn)) >= 0
15175 	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15176     return I387_CW_UNINITIALIZED;
15177 
15178   if (recog_memoized (insn) < 0)
15179     return I387_CW_ANY;
15180 
15181   mode = get_attr_i387_cw (insn);
15182 
15183   switch (entity)
15184     {
15185     case I387_TRUNC:
15186       if (mode == I387_CW_TRUNC)
15187 	return mode;
15188       break;
15189 
15190     case I387_FLOOR:
15191       if (mode == I387_CW_FLOOR)
15192 	return mode;
15193       break;
15194 
15195     case I387_CEIL:
15196       if (mode == I387_CW_CEIL)
15197 	return mode;
15198       break;
15199 
15200     case I387_MASK_PM:
15201       if (mode == I387_CW_MASK_PM)
15202 	return mode;
15203       break;
15204 
15205     default:
15206       gcc_unreachable ();
15207     }
15208 
15209   return I387_CW_ANY;
15210 }
15211 
15212 /* Output code to initialize control word copies used by trunc?f?i and
15213    rounding patterns.  CURRENT_MODE is set to current control word,
15214    while NEW_MODE is set to new control word.  */
15215 
15216 void
15217 emit_i387_cw_initialization (int mode)
15218 {
15219   rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15220   rtx new_mode;
15221 
15222   enum ix86_stack_slot slot;
15223 
15224   rtx reg = gen_reg_rtx (HImode);
15225 
15226   emit_insn (gen_x86_fnstcw_1 (stored_mode));
15227   emit_move_insn (reg, copy_rtx (stored_mode));
15228 
15229   if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15230       || optimize_function_for_size_p (cfun))
15231     {
15232       switch (mode)
15233 	{
15234 	case I387_CW_TRUNC:
15235 	  /* round toward zero (truncate) */
15236 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15237 	  slot = SLOT_CW_TRUNC;
15238 	  break;
15239 
15240 	case I387_CW_FLOOR:
15241 	  /* round down toward -oo */
15242 	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15243 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15244 	  slot = SLOT_CW_FLOOR;
15245 	  break;
15246 
15247 	case I387_CW_CEIL:
15248 	  /* round up toward +oo */
15249 	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15250 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15251 	  slot = SLOT_CW_CEIL;
15252 	  break;
15253 
15254 	case I387_CW_MASK_PM:
15255 	  /* mask precision exception for nearbyint() */
15256 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15257 	  slot = SLOT_CW_MASK_PM;
15258 	  break;
15259 
15260 	default:
15261 	  gcc_unreachable ();
15262 	}
15263     }
15264   else
15265     {
15266       switch (mode)
15267 	{
15268 	case I387_CW_TRUNC:
15269 	  /* round toward zero (truncate) */
15270 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15271 	  slot = SLOT_CW_TRUNC;
15272 	  break;
15273 
15274 	case I387_CW_FLOOR:
15275 	  /* round down toward -oo */
15276 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15277 	  slot = SLOT_CW_FLOOR;
15278 	  break;
15279 
15280 	case I387_CW_CEIL:
15281 	  /* round up toward +oo */
15282 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15283 	  slot = SLOT_CW_CEIL;
15284 	  break;
15285 
15286 	case I387_CW_MASK_PM:
15287 	  /* mask precision exception for nearbyint() */
15288 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15289 	  slot = SLOT_CW_MASK_PM;
15290 	  break;
15291 
15292 	default:
15293 	  gcc_unreachable ();
15294 	}
15295     }
15296 
15297   gcc_assert (slot < MAX_386_STACK_LOCALS);
15298 
15299   new_mode = assign_386_stack_local (HImode, slot);
15300   emit_move_insn (new_mode, reg);
15301 }
15302 
15303 /* Output code for INSN to convert a float to a signed int.  OPERANDS
15304    are the insn operands.  The output may be [HSD]Imode and the input
15305    operand may be [SDX]Fmode.  */
15306 
15307 const char *
15308 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15309 {
15310   int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15311   int dimode_p = GET_MODE (operands[0]) == DImode;
15312   int round_mode = get_attr_i387_cw (insn);
15313 
15314   /* Jump through a hoop or two for DImode, since the hardware has no
15315      non-popping instruction.  We used to do this a different way, but
15316      that was somewhat fragile and broke with post-reload splitters.  */
15317   if ((dimode_p || fisttp) && !stack_top_dies)
15318     output_asm_insn ("fld\t%y1", operands);
15319 
15320   gcc_assert (STACK_TOP_P (operands[1]));
15321   gcc_assert (MEM_P (operands[0]));
15322   gcc_assert (GET_MODE (operands[1]) != TFmode);
15323 
15324   if (fisttp)
15325       output_asm_insn ("fisttp%Z0\t%0", operands);
15326   else
15327     {
15328       if (round_mode != I387_CW_ANY)
15329 	output_asm_insn ("fldcw\t%3", operands);
15330       if (stack_top_dies || dimode_p)
15331 	output_asm_insn ("fistp%Z0\t%0", operands);
15332       else
15333 	output_asm_insn ("fist%Z0\t%0", operands);
15334       if (round_mode != I387_CW_ANY)
15335 	output_asm_insn ("fldcw\t%2", operands);
15336     }
15337 
15338   return "";
15339 }
15340 
15341 /* Output code for x87 ffreep insn.  The OPNO argument, which may only
15342    have the values zero or one, indicates the ffreep insn's operand
15343    from the OPERANDS array.  */
15344 
15345 static const char *
15346 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15347 {
15348   if (TARGET_USE_FFREEP)
15349 #ifdef HAVE_AS_IX86_FFREEP
15350     return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15351 #else
15352     {
15353       static char retval[32];
15354       int regno = REGNO (operands[opno]);
15355 
15356       gcc_assert (FP_REGNO_P (regno));
15357 
15358       regno -= FIRST_STACK_REG;
15359 
15360       snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15361       return retval;
15362     }
15363 #endif
15364 
15365   return opno ? "fstp\t%y1" : "fstp\t%y0";
15366 }
15367 
15368 
15369 /* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
15370    should be used.  UNORDERED_P is true when fucom should be used.  */
15371 
15372 const char *
15373 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15374 {
15375   int stack_top_dies;
15376   rtx cmp_op0, cmp_op1;
15377   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15378 
15379   if (eflags_p)
15380     {
15381       cmp_op0 = operands[0];
15382       cmp_op1 = operands[1];
15383     }
15384   else
15385     {
15386       cmp_op0 = operands[1];
15387       cmp_op1 = operands[2];
15388     }
15389 
15390   if (is_sse)
15391     {
15392       if (GET_MODE (operands[0]) == SFmode)
15393 	if (unordered_p)
15394 	  return "%vucomiss\t{%1, %0|%0, %1}";
15395 	else
15396 	  return "%vcomiss\t{%1, %0|%0, %1}";
15397       else
15398 	if (unordered_p)
15399 	  return "%vucomisd\t{%1, %0|%0, %1}";
15400 	else
15401 	  return "%vcomisd\t{%1, %0|%0, %1}";
15402     }
15403 
15404   gcc_assert (STACK_TOP_P (cmp_op0));
15405 
15406   stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15407 
15408   if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15409     {
15410       if (stack_top_dies)
15411 	{
15412 	  output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15413 	  return output_387_ffreep (operands, 1);
15414 	}
15415       else
15416 	return "ftst\n\tfnstsw\t%0";
15417     }
15418 
15419   if (STACK_REG_P (cmp_op1)
15420       && stack_top_dies
15421       && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15422       && REGNO (cmp_op1) != FIRST_STACK_REG)
15423     {
15424       /* If both the top of the 387 stack dies, and the other operand
15425 	 is also a stack register that dies, then this must be a
15426 	 `fcompp' float compare */
15427 
15428       if (eflags_p)
15429 	{
15430 	  /* There is no double popping fcomi variant.  Fortunately,
15431 	     eflags is immune from the fstp's cc clobbering.  */
15432 	  if (unordered_p)
15433 	    output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15434 	  else
15435 	    output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15436 	  return output_387_ffreep (operands, 0);
15437 	}
15438       else
15439 	{
15440 	  if (unordered_p)
15441 	    return "fucompp\n\tfnstsw\t%0";
15442 	  else
15443 	    return "fcompp\n\tfnstsw\t%0";
15444 	}
15445     }
15446   else
15447     {
15448       /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies.  */
15449 
15450       static const char * const alt[16] =
15451       {
15452 	"fcom%Z2\t%y2\n\tfnstsw\t%0",
15453 	"fcomp%Z2\t%y2\n\tfnstsw\t%0",
15454 	"fucom%Z2\t%y2\n\tfnstsw\t%0",
15455 	"fucomp%Z2\t%y2\n\tfnstsw\t%0",
15456 
15457 	"ficom%Z2\t%y2\n\tfnstsw\t%0",
15458 	"ficomp%Z2\t%y2\n\tfnstsw\t%0",
15459 	NULL,
15460 	NULL,
15461 
15462 	"fcomi\t{%y1, %0|%0, %y1}",
15463 	"fcomip\t{%y1, %0|%0, %y1}",
15464 	"fucomi\t{%y1, %0|%0, %y1}",
15465 	"fucomip\t{%y1, %0|%0, %y1}",
15466 
15467 	NULL,
15468 	NULL,
15469 	NULL,
15470 	NULL
15471       };
15472 
15473       int mask;
15474       const char *ret;
15475 
15476       mask  = eflags_p << 3;
15477       mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15478       mask |= unordered_p << 1;
15479       mask |= stack_top_dies;
15480 
15481       gcc_assert (mask < 16);
15482       ret = alt[mask];
15483       gcc_assert (ret);
15484 
15485       return ret;
15486     }
15487 }
15488 
15489 void
15490 ix86_output_addr_vec_elt (FILE *file, int value)
15491 {
15492   const char *directive = ASM_LONG;
15493 
15494 #ifdef ASM_QUAD
15495   if (TARGET_LP64)
15496     directive = ASM_QUAD;
15497 #else
15498   gcc_assert (!TARGET_64BIT);
15499 #endif
15500 
15501   fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15502 }
15503 
15504 void
15505 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15506 {
15507   const char *directive = ASM_LONG;
15508 
15509 #ifdef ASM_QUAD
15510   if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15511     directive = ASM_QUAD;
15512 #else
15513   gcc_assert (!TARGET_64BIT);
15514 #endif
15515   /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
15516   if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15517     fprintf (file, "%s%s%d-%s%d\n",
15518 	     directive, LPREFIX, value, LPREFIX, rel);
15519   else if (HAVE_AS_GOTOFF_IN_DATA)
15520     fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15521 #if TARGET_MACHO
15522   else if (TARGET_MACHO)
15523     {
15524       fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15525       machopic_output_function_base_name (file);
15526       putc ('\n', file);
15527     }
15528 #endif
15529   else
15530     asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15531 		 GOT_SYMBOL_NAME, LPREFIX, value);
15532 }
15533 
15534 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15535    for the target.  */
15536 
15537 void
15538 ix86_expand_clear (rtx dest)
15539 {
15540   rtx tmp;
15541 
15542   /* We play register width games, which are only valid after reload.  */
15543   gcc_assert (reload_completed);
15544 
15545   /* Avoid HImode and its attendant prefix byte.  */
15546   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15547     dest = gen_rtx_REG (SImode, REGNO (dest));
15548   tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15549 
15550   /* This predicate should match that for movsi_xor and movdi_xor_rex64.  */
15551   if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15552     {
15553       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15554       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15555     }
15556 
15557   emit_insn (tmp);
15558 }
15559 
15560 /* X is an unchanging MEM.  If it is a constant pool reference, return
15561    the constant pool rtx, else NULL.  */
15562 
15563 rtx
15564 maybe_get_pool_constant (rtx x)
15565 {
15566   x = ix86_delegitimize_address (XEXP (x, 0));
15567 
15568   if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15569     return get_pool_constant (x);
15570 
15571   return NULL_RTX;
15572 }
15573 
15574 void
15575 ix86_expand_move (enum machine_mode mode, rtx operands[])
15576 {
15577   rtx op0, op1;
15578   enum tls_model model;
15579 
15580   op0 = operands[0];
15581   op1 = operands[1];
15582 
15583   if (GET_CODE (op1) == SYMBOL_REF)
15584     {
15585       model = SYMBOL_REF_TLS_MODEL (op1);
15586       if (model)
15587 	{
15588 	  op1 = legitimize_tls_address (op1, model, true);
15589 	  op1 = force_operand (op1, op0);
15590 	  if (op1 == op0)
15591 	    return;
15592 	  if (GET_MODE (op1) != mode)
15593 	    op1 = convert_to_mode (mode, op1, 1);
15594 	}
15595       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15596 	       && SYMBOL_REF_DLLIMPORT_P (op1))
15597 	op1 = legitimize_dllimport_symbol (op1, false);
15598     }
15599   else if (GET_CODE (op1) == CONST
15600 	   && GET_CODE (XEXP (op1, 0)) == PLUS
15601 	   && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15602     {
15603       rtx addend = XEXP (XEXP (op1, 0), 1);
15604       rtx symbol = XEXP (XEXP (op1, 0), 0);
15605       rtx tmp = NULL;
15606 
15607       model = SYMBOL_REF_TLS_MODEL (symbol);
15608       if (model)
15609 	tmp = legitimize_tls_address (symbol, model, true);
15610       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15611 	       && SYMBOL_REF_DLLIMPORT_P (symbol))
15612 	tmp = legitimize_dllimport_symbol (symbol, true);
15613 
15614       if (tmp)
15615 	{
15616 	  tmp = force_operand (tmp, NULL);
15617 	  tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15618 				     op0, 1, OPTAB_DIRECT);
15619 	  if (tmp == op0)
15620 	    return;
15621 	  op1 = convert_to_mode (mode, tmp, 1);
15622 	}
15623     }
15624 
15625   if ((flag_pic || MACHOPIC_INDIRECT)
15626       && symbolic_operand (op1, mode))
15627     {
15628       if (TARGET_MACHO && !TARGET_64BIT)
15629 	{
15630 #if TARGET_MACHO
15631 	  /* dynamic-no-pic */
15632 	  if (MACHOPIC_INDIRECT)
15633 	    {
15634 	      rtx temp = ((reload_in_progress
15635 			   || ((op0 && REG_P (op0))
15636 			       && mode == Pmode))
15637 			  ? op0 : gen_reg_rtx (Pmode));
15638 	      op1 = machopic_indirect_data_reference (op1, temp);
15639 	      if (MACHOPIC_PURE)
15640 		op1 = machopic_legitimize_pic_address (op1, mode,
15641 						       temp == op1 ? 0 : temp);
15642 	    }
15643 	  if (op0 != op1 && GET_CODE (op0) != MEM)
15644 	    {
15645 	      rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15646 	      emit_insn (insn);
15647 	      return;
15648 	    }
15649 	  if (GET_CODE (op0) == MEM)
15650 	    op1 = force_reg (Pmode, op1);
15651 	  else
15652 	    {
15653 	      rtx temp = op0;
15654 	      if (GET_CODE (temp) != REG)
15655 		temp = gen_reg_rtx (Pmode);
15656 	      temp = legitimize_pic_address (op1, temp);
15657 	      if (temp == op0)
15658 	    return;
15659 	      op1 = temp;
15660 	    }
15661       /* dynamic-no-pic */
15662 #endif
15663 	}
15664       else
15665 	{
15666 	  if (MEM_P (op0))
15667 	    op1 = force_reg (mode, op1);
15668 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15669 	    {
15670 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15671 	      op1 = legitimize_pic_address (op1, reg);
15672 	      if (op0 == op1)
15673 		return;
15674 	      if (GET_MODE (op1) != mode)
15675 		op1 = convert_to_mode (mode, op1, 1);
15676 	    }
15677 	}
15678     }
15679   else
15680     {
15681       if (MEM_P (op0)
15682 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15683 	      || !push_operand (op0, mode))
15684 	  && MEM_P (op1))
15685 	op1 = force_reg (mode, op1);
15686 
15687       if (push_operand (op0, mode)
15688 	  && ! general_no_elim_operand (op1, mode))
15689 	op1 = copy_to_mode_reg (mode, op1);
15690 
15691       /* Force large constants in 64bit compilation into register
15692 	 to get them CSEed.  */
15693       if (can_create_pseudo_p ()
15694 	  && (mode == DImode) && TARGET_64BIT
15695 	  && immediate_operand (op1, mode)
15696 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
15697 	  && !register_operand (op0, mode)
15698 	  && optimize)
15699 	op1 = copy_to_mode_reg (mode, op1);
15700 
15701       if (can_create_pseudo_p ()
15702 	  && FLOAT_MODE_P (mode)
15703 	  && GET_CODE (op1) == CONST_DOUBLE)
15704 	{
15705 	  /* If we are loading a floating point constant to a register,
15706 	     force the value to memory now, since we'll get better code
15707 	     out the back end.  */
15708 
15709 	  op1 = validize_mem (force_const_mem (mode, op1));
15710 	  if (!register_operand (op0, mode))
15711 	    {
15712 	      rtx temp = gen_reg_rtx (mode);
15713 	      emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15714 	      emit_move_insn (op0, temp);
15715 	      return;
15716 	    }
15717 	}
15718     }
15719 
15720   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15721 }
15722 
15723 void
15724 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15725 {
15726   rtx op0 = operands[0], op1 = operands[1];
15727   unsigned int align = GET_MODE_ALIGNMENT (mode);
15728 
15729   /* Force constants other than zero into memory.  We do not know how
15730      the instructions used to build constants modify the upper 64 bits
15731      of the register, once we have that information we may be able
15732      to handle some of them more efficiently.  */
15733   if (can_create_pseudo_p ()
15734       && register_operand (op0, mode)
15735       && (CONSTANT_P (op1)
15736 	  || (GET_CODE (op1) == SUBREG
15737 	      && CONSTANT_P (SUBREG_REG (op1))))
15738       && !standard_sse_constant_p (op1))
15739     op1 = validize_mem (force_const_mem (mode, op1));
15740 
15741   /* We need to check memory alignment for SSE mode since attribute
15742      can make operands unaligned.  */
15743   if (can_create_pseudo_p ()
15744       && SSE_REG_MODE_P (mode)
15745       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15746 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15747     {
15748       rtx tmp[2];
15749 
15750       /* ix86_expand_vector_move_misalign() does not like constants ... */
15751       if (CONSTANT_P (op1)
15752 	  || (GET_CODE (op1) == SUBREG
15753 	      && CONSTANT_P (SUBREG_REG (op1))))
15754 	op1 = validize_mem (force_const_mem (mode, op1));
15755 
15756       /* ... nor both arguments in memory.  */
15757       if (!register_operand (op0, mode)
15758 	  && !register_operand (op1, mode))
15759 	op1 = force_reg (mode, op1);
15760 
15761       tmp[0] = op0; tmp[1] = op1;
15762       ix86_expand_vector_move_misalign (mode, tmp);
15763       return;
15764     }
15765 
15766   /* Make operand1 a register if it isn't already.  */
15767   if (can_create_pseudo_p ()
15768       && !register_operand (op0, mode)
15769       && !register_operand (op1, mode))
15770     {
15771       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15772       return;
15773     }
15774 
15775   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15776 }
15777 
15778 /* Split 32-byte AVX unaligned load and store if needed.  */
15779 
15780 static void
15781 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15782 {
15783   rtx m;
15784   rtx (*extract) (rtx, rtx, rtx);
15785   rtx (*load_unaligned) (rtx, rtx);
15786   rtx (*store_unaligned) (rtx, rtx);
15787   enum machine_mode mode;
15788 
15789   switch (GET_MODE (op0))
15790     {
15791     default:
15792       gcc_unreachable ();
15793     case V32QImode:
15794       extract = gen_avx_vextractf128v32qi;
15795       load_unaligned = gen_avx_loaddqu256;
15796       store_unaligned = gen_avx_storedqu256;
15797       mode = V16QImode;
15798       break;
15799     case V8SFmode:
15800       extract = gen_avx_vextractf128v8sf;
15801       load_unaligned = gen_avx_loadups256;
15802       store_unaligned = gen_avx_storeups256;
15803       mode = V4SFmode;
15804       break;
15805     case V4DFmode:
15806       extract = gen_avx_vextractf128v4df;
15807       load_unaligned = gen_avx_loadupd256;
15808       store_unaligned = gen_avx_storeupd256;
15809       mode = V2DFmode;
15810       break;
15811     }
15812 
15813   if (MEM_P (op1))
15814     {
15815       if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15816 	{
15817 	  rtx r = gen_reg_rtx (mode);
15818 	  m = adjust_address (op1, mode, 0);
15819 	  emit_move_insn (r, m);
15820 	  m = adjust_address (op1, mode, 16);
15821 	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15822 	  emit_move_insn (op0, r);
15823 	}
15824       else
15825 	emit_insn (load_unaligned (op0, op1));
15826     }
15827   else if (MEM_P (op0))
15828     {
15829       if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15830 	{
15831 	  m = adjust_address (op0, mode, 0);
15832 	  emit_insn (extract (m, op1, const0_rtx));
15833 	  m = adjust_address (op0, mode, 16);
15834 	  emit_insn (extract (m, op1, const1_rtx));
15835 	}
15836       else
15837 	emit_insn (store_unaligned (op0, op1));
15838     }
15839   else
15840     gcc_unreachable ();
15841 }
15842 
15843 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
15844    straight to ix86_expand_vector_move.  */
15845 /* Code generation for scalar reg-reg moves of single and double precision data:
15846      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15847        movaps reg, reg
15848      else
15849        movss reg, reg
15850      if (x86_sse_partial_reg_dependency == true)
15851        movapd reg, reg
15852      else
15853        movsd reg, reg
15854 
15855    Code generation for scalar loads of double precision data:
15856      if (x86_sse_split_regs == true)
15857        movlpd mem, reg      (gas syntax)
15858      else
15859        movsd mem, reg
15860 
15861    Code generation for unaligned packed loads of single precision data
15862    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15863      if (x86_sse_unaligned_move_optimal)
15864        movups mem, reg
15865 
15866      if (x86_sse_partial_reg_dependency == true)
15867        {
15868          xorps  reg, reg
15869          movlps mem, reg
15870          movhps mem+8, reg
15871        }
15872      else
15873        {
15874          movlps mem, reg
15875          movhps mem+8, reg
15876        }
15877 
15878    Code generation for unaligned packed loads of double precision data
15879    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15880      if (x86_sse_unaligned_move_optimal)
15881        movupd mem, reg
15882 
15883      if (x86_sse_split_regs == true)
15884        {
15885          movlpd mem, reg
15886          movhpd mem+8, reg
15887        }
15888      else
15889        {
15890          movsd  mem, reg
15891          movhpd mem+8, reg
15892        }
15893  */
15894 
15895 void
15896 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15897 {
15898   rtx op0, op1, m;
15899   rtx (*move_unaligned) (rtx, rtx);
15900 
15901   op0 = operands[0];
15902   op1 = operands[1];
15903 
15904   if (TARGET_AVX)
15905     {
15906       switch (GET_MODE_CLASS (mode))
15907 	{
15908 	case MODE_VECTOR_INT:
15909 	case MODE_INT:
15910 	  switch (GET_MODE_SIZE (mode))
15911 	    {
15912 	    case 16:
15913 	      /*  If we're optimizing for size, movups is the smallest.  */
15914 	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15915 		{
15916 		  if (MEM_P (op1))
15917 		    move_unaligned = gen_sse_loadups;
15918 		  else if (MEM_P (op0))
15919 		    move_unaligned = gen_sse_storeups;
15920 		  else
15921 		    gcc_unreachable ();
15922 
15923 		  op0 = gen_lowpart (V4SFmode, op0);
15924 		  op1 = gen_lowpart (V4SFmode, op1);
15925 		  emit_insn (move_unaligned (op0, op1));
15926 		  return;
15927 		}
15928 	      if (MEM_P (op1))
15929 		move_unaligned = gen_sse2_loaddqu;
15930 	      else if (MEM_P (op0))
15931 		move_unaligned = gen_sse2_storedqu;
15932 	      else
15933 		gcc_unreachable ();
15934 
15935 	      op0 = gen_lowpart (V16QImode, op0);
15936 	      op1 = gen_lowpart (V16QImode, op1);
15937 	      emit_insn (move_unaligned (op0, op1));
15938 	      break;
15939 	    case 32:
15940 	      op0 = gen_lowpart (V32QImode, op0);
15941 	      op1 = gen_lowpart (V32QImode, op1);
15942 	      ix86_avx256_split_vector_move_misalign (op0, op1);
15943 	      break;
15944 	    default:
15945 	      gcc_unreachable ();
15946 	    }
15947 	  break;
15948 	case MODE_VECTOR_FLOAT:
15949 	  op0 = gen_lowpart (mode, op0);
15950 	  op1 = gen_lowpart (mode, op1);
15951 
15952 	  switch (mode)
15953 	    {
15954 	    case V4SFmode:
15955 	      if (MEM_P (op1))
15956 		move_unaligned = gen_sse_loadups;
15957 	      else if (MEM_P (op0))
15958 		move_unaligned = gen_sse_storeups;
15959 	      else
15960 		gcc_unreachable ();
15961 
15962 	      emit_insn (move_unaligned (op0, op1));
15963 	      break;
15964 	    case V8SFmode:
15965 	      ix86_avx256_split_vector_move_misalign (op0, op1);
15966 	      break;
15967 	    case V2DFmode:
15968 	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15969 		{
15970 		  if (MEM_P (op1))
15971 		    move_unaligned = gen_sse_loadups;
15972 		  else if (MEM_P (op0))
15973 		    move_unaligned = gen_sse_storeups;
15974 		  else
15975 		    gcc_unreachable ();
15976 
15977 		  op0 = gen_lowpart (V4SFmode, op0);
15978 		  op1 = gen_lowpart (V4SFmode, op1);
15979 		  emit_insn (move_unaligned (op0, op1));
15980 		  return;
15981 		}
15982 	      if (MEM_P (op1))
15983 		move_unaligned = gen_sse2_loadupd;
15984 	      else if (MEM_P (op0))
15985 		move_unaligned = gen_sse2_storeupd;
15986 	      else
15987 		gcc_unreachable ();
15988 
15989 	      emit_insn (move_unaligned (op0, op1));
15990 	      break;
15991 	    case V4DFmode:
15992 	      ix86_avx256_split_vector_move_misalign (op0, op1);
15993 	      break;
15994 	    default:
15995 	      gcc_unreachable ();
15996 	    }
15997 	  break;
15998 
15999 	default:
16000 	  gcc_unreachable ();
16001 	}
16002 
16003       return;
16004     }
16005 
16006   if (MEM_P (op1))
16007     {
16008       /* If we're optimizing for size, movups is the smallest.  */
16009       if (optimize_insn_for_size_p ()
16010 	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
16011 	{
16012 	  op0 = gen_lowpart (V4SFmode, op0);
16013 	  op1 = gen_lowpart (V4SFmode, op1);
16014 	  emit_insn (gen_sse_loadups (op0, op1));
16015 	  return;
16016 	}
16017 
16018       /* ??? If we have typed data, then it would appear that using
16019 	 movdqu is the only way to get unaligned data loaded with
16020 	 integer type.  */
16021       if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16022 	{
16023 	  op0 = gen_lowpart (V16QImode, op0);
16024 	  op1 = gen_lowpart (V16QImode, op1);
16025 	  emit_insn (gen_sse2_loaddqu (op0, op1));
16026 	  return;
16027 	}
16028 
16029       if (TARGET_SSE2 && mode == V2DFmode)
16030         {
16031           rtx zero;
16032 
16033 	  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
16034 	    {
16035 	      op0 = gen_lowpart (V2DFmode, op0);
16036 	      op1 = gen_lowpart (V2DFmode, op1);
16037 	      emit_insn (gen_sse2_loadupd (op0, op1));
16038 	      return;
16039 	    }
16040 
16041 	  /* When SSE registers are split into halves, we can avoid
16042 	     writing to the top half twice.  */
16043 	  if (TARGET_SSE_SPLIT_REGS)
16044 	    {
16045 	      emit_clobber (op0);
16046 	      zero = op0;
16047 	    }
16048 	  else
16049 	    {
16050 	      /* ??? Not sure about the best option for the Intel chips.
16051 		 The following would seem to satisfy; the register is
16052 		 entirely cleared, breaking the dependency chain.  We
16053 		 then store to the upper half, with a dependency depth
16054 		 of one.  A rumor has it that Intel recommends two movsd
16055 		 followed by an unpacklpd, but this is unconfirmed.  And
16056 		 given that the dependency depth of the unpacklpd would
16057 		 still be one, I'm not sure why this would be better.  */
16058 	      zero = CONST0_RTX (V2DFmode);
16059 	    }
16060 
16061 	  m = adjust_address (op1, DFmode, 0);
16062 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
16063 	  m = adjust_address (op1, DFmode, 8);
16064 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
16065 	}
16066       else
16067         {
16068 	  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
16069 	    {
16070 	      op0 = gen_lowpart (V4SFmode, op0);
16071 	      op1 = gen_lowpart (V4SFmode, op1);
16072 	      emit_insn (gen_sse_loadups (op0, op1));
16073 	      return;
16074             }
16075 
16076 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16077 	    emit_move_insn (op0, CONST0_RTX (mode));
16078 	  else
16079 	    emit_clobber (op0);
16080 
16081 	  if (mode != V4SFmode)
16082 	    op0 = gen_lowpart (V4SFmode, op0);
16083 	  m = adjust_address (op1, V2SFmode, 0);
16084 	  emit_insn (gen_sse_loadlps (op0, op0, m));
16085 	  m = adjust_address (op1, V2SFmode, 8);
16086 	  emit_insn (gen_sse_loadhps (op0, op0, m));
16087 	}
16088     }
16089   else if (MEM_P (op0))
16090     {
16091       /* If we're optimizing for size, movups is the smallest.  */
16092       if (optimize_insn_for_size_p ()
16093 	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
16094 	{
16095 	  op0 = gen_lowpart (V4SFmode, op0);
16096 	  op1 = gen_lowpart (V4SFmode, op1);
16097 	  emit_insn (gen_sse_storeups (op0, op1));
16098 	  return;
16099 	}
16100 
16101       /* ??? Similar to above, only less clear because of quote
16102 	 typeless stores unquote.  */
16103       if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
16104 	  && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16105         {
16106 	  op0 = gen_lowpart (V16QImode, op0);
16107 	  op1 = gen_lowpart (V16QImode, op1);
16108 	  emit_insn (gen_sse2_storedqu (op0, op1));
16109 	  return;
16110 	}
16111 
16112       if (TARGET_SSE2 && mode == V2DFmode)
16113 	{
16114 	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
16115 	    {
16116 	      op0 = gen_lowpart (V2DFmode, op0);
16117 	      op1 = gen_lowpart (V2DFmode, op1);
16118 	      emit_insn (gen_sse2_storeupd (op0, op1));
16119 	    }
16120 	  else
16121 	    {
16122 	      m = adjust_address (op0, DFmode, 0);
16123 	      emit_insn (gen_sse2_storelpd (m, op1));
16124 	      m = adjust_address (op0, DFmode, 8);
16125 	      emit_insn (gen_sse2_storehpd (m, op1));
16126 	    }
16127 	}
16128       else
16129 	{
16130 	  if (mode != V4SFmode)
16131 	    op1 = gen_lowpart (V4SFmode, op1);
16132 
16133 	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
16134 	    {
16135 	      op0 = gen_lowpart (V4SFmode, op0);
16136 	      emit_insn (gen_sse_storeups (op0, op1));
16137 	    }
16138 	  else
16139 	    {
16140 	      m = adjust_address (op0, V2SFmode, 0);
16141 	      emit_insn (gen_sse_storelps (m, op1));
16142 	      m = adjust_address (op0, V2SFmode, 8);
16143 	      emit_insn (gen_sse_storehps (m, op1));
16144 	    }
16145 	}
16146     }
16147   else
16148     gcc_unreachable ();
16149 }
16150 
16151 /* Expand a push in MODE.  This is some mode for which we do not support
16152    proper push instructions, at least from the registers that we expect
16153    the value to live in.  */
16154 
16155 void
16156 ix86_expand_push (enum machine_mode mode, rtx x)
16157 {
16158   rtx tmp;
16159 
16160   tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16161 			     GEN_INT (-GET_MODE_SIZE (mode)),
16162 			     stack_pointer_rtx, 1, OPTAB_DIRECT);
16163   if (tmp != stack_pointer_rtx)
16164     emit_move_insn (stack_pointer_rtx, tmp);
16165 
16166   tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16167 
16168   /* When we push an operand onto stack, it has to be aligned at least
16169      at the function argument boundary.  However since we don't have
16170      the argument type, we can't determine the actual argument
16171      boundary.  */
16172   emit_move_insn (tmp, x);
16173 }
16174 
16175 /* Helper function of ix86_fixup_binary_operands to canonicalize
16176    operand order.  Returns true if the operands should be swapped.  */
16177 
16178 static bool
16179 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16180 			     rtx operands[])
16181 {
16182   rtx dst = operands[0];
16183   rtx src1 = operands[1];
16184   rtx src2 = operands[2];
16185 
16186   /* If the operation is not commutative, we can't do anything.  */
16187   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16188     return false;
16189 
16190   /* Highest priority is that src1 should match dst.  */
16191   if (rtx_equal_p (dst, src1))
16192     return false;
16193   if (rtx_equal_p (dst, src2))
16194     return true;
16195 
16196   /* Next highest priority is that immediate constants come second.  */
16197   if (immediate_operand (src2, mode))
16198     return false;
16199   if (immediate_operand (src1, mode))
16200     return true;
16201 
16202   /* Lowest priority is that memory references should come second.  */
16203   if (MEM_P (src2))
16204     return false;
16205   if (MEM_P (src1))
16206     return true;
16207 
16208   return false;
16209 }
16210 
16211 
16212 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
16213    destination to use for the operation.  If different from the true
16214    destination in operands[0], a copy operation will be required.  */
16215 
16216 rtx
16217 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16218 			    rtx operands[])
16219 {
16220   rtx dst = operands[0];
16221   rtx src1 = operands[1];
16222   rtx src2 = operands[2];
16223 
16224   /* Canonicalize operand order.  */
16225   if (ix86_swap_binary_operands_p (code, mode, operands))
16226     {
16227       rtx temp;
16228 
16229       /* It is invalid to swap operands of different modes.  */
16230       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16231 
16232       temp = src1;
16233       src1 = src2;
16234       src2 = temp;
16235     }
16236 
16237   /* Both source operands cannot be in memory.  */
16238   if (MEM_P (src1) && MEM_P (src2))
16239     {
16240       /* Optimization: Only read from memory once.  */
16241       if (rtx_equal_p (src1, src2))
16242 	{
16243 	  src2 = force_reg (mode, src2);
16244 	  src1 = src2;
16245 	}
16246       else
16247 	src2 = force_reg (mode, src2);
16248     }
16249 
16250   /* If the destination is memory, and we do not have matching source
16251      operands, do things in registers.  */
16252   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16253     dst = gen_reg_rtx (mode);
16254 
16255   /* Source 1 cannot be a constant.  */
16256   if (CONSTANT_P (src1))
16257     src1 = force_reg (mode, src1);
16258 
16259   /* Source 1 cannot be a non-matching memory.  */
16260   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16261     src1 = force_reg (mode, src1);
16262 
16263   /* Improve address combine.  */
16264   if (code == PLUS
16265       && GET_MODE_CLASS (mode) == MODE_INT
16266       && MEM_P (src2))
16267     src2 = force_reg (mode, src2);
16268 
16269   operands[1] = src1;
16270   operands[2] = src2;
16271   return dst;
16272 }
16273 
16274 /* Similarly, but assume that the destination has already been
16275    set up properly.  */
16276 
16277 void
16278 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16279 				    enum machine_mode mode, rtx operands[])
16280 {
16281   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16282   gcc_assert (dst == operands[0]);
16283 }
16284 
16285 /* Attempt to expand a binary operator.  Make the expansion closer to the
16286    actual machine, then just general_operand, which will allow 3 separate
16287    memory references (one output, two input) in a single insn.  */
16288 
16289 void
16290 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16291 			     rtx operands[])
16292 {
16293   rtx src1, src2, dst, op, clob;
16294 
16295   dst = ix86_fixup_binary_operands (code, mode, operands);
16296   src1 = operands[1];
16297   src2 = operands[2];
16298 
16299  /* Emit the instruction.  */
16300 
16301   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16302   if (reload_in_progress)
16303     {
16304       /* Reload doesn't know about the flags register, and doesn't know that
16305          it doesn't want to clobber it.  We can only do this with PLUS.  */
16306       gcc_assert (code == PLUS);
16307       emit_insn (op);
16308     }
16309   else if (reload_completed
16310 	   && code == PLUS
16311 	   && !rtx_equal_p (dst, src1))
16312     {
16313       /* This is going to be an LEA; avoid splitting it later.  */
16314       emit_insn (op);
16315     }
16316   else
16317     {
16318       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16319       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16320     }
16321 
16322   /* Fix up the destination if needed.  */
16323   if (dst != operands[0])
16324     emit_move_insn (operands[0], dst);
16325 }
16326 
16327 /* Return TRUE or FALSE depending on whether the binary operator meets the
16328    appropriate constraints.  */
16329 
16330 bool
16331 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16332 			 rtx operands[3])
16333 {
16334   rtx dst = operands[0];
16335   rtx src1 = operands[1];
16336   rtx src2 = operands[2];
16337 
16338   /* Both source operands cannot be in memory.  */
16339   if (MEM_P (src1) && MEM_P (src2))
16340     return false;
16341 
16342   /* Canonicalize operand order for commutative operators.  */
16343   if (ix86_swap_binary_operands_p (code, mode, operands))
16344     {
16345       rtx temp = src1;
16346       src1 = src2;
16347       src2 = temp;
16348     }
16349 
16350   /* If the destination is memory, we must have a matching source operand.  */
16351   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16352       return false;
16353 
16354   /* Source 1 cannot be a constant.  */
16355   if (CONSTANT_P (src1))
16356     return false;
16357 
16358   /* Source 1 cannot be a non-matching memory.  */
16359   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16360     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
16361     return (code == AND
16362 	    && (mode == HImode
16363 		|| mode == SImode
16364 		|| (TARGET_64BIT && mode == DImode))
16365 	    && satisfies_constraint_L (src2));
16366 
16367   return true;
16368 }
16369 
16370 /* Attempt to expand a unary operator.  Make the expansion closer to the
16371    actual machine, then just general_operand, which will allow 2 separate
16372    memory references (one output, one input) in a single insn.  */
16373 
16374 void
16375 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16376 			    rtx operands[])
16377 {
16378   int matching_memory;
16379   rtx src, dst, op, clob;
16380 
16381   dst = operands[0];
16382   src = operands[1];
16383 
16384   /* If the destination is memory, and we do not have matching source
16385      operands, do things in registers.  */
16386   matching_memory = 0;
16387   if (MEM_P (dst))
16388     {
16389       if (rtx_equal_p (dst, src))
16390 	matching_memory = 1;
16391       else
16392 	dst = gen_reg_rtx (mode);
16393     }
16394 
16395   /* When source operand is memory, destination must match.  */
16396   if (MEM_P (src) && !matching_memory)
16397     src = force_reg (mode, src);
16398 
16399   /* Emit the instruction.  */
16400 
16401   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16402   if (reload_in_progress || code == NOT)
16403     {
16404       /* Reload doesn't know about the flags register, and doesn't know that
16405          it doesn't want to clobber it.  */
16406       gcc_assert (code == NOT);
16407       emit_insn (op);
16408     }
16409   else
16410     {
16411       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16412       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16413     }
16414 
16415   /* Fix up the destination if needed.  */
16416   if (dst != operands[0])
16417     emit_move_insn (operands[0], dst);
16418 }
16419 
16420 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16421    divisor are within the range [0-255].  */
16422 
16423 void
16424 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16425 		    bool signed_p)
16426 {
16427   rtx end_label, qimode_label;
16428   rtx insn, div, mod;
16429   rtx scratch, tmp0, tmp1, tmp2;
16430   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16431   rtx (*gen_zero_extend) (rtx, rtx);
16432   rtx (*gen_test_ccno_1) (rtx, rtx);
16433 
16434   switch (mode)
16435     {
16436     case SImode:
16437       gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16438       gen_test_ccno_1 = gen_testsi_ccno_1;
16439       gen_zero_extend = gen_zero_extendqisi2;
16440       break;
16441     case DImode:
16442       gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16443       gen_test_ccno_1 = gen_testdi_ccno_1;
16444       gen_zero_extend = gen_zero_extendqidi2;
16445       break;
16446     default:
16447       gcc_unreachable ();
16448     }
16449 
16450   end_label = gen_label_rtx ();
16451   qimode_label = gen_label_rtx ();
16452 
16453   scratch = gen_reg_rtx (mode);
16454 
16455   /* Use 8bit unsigned divimod if dividend and divisor are within
16456      the range [0-255].  */
16457   emit_move_insn (scratch, operands[2]);
16458   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16459 				 scratch, 1, OPTAB_DIRECT);
16460   emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16461   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16462   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16463   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16464 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16465 			       pc_rtx);
16466   insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16467   predict_jump (REG_BR_PROB_BASE * 50 / 100);
16468   JUMP_LABEL (insn) = qimode_label;
16469 
16470   /* Generate original signed/unsigned divimod.  */
16471   div = gen_divmod4_1 (operands[0], operands[1],
16472 		       operands[2], operands[3]);
16473   emit_insn (div);
16474 
16475   /* Branch to the end.  */
16476   emit_jump_insn (gen_jump (end_label));
16477   emit_barrier ();
16478 
16479   /* Generate 8bit unsigned divide.  */
16480   emit_label (qimode_label);
16481   /* Don't use operands[0] for result of 8bit divide since not all
16482      registers support QImode ZERO_EXTRACT.  */
16483   tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16484   tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16485   tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16486   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16487 
16488   if (signed_p)
16489     {
16490       div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16491       mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16492     }
16493   else
16494     {
16495       div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16496       mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16497     }
16498 
16499   /* Extract remainder from AH.  */
16500   tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16501   if (REG_P (operands[1]))
16502     insn = emit_move_insn (operands[1], tmp1);
16503   else
16504     {
16505       /* Need a new scratch register since the old one has result
16506 	 of 8bit divide.  */
16507       scratch = gen_reg_rtx (mode);
16508       emit_move_insn (scratch, tmp1);
16509       insn = emit_move_insn (operands[1], scratch);
16510     }
16511   set_unique_reg_note (insn, REG_EQUAL, mod);
16512 
16513   /* Zero extend quotient from AL.  */
16514   tmp1 = gen_lowpart (QImode, tmp0);
16515   insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16516   set_unique_reg_note (insn, REG_EQUAL, div);
16517 
16518   emit_label (end_label);
16519 }
16520 
16521 #define LEA_MAX_STALL (3)
16522 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16523 
16524 /* Increase given DISTANCE in half-cycles according to
16525    dependencies between PREV and NEXT instructions.
16526    Add 1 half-cycle if there is no dependency and
16527    go to next cycle if there is some dependecy.  */
16528 
16529 static unsigned int
16530 increase_distance (rtx prev, rtx next, unsigned int distance)
16531 {
16532   df_ref *use_rec;
16533   df_ref *def_rec;
16534 
16535   if (!prev || !next)
16536     return distance + (distance & 1) + 2;
16537 
16538   if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16539     return distance + 1;
16540 
16541   for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16542     for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16543       if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16544 	  && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16545 	return distance + (distance & 1) + 2;
16546 
16547   return distance + 1;
16548 }
16549 
16550 /* Function checks if instruction INSN defines register number
16551    REGNO1 or REGNO2.  */
16552 
16553 static bool
16554 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16555 		  rtx insn)
16556 {
16557   df_ref *def_rec;
16558 
16559   for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16560     if (DF_REF_REG_DEF_P (*def_rec)
16561 	&& !DF_REF_IS_ARTIFICIAL (*def_rec)
16562 	&& (regno1 == DF_REF_REGNO (*def_rec)
16563 	    || regno2 == DF_REF_REGNO (*def_rec)))
16564       {
16565 	return true;
16566       }
16567 
16568   return false;
16569 }
16570 
16571 /* Function checks if instruction INSN uses register number
16572    REGNO as a part of address expression.  */
16573 
16574 static bool
16575 insn_uses_reg_mem (unsigned int regno, rtx insn)
16576 {
16577   df_ref *use_rec;
16578 
16579   for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16580     if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16581       return true;
16582 
16583   return false;
16584 }
16585 
16586 /* Search backward for non-agu definition of register number REGNO1
16587    or register number REGNO2 in basic block starting from instruction
16588    START up to head of basic block or instruction INSN.
16589 
16590    Function puts true value into *FOUND var if definition was found
16591    and false otherwise.
16592 
16593    Distance in half-cycles between START and found instruction or head
16594    of BB is added to DISTANCE and returned.  */
16595 
16596 static int
16597 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16598 			       rtx insn, int distance,
16599 			       rtx start, bool *found)
16600 {
16601   basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16602   rtx prev = start;
16603   rtx next = NULL;
16604 
16605   *found = false;
16606 
16607   while (prev
16608 	 && prev != insn
16609 	 && distance < LEA_SEARCH_THRESHOLD)
16610     {
16611       if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16612 	{
16613 	  distance = increase_distance (prev, next, distance);
16614 	  if (insn_defines_reg (regno1, regno2, prev))
16615 	    {
16616 	      if (recog_memoized (prev) < 0
16617 		  || get_attr_type (prev) != TYPE_LEA)
16618 		{
16619 		  *found = true;
16620 		  return distance;
16621 		}
16622 	    }
16623 
16624 	  next = prev;
16625 	}
16626       if (prev == BB_HEAD (bb))
16627 	break;
16628 
16629       prev = PREV_INSN (prev);
16630     }
16631 
16632   return distance;
16633 }
16634 
16635 /* Search backward for non-agu definition of register number REGNO1
16636    or register number REGNO2 in INSN's basic block until
16637    1. Pass LEA_SEARCH_THRESHOLD instructions, or
16638    2. Reach neighbour BBs boundary, or
16639    3. Reach agu definition.
16640    Returns the distance between the non-agu definition point and INSN.
16641    If no definition point, returns -1.  */
16642 
16643 static int
16644 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16645 			 rtx insn)
16646 {
16647   basic_block bb = BLOCK_FOR_INSN (insn);
16648   int distance = 0;
16649   bool found = false;
16650 
16651   if (insn != BB_HEAD (bb))
16652     distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16653 					      distance, PREV_INSN (insn),
16654 					      &found);
16655 
16656   if (!found && distance < LEA_SEARCH_THRESHOLD)
16657     {
16658       edge e;
16659       edge_iterator ei;
16660       bool simple_loop = false;
16661 
16662       FOR_EACH_EDGE (e, ei, bb->preds)
16663 	if (e->src == bb)
16664 	  {
16665 	    simple_loop = true;
16666 	    break;
16667 	  }
16668 
16669       if (simple_loop)
16670 	distance = distance_non_agu_define_in_bb (regno1, regno2,
16671 						  insn, distance,
16672 						  BB_END (bb), &found);
16673       else
16674 	{
16675 	  int shortest_dist = -1;
16676 	  bool found_in_bb = false;
16677 
16678 	  FOR_EACH_EDGE (e, ei, bb->preds)
16679 	    {
16680 	      int bb_dist
16681 		= distance_non_agu_define_in_bb (regno1, regno2,
16682 						 insn, distance,
16683 						 BB_END (e->src),
16684 						 &found_in_bb);
16685 	      if (found_in_bb)
16686 		{
16687 		  if (shortest_dist < 0)
16688 		    shortest_dist = bb_dist;
16689 		  else if (bb_dist > 0)
16690 		    shortest_dist = MIN (bb_dist, shortest_dist);
16691 
16692 		  found = true;
16693 		}
16694 	    }
16695 
16696 	  distance = shortest_dist;
16697 	}
16698     }
16699 
16700   /* get_attr_type may modify recog data.  We want to make sure
16701      that recog data is valid for instruction INSN, on which
16702      distance_non_agu_define is called.  INSN is unchanged here.  */
16703   extract_insn_cached (insn);
16704 
16705   if (!found)
16706     return -1;
16707 
16708   return distance >> 1;
16709 }
16710 
16711 /* Return the distance in half-cycles between INSN and the next
16712    insn that uses register number REGNO in memory address added
16713    to DISTANCE.  Return -1 if REGNO0 is set.
16714 
16715    Put true value into *FOUND if register usage was found and
16716    false otherwise.
16717    Put true value into *REDEFINED if register redefinition was
16718    found and false otherwise.  */
16719 
16720 static int
16721 distance_agu_use_in_bb (unsigned int regno,
16722 			rtx insn, int distance, rtx start,
16723 			bool *found, bool *redefined)
16724 {
16725   basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16726   rtx next = start;
16727   rtx prev = NULL;
16728 
16729   *found = false;
16730   *redefined = false;
16731 
16732   while (next
16733 	 && next != insn
16734 	 && distance < LEA_SEARCH_THRESHOLD)
16735     {
16736       if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16737 	{
16738 	  distance = increase_distance(prev, next, distance);
16739 	  if (insn_uses_reg_mem (regno, next))
16740 	    {
16741 	      /* Return DISTANCE if OP0 is used in memory
16742 		 address in NEXT.  */
16743 	      *found = true;
16744 	      return distance;
16745 	    }
16746 
16747 	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
16748 	    {
16749 	      /* Return -1 if OP0 is set in NEXT.  */
16750 	      *redefined = true;
16751 	      return -1;
16752 	    }
16753 
16754 	  prev = next;
16755 	}
16756 
16757       if (next == BB_END (bb))
16758 	break;
16759 
16760       next = NEXT_INSN (next);
16761     }
16762 
16763   return distance;
16764 }
16765 
16766 /* Return the distance between INSN and the next insn that uses
16767    register number REGNO0 in memory address.  Return -1 if no such
16768    a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
16769 
16770 static int
16771 distance_agu_use (unsigned int regno0, rtx insn)
16772 {
16773   basic_block bb = BLOCK_FOR_INSN (insn);
16774   int distance = 0;
16775   bool found = false;
16776   bool redefined = false;
16777 
16778   if (insn != BB_END (bb))
16779     distance = distance_agu_use_in_bb (regno0, insn, distance,
16780 				       NEXT_INSN (insn),
16781 				       &found, &redefined);
16782 
16783   if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16784     {
16785       edge e;
16786       edge_iterator ei;
16787       bool simple_loop = false;
16788 
16789       FOR_EACH_EDGE (e, ei, bb->succs)
16790         if (e->dest == bb)
16791 	  {
16792 	    simple_loop = true;
16793 	    break;
16794 	  }
16795 
16796       if (simple_loop)
16797 	distance = distance_agu_use_in_bb (regno0, insn,
16798 					   distance, BB_HEAD (bb),
16799 					   &found, &redefined);
16800       else
16801 	{
16802 	  int shortest_dist = -1;
16803 	  bool found_in_bb = false;
16804 	  bool redefined_in_bb = false;
16805 
16806 	  FOR_EACH_EDGE (e, ei, bb->succs)
16807 	    {
16808 	      int bb_dist
16809 		= distance_agu_use_in_bb (regno0, insn,
16810 					  distance, BB_HEAD (e->dest),
16811 					  &found_in_bb, &redefined_in_bb);
16812 	      if (found_in_bb)
16813 		{
16814 		  if (shortest_dist < 0)
16815 		    shortest_dist = bb_dist;
16816 		  else if (bb_dist > 0)
16817 		    shortest_dist = MIN (bb_dist, shortest_dist);
16818 
16819 		  found = true;
16820 		}
16821 	    }
16822 
16823 	  distance = shortest_dist;
16824 	}
16825     }
16826 
16827   if (!found || redefined)
16828     return -1;
16829 
16830   return distance >> 1;
16831 }
16832 
16833 /* Define this macro to tune LEA priority vs ADD, it take effect when
16834    there is a dilemma of choicing LEA or ADD
16835    Negative value: ADD is more preferred than LEA
16836    Zero: Netrual
16837    Positive value: LEA is more preferred than ADD*/
16838 #define IX86_LEA_PRIORITY 0
16839 
16840 /* Return true if usage of lea INSN has performance advantage
16841    over a sequence of instructions.  Instructions sequence has
16842    SPLIT_COST cycles higher latency than lea latency.  */
16843 
16844 static bool
16845 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16846 		      unsigned int regno2, int split_cost)
16847 {
16848   int dist_define, dist_use;
16849 
16850   dist_define = distance_non_agu_define (regno1, regno2, insn);
16851   dist_use = distance_agu_use (regno0, insn);
16852 
16853   if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16854     {
16855       /* If there is no non AGU operand definition, no AGU
16856 	 operand usage and split cost is 0 then both lea
16857 	 and non lea variants have same priority.  Currently
16858 	 we prefer lea for 64 bit code and non lea on 32 bit
16859 	 code.  */
16860       if (dist_use < 0 && split_cost == 0)
16861 	return TARGET_64BIT || IX86_LEA_PRIORITY;
16862       else
16863 	return true;
16864     }
16865 
16866   /* With longer definitions distance lea is more preferable.
16867      Here we change it to take into account splitting cost and
16868      lea priority.  */
16869   dist_define += split_cost + IX86_LEA_PRIORITY;
16870 
16871   /* If there is no use in memory addess then we just check
16872      that split cost does not exceed AGU stall.  */
16873   if (dist_use < 0)
16874     return dist_define >= LEA_MAX_STALL;
16875 
16876   /* If this insn has both backward non-agu dependence and forward
16877      agu dependence, the one with short distance takes effect.  */
16878   return dist_define >= dist_use;
16879 }
16880 
16881 /* Return true if it is legal to clobber flags by INSN and
16882    false otherwise.  */
16883 
16884 static bool
16885 ix86_ok_to_clobber_flags (rtx insn)
16886 {
16887   basic_block bb = BLOCK_FOR_INSN (insn);
16888   df_ref *use;
16889   bitmap live;
16890 
16891   while (insn)
16892     {
16893       if (NONDEBUG_INSN_P (insn))
16894 	{
16895 	  for (use = DF_INSN_USES (insn); *use; use++)
16896 	    if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16897 	      return false;
16898 
16899 	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16900 	    return true;
16901 	}
16902 
16903       if (insn == BB_END (bb))
16904 	break;
16905 
16906       insn = NEXT_INSN (insn);
16907     }
16908 
16909   live = df_get_live_out(bb);
16910   return !REGNO_REG_SET_P (live, FLAGS_REG);
16911 }
16912 
16913 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16914    move and add to avoid AGU stalls.  */
16915 
16916 bool
16917 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16918 {
16919   unsigned int regno0 = true_regnum (operands[0]);
16920   unsigned int regno1 = true_regnum (operands[1]);
16921   unsigned int regno2 = true_regnum (operands[2]);
16922 
16923   /* Check if we need to optimize.  */
16924   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16925     return false;
16926 
16927   /* Check it is correct to split here.  */
16928   if (!ix86_ok_to_clobber_flags(insn))
16929     return false;
16930 
16931   /* We need to split only adds with non destructive
16932      destination operand.  */
16933   if (regno0 == regno1 || regno0 == regno2)
16934     return false;
16935   else
16936     return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16937 }
16938 
16939 /* Return true if we should emit lea instruction instead of mov
16940    instruction.  */
16941 
16942 bool
16943 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16944 {
16945   unsigned int regno0;
16946   unsigned int regno1;
16947 
16948   /* Check if we need to optimize.  */
16949   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16950     return false;
16951 
16952   /* Use lea for reg to reg moves only.  */
16953   if (!REG_P (operands[0]) || !REG_P (operands[1]))
16954     return false;
16955 
16956   regno0 = true_regnum (operands[0]);
16957   regno1 = true_regnum (operands[1]);
16958 
16959   return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
16960 }
16961 
16962 /* Return true if we need to split lea into a sequence of
16963    instructions to avoid AGU stalls. */
16964 
16965 bool
16966 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16967 {
16968   unsigned int regno0 = true_regnum (operands[0]) ;
16969   unsigned int regno1 = INVALID_REGNUM;
16970   unsigned int regno2 = INVALID_REGNUM;
16971   int split_cost = 0;
16972   struct ix86_address parts;
16973   int ok;
16974 
16975   /* FIXME: Handle zero-extended addresses.  */
16976   if (GET_CODE (operands[1]) == ZERO_EXTEND
16977       || GET_CODE (operands[1]) == AND)
16978     return false;
16979 
16980   /* Check we need to optimize.  */
16981   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16982     return false;
16983 
16984   /* Check it is correct to split here.  */
16985   if (!ix86_ok_to_clobber_flags(insn))
16986     return false;
16987 
16988   ok = ix86_decompose_address (operands[1], &parts);
16989   gcc_assert (ok);
16990 
16991   /* There should be at least two components in the address.  */
16992   if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
16993       + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
16994     return false;
16995 
16996   /* We should not split into add if non legitimate pic
16997      operand is used as displacement. */
16998   if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16999     return false;
17000 
17001   if (parts.base)
17002     regno1 = true_regnum (parts.base);
17003   if (parts.index)
17004     regno2 = true_regnum (parts.index);
17005 
17006   /* Compute how many cycles we will add to execution time
17007      if split lea into a sequence of instructions.  */
17008   if (parts.base || parts.index)
17009     {
17010       /* Have to use mov instruction if non desctructive
17011 	 destination form is used.  */
17012       if (regno1 != regno0 && regno2 != regno0)
17013 	split_cost += 1;
17014 
17015       /* Have to add index to base if both exist.  */
17016       if (parts.base && parts.index)
17017 	split_cost += 1;
17018 
17019       /* Have to use shift and adds if scale is 2 or greater.  */
17020       if (parts.scale > 1)
17021 	{
17022 	  if (regno0 != regno1)
17023 	    split_cost += 1;
17024 	  else if (regno2 == regno0)
17025 	    split_cost += 4;
17026 	  else
17027 	    split_cost += parts.scale;
17028 	}
17029 
17030       /* Have to use add instruction with immediate if
17031 	 disp is non zero.  */
17032       if (parts.disp && parts.disp != const0_rtx)
17033 	split_cost += 1;
17034 
17035       /* Subtract the price of lea.  */
17036       split_cost -= 1;
17037     }
17038 
17039   return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17040 }
17041 
17042 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17043    matches destination.  RTX includes clobber of FLAGS_REG.  */
17044 
17045 static void
17046 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17047 		 rtx dst, rtx src)
17048 {
17049   rtx op, clob;
17050 
17051   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17052   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17053 
17054   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17055 }
17056 
17057 /* Split lea instructions into a sequence of instructions
17058    which are executed on ALU to avoid AGU stalls.
17059    It is assumed that it is allowed to clobber flags register
17060    at lea position.  */
17061 
17062 extern void
17063 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
17064 {
17065   unsigned int regno0 = true_regnum (operands[0]) ;
17066   unsigned int regno1 = INVALID_REGNUM;
17067   unsigned int regno2 = INVALID_REGNUM;
17068   struct ix86_address parts;
17069   rtx tmp;
17070   int ok, adds;
17071 
17072   ok = ix86_decompose_address (operands[1], &parts);
17073   gcc_assert (ok);
17074 
17075   if (parts.base)
17076     {
17077       if (GET_MODE (parts.base) != mode)
17078 	parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
17079       regno1 = true_regnum (parts.base);
17080     }
17081 
17082   if (parts.index)
17083     {
17084       if (GET_MODE (parts.index) != mode)
17085 	parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
17086       regno2 = true_regnum (parts.index);
17087     }
17088 
17089   if (parts.scale > 1)
17090     {
17091       /* Case r1 = r1 + ...  */
17092       if (regno1 == regno0)
17093 	{
17094 	  /* If we have a case r1 = r1 + C * r1 then we
17095 	     should use multiplication which is very
17096 	     expensive.  Assume cost model is wrong if we
17097 	     have such case here.  */
17098 	  gcc_assert (regno2 != regno0);
17099 
17100 	  for (adds = parts.scale; adds > 0; adds--)
17101 	    ix86_emit_binop (PLUS, mode, operands[0], parts.index);
17102 	}
17103       else
17104 	{
17105 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
17106 	  if (regno0 != regno2)
17107 	    emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17108 
17109 	  /* Use shift for scaling.  */
17110 	  ix86_emit_binop (ASHIFT, mode, operands[0],
17111 			   GEN_INT (exact_log2 (parts.scale)));
17112 
17113 	  if (parts.base)
17114 	    ix86_emit_binop (PLUS, mode, operands[0], parts.base);
17115 
17116 	  if (parts.disp && parts.disp != const0_rtx)
17117 	    ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17118 	}
17119     }
17120   else if (!parts.base && !parts.index)
17121     {
17122       gcc_assert(parts.disp);
17123       emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
17124     }
17125   else
17126     {
17127       if (!parts.base)
17128 	{
17129 	  if (regno0 != regno2)
17130 	    emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17131 	}
17132       else if (!parts.index)
17133 	{
17134 	  if (regno0 != regno1)
17135 	    emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17136 	}
17137       else
17138 	{
17139 	  if (regno0 == regno1)
17140 	    tmp = parts.index;
17141 	  else if (regno0 == regno2)
17142 	    tmp = parts.base;
17143 	  else
17144 	    {
17145 	      emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17146 	      tmp = parts.index;
17147 	    }
17148 
17149 	  ix86_emit_binop (PLUS, mode, operands[0], tmp);
17150 	}
17151 
17152       if (parts.disp && parts.disp != const0_rtx)
17153 	ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17154     }
17155 }
17156 
17157 /* Return true if it is ok to optimize an ADD operation to LEA
17158    operation to avoid flag register consumation.  For most processors,
17159    ADD is faster than LEA.  For the processors like ATOM, if the
17160    destination register of LEA holds an actual address which will be
17161    used soon, LEA is better and otherwise ADD is better.  */
17162 
17163 bool
17164 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17165 {
17166   unsigned int regno0 = true_regnum (operands[0]);
17167   unsigned int regno1 = true_regnum (operands[1]);
17168   unsigned int regno2 = true_regnum (operands[2]);
17169 
17170   /* If a = b + c, (a!=b && a!=c), must use lea form. */
17171   if (regno0 != regno1 && regno0 != regno2)
17172     return true;
17173 
17174   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17175     return false;
17176 
17177   return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17178 }
17179 
17180 /* Return true if destination reg of SET_BODY is shift count of
17181    USE_BODY.  */
17182 
17183 static bool
17184 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17185 {
17186   rtx set_dest;
17187   rtx shift_rtx;
17188   int i;
17189 
17190   /* Retrieve destination of SET_BODY.  */
17191   switch (GET_CODE (set_body))
17192     {
17193     case SET:
17194       set_dest = SET_DEST (set_body);
17195       if (!set_dest || !REG_P (set_dest))
17196 	return false;
17197       break;
17198     case PARALLEL:
17199       for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17200 	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17201 					  use_body))
17202 	  return true;
17203     default:
17204       return false;
17205       break;
17206     }
17207 
17208   /* Retrieve shift count of USE_BODY.  */
17209   switch (GET_CODE (use_body))
17210     {
17211     case SET:
17212       shift_rtx = XEXP (use_body, 1);
17213       break;
17214     case PARALLEL:
17215       for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17216 	if (ix86_dep_by_shift_count_body (set_body,
17217 					  XVECEXP (use_body, 0, i)))
17218 	  return true;
17219     default:
17220       return false;
17221       break;
17222     }
17223 
17224   if (shift_rtx
17225       && (GET_CODE (shift_rtx) == ASHIFT
17226 	  || GET_CODE (shift_rtx) == LSHIFTRT
17227 	  || GET_CODE (shift_rtx) == ASHIFTRT
17228 	  || GET_CODE (shift_rtx) == ROTATE
17229 	  || GET_CODE (shift_rtx) == ROTATERT))
17230     {
17231       rtx shift_count = XEXP (shift_rtx, 1);
17232 
17233       /* Return true if shift count is dest of SET_BODY.  */
17234       if (REG_P (shift_count)
17235 	  && true_regnum (set_dest) == true_regnum (shift_count))
17236 	return true;
17237     }
17238 
17239   return false;
17240 }
17241 
17242 /* Return true if destination reg of SET_INSN is shift count of
17243    USE_INSN.  */
17244 
17245 bool
17246 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17247 {
17248   return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17249 				       PATTERN (use_insn));
17250 }
17251 
17252 /* Return TRUE or FALSE depending on whether the unary operator meets the
17253    appropriate constraints.  */
17254 
17255 bool
17256 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17257 			enum machine_mode mode ATTRIBUTE_UNUSED,
17258 			rtx operands[2] ATTRIBUTE_UNUSED)
17259 {
17260   /* If one of operands is memory, source and destination must match.  */
17261   if ((MEM_P (operands[0])
17262        || MEM_P (operands[1]))
17263       && ! rtx_equal_p (operands[0], operands[1]))
17264     return false;
17265   return true;
17266 }
17267 
17268 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17269    are ok, keeping in mind the possible movddup alternative.  */
17270 
17271 bool
17272 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17273 {
17274   if (MEM_P (operands[0]))
17275     return rtx_equal_p (operands[0], operands[1 + high]);
17276   if (MEM_P (operands[1]) && MEM_P (operands[2]))
17277     return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17278   return true;
17279 }
17280 
17281 /* Post-reload splitter for converting an SF or DFmode value in an
17282    SSE register into an unsigned SImode.  */
17283 
17284 void
17285 ix86_split_convert_uns_si_sse (rtx operands[])
17286 {
17287   enum machine_mode vecmode;
17288   rtx value, large, zero_or_two31, input, two31, x;
17289 
17290   large = operands[1];
17291   zero_or_two31 = operands[2];
17292   input = operands[3];
17293   two31 = operands[4];
17294   vecmode = GET_MODE (large);
17295   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17296 
17297   /* Load up the value into the low element.  We must ensure that the other
17298      elements are valid floats -- zero is the easiest such value.  */
17299   if (MEM_P (input))
17300     {
17301       if (vecmode == V4SFmode)
17302 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17303       else
17304 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17305     }
17306   else
17307     {
17308       input = gen_rtx_REG (vecmode, REGNO (input));
17309       emit_move_insn (value, CONST0_RTX (vecmode));
17310       if (vecmode == V4SFmode)
17311 	emit_insn (gen_sse_movss (value, value, input));
17312       else
17313 	emit_insn (gen_sse2_movsd (value, value, input));
17314     }
17315 
17316   emit_move_insn (large, two31);
17317   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17318 
17319   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17320   emit_insn (gen_rtx_SET (VOIDmode, large, x));
17321 
17322   x = gen_rtx_AND (vecmode, zero_or_two31, large);
17323   emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17324 
17325   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17326   emit_insn (gen_rtx_SET (VOIDmode, value, x));
17327 
17328   large = gen_rtx_REG (V4SImode, REGNO (large));
17329   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17330 
17331   x = gen_rtx_REG (V4SImode, REGNO (value));
17332   if (vecmode == V4SFmode)
17333     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17334   else
17335     emit_insn (gen_sse2_cvttpd2dq (x, value));
17336   value = x;
17337 
17338   emit_insn (gen_xorv4si3 (value, value, large));
17339 }
17340 
17341 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17342    Expects the 64-bit DImode to be supplied in a pair of integral
17343    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
17344    -mfpmath=sse, !optimize_size only.  */
17345 
17346 void
17347 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17348 {
17349   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17350   rtx int_xmm, fp_xmm;
17351   rtx biases, exponents;
17352   rtx x;
17353 
17354   int_xmm = gen_reg_rtx (V4SImode);
17355   if (TARGET_INTER_UNIT_MOVES)
17356     emit_insn (gen_movdi_to_sse (int_xmm, input));
17357   else if (TARGET_SSE_SPLIT_REGS)
17358     {
17359       emit_clobber (int_xmm);
17360       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17361     }
17362   else
17363     {
17364       x = gen_reg_rtx (V2DImode);
17365       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17366       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17367     }
17368 
17369   x = gen_rtx_CONST_VECTOR (V4SImode,
17370 			    gen_rtvec (4, GEN_INT (0x43300000UL),
17371 				       GEN_INT (0x45300000UL),
17372 				       const0_rtx, const0_rtx));
17373   exponents = validize_mem (force_const_mem (V4SImode, x));
17374 
17375   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17376   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17377 
17378   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17379      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17380      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17381      (0x1.0p84 + double(fp_value_hi_xmm)).
17382      Note these exponents differ by 32.  */
17383 
17384   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17385 
17386   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17387      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
17388   real_ldexp (&bias_lo_rvt, &dconst1, 52);
17389   real_ldexp (&bias_hi_rvt, &dconst1, 84);
17390   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17391   x = const_double_from_real_value (bias_hi_rvt, DFmode);
17392   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17393   biases = validize_mem (force_const_mem (V2DFmode, biases));
17394   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17395 
17396   /* Add the upper and lower DFmode values together.  */
17397   if (TARGET_SSE3)
17398     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17399   else
17400     {
17401       x = copy_to_mode_reg (V2DFmode, fp_xmm);
17402       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17403       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17404     }
17405 
17406   ix86_expand_vector_extract (false, target, fp_xmm, 0);
17407 }
17408 
17409 /* Not used, but eases macroization of patterns.  */
17410 void
17411 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17412 				  rtx input ATTRIBUTE_UNUSED)
17413 {
17414   gcc_unreachable ();
17415 }
17416 
17417 /* Convert an unsigned SImode value into a DFmode.  Only currently used
17418    for SSE, but applicable anywhere.  */
17419 
17420 void
17421 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17422 {
17423   REAL_VALUE_TYPE TWO31r;
17424   rtx x, fp;
17425 
17426   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17427 			   NULL, 1, OPTAB_DIRECT);
17428 
17429   fp = gen_reg_rtx (DFmode);
17430   emit_insn (gen_floatsidf2 (fp, x));
17431 
17432   real_ldexp (&TWO31r, &dconst1, 31);
17433   x = const_double_from_real_value (TWO31r, DFmode);
17434 
17435   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17436   if (x != target)
17437     emit_move_insn (target, x);
17438 }
17439 
17440 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
17441    32-bit mode; otherwise we have a direct convert instruction.  */
17442 
17443 void
17444 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17445 {
17446   REAL_VALUE_TYPE TWO32r;
17447   rtx fp_lo, fp_hi, x;
17448 
17449   fp_lo = gen_reg_rtx (DFmode);
17450   fp_hi = gen_reg_rtx (DFmode);
17451 
17452   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17453 
17454   real_ldexp (&TWO32r, &dconst1, 32);
17455   x = const_double_from_real_value (TWO32r, DFmode);
17456   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17457 
17458   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17459 
17460   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17461 			   0, OPTAB_DIRECT);
17462   if (x != target)
17463     emit_move_insn (target, x);
17464 }
17465 
17466 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17467    For x86_32, -mfpmath=sse, !optimize_size only.  */
17468 void
17469 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17470 {
17471   REAL_VALUE_TYPE ONE16r;
17472   rtx fp_hi, fp_lo, int_hi, int_lo, x;
17473 
17474   real_ldexp (&ONE16r, &dconst1, 16);
17475   x = const_double_from_real_value (ONE16r, SFmode);
17476   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17477 				      NULL, 0, OPTAB_DIRECT);
17478   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17479 				      NULL, 0, OPTAB_DIRECT);
17480   fp_hi = gen_reg_rtx (SFmode);
17481   fp_lo = gen_reg_rtx (SFmode);
17482   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17483   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17484   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17485 			       0, OPTAB_DIRECT);
17486   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17487 			       0, OPTAB_DIRECT);
17488   if (!rtx_equal_p (target, fp_hi))
17489     emit_move_insn (target, fp_hi);
17490 }
17491 
17492 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
17493    a vector of unsigned ints VAL to vector of floats TARGET.  */
17494 
17495 void
17496 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17497 {
17498   rtx tmp[8];
17499   REAL_VALUE_TYPE TWO16r;
17500   enum machine_mode intmode = GET_MODE (val);
17501   enum machine_mode fltmode = GET_MODE (target);
17502   rtx (*cvt) (rtx, rtx);
17503 
17504   if (intmode == V4SImode)
17505     cvt = gen_floatv4siv4sf2;
17506   else
17507     cvt = gen_floatv8siv8sf2;
17508   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17509   tmp[0] = force_reg (intmode, tmp[0]);
17510   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17511 				OPTAB_DIRECT);
17512   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17513 				NULL_RTX, 1, OPTAB_DIRECT);
17514   tmp[3] = gen_reg_rtx (fltmode);
17515   emit_insn (cvt (tmp[3], tmp[1]));
17516   tmp[4] = gen_reg_rtx (fltmode);
17517   emit_insn (cvt (tmp[4], tmp[2]));
17518   real_ldexp (&TWO16r, &dconst1, 16);
17519   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17520   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17521   tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17522 				OPTAB_DIRECT);
17523   tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17524 				OPTAB_DIRECT);
17525   if (tmp[7] != target)
17526     emit_move_insn (target, tmp[7]);
17527 }
17528 
17529 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17530    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17531    This is done by doing just signed conversion if < 0x1p31, and otherwise by
17532    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
17533 
17534 rtx
17535 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17536 {
17537   REAL_VALUE_TYPE TWO31r;
17538   rtx two31r, tmp[4];
17539   enum machine_mode mode = GET_MODE (val);
17540   enum machine_mode scalarmode = GET_MODE_INNER (mode);
17541   enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17542   rtx (*cmp) (rtx, rtx, rtx, rtx);
17543   int i;
17544 
17545   for (i = 0; i < 3; i++)
17546     tmp[i] = gen_reg_rtx (mode);
17547   real_ldexp (&TWO31r, &dconst1, 31);
17548   two31r = const_double_from_real_value (TWO31r, scalarmode);
17549   two31r = ix86_build_const_vector (mode, 1, two31r);
17550   two31r = force_reg (mode, two31r);
17551   switch (mode)
17552     {
17553     case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17554     case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17555     case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17556     case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17557     default: gcc_unreachable ();
17558     }
17559   tmp[3] = gen_rtx_LE (mode, two31r, val);
17560   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17561   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17562 				0, OPTAB_DIRECT);
17563   if (intmode == V4SImode || TARGET_AVX2)
17564     *xorp = expand_simple_binop (intmode, ASHIFT,
17565 				 gen_lowpart (intmode, tmp[0]),
17566 				 GEN_INT (31), NULL_RTX, 0,
17567 				 OPTAB_DIRECT);
17568   else
17569     {
17570       rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17571       two31 = ix86_build_const_vector (intmode, 1, two31);
17572       *xorp = expand_simple_binop (intmode, AND,
17573 				   gen_lowpart (intmode, tmp[0]),
17574 				   two31, NULL_RTX, 0,
17575 				   OPTAB_DIRECT);
17576     }
17577   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17578 			      0, OPTAB_DIRECT);
17579 }
17580 
17581 /* A subroutine of ix86_build_signbit_mask.  If VECT is true,
17582    then replicate the value for all elements of the vector
17583    register.  */
17584 
17585 rtx
17586 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17587 {
17588   int i, n_elt;
17589   rtvec v;
17590   enum machine_mode scalar_mode;
17591 
17592   switch (mode)
17593     {
17594     case V32QImode:
17595     case V16QImode:
17596     case V16HImode:
17597     case V8HImode:
17598     case V8SImode:
17599     case V4SImode:
17600     case V4DImode:
17601     case V2DImode:
17602       gcc_assert (vect);
17603     case V8SFmode:
17604     case V4SFmode:
17605     case V4DFmode:
17606     case V2DFmode:
17607       n_elt = GET_MODE_NUNITS (mode);
17608       v = rtvec_alloc (n_elt);
17609       scalar_mode = GET_MODE_INNER (mode);
17610 
17611       RTVEC_ELT (v, 0) = value;
17612 
17613       for (i = 1; i < n_elt; ++i)
17614 	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17615 
17616       return gen_rtx_CONST_VECTOR (mode, v);
17617 
17618     default:
17619       gcc_unreachable ();
17620     }
17621 }
17622 
17623 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17624    and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
17625    for an SSE register.  If VECT is true, then replicate the mask for
17626    all elements of the vector register.  If INVERT is true, then create
17627    a mask excluding the sign bit.  */
17628 
17629 rtx
17630 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17631 {
17632   enum machine_mode vec_mode, imode;
17633   HOST_WIDE_INT hi, lo;
17634   int shift = 63;
17635   rtx v;
17636   rtx mask;
17637 
17638   /* Find the sign bit, sign extended to 2*HWI.  */
17639   switch (mode)
17640     {
17641     case V8SImode:
17642     case V4SImode:
17643     case V8SFmode:
17644     case V4SFmode:
17645       vec_mode = mode;
17646       mode = GET_MODE_INNER (mode);
17647       imode = SImode;
17648       lo = 0x80000000, hi = lo < 0;
17649       break;
17650 
17651     case V4DImode:
17652     case V2DImode:
17653     case V4DFmode:
17654     case V2DFmode:
17655       vec_mode = mode;
17656       mode = GET_MODE_INNER (mode);
17657       imode = DImode;
17658       if (HOST_BITS_PER_WIDE_INT >= 64)
17659 	lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17660       else
17661 	lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17662       break;
17663 
17664     case TImode:
17665     case TFmode:
17666       vec_mode = VOIDmode;
17667       if (HOST_BITS_PER_WIDE_INT >= 64)
17668 	{
17669 	  imode = TImode;
17670 	  lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17671 	}
17672       else
17673 	{
17674 	  rtvec vec;
17675 
17676 	  imode = DImode;
17677 	  lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17678 
17679 	  if (invert)
17680 	    {
17681 	      lo = ~lo, hi = ~hi;
17682 	      v = constm1_rtx;
17683 	    }
17684 	  else
17685 	    v = const0_rtx;
17686 
17687 	  mask = immed_double_const (lo, hi, imode);
17688 
17689 	  vec = gen_rtvec (2, v, mask);
17690 	  v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17691 	  v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17692 
17693 	  return v;
17694 	}
17695      break;
17696 
17697     default:
17698       gcc_unreachable ();
17699     }
17700 
17701   if (invert)
17702     lo = ~lo, hi = ~hi;
17703 
17704   /* Force this value into the low part of a fp vector constant.  */
17705   mask = immed_double_const (lo, hi, imode);
17706   mask = gen_lowpart (mode, mask);
17707 
17708   if (vec_mode == VOIDmode)
17709     return force_reg (mode, mask);
17710 
17711   v = ix86_build_const_vector (vec_mode, vect, mask);
17712   return force_reg (vec_mode, v);
17713 }
17714 
17715 /* Generate code for floating point ABS or NEG.  */
17716 
17717 void
17718 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17719 				rtx operands[])
17720 {
17721   rtx mask, set, dst, src;
17722   bool use_sse = false;
17723   bool vector_mode = VECTOR_MODE_P (mode);
17724   enum machine_mode vmode = mode;
17725 
17726   if (vector_mode)
17727     use_sse = true;
17728   else if (mode == TFmode)
17729     use_sse = true;
17730   else if (TARGET_SSE_MATH)
17731     {
17732       use_sse = SSE_FLOAT_MODE_P (mode);
17733       if (mode == SFmode)
17734 	vmode = V4SFmode;
17735       else if (mode == DFmode)
17736 	vmode = V2DFmode;
17737     }
17738 
17739   /* NEG and ABS performed with SSE use bitwise mask operations.
17740      Create the appropriate mask now.  */
17741   if (use_sse)
17742     mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17743   else
17744     mask = NULL_RTX;
17745 
17746   dst = operands[0];
17747   src = operands[1];
17748 
17749   set = gen_rtx_fmt_e (code, mode, src);
17750   set = gen_rtx_SET (VOIDmode, dst, set);
17751 
17752   if (mask)
17753     {
17754       rtx use, clob;
17755       rtvec par;
17756 
17757       use = gen_rtx_USE (VOIDmode, mask);
17758       if (vector_mode)
17759 	par = gen_rtvec (2, set, use);
17760       else
17761 	{
17762           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17763 	  par = gen_rtvec (3, set, use, clob);
17764         }
17765       emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17766     }
17767   else
17768     emit_insn (set);
17769 }
17770 
17771 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
17772 
17773 void
17774 ix86_expand_copysign (rtx operands[])
17775 {
17776   enum machine_mode mode, vmode;
17777   rtx dest, op0, op1, mask, nmask;
17778 
17779   dest = operands[0];
17780   op0 = operands[1];
17781   op1 = operands[2];
17782 
17783   mode = GET_MODE (dest);
17784 
17785   if (mode == SFmode)
17786     vmode = V4SFmode;
17787   else if (mode == DFmode)
17788     vmode = V2DFmode;
17789   else
17790     vmode = mode;
17791 
17792   if (GET_CODE (op0) == CONST_DOUBLE)
17793     {
17794       rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17795 
17796       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17797 	op0 = simplify_unary_operation (ABS, mode, op0, mode);
17798 
17799       if (mode == SFmode || mode == DFmode)
17800 	{
17801 	  if (op0 == CONST0_RTX (mode))
17802 	    op0 = CONST0_RTX (vmode);
17803 	  else
17804 	    {
17805 	      rtx v = ix86_build_const_vector (vmode, false, op0);
17806 
17807 	      op0 = force_reg (vmode, v);
17808 	    }
17809 	}
17810       else if (op0 != CONST0_RTX (mode))
17811 	op0 = force_reg (mode, op0);
17812 
17813       mask = ix86_build_signbit_mask (vmode, 0, 0);
17814 
17815       if (mode == SFmode)
17816 	copysign_insn = gen_copysignsf3_const;
17817       else if (mode == DFmode)
17818 	copysign_insn = gen_copysigndf3_const;
17819       else
17820 	copysign_insn = gen_copysigntf3_const;
17821 
17822 	emit_insn (copysign_insn (dest, op0, op1, mask));
17823     }
17824   else
17825     {
17826       rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17827 
17828       nmask = ix86_build_signbit_mask (vmode, 0, 1);
17829       mask = ix86_build_signbit_mask (vmode, 0, 0);
17830 
17831       if (mode == SFmode)
17832 	copysign_insn = gen_copysignsf3_var;
17833       else if (mode == DFmode)
17834 	copysign_insn = gen_copysigndf3_var;
17835       else
17836 	copysign_insn = gen_copysigntf3_var;
17837 
17838       emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17839     }
17840 }
17841 
17842 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
17843    be a constant, and so has already been expanded into a vector constant.  */
17844 
17845 void
17846 ix86_split_copysign_const (rtx operands[])
17847 {
17848   enum machine_mode mode, vmode;
17849   rtx dest, op0, mask, x;
17850 
17851   dest = operands[0];
17852   op0 = operands[1];
17853   mask = operands[3];
17854 
17855   mode = GET_MODE (dest);
17856   vmode = GET_MODE (mask);
17857 
17858   dest = simplify_gen_subreg (vmode, dest, mode, 0);
17859   x = gen_rtx_AND (vmode, dest, mask);
17860   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17861 
17862   if (op0 != CONST0_RTX (vmode))
17863     {
17864       x = gen_rtx_IOR (vmode, dest, op0);
17865       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17866     }
17867 }
17868 
17869 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
17870    so we have to do two masks.  */
17871 
17872 void
17873 ix86_split_copysign_var (rtx operands[])
17874 {
17875   enum machine_mode mode, vmode;
17876   rtx dest, scratch, op0, op1, mask, nmask, x;
17877 
17878   dest = operands[0];
17879   scratch = operands[1];
17880   op0 = operands[2];
17881   op1 = operands[3];
17882   nmask = operands[4];
17883   mask = operands[5];
17884 
17885   mode = GET_MODE (dest);
17886   vmode = GET_MODE (mask);
17887 
17888   if (rtx_equal_p (op0, op1))
17889     {
17890       /* Shouldn't happen often (it's useless, obviously), but when it does
17891 	 we'd generate incorrect code if we continue below.  */
17892       emit_move_insn (dest, op0);
17893       return;
17894     }
17895 
17896   if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
17897     {
17898       gcc_assert (REGNO (op1) == REGNO (scratch));
17899 
17900       x = gen_rtx_AND (vmode, scratch, mask);
17901       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17902 
17903       dest = mask;
17904       op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17905       x = gen_rtx_NOT (vmode, dest);
17906       x = gen_rtx_AND (vmode, x, op0);
17907       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17908     }
17909   else
17910     {
17911       if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
17912 	{
17913 	  x = gen_rtx_AND (vmode, scratch, mask);
17914 	}
17915       else						/* alternative 2,4 */
17916 	{
17917           gcc_assert (REGNO (mask) == REGNO (scratch));
17918           op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17919 	  x = gen_rtx_AND (vmode, scratch, op1);
17920 	}
17921       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17922 
17923       if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
17924 	{
17925 	  dest = simplify_gen_subreg (vmode, op0, mode, 0);
17926 	  x = gen_rtx_AND (vmode, dest, nmask);
17927 	}
17928       else						/* alternative 3,4 */
17929 	{
17930           gcc_assert (REGNO (nmask) == REGNO (dest));
17931 	  dest = nmask;
17932 	  op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17933 	  x = gen_rtx_AND (vmode, dest, op0);
17934 	}
17935       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17936     }
17937 
17938   x = gen_rtx_IOR (vmode, dest, scratch);
17939   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17940 }
17941 
17942 /* Return TRUE or FALSE depending on whether the first SET in INSN
17943    has source and destination with matching CC modes, and that the
17944    CC mode is at least as constrained as REQ_MODE.  */
17945 
17946 bool
17947 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17948 {
17949   rtx set;
17950   enum machine_mode set_mode;
17951 
17952   set = PATTERN (insn);
17953   if (GET_CODE (set) == PARALLEL)
17954     set = XVECEXP (set, 0, 0);
17955   gcc_assert (GET_CODE (set) == SET);
17956   gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17957 
17958   set_mode = GET_MODE (SET_DEST (set));
17959   switch (set_mode)
17960     {
17961     case CCNOmode:
17962       if (req_mode != CCNOmode
17963 	  && (req_mode != CCmode
17964 	      || XEXP (SET_SRC (set), 1) != const0_rtx))
17965 	return false;
17966       break;
17967     case CCmode:
17968       if (req_mode == CCGCmode)
17969 	return false;
17970       /* FALLTHRU */
17971     case CCGCmode:
17972       if (req_mode == CCGOCmode || req_mode == CCNOmode)
17973 	return false;
17974       /* FALLTHRU */
17975     case CCGOCmode:
17976       if (req_mode == CCZmode)
17977 	return false;
17978       /* FALLTHRU */
17979     case CCZmode:
17980       break;
17981 
17982     case CCAmode:
17983     case CCCmode:
17984     case CCOmode:
17985     case CCSmode:
17986       if (set_mode != req_mode)
17987 	return false;
17988       break;
17989 
17990     default:
17991       gcc_unreachable ();
17992     }
17993 
17994   return GET_MODE (SET_SRC (set)) == set_mode;
17995 }
17996 
17997 /* Generate insn patterns to do an integer compare of OPERANDS.  */
17998 
17999 static rtx
18000 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18001 {
18002   enum machine_mode cmpmode;
18003   rtx tmp, flags;
18004 
18005   cmpmode = SELECT_CC_MODE (code, op0, op1);
18006   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18007 
18008   /* This is very simple, but making the interface the same as in the
18009      FP case makes the rest of the code easier.  */
18010   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18011   emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18012 
18013   /* Return the test that should be put into the flags user, i.e.
18014      the bcc, scc, or cmov instruction.  */
18015   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18016 }
18017 
18018 /* Figure out whether to use ordered or unordered fp comparisons.
18019    Return the appropriate mode to use.  */
18020 
18021 enum machine_mode
18022 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18023 {
18024   /* ??? In order to make all comparisons reversible, we do all comparisons
18025      non-trapping when compiling for IEEE.  Once gcc is able to distinguish
18026      all forms trapping and nontrapping comparisons, we can make inequality
18027      comparisons trapping again, since it results in better code when using
18028      FCOM based compares.  */
18029   return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18030 }
18031 
18032 enum machine_mode
18033 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18034 {
18035   enum machine_mode mode = GET_MODE (op0);
18036 
18037   if (SCALAR_FLOAT_MODE_P (mode))
18038     {
18039       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18040       return ix86_fp_compare_mode (code);
18041     }
18042 
18043   switch (code)
18044     {
18045       /* Only zero flag is needed.  */
18046     case EQ:			/* ZF=0 */
18047     case NE:			/* ZF!=0 */
18048       return CCZmode;
18049       /* Codes needing carry flag.  */
18050     case GEU:			/* CF=0 */
18051     case LTU:			/* CF=1 */
18052       /* Detect overflow checks.  They need just the carry flag.  */
18053       if (GET_CODE (op0) == PLUS
18054 	  && rtx_equal_p (op1, XEXP (op0, 0)))
18055 	return CCCmode;
18056       else
18057 	return CCmode;
18058     case GTU:			/* CF=0 & ZF=0 */
18059     case LEU:			/* CF=1 | ZF=1 */
18060       /* Detect overflow checks.  They need just the carry flag.  */
18061       if (GET_CODE (op0) == MINUS
18062 	  && rtx_equal_p (op1, XEXP (op0, 0)))
18063 	return CCCmode;
18064       else
18065 	return CCmode;
18066       /* Codes possibly doable only with sign flag when
18067          comparing against zero.  */
18068     case GE:			/* SF=OF   or   SF=0 */
18069     case LT:			/* SF<>OF  or   SF=1 */
18070       if (op1 == const0_rtx)
18071 	return CCGOCmode;
18072       else
18073 	/* For other cases Carry flag is not required.  */
18074 	return CCGCmode;
18075       /* Codes doable only with sign flag when comparing
18076          against zero, but we miss jump instruction for it
18077          so we need to use relational tests against overflow
18078          that thus needs to be zero.  */
18079     case GT:			/* ZF=0 & SF=OF */
18080     case LE:			/* ZF=1 | SF<>OF */
18081       if (op1 == const0_rtx)
18082 	return CCNOmode;
18083       else
18084 	return CCGCmode;
18085       /* strcmp pattern do (use flags) and combine may ask us for proper
18086 	 mode.  */
18087     case USE:
18088       return CCmode;
18089     default:
18090       gcc_unreachable ();
18091     }
18092 }
18093 
18094 /* Return the fixed registers used for condition codes.  */
18095 
18096 static bool
18097 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18098 {
18099   *p1 = FLAGS_REG;
18100   *p2 = FPSR_REG;
18101   return true;
18102 }
18103 
18104 /* If two condition code modes are compatible, return a condition code
18105    mode which is compatible with both.  Otherwise, return
18106    VOIDmode.  */
18107 
18108 static enum machine_mode
18109 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18110 {
18111   if (m1 == m2)
18112     return m1;
18113 
18114   if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18115     return VOIDmode;
18116 
18117   if ((m1 == CCGCmode && m2 == CCGOCmode)
18118       || (m1 == CCGOCmode && m2 == CCGCmode))
18119     return CCGCmode;
18120 
18121   switch (m1)
18122     {
18123     default:
18124       gcc_unreachable ();
18125 
18126     case CCmode:
18127     case CCGCmode:
18128     case CCGOCmode:
18129     case CCNOmode:
18130     case CCAmode:
18131     case CCCmode:
18132     case CCOmode:
18133     case CCSmode:
18134     case CCZmode:
18135       switch (m2)
18136 	{
18137 	default:
18138 	  return VOIDmode;
18139 
18140 	case CCmode:
18141 	case CCGCmode:
18142 	case CCGOCmode:
18143 	case CCNOmode:
18144 	case CCAmode:
18145 	case CCCmode:
18146 	case CCOmode:
18147 	case CCSmode:
18148 	case CCZmode:
18149 	  return CCmode;
18150 	}
18151 
18152     case CCFPmode:
18153     case CCFPUmode:
18154       /* These are only compatible with themselves, which we already
18155 	 checked above.  */
18156       return VOIDmode;
18157     }
18158 }
18159 
18160 
18161 /* Return a comparison we can do and that it is equivalent to
18162    swap_condition (code) apart possibly from orderedness.
18163    But, never change orderedness if TARGET_IEEE_FP, returning
18164    UNKNOWN in that case if necessary.  */
18165 
18166 static enum rtx_code
18167 ix86_fp_swap_condition (enum rtx_code code)
18168 {
18169   switch (code)
18170     {
18171     case GT:                   /* GTU - CF=0 & ZF=0 */
18172       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18173     case GE:                   /* GEU - CF=0 */
18174       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18175     case UNLT:                 /* LTU - CF=1 */
18176       return TARGET_IEEE_FP ? UNKNOWN : GT;
18177     case UNLE:                 /* LEU - CF=1 | ZF=1 */
18178       return TARGET_IEEE_FP ? UNKNOWN : GE;
18179     default:
18180       return swap_condition (code);
18181     }
18182 }
18183 
18184 /* Return cost of comparison CODE using the best strategy for performance.
18185    All following functions do use number of instructions as a cost metrics.
18186    In future this should be tweaked to compute bytes for optimize_size and
18187    take into account performance of various instructions on various CPUs.  */
18188 
18189 static int
18190 ix86_fp_comparison_cost (enum rtx_code code)
18191 {
18192   int arith_cost;
18193 
18194   /* The cost of code using bit-twiddling on %ah.  */
18195   switch (code)
18196     {
18197     case UNLE:
18198     case UNLT:
18199     case LTGT:
18200     case GT:
18201     case GE:
18202     case UNORDERED:
18203     case ORDERED:
18204     case UNEQ:
18205       arith_cost = 4;
18206       break;
18207     case LT:
18208     case NE:
18209     case EQ:
18210     case UNGE:
18211       arith_cost = TARGET_IEEE_FP ? 5 : 4;
18212       break;
18213     case LE:
18214     case UNGT:
18215       arith_cost = TARGET_IEEE_FP ? 6 : 4;
18216       break;
18217     default:
18218       gcc_unreachable ();
18219     }
18220 
18221   switch (ix86_fp_comparison_strategy (code))
18222     {
18223     case IX86_FPCMP_COMI:
18224       return arith_cost > 4 ? 3 : 2;
18225     case IX86_FPCMP_SAHF:
18226       return arith_cost > 4 ? 4 : 3;
18227     default:
18228       return arith_cost;
18229     }
18230 }
18231 
18232 /* Return strategy to use for floating-point.  We assume that fcomi is always
18233    preferrable where available, since that is also true when looking at size
18234    (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
18235 
18236 enum ix86_fpcmp_strategy
18237 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18238 {
18239   /* Do fcomi/sahf based test when profitable.  */
18240 
18241   if (TARGET_CMOVE)
18242     return IX86_FPCMP_COMI;
18243 
18244   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18245     return IX86_FPCMP_SAHF;
18246 
18247   return IX86_FPCMP_ARITH;
18248 }
18249 
18250 /* Swap, force into registers, or otherwise massage the two operands
18251    to a fp comparison.  The operands are updated in place; the new
18252    comparison code is returned.  */
18253 
18254 static enum rtx_code
18255 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18256 {
18257   enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18258   rtx op0 = *pop0, op1 = *pop1;
18259   enum machine_mode op_mode = GET_MODE (op0);
18260   int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18261 
18262   /* All of the unordered compare instructions only work on registers.
18263      The same is true of the fcomi compare instructions.  The XFmode
18264      compare instructions require registers except when comparing
18265      against zero or when converting operand 1 from fixed point to
18266      floating point.  */
18267 
18268   if (!is_sse
18269       && (fpcmp_mode == CCFPUmode
18270 	  || (op_mode == XFmode
18271 	      && ! (standard_80387_constant_p (op0) == 1
18272 		    || standard_80387_constant_p (op1) == 1)
18273 	      && GET_CODE (op1) != FLOAT)
18274 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18275     {
18276       op0 = force_reg (op_mode, op0);
18277       op1 = force_reg (op_mode, op1);
18278     }
18279   else
18280     {
18281       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
18282 	 things around if they appear profitable, otherwise force op0
18283 	 into a register.  */
18284 
18285       if (standard_80387_constant_p (op0) == 0
18286 	  || (MEM_P (op0)
18287 	      && ! (standard_80387_constant_p (op1) == 0
18288 		    || MEM_P (op1))))
18289 	{
18290 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
18291 	  if (new_code != UNKNOWN)
18292 	    {
18293 	      rtx tmp;
18294 	      tmp = op0, op0 = op1, op1 = tmp;
18295 	      code = new_code;
18296 	    }
18297 	}
18298 
18299       if (!REG_P (op0))
18300 	op0 = force_reg (op_mode, op0);
18301 
18302       if (CONSTANT_P (op1))
18303 	{
18304 	  int tmp = standard_80387_constant_p (op1);
18305 	  if (tmp == 0)
18306 	    op1 = validize_mem (force_const_mem (op_mode, op1));
18307 	  else if (tmp == 1)
18308 	    {
18309 	      if (TARGET_CMOVE)
18310 		op1 = force_reg (op_mode, op1);
18311 	    }
18312 	  else
18313 	    op1 = force_reg (op_mode, op1);
18314 	}
18315     }
18316 
18317   /* Try to rearrange the comparison to make it cheaper.  */
18318   if (ix86_fp_comparison_cost (code)
18319       > ix86_fp_comparison_cost (swap_condition (code))
18320       && (REG_P (op1) || can_create_pseudo_p ()))
18321     {
18322       rtx tmp;
18323       tmp = op0, op0 = op1, op1 = tmp;
18324       code = swap_condition (code);
18325       if (!REG_P (op0))
18326 	op0 = force_reg (op_mode, op0);
18327     }
18328 
18329   *pop0 = op0;
18330   *pop1 = op1;
18331   return code;
18332 }
18333 
18334 /* Convert comparison codes we use to represent FP comparison to integer
18335    code that will result in proper branch.  Return UNKNOWN if no such code
18336    is available.  */
18337 
18338 enum rtx_code
18339 ix86_fp_compare_code_to_integer (enum rtx_code code)
18340 {
18341   switch (code)
18342     {
18343     case GT:
18344       return GTU;
18345     case GE:
18346       return GEU;
18347     case ORDERED:
18348     case UNORDERED:
18349       return code;
18350       break;
18351     case UNEQ:
18352       return EQ;
18353       break;
18354     case UNLT:
18355       return LTU;
18356       break;
18357     case UNLE:
18358       return LEU;
18359       break;
18360     case LTGT:
18361       return NE;
18362       break;
18363     default:
18364       return UNKNOWN;
18365     }
18366 }
18367 
18368 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
18369 
18370 static rtx
18371 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18372 {
18373   enum machine_mode fpcmp_mode, intcmp_mode;
18374   rtx tmp, tmp2;
18375 
18376   fpcmp_mode = ix86_fp_compare_mode (code);
18377   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18378 
18379   /* Do fcomi/sahf based test when profitable.  */
18380   switch (ix86_fp_comparison_strategy (code))
18381     {
18382     case IX86_FPCMP_COMI:
18383       intcmp_mode = fpcmp_mode;
18384       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18385       tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18386 			 tmp);
18387       emit_insn (tmp);
18388       break;
18389 
18390     case IX86_FPCMP_SAHF:
18391       intcmp_mode = fpcmp_mode;
18392       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18393       tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18394 			 tmp);
18395 
18396       if (!scratch)
18397 	scratch = gen_reg_rtx (HImode);
18398       tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18399       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18400       break;
18401 
18402     case IX86_FPCMP_ARITH:
18403       /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
18404       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18405       tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18406       if (!scratch)
18407 	scratch = gen_reg_rtx (HImode);
18408       emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18409 
18410       /* In the unordered case, we have to check C2 for NaN's, which
18411 	 doesn't happen to work out to anything nice combination-wise.
18412 	 So do some bit twiddling on the value we've got in AH to come
18413 	 up with an appropriate set of condition codes.  */
18414 
18415       intcmp_mode = CCNOmode;
18416       switch (code)
18417 	{
18418 	case GT:
18419 	case UNGT:
18420 	  if (code == GT || !TARGET_IEEE_FP)
18421 	    {
18422 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18423 	      code = EQ;
18424 	    }
18425 	  else
18426 	    {
18427 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18428 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18429 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18430 	      intcmp_mode = CCmode;
18431 	      code = GEU;
18432 	    }
18433 	  break;
18434 	case LT:
18435 	case UNLT:
18436 	  if (code == LT && TARGET_IEEE_FP)
18437 	    {
18438 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18439 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18440 	      intcmp_mode = CCmode;
18441 	      code = EQ;
18442 	    }
18443 	  else
18444 	    {
18445 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18446 	      code = NE;
18447 	    }
18448 	  break;
18449 	case GE:
18450 	case UNGE:
18451 	  if (code == GE || !TARGET_IEEE_FP)
18452 	    {
18453 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18454 	      code = EQ;
18455 	    }
18456 	  else
18457 	    {
18458 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18459 	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18460 	      code = NE;
18461 	    }
18462 	  break;
18463 	case LE:
18464 	case UNLE:
18465 	  if (code == LE && TARGET_IEEE_FP)
18466 	    {
18467 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18468 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18469 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18470 	      intcmp_mode = CCmode;
18471 	      code = LTU;
18472 	    }
18473 	  else
18474 	    {
18475 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18476 	      code = NE;
18477 	    }
18478 	  break;
18479 	case EQ:
18480 	case UNEQ:
18481 	  if (code == EQ && TARGET_IEEE_FP)
18482 	    {
18483 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18484 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18485 	      intcmp_mode = CCmode;
18486 	      code = EQ;
18487 	    }
18488 	  else
18489 	    {
18490 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18491 	      code = NE;
18492 	    }
18493 	  break;
18494 	case NE:
18495 	case LTGT:
18496 	  if (code == NE && TARGET_IEEE_FP)
18497 	    {
18498 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18499 	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18500 					     GEN_INT (0x40)));
18501 	      code = NE;
18502 	    }
18503 	  else
18504 	    {
18505 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18506 	      code = EQ;
18507 	    }
18508 	  break;
18509 
18510 	case UNORDERED:
18511 	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18512 	  code = NE;
18513 	  break;
18514 	case ORDERED:
18515 	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18516 	  code = EQ;
18517 	  break;
18518 
18519 	default:
18520 	  gcc_unreachable ();
18521 	}
18522 	break;
18523 
18524     default:
18525       gcc_unreachable();
18526     }
18527 
18528   /* Return the test that should be put into the flags user, i.e.
18529      the bcc, scc, or cmov instruction.  */
18530   return gen_rtx_fmt_ee (code, VOIDmode,
18531 			 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18532 			 const0_rtx);
18533 }
18534 
18535 static rtx
18536 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18537 {
18538   rtx ret;
18539 
18540   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18541     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18542 
18543   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18544     {
18545       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18546       ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18547     }
18548   else
18549     ret = ix86_expand_int_compare (code, op0, op1);
18550 
18551   return ret;
18552 }
18553 
18554 void
18555 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18556 {
18557   enum machine_mode mode = GET_MODE (op0);
18558   rtx tmp;
18559 
18560   switch (mode)
18561     {
18562     case SFmode:
18563     case DFmode:
18564     case XFmode:
18565     case QImode:
18566     case HImode:
18567     case SImode:
18568       simple:
18569       tmp = ix86_expand_compare (code, op0, op1);
18570       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18571 				  gen_rtx_LABEL_REF (VOIDmode, label),
18572 				  pc_rtx);
18573       emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18574       return;
18575 
18576     case DImode:
18577       if (TARGET_64BIT)
18578 	goto simple;
18579     case TImode:
18580       /* Expand DImode branch into multiple compare+branch.  */
18581       {
18582 	rtx lo[2], hi[2], label2;
18583 	enum rtx_code code1, code2, code3;
18584 	enum machine_mode submode;
18585 
18586 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18587 	  {
18588 	    tmp = op0, op0 = op1, op1 = tmp;
18589 	    code = swap_condition (code);
18590 	  }
18591 
18592 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
18593 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
18594 
18595 	submode = mode == DImode ? SImode : DImode;
18596 
18597 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18598 	   avoid two branches.  This costs one extra insn, so disable when
18599 	   optimizing for size.  */
18600 
18601 	if ((code == EQ || code == NE)
18602 	    && (!optimize_insn_for_size_p ()
18603 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
18604 	  {
18605 	    rtx xor0, xor1;
18606 
18607 	    xor1 = hi[0];
18608 	    if (hi[1] != const0_rtx)
18609 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18610 				   NULL_RTX, 0, OPTAB_WIDEN);
18611 
18612 	    xor0 = lo[0];
18613 	    if (lo[1] != const0_rtx)
18614 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18615 				   NULL_RTX, 0, OPTAB_WIDEN);
18616 
18617 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
18618 				NULL_RTX, 0, OPTAB_WIDEN);
18619 
18620 	    ix86_expand_branch (code, tmp, const0_rtx, label);
18621 	    return;
18622 	  }
18623 
18624 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
18625 	   op1 is a constant and the low word is zero, then we can just
18626 	   examine the high word.  Similarly for low word -1 and
18627 	   less-or-equal-than or greater-than.  */
18628 
18629 	if (CONST_INT_P (hi[1]))
18630 	  switch (code)
18631 	    {
18632 	    case LT: case LTU: case GE: case GEU:
18633 	      if (lo[1] == const0_rtx)
18634 		{
18635 		  ix86_expand_branch (code, hi[0], hi[1], label);
18636 		  return;
18637 		}
18638 	      break;
18639 	    case LE: case LEU: case GT: case GTU:
18640 	      if (lo[1] == constm1_rtx)
18641 		{
18642 		  ix86_expand_branch (code, hi[0], hi[1], label);
18643 		  return;
18644 		}
18645 	      break;
18646 	    default:
18647 	      break;
18648 	    }
18649 
18650 	/* Otherwise, we need two or three jumps.  */
18651 
18652 	label2 = gen_label_rtx ();
18653 
18654 	code1 = code;
18655 	code2 = swap_condition (code);
18656 	code3 = unsigned_condition (code);
18657 
18658 	switch (code)
18659 	  {
18660 	  case LT: case GT: case LTU: case GTU:
18661 	    break;
18662 
18663 	  case LE:   code1 = LT;  code2 = GT;  break;
18664 	  case GE:   code1 = GT;  code2 = LT;  break;
18665 	  case LEU:  code1 = LTU; code2 = GTU; break;
18666 	  case GEU:  code1 = GTU; code2 = LTU; break;
18667 
18668 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
18669 	  case NE:   code2 = UNKNOWN; break;
18670 
18671 	  default:
18672 	    gcc_unreachable ();
18673 	  }
18674 
18675 	/*
18676 	 * a < b =>
18677 	 *    if (hi(a) < hi(b)) goto true;
18678 	 *    if (hi(a) > hi(b)) goto false;
18679 	 *    if (lo(a) < lo(b)) goto true;
18680 	 *  false:
18681 	 */
18682 
18683 	if (code1 != UNKNOWN)
18684 	  ix86_expand_branch (code1, hi[0], hi[1], label);
18685 	if (code2 != UNKNOWN)
18686 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
18687 
18688 	ix86_expand_branch (code3, lo[0], lo[1], label);
18689 
18690 	if (code2 != UNKNOWN)
18691 	  emit_label (label2);
18692 	return;
18693       }
18694 
18695     default:
18696       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18697       goto simple;
18698     }
18699 }
18700 
18701 /* Split branch based on floating point condition.  */
18702 void
18703 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18704 		      rtx target1, rtx target2, rtx tmp, rtx pushed)
18705 {
18706   rtx condition;
18707   rtx i;
18708 
18709   if (target2 != pc_rtx)
18710     {
18711       rtx tmp = target2;
18712       code = reverse_condition_maybe_unordered (code);
18713       target2 = target1;
18714       target1 = tmp;
18715     }
18716 
18717   condition = ix86_expand_fp_compare (code, op1, op2,
18718 				      tmp);
18719 
18720   /* Remove pushed operand from stack.  */
18721   if (pushed)
18722     ix86_free_from_memory (GET_MODE (pushed));
18723 
18724   i = emit_jump_insn (gen_rtx_SET
18725 		      (VOIDmode, pc_rtx,
18726 		       gen_rtx_IF_THEN_ELSE (VOIDmode,
18727 					     condition, target1, target2)));
18728   if (split_branch_probability >= 0)
18729     add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18730 }
18731 
18732 void
18733 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18734 {
18735   rtx ret;
18736 
18737   gcc_assert (GET_MODE (dest) == QImode);
18738 
18739   ret = ix86_expand_compare (code, op0, op1);
18740   PUT_MODE (ret, QImode);
18741   emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18742 }
18743 
18744 /* Expand comparison setting or clearing carry flag.  Return true when
18745    successful and set pop for the operation.  */
18746 static bool
18747 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18748 {
18749   enum machine_mode mode =
18750     GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18751 
18752   /* Do not handle double-mode compares that go through special path.  */
18753   if (mode == (TARGET_64BIT ? TImode : DImode))
18754     return false;
18755 
18756   if (SCALAR_FLOAT_MODE_P (mode))
18757     {
18758       rtx compare_op, compare_seq;
18759 
18760       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18761 
18762       /* Shortcut:  following common codes never translate
18763 	 into carry flag compares.  */
18764       if (code == EQ || code == NE || code == UNEQ || code == LTGT
18765 	  || code == ORDERED || code == UNORDERED)
18766 	return false;
18767 
18768       /* These comparisons require zero flag; swap operands so they won't.  */
18769       if ((code == GT || code == UNLE || code == LE || code == UNGT)
18770 	  && !TARGET_IEEE_FP)
18771 	{
18772 	  rtx tmp = op0;
18773 	  op0 = op1;
18774 	  op1 = tmp;
18775 	  code = swap_condition (code);
18776 	}
18777 
18778       /* Try to expand the comparison and verify that we end up with
18779 	 carry flag based comparison.  This fails to be true only when
18780 	 we decide to expand comparison using arithmetic that is not
18781 	 too common scenario.  */
18782       start_sequence ();
18783       compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18784       compare_seq = get_insns ();
18785       end_sequence ();
18786 
18787       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18788 	  || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18789         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18790       else
18791 	code = GET_CODE (compare_op);
18792 
18793       if (code != LTU && code != GEU)
18794 	return false;
18795 
18796       emit_insn (compare_seq);
18797       *pop = compare_op;
18798       return true;
18799     }
18800 
18801   if (!INTEGRAL_MODE_P (mode))
18802     return false;
18803 
18804   switch (code)
18805     {
18806     case LTU:
18807     case GEU:
18808       break;
18809 
18810     /* Convert a==0 into (unsigned)a<1.  */
18811     case EQ:
18812     case NE:
18813       if (op1 != const0_rtx)
18814 	return false;
18815       op1 = const1_rtx;
18816       code = (code == EQ ? LTU : GEU);
18817       break;
18818 
18819     /* Convert a>b into b<a or a>=b-1.  */
18820     case GTU:
18821     case LEU:
18822       if (CONST_INT_P (op1))
18823 	{
18824 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18825 	  /* Bail out on overflow.  We still can swap operands but that
18826 	     would force loading of the constant into register.  */
18827 	  if (op1 == const0_rtx
18828 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18829 	    return false;
18830 	  code = (code == GTU ? GEU : LTU);
18831 	}
18832       else
18833 	{
18834 	  rtx tmp = op1;
18835 	  op1 = op0;
18836 	  op0 = tmp;
18837 	  code = (code == GTU ? LTU : GEU);
18838 	}
18839       break;
18840 
18841     /* Convert a>=0 into (unsigned)a<0x80000000.  */
18842     case LT:
18843     case GE:
18844       if (mode == DImode || op1 != const0_rtx)
18845 	return false;
18846       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18847       code = (code == LT ? GEU : LTU);
18848       break;
18849     case LE:
18850     case GT:
18851       if (mode == DImode || op1 != constm1_rtx)
18852 	return false;
18853       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18854       code = (code == LE ? GEU : LTU);
18855       break;
18856 
18857     default:
18858       return false;
18859     }
18860   /* Swapping operands may cause constant to appear as first operand.  */
18861   if (!nonimmediate_operand (op0, VOIDmode))
18862     {
18863       if (!can_create_pseudo_p ())
18864 	return false;
18865       op0 = force_reg (mode, op0);
18866     }
18867   *pop = ix86_expand_compare (code, op0, op1);
18868   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18869   return true;
18870 }
18871 
18872 bool
18873 ix86_expand_int_movcc (rtx operands[])
18874 {
18875   enum rtx_code code = GET_CODE (operands[1]), compare_code;
18876   rtx compare_seq, compare_op;
18877   enum machine_mode mode = GET_MODE (operands[0]);
18878   bool sign_bit_compare_p = false;
18879   rtx op0 = XEXP (operands[1], 0);
18880   rtx op1 = XEXP (operands[1], 1);
18881 
18882   start_sequence ();
18883   compare_op = ix86_expand_compare (code, op0, op1);
18884   compare_seq = get_insns ();
18885   end_sequence ();
18886 
18887   compare_code = GET_CODE (compare_op);
18888 
18889   if ((op1 == const0_rtx && (code == GE || code == LT))
18890       || (op1 == constm1_rtx && (code == GT || code == LE)))
18891     sign_bit_compare_p = true;
18892 
18893   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18894      HImode insns, we'd be swallowed in word prefix ops.  */
18895 
18896   if ((mode != HImode || TARGET_FAST_PREFIX)
18897       && (mode != (TARGET_64BIT ? TImode : DImode))
18898       && CONST_INT_P (operands[2])
18899       && CONST_INT_P (operands[3]))
18900     {
18901       rtx out = operands[0];
18902       HOST_WIDE_INT ct = INTVAL (operands[2]);
18903       HOST_WIDE_INT cf = INTVAL (operands[3]);
18904       HOST_WIDE_INT diff;
18905 
18906       diff = ct - cf;
18907       /*  Sign bit compares are better done using shifts than we do by using
18908 	  sbb.  */
18909       if (sign_bit_compare_p
18910 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18911 	{
18912 	  /* Detect overlap between destination and compare sources.  */
18913 	  rtx tmp = out;
18914 
18915           if (!sign_bit_compare_p)
18916 	    {
18917 	      rtx flags;
18918 	      bool fpcmp = false;
18919 
18920 	      compare_code = GET_CODE (compare_op);
18921 
18922 	      flags = XEXP (compare_op, 0);
18923 
18924 	      if (GET_MODE (flags) == CCFPmode
18925 		  || GET_MODE (flags) == CCFPUmode)
18926 		{
18927 		  fpcmp = true;
18928 		  compare_code
18929 		    = ix86_fp_compare_code_to_integer (compare_code);
18930 		}
18931 
18932 	      /* To simplify rest of code, restrict to the GEU case.  */
18933 	      if (compare_code == LTU)
18934 		{
18935 		  HOST_WIDE_INT tmp = ct;
18936 		  ct = cf;
18937 		  cf = tmp;
18938 		  compare_code = reverse_condition (compare_code);
18939 		  code = reverse_condition (code);
18940 		}
18941 	      else
18942 		{
18943 		  if (fpcmp)
18944 		    PUT_CODE (compare_op,
18945 			      reverse_condition_maybe_unordered
18946 			        (GET_CODE (compare_op)));
18947 		  else
18948 		    PUT_CODE (compare_op,
18949 			      reverse_condition (GET_CODE (compare_op)));
18950 		}
18951 	      diff = ct - cf;
18952 
18953 	      if (reg_overlap_mentioned_p (out, op0)
18954 		  || reg_overlap_mentioned_p (out, op1))
18955 		tmp = gen_reg_rtx (mode);
18956 
18957 	      if (mode == DImode)
18958 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18959 	      else
18960 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
18961 						 flags, compare_op));
18962 	    }
18963 	  else
18964 	    {
18965 	      if (code == GT || code == GE)
18966 		code = reverse_condition (code);
18967 	      else
18968 		{
18969 		  HOST_WIDE_INT tmp = ct;
18970 		  ct = cf;
18971 		  cf = tmp;
18972 		  diff = ct - cf;
18973 		}
18974 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18975 	    }
18976 
18977 	  if (diff == 1)
18978 	    {
18979 	      /*
18980 	       * cmpl op0,op1
18981 	       * sbbl dest,dest
18982 	       * [addl dest, ct]
18983 	       *
18984 	       * Size 5 - 8.
18985 	       */
18986 	      if (ct)
18987 		tmp = expand_simple_binop (mode, PLUS,
18988 					   tmp, GEN_INT (ct),
18989 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
18990 	    }
18991 	  else if (cf == -1)
18992 	    {
18993 	      /*
18994 	       * cmpl op0,op1
18995 	       * sbbl dest,dest
18996 	       * orl $ct, dest
18997 	       *
18998 	       * Size 8.
18999 	       */
19000 	      tmp = expand_simple_binop (mode, IOR,
19001 					 tmp, GEN_INT (ct),
19002 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
19003 	    }
19004 	  else if (diff == -1 && ct)
19005 	    {
19006 	      /*
19007 	       * cmpl op0,op1
19008 	       * sbbl dest,dest
19009 	       * notl dest
19010 	       * [addl dest, cf]
19011 	       *
19012 	       * Size 8 - 11.
19013 	       */
19014 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19015 	      if (cf)
19016 		tmp = expand_simple_binop (mode, PLUS,
19017 					   copy_rtx (tmp), GEN_INT (cf),
19018 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
19019 	    }
19020 	  else
19021 	    {
19022 	      /*
19023 	       * cmpl op0,op1
19024 	       * sbbl dest,dest
19025 	       * [notl dest]
19026 	       * andl cf - ct, dest
19027 	       * [addl dest, ct]
19028 	       *
19029 	       * Size 8 - 11.
19030 	       */
19031 
19032 	      if (cf == 0)
19033 		{
19034 		  cf = ct;
19035 		  ct = 0;
19036 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19037 		}
19038 
19039 	      tmp = expand_simple_binop (mode, AND,
19040 					 copy_rtx (tmp),
19041 					 gen_int_mode (cf - ct, mode),
19042 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
19043 	      if (ct)
19044 		tmp = expand_simple_binop (mode, PLUS,
19045 					   copy_rtx (tmp), GEN_INT (ct),
19046 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
19047 	    }
19048 
19049 	  if (!rtx_equal_p (tmp, out))
19050 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19051 
19052 	  return true;
19053 	}
19054 
19055       if (diff < 0)
19056 	{
19057 	  enum machine_mode cmp_mode = GET_MODE (op0);
19058 
19059 	  HOST_WIDE_INT tmp;
19060 	  tmp = ct, ct = cf, cf = tmp;
19061 	  diff = -diff;
19062 
19063 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
19064 	    {
19065 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19066 
19067 	      /* We may be reversing unordered compare to normal compare, that
19068 		 is not valid in general (we may convert non-trapping condition
19069 		 to trapping one), however on i386 we currently emit all
19070 		 comparisons unordered.  */
19071 	      compare_code = reverse_condition_maybe_unordered (compare_code);
19072 	      code = reverse_condition_maybe_unordered (code);
19073 	    }
19074 	  else
19075 	    {
19076 	      compare_code = reverse_condition (compare_code);
19077 	      code = reverse_condition (code);
19078 	    }
19079 	}
19080 
19081       compare_code = UNKNOWN;
19082       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19083 	  && CONST_INT_P (op1))
19084 	{
19085 	  if (op1 == const0_rtx
19086 	      && (code == LT || code == GE))
19087 	    compare_code = code;
19088 	  else if (op1 == constm1_rtx)
19089 	    {
19090 	      if (code == LE)
19091 		compare_code = LT;
19092 	      else if (code == GT)
19093 		compare_code = GE;
19094 	    }
19095 	}
19096 
19097       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
19098       if (compare_code != UNKNOWN
19099 	  && GET_MODE (op0) == GET_MODE (out)
19100 	  && (cf == -1 || ct == -1))
19101 	{
19102 	  /* If lea code below could be used, only optimize
19103 	     if it results in a 2 insn sequence.  */
19104 
19105 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19106 		 || diff == 3 || diff == 5 || diff == 9)
19107 	      || (compare_code == LT && ct == -1)
19108 	      || (compare_code == GE && cf == -1))
19109 	    {
19110 	      /*
19111 	       * notl op1	(if necessary)
19112 	       * sarl $31, op1
19113 	       * orl cf, op1
19114 	       */
19115 	      if (ct != -1)
19116 		{
19117 		  cf = ct;
19118 		  ct = -1;
19119 		  code = reverse_condition (code);
19120 		}
19121 
19122 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19123 
19124 	      out = expand_simple_binop (mode, IOR,
19125 					 out, GEN_INT (cf),
19126 					 out, 1, OPTAB_DIRECT);
19127 	      if (out != operands[0])
19128 		emit_move_insn (operands[0], out);
19129 
19130 	      return true;
19131 	    }
19132 	}
19133 
19134 
19135       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19136 	   || diff == 3 || diff == 5 || diff == 9)
19137 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19138 	  && (mode != DImode
19139 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19140 	{
19141 	  /*
19142 	   * xorl dest,dest
19143 	   * cmpl op1,op2
19144 	   * setcc dest
19145 	   * lea cf(dest*(ct-cf)),dest
19146 	   *
19147 	   * Size 14.
19148 	   *
19149 	   * This also catches the degenerate setcc-only case.
19150 	   */
19151 
19152 	  rtx tmp;
19153 	  int nops;
19154 
19155 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19156 
19157 	  nops = 0;
19158 	  /* On x86_64 the lea instruction operates on Pmode, so we need
19159 	     to get arithmetics done in proper mode to match.  */
19160 	  if (diff == 1)
19161 	    tmp = copy_rtx (out);
19162 	  else
19163 	    {
19164 	      rtx out1;
19165 	      out1 = copy_rtx (out);
19166 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19167 	      nops++;
19168 	      if (diff & 1)
19169 		{
19170 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
19171 		  nops++;
19172 		}
19173 	    }
19174 	  if (cf != 0)
19175 	    {
19176 	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19177 	      nops++;
19178 	    }
19179 	  if (!rtx_equal_p (tmp, out))
19180 	    {
19181 	      if (nops == 1)
19182 		out = force_operand (tmp, copy_rtx (out));
19183 	      else
19184 		emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19185 	    }
19186 	  if (!rtx_equal_p (out, operands[0]))
19187 	    emit_move_insn (operands[0], copy_rtx (out));
19188 
19189 	  return true;
19190 	}
19191 
19192       /*
19193        * General case:			Jumpful:
19194        *   xorl dest,dest		cmpl op1, op2
19195        *   cmpl op1, op2		movl ct, dest
19196        *   setcc dest			jcc 1f
19197        *   decl dest			movl cf, dest
19198        *   andl (cf-ct),dest		1:
19199        *   addl ct,dest
19200        *
19201        * Size 20.			Size 14.
19202        *
19203        * This is reasonably steep, but branch mispredict costs are
19204        * high on modern cpus, so consider failing only if optimizing
19205        * for space.
19206        */
19207 
19208       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19209 	  && BRANCH_COST (optimize_insn_for_speed_p (),
19210 		  	  false) >= 2)
19211 	{
19212 	  if (cf == 0)
19213 	    {
19214 	      enum machine_mode cmp_mode = GET_MODE (op0);
19215 
19216 	      cf = ct;
19217 	      ct = 0;
19218 
19219 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
19220 		{
19221 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19222 
19223 		  /* We may be reversing unordered compare to normal compare,
19224 		     that is not valid in general (we may convert non-trapping
19225 		     condition to trapping one), however on i386 we currently
19226 		     emit all comparisons unordered.  */
19227 		  code = reverse_condition_maybe_unordered (code);
19228 		}
19229 	      else
19230 		{
19231 		  code = reverse_condition (code);
19232 		  if (compare_code != UNKNOWN)
19233 		    compare_code = reverse_condition (compare_code);
19234 		}
19235 	    }
19236 
19237 	  if (compare_code != UNKNOWN)
19238 	    {
19239 	      /* notl op1	(if needed)
19240 		 sarl $31, op1
19241 		 andl (cf-ct), op1
19242 		 addl ct, op1
19243 
19244 		 For x < 0 (resp. x <= -1) there will be no notl,
19245 		 so if possible swap the constants to get rid of the
19246 		 complement.
19247 		 True/false will be -1/0 while code below (store flag
19248 		 followed by decrement) is 0/-1, so the constants need
19249 		 to be exchanged once more.  */
19250 
19251 	      if (compare_code == GE || !cf)
19252 		{
19253 		  code = reverse_condition (code);
19254 		  compare_code = LT;
19255 		}
19256 	      else
19257 		{
19258 		  HOST_WIDE_INT tmp = cf;
19259 		  cf = ct;
19260 		  ct = tmp;
19261 		}
19262 
19263 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19264 	    }
19265 	  else
19266 	    {
19267 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19268 
19269 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19270 					 constm1_rtx,
19271 					 copy_rtx (out), 1, OPTAB_DIRECT);
19272 	    }
19273 
19274 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
19275 				     gen_int_mode (cf - ct, mode),
19276 				     copy_rtx (out), 1, OPTAB_DIRECT);
19277 	  if (ct)
19278 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19279 				       copy_rtx (out), 1, OPTAB_DIRECT);
19280 	  if (!rtx_equal_p (out, operands[0]))
19281 	    emit_move_insn (operands[0], copy_rtx (out));
19282 
19283 	  return true;
19284 	}
19285     }
19286 
19287   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19288     {
19289       /* Try a few things more with specific constants and a variable.  */
19290 
19291       optab op;
19292       rtx var, orig_out, out, tmp;
19293 
19294       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19295 	return false;
19296 
19297       /* If one of the two operands is an interesting constant, load a
19298 	 constant with the above and mask it in with a logical operation.  */
19299 
19300       if (CONST_INT_P (operands[2]))
19301 	{
19302 	  var = operands[3];
19303 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19304 	    operands[3] = constm1_rtx, op = and_optab;
19305 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19306 	    operands[3] = const0_rtx, op = ior_optab;
19307 	  else
19308 	    return false;
19309 	}
19310       else if (CONST_INT_P (operands[3]))
19311 	{
19312 	  var = operands[2];
19313 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19314 	    operands[2] = constm1_rtx, op = and_optab;
19315 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19316 	    operands[2] = const0_rtx, op = ior_optab;
19317 	  else
19318 	    return false;
19319 	}
19320       else
19321         return false;
19322 
19323       orig_out = operands[0];
19324       tmp = gen_reg_rtx (mode);
19325       operands[0] = tmp;
19326 
19327       /* Recurse to get the constant loaded.  */
19328       if (ix86_expand_int_movcc (operands) == 0)
19329         return false;
19330 
19331       /* Mask in the interesting variable.  */
19332       out = expand_binop (mode, op, var, tmp, orig_out, 0,
19333 			  OPTAB_WIDEN);
19334       if (!rtx_equal_p (out, orig_out))
19335 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19336 
19337       return true;
19338     }
19339 
19340   /*
19341    * For comparison with above,
19342    *
19343    * movl cf,dest
19344    * movl ct,tmp
19345    * cmpl op1,op2
19346    * cmovcc tmp,dest
19347    *
19348    * Size 15.
19349    */
19350 
19351   if (! nonimmediate_operand (operands[2], mode))
19352     operands[2] = force_reg (mode, operands[2]);
19353   if (! nonimmediate_operand (operands[3], mode))
19354     operands[3] = force_reg (mode, operands[3]);
19355 
19356   if (! register_operand (operands[2], VOIDmode)
19357       && (mode == QImode
19358           || ! register_operand (operands[3], VOIDmode)))
19359     operands[2] = force_reg (mode, operands[2]);
19360 
19361   if (mode == QImode
19362       && ! register_operand (operands[3], VOIDmode))
19363     operands[3] = force_reg (mode, operands[3]);
19364 
19365   emit_insn (compare_seq);
19366   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19367 			  gen_rtx_IF_THEN_ELSE (mode,
19368 						compare_op, operands[2],
19369 						operands[3])));
19370   return true;
19371 }
19372 
19373 /* Swap, force into registers, or otherwise massage the two operands
19374    to an sse comparison with a mask result.  Thus we differ a bit from
19375    ix86_prepare_fp_compare_args which expects to produce a flags result.
19376 
19377    The DEST operand exists to help determine whether to commute commutative
19378    operators.  The POP0/POP1 operands are updated in place.  The new
19379    comparison code is returned, or UNKNOWN if not implementable.  */
19380 
19381 static enum rtx_code
19382 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19383 				  rtx *pop0, rtx *pop1)
19384 {
19385   rtx tmp;
19386 
19387   switch (code)
19388     {
19389     case LTGT:
19390     case UNEQ:
19391       /* AVX supports all the needed comparisons.  */
19392       if (TARGET_AVX)
19393 	break;
19394       /* We have no LTGT as an operator.  We could implement it with
19395 	 NE & ORDERED, but this requires an extra temporary.  It's
19396 	 not clear that it's worth it.  */
19397       return UNKNOWN;
19398 
19399     case LT:
19400     case LE:
19401     case UNGT:
19402     case UNGE:
19403       /* These are supported directly.  */
19404       break;
19405 
19406     case EQ:
19407     case NE:
19408     case UNORDERED:
19409     case ORDERED:
19410       /* AVX has 3 operand comparisons, no need to swap anything.  */
19411       if (TARGET_AVX)
19412 	break;
19413       /* For commutative operators, try to canonicalize the destination
19414 	 operand to be first in the comparison - this helps reload to
19415 	 avoid extra moves.  */
19416       if (!dest || !rtx_equal_p (dest, *pop1))
19417 	break;
19418       /* FALLTHRU */
19419 
19420     case GE:
19421     case GT:
19422     case UNLE:
19423     case UNLT:
19424       /* These are not supported directly before AVX, and furthermore
19425 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
19426 	 comparison operands to transform into something that is
19427 	 supported.  */
19428       tmp = *pop0;
19429       *pop0 = *pop1;
19430       *pop1 = tmp;
19431       code = swap_condition (code);
19432       break;
19433 
19434     default:
19435       gcc_unreachable ();
19436     }
19437 
19438   return code;
19439 }
19440 
19441 /* Detect conditional moves that exactly match min/max operational
19442    semantics.  Note that this is IEEE safe, as long as we don't
19443    interchange the operands.
19444 
19445    Returns FALSE if this conditional move doesn't match a MIN/MAX,
19446    and TRUE if the operation is successful and instructions are emitted.  */
19447 
19448 static bool
19449 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19450 			   rtx cmp_op1, rtx if_true, rtx if_false)
19451 {
19452   enum machine_mode mode;
19453   bool is_min;
19454   rtx tmp;
19455 
19456   if (code == LT)
19457     ;
19458   else if (code == UNGE)
19459     {
19460       tmp = if_true;
19461       if_true = if_false;
19462       if_false = tmp;
19463     }
19464   else
19465     return false;
19466 
19467   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19468     is_min = true;
19469   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19470     is_min = false;
19471   else
19472     return false;
19473 
19474   mode = GET_MODE (dest);
19475 
19476   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19477      but MODE may be a vector mode and thus not appropriate.  */
19478   if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19479     {
19480       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19481       rtvec v;
19482 
19483       if_true = force_reg (mode, if_true);
19484       v = gen_rtvec (2, if_true, if_false);
19485       tmp = gen_rtx_UNSPEC (mode, v, u);
19486     }
19487   else
19488     {
19489       code = is_min ? SMIN : SMAX;
19490       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19491     }
19492 
19493   emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19494   return true;
19495 }
19496 
19497 /* Expand an sse vector comparison.  Return the register with the result.  */
19498 
19499 static rtx
19500 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19501 		     rtx op_true, rtx op_false)
19502 {
19503   enum machine_mode mode = GET_MODE (dest);
19504   enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19505   rtx x;
19506 
19507   cmp_op0 = force_reg (cmp_mode, cmp_op0);
19508   if (!nonimmediate_operand (cmp_op1, cmp_mode))
19509     cmp_op1 = force_reg (cmp_mode, cmp_op1);
19510 
19511   if (optimize
19512       || reg_overlap_mentioned_p (dest, op_true)
19513       || reg_overlap_mentioned_p (dest, op_false))
19514     dest = gen_reg_rtx (mode);
19515 
19516   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19517   if (cmp_mode != mode)
19518     {
19519       x = force_reg (cmp_mode, x);
19520       convert_move (dest, x, false);
19521     }
19522   else
19523     emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19524 
19525   return dest;
19526 }
19527 
19528 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19529    operations.  This is used for both scalar and vector conditional moves.  */
19530 
19531 static void
19532 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19533 {
19534   enum machine_mode mode = GET_MODE (dest);
19535   rtx t2, t3, x;
19536 
19537   if (vector_all_ones_operand (op_true, mode)
19538       && rtx_equal_p (op_false, CONST0_RTX (mode)))
19539     {
19540       emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19541     }
19542   else if (op_false == CONST0_RTX (mode))
19543     {
19544       op_true = force_reg (mode, op_true);
19545       x = gen_rtx_AND (mode, cmp, op_true);
19546       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19547     }
19548   else if (op_true == CONST0_RTX (mode))
19549     {
19550       op_false = force_reg (mode, op_false);
19551       x = gen_rtx_NOT (mode, cmp);
19552       x = gen_rtx_AND (mode, x, op_false);
19553       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19554     }
19555   else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19556     {
19557       op_false = force_reg (mode, op_false);
19558       x = gen_rtx_IOR (mode, cmp, op_false);
19559       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19560     }
19561   else if (TARGET_XOP)
19562     {
19563       op_true = force_reg (mode, op_true);
19564 
19565       if (!nonimmediate_operand (op_false, mode))
19566 	op_false = force_reg (mode, op_false);
19567 
19568       emit_insn (gen_rtx_SET (mode, dest,
19569 			      gen_rtx_IF_THEN_ELSE (mode, cmp,
19570 						    op_true,
19571 						    op_false)));
19572     }
19573   else
19574     {
19575       rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19576 
19577       if (!nonimmediate_operand (op_true, mode))
19578 	op_true = force_reg (mode, op_true);
19579 
19580       op_false = force_reg (mode, op_false);
19581 
19582       switch (mode)
19583 	{
19584 	case V4SFmode:
19585 	  if (TARGET_SSE4_1)
19586 	    gen = gen_sse4_1_blendvps;
19587 	  break;
19588 	case V2DFmode:
19589 	  if (TARGET_SSE4_1)
19590 	    gen = gen_sse4_1_blendvpd;
19591 	  break;
19592 	case V16QImode:
19593 	case V8HImode:
19594 	case V4SImode:
19595 	case V2DImode:
19596 	  if (TARGET_SSE4_1)
19597 	    {
19598 	      gen = gen_sse4_1_pblendvb;
19599 	      dest = gen_lowpart (V16QImode, dest);
19600 	      op_false = gen_lowpart (V16QImode, op_false);
19601 	      op_true = gen_lowpart (V16QImode, op_true);
19602 	      cmp = gen_lowpart (V16QImode, cmp);
19603 	    }
19604 	  break;
19605 	case V8SFmode:
19606 	  if (TARGET_AVX)
19607 	    gen = gen_avx_blendvps256;
19608 	  break;
19609 	case V4DFmode:
19610 	  if (TARGET_AVX)
19611 	    gen = gen_avx_blendvpd256;
19612 	  break;
19613 	case V32QImode:
19614 	case V16HImode:
19615 	case V8SImode:
19616 	case V4DImode:
19617 	  if (TARGET_AVX2)
19618 	    {
19619 	      gen = gen_avx2_pblendvb;
19620 	      dest = gen_lowpart (V32QImode, dest);
19621 	      op_false = gen_lowpart (V32QImode, op_false);
19622 	      op_true = gen_lowpart (V32QImode, op_true);
19623 	      cmp = gen_lowpart (V32QImode, cmp);
19624 	    }
19625 	  break;
19626 	default:
19627 	  break;
19628 	}
19629 
19630       if (gen != NULL)
19631 	emit_insn (gen (dest, op_false, op_true, cmp));
19632       else
19633 	{
19634 	  op_true = force_reg (mode, op_true);
19635 
19636 	  t2 = gen_reg_rtx (mode);
19637 	  if (optimize)
19638 	    t3 = gen_reg_rtx (mode);
19639 	  else
19640 	    t3 = dest;
19641 
19642 	  x = gen_rtx_AND (mode, op_true, cmp);
19643 	  emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19644 
19645 	  x = gen_rtx_NOT (mode, cmp);
19646 	  x = gen_rtx_AND (mode, x, op_false);
19647 	  emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19648 
19649 	  x = gen_rtx_IOR (mode, t3, t2);
19650 	  emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19651 	}
19652     }
19653 }
19654 
19655 /* Expand a floating-point conditional move.  Return true if successful.  */
19656 
19657 bool
19658 ix86_expand_fp_movcc (rtx operands[])
19659 {
19660   enum machine_mode mode = GET_MODE (operands[0]);
19661   enum rtx_code code = GET_CODE (operands[1]);
19662   rtx tmp, compare_op;
19663   rtx op0 = XEXP (operands[1], 0);
19664   rtx op1 = XEXP (operands[1], 1);
19665 
19666   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19667     {
19668       enum machine_mode cmode;
19669 
19670       /* Since we've no cmove for sse registers, don't force bad register
19671 	 allocation just to gain access to it.  Deny movcc when the
19672 	 comparison mode doesn't match the move mode.  */
19673       cmode = GET_MODE (op0);
19674       if (cmode == VOIDmode)
19675 	cmode = GET_MODE (op1);
19676       if (cmode != mode)
19677 	return false;
19678 
19679       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19680       if (code == UNKNOWN)
19681 	return false;
19682 
19683       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19684 				     operands[2], operands[3]))
19685 	return true;
19686 
19687       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19688 				 operands[2], operands[3]);
19689       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19690       return true;
19691     }
19692 
19693   /* The floating point conditional move instructions don't directly
19694      support conditions resulting from a signed integer comparison.  */
19695 
19696   compare_op = ix86_expand_compare (code, op0, op1);
19697   if (!fcmov_comparison_operator (compare_op, VOIDmode))
19698     {
19699       tmp = gen_reg_rtx (QImode);
19700       ix86_expand_setcc (tmp, code, op0, op1);
19701 
19702       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19703     }
19704 
19705   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19706 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
19707 						operands[2], operands[3])));
19708 
19709   return true;
19710 }
19711 
19712 /* Expand a floating-point vector conditional move; a vcond operation
19713    rather than a movcc operation.  */
19714 
19715 bool
19716 ix86_expand_fp_vcond (rtx operands[])
19717 {
19718   enum rtx_code code = GET_CODE (operands[3]);
19719   rtx cmp;
19720 
19721   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19722 					   &operands[4], &operands[5]);
19723   if (code == UNKNOWN)
19724     {
19725       rtx temp;
19726       switch (GET_CODE (operands[3]))
19727 	{
19728 	case LTGT:
19729 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19730 				      operands[5], operands[0], operands[0]);
19731 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19732 				     operands[5], operands[1], operands[2]);
19733 	  code = AND;
19734 	  break;
19735 	case UNEQ:
19736 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19737 				      operands[5], operands[0], operands[0]);
19738 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19739 				     operands[5], operands[1], operands[2]);
19740 	  code = IOR;
19741 	  break;
19742 	default:
19743 	  gcc_unreachable ();
19744 	}
19745       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19746 				 OPTAB_DIRECT);
19747       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19748       return true;
19749     }
19750 
19751   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19752 				 operands[5], operands[1], operands[2]))
19753     return true;
19754 
19755   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19756 			     operands[1], operands[2]);
19757   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19758   return true;
19759 }
19760 
19761 /* Expand a signed/unsigned integral vector conditional move.  */
19762 
19763 bool
19764 ix86_expand_int_vcond (rtx operands[])
19765 {
19766   enum machine_mode data_mode = GET_MODE (operands[0]);
19767   enum machine_mode mode = GET_MODE (operands[4]);
19768   enum rtx_code code = GET_CODE (operands[3]);
19769   bool negate = false;
19770   rtx x, cop0, cop1;
19771 
19772   cop0 = operands[4];
19773   cop1 = operands[5];
19774 
19775   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19776      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
19777   if ((code == LT || code == GE)
19778       && data_mode == mode
19779       && cop1 == CONST0_RTX (mode)
19780       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19781       && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19782       && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19783       && (GET_MODE_SIZE (data_mode) == 16
19784 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19785     {
19786       rtx negop = operands[2 - (code == LT)];
19787       int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19788       if (negop == CONST1_RTX (data_mode))
19789 	{
19790 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19791 					 operands[0], 1, OPTAB_DIRECT);
19792 	  if (res != operands[0])
19793 	    emit_move_insn (operands[0], res);
19794 	  return true;
19795 	}
19796       else if (GET_MODE_INNER (data_mode) != DImode
19797 	       && vector_all_ones_operand (negop, data_mode))
19798 	{
19799 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19800 					 operands[0], 0, OPTAB_DIRECT);
19801 	  if (res != operands[0])
19802 	    emit_move_insn (operands[0], res);
19803 	  return true;
19804 	}
19805     }
19806 
19807   if (!nonimmediate_operand (cop1, mode))
19808     cop1 = force_reg (mode, cop1);
19809   if (!general_operand (operands[1], data_mode))
19810     operands[1] = force_reg (data_mode, operands[1]);
19811   if (!general_operand (operands[2], data_mode))
19812     operands[2] = force_reg (data_mode, operands[2]);
19813 
19814   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
19815   if (TARGET_XOP
19816       && (mode == V16QImode || mode == V8HImode
19817 	  || mode == V4SImode || mode == V2DImode))
19818     ;
19819   else
19820     {
19821       /* Canonicalize the comparison to EQ, GT, GTU.  */
19822       switch (code)
19823 	{
19824 	case EQ:
19825 	case GT:
19826 	case GTU:
19827 	  break;
19828 
19829 	case NE:
19830 	case LE:
19831 	case LEU:
19832 	  code = reverse_condition (code);
19833 	  negate = true;
19834 	  break;
19835 
19836 	case GE:
19837 	case GEU:
19838 	  code = reverse_condition (code);
19839 	  negate = true;
19840 	  /* FALLTHRU */
19841 
19842 	case LT:
19843 	case LTU:
19844 	  code = swap_condition (code);
19845 	  x = cop0, cop0 = cop1, cop1 = x;
19846 	  break;
19847 
19848 	default:
19849 	  gcc_unreachable ();
19850 	}
19851 
19852       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
19853       if (mode == V2DImode)
19854 	{
19855 	  switch (code)
19856 	    {
19857 	    case EQ:
19858 	      /* SSE4.1 supports EQ.  */
19859 	      if (!TARGET_SSE4_1)
19860 		return false;
19861 	      break;
19862 
19863 	    case GT:
19864 	    case GTU:
19865 	      /* SSE4.2 supports GT/GTU.  */
19866 	      if (!TARGET_SSE4_2)
19867 		return false;
19868 	      break;
19869 
19870 	    default:
19871 	      gcc_unreachable ();
19872 	    }
19873 	}
19874 
19875       /* Unsigned parallel compare is not supported by the hardware.
19876 	 Play some tricks to turn this into a signed comparison
19877 	 against 0.  */
19878       if (code == GTU)
19879 	{
19880 	  cop0 = force_reg (mode, cop0);
19881 
19882 	  switch (mode)
19883 	    {
19884 	    case V8SImode:
19885 	    case V4DImode:
19886 	    case V4SImode:
19887 	    case V2DImode:
19888 		{
19889 		  rtx t1, t2, mask;
19890 		  rtx (*gen_sub3) (rtx, rtx, rtx);
19891 
19892 		  switch (mode)
19893 		    {
19894 		    case V8SImode: gen_sub3 = gen_subv8si3; break;
19895 		    case V4DImode: gen_sub3 = gen_subv4di3; break;
19896 		    case V4SImode: gen_sub3 = gen_subv4si3; break;
19897 		    case V2DImode: gen_sub3 = gen_subv2di3; break;
19898 		    default:
19899 		      gcc_unreachable ();
19900 		    }
19901 		  /* Subtract (-(INT MAX) - 1) from both operands to make
19902 		     them signed.  */
19903 		  mask = ix86_build_signbit_mask (mode, true, false);
19904 		  t1 = gen_reg_rtx (mode);
19905 		  emit_insn (gen_sub3 (t1, cop0, mask));
19906 
19907 		  t2 = gen_reg_rtx (mode);
19908 		  emit_insn (gen_sub3 (t2, cop1, mask));
19909 
19910 		  cop0 = t1;
19911 		  cop1 = t2;
19912 		  code = GT;
19913 		}
19914 	      break;
19915 
19916 	    case V32QImode:
19917 	    case V16HImode:
19918 	    case V16QImode:
19919 	    case V8HImode:
19920 	      /* Perform a parallel unsigned saturating subtraction.  */
19921 	      x = gen_reg_rtx (mode);
19922 	      emit_insn (gen_rtx_SET (VOIDmode, x,
19923 				      gen_rtx_US_MINUS (mode, cop0, cop1)));
19924 
19925 	      cop0 = x;
19926 	      cop1 = CONST0_RTX (mode);
19927 	      code = EQ;
19928 	      negate = !negate;
19929 	      break;
19930 
19931 	    default:
19932 	      gcc_unreachable ();
19933 	    }
19934 	}
19935     }
19936 
19937   /* Allow the comparison to be done in one mode, but the movcc to
19938      happen in another mode.  */
19939   if (data_mode == mode)
19940     {
19941       x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19942 			       operands[1+negate], operands[2-negate]);
19943     }
19944   else
19945     {
19946       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19947       x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19948 			       code, cop0, cop1,
19949 			       operands[1+negate], operands[2-negate]);
19950       x = gen_lowpart (data_mode, x);
19951     }
19952 
19953   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19954 			 operands[2-negate]);
19955   return true;
19956 }
19957 
19958 /* Expand a variable vector permutation.  */
19959 
19960 void
19961 ix86_expand_vec_perm (rtx operands[])
19962 {
19963   rtx target = operands[0];
19964   rtx op0 = operands[1];
19965   rtx op1 = operands[2];
19966   rtx mask = operands[3];
19967   rtx t1, t2, t3, t4, vt, vt2, vec[32];
19968   enum machine_mode mode = GET_MODE (op0);
19969   enum machine_mode maskmode = GET_MODE (mask);
19970   int w, e, i;
19971   bool one_operand_shuffle = rtx_equal_p (op0, op1);
19972 
19973   /* Number of elements in the vector.  */
19974   w = GET_MODE_NUNITS (mode);
19975   e = GET_MODE_UNIT_SIZE (mode);
19976   gcc_assert (w <= 32);
19977 
19978   if (TARGET_AVX2)
19979     {
19980       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19981 	{
19982 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19983 	     an constant shuffle operand.  With a tiny bit of effort we can
19984 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
19985 	     unfortunate but there's no avoiding it.
19986 	     Similarly for V16HImode we don't have instructions for variable
19987 	     shuffling, while for V32QImode we can use after preparing suitable
19988 	     masks vpshufb; vpshufb; vpermq; vpor.  */
19989 
19990 	  if (mode == V16HImode)
19991 	    {
19992 	      maskmode = mode = V32QImode;
19993 	      w = 32;
19994 	      e = 1;
19995 	    }
19996 	  else
19997 	    {
19998 	      maskmode = mode = V8SImode;
19999 	      w = 8;
20000 	      e = 4;
20001 	    }
20002 	  t1 = gen_reg_rtx (maskmode);
20003 
20004 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
20005 	       mask = { A B C D }
20006 	       t1 = { A A B B C C D D }.  */
20007 	  for (i = 0; i < w / 2; ++i)
20008 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20009 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20010 	  vt = force_reg (maskmode, vt);
20011 	  mask = gen_lowpart (maskmode, mask);
20012 	  if (maskmode == V8SImode)
20013 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20014 	  else
20015 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20016 
20017 	  /* Multiply the shuffle indicies by two.  */
20018 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20019 				    OPTAB_DIRECT);
20020 
20021 	  /* Add one to the odd shuffle indicies:
20022 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
20023 	  for (i = 0; i < w / 2; ++i)
20024 	    {
20025 	      vec[i * 2] = const0_rtx;
20026 	      vec[i * 2 + 1] = const1_rtx;
20027 	    }
20028 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20029 	  vt = force_const_mem (maskmode, vt);
20030 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20031 				    OPTAB_DIRECT);
20032 
20033 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
20034 	  operands[3] = mask = t1;
20035 	  target = gen_lowpart (mode, target);
20036 	  op0 = gen_lowpart (mode, op0);
20037 	  op1 = gen_lowpart (mode, op1);
20038 	}
20039 
20040       switch (mode)
20041 	{
20042 	case V8SImode:
20043 	  /* The VPERMD and VPERMPS instructions already properly ignore
20044 	     the high bits of the shuffle elements.  No need for us to
20045 	     perform an AND ourselves.  */
20046 	  if (one_operand_shuffle)
20047 	    emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20048 	  else
20049 	    {
20050 	      t1 = gen_reg_rtx (V8SImode);
20051 	      t2 = gen_reg_rtx (V8SImode);
20052 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20053 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20054 	      goto merge_two;
20055 	    }
20056 	  return;
20057 
20058 	case V8SFmode:
20059 	  mask = gen_lowpart (V8SFmode, mask);
20060 	  if (one_operand_shuffle)
20061 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20062 	  else
20063 	    {
20064 	      t1 = gen_reg_rtx (V8SFmode);
20065 	      t2 = gen_reg_rtx (V8SFmode);
20066 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20067 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20068 	      goto merge_two;
20069 	    }
20070 	  return;
20071 
20072         case V4SImode:
20073 	  /* By combining the two 128-bit input vectors into one 256-bit
20074 	     input vector, we can use VPERMD and VPERMPS for the full
20075 	     two-operand shuffle.  */
20076 	  t1 = gen_reg_rtx (V8SImode);
20077 	  t2 = gen_reg_rtx (V8SImode);
20078 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20079 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20080 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20081 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20082 	  return;
20083 
20084         case V4SFmode:
20085 	  t1 = gen_reg_rtx (V8SFmode);
20086 	  t2 = gen_reg_rtx (V8SImode);
20087 	  mask = gen_lowpart (V4SImode, mask);
20088 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20089 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20090 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20091 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20092 	  return;
20093 
20094 	case V32QImode:
20095 	  t1 = gen_reg_rtx (V32QImode);
20096 	  t2 = gen_reg_rtx (V32QImode);
20097 	  t3 = gen_reg_rtx (V32QImode);
20098 	  vt2 = GEN_INT (128);
20099 	  for (i = 0; i < 32; i++)
20100 	    vec[i] = vt2;
20101 	  vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20102 	  vt = force_reg (V32QImode, vt);
20103 	  for (i = 0; i < 32; i++)
20104 	    vec[i] = i < 16 ? vt2 : const0_rtx;
20105 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20106 	  vt2 = force_reg (V32QImode, vt2);
20107 	  /* From mask create two adjusted masks, which contain the same
20108 	     bits as mask in the low 7 bits of each vector element.
20109 	     The first mask will have the most significant bit clear
20110 	     if it requests element from the same 128-bit lane
20111 	     and MSB set if it requests element from the other 128-bit lane.
20112 	     The second mask will have the opposite values of the MSB,
20113 	     and additionally will have its 128-bit lanes swapped.
20114 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20115 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
20116 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20117 	     stands for other 12 bytes.  */
20118 	  /* The bit whether element is from the same lane or the other
20119 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
20120 	  emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20121 				    gen_lowpart (V4DImode, mask),
20122 				    GEN_INT (3)));
20123 	  /* Clear MSB bits from the mask just in case it had them set.  */
20124 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20125 	  /* After this t1 will have MSB set for elements from other lane.  */
20126 	  emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20127 	  /* Clear bits other than MSB.  */
20128 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
20129 	  /* Or in the lower bits from mask into t3.  */
20130 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
20131 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
20132 	     lane.  */
20133 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
20134 	  /* Swap 128-bit lanes in t3.  */
20135 	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20136 					  gen_lowpart (V4DImode, t3),
20137 					  const2_rtx, GEN_INT (3),
20138 					  const0_rtx, const1_rtx));
20139 	  /* And or in the lower bits from mask into t1.  */
20140 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
20141 	  if (one_operand_shuffle)
20142 	    {
20143 	      /* Each of these shuffles will put 0s in places where
20144 		 element from the other 128-bit lane is needed, otherwise
20145 		 will shuffle in the requested value.  */
20146 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20147 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20148 	      /* For t3 the 128-bit lanes are swapped again.  */
20149 	      emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20150 					      gen_lowpart (V4DImode, t3),
20151 					      const2_rtx, GEN_INT (3),
20152 					      const0_rtx, const1_rtx));
20153 	      /* And oring both together leads to the result.  */
20154 	      emit_insn (gen_iorv32qi3 (target, t1, t3));
20155 	      return;
20156 	    }
20157 
20158 	  t4 = gen_reg_rtx (V32QImode);
20159 	  /* Similarly to the above one_operand_shuffle code,
20160 	     just for repeated twice for each operand.  merge_two:
20161 	     code will merge the two results together.  */
20162 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20163 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20164 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20165 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20166 	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20167 					  gen_lowpart (V4DImode, t4),
20168 					  const2_rtx, GEN_INT (3),
20169 					  const0_rtx, const1_rtx));
20170 	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20171 					  gen_lowpart (V4DImode, t3),
20172 					  const2_rtx, GEN_INT (3),
20173 					  const0_rtx, const1_rtx));
20174 	  emit_insn (gen_iorv32qi3 (t4, t2, t4));
20175 	  emit_insn (gen_iorv32qi3 (t3, t1, t3));
20176 	  t1 = t4;
20177 	  t2 = t3;
20178 	  goto merge_two;
20179 
20180 	default:
20181 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
20182 	  break;
20183 	}
20184     }
20185 
20186   if (TARGET_XOP)
20187     {
20188       /* The XOP VPPERM insn supports three inputs.  By ignoring the
20189 	 one_operand_shuffle special case, we avoid creating another
20190 	 set of constant vectors in memory.  */
20191       one_operand_shuffle = false;
20192 
20193       /* mask = mask & {2*w-1, ...} */
20194       vt = GEN_INT (2*w - 1);
20195     }
20196   else
20197     {
20198       /* mask = mask & {w-1, ...} */
20199       vt = GEN_INT (w - 1);
20200     }
20201 
20202   for (i = 0; i < w; i++)
20203     vec[i] = vt;
20204   vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20205   mask = expand_simple_binop (maskmode, AND, mask, vt,
20206 			      NULL_RTX, 0, OPTAB_DIRECT);
20207 
20208   /* For non-QImode operations, convert the word permutation control
20209      into a byte permutation control.  */
20210   if (mode != V16QImode)
20211     {
20212       mask = expand_simple_binop (maskmode, ASHIFT, mask,
20213 				  GEN_INT (exact_log2 (e)),
20214 				  NULL_RTX, 0, OPTAB_DIRECT);
20215 
20216       /* Convert mask to vector of chars.  */
20217       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20218 
20219       /* Replicate each of the input bytes into byte positions:
20220 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20221 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20222 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
20223       for (i = 0; i < 16; ++i)
20224 	vec[i] = GEN_INT (i/e * e);
20225       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20226       vt = force_const_mem (V16QImode, vt);
20227       if (TARGET_XOP)
20228 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20229       else
20230 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20231 
20232       /* Convert it into the byte positions by doing
20233 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
20234       for (i = 0; i < 16; ++i)
20235 	vec[i] = GEN_INT (i % e);
20236       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20237       vt = force_const_mem (V16QImode, vt);
20238       emit_insn (gen_addv16qi3 (mask, mask, vt));
20239     }
20240 
20241   /* The actual shuffle operations all operate on V16QImode.  */
20242   op0 = gen_lowpart (V16QImode, op0);
20243   op1 = gen_lowpart (V16QImode, op1);
20244   target = gen_lowpart (V16QImode, target);
20245 
20246   if (TARGET_XOP)
20247     {
20248       emit_insn (gen_xop_pperm (target, op0, op1, mask));
20249     }
20250   else if (one_operand_shuffle)
20251     {
20252       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20253     }
20254   else
20255     {
20256       rtx xops[6];
20257       bool ok;
20258 
20259       /* Shuffle the two input vectors independently.  */
20260       t1 = gen_reg_rtx (V16QImode);
20261       t2 = gen_reg_rtx (V16QImode);
20262       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20263       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20264 
20265  merge_two:
20266       /* Then merge them together.  The key is whether any given control
20267          element contained a bit set that indicates the second word.  */
20268       mask = operands[3];
20269       vt = GEN_INT (w);
20270       if (maskmode == V2DImode && !TARGET_SSE4_1)
20271 	{
20272 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
20273 	     more shuffle to convert the V2DI input mask into a V4SI
20274 	     input mask.  At which point the masking that expand_int_vcond
20275 	     will work as desired.  */
20276 	  rtx t3 = gen_reg_rtx (V4SImode);
20277 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20278 				        const0_rtx, const0_rtx,
20279 				        const2_rtx, const2_rtx));
20280 	  mask = t3;
20281 	  maskmode = V4SImode;
20282 	  e = w = 4;
20283 	}
20284 
20285       for (i = 0; i < w; i++)
20286 	vec[i] = vt;
20287       vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20288       vt = force_reg (maskmode, vt);
20289       mask = expand_simple_binop (maskmode, AND, mask, vt,
20290 				  NULL_RTX, 0, OPTAB_DIRECT);
20291 
20292       xops[0] = gen_lowpart (mode, operands[0]);
20293       xops[1] = gen_lowpart (mode, t2);
20294       xops[2] = gen_lowpart (mode, t1);
20295       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20296       xops[4] = mask;
20297       xops[5] = vt;
20298       ok = ix86_expand_int_vcond (xops);
20299       gcc_assert (ok);
20300     }
20301 }
20302 
20303 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
20304    true if we should do zero extension, else sign extension.  HIGH_P is
20305    true if we want the N/2 high elements, else the low elements.  */
20306 
20307 void
20308 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20309 {
20310   enum machine_mode imode = GET_MODE (operands[1]);
20311   rtx tmp, dest;
20312 
20313   if (TARGET_SSE4_1)
20314     {
20315       rtx (*unpack)(rtx, rtx);
20316       rtx (*extract)(rtx, rtx) = NULL;
20317       enum machine_mode halfmode = BLKmode;
20318 
20319       switch (imode)
20320 	{
20321 	case V32QImode:
20322 	  if (unsigned_p)
20323 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
20324 	  else
20325 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
20326 	  halfmode = V16QImode;
20327 	  extract
20328 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20329 	  break;
20330 	case V16HImode:
20331 	  if (unsigned_p)
20332 	    unpack = gen_avx2_zero_extendv8hiv8si2;
20333 	  else
20334 	    unpack = gen_avx2_sign_extendv8hiv8si2;
20335 	  halfmode = V8HImode;
20336 	  extract
20337 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20338 	  break;
20339 	case V8SImode:
20340 	  if (unsigned_p)
20341 	    unpack = gen_avx2_zero_extendv4siv4di2;
20342 	  else
20343 	    unpack = gen_avx2_sign_extendv4siv4di2;
20344 	  halfmode = V4SImode;
20345 	  extract
20346 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20347 	  break;
20348 	case V16QImode:
20349 	  if (unsigned_p)
20350 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20351 	  else
20352 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20353 	  break;
20354 	case V8HImode:
20355 	  if (unsigned_p)
20356 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
20357 	  else
20358 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
20359 	  break;
20360 	case V4SImode:
20361 	  if (unsigned_p)
20362 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
20363 	  else
20364 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
20365 	  break;
20366 	default:
20367 	  gcc_unreachable ();
20368 	}
20369 
20370       if (GET_MODE_SIZE (imode) == 32)
20371 	{
20372 	  tmp = gen_reg_rtx (halfmode);
20373 	  emit_insn (extract (tmp, operands[1]));
20374 	}
20375       else if (high_p)
20376 	{
20377 	  /* Shift higher 8 bytes to lower 8 bytes.  */
20378 	  tmp = gen_reg_rtx (imode);
20379 	  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20380 					 gen_lowpart (V1TImode, operands[1]),
20381 					 GEN_INT (64)));
20382 	}
20383       else
20384 	tmp = operands[1];
20385 
20386       emit_insn (unpack (operands[0], tmp));
20387     }
20388   else
20389     {
20390       rtx (*unpack)(rtx, rtx, rtx);
20391 
20392       switch (imode)
20393 	{
20394 	case V16QImode:
20395 	  if (high_p)
20396 	    unpack = gen_vec_interleave_highv16qi;
20397 	  else
20398 	    unpack = gen_vec_interleave_lowv16qi;
20399 	  break;
20400 	case V8HImode:
20401 	  if (high_p)
20402 	    unpack = gen_vec_interleave_highv8hi;
20403 	  else
20404 	    unpack = gen_vec_interleave_lowv8hi;
20405 	  break;
20406 	case V4SImode:
20407 	  if (high_p)
20408 	    unpack = gen_vec_interleave_highv4si;
20409 	  else
20410 	    unpack = gen_vec_interleave_lowv4si;
20411 	  break;
20412 	default:
20413 	  gcc_unreachable ();
20414 	}
20415 
20416       dest = gen_lowpart (imode, operands[0]);
20417 
20418       if (unsigned_p)
20419 	tmp = force_reg (imode, CONST0_RTX (imode));
20420       else
20421 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20422 				   operands[1], pc_rtx, pc_rtx);
20423 
20424       emit_insn (unpack (dest, operands[1], tmp));
20425     }
20426 }
20427 
20428 /* Expand conditional increment or decrement using adb/sbb instructions.
20429    The default case using setcc followed by the conditional move can be
20430    done by generic code.  */
20431 bool
20432 ix86_expand_int_addcc (rtx operands[])
20433 {
20434   enum rtx_code code = GET_CODE (operands[1]);
20435   rtx flags;
20436   rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20437   rtx compare_op;
20438   rtx val = const0_rtx;
20439   bool fpcmp = false;
20440   enum machine_mode mode;
20441   rtx op0 = XEXP (operands[1], 0);
20442   rtx op1 = XEXP (operands[1], 1);
20443 
20444   if (operands[3] != const1_rtx
20445       && operands[3] != constm1_rtx)
20446     return false;
20447   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20448      return false;
20449   code = GET_CODE (compare_op);
20450 
20451   flags = XEXP (compare_op, 0);
20452 
20453   if (GET_MODE (flags) == CCFPmode
20454       || GET_MODE (flags) == CCFPUmode)
20455     {
20456       fpcmp = true;
20457       code = ix86_fp_compare_code_to_integer (code);
20458     }
20459 
20460   if (code != LTU)
20461     {
20462       val = constm1_rtx;
20463       if (fpcmp)
20464 	PUT_CODE (compare_op,
20465 		  reverse_condition_maybe_unordered
20466 		    (GET_CODE (compare_op)));
20467       else
20468 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20469     }
20470 
20471   mode = GET_MODE (operands[0]);
20472 
20473   /* Construct either adc or sbb insn.  */
20474   if ((code == LTU) == (operands[3] == constm1_rtx))
20475     {
20476       switch (mode)
20477 	{
20478 	  case QImode:
20479 	    insn = gen_subqi3_carry;
20480 	    break;
20481 	  case HImode:
20482 	    insn = gen_subhi3_carry;
20483 	    break;
20484 	  case SImode:
20485 	    insn = gen_subsi3_carry;
20486 	    break;
20487 	  case DImode:
20488 	    insn = gen_subdi3_carry;
20489 	    break;
20490 	  default:
20491 	    gcc_unreachable ();
20492 	}
20493     }
20494   else
20495     {
20496       switch (mode)
20497 	{
20498 	  case QImode:
20499 	    insn = gen_addqi3_carry;
20500 	    break;
20501 	  case HImode:
20502 	    insn = gen_addhi3_carry;
20503 	    break;
20504 	  case SImode:
20505 	    insn = gen_addsi3_carry;
20506 	    break;
20507 	  case DImode:
20508 	    insn = gen_adddi3_carry;
20509 	    break;
20510 	  default:
20511 	    gcc_unreachable ();
20512 	}
20513     }
20514   emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20515 
20516   return true;
20517 }
20518 
20519 
20520 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
20521    but works for floating pointer parameters and nonoffsetable memories.
20522    For pushes, it returns just stack offsets; the values will be saved
20523    in the right order.  Maximally three parts are generated.  */
20524 
20525 static int
20526 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20527 {
20528   int size;
20529 
20530   if (!TARGET_64BIT)
20531     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20532   else
20533     size = (GET_MODE_SIZE (mode) + 4) / 8;
20534 
20535   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20536   gcc_assert (size >= 2 && size <= 4);
20537 
20538   /* Optimize constant pool reference to immediates.  This is used by fp
20539      moves, that force all constants to memory to allow combining.  */
20540   if (MEM_P (operand) && MEM_READONLY_P (operand))
20541     {
20542       rtx tmp = maybe_get_pool_constant (operand);
20543       if (tmp)
20544 	operand = tmp;
20545     }
20546 
20547   if (MEM_P (operand) && !offsettable_memref_p (operand))
20548     {
20549       /* The only non-offsetable memories we handle are pushes.  */
20550       int ok = push_operand (operand, VOIDmode);
20551 
20552       gcc_assert (ok);
20553 
20554       operand = copy_rtx (operand);
20555       PUT_MODE (operand, Pmode);
20556       parts[0] = parts[1] = parts[2] = parts[3] = operand;
20557       return size;
20558     }
20559 
20560   if (GET_CODE (operand) == CONST_VECTOR)
20561     {
20562       enum machine_mode imode = int_mode_for_mode (mode);
20563       /* Caution: if we looked through a constant pool memory above,
20564 	 the operand may actually have a different mode now.  That's
20565 	 ok, since we want to pun this all the way back to an integer.  */
20566       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20567       gcc_assert (operand != NULL);
20568       mode = imode;
20569     }
20570 
20571   if (!TARGET_64BIT)
20572     {
20573       if (mode == DImode)
20574 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20575       else
20576 	{
20577 	  int i;
20578 
20579 	  if (REG_P (operand))
20580 	    {
20581 	      gcc_assert (reload_completed);
20582 	      for (i = 0; i < size; i++)
20583 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20584 	    }
20585 	  else if (offsettable_memref_p (operand))
20586 	    {
20587 	      operand = adjust_address (operand, SImode, 0);
20588 	      parts[0] = operand;
20589 	      for (i = 1; i < size; i++)
20590 		parts[i] = adjust_address (operand, SImode, 4 * i);
20591 	    }
20592 	  else if (GET_CODE (operand) == CONST_DOUBLE)
20593 	    {
20594 	      REAL_VALUE_TYPE r;
20595 	      long l[4];
20596 
20597 	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20598 	      switch (mode)
20599 		{
20600 		case TFmode:
20601 		  real_to_target (l, &r, mode);
20602 		  parts[3] = gen_int_mode (l[3], SImode);
20603 		  parts[2] = gen_int_mode (l[2], SImode);
20604 		  break;
20605 		case XFmode:
20606 		  REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20607 		  parts[2] = gen_int_mode (l[2], SImode);
20608 		  break;
20609 		case DFmode:
20610 		  REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20611 		  break;
20612 		default:
20613 		  gcc_unreachable ();
20614 		}
20615 	      parts[1] = gen_int_mode (l[1], SImode);
20616 	      parts[0] = gen_int_mode (l[0], SImode);
20617 	    }
20618 	  else
20619 	    gcc_unreachable ();
20620 	}
20621     }
20622   else
20623     {
20624       if (mode == TImode)
20625 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20626       if (mode == XFmode || mode == TFmode)
20627 	{
20628 	  enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20629 	  if (REG_P (operand))
20630 	    {
20631 	      gcc_assert (reload_completed);
20632 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20633 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20634 	    }
20635 	  else if (offsettable_memref_p (operand))
20636 	    {
20637 	      operand = adjust_address (operand, DImode, 0);
20638 	      parts[0] = operand;
20639 	      parts[1] = adjust_address (operand, upper_mode, 8);
20640 	    }
20641 	  else if (GET_CODE (operand) == CONST_DOUBLE)
20642 	    {
20643 	      REAL_VALUE_TYPE r;
20644 	      long l[4];
20645 
20646 	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20647 	      real_to_target (l, &r, mode);
20648 
20649 	      /* Do not use shift by 32 to avoid warning on 32bit systems.  */
20650 	      if (HOST_BITS_PER_WIDE_INT >= 64)
20651 	        parts[0]
20652 		  = gen_int_mode
20653 		      ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20654 		       + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20655 		       DImode);
20656 	      else
20657 	        parts[0] = immed_double_const (l[0], l[1], DImode);
20658 
20659 	      if (upper_mode == SImode)
20660 	        parts[1] = gen_int_mode (l[2], SImode);
20661 	      else if (HOST_BITS_PER_WIDE_INT >= 64)
20662 	        parts[1]
20663 		  = gen_int_mode
20664 		      ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20665 		       + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20666 		       DImode);
20667 	      else
20668 	        parts[1] = immed_double_const (l[2], l[3], DImode);
20669 	    }
20670 	  else
20671 	    gcc_unreachable ();
20672 	}
20673     }
20674 
20675   return size;
20676 }
20677 
20678 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20679    Return false when normal moves are needed; true when all required
20680    insns have been emitted.  Operands 2-4 contain the input values
20681    int the correct order; operands 5-7 contain the output values.  */
20682 
20683 void
20684 ix86_split_long_move (rtx operands[])
20685 {
20686   rtx part[2][4];
20687   int nparts, i, j;
20688   int push = 0;
20689   int collisions = 0;
20690   enum machine_mode mode = GET_MODE (operands[0]);
20691   bool collisionparts[4];
20692 
20693   /* The DFmode expanders may ask us to move double.
20694      For 64bit target this is single move.  By hiding the fact
20695      here we simplify i386.md splitters.  */
20696   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20697     {
20698       /* Optimize constant pool reference to immediates.  This is used by
20699 	 fp moves, that force all constants to memory to allow combining.  */
20700 
20701       if (MEM_P (operands[1])
20702 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20703 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20704 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
20705       if (push_operand (operands[0], VOIDmode))
20706 	{
20707 	  operands[0] = copy_rtx (operands[0]);
20708 	  PUT_MODE (operands[0], Pmode);
20709 	}
20710       else
20711         operands[0] = gen_lowpart (DImode, operands[0]);
20712       operands[1] = gen_lowpart (DImode, operands[1]);
20713       emit_move_insn (operands[0], operands[1]);
20714       return;
20715     }
20716 
20717   /* The only non-offsettable memory we handle is push.  */
20718   if (push_operand (operands[0], VOIDmode))
20719     push = 1;
20720   else
20721     gcc_assert (!MEM_P (operands[0])
20722 		|| offsettable_memref_p (operands[0]));
20723 
20724   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20725   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20726 
20727   /* When emitting push, take care for source operands on the stack.  */
20728   if (push && MEM_P (operands[1])
20729       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20730     {
20731       rtx src_base = XEXP (part[1][nparts - 1], 0);
20732 
20733       /* Compensate for the stack decrement by 4.  */
20734       if (!TARGET_64BIT && nparts == 3
20735 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20736 	src_base = plus_constant (src_base, 4);
20737 
20738       /* src_base refers to the stack pointer and is
20739 	 automatically decreased by emitted push.  */
20740       for (i = 0; i < nparts; i++)
20741 	part[1][i] = change_address (part[1][i],
20742 				     GET_MODE (part[1][i]), src_base);
20743     }
20744 
20745   /* We need to do copy in the right order in case an address register
20746      of the source overlaps the destination.  */
20747   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20748     {
20749       rtx tmp;
20750 
20751       for (i = 0; i < nparts; i++)
20752 	{
20753 	  collisionparts[i]
20754 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20755 	  if (collisionparts[i])
20756 	    collisions++;
20757 	}
20758 
20759       /* Collision in the middle part can be handled by reordering.  */
20760       if (collisions == 1 && nparts == 3 && collisionparts [1])
20761 	{
20762 	  tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20763 	  tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20764 	}
20765       else if (collisions == 1
20766 	       && nparts == 4
20767 	       && (collisionparts [1] || collisionparts [2]))
20768 	{
20769 	  if (collisionparts [1])
20770 	    {
20771 	      tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20772 	      tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20773 	    }
20774 	  else
20775 	    {
20776 	      tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20777 	      tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20778 	    }
20779 	}
20780 
20781       /* If there are more collisions, we can't handle it by reordering.
20782 	 Do an lea to the last part and use only one colliding move.  */
20783       else if (collisions > 1)
20784 	{
20785 	  rtx base;
20786 
20787 	  collisions = 1;
20788 
20789 	  base = part[0][nparts - 1];
20790 
20791 	  /* Handle the case when the last part isn't valid for lea.
20792 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
20793 	  if (GET_MODE (base) != Pmode)
20794 	    base = gen_rtx_REG (Pmode, REGNO (base));
20795 
20796 	  emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20797 	  part[1][0] = replace_equiv_address (part[1][0], base);
20798 	  for (i = 1; i < nparts; i++)
20799 	    {
20800 	      tmp = plus_constant (base, UNITS_PER_WORD * i);
20801 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
20802 	    }
20803 	}
20804     }
20805 
20806   if (push)
20807     {
20808       if (!TARGET_64BIT)
20809 	{
20810 	  if (nparts == 3)
20811 	    {
20812 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20813                 emit_insn (gen_addsi3 (stack_pointer_rtx,
20814 				       stack_pointer_rtx, GEN_INT (-4)));
20815 	      emit_move_insn (part[0][2], part[1][2]);
20816 	    }
20817 	  else if (nparts == 4)
20818 	    {
20819 	      emit_move_insn (part[0][3], part[1][3]);
20820 	      emit_move_insn (part[0][2], part[1][2]);
20821 	    }
20822 	}
20823       else
20824 	{
20825 	  /* In 64bit mode we don't have 32bit push available.  In case this is
20826 	     register, it is OK - we will just use larger counterpart.  We also
20827 	     retype memory - these comes from attempt to avoid REX prefix on
20828 	     moving of second half of TFmode value.  */
20829 	  if (GET_MODE (part[1][1]) == SImode)
20830 	    {
20831 	      switch (GET_CODE (part[1][1]))
20832 		{
20833 		case MEM:
20834 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
20835 		  break;
20836 
20837 		case REG:
20838 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20839 		  break;
20840 
20841 		default:
20842 		  gcc_unreachable ();
20843 		}
20844 
20845 	      if (GET_MODE (part[1][0]) == SImode)
20846 		part[1][0] = part[1][1];
20847 	    }
20848 	}
20849       emit_move_insn (part[0][1], part[1][1]);
20850       emit_move_insn (part[0][0], part[1][0]);
20851       return;
20852     }
20853 
20854   /* Choose correct order to not overwrite the source before it is copied.  */
20855   if ((REG_P (part[0][0])
20856        && REG_P (part[1][1])
20857        && (REGNO (part[0][0]) == REGNO (part[1][1])
20858 	   || (nparts == 3
20859 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
20860 	   || (nparts == 4
20861 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
20862       || (collisions > 0
20863 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20864     {
20865       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20866 	{
20867 	  operands[2 + i] = part[0][j];
20868 	  operands[6 + i] = part[1][j];
20869 	}
20870     }
20871   else
20872     {
20873       for (i = 0; i < nparts; i++)
20874 	{
20875 	  operands[2 + i] = part[0][i];
20876 	  operands[6 + i] = part[1][i];
20877 	}
20878     }
20879 
20880   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
20881   if (optimize_insn_for_size_p ())
20882     {
20883       for (j = 0; j < nparts - 1; j++)
20884 	if (CONST_INT_P (operands[6 + j])
20885 	    && operands[6 + j] != const0_rtx
20886 	    && REG_P (operands[2 + j]))
20887 	  for (i = j; i < nparts - 1; i++)
20888 	    if (CONST_INT_P (operands[7 + i])
20889 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20890 	      operands[7 + i] = operands[2 + j];
20891     }
20892 
20893   for (i = 0; i < nparts; i++)
20894     emit_move_insn (operands[2 + i], operands[6 + i]);
20895 
20896   return;
20897 }
20898 
20899 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20900    left shift by a constant, either using a single shift or
20901    a sequence of add instructions.  */
20902 
20903 static void
20904 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20905 {
20906   rtx (*insn)(rtx, rtx, rtx);
20907 
20908   if (count == 1
20909       || (count * ix86_cost->add <= ix86_cost->shift_const
20910 	  && !optimize_insn_for_size_p ()))
20911     {
20912       insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20913       while (count-- > 0)
20914 	emit_insn (insn (operand, operand, operand));
20915     }
20916   else
20917     {
20918       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20919       emit_insn (insn (operand, operand, GEN_INT (count)));
20920     }
20921 }
20922 
20923 void
20924 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20925 {
20926   rtx (*gen_ashl3)(rtx, rtx, rtx);
20927   rtx (*gen_shld)(rtx, rtx, rtx);
20928   int half_width = GET_MODE_BITSIZE (mode) >> 1;
20929 
20930   rtx low[2], high[2];
20931   int count;
20932 
20933   if (CONST_INT_P (operands[2]))
20934     {
20935       split_double_mode (mode, operands, 2, low, high);
20936       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20937 
20938       if (count >= half_width)
20939 	{
20940 	  emit_move_insn (high[0], low[1]);
20941 	  emit_move_insn (low[0], const0_rtx);
20942 
20943 	  if (count > half_width)
20944 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
20945 	}
20946       else
20947 	{
20948 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20949 
20950 	  if (!rtx_equal_p (operands[0], operands[1]))
20951 	    emit_move_insn (operands[0], operands[1]);
20952 
20953 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20954 	  ix86_expand_ashl_const (low[0], count, mode);
20955 	}
20956       return;
20957     }
20958 
20959   split_double_mode (mode, operands, 1, low, high);
20960 
20961   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20962 
20963   if (operands[1] == const1_rtx)
20964     {
20965       /* Assuming we've chosen a QImode capable registers, then 1 << N
20966 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
20967       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20968 	{
20969 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20970 
20971 	  ix86_expand_clear (low[0]);
20972 	  ix86_expand_clear (high[0]);
20973 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20974 
20975 	  d = gen_lowpart (QImode, low[0]);
20976 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20977 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
20978 	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
20979 
20980 	  d = gen_lowpart (QImode, high[0]);
20981 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20982 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
20983 	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
20984 	}
20985 
20986       /* Otherwise, we can get the same results by manually performing
20987 	 a bit extract operation on bit 5/6, and then performing the two
20988 	 shifts.  The two methods of getting 0/1 into low/high are exactly
20989 	 the same size.  Avoiding the shift in the bit extract case helps
20990 	 pentium4 a bit; no one else seems to care much either way.  */
20991       else
20992 	{
20993 	  enum machine_mode half_mode;
20994 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
20995 	  rtx (*gen_and3)(rtx, rtx, rtx);
20996 	  rtx (*gen_xor3)(rtx, rtx, rtx);
20997 	  HOST_WIDE_INT bits;
20998 	  rtx x;
20999 
21000 	  if (mode == DImode)
21001 	    {
21002 	      half_mode = SImode;
21003 	      gen_lshr3 = gen_lshrsi3;
21004 	      gen_and3 = gen_andsi3;
21005 	      gen_xor3 = gen_xorsi3;
21006 	      bits = 5;
21007 	    }
21008 	  else
21009 	    {
21010 	      half_mode = DImode;
21011 	      gen_lshr3 = gen_lshrdi3;
21012 	      gen_and3 = gen_anddi3;
21013 	      gen_xor3 = gen_xordi3;
21014 	      bits = 6;
21015 	    }
21016 
21017 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21018 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21019 	  else
21020 	    x = gen_lowpart (half_mode, operands[2]);
21021 	  emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21022 
21023 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21024 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21025 	  emit_move_insn (low[0], high[0]);
21026 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21027 	}
21028 
21029       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21030       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21031       return;
21032     }
21033 
21034   if (operands[1] == constm1_rtx)
21035     {
21036       /* For -1 << N, we can avoid the shld instruction, because we
21037 	 know that we're shifting 0...31/63 ones into a -1.  */
21038       emit_move_insn (low[0], constm1_rtx);
21039       if (optimize_insn_for_size_p ())
21040 	emit_move_insn (high[0], low[0]);
21041       else
21042 	emit_move_insn (high[0], constm1_rtx);
21043     }
21044   else
21045     {
21046       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21047 
21048       if (!rtx_equal_p (operands[0], operands[1]))
21049 	emit_move_insn (operands[0], operands[1]);
21050 
21051       split_double_mode (mode, operands, 1, low, high);
21052       emit_insn (gen_shld (high[0], low[0], operands[2]));
21053     }
21054 
21055   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21056 
21057   if (TARGET_CMOVE && scratch)
21058     {
21059       rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21060 	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21061 
21062       ix86_expand_clear (scratch);
21063       emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21064     }
21065   else
21066     {
21067       rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21068 	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21069 
21070       emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21071     }
21072 }
21073 
21074 void
21075 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21076 {
21077   rtx (*gen_ashr3)(rtx, rtx, rtx)
21078     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21079   rtx (*gen_shrd)(rtx, rtx, rtx);
21080   int half_width = GET_MODE_BITSIZE (mode) >> 1;
21081 
21082   rtx low[2], high[2];
21083   int count;
21084 
21085   if (CONST_INT_P (operands[2]))
21086     {
21087       split_double_mode (mode, operands, 2, low, high);
21088       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21089 
21090       if (count == GET_MODE_BITSIZE (mode) - 1)
21091 	{
21092 	  emit_move_insn (high[0], high[1]);
21093 	  emit_insn (gen_ashr3 (high[0], high[0],
21094 				GEN_INT (half_width - 1)));
21095 	  emit_move_insn (low[0], high[0]);
21096 
21097 	}
21098       else if (count >= half_width)
21099 	{
21100 	  emit_move_insn (low[0], high[1]);
21101 	  emit_move_insn (high[0], low[0]);
21102 	  emit_insn (gen_ashr3 (high[0], high[0],
21103 				GEN_INT (half_width - 1)));
21104 
21105 	  if (count > half_width)
21106 	    emit_insn (gen_ashr3 (low[0], low[0],
21107 				  GEN_INT (count - half_width)));
21108 	}
21109       else
21110 	{
21111 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21112 
21113 	  if (!rtx_equal_p (operands[0], operands[1]))
21114 	    emit_move_insn (operands[0], operands[1]);
21115 
21116 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21117 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21118 	}
21119     }
21120   else
21121     {
21122       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21123 
21124      if (!rtx_equal_p (operands[0], operands[1]))
21125 	emit_move_insn (operands[0], operands[1]);
21126 
21127       split_double_mode (mode, operands, 1, low, high);
21128 
21129       emit_insn (gen_shrd (low[0], high[0], operands[2]));
21130       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21131 
21132       if (TARGET_CMOVE && scratch)
21133 	{
21134 	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21135 	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21136 
21137 	  emit_move_insn (scratch, high[0]);
21138 	  emit_insn (gen_ashr3 (scratch, scratch,
21139 				GEN_INT (half_width - 1)));
21140 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21141 					  scratch));
21142 	}
21143       else
21144 	{
21145 	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21146 	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21147 
21148 	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21149 	}
21150     }
21151 }
21152 
21153 void
21154 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21155 {
21156   rtx (*gen_lshr3)(rtx, rtx, rtx)
21157     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21158   rtx (*gen_shrd)(rtx, rtx, rtx);
21159   int half_width = GET_MODE_BITSIZE (mode) >> 1;
21160 
21161   rtx low[2], high[2];
21162   int count;
21163 
21164   if (CONST_INT_P (operands[2]))
21165     {
21166       split_double_mode (mode, operands, 2, low, high);
21167       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21168 
21169       if (count >= half_width)
21170 	{
21171 	  emit_move_insn (low[0], high[1]);
21172 	  ix86_expand_clear (high[0]);
21173 
21174 	  if (count > half_width)
21175 	    emit_insn (gen_lshr3 (low[0], low[0],
21176 				  GEN_INT (count - half_width)));
21177 	}
21178       else
21179 	{
21180 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21181 
21182 	  if (!rtx_equal_p (operands[0], operands[1]))
21183 	    emit_move_insn (operands[0], operands[1]);
21184 
21185 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21186 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21187 	}
21188     }
21189   else
21190     {
21191       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21192 
21193       if (!rtx_equal_p (operands[0], operands[1]))
21194 	emit_move_insn (operands[0], operands[1]);
21195 
21196       split_double_mode (mode, operands, 1, low, high);
21197 
21198       emit_insn (gen_shrd (low[0], high[0], operands[2]));
21199       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21200 
21201       if (TARGET_CMOVE && scratch)
21202 	{
21203 	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21204 	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21205 
21206 	  ix86_expand_clear (scratch);
21207 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21208 					  scratch));
21209 	}
21210       else
21211 	{
21212 	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21213 	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21214 
21215 	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21216 	}
21217     }
21218 }
21219 
21220 /* Predict just emitted jump instruction to be taken with probability PROB.  */
21221 static void
21222 predict_jump (int prob)
21223 {
21224   rtx insn = get_last_insn ();
21225   gcc_assert (JUMP_P (insn));
21226   add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21227 }
21228 
21229 /* Helper function for the string operations below.  Dest VARIABLE whether
21230    it is aligned to VALUE bytes.  If true, jump to the label.  */
21231 static rtx
21232 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21233 {
21234   rtx label = gen_label_rtx ();
21235   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21236   if (GET_MODE (variable) == DImode)
21237     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21238   else
21239     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21240   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21241 			   1, label);
21242   if (epilogue)
21243     predict_jump (REG_BR_PROB_BASE * 50 / 100);
21244   else
21245     predict_jump (REG_BR_PROB_BASE * 90 / 100);
21246   return label;
21247 }
21248 
21249 /* Adjust COUNTER by the VALUE.  */
21250 static void
21251 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21252 {
21253   rtx (*gen_add)(rtx, rtx, rtx)
21254     = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21255 
21256   emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21257 }
21258 
21259 /* Zero extend possibly SImode EXP to Pmode register.  */
21260 rtx
21261 ix86_zero_extend_to_Pmode (rtx exp)
21262 {
21263   rtx r;
21264   if (GET_MODE (exp) == VOIDmode)
21265     return force_reg (Pmode, exp);
21266   if (GET_MODE (exp) == Pmode)
21267     return copy_to_mode_reg (Pmode, exp);
21268   r = gen_reg_rtx (Pmode);
21269   emit_insn (gen_zero_extendsidi2 (r, exp));
21270   return r;
21271 }
21272 
21273 /* Divide COUNTREG by SCALE.  */
21274 static rtx
21275 scale_counter (rtx countreg, int scale)
21276 {
21277   rtx sc;
21278 
21279   if (scale == 1)
21280     return countreg;
21281   if (CONST_INT_P (countreg))
21282     return GEN_INT (INTVAL (countreg) / scale);
21283   gcc_assert (REG_P (countreg));
21284 
21285   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21286 			    GEN_INT (exact_log2 (scale)),
21287 			    NULL, 1, OPTAB_DIRECT);
21288   return sc;
21289 }
21290 
21291 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
21292    DImode for constant loop counts.  */
21293 
21294 static enum machine_mode
21295 counter_mode (rtx count_exp)
21296 {
21297   if (GET_MODE (count_exp) != VOIDmode)
21298     return GET_MODE (count_exp);
21299   if (!CONST_INT_P (count_exp))
21300     return Pmode;
21301   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21302     return DImode;
21303   return SImode;
21304 }
21305 
21306 /* When SRCPTR is non-NULL, output simple loop to move memory
21307    pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21308    overall size is COUNT specified in bytes.  When SRCPTR is NULL, output the
21309    equivalent loop to set memory by VALUE (supposed to be in MODE).
21310 
21311    The size is rounded down to whole number of chunk size moved at once.
21312    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
21313 
21314 
21315 static void
21316 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21317 			       rtx destptr, rtx srcptr, rtx value,
21318 			       rtx count, enum machine_mode mode, int unroll,
21319 			       int expected_size)
21320 {
21321   rtx out_label, top_label, iter, tmp;
21322   enum machine_mode iter_mode = counter_mode (count);
21323   rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21324   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21325   rtx size;
21326   rtx x_addr;
21327   rtx y_addr;
21328   int i;
21329 
21330   top_label = gen_label_rtx ();
21331   out_label = gen_label_rtx ();
21332   iter = gen_reg_rtx (iter_mode);
21333 
21334   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21335 			      NULL, 1, OPTAB_DIRECT);
21336   /* Those two should combine.  */
21337   if (piece_size == const1_rtx)
21338     {
21339       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21340 			       true, out_label);
21341       predict_jump (REG_BR_PROB_BASE * 10 / 100);
21342     }
21343   emit_move_insn (iter, const0_rtx);
21344 
21345   emit_label (top_label);
21346 
21347   tmp = convert_modes (Pmode, iter_mode, iter, true);
21348   x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21349   destmem = change_address (destmem, mode, x_addr);
21350 
21351   if (srcmem)
21352     {
21353       y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21354       srcmem = change_address (srcmem, mode, y_addr);
21355 
21356       /* When unrolling for chips that reorder memory reads and writes,
21357 	 we can save registers by using single temporary.
21358 	 Also using 4 temporaries is overkill in 32bit mode.  */
21359       if (!TARGET_64BIT && 0)
21360 	{
21361 	  for (i = 0; i < unroll; i++)
21362 	    {
21363 	      if (i)
21364 		{
21365 		  destmem =
21366 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21367 		  srcmem =
21368 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21369 		}
21370 	      emit_move_insn (destmem, srcmem);
21371 	    }
21372 	}
21373       else
21374 	{
21375 	  rtx tmpreg[4];
21376 	  gcc_assert (unroll <= 4);
21377 	  for (i = 0; i < unroll; i++)
21378 	    {
21379 	      tmpreg[i] = gen_reg_rtx (mode);
21380 	      if (i)
21381 		{
21382 		  srcmem =
21383 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21384 		}
21385 	      emit_move_insn (tmpreg[i], srcmem);
21386 	    }
21387 	  for (i = 0; i < unroll; i++)
21388 	    {
21389 	      if (i)
21390 		{
21391 		  destmem =
21392 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21393 		}
21394 	      emit_move_insn (destmem, tmpreg[i]);
21395 	    }
21396 	}
21397     }
21398   else
21399     for (i = 0; i < unroll; i++)
21400       {
21401 	if (i)
21402 	  destmem =
21403 	    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21404 	emit_move_insn (destmem, value);
21405       }
21406 
21407   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21408 			     true, OPTAB_LIB_WIDEN);
21409   if (tmp != iter)
21410     emit_move_insn (iter, tmp);
21411 
21412   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21413 			   true, top_label);
21414   if (expected_size != -1)
21415     {
21416       expected_size /= GET_MODE_SIZE (mode) * unroll;
21417       if (expected_size == 0)
21418 	predict_jump (0);
21419       else if (expected_size > REG_BR_PROB_BASE)
21420 	predict_jump (REG_BR_PROB_BASE - 1);
21421       else
21422         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21423     }
21424   else
21425     predict_jump (REG_BR_PROB_BASE * 80 / 100);
21426   iter = ix86_zero_extend_to_Pmode (iter);
21427   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21428 			     true, OPTAB_LIB_WIDEN);
21429   if (tmp != destptr)
21430     emit_move_insn (destptr, tmp);
21431   if (srcptr)
21432     {
21433       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21434 				 true, OPTAB_LIB_WIDEN);
21435       if (tmp != srcptr)
21436 	emit_move_insn (srcptr, tmp);
21437     }
21438   emit_label (out_label);
21439 }
21440 
21441 /* Output "rep; mov" instruction.
21442    Arguments have same meaning as for previous function */
21443 static void
21444 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21445 			   rtx destptr, rtx srcptr,
21446 			   rtx count,
21447 			   enum machine_mode mode)
21448 {
21449   rtx destexp;
21450   rtx srcexp;
21451   rtx countreg;
21452   HOST_WIDE_INT rounded_count;
21453 
21454   /* If the size is known, it is shorter to use rep movs.  */
21455   if (mode == QImode && CONST_INT_P (count)
21456       && !(INTVAL (count) & 3))
21457     mode = SImode;
21458 
21459   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21460     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21461   if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21462     srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21463   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21464   if (mode != QImode)
21465     {
21466       destexp = gen_rtx_ASHIFT (Pmode, countreg,
21467 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21468       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21469       srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21470 			       GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21471       srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21472     }
21473   else
21474     {
21475       destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21476       srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21477     }
21478   if (CONST_INT_P (count))
21479     {
21480       rounded_count = (INTVAL (count)
21481 		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21482       destmem = shallow_copy_rtx (destmem);
21483       srcmem = shallow_copy_rtx (srcmem);
21484       set_mem_size (destmem, rounded_count);
21485       set_mem_size (srcmem, rounded_count);
21486     }
21487   else
21488     {
21489       if (MEM_SIZE_KNOWN_P (destmem))
21490 	clear_mem_size (destmem);
21491       if (MEM_SIZE_KNOWN_P (srcmem))
21492 	clear_mem_size (srcmem);
21493     }
21494   emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21495 			  destexp, srcexp));
21496 }
21497 
21498 /* Output "rep; stos" instruction.
21499    Arguments have same meaning as for previous function */
21500 static void
21501 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21502 			    rtx count, enum machine_mode mode,
21503 			    rtx orig_value)
21504 {
21505   rtx destexp;
21506   rtx countreg;
21507   HOST_WIDE_INT rounded_count;
21508 
21509   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21510     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21511   value = force_reg (mode, gen_lowpart (mode, value));
21512   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21513   if (mode != QImode)
21514     {
21515       destexp = gen_rtx_ASHIFT (Pmode, countreg,
21516 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21517       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21518     }
21519   else
21520     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21521   if (orig_value == const0_rtx && CONST_INT_P (count))
21522     {
21523       rounded_count = (INTVAL (count)
21524 		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21525       destmem = shallow_copy_rtx (destmem);
21526       set_mem_size (destmem, rounded_count);
21527     }
21528   else if (MEM_SIZE_KNOWN_P (destmem))
21529     clear_mem_size (destmem);
21530   emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21531 }
21532 
21533 static void
21534 emit_strmov (rtx destmem, rtx srcmem,
21535 	     rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21536 {
21537   rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21538   rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21539   emit_insn (gen_strmov (destptr, dest, srcptr, src));
21540 }
21541 
21542 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
21543 static void
21544 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21545 			rtx destptr, rtx srcptr, rtx count, int max_size)
21546 {
21547   rtx src, dest;
21548   if (CONST_INT_P (count))
21549     {
21550       HOST_WIDE_INT countval = INTVAL (count);
21551       int offset = 0;
21552 
21553       if ((countval & 0x10) && max_size > 16)
21554 	{
21555 	  if (TARGET_64BIT)
21556 	    {
21557 	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21558 	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21559 	    }
21560 	  else
21561 	    gcc_unreachable ();
21562 	  offset += 16;
21563 	}
21564       if ((countval & 0x08) && max_size > 8)
21565 	{
21566 	  if (TARGET_64BIT)
21567 	    emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21568 	  else
21569 	    {
21570 	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21571 	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21572 	    }
21573 	  offset += 8;
21574 	}
21575       if ((countval & 0x04) && max_size > 4)
21576 	{
21577           emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21578 	  offset += 4;
21579 	}
21580       if ((countval & 0x02) && max_size > 2)
21581 	{
21582           emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21583 	  offset += 2;
21584 	}
21585       if ((countval & 0x01) && max_size > 1)
21586 	{
21587           emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21588 	  offset += 1;
21589 	}
21590       return;
21591     }
21592   if (max_size > 8)
21593     {
21594       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21595 				    count, 1, OPTAB_DIRECT);
21596       expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21597 				     count, QImode, 1, 4);
21598       return;
21599     }
21600 
21601   /* When there are stringops, we can cheaply increase dest and src pointers.
21602      Otherwise we save code size by maintaining offset (zero is readily
21603      available from preceding rep operation) and using x86 addressing modes.
21604    */
21605   if (TARGET_SINGLE_STRINGOP)
21606     {
21607       if (max_size > 4)
21608 	{
21609 	  rtx label = ix86_expand_aligntest (count, 4, true);
21610 	  src = change_address (srcmem, SImode, srcptr);
21611 	  dest = change_address (destmem, SImode, destptr);
21612 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
21613 	  emit_label (label);
21614 	  LABEL_NUSES (label) = 1;
21615 	}
21616       if (max_size > 2)
21617 	{
21618 	  rtx label = ix86_expand_aligntest (count, 2, true);
21619 	  src = change_address (srcmem, HImode, srcptr);
21620 	  dest = change_address (destmem, HImode, destptr);
21621 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
21622 	  emit_label (label);
21623 	  LABEL_NUSES (label) = 1;
21624 	}
21625       if (max_size > 1)
21626 	{
21627 	  rtx label = ix86_expand_aligntest (count, 1, true);
21628 	  src = change_address (srcmem, QImode, srcptr);
21629 	  dest = change_address (destmem, QImode, destptr);
21630 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
21631 	  emit_label (label);
21632 	  LABEL_NUSES (label) = 1;
21633 	}
21634     }
21635   else
21636     {
21637       rtx offset = force_reg (Pmode, const0_rtx);
21638       rtx tmp;
21639 
21640       if (max_size > 4)
21641 	{
21642 	  rtx label = ix86_expand_aligntest (count, 4, true);
21643 	  src = change_address (srcmem, SImode, srcptr);
21644 	  dest = change_address (destmem, SImode, destptr);
21645 	  emit_move_insn (dest, src);
21646 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21647 				     true, OPTAB_LIB_WIDEN);
21648 	  if (tmp != offset)
21649 	    emit_move_insn (offset, tmp);
21650 	  emit_label (label);
21651 	  LABEL_NUSES (label) = 1;
21652 	}
21653       if (max_size > 2)
21654 	{
21655 	  rtx label = ix86_expand_aligntest (count, 2, true);
21656 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21657 	  src = change_address (srcmem, HImode, tmp);
21658 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21659 	  dest = change_address (destmem, HImode, tmp);
21660 	  emit_move_insn (dest, src);
21661 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21662 				     true, OPTAB_LIB_WIDEN);
21663 	  if (tmp != offset)
21664 	    emit_move_insn (offset, tmp);
21665 	  emit_label (label);
21666 	  LABEL_NUSES (label) = 1;
21667 	}
21668       if (max_size > 1)
21669 	{
21670 	  rtx label = ix86_expand_aligntest (count, 1, true);
21671 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21672 	  src = change_address (srcmem, QImode, tmp);
21673 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21674 	  dest = change_address (destmem, QImode, tmp);
21675 	  emit_move_insn (dest, src);
21676 	  emit_label (label);
21677 	  LABEL_NUSES (label) = 1;
21678 	}
21679     }
21680 }
21681 
21682 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
21683 static void
21684 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21685 				 rtx count, int max_size)
21686 {
21687   count =
21688     expand_simple_binop (counter_mode (count), AND, count,
21689 			 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21690   expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21691 				 gen_lowpart (QImode, value), count, QImode,
21692 				 1, max_size / 2);
21693 }
21694 
21695 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
21696 static void
21697 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21698 {
21699   rtx dest;
21700 
21701   if (CONST_INT_P (count))
21702     {
21703       HOST_WIDE_INT countval = INTVAL (count);
21704       int offset = 0;
21705 
21706       if ((countval & 0x10) && max_size > 16)
21707 	{
21708 	  if (TARGET_64BIT)
21709 	    {
21710 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21711 	      emit_insn (gen_strset (destptr, dest, value));
21712 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21713 	      emit_insn (gen_strset (destptr, dest, value));
21714 	    }
21715 	  else
21716 	    gcc_unreachable ();
21717 	  offset += 16;
21718 	}
21719       if ((countval & 0x08) && max_size > 8)
21720 	{
21721 	  if (TARGET_64BIT)
21722 	    {
21723 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21724 	      emit_insn (gen_strset (destptr, dest, value));
21725 	    }
21726 	  else
21727 	    {
21728 	      dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21729 	      emit_insn (gen_strset (destptr, dest, value));
21730 	      dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21731 	      emit_insn (gen_strset (destptr, dest, value));
21732 	    }
21733 	  offset += 8;
21734 	}
21735       if ((countval & 0x04) && max_size > 4)
21736 	{
21737 	  dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21738 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21739 	  offset += 4;
21740 	}
21741       if ((countval & 0x02) && max_size > 2)
21742 	{
21743 	  dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21744 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21745 	  offset += 2;
21746 	}
21747       if ((countval & 0x01) && max_size > 1)
21748 	{
21749 	  dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21750 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21751 	  offset += 1;
21752 	}
21753       return;
21754     }
21755   if (max_size > 32)
21756     {
21757       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21758       return;
21759     }
21760   if (max_size > 16)
21761     {
21762       rtx label = ix86_expand_aligntest (count, 16, true);
21763       if (TARGET_64BIT)
21764 	{
21765 	  dest = change_address (destmem, DImode, destptr);
21766 	  emit_insn (gen_strset (destptr, dest, value));
21767 	  emit_insn (gen_strset (destptr, dest, value));
21768 	}
21769       else
21770 	{
21771 	  dest = change_address (destmem, SImode, destptr);
21772 	  emit_insn (gen_strset (destptr, dest, value));
21773 	  emit_insn (gen_strset (destptr, dest, value));
21774 	  emit_insn (gen_strset (destptr, dest, value));
21775 	  emit_insn (gen_strset (destptr, dest, value));
21776 	}
21777       emit_label (label);
21778       LABEL_NUSES (label) = 1;
21779     }
21780   if (max_size > 8)
21781     {
21782       rtx label = ix86_expand_aligntest (count, 8, true);
21783       if (TARGET_64BIT)
21784 	{
21785 	  dest = change_address (destmem, DImode, destptr);
21786 	  emit_insn (gen_strset (destptr, dest, value));
21787 	}
21788       else
21789 	{
21790 	  dest = change_address (destmem, SImode, destptr);
21791 	  emit_insn (gen_strset (destptr, dest, value));
21792 	  emit_insn (gen_strset (destptr, dest, value));
21793 	}
21794       emit_label (label);
21795       LABEL_NUSES (label) = 1;
21796     }
21797   if (max_size > 4)
21798     {
21799       rtx label = ix86_expand_aligntest (count, 4, true);
21800       dest = change_address (destmem, SImode, destptr);
21801       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21802       emit_label (label);
21803       LABEL_NUSES (label) = 1;
21804     }
21805   if (max_size > 2)
21806     {
21807       rtx label = ix86_expand_aligntest (count, 2, true);
21808       dest = change_address (destmem, HImode, destptr);
21809       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21810       emit_label (label);
21811       LABEL_NUSES (label) = 1;
21812     }
21813   if (max_size > 1)
21814     {
21815       rtx label = ix86_expand_aligntest (count, 1, true);
21816       dest = change_address (destmem, QImode, destptr);
21817       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21818       emit_label (label);
21819       LABEL_NUSES (label) = 1;
21820     }
21821 }
21822 
21823 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21824    DESIRED_ALIGNMENT.  */
21825 static void
21826 expand_movmem_prologue (rtx destmem, rtx srcmem,
21827 			rtx destptr, rtx srcptr, rtx count,
21828 			int align, int desired_alignment)
21829 {
21830   if (align <= 1 && desired_alignment > 1)
21831     {
21832       rtx label = ix86_expand_aligntest (destptr, 1, false);
21833       srcmem = change_address (srcmem, QImode, srcptr);
21834       destmem = change_address (destmem, QImode, destptr);
21835       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21836       ix86_adjust_counter (count, 1);
21837       emit_label (label);
21838       LABEL_NUSES (label) = 1;
21839     }
21840   if (align <= 2 && desired_alignment > 2)
21841     {
21842       rtx label = ix86_expand_aligntest (destptr, 2, false);
21843       srcmem = change_address (srcmem, HImode, srcptr);
21844       destmem = change_address (destmem, HImode, destptr);
21845       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21846       ix86_adjust_counter (count, 2);
21847       emit_label (label);
21848       LABEL_NUSES (label) = 1;
21849     }
21850   if (align <= 4 && desired_alignment > 4)
21851     {
21852       rtx label = ix86_expand_aligntest (destptr, 4, false);
21853       srcmem = change_address (srcmem, SImode, srcptr);
21854       destmem = change_address (destmem, SImode, destptr);
21855       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21856       ix86_adjust_counter (count, 4);
21857       emit_label (label);
21858       LABEL_NUSES (label) = 1;
21859     }
21860   gcc_assert (desired_alignment <= 8);
21861 }
21862 
21863 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21864    ALIGN_BYTES is how many bytes need to be copied.  */
21865 static rtx
21866 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21867 				 int desired_align, int align_bytes)
21868 {
21869   rtx src = *srcp;
21870   rtx orig_dst = dst;
21871   rtx orig_src = src;
21872   int off = 0;
21873   int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21874   if (src_align_bytes >= 0)
21875     src_align_bytes = desired_align - src_align_bytes;
21876   if (align_bytes & 1)
21877     {
21878       dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21879       src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21880       off = 1;
21881       emit_insn (gen_strmov (destreg, dst, srcreg, src));
21882     }
21883   if (align_bytes & 2)
21884     {
21885       dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21886       src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21887       if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21888 	set_mem_align (dst, 2 * BITS_PER_UNIT);
21889       if (src_align_bytes >= 0
21890 	  && (src_align_bytes & 1) == (align_bytes & 1)
21891 	  && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21892 	set_mem_align (src, 2 * BITS_PER_UNIT);
21893       off = 2;
21894       emit_insn (gen_strmov (destreg, dst, srcreg, src));
21895     }
21896   if (align_bytes & 4)
21897     {
21898       dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21899       src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21900       if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21901 	set_mem_align (dst, 4 * BITS_PER_UNIT);
21902       if (src_align_bytes >= 0)
21903 	{
21904 	  unsigned int src_align = 0;
21905 	  if ((src_align_bytes & 3) == (align_bytes & 3))
21906 	    src_align = 4;
21907 	  else if ((src_align_bytes & 1) == (align_bytes & 1))
21908 	    src_align = 2;
21909 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21910 	    set_mem_align (src, src_align * BITS_PER_UNIT);
21911 	}
21912       off = 4;
21913       emit_insn (gen_strmov (destreg, dst, srcreg, src));
21914     }
21915   dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21916   src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21917   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21918     set_mem_align (dst, desired_align * BITS_PER_UNIT);
21919   if (src_align_bytes >= 0)
21920     {
21921       unsigned int src_align = 0;
21922       if ((src_align_bytes & 7) == (align_bytes & 7))
21923 	src_align = 8;
21924       else if ((src_align_bytes & 3) == (align_bytes & 3))
21925 	src_align = 4;
21926       else if ((src_align_bytes & 1) == (align_bytes & 1))
21927 	src_align = 2;
21928       if (src_align > (unsigned int) desired_align)
21929 	src_align = desired_align;
21930       if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21931 	set_mem_align (src, src_align * BITS_PER_UNIT);
21932     }
21933   if (MEM_SIZE_KNOWN_P (orig_dst))
21934     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21935   if (MEM_SIZE_KNOWN_P (orig_src))
21936     set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21937   *srcp = src;
21938   return dst;
21939 }
21940 
21941 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21942    DESIRED_ALIGNMENT.  */
21943 static void
21944 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21945 			int align, int desired_alignment)
21946 {
21947   if (align <= 1 && desired_alignment > 1)
21948     {
21949       rtx label = ix86_expand_aligntest (destptr, 1, false);
21950       destmem = change_address (destmem, QImode, destptr);
21951       emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21952       ix86_adjust_counter (count, 1);
21953       emit_label (label);
21954       LABEL_NUSES (label) = 1;
21955     }
21956   if (align <= 2 && desired_alignment > 2)
21957     {
21958       rtx label = ix86_expand_aligntest (destptr, 2, false);
21959       destmem = change_address (destmem, HImode, destptr);
21960       emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21961       ix86_adjust_counter (count, 2);
21962       emit_label (label);
21963       LABEL_NUSES (label) = 1;
21964     }
21965   if (align <= 4 && desired_alignment > 4)
21966     {
21967       rtx label = ix86_expand_aligntest (destptr, 4, false);
21968       destmem = change_address (destmem, SImode, destptr);
21969       emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21970       ix86_adjust_counter (count, 4);
21971       emit_label (label);
21972       LABEL_NUSES (label) = 1;
21973     }
21974   gcc_assert (desired_alignment <= 8);
21975 }
21976 
21977 /* Set enough from DST to align DST known to by aligned by ALIGN to
21978    DESIRED_ALIGN.  ALIGN_BYTES is how many bytes need to be stored.  */
21979 static rtx
21980 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21981 				 int desired_align, int align_bytes)
21982 {
21983   int off = 0;
21984   rtx orig_dst = dst;
21985   if (align_bytes & 1)
21986     {
21987       dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21988       off = 1;
21989       emit_insn (gen_strset (destreg, dst,
21990 			     gen_lowpart (QImode, value)));
21991     }
21992   if (align_bytes & 2)
21993     {
21994       dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21995       if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21996 	set_mem_align (dst, 2 * BITS_PER_UNIT);
21997       off = 2;
21998       emit_insn (gen_strset (destreg, dst,
21999 			     gen_lowpart (HImode, value)));
22000     }
22001   if (align_bytes & 4)
22002     {
22003       dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22004       if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22005 	set_mem_align (dst, 4 * BITS_PER_UNIT);
22006       off = 4;
22007       emit_insn (gen_strset (destreg, dst,
22008 			     gen_lowpart (SImode, value)));
22009     }
22010   dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22011   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22012     set_mem_align (dst, desired_align * BITS_PER_UNIT);
22013   if (MEM_SIZE_KNOWN_P (orig_dst))
22014     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22015   return dst;
22016 }
22017 
22018 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
22019 static enum stringop_alg
22020 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22021 	    int *dynamic_check)
22022 {
22023   const struct stringop_algs * algs;
22024   bool optimize_for_speed;
22025   /* Algorithms using the rep prefix want at least edi and ecx;
22026      additionally, memset wants eax and memcpy wants esi.  Don't
22027      consider such algorithms if the user has appropriated those
22028      registers for their own purposes.	*/
22029   bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22030                              || (memset
22031 				 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22032 
22033 #define ALG_USABLE_P(alg) (rep_prefix_usable			\
22034 			   || (alg != rep_prefix_1_byte		\
22035 			       && alg != rep_prefix_4_byte      \
22036 			       && alg != rep_prefix_8_byte))
22037   const struct processor_costs *cost;
22038 
22039   /* Even if the string operation call is cold, we still might spend a lot
22040      of time processing large blocks.  */
22041   if (optimize_function_for_size_p (cfun)
22042       || (optimize_insn_for_size_p ()
22043           && expected_size != -1 && expected_size < 256))
22044     optimize_for_speed = false;
22045   else
22046     optimize_for_speed = true;
22047 
22048   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22049 
22050   *dynamic_check = -1;
22051   if (memset)
22052     algs = &cost->memset[TARGET_64BIT != 0];
22053   else
22054     algs = &cost->memcpy[TARGET_64BIT != 0];
22055   if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22056     return ix86_stringop_alg;
22057   /* rep; movq or rep; movl is the smallest variant.  */
22058   else if (!optimize_for_speed)
22059     {
22060       if (!count || (count & 3))
22061 	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22062       else
22063 	return rep_prefix_usable ? rep_prefix_4_byte : loop;
22064     }
22065   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22066    */
22067   else if (expected_size != -1 && expected_size < 4)
22068     return loop_1_byte;
22069   else if (expected_size != -1)
22070     {
22071       unsigned int i;
22072       enum stringop_alg alg = libcall;
22073       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22074 	{
22075 	  /* We get here if the algorithms that were not libcall-based
22076 	     were rep-prefix based and we are unable to use rep prefixes
22077 	     based on global register usage.  Break out of the loop and
22078 	     use the heuristic below.  */
22079 	  if (algs->size[i].max == 0)
22080 	    break;
22081 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22082 	    {
22083 	      enum stringop_alg candidate = algs->size[i].alg;
22084 
22085 	      if (candidate != libcall && ALG_USABLE_P (candidate))
22086 		alg = candidate;
22087 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22088 		 last non-libcall inline algorithm.  */
22089 	      if (TARGET_INLINE_ALL_STRINGOPS)
22090 		{
22091 		  /* When the current size is best to be copied by a libcall,
22092 		     but we are still forced to inline, run the heuristic below
22093 		     that will pick code for medium sized blocks.  */
22094 		  if (alg != libcall)
22095 		    return alg;
22096 		  break;
22097 		}
22098 	      else if (ALG_USABLE_P (candidate))
22099 		return candidate;
22100 	    }
22101 	}
22102       gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22103     }
22104   /* When asked to inline the call anyway, try to pick meaningful choice.
22105      We look for maximal size of block that is faster to copy by hand and
22106      take blocks of at most of that size guessing that average size will
22107      be roughly half of the block.
22108 
22109      If this turns out to be bad, we might simply specify the preferred
22110      choice in ix86_costs.  */
22111   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22112       && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22113     {
22114       int max = -1;
22115       enum stringop_alg alg;
22116       int i;
22117       bool any_alg_usable_p = true;
22118 
22119       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22120         {
22121           enum stringop_alg candidate = algs->size[i].alg;
22122           any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22123 
22124           if (candidate != libcall && candidate
22125               && ALG_USABLE_P (candidate))
22126               max = algs->size[i].max;
22127         }
22128       /* If there aren't any usable algorithms, then recursing on
22129          smaller sizes isn't going to find anything.  Just return the
22130          simple byte-at-a-time copy loop.  */
22131       if (!any_alg_usable_p)
22132         {
22133           /* Pick something reasonable.  */
22134           if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22135             *dynamic_check = 128;
22136           return loop_1_byte;
22137         }
22138       if (max == -1)
22139 	max = 4096;
22140       alg = decide_alg (count, max / 2, memset, dynamic_check);
22141       gcc_assert (*dynamic_check == -1);
22142       gcc_assert (alg != libcall);
22143       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22144 	*dynamic_check = max;
22145       return alg;
22146     }
22147   return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22148 #undef ALG_USABLE_P
22149 }
22150 
22151 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
22152    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
22153 static int
22154 decide_alignment (int align,
22155 		  enum stringop_alg alg,
22156 		  int expected_size)
22157 {
22158   int desired_align = 0;
22159   switch (alg)
22160     {
22161       case no_stringop:
22162 	gcc_unreachable ();
22163       case loop:
22164       case unrolled_loop:
22165 	desired_align = GET_MODE_SIZE (Pmode);
22166 	break;
22167       case rep_prefix_8_byte:
22168 	desired_align = 8;
22169 	break;
22170       case rep_prefix_4_byte:
22171 	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
22172 	   copying whole cacheline at once.  */
22173 	if (TARGET_PENTIUMPRO)
22174 	  desired_align = 8;
22175 	else
22176 	  desired_align = 4;
22177 	break;
22178       case rep_prefix_1_byte:
22179 	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
22180 	   copying whole cacheline at once.  */
22181 	if (TARGET_PENTIUMPRO)
22182 	  desired_align = 8;
22183 	else
22184 	  desired_align = 1;
22185 	break;
22186       case loop_1_byte:
22187 	desired_align = 1;
22188 	break;
22189       case libcall:
22190 	return 0;
22191     }
22192 
22193   if (optimize_size)
22194     desired_align = 1;
22195   if (desired_align < align)
22196     desired_align = align;
22197   if (expected_size != -1 && expected_size < 4)
22198     desired_align = align;
22199   return desired_align;
22200 }
22201 
22202 /* Return the smallest power of 2 greater than VAL.  */
22203 static int
22204 smallest_pow2_greater_than (int val)
22205 {
22206   int ret = 1;
22207   while (ret <= val)
22208     ret <<= 1;
22209   return ret;
22210 }
22211 
22212 /* Expand string move (memcpy) operation.  Use i386 string operations
22213    when profitable.  expand_setmem contains similar code.  The code
22214    depends upon architecture, block size and alignment, but always has
22215    the same overall structure:
22216 
22217    1) Prologue guard: Conditional that jumps up to epilogues for small
22218       blocks that can be handled by epilogue alone.  This is faster
22219       but also needed for correctness, since prologue assume the block
22220       is larger than the desired alignment.
22221 
22222       Optional dynamic check for size and libcall for large
22223       blocks is emitted here too, with -minline-stringops-dynamically.
22224 
22225    2) Prologue: copy first few bytes in order to get destination
22226       aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
22227       than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22228       copied.  We emit either a jump tree on power of two sized
22229       blocks, or a byte loop.
22230 
22231    3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22232       with specified algorithm.
22233 
22234    4) Epilogue: code copying tail of the block that is too small to be
22235       handled by main body (or up to size guarded by prologue guard).  */
22236 
22237 bool
22238 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22239 		    rtx expected_align_exp, rtx expected_size_exp)
22240 {
22241   rtx destreg;
22242   rtx srcreg;
22243   rtx label = NULL;
22244   rtx tmp;
22245   rtx jump_around_label = NULL;
22246   HOST_WIDE_INT align = 1;
22247   unsigned HOST_WIDE_INT count = 0;
22248   HOST_WIDE_INT expected_size = -1;
22249   int size_needed = 0, epilogue_size_needed;
22250   int desired_align = 0, align_bytes = 0;
22251   enum stringop_alg alg;
22252   int dynamic_check;
22253   bool need_zero_guard = false;
22254 
22255   if (CONST_INT_P (align_exp))
22256     align = INTVAL (align_exp);
22257   /* i386 can do misaligned access on reasonably increased cost.  */
22258   if (CONST_INT_P (expected_align_exp)
22259       && INTVAL (expected_align_exp) > align)
22260     align = INTVAL (expected_align_exp);
22261   /* ALIGN is the minimum of destination and source alignment, but we care here
22262      just about destination alignment.  */
22263   else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22264     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22265 
22266   if (CONST_INT_P (count_exp))
22267     count = expected_size = INTVAL (count_exp);
22268   if (CONST_INT_P (expected_size_exp) && count == 0)
22269     expected_size = INTVAL (expected_size_exp);
22270 
22271   /* Make sure we don't need to care about overflow later on.  */
22272   if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22273     return false;
22274 
22275   /* Step 0: Decide on preferred algorithm, desired alignment and
22276      size of chunks to be copied by main loop.  */
22277 
22278   alg = decide_alg (count, expected_size, false, &dynamic_check);
22279   desired_align = decide_alignment (align, alg, expected_size);
22280 
22281   if (!TARGET_ALIGN_STRINGOPS)
22282     align = desired_align;
22283 
22284   if (alg == libcall)
22285     return false;
22286   gcc_assert (alg != no_stringop);
22287   if (!count)
22288     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22289   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22290   srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
22291   switch (alg)
22292     {
22293     case libcall:
22294     case no_stringop:
22295       gcc_unreachable ();
22296     case loop:
22297       need_zero_guard = true;
22298       size_needed = GET_MODE_SIZE (Pmode);
22299       break;
22300     case unrolled_loop:
22301       need_zero_guard = true;
22302       size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
22303       break;
22304     case rep_prefix_8_byte:
22305       size_needed = 8;
22306       break;
22307     case rep_prefix_4_byte:
22308       size_needed = 4;
22309       break;
22310     case rep_prefix_1_byte:
22311       size_needed = 1;
22312       break;
22313     case loop_1_byte:
22314       need_zero_guard = true;
22315       size_needed = 1;
22316       break;
22317     }
22318 
22319   epilogue_size_needed = size_needed;
22320 
22321   /* Step 1: Prologue guard.  */
22322 
22323   /* Alignment code needs count to be in register.  */
22324   if (CONST_INT_P (count_exp) && desired_align > align)
22325     {
22326       if (INTVAL (count_exp) > desired_align
22327 	  && INTVAL (count_exp) > size_needed)
22328 	{
22329 	  align_bytes
22330 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22331 	  if (align_bytes <= 0)
22332 	    align_bytes = 0;
22333 	  else
22334 	    align_bytes = desired_align - align_bytes;
22335 	}
22336       if (align_bytes == 0)
22337 	count_exp = force_reg (counter_mode (count_exp), count_exp);
22338     }
22339   gcc_assert (desired_align >= 1 && align >= 1);
22340 
22341   /* Ensure that alignment prologue won't copy past end of block.  */
22342   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22343     {
22344       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22345       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22346 	 Make sure it is power of 2.  */
22347       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22348 
22349       if (count)
22350 	{
22351 	  if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22352 	    {
22353 	      /* If main algorithm works on QImode, no epilogue is needed.
22354 		 For small sizes just don't align anything.  */
22355 	      if (size_needed == 1)
22356 		desired_align = align;
22357 	      else
22358 		goto epilogue;
22359 	    }
22360 	}
22361       else
22362 	{
22363 	  label = gen_label_rtx ();
22364 	  emit_cmp_and_jump_insns (count_exp,
22365 				   GEN_INT (epilogue_size_needed),
22366 				   LTU, 0, counter_mode (count_exp), 1, label);
22367 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
22368 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22369 	  else
22370 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22371 	}
22372     }
22373 
22374   /* Emit code to decide on runtime whether library call or inline should be
22375      used.  */
22376   if (dynamic_check != -1)
22377     {
22378       if (CONST_INT_P (count_exp))
22379 	{
22380 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22381 	    {
22382 	      emit_block_move_via_libcall (dst, src, count_exp, false);
22383 	      count_exp = const0_rtx;
22384 	      goto epilogue;
22385 	    }
22386 	}
22387       else
22388 	{
22389 	  rtx hot_label = gen_label_rtx ();
22390 	  jump_around_label = gen_label_rtx ();
22391 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22392 				   LEU, 0, GET_MODE (count_exp), 1, hot_label);
22393 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
22394 	  emit_block_move_via_libcall (dst, src, count_exp, false);
22395 	  emit_jump (jump_around_label);
22396 	  emit_label (hot_label);
22397 	}
22398     }
22399 
22400   /* Step 2: Alignment prologue.  */
22401 
22402   if (desired_align > align)
22403     {
22404       if (align_bytes == 0)
22405 	{
22406 	  /* Except for the first move in epilogue, we no longer know
22407 	     constant offset in aliasing info.  It don't seems to worth
22408 	     the pain to maintain it for the first move, so throw away
22409 	     the info early.  */
22410 	  src = change_address (src, BLKmode, srcreg);
22411 	  dst = change_address (dst, BLKmode, destreg);
22412 	  expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22413 				  desired_align);
22414 	}
22415       else
22416 	{
22417 	  /* If we know how many bytes need to be stored before dst is
22418 	     sufficiently aligned, maintain aliasing info accurately.  */
22419 	  dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22420 						 desired_align, align_bytes);
22421 	  count_exp = plus_constant (count_exp, -align_bytes);
22422 	  count -= align_bytes;
22423 	}
22424       if (need_zero_guard
22425 	  && (count < (unsigned HOST_WIDE_INT) size_needed
22426 	      || (align_bytes == 0
22427 		  && count < ((unsigned HOST_WIDE_INT) size_needed
22428 			      + desired_align - align))))
22429 	{
22430 	  /* It is possible that we copied enough so the main loop will not
22431 	     execute.  */
22432 	  gcc_assert (size_needed > 1);
22433 	  if (label == NULL_RTX)
22434 	    label = gen_label_rtx ();
22435 	  emit_cmp_and_jump_insns (count_exp,
22436 				   GEN_INT (size_needed),
22437 				   LTU, 0, counter_mode (count_exp), 1, label);
22438 	  if (expected_size == -1
22439 	      || expected_size < (desired_align - align) / 2 + size_needed)
22440 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22441 	  else
22442 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22443 	}
22444     }
22445   if (label && size_needed == 1)
22446     {
22447       emit_label (label);
22448       LABEL_NUSES (label) = 1;
22449       label = NULL;
22450       epilogue_size_needed = 1;
22451     }
22452   else if (label == NULL_RTX)
22453     epilogue_size_needed = size_needed;
22454 
22455   /* Step 3: Main loop.  */
22456 
22457   switch (alg)
22458     {
22459     case libcall:
22460     case no_stringop:
22461       gcc_unreachable ();
22462     case loop_1_byte:
22463       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22464 				     count_exp, QImode, 1, expected_size);
22465       break;
22466     case loop:
22467       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22468 				     count_exp, Pmode, 1, expected_size);
22469       break;
22470     case unrolled_loop:
22471       /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22472 	 registers for 4 temporaries anyway.  */
22473       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22474 				     count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22475 				     expected_size);
22476       break;
22477     case rep_prefix_8_byte:
22478       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22479 				 DImode);
22480       break;
22481     case rep_prefix_4_byte:
22482       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22483 				 SImode);
22484       break;
22485     case rep_prefix_1_byte:
22486       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22487 				 QImode);
22488       break;
22489     }
22490   /* Adjust properly the offset of src and dest memory for aliasing.  */
22491   if (CONST_INT_P (count_exp))
22492     {
22493       src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22494 					  (count / size_needed) * size_needed);
22495       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22496 					  (count / size_needed) * size_needed);
22497     }
22498   else
22499     {
22500       src = change_address (src, BLKmode, srcreg);
22501       dst = change_address (dst, BLKmode, destreg);
22502     }
22503 
22504   /* Step 4: Epilogue to copy the remaining bytes.  */
22505  epilogue:
22506   if (label)
22507     {
22508       /* When the main loop is done, COUNT_EXP might hold original count,
22509  	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22510 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22511 	 bytes. Compensate if needed.  */
22512 
22513       if (size_needed < epilogue_size_needed)
22514 	{
22515 	  tmp =
22516 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22517 				 GEN_INT (size_needed - 1), count_exp, 1,
22518 				 OPTAB_DIRECT);
22519 	  if (tmp != count_exp)
22520 	    emit_move_insn (count_exp, tmp);
22521 	}
22522       emit_label (label);
22523       LABEL_NUSES (label) = 1;
22524     }
22525 
22526   if (count_exp != const0_rtx && epilogue_size_needed > 1)
22527     expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22528 			    epilogue_size_needed);
22529   if (jump_around_label)
22530     emit_label (jump_around_label);
22531   return true;
22532 }
22533 
22534 /* Helper function for memcpy.  For QImode value 0xXY produce
22535    0xXYXYXYXY of wide specified by MODE.  This is essentially
22536    a * 0x10101010, but we can do slightly better than
22537    synth_mult by unwinding the sequence by hand on CPUs with
22538    slow multiply.  */
22539 static rtx
22540 promote_duplicated_reg (enum machine_mode mode, rtx val)
22541 {
22542   enum machine_mode valmode = GET_MODE (val);
22543   rtx tmp;
22544   int nops = mode == DImode ? 3 : 2;
22545 
22546   gcc_assert (mode == SImode || mode == DImode);
22547   if (val == const0_rtx)
22548     return copy_to_mode_reg (mode, const0_rtx);
22549   if (CONST_INT_P (val))
22550     {
22551       HOST_WIDE_INT v = INTVAL (val) & 255;
22552 
22553       v |= v << 8;
22554       v |= v << 16;
22555       if (mode == DImode)
22556         v |= (v << 16) << 16;
22557       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22558     }
22559 
22560   if (valmode == VOIDmode)
22561     valmode = QImode;
22562   if (valmode != QImode)
22563     val = gen_lowpart (QImode, val);
22564   if (mode == QImode)
22565     return val;
22566   if (!TARGET_PARTIAL_REG_STALL)
22567     nops--;
22568   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22569       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22570       <= (ix86_cost->shift_const + ix86_cost->add) * nops
22571           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22572     {
22573       rtx reg = convert_modes (mode, QImode, val, true);
22574       tmp = promote_duplicated_reg (mode, const1_rtx);
22575       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22576 				  OPTAB_DIRECT);
22577     }
22578   else
22579     {
22580       rtx reg = convert_modes (mode, QImode, val, true);
22581 
22582       if (!TARGET_PARTIAL_REG_STALL)
22583 	if (mode == SImode)
22584 	  emit_insn (gen_movsi_insv_1 (reg, reg));
22585 	else
22586 	  emit_insn (gen_movdi_insv_1 (reg, reg));
22587       else
22588 	{
22589 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22590 				     NULL, 1, OPTAB_DIRECT);
22591 	  reg =
22592 	    expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22593 	}
22594       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22595 			         NULL, 1, OPTAB_DIRECT);
22596       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22597       if (mode == SImode)
22598 	return reg;
22599       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22600 				 NULL, 1, OPTAB_DIRECT);
22601       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22602       return reg;
22603     }
22604 }
22605 
22606 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22607    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22608    alignment from ALIGN to DESIRED_ALIGN.  */
22609 static rtx
22610 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22611 {
22612   rtx promoted_val;
22613 
22614   if (TARGET_64BIT
22615       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22616     promoted_val = promote_duplicated_reg (DImode, val);
22617   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22618     promoted_val = promote_duplicated_reg (SImode, val);
22619   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22620     promoted_val = promote_duplicated_reg (HImode, val);
22621   else
22622     promoted_val = val;
22623 
22624   return promoted_val;
22625 }
22626 
22627 /* Expand string clear operation (bzero).  Use i386 string operations when
22628    profitable.  See expand_movmem comment for explanation of individual
22629    steps performed.  */
22630 bool
22631 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22632 		    rtx expected_align_exp, rtx expected_size_exp)
22633 {
22634   rtx destreg;
22635   rtx label = NULL;
22636   rtx tmp;
22637   rtx jump_around_label = NULL;
22638   HOST_WIDE_INT align = 1;
22639   unsigned HOST_WIDE_INT count = 0;
22640   HOST_WIDE_INT expected_size = -1;
22641   int size_needed = 0, epilogue_size_needed;
22642   int desired_align = 0, align_bytes = 0;
22643   enum stringop_alg alg;
22644   rtx promoted_val = NULL;
22645   bool force_loopy_epilogue = false;
22646   int dynamic_check;
22647   bool need_zero_guard = false;
22648 
22649   if (CONST_INT_P (align_exp))
22650     align = INTVAL (align_exp);
22651   /* i386 can do misaligned access on reasonably increased cost.  */
22652   if (CONST_INT_P (expected_align_exp)
22653       && INTVAL (expected_align_exp) > align)
22654     align = INTVAL (expected_align_exp);
22655   if (CONST_INT_P (count_exp))
22656     count = expected_size = INTVAL (count_exp);
22657   if (CONST_INT_P (expected_size_exp) && count == 0)
22658     expected_size = INTVAL (expected_size_exp);
22659 
22660   /* Make sure we don't need to care about overflow later on.  */
22661   if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22662     return false;
22663 
22664   /* Step 0: Decide on preferred algorithm, desired alignment and
22665      size of chunks to be copied by main loop.  */
22666 
22667   alg = decide_alg (count, expected_size, true, &dynamic_check);
22668   desired_align = decide_alignment (align, alg, expected_size);
22669 
22670   if (!TARGET_ALIGN_STRINGOPS)
22671     align = desired_align;
22672 
22673   if (alg == libcall)
22674     return false;
22675   gcc_assert (alg != no_stringop);
22676   if (!count)
22677     count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22678   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22679   switch (alg)
22680     {
22681     case libcall:
22682     case no_stringop:
22683       gcc_unreachable ();
22684     case loop:
22685       need_zero_guard = true;
22686       size_needed = GET_MODE_SIZE (Pmode);
22687       break;
22688     case unrolled_loop:
22689       need_zero_guard = true;
22690       size_needed = GET_MODE_SIZE (Pmode) * 4;
22691       break;
22692     case rep_prefix_8_byte:
22693       size_needed = 8;
22694       break;
22695     case rep_prefix_4_byte:
22696       size_needed = 4;
22697       break;
22698     case rep_prefix_1_byte:
22699       size_needed = 1;
22700       break;
22701     case loop_1_byte:
22702       need_zero_guard = true;
22703       size_needed = 1;
22704       break;
22705     }
22706   epilogue_size_needed = size_needed;
22707 
22708   /* Step 1: Prologue guard.  */
22709 
22710   /* Alignment code needs count to be in register.  */
22711   if (CONST_INT_P (count_exp) && desired_align > align)
22712     {
22713       if (INTVAL (count_exp) > desired_align
22714 	  && INTVAL (count_exp) > size_needed)
22715 	{
22716 	  align_bytes
22717 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22718 	  if (align_bytes <= 0)
22719 	    align_bytes = 0;
22720 	  else
22721 	    align_bytes = desired_align - align_bytes;
22722 	}
22723       if (align_bytes == 0)
22724 	{
22725 	  enum machine_mode mode = SImode;
22726 	  if (TARGET_64BIT && (count & ~0xffffffff))
22727 	    mode = DImode;
22728 	  count_exp = force_reg (mode, count_exp);
22729 	}
22730     }
22731   /* Do the cheap promotion to allow better CSE across the
22732      main loop and epilogue (ie one load of the big constant in the
22733      front of all code.  */
22734   if (CONST_INT_P (val_exp))
22735     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22736 						   desired_align, align);
22737   /* Ensure that alignment prologue won't copy past end of block.  */
22738   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22739     {
22740       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22741       /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22742 	 Make sure it is power of 2.  */
22743       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22744 
22745       /* To improve performance of small blocks, we jump around the VAL
22746 	 promoting mode.  This mean that if the promoted VAL is not constant,
22747 	 we might not use it in the epilogue and have to use byte
22748 	 loop variant.  */
22749       if (epilogue_size_needed > 2 && !promoted_val)
22750         force_loopy_epilogue = true;
22751       if (count)
22752 	{
22753 	  if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22754 	    {
22755 	      /* If main algorithm works on QImode, no epilogue is needed.
22756 		 For small sizes just don't align anything.  */
22757 	      if (size_needed == 1)
22758 		desired_align = align;
22759 	      else
22760 		goto epilogue;
22761 	    }
22762 	}
22763       else
22764 	{
22765 	  label = gen_label_rtx ();
22766 	  emit_cmp_and_jump_insns (count_exp,
22767 				   GEN_INT (epilogue_size_needed),
22768 				   LTU, 0, counter_mode (count_exp), 1, label);
22769 	  if (expected_size == -1 || expected_size <= epilogue_size_needed)
22770 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22771 	  else
22772 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22773 	}
22774     }
22775   if (dynamic_check != -1)
22776     {
22777       rtx hot_label = gen_label_rtx ();
22778       jump_around_label = gen_label_rtx ();
22779       emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22780 			       LEU, 0, counter_mode (count_exp), 1, hot_label);
22781       predict_jump (REG_BR_PROB_BASE * 90 / 100);
22782       set_storage_via_libcall (dst, count_exp, val_exp, false);
22783       emit_jump (jump_around_label);
22784       emit_label (hot_label);
22785     }
22786 
22787   /* Step 2: Alignment prologue.  */
22788 
22789   /* Do the expensive promotion once we branched off the small blocks.  */
22790   if (!promoted_val)
22791     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22792 						   desired_align, align);
22793   gcc_assert (desired_align >= 1 && align >= 1);
22794 
22795   if (desired_align > align)
22796     {
22797       if (align_bytes == 0)
22798 	{
22799 	  /* Except for the first move in epilogue, we no longer know
22800 	     constant offset in aliasing info.  It don't seems to worth
22801 	     the pain to maintain it for the first move, so throw away
22802 	     the info early.  */
22803 	  dst = change_address (dst, BLKmode, destreg);
22804 	  expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22805 				  desired_align);
22806 	}
22807       else
22808 	{
22809 	  /* If we know how many bytes need to be stored before dst is
22810 	     sufficiently aligned, maintain aliasing info accurately.  */
22811 	  dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22812 						 desired_align, align_bytes);
22813 	  count_exp = plus_constant (count_exp, -align_bytes);
22814 	  count -= align_bytes;
22815 	}
22816       if (need_zero_guard
22817 	  && (count < (unsigned HOST_WIDE_INT) size_needed
22818 	      || (align_bytes == 0
22819 		  && count < ((unsigned HOST_WIDE_INT) size_needed
22820 			      + desired_align - align))))
22821 	{
22822 	  /* It is possible that we copied enough so the main loop will not
22823 	     execute.  */
22824 	  gcc_assert (size_needed > 1);
22825 	  if (label == NULL_RTX)
22826 	    label = gen_label_rtx ();
22827 	  emit_cmp_and_jump_insns (count_exp,
22828 				   GEN_INT (size_needed),
22829 				   LTU, 0, counter_mode (count_exp), 1, label);
22830 	  if (expected_size == -1
22831 	      || expected_size < (desired_align - align) / 2 + size_needed)
22832 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22833 	  else
22834 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22835 	}
22836     }
22837   if (label && size_needed == 1)
22838     {
22839       emit_label (label);
22840       LABEL_NUSES (label) = 1;
22841       label = NULL;
22842       promoted_val = val_exp;
22843       epilogue_size_needed = 1;
22844     }
22845   else if (label == NULL_RTX)
22846     epilogue_size_needed = size_needed;
22847 
22848   /* Step 3: Main loop.  */
22849 
22850   switch (alg)
22851     {
22852     case libcall:
22853     case no_stringop:
22854       gcc_unreachable ();
22855     case loop_1_byte:
22856       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22857 				     count_exp, QImode, 1, expected_size);
22858       break;
22859     case loop:
22860       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22861 				     count_exp, Pmode, 1, expected_size);
22862       break;
22863     case unrolled_loop:
22864       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22865 				     count_exp, Pmode, 4, expected_size);
22866       break;
22867     case rep_prefix_8_byte:
22868       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22869 				  DImode, val_exp);
22870       break;
22871     case rep_prefix_4_byte:
22872       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22873 				  SImode, val_exp);
22874       break;
22875     case rep_prefix_1_byte:
22876       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22877 				  QImode, val_exp);
22878       break;
22879     }
22880   /* Adjust properly the offset of src and dest memory for aliasing.  */
22881   if (CONST_INT_P (count_exp))
22882     dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22883 					(count / size_needed) * size_needed);
22884   else
22885     dst = change_address (dst, BLKmode, destreg);
22886 
22887   /* Step 4: Epilogue to copy the remaining bytes.  */
22888 
22889   if (label)
22890     {
22891       /* When the main loop is done, COUNT_EXP might hold original count,
22892  	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22893 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22894 	 bytes. Compensate if needed.  */
22895 
22896       if (size_needed < epilogue_size_needed)
22897 	{
22898 	  tmp =
22899 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22900 				 GEN_INT (size_needed - 1), count_exp, 1,
22901 				 OPTAB_DIRECT);
22902 	  if (tmp != count_exp)
22903 	    emit_move_insn (count_exp, tmp);
22904 	}
22905       emit_label (label);
22906       LABEL_NUSES (label) = 1;
22907     }
22908  epilogue:
22909   if (count_exp != const0_rtx && epilogue_size_needed > 1)
22910     {
22911       if (force_loopy_epilogue)
22912 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22913 					 epilogue_size_needed);
22914       else
22915 	expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22916 				epilogue_size_needed);
22917     }
22918   if (jump_around_label)
22919     emit_label (jump_around_label);
22920   return true;
22921 }
22922 
22923 /* Expand the appropriate insns for doing strlen if not just doing
22924    repnz; scasb
22925 
22926    out = result, initialized with the start address
22927    align_rtx = alignment of the address.
22928    scratch = scratch register, initialized with the startaddress when
22929 	not aligned, otherwise undefined
22930 
22931    This is just the body. It needs the initializations mentioned above and
22932    some address computing at the end.  These things are done in i386.md.  */
22933 
22934 static void
22935 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22936 {
22937   int align;
22938   rtx tmp;
22939   rtx align_2_label = NULL_RTX;
22940   rtx align_3_label = NULL_RTX;
22941   rtx align_4_label = gen_label_rtx ();
22942   rtx end_0_label = gen_label_rtx ();
22943   rtx mem;
22944   rtx tmpreg = gen_reg_rtx (SImode);
22945   rtx scratch = gen_reg_rtx (SImode);
22946   rtx cmp;
22947 
22948   align = 0;
22949   if (CONST_INT_P (align_rtx))
22950     align = INTVAL (align_rtx);
22951 
22952   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
22953 
22954   /* Is there a known alignment and is it less than 4?  */
22955   if (align < 4)
22956     {
22957       rtx scratch1 = gen_reg_rtx (Pmode);
22958       emit_move_insn (scratch1, out);
22959       /* Is there a known alignment and is it not 2? */
22960       if (align != 2)
22961 	{
22962 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22963 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22964 
22965 	  /* Leave just the 3 lower bits.  */
22966 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22967 				    NULL_RTX, 0, OPTAB_WIDEN);
22968 
22969 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22970 				   Pmode, 1, align_4_label);
22971 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22972 				   Pmode, 1, align_2_label);
22973 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22974 				   Pmode, 1, align_3_label);
22975 	}
22976       else
22977         {
22978 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
22979 	     check if is aligned to 4 - byte.  */
22980 
22981 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22982 				    NULL_RTX, 0, OPTAB_WIDEN);
22983 
22984 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22985 				   Pmode, 1, align_4_label);
22986         }
22987 
22988       mem = change_address (src, QImode, out);
22989 
22990       /* Now compare the bytes.  */
22991 
22992       /* Compare the first n unaligned byte on a byte per byte basis.  */
22993       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22994 			       QImode, 1, end_0_label);
22995 
22996       /* Increment the address.  */
22997       emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22998 
22999       /* Not needed with an alignment of 2 */
23000       if (align != 2)
23001 	{
23002 	  emit_label (align_2_label);
23003 
23004 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23005 				   end_0_label);
23006 
23007 	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23008 
23009 	  emit_label (align_3_label);
23010 	}
23011 
23012       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23013 			       end_0_label);
23014 
23015       emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23016     }
23017 
23018   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
23019      align this loop.  It gives only huge programs, but does not help to
23020      speed up.  */
23021   emit_label (align_4_label);
23022 
23023   mem = change_address (src, SImode, out);
23024   emit_move_insn (scratch, mem);
23025   emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23026 
23027   /* This formula yields a nonzero result iff one of the bytes is zero.
23028      This saves three branches inside loop and many cycles.  */
23029 
23030   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23031   emit_insn (gen_one_cmplsi2 (scratch, scratch));
23032   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23033   emit_insn (gen_andsi3 (tmpreg, tmpreg,
23034 			 gen_int_mode (0x80808080, SImode)));
23035   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23036 			   align_4_label);
23037 
23038   if (TARGET_CMOVE)
23039     {
23040        rtx reg = gen_reg_rtx (SImode);
23041        rtx reg2 = gen_reg_rtx (Pmode);
23042        emit_move_insn (reg, tmpreg);
23043        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23044 
23045        /* If zero is not in the first two bytes, move two bytes forward.  */
23046        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23047        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23048        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23049        emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23050 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
23051 						     reg,
23052 						     tmpreg)));
23053        /* Emit lea manually to avoid clobbering of flags.  */
23054        emit_insn (gen_rtx_SET (SImode, reg2,
23055 			       gen_rtx_PLUS (Pmode, out, const2_rtx)));
23056 
23057        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23058        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23059        emit_insn (gen_rtx_SET (VOIDmode, out,
23060 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23061 						     reg2,
23062 						     out)));
23063     }
23064   else
23065     {
23066        rtx end_2_label = gen_label_rtx ();
23067        /* Is zero in the first two bytes? */
23068 
23069        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23070        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23071        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23072        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23073                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23074                             pc_rtx);
23075        tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23076        JUMP_LABEL (tmp) = end_2_label;
23077 
23078        /* Not in the first two.  Move two bytes forward.  */
23079        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23080        emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23081 
23082        emit_label (end_2_label);
23083 
23084     }
23085 
23086   /* Avoid branch in fixing the byte.  */
23087   tmpreg = gen_lowpart (QImode, tmpreg);
23088   emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23089   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23090   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23091   emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23092 
23093   emit_label (end_0_label);
23094 }
23095 
23096 /* Expand strlen.  */
23097 
23098 bool
23099 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23100 {
23101   rtx addr, scratch1, scratch2, scratch3, scratch4;
23102 
23103   /* The generic case of strlen expander is long.  Avoid it's
23104      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
23105 
23106   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23107       && !TARGET_INLINE_ALL_STRINGOPS
23108       && !optimize_insn_for_size_p ()
23109       && (!CONST_INT_P (align) || INTVAL (align) < 4))
23110     return false;
23111 
23112   addr = force_reg (Pmode, XEXP (src, 0));
23113   scratch1 = gen_reg_rtx (Pmode);
23114 
23115   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23116       && !optimize_insn_for_size_p ())
23117     {
23118       /* Well it seems that some optimizer does not combine a call like
23119          foo(strlen(bar), strlen(bar));
23120          when the move and the subtraction is done here.  It does calculate
23121          the length just once when these instructions are done inside of
23122          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
23123          often used and I use one fewer register for the lifetime of
23124          output_strlen_unroll() this is better.  */
23125 
23126       emit_move_insn (out, addr);
23127 
23128       ix86_expand_strlensi_unroll_1 (out, src, align);
23129 
23130       /* strlensi_unroll_1 returns the address of the zero at the end of
23131          the string, like memchr(), so compute the length by subtracting
23132          the start address.  */
23133       emit_insn (ix86_gen_sub3 (out, out, addr));
23134     }
23135   else
23136     {
23137       rtx unspec;
23138 
23139       /* Can't use this if the user has appropriated eax, ecx, or edi.  */
23140       if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23141         return false;
23142 
23143       scratch2 = gen_reg_rtx (Pmode);
23144       scratch3 = gen_reg_rtx (Pmode);
23145       scratch4 = force_reg (Pmode, constm1_rtx);
23146 
23147       emit_move_insn (scratch3, addr);
23148       eoschar = force_reg (QImode, eoschar);
23149 
23150       src = replace_equiv_address_nv (src, scratch3);
23151 
23152       /* If .md starts supporting :P, this can be done in .md.  */
23153       unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23154 						 scratch4), UNSPEC_SCAS);
23155       emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23156       emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23157       emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23158     }
23159   return true;
23160 }
23161 
23162 /* For given symbol (function) construct code to compute address of it's PLT
23163    entry in large x86-64 PIC model.  */
23164 rtx
23165 construct_plt_address (rtx symbol)
23166 {
23167   rtx tmp = gen_reg_rtx (Pmode);
23168   rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23169 
23170   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23171   gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23172 
23173   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23174   emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
23175   return tmp;
23176 }
23177 
23178 rtx
23179 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23180 		  rtx callarg2,
23181 		  rtx pop, bool sibcall)
23182 {
23183   /* We need to represent that SI and DI registers are clobbered
23184      by SYSV calls.  */
23185   static int clobbered_registers[] = {
23186 	XMM6_REG, XMM7_REG, XMM8_REG,
23187 	XMM9_REG, XMM10_REG, XMM11_REG,
23188 	XMM12_REG, XMM13_REG, XMM14_REG,
23189 	XMM15_REG, SI_REG, DI_REG
23190   };
23191   rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23192   rtx use = NULL, call;
23193   unsigned int vec_len;
23194 
23195   if (pop == const0_rtx)
23196     pop = NULL;
23197   gcc_assert (!TARGET_64BIT || !pop);
23198 
23199   if (TARGET_MACHO && !TARGET_64BIT)
23200     {
23201 #if TARGET_MACHO
23202       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23203 	fnaddr = machopic_indirect_call_target (fnaddr);
23204 #endif
23205     }
23206   else
23207     {
23208       /* Static functions and indirect calls don't need the pic register.  */
23209       if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23210 	  && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23211 	  && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23212 	use_reg (&use, pic_offset_table_rtx);
23213     }
23214 
23215   if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23216     {
23217       rtx al = gen_rtx_REG (QImode, AX_REG);
23218       emit_move_insn (al, callarg2);
23219       use_reg (&use, al);
23220     }
23221 
23222   if (ix86_cmodel == CM_LARGE_PIC
23223       && MEM_P (fnaddr)
23224       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23225       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23226     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23227   else if (sibcall
23228 	   ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
23229 	   : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
23230     {
23231       fnaddr = XEXP (fnaddr, 0);
23232       if (GET_MODE (fnaddr) != Pmode)
23233 	fnaddr = convert_to_mode (Pmode, fnaddr, 1);
23234       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
23235     }
23236 
23237   vec_len = 0;
23238   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23239   if (retval)
23240     call = gen_rtx_SET (VOIDmode, retval, call);
23241   vec[vec_len++] = call;
23242 
23243   if (pop)
23244     {
23245       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23246       pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23247       vec[vec_len++] = pop;
23248     }
23249 
23250   if (TARGET_64BIT_MS_ABI
23251       && (!callarg2 || INTVAL (callarg2) != -2))
23252     {
23253       unsigned i;
23254 
23255       vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23256 				       UNSPEC_MS_TO_SYSV_CALL);
23257 
23258       for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23259         vec[vec_len++]
23260 	  = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23261 			     ? TImode : DImode,
23262 			     gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23263 					  ? TImode : DImode,
23264 					  clobbered_registers[i]));
23265     }
23266 
23267   /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration.  */
23268   if (TARGET_VZEROUPPER)
23269     {
23270       int avx256;
23271       if (cfun->machine->callee_pass_avx256_p)
23272 	{
23273 	  if (cfun->machine->callee_return_avx256_p)
23274 	    avx256 = callee_return_pass_avx256;
23275 	  else
23276 	    avx256 = callee_pass_avx256;
23277 	}
23278       else if (cfun->machine->callee_return_avx256_p)
23279 	avx256 = callee_return_avx256;
23280       else
23281 	avx256 = call_no_avx256;
23282 
23283       if (reload_completed)
23284 	emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23285       else
23286 	vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23287 					 gen_rtvec (1, GEN_INT (avx256)),
23288 					 UNSPEC_CALL_NEEDS_VZEROUPPER);
23289     }
23290 
23291   if (vec_len > 1)
23292     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23293   call = emit_call_insn (call);
23294   if (use)
23295     CALL_INSN_FUNCTION_USAGE (call) = use;
23296 
23297   return call;
23298 }
23299 
23300 void
23301 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23302 {
23303   rtx pat = PATTERN (insn);
23304   rtvec vec = XVEC (pat, 0);
23305   int len = GET_NUM_ELEM (vec) - 1;
23306 
23307   /* Strip off the last entry of the parallel.  */
23308   gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23309   gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23310   if (len == 1)
23311     pat = RTVEC_ELT (vec, 0);
23312   else
23313     pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23314 
23315   emit_insn (gen_avx_vzeroupper (vzeroupper));
23316   emit_call_insn (pat);
23317 }
23318 
23319 /* Output the assembly for a call instruction.  */
23320 
23321 const char *
23322 ix86_output_call_insn (rtx insn, rtx call_op)
23323 {
23324   bool direct_p = constant_call_address_operand (call_op, Pmode);
23325   bool seh_nop_p = false;
23326   const char *xasm;
23327 
23328   if (SIBLING_CALL_P (insn))
23329     {
23330       if (direct_p)
23331 	xasm = "jmp\t%P0";
23332       /* SEH epilogue detection requires the indirect branch case
23333 	 to include REX.W.  */
23334       else if (TARGET_SEH)
23335 	xasm = "rex.W jmp %A0";
23336       else
23337 	xasm = "jmp\t%A0";
23338 
23339       output_asm_insn (xasm, &call_op);
23340       return "";
23341     }
23342 
23343   /* SEH unwinding can require an extra nop to be emitted in several
23344      circumstances.  Determine if we have one of those.  */
23345   if (TARGET_SEH)
23346     {
23347       rtx i;
23348 
23349       for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23350 	{
23351 	  /* If we get to another real insn, we don't need the nop.  */
23352 	  if (INSN_P (i))
23353 	    break;
23354 
23355 	  /* If we get to the epilogue note, prevent a catch region from
23356 	     being adjacent to the standard epilogue sequence.  If non-
23357 	     call-exceptions, we'll have done this during epilogue emission. */
23358 	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23359 	      && !flag_non_call_exceptions
23360 	      && !can_throw_internal (insn))
23361 	    {
23362 	      seh_nop_p = true;
23363 	      break;
23364 	    }
23365 	}
23366 
23367       /* If we didn't find a real insn following the call, prevent the
23368 	 unwinder from looking into the next function.  */
23369       if (i == NULL)
23370 	seh_nop_p = true;
23371     }
23372 
23373   if (direct_p)
23374     xasm = "call\t%P0";
23375   else
23376     xasm = "call\t%A0";
23377 
23378   output_asm_insn (xasm, &call_op);
23379 
23380   if (seh_nop_p)
23381     return "nop";
23382 
23383   return "";
23384 }
23385 
23386 /* Clear stack slot assignments remembered from previous functions.
23387    This is called from INIT_EXPANDERS once before RTL is emitted for each
23388    function.  */
23389 
23390 static struct machine_function *
23391 ix86_init_machine_status (void)
23392 {
23393   struct machine_function *f;
23394 
23395   f = ggc_alloc_cleared_machine_function ();
23396   f->use_fast_prologue_epilogue_nregs = -1;
23397   f->call_abi = ix86_abi;
23398 
23399   return f;
23400 }
23401 
23402 /* Return a MEM corresponding to a stack slot with mode MODE.
23403    Allocate a new slot if necessary.
23404 
23405    The RTL for a function can have several slots available: N is
23406    which slot to use.  */
23407 
23408 rtx
23409 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23410 {
23411   struct stack_local_entry *s;
23412 
23413   gcc_assert (n < MAX_386_STACK_LOCALS);
23414 
23415   for (s = ix86_stack_locals; s; s = s->next)
23416     if (s->mode == mode && s->n == n)
23417       return validize_mem (copy_rtx (s->rtl));
23418 
23419   s = ggc_alloc_stack_local_entry ();
23420   s->n = n;
23421   s->mode = mode;
23422   s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23423 
23424   s->next = ix86_stack_locals;
23425   ix86_stack_locals = s;
23426   return validize_mem (s->rtl);
23427 }
23428 
23429 static void
23430 ix86_instantiate_decls (void)
23431 {
23432   struct stack_local_entry *s;
23433 
23434   for (s = ix86_stack_locals; s; s = s->next)
23435     if (s->rtl != NULL_RTX)
23436       instantiate_decl_rtl (s->rtl);
23437 }
23438 
23439 /* Calculate the length of the memory address in the instruction encoding.
23440    Includes addr32 prefix, does not include the one-byte modrm, opcode,
23441    or other prefixes.  We never generate addr32 prefix for LEA insn.  */
23442 
23443 int
23444 memory_address_length (rtx addr, bool lea)
23445 {
23446   struct ix86_address parts;
23447   rtx base, index, disp;
23448   int len;
23449   int ok;
23450 
23451   if (GET_CODE (addr) == PRE_DEC
23452       || GET_CODE (addr) == POST_INC
23453       || GET_CODE (addr) == PRE_MODIFY
23454       || GET_CODE (addr) == POST_MODIFY)
23455     return 0;
23456 
23457   ok = ix86_decompose_address (addr, &parts);
23458   gcc_assert (ok);
23459 
23460   len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23461 
23462   /*  If this is not LEA instruction, add the length of addr32 prefix.  */
23463   if (TARGET_64BIT && !lea
23464       && (SImode_address_operand (addr, VOIDmode)
23465 	  || (parts.base && GET_MODE (parts.base) == SImode)
23466 	  || (parts.index && GET_MODE (parts.index) == SImode)))
23467     len++;
23468 
23469   base = parts.base;
23470   index = parts.index;
23471   disp = parts.disp;
23472 
23473   if (base && GET_CODE (base) == SUBREG)
23474     base = SUBREG_REG (base);
23475   if (index && GET_CODE (index) == SUBREG)
23476     index = SUBREG_REG (index);
23477 
23478   gcc_assert (base == NULL_RTX || REG_P (base));
23479   gcc_assert (index == NULL_RTX || REG_P (index));
23480 
23481   /* Rule of thumb:
23482        - esp as the base always wants an index,
23483        - ebp as the base always wants a displacement,
23484        - r12 as the base always wants an index,
23485        - r13 as the base always wants a displacement.  */
23486 
23487   /* Register Indirect.  */
23488   if (base && !index && !disp)
23489     {
23490       /* esp (for its index) and ebp (for its displacement) need
23491 	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
23492 	 code.  */
23493       if (base == arg_pointer_rtx
23494 	  || base == frame_pointer_rtx
23495 	  || REGNO (base) == SP_REG
23496 	  || REGNO (base) == BP_REG
23497 	  || REGNO (base) == R12_REG
23498 	  || REGNO (base) == R13_REG)
23499 	len++;
23500     }
23501 
23502   /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
23503      is not disp32, but disp32(%rip), so for disp32
23504      SIB byte is needed, unless print_operand_address
23505      optimizes it into disp32(%rip) or (%rip) is implied
23506      by UNSPEC.  */
23507   else if (disp && !base && !index)
23508     {
23509       len += 4;
23510       if (TARGET_64BIT)
23511 	{
23512 	  rtx symbol = disp;
23513 
23514 	  if (GET_CODE (disp) == CONST)
23515 	    symbol = XEXP (disp, 0);
23516 	  if (GET_CODE (symbol) == PLUS
23517 	      && CONST_INT_P (XEXP (symbol, 1)))
23518 	    symbol = XEXP (symbol, 0);
23519 
23520 	  if (GET_CODE (symbol) != LABEL_REF
23521 	      && (GET_CODE (symbol) != SYMBOL_REF
23522 		  || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23523 	      && (GET_CODE (symbol) != UNSPEC
23524 		  || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23525 		      && XINT (symbol, 1) != UNSPEC_PCREL
23526 		      && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23527 	    len++;
23528 	}
23529     }
23530   else
23531     {
23532       /* Find the length of the displacement constant.  */
23533       if (disp)
23534 	{
23535 	  if (base && satisfies_constraint_K (disp))
23536 	    len += 1;
23537 	  else
23538 	    len += 4;
23539 	}
23540       /* ebp always wants a displacement.  Similarly r13.  */
23541       else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23542 	len++;
23543 
23544       /* An index requires the two-byte modrm form....  */
23545       if (index
23546 	  /* ...like esp (or r12), which always wants an index.  */
23547 	  || base == arg_pointer_rtx
23548 	  || base == frame_pointer_rtx
23549 	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23550 	len++;
23551     }
23552 
23553   return len;
23554 }
23555 
23556 /* Compute default value for "length_immediate" attribute.  When SHORTFORM
23557    is set, expect that insn have 8bit immediate alternative.  */
23558 int
23559 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23560 {
23561   int len = 0;
23562   int i;
23563   extract_insn_cached (insn);
23564   for (i = recog_data.n_operands - 1; i >= 0; --i)
23565     if (CONSTANT_P (recog_data.operand[i]))
23566       {
23567         enum attr_mode mode = get_attr_mode (insn);
23568 
23569 	gcc_assert (!len);
23570 	if (shortform && CONST_INT_P (recog_data.operand[i]))
23571 	  {
23572 	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23573 	    switch (mode)
23574 	      {
23575 	      case MODE_QI:
23576 		len = 1;
23577 		continue;
23578 	      case MODE_HI:
23579 		ival = trunc_int_for_mode (ival, HImode);
23580 		break;
23581 	      case MODE_SI:
23582 		ival = trunc_int_for_mode (ival, SImode);
23583 		break;
23584 	      default:
23585 		break;
23586 	      }
23587 	    if (IN_RANGE (ival, -128, 127))
23588 	      {
23589 		len = 1;
23590 		continue;
23591 	      }
23592 	  }
23593 	switch (mode)
23594 	  {
23595 	  case MODE_QI:
23596 	    len = 1;
23597 	    break;
23598 	  case MODE_HI:
23599 	    len = 2;
23600 	    break;
23601 	  case MODE_SI:
23602 	    len = 4;
23603 	    break;
23604 	  /* Immediates for DImode instructions are encoded
23605 	     as 32bit sign extended values.  */
23606 	  case MODE_DI:
23607 	    len = 4;
23608 	    break;
23609 	  default:
23610 	    fatal_insn ("unknown insn mode", insn);
23611 	}
23612       }
23613   return len;
23614 }
23615 
23616 /* Compute default value for "length_address" attribute.  */
23617 int
23618 ix86_attr_length_address_default (rtx insn)
23619 {
23620   int i;
23621 
23622   if (get_attr_type (insn) == TYPE_LEA)
23623     {
23624       rtx set = PATTERN (insn), addr;
23625 
23626       if (GET_CODE (set) == PARALLEL)
23627 	set = XVECEXP (set, 0, 0);
23628 
23629       gcc_assert (GET_CODE (set) == SET);
23630 
23631       addr = SET_SRC (set);
23632 
23633       return memory_address_length (addr, true);
23634     }
23635 
23636   extract_insn_cached (insn);
23637   for (i = recog_data.n_operands - 1; i >= 0; --i)
23638     if (MEM_P (recog_data.operand[i]))
23639       {
23640         constrain_operands_cached (reload_completed);
23641         if (which_alternative != -1)
23642 	  {
23643 	    const char *constraints = recog_data.constraints[i];
23644 	    int alt = which_alternative;
23645 
23646 	    while (*constraints == '=' || *constraints == '+')
23647 	      constraints++;
23648 	    while (alt-- > 0)
23649 	      while (*constraints++ != ',')
23650 		;
23651 	    /* Skip ignored operands.  */
23652 	    if (*constraints == 'X')
23653 	      continue;
23654 	  }
23655 	return memory_address_length (XEXP (recog_data.operand[i], 0), false);
23656       }
23657   return 0;
23658 }
23659 
23660 /* Compute default value for "length_vex" attribute. It includes
23661    2 or 3 byte VEX prefix and 1 opcode byte.  */
23662 
23663 int
23664 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23665 {
23666   int i;
23667 
23668   /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
23669      byte VEX prefix.  */
23670   if (!has_0f_opcode || has_vex_w)
23671     return 3 + 1;
23672 
23673  /* We can always use 2 byte VEX prefix in 32bit.  */
23674   if (!TARGET_64BIT)
23675     return 2 + 1;
23676 
23677   extract_insn_cached (insn);
23678 
23679   for (i = recog_data.n_operands - 1; i >= 0; --i)
23680     if (REG_P (recog_data.operand[i]))
23681       {
23682 	/* REX.W bit uses 3 byte VEX prefix.  */
23683 	if (GET_MODE (recog_data.operand[i]) == DImode
23684 	    && GENERAL_REG_P (recog_data.operand[i]))
23685 	  return 3 + 1;
23686       }
23687     else
23688       {
23689 	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
23690 	if (MEM_P (recog_data.operand[i])
23691 	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23692 	  return 3 + 1;
23693       }
23694 
23695   return 2 + 1;
23696 }
23697 
23698 /* Return the maximum number of instructions a cpu can issue.  */
23699 
23700 static int
23701 ix86_issue_rate (void)
23702 {
23703   switch (ix86_tune)
23704     {
23705     case PROCESSOR_PENTIUM:
23706     case PROCESSOR_ATOM:
23707     case PROCESSOR_K6:
23708       return 2;
23709 
23710     case PROCESSOR_PENTIUMPRO:
23711     case PROCESSOR_PENTIUM4:
23712     case PROCESSOR_CORE2_32:
23713     case PROCESSOR_CORE2_64:
23714     case PROCESSOR_COREI7_32:
23715     case PROCESSOR_COREI7_64:
23716     case PROCESSOR_ATHLON:
23717     case PROCESSOR_K8:
23718     case PROCESSOR_AMDFAM10:
23719     case PROCESSOR_NOCONA:
23720     case PROCESSOR_GENERIC32:
23721     case PROCESSOR_GENERIC64:
23722     case PROCESSOR_BDVER1:
23723     case PROCESSOR_BDVER2:
23724     case PROCESSOR_BTVER1:
23725       return 3;
23726 
23727     default:
23728       return 1;
23729     }
23730 }
23731 
23732 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23733    by DEP_INSN and nothing set by DEP_INSN.  */
23734 
23735 static bool
23736 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23737 {
23738   rtx set, set2;
23739 
23740   /* Simplify the test for uninteresting insns.  */
23741   if (insn_type != TYPE_SETCC
23742       && insn_type != TYPE_ICMOV
23743       && insn_type != TYPE_FCMOV
23744       && insn_type != TYPE_IBR)
23745     return false;
23746 
23747   if ((set = single_set (dep_insn)) != 0)
23748     {
23749       set = SET_DEST (set);
23750       set2 = NULL_RTX;
23751     }
23752   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23753 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
23754 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23755 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23756     {
23757       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23758       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23759     }
23760   else
23761     return false;
23762 
23763   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23764     return false;
23765 
23766   /* This test is true if the dependent insn reads the flags but
23767      not any other potentially set register.  */
23768   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23769     return false;
23770 
23771   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23772     return false;
23773 
23774   return true;
23775 }
23776 
23777 /* Return true iff USE_INSN has a memory address with operands set by
23778    SET_INSN.  */
23779 
23780 bool
23781 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23782 {
23783   int i;
23784   extract_insn_cached (use_insn);
23785   for (i = recog_data.n_operands - 1; i >= 0; --i)
23786     if (MEM_P (recog_data.operand[i]))
23787       {
23788 	rtx addr = XEXP (recog_data.operand[i], 0);
23789 	return modified_in_p (addr, set_insn) != 0;
23790       }
23791   return false;
23792 }
23793 
23794 static int
23795 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23796 {
23797   enum attr_type insn_type, dep_insn_type;
23798   enum attr_memory memory;
23799   rtx set, set2;
23800   int dep_insn_code_number;
23801 
23802   /* Anti and output dependencies have zero cost on all CPUs.  */
23803   if (REG_NOTE_KIND (link) != 0)
23804     return 0;
23805 
23806   dep_insn_code_number = recog_memoized (dep_insn);
23807 
23808   /* If we can't recognize the insns, we can't really do anything.  */
23809   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23810     return cost;
23811 
23812   insn_type = get_attr_type (insn);
23813   dep_insn_type = get_attr_type (dep_insn);
23814 
23815   switch (ix86_tune)
23816     {
23817     case PROCESSOR_PENTIUM:
23818       /* Address Generation Interlock adds a cycle of latency.  */
23819       if (insn_type == TYPE_LEA)
23820 	{
23821 	  rtx addr = PATTERN (insn);
23822 
23823 	  if (GET_CODE (addr) == PARALLEL)
23824 	    addr = XVECEXP (addr, 0, 0);
23825 
23826 	  gcc_assert (GET_CODE (addr) == SET);
23827 
23828 	  addr = SET_SRC (addr);
23829 	  if (modified_in_p (addr, dep_insn))
23830 	    cost += 1;
23831 	}
23832       else if (ix86_agi_dependent (dep_insn, insn))
23833 	cost += 1;
23834 
23835       /* ??? Compares pair with jump/setcc.  */
23836       if (ix86_flags_dependent (insn, dep_insn, insn_type))
23837 	cost = 0;
23838 
23839       /* Floating point stores require value to be ready one cycle earlier.  */
23840       if (insn_type == TYPE_FMOV
23841 	  && get_attr_memory (insn) == MEMORY_STORE
23842 	  && !ix86_agi_dependent (dep_insn, insn))
23843 	cost += 1;
23844       break;
23845 
23846     case PROCESSOR_PENTIUMPRO:
23847       memory = get_attr_memory (insn);
23848 
23849       /* INT->FP conversion is expensive.  */
23850       if (get_attr_fp_int_src (dep_insn))
23851 	cost += 5;
23852 
23853       /* There is one cycle extra latency between an FP op and a store.  */
23854       if (insn_type == TYPE_FMOV
23855 	  && (set = single_set (dep_insn)) != NULL_RTX
23856 	  && (set2 = single_set (insn)) != NULL_RTX
23857 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23858 	  && MEM_P (SET_DEST (set2)))
23859 	cost += 1;
23860 
23861       /* Show ability of reorder buffer to hide latency of load by executing
23862 	 in parallel with previous instruction in case
23863 	 previous instruction is not needed to compute the address.  */
23864       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23865 	  && !ix86_agi_dependent (dep_insn, insn))
23866 	{
23867 	  /* Claim moves to take one cycle, as core can issue one load
23868 	     at time and the next load can start cycle later.  */
23869 	  if (dep_insn_type == TYPE_IMOV
23870 	      || dep_insn_type == TYPE_FMOV)
23871 	    cost = 1;
23872 	  else if (cost > 1)
23873 	    cost--;
23874 	}
23875       break;
23876 
23877     case PROCESSOR_K6:
23878       memory = get_attr_memory (insn);
23879 
23880       /* The esp dependency is resolved before the instruction is really
23881          finished.  */
23882       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23883 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23884 	return 1;
23885 
23886       /* INT->FP conversion is expensive.  */
23887       if (get_attr_fp_int_src (dep_insn))
23888 	cost += 5;
23889 
23890       /* Show ability of reorder buffer to hide latency of load by executing
23891 	 in parallel with previous instruction in case
23892 	 previous instruction is not needed to compute the address.  */
23893       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23894 	  && !ix86_agi_dependent (dep_insn, insn))
23895 	{
23896 	  /* Claim moves to take one cycle, as core can issue one load
23897 	     at time and the next load can start cycle later.  */
23898 	  if (dep_insn_type == TYPE_IMOV
23899 	      || dep_insn_type == TYPE_FMOV)
23900 	    cost = 1;
23901 	  else if (cost > 2)
23902 	    cost -= 2;
23903 	  else
23904 	    cost = 1;
23905 	}
23906       break;
23907 
23908     case PROCESSOR_ATHLON:
23909     case PROCESSOR_K8:
23910     case PROCESSOR_AMDFAM10:
23911     case PROCESSOR_BDVER1:
23912     case PROCESSOR_BDVER2:
23913     case PROCESSOR_BTVER1:
23914     case PROCESSOR_ATOM:
23915     case PROCESSOR_GENERIC32:
23916     case PROCESSOR_GENERIC64:
23917       memory = get_attr_memory (insn);
23918 
23919       /* Show ability of reorder buffer to hide latency of load by executing
23920 	 in parallel with previous instruction in case
23921 	 previous instruction is not needed to compute the address.  */
23922       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23923 	  && !ix86_agi_dependent (dep_insn, insn))
23924 	{
23925 	  enum attr_unit unit = get_attr_unit (insn);
23926 	  int loadcost = 3;
23927 
23928 	  /* Because of the difference between the length of integer and
23929 	     floating unit pipeline preparation stages, the memory operands
23930 	     for floating point are cheaper.
23931 
23932 	     ??? For Athlon it the difference is most probably 2.  */
23933 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23934 	    loadcost = 3;
23935 	  else
23936 	    loadcost = TARGET_ATHLON ? 2 : 0;
23937 
23938 	  if (cost >= loadcost)
23939 	    cost -= loadcost;
23940 	  else
23941 	    cost = 0;
23942 	}
23943 
23944     default:
23945       break;
23946     }
23947 
23948   return cost;
23949 }
23950 
23951 /* How many alternative schedules to try.  This should be as wide as the
23952    scheduling freedom in the DFA, but no wider.  Making this value too
23953    large results extra work for the scheduler.  */
23954 
23955 static int
23956 ia32_multipass_dfa_lookahead (void)
23957 {
23958   switch (ix86_tune)
23959     {
23960     case PROCESSOR_PENTIUM:
23961       return 2;
23962 
23963     case PROCESSOR_PENTIUMPRO:
23964     case PROCESSOR_K6:
23965       return 1;
23966 
23967     case PROCESSOR_CORE2_32:
23968     case PROCESSOR_CORE2_64:
23969     case PROCESSOR_COREI7_32:
23970     case PROCESSOR_COREI7_64:
23971     case PROCESSOR_ATOM:
23972       /* Generally, we want haifa-sched:max_issue() to look ahead as far
23973 	 as many instructions can be executed on a cycle, i.e.,
23974 	 issue_rate.  I wonder why tuning for many CPUs does not do this.  */
23975       return ix86_issue_rate ();
23976 
23977     default:
23978       return 0;
23979     }
23980 }
23981 
23982 
23983 
23984 /* Model decoder of Core 2/i7.
23985    Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23986    track the instruction fetch block boundaries and make sure that long
23987    (9+ bytes) instructions are assigned to D0.  */
23988 
23989 /* Maximum length of an insn that can be handled by
23990    a secondary decoder unit.  '8' for Core 2/i7.  */
23991 static int core2i7_secondary_decoder_max_insn_size;
23992 
23993 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23994    '16' for Core 2/i7.  */
23995 static int core2i7_ifetch_block_size;
23996 
23997 /* Maximum number of instructions decoder can handle per cycle.
23998    '6' for Core 2/i7.  */
23999 static int core2i7_ifetch_block_max_insns;
24000 
24001 typedef struct ix86_first_cycle_multipass_data_ *
24002   ix86_first_cycle_multipass_data_t;
24003 typedef const struct ix86_first_cycle_multipass_data_ *
24004   const_ix86_first_cycle_multipass_data_t;
24005 
24006 /* A variable to store target state across calls to max_issue within
24007    one cycle.  */
24008 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24009   *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24010 
24011 /* Initialize DATA.  */
24012 static void
24013 core2i7_first_cycle_multipass_init (void *_data)
24014 {
24015   ix86_first_cycle_multipass_data_t data
24016     = (ix86_first_cycle_multipass_data_t) _data;
24017 
24018   data->ifetch_block_len = 0;
24019   data->ifetch_block_n_insns = 0;
24020   data->ready_try_change = NULL;
24021   data->ready_try_change_size = 0;
24022 }
24023 
24024 /* Advancing the cycle; reset ifetch block counts.  */
24025 static void
24026 core2i7_dfa_post_advance_cycle (void)
24027 {
24028   ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24029 
24030   gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24031 
24032   data->ifetch_block_len = 0;
24033   data->ifetch_block_n_insns = 0;
24034 }
24035 
24036 static int min_insn_size (rtx);
24037 
24038 /* Filter out insns from ready_try that the core will not be able to issue
24039    on current cycle due to decoder.  */
24040 static void
24041 core2i7_first_cycle_multipass_filter_ready_try
24042 (const_ix86_first_cycle_multipass_data_t data,
24043  char *ready_try, int n_ready, bool first_cycle_insn_p)
24044 {
24045   while (n_ready--)
24046     {
24047       rtx insn;
24048       int insn_size;
24049 
24050       if (ready_try[n_ready])
24051 	continue;
24052 
24053       insn = get_ready_element (n_ready);
24054       insn_size = min_insn_size (insn);
24055 
24056       if (/* If this is a too long an insn for a secondary decoder ...  */
24057 	  (!first_cycle_insn_p
24058 	   && insn_size > core2i7_secondary_decoder_max_insn_size)
24059 	  /* ... or it would not fit into the ifetch block ...  */
24060 	  || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24061 	  /* ... or the decoder is full already ...  */
24062 	  || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24063 	/* ... mask the insn out.  */
24064 	{
24065 	  ready_try[n_ready] = 1;
24066 
24067 	  if (data->ready_try_change)
24068 	    SET_BIT (data->ready_try_change, n_ready);
24069 	}
24070     }
24071 }
24072 
24073 /* Prepare for a new round of multipass lookahead scheduling.  */
24074 static void
24075 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24076 				     bool first_cycle_insn_p)
24077 {
24078   ix86_first_cycle_multipass_data_t data
24079     = (ix86_first_cycle_multipass_data_t) _data;
24080   const_ix86_first_cycle_multipass_data_t prev_data
24081     = ix86_first_cycle_multipass_data;
24082 
24083   /* Restore the state from the end of the previous round.  */
24084   data->ifetch_block_len = prev_data->ifetch_block_len;
24085   data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24086 
24087   /* Filter instructions that cannot be issued on current cycle due to
24088      decoder restrictions.  */
24089   core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24090 						  first_cycle_insn_p);
24091 }
24092 
24093 /* INSN is being issued in current solution.  Account for its impact on
24094    the decoder model.  */
24095 static void
24096 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24097 				     rtx insn, const void *_prev_data)
24098 {
24099   ix86_first_cycle_multipass_data_t data
24100     = (ix86_first_cycle_multipass_data_t) _data;
24101   const_ix86_first_cycle_multipass_data_t prev_data
24102     = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24103 
24104   int insn_size = min_insn_size (insn);
24105 
24106   data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24107   data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24108   gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24109 	      && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24110 
24111   /* Allocate or resize the bitmap for storing INSN's effect on ready_try.  */
24112   if (!data->ready_try_change)
24113     {
24114       data->ready_try_change = sbitmap_alloc (n_ready);
24115       data->ready_try_change_size = n_ready;
24116     }
24117   else if (data->ready_try_change_size < n_ready)
24118     {
24119       data->ready_try_change = sbitmap_resize (data->ready_try_change,
24120 					       n_ready, 0);
24121       data->ready_try_change_size = n_ready;
24122     }
24123   sbitmap_zero (data->ready_try_change);
24124 
24125   /* Filter out insns from ready_try that the core will not be able to issue
24126      on current cycle due to decoder.  */
24127   core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24128 						  false);
24129 }
24130 
24131 /* Revert the effect on ready_try.  */
24132 static void
24133 core2i7_first_cycle_multipass_backtrack (const void *_data,
24134 					 char *ready_try,
24135 					 int n_ready ATTRIBUTE_UNUSED)
24136 {
24137   const_ix86_first_cycle_multipass_data_t data
24138     = (const_ix86_first_cycle_multipass_data_t) _data;
24139   unsigned int i = 0;
24140   sbitmap_iterator sbi;
24141 
24142   gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24143   EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24144     {
24145       ready_try[i] = 0;
24146     }
24147 }
24148 
24149 /* Save the result of multipass lookahead scheduling for the next round.  */
24150 static void
24151 core2i7_first_cycle_multipass_end (const void *_data)
24152 {
24153   const_ix86_first_cycle_multipass_data_t data
24154     = (const_ix86_first_cycle_multipass_data_t) _data;
24155   ix86_first_cycle_multipass_data_t next_data
24156     = ix86_first_cycle_multipass_data;
24157 
24158   if (data != NULL)
24159     {
24160       next_data->ifetch_block_len = data->ifetch_block_len;
24161       next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24162     }
24163 }
24164 
24165 /* Deallocate target data.  */
24166 static void
24167 core2i7_first_cycle_multipass_fini (void *_data)
24168 {
24169   ix86_first_cycle_multipass_data_t data
24170     = (ix86_first_cycle_multipass_data_t) _data;
24171 
24172   if (data->ready_try_change)
24173     {
24174       sbitmap_free (data->ready_try_change);
24175       data->ready_try_change = NULL;
24176       data->ready_try_change_size = 0;
24177     }
24178 }
24179 
24180 /* Prepare for scheduling pass.  */
24181 static void
24182 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24183 			int verbose ATTRIBUTE_UNUSED,
24184 			int max_uid ATTRIBUTE_UNUSED)
24185 {
24186   /* Install scheduling hooks for current CPU.  Some of these hooks are used
24187      in time-critical parts of the scheduler, so we only set them up when
24188      they are actually used.  */
24189   switch (ix86_tune)
24190     {
24191     case PROCESSOR_CORE2_32:
24192     case PROCESSOR_CORE2_64:
24193     case PROCESSOR_COREI7_32:
24194     case PROCESSOR_COREI7_64:
24195       targetm.sched.dfa_post_advance_cycle
24196 	= core2i7_dfa_post_advance_cycle;
24197       targetm.sched.first_cycle_multipass_init
24198 	= core2i7_first_cycle_multipass_init;
24199       targetm.sched.first_cycle_multipass_begin
24200 	= core2i7_first_cycle_multipass_begin;
24201       targetm.sched.first_cycle_multipass_issue
24202 	= core2i7_first_cycle_multipass_issue;
24203       targetm.sched.first_cycle_multipass_backtrack
24204 	= core2i7_first_cycle_multipass_backtrack;
24205       targetm.sched.first_cycle_multipass_end
24206 	= core2i7_first_cycle_multipass_end;
24207       targetm.sched.first_cycle_multipass_fini
24208 	= core2i7_first_cycle_multipass_fini;
24209 
24210       /* Set decoder parameters.  */
24211       core2i7_secondary_decoder_max_insn_size = 8;
24212       core2i7_ifetch_block_size = 16;
24213       core2i7_ifetch_block_max_insns = 6;
24214       break;
24215 
24216     default:
24217       targetm.sched.dfa_post_advance_cycle = NULL;
24218       targetm.sched.first_cycle_multipass_init = NULL;
24219       targetm.sched.first_cycle_multipass_begin = NULL;
24220       targetm.sched.first_cycle_multipass_issue = NULL;
24221       targetm.sched.first_cycle_multipass_backtrack = NULL;
24222       targetm.sched.first_cycle_multipass_end = NULL;
24223       targetm.sched.first_cycle_multipass_fini = NULL;
24224       break;
24225     }
24226 }
24227 
24228 
24229 /* Compute the alignment given to a constant that is being placed in memory.
24230    EXP is the constant and ALIGN is the alignment that the object would
24231    ordinarily have.
24232    The value of this function is used instead of that alignment to align
24233    the object.  */
24234 
24235 int
24236 ix86_constant_alignment (tree exp, int align)
24237 {
24238   if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24239       || TREE_CODE (exp) == INTEGER_CST)
24240     {
24241       if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24242 	return 64;
24243       else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24244 	return 128;
24245     }
24246   else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24247 	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24248     return BITS_PER_WORD;
24249 
24250   return align;
24251 }
24252 
24253 /* Compute the alignment for a static variable.
24254    TYPE is the data type, and ALIGN is the alignment that
24255    the object would ordinarily have.  The value of this function is used
24256    instead of that alignment to align the object.  */
24257 
24258 int
24259 ix86_data_alignment (tree type, int align)
24260 {
24261   int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24262 
24263   if (AGGREGATE_TYPE_P (type)
24264       && TYPE_SIZE (type)
24265       && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24266       && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24267 	  || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24268       && align < max_align)
24269     align = max_align;
24270 
24271   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24272      to 16byte boundary.  */
24273   if (TARGET_64BIT)
24274     {
24275       if (AGGREGATE_TYPE_P (type)
24276 	   && TYPE_SIZE (type)
24277 	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24278 	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24279 	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24280 	return 128;
24281     }
24282 
24283   if (TREE_CODE (type) == ARRAY_TYPE)
24284     {
24285       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24286 	return 64;
24287       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24288 	return 128;
24289     }
24290   else if (TREE_CODE (type) == COMPLEX_TYPE)
24291     {
24292 
24293       if (TYPE_MODE (type) == DCmode && align < 64)
24294 	return 64;
24295       if ((TYPE_MODE (type) == XCmode
24296 	   || TYPE_MODE (type) == TCmode) && align < 128)
24297 	return 128;
24298     }
24299   else if ((TREE_CODE (type) == RECORD_TYPE
24300 	    || TREE_CODE (type) == UNION_TYPE
24301 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
24302 	   && TYPE_FIELDS (type))
24303     {
24304       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24305 	return 64;
24306       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24307 	return 128;
24308     }
24309   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24310 	   || TREE_CODE (type) == INTEGER_TYPE)
24311     {
24312       if (TYPE_MODE (type) == DFmode && align < 64)
24313 	return 64;
24314       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24315 	return 128;
24316     }
24317 
24318   return align;
24319 }
24320 
24321 /* Compute the alignment for a local variable or a stack slot.  EXP is
24322    the data type or decl itself, MODE is the widest mode available and
24323    ALIGN is the alignment that the object would ordinarily have.  The
24324    value of this macro is used instead of that alignment to align the
24325    object.  */
24326 
24327 unsigned int
24328 ix86_local_alignment (tree exp, enum machine_mode mode,
24329 		      unsigned int align)
24330 {
24331   tree type, decl;
24332 
24333   if (exp && DECL_P (exp))
24334     {
24335       type = TREE_TYPE (exp);
24336       decl = exp;
24337     }
24338   else
24339     {
24340       type = exp;
24341       decl = NULL;
24342     }
24343 
24344   /* Don't do dynamic stack realignment for long long objects with
24345      -mpreferred-stack-boundary=2.  */
24346   if (!TARGET_64BIT
24347       && align == 64
24348       && ix86_preferred_stack_boundary < 64
24349       && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24350       && (!type || !TYPE_USER_ALIGN (type))
24351       && (!decl || !DECL_USER_ALIGN (decl)))
24352     align = 32;
24353 
24354   /* If TYPE is NULL, we are allocating a stack slot for caller-save
24355      register in MODE.  We will return the largest alignment of XF
24356      and DF.  */
24357   if (!type)
24358     {
24359       if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24360 	align = GET_MODE_ALIGNMENT (DFmode);
24361       return align;
24362     }
24363 
24364   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24365      to 16byte boundary.  Exact wording is:
24366 
24367      An array uses the same alignment as its elements, except that a local or
24368      global array variable of length at least 16 bytes or
24369      a C99 variable-length array variable always has alignment of at least 16 bytes.
24370 
24371      This was added to allow use of aligned SSE instructions at arrays.  This
24372      rule is meant for static storage (where compiler can not do the analysis
24373      by itself).  We follow it for automatic variables only when convenient.
24374      We fully control everything in the function compiled and functions from
24375      other unit can not rely on the alignment.
24376 
24377      Exclude va_list type.  It is the common case of local array where
24378      we can not benefit from the alignment.  */
24379   if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24380       && TARGET_SSE)
24381     {
24382       if (AGGREGATE_TYPE_P (type)
24383 	   && (va_list_type_node == NULL_TREE
24384 	       || (TYPE_MAIN_VARIANT (type)
24385 		   != TYPE_MAIN_VARIANT (va_list_type_node)))
24386 	   && TYPE_SIZE (type)
24387 	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24388 	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24389 	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24390 	return 128;
24391     }
24392   if (TREE_CODE (type) == ARRAY_TYPE)
24393     {
24394       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24395 	return 64;
24396       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24397 	return 128;
24398     }
24399   else if (TREE_CODE (type) == COMPLEX_TYPE)
24400     {
24401       if (TYPE_MODE (type) == DCmode && align < 64)
24402 	return 64;
24403       if ((TYPE_MODE (type) == XCmode
24404 	   || TYPE_MODE (type) == TCmode) && align < 128)
24405 	return 128;
24406     }
24407   else if ((TREE_CODE (type) == RECORD_TYPE
24408 	    || TREE_CODE (type) == UNION_TYPE
24409 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
24410 	   && TYPE_FIELDS (type))
24411     {
24412       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24413 	return 64;
24414       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24415 	return 128;
24416     }
24417   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24418 	   || TREE_CODE (type) == INTEGER_TYPE)
24419     {
24420 
24421       if (TYPE_MODE (type) == DFmode && align < 64)
24422 	return 64;
24423       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24424 	return 128;
24425     }
24426   return align;
24427 }
24428 
24429 /* Compute the minimum required alignment for dynamic stack realignment
24430    purposes for a local variable, parameter or a stack slot.  EXP is
24431    the data type or decl itself, MODE is its mode and ALIGN is the
24432    alignment that the object would ordinarily have.  */
24433 
24434 unsigned int
24435 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24436 			unsigned int align)
24437 {
24438   tree type, decl;
24439 
24440   if (exp && DECL_P (exp))
24441     {
24442       type = TREE_TYPE (exp);
24443       decl = exp;
24444     }
24445   else
24446     {
24447       type = exp;
24448       decl = NULL;
24449     }
24450 
24451   if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24452     return align;
24453 
24454   /* Don't do dynamic stack realignment for long long objects with
24455      -mpreferred-stack-boundary=2.  */
24456   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24457       && (!type || !TYPE_USER_ALIGN (type))
24458       && (!decl || !DECL_USER_ALIGN (decl)))
24459     return 32;
24460 
24461   return align;
24462 }
24463 
24464 /* Find a location for the static chain incoming to a nested function.
24465    This is a register, unless all free registers are used by arguments.  */
24466 
24467 static rtx
24468 ix86_static_chain (const_tree fndecl, bool incoming_p)
24469 {
24470   unsigned regno;
24471 
24472   if (!DECL_STATIC_CHAIN (fndecl))
24473     return NULL;
24474 
24475   if (TARGET_64BIT)
24476     {
24477       /* We always use R10 in 64-bit mode.  */
24478       regno = R10_REG;
24479     }
24480   else
24481     {
24482       tree fntype;
24483       unsigned int ccvt;
24484 
24485       /* By default in 32-bit mode we use ECX to pass the static chain.  */
24486       regno = CX_REG;
24487 
24488       fntype = TREE_TYPE (fndecl);
24489       ccvt = ix86_get_callcvt (fntype);
24490       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
24491 	{
24492 	  /* Fastcall functions use ecx/edx for arguments, which leaves
24493 	     us with EAX for the static chain.
24494 	     Thiscall functions use ecx for arguments, which also
24495 	     leaves us with EAX for the static chain.  */
24496 	  regno = AX_REG;
24497 	}
24498       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
24499 	{
24500 	  /* Thiscall functions use ecx for arguments, which leaves
24501 	     us with EAX and EDX for the static chain.
24502 	     We are using for abi-compatibility EAX.  */
24503 	  regno = AX_REG;
24504 	}
24505       else if (ix86_function_regparm (fntype, fndecl) == 3)
24506 	{
24507 	  /* For regparm 3, we have no free call-clobbered registers in
24508 	     which to store the static chain.  In order to implement this,
24509 	     we have the trampoline push the static chain to the stack.
24510 	     However, we can't push a value below the return address when
24511 	     we call the nested function directly, so we have to use an
24512 	     alternate entry point.  For this we use ESI, and have the
24513 	     alternate entry point push ESI, so that things appear the
24514 	     same once we're executing the nested function.  */
24515 	  if (incoming_p)
24516 	    {
24517 	      if (fndecl == current_function_decl)
24518 		ix86_static_chain_on_stack = true;
24519 	      return gen_frame_mem (SImode,
24520 				    plus_constant (arg_pointer_rtx, -8));
24521 	    }
24522 	  regno = SI_REG;
24523 	}
24524     }
24525 
24526   return gen_rtx_REG (Pmode, regno);
24527 }
24528 
24529 /* Emit RTL insns to initialize the variable parts of a trampoline.
24530    FNDECL is the decl of the target address; M_TRAMP is a MEM for
24531    the trampoline, and CHAIN_VALUE is an RTX for the static chain
24532    to be passed to the target function.  */
24533 
24534 static void
24535 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24536 {
24537   rtx mem, fnaddr;
24538   int opcode;
24539   int offset = 0;
24540 
24541   fnaddr = XEXP (DECL_RTL (fndecl), 0);
24542 
24543   if (TARGET_64BIT)
24544     {
24545       int size;
24546 
24547       /* Load the function address to r11.  Try to load address using
24548 	 the shorter movl instead of movabs.  We may want to support
24549 	 movq for kernel mode, but kernel does not use trampolines at
24550 	 the moment.  */
24551       if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24552 	{
24553 	  fnaddr = copy_to_mode_reg (DImode, fnaddr);
24554 
24555 	  mem = adjust_address (m_tramp, HImode, offset);
24556 	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24557 
24558 	  mem = adjust_address (m_tramp, SImode, offset + 2);
24559 	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24560 	  offset += 6;
24561 	}
24562       else
24563 	{
24564 	  mem = adjust_address (m_tramp, HImode, offset);
24565 	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24566 
24567 	  mem = adjust_address (m_tramp, DImode, offset + 2);
24568 	  emit_move_insn (mem, fnaddr);
24569 	  offset += 10;
24570 	}
24571 
24572       /* Load static chain using movabs to r10.  Use the
24573 	 shorter movl instead of movabs for x32.  */
24574       if (TARGET_X32)
24575 	{
24576 	  opcode = 0xba41;
24577 	  size = 6;
24578 	}
24579       else
24580 	{
24581 	  opcode = 0xba49;
24582 	  size = 10;
24583 	}
24584 
24585       mem = adjust_address (m_tramp, HImode, offset);
24586       emit_move_insn (mem, gen_int_mode (opcode, HImode));
24587 
24588       mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24589       emit_move_insn (mem, chain_value);
24590       offset += size;
24591 
24592       /* Jump to r11; the last (unused) byte is a nop, only there to
24593 	 pad the write out to a single 32-bit store.  */
24594       mem = adjust_address (m_tramp, SImode, offset);
24595       emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24596       offset += 4;
24597     }
24598   else
24599     {
24600       rtx disp, chain;
24601 
24602       /* Depending on the static chain location, either load a register
24603 	 with a constant, or push the constant to the stack.  All of the
24604 	 instructions are the same size.  */
24605       chain = ix86_static_chain (fndecl, true);
24606       if (REG_P (chain))
24607 	{
24608 	  switch (REGNO (chain))
24609 	    {
24610 	    case AX_REG:
24611 	      opcode = 0xb8; break;
24612 	    case CX_REG:
24613 	      opcode = 0xb9; break;
24614 	    default:
24615 	      gcc_unreachable ();
24616 	    }
24617 	}
24618       else
24619 	opcode = 0x68;
24620 
24621       mem = adjust_address (m_tramp, QImode, offset);
24622       emit_move_insn (mem, gen_int_mode (opcode, QImode));
24623 
24624       mem = adjust_address (m_tramp, SImode, offset + 1);
24625       emit_move_insn (mem, chain_value);
24626       offset += 5;
24627 
24628       mem = adjust_address (m_tramp, QImode, offset);
24629       emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24630 
24631       mem = adjust_address (m_tramp, SImode, offset + 1);
24632 
24633       /* Compute offset from the end of the jmp to the target function.
24634 	 In the case in which the trampoline stores the static chain on
24635 	 the stack, we need to skip the first insn which pushes the
24636 	 (call-saved) register static chain; this push is 1 byte.  */
24637       offset += 5;
24638       disp = expand_binop (SImode, sub_optab, fnaddr,
24639 			   plus_constant (XEXP (m_tramp, 0),
24640 					  offset - (MEM_P (chain) ? 1 : 0)),
24641 			   NULL_RTX, 1, OPTAB_DIRECT);
24642       emit_move_insn (mem, disp);
24643     }
24644 
24645   gcc_assert (offset <= TRAMPOLINE_SIZE);
24646 
24647 #ifdef HAVE_ENABLE_EXECUTE_STACK
24648 #ifdef CHECK_EXECUTE_STACK_ENABLED
24649   if (CHECK_EXECUTE_STACK_ENABLED)
24650 #endif
24651   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24652 		     LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24653 #endif
24654 }
24655 
24656 /* The following file contains several enumerations and data structures
24657    built from the definitions in i386-builtin-types.def.  */
24658 
24659 #include "i386-builtin-types.inc"
24660 
24661 /* Table for the ix86 builtin non-function types.  */
24662 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24663 
24664 /* Retrieve an element from the above table, building some of
24665    the types lazily.  */
24666 
24667 static tree
24668 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24669 {
24670   unsigned int index;
24671   tree type, itype;
24672 
24673   gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24674 
24675   type = ix86_builtin_type_tab[(int) tcode];
24676   if (type != NULL)
24677     return type;
24678 
24679   gcc_assert (tcode > IX86_BT_LAST_PRIM);
24680   if (tcode <= IX86_BT_LAST_VECT)
24681     {
24682       enum machine_mode mode;
24683 
24684       index = tcode - IX86_BT_LAST_PRIM - 1;
24685       itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24686       mode = ix86_builtin_type_vect_mode[index];
24687 
24688       type = build_vector_type_for_mode (itype, mode);
24689     }
24690   else
24691     {
24692       int quals;
24693 
24694       index = tcode - IX86_BT_LAST_VECT - 1;
24695       if (tcode <= IX86_BT_LAST_PTR)
24696 	quals = TYPE_UNQUALIFIED;
24697       else
24698 	quals = TYPE_QUAL_CONST;
24699 
24700       itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24701       if (quals != TYPE_UNQUALIFIED)
24702 	itype = build_qualified_type (itype, quals);
24703 
24704       type = build_pointer_type (itype);
24705     }
24706 
24707   ix86_builtin_type_tab[(int) tcode] = type;
24708   return type;
24709 }
24710 
24711 /* Table for the ix86 builtin function types.  */
24712 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24713 
24714 /* Retrieve an element from the above table, building some of
24715    the types lazily.  */
24716 
24717 static tree
24718 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24719 {
24720   tree type;
24721 
24722   gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24723 
24724   type = ix86_builtin_func_type_tab[(int) tcode];
24725   if (type != NULL)
24726     return type;
24727 
24728   if (tcode <= IX86_BT_LAST_FUNC)
24729     {
24730       unsigned start = ix86_builtin_func_start[(int) tcode];
24731       unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24732       tree rtype, atype, args = void_list_node;
24733       unsigned i;
24734 
24735       rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24736       for (i = after - 1; i > start; --i)
24737 	{
24738 	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24739 	  args = tree_cons (NULL, atype, args);
24740 	}
24741 
24742       type = build_function_type (rtype, args);
24743     }
24744   else
24745     {
24746       unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24747       enum ix86_builtin_func_type icode;
24748 
24749       icode = ix86_builtin_func_alias_base[index];
24750       type = ix86_get_builtin_func_type (icode);
24751     }
24752 
24753   ix86_builtin_func_type_tab[(int) tcode] = type;
24754   return type;
24755 }
24756 
24757 
24758 /* Codes for all the SSE/MMX builtins.  */
24759 enum ix86_builtins
24760 {
24761   IX86_BUILTIN_ADDPS,
24762   IX86_BUILTIN_ADDSS,
24763   IX86_BUILTIN_DIVPS,
24764   IX86_BUILTIN_DIVSS,
24765   IX86_BUILTIN_MULPS,
24766   IX86_BUILTIN_MULSS,
24767   IX86_BUILTIN_SUBPS,
24768   IX86_BUILTIN_SUBSS,
24769 
24770   IX86_BUILTIN_CMPEQPS,
24771   IX86_BUILTIN_CMPLTPS,
24772   IX86_BUILTIN_CMPLEPS,
24773   IX86_BUILTIN_CMPGTPS,
24774   IX86_BUILTIN_CMPGEPS,
24775   IX86_BUILTIN_CMPNEQPS,
24776   IX86_BUILTIN_CMPNLTPS,
24777   IX86_BUILTIN_CMPNLEPS,
24778   IX86_BUILTIN_CMPNGTPS,
24779   IX86_BUILTIN_CMPNGEPS,
24780   IX86_BUILTIN_CMPORDPS,
24781   IX86_BUILTIN_CMPUNORDPS,
24782   IX86_BUILTIN_CMPEQSS,
24783   IX86_BUILTIN_CMPLTSS,
24784   IX86_BUILTIN_CMPLESS,
24785   IX86_BUILTIN_CMPNEQSS,
24786   IX86_BUILTIN_CMPNLTSS,
24787   IX86_BUILTIN_CMPNLESS,
24788   IX86_BUILTIN_CMPNGTSS,
24789   IX86_BUILTIN_CMPNGESS,
24790   IX86_BUILTIN_CMPORDSS,
24791   IX86_BUILTIN_CMPUNORDSS,
24792 
24793   IX86_BUILTIN_COMIEQSS,
24794   IX86_BUILTIN_COMILTSS,
24795   IX86_BUILTIN_COMILESS,
24796   IX86_BUILTIN_COMIGTSS,
24797   IX86_BUILTIN_COMIGESS,
24798   IX86_BUILTIN_COMINEQSS,
24799   IX86_BUILTIN_UCOMIEQSS,
24800   IX86_BUILTIN_UCOMILTSS,
24801   IX86_BUILTIN_UCOMILESS,
24802   IX86_BUILTIN_UCOMIGTSS,
24803   IX86_BUILTIN_UCOMIGESS,
24804   IX86_BUILTIN_UCOMINEQSS,
24805 
24806   IX86_BUILTIN_CVTPI2PS,
24807   IX86_BUILTIN_CVTPS2PI,
24808   IX86_BUILTIN_CVTSI2SS,
24809   IX86_BUILTIN_CVTSI642SS,
24810   IX86_BUILTIN_CVTSS2SI,
24811   IX86_BUILTIN_CVTSS2SI64,
24812   IX86_BUILTIN_CVTTPS2PI,
24813   IX86_BUILTIN_CVTTSS2SI,
24814   IX86_BUILTIN_CVTTSS2SI64,
24815 
24816   IX86_BUILTIN_MAXPS,
24817   IX86_BUILTIN_MAXSS,
24818   IX86_BUILTIN_MINPS,
24819   IX86_BUILTIN_MINSS,
24820 
24821   IX86_BUILTIN_LOADUPS,
24822   IX86_BUILTIN_STOREUPS,
24823   IX86_BUILTIN_MOVSS,
24824 
24825   IX86_BUILTIN_MOVHLPS,
24826   IX86_BUILTIN_MOVLHPS,
24827   IX86_BUILTIN_LOADHPS,
24828   IX86_BUILTIN_LOADLPS,
24829   IX86_BUILTIN_STOREHPS,
24830   IX86_BUILTIN_STORELPS,
24831 
24832   IX86_BUILTIN_MASKMOVQ,
24833   IX86_BUILTIN_MOVMSKPS,
24834   IX86_BUILTIN_PMOVMSKB,
24835 
24836   IX86_BUILTIN_MOVNTPS,
24837   IX86_BUILTIN_MOVNTQ,
24838 
24839   IX86_BUILTIN_LOADDQU,
24840   IX86_BUILTIN_STOREDQU,
24841 
24842   IX86_BUILTIN_PACKSSWB,
24843   IX86_BUILTIN_PACKSSDW,
24844   IX86_BUILTIN_PACKUSWB,
24845 
24846   IX86_BUILTIN_PADDB,
24847   IX86_BUILTIN_PADDW,
24848   IX86_BUILTIN_PADDD,
24849   IX86_BUILTIN_PADDQ,
24850   IX86_BUILTIN_PADDSB,
24851   IX86_BUILTIN_PADDSW,
24852   IX86_BUILTIN_PADDUSB,
24853   IX86_BUILTIN_PADDUSW,
24854   IX86_BUILTIN_PSUBB,
24855   IX86_BUILTIN_PSUBW,
24856   IX86_BUILTIN_PSUBD,
24857   IX86_BUILTIN_PSUBQ,
24858   IX86_BUILTIN_PSUBSB,
24859   IX86_BUILTIN_PSUBSW,
24860   IX86_BUILTIN_PSUBUSB,
24861   IX86_BUILTIN_PSUBUSW,
24862 
24863   IX86_BUILTIN_PAND,
24864   IX86_BUILTIN_PANDN,
24865   IX86_BUILTIN_POR,
24866   IX86_BUILTIN_PXOR,
24867 
24868   IX86_BUILTIN_PAVGB,
24869   IX86_BUILTIN_PAVGW,
24870 
24871   IX86_BUILTIN_PCMPEQB,
24872   IX86_BUILTIN_PCMPEQW,
24873   IX86_BUILTIN_PCMPEQD,
24874   IX86_BUILTIN_PCMPGTB,
24875   IX86_BUILTIN_PCMPGTW,
24876   IX86_BUILTIN_PCMPGTD,
24877 
24878   IX86_BUILTIN_PMADDWD,
24879 
24880   IX86_BUILTIN_PMAXSW,
24881   IX86_BUILTIN_PMAXUB,
24882   IX86_BUILTIN_PMINSW,
24883   IX86_BUILTIN_PMINUB,
24884 
24885   IX86_BUILTIN_PMULHUW,
24886   IX86_BUILTIN_PMULHW,
24887   IX86_BUILTIN_PMULLW,
24888 
24889   IX86_BUILTIN_PSADBW,
24890   IX86_BUILTIN_PSHUFW,
24891 
24892   IX86_BUILTIN_PSLLW,
24893   IX86_BUILTIN_PSLLD,
24894   IX86_BUILTIN_PSLLQ,
24895   IX86_BUILTIN_PSRAW,
24896   IX86_BUILTIN_PSRAD,
24897   IX86_BUILTIN_PSRLW,
24898   IX86_BUILTIN_PSRLD,
24899   IX86_BUILTIN_PSRLQ,
24900   IX86_BUILTIN_PSLLWI,
24901   IX86_BUILTIN_PSLLDI,
24902   IX86_BUILTIN_PSLLQI,
24903   IX86_BUILTIN_PSRAWI,
24904   IX86_BUILTIN_PSRADI,
24905   IX86_BUILTIN_PSRLWI,
24906   IX86_BUILTIN_PSRLDI,
24907   IX86_BUILTIN_PSRLQI,
24908 
24909   IX86_BUILTIN_PUNPCKHBW,
24910   IX86_BUILTIN_PUNPCKHWD,
24911   IX86_BUILTIN_PUNPCKHDQ,
24912   IX86_BUILTIN_PUNPCKLBW,
24913   IX86_BUILTIN_PUNPCKLWD,
24914   IX86_BUILTIN_PUNPCKLDQ,
24915 
24916   IX86_BUILTIN_SHUFPS,
24917 
24918   IX86_BUILTIN_RCPPS,
24919   IX86_BUILTIN_RCPSS,
24920   IX86_BUILTIN_RSQRTPS,
24921   IX86_BUILTIN_RSQRTPS_NR,
24922   IX86_BUILTIN_RSQRTSS,
24923   IX86_BUILTIN_RSQRTF,
24924   IX86_BUILTIN_SQRTPS,
24925   IX86_BUILTIN_SQRTPS_NR,
24926   IX86_BUILTIN_SQRTSS,
24927 
24928   IX86_BUILTIN_UNPCKHPS,
24929   IX86_BUILTIN_UNPCKLPS,
24930 
24931   IX86_BUILTIN_ANDPS,
24932   IX86_BUILTIN_ANDNPS,
24933   IX86_BUILTIN_ORPS,
24934   IX86_BUILTIN_XORPS,
24935 
24936   IX86_BUILTIN_EMMS,
24937   IX86_BUILTIN_LDMXCSR,
24938   IX86_BUILTIN_STMXCSR,
24939   IX86_BUILTIN_SFENCE,
24940 
24941   /* 3DNow! Original */
24942   IX86_BUILTIN_FEMMS,
24943   IX86_BUILTIN_PAVGUSB,
24944   IX86_BUILTIN_PF2ID,
24945   IX86_BUILTIN_PFACC,
24946   IX86_BUILTIN_PFADD,
24947   IX86_BUILTIN_PFCMPEQ,
24948   IX86_BUILTIN_PFCMPGE,
24949   IX86_BUILTIN_PFCMPGT,
24950   IX86_BUILTIN_PFMAX,
24951   IX86_BUILTIN_PFMIN,
24952   IX86_BUILTIN_PFMUL,
24953   IX86_BUILTIN_PFRCP,
24954   IX86_BUILTIN_PFRCPIT1,
24955   IX86_BUILTIN_PFRCPIT2,
24956   IX86_BUILTIN_PFRSQIT1,
24957   IX86_BUILTIN_PFRSQRT,
24958   IX86_BUILTIN_PFSUB,
24959   IX86_BUILTIN_PFSUBR,
24960   IX86_BUILTIN_PI2FD,
24961   IX86_BUILTIN_PMULHRW,
24962 
24963   /* 3DNow! Athlon Extensions */
24964   IX86_BUILTIN_PF2IW,
24965   IX86_BUILTIN_PFNACC,
24966   IX86_BUILTIN_PFPNACC,
24967   IX86_BUILTIN_PI2FW,
24968   IX86_BUILTIN_PSWAPDSI,
24969   IX86_BUILTIN_PSWAPDSF,
24970 
24971   /* SSE2 */
24972   IX86_BUILTIN_ADDPD,
24973   IX86_BUILTIN_ADDSD,
24974   IX86_BUILTIN_DIVPD,
24975   IX86_BUILTIN_DIVSD,
24976   IX86_BUILTIN_MULPD,
24977   IX86_BUILTIN_MULSD,
24978   IX86_BUILTIN_SUBPD,
24979   IX86_BUILTIN_SUBSD,
24980 
24981   IX86_BUILTIN_CMPEQPD,
24982   IX86_BUILTIN_CMPLTPD,
24983   IX86_BUILTIN_CMPLEPD,
24984   IX86_BUILTIN_CMPGTPD,
24985   IX86_BUILTIN_CMPGEPD,
24986   IX86_BUILTIN_CMPNEQPD,
24987   IX86_BUILTIN_CMPNLTPD,
24988   IX86_BUILTIN_CMPNLEPD,
24989   IX86_BUILTIN_CMPNGTPD,
24990   IX86_BUILTIN_CMPNGEPD,
24991   IX86_BUILTIN_CMPORDPD,
24992   IX86_BUILTIN_CMPUNORDPD,
24993   IX86_BUILTIN_CMPEQSD,
24994   IX86_BUILTIN_CMPLTSD,
24995   IX86_BUILTIN_CMPLESD,
24996   IX86_BUILTIN_CMPNEQSD,
24997   IX86_BUILTIN_CMPNLTSD,
24998   IX86_BUILTIN_CMPNLESD,
24999   IX86_BUILTIN_CMPORDSD,
25000   IX86_BUILTIN_CMPUNORDSD,
25001 
25002   IX86_BUILTIN_COMIEQSD,
25003   IX86_BUILTIN_COMILTSD,
25004   IX86_BUILTIN_COMILESD,
25005   IX86_BUILTIN_COMIGTSD,
25006   IX86_BUILTIN_COMIGESD,
25007   IX86_BUILTIN_COMINEQSD,
25008   IX86_BUILTIN_UCOMIEQSD,
25009   IX86_BUILTIN_UCOMILTSD,
25010   IX86_BUILTIN_UCOMILESD,
25011   IX86_BUILTIN_UCOMIGTSD,
25012   IX86_BUILTIN_UCOMIGESD,
25013   IX86_BUILTIN_UCOMINEQSD,
25014 
25015   IX86_BUILTIN_MAXPD,
25016   IX86_BUILTIN_MAXSD,
25017   IX86_BUILTIN_MINPD,
25018   IX86_BUILTIN_MINSD,
25019 
25020   IX86_BUILTIN_ANDPD,
25021   IX86_BUILTIN_ANDNPD,
25022   IX86_BUILTIN_ORPD,
25023   IX86_BUILTIN_XORPD,
25024 
25025   IX86_BUILTIN_SQRTPD,
25026   IX86_BUILTIN_SQRTSD,
25027 
25028   IX86_BUILTIN_UNPCKHPD,
25029   IX86_BUILTIN_UNPCKLPD,
25030 
25031   IX86_BUILTIN_SHUFPD,
25032 
25033   IX86_BUILTIN_LOADUPD,
25034   IX86_BUILTIN_STOREUPD,
25035   IX86_BUILTIN_MOVSD,
25036 
25037   IX86_BUILTIN_LOADHPD,
25038   IX86_BUILTIN_LOADLPD,
25039 
25040   IX86_BUILTIN_CVTDQ2PD,
25041   IX86_BUILTIN_CVTDQ2PS,
25042 
25043   IX86_BUILTIN_CVTPD2DQ,
25044   IX86_BUILTIN_CVTPD2PI,
25045   IX86_BUILTIN_CVTPD2PS,
25046   IX86_BUILTIN_CVTTPD2DQ,
25047   IX86_BUILTIN_CVTTPD2PI,
25048 
25049   IX86_BUILTIN_CVTPI2PD,
25050   IX86_BUILTIN_CVTSI2SD,
25051   IX86_BUILTIN_CVTSI642SD,
25052 
25053   IX86_BUILTIN_CVTSD2SI,
25054   IX86_BUILTIN_CVTSD2SI64,
25055   IX86_BUILTIN_CVTSD2SS,
25056   IX86_BUILTIN_CVTSS2SD,
25057   IX86_BUILTIN_CVTTSD2SI,
25058   IX86_BUILTIN_CVTTSD2SI64,
25059 
25060   IX86_BUILTIN_CVTPS2DQ,
25061   IX86_BUILTIN_CVTPS2PD,
25062   IX86_BUILTIN_CVTTPS2DQ,
25063 
25064   IX86_BUILTIN_MOVNTI,
25065   IX86_BUILTIN_MOVNTI64,
25066   IX86_BUILTIN_MOVNTPD,
25067   IX86_BUILTIN_MOVNTDQ,
25068 
25069   IX86_BUILTIN_MOVQ128,
25070 
25071   /* SSE2 MMX */
25072   IX86_BUILTIN_MASKMOVDQU,
25073   IX86_BUILTIN_MOVMSKPD,
25074   IX86_BUILTIN_PMOVMSKB128,
25075 
25076   IX86_BUILTIN_PACKSSWB128,
25077   IX86_BUILTIN_PACKSSDW128,
25078   IX86_BUILTIN_PACKUSWB128,
25079 
25080   IX86_BUILTIN_PADDB128,
25081   IX86_BUILTIN_PADDW128,
25082   IX86_BUILTIN_PADDD128,
25083   IX86_BUILTIN_PADDQ128,
25084   IX86_BUILTIN_PADDSB128,
25085   IX86_BUILTIN_PADDSW128,
25086   IX86_BUILTIN_PADDUSB128,
25087   IX86_BUILTIN_PADDUSW128,
25088   IX86_BUILTIN_PSUBB128,
25089   IX86_BUILTIN_PSUBW128,
25090   IX86_BUILTIN_PSUBD128,
25091   IX86_BUILTIN_PSUBQ128,
25092   IX86_BUILTIN_PSUBSB128,
25093   IX86_BUILTIN_PSUBSW128,
25094   IX86_BUILTIN_PSUBUSB128,
25095   IX86_BUILTIN_PSUBUSW128,
25096 
25097   IX86_BUILTIN_PAND128,
25098   IX86_BUILTIN_PANDN128,
25099   IX86_BUILTIN_POR128,
25100   IX86_BUILTIN_PXOR128,
25101 
25102   IX86_BUILTIN_PAVGB128,
25103   IX86_BUILTIN_PAVGW128,
25104 
25105   IX86_BUILTIN_PCMPEQB128,
25106   IX86_BUILTIN_PCMPEQW128,
25107   IX86_BUILTIN_PCMPEQD128,
25108   IX86_BUILTIN_PCMPGTB128,
25109   IX86_BUILTIN_PCMPGTW128,
25110   IX86_BUILTIN_PCMPGTD128,
25111 
25112   IX86_BUILTIN_PMADDWD128,
25113 
25114   IX86_BUILTIN_PMAXSW128,
25115   IX86_BUILTIN_PMAXUB128,
25116   IX86_BUILTIN_PMINSW128,
25117   IX86_BUILTIN_PMINUB128,
25118 
25119   IX86_BUILTIN_PMULUDQ,
25120   IX86_BUILTIN_PMULUDQ128,
25121   IX86_BUILTIN_PMULHUW128,
25122   IX86_BUILTIN_PMULHW128,
25123   IX86_BUILTIN_PMULLW128,
25124 
25125   IX86_BUILTIN_PSADBW128,
25126   IX86_BUILTIN_PSHUFHW,
25127   IX86_BUILTIN_PSHUFLW,
25128   IX86_BUILTIN_PSHUFD,
25129 
25130   IX86_BUILTIN_PSLLDQI128,
25131   IX86_BUILTIN_PSLLWI128,
25132   IX86_BUILTIN_PSLLDI128,
25133   IX86_BUILTIN_PSLLQI128,
25134   IX86_BUILTIN_PSRAWI128,
25135   IX86_BUILTIN_PSRADI128,
25136   IX86_BUILTIN_PSRLDQI128,
25137   IX86_BUILTIN_PSRLWI128,
25138   IX86_BUILTIN_PSRLDI128,
25139   IX86_BUILTIN_PSRLQI128,
25140 
25141   IX86_BUILTIN_PSLLDQ128,
25142   IX86_BUILTIN_PSLLW128,
25143   IX86_BUILTIN_PSLLD128,
25144   IX86_BUILTIN_PSLLQ128,
25145   IX86_BUILTIN_PSRAW128,
25146   IX86_BUILTIN_PSRAD128,
25147   IX86_BUILTIN_PSRLW128,
25148   IX86_BUILTIN_PSRLD128,
25149   IX86_BUILTIN_PSRLQ128,
25150 
25151   IX86_BUILTIN_PUNPCKHBW128,
25152   IX86_BUILTIN_PUNPCKHWD128,
25153   IX86_BUILTIN_PUNPCKHDQ128,
25154   IX86_BUILTIN_PUNPCKHQDQ128,
25155   IX86_BUILTIN_PUNPCKLBW128,
25156   IX86_BUILTIN_PUNPCKLWD128,
25157   IX86_BUILTIN_PUNPCKLDQ128,
25158   IX86_BUILTIN_PUNPCKLQDQ128,
25159 
25160   IX86_BUILTIN_CLFLUSH,
25161   IX86_BUILTIN_MFENCE,
25162   IX86_BUILTIN_LFENCE,
25163   IX86_BUILTIN_PAUSE,
25164 
25165   IX86_BUILTIN_BSRSI,
25166   IX86_BUILTIN_BSRDI,
25167   IX86_BUILTIN_RDPMC,
25168   IX86_BUILTIN_RDTSC,
25169   IX86_BUILTIN_RDTSCP,
25170   IX86_BUILTIN_ROLQI,
25171   IX86_BUILTIN_ROLHI,
25172   IX86_BUILTIN_RORQI,
25173   IX86_BUILTIN_RORHI,
25174 
25175   /* SSE3.  */
25176   IX86_BUILTIN_ADDSUBPS,
25177   IX86_BUILTIN_HADDPS,
25178   IX86_BUILTIN_HSUBPS,
25179   IX86_BUILTIN_MOVSHDUP,
25180   IX86_BUILTIN_MOVSLDUP,
25181   IX86_BUILTIN_ADDSUBPD,
25182   IX86_BUILTIN_HADDPD,
25183   IX86_BUILTIN_HSUBPD,
25184   IX86_BUILTIN_LDDQU,
25185 
25186   IX86_BUILTIN_MONITOR,
25187   IX86_BUILTIN_MWAIT,
25188 
25189   /* SSSE3.  */
25190   IX86_BUILTIN_PHADDW,
25191   IX86_BUILTIN_PHADDD,
25192   IX86_BUILTIN_PHADDSW,
25193   IX86_BUILTIN_PHSUBW,
25194   IX86_BUILTIN_PHSUBD,
25195   IX86_BUILTIN_PHSUBSW,
25196   IX86_BUILTIN_PMADDUBSW,
25197   IX86_BUILTIN_PMULHRSW,
25198   IX86_BUILTIN_PSHUFB,
25199   IX86_BUILTIN_PSIGNB,
25200   IX86_BUILTIN_PSIGNW,
25201   IX86_BUILTIN_PSIGND,
25202   IX86_BUILTIN_PALIGNR,
25203   IX86_BUILTIN_PABSB,
25204   IX86_BUILTIN_PABSW,
25205   IX86_BUILTIN_PABSD,
25206 
25207   IX86_BUILTIN_PHADDW128,
25208   IX86_BUILTIN_PHADDD128,
25209   IX86_BUILTIN_PHADDSW128,
25210   IX86_BUILTIN_PHSUBW128,
25211   IX86_BUILTIN_PHSUBD128,
25212   IX86_BUILTIN_PHSUBSW128,
25213   IX86_BUILTIN_PMADDUBSW128,
25214   IX86_BUILTIN_PMULHRSW128,
25215   IX86_BUILTIN_PSHUFB128,
25216   IX86_BUILTIN_PSIGNB128,
25217   IX86_BUILTIN_PSIGNW128,
25218   IX86_BUILTIN_PSIGND128,
25219   IX86_BUILTIN_PALIGNR128,
25220   IX86_BUILTIN_PABSB128,
25221   IX86_BUILTIN_PABSW128,
25222   IX86_BUILTIN_PABSD128,
25223 
25224   /* AMDFAM10 - SSE4A New Instructions.  */
25225   IX86_BUILTIN_MOVNTSD,
25226   IX86_BUILTIN_MOVNTSS,
25227   IX86_BUILTIN_EXTRQI,
25228   IX86_BUILTIN_EXTRQ,
25229   IX86_BUILTIN_INSERTQI,
25230   IX86_BUILTIN_INSERTQ,
25231 
25232   /* SSE4.1.  */
25233   IX86_BUILTIN_BLENDPD,
25234   IX86_BUILTIN_BLENDPS,
25235   IX86_BUILTIN_BLENDVPD,
25236   IX86_BUILTIN_BLENDVPS,
25237   IX86_BUILTIN_PBLENDVB128,
25238   IX86_BUILTIN_PBLENDW128,
25239 
25240   IX86_BUILTIN_DPPD,
25241   IX86_BUILTIN_DPPS,
25242 
25243   IX86_BUILTIN_INSERTPS128,
25244 
25245   IX86_BUILTIN_MOVNTDQA,
25246   IX86_BUILTIN_MPSADBW128,
25247   IX86_BUILTIN_PACKUSDW128,
25248   IX86_BUILTIN_PCMPEQQ,
25249   IX86_BUILTIN_PHMINPOSUW128,
25250 
25251   IX86_BUILTIN_PMAXSB128,
25252   IX86_BUILTIN_PMAXSD128,
25253   IX86_BUILTIN_PMAXUD128,
25254   IX86_BUILTIN_PMAXUW128,
25255 
25256   IX86_BUILTIN_PMINSB128,
25257   IX86_BUILTIN_PMINSD128,
25258   IX86_BUILTIN_PMINUD128,
25259   IX86_BUILTIN_PMINUW128,
25260 
25261   IX86_BUILTIN_PMOVSXBW128,
25262   IX86_BUILTIN_PMOVSXBD128,
25263   IX86_BUILTIN_PMOVSXBQ128,
25264   IX86_BUILTIN_PMOVSXWD128,
25265   IX86_BUILTIN_PMOVSXWQ128,
25266   IX86_BUILTIN_PMOVSXDQ128,
25267 
25268   IX86_BUILTIN_PMOVZXBW128,
25269   IX86_BUILTIN_PMOVZXBD128,
25270   IX86_BUILTIN_PMOVZXBQ128,
25271   IX86_BUILTIN_PMOVZXWD128,
25272   IX86_BUILTIN_PMOVZXWQ128,
25273   IX86_BUILTIN_PMOVZXDQ128,
25274 
25275   IX86_BUILTIN_PMULDQ128,
25276   IX86_BUILTIN_PMULLD128,
25277 
25278   IX86_BUILTIN_ROUNDSD,
25279   IX86_BUILTIN_ROUNDSS,
25280 
25281   IX86_BUILTIN_ROUNDPD,
25282   IX86_BUILTIN_ROUNDPS,
25283 
25284   IX86_BUILTIN_FLOORPD,
25285   IX86_BUILTIN_CEILPD,
25286   IX86_BUILTIN_TRUNCPD,
25287   IX86_BUILTIN_RINTPD,
25288   IX86_BUILTIN_ROUNDPD_AZ,
25289 
25290   IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25291   IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25292   IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25293 
25294   IX86_BUILTIN_FLOORPS,
25295   IX86_BUILTIN_CEILPS,
25296   IX86_BUILTIN_TRUNCPS,
25297   IX86_BUILTIN_RINTPS,
25298   IX86_BUILTIN_ROUNDPS_AZ,
25299 
25300   IX86_BUILTIN_FLOORPS_SFIX,
25301   IX86_BUILTIN_CEILPS_SFIX,
25302   IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25303 
25304   IX86_BUILTIN_PTESTZ,
25305   IX86_BUILTIN_PTESTC,
25306   IX86_BUILTIN_PTESTNZC,
25307 
25308   IX86_BUILTIN_VEC_INIT_V2SI,
25309   IX86_BUILTIN_VEC_INIT_V4HI,
25310   IX86_BUILTIN_VEC_INIT_V8QI,
25311   IX86_BUILTIN_VEC_EXT_V2DF,
25312   IX86_BUILTIN_VEC_EXT_V2DI,
25313   IX86_BUILTIN_VEC_EXT_V4SF,
25314   IX86_BUILTIN_VEC_EXT_V4SI,
25315   IX86_BUILTIN_VEC_EXT_V8HI,
25316   IX86_BUILTIN_VEC_EXT_V2SI,
25317   IX86_BUILTIN_VEC_EXT_V4HI,
25318   IX86_BUILTIN_VEC_EXT_V16QI,
25319   IX86_BUILTIN_VEC_SET_V2DI,
25320   IX86_BUILTIN_VEC_SET_V4SF,
25321   IX86_BUILTIN_VEC_SET_V4SI,
25322   IX86_BUILTIN_VEC_SET_V8HI,
25323   IX86_BUILTIN_VEC_SET_V4HI,
25324   IX86_BUILTIN_VEC_SET_V16QI,
25325 
25326   IX86_BUILTIN_VEC_PACK_SFIX,
25327   IX86_BUILTIN_VEC_PACK_SFIX256,
25328 
25329   /* SSE4.2.  */
25330   IX86_BUILTIN_CRC32QI,
25331   IX86_BUILTIN_CRC32HI,
25332   IX86_BUILTIN_CRC32SI,
25333   IX86_BUILTIN_CRC32DI,
25334 
25335   IX86_BUILTIN_PCMPESTRI128,
25336   IX86_BUILTIN_PCMPESTRM128,
25337   IX86_BUILTIN_PCMPESTRA128,
25338   IX86_BUILTIN_PCMPESTRC128,
25339   IX86_BUILTIN_PCMPESTRO128,
25340   IX86_BUILTIN_PCMPESTRS128,
25341   IX86_BUILTIN_PCMPESTRZ128,
25342   IX86_BUILTIN_PCMPISTRI128,
25343   IX86_BUILTIN_PCMPISTRM128,
25344   IX86_BUILTIN_PCMPISTRA128,
25345   IX86_BUILTIN_PCMPISTRC128,
25346   IX86_BUILTIN_PCMPISTRO128,
25347   IX86_BUILTIN_PCMPISTRS128,
25348   IX86_BUILTIN_PCMPISTRZ128,
25349 
25350   IX86_BUILTIN_PCMPGTQ,
25351 
25352   /* AES instructions */
25353   IX86_BUILTIN_AESENC128,
25354   IX86_BUILTIN_AESENCLAST128,
25355   IX86_BUILTIN_AESDEC128,
25356   IX86_BUILTIN_AESDECLAST128,
25357   IX86_BUILTIN_AESIMC128,
25358   IX86_BUILTIN_AESKEYGENASSIST128,
25359 
25360   /* PCLMUL instruction */
25361   IX86_BUILTIN_PCLMULQDQ128,
25362 
25363   /* AVX */
25364   IX86_BUILTIN_ADDPD256,
25365   IX86_BUILTIN_ADDPS256,
25366   IX86_BUILTIN_ADDSUBPD256,
25367   IX86_BUILTIN_ADDSUBPS256,
25368   IX86_BUILTIN_ANDPD256,
25369   IX86_BUILTIN_ANDPS256,
25370   IX86_BUILTIN_ANDNPD256,
25371   IX86_BUILTIN_ANDNPS256,
25372   IX86_BUILTIN_BLENDPD256,
25373   IX86_BUILTIN_BLENDPS256,
25374   IX86_BUILTIN_BLENDVPD256,
25375   IX86_BUILTIN_BLENDVPS256,
25376   IX86_BUILTIN_DIVPD256,
25377   IX86_BUILTIN_DIVPS256,
25378   IX86_BUILTIN_DPPS256,
25379   IX86_BUILTIN_HADDPD256,
25380   IX86_BUILTIN_HADDPS256,
25381   IX86_BUILTIN_HSUBPD256,
25382   IX86_BUILTIN_HSUBPS256,
25383   IX86_BUILTIN_MAXPD256,
25384   IX86_BUILTIN_MAXPS256,
25385   IX86_BUILTIN_MINPD256,
25386   IX86_BUILTIN_MINPS256,
25387   IX86_BUILTIN_MULPD256,
25388   IX86_BUILTIN_MULPS256,
25389   IX86_BUILTIN_ORPD256,
25390   IX86_BUILTIN_ORPS256,
25391   IX86_BUILTIN_SHUFPD256,
25392   IX86_BUILTIN_SHUFPS256,
25393   IX86_BUILTIN_SUBPD256,
25394   IX86_BUILTIN_SUBPS256,
25395   IX86_BUILTIN_XORPD256,
25396   IX86_BUILTIN_XORPS256,
25397   IX86_BUILTIN_CMPSD,
25398   IX86_BUILTIN_CMPSS,
25399   IX86_BUILTIN_CMPPD,
25400   IX86_BUILTIN_CMPPS,
25401   IX86_BUILTIN_CMPPD256,
25402   IX86_BUILTIN_CMPPS256,
25403   IX86_BUILTIN_CVTDQ2PD256,
25404   IX86_BUILTIN_CVTDQ2PS256,
25405   IX86_BUILTIN_CVTPD2PS256,
25406   IX86_BUILTIN_CVTPS2DQ256,
25407   IX86_BUILTIN_CVTPS2PD256,
25408   IX86_BUILTIN_CVTTPD2DQ256,
25409   IX86_BUILTIN_CVTPD2DQ256,
25410   IX86_BUILTIN_CVTTPS2DQ256,
25411   IX86_BUILTIN_EXTRACTF128PD256,
25412   IX86_BUILTIN_EXTRACTF128PS256,
25413   IX86_BUILTIN_EXTRACTF128SI256,
25414   IX86_BUILTIN_VZEROALL,
25415   IX86_BUILTIN_VZEROUPPER,
25416   IX86_BUILTIN_VPERMILVARPD,
25417   IX86_BUILTIN_VPERMILVARPS,
25418   IX86_BUILTIN_VPERMILVARPD256,
25419   IX86_BUILTIN_VPERMILVARPS256,
25420   IX86_BUILTIN_VPERMILPD,
25421   IX86_BUILTIN_VPERMILPS,
25422   IX86_BUILTIN_VPERMILPD256,
25423   IX86_BUILTIN_VPERMILPS256,
25424   IX86_BUILTIN_VPERMIL2PD,
25425   IX86_BUILTIN_VPERMIL2PS,
25426   IX86_BUILTIN_VPERMIL2PD256,
25427   IX86_BUILTIN_VPERMIL2PS256,
25428   IX86_BUILTIN_VPERM2F128PD256,
25429   IX86_BUILTIN_VPERM2F128PS256,
25430   IX86_BUILTIN_VPERM2F128SI256,
25431   IX86_BUILTIN_VBROADCASTSS,
25432   IX86_BUILTIN_VBROADCASTSD256,
25433   IX86_BUILTIN_VBROADCASTSS256,
25434   IX86_BUILTIN_VBROADCASTPD256,
25435   IX86_BUILTIN_VBROADCASTPS256,
25436   IX86_BUILTIN_VINSERTF128PD256,
25437   IX86_BUILTIN_VINSERTF128PS256,
25438   IX86_BUILTIN_VINSERTF128SI256,
25439   IX86_BUILTIN_LOADUPD256,
25440   IX86_BUILTIN_LOADUPS256,
25441   IX86_BUILTIN_STOREUPD256,
25442   IX86_BUILTIN_STOREUPS256,
25443   IX86_BUILTIN_LDDQU256,
25444   IX86_BUILTIN_MOVNTDQ256,
25445   IX86_BUILTIN_MOVNTPD256,
25446   IX86_BUILTIN_MOVNTPS256,
25447   IX86_BUILTIN_LOADDQU256,
25448   IX86_BUILTIN_STOREDQU256,
25449   IX86_BUILTIN_MASKLOADPD,
25450   IX86_BUILTIN_MASKLOADPS,
25451   IX86_BUILTIN_MASKSTOREPD,
25452   IX86_BUILTIN_MASKSTOREPS,
25453   IX86_BUILTIN_MASKLOADPD256,
25454   IX86_BUILTIN_MASKLOADPS256,
25455   IX86_BUILTIN_MASKSTOREPD256,
25456   IX86_BUILTIN_MASKSTOREPS256,
25457   IX86_BUILTIN_MOVSHDUP256,
25458   IX86_BUILTIN_MOVSLDUP256,
25459   IX86_BUILTIN_MOVDDUP256,
25460 
25461   IX86_BUILTIN_SQRTPD256,
25462   IX86_BUILTIN_SQRTPS256,
25463   IX86_BUILTIN_SQRTPS_NR256,
25464   IX86_BUILTIN_RSQRTPS256,
25465   IX86_BUILTIN_RSQRTPS_NR256,
25466 
25467   IX86_BUILTIN_RCPPS256,
25468 
25469   IX86_BUILTIN_ROUNDPD256,
25470   IX86_BUILTIN_ROUNDPS256,
25471 
25472   IX86_BUILTIN_FLOORPD256,
25473   IX86_BUILTIN_CEILPD256,
25474   IX86_BUILTIN_TRUNCPD256,
25475   IX86_BUILTIN_RINTPD256,
25476   IX86_BUILTIN_ROUNDPD_AZ256,
25477 
25478   IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25479   IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25480   IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25481 
25482   IX86_BUILTIN_FLOORPS256,
25483   IX86_BUILTIN_CEILPS256,
25484   IX86_BUILTIN_TRUNCPS256,
25485   IX86_BUILTIN_RINTPS256,
25486   IX86_BUILTIN_ROUNDPS_AZ256,
25487 
25488   IX86_BUILTIN_FLOORPS_SFIX256,
25489   IX86_BUILTIN_CEILPS_SFIX256,
25490   IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25491 
25492   IX86_BUILTIN_UNPCKHPD256,
25493   IX86_BUILTIN_UNPCKLPD256,
25494   IX86_BUILTIN_UNPCKHPS256,
25495   IX86_BUILTIN_UNPCKLPS256,
25496 
25497   IX86_BUILTIN_SI256_SI,
25498   IX86_BUILTIN_PS256_PS,
25499   IX86_BUILTIN_PD256_PD,
25500   IX86_BUILTIN_SI_SI256,
25501   IX86_BUILTIN_PS_PS256,
25502   IX86_BUILTIN_PD_PD256,
25503 
25504   IX86_BUILTIN_VTESTZPD,
25505   IX86_BUILTIN_VTESTCPD,
25506   IX86_BUILTIN_VTESTNZCPD,
25507   IX86_BUILTIN_VTESTZPS,
25508   IX86_BUILTIN_VTESTCPS,
25509   IX86_BUILTIN_VTESTNZCPS,
25510   IX86_BUILTIN_VTESTZPD256,
25511   IX86_BUILTIN_VTESTCPD256,
25512   IX86_BUILTIN_VTESTNZCPD256,
25513   IX86_BUILTIN_VTESTZPS256,
25514   IX86_BUILTIN_VTESTCPS256,
25515   IX86_BUILTIN_VTESTNZCPS256,
25516   IX86_BUILTIN_PTESTZ256,
25517   IX86_BUILTIN_PTESTC256,
25518   IX86_BUILTIN_PTESTNZC256,
25519 
25520   IX86_BUILTIN_MOVMSKPD256,
25521   IX86_BUILTIN_MOVMSKPS256,
25522 
25523   /* AVX2 */
25524   IX86_BUILTIN_MPSADBW256,
25525   IX86_BUILTIN_PABSB256,
25526   IX86_BUILTIN_PABSW256,
25527   IX86_BUILTIN_PABSD256,
25528   IX86_BUILTIN_PACKSSDW256,
25529   IX86_BUILTIN_PACKSSWB256,
25530   IX86_BUILTIN_PACKUSDW256,
25531   IX86_BUILTIN_PACKUSWB256,
25532   IX86_BUILTIN_PADDB256,
25533   IX86_BUILTIN_PADDW256,
25534   IX86_BUILTIN_PADDD256,
25535   IX86_BUILTIN_PADDQ256,
25536   IX86_BUILTIN_PADDSB256,
25537   IX86_BUILTIN_PADDSW256,
25538   IX86_BUILTIN_PADDUSB256,
25539   IX86_BUILTIN_PADDUSW256,
25540   IX86_BUILTIN_PALIGNR256,
25541   IX86_BUILTIN_AND256I,
25542   IX86_BUILTIN_ANDNOT256I,
25543   IX86_BUILTIN_PAVGB256,
25544   IX86_BUILTIN_PAVGW256,
25545   IX86_BUILTIN_PBLENDVB256,
25546   IX86_BUILTIN_PBLENDVW256,
25547   IX86_BUILTIN_PCMPEQB256,
25548   IX86_BUILTIN_PCMPEQW256,
25549   IX86_BUILTIN_PCMPEQD256,
25550   IX86_BUILTIN_PCMPEQQ256,
25551   IX86_BUILTIN_PCMPGTB256,
25552   IX86_BUILTIN_PCMPGTW256,
25553   IX86_BUILTIN_PCMPGTD256,
25554   IX86_BUILTIN_PCMPGTQ256,
25555   IX86_BUILTIN_PHADDW256,
25556   IX86_BUILTIN_PHADDD256,
25557   IX86_BUILTIN_PHADDSW256,
25558   IX86_BUILTIN_PHSUBW256,
25559   IX86_BUILTIN_PHSUBD256,
25560   IX86_BUILTIN_PHSUBSW256,
25561   IX86_BUILTIN_PMADDUBSW256,
25562   IX86_BUILTIN_PMADDWD256,
25563   IX86_BUILTIN_PMAXSB256,
25564   IX86_BUILTIN_PMAXSW256,
25565   IX86_BUILTIN_PMAXSD256,
25566   IX86_BUILTIN_PMAXUB256,
25567   IX86_BUILTIN_PMAXUW256,
25568   IX86_BUILTIN_PMAXUD256,
25569   IX86_BUILTIN_PMINSB256,
25570   IX86_BUILTIN_PMINSW256,
25571   IX86_BUILTIN_PMINSD256,
25572   IX86_BUILTIN_PMINUB256,
25573   IX86_BUILTIN_PMINUW256,
25574   IX86_BUILTIN_PMINUD256,
25575   IX86_BUILTIN_PMOVMSKB256,
25576   IX86_BUILTIN_PMOVSXBW256,
25577   IX86_BUILTIN_PMOVSXBD256,
25578   IX86_BUILTIN_PMOVSXBQ256,
25579   IX86_BUILTIN_PMOVSXWD256,
25580   IX86_BUILTIN_PMOVSXWQ256,
25581   IX86_BUILTIN_PMOVSXDQ256,
25582   IX86_BUILTIN_PMOVZXBW256,
25583   IX86_BUILTIN_PMOVZXBD256,
25584   IX86_BUILTIN_PMOVZXBQ256,
25585   IX86_BUILTIN_PMOVZXWD256,
25586   IX86_BUILTIN_PMOVZXWQ256,
25587   IX86_BUILTIN_PMOVZXDQ256,
25588   IX86_BUILTIN_PMULDQ256,
25589   IX86_BUILTIN_PMULHRSW256,
25590   IX86_BUILTIN_PMULHUW256,
25591   IX86_BUILTIN_PMULHW256,
25592   IX86_BUILTIN_PMULLW256,
25593   IX86_BUILTIN_PMULLD256,
25594   IX86_BUILTIN_PMULUDQ256,
25595   IX86_BUILTIN_POR256,
25596   IX86_BUILTIN_PSADBW256,
25597   IX86_BUILTIN_PSHUFB256,
25598   IX86_BUILTIN_PSHUFD256,
25599   IX86_BUILTIN_PSHUFHW256,
25600   IX86_BUILTIN_PSHUFLW256,
25601   IX86_BUILTIN_PSIGNB256,
25602   IX86_BUILTIN_PSIGNW256,
25603   IX86_BUILTIN_PSIGND256,
25604   IX86_BUILTIN_PSLLDQI256,
25605   IX86_BUILTIN_PSLLWI256,
25606   IX86_BUILTIN_PSLLW256,
25607   IX86_BUILTIN_PSLLDI256,
25608   IX86_BUILTIN_PSLLD256,
25609   IX86_BUILTIN_PSLLQI256,
25610   IX86_BUILTIN_PSLLQ256,
25611   IX86_BUILTIN_PSRAWI256,
25612   IX86_BUILTIN_PSRAW256,
25613   IX86_BUILTIN_PSRADI256,
25614   IX86_BUILTIN_PSRAD256,
25615   IX86_BUILTIN_PSRLDQI256,
25616   IX86_BUILTIN_PSRLWI256,
25617   IX86_BUILTIN_PSRLW256,
25618   IX86_BUILTIN_PSRLDI256,
25619   IX86_BUILTIN_PSRLD256,
25620   IX86_BUILTIN_PSRLQI256,
25621   IX86_BUILTIN_PSRLQ256,
25622   IX86_BUILTIN_PSUBB256,
25623   IX86_BUILTIN_PSUBW256,
25624   IX86_BUILTIN_PSUBD256,
25625   IX86_BUILTIN_PSUBQ256,
25626   IX86_BUILTIN_PSUBSB256,
25627   IX86_BUILTIN_PSUBSW256,
25628   IX86_BUILTIN_PSUBUSB256,
25629   IX86_BUILTIN_PSUBUSW256,
25630   IX86_BUILTIN_PUNPCKHBW256,
25631   IX86_BUILTIN_PUNPCKHWD256,
25632   IX86_BUILTIN_PUNPCKHDQ256,
25633   IX86_BUILTIN_PUNPCKHQDQ256,
25634   IX86_BUILTIN_PUNPCKLBW256,
25635   IX86_BUILTIN_PUNPCKLWD256,
25636   IX86_BUILTIN_PUNPCKLDQ256,
25637   IX86_BUILTIN_PUNPCKLQDQ256,
25638   IX86_BUILTIN_PXOR256,
25639   IX86_BUILTIN_MOVNTDQA256,
25640   IX86_BUILTIN_VBROADCASTSS_PS,
25641   IX86_BUILTIN_VBROADCASTSS_PS256,
25642   IX86_BUILTIN_VBROADCASTSD_PD256,
25643   IX86_BUILTIN_VBROADCASTSI256,
25644   IX86_BUILTIN_PBLENDD256,
25645   IX86_BUILTIN_PBLENDD128,
25646   IX86_BUILTIN_PBROADCASTB256,
25647   IX86_BUILTIN_PBROADCASTW256,
25648   IX86_BUILTIN_PBROADCASTD256,
25649   IX86_BUILTIN_PBROADCASTQ256,
25650   IX86_BUILTIN_PBROADCASTB128,
25651   IX86_BUILTIN_PBROADCASTW128,
25652   IX86_BUILTIN_PBROADCASTD128,
25653   IX86_BUILTIN_PBROADCASTQ128,
25654   IX86_BUILTIN_VPERMVARSI256,
25655   IX86_BUILTIN_VPERMDF256,
25656   IX86_BUILTIN_VPERMVARSF256,
25657   IX86_BUILTIN_VPERMDI256,
25658   IX86_BUILTIN_VPERMTI256,
25659   IX86_BUILTIN_VEXTRACT128I256,
25660   IX86_BUILTIN_VINSERT128I256,
25661   IX86_BUILTIN_MASKLOADD,
25662   IX86_BUILTIN_MASKLOADQ,
25663   IX86_BUILTIN_MASKLOADD256,
25664   IX86_BUILTIN_MASKLOADQ256,
25665   IX86_BUILTIN_MASKSTORED,
25666   IX86_BUILTIN_MASKSTOREQ,
25667   IX86_BUILTIN_MASKSTORED256,
25668   IX86_BUILTIN_MASKSTOREQ256,
25669   IX86_BUILTIN_PSLLVV4DI,
25670   IX86_BUILTIN_PSLLVV2DI,
25671   IX86_BUILTIN_PSLLVV8SI,
25672   IX86_BUILTIN_PSLLVV4SI,
25673   IX86_BUILTIN_PSRAVV8SI,
25674   IX86_BUILTIN_PSRAVV4SI,
25675   IX86_BUILTIN_PSRLVV4DI,
25676   IX86_BUILTIN_PSRLVV2DI,
25677   IX86_BUILTIN_PSRLVV8SI,
25678   IX86_BUILTIN_PSRLVV4SI,
25679 
25680   IX86_BUILTIN_GATHERSIV2DF,
25681   IX86_BUILTIN_GATHERSIV4DF,
25682   IX86_BUILTIN_GATHERDIV2DF,
25683   IX86_BUILTIN_GATHERDIV4DF,
25684   IX86_BUILTIN_GATHERSIV4SF,
25685   IX86_BUILTIN_GATHERSIV8SF,
25686   IX86_BUILTIN_GATHERDIV4SF,
25687   IX86_BUILTIN_GATHERDIV8SF,
25688   IX86_BUILTIN_GATHERSIV2DI,
25689   IX86_BUILTIN_GATHERSIV4DI,
25690   IX86_BUILTIN_GATHERDIV2DI,
25691   IX86_BUILTIN_GATHERDIV4DI,
25692   IX86_BUILTIN_GATHERSIV4SI,
25693   IX86_BUILTIN_GATHERSIV8SI,
25694   IX86_BUILTIN_GATHERDIV4SI,
25695   IX86_BUILTIN_GATHERDIV8SI,
25696 
25697   /* Alternate 4 element gather for the vectorizer where
25698      all operands are 32-byte wide.  */
25699   IX86_BUILTIN_GATHERALTSIV4DF,
25700   IX86_BUILTIN_GATHERALTDIV8SF,
25701   IX86_BUILTIN_GATHERALTSIV4DI,
25702   IX86_BUILTIN_GATHERALTDIV8SI,
25703 
25704   /* TFmode support builtins.  */
25705   IX86_BUILTIN_INFQ,
25706   IX86_BUILTIN_HUGE_VALQ,
25707   IX86_BUILTIN_FABSQ,
25708   IX86_BUILTIN_COPYSIGNQ,
25709 
25710   /* Vectorizer support builtins.  */
25711   IX86_BUILTIN_CPYSGNPS,
25712   IX86_BUILTIN_CPYSGNPD,
25713   IX86_BUILTIN_CPYSGNPS256,
25714   IX86_BUILTIN_CPYSGNPD256,
25715 
25716   /* FMA4 instructions.  */
25717   IX86_BUILTIN_VFMADDSS,
25718   IX86_BUILTIN_VFMADDSD,
25719   IX86_BUILTIN_VFMADDPS,
25720   IX86_BUILTIN_VFMADDPD,
25721   IX86_BUILTIN_VFMADDPS256,
25722   IX86_BUILTIN_VFMADDPD256,
25723   IX86_BUILTIN_VFMADDSUBPS,
25724   IX86_BUILTIN_VFMADDSUBPD,
25725   IX86_BUILTIN_VFMADDSUBPS256,
25726   IX86_BUILTIN_VFMADDSUBPD256,
25727 
25728   /* FMA3 instructions.  */
25729   IX86_BUILTIN_VFMADDSS3,
25730   IX86_BUILTIN_VFMADDSD3,
25731 
25732   /* XOP instructions.  */
25733   IX86_BUILTIN_VPCMOV,
25734   IX86_BUILTIN_VPCMOV_V2DI,
25735   IX86_BUILTIN_VPCMOV_V4SI,
25736   IX86_BUILTIN_VPCMOV_V8HI,
25737   IX86_BUILTIN_VPCMOV_V16QI,
25738   IX86_BUILTIN_VPCMOV_V4SF,
25739   IX86_BUILTIN_VPCMOV_V2DF,
25740   IX86_BUILTIN_VPCMOV256,
25741   IX86_BUILTIN_VPCMOV_V4DI256,
25742   IX86_BUILTIN_VPCMOV_V8SI256,
25743   IX86_BUILTIN_VPCMOV_V16HI256,
25744   IX86_BUILTIN_VPCMOV_V32QI256,
25745   IX86_BUILTIN_VPCMOV_V8SF256,
25746   IX86_BUILTIN_VPCMOV_V4DF256,
25747 
25748   IX86_BUILTIN_VPPERM,
25749 
25750   IX86_BUILTIN_VPMACSSWW,
25751   IX86_BUILTIN_VPMACSWW,
25752   IX86_BUILTIN_VPMACSSWD,
25753   IX86_BUILTIN_VPMACSWD,
25754   IX86_BUILTIN_VPMACSSDD,
25755   IX86_BUILTIN_VPMACSDD,
25756   IX86_BUILTIN_VPMACSSDQL,
25757   IX86_BUILTIN_VPMACSSDQH,
25758   IX86_BUILTIN_VPMACSDQL,
25759   IX86_BUILTIN_VPMACSDQH,
25760   IX86_BUILTIN_VPMADCSSWD,
25761   IX86_BUILTIN_VPMADCSWD,
25762 
25763   IX86_BUILTIN_VPHADDBW,
25764   IX86_BUILTIN_VPHADDBD,
25765   IX86_BUILTIN_VPHADDBQ,
25766   IX86_BUILTIN_VPHADDWD,
25767   IX86_BUILTIN_VPHADDWQ,
25768   IX86_BUILTIN_VPHADDDQ,
25769   IX86_BUILTIN_VPHADDUBW,
25770   IX86_BUILTIN_VPHADDUBD,
25771   IX86_BUILTIN_VPHADDUBQ,
25772   IX86_BUILTIN_VPHADDUWD,
25773   IX86_BUILTIN_VPHADDUWQ,
25774   IX86_BUILTIN_VPHADDUDQ,
25775   IX86_BUILTIN_VPHSUBBW,
25776   IX86_BUILTIN_VPHSUBWD,
25777   IX86_BUILTIN_VPHSUBDQ,
25778 
25779   IX86_BUILTIN_VPROTB,
25780   IX86_BUILTIN_VPROTW,
25781   IX86_BUILTIN_VPROTD,
25782   IX86_BUILTIN_VPROTQ,
25783   IX86_BUILTIN_VPROTB_IMM,
25784   IX86_BUILTIN_VPROTW_IMM,
25785   IX86_BUILTIN_VPROTD_IMM,
25786   IX86_BUILTIN_VPROTQ_IMM,
25787 
25788   IX86_BUILTIN_VPSHLB,
25789   IX86_BUILTIN_VPSHLW,
25790   IX86_BUILTIN_VPSHLD,
25791   IX86_BUILTIN_VPSHLQ,
25792   IX86_BUILTIN_VPSHAB,
25793   IX86_BUILTIN_VPSHAW,
25794   IX86_BUILTIN_VPSHAD,
25795   IX86_BUILTIN_VPSHAQ,
25796 
25797   IX86_BUILTIN_VFRCZSS,
25798   IX86_BUILTIN_VFRCZSD,
25799   IX86_BUILTIN_VFRCZPS,
25800   IX86_BUILTIN_VFRCZPD,
25801   IX86_BUILTIN_VFRCZPS256,
25802   IX86_BUILTIN_VFRCZPD256,
25803 
25804   IX86_BUILTIN_VPCOMEQUB,
25805   IX86_BUILTIN_VPCOMNEUB,
25806   IX86_BUILTIN_VPCOMLTUB,
25807   IX86_BUILTIN_VPCOMLEUB,
25808   IX86_BUILTIN_VPCOMGTUB,
25809   IX86_BUILTIN_VPCOMGEUB,
25810   IX86_BUILTIN_VPCOMFALSEUB,
25811   IX86_BUILTIN_VPCOMTRUEUB,
25812 
25813   IX86_BUILTIN_VPCOMEQUW,
25814   IX86_BUILTIN_VPCOMNEUW,
25815   IX86_BUILTIN_VPCOMLTUW,
25816   IX86_BUILTIN_VPCOMLEUW,
25817   IX86_BUILTIN_VPCOMGTUW,
25818   IX86_BUILTIN_VPCOMGEUW,
25819   IX86_BUILTIN_VPCOMFALSEUW,
25820   IX86_BUILTIN_VPCOMTRUEUW,
25821 
25822   IX86_BUILTIN_VPCOMEQUD,
25823   IX86_BUILTIN_VPCOMNEUD,
25824   IX86_BUILTIN_VPCOMLTUD,
25825   IX86_BUILTIN_VPCOMLEUD,
25826   IX86_BUILTIN_VPCOMGTUD,
25827   IX86_BUILTIN_VPCOMGEUD,
25828   IX86_BUILTIN_VPCOMFALSEUD,
25829   IX86_BUILTIN_VPCOMTRUEUD,
25830 
25831   IX86_BUILTIN_VPCOMEQUQ,
25832   IX86_BUILTIN_VPCOMNEUQ,
25833   IX86_BUILTIN_VPCOMLTUQ,
25834   IX86_BUILTIN_VPCOMLEUQ,
25835   IX86_BUILTIN_VPCOMGTUQ,
25836   IX86_BUILTIN_VPCOMGEUQ,
25837   IX86_BUILTIN_VPCOMFALSEUQ,
25838   IX86_BUILTIN_VPCOMTRUEUQ,
25839 
25840   IX86_BUILTIN_VPCOMEQB,
25841   IX86_BUILTIN_VPCOMNEB,
25842   IX86_BUILTIN_VPCOMLTB,
25843   IX86_BUILTIN_VPCOMLEB,
25844   IX86_BUILTIN_VPCOMGTB,
25845   IX86_BUILTIN_VPCOMGEB,
25846   IX86_BUILTIN_VPCOMFALSEB,
25847   IX86_BUILTIN_VPCOMTRUEB,
25848 
25849   IX86_BUILTIN_VPCOMEQW,
25850   IX86_BUILTIN_VPCOMNEW,
25851   IX86_BUILTIN_VPCOMLTW,
25852   IX86_BUILTIN_VPCOMLEW,
25853   IX86_BUILTIN_VPCOMGTW,
25854   IX86_BUILTIN_VPCOMGEW,
25855   IX86_BUILTIN_VPCOMFALSEW,
25856   IX86_BUILTIN_VPCOMTRUEW,
25857 
25858   IX86_BUILTIN_VPCOMEQD,
25859   IX86_BUILTIN_VPCOMNED,
25860   IX86_BUILTIN_VPCOMLTD,
25861   IX86_BUILTIN_VPCOMLED,
25862   IX86_BUILTIN_VPCOMGTD,
25863   IX86_BUILTIN_VPCOMGED,
25864   IX86_BUILTIN_VPCOMFALSED,
25865   IX86_BUILTIN_VPCOMTRUED,
25866 
25867   IX86_BUILTIN_VPCOMEQQ,
25868   IX86_BUILTIN_VPCOMNEQ,
25869   IX86_BUILTIN_VPCOMLTQ,
25870   IX86_BUILTIN_VPCOMLEQ,
25871   IX86_BUILTIN_VPCOMGTQ,
25872   IX86_BUILTIN_VPCOMGEQ,
25873   IX86_BUILTIN_VPCOMFALSEQ,
25874   IX86_BUILTIN_VPCOMTRUEQ,
25875 
25876   /* LWP instructions.  */
25877   IX86_BUILTIN_LLWPCB,
25878   IX86_BUILTIN_SLWPCB,
25879   IX86_BUILTIN_LWPVAL32,
25880   IX86_BUILTIN_LWPVAL64,
25881   IX86_BUILTIN_LWPINS32,
25882   IX86_BUILTIN_LWPINS64,
25883 
25884   IX86_BUILTIN_CLZS,
25885 
25886   /* BMI instructions.  */
25887   IX86_BUILTIN_BEXTR32,
25888   IX86_BUILTIN_BEXTR64,
25889   IX86_BUILTIN_CTZS,
25890 
25891   /* TBM instructions.  */
25892   IX86_BUILTIN_BEXTRI32,
25893   IX86_BUILTIN_BEXTRI64,
25894 
25895   /* BMI2 instructions. */
25896   IX86_BUILTIN_BZHI32,
25897   IX86_BUILTIN_BZHI64,
25898   IX86_BUILTIN_PDEP32,
25899   IX86_BUILTIN_PDEP64,
25900   IX86_BUILTIN_PEXT32,
25901   IX86_BUILTIN_PEXT64,
25902 
25903   /* FSGSBASE instructions.  */
25904   IX86_BUILTIN_RDFSBASE32,
25905   IX86_BUILTIN_RDFSBASE64,
25906   IX86_BUILTIN_RDGSBASE32,
25907   IX86_BUILTIN_RDGSBASE64,
25908   IX86_BUILTIN_WRFSBASE32,
25909   IX86_BUILTIN_WRFSBASE64,
25910   IX86_BUILTIN_WRGSBASE32,
25911   IX86_BUILTIN_WRGSBASE64,
25912 
25913   /* RDRND instructions.  */
25914   IX86_BUILTIN_RDRAND16_STEP,
25915   IX86_BUILTIN_RDRAND32_STEP,
25916   IX86_BUILTIN_RDRAND64_STEP,
25917 
25918   /* F16C instructions.  */
25919   IX86_BUILTIN_CVTPH2PS,
25920   IX86_BUILTIN_CVTPH2PS256,
25921   IX86_BUILTIN_CVTPS2PH,
25922   IX86_BUILTIN_CVTPS2PH256,
25923 
25924   /* CFString built-in for darwin */
25925   IX86_BUILTIN_CFSTRING,
25926 
25927   IX86_BUILTIN_MAX
25928 };
25929 
25930 /* Table for the ix86 builtin decls.  */
25931 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25932 
25933 /* Table of all of the builtin functions that are possible with different ISA's
25934    but are waiting to be built until a function is declared to use that
25935    ISA.  */
25936 struct builtin_isa {
25937   const char *name;		/* function name */
25938   enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25939   HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
25940   bool const_p;			/* true if the declaration is constant */
25941   bool set_and_not_built_p;
25942 };
25943 
25944 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25945 
25946 
25947 /* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the MASK
25948    of which isa_flags to use in the ix86_builtins_isa array.  Stores the
25949    function decl in the ix86_builtins array.  Returns the function decl or
25950    NULL_TREE, if the builtin was not added.
25951 
25952    If the front end has a special hook for builtin functions, delay adding
25953    builtin functions that aren't in the current ISA until the ISA is changed
25954    with function specific optimization.  Doing so, can save about 300K for the
25955    default compiler.  When the builtin is expanded, check at that time whether
25956    it is valid.
25957 
25958    If the front end doesn't have a special hook, record all builtins, even if
25959    it isn't an instruction set in the current ISA in case the user uses
25960    function specific options for a different ISA, so that we don't get scope
25961    errors if a builtin is added in the middle of a function scope.  */
25962 
25963 static inline tree
25964 def_builtin (HOST_WIDE_INT mask, const char *name,
25965 	     enum ix86_builtin_func_type tcode,
25966 	     enum ix86_builtins code)
25967 {
25968   tree decl = NULL_TREE;
25969 
25970   if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25971     {
25972       ix86_builtins_isa[(int) code].isa = mask;
25973 
25974       mask &= ~OPTION_MASK_ISA_64BIT;
25975       if (mask == 0
25976 	  || (mask & ix86_isa_flags) != 0
25977 	  || (lang_hooks.builtin_function
25978 	      == lang_hooks.builtin_function_ext_scope))
25979 
25980 	{
25981 	  tree type = ix86_get_builtin_func_type (tcode);
25982 	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25983 				       NULL, NULL_TREE);
25984 	  ix86_builtins[(int) code] = decl;
25985 	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25986 	}
25987       else
25988 	{
25989 	  ix86_builtins[(int) code] = NULL_TREE;
25990 	  ix86_builtins_isa[(int) code].tcode = tcode;
25991 	  ix86_builtins_isa[(int) code].name = name;
25992 	  ix86_builtins_isa[(int) code].const_p = false;
25993 	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25994 	}
25995     }
25996 
25997   return decl;
25998 }
25999 
26000 /* Like def_builtin, but also marks the function decl "const".  */
26001 
26002 static inline tree
26003 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26004 		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26005 {
26006   tree decl = def_builtin (mask, name, tcode, code);
26007   if (decl)
26008     TREE_READONLY (decl) = 1;
26009   else
26010     ix86_builtins_isa[(int) code].const_p = true;
26011 
26012   return decl;
26013 }
26014 
26015 /* Add any new builtin functions for a given ISA that may not have been
26016    declared.  This saves a bit of space compared to adding all of the
26017    declarations to the tree, even if we didn't use them.  */
26018 
26019 static void
26020 ix86_add_new_builtins (HOST_WIDE_INT isa)
26021 {
26022   int i;
26023 
26024   for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26025     {
26026       if ((ix86_builtins_isa[i].isa & isa) != 0
26027 	  && ix86_builtins_isa[i].set_and_not_built_p)
26028 	{
26029 	  tree decl, type;
26030 
26031 	  /* Don't define the builtin again.  */
26032 	  ix86_builtins_isa[i].set_and_not_built_p = false;
26033 
26034 	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26035 	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26036 						 type, i, BUILT_IN_MD, NULL,
26037 						 NULL_TREE);
26038 
26039 	  ix86_builtins[i] = decl;
26040 	  if (ix86_builtins_isa[i].const_p)
26041 	    TREE_READONLY (decl) = 1;
26042 	}
26043     }
26044 }
26045 
26046 /* Bits for builtin_description.flag.  */
26047 
26048 /* Set when we don't support the comparison natively, and should
26049    swap_comparison in order to support it.  */
26050 #define BUILTIN_DESC_SWAP_OPERANDS	1
26051 
26052 struct builtin_description
26053 {
26054   const HOST_WIDE_INT mask;
26055   const enum insn_code icode;
26056   const char *const name;
26057   const enum ix86_builtins code;
26058   const enum rtx_code comparison;
26059   const int flag;
26060 };
26061 
26062 static const struct builtin_description bdesc_comi[] =
26063 {
26064   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26065   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26066   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26067   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26068   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26069   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26070   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26071   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26072   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26073   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26074   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26075   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26076   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26077   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26078   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26079   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26080   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26081   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26082   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26083   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26084   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26085   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26086   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26087   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26088 };
26089 
26090 static const struct builtin_description bdesc_pcmpestr[] =
26091 {
26092   /* SSE4.2 */
26093   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26094   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26095   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26096   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26097   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26098   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26099   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26100 };
26101 
26102 static const struct builtin_description bdesc_pcmpistr[] =
26103 {
26104   /* SSE4.2 */
26105   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26106   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26107   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26108   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26109   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26110   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26111   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26112 };
26113 
26114 /* Special builtins with variable number of arguments.  */
26115 static const struct builtin_description bdesc_special_args[] =
26116 {
26117   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26118   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26119   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26120 
26121   /* MMX */
26122   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26123 
26124   /* 3DNow! */
26125   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26126 
26127   /* SSE */
26128   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26129   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26130   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26131 
26132   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26133   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26134   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26135   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26136 
26137   /* SSE or 3DNow!A  */
26138   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26139   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26140 
26141   /* SSE2 */
26142   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26143   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26144   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26145   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26146   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26147   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26148   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26149   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26150   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26151   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26152 
26153   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26154   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26155 
26156   /* SSE3 */
26157   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26158 
26159   /* SSE4.1 */
26160   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26161 
26162   /* SSE4A */
26163   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26164   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26165 
26166   /* AVX */
26167   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26168   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26169 
26170   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26171   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26172   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26173   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26174   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26175 
26176   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26177   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26178   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26179   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26180   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26181   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26182   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26183 
26184   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26185   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26186   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26187 
26188   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26189   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26190   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26191   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26192   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26193   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26194   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26195   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26196 
26197   /* AVX2 */
26198   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26199   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26200   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26201   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26202   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26203   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26204   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26205   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26206   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26207 
26208   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26209   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26210   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26211   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26212   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26213   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26214 
26215   /* FSGSBASE */
26216   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26217   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26218   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26219   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26220   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26221   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26222   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26223   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26224 };
26225 
26226 /* Builtins with variable number of arguments.  */
26227 static const struct builtin_description bdesc_args[] =
26228 {
26229   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26230   { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26231   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26232   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26233   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26234   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26235   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26236 
26237   /* MMX */
26238   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26239   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26240   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26241   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26242   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26243   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26244 
26245   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26246   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26247   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26248   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26249   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26250   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26251   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26252   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26253 
26254   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26255   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26256 
26257   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26258   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26259   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26260   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26261 
26262   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26263   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26264   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26265   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26266   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26267   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26268 
26269   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26270   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26271   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26272   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26273   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26274   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26275 
26276   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26277   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26278   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26279 
26280   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26281 
26282   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26283   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26284   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26285   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26286   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26287   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26288 
26289   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26290   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26291   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26292   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26293   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26294   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26295 
26296   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26297   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26298   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26299   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26300 
26301   /* 3DNow! */
26302   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26303   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26304   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26305   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26306 
26307   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26308   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26309   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26310   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26311   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26312   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26313   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26314   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26315   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26316   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26317   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26318   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26319   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26320   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26321   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26322 
26323   /* 3DNow!A */
26324   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26325   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26326   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26327   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26328   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26329   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26330 
26331   /* SSE */
26332   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26333   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26334   { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26335   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26336   { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26337   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26338   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26339   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26340   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26341   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26342   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26343   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26344 
26345   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26346 
26347   { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26348   { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26349   { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26350   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26351   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3,  "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26352   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3,  "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26353   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3,  "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26354   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3,  "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26355 
26356   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26357   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26358   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26359   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26360   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26361   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26362   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26363   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26364   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26365   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26366   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26367   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26368   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26369   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26370   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26371   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26372   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26373   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26374   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26375   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26376   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26377   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26378 
26379   { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26380   { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26381   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26382   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26383 
26384   { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26385   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3,  "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26386   { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26387   { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26388 
26389   { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3,  "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26390 
26391   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26392   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26393   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26394   { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26395   { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26396 
26397   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26398   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26399   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26400 
26401   { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26402 
26403   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26404   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26405   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26406 
26407   /* SSE MMX or 3Dnow!A */
26408   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26409   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26410   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26411 
26412   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26413   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26414   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26415   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26416 
26417   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26418   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26419 
26420   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26421 
26422   /* SSE2 */
26423   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26424 
26425   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF  },
26426   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26427   { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26428   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26429   { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26430 
26431   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26432   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26433   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26434   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26435   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26436 
26437   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26438 
26439   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26440   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26441   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26442   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26443 
26444   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26445   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26446   { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26447 
26448   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26449   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26450   { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26451   { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26452   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3,  "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26453   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3,  "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26454   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3,  "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26455   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3,  "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26456 
26457   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26458   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26459   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26460   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26461   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26462   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26463   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26464   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26465   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26466   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26467   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26468   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26469   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26470   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26471   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26472   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26473   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26474   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26475   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26476   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26477 
26478   { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26479   { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26480   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26481   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26482 
26483   { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26484   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26485   { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26486   { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26487 
26488   { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26489 
26490   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26491   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26492   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26493 
26494   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26495 
26496   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26497   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26498   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26499   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26500   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26501   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26502   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26503   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26504 
26505   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26506   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26507   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26508   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26509   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26510   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26511   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26512   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26513 
26514   { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26515   { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26516 
26517   { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26518   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26519   { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26520   { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26521 
26522   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26523   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26524 
26525   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26526   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26527   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
26528   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26529   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26530   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
26531 
26532   { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26533   { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26534   { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26535   { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26536 
26537   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26538   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI  },
26539   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN,  (int) V4SI_FTYPE_V4SI_V4SI },
26540   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26541   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26542   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26543   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26544   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26545 
26546   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26547   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26548   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26549 
26550   { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26551   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26552 
26553   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26554   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26555 
26556   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26557 
26558   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26559   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26560   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26561   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26562 
26563   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26564   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26565   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26566   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26567   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26568   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26569   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26570 
26571   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26572   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26573   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26574   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26575   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26576   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26577   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26578 
26579   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26580   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26581   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26582   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26583 
26584   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26585   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26586   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26587 
26588   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26589 
26590   { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26591   { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26592 
26593   { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26594 
26595   /* SSE2 MMX */
26596   { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26597   { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26598 
26599   /* SSE3 */
26600   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26601   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26602 
26603   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26604   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26605   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26606   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26607   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26608   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26609 
26610   /* SSSE3 */
26611   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26612   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26613   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26614   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26615   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26616   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26617 
26618   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26619   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26620   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26621   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26622   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26623   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26624   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26625   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26626   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26627   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26628   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26629   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26630   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26631   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26632   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26633   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26634   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26635   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26636   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26637   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26638   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26639   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26640   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26641   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26642 
26643   /* SSSE3.  */
26644   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26645   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26646 
26647   /* SSE4.1 */
26648   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26649   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26650   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26651   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26652   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26653   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26654   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26655   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26656   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26657   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26658 
26659   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26660   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26661   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26662   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26663   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26664   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26665   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26666   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26667   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26668   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26669   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26670   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26671   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26672 
26673   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26674   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26675   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26676   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26677   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26678   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26679   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26680   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26681   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26682   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26683   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26684   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26685 
26686   /* SSE4.1 */
26687   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26688   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26689   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26690   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26691 
26692   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26693   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26694   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26695   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26696 
26697   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26698   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26699 
26700   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26701   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26702 
26703   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26704   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26705   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26706   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26707 
26708   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26709   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26710 
26711   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26712   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26713 
26714   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26715   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26716   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26717 
26718   /* SSE4.2 */
26719   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26720   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26721   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26722   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26723   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26724 
26725   /* SSE4A */
26726   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26727   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26728   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26729   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26730 
26731   /* AES */
26732   { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26733   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26734 
26735   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26736   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26737   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26738   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26739 
26740   /* PCLMUL */
26741   { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26742 
26743   /* AVX */
26744   { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26745   { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26746   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26747   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26748   { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26749   { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26750   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26751   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26752   { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26753   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26754   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26755   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26756   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26757   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26758   { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26759   { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26760   { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26761   { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26762   { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26763   { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26764   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26765   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26766   { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26767   { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26768   { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26769   { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26770 
26771   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26772   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26773   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26774   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26775 
26776   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26777   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26778   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26779   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26780   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26781   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26782   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26783   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26784   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26785   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26786   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26787   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26788   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26789   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26790   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26791   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26792   { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26793   { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26794   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26795   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26796   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26797   { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26798   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26799   { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26800   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26801   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26802   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26803   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26804   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26805   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26806   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26807   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26808   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26809   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26810 
26811   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26812   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26813   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26814 
26815   { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26816   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26817   { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26818   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26819   { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26820 
26821   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26822 
26823   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26824   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26825 
26826   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26827   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26828   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26829   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26830 
26831   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26832   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26833 
26834   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26835   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26836 
26837   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26838   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26839   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26840   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26841 
26842   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26843   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26844 
26845   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26846   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26847 
26848   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26849   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26850   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,  "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26851   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256,  "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26852 
26853   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26854   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26855   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26856   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26857   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26858   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26859 
26860   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26861   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26862   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26863   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26864   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26865   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26866   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26867   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26868   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26869   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26870   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26871   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26872   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26873   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26874   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26875 
26876   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF  },
26877   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26878 
26879   { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3,  "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26880   { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3,  "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26881 
26882   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26883 
26884   /* AVX2 */
26885   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26886   { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26887   { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26888   { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26889   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26890   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26891   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26892   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256",  IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26893   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26894   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26895   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26896   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26897   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26898   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26899   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26900   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26901   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26902   { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26903   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26904   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26905   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26906   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26907   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26908   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26909   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26910   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
26911   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
26912   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26913   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26914   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
26915   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
26916   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26917   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26918   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26919   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26920   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26921   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26922   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26923   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26924   { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26925   { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26926   { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26927   { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26928   { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26929   { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26930   { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26931   { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26932   { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26933   { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26934   { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26935   { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26936   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26937   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26938   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2  , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26939   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2  , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26940   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2  , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26941   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2  , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26942   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2  , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26943   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26944   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2  , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26945   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2  , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26946   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2  , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26947   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2  , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26948   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2  , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26949   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3  , "__builtin_ia32_pmuldq256"  , IX86_BUILTIN_PMULDQ256  , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26950   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26951   { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26952   { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256"  , IX86_BUILTIN_PMULHW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26953   { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"  , IX86_BUILTIN_PMULLW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26954   { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26955   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3  , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26956   { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26957   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26958   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26959   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26960   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26961   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26962   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26963   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26964   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26965   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26966   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26967   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26968   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26969   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26970   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26971   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26972   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26973   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26974   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26975   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26976   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26977   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26978   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26979   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26980   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26981   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26982   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26983   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26984   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26985   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26986   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26987   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26988   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26989   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26990   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26991   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26992   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI  },
26993   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN,  (int) V8SI_FTYPE_V8SI_V8SI },
26994   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26995   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26996   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26997   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26998   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26999   { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27000   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27001   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27002   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27003   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27004   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27005   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27006   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27007   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27008   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27009   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27010   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27011   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27012   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27013   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27014   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27015   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27016   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27017   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27018   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27019   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27020   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27021   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27022   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27023   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27024   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27025   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27026   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27027   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27028   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27029   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27030   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27031 
27032   { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
27033 
27034   /* BMI */
27035   { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27036   { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27037   { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2,       "__builtin_ctzs",           IX86_BUILTIN_CTZS,    UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27038 
27039   /* TBM */
27040   { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27041   { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27042 
27043   /* F16C */
27044   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27045   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27046   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27047   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27048 
27049   /* BMI2 */
27050   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27051   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27052   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27053   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27054   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27055   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27056 };
27057 
27058 /* FMA4 and XOP.  */
27059 #define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27060 #define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27061 #define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27062 #define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27063 #define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
27064 #define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
27065 #define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
27066 #define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
27067 #define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
27068 #define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
27069 #define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
27070 #define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
27071 #define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
27072 #define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
27073 #define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
27074 #define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
27075 #define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
27076 #define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
27077 #define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
27078 #define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
27079 #define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
27080 #define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
27081 #define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
27082 #define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
27083 #define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
27084 #define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
27085 #define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
27086 #define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
27087 #define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
27088 #define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
27089 #define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
27090 #define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
27091 #define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
27092 #define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
27093 #define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
27094 #define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
27095 #define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
27096 #define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
27097 #define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
27098 #define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
27099 #define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
27100 #define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
27101 #define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
27102 #define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
27103 #define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
27104 #define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
27105 #define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
27106 #define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
27107 #define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
27108 #define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
27109 #define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
27110 #define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
27111 
27112 static const struct builtin_description bdesc_multi_arg[] =
27113 {
27114   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27115     "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27116     UNKNOWN, (int)MULTI_ARG_3_SF },
27117   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27118     "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27119     UNKNOWN, (int)MULTI_ARG_3_DF },
27120 
27121   { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27122     "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27123     UNKNOWN, (int)MULTI_ARG_3_SF },
27124   { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27125     "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27126     UNKNOWN, (int)MULTI_ARG_3_DF },
27127 
27128   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27129     "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27130     UNKNOWN, (int)MULTI_ARG_3_SF },
27131   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27132     "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27133     UNKNOWN, (int)MULTI_ARG_3_DF },
27134   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27135     "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27136     UNKNOWN, (int)MULTI_ARG_3_SF2 },
27137   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27138     "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27139     UNKNOWN, (int)MULTI_ARG_3_DF2 },
27140 
27141   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27142     "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27143     UNKNOWN, (int)MULTI_ARG_3_SF },
27144   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27145     "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27146     UNKNOWN, (int)MULTI_ARG_3_DF },
27147   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27148     "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27149     UNKNOWN, (int)MULTI_ARG_3_SF2 },
27150   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27151     "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27152     UNKNOWN, (int)MULTI_ARG_3_DF2 },
27153 
27154   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov",      IX86_BUILTIN_VPCMOV,	 UNKNOWN,      (int)MULTI_ARG_3_DI },
27155   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN,      (int)MULTI_ARG_3_DI },
27156   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si,        "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN,      (int)MULTI_ARG_3_SI },
27157   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi,        "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN,      (int)MULTI_ARG_3_HI },
27158   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi,       "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN,      (int)MULTI_ARG_3_QI },
27159   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df,        "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN,      (int)MULTI_ARG_3_DF },
27160   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf,        "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN,      (int)MULTI_ARG_3_SF },
27161 
27162   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov256",       IX86_BUILTIN_VPCMOV256,       UNKNOWN,      (int)MULTI_ARG_3_DI2 },
27163   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov_v4di256",  IX86_BUILTIN_VPCMOV_V4DI256,  UNKNOWN,      (int)MULTI_ARG_3_DI2 },
27164   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256,        "__builtin_ia32_vpcmov_v8si256",  IX86_BUILTIN_VPCMOV_V8SI256,  UNKNOWN,      (int)MULTI_ARG_3_SI2 },
27165   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256,       "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN,      (int)MULTI_ARG_3_HI2 },
27166   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256,       "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN,      (int)MULTI_ARG_3_QI2 },
27167   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256,        "__builtin_ia32_vpcmov_v4df256",  IX86_BUILTIN_VPCMOV_V4DF256,  UNKNOWN,      (int)MULTI_ARG_3_DF2 },
27168   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256,        "__builtin_ia32_vpcmov_v8sf256",  IX86_BUILTIN_VPCMOV_V8SF256,  UNKNOWN,      (int)MULTI_ARG_3_SF2 },
27169 
27170   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm,             "__builtin_ia32_vpperm",      IX86_BUILTIN_VPPERM,      UNKNOWN,      (int)MULTI_ARG_3_QI },
27171 
27172   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww,          "__builtin_ia32_vpmacssww",   IX86_BUILTIN_VPMACSSWW,   UNKNOWN,      (int)MULTI_ARG_3_HI },
27173   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww,           "__builtin_ia32_vpmacsww",    IX86_BUILTIN_VPMACSWW,    UNKNOWN,      (int)MULTI_ARG_3_HI },
27174   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd,          "__builtin_ia32_vpmacsswd",   IX86_BUILTIN_VPMACSSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27175   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd,           "__builtin_ia32_vpmacswd",    IX86_BUILTIN_VPMACSWD,    UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27176   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd,          "__builtin_ia32_vpmacssdd",   IX86_BUILTIN_VPMACSSDD,   UNKNOWN,      (int)MULTI_ARG_3_SI },
27177   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd,           "__builtin_ia32_vpmacsdd",    IX86_BUILTIN_VPMACSDD,    UNKNOWN,      (int)MULTI_ARG_3_SI },
27178   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql,         "__builtin_ia32_vpmacssdql",  IX86_BUILTIN_VPMACSSDQL,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27179   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh,         "__builtin_ia32_vpmacssdqh",  IX86_BUILTIN_VPMACSSDQH,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27180   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql,          "__builtin_ia32_vpmacsdql",   IX86_BUILTIN_VPMACSDQL,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27181   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh,          "__builtin_ia32_vpmacsdqh",   IX86_BUILTIN_VPMACSDQH,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27182   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd,         "__builtin_ia32_vpmadcsswd",  IX86_BUILTIN_VPMADCSSWD,  UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27183   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd,          "__builtin_ia32_vpmadcswd",   IX86_BUILTIN_VPMADCSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27184 
27185   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3,        "__builtin_ia32_vprotq",      IX86_BUILTIN_VPROTQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
27186   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3,        "__builtin_ia32_vprotd",      IX86_BUILTIN_VPROTD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
27187   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3,        "__builtin_ia32_vprotw",      IX86_BUILTIN_VPROTW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
27188   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3,       "__builtin_ia32_vprotb",      IX86_BUILTIN_VPROTB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
27189   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3,         "__builtin_ia32_vprotqi",     IX86_BUILTIN_VPROTQ_IMM,  UNKNOWN,      (int)MULTI_ARG_2_DI_IMM },
27190   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3,         "__builtin_ia32_vprotdi",     IX86_BUILTIN_VPROTD_IMM,  UNKNOWN,      (int)MULTI_ARG_2_SI_IMM },
27191   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3,         "__builtin_ia32_vprotwi",     IX86_BUILTIN_VPROTW_IMM,  UNKNOWN,      (int)MULTI_ARG_2_HI_IMM },
27192   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3,        "__builtin_ia32_vprotbi",     IX86_BUILTIN_VPROTB_IMM,  UNKNOWN,      (int)MULTI_ARG_2_QI_IMM },
27193   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3,         "__builtin_ia32_vpshaq",      IX86_BUILTIN_VPSHAQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
27194   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3,         "__builtin_ia32_vpshad",      IX86_BUILTIN_VPSHAD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
27195   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3,         "__builtin_ia32_vpshaw",      IX86_BUILTIN_VPSHAW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
27196   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3,        "__builtin_ia32_vpshab",      IX86_BUILTIN_VPSHAB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
27197   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3,         "__builtin_ia32_vpshlq",      IX86_BUILTIN_VPSHLQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
27198   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3,         "__builtin_ia32_vpshld",      IX86_BUILTIN_VPSHLD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
27199   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3,         "__builtin_ia32_vpshlw",      IX86_BUILTIN_VPSHLW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
27200   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3,        "__builtin_ia32_vpshlb",      IX86_BUILTIN_VPSHLB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
27201 
27202   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2,       "__builtin_ia32_vfrczss",     IX86_BUILTIN_VFRCZSS,     UNKNOWN,      (int)MULTI_ARG_2_SF },
27203   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2,       "__builtin_ia32_vfrczsd",     IX86_BUILTIN_VFRCZSD,     UNKNOWN,      (int)MULTI_ARG_2_DF },
27204   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2,         "__builtin_ia32_vfrczps",     IX86_BUILTIN_VFRCZPS,     UNKNOWN,      (int)MULTI_ARG_1_SF },
27205   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2,         "__builtin_ia32_vfrczpd",     IX86_BUILTIN_VFRCZPD,     UNKNOWN,      (int)MULTI_ARG_1_DF },
27206   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2,         "__builtin_ia32_vfrczps256",  IX86_BUILTIN_VFRCZPS256,  UNKNOWN,      (int)MULTI_ARG_1_SF2 },
27207   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2,         "__builtin_ia32_vfrczpd256",  IX86_BUILTIN_VFRCZPD256,  UNKNOWN,      (int)MULTI_ARG_1_DF2 },
27208 
27209   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw,           "__builtin_ia32_vphaddbw",    IX86_BUILTIN_VPHADDBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
27210   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd,           "__builtin_ia32_vphaddbd",    IX86_BUILTIN_VPHADDBD,    UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
27211   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq,           "__builtin_ia32_vphaddbq",    IX86_BUILTIN_VPHADDBQ,    UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
27212   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd,           "__builtin_ia32_vphaddwd",    IX86_BUILTIN_VPHADDWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
27213   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq,           "__builtin_ia32_vphaddwq",    IX86_BUILTIN_VPHADDWQ,    UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
27214   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq,           "__builtin_ia32_vphadddq",    IX86_BUILTIN_VPHADDDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
27215   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw,          "__builtin_ia32_vphaddubw",   IX86_BUILTIN_VPHADDUBW,   UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
27216   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd,          "__builtin_ia32_vphaddubd",   IX86_BUILTIN_VPHADDUBD,   UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
27217   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq,          "__builtin_ia32_vphaddubq",   IX86_BUILTIN_VPHADDUBQ,   UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
27218   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd,          "__builtin_ia32_vphadduwd",   IX86_BUILTIN_VPHADDUWD,   UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
27219   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq,          "__builtin_ia32_vphadduwq",   IX86_BUILTIN_VPHADDUWQ,   UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
27220   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq,          "__builtin_ia32_vphaddudq",   IX86_BUILTIN_VPHADDUDQ,   UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
27221   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw,           "__builtin_ia32_vphsubbw",    IX86_BUILTIN_VPHSUBBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
27222   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd,           "__builtin_ia32_vphsubwd",    IX86_BUILTIN_VPHSUBWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
27223   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq,           "__builtin_ia32_vphsubdq",    IX86_BUILTIN_VPHSUBDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
27224 
27225   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomeqb",    IX86_BUILTIN_VPCOMEQB,    EQ,           (int)MULTI_ARG_2_QI_CMP },
27226   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneb",    IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
27227   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneqb",   IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
27228   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomltb",    IX86_BUILTIN_VPCOMLTB,    LT,           (int)MULTI_ARG_2_QI_CMP },
27229   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomleb",    IX86_BUILTIN_VPCOMLEB,    LE,           (int)MULTI_ARG_2_QI_CMP },
27230   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgtb",    IX86_BUILTIN_VPCOMGTB,    GT,           (int)MULTI_ARG_2_QI_CMP },
27231   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgeb",    IX86_BUILTIN_VPCOMGEB,    GE,           (int)MULTI_ARG_2_QI_CMP },
27232 
27233   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomeqw",    IX86_BUILTIN_VPCOMEQW,    EQ,           (int)MULTI_ARG_2_HI_CMP },
27234   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomnew",    IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
27235   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomneqw",   IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
27236   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomltw",    IX86_BUILTIN_VPCOMLTW,    LT,           (int)MULTI_ARG_2_HI_CMP },
27237   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomlew",    IX86_BUILTIN_VPCOMLEW,    LE,           (int)MULTI_ARG_2_HI_CMP },
27238   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgtw",    IX86_BUILTIN_VPCOMGTW,    GT,           (int)MULTI_ARG_2_HI_CMP },
27239   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgew",    IX86_BUILTIN_VPCOMGEW,    GE,           (int)MULTI_ARG_2_HI_CMP },
27240 
27241   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomeqd",    IX86_BUILTIN_VPCOMEQD,    EQ,           (int)MULTI_ARG_2_SI_CMP },
27242   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomned",    IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
27243   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomneqd",   IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
27244   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomltd",    IX86_BUILTIN_VPCOMLTD,    LT,           (int)MULTI_ARG_2_SI_CMP },
27245   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomled",    IX86_BUILTIN_VPCOMLED,    LE,           (int)MULTI_ARG_2_SI_CMP },
27246   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomgtd",    IX86_BUILTIN_VPCOMGTD,    GT,           (int)MULTI_ARG_2_SI_CMP },
27247   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomged",    IX86_BUILTIN_VPCOMGED,    GE,           (int)MULTI_ARG_2_SI_CMP },
27248 
27249   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomeqq",    IX86_BUILTIN_VPCOMEQQ,    EQ,           (int)MULTI_ARG_2_DI_CMP },
27250   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneq",    IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
27251   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneqq",   IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
27252   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomltq",    IX86_BUILTIN_VPCOMLTQ,    LT,           (int)MULTI_ARG_2_DI_CMP },
27253   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomleq",    IX86_BUILTIN_VPCOMLEQ,    LE,           (int)MULTI_ARG_2_DI_CMP },
27254   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgtq",    IX86_BUILTIN_VPCOMGTQ,    GT,           (int)MULTI_ARG_2_DI_CMP },
27255   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgeq",    IX86_BUILTIN_VPCOMGEQ,    GE,           (int)MULTI_ARG_2_DI_CMP },
27256 
27257   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb",   IX86_BUILTIN_VPCOMEQUB,   EQ,           (int)MULTI_ARG_2_QI_CMP },
27258   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub",   IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
27259   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb",  IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
27260   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub",   IX86_BUILTIN_VPCOMLTUB,   LTU,          (int)MULTI_ARG_2_QI_CMP },
27261   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub",   IX86_BUILTIN_VPCOMLEUB,   LEU,          (int)MULTI_ARG_2_QI_CMP },
27262   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub",   IX86_BUILTIN_VPCOMGTUB,   GTU,          (int)MULTI_ARG_2_QI_CMP },
27263   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub",   IX86_BUILTIN_VPCOMGEUB,   GEU,          (int)MULTI_ARG_2_QI_CMP },
27264 
27265   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw",   IX86_BUILTIN_VPCOMEQUW,   EQ,           (int)MULTI_ARG_2_HI_CMP },
27266   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw",   IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
27267   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw",  IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
27268   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomltuw",   IX86_BUILTIN_VPCOMLTUW,   LTU,          (int)MULTI_ARG_2_HI_CMP },
27269   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomleuw",   IX86_BUILTIN_VPCOMLEUW,   LEU,          (int)MULTI_ARG_2_HI_CMP },
27270   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgtuw",   IX86_BUILTIN_VPCOMGTUW,   GTU,          (int)MULTI_ARG_2_HI_CMP },
27271   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgeuw",   IX86_BUILTIN_VPCOMGEUW,   GEU,          (int)MULTI_ARG_2_HI_CMP },
27272 
27273   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd",   IX86_BUILTIN_VPCOMEQUD,   EQ,           (int)MULTI_ARG_2_SI_CMP },
27274   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud",   IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
27275   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd",  IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
27276   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomltud",   IX86_BUILTIN_VPCOMLTUD,   LTU,          (int)MULTI_ARG_2_SI_CMP },
27277   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomleud",   IX86_BUILTIN_VPCOMLEUD,   LEU,          (int)MULTI_ARG_2_SI_CMP },
27278   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgtud",   IX86_BUILTIN_VPCOMGTUD,   GTU,          (int)MULTI_ARG_2_SI_CMP },
27279   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgeud",   IX86_BUILTIN_VPCOMGEUD,   GEU,          (int)MULTI_ARG_2_SI_CMP },
27280 
27281   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq",   IX86_BUILTIN_VPCOMEQUQ,   EQ,           (int)MULTI_ARG_2_DI_CMP },
27282   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq",   IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
27283   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq",  IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
27284   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomltuq",   IX86_BUILTIN_VPCOMLTUQ,   LTU,          (int)MULTI_ARG_2_DI_CMP },
27285   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomleuq",   IX86_BUILTIN_VPCOMLEUQ,   LEU,          (int)MULTI_ARG_2_DI_CMP },
27286   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgtuq",   IX86_BUILTIN_VPCOMGTUQ,   GTU,          (int)MULTI_ARG_2_DI_CMP },
27287   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgeuq",   IX86_BUILTIN_VPCOMGEUQ,   GEU,          (int)MULTI_ARG_2_DI_CMP },
27288 
27289   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
27290   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
27291   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
27292   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
27293   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
27294   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
27295   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
27296   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
27297 
27298   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueb",  IX86_BUILTIN_VPCOMTRUEB,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
27299   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtruew",  IX86_BUILTIN_VPCOMTRUEW,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
27300   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrued",  IX86_BUILTIN_VPCOMTRUED,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
27301   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueq",  IX86_BUILTIN_VPCOMTRUEQ,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
27302   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
27303   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
27304   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
27305   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
27306 
27307   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3,     "__builtin_ia32_vpermil2pd",  IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27308   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3,     "__builtin_ia32_vpermil2ps",  IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27309   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3,     "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27310   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3,     "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27311 
27312 };
27313 
27314 /* TM vector builtins.  */
27315 
27316 /* Reuse the existing x86-specific `struct builtin_description' cause
27317    we're lazy.  Add casts to make them fit.  */
27318 static const struct builtin_description bdesc_tm[] =
27319 {
27320   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27321   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27322   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27323   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27324   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27325   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27326   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27327 
27328   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27329   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27330   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27331   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27332   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27333   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27334   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27335 
27336   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27337   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27338   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27339   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27340   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27341   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27342   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27343 
27344   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27345   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27346   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27347 };
27348 
27349 /* TM callbacks.  */
27350 
27351 /* Return the builtin decl needed to load a vector of TYPE.  */
27352 
27353 static tree
27354 ix86_builtin_tm_load (tree type)
27355 {
27356   if (TREE_CODE (type) == VECTOR_TYPE)
27357     {
27358       switch (tree_low_cst (TYPE_SIZE (type), 1))
27359 	{
27360 	case 64:
27361 	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27362 	case 128:
27363 	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27364 	case 256:
27365 	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27366 	}
27367     }
27368   return NULL_TREE;
27369 }
27370 
27371 /* Return the builtin decl needed to store a vector of TYPE.  */
27372 
27373 static tree
27374 ix86_builtin_tm_store (tree type)
27375 {
27376   if (TREE_CODE (type) == VECTOR_TYPE)
27377     {
27378       switch (tree_low_cst (TYPE_SIZE (type), 1))
27379 	{
27380 	case 64:
27381 	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27382 	case 128:
27383 	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27384 	case 256:
27385 	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27386 	}
27387     }
27388   return NULL_TREE;
27389 }
27390 
27391 /* Initialize the transactional memory vector load/store builtins.  */
27392 
27393 static void
27394 ix86_init_tm_builtins (void)
27395 {
27396   enum ix86_builtin_func_type ftype;
27397   const struct builtin_description *d;
27398   size_t i;
27399   tree decl;
27400   tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27401   tree attrs_log, attrs_type_log;
27402 
27403   if (!flag_tm)
27404     return;
27405 
27406   /* If there are no builtins defined, we must be compiling in a
27407      language without trans-mem support.  */
27408   if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27409     return;
27410 
27411   /* Use whatever attributes a normal TM load has.  */
27412   decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27413   attrs_load = DECL_ATTRIBUTES (decl);
27414   attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27415   /* Use whatever attributes a normal TM store has.  */
27416   decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27417   attrs_store = DECL_ATTRIBUTES (decl);
27418   attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27419   /* Use whatever attributes a normal TM log has.  */
27420   decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27421   attrs_log = DECL_ATTRIBUTES (decl);
27422   attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27423 
27424   for (i = 0, d = bdesc_tm;
27425        i < ARRAY_SIZE (bdesc_tm);
27426        i++, d++)
27427     {
27428       if ((d->mask & ix86_isa_flags) != 0
27429 	  || (lang_hooks.builtin_function
27430 	      == lang_hooks.builtin_function_ext_scope))
27431 	{
27432 	  tree type, attrs, attrs_type;
27433 	  enum built_in_function code = (enum built_in_function) d->code;
27434 
27435 	  ftype = (enum ix86_builtin_func_type) d->flag;
27436 	  type = ix86_get_builtin_func_type (ftype);
27437 
27438 	  if (BUILTIN_TM_LOAD_P (code))
27439 	    {
27440 	      attrs = attrs_load;
27441 	      attrs_type = attrs_type_load;
27442 	    }
27443 	  else if (BUILTIN_TM_STORE_P (code))
27444 	    {
27445 	      attrs = attrs_store;
27446 	      attrs_type = attrs_type_store;
27447 	    }
27448 	  else
27449 	    {
27450 	      attrs = attrs_log;
27451 	      attrs_type = attrs_type_log;
27452 	    }
27453 	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27454 				       /* The builtin without the prefix for
27455 					  calling it directly.  */
27456 				       d->name + strlen ("__builtin_"),
27457 				       attrs);
27458 	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27459 	     set the TYPE_ATTRIBUTES.  */
27460 	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27461 
27462 	  set_builtin_decl (code, decl, false);
27463 	}
27464     }
27465 }
27466 
27467 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27468    in the current target ISA to allow the user to compile particular modules
27469    with different target specific options that differ from the command line
27470    options.  */
27471 static void
27472 ix86_init_mmx_sse_builtins (void)
27473 {
27474   const struct builtin_description * d;
27475   enum ix86_builtin_func_type ftype;
27476   size_t i;
27477 
27478   /* Add all special builtins with variable number of operands.  */
27479   for (i = 0, d = bdesc_special_args;
27480        i < ARRAY_SIZE (bdesc_special_args);
27481        i++, d++)
27482     {
27483       if (d->name == 0)
27484 	continue;
27485 
27486       ftype = (enum ix86_builtin_func_type) d->flag;
27487       def_builtin (d->mask, d->name, ftype, d->code);
27488     }
27489 
27490   /* Add all builtins with variable number of operands.  */
27491   for (i = 0, d = bdesc_args;
27492        i < ARRAY_SIZE (bdesc_args);
27493        i++, d++)
27494     {
27495       if (d->name == 0)
27496 	continue;
27497 
27498       ftype = (enum ix86_builtin_func_type) d->flag;
27499       def_builtin_const (d->mask, d->name, ftype, d->code);
27500     }
27501 
27502   /* pcmpestr[im] insns.  */
27503   for (i = 0, d = bdesc_pcmpestr;
27504        i < ARRAY_SIZE (bdesc_pcmpestr);
27505        i++, d++)
27506     {
27507       if (d->code == IX86_BUILTIN_PCMPESTRM128)
27508 	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27509       else
27510 	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27511       def_builtin_const (d->mask, d->name, ftype, d->code);
27512     }
27513 
27514   /* pcmpistr[im] insns.  */
27515   for (i = 0, d = bdesc_pcmpistr;
27516        i < ARRAY_SIZE (bdesc_pcmpistr);
27517        i++, d++)
27518     {
27519       if (d->code == IX86_BUILTIN_PCMPISTRM128)
27520 	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27521       else
27522 	ftype = INT_FTYPE_V16QI_V16QI_INT;
27523       def_builtin_const (d->mask, d->name, ftype, d->code);
27524     }
27525 
27526   /* comi/ucomi insns.  */
27527   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27528     {
27529       if (d->mask == OPTION_MASK_ISA_SSE2)
27530 	ftype = INT_FTYPE_V2DF_V2DF;
27531       else
27532 	ftype = INT_FTYPE_V4SF_V4SF;
27533       def_builtin_const (d->mask, d->name, ftype, d->code);
27534     }
27535 
27536   /* SSE */
27537   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27538 	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27539   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27540 	       UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27541 
27542   /* SSE or 3DNow!A */
27543   def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27544 	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27545 	       IX86_BUILTIN_MASKMOVQ);
27546 
27547   /* SSE2 */
27548   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27549 	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27550 
27551   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27552 	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27553   x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27554 			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27555 
27556   /* SSE3.  */
27557   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27558 	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27559   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27560 	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27561 
27562   /* AES */
27563   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27564 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27565   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27566 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27567   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27568 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27569   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27570 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27571   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27572 		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27573   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27574 		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27575 
27576   /* PCLMUL */
27577   def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27578 		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27579 
27580   /* RDRND */
27581   def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27582 	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27583   def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27584 	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27585   def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27586 	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27587 	       IX86_BUILTIN_RDRAND64_STEP);
27588 
27589   /* AVX2 */
27590   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27591 	       V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27592 	       IX86_BUILTIN_GATHERSIV2DF);
27593 
27594   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27595 	       V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27596 	       IX86_BUILTIN_GATHERSIV4DF);
27597 
27598   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27599 	       V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27600 	       IX86_BUILTIN_GATHERDIV2DF);
27601 
27602   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27603 	       V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27604 	       IX86_BUILTIN_GATHERDIV4DF);
27605 
27606   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27607 	       V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27608 	       IX86_BUILTIN_GATHERSIV4SF);
27609 
27610   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27611 	       V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27612 	       IX86_BUILTIN_GATHERSIV8SF);
27613 
27614   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27615 	       V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27616 	       IX86_BUILTIN_GATHERDIV4SF);
27617 
27618   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27619 	       V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27620 	       IX86_BUILTIN_GATHERDIV8SF);
27621 
27622   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27623 	       V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27624 	       IX86_BUILTIN_GATHERSIV2DI);
27625 
27626   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27627 	       V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27628 	       IX86_BUILTIN_GATHERSIV4DI);
27629 
27630   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27631 	       V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27632 	       IX86_BUILTIN_GATHERDIV2DI);
27633 
27634   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27635 	       V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27636 	       IX86_BUILTIN_GATHERDIV4DI);
27637 
27638   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27639 	       V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27640 	       IX86_BUILTIN_GATHERSIV4SI);
27641 
27642   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27643 	       V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27644 	       IX86_BUILTIN_GATHERSIV8SI);
27645 
27646   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27647 	       V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27648 	       IX86_BUILTIN_GATHERDIV4SI);
27649 
27650   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27651 	       V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27652 	       IX86_BUILTIN_GATHERDIV8SI);
27653 
27654   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27655 	       V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27656 	       IX86_BUILTIN_GATHERALTSIV4DF);
27657 
27658   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27659 	       V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27660 	       IX86_BUILTIN_GATHERALTDIV8SF);
27661 
27662   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27663 	       V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27664 	       IX86_BUILTIN_GATHERALTSIV4DI);
27665 
27666   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27667 	       V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27668 	       IX86_BUILTIN_GATHERALTDIV8SI);
27669 
27670   /* MMX access to the vec_init patterns.  */
27671   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27672 		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27673 
27674   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27675 		     V4HI_FTYPE_HI_HI_HI_HI,
27676 		     IX86_BUILTIN_VEC_INIT_V4HI);
27677 
27678   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27679 		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27680 		     IX86_BUILTIN_VEC_INIT_V8QI);
27681 
27682   /* Access to the vec_extract patterns.  */
27683   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27684 		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27685   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27686 		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27687   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27688 		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27689   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27690 		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27691   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27692 		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27693 
27694   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27695 		     "__builtin_ia32_vec_ext_v4hi",
27696 		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27697 
27698   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27699 		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27700 
27701   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27702 		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27703 
27704   /* Access to the vec_set patterns.  */
27705   def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27706 		     "__builtin_ia32_vec_set_v2di",
27707 		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27708 
27709   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27710 		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27711 
27712   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27713 		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27714 
27715   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27716 		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27717 
27718   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27719 		     "__builtin_ia32_vec_set_v4hi",
27720 		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27721 
27722   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27723 		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27724 
27725   /* Add FMA4 multi-arg argument instructions */
27726   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27727     {
27728       if (d->name == 0)
27729 	continue;
27730 
27731       ftype = (enum ix86_builtin_func_type) d->flag;
27732       def_builtin_const (d->mask, d->name, ftype, d->code);
27733     }
27734 }
27735 
27736 /* Internal method for ix86_init_builtins.  */
27737 
27738 static void
27739 ix86_init_builtins_va_builtins_abi (void)
27740 {
27741   tree ms_va_ref, sysv_va_ref;
27742   tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27743   tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27744   tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27745   tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27746 
27747   if (!TARGET_64BIT)
27748     return;
27749   fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27750   fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27751   ms_va_ref = build_reference_type (ms_va_list_type_node);
27752   sysv_va_ref =
27753     build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27754 
27755   fnvoid_va_end_ms =
27756     build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27757   fnvoid_va_start_ms =
27758     build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27759   fnvoid_va_end_sysv =
27760     build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27761   fnvoid_va_start_sysv =
27762     build_varargs_function_type_list (void_type_node, sysv_va_ref,
27763     				       NULL_TREE);
27764   fnvoid_va_copy_ms =
27765     build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27766     			      NULL_TREE);
27767   fnvoid_va_copy_sysv =
27768     build_function_type_list (void_type_node, sysv_va_ref,
27769     			      sysv_va_ref, NULL_TREE);
27770 
27771   add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27772   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27773   add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27774   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27775   add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27776 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27777   add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27778   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27779   add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27780   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27781   add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27782 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27783 }
27784 
27785 static void
27786 ix86_init_builtin_types (void)
27787 {
27788   tree float128_type_node, float80_type_node;
27789 
27790   /* The __float80 type.  */
27791   float80_type_node = long_double_type_node;
27792   if (TYPE_MODE (float80_type_node) != XFmode)
27793     {
27794       /* The __float80 type.  */
27795       float80_type_node = make_node (REAL_TYPE);
27796 
27797       TYPE_PRECISION (float80_type_node) = 80;
27798       layout_type (float80_type_node);
27799     }
27800   lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27801 
27802   /* The __float128 type.  */
27803   float128_type_node = make_node (REAL_TYPE);
27804   TYPE_PRECISION (float128_type_node) = 128;
27805   layout_type (float128_type_node);
27806   lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27807 
27808   /* This macro is built by i386-builtin-types.awk.  */
27809   DEFINE_BUILTIN_PRIMITIVE_TYPES;
27810 }
27811 
27812 static void
27813 ix86_init_builtins (void)
27814 {
27815   tree t;
27816 
27817   ix86_init_builtin_types ();
27818 
27819   /* TFmode support builtins.  */
27820   def_builtin_const (0, "__builtin_infq",
27821 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27822   def_builtin_const (0, "__builtin_huge_valq",
27823 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27824 
27825   /* We will expand them to normal call if SSE2 isn't available since
27826      they are used by libgcc. */
27827   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27828   t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27829 			    BUILT_IN_MD, "__fabstf2", NULL_TREE);
27830   TREE_READONLY (t) = 1;
27831   ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27832 
27833   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27834   t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27835 			    BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27836   TREE_READONLY (t) = 1;
27837   ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27838 
27839   ix86_init_tm_builtins ();
27840   ix86_init_mmx_sse_builtins ();
27841 
27842   if (TARGET_LP64)
27843     ix86_init_builtins_va_builtins_abi ();
27844 
27845 #ifdef SUBTARGET_INIT_BUILTINS
27846   SUBTARGET_INIT_BUILTINS;
27847 #endif
27848 }
27849 
27850 /* Return the ix86 builtin for CODE.  */
27851 
27852 static tree
27853 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27854 {
27855   if (code >= IX86_BUILTIN_MAX)
27856     return error_mark_node;
27857 
27858   return ix86_builtins[code];
27859 }
27860 
27861 /* Errors in the source file can cause expand_expr to return const0_rtx
27862    where we expect a vector.  To avoid crashing, use one of the vector
27863    clear instructions.  */
27864 static rtx
27865 safe_vector_operand (rtx x, enum machine_mode mode)
27866 {
27867   if (x == const0_rtx)
27868     x = CONST0_RTX (mode);
27869   return x;
27870 }
27871 
27872 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
27873 
27874 static rtx
27875 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27876 {
27877   rtx pat;
27878   tree arg0 = CALL_EXPR_ARG (exp, 0);
27879   tree arg1 = CALL_EXPR_ARG (exp, 1);
27880   rtx op0 = expand_normal (arg0);
27881   rtx op1 = expand_normal (arg1);
27882   enum machine_mode tmode = insn_data[icode].operand[0].mode;
27883   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27884   enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27885 
27886   if (VECTOR_MODE_P (mode0))
27887     op0 = safe_vector_operand (op0, mode0);
27888   if (VECTOR_MODE_P (mode1))
27889     op1 = safe_vector_operand (op1, mode1);
27890 
27891   if (optimize || !target
27892       || GET_MODE (target) != tmode
27893       || !insn_data[icode].operand[0].predicate (target, tmode))
27894     target = gen_reg_rtx (tmode);
27895 
27896   if (GET_MODE (op1) == SImode && mode1 == TImode)
27897     {
27898       rtx x = gen_reg_rtx (V4SImode);
27899       emit_insn (gen_sse2_loadd (x, op1));
27900       op1 = gen_lowpart (TImode, x);
27901     }
27902 
27903   if (!insn_data[icode].operand[1].predicate (op0, mode0))
27904     op0 = copy_to_mode_reg (mode0, op0);
27905   if (!insn_data[icode].operand[2].predicate (op1, mode1))
27906     op1 = copy_to_mode_reg (mode1, op1);
27907 
27908   pat = GEN_FCN (icode) (target, op0, op1);
27909   if (! pat)
27910     return 0;
27911 
27912   emit_insn (pat);
27913 
27914   return target;
27915 }
27916 
27917 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
27918 
27919 static rtx
27920 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27921 			       enum ix86_builtin_func_type m_type,
27922 			       enum rtx_code sub_code)
27923 {
27924   rtx pat;
27925   int i;
27926   int nargs;
27927   bool comparison_p = false;
27928   bool tf_p = false;
27929   bool last_arg_constant = false;
27930   int num_memory = 0;
27931   struct {
27932     rtx op;
27933     enum machine_mode mode;
27934   } args[4];
27935 
27936   enum machine_mode tmode = insn_data[icode].operand[0].mode;
27937 
27938   switch (m_type)
27939     {
27940     case MULTI_ARG_4_DF2_DI_I:
27941     case MULTI_ARG_4_DF2_DI_I1:
27942     case MULTI_ARG_4_SF2_SI_I:
27943     case MULTI_ARG_4_SF2_SI_I1:
27944       nargs = 4;
27945       last_arg_constant = true;
27946       break;
27947 
27948     case MULTI_ARG_3_SF:
27949     case MULTI_ARG_3_DF:
27950     case MULTI_ARG_3_SF2:
27951     case MULTI_ARG_3_DF2:
27952     case MULTI_ARG_3_DI:
27953     case MULTI_ARG_3_SI:
27954     case MULTI_ARG_3_SI_DI:
27955     case MULTI_ARG_3_HI:
27956     case MULTI_ARG_3_HI_SI:
27957     case MULTI_ARG_3_QI:
27958     case MULTI_ARG_3_DI2:
27959     case MULTI_ARG_3_SI2:
27960     case MULTI_ARG_3_HI2:
27961     case MULTI_ARG_3_QI2:
27962       nargs = 3;
27963       break;
27964 
27965     case MULTI_ARG_2_SF:
27966     case MULTI_ARG_2_DF:
27967     case MULTI_ARG_2_DI:
27968     case MULTI_ARG_2_SI:
27969     case MULTI_ARG_2_HI:
27970     case MULTI_ARG_2_QI:
27971       nargs = 2;
27972       break;
27973 
27974     case MULTI_ARG_2_DI_IMM:
27975     case MULTI_ARG_2_SI_IMM:
27976     case MULTI_ARG_2_HI_IMM:
27977     case MULTI_ARG_2_QI_IMM:
27978       nargs = 2;
27979       last_arg_constant = true;
27980       break;
27981 
27982     case MULTI_ARG_1_SF:
27983     case MULTI_ARG_1_DF:
27984     case MULTI_ARG_1_SF2:
27985     case MULTI_ARG_1_DF2:
27986     case MULTI_ARG_1_DI:
27987     case MULTI_ARG_1_SI:
27988     case MULTI_ARG_1_HI:
27989     case MULTI_ARG_1_QI:
27990     case MULTI_ARG_1_SI_DI:
27991     case MULTI_ARG_1_HI_DI:
27992     case MULTI_ARG_1_HI_SI:
27993     case MULTI_ARG_1_QI_DI:
27994     case MULTI_ARG_1_QI_SI:
27995     case MULTI_ARG_1_QI_HI:
27996       nargs = 1;
27997       break;
27998 
27999     case MULTI_ARG_2_DI_CMP:
28000     case MULTI_ARG_2_SI_CMP:
28001     case MULTI_ARG_2_HI_CMP:
28002     case MULTI_ARG_2_QI_CMP:
28003       nargs = 2;
28004       comparison_p = true;
28005       break;
28006 
28007     case MULTI_ARG_2_SF_TF:
28008     case MULTI_ARG_2_DF_TF:
28009     case MULTI_ARG_2_DI_TF:
28010     case MULTI_ARG_2_SI_TF:
28011     case MULTI_ARG_2_HI_TF:
28012     case MULTI_ARG_2_QI_TF:
28013       nargs = 2;
28014       tf_p = true;
28015       break;
28016 
28017     default:
28018       gcc_unreachable ();
28019     }
28020 
28021   if (optimize || !target
28022       || GET_MODE (target) != tmode
28023       || !insn_data[icode].operand[0].predicate (target, tmode))
28024     target = gen_reg_rtx (tmode);
28025 
28026   gcc_assert (nargs <= 4);
28027 
28028   for (i = 0; i < nargs; i++)
28029     {
28030       tree arg = CALL_EXPR_ARG (exp, i);
28031       rtx op = expand_normal (arg);
28032       int adjust = (comparison_p) ? 1 : 0;
28033       enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28034 
28035       if (last_arg_constant && i == nargs - 1)
28036 	{
28037 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28038 	    {
28039 	      enum insn_code new_icode = icode;
28040 	      switch (icode)
28041 		{
28042 		case CODE_FOR_xop_vpermil2v2df3:
28043 		case CODE_FOR_xop_vpermil2v4sf3:
28044 		case CODE_FOR_xop_vpermil2v4df3:
28045 		case CODE_FOR_xop_vpermil2v8sf3:
28046 		  error ("the last argument must be a 2-bit immediate");
28047 		  return gen_reg_rtx (tmode);
28048 		case CODE_FOR_xop_rotlv2di3:
28049 		  new_icode = CODE_FOR_rotlv2di3;
28050 		  goto xop_rotl;
28051 		case CODE_FOR_xop_rotlv4si3:
28052 		  new_icode = CODE_FOR_rotlv4si3;
28053 		  goto xop_rotl;
28054 		case CODE_FOR_xop_rotlv8hi3:
28055 		  new_icode = CODE_FOR_rotlv8hi3;
28056 		  goto xop_rotl;
28057 		case CODE_FOR_xop_rotlv16qi3:
28058 		  new_icode = CODE_FOR_rotlv16qi3;
28059 		xop_rotl:
28060 		  if (CONST_INT_P (op))
28061 		    {
28062 		      int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28063 		      op = GEN_INT (INTVAL (op) & mask);
28064 		      gcc_checking_assert
28065 			(insn_data[icode].operand[i + 1].predicate (op, mode));
28066 		    }
28067 		  else
28068 		    {
28069 		      gcc_checking_assert
28070 			(nargs == 2
28071 			 && insn_data[new_icode].operand[0].mode == tmode
28072 			 && insn_data[new_icode].operand[1].mode == tmode
28073 			 && insn_data[new_icode].operand[2].mode == mode
28074 			 && insn_data[new_icode].operand[0].predicate
28075 			    == insn_data[icode].operand[0].predicate
28076 			 && insn_data[new_icode].operand[1].predicate
28077 			    == insn_data[icode].operand[1].predicate);
28078 		      icode = new_icode;
28079 		      goto non_constant;
28080 		    }
28081 		  break;
28082 		default:
28083 		  gcc_unreachable ();
28084 		}
28085 	    }
28086 	}
28087       else
28088 	{
28089 	non_constant:
28090 	  if (VECTOR_MODE_P (mode))
28091 	    op = safe_vector_operand (op, mode);
28092 
28093 	  /* If we aren't optimizing, only allow one memory operand to be
28094 	     generated.  */
28095 	  if (memory_operand (op, mode))
28096 	    num_memory++;
28097 
28098 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28099 
28100 	  if (optimize
28101 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28102 	      || num_memory > 1)
28103 	    op = force_reg (mode, op);
28104 	}
28105 
28106       args[i].op = op;
28107       args[i].mode = mode;
28108     }
28109 
28110   switch (nargs)
28111     {
28112     case 1:
28113       pat = GEN_FCN (icode) (target, args[0].op);
28114       break;
28115 
28116     case 2:
28117       if (tf_p)
28118 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28119 			       GEN_INT ((int)sub_code));
28120       else if (! comparison_p)
28121 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28122       else
28123 	{
28124 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28125 				       args[0].op,
28126 				       args[1].op);
28127 
28128 	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28129 	}
28130       break;
28131 
28132     case 3:
28133       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28134       break;
28135 
28136     case 4:
28137       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28138       break;
28139 
28140     default:
28141       gcc_unreachable ();
28142     }
28143 
28144   if (! pat)
28145     return 0;
28146 
28147   emit_insn (pat);
28148   return target;
28149 }
28150 
28151 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28152    insns with vec_merge.  */
28153 
28154 static rtx
28155 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28156 				    rtx target)
28157 {
28158   rtx pat;
28159   tree arg0 = CALL_EXPR_ARG (exp, 0);
28160   rtx op1, op0 = expand_normal (arg0);
28161   enum machine_mode tmode = insn_data[icode].operand[0].mode;
28162   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28163 
28164   if (optimize || !target
28165       || GET_MODE (target) != tmode
28166       || !insn_data[icode].operand[0].predicate (target, tmode))
28167     target = gen_reg_rtx (tmode);
28168 
28169   if (VECTOR_MODE_P (mode0))
28170     op0 = safe_vector_operand (op0, mode0);
28171 
28172   if ((optimize && !register_operand (op0, mode0))
28173       || !insn_data[icode].operand[1].predicate (op0, mode0))
28174     op0 = copy_to_mode_reg (mode0, op0);
28175 
28176   op1 = op0;
28177   if (!insn_data[icode].operand[2].predicate (op1, mode0))
28178     op1 = copy_to_mode_reg (mode0, op1);
28179 
28180   pat = GEN_FCN (icode) (target, op0, op1);
28181   if (! pat)
28182     return 0;
28183   emit_insn (pat);
28184   return target;
28185 }
28186 
28187 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
28188 
28189 static rtx
28190 ix86_expand_sse_compare (const struct builtin_description *d,
28191 			 tree exp, rtx target, bool swap)
28192 {
28193   rtx pat;
28194   tree arg0 = CALL_EXPR_ARG (exp, 0);
28195   tree arg1 = CALL_EXPR_ARG (exp, 1);
28196   rtx op0 = expand_normal (arg0);
28197   rtx op1 = expand_normal (arg1);
28198   rtx op2;
28199   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28200   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28201   enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28202   enum rtx_code comparison = d->comparison;
28203 
28204   if (VECTOR_MODE_P (mode0))
28205     op0 = safe_vector_operand (op0, mode0);
28206   if (VECTOR_MODE_P (mode1))
28207     op1 = safe_vector_operand (op1, mode1);
28208 
28209   /* Swap operands if we have a comparison that isn't available in
28210      hardware.  */
28211   if (swap)
28212     {
28213       rtx tmp = gen_reg_rtx (mode1);
28214       emit_move_insn (tmp, op1);
28215       op1 = op0;
28216       op0 = tmp;
28217     }
28218 
28219   if (optimize || !target
28220       || GET_MODE (target) != tmode
28221       || !insn_data[d->icode].operand[0].predicate (target, tmode))
28222     target = gen_reg_rtx (tmode);
28223 
28224   if ((optimize && !register_operand (op0, mode0))
28225       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28226     op0 = copy_to_mode_reg (mode0, op0);
28227   if ((optimize && !register_operand (op1, mode1))
28228       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28229     op1 = copy_to_mode_reg (mode1, op1);
28230 
28231   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28232   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28233   if (! pat)
28234     return 0;
28235   emit_insn (pat);
28236   return target;
28237 }
28238 
28239 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
28240 
28241 static rtx
28242 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28243 		      rtx target)
28244 {
28245   rtx pat;
28246   tree arg0 = CALL_EXPR_ARG (exp, 0);
28247   tree arg1 = CALL_EXPR_ARG (exp, 1);
28248   rtx op0 = expand_normal (arg0);
28249   rtx op1 = expand_normal (arg1);
28250   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28251   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28252   enum rtx_code comparison = d->comparison;
28253 
28254   if (VECTOR_MODE_P (mode0))
28255     op0 = safe_vector_operand (op0, mode0);
28256   if (VECTOR_MODE_P (mode1))
28257     op1 = safe_vector_operand (op1, mode1);
28258 
28259   /* Swap operands if we have a comparison that isn't available in
28260      hardware.  */
28261   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28262     {
28263       rtx tmp = op1;
28264       op1 = op0;
28265       op0 = tmp;
28266     }
28267 
28268   target = gen_reg_rtx (SImode);
28269   emit_move_insn (target, const0_rtx);
28270   target = gen_rtx_SUBREG (QImode, target, 0);
28271 
28272   if ((optimize && !register_operand (op0, mode0))
28273       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28274     op0 = copy_to_mode_reg (mode0, op0);
28275   if ((optimize && !register_operand (op1, mode1))
28276       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28277     op1 = copy_to_mode_reg (mode1, op1);
28278 
28279   pat = GEN_FCN (d->icode) (op0, op1);
28280   if (! pat)
28281     return 0;
28282   emit_insn (pat);
28283   emit_insn (gen_rtx_SET (VOIDmode,
28284 			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28285 			  gen_rtx_fmt_ee (comparison, QImode,
28286 					  SET_DEST (pat),
28287 					  const0_rtx)));
28288 
28289   return SUBREG_REG (target);
28290 }
28291 
28292 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
28293 
28294 static rtx
28295 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28296 		       rtx target)
28297 {
28298   rtx pat;
28299   tree arg0 = CALL_EXPR_ARG (exp, 0);
28300   rtx op1, op0 = expand_normal (arg0);
28301   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28302   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28303 
28304   if (optimize || target == 0
28305       || GET_MODE (target) != tmode
28306       || !insn_data[d->icode].operand[0].predicate (target, tmode))
28307     target = gen_reg_rtx (tmode);
28308 
28309   if (VECTOR_MODE_P (mode0))
28310     op0 = safe_vector_operand (op0, mode0);
28311 
28312   if ((optimize && !register_operand (op0, mode0))
28313       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28314     op0 = copy_to_mode_reg (mode0, op0);
28315 
28316   op1 = GEN_INT (d->comparison);
28317 
28318   pat = GEN_FCN (d->icode) (target, op0, op1);
28319   if (! pat)
28320     return 0;
28321   emit_insn (pat);
28322   return target;
28323 }
28324 
28325 static rtx
28326 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28327 				     tree exp, rtx target)
28328 {
28329   rtx pat;
28330   tree arg0 = CALL_EXPR_ARG (exp, 0);
28331   tree arg1 = CALL_EXPR_ARG (exp, 1);
28332   rtx op0 = expand_normal (arg0);
28333   rtx op1 = expand_normal (arg1);
28334   rtx op2;
28335   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28336   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28337   enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28338 
28339   if (optimize || target == 0
28340       || GET_MODE (target) != tmode
28341       || !insn_data[d->icode].operand[0].predicate (target, tmode))
28342     target = gen_reg_rtx (tmode);
28343 
28344   op0 = safe_vector_operand (op0, mode0);
28345   op1 = safe_vector_operand (op1, mode1);
28346 
28347   if ((optimize && !register_operand (op0, mode0))
28348       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28349     op0 = copy_to_mode_reg (mode0, op0);
28350   if ((optimize && !register_operand (op1, mode1))
28351       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28352     op1 = copy_to_mode_reg (mode1, op1);
28353 
28354   op2 = GEN_INT (d->comparison);
28355 
28356   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28357   if (! pat)
28358     return 0;
28359   emit_insn (pat);
28360   return target;
28361 }
28362 
28363 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
28364 
28365 static rtx
28366 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28367 		       rtx target)
28368 {
28369   rtx pat;
28370   tree arg0 = CALL_EXPR_ARG (exp, 0);
28371   tree arg1 = CALL_EXPR_ARG (exp, 1);
28372   rtx op0 = expand_normal (arg0);
28373   rtx op1 = expand_normal (arg1);
28374   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28375   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28376   enum rtx_code comparison = d->comparison;
28377 
28378   if (VECTOR_MODE_P (mode0))
28379     op0 = safe_vector_operand (op0, mode0);
28380   if (VECTOR_MODE_P (mode1))
28381     op1 = safe_vector_operand (op1, mode1);
28382 
28383   target = gen_reg_rtx (SImode);
28384   emit_move_insn (target, const0_rtx);
28385   target = gen_rtx_SUBREG (QImode, target, 0);
28386 
28387   if ((optimize && !register_operand (op0, mode0))
28388       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28389     op0 = copy_to_mode_reg (mode0, op0);
28390   if ((optimize && !register_operand (op1, mode1))
28391       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28392     op1 = copy_to_mode_reg (mode1, op1);
28393 
28394   pat = GEN_FCN (d->icode) (op0, op1);
28395   if (! pat)
28396     return 0;
28397   emit_insn (pat);
28398   emit_insn (gen_rtx_SET (VOIDmode,
28399 			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28400 			  gen_rtx_fmt_ee (comparison, QImode,
28401 					  SET_DEST (pat),
28402 					  const0_rtx)));
28403 
28404   return SUBREG_REG (target);
28405 }
28406 
28407 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
28408 
28409 static rtx
28410 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28411 			  tree exp, rtx target)
28412 {
28413   rtx pat;
28414   tree arg0 = CALL_EXPR_ARG (exp, 0);
28415   tree arg1 = CALL_EXPR_ARG (exp, 1);
28416   tree arg2 = CALL_EXPR_ARG (exp, 2);
28417   tree arg3 = CALL_EXPR_ARG (exp, 3);
28418   tree arg4 = CALL_EXPR_ARG (exp, 4);
28419   rtx scratch0, scratch1;
28420   rtx op0 = expand_normal (arg0);
28421   rtx op1 = expand_normal (arg1);
28422   rtx op2 = expand_normal (arg2);
28423   rtx op3 = expand_normal (arg3);
28424   rtx op4 = expand_normal (arg4);
28425   enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28426 
28427   tmode0 = insn_data[d->icode].operand[0].mode;
28428   tmode1 = insn_data[d->icode].operand[1].mode;
28429   modev2 = insn_data[d->icode].operand[2].mode;
28430   modei3 = insn_data[d->icode].operand[3].mode;
28431   modev4 = insn_data[d->icode].operand[4].mode;
28432   modei5 = insn_data[d->icode].operand[5].mode;
28433   modeimm = insn_data[d->icode].operand[6].mode;
28434 
28435   if (VECTOR_MODE_P (modev2))
28436     op0 = safe_vector_operand (op0, modev2);
28437   if (VECTOR_MODE_P (modev4))
28438     op2 = safe_vector_operand (op2, modev4);
28439 
28440   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28441     op0 = copy_to_mode_reg (modev2, op0);
28442   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28443     op1 = copy_to_mode_reg (modei3, op1);
28444   if ((optimize && !register_operand (op2, modev4))
28445       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28446     op2 = copy_to_mode_reg (modev4, op2);
28447   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28448     op3 = copy_to_mode_reg (modei5, op3);
28449 
28450   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28451     {
28452       error ("the fifth argument must be an 8-bit immediate");
28453       return const0_rtx;
28454     }
28455 
28456   if (d->code == IX86_BUILTIN_PCMPESTRI128)
28457     {
28458       if (optimize || !target
28459 	  || GET_MODE (target) != tmode0
28460 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28461 	target = gen_reg_rtx (tmode0);
28462 
28463       scratch1 = gen_reg_rtx (tmode1);
28464 
28465       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28466     }
28467   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28468     {
28469       if (optimize || !target
28470 	  || GET_MODE (target) != tmode1
28471 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28472 	target = gen_reg_rtx (tmode1);
28473 
28474       scratch0 = gen_reg_rtx (tmode0);
28475 
28476       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28477     }
28478   else
28479     {
28480       gcc_assert (d->flag);
28481 
28482       scratch0 = gen_reg_rtx (tmode0);
28483       scratch1 = gen_reg_rtx (tmode1);
28484 
28485       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28486     }
28487 
28488   if (! pat)
28489     return 0;
28490 
28491   emit_insn (pat);
28492 
28493   if (d->flag)
28494     {
28495       target = gen_reg_rtx (SImode);
28496       emit_move_insn (target, const0_rtx);
28497       target = gen_rtx_SUBREG (QImode, target, 0);
28498 
28499       emit_insn
28500 	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28501 		      gen_rtx_fmt_ee (EQ, QImode,
28502 				      gen_rtx_REG ((enum machine_mode) d->flag,
28503 						   FLAGS_REG),
28504 				      const0_rtx)));
28505       return SUBREG_REG (target);
28506     }
28507   else
28508     return target;
28509 }
28510 
28511 
28512 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
28513 
28514 static rtx
28515 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28516 			  tree exp, rtx target)
28517 {
28518   rtx pat;
28519   tree arg0 = CALL_EXPR_ARG (exp, 0);
28520   tree arg1 = CALL_EXPR_ARG (exp, 1);
28521   tree arg2 = CALL_EXPR_ARG (exp, 2);
28522   rtx scratch0, scratch1;
28523   rtx op0 = expand_normal (arg0);
28524   rtx op1 = expand_normal (arg1);
28525   rtx op2 = expand_normal (arg2);
28526   enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28527 
28528   tmode0 = insn_data[d->icode].operand[0].mode;
28529   tmode1 = insn_data[d->icode].operand[1].mode;
28530   modev2 = insn_data[d->icode].operand[2].mode;
28531   modev3 = insn_data[d->icode].operand[3].mode;
28532   modeimm = insn_data[d->icode].operand[4].mode;
28533 
28534   if (VECTOR_MODE_P (modev2))
28535     op0 = safe_vector_operand (op0, modev2);
28536   if (VECTOR_MODE_P (modev3))
28537     op1 = safe_vector_operand (op1, modev3);
28538 
28539   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28540     op0 = copy_to_mode_reg (modev2, op0);
28541   if ((optimize && !register_operand (op1, modev3))
28542       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28543     op1 = copy_to_mode_reg (modev3, op1);
28544 
28545   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28546     {
28547       error ("the third argument must be an 8-bit immediate");
28548       return const0_rtx;
28549     }
28550 
28551   if (d->code == IX86_BUILTIN_PCMPISTRI128)
28552     {
28553       if (optimize || !target
28554 	  || GET_MODE (target) != tmode0
28555 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28556 	target = gen_reg_rtx (tmode0);
28557 
28558       scratch1 = gen_reg_rtx (tmode1);
28559 
28560       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28561     }
28562   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28563     {
28564       if (optimize || !target
28565 	  || GET_MODE (target) != tmode1
28566 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28567 	target = gen_reg_rtx (tmode1);
28568 
28569       scratch0 = gen_reg_rtx (tmode0);
28570 
28571       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28572     }
28573   else
28574     {
28575       gcc_assert (d->flag);
28576 
28577       scratch0 = gen_reg_rtx (tmode0);
28578       scratch1 = gen_reg_rtx (tmode1);
28579 
28580       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28581     }
28582 
28583   if (! pat)
28584     return 0;
28585 
28586   emit_insn (pat);
28587 
28588   if (d->flag)
28589     {
28590       target = gen_reg_rtx (SImode);
28591       emit_move_insn (target, const0_rtx);
28592       target = gen_rtx_SUBREG (QImode, target, 0);
28593 
28594       emit_insn
28595 	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28596 		      gen_rtx_fmt_ee (EQ, QImode,
28597 				      gen_rtx_REG ((enum machine_mode) d->flag,
28598 						   FLAGS_REG),
28599 				      const0_rtx)));
28600       return SUBREG_REG (target);
28601     }
28602   else
28603     return target;
28604 }
28605 
28606 /* Subroutine of ix86_expand_builtin to take care of insns with
28607    variable number of operands.  */
28608 
28609 static rtx
28610 ix86_expand_args_builtin (const struct builtin_description *d,
28611 			  tree exp, rtx target)
28612 {
28613   rtx pat, real_target;
28614   unsigned int i, nargs;
28615   unsigned int nargs_constant = 0;
28616   int num_memory = 0;
28617   struct
28618     {
28619       rtx op;
28620       enum machine_mode mode;
28621     } args[4];
28622   bool last_arg_count = false;
28623   enum insn_code icode = d->icode;
28624   const struct insn_data_d *insn_p = &insn_data[icode];
28625   enum machine_mode tmode = insn_p->operand[0].mode;
28626   enum machine_mode rmode = VOIDmode;
28627   bool swap = false;
28628   enum rtx_code comparison = d->comparison;
28629 
28630   switch ((enum ix86_builtin_func_type) d->flag)
28631     {
28632     case V2DF_FTYPE_V2DF_ROUND:
28633     case V4DF_FTYPE_V4DF_ROUND:
28634     case V4SF_FTYPE_V4SF_ROUND:
28635     case V8SF_FTYPE_V8SF_ROUND:
28636     case V4SI_FTYPE_V4SF_ROUND:
28637     case V8SI_FTYPE_V8SF_ROUND:
28638       return ix86_expand_sse_round (d, exp, target);
28639     case V4SI_FTYPE_V2DF_V2DF_ROUND:
28640     case V8SI_FTYPE_V4DF_V4DF_ROUND:
28641       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28642     case INT_FTYPE_V8SF_V8SF_PTEST:
28643     case INT_FTYPE_V4DI_V4DI_PTEST:
28644     case INT_FTYPE_V4DF_V4DF_PTEST:
28645     case INT_FTYPE_V4SF_V4SF_PTEST:
28646     case INT_FTYPE_V2DI_V2DI_PTEST:
28647     case INT_FTYPE_V2DF_V2DF_PTEST:
28648       return ix86_expand_sse_ptest (d, exp, target);
28649     case FLOAT128_FTYPE_FLOAT128:
28650     case FLOAT_FTYPE_FLOAT:
28651     case INT_FTYPE_INT:
28652     case UINT64_FTYPE_INT:
28653     case UINT16_FTYPE_UINT16:
28654     case INT64_FTYPE_INT64:
28655     case INT64_FTYPE_V4SF:
28656     case INT64_FTYPE_V2DF:
28657     case INT_FTYPE_V16QI:
28658     case INT_FTYPE_V8QI:
28659     case INT_FTYPE_V8SF:
28660     case INT_FTYPE_V4DF:
28661     case INT_FTYPE_V4SF:
28662     case INT_FTYPE_V2DF:
28663     case INT_FTYPE_V32QI:
28664     case V16QI_FTYPE_V16QI:
28665     case V8SI_FTYPE_V8SF:
28666     case V8SI_FTYPE_V4SI:
28667     case V8HI_FTYPE_V8HI:
28668     case V8HI_FTYPE_V16QI:
28669     case V8QI_FTYPE_V8QI:
28670     case V8SF_FTYPE_V8SF:
28671     case V8SF_FTYPE_V8SI:
28672     case V8SF_FTYPE_V4SF:
28673     case V8SF_FTYPE_V8HI:
28674     case V4SI_FTYPE_V4SI:
28675     case V4SI_FTYPE_V16QI:
28676     case V4SI_FTYPE_V4SF:
28677     case V4SI_FTYPE_V8SI:
28678     case V4SI_FTYPE_V8HI:
28679     case V4SI_FTYPE_V4DF:
28680     case V4SI_FTYPE_V2DF:
28681     case V4HI_FTYPE_V4HI:
28682     case V4DF_FTYPE_V4DF:
28683     case V4DF_FTYPE_V4SI:
28684     case V4DF_FTYPE_V4SF:
28685     case V4DF_FTYPE_V2DF:
28686     case V4SF_FTYPE_V4SF:
28687     case V4SF_FTYPE_V4SI:
28688     case V4SF_FTYPE_V8SF:
28689     case V4SF_FTYPE_V4DF:
28690     case V4SF_FTYPE_V8HI:
28691     case V4SF_FTYPE_V2DF:
28692     case V2DI_FTYPE_V2DI:
28693     case V2DI_FTYPE_V16QI:
28694     case V2DI_FTYPE_V8HI:
28695     case V2DI_FTYPE_V4SI:
28696     case V2DF_FTYPE_V2DF:
28697     case V2DF_FTYPE_V4SI:
28698     case V2DF_FTYPE_V4DF:
28699     case V2DF_FTYPE_V4SF:
28700     case V2DF_FTYPE_V2SI:
28701     case V2SI_FTYPE_V2SI:
28702     case V2SI_FTYPE_V4SF:
28703     case V2SI_FTYPE_V2SF:
28704     case V2SI_FTYPE_V2DF:
28705     case V2SF_FTYPE_V2SF:
28706     case V2SF_FTYPE_V2SI:
28707     case V32QI_FTYPE_V32QI:
28708     case V32QI_FTYPE_V16QI:
28709     case V16HI_FTYPE_V16HI:
28710     case V16HI_FTYPE_V8HI:
28711     case V8SI_FTYPE_V8SI:
28712     case V16HI_FTYPE_V16QI:
28713     case V8SI_FTYPE_V16QI:
28714     case V4DI_FTYPE_V16QI:
28715     case V8SI_FTYPE_V8HI:
28716     case V4DI_FTYPE_V8HI:
28717     case V4DI_FTYPE_V4SI:
28718     case V4DI_FTYPE_V2DI:
28719       nargs = 1;
28720       break;
28721     case V4SF_FTYPE_V4SF_VEC_MERGE:
28722     case V2DF_FTYPE_V2DF_VEC_MERGE:
28723       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28724     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28725     case V16QI_FTYPE_V16QI_V16QI:
28726     case V16QI_FTYPE_V8HI_V8HI:
28727     case V8QI_FTYPE_V8QI_V8QI:
28728     case V8QI_FTYPE_V4HI_V4HI:
28729     case V8HI_FTYPE_V8HI_V8HI:
28730     case V8HI_FTYPE_V16QI_V16QI:
28731     case V8HI_FTYPE_V4SI_V4SI:
28732     case V8SF_FTYPE_V8SF_V8SF:
28733     case V8SF_FTYPE_V8SF_V8SI:
28734     case V4SI_FTYPE_V4SI_V4SI:
28735     case V4SI_FTYPE_V8HI_V8HI:
28736     case V4SI_FTYPE_V4SF_V4SF:
28737     case V4SI_FTYPE_V2DF_V2DF:
28738     case V4HI_FTYPE_V4HI_V4HI:
28739     case V4HI_FTYPE_V8QI_V8QI:
28740     case V4HI_FTYPE_V2SI_V2SI:
28741     case V4DF_FTYPE_V4DF_V4DF:
28742     case V4DF_FTYPE_V4DF_V4DI:
28743     case V4SF_FTYPE_V4SF_V4SF:
28744     case V4SF_FTYPE_V4SF_V4SI:
28745     case V4SF_FTYPE_V4SF_V2SI:
28746     case V4SF_FTYPE_V4SF_V2DF:
28747     case V4SF_FTYPE_V4SF_DI:
28748     case V4SF_FTYPE_V4SF_SI:
28749     case V2DI_FTYPE_V2DI_V2DI:
28750     case V2DI_FTYPE_V16QI_V16QI:
28751     case V2DI_FTYPE_V4SI_V4SI:
28752     case V2DI_FTYPE_V2DI_V16QI:
28753     case V2DI_FTYPE_V2DF_V2DF:
28754     case V2SI_FTYPE_V2SI_V2SI:
28755     case V2SI_FTYPE_V4HI_V4HI:
28756     case V2SI_FTYPE_V2SF_V2SF:
28757     case V2DF_FTYPE_V2DF_V2DF:
28758     case V2DF_FTYPE_V2DF_V4SF:
28759     case V2DF_FTYPE_V2DF_V2DI:
28760     case V2DF_FTYPE_V2DF_DI:
28761     case V2DF_FTYPE_V2DF_SI:
28762     case V2SF_FTYPE_V2SF_V2SF:
28763     case V1DI_FTYPE_V1DI_V1DI:
28764     case V1DI_FTYPE_V8QI_V8QI:
28765     case V1DI_FTYPE_V2SI_V2SI:
28766     case V32QI_FTYPE_V16HI_V16HI:
28767     case V16HI_FTYPE_V8SI_V8SI:
28768     case V32QI_FTYPE_V32QI_V32QI:
28769     case V16HI_FTYPE_V32QI_V32QI:
28770     case V16HI_FTYPE_V16HI_V16HI:
28771     case V8SI_FTYPE_V4DF_V4DF:
28772     case V8SI_FTYPE_V8SI_V8SI:
28773     case V8SI_FTYPE_V16HI_V16HI:
28774     case V4DI_FTYPE_V4DI_V4DI:
28775     case V4DI_FTYPE_V8SI_V8SI:
28776       if (comparison == UNKNOWN)
28777 	return ix86_expand_binop_builtin (icode, exp, target);
28778       nargs = 2;
28779       break;
28780     case V4SF_FTYPE_V4SF_V4SF_SWAP:
28781     case V2DF_FTYPE_V2DF_V2DF_SWAP:
28782       gcc_assert (comparison != UNKNOWN);
28783       nargs = 2;
28784       swap = true;
28785       break;
28786     case V16HI_FTYPE_V16HI_V8HI_COUNT:
28787     case V16HI_FTYPE_V16HI_SI_COUNT:
28788     case V8SI_FTYPE_V8SI_V4SI_COUNT:
28789     case V8SI_FTYPE_V8SI_SI_COUNT:
28790     case V4DI_FTYPE_V4DI_V2DI_COUNT:
28791     case V4DI_FTYPE_V4DI_INT_COUNT:
28792     case V8HI_FTYPE_V8HI_V8HI_COUNT:
28793     case V8HI_FTYPE_V8HI_SI_COUNT:
28794     case V4SI_FTYPE_V4SI_V4SI_COUNT:
28795     case V4SI_FTYPE_V4SI_SI_COUNT:
28796     case V4HI_FTYPE_V4HI_V4HI_COUNT:
28797     case V4HI_FTYPE_V4HI_SI_COUNT:
28798     case V2DI_FTYPE_V2DI_V2DI_COUNT:
28799     case V2DI_FTYPE_V2DI_SI_COUNT:
28800     case V2SI_FTYPE_V2SI_V2SI_COUNT:
28801     case V2SI_FTYPE_V2SI_SI_COUNT:
28802     case V1DI_FTYPE_V1DI_V1DI_COUNT:
28803     case V1DI_FTYPE_V1DI_SI_COUNT:
28804       nargs = 2;
28805       last_arg_count = true;
28806       break;
28807     case UINT64_FTYPE_UINT64_UINT64:
28808     case UINT_FTYPE_UINT_UINT:
28809     case UINT_FTYPE_UINT_USHORT:
28810     case UINT_FTYPE_UINT_UCHAR:
28811     case UINT16_FTYPE_UINT16_INT:
28812     case UINT8_FTYPE_UINT8_INT:
28813       nargs = 2;
28814       break;
28815     case V2DI_FTYPE_V2DI_INT_CONVERT:
28816       nargs = 2;
28817       rmode = V1TImode;
28818       nargs_constant = 1;
28819       break;
28820     case V4DI_FTYPE_V4DI_INT_CONVERT:
28821       nargs = 2;
28822       rmode = V2TImode;
28823       nargs_constant = 1;
28824       break;
28825     case V8HI_FTYPE_V8HI_INT:
28826     case V8HI_FTYPE_V8SF_INT:
28827     case V8HI_FTYPE_V4SF_INT:
28828     case V8SF_FTYPE_V8SF_INT:
28829     case V4SI_FTYPE_V4SI_INT:
28830     case V4SI_FTYPE_V8SI_INT:
28831     case V4HI_FTYPE_V4HI_INT:
28832     case V4DF_FTYPE_V4DF_INT:
28833     case V4SF_FTYPE_V4SF_INT:
28834     case V4SF_FTYPE_V8SF_INT:
28835     case V2DI_FTYPE_V2DI_INT:
28836     case V2DF_FTYPE_V2DF_INT:
28837     case V2DF_FTYPE_V4DF_INT:
28838     case V16HI_FTYPE_V16HI_INT:
28839     case V8SI_FTYPE_V8SI_INT:
28840     case V4DI_FTYPE_V4DI_INT:
28841     case V2DI_FTYPE_V4DI_INT:
28842       nargs = 2;
28843       nargs_constant = 1;
28844       break;
28845     case V16QI_FTYPE_V16QI_V16QI_V16QI:
28846     case V8SF_FTYPE_V8SF_V8SF_V8SF:
28847     case V4DF_FTYPE_V4DF_V4DF_V4DF:
28848     case V4SF_FTYPE_V4SF_V4SF_V4SF:
28849     case V2DF_FTYPE_V2DF_V2DF_V2DF:
28850     case V32QI_FTYPE_V32QI_V32QI_V32QI:
28851       nargs = 3;
28852       break;
28853     case V32QI_FTYPE_V32QI_V32QI_INT:
28854     case V16HI_FTYPE_V16HI_V16HI_INT:
28855     case V16QI_FTYPE_V16QI_V16QI_INT:
28856     case V4DI_FTYPE_V4DI_V4DI_INT:
28857     case V8HI_FTYPE_V8HI_V8HI_INT:
28858     case V8SI_FTYPE_V8SI_V8SI_INT:
28859     case V8SI_FTYPE_V8SI_V4SI_INT:
28860     case V8SF_FTYPE_V8SF_V8SF_INT:
28861     case V8SF_FTYPE_V8SF_V4SF_INT:
28862     case V4SI_FTYPE_V4SI_V4SI_INT:
28863     case V4DF_FTYPE_V4DF_V4DF_INT:
28864     case V4DF_FTYPE_V4DF_V2DF_INT:
28865     case V4SF_FTYPE_V4SF_V4SF_INT:
28866     case V2DI_FTYPE_V2DI_V2DI_INT:
28867     case V4DI_FTYPE_V4DI_V2DI_INT:
28868     case V2DF_FTYPE_V2DF_V2DF_INT:
28869       nargs = 3;
28870       nargs_constant = 1;
28871       break;
28872     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28873       nargs = 3;
28874       rmode = V4DImode;
28875       nargs_constant = 1;
28876       break;
28877     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28878       nargs = 3;
28879       rmode = V2DImode;
28880       nargs_constant = 1;
28881       break;
28882     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28883       nargs = 3;
28884       rmode = DImode;
28885       nargs_constant = 1;
28886       break;
28887     case V2DI_FTYPE_V2DI_UINT_UINT:
28888       nargs = 3;
28889       nargs_constant = 2;
28890       break;
28891     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28892     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28893     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28894     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28895       nargs = 4;
28896       nargs_constant = 1;
28897       break;
28898     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28899       nargs = 4;
28900       nargs_constant = 2;
28901       break;
28902     default:
28903       gcc_unreachable ();
28904     }
28905 
28906   gcc_assert (nargs <= ARRAY_SIZE (args));
28907 
28908   if (comparison != UNKNOWN)
28909     {
28910       gcc_assert (nargs == 2);
28911       return ix86_expand_sse_compare (d, exp, target, swap);
28912     }
28913 
28914   if (rmode == VOIDmode || rmode == tmode)
28915     {
28916       if (optimize
28917 	  || target == 0
28918 	  || GET_MODE (target) != tmode
28919 	  || !insn_p->operand[0].predicate (target, tmode))
28920 	target = gen_reg_rtx (tmode);
28921       real_target = target;
28922     }
28923   else
28924     {
28925       target = gen_reg_rtx (rmode);
28926       real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28927     }
28928 
28929   for (i = 0; i < nargs; i++)
28930     {
28931       tree arg = CALL_EXPR_ARG (exp, i);
28932       rtx op = expand_normal (arg);
28933       enum machine_mode mode = insn_p->operand[i + 1].mode;
28934       bool match = insn_p->operand[i + 1].predicate (op, mode);
28935 
28936       if (last_arg_count && (i + 1) == nargs)
28937 	{
28938 	  /* SIMD shift insns take either an 8-bit immediate or
28939 	     register as count.  But builtin functions take int as
28940 	     count.  If count doesn't match, we put it in register.  */
28941 	  if (!match)
28942 	    {
28943 	      op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28944 	      if (!insn_p->operand[i + 1].predicate (op, mode))
28945 		op = copy_to_reg (op);
28946 	    }
28947 	}
28948       else if ((nargs - i) <= nargs_constant)
28949 	{
28950 	  if (!match)
28951 	    switch (icode)
28952 	      {
28953 	      case CODE_FOR_avx2_inserti128:
28954 	      case CODE_FOR_avx2_extracti128:
28955 		error ("the last argument must be an 1-bit immediate");
28956 		return const0_rtx;
28957 
28958 	      case CODE_FOR_sse4_1_roundsd:
28959 	      case CODE_FOR_sse4_1_roundss:
28960 
28961 	      case CODE_FOR_sse4_1_roundpd:
28962 	      case CODE_FOR_sse4_1_roundps:
28963 	      case CODE_FOR_avx_roundpd256:
28964 	      case CODE_FOR_avx_roundps256:
28965 
28966 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28967 	      case CODE_FOR_sse4_1_roundps_sfix:
28968 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28969 	      case CODE_FOR_avx_roundps_sfix256:
28970 
28971 	      case CODE_FOR_sse4_1_blendps:
28972 	      case CODE_FOR_avx_blendpd256:
28973 	      case CODE_FOR_avx_vpermilv4df:
28974 		error ("the last argument must be a 4-bit immediate");
28975 		return const0_rtx;
28976 
28977 	      case CODE_FOR_sse4_1_blendpd:
28978 	      case CODE_FOR_avx_vpermilv2df:
28979 	      case CODE_FOR_xop_vpermil2v2df3:
28980 	      case CODE_FOR_xop_vpermil2v4sf3:
28981 	      case CODE_FOR_xop_vpermil2v4df3:
28982 	      case CODE_FOR_xop_vpermil2v8sf3:
28983 		error ("the last argument must be a 2-bit immediate");
28984 		return const0_rtx;
28985 
28986 	      case CODE_FOR_avx_vextractf128v4df:
28987 	      case CODE_FOR_avx_vextractf128v8sf:
28988 	      case CODE_FOR_avx_vextractf128v8si:
28989 	      case CODE_FOR_avx_vinsertf128v4df:
28990 	      case CODE_FOR_avx_vinsertf128v8sf:
28991 	      case CODE_FOR_avx_vinsertf128v8si:
28992 		error ("the last argument must be a 1-bit immediate");
28993 		return const0_rtx;
28994 
28995 	      case CODE_FOR_avx_vmcmpv2df3:
28996 	      case CODE_FOR_avx_vmcmpv4sf3:
28997 	      case CODE_FOR_avx_cmpv2df3:
28998 	      case CODE_FOR_avx_cmpv4sf3:
28999 	      case CODE_FOR_avx_cmpv4df3:
29000 	      case CODE_FOR_avx_cmpv8sf3:
29001 		error ("the last argument must be a 5-bit immediate");
29002 		return const0_rtx;
29003 
29004 	     default:
29005 		switch (nargs_constant)
29006 		  {
29007 		  case 2:
29008 		    if ((nargs - i) == nargs_constant)
29009 		      {
29010 			error ("the next to last argument must be an 8-bit immediate");
29011 			break;
29012 		      }
29013 		  case 1:
29014 		    error ("the last argument must be an 8-bit immediate");
29015 		    break;
29016 		  default:
29017 		    gcc_unreachable ();
29018 		  }
29019 		return const0_rtx;
29020 	      }
29021 	}
29022       else
29023 	{
29024 	  if (VECTOR_MODE_P (mode))
29025 	    op = safe_vector_operand (op, mode);
29026 
29027 	  /* If we aren't optimizing, only allow one memory operand to
29028 	     be generated.  */
29029 	  if (memory_operand (op, mode))
29030 	    num_memory++;
29031 
29032 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29033 	    {
29034 	      if (optimize || !match || num_memory > 1)
29035 		op = copy_to_mode_reg (mode, op);
29036 	    }
29037 	  else
29038 	    {
29039 	      op = copy_to_reg (op);
29040 	      op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29041 	    }
29042 	}
29043 
29044       args[i].op = op;
29045       args[i].mode = mode;
29046     }
29047 
29048   switch (nargs)
29049     {
29050     case 1:
29051       pat = GEN_FCN (icode) (real_target, args[0].op);
29052       break;
29053     case 2:
29054       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29055       break;
29056     case 3:
29057       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29058 			     args[2].op);
29059       break;
29060     case 4:
29061       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29062 			     args[2].op, args[3].op);
29063       break;
29064     default:
29065       gcc_unreachable ();
29066     }
29067 
29068   if (! pat)
29069     return 0;
29070 
29071   emit_insn (pat);
29072   return target;
29073 }
29074 
29075 /* Subroutine of ix86_expand_builtin to take care of special insns
29076    with variable number of operands.  */
29077 
29078 static rtx
29079 ix86_expand_special_args_builtin (const struct builtin_description *d,
29080 				    tree exp, rtx target)
29081 {
29082   tree arg;
29083   rtx pat, op;
29084   unsigned int i, nargs, arg_adjust, memory;
29085   struct
29086     {
29087       rtx op;
29088       enum machine_mode mode;
29089     } args[3];
29090   enum insn_code icode = d->icode;
29091   bool last_arg_constant = false;
29092   const struct insn_data_d *insn_p = &insn_data[icode];
29093   enum machine_mode tmode = insn_p->operand[0].mode;
29094   enum { load, store } klass;
29095 
29096   switch ((enum ix86_builtin_func_type) d->flag)
29097     {
29098     case VOID_FTYPE_VOID:
29099       if (icode == CODE_FOR_avx_vzeroupper)
29100 	target = GEN_INT (vzeroupper_intrinsic);
29101       emit_insn (GEN_FCN (icode) (target));
29102       return 0;
29103     case VOID_FTYPE_UINT64:
29104     case VOID_FTYPE_UNSIGNED:
29105       nargs = 0;
29106       klass = store;
29107       memory = 0;
29108       break;
29109     case UINT64_FTYPE_VOID:
29110     case UNSIGNED_FTYPE_VOID:
29111       nargs = 0;
29112       klass = load;
29113       memory = 0;
29114       break;
29115     case UINT64_FTYPE_PUNSIGNED:
29116     case V2DI_FTYPE_PV2DI:
29117     case V4DI_FTYPE_PV4DI:
29118     case V32QI_FTYPE_PCCHAR:
29119     case V16QI_FTYPE_PCCHAR:
29120     case V8SF_FTYPE_PCV4SF:
29121     case V8SF_FTYPE_PCFLOAT:
29122     case V4SF_FTYPE_PCFLOAT:
29123     case V4DF_FTYPE_PCV2DF:
29124     case V4DF_FTYPE_PCDOUBLE:
29125     case V2DF_FTYPE_PCDOUBLE:
29126     case VOID_FTYPE_PVOID:
29127       nargs = 1;
29128       klass = load;
29129       memory = 0;
29130       break;
29131     case VOID_FTYPE_PV2SF_V4SF:
29132     case VOID_FTYPE_PV4DI_V4DI:
29133     case VOID_FTYPE_PV2DI_V2DI:
29134     case VOID_FTYPE_PCHAR_V32QI:
29135     case VOID_FTYPE_PCHAR_V16QI:
29136     case VOID_FTYPE_PFLOAT_V8SF:
29137     case VOID_FTYPE_PFLOAT_V4SF:
29138     case VOID_FTYPE_PDOUBLE_V4DF:
29139     case VOID_FTYPE_PDOUBLE_V2DF:
29140     case VOID_FTYPE_PLONGLONG_LONGLONG:
29141     case VOID_FTYPE_PULONGLONG_ULONGLONG:
29142     case VOID_FTYPE_PINT_INT:
29143       nargs = 1;
29144       klass = store;
29145       /* Reserve memory operand for target.  */
29146       memory = ARRAY_SIZE (args);
29147       break;
29148     case V4SF_FTYPE_V4SF_PCV2SF:
29149     case V2DF_FTYPE_V2DF_PCDOUBLE:
29150       nargs = 2;
29151       klass = load;
29152       memory = 1;
29153       break;
29154     case V8SF_FTYPE_PCV8SF_V8SI:
29155     case V4DF_FTYPE_PCV4DF_V4DI:
29156     case V4SF_FTYPE_PCV4SF_V4SI:
29157     case V2DF_FTYPE_PCV2DF_V2DI:
29158     case V8SI_FTYPE_PCV8SI_V8SI:
29159     case V4DI_FTYPE_PCV4DI_V4DI:
29160     case V4SI_FTYPE_PCV4SI_V4SI:
29161     case V2DI_FTYPE_PCV2DI_V2DI:
29162       nargs = 2;
29163       klass = load;
29164       memory = 0;
29165       break;
29166     case VOID_FTYPE_PV8SF_V8SI_V8SF:
29167     case VOID_FTYPE_PV4DF_V4DI_V4DF:
29168     case VOID_FTYPE_PV4SF_V4SI_V4SF:
29169     case VOID_FTYPE_PV2DF_V2DI_V2DF:
29170     case VOID_FTYPE_PV8SI_V8SI_V8SI:
29171     case VOID_FTYPE_PV4DI_V4DI_V4DI:
29172     case VOID_FTYPE_PV4SI_V4SI_V4SI:
29173     case VOID_FTYPE_PV2DI_V2DI_V2DI:
29174       nargs = 2;
29175       klass = store;
29176       /* Reserve memory operand for target.  */
29177       memory = ARRAY_SIZE (args);
29178       break;
29179     case VOID_FTYPE_UINT_UINT_UINT:
29180     case VOID_FTYPE_UINT64_UINT_UINT:
29181     case UCHAR_FTYPE_UINT_UINT_UINT:
29182     case UCHAR_FTYPE_UINT64_UINT_UINT:
29183       nargs = 3;
29184       klass = load;
29185       memory = ARRAY_SIZE (args);
29186       last_arg_constant = true;
29187       break;
29188     default:
29189       gcc_unreachable ();
29190     }
29191 
29192   gcc_assert (nargs <= ARRAY_SIZE (args));
29193 
29194   if (klass == store)
29195     {
29196       arg = CALL_EXPR_ARG (exp, 0);
29197       op = expand_normal (arg);
29198       gcc_assert (target == 0);
29199       if (memory)
29200 	{
29201 	  if (GET_MODE (op) != Pmode)
29202 	    op = convert_to_mode (Pmode, op, 1);
29203 	  target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29204 	}
29205       else
29206 	target = force_reg (tmode, op);
29207       arg_adjust = 1;
29208     }
29209   else
29210     {
29211       arg_adjust = 0;
29212       if (optimize
29213 	  || target == 0
29214 	  || !register_operand (target, tmode)
29215 	  || GET_MODE (target) != tmode)
29216 	target = gen_reg_rtx (tmode);
29217     }
29218 
29219   for (i = 0; i < nargs; i++)
29220     {
29221       enum machine_mode mode = insn_p->operand[i + 1].mode;
29222       bool match;
29223 
29224       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29225       op = expand_normal (arg);
29226       match = insn_p->operand[i + 1].predicate (op, mode);
29227 
29228       if (last_arg_constant && (i + 1) == nargs)
29229 	{
29230 	  if (!match)
29231 	    {
29232 	      if (icode == CODE_FOR_lwp_lwpvalsi3
29233 		  || icode == CODE_FOR_lwp_lwpinssi3
29234 		  || icode == CODE_FOR_lwp_lwpvaldi3
29235 		  || icode == CODE_FOR_lwp_lwpinsdi3)
29236 		error ("the last argument must be a 32-bit immediate");
29237 	      else
29238 		error ("the last argument must be an 8-bit immediate");
29239 	      return const0_rtx;
29240 	    }
29241 	}
29242       else
29243 	{
29244 	  if (i == memory)
29245 	    {
29246 	      /* This must be the memory operand.  */
29247 	      if (GET_MODE (op) != Pmode)
29248 		op = convert_to_mode (Pmode, op, 1);
29249 	      op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29250 	      gcc_assert (GET_MODE (op) == mode
29251 			  || GET_MODE (op) == VOIDmode);
29252 	    }
29253 	  else
29254 	    {
29255 	      /* This must be register.  */
29256 	      if (VECTOR_MODE_P (mode))
29257 		op = safe_vector_operand (op, mode);
29258 
29259 	      gcc_assert (GET_MODE (op) == mode
29260 			  || GET_MODE (op) == VOIDmode);
29261 	      op = copy_to_mode_reg (mode, op);
29262 	    }
29263 	}
29264 
29265       args[i].op = op;
29266       args[i].mode = mode;
29267     }
29268 
29269   switch (nargs)
29270     {
29271     case 0:
29272       pat = GEN_FCN (icode) (target);
29273       break;
29274     case 1:
29275       pat = GEN_FCN (icode) (target, args[0].op);
29276       break;
29277     case 2:
29278       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29279       break;
29280     case 3:
29281       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29282       break;
29283     default:
29284       gcc_unreachable ();
29285     }
29286 
29287   if (! pat)
29288     return 0;
29289   emit_insn (pat);
29290   return klass == store ? 0 : target;
29291 }
29292 
29293 /* Return the integer constant in ARG.  Constrain it to be in the range
29294    of the subparts of VEC_TYPE; issue an error if not.  */
29295 
29296 static int
29297 get_element_number (tree vec_type, tree arg)
29298 {
29299   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29300 
29301   if (!host_integerp (arg, 1)
29302       || (elt = tree_low_cst (arg, 1), elt > max))
29303     {
29304       error ("selector must be an integer constant in the range 0..%wi", max);
29305       return 0;
29306     }
29307 
29308   return elt;
29309 }
29310 
29311 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
29312    ix86_expand_vector_init.  We DO have language-level syntax for this, in
29313    the form of  (type){ init-list }.  Except that since we can't place emms
29314    instructions from inside the compiler, we can't allow the use of MMX
29315    registers unless the user explicitly asks for it.  So we do *not* define
29316    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
29317    we have builtins invoked by mmintrin.h that gives us license to emit
29318    these sorts of instructions.  */
29319 
29320 static rtx
29321 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29322 {
29323   enum machine_mode tmode = TYPE_MODE (type);
29324   enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29325   int i, n_elt = GET_MODE_NUNITS (tmode);
29326   rtvec v = rtvec_alloc (n_elt);
29327 
29328   gcc_assert (VECTOR_MODE_P (tmode));
29329   gcc_assert (call_expr_nargs (exp) == n_elt);
29330 
29331   for (i = 0; i < n_elt; ++i)
29332     {
29333       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29334       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29335     }
29336 
29337   if (!target || !register_operand (target, tmode))
29338     target = gen_reg_rtx (tmode);
29339 
29340   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29341   return target;
29342 }
29343 
29344 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
29345    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
29346    had a language-level syntax for referencing vector elements.  */
29347 
29348 static rtx
29349 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29350 {
29351   enum machine_mode tmode, mode0;
29352   tree arg0, arg1;
29353   int elt;
29354   rtx op0;
29355 
29356   arg0 = CALL_EXPR_ARG (exp, 0);
29357   arg1 = CALL_EXPR_ARG (exp, 1);
29358 
29359   op0 = expand_normal (arg0);
29360   elt = get_element_number (TREE_TYPE (arg0), arg1);
29361 
29362   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29363   mode0 = TYPE_MODE (TREE_TYPE (arg0));
29364   gcc_assert (VECTOR_MODE_P (mode0));
29365 
29366   op0 = force_reg (mode0, op0);
29367 
29368   if (optimize || !target || !register_operand (target, tmode))
29369     target = gen_reg_rtx (tmode);
29370 
29371   ix86_expand_vector_extract (true, target, op0, elt);
29372 
29373   return target;
29374 }
29375 
29376 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
29377    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
29378    a language-level syntax for referencing vector elements.  */
29379 
29380 static rtx
29381 ix86_expand_vec_set_builtin (tree exp)
29382 {
29383   enum machine_mode tmode, mode1;
29384   tree arg0, arg1, arg2;
29385   int elt;
29386   rtx op0, op1, target;
29387 
29388   arg0 = CALL_EXPR_ARG (exp, 0);
29389   arg1 = CALL_EXPR_ARG (exp, 1);
29390   arg2 = CALL_EXPR_ARG (exp, 2);
29391 
29392   tmode = TYPE_MODE (TREE_TYPE (arg0));
29393   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29394   gcc_assert (VECTOR_MODE_P (tmode));
29395 
29396   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29397   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29398   elt = get_element_number (TREE_TYPE (arg0), arg2);
29399 
29400   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29401     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29402 
29403   op0 = force_reg (tmode, op0);
29404   op1 = force_reg (mode1, op1);
29405 
29406   /* OP0 is the source of these builtin functions and shouldn't be
29407      modified.  Create a copy, use it and return it as target.  */
29408   target = gen_reg_rtx (tmode);
29409   emit_move_insn (target, op0);
29410   ix86_expand_vector_set (true, target, op1, elt);
29411 
29412   return target;
29413 }
29414 
29415 /* Expand an expression EXP that calls a built-in function,
29416    with result going to TARGET if that's convenient
29417    (and in mode MODE if that's convenient).
29418    SUBTARGET may be used as the target for computing one of EXP's operands.
29419    IGNORE is nonzero if the value is to be ignored.  */
29420 
29421 static rtx
29422 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29423 		     enum machine_mode mode ATTRIBUTE_UNUSED,
29424 		     int ignore ATTRIBUTE_UNUSED)
29425 {
29426   const struct builtin_description *d;
29427   size_t i;
29428   enum insn_code icode;
29429   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29430   tree arg0, arg1, arg2, arg3, arg4;
29431   rtx op0, op1, op2, op3, op4, pat;
29432   enum machine_mode mode0, mode1, mode2, mode3, mode4;
29433   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29434 
29435   /* Determine whether the builtin function is available under the current ISA.
29436      Originally the builtin was not created if it wasn't applicable to the
29437      current ISA based on the command line switches.  With function specific
29438      options, we need to check in the context of the function making the call
29439      whether it is supported.  */
29440   if (ix86_builtins_isa[fcode].isa
29441       && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29442     {
29443       char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29444 				       NULL, (enum fpmath_unit) 0, false);
29445 
29446       if (!opts)
29447 	error ("%qE needs unknown isa option", fndecl);
29448       else
29449 	{
29450 	  gcc_assert (opts != NULL);
29451 	  error ("%qE needs isa option %s", fndecl, opts);
29452 	  free (opts);
29453 	}
29454       return const0_rtx;
29455     }
29456 
29457   switch (fcode)
29458     {
29459     case IX86_BUILTIN_MASKMOVQ:
29460     case IX86_BUILTIN_MASKMOVDQU:
29461       icode = (fcode == IX86_BUILTIN_MASKMOVQ
29462 	       ? CODE_FOR_mmx_maskmovq
29463 	       : CODE_FOR_sse2_maskmovdqu);
29464       /* Note the arg order is different from the operand order.  */
29465       arg1 = CALL_EXPR_ARG (exp, 0);
29466       arg2 = CALL_EXPR_ARG (exp, 1);
29467       arg0 = CALL_EXPR_ARG (exp, 2);
29468       op0 = expand_normal (arg0);
29469       op1 = expand_normal (arg1);
29470       op2 = expand_normal (arg2);
29471       mode0 = insn_data[icode].operand[0].mode;
29472       mode1 = insn_data[icode].operand[1].mode;
29473       mode2 = insn_data[icode].operand[2].mode;
29474 
29475       if (GET_MODE (op0) != Pmode)
29476 	op0 = convert_to_mode (Pmode, op0, 1);
29477       op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29478 
29479       if (!insn_data[icode].operand[0].predicate (op0, mode0))
29480 	op0 = copy_to_mode_reg (mode0, op0);
29481       if (!insn_data[icode].operand[1].predicate (op1, mode1))
29482 	op1 = copy_to_mode_reg (mode1, op1);
29483       if (!insn_data[icode].operand[2].predicate (op2, mode2))
29484 	op2 = copy_to_mode_reg (mode2, op2);
29485       pat = GEN_FCN (icode) (op0, op1, op2);
29486       if (! pat)
29487 	return 0;
29488       emit_insn (pat);
29489       return 0;
29490 
29491     case IX86_BUILTIN_LDMXCSR:
29492       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29493       target = assign_386_stack_local (SImode, SLOT_TEMP);
29494       emit_move_insn (target, op0);
29495       emit_insn (gen_sse_ldmxcsr (target));
29496       return 0;
29497 
29498     case IX86_BUILTIN_STMXCSR:
29499       target = assign_386_stack_local (SImode, SLOT_TEMP);
29500       emit_insn (gen_sse_stmxcsr (target));
29501       return copy_to_mode_reg (SImode, target);
29502 
29503     case IX86_BUILTIN_CLFLUSH:
29504 	arg0 = CALL_EXPR_ARG (exp, 0);
29505 	op0 = expand_normal (arg0);
29506 	icode = CODE_FOR_sse2_clflush;
29507 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29508 	  {
29509 	    if (GET_MODE (op0) != Pmode)
29510 	      op0 = convert_to_mode (Pmode, op0, 1);
29511 	    op0 = force_reg (Pmode, op0);
29512 	  }
29513 
29514 	emit_insn (gen_sse2_clflush (op0));
29515 	return 0;
29516 
29517     case IX86_BUILTIN_MONITOR:
29518       arg0 = CALL_EXPR_ARG (exp, 0);
29519       arg1 = CALL_EXPR_ARG (exp, 1);
29520       arg2 = CALL_EXPR_ARG (exp, 2);
29521       op0 = expand_normal (arg0);
29522       op1 = expand_normal (arg1);
29523       op2 = expand_normal (arg2);
29524       if (!REG_P (op0))
29525 	{
29526 	  if (GET_MODE (op0) != Pmode)
29527 	    op0 = convert_to_mode (Pmode, op0, 1);
29528 	  op0 = force_reg (Pmode, op0);
29529 	}
29530       if (!REG_P (op1))
29531 	op1 = copy_to_mode_reg (SImode, op1);
29532       if (!REG_P (op2))
29533 	op2 = copy_to_mode_reg (SImode, op2);
29534       emit_insn (ix86_gen_monitor (op0, op1, op2));
29535       return 0;
29536 
29537     case IX86_BUILTIN_MWAIT:
29538       arg0 = CALL_EXPR_ARG (exp, 0);
29539       arg1 = CALL_EXPR_ARG (exp, 1);
29540       op0 = expand_normal (arg0);
29541       op1 = expand_normal (arg1);
29542       if (!REG_P (op0))
29543 	op0 = copy_to_mode_reg (SImode, op0);
29544       if (!REG_P (op1))
29545 	op1 = copy_to_mode_reg (SImode, op1);
29546       emit_insn (gen_sse3_mwait (op0, op1));
29547       return 0;
29548 
29549     case IX86_BUILTIN_VEC_INIT_V2SI:
29550     case IX86_BUILTIN_VEC_INIT_V4HI:
29551     case IX86_BUILTIN_VEC_INIT_V8QI:
29552       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29553 
29554     case IX86_BUILTIN_VEC_EXT_V2DF:
29555     case IX86_BUILTIN_VEC_EXT_V2DI:
29556     case IX86_BUILTIN_VEC_EXT_V4SF:
29557     case IX86_BUILTIN_VEC_EXT_V4SI:
29558     case IX86_BUILTIN_VEC_EXT_V8HI:
29559     case IX86_BUILTIN_VEC_EXT_V2SI:
29560     case IX86_BUILTIN_VEC_EXT_V4HI:
29561     case IX86_BUILTIN_VEC_EXT_V16QI:
29562       return ix86_expand_vec_ext_builtin (exp, target);
29563 
29564     case IX86_BUILTIN_VEC_SET_V2DI:
29565     case IX86_BUILTIN_VEC_SET_V4SF:
29566     case IX86_BUILTIN_VEC_SET_V4SI:
29567     case IX86_BUILTIN_VEC_SET_V8HI:
29568     case IX86_BUILTIN_VEC_SET_V4HI:
29569     case IX86_BUILTIN_VEC_SET_V16QI:
29570       return ix86_expand_vec_set_builtin (exp);
29571 
29572     case IX86_BUILTIN_INFQ:
29573     case IX86_BUILTIN_HUGE_VALQ:
29574       {
29575 	REAL_VALUE_TYPE inf;
29576 	rtx tmp;
29577 
29578 	real_inf (&inf);
29579 	tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29580 
29581 	tmp = validize_mem (force_const_mem (mode, tmp));
29582 
29583 	if (target == 0)
29584 	  target = gen_reg_rtx (mode);
29585 
29586 	emit_move_insn (target, tmp);
29587 	return target;
29588       }
29589 
29590     case IX86_BUILTIN_LLWPCB:
29591       arg0 = CALL_EXPR_ARG (exp, 0);
29592       op0 = expand_normal (arg0);
29593       icode = CODE_FOR_lwp_llwpcb;
29594       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29595 	{
29596 	  if (GET_MODE (op0) != Pmode)
29597 	    op0 = convert_to_mode (Pmode, op0, 1);
29598 	  op0 = force_reg (Pmode, op0);
29599 	}
29600       emit_insn (gen_lwp_llwpcb (op0));
29601       return 0;
29602 
29603     case IX86_BUILTIN_SLWPCB:
29604       icode = CODE_FOR_lwp_slwpcb;
29605       if (!target
29606 	  || !insn_data[icode].operand[0].predicate (target, Pmode))
29607 	target = gen_reg_rtx (Pmode);
29608       emit_insn (gen_lwp_slwpcb (target));
29609       return target;
29610 
29611     case IX86_BUILTIN_BEXTRI32:
29612     case IX86_BUILTIN_BEXTRI64:
29613       arg0 = CALL_EXPR_ARG (exp, 0);
29614       arg1 = CALL_EXPR_ARG (exp, 1);
29615       op0 = expand_normal (arg0);
29616       op1 = expand_normal (arg1);
29617       icode = (fcode == IX86_BUILTIN_BEXTRI32
29618 	  ? CODE_FOR_tbm_bextri_si
29619 	  : CODE_FOR_tbm_bextri_di);
29620       if (!CONST_INT_P (op1))
29621         {
29622           error ("last argument must be an immediate");
29623           return const0_rtx;
29624         }
29625       else
29626         {
29627           unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29628           unsigned char lsb_index = INTVAL (op1) & 0xFF;
29629           op1 = GEN_INT (length);
29630           op2 = GEN_INT (lsb_index);
29631           pat = GEN_FCN (icode) (target, op0, op1, op2);
29632           if (pat)
29633             emit_insn (pat);
29634           return target;
29635         }
29636 
29637     case IX86_BUILTIN_RDRAND16_STEP:
29638       icode = CODE_FOR_rdrandhi_1;
29639       mode0 = HImode;
29640       goto rdrand_step;
29641 
29642     case IX86_BUILTIN_RDRAND32_STEP:
29643       icode = CODE_FOR_rdrandsi_1;
29644       mode0 = SImode;
29645       goto rdrand_step;
29646 
29647     case IX86_BUILTIN_RDRAND64_STEP:
29648       icode = CODE_FOR_rdranddi_1;
29649       mode0 = DImode;
29650 
29651 rdrand_step:
29652       op0 = gen_reg_rtx (mode0);
29653       emit_insn (GEN_FCN (icode) (op0));
29654 
29655       arg0 = CALL_EXPR_ARG (exp, 0);
29656       op1 = expand_normal (arg0);
29657       if (!address_operand (op1, VOIDmode))
29658 	{
29659 	  op1 = convert_memory_address (Pmode, op1);
29660 	  op1 = copy_addr_to_reg (op1);
29661 	}
29662       emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29663 
29664       op1 = gen_reg_rtx (SImode);
29665       emit_move_insn (op1, CONST1_RTX (SImode));
29666 
29667       /* Emit SImode conditional move.  */
29668       if (mode0 == HImode)
29669 	{
29670 	  op2 = gen_reg_rtx (SImode);
29671 	  emit_insn (gen_zero_extendhisi2 (op2, op0));
29672 	}
29673       else if (mode0 == SImode)
29674 	op2 = op0;
29675       else
29676 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
29677 
29678       if (target == 0)
29679 	target = gen_reg_rtx (SImode);
29680 
29681       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29682 			 const0_rtx);
29683       emit_insn (gen_rtx_SET (VOIDmode, target,
29684 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29685       return target;
29686 
29687     case IX86_BUILTIN_GATHERSIV2DF:
29688       icode = CODE_FOR_avx2_gathersiv2df;
29689       goto gather_gen;
29690     case IX86_BUILTIN_GATHERSIV4DF:
29691       icode = CODE_FOR_avx2_gathersiv4df;
29692       goto gather_gen;
29693     case IX86_BUILTIN_GATHERDIV2DF:
29694       icode = CODE_FOR_avx2_gatherdiv2df;
29695       goto gather_gen;
29696     case IX86_BUILTIN_GATHERDIV4DF:
29697       icode = CODE_FOR_avx2_gatherdiv4df;
29698       goto gather_gen;
29699     case IX86_BUILTIN_GATHERSIV4SF:
29700       icode = CODE_FOR_avx2_gathersiv4sf;
29701       goto gather_gen;
29702     case IX86_BUILTIN_GATHERSIV8SF:
29703       icode = CODE_FOR_avx2_gathersiv8sf;
29704       goto gather_gen;
29705     case IX86_BUILTIN_GATHERDIV4SF:
29706       icode = CODE_FOR_avx2_gatherdiv4sf;
29707       goto gather_gen;
29708     case IX86_BUILTIN_GATHERDIV8SF:
29709       icode = CODE_FOR_avx2_gatherdiv8sf;
29710       goto gather_gen;
29711     case IX86_BUILTIN_GATHERSIV2DI:
29712       icode = CODE_FOR_avx2_gathersiv2di;
29713       goto gather_gen;
29714     case IX86_BUILTIN_GATHERSIV4DI:
29715       icode = CODE_FOR_avx2_gathersiv4di;
29716       goto gather_gen;
29717     case IX86_BUILTIN_GATHERDIV2DI:
29718       icode = CODE_FOR_avx2_gatherdiv2di;
29719       goto gather_gen;
29720     case IX86_BUILTIN_GATHERDIV4DI:
29721       icode = CODE_FOR_avx2_gatherdiv4di;
29722       goto gather_gen;
29723     case IX86_BUILTIN_GATHERSIV4SI:
29724       icode = CODE_FOR_avx2_gathersiv4si;
29725       goto gather_gen;
29726     case IX86_BUILTIN_GATHERSIV8SI:
29727       icode = CODE_FOR_avx2_gathersiv8si;
29728       goto gather_gen;
29729     case IX86_BUILTIN_GATHERDIV4SI:
29730       icode = CODE_FOR_avx2_gatherdiv4si;
29731       goto gather_gen;
29732     case IX86_BUILTIN_GATHERDIV8SI:
29733       icode = CODE_FOR_avx2_gatherdiv8si;
29734       goto gather_gen;
29735     case IX86_BUILTIN_GATHERALTSIV4DF:
29736       icode = CODE_FOR_avx2_gathersiv4df;
29737       goto gather_gen;
29738     case IX86_BUILTIN_GATHERALTDIV8SF:
29739       icode = CODE_FOR_avx2_gatherdiv8sf;
29740       goto gather_gen;
29741     case IX86_BUILTIN_GATHERALTSIV4DI:
29742       icode = CODE_FOR_avx2_gathersiv4di;
29743       goto gather_gen;
29744     case IX86_BUILTIN_GATHERALTDIV8SI:
29745       icode = CODE_FOR_avx2_gatherdiv8si;
29746       goto gather_gen;
29747 
29748     gather_gen:
29749       arg0 = CALL_EXPR_ARG (exp, 0);
29750       arg1 = CALL_EXPR_ARG (exp, 1);
29751       arg2 = CALL_EXPR_ARG (exp, 2);
29752       arg3 = CALL_EXPR_ARG (exp, 3);
29753       arg4 = CALL_EXPR_ARG (exp, 4);
29754       op0 = expand_normal (arg0);
29755       op1 = expand_normal (arg1);
29756       op2 = expand_normal (arg2);
29757       op3 = expand_normal (arg3);
29758       op4 = expand_normal (arg4);
29759       /* Note the arg order is different from the operand order.  */
29760       mode0 = insn_data[icode].operand[1].mode;
29761       mode2 = insn_data[icode].operand[3].mode;
29762       mode3 = insn_data[icode].operand[4].mode;
29763       mode4 = insn_data[icode].operand[5].mode;
29764 
29765       if (target == NULL_RTX
29766 	  || GET_MODE (target) != insn_data[icode].operand[0].mode)
29767 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29768       else
29769 	subtarget = target;
29770 
29771       if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29772 	  || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29773 	{
29774 	  rtx half = gen_reg_rtx (V4SImode);
29775 	  if (!nonimmediate_operand (op2, V8SImode))
29776 	    op2 = copy_to_mode_reg (V8SImode, op2);
29777 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
29778 	  op2 = half;
29779 	}
29780       else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29781 	       || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29782 	{
29783 	  rtx (*gen) (rtx, rtx);
29784 	  rtx half = gen_reg_rtx (mode0);
29785 	  if (mode0 == V4SFmode)
29786 	    gen = gen_vec_extract_lo_v8sf;
29787 	  else
29788 	    gen = gen_vec_extract_lo_v8si;
29789 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
29790 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29791 	  emit_insn (gen (half, op0));
29792 	  op0 = half;
29793 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
29794 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29795 	  emit_insn (gen (half, op3));
29796 	  op3 = half;
29797 	}
29798 
29799       /* Force memory operand only with base register here.  But we
29800 	 don't want to do it on memory operand for other builtin
29801 	 functions.  */
29802       if (GET_MODE (op1) != Pmode)
29803 	op1 = convert_to_mode (Pmode, op1, 1);
29804       op1 = force_reg (Pmode, op1);
29805 
29806       if (!insn_data[icode].operand[1].predicate (op0, mode0))
29807 	op0 = copy_to_mode_reg (mode0, op0);
29808       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29809 	op1 = copy_to_mode_reg (Pmode, op1);
29810       if (!insn_data[icode].operand[3].predicate (op2, mode2))
29811 	op2 = copy_to_mode_reg (mode2, op2);
29812       if (!insn_data[icode].operand[4].predicate (op3, mode3))
29813 	op3 = copy_to_mode_reg (mode3, op3);
29814       if (!insn_data[icode].operand[5].predicate (op4, mode4))
29815 	{
29816           error ("last argument must be scale 1, 2, 4, 8");
29817           return const0_rtx;
29818 	}
29819 
29820       /* Optimize.  If mask is known to have all high bits set,
29821 	 replace op0 with pc_rtx to signal that the instruction
29822 	 overwrites the whole destination and doesn't use its
29823 	 previous contents.  */
29824       if (optimize)
29825 	{
29826 	  if (TREE_CODE (arg3) == VECTOR_CST)
29827 	    {
29828 	      tree elt;
29829 	      unsigned int negative = 0;
29830 	      for (elt = TREE_VECTOR_CST_ELTS (arg3);
29831 		   elt; elt = TREE_CHAIN (elt))
29832 		{
29833 		  tree cst = TREE_VALUE (elt);
29834 		  if (TREE_CODE (cst) == INTEGER_CST
29835 		      && tree_int_cst_sign_bit (cst))
29836 		    negative++;
29837 		  else if (TREE_CODE (cst) == REAL_CST
29838 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29839 		    negative++;
29840 		}
29841 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29842 		op0 = pc_rtx;
29843 	    }
29844 	  else if (TREE_CODE (arg3) == SSA_NAME)
29845 	    {
29846 	      /* Recognize also when mask is like:
29847 		 __v2df src = _mm_setzero_pd ();
29848 		 __v2df mask = _mm_cmpeq_pd (src, src);
29849 		 or
29850 		 __v8sf src = _mm256_setzero_ps ();
29851 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29852 		 as that is a cheaper way to load all ones into
29853 		 a register than having to load a constant from
29854 		 memory.  */
29855 	      gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29856 	      if (is_gimple_call (def_stmt))
29857 		{
29858 		  tree fndecl = gimple_call_fndecl (def_stmt);
29859 		  if (fndecl
29860 		      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29861 		    switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29862 		      {
29863 		      case IX86_BUILTIN_CMPPD:
29864 		      case IX86_BUILTIN_CMPPS:
29865 		      case IX86_BUILTIN_CMPPD256:
29866 		      case IX86_BUILTIN_CMPPS256:
29867 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29868 			  break;
29869 			/* FALLTHRU */
29870 		      case IX86_BUILTIN_CMPEQPD:
29871 		      case IX86_BUILTIN_CMPEQPS:
29872 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29873 			    && initializer_zerop (gimple_call_arg (def_stmt,
29874 								   1)))
29875 			  op0 = pc_rtx;
29876 			break;
29877 		      default:
29878 			break;
29879 		      }
29880 		}
29881 	    }
29882 	}
29883 
29884       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29885       if (! pat)
29886 	return const0_rtx;
29887       emit_insn (pat);
29888 
29889       if (fcode == IX86_BUILTIN_GATHERDIV8SF
29890 	  || fcode == IX86_BUILTIN_GATHERDIV8SI)
29891 	{
29892 	  enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29893 				    ? V4SFmode : V4SImode;
29894 	  if (target == NULL_RTX)
29895 	    target = gen_reg_rtx (tmode);
29896 	  if (tmode == V4SFmode)
29897 	    emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29898 	  else
29899 	    emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29900 	}
29901       else
29902 	target = subtarget;
29903 
29904       return target;
29905 
29906     default:
29907       break;
29908     }
29909 
29910   for (i = 0, d = bdesc_special_args;
29911        i < ARRAY_SIZE (bdesc_special_args);
29912        i++, d++)
29913     if (d->code == fcode)
29914       return ix86_expand_special_args_builtin (d, exp, target);
29915 
29916   for (i = 0, d = bdesc_args;
29917        i < ARRAY_SIZE (bdesc_args);
29918        i++, d++)
29919     if (d->code == fcode)
29920       switch (fcode)
29921 	{
29922 	case IX86_BUILTIN_FABSQ:
29923 	case IX86_BUILTIN_COPYSIGNQ:
29924 	  if (!TARGET_SSE2)
29925 	    /* Emit a normal call if SSE2 isn't available.  */
29926 	    return expand_call (exp, target, ignore);
29927 	default:
29928 	  return ix86_expand_args_builtin (d, exp, target);
29929 	}
29930 
29931   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29932     if (d->code == fcode)
29933       return ix86_expand_sse_comi (d, exp, target);
29934 
29935   for (i = 0, d = bdesc_pcmpestr;
29936        i < ARRAY_SIZE (bdesc_pcmpestr);
29937        i++, d++)
29938     if (d->code == fcode)
29939       return ix86_expand_sse_pcmpestr (d, exp, target);
29940 
29941   for (i = 0, d = bdesc_pcmpistr;
29942        i < ARRAY_SIZE (bdesc_pcmpistr);
29943        i++, d++)
29944     if (d->code == fcode)
29945       return ix86_expand_sse_pcmpistr (d, exp, target);
29946 
29947   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29948     if (d->code == fcode)
29949       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29950 					    (enum ix86_builtin_func_type)
29951 					    d->flag, d->comparison);
29952 
29953   gcc_unreachable ();
29954 }
29955 
29956 /* Returns a function decl for a vectorized version of the builtin function
29957    with builtin function code FN and the result vector type TYPE, or NULL_TREE
29958    if it is not available.  */
29959 
29960 static tree
29961 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29962 				  tree type_in)
29963 {
29964   enum machine_mode in_mode, out_mode;
29965   int in_n, out_n;
29966   enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29967 
29968   if (TREE_CODE (type_out) != VECTOR_TYPE
29969       || TREE_CODE (type_in) != VECTOR_TYPE
29970       || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29971     return NULL_TREE;
29972 
29973   out_mode = TYPE_MODE (TREE_TYPE (type_out));
29974   out_n = TYPE_VECTOR_SUBPARTS (type_out);
29975   in_mode = TYPE_MODE (TREE_TYPE (type_in));
29976   in_n = TYPE_VECTOR_SUBPARTS (type_in);
29977 
29978   switch (fn)
29979     {
29980     case BUILT_IN_SQRT:
29981       if (out_mode == DFmode && in_mode == DFmode)
29982 	{
29983 	  if (out_n == 2 && in_n == 2)
29984 	    return ix86_builtins[IX86_BUILTIN_SQRTPD];
29985 	  else if (out_n == 4 && in_n == 4)
29986 	    return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29987 	}
29988       break;
29989 
29990     case BUILT_IN_SQRTF:
29991       if (out_mode == SFmode && in_mode == SFmode)
29992 	{
29993 	  if (out_n == 4 && in_n == 4)
29994 	    return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29995 	  else if (out_n == 8 && in_n == 8)
29996 	    return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29997 	}
29998       break;
29999 
30000     case BUILT_IN_IFLOOR:
30001     case BUILT_IN_LFLOOR:
30002     case BUILT_IN_LLFLOOR:
30003       /* The round insn does not trap on denormals.  */
30004       if (flag_trapping_math || !TARGET_ROUND)
30005 	break;
30006 
30007       if (out_mode == SImode && in_mode == DFmode)
30008 	{
30009 	  if (out_n == 4 && in_n == 2)
30010 	    return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30011 	  else if (out_n == 8 && in_n == 4)
30012 	    return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30013 	}
30014       break;
30015 
30016     case BUILT_IN_IFLOORF:
30017     case BUILT_IN_LFLOORF:
30018     case BUILT_IN_LLFLOORF:
30019       /* The round insn does not trap on denormals.  */
30020       if (flag_trapping_math || !TARGET_ROUND)
30021 	break;
30022 
30023       if (out_mode == SImode && in_mode == SFmode)
30024 	{
30025 	  if (out_n == 4 && in_n == 4)
30026 	    return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30027 	  else if (out_n == 8 && in_n == 8)
30028 	    return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30029 	}
30030       break;
30031 
30032     case BUILT_IN_ICEIL:
30033     case BUILT_IN_LCEIL:
30034     case BUILT_IN_LLCEIL:
30035       /* The round insn does not trap on denormals.  */
30036       if (flag_trapping_math || !TARGET_ROUND)
30037 	break;
30038 
30039       if (out_mode == SImode && in_mode == DFmode)
30040 	{
30041 	  if (out_n == 4 && in_n == 2)
30042 	    return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30043 	  else if (out_n == 8 && in_n == 4)
30044 	    return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30045 	}
30046       break;
30047 
30048     case BUILT_IN_ICEILF:
30049     case BUILT_IN_LCEILF:
30050     case BUILT_IN_LLCEILF:
30051       /* The round insn does not trap on denormals.  */
30052       if (flag_trapping_math || !TARGET_ROUND)
30053 	break;
30054 
30055       if (out_mode == SImode && in_mode == SFmode)
30056 	{
30057 	  if (out_n == 4 && in_n == 4)
30058 	    return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30059 	  else if (out_n == 8 && in_n == 8)
30060 	    return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30061 	}
30062       break;
30063 
30064     case BUILT_IN_IRINT:
30065     case BUILT_IN_LRINT:
30066     case BUILT_IN_LLRINT:
30067       if (out_mode == SImode && in_mode == DFmode)
30068 	{
30069 	  if (out_n == 4 && in_n == 2)
30070 	    return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30071 	  else if (out_n == 8 && in_n == 4)
30072 	    return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30073 	}
30074       break;
30075 
30076     case BUILT_IN_IRINTF:
30077     case BUILT_IN_LRINTF:
30078     case BUILT_IN_LLRINTF:
30079       if (out_mode == SImode && in_mode == SFmode)
30080 	{
30081 	  if (out_n == 4 && in_n == 4)
30082 	    return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30083 	  else if (out_n == 8 && in_n == 8)
30084 	    return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30085 	}
30086       break;
30087 
30088     case BUILT_IN_IROUND:
30089     case BUILT_IN_LROUND:
30090     case BUILT_IN_LLROUND:
30091       /* The round insn does not trap on denormals.  */
30092       if (flag_trapping_math || !TARGET_ROUND)
30093 	break;
30094 
30095       if (out_mode == SImode && in_mode == DFmode)
30096 	{
30097 	  if (out_n == 4 && in_n == 2)
30098 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30099 	  else if (out_n == 8 && in_n == 4)
30100 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30101 	}
30102       break;
30103 
30104     case BUILT_IN_IROUNDF:
30105     case BUILT_IN_LROUNDF:
30106     case BUILT_IN_LLROUNDF:
30107       /* The round insn does not trap on denormals.  */
30108       if (flag_trapping_math || !TARGET_ROUND)
30109 	break;
30110 
30111       if (out_mode == SImode && in_mode == SFmode)
30112 	{
30113 	  if (out_n == 4 && in_n == 4)
30114 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30115 	  else if (out_n == 8 && in_n == 8)
30116 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30117 	}
30118       break;
30119 
30120     case BUILT_IN_COPYSIGN:
30121       if (out_mode == DFmode && in_mode == DFmode)
30122 	{
30123 	  if (out_n == 2 && in_n == 2)
30124 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30125 	  else if (out_n == 4 && in_n == 4)
30126 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30127 	}
30128       break;
30129 
30130     case BUILT_IN_COPYSIGNF:
30131       if (out_mode == SFmode && in_mode == SFmode)
30132 	{
30133 	  if (out_n == 4 && in_n == 4)
30134 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30135 	  else if (out_n == 8 && in_n == 8)
30136 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30137 	}
30138       break;
30139 
30140     case BUILT_IN_FLOOR:
30141       /* The round insn does not trap on denormals.  */
30142       if (flag_trapping_math || !TARGET_ROUND)
30143 	break;
30144 
30145       if (out_mode == DFmode && in_mode == DFmode)
30146 	{
30147 	  if (out_n == 2 && in_n == 2)
30148 	    return ix86_builtins[IX86_BUILTIN_FLOORPD];
30149 	  else if (out_n == 4 && in_n == 4)
30150 	    return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30151 	}
30152       break;
30153 
30154     case BUILT_IN_FLOORF:
30155       /* The round insn does not trap on denormals.  */
30156       if (flag_trapping_math || !TARGET_ROUND)
30157 	break;
30158 
30159       if (out_mode == SFmode && in_mode == SFmode)
30160 	{
30161 	  if (out_n == 4 && in_n == 4)
30162 	    return ix86_builtins[IX86_BUILTIN_FLOORPS];
30163 	  else if (out_n == 8 && in_n == 8)
30164 	    return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30165 	}
30166       break;
30167 
30168     case BUILT_IN_CEIL:
30169       /* The round insn does not trap on denormals.  */
30170       if (flag_trapping_math || !TARGET_ROUND)
30171 	break;
30172 
30173       if (out_mode == DFmode && in_mode == DFmode)
30174 	{
30175 	  if (out_n == 2 && in_n == 2)
30176 	    return ix86_builtins[IX86_BUILTIN_CEILPD];
30177 	  else if (out_n == 4 && in_n == 4)
30178 	    return ix86_builtins[IX86_BUILTIN_CEILPD256];
30179 	}
30180       break;
30181 
30182     case BUILT_IN_CEILF:
30183       /* The round insn does not trap on denormals.  */
30184       if (flag_trapping_math || !TARGET_ROUND)
30185 	break;
30186 
30187       if (out_mode == SFmode && in_mode == SFmode)
30188 	{
30189 	  if (out_n == 4 && in_n == 4)
30190 	    return ix86_builtins[IX86_BUILTIN_CEILPS];
30191 	  else if (out_n == 8 && in_n == 8)
30192 	    return ix86_builtins[IX86_BUILTIN_CEILPS256];
30193 	}
30194       break;
30195 
30196     case BUILT_IN_TRUNC:
30197       /* The round insn does not trap on denormals.  */
30198       if (flag_trapping_math || !TARGET_ROUND)
30199 	break;
30200 
30201       if (out_mode == DFmode && in_mode == DFmode)
30202 	{
30203 	  if (out_n == 2 && in_n == 2)
30204 	    return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30205 	  else if (out_n == 4 && in_n == 4)
30206 	    return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30207 	}
30208       break;
30209 
30210     case BUILT_IN_TRUNCF:
30211       /* The round insn does not trap on denormals.  */
30212       if (flag_trapping_math || !TARGET_ROUND)
30213 	break;
30214 
30215       if (out_mode == SFmode && in_mode == SFmode)
30216 	{
30217 	  if (out_n == 4 && in_n == 4)
30218 	    return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30219 	  else if (out_n == 8 && in_n == 8)
30220 	    return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30221 	}
30222       break;
30223 
30224     case BUILT_IN_RINT:
30225       /* The round insn does not trap on denormals.  */
30226       if (flag_trapping_math || !TARGET_ROUND)
30227 	break;
30228 
30229       if (out_mode == DFmode && in_mode == DFmode)
30230 	{
30231 	  if (out_n == 2 && in_n == 2)
30232 	    return ix86_builtins[IX86_BUILTIN_RINTPD];
30233 	  else if (out_n == 4 && in_n == 4)
30234 	    return ix86_builtins[IX86_BUILTIN_RINTPD256];
30235 	}
30236       break;
30237 
30238     case BUILT_IN_RINTF:
30239       /* The round insn does not trap on denormals.  */
30240       if (flag_trapping_math || !TARGET_ROUND)
30241 	break;
30242 
30243       if (out_mode == SFmode && in_mode == SFmode)
30244 	{
30245 	  if (out_n == 4 && in_n == 4)
30246 	    return ix86_builtins[IX86_BUILTIN_RINTPS];
30247 	  else if (out_n == 8 && in_n == 8)
30248 	    return ix86_builtins[IX86_BUILTIN_RINTPS256];
30249 	}
30250       break;
30251 
30252     case BUILT_IN_ROUND:
30253       /* The round insn does not trap on denormals.  */
30254       if (flag_trapping_math || !TARGET_ROUND)
30255 	break;
30256 
30257       if (out_mode == DFmode && in_mode == DFmode)
30258 	{
30259 	  if (out_n == 2 && in_n == 2)
30260 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30261 	  else if (out_n == 4 && in_n == 4)
30262 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30263 	}
30264       break;
30265 
30266     case BUILT_IN_ROUNDF:
30267       /* The round insn does not trap on denormals.  */
30268       if (flag_trapping_math || !TARGET_ROUND)
30269 	break;
30270 
30271       if (out_mode == SFmode && in_mode == SFmode)
30272 	{
30273 	  if (out_n == 4 && in_n == 4)
30274 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30275 	  else if (out_n == 8 && in_n == 8)
30276 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30277 	}
30278       break;
30279 
30280     case BUILT_IN_FMA:
30281       if (out_mode == DFmode && in_mode == DFmode)
30282 	{
30283 	  if (out_n == 2 && in_n == 2)
30284 	    return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30285 	  if (out_n == 4 && in_n == 4)
30286 	    return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30287 	}
30288       break;
30289 
30290     case BUILT_IN_FMAF:
30291       if (out_mode == SFmode && in_mode == SFmode)
30292 	{
30293 	  if (out_n == 4 && in_n == 4)
30294 	    return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30295 	  if (out_n == 8 && in_n == 8)
30296 	    return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30297 	}
30298       break;
30299 
30300     default:
30301       break;
30302     }
30303 
30304   /* Dispatch to a handler for a vectorization library.  */
30305   if (ix86_veclib_handler)
30306     return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30307 				type_in);
30308 
30309   return NULL_TREE;
30310 }
30311 
30312 /* Handler for an SVML-style interface to
30313    a library with vectorized intrinsics.  */
30314 
30315 static tree
30316 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30317 {
30318   char name[20];
30319   tree fntype, new_fndecl, args;
30320   unsigned arity;
30321   const char *bname;
30322   enum machine_mode el_mode, in_mode;
30323   int n, in_n;
30324 
30325   /* The SVML is suitable for unsafe math only.  */
30326   if (!flag_unsafe_math_optimizations)
30327     return NULL_TREE;
30328 
30329   el_mode = TYPE_MODE (TREE_TYPE (type_out));
30330   n = TYPE_VECTOR_SUBPARTS (type_out);
30331   in_mode = TYPE_MODE (TREE_TYPE (type_in));
30332   in_n = TYPE_VECTOR_SUBPARTS (type_in);
30333   if (el_mode != in_mode
30334       || n != in_n)
30335     return NULL_TREE;
30336 
30337   switch (fn)
30338     {
30339     case BUILT_IN_EXP:
30340     case BUILT_IN_LOG:
30341     case BUILT_IN_LOG10:
30342     case BUILT_IN_POW:
30343     case BUILT_IN_TANH:
30344     case BUILT_IN_TAN:
30345     case BUILT_IN_ATAN:
30346     case BUILT_IN_ATAN2:
30347     case BUILT_IN_ATANH:
30348     case BUILT_IN_CBRT:
30349     case BUILT_IN_SINH:
30350     case BUILT_IN_SIN:
30351     case BUILT_IN_ASINH:
30352     case BUILT_IN_ASIN:
30353     case BUILT_IN_COSH:
30354     case BUILT_IN_COS:
30355     case BUILT_IN_ACOSH:
30356     case BUILT_IN_ACOS:
30357       if (el_mode != DFmode || n != 2)
30358 	return NULL_TREE;
30359       break;
30360 
30361     case BUILT_IN_EXPF:
30362     case BUILT_IN_LOGF:
30363     case BUILT_IN_LOG10F:
30364     case BUILT_IN_POWF:
30365     case BUILT_IN_TANHF:
30366     case BUILT_IN_TANF:
30367     case BUILT_IN_ATANF:
30368     case BUILT_IN_ATAN2F:
30369     case BUILT_IN_ATANHF:
30370     case BUILT_IN_CBRTF:
30371     case BUILT_IN_SINHF:
30372     case BUILT_IN_SINF:
30373     case BUILT_IN_ASINHF:
30374     case BUILT_IN_ASINF:
30375     case BUILT_IN_COSHF:
30376     case BUILT_IN_COSF:
30377     case BUILT_IN_ACOSHF:
30378     case BUILT_IN_ACOSF:
30379       if (el_mode != SFmode || n != 4)
30380 	return NULL_TREE;
30381       break;
30382 
30383     default:
30384       return NULL_TREE;
30385     }
30386 
30387   bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30388 
30389   if (fn == BUILT_IN_LOGF)
30390     strcpy (name, "vmlsLn4");
30391   else if (fn == BUILT_IN_LOG)
30392     strcpy (name, "vmldLn2");
30393   else if (n == 4)
30394     {
30395       sprintf (name, "vmls%s", bname+10);
30396       name[strlen (name)-1] = '4';
30397     }
30398   else
30399     sprintf (name, "vmld%s2", bname+10);
30400 
30401   /* Convert to uppercase. */
30402   name[4] &= ~0x20;
30403 
30404   arity = 0;
30405   for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30406        args;
30407        args = TREE_CHAIN (args))
30408     arity++;
30409 
30410   if (arity == 1)
30411     fntype = build_function_type_list (type_out, type_in, NULL);
30412   else
30413     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30414 
30415   /* Build a function declaration for the vectorized function.  */
30416   new_fndecl = build_decl (BUILTINS_LOCATION,
30417 			   FUNCTION_DECL, get_identifier (name), fntype);
30418   TREE_PUBLIC (new_fndecl) = 1;
30419   DECL_EXTERNAL (new_fndecl) = 1;
30420   DECL_IS_NOVOPS (new_fndecl) = 1;
30421   TREE_READONLY (new_fndecl) = 1;
30422 
30423   return new_fndecl;
30424 }
30425 
30426 /* Handler for an ACML-style interface to
30427    a library with vectorized intrinsics.  */
30428 
30429 static tree
30430 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30431 {
30432   char name[20] = "__vr.._";
30433   tree fntype, new_fndecl, args;
30434   unsigned arity;
30435   const char *bname;
30436   enum machine_mode el_mode, in_mode;
30437   int n, in_n;
30438 
30439   /* The ACML is 64bits only and suitable for unsafe math only as
30440      it does not correctly support parts of IEEE with the required
30441      precision such as denormals.  */
30442   if (!TARGET_64BIT
30443       || !flag_unsafe_math_optimizations)
30444     return NULL_TREE;
30445 
30446   el_mode = TYPE_MODE (TREE_TYPE (type_out));
30447   n = TYPE_VECTOR_SUBPARTS (type_out);
30448   in_mode = TYPE_MODE (TREE_TYPE (type_in));
30449   in_n = TYPE_VECTOR_SUBPARTS (type_in);
30450   if (el_mode != in_mode
30451       || n != in_n)
30452     return NULL_TREE;
30453 
30454   switch (fn)
30455     {
30456     case BUILT_IN_SIN:
30457     case BUILT_IN_COS:
30458     case BUILT_IN_EXP:
30459     case BUILT_IN_LOG:
30460     case BUILT_IN_LOG2:
30461     case BUILT_IN_LOG10:
30462       name[4] = 'd';
30463       name[5] = '2';
30464       if (el_mode != DFmode
30465 	  || n != 2)
30466 	return NULL_TREE;
30467       break;
30468 
30469     case BUILT_IN_SINF:
30470     case BUILT_IN_COSF:
30471     case BUILT_IN_EXPF:
30472     case BUILT_IN_POWF:
30473     case BUILT_IN_LOGF:
30474     case BUILT_IN_LOG2F:
30475     case BUILT_IN_LOG10F:
30476       name[4] = 's';
30477       name[5] = '4';
30478       if (el_mode != SFmode
30479 	  || n != 4)
30480 	return NULL_TREE;
30481       break;
30482 
30483     default:
30484       return NULL_TREE;
30485     }
30486 
30487   bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30488   sprintf (name + 7, "%s", bname+10);
30489 
30490   arity = 0;
30491   for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30492        args;
30493        args = TREE_CHAIN (args))
30494     arity++;
30495 
30496   if (arity == 1)
30497     fntype = build_function_type_list (type_out, type_in, NULL);
30498   else
30499     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30500 
30501   /* Build a function declaration for the vectorized function.  */
30502   new_fndecl = build_decl (BUILTINS_LOCATION,
30503 			   FUNCTION_DECL, get_identifier (name), fntype);
30504   TREE_PUBLIC (new_fndecl) = 1;
30505   DECL_EXTERNAL (new_fndecl) = 1;
30506   DECL_IS_NOVOPS (new_fndecl) = 1;
30507   TREE_READONLY (new_fndecl) = 1;
30508 
30509   return new_fndecl;
30510 }
30511 
30512 /* Returns a decl of a function that implements gather load with
30513    memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30514    Return NULL_TREE if it is not available.  */
30515 
30516 static tree
30517 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30518 			       const_tree index_type, int scale)
30519 {
30520   bool si;
30521   enum ix86_builtins code;
30522 
30523   if (! TARGET_AVX2)
30524     return NULL_TREE;
30525 
30526   if ((TREE_CODE (index_type) != INTEGER_TYPE
30527        && !POINTER_TYPE_P (index_type))
30528       || (TYPE_MODE (index_type) != SImode
30529 	  && TYPE_MODE (index_type) != DImode))
30530     return NULL_TREE;
30531 
30532   if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30533     return NULL_TREE;
30534 
30535   /* v*gather* insn sign extends index to pointer mode.  */
30536   if (TYPE_PRECISION (index_type) < POINTER_SIZE
30537       && TYPE_UNSIGNED (index_type))
30538     return NULL_TREE;
30539 
30540   if (scale <= 0
30541       || scale > 8
30542       || (scale & (scale - 1)) != 0)
30543     return NULL_TREE;
30544 
30545   si = TYPE_MODE (index_type) == SImode;
30546   switch (TYPE_MODE (mem_vectype))
30547     {
30548     case V2DFmode:
30549       code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30550       break;
30551     case V4DFmode:
30552       code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30553       break;
30554     case V2DImode:
30555       code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30556       break;
30557     case V4DImode:
30558       code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30559       break;
30560     case V4SFmode:
30561       code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30562       break;
30563     case V8SFmode:
30564       code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30565       break;
30566     case V4SImode:
30567       code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30568       break;
30569     case V8SImode:
30570       code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30571       break;
30572     default:
30573       return NULL_TREE;
30574     }
30575 
30576   return ix86_builtins[code];
30577 }
30578 
30579 /* Returns a code for a target-specific builtin that implements
30580    reciprocal of the function, or NULL_TREE if not available.  */
30581 
30582 static tree
30583 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30584 			 bool sqrt ATTRIBUTE_UNUSED)
30585 {
30586   if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30587 	 && flag_finite_math_only && !flag_trapping_math
30588 	 && flag_unsafe_math_optimizations))
30589     return NULL_TREE;
30590 
30591   if (md_fn)
30592     /* Machine dependent builtins.  */
30593     switch (fn)
30594       {
30595 	/* Vectorized version of sqrt to rsqrt conversion.  */
30596       case IX86_BUILTIN_SQRTPS_NR:
30597 	return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30598 
30599       case IX86_BUILTIN_SQRTPS_NR256:
30600 	return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30601 
30602       default:
30603 	return NULL_TREE;
30604       }
30605   else
30606     /* Normal builtins.  */
30607     switch (fn)
30608       {
30609 	/* Sqrt to rsqrt conversion.  */
30610       case BUILT_IN_SQRTF:
30611 	return ix86_builtins[IX86_BUILTIN_RSQRTF];
30612 
30613       default:
30614 	return NULL_TREE;
30615       }
30616 }
30617 
30618 /* Helper for avx_vpermilps256_operand et al.  This is also used by
30619    the expansion functions to turn the parallel back into a mask.
30620    The return value is 0 for no match and the imm8+1 for a match.  */
30621 
30622 int
30623 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30624 {
30625   unsigned i, nelt = GET_MODE_NUNITS (mode);
30626   unsigned mask = 0;
30627   unsigned char ipar[8];
30628 
30629   if (XVECLEN (par, 0) != (int) nelt)
30630     return 0;
30631 
30632   /* Validate that all of the elements are constants, and not totally
30633      out of range.  Copy the data into an integral array to make the
30634      subsequent checks easier.  */
30635   for (i = 0; i < nelt; ++i)
30636     {
30637       rtx er = XVECEXP (par, 0, i);
30638       unsigned HOST_WIDE_INT ei;
30639 
30640       if (!CONST_INT_P (er))
30641 	return 0;
30642       ei = INTVAL (er);
30643       if (ei >= nelt)
30644 	return 0;
30645       ipar[i] = ei;
30646     }
30647 
30648   switch (mode)
30649     {
30650     case V4DFmode:
30651       /* In the 256-bit DFmode case, we can only move elements within
30652          a 128-bit lane.  */
30653       for (i = 0; i < 2; ++i)
30654 	{
30655 	  if (ipar[i] >= 2)
30656 	    return 0;
30657 	  mask |= ipar[i] << i;
30658 	}
30659       for (i = 2; i < 4; ++i)
30660 	{
30661 	  if (ipar[i] < 2)
30662 	    return 0;
30663 	  mask |= (ipar[i] - 2) << i;
30664 	}
30665       break;
30666 
30667     case V8SFmode:
30668       /* In the 256-bit SFmode case, we have full freedom of movement
30669 	 within the low 128-bit lane, but the high 128-bit lane must
30670 	 mirror the exact same pattern.  */
30671       for (i = 0; i < 4; ++i)
30672 	if (ipar[i] + 4 != ipar[i + 4])
30673 	  return 0;
30674       nelt = 4;
30675       /* FALLTHRU */
30676 
30677     case V2DFmode:
30678     case V4SFmode:
30679       /* In the 128-bit case, we've full freedom in the placement of
30680 	 the elements from the source operand.  */
30681       for (i = 0; i < nelt; ++i)
30682 	mask |= ipar[i] << (i * (nelt / 2));
30683       break;
30684 
30685     default:
30686       gcc_unreachable ();
30687     }
30688 
30689   /* Make sure success has a non-zero value by adding one.  */
30690   return mask + 1;
30691 }
30692 
30693 /* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
30694    the expansion functions to turn the parallel back into a mask.
30695    The return value is 0 for no match and the imm8+1 for a match.  */
30696 
30697 int
30698 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30699 {
30700   unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30701   unsigned mask = 0;
30702   unsigned char ipar[8];
30703 
30704   if (XVECLEN (par, 0) != (int) nelt)
30705     return 0;
30706 
30707   /* Validate that all of the elements are constants, and not totally
30708      out of range.  Copy the data into an integral array to make the
30709      subsequent checks easier.  */
30710   for (i = 0; i < nelt; ++i)
30711     {
30712       rtx er = XVECEXP (par, 0, i);
30713       unsigned HOST_WIDE_INT ei;
30714 
30715       if (!CONST_INT_P (er))
30716 	return 0;
30717       ei = INTVAL (er);
30718       if (ei >= 2 * nelt)
30719 	return 0;
30720       ipar[i] = ei;
30721     }
30722 
30723   /* Validate that the halves of the permute are halves.  */
30724   for (i = 0; i < nelt2 - 1; ++i)
30725     if (ipar[i] + 1 != ipar[i + 1])
30726       return 0;
30727   for (i = nelt2; i < nelt - 1; ++i)
30728     if (ipar[i] + 1 != ipar[i + 1])
30729       return 0;
30730 
30731   /* Reconstruct the mask.  */
30732   for (i = 0; i < 2; ++i)
30733     {
30734       unsigned e = ipar[i * nelt2];
30735       if (e % nelt2)
30736 	return 0;
30737       e /= nelt2;
30738       mask |= e << (i * 4);
30739     }
30740 
30741   /* Make sure success has a non-zero value by adding one.  */
30742   return mask + 1;
30743 }
30744 
30745 /* Store OPERAND to the memory after reload is completed.  This means
30746    that we can't easily use assign_stack_local.  */
30747 rtx
30748 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30749 {
30750   rtx result;
30751 
30752   gcc_assert (reload_completed);
30753   if (ix86_using_red_zone ())
30754     {
30755       result = gen_rtx_MEM (mode,
30756 			    gen_rtx_PLUS (Pmode,
30757 					  stack_pointer_rtx,
30758 					  GEN_INT (-RED_ZONE_SIZE)));
30759       emit_move_insn (result, operand);
30760     }
30761   else if (TARGET_64BIT)
30762     {
30763       switch (mode)
30764 	{
30765 	case HImode:
30766 	case SImode:
30767 	  operand = gen_lowpart (DImode, operand);
30768 	  /* FALLTHRU */
30769 	case DImode:
30770 	  emit_insn (
30771 		      gen_rtx_SET (VOIDmode,
30772 				   gen_rtx_MEM (DImode,
30773 						gen_rtx_PRE_DEC (DImode,
30774 							stack_pointer_rtx)),
30775 				   operand));
30776 	  break;
30777 	default:
30778 	  gcc_unreachable ();
30779 	}
30780       result = gen_rtx_MEM (mode, stack_pointer_rtx);
30781     }
30782   else
30783     {
30784       switch (mode)
30785 	{
30786 	case DImode:
30787 	  {
30788 	    rtx operands[2];
30789 	    split_double_mode (mode, &operand, 1, operands, operands + 1);
30790 	    emit_insn (
30791 			gen_rtx_SET (VOIDmode,
30792 				     gen_rtx_MEM (SImode,
30793 						  gen_rtx_PRE_DEC (Pmode,
30794 							stack_pointer_rtx)),
30795 				     operands[1]));
30796 	    emit_insn (
30797 			gen_rtx_SET (VOIDmode,
30798 				     gen_rtx_MEM (SImode,
30799 						  gen_rtx_PRE_DEC (Pmode,
30800 							stack_pointer_rtx)),
30801 				     operands[0]));
30802 	  }
30803 	  break;
30804 	case HImode:
30805 	  /* Store HImodes as SImodes.  */
30806 	  operand = gen_lowpart (SImode, operand);
30807 	  /* FALLTHRU */
30808 	case SImode:
30809 	  emit_insn (
30810 		      gen_rtx_SET (VOIDmode,
30811 				   gen_rtx_MEM (GET_MODE (operand),
30812 						gen_rtx_PRE_DEC (SImode,
30813 							stack_pointer_rtx)),
30814 				   operand));
30815 	  break;
30816 	default:
30817 	  gcc_unreachable ();
30818 	}
30819       result = gen_rtx_MEM (mode, stack_pointer_rtx);
30820     }
30821   return result;
30822 }
30823 
30824 /* Free operand from the memory.  */
30825 void
30826 ix86_free_from_memory (enum machine_mode mode)
30827 {
30828   if (!ix86_using_red_zone ())
30829     {
30830       int size;
30831 
30832       if (mode == DImode || TARGET_64BIT)
30833 	size = 8;
30834       else
30835 	size = 4;
30836       /* Use LEA to deallocate stack space.  In peephole2 it will be converted
30837          to pop or add instruction if registers are available.  */
30838       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30839 			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30840 					    GEN_INT (size))));
30841     }
30842 }
30843 
30844 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30845 
30846    Put float CONST_DOUBLE in the constant pool instead of fp regs.
30847    QImode must go into class Q_REGS.
30848    Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
30849    movdf to do mem-to-mem moves through integer regs.  */
30850 
30851 static reg_class_t
30852 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30853 {
30854   enum machine_mode mode = GET_MODE (x);
30855 
30856   /* We're only allowed to return a subclass of CLASS.  Many of the
30857      following checks fail for NO_REGS, so eliminate that early.  */
30858   if (regclass == NO_REGS)
30859     return NO_REGS;
30860 
30861   /* All classes can load zeros.  */
30862   if (x == CONST0_RTX (mode))
30863     return regclass;
30864 
30865   /* Force constants into memory if we are loading a (nonzero) constant into
30866      an MMX or SSE register.  This is because there are no MMX/SSE instructions
30867      to load from a constant.  */
30868   if (CONSTANT_P (x)
30869       && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30870     return NO_REGS;
30871 
30872   /* Prefer SSE regs only, if we can use them for math.  */
30873   if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30874     return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30875 
30876   /* Floating-point constants need more complex checks.  */
30877   if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30878     {
30879       /* General regs can load everything.  */
30880       if (reg_class_subset_p (regclass, GENERAL_REGS))
30881         return regclass;
30882 
30883       /* Floats can load 0 and 1 plus some others.  Note that we eliminated
30884 	 zero above.  We only want to wind up preferring 80387 registers if
30885 	 we plan on doing computation with them.  */
30886       if (TARGET_80387
30887 	  && standard_80387_constant_p (x) > 0)
30888 	{
30889 	  /* Limit class to non-sse.  */
30890 	  if (regclass == FLOAT_SSE_REGS)
30891 	    return FLOAT_REGS;
30892 	  if (regclass == FP_TOP_SSE_REGS)
30893 	    return FP_TOP_REG;
30894 	  if (regclass == FP_SECOND_SSE_REGS)
30895 	    return FP_SECOND_REG;
30896 	  if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30897 	    return regclass;
30898 	}
30899 
30900       return NO_REGS;
30901     }
30902 
30903   /* Generally when we see PLUS here, it's the function invariant
30904      (plus soft-fp const_int).  Which can only be computed into general
30905      regs.  */
30906   if (GET_CODE (x) == PLUS)
30907     return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30908 
30909   /* QImode constants are easy to load, but non-constant QImode data
30910      must go into Q_REGS.  */
30911   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30912     {
30913       if (reg_class_subset_p (regclass, Q_REGS))
30914 	return regclass;
30915       if (reg_class_subset_p (Q_REGS, regclass))
30916 	return Q_REGS;
30917       return NO_REGS;
30918     }
30919 
30920   return regclass;
30921 }
30922 
30923 /* Discourage putting floating-point values in SSE registers unless
30924    SSE math is being used, and likewise for the 387 registers.  */
30925 static reg_class_t
30926 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30927 {
30928   enum machine_mode mode = GET_MODE (x);
30929 
30930   /* Restrict the output reload class to the register bank that we are doing
30931      math on.  If we would like not to return a subset of CLASS, reject this
30932      alternative: if reload cannot do this, it will still use its choice.  */
30933   mode = GET_MODE (x);
30934   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30935     return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30936 
30937   if (X87_FLOAT_MODE_P (mode))
30938     {
30939       if (regclass == FP_TOP_SSE_REGS)
30940 	return FP_TOP_REG;
30941       else if (regclass == FP_SECOND_SSE_REGS)
30942 	return FP_SECOND_REG;
30943       else
30944 	return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30945     }
30946 
30947   return regclass;
30948 }
30949 
30950 static reg_class_t
30951 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30952 		       enum machine_mode mode, secondary_reload_info *sri)
30953 {
30954   /* Double-word spills from general registers to non-offsettable memory
30955      references (zero-extended addresses) require special handling.  */
30956   if (TARGET_64BIT
30957       && MEM_P (x)
30958       && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30959       && rclass == GENERAL_REGS
30960       && !offsettable_memref_p (x))
30961     {
30962       sri->icode = (in_p
30963 		    ? CODE_FOR_reload_noff_load
30964 		    : CODE_FOR_reload_noff_store);
30965       /* Add the cost of moving address to a temporary.  */
30966       sri->extra_cost = 1;
30967 
30968       return NO_REGS;
30969     }
30970 
30971   /* QImode spills from non-QI registers require
30972      intermediate register on 32bit targets.  */
30973   if (!TARGET_64BIT
30974       && !in_p && mode == QImode
30975       && (rclass == GENERAL_REGS
30976 	  || rclass == LEGACY_REGS
30977 	  || rclass == INDEX_REGS))
30978     {
30979       int regno;
30980 
30981       if (REG_P (x))
30982 	regno = REGNO (x);
30983       else
30984 	regno = -1;
30985 
30986       if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30987 	regno = true_regnum (x);
30988 
30989       /* Return Q_REGS if the operand is in memory.  */
30990       if (regno == -1)
30991 	return Q_REGS;
30992     }
30993 
30994   /* This condition handles corner case where an expression involving
30995      pointers gets vectorized.  We're trying to use the address of a
30996      stack slot as a vector initializer.
30997 
30998      (set (reg:V2DI 74 [ vect_cst_.2 ])
30999           (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31000 
31001      Eventually frame gets turned into sp+offset like this:
31002 
31003      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31004           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31005 	                               (const_int 392 [0x188]))))
31006 
31007      That later gets turned into:
31008 
31009      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31010           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31011 	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31012 
31013      We'll have the following reload recorded:
31014 
31015      Reload 0: reload_in (DI) =
31016            (plus:DI (reg/f:DI 7 sp)
31017             (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31018      reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31019      SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31020      reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31021      reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31022      reload_reg_rtx: (reg:V2DI 22 xmm1)
31023 
31024      Which isn't going to work since SSE instructions can't handle scalar
31025      additions.  Returning GENERAL_REGS forces the addition into integer
31026      register and reload can handle subsequent reloads without problems.  */
31027 
31028   if (in_p && GET_CODE (x) == PLUS
31029       && SSE_CLASS_P (rclass)
31030       && SCALAR_INT_MODE_P (mode))
31031     return GENERAL_REGS;
31032 
31033   return NO_REGS;
31034 }
31035 
31036 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
31037 
31038 static bool
31039 ix86_class_likely_spilled_p (reg_class_t rclass)
31040 {
31041   switch (rclass)
31042     {
31043       case AREG:
31044       case DREG:
31045       case CREG:
31046       case BREG:
31047       case AD_REGS:
31048       case SIREG:
31049       case DIREG:
31050       case SSE_FIRST_REG:
31051       case FP_TOP_REG:
31052       case FP_SECOND_REG:
31053 	return true;
31054 
31055       default:
31056 	break;
31057     }
31058 
31059   return false;
31060 }
31061 
31062 /* If we are copying between general and FP registers, we need a memory
31063    location. The same is true for SSE and MMX registers.
31064 
31065    To optimize register_move_cost performance, allow inline variant.
31066 
31067    The macro can't work reliably when one of the CLASSES is class containing
31068    registers from multiple units (SSE, MMX, integer).  We avoid this by never
31069    combining those units in single alternative in the machine description.
31070    Ensure that this constraint holds to avoid unexpected surprises.
31071 
31072    When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31073    enforce these sanity checks.  */
31074 
31075 static inline bool
31076 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31077 				enum machine_mode mode, int strict)
31078 {
31079   if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31080       || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31081       || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31082       || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31083       || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31084       || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31085     {
31086       gcc_assert (!strict);
31087       return true;
31088     }
31089 
31090   if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31091     return true;
31092 
31093   /* ??? This is a lie.  We do have moves between mmx/general, and for
31094      mmx/sse2.  But by saying we need secondary memory we discourage the
31095      register allocator from using the mmx registers unless needed.  */
31096   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31097     return true;
31098 
31099   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31100     {
31101       /* SSE1 doesn't have any direct moves from other classes.  */
31102       if (!TARGET_SSE2)
31103 	return true;
31104 
31105       /* If the target says that inter-unit moves are more expensive
31106 	 than moving through memory, then don't generate them.  */
31107       if (!TARGET_INTER_UNIT_MOVES)
31108 	return true;
31109 
31110       /* Between SSE and general, we have moves no larger than word size.  */
31111       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31112 	return true;
31113     }
31114 
31115   return false;
31116 }
31117 
31118 bool
31119 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31120 			      enum machine_mode mode, int strict)
31121 {
31122   return inline_secondary_memory_needed (class1, class2, mode, strict);
31123 }
31124 
31125 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31126 
31127    On the 80386, this is the size of MODE in words,
31128    except in the FP regs, where a single reg is always enough.  */
31129 
31130 static unsigned char
31131 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31132 {
31133   if (MAYBE_INTEGER_CLASS_P (rclass))
31134     {
31135       if (mode == XFmode)
31136 	return (TARGET_64BIT ? 2 : 3);
31137       else if (mode == XCmode)
31138 	return (TARGET_64BIT ? 4 : 6);
31139       else
31140 	return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31141     }
31142   else
31143     {
31144       if (COMPLEX_MODE_P (mode))
31145 	return 2;
31146       else
31147 	return 1;
31148     }
31149 }
31150 
31151 /* Return true if the registers in CLASS cannot represent the change from
31152    modes FROM to TO.  */
31153 
31154 bool
31155 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31156 			       enum reg_class regclass)
31157 {
31158   if (from == to)
31159     return false;
31160 
31161   /* x87 registers can't do subreg at all, as all values are reformatted
31162      to extended precision.  */
31163   if (MAYBE_FLOAT_CLASS_P (regclass))
31164     return true;
31165 
31166   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31167     {
31168       /* Vector registers do not support QI or HImode loads.  If we don't
31169 	 disallow a change to these modes, reload will assume it's ok to
31170 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
31171 	 the vec_dupv4hi pattern.  */
31172       if (GET_MODE_SIZE (from) < 4)
31173 	return true;
31174 
31175       /* Vector registers do not support subreg with nonzero offsets, which
31176 	 are otherwise valid for integer registers.  Since we can't see
31177 	 whether we have a nonzero offset from here, prohibit all
31178          nonparadoxical subregs changing size.  */
31179       if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31180 	return true;
31181     }
31182 
31183   return false;
31184 }
31185 
31186 /* Return the cost of moving data of mode M between a
31187    register and memory.  A value of 2 is the default; this cost is
31188    relative to those in `REGISTER_MOVE_COST'.
31189 
31190    This function is used extensively by register_move_cost that is used to
31191    build tables at startup.  Make it inline in this case.
31192    When IN is 2, return maximum of in and out move cost.
31193 
31194    If moving between registers and memory is more expensive than
31195    between two registers, you should define this macro to express the
31196    relative cost.
31197 
31198    Model also increased moving costs of QImode registers in non
31199    Q_REGS classes.
31200  */
31201 static inline int
31202 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31203 			 int in)
31204 {
31205   int cost;
31206   if (FLOAT_CLASS_P (regclass))
31207     {
31208       int index;
31209       switch (mode)
31210 	{
31211 	  case SFmode:
31212 	    index = 0;
31213 	    break;
31214 	  case DFmode:
31215 	    index = 1;
31216 	    break;
31217 	  case XFmode:
31218 	    index = 2;
31219 	    break;
31220 	  default:
31221 	    return 100;
31222 	}
31223       if (in == 2)
31224         return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31225       return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31226     }
31227   if (SSE_CLASS_P (regclass))
31228     {
31229       int index;
31230       switch (GET_MODE_SIZE (mode))
31231 	{
31232 	  case 4:
31233 	    index = 0;
31234 	    break;
31235 	  case 8:
31236 	    index = 1;
31237 	    break;
31238 	  case 16:
31239 	    index = 2;
31240 	    break;
31241 	  default:
31242 	    return 100;
31243 	}
31244       if (in == 2)
31245         return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31246       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31247     }
31248   if (MMX_CLASS_P (regclass))
31249     {
31250       int index;
31251       switch (GET_MODE_SIZE (mode))
31252 	{
31253 	  case 4:
31254 	    index = 0;
31255 	    break;
31256 	  case 8:
31257 	    index = 1;
31258 	    break;
31259 	  default:
31260 	    return 100;
31261 	}
31262       if (in)
31263         return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31264       return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31265     }
31266   switch (GET_MODE_SIZE (mode))
31267     {
31268       case 1:
31269 	if (Q_CLASS_P (regclass) || TARGET_64BIT)
31270 	  {
31271 	    if (!in)
31272 	      return ix86_cost->int_store[0];
31273 	    if (TARGET_PARTIAL_REG_DEPENDENCY
31274 	        && optimize_function_for_speed_p (cfun))
31275 	      cost = ix86_cost->movzbl_load;
31276 	    else
31277 	      cost = ix86_cost->int_load[0];
31278 	    if (in == 2)
31279 	      return MAX (cost, ix86_cost->int_store[0]);
31280 	    return cost;
31281 	  }
31282 	else
31283 	  {
31284 	   if (in == 2)
31285 	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31286 	   if (in)
31287 	     return ix86_cost->movzbl_load;
31288 	   else
31289 	     return ix86_cost->int_store[0] + 4;
31290 	  }
31291 	break;
31292       case 2:
31293 	if (in == 2)
31294 	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31295 	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31296       default:
31297 	/* Compute number of 32bit moves needed.  TFmode is moved as XFmode.  */
31298 	if (mode == TFmode)
31299 	  mode = XFmode;
31300 	if (in == 2)
31301 	  cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31302 	else if (in)
31303 	  cost = ix86_cost->int_load[2];
31304 	else
31305 	  cost = ix86_cost->int_store[2];
31306 	return (cost * (((int) GET_MODE_SIZE (mode)
31307 		        + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31308     }
31309 }
31310 
31311 static int
31312 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31313 		       bool in)
31314 {
31315   return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31316 }
31317 
31318 
31319 /* Return the cost of moving data from a register in class CLASS1 to
31320    one in class CLASS2.
31321 
31322    It is not required that the cost always equal 2 when FROM is the same as TO;
31323    on some machines it is expensive to move between registers if they are not
31324    general registers.  */
31325 
31326 static int
31327 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31328 			 reg_class_t class2_i)
31329 {
31330   enum reg_class class1 = (enum reg_class) class1_i;
31331   enum reg_class class2 = (enum reg_class) class2_i;
31332 
31333   /* In case we require secondary memory, compute cost of the store followed
31334      by load.  In order to avoid bad register allocation choices, we need
31335      for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
31336 
31337   if (inline_secondary_memory_needed (class1, class2, mode, 0))
31338     {
31339       int cost = 1;
31340 
31341       cost += inline_memory_move_cost (mode, class1, 2);
31342       cost += inline_memory_move_cost (mode, class2, 2);
31343 
31344       /* In case of copying from general_purpose_register we may emit multiple
31345          stores followed by single load causing memory size mismatch stall.
31346          Count this as arbitrarily high cost of 20.  */
31347       if (targetm.class_max_nregs (class1, mode)
31348 	  > targetm.class_max_nregs (class2, mode))
31349 	cost += 20;
31350 
31351       /* In the case of FP/MMX moves, the registers actually overlap, and we
31352 	 have to switch modes in order to treat them differently.  */
31353       if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31354           || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31355 	cost += 20;
31356 
31357       return cost;
31358     }
31359 
31360   /* Moves between SSE/MMX and integer unit are expensive.  */
31361   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31362       || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31363 
31364     /* ??? By keeping returned value relatively high, we limit the number
31365        of moves between integer and MMX/SSE registers for all targets.
31366        Additionally, high value prevents problem with x86_modes_tieable_p(),
31367        where integer modes in MMX/SSE registers are not tieable
31368        because of missing QImode and HImode moves to, from or between
31369        MMX/SSE registers.  */
31370     return MAX (8, ix86_cost->mmxsse_to_integer);
31371 
31372   if (MAYBE_FLOAT_CLASS_P (class1))
31373     return ix86_cost->fp_move;
31374   if (MAYBE_SSE_CLASS_P (class1))
31375     return ix86_cost->sse_move;
31376   if (MAYBE_MMX_CLASS_P (class1))
31377     return ix86_cost->mmx_move;
31378   return 2;
31379 }
31380 
31381 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31382    MODE.  */
31383 
31384 bool
31385 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31386 {
31387   /* Flags and only flags can only hold CCmode values.  */
31388   if (CC_REGNO_P (regno))
31389     return GET_MODE_CLASS (mode) == MODE_CC;
31390   if (GET_MODE_CLASS (mode) == MODE_CC
31391       || GET_MODE_CLASS (mode) == MODE_RANDOM
31392       || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31393     return false;
31394   if (FP_REGNO_P (regno))
31395     return VALID_FP_MODE_P (mode);
31396   if (SSE_REGNO_P (regno))
31397     {
31398       /* We implement the move patterns for all vector modes into and
31399 	 out of SSE registers, even when no operation instructions
31400 	 are available.  OImode move is available only when AVX is
31401 	 enabled.  */
31402       return ((TARGET_AVX && mode == OImode)
31403 	      || VALID_AVX256_REG_MODE (mode)
31404 	      || VALID_SSE_REG_MODE (mode)
31405 	      || VALID_SSE2_REG_MODE (mode)
31406 	      || VALID_MMX_REG_MODE (mode)
31407 	      || VALID_MMX_REG_MODE_3DNOW (mode));
31408     }
31409   if (MMX_REGNO_P (regno))
31410     {
31411       /* We implement the move patterns for 3DNOW modes even in MMX mode,
31412 	 so if the register is available at all, then we can move data of
31413 	 the given mode into or out of it.  */
31414       return (VALID_MMX_REG_MODE (mode)
31415 	      || VALID_MMX_REG_MODE_3DNOW (mode));
31416     }
31417 
31418   if (mode == QImode)
31419     {
31420       /* Take care for QImode values - they can be in non-QI regs,
31421 	 but then they do cause partial register stalls.  */
31422       if (regno <= BX_REG || TARGET_64BIT)
31423 	return true;
31424       if (!TARGET_PARTIAL_REG_STALL)
31425 	return true;
31426       return !can_create_pseudo_p ();
31427     }
31428   /* We handle both integer and floats in the general purpose registers.  */
31429   else if (VALID_INT_MODE_P (mode))
31430     return true;
31431   else if (VALID_FP_MODE_P (mode))
31432     return true;
31433   else if (VALID_DFP_MODE_P (mode))
31434     return true;
31435   /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
31436      on to use that value in smaller contexts, this can easily force a
31437      pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
31438      supporting DImode, allow it.  */
31439   else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31440     return true;
31441 
31442   return false;
31443 }
31444 
31445 /* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
31446    tieable integer mode.  */
31447 
31448 static bool
31449 ix86_tieable_integer_mode_p (enum machine_mode mode)
31450 {
31451   switch (mode)
31452     {
31453     case HImode:
31454     case SImode:
31455       return true;
31456 
31457     case QImode:
31458       return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31459 
31460     case DImode:
31461       return TARGET_64BIT;
31462 
31463     default:
31464       return false;
31465     }
31466 }
31467 
31468 /* Return true if MODE1 is accessible in a register that can hold MODE2
31469    without copying.  That is, all register classes that can hold MODE2
31470    can also hold MODE1.  */
31471 
31472 bool
31473 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31474 {
31475   if (mode1 == mode2)
31476     return true;
31477 
31478   if (ix86_tieable_integer_mode_p (mode1)
31479       && ix86_tieable_integer_mode_p (mode2))
31480     return true;
31481 
31482   /* MODE2 being XFmode implies fp stack or general regs, which means we
31483      can tie any smaller floating point modes to it.  Note that we do not
31484      tie this with TFmode.  */
31485   if (mode2 == XFmode)
31486     return mode1 == SFmode || mode1 == DFmode;
31487 
31488   /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31489      that we can tie it with SFmode.  */
31490   if (mode2 == DFmode)
31491     return mode1 == SFmode;
31492 
31493   /* If MODE2 is only appropriate for an SSE register, then tie with
31494      any other mode acceptable to SSE registers.  */
31495   if (GET_MODE_SIZE (mode2) == 16
31496       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31497     return (GET_MODE_SIZE (mode1) == 16
31498 	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31499 
31500   /* If MODE2 is appropriate for an MMX register, then tie
31501      with any other mode acceptable to MMX registers.  */
31502   if (GET_MODE_SIZE (mode2) == 8
31503       && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31504     return (GET_MODE_SIZE (mode1) == 8
31505 	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31506 
31507   return false;
31508 }
31509 
31510 /* Compute a (partial) cost for rtx X.  Return true if the complete
31511    cost has been computed, and false if subexpressions should be
31512    scanned.  In either case, *TOTAL contains the cost result.  */
31513 
31514 static bool
31515 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31516 		bool speed)
31517 {
31518   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31519   enum machine_mode mode = GET_MODE (x);
31520   const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31521 
31522   switch (code)
31523     {
31524     case CONST_INT:
31525     case CONST:
31526     case LABEL_REF:
31527     case SYMBOL_REF:
31528       if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31529 	*total = 3;
31530       else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31531 	*total = 2;
31532       else if (flag_pic && SYMBOLIC_CONST (x)
31533 	       && (!TARGET_64BIT
31534 		   || (!GET_CODE (x) != LABEL_REF
31535 		       && (GET_CODE (x) != SYMBOL_REF
31536 		           || !SYMBOL_REF_LOCAL_P (x)))))
31537 	*total = 1;
31538       else
31539 	*total = 0;
31540       return true;
31541 
31542     case CONST_DOUBLE:
31543       if (mode == VOIDmode)
31544 	*total = 0;
31545       else
31546 	switch (standard_80387_constant_p (x))
31547 	  {
31548 	  case 1: /* 0.0 */
31549 	    *total = 1;
31550 	    break;
31551 	  default: /* Other constants */
31552 	    *total = 2;
31553 	    break;
31554 	  case 0:
31555 	  case -1:
31556 	    /* Start with (MEM (SYMBOL_REF)), since that's where
31557 	       it'll probably end up.  Add a penalty for size.  */
31558 	    *total = (COSTS_N_INSNS (1)
31559 		      + (flag_pic != 0 && !TARGET_64BIT)
31560 		      + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31561 	    break;
31562 	  }
31563       return true;
31564 
31565     case ZERO_EXTEND:
31566       /* The zero extensions is often completely free on x86_64, so make
31567 	 it as cheap as possible.  */
31568       if (TARGET_64BIT && mode == DImode
31569 	  && GET_MODE (XEXP (x, 0)) == SImode)
31570 	*total = 1;
31571       else if (TARGET_ZERO_EXTEND_WITH_AND)
31572 	*total = cost->add;
31573       else
31574 	*total = cost->movzx;
31575       return false;
31576 
31577     case SIGN_EXTEND:
31578       *total = cost->movsx;
31579       return false;
31580 
31581     case ASHIFT:
31582       if (CONST_INT_P (XEXP (x, 1))
31583 	  && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31584 	{
31585 	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31586 	  if (value == 1)
31587 	    {
31588 	      *total = cost->add;
31589 	      return false;
31590 	    }
31591 	  if ((value == 2 || value == 3)
31592 	      && cost->lea <= cost->shift_const)
31593 	    {
31594 	      *total = cost->lea;
31595 	      return false;
31596 	    }
31597 	}
31598       /* FALLTHRU */
31599 
31600     case ROTATE:
31601     case ASHIFTRT:
31602     case LSHIFTRT:
31603     case ROTATERT:
31604       if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31605 	{
31606 	  if (CONST_INT_P (XEXP (x, 1)))
31607 	    {
31608 	      if (INTVAL (XEXP (x, 1)) > 32)
31609 		*total = cost->shift_const + COSTS_N_INSNS (2);
31610 	      else
31611 		*total = cost->shift_const * 2;
31612 	    }
31613 	  else
31614 	    {
31615 	      if (GET_CODE (XEXP (x, 1)) == AND)
31616 		*total = cost->shift_var * 2;
31617 	      else
31618 		*total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31619 	    }
31620 	}
31621       else
31622 	{
31623 	  if (CONST_INT_P (XEXP (x, 1)))
31624 	    *total = cost->shift_const;
31625 	  else if (GET_CODE (XEXP (x, 1)) == SUBREG
31626 		   && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
31627 	    {
31628 	      /* Return the cost after shift-and truncation.  */
31629 	      *total = cost->shift_var;
31630 	      return true;
31631 	    }
31632 	  else
31633 	    *total = cost->shift_var;
31634 	}
31635       return false;
31636 
31637     case FMA:
31638       {
31639 	rtx sub;
31640 
31641         gcc_assert (FLOAT_MODE_P (mode));
31642         gcc_assert (TARGET_FMA || TARGET_FMA4);
31643 
31644         /* ??? SSE scalar/vector cost should be used here.  */
31645         /* ??? Bald assumption that fma has the same cost as fmul.  */
31646         *total = cost->fmul;
31647 	*total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31648 
31649         /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
31650 	sub = XEXP (x, 0);
31651 	if (GET_CODE (sub) == NEG)
31652 	  sub = XEXP (sub, 0);
31653 	*total += rtx_cost (sub, FMA, 0, speed);
31654 
31655 	sub = XEXP (x, 2);
31656 	if (GET_CODE (sub) == NEG)
31657 	  sub = XEXP (sub, 0);
31658 	*total += rtx_cost (sub, FMA, 2, speed);
31659 	return true;
31660       }
31661 
31662     case MULT:
31663       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31664 	{
31665 	  /* ??? SSE scalar cost should be used here.  */
31666 	  *total = cost->fmul;
31667 	  return false;
31668 	}
31669       else if (X87_FLOAT_MODE_P (mode))
31670 	{
31671 	  *total = cost->fmul;
31672 	  return false;
31673 	}
31674       else if (FLOAT_MODE_P (mode))
31675 	{
31676 	  /* ??? SSE vector cost should be used here.  */
31677 	  *total = cost->fmul;
31678 	  return false;
31679 	}
31680       else
31681 	{
31682 	  rtx op0 = XEXP (x, 0);
31683 	  rtx op1 = XEXP (x, 1);
31684 	  int nbits;
31685 	  if (CONST_INT_P (XEXP (x, 1)))
31686 	    {
31687 	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31688 	      for (nbits = 0; value != 0; value &= value - 1)
31689 	        nbits++;
31690 	    }
31691 	  else
31692 	    /* This is arbitrary.  */
31693 	    nbits = 7;
31694 
31695 	  /* Compute costs correctly for widening multiplication.  */
31696 	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31697 	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31698 	         == GET_MODE_SIZE (mode))
31699 	    {
31700 	      int is_mulwiden = 0;
31701 	      enum machine_mode inner_mode = GET_MODE (op0);
31702 
31703 	      if (GET_CODE (op0) == GET_CODE (op1))
31704 		is_mulwiden = 1, op1 = XEXP (op1, 0);
31705 	      else if (CONST_INT_P (op1))
31706 		{
31707 		  if (GET_CODE (op0) == SIGN_EXTEND)
31708 		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31709 			          == INTVAL (op1);
31710 		  else
31711 		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31712 	        }
31713 
31714 	      if (is_mulwiden)
31715 	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31716 	    }
31717 
31718   	  *total = (cost->mult_init[MODE_INDEX (mode)]
31719 		    + nbits * cost->mult_bit
31720 	            + rtx_cost (op0, outer_code, opno, speed)
31721 		    + rtx_cost (op1, outer_code, opno, speed));
31722 
31723           return true;
31724 	}
31725 
31726     case DIV:
31727     case UDIV:
31728     case MOD:
31729     case UMOD:
31730       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31731 	/* ??? SSE cost should be used here.  */
31732 	*total = cost->fdiv;
31733       else if (X87_FLOAT_MODE_P (mode))
31734 	*total = cost->fdiv;
31735       else if (FLOAT_MODE_P (mode))
31736 	/* ??? SSE vector cost should be used here.  */
31737 	*total = cost->fdiv;
31738       else
31739 	*total = cost->divide[MODE_INDEX (mode)];
31740       return false;
31741 
31742     case PLUS:
31743       if (GET_MODE_CLASS (mode) == MODE_INT
31744 	       && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31745 	{
31746 	  if (GET_CODE (XEXP (x, 0)) == PLUS
31747 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31748 	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31749 	      && CONSTANT_P (XEXP (x, 1)))
31750 	    {
31751 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31752 	      if (val == 2 || val == 4 || val == 8)
31753 		{
31754 		  *total = cost->lea;
31755 		  *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31756 				      outer_code, opno, speed);
31757 		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31758 				      outer_code, opno, speed);
31759 		  *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31760 		  return true;
31761 		}
31762 	    }
31763 	  else if (GET_CODE (XEXP (x, 0)) == MULT
31764 		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31765 	    {
31766 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31767 	      if (val == 2 || val == 4 || val == 8)
31768 		{
31769 		  *total = cost->lea;
31770 		  *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31771 				      outer_code, opno, speed);
31772 		  *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31773 		  return true;
31774 		}
31775 	    }
31776 	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
31777 	    {
31778 	      *total = cost->lea;
31779 	      *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31780 				  outer_code, opno, speed);
31781 	      *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31782 				  outer_code, opno, speed);
31783 	      *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31784 	      return true;
31785 	    }
31786 	}
31787       /* FALLTHRU */
31788 
31789     case MINUS:
31790       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31791 	{
31792 	  /* ??? SSE cost should be used here.  */
31793 	  *total = cost->fadd;
31794 	  return false;
31795 	}
31796       else if (X87_FLOAT_MODE_P (mode))
31797 	{
31798 	  *total = cost->fadd;
31799 	  return false;
31800 	}
31801       else if (FLOAT_MODE_P (mode))
31802 	{
31803 	  /* ??? SSE vector cost should be used here.  */
31804 	  *total = cost->fadd;
31805 	  return false;
31806 	}
31807       /* FALLTHRU */
31808 
31809     case AND:
31810     case IOR:
31811     case XOR:
31812       if (!TARGET_64BIT && mode == DImode)
31813 	{
31814 	  *total = (cost->add * 2
31815 		    + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31816 		       << (GET_MODE (XEXP (x, 0)) != DImode))
31817 		    + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31818 	               << (GET_MODE (XEXP (x, 1)) != DImode)));
31819 	  return true;
31820 	}
31821       /* FALLTHRU */
31822 
31823     case NEG:
31824       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31825 	{
31826 	  /* ??? SSE cost should be used here.  */
31827 	  *total = cost->fchs;
31828 	  return false;
31829 	}
31830       else if (X87_FLOAT_MODE_P (mode))
31831 	{
31832 	  *total = cost->fchs;
31833 	  return false;
31834 	}
31835       else if (FLOAT_MODE_P (mode))
31836 	{
31837 	  /* ??? SSE vector cost should be used here.  */
31838 	  *total = cost->fchs;
31839 	  return false;
31840 	}
31841       /* FALLTHRU */
31842 
31843     case NOT:
31844       if (!TARGET_64BIT && mode == DImode)
31845 	*total = cost->add * 2;
31846       else
31847 	*total = cost->add;
31848       return false;
31849 
31850     case COMPARE:
31851       if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31852 	  && XEXP (XEXP (x, 0), 1) == const1_rtx
31853 	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31854 	  && XEXP (x, 1) == const0_rtx)
31855 	{
31856 	  /* This kind of construct is implemented using test[bwl].
31857 	     Treat it as if we had an AND.  */
31858 	  *total = (cost->add
31859 		    + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31860 		    + rtx_cost (const1_rtx, outer_code, opno, speed));
31861 	  return true;
31862 	}
31863       return false;
31864 
31865     case FLOAT_EXTEND:
31866       if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31867 	*total = 0;
31868       return false;
31869 
31870     case ABS:
31871       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31872 	/* ??? SSE cost should be used here.  */
31873 	*total = cost->fabs;
31874       else if (X87_FLOAT_MODE_P (mode))
31875 	*total = cost->fabs;
31876       else if (FLOAT_MODE_P (mode))
31877 	/* ??? SSE vector cost should be used here.  */
31878 	*total = cost->fabs;
31879       return false;
31880 
31881     case SQRT:
31882       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31883 	/* ??? SSE cost should be used here.  */
31884 	*total = cost->fsqrt;
31885       else if (X87_FLOAT_MODE_P (mode))
31886 	*total = cost->fsqrt;
31887       else if (FLOAT_MODE_P (mode))
31888 	/* ??? SSE vector cost should be used here.  */
31889 	*total = cost->fsqrt;
31890       return false;
31891 
31892     case UNSPEC:
31893       if (XINT (x, 1) == UNSPEC_TP)
31894 	*total = 0;
31895       return false;
31896 
31897     case VEC_SELECT:
31898     case VEC_CONCAT:
31899     case VEC_MERGE:
31900     case VEC_DUPLICATE:
31901       /* ??? Assume all of these vector manipulation patterns are
31902 	 recognizable.  In which case they all pretty much have the
31903 	 same cost.  */
31904      *total = COSTS_N_INSNS (1);
31905      return true;
31906 
31907     default:
31908       return false;
31909     }
31910 }
31911 
31912 #if TARGET_MACHO
31913 
31914 static int current_machopic_label_num;
31915 
31916 /* Given a symbol name and its associated stub, write out the
31917    definition of the stub.  */
31918 
31919 void
31920 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31921 {
31922   unsigned int length;
31923   char *binder_name, *symbol_name, lazy_ptr_name[32];
31924   int label = ++current_machopic_label_num;
31925 
31926   /* For 64-bit we shouldn't get here.  */
31927   gcc_assert (!TARGET_64BIT);
31928 
31929   /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
31930   symb = targetm.strip_name_encoding (symb);
31931 
31932   length = strlen (stub);
31933   binder_name = XALLOCAVEC (char, length + 32);
31934   GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31935 
31936   length = strlen (symb);
31937   symbol_name = XALLOCAVEC (char, length + 32);
31938   GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31939 
31940   sprintf (lazy_ptr_name, "L%d$lz", label);
31941 
31942   if (MACHOPIC_ATT_STUB)
31943     switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31944   else if (MACHOPIC_PURE)
31945     switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31946   else
31947     switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31948 
31949   fprintf (file, "%s:\n", stub);
31950   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31951 
31952   if (MACHOPIC_ATT_STUB)
31953     {
31954       fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31955     }
31956   else if (MACHOPIC_PURE)
31957     {
31958       /* PIC stub.  */
31959       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
31960       rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31961       output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
31962       fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31963 	       label, lazy_ptr_name, label);
31964       fprintf (file, "\tjmp\t*%%ecx\n");
31965     }
31966   else
31967     fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31968 
31969   /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31970      it needs no stub-binding-helper.  */
31971   if (MACHOPIC_ATT_STUB)
31972     return;
31973 
31974   fprintf (file, "%s:\n", binder_name);
31975 
31976   if (MACHOPIC_PURE)
31977     {
31978       fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31979       fprintf (file, "\tpushl\t%%ecx\n");
31980     }
31981   else
31982     fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31983 
31984   fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31985 
31986   /* N.B. Keep the correspondence of these
31987      'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31988      old-pic/new-pic/non-pic stubs; altering this will break
31989      compatibility with existing dylibs.  */
31990   if (MACHOPIC_PURE)
31991     {
31992       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
31993       switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31994     }
31995   else
31996     /* 16-byte -mdynamic-no-pic stub.  */
31997     switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31998 
31999   fprintf (file, "%s:\n", lazy_ptr_name);
32000   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32001   fprintf (file, ASM_LONG "%s\n", binder_name);
32002 }
32003 #endif /* TARGET_MACHO */
32004 
32005 /* Order the registers for register allocator.  */
32006 
32007 void
32008 x86_order_regs_for_local_alloc (void)
32009 {
32010    int pos = 0;
32011    int i;
32012 
32013    /* First allocate the local general purpose registers.  */
32014    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32015      if (GENERAL_REGNO_P (i) && call_used_regs[i])
32016 	reg_alloc_order [pos++] = i;
32017 
32018    /* Global general purpose registers.  */
32019    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32020      if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32021 	reg_alloc_order [pos++] = i;
32022 
32023    /* x87 registers come first in case we are doing FP math
32024       using them.  */
32025    if (!TARGET_SSE_MATH)
32026      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32027        reg_alloc_order [pos++] = i;
32028 
32029    /* SSE registers.  */
32030    for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32031      reg_alloc_order [pos++] = i;
32032    for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32033      reg_alloc_order [pos++] = i;
32034 
32035    /* x87 registers.  */
32036    if (TARGET_SSE_MATH)
32037      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32038        reg_alloc_order [pos++] = i;
32039 
32040    for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32041      reg_alloc_order [pos++] = i;
32042 
32043    /* Initialize the rest of array as we do not allocate some registers
32044       at all.  */
32045    while (pos < FIRST_PSEUDO_REGISTER)
32046      reg_alloc_order [pos++] = 0;
32047 }
32048 
32049 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32050    in struct attribute_spec handler.  */
32051 static tree
32052 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32053 					      tree args,
32054 					      int flags ATTRIBUTE_UNUSED,
32055 					      bool *no_add_attrs)
32056 {
32057   if (TREE_CODE (*node) != FUNCTION_TYPE
32058       && TREE_CODE (*node) != METHOD_TYPE
32059       && TREE_CODE (*node) != FIELD_DECL
32060       && TREE_CODE (*node) != TYPE_DECL)
32061     {
32062       warning (OPT_Wattributes, "%qE attribute only applies to functions",
32063 	       name);
32064       *no_add_attrs = true;
32065       return NULL_TREE;
32066     }
32067   if (TARGET_64BIT)
32068     {
32069       warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32070 	       name);
32071       *no_add_attrs = true;
32072       return NULL_TREE;
32073     }
32074   if (is_attribute_p ("callee_pop_aggregate_return", name))
32075     {
32076       tree cst;
32077 
32078       cst = TREE_VALUE (args);
32079       if (TREE_CODE (cst) != INTEGER_CST)
32080 	{
32081 	  warning (OPT_Wattributes,
32082 		   "%qE attribute requires an integer constant argument",
32083 		   name);
32084 	  *no_add_attrs = true;
32085 	}
32086       else if (compare_tree_int (cst, 0) != 0
32087 	       && compare_tree_int (cst, 1) != 0)
32088 	{
32089 	  warning (OPT_Wattributes,
32090 		   "argument to %qE attribute is neither zero, nor one",
32091 		   name);
32092 	  *no_add_attrs = true;
32093 	}
32094 
32095       return NULL_TREE;
32096     }
32097 
32098   return NULL_TREE;
32099 }
32100 
32101 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32102    struct attribute_spec.handler.  */
32103 static tree
32104 ix86_handle_abi_attribute (tree *node, tree name,
32105 			      tree args ATTRIBUTE_UNUSED,
32106 			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32107 {
32108   if (TREE_CODE (*node) != FUNCTION_TYPE
32109       && TREE_CODE (*node) != METHOD_TYPE
32110       && TREE_CODE (*node) != FIELD_DECL
32111       && TREE_CODE (*node) != TYPE_DECL)
32112     {
32113       warning (OPT_Wattributes, "%qE attribute only applies to functions",
32114 	       name);
32115       *no_add_attrs = true;
32116       return NULL_TREE;
32117     }
32118 
32119   /* Can combine regparm with all attributes but fastcall.  */
32120   if (is_attribute_p ("ms_abi", name))
32121     {
32122       if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32123         {
32124 	  error ("ms_abi and sysv_abi attributes are not compatible");
32125 	}
32126 
32127       return NULL_TREE;
32128     }
32129   else if (is_attribute_p ("sysv_abi", name))
32130     {
32131       if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32132         {
32133 	  error ("ms_abi and sysv_abi attributes are not compatible");
32134 	}
32135 
32136       return NULL_TREE;
32137     }
32138 
32139   return NULL_TREE;
32140 }
32141 
32142 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32143    struct attribute_spec.handler.  */
32144 static tree
32145 ix86_handle_struct_attribute (tree *node, tree name,
32146 			      tree args ATTRIBUTE_UNUSED,
32147 			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32148 {
32149   tree *type = NULL;
32150   if (DECL_P (*node))
32151     {
32152       if (TREE_CODE (*node) == TYPE_DECL)
32153 	type = &TREE_TYPE (*node);
32154     }
32155   else
32156     type = node;
32157 
32158   if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32159     {
32160       warning (OPT_Wattributes, "%qE attribute ignored",
32161 	       name);
32162       *no_add_attrs = true;
32163     }
32164 
32165   else if ((is_attribute_p ("ms_struct", name)
32166 	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32167 	   || ((is_attribute_p ("gcc_struct", name)
32168 		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32169     {
32170       warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32171                name);
32172       *no_add_attrs = true;
32173     }
32174 
32175   return NULL_TREE;
32176 }
32177 
32178 static tree
32179 ix86_handle_fndecl_attribute (tree *node, tree name,
32180                               tree args ATTRIBUTE_UNUSED,
32181                               int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32182 {
32183   if (TREE_CODE (*node) != FUNCTION_DECL)
32184     {
32185       warning (OPT_Wattributes, "%qE attribute only applies to functions",
32186                name);
32187       *no_add_attrs = true;
32188     }
32189   return NULL_TREE;
32190 }
32191 
32192 static bool
32193 ix86_ms_bitfield_layout_p (const_tree record_type)
32194 {
32195   return ((TARGET_MS_BITFIELD_LAYOUT
32196 	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32197           || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32198 }
32199 
32200 /* Returns an expression indicating where the this parameter is
32201    located on entry to the FUNCTION.  */
32202 
32203 static rtx
32204 x86_this_parameter (tree function)
32205 {
32206   tree type = TREE_TYPE (function);
32207   bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32208   int nregs;
32209 
32210   if (TARGET_64BIT)
32211     {
32212       const int *parm_regs;
32213 
32214       if (ix86_function_type_abi (type) == MS_ABI)
32215         parm_regs = x86_64_ms_abi_int_parameter_registers;
32216       else
32217         parm_regs = x86_64_int_parameter_registers;
32218       return gen_rtx_REG (DImode, parm_regs[aggr]);
32219     }
32220 
32221   nregs = ix86_function_regparm (type, function);
32222 
32223   if (nregs > 0 && !stdarg_p (type))
32224     {
32225       int regno;
32226       unsigned int ccvt = ix86_get_callcvt (type);
32227 
32228       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32229 	regno = aggr ? DX_REG : CX_REG;
32230       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32231         {
32232 	  regno = CX_REG;
32233 	  if (aggr)
32234 	    return gen_rtx_MEM (SImode,
32235 				plus_constant (stack_pointer_rtx, 4));
32236 	}
32237       else
32238         {
32239 	  regno = AX_REG;
32240 	  if (aggr)
32241 	    {
32242 	      regno = DX_REG;
32243 	      if (nregs == 1)
32244 		return gen_rtx_MEM (SImode,
32245 				    plus_constant (stack_pointer_rtx, 4));
32246 	    }
32247 	}
32248       return gen_rtx_REG (SImode, regno);
32249     }
32250 
32251   return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32252 }
32253 
32254 /* Determine whether x86_output_mi_thunk can succeed.  */
32255 
32256 static bool
32257 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32258 			 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32259 			 HOST_WIDE_INT vcall_offset, const_tree function)
32260 {
32261   /* 64-bit can handle anything.  */
32262   if (TARGET_64BIT)
32263     return true;
32264 
32265   /* For 32-bit, everything's fine if we have one free register.  */
32266   if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32267     return true;
32268 
32269   /* Need a free register for vcall_offset.  */
32270   if (vcall_offset)
32271     return false;
32272 
32273   /* Need a free register for GOT references.  */
32274   if (flag_pic && !targetm.binds_local_p (function))
32275     return false;
32276 
32277   /* Otherwise ok.  */
32278   return true;
32279 }
32280 
32281 /* Output the assembler code for a thunk function.  THUNK_DECL is the
32282    declaration for the thunk function itself, FUNCTION is the decl for
32283    the target function.  DELTA is an immediate constant offset to be
32284    added to THIS.  If VCALL_OFFSET is nonzero, the word at
32285    *(*this + vcall_offset) should be added to THIS.  */
32286 
32287 static void
32288 x86_output_mi_thunk (FILE *file,
32289 		     tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32290 		     HOST_WIDE_INT vcall_offset, tree function)
32291 {
32292   rtx this_param = x86_this_parameter (function);
32293   rtx this_reg, tmp, fnaddr;
32294   unsigned int tmp_regno;
32295 
32296   if (TARGET_64BIT)
32297     tmp_regno = R10_REG;
32298   else
32299     {
32300       unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32301       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32302 	tmp_regno = AX_REG;
32303       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32304 	tmp_regno = DX_REG;
32305       else
32306 	tmp_regno = CX_REG;
32307     }
32308 
32309   emit_note (NOTE_INSN_PROLOGUE_END);
32310 
32311   /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
32312      pull it in now and let DELTA benefit.  */
32313   if (REG_P (this_param))
32314     this_reg = this_param;
32315   else if (vcall_offset)
32316     {
32317       /* Put the this parameter into %eax.  */
32318       this_reg = gen_rtx_REG (Pmode, AX_REG);
32319       emit_move_insn (this_reg, this_param);
32320     }
32321   else
32322     this_reg = NULL_RTX;
32323 
32324   /* Adjust the this parameter by a fixed constant.  */
32325   if (delta)
32326     {
32327       rtx delta_rtx = GEN_INT (delta);
32328       rtx delta_dst = this_reg ? this_reg : this_param;
32329 
32330       if (TARGET_64BIT)
32331 	{
32332 	  if (!x86_64_general_operand (delta_rtx, Pmode))
32333 	    {
32334 	      tmp = gen_rtx_REG (Pmode, tmp_regno);
32335 	      emit_move_insn (tmp, delta_rtx);
32336 	      delta_rtx = tmp;
32337 	    }
32338 	}
32339 
32340       ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32341     }
32342 
32343   /* Adjust the this parameter by a value stored in the vtable.  */
32344   if (vcall_offset)
32345     {
32346       rtx vcall_addr, vcall_mem, this_mem;
32347 
32348       tmp = gen_rtx_REG (Pmode, tmp_regno);
32349 
32350       this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32351       if (Pmode != ptr_mode)
32352 	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32353       emit_move_insn (tmp, this_mem);
32354 
32355       /* Adjust the this parameter.  */
32356       vcall_addr = plus_constant (tmp, vcall_offset);
32357       if (TARGET_64BIT
32358 	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32359 	{
32360 	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32361 	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
32362 	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32363 	}
32364 
32365       vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32366       if (Pmode != ptr_mode)
32367 	emit_insn (gen_addsi_1_zext (this_reg,
32368 				     gen_rtx_REG (ptr_mode,
32369 						  REGNO (this_reg)),
32370 				     vcall_mem));
32371       else
32372 	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32373     }
32374 
32375   /* If necessary, drop THIS back to its stack slot.  */
32376   if (this_reg && this_reg != this_param)
32377     emit_move_insn (this_param, this_reg);
32378 
32379   fnaddr = XEXP (DECL_RTL (function), 0);
32380   if (TARGET_64BIT)
32381     {
32382       if (!flag_pic || targetm.binds_local_p (function)
32383 	  || cfun->machine->call_abi == MS_ABI)
32384 	;
32385       else
32386 	{
32387 	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32388 	  tmp = gen_rtx_CONST (Pmode, tmp);
32389 	  fnaddr = gen_rtx_MEM (Pmode, tmp);
32390 	}
32391     }
32392   else
32393     {
32394       if (!flag_pic || targetm.binds_local_p (function))
32395 	;
32396 #if TARGET_MACHO
32397       else if (TARGET_MACHO)
32398 	{
32399 	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32400 	  fnaddr = XEXP (fnaddr, 0);
32401 	}
32402 #endif /* TARGET_MACHO */
32403       else
32404 	{
32405 	  tmp = gen_rtx_REG (Pmode, CX_REG);
32406 	  output_set_got (tmp, NULL_RTX);
32407 
32408 	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32409 	  fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32410 	  fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32411 	}
32412     }
32413 
32414   /* Our sibling call patterns do not allow memories, because we have no
32415      predicate that can distinguish between frame and non-frame memory.
32416      For our purposes here, we can get away with (ab)using a jump pattern,
32417      because we're going to do no optimization.  */
32418   if (MEM_P (fnaddr))
32419     emit_jump_insn (gen_indirect_jump (fnaddr));
32420   else
32421     {
32422       if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
32423 	fnaddr = legitimize_pic_address (fnaddr,
32424 					 gen_rtx_REG (Pmode, tmp_regno));
32425 
32426       if (!sibcall_insn_operand (fnaddr, Pmode))
32427 	{
32428 	  tmp = gen_rtx_REG (Pmode, tmp_regno);
32429 	  if (GET_MODE (fnaddr) != Pmode)
32430 	    fnaddr = gen_rtx_ZERO_EXTEND (Pmode, fnaddr);
32431 	  emit_move_insn (tmp, fnaddr);
32432 	  fnaddr = tmp;
32433 	}
32434 
32435       tmp = gen_rtx_MEM (QImode, fnaddr);
32436       tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32437       tmp = emit_call_insn (tmp);
32438       SIBLING_CALL_P (tmp) = 1;
32439     }
32440   emit_barrier ();
32441 
32442   /* Emit just enough of rest_of_compilation to get the insns emitted.
32443      Note that use_thunk calls assemble_start_function et al.  */
32444   tmp = get_insns ();
32445   insn_locators_alloc ();
32446   shorten_branches (tmp);
32447   final_start_function (tmp, file, 1);
32448   final (tmp, file, 1);
32449   final_end_function ();
32450 }
32451 
32452 static void
32453 x86_file_start (void)
32454 {
32455   default_file_start ();
32456 #if TARGET_MACHO
32457   darwin_file_start ();
32458 #endif
32459   if (X86_FILE_START_VERSION_DIRECTIVE)
32460     fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32461   if (X86_FILE_START_FLTUSED)
32462     fputs ("\t.global\t__fltused\n", asm_out_file);
32463   if (ix86_asm_dialect == ASM_INTEL)
32464     fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32465 }
32466 
32467 int
32468 x86_field_alignment (tree field, int computed)
32469 {
32470   enum machine_mode mode;
32471   tree type = TREE_TYPE (field);
32472 
32473   if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32474     return computed;
32475   mode = TYPE_MODE (strip_array_types (type));
32476   if (mode == DFmode || mode == DCmode
32477       || GET_MODE_CLASS (mode) == MODE_INT
32478       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32479     return MIN (32, computed);
32480   return computed;
32481 }
32482 
32483 /* Output assembler code to FILE to increment profiler label # LABELNO
32484    for profiling a function entry.  */
32485 void
32486 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32487 {
32488   const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32489 					 : MCOUNT_NAME);
32490 
32491   if (TARGET_64BIT)
32492     {
32493 #ifndef NO_PROFILE_COUNTERS
32494       fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32495 #endif
32496 
32497       if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32498 	fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32499       else
32500 	fprintf (file, "\tcall\t%s\n", mcount_name);
32501     }
32502   else if (flag_pic)
32503     {
32504 #ifndef NO_PROFILE_COUNTERS
32505       fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32506 	       LPREFIX, labelno);
32507 #endif
32508       fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32509     }
32510   else
32511     {
32512 #ifndef NO_PROFILE_COUNTERS
32513       fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32514 	       LPREFIX, labelno);
32515 #endif
32516       fprintf (file, "\tcall\t%s\n", mcount_name);
32517     }
32518 }
32519 
32520 /* We don't have exact information about the insn sizes, but we may assume
32521    quite safely that we are informed about all 1 byte insns and memory
32522    address sizes.  This is enough to eliminate unnecessary padding in
32523    99% of cases.  */
32524 
32525 static int
32526 min_insn_size (rtx insn)
32527 {
32528   int l = 0, len;
32529 
32530   if (!INSN_P (insn) || !active_insn_p (insn))
32531     return 0;
32532 
32533   /* Discard alignments we've emit and jump instructions.  */
32534   if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32535       && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32536     return 0;
32537   if (JUMP_TABLE_DATA_P (insn))
32538     return 0;
32539 
32540   /* Important case - calls are always 5 bytes.
32541      It is common to have many calls in the row.  */
32542   if (CALL_P (insn)
32543       && symbolic_reference_mentioned_p (PATTERN (insn))
32544       && !SIBLING_CALL_P (insn))
32545     return 5;
32546   len = get_attr_length (insn);
32547   if (len <= 1)
32548     return 1;
32549 
32550   /* For normal instructions we rely on get_attr_length being exact,
32551      with a few exceptions.  */
32552   if (!JUMP_P (insn))
32553     {
32554       enum attr_type type = get_attr_type (insn);
32555 
32556       switch (type)
32557 	{
32558 	case TYPE_MULTI:
32559 	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32560 	      || asm_noperands (PATTERN (insn)) >= 0)
32561 	    return 0;
32562 	  break;
32563 	case TYPE_OTHER:
32564 	case TYPE_FCMP:
32565 	  break;
32566 	default:
32567 	  /* Otherwise trust get_attr_length.  */
32568 	  return len;
32569 	}
32570 
32571       l = get_attr_length_address (insn);
32572       if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32573 	l = 4;
32574     }
32575   if (l)
32576     return 1+l;
32577   else
32578     return 2;
32579 }
32580 
32581 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32582 
32583 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32584    window.  */
32585 
32586 static void
32587 ix86_avoid_jump_mispredicts (void)
32588 {
32589   rtx insn, start = get_insns ();
32590   int nbytes = 0, njumps = 0;
32591   int isjump = 0;
32592 
32593   /* Look for all minimal intervals of instructions containing 4 jumps.
32594      The intervals are bounded by START and INSN.  NBYTES is the total
32595      size of instructions in the interval including INSN and not including
32596      START.  When the NBYTES is smaller than 16 bytes, it is possible
32597      that the end of START and INSN ends up in the same 16byte page.
32598 
32599      The smallest offset in the page INSN can start is the case where START
32600      ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
32601      We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32602      */
32603   for (insn = start; insn; insn = NEXT_INSN (insn))
32604     {
32605       int min_size;
32606 
32607       if (LABEL_P (insn))
32608 	{
32609 	  int align = label_to_alignment (insn);
32610 	  int max_skip = label_to_max_skip (insn);
32611 
32612 	  if (max_skip > 15)
32613 	    max_skip = 15;
32614 	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32615 	     already in the current 16 byte page, because otherwise
32616 	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32617 	     bytes to reach 16 byte boundary.  */
32618 	  if (align <= 0
32619 	      || (align <= 3 && max_skip != (1 << align) - 1))
32620 	    max_skip = 0;
32621 	  if (dump_file)
32622 	    fprintf (dump_file, "Label %i with max_skip %i\n",
32623 		     INSN_UID (insn), max_skip);
32624 	  if (max_skip)
32625 	    {
32626 	      while (nbytes + max_skip >= 16)
32627 		{
32628 		  start = NEXT_INSN (start);
32629 		  if ((JUMP_P (start)
32630 		       && GET_CODE (PATTERN (start)) != ADDR_VEC
32631 		       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32632 		      || CALL_P (start))
32633 		    njumps--, isjump = 1;
32634 		  else
32635 		    isjump = 0;
32636 		  nbytes -= min_insn_size (start);
32637 		}
32638 	    }
32639 	  continue;
32640 	}
32641 
32642       min_size = min_insn_size (insn);
32643       nbytes += min_size;
32644       if (dump_file)
32645 	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32646 		 INSN_UID (insn), min_size);
32647       if ((JUMP_P (insn)
32648 	   && GET_CODE (PATTERN (insn)) != ADDR_VEC
32649 	   && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32650 	  || CALL_P (insn))
32651 	njumps++;
32652       else
32653 	continue;
32654 
32655       while (njumps > 3)
32656 	{
32657 	  start = NEXT_INSN (start);
32658 	  if ((JUMP_P (start)
32659 	       && GET_CODE (PATTERN (start)) != ADDR_VEC
32660 	       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32661 	      || CALL_P (start))
32662 	    njumps--, isjump = 1;
32663 	  else
32664 	    isjump = 0;
32665 	  nbytes -= min_insn_size (start);
32666 	}
32667       gcc_assert (njumps >= 0);
32668       if (dump_file)
32669         fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32670 		 INSN_UID (start), INSN_UID (insn), nbytes);
32671 
32672       if (njumps == 3 && isjump && nbytes < 16)
32673 	{
32674 	  int padsize = 15 - nbytes + min_insn_size (insn);
32675 
32676 	  if (dump_file)
32677 	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32678 		     INSN_UID (insn), padsize);
32679           emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32680 	}
32681     }
32682 }
32683 #endif
32684 
32685 /* AMD Athlon works faster
32686    when RET is not destination of conditional jump or directly preceded
32687    by other jump instruction.  We avoid the penalty by inserting NOP just
32688    before the RET instructions in such cases.  */
32689 static void
32690 ix86_pad_returns (void)
32691 {
32692   edge e;
32693   edge_iterator ei;
32694 
32695   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32696     {
32697       basic_block bb = e->src;
32698       rtx ret = BB_END (bb);
32699       rtx prev;
32700       bool replace = false;
32701 
32702       if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32703 	  || optimize_bb_for_size_p (bb))
32704 	continue;
32705       for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32706 	if (active_insn_p (prev) || LABEL_P (prev))
32707 	  break;
32708       if (prev && LABEL_P (prev))
32709 	{
32710 	  edge e;
32711 	  edge_iterator ei;
32712 
32713 	  FOR_EACH_EDGE (e, ei, bb->preds)
32714 	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
32715 		&& !(e->flags & EDGE_FALLTHRU))
32716 	      replace = true;
32717 	}
32718       if (!replace)
32719 	{
32720 	  prev = prev_active_insn (ret);
32721 	  if (prev
32722 	      && ((JUMP_P (prev) && any_condjump_p (prev))
32723 		  || CALL_P (prev)))
32724 	    replace = true;
32725 	  /* Empty functions get branch mispredict even when
32726 	     the jump destination is not visible to us.  */
32727 	  if (!prev && !optimize_function_for_size_p (cfun))
32728 	    replace = true;
32729 	}
32730       if (replace)
32731 	{
32732 	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32733 	  delete_insn (ret);
32734 	}
32735     }
32736 }
32737 
32738 /* Count the minimum number of instructions in BB.  Return 4 if the
32739    number of instructions >= 4.  */
32740 
32741 static int
32742 ix86_count_insn_bb (basic_block bb)
32743 {
32744   rtx insn;
32745   int insn_count = 0;
32746 
32747   /* Count number of instructions in this block.  Return 4 if the number
32748      of instructions >= 4.  */
32749   FOR_BB_INSNS (bb, insn)
32750     {
32751       /* Only happen in exit blocks.  */
32752       if (JUMP_P (insn)
32753 	  && ANY_RETURN_P (PATTERN (insn)))
32754 	break;
32755 
32756       if (NONDEBUG_INSN_P (insn)
32757 	  && GET_CODE (PATTERN (insn)) != USE
32758 	  && GET_CODE (PATTERN (insn)) != CLOBBER)
32759 	{
32760 	  insn_count++;
32761 	  if (insn_count >= 4)
32762 	    return insn_count;
32763 	}
32764     }
32765 
32766   return insn_count;
32767 }
32768 
32769 
32770 /* Count the minimum number of instructions in code path in BB.
32771    Return 4 if the number of instructions >= 4.  */
32772 
32773 static int
32774 ix86_count_insn (basic_block bb)
32775 {
32776   edge e;
32777   edge_iterator ei;
32778   int min_prev_count;
32779 
32780   /* Only bother counting instructions along paths with no
32781      more than 2 basic blocks between entry and exit.  Given
32782      that BB has an edge to exit, determine if a predecessor
32783      of BB has an edge from entry.  If so, compute the number
32784      of instructions in the predecessor block.  If there
32785      happen to be multiple such blocks, compute the minimum.  */
32786   min_prev_count = 4;
32787   FOR_EACH_EDGE (e, ei, bb->preds)
32788     {
32789       edge prev_e;
32790       edge_iterator prev_ei;
32791 
32792       if (e->src == ENTRY_BLOCK_PTR)
32793 	{
32794 	  min_prev_count = 0;
32795 	  break;
32796 	}
32797       FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32798 	{
32799 	  if (prev_e->src == ENTRY_BLOCK_PTR)
32800 	    {
32801 	      int count = ix86_count_insn_bb (e->src);
32802 	      if (count < min_prev_count)
32803 		min_prev_count = count;
32804 	      break;
32805 	    }
32806 	}
32807     }
32808 
32809   if (min_prev_count < 4)
32810     min_prev_count += ix86_count_insn_bb (bb);
32811 
32812   return min_prev_count;
32813 }
32814 
32815 /* Pad short funtion to 4 instructions.   */
32816 
32817 static void
32818 ix86_pad_short_function (void)
32819 {
32820   edge e;
32821   edge_iterator ei;
32822 
32823   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32824     {
32825       rtx ret = BB_END (e->src);
32826       if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32827 	{
32828 	  int insn_count = ix86_count_insn (e->src);
32829 
32830 	  /* Pad short function.  */
32831 	  if (insn_count < 4)
32832 	    {
32833 	      rtx insn = ret;
32834 
32835 	      /* Find epilogue.  */
32836 	      while (insn
32837 		     && (!NOTE_P (insn)
32838 			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32839 		insn = PREV_INSN (insn);
32840 
32841 	      if (!insn)
32842 		insn = ret;
32843 
32844 	      /* Two NOPs count as one instruction.  */
32845 	      insn_count = 2 * (4 - insn_count);
32846 	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32847 	    }
32848 	}
32849     }
32850 }
32851 
32852 /* Implement machine specific optimizations.  We implement padding of returns
32853    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
32854 static void
32855 ix86_reorg (void)
32856 {
32857   /* We are freeing block_for_insn in the toplev to keep compatibility
32858      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
32859   compute_bb_for_insn ();
32860 
32861   /* Run the vzeroupper optimization if needed.  */
32862   if (TARGET_VZEROUPPER)
32863     move_or_delete_vzeroupper ();
32864 
32865   if (optimize && optimize_function_for_speed_p (cfun))
32866     {
32867       if (TARGET_PAD_SHORT_FUNCTION)
32868 	ix86_pad_short_function ();
32869       else if (TARGET_PAD_RETURNS)
32870 	ix86_pad_returns ();
32871 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32872       if (TARGET_FOUR_JUMP_LIMIT)
32873 	ix86_avoid_jump_mispredicts ();
32874 #endif
32875     }
32876 }
32877 
32878 /* Return nonzero when QImode register that must be represented via REX prefix
32879    is used.  */
32880 bool
32881 x86_extended_QIreg_mentioned_p (rtx insn)
32882 {
32883   int i;
32884   extract_insn_cached (insn);
32885   for (i = 0; i < recog_data.n_operands; i++)
32886     if (REG_P (recog_data.operand[i])
32887 	&& REGNO (recog_data.operand[i]) > BX_REG)
32888        return true;
32889   return false;
32890 }
32891 
32892 /* Return nonzero when P points to register encoded via REX prefix.
32893    Called via for_each_rtx.  */
32894 static int
32895 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32896 {
32897    unsigned int regno;
32898    if (!REG_P (*p))
32899      return 0;
32900    regno = REGNO (*p);
32901    return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32902 }
32903 
32904 /* Return true when INSN mentions register that must be encoded using REX
32905    prefix.  */
32906 bool
32907 x86_extended_reg_mentioned_p (rtx insn)
32908 {
32909   return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32910 		       extended_reg_mentioned_1, NULL);
32911 }
32912 
32913 /* If profitable, negate (without causing overflow) integer constant
32914    of mode MODE at location LOC.  Return true in this case.  */
32915 bool
32916 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32917 {
32918   HOST_WIDE_INT val;
32919 
32920   if (!CONST_INT_P (*loc))
32921     return false;
32922 
32923   switch (mode)
32924     {
32925     case DImode:
32926       /* DImode x86_64 constants must fit in 32 bits.  */
32927       gcc_assert (x86_64_immediate_operand (*loc, mode));
32928 
32929       mode = SImode;
32930       break;
32931 
32932     case SImode:
32933     case HImode:
32934     case QImode:
32935       break;
32936 
32937     default:
32938       gcc_unreachable ();
32939     }
32940 
32941   /* Avoid overflows.  */
32942   if (mode_signbit_p (mode, *loc))
32943     return false;
32944 
32945   val = INTVAL (*loc);
32946 
32947   /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32948      Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
32949   if ((val < 0 && val != -128)
32950       || val == 128)
32951     {
32952       *loc = GEN_INT (-val);
32953       return true;
32954     }
32955 
32956   return false;
32957 }
32958 
32959 /* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
32960    optabs would emit if we didn't have TFmode patterns.  */
32961 
32962 void
32963 x86_emit_floatuns (rtx operands[2])
32964 {
32965   rtx neglab, donelab, i0, i1, f0, in, out;
32966   enum machine_mode mode, inmode;
32967 
32968   inmode = GET_MODE (operands[1]);
32969   gcc_assert (inmode == SImode || inmode == DImode);
32970 
32971   out = operands[0];
32972   in = force_reg (inmode, operands[1]);
32973   mode = GET_MODE (out);
32974   neglab = gen_label_rtx ();
32975   donelab = gen_label_rtx ();
32976   f0 = gen_reg_rtx (mode);
32977 
32978   emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32979 
32980   expand_float (out, in, 0);
32981 
32982   emit_jump_insn (gen_jump (donelab));
32983   emit_barrier ();
32984 
32985   emit_label (neglab);
32986 
32987   i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32988 			    1, OPTAB_DIRECT);
32989   i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32990 			    1, OPTAB_DIRECT);
32991   i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32992 
32993   expand_float (f0, i0, 0);
32994 
32995   emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32996 
32997   emit_label (donelab);
32998 }
32999 
33000 /* AVX2 does support 32-byte integer vector operations,
33001    thus the longest vector we are faced with is V32QImode.  */
33002 #define MAX_VECT_LEN	32
33003 
33004 struct expand_vec_perm_d
33005 {
33006   rtx target, op0, op1;
33007   unsigned char perm[MAX_VECT_LEN];
33008   enum machine_mode vmode;
33009   unsigned char nelt;
33010   bool testing_p;
33011 };
33012 
33013 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33014 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33015 
33016 /* Get a vector mode of the same size as the original but with elements
33017    twice as wide.  This is only guaranteed to apply to integral vectors.  */
33018 
33019 static inline enum machine_mode
33020 get_mode_wider_vector (enum machine_mode o)
33021 {
33022   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
33023   enum machine_mode n = GET_MODE_WIDER_MODE (o);
33024   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33025   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33026   return n;
33027 }
33028 
33029 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
33030    with all elements equal to VAR.  Return true if successful.  */
33031 
33032 static bool
33033 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33034 				   rtx target, rtx val)
33035 {
33036   bool ok;
33037 
33038   switch (mode)
33039     {
33040     case V2SImode:
33041     case V2SFmode:
33042       if (!mmx_ok)
33043 	return false;
33044       /* FALLTHRU */
33045 
33046     case V4DFmode:
33047     case V4DImode:
33048     case V8SFmode:
33049     case V8SImode:
33050     case V2DFmode:
33051     case V2DImode:
33052     case V4SFmode:
33053     case V4SImode:
33054       {
33055 	rtx insn, dup;
33056 
33057 	/* First attempt to recognize VAL as-is.  */
33058 	dup = gen_rtx_VEC_DUPLICATE (mode, val);
33059 	insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33060 	if (recog_memoized (insn) < 0)
33061 	  {
33062 	    rtx seq;
33063 	    /* If that fails, force VAL into a register.  */
33064 
33065 	    start_sequence ();
33066 	    XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33067 	    seq = get_insns ();
33068 	    end_sequence ();
33069 	    if (seq)
33070 	      emit_insn_before (seq, insn);
33071 
33072 	    ok = recog_memoized (insn) >= 0;
33073 	    gcc_assert (ok);
33074 	  }
33075       }
33076       return true;
33077 
33078     case V4HImode:
33079       if (!mmx_ok)
33080 	return false;
33081       if (TARGET_SSE || TARGET_3DNOW_A)
33082 	{
33083 	  rtx x;
33084 
33085 	  val = gen_lowpart (SImode, val);
33086 	  x = gen_rtx_TRUNCATE (HImode, val);
33087 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
33088 	  emit_insn (gen_rtx_SET (VOIDmode, target, x));
33089 	  return true;
33090 	}
33091       goto widen;
33092 
33093     case V8QImode:
33094       if (!mmx_ok)
33095 	return false;
33096       goto widen;
33097 
33098     case V8HImode:
33099       if (TARGET_SSE2)
33100 	{
33101 	  struct expand_vec_perm_d dperm;
33102 	  rtx tmp1, tmp2;
33103 
33104 	permute:
33105 	  memset (&dperm, 0, sizeof (dperm));
33106 	  dperm.target = target;
33107 	  dperm.vmode = mode;
33108 	  dperm.nelt = GET_MODE_NUNITS (mode);
33109 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33110 
33111 	  /* Extend to SImode using a paradoxical SUBREG.  */
33112 	  tmp1 = gen_reg_rtx (SImode);
33113 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
33114 
33115 	  /* Insert the SImode value as low element of a V4SImode vector. */
33116 	  tmp2 = gen_lowpart (V4SImode, dperm.op0);
33117 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33118 
33119 	  ok = (expand_vec_perm_1 (&dperm)
33120 		|| expand_vec_perm_broadcast_1 (&dperm));
33121 	  gcc_assert (ok);
33122 	  return ok;
33123 	}
33124       goto widen;
33125 
33126     case V16QImode:
33127       if (TARGET_SSE2)
33128 	goto permute;
33129       goto widen;
33130 
33131     widen:
33132       /* Replicate the value once into the next wider mode and recurse.  */
33133       {
33134 	enum machine_mode smode, wsmode, wvmode;
33135 	rtx x;
33136 
33137 	smode = GET_MODE_INNER (mode);
33138 	wvmode = get_mode_wider_vector (mode);
33139 	wsmode = GET_MODE_INNER (wvmode);
33140 
33141 	val = convert_modes (wsmode, smode, val, true);
33142 	x = expand_simple_binop (wsmode, ASHIFT, val,
33143 				 GEN_INT (GET_MODE_BITSIZE (smode)),
33144 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33145 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33146 
33147 	x = gen_lowpart (wvmode, target);
33148 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33149 	gcc_assert (ok);
33150 	return ok;
33151       }
33152 
33153     case V16HImode:
33154     case V32QImode:
33155       {
33156 	enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33157 	rtx x = gen_reg_rtx (hvmode);
33158 
33159 	ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33160 	gcc_assert (ok);
33161 
33162 	x = gen_rtx_VEC_CONCAT (mode, x, x);
33163 	emit_insn (gen_rtx_SET (VOIDmode, target, x));
33164       }
33165       return true;
33166 
33167     default:
33168       return false;
33169     }
33170 }
33171 
33172 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
33173    whose ONE_VAR element is VAR, and other elements are zero.  Return true
33174    if successful.  */
33175 
33176 static bool
33177 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33178 				     rtx target, rtx var, int one_var)
33179 {
33180   enum machine_mode vsimode;
33181   rtx new_target;
33182   rtx x, tmp;
33183   bool use_vector_set = false;
33184 
33185   switch (mode)
33186     {
33187     case V2DImode:
33188       /* For SSE4.1, we normally use vector set.  But if the second
33189 	 element is zero and inter-unit moves are OK, we use movq
33190 	 instead.  */
33191       use_vector_set = (TARGET_64BIT
33192 			&& TARGET_SSE4_1
33193 			&& !(TARGET_INTER_UNIT_MOVES
33194 			     && one_var == 0));
33195       break;
33196     case V16QImode:
33197     case V4SImode:
33198     case V4SFmode:
33199       use_vector_set = TARGET_SSE4_1;
33200       break;
33201     case V8HImode:
33202       use_vector_set = TARGET_SSE2;
33203       break;
33204     case V4HImode:
33205       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33206       break;
33207     case V32QImode:
33208     case V16HImode:
33209     case V8SImode:
33210     case V8SFmode:
33211     case V4DFmode:
33212       use_vector_set = TARGET_AVX;
33213       break;
33214     case V4DImode:
33215       /* Use ix86_expand_vector_set in 64bit mode only.  */
33216       use_vector_set = TARGET_AVX && TARGET_64BIT;
33217       break;
33218     default:
33219       break;
33220     }
33221 
33222   if (use_vector_set)
33223     {
33224       emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33225       var = force_reg (GET_MODE_INNER (mode), var);
33226       ix86_expand_vector_set (mmx_ok, target, var, one_var);
33227       return true;
33228     }
33229 
33230   switch (mode)
33231     {
33232     case V2SFmode:
33233     case V2SImode:
33234       if (!mmx_ok)
33235 	return false;
33236       /* FALLTHRU */
33237 
33238     case V2DFmode:
33239     case V2DImode:
33240       if (one_var != 0)
33241 	return false;
33242       var = force_reg (GET_MODE_INNER (mode), var);
33243       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33244       emit_insn (gen_rtx_SET (VOIDmode, target, x));
33245       return true;
33246 
33247     case V4SFmode:
33248     case V4SImode:
33249       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33250 	new_target = gen_reg_rtx (mode);
33251       else
33252 	new_target = target;
33253       var = force_reg (GET_MODE_INNER (mode), var);
33254       x = gen_rtx_VEC_DUPLICATE (mode, var);
33255       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33256       emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33257       if (one_var != 0)
33258 	{
33259 	  /* We need to shuffle the value to the correct position, so
33260 	     create a new pseudo to store the intermediate result.  */
33261 
33262 	  /* With SSE2, we can use the integer shuffle insns.  */
33263 	  if (mode != V4SFmode && TARGET_SSE2)
33264 	    {
33265 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33266 					    const1_rtx,
33267 					    GEN_INT (one_var == 1 ? 0 : 1),
33268 					    GEN_INT (one_var == 2 ? 0 : 1),
33269 					    GEN_INT (one_var == 3 ? 0 : 1)));
33270 	      if (target != new_target)
33271 		emit_move_insn (target, new_target);
33272 	      return true;
33273 	    }
33274 
33275 	  /* Otherwise convert the intermediate result to V4SFmode and
33276 	     use the SSE1 shuffle instructions.  */
33277 	  if (mode != V4SFmode)
33278 	    {
33279 	      tmp = gen_reg_rtx (V4SFmode);
33280 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33281 	    }
33282 	  else
33283 	    tmp = new_target;
33284 
33285 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33286 				       const1_rtx,
33287 				       GEN_INT (one_var == 1 ? 0 : 1),
33288 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
33289 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33290 
33291 	  if (mode != V4SFmode)
33292 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33293 	  else if (tmp != target)
33294 	    emit_move_insn (target, tmp);
33295 	}
33296       else if (target != new_target)
33297 	emit_move_insn (target, new_target);
33298       return true;
33299 
33300     case V8HImode:
33301     case V16QImode:
33302       vsimode = V4SImode;
33303       goto widen;
33304     case V4HImode:
33305     case V8QImode:
33306       if (!mmx_ok)
33307 	return false;
33308       vsimode = V2SImode;
33309       goto widen;
33310     widen:
33311       if (one_var != 0)
33312 	return false;
33313 
33314       /* Zero extend the variable element to SImode and recurse.  */
33315       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33316 
33317       x = gen_reg_rtx (vsimode);
33318       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33319 						var, one_var))
33320 	gcc_unreachable ();
33321 
33322       emit_move_insn (target, gen_lowpart (mode, x));
33323       return true;
33324 
33325     default:
33326       return false;
33327     }
33328 }
33329 
33330 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
33331    consisting of the values in VALS.  It is known that all elements
33332    except ONE_VAR are constants.  Return true if successful.  */
33333 
33334 static bool
33335 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33336 				 rtx target, rtx vals, int one_var)
33337 {
33338   rtx var = XVECEXP (vals, 0, one_var);
33339   enum machine_mode wmode;
33340   rtx const_vec, x;
33341 
33342   const_vec = copy_rtx (vals);
33343   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33344   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33345 
33346   switch (mode)
33347     {
33348     case V2DFmode:
33349     case V2DImode:
33350     case V2SFmode:
33351     case V2SImode:
33352       /* For the two element vectors, it's just as easy to use
33353 	 the general case.  */
33354       return false;
33355 
33356     case V4DImode:
33357       /* Use ix86_expand_vector_set in 64bit mode only.  */
33358       if (!TARGET_64BIT)
33359 	return false;
33360     case V4DFmode:
33361     case V8SFmode:
33362     case V8SImode:
33363     case V16HImode:
33364     case V32QImode:
33365     case V4SFmode:
33366     case V4SImode:
33367     case V8HImode:
33368     case V4HImode:
33369       break;
33370 
33371     case V16QImode:
33372       if (TARGET_SSE4_1)
33373 	break;
33374       wmode = V8HImode;
33375       goto widen;
33376     case V8QImode:
33377       wmode = V4HImode;
33378       goto widen;
33379     widen:
33380       /* There's no way to set one QImode entry easily.  Combine
33381 	 the variable value with its adjacent constant value, and
33382 	 promote to an HImode set.  */
33383       x = XVECEXP (vals, 0, one_var ^ 1);
33384       if (one_var & 1)
33385 	{
33386 	  var = convert_modes (HImode, QImode, var, true);
33387 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33388 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
33389 	  x = GEN_INT (INTVAL (x) & 0xff);
33390 	}
33391       else
33392 	{
33393 	  var = convert_modes (HImode, QImode, var, true);
33394 	  x = gen_int_mode (INTVAL (x) << 8, HImode);
33395 	}
33396       if (x != const0_rtx)
33397 	var = expand_simple_binop (HImode, IOR, var, x, var,
33398 				   1, OPTAB_LIB_WIDEN);
33399 
33400       x = gen_reg_rtx (wmode);
33401       emit_move_insn (x, gen_lowpart (wmode, const_vec));
33402       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33403 
33404       emit_move_insn (target, gen_lowpart (mode, x));
33405       return true;
33406 
33407     default:
33408       return false;
33409     }
33410 
33411   emit_move_insn (target, const_vec);
33412   ix86_expand_vector_set (mmx_ok, target, var, one_var);
33413   return true;
33414 }
33415 
33416 /* A subroutine of ix86_expand_vector_init_general.  Use vector
33417    concatenate to handle the most general case: all values variable,
33418    and none identical.  */
33419 
33420 static void
33421 ix86_expand_vector_init_concat (enum machine_mode mode,
33422 				rtx target, rtx *ops, int n)
33423 {
33424   enum machine_mode cmode, hmode = VOIDmode;
33425   rtx first[8], second[4];
33426   rtvec v;
33427   int i, j;
33428 
33429   switch (n)
33430     {
33431     case 2:
33432       switch (mode)
33433 	{
33434 	case V8SImode:
33435 	  cmode = V4SImode;
33436 	  break;
33437 	case V8SFmode:
33438 	  cmode = V4SFmode;
33439 	  break;
33440 	case V4DImode:
33441 	  cmode = V2DImode;
33442 	  break;
33443 	case V4DFmode:
33444 	  cmode = V2DFmode;
33445 	  break;
33446 	case V4SImode:
33447 	  cmode = V2SImode;
33448 	  break;
33449 	case V4SFmode:
33450 	  cmode = V2SFmode;
33451 	  break;
33452 	case V2DImode:
33453 	  cmode = DImode;
33454 	  break;
33455 	case V2SImode:
33456 	  cmode = SImode;
33457 	  break;
33458 	case V2DFmode:
33459 	  cmode = DFmode;
33460 	  break;
33461 	case V2SFmode:
33462 	  cmode = SFmode;
33463 	  break;
33464 	default:
33465 	  gcc_unreachable ();
33466 	}
33467 
33468       if (!register_operand (ops[1], cmode))
33469 	ops[1] = force_reg (cmode, ops[1]);
33470       if (!register_operand (ops[0], cmode))
33471 	ops[0] = force_reg (cmode, ops[0]);
33472       emit_insn (gen_rtx_SET (VOIDmode, target,
33473 			      gen_rtx_VEC_CONCAT (mode, ops[0],
33474 						  ops[1])));
33475       break;
33476 
33477     case 4:
33478       switch (mode)
33479 	{
33480 	case V4DImode:
33481 	  cmode = V2DImode;
33482 	  break;
33483 	case V4DFmode:
33484 	  cmode = V2DFmode;
33485 	  break;
33486 	case V4SImode:
33487 	  cmode = V2SImode;
33488 	  break;
33489 	case V4SFmode:
33490 	  cmode = V2SFmode;
33491 	  break;
33492 	default:
33493 	  gcc_unreachable ();
33494 	}
33495       goto half;
33496 
33497     case 8:
33498       switch (mode)
33499 	{
33500 	case V8SImode:
33501 	  cmode = V2SImode;
33502 	  hmode = V4SImode;
33503 	  break;
33504 	case V8SFmode:
33505 	  cmode = V2SFmode;
33506 	  hmode = V4SFmode;
33507 	  break;
33508 	default:
33509 	  gcc_unreachable ();
33510 	}
33511       goto half;
33512 
33513 half:
33514       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
33515       i = n - 1;
33516       j = (n >> 1) - 1;
33517       for (; i > 0; i -= 2, j--)
33518 	{
33519 	  first[j] = gen_reg_rtx (cmode);
33520 	  v = gen_rtvec (2, ops[i - 1], ops[i]);
33521 	  ix86_expand_vector_init (false, first[j],
33522 				   gen_rtx_PARALLEL (cmode, v));
33523 	}
33524 
33525       n >>= 1;
33526       if (n > 2)
33527 	{
33528 	  gcc_assert (hmode != VOIDmode);
33529 	  for (i = j = 0; i < n; i += 2, j++)
33530 	    {
33531 	      second[j] = gen_reg_rtx (hmode);
33532 	      ix86_expand_vector_init_concat (hmode, second [j],
33533 					      &first [i], 2);
33534 	    }
33535 	  n >>= 1;
33536 	  ix86_expand_vector_init_concat (mode, target, second, n);
33537 	}
33538       else
33539 	ix86_expand_vector_init_concat (mode, target, first, n);
33540       break;
33541 
33542     default:
33543       gcc_unreachable ();
33544     }
33545 }
33546 
33547 /* A subroutine of ix86_expand_vector_init_general.  Use vector
33548    interleave to handle the most general case: all values variable,
33549    and none identical.  */
33550 
33551 static void
33552 ix86_expand_vector_init_interleave (enum machine_mode mode,
33553 				    rtx target, rtx *ops, int n)
33554 {
33555   enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33556   int i, j;
33557   rtx op0, op1;
33558   rtx (*gen_load_even) (rtx, rtx, rtx);
33559   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33560   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33561 
33562   switch (mode)
33563     {
33564     case V8HImode:
33565       gen_load_even = gen_vec_setv8hi;
33566       gen_interleave_first_low = gen_vec_interleave_lowv4si;
33567       gen_interleave_second_low = gen_vec_interleave_lowv2di;
33568       inner_mode = HImode;
33569       first_imode = V4SImode;
33570       second_imode = V2DImode;
33571       third_imode = VOIDmode;
33572       break;
33573     case V16QImode:
33574       gen_load_even = gen_vec_setv16qi;
33575       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33576       gen_interleave_second_low = gen_vec_interleave_lowv4si;
33577       inner_mode = QImode;
33578       first_imode = V8HImode;
33579       second_imode = V4SImode;
33580       third_imode = V2DImode;
33581       break;
33582     default:
33583       gcc_unreachable ();
33584     }
33585 
33586   for (i = 0; i < n; i++)
33587     {
33588       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
33589       op0 = gen_reg_rtx (SImode);
33590       emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33591 
33592       /* Insert the SImode value as low element of V4SImode vector. */
33593       op1 = gen_reg_rtx (V4SImode);
33594       op0 = gen_rtx_VEC_MERGE (V4SImode,
33595 			       gen_rtx_VEC_DUPLICATE (V4SImode,
33596 						      op0),
33597 			       CONST0_RTX (V4SImode),
33598 			       const1_rtx);
33599       emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33600 
33601       /* Cast the V4SImode vector back to a vector in orignal mode.  */
33602       op0 = gen_reg_rtx (mode);
33603       emit_move_insn (op0, gen_lowpart (mode, op1));
33604 
33605       /* Load even elements into the second positon.  */
33606       emit_insn (gen_load_even (op0,
33607 				force_reg (inner_mode,
33608 					   ops [i + i + 1]),
33609 				const1_rtx));
33610 
33611       /* Cast vector to FIRST_IMODE vector.  */
33612       ops[i] = gen_reg_rtx (first_imode);
33613       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33614     }
33615 
33616   /* Interleave low FIRST_IMODE vectors.  */
33617   for (i = j = 0; i < n; i += 2, j++)
33618     {
33619       op0 = gen_reg_rtx (first_imode);
33620       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33621 
33622       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
33623       ops[j] = gen_reg_rtx (second_imode);
33624       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33625     }
33626 
33627   /* Interleave low SECOND_IMODE vectors.  */
33628   switch (second_imode)
33629     {
33630     case V4SImode:
33631       for (i = j = 0; i < n / 2; i += 2, j++)
33632 	{
33633 	  op0 = gen_reg_rtx (second_imode);
33634 	  emit_insn (gen_interleave_second_low (op0, ops[i],
33635 						ops[i + 1]));
33636 
33637 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33638 	     vector.  */
33639 	  ops[j] = gen_reg_rtx (third_imode);
33640 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33641 	}
33642       second_imode = V2DImode;
33643       gen_interleave_second_low = gen_vec_interleave_lowv2di;
33644       /* FALLTHRU */
33645 
33646     case V2DImode:
33647       op0 = gen_reg_rtx (second_imode);
33648       emit_insn (gen_interleave_second_low (op0, ops[0],
33649 					    ops[1]));
33650 
33651       /* Cast the SECOND_IMODE vector back to a vector on original
33652 	 mode.  */
33653       emit_insn (gen_rtx_SET (VOIDmode, target,
33654 			      gen_lowpart (mode, op0)));
33655       break;
33656 
33657     default:
33658       gcc_unreachable ();
33659     }
33660 }
33661 
33662 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
33663    all values variable, and none identical.  */
33664 
33665 static void
33666 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33667 				 rtx target, rtx vals)
33668 {
33669   rtx ops[32], op0, op1;
33670   enum machine_mode half_mode = VOIDmode;
33671   int n, i;
33672 
33673   switch (mode)
33674     {
33675     case V2SFmode:
33676     case V2SImode:
33677       if (!mmx_ok && !TARGET_SSE)
33678 	break;
33679       /* FALLTHRU */
33680 
33681     case V8SFmode:
33682     case V8SImode:
33683     case V4DFmode:
33684     case V4DImode:
33685     case V4SFmode:
33686     case V4SImode:
33687     case V2DFmode:
33688     case V2DImode:
33689       n = GET_MODE_NUNITS (mode);
33690       for (i = 0; i < n; i++)
33691 	ops[i] = XVECEXP (vals, 0, i);
33692       ix86_expand_vector_init_concat (mode, target, ops, n);
33693       return;
33694 
33695     case V32QImode:
33696       half_mode = V16QImode;
33697       goto half;
33698 
33699     case V16HImode:
33700       half_mode = V8HImode;
33701       goto half;
33702 
33703 half:
33704       n = GET_MODE_NUNITS (mode);
33705       for (i = 0; i < n; i++)
33706 	ops[i] = XVECEXP (vals, 0, i);
33707       op0 = gen_reg_rtx (half_mode);
33708       op1 = gen_reg_rtx (half_mode);
33709       ix86_expand_vector_init_interleave (half_mode, op0, ops,
33710 					  n >> 2);
33711       ix86_expand_vector_init_interleave (half_mode, op1,
33712 					  &ops [n >> 1], n >> 2);
33713       emit_insn (gen_rtx_SET (VOIDmode, target,
33714 			      gen_rtx_VEC_CONCAT (mode, op0, op1)));
33715       return;
33716 
33717     case V16QImode:
33718       if (!TARGET_SSE4_1)
33719 	break;
33720       /* FALLTHRU */
33721 
33722     case V8HImode:
33723       if (!TARGET_SSE2)
33724 	break;
33725 
33726       /* Don't use ix86_expand_vector_init_interleave if we can't
33727 	 move from GPR to SSE register directly.  */
33728       if (!TARGET_INTER_UNIT_MOVES)
33729 	break;
33730 
33731       n = GET_MODE_NUNITS (mode);
33732       for (i = 0; i < n; i++)
33733 	ops[i] = XVECEXP (vals, 0, i);
33734       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33735       return;
33736 
33737     case V4HImode:
33738     case V8QImode:
33739       break;
33740 
33741     default:
33742       gcc_unreachable ();
33743     }
33744 
33745     {
33746       int i, j, n_elts, n_words, n_elt_per_word;
33747       enum machine_mode inner_mode;
33748       rtx words[4], shift;
33749 
33750       inner_mode = GET_MODE_INNER (mode);
33751       n_elts = GET_MODE_NUNITS (mode);
33752       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33753       n_elt_per_word = n_elts / n_words;
33754       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33755 
33756       for (i = 0; i < n_words; ++i)
33757 	{
33758 	  rtx word = NULL_RTX;
33759 
33760 	  for (j = 0; j < n_elt_per_word; ++j)
33761 	    {
33762 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33763 	      elt = convert_modes (word_mode, inner_mode, elt, true);
33764 
33765 	      if (j == 0)
33766 		word = elt;
33767 	      else
33768 		{
33769 		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33770 					      word, 1, OPTAB_LIB_WIDEN);
33771 		  word = expand_simple_binop (word_mode, IOR, word, elt,
33772 					      word, 1, OPTAB_LIB_WIDEN);
33773 		}
33774 	    }
33775 
33776 	  words[i] = word;
33777 	}
33778 
33779       if (n_words == 1)
33780 	emit_move_insn (target, gen_lowpart (mode, words[0]));
33781       else if (n_words == 2)
33782 	{
33783 	  rtx tmp = gen_reg_rtx (mode);
33784 	  emit_clobber (tmp);
33785 	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33786 	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33787 	  emit_move_insn (target, tmp);
33788 	}
33789       else if (n_words == 4)
33790 	{
33791 	  rtx tmp = gen_reg_rtx (V4SImode);
33792 	  gcc_assert (word_mode == SImode);
33793 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33794 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33795 	  emit_move_insn (target, gen_lowpart (mode, tmp));
33796 	}
33797       else
33798 	gcc_unreachable ();
33799     }
33800 }
33801 
33802 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
33803    instructions unless MMX_OK is true.  */
33804 
33805 void
33806 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33807 {
33808   enum machine_mode mode = GET_MODE (target);
33809   enum machine_mode inner_mode = GET_MODE_INNER (mode);
33810   int n_elts = GET_MODE_NUNITS (mode);
33811   int n_var = 0, one_var = -1;
33812   bool all_same = true, all_const_zero = true;
33813   int i;
33814   rtx x;
33815 
33816   for (i = 0; i < n_elts; ++i)
33817     {
33818       x = XVECEXP (vals, 0, i);
33819       if (!(CONST_INT_P (x)
33820 	    || GET_CODE (x) == CONST_DOUBLE
33821 	    || GET_CODE (x) == CONST_FIXED))
33822 	n_var++, one_var = i;
33823       else if (x != CONST0_RTX (inner_mode))
33824 	all_const_zero = false;
33825       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33826 	all_same = false;
33827     }
33828 
33829   /* Constants are best loaded from the constant pool.  */
33830   if (n_var == 0)
33831     {
33832       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33833       return;
33834     }
33835 
33836   /* If all values are identical, broadcast the value.  */
33837   if (all_same
33838       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33839 					    XVECEXP (vals, 0, 0)))
33840     return;
33841 
33842   /* Values where only one field is non-constant are best loaded from
33843      the pool and overwritten via move later.  */
33844   if (n_var == 1)
33845     {
33846       if (all_const_zero
33847 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33848 						  XVECEXP (vals, 0, one_var),
33849 						  one_var))
33850 	return;
33851 
33852       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33853 	return;
33854     }
33855 
33856   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33857 }
33858 
33859 void
33860 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33861 {
33862   enum machine_mode mode = GET_MODE (target);
33863   enum machine_mode inner_mode = GET_MODE_INNER (mode);
33864   enum machine_mode half_mode;
33865   bool use_vec_merge = false;
33866   rtx tmp;
33867   static rtx (*gen_extract[6][2]) (rtx, rtx)
33868     = {
33869 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33870 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33871 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33872 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33873 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33874 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33875       };
33876   static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33877     = {
33878 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33879 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33880 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33881 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33882 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33883 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33884       };
33885   int i, j, n;
33886 
33887   switch (mode)
33888     {
33889     case V2SFmode:
33890     case V2SImode:
33891       if (mmx_ok)
33892 	{
33893 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33894 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33895 	  if (elt == 0)
33896 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33897 	  else
33898 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33899 	  emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33900 	  return;
33901 	}
33902       break;
33903 
33904     case V2DImode:
33905       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33906       if (use_vec_merge)
33907 	break;
33908 
33909       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33910       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33911       if (elt == 0)
33912 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33913       else
33914 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33915       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33916       return;
33917 
33918     case V2DFmode:
33919       {
33920 	rtx op0, op1;
33921 
33922 	/* For the two element vectors, we implement a VEC_CONCAT with
33923 	   the extraction of the other element.  */
33924 
33925 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33926 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33927 
33928 	if (elt == 0)
33929 	  op0 = val, op1 = tmp;
33930 	else
33931 	  op0 = tmp, op1 = val;
33932 
33933 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33934 	emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33935       }
33936       return;
33937 
33938     case V4SFmode:
33939       use_vec_merge = TARGET_SSE4_1;
33940       if (use_vec_merge)
33941 	break;
33942 
33943       switch (elt)
33944 	{
33945 	case 0:
33946 	  use_vec_merge = true;
33947 	  break;
33948 
33949 	case 1:
33950 	  /* tmp = target = A B C D */
33951 	  tmp = copy_to_reg (target);
33952 	  /* target = A A B B */
33953 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33954 	  /* target = X A B B */
33955 	  ix86_expand_vector_set (false, target, val, 0);
33956 	  /* target = A X C D  */
33957 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33958 					  const1_rtx, const0_rtx,
33959 					  GEN_INT (2+4), GEN_INT (3+4)));
33960 	  return;
33961 
33962 	case 2:
33963 	  /* tmp = target = A B C D */
33964 	  tmp = copy_to_reg (target);
33965 	  /* tmp = X B C D */
33966 	  ix86_expand_vector_set (false, tmp, val, 0);
33967 	  /* target = A B X D */
33968 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33969 					  const0_rtx, const1_rtx,
33970 					  GEN_INT (0+4), GEN_INT (3+4)));
33971 	  return;
33972 
33973 	case 3:
33974 	  /* tmp = target = A B C D */
33975 	  tmp = copy_to_reg (target);
33976 	  /* tmp = X B C D */
33977 	  ix86_expand_vector_set (false, tmp, val, 0);
33978 	  /* target = A B X D */
33979 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33980 					  const0_rtx, const1_rtx,
33981 					  GEN_INT (2+4), GEN_INT (0+4)));
33982 	  return;
33983 
33984 	default:
33985 	  gcc_unreachable ();
33986 	}
33987       break;
33988 
33989     case V4SImode:
33990       use_vec_merge = TARGET_SSE4_1;
33991       if (use_vec_merge)
33992 	break;
33993 
33994       /* Element 0 handled by vec_merge below.  */
33995       if (elt == 0)
33996 	{
33997 	  use_vec_merge = true;
33998 	  break;
33999 	}
34000 
34001       if (TARGET_SSE2)
34002 	{
34003 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
34004 	     store into element 0, then shuffle them back.  */
34005 
34006 	  rtx order[4];
34007 
34008 	  order[0] = GEN_INT (elt);
34009 	  order[1] = const1_rtx;
34010 	  order[2] = const2_rtx;
34011 	  order[3] = GEN_INT (3);
34012 	  order[elt] = const0_rtx;
34013 
34014 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34015 					order[1], order[2], order[3]));
34016 
34017 	  ix86_expand_vector_set (false, target, val, 0);
34018 
34019 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34020 					order[1], order[2], order[3]));
34021 	}
34022       else
34023 	{
34024 	  /* For SSE1, we have to reuse the V4SF code.  */
34025 	  ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34026 				  gen_lowpart (SFmode, val), elt);
34027 	}
34028       return;
34029 
34030     case V8HImode:
34031       use_vec_merge = TARGET_SSE2;
34032       break;
34033     case V4HImode:
34034       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34035       break;
34036 
34037     case V16QImode:
34038       use_vec_merge = TARGET_SSE4_1;
34039       break;
34040 
34041     case V8QImode:
34042       break;
34043 
34044     case V32QImode:
34045       half_mode = V16QImode;
34046       j = 0;
34047       n = 16;
34048       goto half;
34049 
34050     case V16HImode:
34051       half_mode = V8HImode;
34052       j = 1;
34053       n = 8;
34054       goto half;
34055 
34056     case V8SImode:
34057       half_mode = V4SImode;
34058       j = 2;
34059       n = 4;
34060       goto half;
34061 
34062     case V4DImode:
34063       half_mode = V2DImode;
34064       j = 3;
34065       n = 2;
34066       goto half;
34067 
34068     case V8SFmode:
34069       half_mode = V4SFmode;
34070       j = 4;
34071       n = 4;
34072       goto half;
34073 
34074     case V4DFmode:
34075       half_mode = V2DFmode;
34076       j = 5;
34077       n = 2;
34078       goto half;
34079 
34080 half:
34081       /* Compute offset.  */
34082       i = elt / n;
34083       elt %= n;
34084 
34085       gcc_assert (i <= 1);
34086 
34087       /* Extract the half.  */
34088       tmp = gen_reg_rtx (half_mode);
34089       emit_insn (gen_extract[j][i] (tmp, target));
34090 
34091       /* Put val in tmp at elt.  */
34092       ix86_expand_vector_set (false, tmp, val, elt);
34093 
34094       /* Put it back.  */
34095       emit_insn (gen_insert[j][i] (target, target, tmp));
34096       return;
34097 
34098     default:
34099       break;
34100     }
34101 
34102   if (use_vec_merge)
34103     {
34104       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34105       tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34106       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34107     }
34108   else
34109     {
34110       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34111 
34112       emit_move_insn (mem, target);
34113 
34114       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34115       emit_move_insn (tmp, val);
34116 
34117       emit_move_insn (target, mem);
34118     }
34119 }
34120 
34121 void
34122 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34123 {
34124   enum machine_mode mode = GET_MODE (vec);
34125   enum machine_mode inner_mode = GET_MODE_INNER (mode);
34126   bool use_vec_extr = false;
34127   rtx tmp;
34128 
34129   switch (mode)
34130     {
34131     case V2SImode:
34132     case V2SFmode:
34133       if (!mmx_ok)
34134 	break;
34135       /* FALLTHRU */
34136 
34137     case V2DFmode:
34138     case V2DImode:
34139       use_vec_extr = true;
34140       break;
34141 
34142     case V4SFmode:
34143       use_vec_extr = TARGET_SSE4_1;
34144       if (use_vec_extr)
34145 	break;
34146 
34147       switch (elt)
34148 	{
34149 	case 0:
34150 	  tmp = vec;
34151 	  break;
34152 
34153 	case 1:
34154 	case 3:
34155 	  tmp = gen_reg_rtx (mode);
34156 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34157 				       GEN_INT (elt), GEN_INT (elt),
34158 				       GEN_INT (elt+4), GEN_INT (elt+4)));
34159 	  break;
34160 
34161 	case 2:
34162 	  tmp = gen_reg_rtx (mode);
34163 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34164 	  break;
34165 
34166 	default:
34167 	  gcc_unreachable ();
34168 	}
34169       vec = tmp;
34170       use_vec_extr = true;
34171       elt = 0;
34172       break;
34173 
34174     case V4SImode:
34175       use_vec_extr = TARGET_SSE4_1;
34176       if (use_vec_extr)
34177 	break;
34178 
34179       if (TARGET_SSE2)
34180 	{
34181 	  switch (elt)
34182 	    {
34183 	    case 0:
34184 	      tmp = vec;
34185 	      break;
34186 
34187 	    case 1:
34188 	    case 3:
34189 	      tmp = gen_reg_rtx (mode);
34190 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34191 					    GEN_INT (elt), GEN_INT (elt),
34192 					    GEN_INT (elt), GEN_INT (elt)));
34193 	      break;
34194 
34195 	    case 2:
34196 	      tmp = gen_reg_rtx (mode);
34197 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34198 	      break;
34199 
34200 	    default:
34201 	      gcc_unreachable ();
34202 	    }
34203 	  vec = tmp;
34204 	  use_vec_extr = true;
34205 	  elt = 0;
34206 	}
34207       else
34208 	{
34209 	  /* For SSE1, we have to reuse the V4SF code.  */
34210 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34211 				      gen_lowpart (V4SFmode, vec), elt);
34212 	  return;
34213 	}
34214       break;
34215 
34216     case V8HImode:
34217       use_vec_extr = TARGET_SSE2;
34218       break;
34219     case V4HImode:
34220       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34221       break;
34222 
34223     case V16QImode:
34224       use_vec_extr = TARGET_SSE4_1;
34225       break;
34226 
34227     case V8SFmode:
34228       if (TARGET_AVX)
34229 	{
34230 	  tmp = gen_reg_rtx (V4SFmode);
34231 	  if (elt < 4)
34232 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34233 	  else
34234 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34235 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
34236 	  return;
34237 	}
34238       break;
34239 
34240     case V4DFmode:
34241       if (TARGET_AVX)
34242 	{
34243 	  tmp = gen_reg_rtx (V2DFmode);
34244 	  if (elt < 2)
34245 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34246 	  else
34247 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34248 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
34249 	  return;
34250 	}
34251       break;
34252 
34253     case V32QImode:
34254       if (TARGET_AVX)
34255 	{
34256 	  tmp = gen_reg_rtx (V16QImode);
34257 	  if (elt < 16)
34258 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34259 	  else
34260 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34261 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
34262 	  return;
34263 	}
34264       break;
34265 
34266     case V16HImode:
34267       if (TARGET_AVX)
34268 	{
34269 	  tmp = gen_reg_rtx (V8HImode);
34270 	  if (elt < 8)
34271 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34272 	  else
34273 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34274 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
34275 	  return;
34276 	}
34277       break;
34278 
34279     case V8SImode:
34280       if (TARGET_AVX)
34281 	{
34282 	  tmp = gen_reg_rtx (V4SImode);
34283 	  if (elt < 4)
34284 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34285 	  else
34286 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34287 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
34288 	  return;
34289 	}
34290       break;
34291 
34292     case V4DImode:
34293       if (TARGET_AVX)
34294 	{
34295 	  tmp = gen_reg_rtx (V2DImode);
34296 	  if (elt < 2)
34297 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34298 	  else
34299 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34300 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
34301 	  return;
34302 	}
34303       break;
34304 
34305     case V8QImode:
34306       /* ??? Could extract the appropriate HImode element and shift.  */
34307     default:
34308       break;
34309     }
34310 
34311   if (use_vec_extr)
34312     {
34313       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34314       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34315 
34316       /* Let the rtl optimizers know about the zero extension performed.  */
34317       if (inner_mode == QImode || inner_mode == HImode)
34318 	{
34319 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34320 	  target = gen_lowpart (SImode, target);
34321 	}
34322 
34323       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34324     }
34325   else
34326     {
34327       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34328 
34329       emit_move_insn (mem, vec);
34330 
34331       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34332       emit_move_insn (target, tmp);
34333     }
34334 }
34335 
34336 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34337    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34338    The upper bits of DEST are undefined, though they shouldn't cause
34339    exceptions (some bits from src or all zeros are ok).  */
34340 
34341 static void
34342 emit_reduc_half (rtx dest, rtx src, int i)
34343 {
34344   rtx tem;
34345   switch (GET_MODE (src))
34346     {
34347     case V4SFmode:
34348       if (i == 128)
34349 	tem = gen_sse_movhlps (dest, src, src);
34350       else
34351 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34352 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
34353       break;
34354     case V2DFmode:
34355       tem = gen_vec_interleave_highv2df (dest, src, src);
34356       break;
34357     case V16QImode:
34358     case V8HImode:
34359     case V4SImode:
34360     case V2DImode:
34361       tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34362 				gen_lowpart (V1TImode, src),
34363 				GEN_INT (i / 2));
34364       break;
34365     case V8SFmode:
34366       if (i == 256)
34367 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34368       else
34369 	tem = gen_avx_shufps256 (dest, src, src,
34370 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34371       break;
34372     case V4DFmode:
34373       if (i == 256)
34374 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34375       else
34376 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34377       break;
34378     case V32QImode:
34379     case V16HImode:
34380     case V8SImode:
34381     case V4DImode:
34382       if (i == 256)
34383 	tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34384 				 gen_lowpart (V4DImode, src),
34385 				 gen_lowpart (V4DImode, src),
34386 				 const1_rtx);
34387       else
34388 	tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34389 				  gen_lowpart (V2TImode, src),
34390 				  GEN_INT (i / 2));
34391       break;
34392     default:
34393       gcc_unreachable ();
34394     }
34395   emit_insn (tem);
34396 }
34397 
34398 /* Expand a vector reduction.  FN is the binary pattern to reduce;
34399    DEST is the destination; IN is the input vector.  */
34400 
34401 void
34402 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34403 {
34404   rtx half, dst, vec = in;
34405   enum machine_mode mode = GET_MODE (in);
34406   int i;
34407 
34408   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
34409   if (TARGET_SSE4_1
34410       && mode == V8HImode
34411       && fn == gen_uminv8hi3)
34412     {
34413       emit_insn (gen_sse4_1_phminposuw (dest, in));
34414       return;
34415     }
34416 
34417   for (i = GET_MODE_BITSIZE (mode);
34418        i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34419        i >>= 1)
34420     {
34421       half = gen_reg_rtx (mode);
34422       emit_reduc_half (half, vec, i);
34423       if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34424 	dst = dest;
34425       else
34426 	dst = gen_reg_rtx (mode);
34427       emit_insn (fn (dst, half, vec));
34428       vec = dst;
34429     }
34430 }
34431 
34432 /* Target hook for scalar_mode_supported_p.  */
34433 static bool
34434 ix86_scalar_mode_supported_p (enum machine_mode mode)
34435 {
34436   if (DECIMAL_FLOAT_MODE_P (mode))
34437     return default_decimal_float_supported_p ();
34438   else if (mode == TFmode)
34439     return true;
34440   else
34441     return default_scalar_mode_supported_p (mode);
34442 }
34443 
34444 /* Implements target hook vector_mode_supported_p.  */
34445 static bool
34446 ix86_vector_mode_supported_p (enum machine_mode mode)
34447 {
34448   if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34449     return true;
34450   if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34451     return true;
34452   if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34453     return true;
34454   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34455     return true;
34456   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34457     return true;
34458   return false;
34459 }
34460 
34461 /* Target hook for c_mode_for_suffix.  */
34462 static enum machine_mode
34463 ix86_c_mode_for_suffix (char suffix)
34464 {
34465   if (suffix == 'q')
34466     return TFmode;
34467   if (suffix == 'w')
34468     return XFmode;
34469 
34470   return VOIDmode;
34471 }
34472 
34473 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34474 
34475    We do this in the new i386 backend to maintain source compatibility
34476    with the old cc0-based compiler.  */
34477 
34478 static tree
34479 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34480 		      tree inputs ATTRIBUTE_UNUSED,
34481 		      tree clobbers)
34482 {
34483   clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34484 			clobbers);
34485   clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34486 			clobbers);
34487   return clobbers;
34488 }
34489 
34490 /* Implements target vector targetm.asm.encode_section_info.  */
34491 
34492 static void ATTRIBUTE_UNUSED
34493 ix86_encode_section_info (tree decl, rtx rtl, int first)
34494 {
34495   default_encode_section_info (decl, rtl, first);
34496 
34497   if (TREE_CODE (decl) == VAR_DECL
34498       && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34499       && ix86_in_large_data_p (decl))
34500     SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34501 }
34502 
34503 /* Worker function for REVERSE_CONDITION.  */
34504 
34505 enum rtx_code
34506 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34507 {
34508   return (mode != CCFPmode && mode != CCFPUmode
34509 	  ? reverse_condition (code)
34510 	  : reverse_condition_maybe_unordered (code));
34511 }
34512 
34513 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34514    to OPERANDS[0].  */
34515 
34516 const char *
34517 output_387_reg_move (rtx insn, rtx *operands)
34518 {
34519   if (REG_P (operands[0]))
34520     {
34521       if (REG_P (operands[1])
34522 	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34523 	{
34524 	  if (REGNO (operands[0]) == FIRST_STACK_REG)
34525 	    return output_387_ffreep (operands, 0);
34526 	  return "fstp\t%y0";
34527 	}
34528       if (STACK_TOP_P (operands[0]))
34529 	return "fld%Z1\t%y1";
34530       return "fst\t%y0";
34531     }
34532   else if (MEM_P (operands[0]))
34533     {
34534       gcc_assert (REG_P (operands[1]));
34535       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34536 	return "fstp%Z0\t%y0";
34537       else
34538 	{
34539 	  /* There is no non-popping store to memory for XFmode.
34540 	     So if we need one, follow the store with a load.  */
34541 	  if (GET_MODE (operands[0]) == XFmode)
34542 	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34543 	  else
34544 	    return "fst%Z0\t%y0";
34545 	}
34546     }
34547   else
34548     gcc_unreachable();
34549 }
34550 
34551 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34552    FP status register is set.  */
34553 
34554 void
34555 ix86_emit_fp_unordered_jump (rtx label)
34556 {
34557   rtx reg = gen_reg_rtx (HImode);
34558   rtx temp;
34559 
34560   emit_insn (gen_x86_fnstsw_1 (reg));
34561 
34562   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34563     {
34564       emit_insn (gen_x86_sahf_1 (reg));
34565 
34566       temp = gen_rtx_REG (CCmode, FLAGS_REG);
34567       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34568     }
34569   else
34570     {
34571       emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34572 
34573       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34574       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34575     }
34576 
34577   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34578 			      gen_rtx_LABEL_REF (VOIDmode, label),
34579 			      pc_rtx);
34580   temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34581 
34582   emit_jump_insn (temp);
34583   predict_jump (REG_BR_PROB_BASE * 10 / 100);
34584 }
34585 
34586 /* Output code to perform a log1p XFmode calculation.  */
34587 
34588 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34589 {
34590   rtx label1 = gen_label_rtx ();
34591   rtx label2 = gen_label_rtx ();
34592 
34593   rtx tmp = gen_reg_rtx (XFmode);
34594   rtx tmp2 = gen_reg_rtx (XFmode);
34595   rtx test;
34596 
34597   emit_insn (gen_absxf2 (tmp, op1));
34598   test = gen_rtx_GE (VOIDmode, tmp,
34599     CONST_DOUBLE_FROM_REAL_VALUE (
34600        REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34601        XFmode));
34602   emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34603 
34604   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34605   emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34606   emit_jump (label2);
34607 
34608   emit_label (label1);
34609   emit_move_insn (tmp, CONST1_RTX (XFmode));
34610   emit_insn (gen_addxf3 (tmp, op1, tmp));
34611   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34612   emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34613 
34614   emit_label (label2);
34615 }
34616 
34617 /* Emit code for round calculation.  */
34618 void ix86_emit_i387_round (rtx op0, rtx op1)
34619 {
34620   enum machine_mode inmode = GET_MODE (op1);
34621   enum machine_mode outmode = GET_MODE (op0);
34622   rtx e1, e2, res, tmp, tmp1, half;
34623   rtx scratch = gen_reg_rtx (HImode);
34624   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34625   rtx jump_label = gen_label_rtx ();
34626   rtx insn;
34627   rtx (*gen_abs) (rtx, rtx);
34628   rtx (*gen_neg) (rtx, rtx);
34629 
34630   switch (inmode)
34631     {
34632     case SFmode:
34633       gen_abs = gen_abssf2;
34634       break;
34635     case DFmode:
34636       gen_abs = gen_absdf2;
34637       break;
34638     case XFmode:
34639       gen_abs = gen_absxf2;
34640       break;
34641     default:
34642       gcc_unreachable ();
34643     }
34644 
34645   switch (outmode)
34646     {
34647     case SFmode:
34648       gen_neg = gen_negsf2;
34649       break;
34650     case DFmode:
34651       gen_neg = gen_negdf2;
34652       break;
34653     case XFmode:
34654       gen_neg = gen_negxf2;
34655       break;
34656     case HImode:
34657       gen_neg = gen_neghi2;
34658       break;
34659     case SImode:
34660       gen_neg = gen_negsi2;
34661       break;
34662     case DImode:
34663       gen_neg = gen_negdi2;
34664       break;
34665     default:
34666       gcc_unreachable ();
34667     }
34668 
34669   e1 = gen_reg_rtx (inmode);
34670   e2 = gen_reg_rtx (inmode);
34671   res = gen_reg_rtx (outmode);
34672 
34673   half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34674 
34675   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34676 
34677   /* scratch = fxam(op1) */
34678   emit_insn (gen_rtx_SET (VOIDmode, scratch,
34679 			  gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34680 					  UNSPEC_FXAM)));
34681   /* e1 = fabs(op1) */
34682   emit_insn (gen_abs (e1, op1));
34683 
34684   /* e2 = e1 + 0.5 */
34685   half = force_reg (inmode, half);
34686   emit_insn (gen_rtx_SET (VOIDmode, e2,
34687 			  gen_rtx_PLUS (inmode, e1, half)));
34688 
34689   /* res = floor(e2) */
34690   if (inmode != XFmode)
34691     {
34692       tmp1 = gen_reg_rtx (XFmode);
34693 
34694       emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34695 			      gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34696     }
34697   else
34698     tmp1 = e2;
34699 
34700   switch (outmode)
34701     {
34702     case SFmode:
34703     case DFmode:
34704       {
34705 	rtx tmp0 = gen_reg_rtx (XFmode);
34706 
34707 	emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34708 
34709 	emit_insn (gen_rtx_SET (VOIDmode, res,
34710 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34711 						UNSPEC_TRUNC_NOOP)));
34712       }
34713       break;
34714     case XFmode:
34715       emit_insn (gen_frndintxf2_floor (res, tmp1));
34716       break;
34717     case HImode:
34718       emit_insn (gen_lfloorxfhi2 (res, tmp1));
34719       break;
34720     case SImode:
34721       emit_insn (gen_lfloorxfsi2 (res, tmp1));
34722       break;
34723     case DImode:
34724       emit_insn (gen_lfloorxfdi2 (res, tmp1));
34725 	break;
34726     default:
34727       gcc_unreachable ();
34728     }
34729 
34730   /* flags = signbit(a) */
34731   emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34732 
34733   /* if (flags) then res = -res */
34734   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34735 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34736 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
34737 			      pc_rtx);
34738   insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34739   predict_jump (REG_BR_PROB_BASE * 50 / 100);
34740   JUMP_LABEL (insn) = jump_label;
34741 
34742   emit_insn (gen_neg (res, res));
34743 
34744   emit_label (jump_label);
34745   LABEL_NUSES (jump_label) = 1;
34746 
34747   emit_move_insn (op0, res);
34748 }
34749 
34750 /* Output code to perform a Newton-Rhapson approximation of a single precision
34751    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
34752 
34753 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34754 {
34755   rtx x0, x1, e0, e1;
34756 
34757   x0 = gen_reg_rtx (mode);
34758   e0 = gen_reg_rtx (mode);
34759   e1 = gen_reg_rtx (mode);
34760   x1 = gen_reg_rtx (mode);
34761 
34762   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34763 
34764   b = force_reg (mode, b);
34765 
34766   /* x0 = rcp(b) estimate */
34767   emit_insn (gen_rtx_SET (VOIDmode, x0,
34768 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34769 					  UNSPEC_RCP)));
34770   /* e0 = x0 * b */
34771   emit_insn (gen_rtx_SET (VOIDmode, e0,
34772 			  gen_rtx_MULT (mode, x0, b)));
34773 
34774   /* e0 = x0 * e0 */
34775   emit_insn (gen_rtx_SET (VOIDmode, e0,
34776 			  gen_rtx_MULT (mode, x0, e0)));
34777 
34778   /* e1 = x0 + x0 */
34779   emit_insn (gen_rtx_SET (VOIDmode, e1,
34780 			  gen_rtx_PLUS (mode, x0, x0)));
34781 
34782   /* x1 = e1 - e0 */
34783   emit_insn (gen_rtx_SET (VOIDmode, x1,
34784 			  gen_rtx_MINUS (mode, e1, e0)));
34785 
34786   /* res = a * x1 */
34787   emit_insn (gen_rtx_SET (VOIDmode, res,
34788 			  gen_rtx_MULT (mode, a, x1)));
34789 }
34790 
34791 /* Output code to perform a Newton-Rhapson approximation of a
34792    single precision floating point [reciprocal] square root.  */
34793 
34794 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34795 			 bool recip)
34796 {
34797   rtx x0, e0, e1, e2, e3, mthree, mhalf;
34798   REAL_VALUE_TYPE r;
34799 
34800   x0 = gen_reg_rtx (mode);
34801   e0 = gen_reg_rtx (mode);
34802   e1 = gen_reg_rtx (mode);
34803   e2 = gen_reg_rtx (mode);
34804   e3 = gen_reg_rtx (mode);
34805 
34806   real_from_integer (&r, VOIDmode, -3, -1, 0);
34807   mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34808 
34809   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34810   mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34811 
34812   if (VECTOR_MODE_P (mode))
34813     {
34814       mthree = ix86_build_const_vector (mode, true, mthree);
34815       mhalf = ix86_build_const_vector (mode, true, mhalf);
34816     }
34817 
34818   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34819      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34820 
34821   a = force_reg (mode, a);
34822 
34823   /* x0 = rsqrt(a) estimate */
34824   emit_insn (gen_rtx_SET (VOIDmode, x0,
34825 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34826 					  UNSPEC_RSQRT)));
34827 
34828   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
34829   if (!recip)
34830     {
34831       rtx zero, mask;
34832 
34833       zero = gen_reg_rtx (mode);
34834       mask = gen_reg_rtx (mode);
34835 
34836       zero = force_reg (mode, CONST0_RTX(mode));
34837       emit_insn (gen_rtx_SET (VOIDmode, mask,
34838 			      gen_rtx_NE (mode, zero, a)));
34839 
34840       emit_insn (gen_rtx_SET (VOIDmode, x0,
34841 			      gen_rtx_AND (mode, x0, mask)));
34842     }
34843 
34844   /* e0 = x0 * a */
34845   emit_insn (gen_rtx_SET (VOIDmode, e0,
34846 			  gen_rtx_MULT (mode, x0, a)));
34847   /* e1 = e0 * x0 */
34848   emit_insn (gen_rtx_SET (VOIDmode, e1,
34849 			  gen_rtx_MULT (mode, e0, x0)));
34850 
34851   /* e2 = e1 - 3. */
34852   mthree = force_reg (mode, mthree);
34853   emit_insn (gen_rtx_SET (VOIDmode, e2,
34854 			  gen_rtx_PLUS (mode, e1, mthree)));
34855 
34856   mhalf = force_reg (mode, mhalf);
34857   if (recip)
34858     /* e3 = -.5 * x0 */
34859     emit_insn (gen_rtx_SET (VOIDmode, e3,
34860 			    gen_rtx_MULT (mode, x0, mhalf)));
34861   else
34862     /* e3 = -.5 * e0 */
34863     emit_insn (gen_rtx_SET (VOIDmode, e3,
34864 			    gen_rtx_MULT (mode, e0, mhalf)));
34865   /* ret = e2 * e3 */
34866   emit_insn (gen_rtx_SET (VOIDmode, res,
34867 			  gen_rtx_MULT (mode, e2, e3)));
34868 }
34869 
34870 #ifdef TARGET_SOLARIS
34871 /* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
34872 
34873 static void
34874 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34875 				tree decl)
34876 {
34877   /* With Binutils 2.15, the "@unwind" marker must be specified on
34878      every occurrence of the ".eh_frame" section, not just the first
34879      one.  */
34880   if (TARGET_64BIT
34881       && strcmp (name, ".eh_frame") == 0)
34882     {
34883       fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34884 	       flags & SECTION_WRITE ? "aw" : "a");
34885       return;
34886     }
34887 
34888 #ifndef USE_GAS
34889   if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34890     {
34891       solaris_elf_asm_comdat_section (name, flags, decl);
34892       return;
34893     }
34894 #endif
34895 
34896   default_elf_asm_named_section (name, flags, decl);
34897 }
34898 #endif /* TARGET_SOLARIS */
34899 
34900 /* Return the mangling of TYPE if it is an extended fundamental type.  */
34901 
34902 static const char *
34903 ix86_mangle_type (const_tree type)
34904 {
34905   type = TYPE_MAIN_VARIANT (type);
34906 
34907   if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34908       && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34909     return NULL;
34910 
34911   switch (TYPE_MODE (type))
34912     {
34913     case TFmode:
34914       /* __float128 is "g".  */
34915       return "g";
34916     case XFmode:
34917       /* "long double" or __float80 is "e".  */
34918       return "e";
34919     default:
34920       return NULL;
34921     }
34922 }
34923 
34924 /* For 32-bit code we can save PIC register setup by using
34925    __stack_chk_fail_local hidden function instead of calling
34926    __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
34927    register, so it is better to call __stack_chk_fail directly.  */
34928 
34929 static tree ATTRIBUTE_UNUSED
34930 ix86_stack_protect_fail (void)
34931 {
34932 #if 0  /*  Still broken -- affects FreeBSD too  */
34933   return TARGET_64BIT
34934 	 ? default_external_stack_protect_fail ()
34935 	 : default_hidden_stack_protect_fail ();
34936 #else
34937   return default_external_stack_protect_fail ();
34938 #endif
34939 }
34940 
34941 /* Select a format to encode pointers in exception handling data.  CODE
34942    is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
34943    true if the symbol may be affected by dynamic relocations.
34944 
34945    ??? All x86 object file formats are capable of representing this.
34946    After all, the relocation needed is the same as for the call insn.
34947    Whether or not a particular assembler allows us to enter such, I
34948    guess we'll have to see.  */
34949 int
34950 asm_preferred_eh_data_format (int code, int global)
34951 {
34952   if (flag_pic)
34953     {
34954       int type = DW_EH_PE_sdata8;
34955       if (!TARGET_64BIT
34956 	  || ix86_cmodel == CM_SMALL_PIC
34957 	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34958 	type = DW_EH_PE_sdata4;
34959       return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34960     }
34961   if (ix86_cmodel == CM_SMALL
34962       || (ix86_cmodel == CM_MEDIUM && code))
34963     return DW_EH_PE_udata4;
34964   return DW_EH_PE_absptr;
34965 }
34966 
34967 /* Expand copysign from SIGN to the positive value ABS_VALUE
34968    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
34969    the sign-bit.  */
34970 static void
34971 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34972 {
34973   enum machine_mode mode = GET_MODE (sign);
34974   rtx sgn = gen_reg_rtx (mode);
34975   if (mask == NULL_RTX)
34976     {
34977       enum machine_mode vmode;
34978 
34979       if (mode == SFmode)
34980 	vmode = V4SFmode;
34981       else if (mode == DFmode)
34982 	vmode = V2DFmode;
34983       else
34984 	vmode = mode;
34985 
34986       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34987       if (!VECTOR_MODE_P (mode))
34988 	{
34989 	  /* We need to generate a scalar mode mask in this case.  */
34990 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34991 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34992 	  mask = gen_reg_rtx (mode);
34993 	  emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34994 	}
34995     }
34996   else
34997     mask = gen_rtx_NOT (mode, mask);
34998   emit_insn (gen_rtx_SET (VOIDmode, sgn,
34999 			  gen_rtx_AND (mode, mask, sign)));
35000   emit_insn (gen_rtx_SET (VOIDmode, result,
35001 			  gen_rtx_IOR (mode, abs_value, sgn)));
35002 }
35003 
35004 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
35005    mask for masking out the sign-bit is stored in *SMASK, if that is
35006    non-null.  */
35007 static rtx
35008 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35009 {
35010   enum machine_mode vmode, mode = GET_MODE (op0);
35011   rtx xa, mask;
35012 
35013   xa = gen_reg_rtx (mode);
35014   if (mode == SFmode)
35015     vmode = V4SFmode;
35016   else if (mode == DFmode)
35017     vmode = V2DFmode;
35018   else
35019     vmode = mode;
35020   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35021   if (!VECTOR_MODE_P (mode))
35022     {
35023       /* We need to generate a scalar mode mask in this case.  */
35024       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35025       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35026       mask = gen_reg_rtx (mode);
35027       emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35028     }
35029   emit_insn (gen_rtx_SET (VOIDmode, xa,
35030 			  gen_rtx_AND (mode, op0, mask)));
35031 
35032   if (smask)
35033     *smask = mask;
35034 
35035   return xa;
35036 }
35037 
35038 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35039    swapping the operands if SWAP_OPERANDS is true.  The expanded
35040    code is a forward jump to a newly created label in case the
35041    comparison is true.  The generated label rtx is returned.  */
35042 static rtx
35043 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35044                                   bool swap_operands)
35045 {
35046   rtx label, tmp;
35047 
35048   if (swap_operands)
35049     {
35050       tmp = op0;
35051       op0 = op1;
35052       op1 = tmp;
35053     }
35054 
35055   label = gen_label_rtx ();
35056   tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35057   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35058 			  gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35059   tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35060   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35061 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35062   tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35063   JUMP_LABEL (tmp) = label;
35064 
35065   return label;
35066 }
35067 
35068 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35069    using comparison code CODE.  Operands are swapped for the comparison if
35070    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
35071 static rtx
35072 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35073 			      bool swap_operands)
35074 {
35075   rtx (*insn)(rtx, rtx, rtx, rtx);
35076   enum machine_mode mode = GET_MODE (op0);
35077   rtx mask = gen_reg_rtx (mode);
35078 
35079   if (swap_operands)
35080     {
35081       rtx tmp = op0;
35082       op0 = op1;
35083       op1 = tmp;
35084     }
35085 
35086   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35087 
35088   emit_insn (insn (mask, op0, op1,
35089 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
35090   return mask;
35091 }
35092 
35093 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35094    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
35095 static rtx
35096 ix86_gen_TWO52 (enum machine_mode mode)
35097 {
35098   REAL_VALUE_TYPE TWO52r;
35099   rtx TWO52;
35100 
35101   real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35102   TWO52 = const_double_from_real_value (TWO52r, mode);
35103   TWO52 = force_reg (mode, TWO52);
35104 
35105   return TWO52;
35106 }
35107 
35108 /* Expand SSE sequence for computing lround from OP1 storing
35109    into OP0.  */
35110 void
35111 ix86_expand_lround (rtx op0, rtx op1)
35112 {
35113   /* C code for the stuff we're doing below:
35114        tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35115        return (long)tmp;
35116    */
35117   enum machine_mode mode = GET_MODE (op1);
35118   const struct real_format *fmt;
35119   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35120   rtx adj;
35121 
35122   /* load nextafter (0.5, 0.0) */
35123   fmt = REAL_MODE_FORMAT (mode);
35124   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35125   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35126 
35127   /* adj = copysign (0.5, op1) */
35128   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35129   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35130 
35131   /* adj = op1 + adj */
35132   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35133 
35134   /* op0 = (imode)adj */
35135   expand_fix (op0, adj, 0);
35136 }
35137 
35138 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35139    into OPERAND0.  */
35140 void
35141 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35142 {
35143   /* C code for the stuff we're doing below (for do_floor):
35144 	xi = (long)op1;
35145         xi -= (double)xi > op1 ? 1 : 0;
35146         return xi;
35147    */
35148   enum machine_mode fmode = GET_MODE (op1);
35149   enum machine_mode imode = GET_MODE (op0);
35150   rtx ireg, freg, label, tmp;
35151 
35152   /* reg = (long)op1 */
35153   ireg = gen_reg_rtx (imode);
35154   expand_fix (ireg, op1, 0);
35155 
35156   /* freg = (double)reg */
35157   freg = gen_reg_rtx (fmode);
35158   expand_float (freg, ireg, 0);
35159 
35160   /* ireg = (freg > op1) ? ireg - 1 : ireg */
35161   label = ix86_expand_sse_compare_and_jump (UNLE,
35162 					    freg, op1, !do_floor);
35163   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35164 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35165   emit_move_insn (ireg, tmp);
35166 
35167   emit_label (label);
35168   LABEL_NUSES (label) = 1;
35169 
35170   emit_move_insn (op0, ireg);
35171 }
35172 
35173 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35174    result in OPERAND0.  */
35175 void
35176 ix86_expand_rint (rtx operand0, rtx operand1)
35177 {
35178   /* C code for the stuff we're doing below:
35179 	xa = fabs (operand1);
35180         if (!isless (xa, 2**52))
35181 	  return operand1;
35182         xa = xa + 2**52 - 2**52;
35183         return copysign (xa, operand1);
35184    */
35185   enum machine_mode mode = GET_MODE (operand0);
35186   rtx res, xa, label, TWO52, mask;
35187 
35188   res = gen_reg_rtx (mode);
35189   emit_move_insn (res, operand1);
35190 
35191   /* xa = abs (operand1) */
35192   xa = ix86_expand_sse_fabs (res, &mask);
35193 
35194   /* if (!isless (xa, TWO52)) goto label; */
35195   TWO52 = ix86_gen_TWO52 (mode);
35196   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35197 
35198   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35199   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35200 
35201   ix86_sse_copysign_to_positive (res, xa, res, mask);
35202 
35203   emit_label (label);
35204   LABEL_NUSES (label) = 1;
35205 
35206   emit_move_insn (operand0, res);
35207 }
35208 
35209 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35210    into OPERAND0.  */
35211 void
35212 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35213 {
35214   /* C code for the stuff we expand below.
35215         double xa = fabs (x), x2;
35216         if (!isless (xa, TWO52))
35217           return x;
35218         xa = xa + TWO52 - TWO52;
35219         x2 = copysign (xa, x);
35220      Compensate.  Floor:
35221         if (x2 > x)
35222           x2 -= 1;
35223      Compensate.  Ceil:
35224         if (x2 < x)
35225           x2 -= -1;
35226         return x2;
35227    */
35228   enum machine_mode mode = GET_MODE (operand0);
35229   rtx xa, TWO52, tmp, label, one, res, mask;
35230 
35231   TWO52 = ix86_gen_TWO52 (mode);
35232 
35233   /* Temporary for holding the result, initialized to the input
35234      operand to ease control flow.  */
35235   res = gen_reg_rtx (mode);
35236   emit_move_insn (res, operand1);
35237 
35238   /* xa = abs (operand1) */
35239   xa = ix86_expand_sse_fabs (res, &mask);
35240 
35241   /* if (!isless (xa, TWO52)) goto label; */
35242   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35243 
35244   /* xa = xa + TWO52 - TWO52; */
35245   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35246   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35247 
35248   /* xa = copysign (xa, operand1) */
35249   ix86_sse_copysign_to_positive (xa, xa, res, mask);
35250 
35251   /* generate 1.0 or -1.0 */
35252   one = force_reg (mode,
35253 	           const_double_from_real_value (do_floor
35254 						 ? dconst1 : dconstm1, mode));
35255 
35256   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35257   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35258   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35259                           gen_rtx_AND (mode, one, tmp)));
35260   /* We always need to subtract here to preserve signed zero.  */
35261   tmp = expand_simple_binop (mode, MINUS,
35262 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35263   emit_move_insn (res, tmp);
35264 
35265   emit_label (label);
35266   LABEL_NUSES (label) = 1;
35267 
35268   emit_move_insn (operand0, res);
35269 }
35270 
35271 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35272    into OPERAND0.  */
35273 void
35274 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35275 {
35276   /* C code for the stuff we expand below.
35277 	double xa = fabs (x), x2;
35278         if (!isless (xa, TWO52))
35279           return x;
35280 	x2 = (double)(long)x;
35281      Compensate.  Floor:
35282 	if (x2 > x)
35283 	  x2 -= 1;
35284      Compensate.  Ceil:
35285 	if (x2 < x)
35286 	  x2 += 1;
35287 	if (HONOR_SIGNED_ZEROS (mode))
35288 	  return copysign (x2, x);
35289 	return x2;
35290    */
35291   enum machine_mode mode = GET_MODE (operand0);
35292   rtx xa, xi, TWO52, tmp, label, one, res, mask;
35293 
35294   TWO52 = ix86_gen_TWO52 (mode);
35295 
35296   /* Temporary for holding the result, initialized to the input
35297      operand to ease control flow.  */
35298   res = gen_reg_rtx (mode);
35299   emit_move_insn (res, operand1);
35300 
35301   /* xa = abs (operand1) */
35302   xa = ix86_expand_sse_fabs (res, &mask);
35303 
35304   /* if (!isless (xa, TWO52)) goto label; */
35305   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35306 
35307   /* xa = (double)(long)x */
35308   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35309   expand_fix (xi, res, 0);
35310   expand_float (xa, xi, 0);
35311 
35312   /* generate 1.0 */
35313   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35314 
35315   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35316   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35317   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35318                           gen_rtx_AND (mode, one, tmp)));
35319   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35320 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35321   emit_move_insn (res, tmp);
35322 
35323   if (HONOR_SIGNED_ZEROS (mode))
35324     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35325 
35326   emit_label (label);
35327   LABEL_NUSES (label) = 1;
35328 
35329   emit_move_insn (operand0, res);
35330 }
35331 
35332 /* Expand SSE sequence for computing round from OPERAND1 storing
35333    into OPERAND0.  Sequence that works without relying on DImode truncation
35334    via cvttsd2siq that is only available on 64bit targets.  */
35335 void
35336 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35337 {
35338   /* C code for the stuff we expand below.
35339         double xa = fabs (x), xa2, x2;
35340         if (!isless (xa, TWO52))
35341           return x;
35342      Using the absolute value and copying back sign makes
35343      -0.0 -> -0.0 correct.
35344         xa2 = xa + TWO52 - TWO52;
35345      Compensate.
35346 	dxa = xa2 - xa;
35347         if (dxa <= -0.5)
35348           xa2 += 1;
35349         else if (dxa > 0.5)
35350           xa2 -= 1;
35351         x2 = copysign (xa2, x);
35352         return x2;
35353    */
35354   enum machine_mode mode = GET_MODE (operand0);
35355   rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35356 
35357   TWO52 = ix86_gen_TWO52 (mode);
35358 
35359   /* Temporary for holding the result, initialized to the input
35360      operand to ease control flow.  */
35361   res = gen_reg_rtx (mode);
35362   emit_move_insn (res, operand1);
35363 
35364   /* xa = abs (operand1) */
35365   xa = ix86_expand_sse_fabs (res, &mask);
35366 
35367   /* if (!isless (xa, TWO52)) goto label; */
35368   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35369 
35370   /* xa2 = xa + TWO52 - TWO52; */
35371   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35372   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35373 
35374   /* dxa = xa2 - xa; */
35375   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35376 
35377   /* generate 0.5, 1.0 and -0.5 */
35378   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35379   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35380   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35381 			       0, OPTAB_DIRECT);
35382 
35383   /* Compensate.  */
35384   tmp = gen_reg_rtx (mode);
35385   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35386   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35387   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35388                           gen_rtx_AND (mode, one, tmp)));
35389   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35390   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35391   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35392   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35393                           gen_rtx_AND (mode, one, tmp)));
35394   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35395 
35396   /* res = copysign (xa2, operand1) */
35397   ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35398 
35399   emit_label (label);
35400   LABEL_NUSES (label) = 1;
35401 
35402   emit_move_insn (operand0, res);
35403 }
35404 
35405 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35406    into OPERAND0.  */
35407 void
35408 ix86_expand_trunc (rtx operand0, rtx operand1)
35409 {
35410   /* C code for SSE variant we expand below.
35411         double xa = fabs (x), x2;
35412         if (!isless (xa, TWO52))
35413           return x;
35414         x2 = (double)(long)x;
35415 	if (HONOR_SIGNED_ZEROS (mode))
35416 	  return copysign (x2, x);
35417 	return x2;
35418    */
35419   enum machine_mode mode = GET_MODE (operand0);
35420   rtx xa, xi, TWO52, label, res, mask;
35421 
35422   TWO52 = ix86_gen_TWO52 (mode);
35423 
35424   /* Temporary for holding the result, initialized to the input
35425      operand to ease control flow.  */
35426   res = gen_reg_rtx (mode);
35427   emit_move_insn (res, operand1);
35428 
35429   /* xa = abs (operand1) */
35430   xa = ix86_expand_sse_fabs (res, &mask);
35431 
35432   /* if (!isless (xa, TWO52)) goto label; */
35433   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35434 
35435   /* x = (double)(long)x */
35436   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35437   expand_fix (xi, res, 0);
35438   expand_float (res, xi, 0);
35439 
35440   if (HONOR_SIGNED_ZEROS (mode))
35441     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35442 
35443   emit_label (label);
35444   LABEL_NUSES (label) = 1;
35445 
35446   emit_move_insn (operand0, res);
35447 }
35448 
35449 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35450    into OPERAND0.  */
35451 void
35452 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35453 {
35454   enum machine_mode mode = GET_MODE (operand0);
35455   rtx xa, mask, TWO52, label, one, res, smask, tmp;
35456 
35457   /* C code for SSE variant we expand below.
35458         double xa = fabs (x), x2;
35459         if (!isless (xa, TWO52))
35460           return x;
35461         xa2 = xa + TWO52 - TWO52;
35462      Compensate:
35463         if (xa2 > xa)
35464           xa2 -= 1.0;
35465         x2 = copysign (xa2, x);
35466         return x2;
35467    */
35468 
35469   TWO52 = ix86_gen_TWO52 (mode);
35470 
35471   /* Temporary for holding the result, initialized to the input
35472      operand to ease control flow.  */
35473   res = gen_reg_rtx (mode);
35474   emit_move_insn (res, operand1);
35475 
35476   /* xa = abs (operand1) */
35477   xa = ix86_expand_sse_fabs (res, &smask);
35478 
35479   /* if (!isless (xa, TWO52)) goto label; */
35480   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35481 
35482   /* res = xa + TWO52 - TWO52; */
35483   tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35484   tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35485   emit_move_insn (res, tmp);
35486 
35487   /* generate 1.0 */
35488   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35489 
35490   /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
35491   mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35492   emit_insn (gen_rtx_SET (VOIDmode, mask,
35493                           gen_rtx_AND (mode, mask, one)));
35494   tmp = expand_simple_binop (mode, MINUS,
35495 			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35496   emit_move_insn (res, tmp);
35497 
35498   /* res = copysign (res, operand1) */
35499   ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35500 
35501   emit_label (label);
35502   LABEL_NUSES (label) = 1;
35503 
35504   emit_move_insn (operand0, res);
35505 }
35506 
35507 /* Expand SSE sequence for computing round from OPERAND1 storing
35508    into OPERAND0.  */
35509 void
35510 ix86_expand_round (rtx operand0, rtx operand1)
35511 {
35512   /* C code for the stuff we're doing below:
35513         double xa = fabs (x);
35514         if (!isless (xa, TWO52))
35515           return x;
35516         xa = (double)(long)(xa + nextafter (0.5, 0.0));
35517         return copysign (xa, x);
35518    */
35519   enum machine_mode mode = GET_MODE (operand0);
35520   rtx res, TWO52, xa, label, xi, half, mask;
35521   const struct real_format *fmt;
35522   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35523 
35524   /* Temporary for holding the result, initialized to the input
35525      operand to ease control flow.  */
35526   res = gen_reg_rtx (mode);
35527   emit_move_insn (res, operand1);
35528 
35529   TWO52 = ix86_gen_TWO52 (mode);
35530   xa = ix86_expand_sse_fabs (res, &mask);
35531   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35532 
35533   /* load nextafter (0.5, 0.0) */
35534   fmt = REAL_MODE_FORMAT (mode);
35535   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35536   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35537 
35538   /* xa = xa + 0.5 */
35539   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35540   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35541 
35542   /* xa = (double)(int64_t)xa */
35543   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35544   expand_fix (xi, xa, 0);
35545   expand_float (xa, xi, 0);
35546 
35547   /* res = copysign (xa, operand1) */
35548   ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35549 
35550   emit_label (label);
35551   LABEL_NUSES (label) = 1;
35552 
35553   emit_move_insn (operand0, res);
35554 }
35555 
35556 /* Expand SSE sequence for computing round
35557    from OP1 storing into OP0 using sse4 round insn.  */
35558 void
35559 ix86_expand_round_sse4 (rtx op0, rtx op1)
35560 {
35561   enum machine_mode mode = GET_MODE (op0);
35562   rtx e1, e2, res, half;
35563   const struct real_format *fmt;
35564   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35565   rtx (*gen_copysign) (rtx, rtx, rtx);
35566   rtx (*gen_round) (rtx, rtx, rtx);
35567 
35568   switch (mode)
35569     {
35570     case SFmode:
35571       gen_copysign = gen_copysignsf3;
35572       gen_round = gen_sse4_1_roundsf2;
35573       break;
35574     case DFmode:
35575       gen_copysign = gen_copysigndf3;
35576       gen_round = gen_sse4_1_rounddf2;
35577       break;
35578     default:
35579       gcc_unreachable ();
35580     }
35581 
35582   /* round (a) = trunc (a + copysign (0.5, a)) */
35583 
35584   /* load nextafter (0.5, 0.0) */
35585   fmt = REAL_MODE_FORMAT (mode);
35586   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35587   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35588   half = const_double_from_real_value (pred_half, mode);
35589 
35590   /* e1 = copysign (0.5, op1) */
35591   e1 = gen_reg_rtx (mode);
35592   emit_insn (gen_copysign (e1, half, op1));
35593 
35594   /* e2 = op1 + e1 */
35595   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35596 
35597   /* res = trunc (e2) */
35598   res = gen_reg_rtx (mode);
35599   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35600 
35601   emit_move_insn (op0, res);
35602 }
35603 
35604 
35605 /* Table of valid machine attributes.  */
35606 static const struct attribute_spec ix86_attribute_table[] =
35607 {
35608   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35609        affects_type_identity } */
35610   /* Stdcall attribute says callee is responsible for popping arguments
35611      if they are not variable.  */
35612   { "stdcall",   0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35613     true },
35614   /* Fastcall attribute says callee is responsible for popping arguments
35615      if they are not variable.  */
35616   { "fastcall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35617     true },
35618   /* Thiscall attribute says callee is responsible for popping arguments
35619      if they are not variable.  */
35620   { "thiscall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35621     true },
35622   /* Cdecl attribute says the callee is a normal C declaration */
35623   { "cdecl",     0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35624     true },
35625   /* Regparm attribute specifies how many integer arguments are to be
35626      passed in registers.  */
35627   { "regparm",   1, 1, false, true,  true,  ix86_handle_cconv_attribute,
35628     true },
35629   /* Sseregparm attribute says we are using x86_64 calling conventions
35630      for FP arguments.  */
35631   { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35632     true },
35633   /* The transactional memory builtins are implicitly regparm or fastcall
35634      depending on the ABI.  Override the generic do-nothing attribute that
35635      these builtins were declared with.  */
35636   { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35637     true },
35638   /* force_align_arg_pointer says this function realigns the stack at entry.  */
35639   { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35640     false, true,  true, ix86_handle_cconv_attribute, false },
35641 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35642   { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35643   { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35644   { "shared",    0, 0, true,  false, false, ix86_handle_shared_attribute,
35645     false },
35646 #endif
35647   { "ms_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute,
35648     false },
35649   { "gcc_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute,
35650     false },
35651 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35652   SUBTARGET_ATTRIBUTE_TABLE,
35653 #endif
35654   /* ms_abi and sysv_abi calling convention function attributes.  */
35655   { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35656   { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35657   { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35658     false },
35659   { "callee_pop_aggregate_return", 1, 1, false, true, true,
35660     ix86_handle_callee_pop_aggregate_return, true },
35661   /* End element.  */
35662   { NULL,        0, 0, false, false, false, NULL, false }
35663 };
35664 
35665 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
35666 static int
35667 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35668                                  tree vectype ATTRIBUTE_UNUSED,
35669                                  int misalign ATTRIBUTE_UNUSED)
35670 {
35671   switch (type_of_cost)
35672     {
35673       case scalar_stmt:
35674         return ix86_cost->scalar_stmt_cost;
35675 
35676       case scalar_load:
35677         return ix86_cost->scalar_load_cost;
35678 
35679       case scalar_store:
35680         return ix86_cost->scalar_store_cost;
35681 
35682       case vector_stmt:
35683         return ix86_cost->vec_stmt_cost;
35684 
35685       case vector_load:
35686         return ix86_cost->vec_align_load_cost;
35687 
35688       case vector_store:
35689         return ix86_cost->vec_store_cost;
35690 
35691       case vec_to_scalar:
35692         return ix86_cost->vec_to_scalar_cost;
35693 
35694       case scalar_to_vec:
35695         return ix86_cost->scalar_to_vec_cost;
35696 
35697       case unaligned_load:
35698       case unaligned_store:
35699         return ix86_cost->vec_unalign_load_cost;
35700 
35701       case cond_branch_taken:
35702         return ix86_cost->cond_taken_branch_cost;
35703 
35704       case cond_branch_not_taken:
35705         return ix86_cost->cond_not_taken_branch_cost;
35706 
35707       case vec_perm:
35708       case vec_promote_demote:
35709         return ix86_cost->vec_stmt_cost;
35710 
35711       default:
35712         gcc_unreachable ();
35713     }
35714 }
35715 
35716 /* Construct (set target (vec_select op0 (parallel perm))) and
35717    return true if that's a valid instruction in the active ISA.  */
35718 
35719 static bool
35720 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35721 {
35722   rtx rperm[MAX_VECT_LEN], x;
35723   unsigned i;
35724 
35725   for (i = 0; i < nelt; ++i)
35726     rperm[i] = GEN_INT (perm[i]);
35727 
35728   x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35729   x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35730   x = gen_rtx_SET (VOIDmode, target, x);
35731 
35732   x = emit_insn (x);
35733   if (recog_memoized (x) < 0)
35734     {
35735       remove_insn (x);
35736       return false;
35737     }
35738   return true;
35739 }
35740 
35741 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
35742 
35743 static bool
35744 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35745 			const unsigned char *perm, unsigned nelt)
35746 {
35747   enum machine_mode v2mode;
35748   rtx x;
35749 
35750   v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35751   x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35752   return expand_vselect (target, x, perm, nelt);
35753 }
35754 
35755 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
35756    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
35757 
35758 static bool
35759 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35760 {
35761   enum machine_mode vmode = d->vmode;
35762   unsigned i, mask, nelt = d->nelt;
35763   rtx target, op0, op1, x;
35764   rtx rperm[32], vperm;
35765 
35766   if (d->op0 == d->op1)
35767     return false;
35768   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35769     ;
35770   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35771     ;
35772   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35773     ;
35774   else
35775     return false;
35776 
35777   /* This is a blend, not a permute.  Elements must stay in their
35778      respective lanes.  */
35779   for (i = 0; i < nelt; ++i)
35780     {
35781       unsigned e = d->perm[i];
35782       if (!(e == i || e == i + nelt))
35783 	return false;
35784     }
35785 
35786   if (d->testing_p)
35787     return true;
35788 
35789   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
35790      decision should be extracted elsewhere, so that we only try that
35791      sequence once all budget==3 options have been tried.  */
35792   target = d->target;
35793   op0 = d->op0;
35794   op1 = d->op1;
35795   mask = 0;
35796 
35797   switch (vmode)
35798     {
35799     case V4DFmode:
35800     case V8SFmode:
35801     case V2DFmode:
35802     case V4SFmode:
35803     case V8HImode:
35804     case V8SImode:
35805       for (i = 0; i < nelt; ++i)
35806 	mask |= (d->perm[i] >= nelt) << i;
35807       break;
35808 
35809     case V2DImode:
35810       for (i = 0; i < 2; ++i)
35811 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35812       vmode = V8HImode;
35813       goto do_subreg;
35814 
35815     case V4SImode:
35816       for (i = 0; i < 4; ++i)
35817 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35818       vmode = V8HImode;
35819       goto do_subreg;
35820 
35821     case V16QImode:
35822       /* See if bytes move in pairs so we can use pblendw with
35823 	 an immediate argument, rather than pblendvb with a vector
35824 	 argument.  */
35825       for (i = 0; i < 16; i += 2)
35826 	if (d->perm[i] + 1 != d->perm[i + 1])
35827 	  {
35828 	  use_pblendvb:
35829 	    for (i = 0; i < nelt; ++i)
35830 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35831 
35832 	  finish_pblendvb:
35833 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35834 	    vperm = force_reg (vmode, vperm);
35835 
35836 	    if (GET_MODE_SIZE (vmode) == 16)
35837 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35838 	    else
35839 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35840 	    return true;
35841 	  }
35842 
35843       for (i = 0; i < 8; ++i)
35844 	mask |= (d->perm[i * 2] >= 16) << i;
35845       vmode = V8HImode;
35846       /* FALLTHRU */
35847 
35848     do_subreg:
35849       target = gen_lowpart (vmode, target);
35850       op0 = gen_lowpart (vmode, op0);
35851       op1 = gen_lowpart (vmode, op1);
35852       break;
35853 
35854     case V32QImode:
35855       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
35856       for (i = 0; i < 32; i += 2)
35857 	if (d->perm[i] + 1 != d->perm[i + 1])
35858 	  goto use_pblendvb;
35859       /* See if bytes move in quadruplets.  If yes, vpblendd
35860 	 with immediate can be used.  */
35861       for (i = 0; i < 32; i += 4)
35862 	if (d->perm[i] + 2 != d->perm[i + 2])
35863 	  break;
35864       if (i < 32)
35865 	{
35866 	  /* See if bytes move the same in both lanes.  If yes,
35867 	     vpblendw with immediate can be used.  */
35868 	  for (i = 0; i < 16; i += 2)
35869 	    if (d->perm[i] + 16 != d->perm[i + 16])
35870 	      goto use_pblendvb;
35871 
35872 	  /* Use vpblendw.  */
35873 	  for (i = 0; i < 16; ++i)
35874 	    mask |= (d->perm[i * 2] >= 32) << i;
35875 	  vmode = V16HImode;
35876 	  goto do_subreg;
35877 	}
35878 
35879       /* Use vpblendd.  */
35880       for (i = 0; i < 8; ++i)
35881 	mask |= (d->perm[i * 4] >= 32) << i;
35882       vmode = V8SImode;
35883       goto do_subreg;
35884 
35885     case V16HImode:
35886       /* See if words move in pairs.  If yes, vpblendd can be used.  */
35887       for (i = 0; i < 16; i += 2)
35888 	if (d->perm[i] + 1 != d->perm[i + 1])
35889 	  break;
35890       if (i < 16)
35891 	{
35892 	  /* See if words move the same in both lanes.  If not,
35893 	     vpblendvb must be used.  */
35894 	  for (i = 0; i < 8; i++)
35895 	    if (d->perm[i] + 8 != d->perm[i + 8])
35896 	      {
35897 		/* Use vpblendvb.  */
35898 		for (i = 0; i < 32; ++i)
35899 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35900 
35901 		vmode = V32QImode;
35902 		nelt = 32;
35903 		target = gen_lowpart (vmode, target);
35904 		op0 = gen_lowpart (vmode, op0);
35905 		op1 = gen_lowpart (vmode, op1);
35906 		goto finish_pblendvb;
35907 	      }
35908 
35909 	  /* Use vpblendw.  */
35910 	  for (i = 0; i < 16; ++i)
35911 	    mask |= (d->perm[i] >= 16) << i;
35912 	  break;
35913 	}
35914 
35915       /* Use vpblendd.  */
35916       for (i = 0; i < 8; ++i)
35917 	mask |= (d->perm[i * 2] >= 16) << i;
35918       vmode = V8SImode;
35919       goto do_subreg;
35920 
35921     case V4DImode:
35922       /* Use vpblendd.  */
35923       for (i = 0; i < 4; ++i)
35924 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35925       vmode = V8SImode;
35926       goto do_subreg;
35927 
35928     default:
35929       gcc_unreachable ();
35930     }
35931 
35932   /* This matches five different patterns with the different modes.  */
35933   x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35934   x = gen_rtx_SET (VOIDmode, target, x);
35935   emit_insn (x);
35936 
35937   return true;
35938 }
35939 
35940 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
35941    in terms of the variable form of vpermilps.
35942 
35943    Note that we will have already failed the immediate input vpermilps,
35944    which requires that the high and low part shuffle be identical; the
35945    variable form doesn't require that.  */
35946 
35947 static bool
35948 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35949 {
35950   rtx rperm[8], vperm;
35951   unsigned i;
35952 
35953   if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35954     return false;
35955 
35956   /* We can only permute within the 128-bit lane.  */
35957   for (i = 0; i < 8; ++i)
35958     {
35959       unsigned e = d->perm[i];
35960       if (i < 4 ? e >= 4 : e < 4)
35961 	return false;
35962     }
35963 
35964   if (d->testing_p)
35965     return true;
35966 
35967   for (i = 0; i < 8; ++i)
35968     {
35969       unsigned e = d->perm[i];
35970 
35971       /* Within each 128-bit lane, the elements of op0 are numbered
35972 	 from 0 and the elements of op1 are numbered from 4.  */
35973       if (e >= 8 + 4)
35974 	e -= 8;
35975       else if (e >= 4)
35976 	e -= 4;
35977 
35978       rperm[i] = GEN_INT (e);
35979     }
35980 
35981   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35982   vperm = force_reg (V8SImode, vperm);
35983   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35984 
35985   return true;
35986 }
35987 
35988 /* Return true if permutation D can be performed as VMODE permutation
35989    instead.  */
35990 
35991 static bool
35992 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35993 {
35994   unsigned int i, j, chunk;
35995 
35996   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35997       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35998       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35999     return false;
36000 
36001   if (GET_MODE_NUNITS (vmode) >= d->nelt)
36002     return true;
36003 
36004   chunk = d->nelt / GET_MODE_NUNITS (vmode);
36005   for (i = 0; i < d->nelt; i += chunk)
36006     if (d->perm[i] & (chunk - 1))
36007       return false;
36008     else
36009       for (j = 1; j < chunk; ++j)
36010 	if (d->perm[i] + j != d->perm[i + j])
36011 	  return false;
36012 
36013   return true;
36014 }
36015 
36016 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
36017    in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128.  */
36018 
36019 static bool
36020 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36021 {
36022   unsigned i, nelt, eltsz, mask;
36023   unsigned char perm[32];
36024   enum machine_mode vmode = V16QImode;
36025   rtx rperm[32], vperm, target, op0, op1;
36026 
36027   nelt = d->nelt;
36028 
36029   if (d->op0 != d->op1)
36030     {
36031       if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36032 	{
36033 	  if (TARGET_AVX2
36034 	      && valid_perm_using_mode_p (V2TImode, d))
36035 	    {
36036 	      if (d->testing_p)
36037 		return true;
36038 
36039 	      /* Use vperm2i128 insn.  The pattern uses
36040 		 V4DImode instead of V2TImode.  */
36041 	      target = gen_lowpart (V4DImode, d->target);
36042 	      op0 = gen_lowpart (V4DImode, d->op0);
36043 	      op1 = gen_lowpart (V4DImode, d->op1);
36044 	      rperm[0]
36045 		= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36046 			   || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36047 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36048 	      return true;
36049 	    }
36050 	  return false;
36051 	}
36052     }
36053   else
36054     {
36055       if (GET_MODE_SIZE (d->vmode) == 16)
36056 	{
36057 	  if (!TARGET_SSSE3)
36058 	    return false;
36059 	}
36060       else if (GET_MODE_SIZE (d->vmode) == 32)
36061 	{
36062 	  if (!TARGET_AVX2)
36063 	    return false;
36064 
36065 	  /* V4DImode should be already handled through
36066 	     expand_vselect by vpermq instruction.  */
36067 	  gcc_assert (d->vmode != V4DImode);
36068 
36069 	  vmode = V32QImode;
36070 	  if (d->vmode == V8SImode
36071 	      || d->vmode == V16HImode
36072 	      || d->vmode == V32QImode)
36073 	    {
36074 	      /* First see if vpermq can be used for
36075 		 V8SImode/V16HImode/V32QImode.  */
36076 	      if (valid_perm_using_mode_p (V4DImode, d))
36077 		{
36078 		  for (i = 0; i < 4; i++)
36079 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36080 		  if (d->testing_p)
36081 		    return true;
36082 		  return expand_vselect (gen_lowpart (V4DImode, d->target),
36083 					 gen_lowpart (V4DImode, d->op0),
36084 					 perm, 4);
36085 		}
36086 
36087 	      /* Next see if vpermd can be used.  */
36088 	      if (valid_perm_using_mode_p (V8SImode, d))
36089 		vmode = V8SImode;
36090 	    }
36091 
36092 	  if (vmode == V32QImode)
36093 	    {
36094 	      /* vpshufb only works intra lanes, it is not
36095 		 possible to shuffle bytes in between the lanes.  */
36096 	      for (i = 0; i < nelt; ++i)
36097 		if ((d->perm[i] ^ i) & (nelt / 2))
36098 		  return false;
36099 	    }
36100 	}
36101       else
36102 	return false;
36103     }
36104 
36105   if (d->testing_p)
36106     return true;
36107 
36108   if (vmode == V8SImode)
36109     for (i = 0; i < 8; ++i)
36110       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36111   else
36112     {
36113       eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36114       if (d->op0 != d->op1)
36115 	mask = 2 * nelt - 1;
36116       else if (vmode == V16QImode)
36117 	mask = nelt - 1;
36118       else
36119 	mask = nelt / 2 - 1;
36120 
36121       for (i = 0; i < nelt; ++i)
36122 	{
36123 	  unsigned j, e = d->perm[i] & mask;
36124 	  for (j = 0; j < eltsz; ++j)
36125 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36126 	}
36127     }
36128 
36129   vperm = gen_rtx_CONST_VECTOR (vmode,
36130 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36131   vperm = force_reg (vmode, vperm);
36132 
36133   target = gen_lowpart (vmode, d->target);
36134   op0 = gen_lowpart (vmode, d->op0);
36135   if (d->op0 == d->op1)
36136     {
36137       if (vmode == V16QImode)
36138 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36139       else if (vmode == V32QImode)
36140 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36141       else
36142 	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36143     }
36144   else
36145     {
36146       op1 = gen_lowpart (vmode, d->op1);
36147       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36148     }
36149 
36150   return true;
36151 }
36152 
36153 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
36154    in a single instruction.  */
36155 
36156 static bool
36157 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36158 {
36159   unsigned i, nelt = d->nelt;
36160   unsigned char perm2[MAX_VECT_LEN];
36161 
36162   /* Check plain VEC_SELECT first, because AVX has instructions that could
36163      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36164      input where SEL+CONCAT may not.  */
36165   if (d->op0 == d->op1)
36166     {
36167       int mask = nelt - 1;
36168       bool identity_perm = true;
36169       bool broadcast_perm = true;
36170 
36171       for (i = 0; i < nelt; i++)
36172 	{
36173 	  perm2[i] = d->perm[i] & mask;
36174 	  if (perm2[i] != i)
36175 	    identity_perm = false;
36176 	  if (perm2[i])
36177 	    broadcast_perm = false;
36178 	}
36179 
36180       if (identity_perm)
36181 	{
36182 	  if (!d->testing_p)
36183 	    emit_move_insn (d->target, d->op0);
36184 	  return true;
36185 	}
36186       else if (broadcast_perm && TARGET_AVX2)
36187 	{
36188 	  /* Use vpbroadcast{b,w,d}.  */
36189 	  rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
36190 	  switch (d->vmode)
36191 	    {
36192 	    case V32QImode:
36193 	      op = gen_lowpart (V16QImode, op);
36194 	      gen = gen_avx2_pbroadcastv32qi;
36195 	      break;
36196 	    case V16HImode:
36197 	      op = gen_lowpart (V8HImode, op);
36198 	      gen = gen_avx2_pbroadcastv16hi;
36199 	      break;
36200 	    case V8SImode:
36201 	      op = gen_lowpart (V4SImode, op);
36202 	      gen = gen_avx2_pbroadcastv8si;
36203 	      break;
36204 	    case V16QImode:
36205 	      gen = gen_avx2_pbroadcastv16qi;
36206 	      break;
36207 	    case V8HImode:
36208 	      gen = gen_avx2_pbroadcastv8hi;
36209 	      break;
36210 	    /* For other modes prefer other shuffles this function creates.  */
36211 	    default: break;
36212 	    }
36213 	  if (gen != NULL)
36214 	    {
36215 	      if (!d->testing_p)
36216 		emit_insn (gen (d->target, op));
36217 	      return true;
36218 	    }
36219 	}
36220 
36221       if (expand_vselect (d->target, d->op0, perm2, nelt))
36222 	return true;
36223 
36224       /* There are plenty of patterns in sse.md that are written for
36225 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
36226 	 that should be changed, to avoid the nastiness here.  */
36227 
36228       /* Recognize interleave style patterns, which means incrementing
36229 	 every other permutation operand.  */
36230       for (i = 0; i < nelt; i += 2)
36231 	{
36232 	  perm2[i] = d->perm[i] & mask;
36233 	  perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36234 	}
36235       if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36236 	return true;
36237 
36238       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
36239       if (nelt >= 4)
36240 	{
36241 	  for (i = 0; i < nelt; i += 4)
36242 	    {
36243 	      perm2[i + 0] = d->perm[i + 0] & mask;
36244 	      perm2[i + 1] = d->perm[i + 1] & mask;
36245 	      perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36246 	      perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36247 	    }
36248 
36249 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36250 	    return true;
36251 	}
36252     }
36253 
36254   /* Finally, try the fully general two operand permute.  */
36255   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
36256     return true;
36257 
36258   /* Recognize interleave style patterns with reversed operands.  */
36259   if (d->op0 != d->op1)
36260     {
36261       for (i = 0; i < nelt; ++i)
36262 	{
36263 	  unsigned e = d->perm[i];
36264 	  if (e >= nelt)
36265 	    e -= nelt;
36266 	  else
36267 	    e += nelt;
36268 	  perm2[i] = e;
36269 	}
36270 
36271       if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
36272 	return true;
36273     }
36274 
36275   /* Try the SSE4.1 blend variable merge instructions.  */
36276   if (expand_vec_perm_blend (d))
36277     return true;
36278 
36279   /* Try one of the AVX vpermil variable permutations.  */
36280   if (expand_vec_perm_vpermil (d))
36281     return true;
36282 
36283   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36284      vpshufb, vpermd or vpermq variable permutation.  */
36285   if (expand_vec_perm_pshufb (d))
36286     return true;
36287 
36288   return false;
36289 }
36290 
36291 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
36292    in terms of a pair of pshuflw + pshufhw instructions.  */
36293 
36294 static bool
36295 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36296 {
36297   unsigned char perm2[MAX_VECT_LEN];
36298   unsigned i;
36299   bool ok;
36300 
36301   if (d->vmode != V8HImode || d->op0 != d->op1)
36302     return false;
36303 
36304   /* The two permutations only operate in 64-bit lanes.  */
36305   for (i = 0; i < 4; ++i)
36306     if (d->perm[i] >= 4)
36307       return false;
36308   for (i = 4; i < 8; ++i)
36309     if (d->perm[i] < 4)
36310       return false;
36311 
36312   if (d->testing_p)
36313     return true;
36314 
36315   /* Emit the pshuflw.  */
36316   memcpy (perm2, d->perm, 4);
36317   for (i = 4; i < 8; ++i)
36318     perm2[i] = i;
36319   ok = expand_vselect (d->target, d->op0, perm2, 8);
36320   gcc_assert (ok);
36321 
36322   /* Emit the pshufhw.  */
36323   memcpy (perm2 + 4, d->perm + 4, 4);
36324   for (i = 0; i < 4; ++i)
36325     perm2[i] = i;
36326   ok = expand_vselect (d->target, d->target, perm2, 8);
36327   gcc_assert (ok);
36328 
36329   return true;
36330 }
36331 
36332 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36333    the permutation using the SSSE3 palignr instruction.  This succeeds
36334    when all of the elements in PERM fit within one vector and we merely
36335    need to shift them down so that a single vector permutation has a
36336    chance to succeed.  */
36337 
36338 static bool
36339 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36340 {
36341   unsigned i, nelt = d->nelt;
36342   unsigned min, max;
36343   bool in_order, ok;
36344   rtx shift;
36345 
36346   /* Even with AVX, palignr only operates on 128-bit vectors.  */
36347   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36348     return false;
36349 
36350   min = nelt, max = 0;
36351   for (i = 0; i < nelt; ++i)
36352     {
36353       unsigned e = d->perm[i];
36354       if (e < min)
36355 	min = e;
36356       if (e > max)
36357 	max = e;
36358     }
36359   if (min == 0 || max - min >= nelt)
36360     return false;
36361 
36362   /* Given that we have SSSE3, we know we'll be able to implement the
36363      single operand permutation after the palignr with pshufb.  */
36364   if (d->testing_p)
36365     return true;
36366 
36367   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36368   emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36369 				  gen_lowpart (TImode, d->op1),
36370 				  gen_lowpart (TImode, d->op0), shift));
36371 
36372   d->op0 = d->op1 = d->target;
36373 
36374   in_order = true;
36375   for (i = 0; i < nelt; ++i)
36376     {
36377       unsigned e = d->perm[i] - min;
36378       if (e != i)
36379 	in_order = false;
36380       d->perm[i] = e;
36381     }
36382 
36383   /* Test for the degenerate case where the alignment by itself
36384      produces the desired permutation.  */
36385   if (in_order)
36386     return true;
36387 
36388   ok = expand_vec_perm_1 (d);
36389   gcc_assert (ok);
36390 
36391   return ok;
36392 }
36393 
36394 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36395 
36396 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36397    a two vector permutation into a single vector permutation by using
36398    an interleave operation to merge the vectors.  */
36399 
36400 static bool
36401 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36402 {
36403   struct expand_vec_perm_d dremap, dfinal;
36404   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36405   unsigned HOST_WIDE_INT contents;
36406   unsigned char remap[2 * MAX_VECT_LEN];
36407   rtx seq;
36408   bool ok, same_halves = false;
36409 
36410   if (GET_MODE_SIZE (d->vmode) == 16)
36411     {
36412       if (d->op0 == d->op1)
36413 	return false;
36414     }
36415   else if (GET_MODE_SIZE (d->vmode) == 32)
36416     {
36417       if (!TARGET_AVX)
36418 	return false;
36419       /* For 32-byte modes allow even d->op0 == d->op1.
36420 	 The lack of cross-lane shuffling in some instructions
36421 	 might prevent a single insn shuffle.  */
36422       dfinal = *d;
36423       dfinal.testing_p = true;
36424       /* If expand_vec_perm_interleave3 can expand this into
36425 	 a 3 insn sequence, give up and let it be expanded as
36426 	 3 insn sequence.  While that is one insn longer,
36427 	 it doesn't need a memory operand and in the common
36428 	 case that both interleave low and high permutations
36429 	 with the same operands are adjacent needs 4 insns
36430 	 for both after CSE.  */
36431       if (expand_vec_perm_interleave3 (&dfinal))
36432 	return false;
36433     }
36434   else
36435     return false;
36436 
36437   /* Examine from whence the elements come.  */
36438   contents = 0;
36439   for (i = 0; i < nelt; ++i)
36440     contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36441 
36442   memset (remap, 0xff, sizeof (remap));
36443   dremap = *d;
36444 
36445   if (GET_MODE_SIZE (d->vmode) == 16)
36446     {
36447       unsigned HOST_WIDE_INT h1, h2, h3, h4;
36448 
36449       /* Split the two input vectors into 4 halves.  */
36450       h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36451       h2 = h1 << nelt2;
36452       h3 = h2 << nelt2;
36453       h4 = h3 << nelt2;
36454 
36455       /* If the elements from the low halves use interleave low, and similarly
36456 	 for interleave high.  If the elements are from mis-matched halves, we
36457 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
36458       if ((contents & (h1 | h3)) == contents)
36459 	{
36460 	  /* punpckl* */
36461 	  for (i = 0; i < nelt2; ++i)
36462 	    {
36463 	      remap[i] = i * 2;
36464 	      remap[i + nelt] = i * 2 + 1;
36465 	      dremap.perm[i * 2] = i;
36466 	      dremap.perm[i * 2 + 1] = i + nelt;
36467 	    }
36468 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
36469 	    dremap.vmode = V4SFmode;
36470 	}
36471       else if ((contents & (h2 | h4)) == contents)
36472 	{
36473 	  /* punpckh* */
36474 	  for (i = 0; i < nelt2; ++i)
36475 	    {
36476 	      remap[i + nelt2] = i * 2;
36477 	      remap[i + nelt + nelt2] = i * 2 + 1;
36478 	      dremap.perm[i * 2] = i + nelt2;
36479 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36480 	    }
36481 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
36482 	    dremap.vmode = V4SFmode;
36483 	}
36484       else if ((contents & (h1 | h4)) == contents)
36485 	{
36486 	  /* shufps */
36487 	  for (i = 0; i < nelt2; ++i)
36488 	    {
36489 	      remap[i] = i;
36490 	      remap[i + nelt + nelt2] = i + nelt2;
36491 	      dremap.perm[i] = i;
36492 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
36493 	    }
36494 	  if (nelt != 4)
36495 	    {
36496 	      /* shufpd */
36497 	      dremap.vmode = V2DImode;
36498 	      dremap.nelt = 2;
36499 	      dremap.perm[0] = 0;
36500 	      dremap.perm[1] = 3;
36501 	    }
36502 	}
36503       else if ((contents & (h2 | h3)) == contents)
36504 	{
36505 	  /* shufps */
36506 	  for (i = 0; i < nelt2; ++i)
36507 	    {
36508 	      remap[i + nelt2] = i;
36509 	      remap[i + nelt] = i + nelt2;
36510 	      dremap.perm[i] = i + nelt2;
36511 	      dremap.perm[i + nelt2] = i + nelt;
36512 	    }
36513 	  if (nelt != 4)
36514 	    {
36515 	      /* shufpd */
36516 	      dremap.vmode = V2DImode;
36517 	      dremap.nelt = 2;
36518 	      dremap.perm[0] = 1;
36519 	      dremap.perm[1] = 2;
36520 	    }
36521 	}
36522       else
36523 	return false;
36524     }
36525   else
36526     {
36527       unsigned int nelt4 = nelt / 4, nzcnt = 0;
36528       unsigned HOST_WIDE_INT q[8];
36529       unsigned int nonzero_halves[4];
36530 
36531       /* Split the two input vectors into 8 quarters.  */
36532       q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36533       for (i = 1; i < 8; ++i)
36534 	q[i] = q[0] << (nelt4 * i);
36535       for (i = 0; i < 4; ++i)
36536 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36537 	  {
36538 	    nonzero_halves[nzcnt] = i;
36539 	    ++nzcnt;
36540 	  }
36541 
36542       if (nzcnt == 1)
36543 	{
36544 	  gcc_assert (d->op0 == d->op1);
36545 	  nonzero_halves[1] = nonzero_halves[0];
36546 	  same_halves = true;
36547 	}
36548       else if (d->op0 == d->op1)
36549 	{
36550 	  gcc_assert (nonzero_halves[0] == 0);
36551 	  gcc_assert (nonzero_halves[1] == 1);
36552 	}
36553 
36554       if (nzcnt <= 2)
36555 	{
36556 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
36557 	    {
36558 	      /* Attempt to increase the likelyhood that dfinal
36559 		 shuffle will be intra-lane.  */
36560 	      char tmph = nonzero_halves[0];
36561 	      nonzero_halves[0] = nonzero_halves[1];
36562 	      nonzero_halves[1] = tmph;
36563 	    }
36564 
36565 	  /* vperm2f128 or vperm2i128.  */
36566 	  for (i = 0; i < nelt2; ++i)
36567 	    {
36568 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36569 	      remap[i + nonzero_halves[0] * nelt2] = i;
36570 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36571 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36572 	    }
36573 
36574 	  if (d->vmode != V8SFmode
36575 	      && d->vmode != V4DFmode
36576 	      && d->vmode != V8SImode)
36577 	    {
36578 	      dremap.vmode = V8SImode;
36579 	      dremap.nelt = 8;
36580 	      for (i = 0; i < 4; ++i)
36581 		{
36582 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
36583 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36584 		}
36585 	    }
36586 	}
36587       else if (d->op0 == d->op1)
36588 	return false;
36589       else if (TARGET_AVX2
36590 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36591 	{
36592 	  /* vpunpckl* */
36593 	  for (i = 0; i < nelt4; ++i)
36594 	    {
36595 	      remap[i] = i * 2;
36596 	      remap[i + nelt] = i * 2 + 1;
36597 	      remap[i + nelt2] = i * 2 + nelt2;
36598 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36599 	      dremap.perm[i * 2] = i;
36600 	      dremap.perm[i * 2 + 1] = i + nelt;
36601 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
36602 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36603 	    }
36604 	}
36605       else if (TARGET_AVX2
36606 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36607 	{
36608 	  /* vpunpckh* */
36609 	  for (i = 0; i < nelt4; ++i)
36610 	    {
36611 	      remap[i + nelt4] = i * 2;
36612 	      remap[i + nelt + nelt4] = i * 2 + 1;
36613 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36614 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36615 	      dremap.perm[i * 2] = i + nelt4;
36616 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36617 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36618 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36619 	    }
36620 	}
36621       else
36622 	return false;
36623     }
36624 
36625   /* Use the remapping array set up above to move the elements from their
36626      swizzled locations into their final destinations.  */
36627   dfinal = *d;
36628   for (i = 0; i < nelt; ++i)
36629     {
36630       unsigned e = remap[d->perm[i]];
36631       gcc_assert (e < nelt);
36632       /* If same_halves is true, both halves of the remapped vector are the
36633 	 same.  Avoid cross-lane accesses if possible.  */
36634       if (same_halves && i >= nelt2)
36635 	{
36636 	  gcc_assert (e < nelt2);
36637 	  dfinal.perm[i] = e + nelt2;
36638 	}
36639       else
36640 	dfinal.perm[i] = e;
36641     }
36642   dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36643   dfinal.op1 = dfinal.op0;
36644   dremap.target = dfinal.op0;
36645 
36646   /* Test if the final remap can be done with a single insn.  For V4SFmode or
36647      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
36648   start_sequence ();
36649   ok = expand_vec_perm_1 (&dfinal);
36650   seq = get_insns ();
36651   end_sequence ();
36652 
36653   if (!ok)
36654     return false;
36655 
36656   if (d->testing_p)
36657     return true;
36658 
36659   if (dremap.vmode != dfinal.vmode)
36660     {
36661       dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36662       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36663       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36664     }
36665 
36666   ok = expand_vec_perm_1 (&dremap);
36667   gcc_assert (ok);
36668 
36669   emit_insn (seq);
36670   return true;
36671 }
36672 
36673 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36674    a single vector cross-lane permutation into vpermq followed
36675    by any of the single insn permutations.  */
36676 
36677 static bool
36678 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36679 {
36680   struct expand_vec_perm_d dremap, dfinal;
36681   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36682   unsigned contents[2];
36683   bool ok;
36684 
36685   if (!(TARGET_AVX2
36686 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
36687 	&& d->op0 == d->op1))
36688     return false;
36689 
36690   contents[0] = 0;
36691   contents[1] = 0;
36692   for (i = 0; i < nelt2; ++i)
36693     {
36694       contents[0] |= 1u << (d->perm[i] / nelt4);
36695       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36696     }
36697 
36698   for (i = 0; i < 2; ++i)
36699     {
36700       unsigned int cnt = 0;
36701       for (j = 0; j < 4; ++j)
36702 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36703 	  return false;
36704     }
36705 
36706   if (d->testing_p)
36707     return true;
36708 
36709   dremap = *d;
36710   dremap.vmode = V4DImode;
36711   dremap.nelt = 4;
36712   dremap.target = gen_reg_rtx (V4DImode);
36713   dremap.op0 = gen_lowpart (V4DImode, d->op0);
36714   dremap.op1 = dremap.op0;
36715   for (i = 0; i < 2; ++i)
36716     {
36717       unsigned int cnt = 0;
36718       for (j = 0; j < 4; ++j)
36719 	if ((contents[i] & (1u << j)) != 0)
36720 	  dremap.perm[2 * i + cnt++] = j;
36721       for (; cnt < 2; ++cnt)
36722 	dremap.perm[2 * i + cnt] = 0;
36723     }
36724 
36725   dfinal = *d;
36726   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36727   dfinal.op1 = dfinal.op0;
36728   for (i = 0, j = 0; i < nelt; ++i)
36729     {
36730       if (i == nelt2)
36731 	j = 2;
36732       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36733       if ((d->perm[i] / nelt4) == dremap.perm[j])
36734 	;
36735       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36736 	dfinal.perm[i] |= nelt4;
36737       else
36738 	gcc_unreachable ();
36739     }
36740 
36741   ok = expand_vec_perm_1 (&dremap);
36742   gcc_assert (ok);
36743 
36744   ok = expand_vec_perm_1 (&dfinal);
36745   gcc_assert (ok);
36746 
36747   return true;
36748 }
36749 
36750 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36751    a two vector permutation using 2 intra-lane interleave insns
36752    and cross-lane shuffle for 32-byte vectors.  */
36753 
36754 static bool
36755 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36756 {
36757   unsigned i, nelt;
36758   rtx (*gen) (rtx, rtx, rtx);
36759 
36760   if (d->op0 == d->op1)
36761     return false;
36762   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36763     ;
36764   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36765     ;
36766   else
36767     return false;
36768 
36769   nelt = d->nelt;
36770   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36771     return false;
36772   for (i = 0; i < nelt; i += 2)
36773     if (d->perm[i] != d->perm[0] + i / 2
36774 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36775       return false;
36776 
36777   if (d->testing_p)
36778     return true;
36779 
36780   switch (d->vmode)
36781     {
36782     case V32QImode:
36783       if (d->perm[0])
36784 	gen = gen_vec_interleave_highv32qi;
36785       else
36786 	gen = gen_vec_interleave_lowv32qi;
36787       break;
36788     case V16HImode:
36789       if (d->perm[0])
36790 	gen = gen_vec_interleave_highv16hi;
36791       else
36792 	gen = gen_vec_interleave_lowv16hi;
36793       break;
36794     case V8SImode:
36795       if (d->perm[0])
36796 	gen = gen_vec_interleave_highv8si;
36797       else
36798 	gen = gen_vec_interleave_lowv8si;
36799       break;
36800     case V4DImode:
36801       if (d->perm[0])
36802 	gen = gen_vec_interleave_highv4di;
36803       else
36804 	gen = gen_vec_interleave_lowv4di;
36805       break;
36806     case V8SFmode:
36807       if (d->perm[0])
36808 	gen = gen_vec_interleave_highv8sf;
36809       else
36810 	gen = gen_vec_interleave_lowv8sf;
36811       break;
36812     case V4DFmode:
36813       if (d->perm[0])
36814 	gen = gen_vec_interleave_highv4df;
36815       else
36816 	gen = gen_vec_interleave_lowv4df;
36817       break;
36818     default:
36819       gcc_unreachable ();
36820     }
36821 
36822   emit_insn (gen (d->target, d->op0, d->op1));
36823   return true;
36824 }
36825 
36826 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
36827    permutation with two pshufb insns and an ior.  We should have already
36828    failed all two instruction sequences.  */
36829 
36830 static bool
36831 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36832 {
36833   rtx rperm[2][16], vperm, l, h, op, m128;
36834   unsigned int i, nelt, eltsz;
36835 
36836   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36837     return false;
36838   gcc_assert (d->op0 != d->op1);
36839 
36840   nelt = d->nelt;
36841   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36842 
36843   /* Generate two permutation masks.  If the required element is within
36844      the given vector it is shuffled into the proper lane.  If the required
36845      element is in the other vector, force a zero into the lane by setting
36846      bit 7 in the permutation mask.  */
36847   m128 = GEN_INT (-128);
36848   for (i = 0; i < nelt; ++i)
36849     {
36850       unsigned j, e = d->perm[i];
36851       unsigned which = (e >= nelt);
36852       if (e >= nelt)
36853 	e -= nelt;
36854 
36855       for (j = 0; j < eltsz; ++j)
36856 	{
36857 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36858 	  rperm[1-which][i*eltsz + j] = m128;
36859 	}
36860     }
36861 
36862   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36863   vperm = force_reg (V16QImode, vperm);
36864 
36865   l = gen_reg_rtx (V16QImode);
36866   op = gen_lowpart (V16QImode, d->op0);
36867   emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36868 
36869   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36870   vperm = force_reg (V16QImode, vperm);
36871 
36872   h = gen_reg_rtx (V16QImode);
36873   op = gen_lowpart (V16QImode, d->op1);
36874   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36875 
36876   op = gen_lowpart (V16QImode, d->target);
36877   emit_insn (gen_iorv16qi3 (op, l, h));
36878 
36879   return true;
36880 }
36881 
36882 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36883    with two vpshufb insns, vpermq and vpor.  We should have already failed
36884    all two or three instruction sequences.  */
36885 
36886 static bool
36887 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36888 {
36889   rtx rperm[2][32], vperm, l, h, hp, op, m128;
36890   unsigned int i, nelt, eltsz;
36891 
36892   if (!TARGET_AVX2
36893       || d->op0 != d->op1
36894       || (d->vmode != V32QImode && d->vmode != V16HImode))
36895     return false;
36896 
36897   if (d->testing_p)
36898     return true;
36899 
36900   nelt = d->nelt;
36901   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36902 
36903   /* Generate two permutation masks.  If the required element is within
36904      the same lane, it is shuffled in.  If the required element from the
36905      other lane, force a zero by setting bit 7 in the permutation mask.
36906      In the other mask the mask has non-negative elements if element
36907      is requested from the other lane, but also moved to the other lane,
36908      so that the result of vpshufb can have the two V2TImode halves
36909      swapped.  */
36910   m128 = GEN_INT (-128);
36911   for (i = 0; i < nelt; ++i)
36912     {
36913       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36914       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36915 
36916       for (j = 0; j < eltsz; ++j)
36917 	{
36918 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36919 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36920 	}
36921     }
36922 
36923   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36924   vperm = force_reg (V32QImode, vperm);
36925 
36926   h = gen_reg_rtx (V32QImode);
36927   op = gen_lowpart (V32QImode, d->op0);
36928   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36929 
36930   /* Swap the 128-byte lanes of h into hp.  */
36931   hp = gen_reg_rtx (V4DImode);
36932   op = gen_lowpart (V4DImode, h);
36933   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36934 				  const1_rtx));
36935 
36936   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36937   vperm = force_reg (V32QImode, vperm);
36938 
36939   l = gen_reg_rtx (V32QImode);
36940   op = gen_lowpart (V32QImode, d->op0);
36941   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36942 
36943   op = gen_lowpart (V32QImode, d->target);
36944   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36945 
36946   return true;
36947 }
36948 
36949 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
36950    and extract-odd permutations of two V32QImode and V16QImode operand
36951    with two vpshufb insns, vpor and vpermq.  We should have already
36952    failed all two or three instruction sequences.  */
36953 
36954 static bool
36955 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36956 {
36957   rtx rperm[2][32], vperm, l, h, ior, op, m128;
36958   unsigned int i, nelt, eltsz;
36959 
36960   if (!TARGET_AVX2
36961       || d->op0 == d->op1
36962       || (d->vmode != V32QImode && d->vmode != V16HImode))
36963     return false;
36964 
36965   for (i = 0; i < d->nelt; ++i)
36966     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36967       return false;
36968 
36969   if (d->testing_p)
36970     return true;
36971 
36972   nelt = d->nelt;
36973   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36974 
36975   /* Generate two permutation masks.  In the first permutation mask
36976      the first quarter will contain indexes for the first half
36977      of the op0, the second quarter will contain bit 7 set, third quarter
36978      will contain indexes for the second half of the op0 and the
36979      last quarter bit 7 set.  In the second permutation mask
36980      the first quarter will contain bit 7 set, the second quarter
36981      indexes for the first half of the op1, the third quarter bit 7 set
36982      and last quarter indexes for the second half of the op1.
36983      I.e. the first mask e.g. for V32QImode extract even will be:
36984      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36985      (all values masked with 0xf except for -128) and second mask
36986      for extract even will be
36987      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
36988   m128 = GEN_INT (-128);
36989   for (i = 0; i < nelt; ++i)
36990     {
36991       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36992       unsigned which = d->perm[i] >= nelt;
36993       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36994 
36995       for (j = 0; j < eltsz; ++j)
36996 	{
36997 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36998 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36999 	}
37000     }
37001 
37002   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37003   vperm = force_reg (V32QImode, vperm);
37004 
37005   l = gen_reg_rtx (V32QImode);
37006   op = gen_lowpart (V32QImode, d->op0);
37007   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37008 
37009   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37010   vperm = force_reg (V32QImode, vperm);
37011 
37012   h = gen_reg_rtx (V32QImode);
37013   op = gen_lowpart (V32QImode, d->op1);
37014   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37015 
37016   ior = gen_reg_rtx (V32QImode);
37017   emit_insn (gen_iorv32qi3 (ior, l, h));
37018 
37019   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
37020   op = gen_lowpart (V4DImode, d->target);
37021   ior = gen_lowpart (V4DImode, ior);
37022   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37023 				  const1_rtx, GEN_INT (3)));
37024 
37025   return true;
37026 }
37027 
37028 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
37029    and extract-odd permutations.  */
37030 
37031 static bool
37032 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37033 {
37034   rtx t1, t2, t3;
37035 
37036   switch (d->vmode)
37037     {
37038     case V4DFmode:
37039       t1 = gen_reg_rtx (V4DFmode);
37040       t2 = gen_reg_rtx (V4DFmode);
37041 
37042       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
37043       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37044       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37045 
37046       /* Now an unpck[lh]pd will produce the result required.  */
37047       if (odd)
37048 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37049       else
37050 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37051       emit_insn (t3);
37052       break;
37053 
37054     case V8SFmode:
37055       {
37056 	int mask = odd ? 0xdd : 0x88;
37057 
37058 	t1 = gen_reg_rtx (V8SFmode);
37059 	t2 = gen_reg_rtx (V8SFmode);
37060 	t3 = gen_reg_rtx (V8SFmode);
37061 
37062 	/* Shuffle within the 128-bit lanes to produce:
37063 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
37064 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37065 				      GEN_INT (mask)));
37066 
37067 	/* Shuffle the lanes around to produce:
37068 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
37069 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37070 					    GEN_INT (0x3)));
37071 
37072 	/* Shuffle within the 128-bit lanes to produce:
37073 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
37074 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37075 
37076 	/* Shuffle within the 128-bit lanes to produce:
37077 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
37078 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37079 
37080 	/* Shuffle the lanes around to produce:
37081 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
37082 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37083 					    GEN_INT (0x20)));
37084       }
37085       break;
37086 
37087     case V2DFmode:
37088     case V4SFmode:
37089     case V2DImode:
37090     case V4SImode:
37091       /* These are always directly implementable by expand_vec_perm_1.  */
37092       gcc_unreachable ();
37093 
37094     case V8HImode:
37095       if (TARGET_SSSE3)
37096 	return expand_vec_perm_pshufb2 (d);
37097       else
37098 	{
37099 	  /* We need 2*log2(N)-1 operations to achieve odd/even
37100 	     with interleave. */
37101 	  t1 = gen_reg_rtx (V8HImode);
37102 	  t2 = gen_reg_rtx (V8HImode);
37103 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37104 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37105 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37106 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37107 	  if (odd)
37108 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37109 	  else
37110 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37111 	  emit_insn (t3);
37112 	}
37113       break;
37114 
37115     case V16QImode:
37116       if (TARGET_SSSE3)
37117 	return expand_vec_perm_pshufb2 (d);
37118       else
37119 	{
37120 	  t1 = gen_reg_rtx (V16QImode);
37121 	  t2 = gen_reg_rtx (V16QImode);
37122 	  t3 = gen_reg_rtx (V16QImode);
37123 	  emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37124 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37125 	  emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37126 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37127 	  emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37128 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37129 	  if (odd)
37130 	    t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37131 	  else
37132 	    t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37133 	  emit_insn (t3);
37134 	}
37135       break;
37136 
37137     case V16HImode:
37138     case V32QImode:
37139       return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37140 
37141     case V4DImode:
37142       if (!TARGET_AVX2)
37143 	{
37144 	  struct expand_vec_perm_d d_copy = *d;
37145 	  d_copy.vmode = V4DFmode;
37146 	  d_copy.target = gen_lowpart (V4DFmode, d->target);
37147 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37148 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37149 	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
37150 	}
37151 
37152       t1 = gen_reg_rtx (V4DImode);
37153       t2 = gen_reg_rtx (V4DImode);
37154 
37155       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
37156       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37157       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37158 
37159       /* Now an vpunpck[lh]qdq will produce the result required.  */
37160       if (odd)
37161 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37162       else
37163 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37164       emit_insn (t3);
37165       break;
37166 
37167     case V8SImode:
37168       if (!TARGET_AVX2)
37169 	{
37170 	  struct expand_vec_perm_d d_copy = *d;
37171 	  d_copy.vmode = V8SFmode;
37172 	  d_copy.target = gen_lowpart (V8SFmode, d->target);
37173 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37174 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37175 	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
37176 	}
37177 
37178       t1 = gen_reg_rtx (V8SImode);
37179       t2 = gen_reg_rtx (V8SImode);
37180 
37181       /* Shuffle the lanes around into
37182 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
37183       emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37184 				    gen_lowpart (V4DImode, d->op0),
37185 				    gen_lowpart (V4DImode, d->op1),
37186 				    GEN_INT (0x20)));
37187       emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37188 				    gen_lowpart (V4DImode, d->op0),
37189 				    gen_lowpart (V4DImode, d->op1),
37190 				    GEN_INT (0x31)));
37191 
37192       /* Swap the 2nd and 3rd position in each lane into
37193 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
37194       emit_insn (gen_avx2_pshufdv3 (t1, t1,
37195 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37196       emit_insn (gen_avx2_pshufdv3 (t2, t2,
37197 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37198 
37199       /* Now an vpunpck[lh]qdq will produce
37200 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
37201       if (odd)
37202 	t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37203 					   gen_lowpart (V4DImode, t1),
37204 					   gen_lowpart (V4DImode, t2));
37205       else
37206 	t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37207 					  gen_lowpart (V4DImode, t1),
37208 					  gen_lowpart (V4DImode, t2));
37209       emit_insn (t3);
37210       break;
37211 
37212     default:
37213       gcc_unreachable ();
37214     }
37215 
37216   return true;
37217 }
37218 
37219 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
37220    extract-even and extract-odd permutations.  */
37221 
37222 static bool
37223 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37224 {
37225   unsigned i, odd, nelt = d->nelt;
37226 
37227   odd = d->perm[0];
37228   if (odd != 0 && odd != 1)
37229     return false;
37230 
37231   for (i = 1; i < nelt; ++i)
37232     if (d->perm[i] != 2 * i + odd)
37233       return false;
37234 
37235   return expand_vec_perm_even_odd_1 (d, odd);
37236 }
37237 
37238 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
37239    permutations.  We assume that expand_vec_perm_1 has already failed.  */
37240 
37241 static bool
37242 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37243 {
37244   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37245   enum machine_mode vmode = d->vmode;
37246   unsigned char perm2[4];
37247   rtx op0 = d->op0;
37248   bool ok;
37249 
37250   switch (vmode)
37251     {
37252     case V4DFmode:
37253     case V8SFmode:
37254       /* These are special-cased in sse.md so that we can optionally
37255 	 use the vbroadcast instruction.  They expand to two insns
37256 	 if the input happens to be in a register.  */
37257       gcc_unreachable ();
37258 
37259     case V2DFmode:
37260     case V2DImode:
37261     case V4SFmode:
37262     case V4SImode:
37263       /* These are always implementable using standard shuffle patterns.  */
37264       gcc_unreachable ();
37265 
37266     case V8HImode:
37267     case V16QImode:
37268       /* These can be implemented via interleave.  We save one insn by
37269 	 stopping once we have promoted to V4SImode and then use pshufd.  */
37270       do
37271 	{
37272 	  rtx dest;
37273 	  rtx (*gen) (rtx, rtx, rtx)
37274 	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37275 				 : gen_vec_interleave_lowv8hi;
37276 
37277 	  if (elt >= nelt2)
37278 	    {
37279 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37280 				       : gen_vec_interleave_highv8hi;
37281 	      elt -= nelt2;
37282 	    }
37283 	  nelt2 /= 2;
37284 
37285 	  dest = gen_reg_rtx (vmode);
37286 	  emit_insn (gen (dest, op0, op0));
37287 	  vmode = get_mode_wider_vector (vmode);
37288 	  op0 = gen_lowpart (vmode, dest);
37289 	}
37290       while (vmode != V4SImode);
37291 
37292       memset (perm2, elt, 4);
37293       ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
37294       gcc_assert (ok);
37295       return true;
37296 
37297     case V32QImode:
37298     case V16HImode:
37299     case V8SImode:
37300     case V4DImode:
37301       /* For AVX2 broadcasts of the first element vpbroadcast* or
37302 	 vpermq should be used by expand_vec_perm_1.  */
37303       gcc_assert (!TARGET_AVX2 || d->perm[0]);
37304       return false;
37305 
37306     default:
37307       gcc_unreachable ();
37308     }
37309 }
37310 
37311 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
37312    broadcast permutations.  */
37313 
37314 static bool
37315 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37316 {
37317   unsigned i, elt, nelt = d->nelt;
37318 
37319   if (d->op0 != d->op1)
37320     return false;
37321 
37322   elt = d->perm[0];
37323   for (i = 1; i < nelt; ++i)
37324     if (d->perm[i] != elt)
37325       return false;
37326 
37327   return expand_vec_perm_broadcast_1 (d);
37328 }
37329 
37330 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37331    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
37332    all the shorter instruction sequences.  */
37333 
37334 static bool
37335 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37336 {
37337   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37338   unsigned int i, nelt, eltsz;
37339   bool used[4];
37340 
37341   if (!TARGET_AVX2
37342       || d->op0 == d->op1
37343       || (d->vmode != V32QImode && d->vmode != V16HImode))
37344     return false;
37345 
37346   if (d->testing_p)
37347     return true;
37348 
37349   nelt = d->nelt;
37350   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37351 
37352   /* Generate 4 permutation masks.  If the required element is within
37353      the same lane, it is shuffled in.  If the required element from the
37354      other lane, force a zero by setting bit 7 in the permutation mask.
37355      In the other mask the mask has non-negative elements if element
37356      is requested from the other lane, but also moved to the other lane,
37357      so that the result of vpshufb can have the two V2TImode halves
37358      swapped.  */
37359   m128 = GEN_INT (-128);
37360   for (i = 0; i < 32; ++i)
37361     {
37362       rperm[0][i] = m128;
37363       rperm[1][i] = m128;
37364       rperm[2][i] = m128;
37365       rperm[3][i] = m128;
37366     }
37367   used[0] = false;
37368   used[1] = false;
37369   used[2] = false;
37370   used[3] = false;
37371   for (i = 0; i < nelt; ++i)
37372     {
37373       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37374       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37375       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37376 
37377       for (j = 0; j < eltsz; ++j)
37378 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37379       used[which] = true;
37380     }
37381 
37382   for (i = 0; i < 2; ++i)
37383     {
37384       if (!used[2 * i + 1])
37385 	{
37386 	  h[i] = NULL_RTX;
37387 	  continue;
37388 	}
37389       vperm = gen_rtx_CONST_VECTOR (V32QImode,
37390 				    gen_rtvec_v (32, rperm[2 * i + 1]));
37391       vperm = force_reg (V32QImode, vperm);
37392       h[i] = gen_reg_rtx (V32QImode);
37393       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37394       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37395     }
37396 
37397   /* Swap the 128-byte lanes of h[X].  */
37398   for (i = 0; i < 2; ++i)
37399    {
37400      if (h[i] == NULL_RTX)
37401        continue;
37402      op = gen_reg_rtx (V4DImode);
37403      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37404 				     const2_rtx, GEN_INT (3), const0_rtx,
37405 				     const1_rtx));
37406      h[i] = gen_lowpart (V32QImode, op);
37407    }
37408 
37409   for (i = 0; i < 2; ++i)
37410     {
37411       if (!used[2 * i])
37412 	{
37413 	  l[i] = NULL_RTX;
37414 	  continue;
37415 	}
37416       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37417       vperm = force_reg (V32QImode, vperm);
37418       l[i] = gen_reg_rtx (V32QImode);
37419       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37420       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37421     }
37422 
37423   for (i = 0; i < 2; ++i)
37424     {
37425       if (h[i] && l[i])
37426 	{
37427 	  op = gen_reg_rtx (V32QImode);
37428 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37429 	  l[i] = op;
37430 	}
37431       else if (h[i])
37432 	l[i] = h[i];
37433     }
37434 
37435   gcc_assert (l[0] && l[1]);
37436   op = gen_lowpart (V32QImode, d->target);
37437   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37438   return true;
37439 }
37440 
37441 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37442    With all of the interface bits taken care of, perform the expansion
37443    in D and return true on success.  */
37444 
37445 static bool
37446 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37447 {
37448   /* Try a single instruction expansion.  */
37449   if (expand_vec_perm_1 (d))
37450     return true;
37451 
37452   /* Try sequences of two instructions.  */
37453 
37454   if (expand_vec_perm_pshuflw_pshufhw (d))
37455     return true;
37456 
37457   if (expand_vec_perm_palignr (d))
37458     return true;
37459 
37460   if (expand_vec_perm_interleave2 (d))
37461     return true;
37462 
37463   if (expand_vec_perm_broadcast (d))
37464     return true;
37465 
37466   if (expand_vec_perm_vpermq_perm_1 (d))
37467     return true;
37468 
37469   /* Try sequences of three instructions.  */
37470 
37471   if (expand_vec_perm_pshufb2 (d))
37472     return true;
37473 
37474   if (expand_vec_perm_interleave3 (d))
37475     return true;
37476 
37477   /* Try sequences of four instructions.  */
37478 
37479   if (expand_vec_perm_vpshufb2_vpermq (d))
37480     return true;
37481 
37482   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37483     return true;
37484 
37485   /* ??? Look for narrow permutations whose element orderings would
37486      allow the promotion to a wider mode.  */
37487 
37488   /* ??? Look for sequences of interleave or a wider permute that place
37489      the data into the correct lanes for a half-vector shuffle like
37490      pshuf[lh]w or vpermilps.  */
37491 
37492   /* ??? Look for sequences of interleave that produce the desired results.
37493      The combinatorics of punpck[lh] get pretty ugly... */
37494 
37495   if (expand_vec_perm_even_odd (d))
37496     return true;
37497 
37498   /* Even longer sequences.  */
37499   if (expand_vec_perm_vpshufb4_vpermq2 (d))
37500     return true;
37501 
37502   return false;
37503 }
37504 
37505 bool
37506 ix86_expand_vec_perm_const (rtx operands[4])
37507 {
37508   struct expand_vec_perm_d d;
37509   unsigned char perm[MAX_VECT_LEN];
37510   int i, nelt, which;
37511   rtx sel;
37512 
37513   d.target = operands[0];
37514   d.op0 = operands[1];
37515   d.op1 = operands[2];
37516   sel = operands[3];
37517 
37518   d.vmode = GET_MODE (d.target);
37519   gcc_assert (VECTOR_MODE_P (d.vmode));
37520   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37521   d.testing_p = false;
37522 
37523   gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37524   gcc_assert (XVECLEN (sel, 0) == nelt);
37525   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37526 
37527   for (i = which = 0; i < nelt; ++i)
37528     {
37529       rtx e = XVECEXP (sel, 0, i);
37530       int ei = INTVAL (e) & (2 * nelt - 1);
37531 
37532       which |= (ei < nelt ? 1 : 2);
37533       d.perm[i] = ei;
37534       perm[i] = ei;
37535     }
37536 
37537   switch (which)
37538     {
37539     default:
37540       gcc_unreachable();
37541 
37542     case 3:
37543       if (!rtx_equal_p (d.op0, d.op1))
37544 	break;
37545 
37546       /* The elements of PERM do not suggest that only the first operand
37547 	 is used, but both operands are identical.  Allow easier matching
37548 	 of the permutation by folding the permutation into the single
37549 	 input vector.  */
37550       for (i = 0; i < nelt; ++i)
37551 	if (d.perm[i] >= nelt)
37552 	  d.perm[i] -= nelt;
37553       /* FALLTHRU */
37554 
37555     case 1:
37556       d.op1 = d.op0;
37557       break;
37558 
37559     case 2:
37560       for (i = 0; i < nelt; ++i)
37561         d.perm[i] -= nelt;
37562       d.op0 = d.op1;
37563       break;
37564     }
37565 
37566   if (ix86_expand_vec_perm_const_1 (&d))
37567     return true;
37568 
37569   /* If the mask says both arguments are needed, but they are the same,
37570      the above tried to expand with d.op0 == d.op1.  If that didn't work,
37571      retry with d.op0 != d.op1 as that is what testing has been done with.  */
37572   if (which == 3 && d.op0 == d.op1)
37573     {
37574       rtx seq;
37575       bool ok;
37576 
37577       memcpy (d.perm, perm, sizeof (perm));
37578       d.op1 = gen_reg_rtx (d.vmode);
37579       start_sequence ();
37580       ok = ix86_expand_vec_perm_const_1 (&d);
37581       seq = get_insns ();
37582       end_sequence ();
37583       if (ok)
37584 	{
37585 	  emit_move_insn (d.op1, d.op0);
37586 	  emit_insn (seq);
37587 	  return true;
37588 	}
37589     }
37590 
37591   return false;
37592 }
37593 
37594 /* Implement targetm.vectorize.vec_perm_const_ok.  */
37595 
37596 static bool
37597 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37598 				  const unsigned char *sel)
37599 {
37600   struct expand_vec_perm_d d;
37601   unsigned int i, nelt, which;
37602   bool ret, one_vec;
37603 
37604   d.vmode = vmode;
37605   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37606   d.testing_p = true;
37607 
37608   /* Given sufficient ISA support we can just return true here
37609      for selected vector modes.  */
37610   if (GET_MODE_SIZE (d.vmode) == 16)
37611     {
37612       /* All implementable with a single vpperm insn.  */
37613       if (TARGET_XOP)
37614 	return true;
37615       /* All implementable with 2 pshufb + 1 ior.  */
37616       if (TARGET_SSSE3)
37617 	return true;
37618       /* All implementable with shufpd or unpck[lh]pd.  */
37619       if (d.nelt == 2)
37620 	return true;
37621     }
37622 
37623   /* Extract the values from the vector CST into the permutation
37624      array in D.  */
37625   memcpy (d.perm, sel, nelt);
37626   for (i = which = 0; i < nelt; ++i)
37627     {
37628       unsigned char e = d.perm[i];
37629       gcc_assert (e < 2 * nelt);
37630       which |= (e < nelt ? 1 : 2);
37631     }
37632 
37633   /* For all elements from second vector, fold the elements to first.  */
37634   if (which == 2)
37635     for (i = 0; i < nelt; ++i)
37636       d.perm[i] -= nelt;
37637 
37638   /* Check whether the mask can be applied to the vector type.  */
37639   one_vec = (which != 3);
37640 
37641   /* Implementable with shufps or pshufd.  */
37642   if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37643     return true;
37644 
37645   /* Otherwise we have to go through the motions and see if we can
37646      figure out how to generate the requested permutation.  */
37647   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37648   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37649   if (!one_vec)
37650     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37651 
37652   start_sequence ();
37653   ret = ix86_expand_vec_perm_const_1 (&d);
37654   end_sequence ();
37655 
37656   return ret;
37657 }
37658 
37659 void
37660 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37661 {
37662   struct expand_vec_perm_d d;
37663   unsigned i, nelt;
37664 
37665   d.target = targ;
37666   d.op0 = op0;
37667   d.op1 = op1;
37668   d.vmode = GET_MODE (targ);
37669   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37670   d.testing_p = false;
37671 
37672   for (i = 0; i < nelt; ++i)
37673     d.perm[i] = i * 2 + odd;
37674 
37675   /* We'll either be able to implement the permutation directly...  */
37676   if (expand_vec_perm_1 (&d))
37677     return;
37678 
37679   /* ... or we use the special-case patterns.  */
37680   expand_vec_perm_even_odd_1 (&d, odd);
37681 }
37682 
37683 /* Expand an insert into a vector register through pinsr insn.
37684    Return true if successful.  */
37685 
37686 bool
37687 ix86_expand_pinsr (rtx *operands)
37688 {
37689   rtx dst = operands[0];
37690   rtx src = operands[3];
37691 
37692   unsigned int size = INTVAL (operands[1]);
37693   unsigned int pos = INTVAL (operands[2]);
37694 
37695   if (GET_CODE (dst) == SUBREG)
37696     {
37697       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37698       dst = SUBREG_REG (dst);
37699     }
37700 
37701   if (GET_CODE (src) == SUBREG)
37702     src = SUBREG_REG (src);
37703 
37704   switch (GET_MODE (dst))
37705     {
37706     case V16QImode:
37707     case V8HImode:
37708     case V4SImode:
37709     case V2DImode:
37710       {
37711 	enum machine_mode srcmode, dstmode;
37712 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
37713 
37714 	srcmode = mode_for_size (size, MODE_INT, 0);
37715 
37716 	switch (srcmode)
37717 	  {
37718 	  case QImode:
37719 	    if (!TARGET_SSE4_1)
37720 	      return false;
37721 	    dstmode = V16QImode;
37722 	    pinsr = gen_sse4_1_pinsrb;
37723 	    break;
37724 
37725 	  case HImode:
37726 	    if (!TARGET_SSE2)
37727 	      return false;
37728 	    dstmode = V8HImode;
37729 	    pinsr = gen_sse2_pinsrw;
37730 	    break;
37731 
37732 	  case SImode:
37733 	    if (!TARGET_SSE4_1)
37734 	      return false;
37735 	    dstmode = V4SImode;
37736 	    pinsr = gen_sse4_1_pinsrd;
37737 	    break;
37738 
37739 	  case DImode:
37740 	    gcc_assert (TARGET_64BIT);
37741 	    if (!TARGET_SSE4_1)
37742 	      return false;
37743 	    dstmode = V2DImode;
37744 	    pinsr = gen_sse4_1_pinsrq;
37745 	    break;
37746 
37747 	  default:
37748 	    return false;
37749 	  }
37750 
37751 	dst = gen_lowpart (dstmode, dst);
37752 	src = gen_lowpart (srcmode, src);
37753 
37754 	pos /= size;
37755 
37756 	emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37757 	return true;
37758       }
37759 
37760     default:
37761       return false;
37762     }
37763 }
37764 
37765 /* This function returns the calling abi specific va_list type node.
37766    It returns  the FNDECL specific va_list type.  */
37767 
37768 static tree
37769 ix86_fn_abi_va_list (tree fndecl)
37770 {
37771   if (!TARGET_64BIT)
37772     return va_list_type_node;
37773   gcc_assert (fndecl != NULL_TREE);
37774 
37775   if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37776     return ms_va_list_type_node;
37777   else
37778     return sysv_va_list_type_node;
37779 }
37780 
37781 /* Returns the canonical va_list type specified by TYPE. If there
37782    is no valid TYPE provided, it return NULL_TREE.  */
37783 
37784 static tree
37785 ix86_canonical_va_list_type (tree type)
37786 {
37787   tree wtype, htype;
37788 
37789   /* Resolve references and pointers to va_list type.  */
37790   if (TREE_CODE (type) == MEM_REF)
37791     type = TREE_TYPE (type);
37792   else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37793     type = TREE_TYPE (type);
37794   else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37795     type = TREE_TYPE (type);
37796 
37797   if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37798     {
37799       wtype = va_list_type_node;
37800 	  gcc_assert (wtype != NULL_TREE);
37801       htype = type;
37802       if (TREE_CODE (wtype) == ARRAY_TYPE)
37803 	{
37804 	  /* If va_list is an array type, the argument may have decayed
37805 	     to a pointer type, e.g. by being passed to another function.
37806 	     In that case, unwrap both types so that we can compare the
37807 	     underlying records.  */
37808 	  if (TREE_CODE (htype) == ARRAY_TYPE
37809 	      || POINTER_TYPE_P (htype))
37810 	    {
37811 	      wtype = TREE_TYPE (wtype);
37812 	      htype = TREE_TYPE (htype);
37813 	    }
37814 	}
37815       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37816 	return va_list_type_node;
37817       wtype = sysv_va_list_type_node;
37818 	  gcc_assert (wtype != NULL_TREE);
37819       htype = type;
37820       if (TREE_CODE (wtype) == ARRAY_TYPE)
37821 	{
37822 	  /* If va_list is an array type, the argument may have decayed
37823 	     to a pointer type, e.g. by being passed to another function.
37824 	     In that case, unwrap both types so that we can compare the
37825 	     underlying records.  */
37826 	  if (TREE_CODE (htype) == ARRAY_TYPE
37827 	      || POINTER_TYPE_P (htype))
37828 	    {
37829 	      wtype = TREE_TYPE (wtype);
37830 	      htype = TREE_TYPE (htype);
37831 	    }
37832 	}
37833       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37834 	return sysv_va_list_type_node;
37835       wtype = ms_va_list_type_node;
37836 	  gcc_assert (wtype != NULL_TREE);
37837       htype = type;
37838       if (TREE_CODE (wtype) == ARRAY_TYPE)
37839 	{
37840 	  /* If va_list is an array type, the argument may have decayed
37841 	     to a pointer type, e.g. by being passed to another function.
37842 	     In that case, unwrap both types so that we can compare the
37843 	     underlying records.  */
37844 	  if (TREE_CODE (htype) == ARRAY_TYPE
37845 	      || POINTER_TYPE_P (htype))
37846 	    {
37847 	      wtype = TREE_TYPE (wtype);
37848 	      htype = TREE_TYPE (htype);
37849 	    }
37850 	}
37851       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37852 	return ms_va_list_type_node;
37853       return NULL_TREE;
37854     }
37855   return std_canonical_va_list_type (type);
37856 }
37857 
37858 /* Iterate through the target-specific builtin types for va_list.
37859    IDX denotes the iterator, *PTREE is set to the result type of
37860    the va_list builtin, and *PNAME to its internal type.
37861    Returns zero if there is no element for this index, otherwise
37862    IDX should be increased upon the next call.
37863    Note, do not iterate a base builtin's name like __builtin_va_list.
37864    Used from c_common_nodes_and_builtins.  */
37865 
37866 static int
37867 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37868 {
37869   if (TARGET_64BIT)
37870     {
37871       switch (idx)
37872 	{
37873 	default:
37874 	  break;
37875 
37876 	case 0:
37877 	  *ptree = ms_va_list_type_node;
37878 	  *pname = "__builtin_ms_va_list";
37879 	  return 1;
37880 
37881 	case 1:
37882 	  *ptree = sysv_va_list_type_node;
37883 	  *pname = "__builtin_sysv_va_list";
37884 	  return 1;
37885 	}
37886     }
37887 
37888   return 0;
37889 }
37890 
37891 #undef TARGET_SCHED_DISPATCH
37892 #define TARGET_SCHED_DISPATCH has_dispatch
37893 #undef TARGET_SCHED_DISPATCH_DO
37894 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37895 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37896 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37897 
37898 /* The size of the dispatch window is the total number of bytes of
37899    object code allowed in a window.  */
37900 #define DISPATCH_WINDOW_SIZE 16
37901 
37902 /* Number of dispatch windows considered for scheduling.  */
37903 #define MAX_DISPATCH_WINDOWS 3
37904 
37905 /* Maximum number of instructions in a window.  */
37906 #define MAX_INSN 4
37907 
37908 /* Maximum number of immediate operands in a window.  */
37909 #define MAX_IMM 4
37910 
37911 /* Maximum number of immediate bits allowed in a window.  */
37912 #define MAX_IMM_SIZE 128
37913 
37914 /* Maximum number of 32 bit immediates allowed in a window.  */
37915 #define MAX_IMM_32 4
37916 
37917 /* Maximum number of 64 bit immediates allowed in a window.  */
37918 #define MAX_IMM_64 2
37919 
37920 /* Maximum total of loads or prefetches allowed in a window.  */
37921 #define MAX_LOAD 2
37922 
37923 /* Maximum total of stores allowed in a window.  */
37924 #define MAX_STORE 1
37925 
37926 #undef BIG
37927 #define BIG 100
37928 
37929 
37930 /* Dispatch groups.  Istructions that affect the mix in a dispatch window.  */
37931 enum dispatch_group {
37932   disp_no_group = 0,
37933   disp_load,
37934   disp_store,
37935   disp_load_store,
37936   disp_prefetch,
37937   disp_imm,
37938   disp_imm_32,
37939   disp_imm_64,
37940   disp_branch,
37941   disp_cmp,
37942   disp_jcc,
37943   disp_last
37944 };
37945 
37946 /* Number of allowable groups in a dispatch window.  It is an array
37947    indexed by dispatch_group enum.  100 is used as a big number,
37948    because the number of these kind of operations does not have any
37949    effect in dispatch window, but we need them for other reasons in
37950    the table.  */
37951 static unsigned int num_allowable_groups[disp_last] = {
37952   0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37953 };
37954 
37955 char group_name[disp_last + 1][16] = {
37956   "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37957   "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37958   "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37959 };
37960 
37961 /* Instruction path.  */
37962 enum insn_path {
37963   no_path = 0,
37964   path_single, /* Single micro op.  */
37965   path_double, /* Double micro op.  */
37966   path_multi,  /* Instructions with more than 2 micro op..  */
37967   last_path
37968 };
37969 
37970 /* sched_insn_info defines a window to the instructions scheduled in
37971    the basic block.  It contains a pointer to the insn_info table and
37972    the instruction scheduled.
37973 
37974    Windows are allocated for each basic block and are linked
37975    together.  */
37976 typedef struct sched_insn_info_s {
37977   rtx insn;
37978   enum dispatch_group group;
37979   enum insn_path path;
37980   int byte_len;
37981   int imm_bytes;
37982 } sched_insn_info;
37983 
37984 /* Linked list of dispatch windows.  This is a two way list of
37985    dispatch windows of a basic block.  It contains information about
37986    the number of uops in the window and the total number of
37987    instructions and of bytes in the object code for this dispatch
37988    window.  */
37989 typedef struct dispatch_windows_s {
37990   int num_insn;            /* Number of insn in the window.  */
37991   int num_uops;            /* Number of uops in the window.  */
37992   int window_size;         /* Number of bytes in the window.  */
37993   int window_num;          /* Window number between 0 or 1.  */
37994   int num_imm;             /* Number of immediates in an insn.  */
37995   int num_imm_32;          /* Number of 32 bit immediates in an insn.  */
37996   int num_imm_64;          /* Number of 64 bit immediates in an insn.  */
37997   int imm_size;            /* Total immediates in the window.  */
37998   int num_loads;           /* Total memory loads in the window.  */
37999   int num_stores;          /* Total memory stores in the window.  */
38000   int violation;          /* Violation exists in window.  */
38001   sched_insn_info *window; /* Pointer to the window.  */
38002   struct dispatch_windows_s *next;
38003   struct dispatch_windows_s *prev;
38004 } dispatch_windows;
38005 
38006 /* Immediate valuse used in an insn.  */
38007 typedef struct imm_info_s
38008   {
38009     int imm;
38010     int imm32;
38011     int imm64;
38012   } imm_info;
38013 
38014 static dispatch_windows *dispatch_window_list;
38015 static dispatch_windows *dispatch_window_list1;
38016 
38017 /* Get dispatch group of insn.  */
38018 
38019 static enum dispatch_group
38020 get_mem_group (rtx insn)
38021 {
38022   enum attr_memory memory;
38023 
38024   if (INSN_CODE (insn) < 0)
38025     return disp_no_group;
38026   memory = get_attr_memory (insn);
38027   if (memory == MEMORY_STORE)
38028     return disp_store;
38029 
38030   if (memory == MEMORY_LOAD)
38031     return disp_load;
38032 
38033   if (memory == MEMORY_BOTH)
38034     return disp_load_store;
38035 
38036   return disp_no_group;
38037 }
38038 
38039 /* Return true if insn is a compare instruction.  */
38040 
38041 static bool
38042 is_cmp (rtx insn)
38043 {
38044   enum attr_type type;
38045 
38046   type = get_attr_type (insn);
38047   return (type == TYPE_TEST
38048 	  || type == TYPE_ICMP
38049 	  || type == TYPE_FCMP
38050 	  || GET_CODE (PATTERN (insn)) == COMPARE);
38051 }
38052 
38053 /* Return true if a dispatch violation encountered.  */
38054 
38055 static bool
38056 dispatch_violation (void)
38057 {
38058   if (dispatch_window_list->next)
38059     return dispatch_window_list->next->violation;
38060   return dispatch_window_list->violation;
38061 }
38062 
38063 /* Return true if insn is a branch instruction.  */
38064 
38065 static bool
38066 is_branch (rtx insn)
38067 {
38068   return (CALL_P (insn) || JUMP_P (insn));
38069 }
38070 
38071 /* Return true if insn is a prefetch instruction.  */
38072 
38073 static bool
38074 is_prefetch (rtx insn)
38075 {
38076   return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38077 }
38078 
38079 /* This function initializes a dispatch window and the list container holding a
38080    pointer to the window.  */
38081 
38082 static void
38083 init_window (int window_num)
38084 {
38085   int i;
38086   dispatch_windows *new_list;
38087 
38088   if (window_num == 0)
38089     new_list = dispatch_window_list;
38090   else
38091     new_list = dispatch_window_list1;
38092 
38093   new_list->num_insn = 0;
38094   new_list->num_uops = 0;
38095   new_list->window_size = 0;
38096   new_list->next = NULL;
38097   new_list->prev = NULL;
38098   new_list->window_num = window_num;
38099   new_list->num_imm = 0;
38100   new_list->num_imm_32 = 0;
38101   new_list->num_imm_64 = 0;
38102   new_list->imm_size = 0;
38103   new_list->num_loads = 0;
38104   new_list->num_stores = 0;
38105   new_list->violation = false;
38106 
38107   for (i = 0; i < MAX_INSN; i++)
38108     {
38109       new_list->window[i].insn = NULL;
38110       new_list->window[i].group = disp_no_group;
38111       new_list->window[i].path = no_path;
38112       new_list->window[i].byte_len = 0;
38113       new_list->window[i].imm_bytes = 0;
38114     }
38115   return;
38116 }
38117 
38118 /* This function allocates and initializes a dispatch window and the
38119    list container holding a pointer to the window.  */
38120 
38121 static dispatch_windows *
38122 allocate_window (void)
38123 {
38124   dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38125   new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38126 
38127   return new_list;
38128 }
38129 
38130 /* This routine initializes the dispatch scheduling information.  It
38131    initiates building dispatch scheduler tables and constructs the
38132    first dispatch window.  */
38133 
38134 static void
38135 init_dispatch_sched (void)
38136 {
38137   /* Allocate a dispatch list and a window.  */
38138   dispatch_window_list = allocate_window ();
38139   dispatch_window_list1 = allocate_window ();
38140   init_window (0);
38141   init_window (1);
38142 }
38143 
38144 /* This function returns true if a branch is detected.  End of a basic block
38145    does not have to be a branch, but here we assume only branches end a
38146    window.  */
38147 
38148 static bool
38149 is_end_basic_block (enum dispatch_group group)
38150 {
38151   return group == disp_branch;
38152 }
38153 
38154 /* This function is called when the end of a window processing is reached.  */
38155 
38156 static void
38157 process_end_window (void)
38158 {
38159   gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38160   if (dispatch_window_list->next)
38161     {
38162       gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38163       gcc_assert (dispatch_window_list->window_size
38164 		  + dispatch_window_list1->window_size <= 48);
38165       init_window (1);
38166     }
38167   init_window (0);
38168 }
38169 
38170 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38171    WINDOW_NUM is either 0 or 1.  A maximum of two windows are generated
38172    for 48 bytes of instructions.  Note that these windows are not dispatch
38173    windows that their sizes are DISPATCH_WINDOW_SIZE.  */
38174 
38175 static dispatch_windows *
38176 allocate_next_window (int window_num)
38177 {
38178   if (window_num == 0)
38179     {
38180       if (dispatch_window_list->next)
38181 	  init_window (1);
38182       init_window (0);
38183       return dispatch_window_list;
38184     }
38185 
38186   dispatch_window_list->next = dispatch_window_list1;
38187   dispatch_window_list1->prev = dispatch_window_list;
38188 
38189   return dispatch_window_list1;
38190 }
38191 
38192 /* Increment the number of immediate operands of an instruction.  */
38193 
38194 static int
38195 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38196 {
38197   if (*in_rtx == 0)
38198     return 0;
38199 
38200     switch ( GET_CODE (*in_rtx))
38201     {
38202     case CONST:
38203     case SYMBOL_REF:
38204     case CONST_INT:
38205       (imm_values->imm)++;
38206       if (x86_64_immediate_operand (*in_rtx, SImode))
38207 	(imm_values->imm32)++;
38208       else
38209 	(imm_values->imm64)++;
38210       break;
38211 
38212     case CONST_DOUBLE:
38213       (imm_values->imm)++;
38214       (imm_values->imm64)++;
38215       break;
38216 
38217     case CODE_LABEL:
38218       if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38219 	{
38220 	  (imm_values->imm)++;
38221 	  (imm_values->imm32)++;
38222 	}
38223       break;
38224 
38225     default:
38226       break;
38227     }
38228 
38229   return 0;
38230 }
38231 
38232 /* Compute number of immediate operands of an instruction.  */
38233 
38234 static void
38235 find_constant (rtx in_rtx, imm_info *imm_values)
38236 {
38237   for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38238 		(rtx_function) find_constant_1, (void *) imm_values);
38239 }
38240 
38241 /* Return total size of immediate operands of an instruction along with number
38242    of corresponding immediate-operands.  It initializes its parameters to zero
38243    befor calling FIND_CONSTANT.
38244    INSN is the input instruction.  IMM is the total of immediates.
38245    IMM32 is the number of 32 bit immediates.  IMM64 is the number of 64
38246    bit immediates.  */
38247 
38248 static int
38249 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38250 {
38251   imm_info imm_values = {0, 0, 0};
38252 
38253   find_constant (insn, &imm_values);
38254   *imm = imm_values.imm;
38255   *imm32 = imm_values.imm32;
38256   *imm64 = imm_values.imm64;
38257   return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38258 }
38259 
38260 /* This function indicates if an operand of an instruction is an
38261    immediate.  */
38262 
38263 static bool
38264 has_immediate (rtx insn)
38265 {
38266   int num_imm_operand;
38267   int num_imm32_operand;
38268   int num_imm64_operand;
38269 
38270   if (insn)
38271     return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38272 			       &num_imm64_operand);
38273   return false;
38274 }
38275 
38276 /* Return single or double path for instructions.  */
38277 
38278 static enum insn_path
38279 get_insn_path (rtx insn)
38280 {
38281   enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38282 
38283   if ((int)path == 0)
38284     return path_single;
38285 
38286   if ((int)path == 1)
38287     return path_double;
38288 
38289   return path_multi;
38290 }
38291 
38292 /* Return insn dispatch group.  */
38293 
38294 static enum dispatch_group
38295 get_insn_group (rtx insn)
38296 {
38297   enum dispatch_group group = get_mem_group (insn);
38298   if (group)
38299     return group;
38300 
38301   if (is_branch (insn))
38302     return disp_branch;
38303 
38304   if (is_cmp (insn))
38305     return disp_cmp;
38306 
38307   if (has_immediate (insn))
38308     return disp_imm;
38309 
38310   if (is_prefetch (insn))
38311     return disp_prefetch;
38312 
38313   return disp_no_group;
38314 }
38315 
38316 /* Count number of GROUP restricted instructions in a dispatch
38317    window WINDOW_LIST.  */
38318 
38319 static int
38320 count_num_restricted (rtx insn, dispatch_windows *window_list)
38321 {
38322   enum dispatch_group group = get_insn_group (insn);
38323   int imm_size;
38324   int num_imm_operand;
38325   int num_imm32_operand;
38326   int num_imm64_operand;
38327 
38328   if (group == disp_no_group)
38329     return 0;
38330 
38331   if (group == disp_imm)
38332     {
38333       imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38334 			      &num_imm64_operand);
38335       if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38336 	  || num_imm_operand + window_list->num_imm > MAX_IMM
38337 	  || (num_imm32_operand > 0
38338 	      && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38339 		  || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38340 	  || (num_imm64_operand > 0
38341 	      && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38342 		  || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38343 	  || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38344 	      && num_imm64_operand > 0
38345 	      && ((window_list->num_imm_64 > 0
38346 		   && window_list->num_insn >= 2)
38347 		  || window_list->num_insn >= 3)))
38348 	return BIG;
38349 
38350       return 1;
38351     }
38352 
38353   if ((group == disp_load_store
38354        && (window_list->num_loads >= MAX_LOAD
38355 	   || window_list->num_stores >= MAX_STORE))
38356       || ((group == disp_load
38357 	   || group == disp_prefetch)
38358 	  && window_list->num_loads >= MAX_LOAD)
38359       || (group == disp_store
38360 	  && window_list->num_stores >= MAX_STORE))
38361     return BIG;
38362 
38363   return 1;
38364 }
38365 
38366 /* This function returns true if insn satisfies dispatch rules on the
38367    last window scheduled.  */
38368 
38369 static bool
38370 fits_dispatch_window (rtx insn)
38371 {
38372   dispatch_windows *window_list = dispatch_window_list;
38373   dispatch_windows *window_list_next = dispatch_window_list->next;
38374   unsigned int num_restrict;
38375   enum dispatch_group group = get_insn_group (insn);
38376   enum insn_path path = get_insn_path (insn);
38377   int sum;
38378 
38379   /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
38380      instructions should be given the lowest priority in the
38381      scheduling process in Haifa scheduler to make sure they will be
38382      scheduled in the same dispatch window as the refrence to them.  */
38383   if (group == disp_jcc || group == disp_cmp)
38384     return false;
38385 
38386   /* Check nonrestricted.  */
38387   if (group == disp_no_group || group == disp_branch)
38388     return true;
38389 
38390   /* Get last dispatch window.  */
38391   if (window_list_next)
38392     window_list = window_list_next;
38393 
38394   if (window_list->window_num == 1)
38395     {
38396       sum = window_list->prev->window_size + window_list->window_size;
38397 
38398       if (sum == 32
38399 	  || (min_insn_size (insn) + sum) >= 48)
38400 	/* Window 1 is full.  Go for next window.  */
38401 	return true;
38402     }
38403 
38404   num_restrict = count_num_restricted (insn, window_list);
38405 
38406   if (num_restrict > num_allowable_groups[group])
38407     return false;
38408 
38409   /* See if it fits in the first window.  */
38410   if (window_list->window_num == 0)
38411     {
38412       /* The first widow should have only single and double path
38413 	 uops.  */
38414       if (path == path_double
38415 	  && (window_list->num_uops + 2) > MAX_INSN)
38416 	return false;
38417       else if (path != path_single)
38418         return false;
38419     }
38420   return true;
38421 }
38422 
38423 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38424    dispatch window WINDOW_LIST.  */
38425 
38426 static void
38427 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38428 {
38429   int byte_len = min_insn_size (insn);
38430   int num_insn = window_list->num_insn;
38431   int imm_size;
38432   sched_insn_info *window = window_list->window;
38433   enum dispatch_group group = get_insn_group (insn);
38434   enum insn_path path = get_insn_path (insn);
38435   int num_imm_operand;
38436   int num_imm32_operand;
38437   int num_imm64_operand;
38438 
38439   if (!window_list->violation && group != disp_cmp
38440       && !fits_dispatch_window (insn))
38441     window_list->violation = true;
38442 
38443   imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38444 				 &num_imm64_operand);
38445 
38446   /* Initialize window with new instruction.  */
38447   window[num_insn].insn = insn;
38448   window[num_insn].byte_len = byte_len;
38449   window[num_insn].group = group;
38450   window[num_insn].path = path;
38451   window[num_insn].imm_bytes = imm_size;
38452 
38453   window_list->window_size += byte_len;
38454   window_list->num_insn = num_insn + 1;
38455   window_list->num_uops = window_list->num_uops + num_uops;
38456   window_list->imm_size += imm_size;
38457   window_list->num_imm += num_imm_operand;
38458   window_list->num_imm_32 += num_imm32_operand;
38459   window_list->num_imm_64 += num_imm64_operand;
38460 
38461   if (group == disp_store)
38462     window_list->num_stores += 1;
38463   else if (group == disp_load
38464 	   || group == disp_prefetch)
38465     window_list->num_loads += 1;
38466   else if (group == disp_load_store)
38467     {
38468       window_list->num_stores += 1;
38469       window_list->num_loads += 1;
38470     }
38471 }
38472 
38473 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38474    If the total bytes of instructions or the number of instructions in
38475    the window exceed allowable, it allocates a new window.  */
38476 
38477 static void
38478 add_to_dispatch_window (rtx insn)
38479 {
38480   int byte_len;
38481   dispatch_windows *window_list;
38482   dispatch_windows *next_list;
38483   dispatch_windows *window0_list;
38484   enum insn_path path;
38485   enum dispatch_group insn_group;
38486   bool insn_fits;
38487   int num_insn;
38488   int num_uops;
38489   int window_num;
38490   int insn_num_uops;
38491   int sum;
38492 
38493   if (INSN_CODE (insn) < 0)
38494     return;
38495 
38496   byte_len = min_insn_size (insn);
38497   window_list = dispatch_window_list;
38498   next_list = window_list->next;
38499   path = get_insn_path (insn);
38500   insn_group = get_insn_group (insn);
38501 
38502   /* Get the last dispatch window.  */
38503   if (next_list)
38504       window_list = dispatch_window_list->next;
38505 
38506   if (path == path_single)
38507     insn_num_uops = 1;
38508   else if (path == path_double)
38509     insn_num_uops = 2;
38510   else
38511     insn_num_uops = (int) path;
38512 
38513   /* If current window is full, get a new window.
38514      Window number zero is full, if MAX_INSN uops are scheduled in it.
38515      Window number one is full, if window zero's bytes plus window
38516      one's bytes is 32, or if the bytes of the new instruction added
38517      to the total makes it greater than 48, or it has already MAX_INSN
38518      instructions in it.  */
38519   num_insn = window_list->num_insn;
38520   num_uops = window_list->num_uops;
38521   window_num = window_list->window_num;
38522   insn_fits = fits_dispatch_window (insn);
38523 
38524   if (num_insn >= MAX_INSN
38525       || num_uops + insn_num_uops > MAX_INSN
38526       || !(insn_fits))
38527     {
38528       window_num = ~window_num & 1;
38529       window_list = allocate_next_window (window_num);
38530     }
38531 
38532   if (window_num == 0)
38533     {
38534       add_insn_window (insn, window_list, insn_num_uops);
38535       if (window_list->num_insn >= MAX_INSN
38536 	  && insn_group == disp_branch)
38537 	{
38538 	  process_end_window ();
38539 	  return;
38540 	}
38541     }
38542   else if (window_num == 1)
38543     {
38544       window0_list = window_list->prev;
38545       sum = window0_list->window_size + window_list->window_size;
38546       if (sum == 32
38547 	  || (byte_len + sum) >= 48)
38548 	{
38549 	  process_end_window ();
38550 	  window_list = dispatch_window_list;
38551 	}
38552 
38553       add_insn_window (insn, window_list, insn_num_uops);
38554     }
38555   else
38556     gcc_unreachable ();
38557 
38558   if (is_end_basic_block (insn_group))
38559     {
38560       /* End of basic block is reached do end-basic-block process.  */
38561       process_end_window ();
38562       return;
38563     }
38564 }
38565 
38566 /* Print the dispatch window, WINDOW_NUM, to FILE.  */
38567 
38568 DEBUG_FUNCTION static void
38569 debug_dispatch_window_file (FILE *file, int window_num)
38570 {
38571   dispatch_windows *list;
38572   int i;
38573 
38574   if (window_num == 0)
38575     list = dispatch_window_list;
38576   else
38577     list = dispatch_window_list1;
38578 
38579   fprintf (file, "Window #%d:\n", list->window_num);
38580   fprintf (file, "  num_insn = %d, num_uops = %d, window_size = %d\n",
38581 	  list->num_insn, list->num_uops, list->window_size);
38582   fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38583 	   list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38584 
38585   fprintf (file, "  num_loads = %d, num_stores = %d\n", list->num_loads,
38586 	  list->num_stores);
38587   fprintf (file, " insn info:\n");
38588 
38589   for (i = 0; i < MAX_INSN; i++)
38590     {
38591       if (!list->window[i].insn)
38592 	break;
38593       fprintf (file, "    group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38594 	      i, group_name[list->window[i].group],
38595 	      i, (void *)list->window[i].insn,
38596 	      i, list->window[i].path,
38597 	      i, list->window[i].byte_len,
38598 	      i, list->window[i].imm_bytes);
38599     }
38600 }
38601 
38602 /* Print to stdout a dispatch window.  */
38603 
38604 DEBUG_FUNCTION void
38605 debug_dispatch_window (int window_num)
38606 {
38607   debug_dispatch_window_file (stdout, window_num);
38608 }
38609 
38610 /* Print INSN dispatch information to FILE.  */
38611 
38612 DEBUG_FUNCTION static void
38613 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38614 {
38615   int byte_len;
38616   enum insn_path path;
38617   enum dispatch_group group;
38618   int imm_size;
38619   int num_imm_operand;
38620   int num_imm32_operand;
38621   int num_imm64_operand;
38622 
38623   if (INSN_CODE (insn) < 0)
38624     return;
38625 
38626   byte_len = min_insn_size (insn);
38627   path = get_insn_path (insn);
38628   group = get_insn_group (insn);
38629   imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38630 				 &num_imm64_operand);
38631 
38632   fprintf (file, " insn info:\n");
38633   fprintf (file, "  group = %s, path = %d, byte_len = %d\n",
38634 	   group_name[group], path, byte_len);
38635   fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38636 	   num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38637 }
38638 
38639 /* Print to STDERR the status of the ready list with respect to
38640    dispatch windows.  */
38641 
38642 DEBUG_FUNCTION void
38643 debug_ready_dispatch (void)
38644 {
38645   int i;
38646   int no_ready = number_in_ready ();
38647 
38648   fprintf (stdout, "Number of ready: %d\n", no_ready);
38649 
38650   for (i = 0; i < no_ready; i++)
38651     debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38652 }
38653 
38654 /* This routine is the driver of the dispatch scheduler.  */
38655 
38656 static void
38657 do_dispatch (rtx insn, int mode)
38658 {
38659   if (mode == DISPATCH_INIT)
38660     init_dispatch_sched ();
38661   else if (mode == ADD_TO_DISPATCH_WINDOW)
38662     add_to_dispatch_window (insn);
38663 }
38664 
38665 /* Return TRUE if Dispatch Scheduling is supported.  */
38666 
38667 static bool
38668 has_dispatch (rtx insn, int action)
38669 {
38670   if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38671       && flag_dispatch_scheduler)
38672     switch (action)
38673       {
38674       default:
38675 	return false;
38676 
38677       case IS_DISPATCH_ON:
38678 	return true;
38679 	break;
38680 
38681       case IS_CMP:
38682 	return is_cmp (insn);
38683 
38684       case DISPATCH_VIOLATION:
38685 	return dispatch_violation ();
38686 
38687       case FITS_DISPATCH_WINDOW:
38688 	return fits_dispatch_window (insn);
38689       }
38690 
38691   return false;
38692 }
38693 
38694 /* Implementation of reassociation_width target hook used by
38695    reassoc phase to identify parallelism level in reassociated
38696    tree.  Statements tree_code is passed in OPC.  Arguments type
38697    is passed in MODE.
38698 
38699    Currently parallel reassociation is enabled for Atom
38700    processors only and we set reassociation width to be 2
38701    because Atom may issue up to 2 instructions per cycle.
38702 
38703    Return value should be fixed if parallel reassociation is
38704    enabled for other processors.  */
38705 
38706 static int
38707 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38708 			  enum machine_mode mode)
38709 {
38710   int res = 1;
38711 
38712   if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38713     res = 2;
38714   else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38715     res = 2;
38716 
38717   return res;
38718 }
38719 
38720 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38721    place emms and femms instructions.  */
38722 
38723 static enum machine_mode
38724 ix86_preferred_simd_mode (enum machine_mode mode)
38725 {
38726   if (!TARGET_SSE)
38727     return word_mode;
38728 
38729   switch (mode)
38730     {
38731     case QImode:
38732       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38733     case HImode:
38734       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38735     case SImode:
38736       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38737     case DImode:
38738       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38739 
38740     case SFmode:
38741       if (TARGET_AVX && !TARGET_PREFER_AVX128)
38742 	return V8SFmode;
38743       else
38744 	return V4SFmode;
38745 
38746     case DFmode:
38747       if (!TARGET_VECTORIZE_DOUBLE)
38748 	return word_mode;
38749       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38750 	return V4DFmode;
38751       else if (TARGET_SSE2)
38752 	return V2DFmode;
38753       /* FALLTHRU */
38754 
38755     default:
38756       return word_mode;
38757     }
38758 }
38759 
38760 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38761    vectors.  */
38762 
38763 static unsigned int
38764 ix86_autovectorize_vector_sizes (void)
38765 {
38766   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38767 }
38768 
38769 /* Initialize the GCC target structure.  */
38770 #undef TARGET_RETURN_IN_MEMORY
38771 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38772 
38773 #undef TARGET_LEGITIMIZE_ADDRESS
38774 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38775 
38776 #undef TARGET_ATTRIBUTE_TABLE
38777 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38778 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38779 #  undef TARGET_MERGE_DECL_ATTRIBUTES
38780 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38781 #endif
38782 
38783 #undef TARGET_COMP_TYPE_ATTRIBUTES
38784 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38785 
38786 #undef TARGET_INIT_BUILTINS
38787 #define TARGET_INIT_BUILTINS ix86_init_builtins
38788 #undef TARGET_BUILTIN_DECL
38789 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38790 #undef TARGET_EXPAND_BUILTIN
38791 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38792 
38793 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38794 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38795   ix86_builtin_vectorized_function
38796 
38797 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38798 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38799 
38800 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38801 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38802 
38803 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38804 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38805 
38806 #undef TARGET_BUILTIN_RECIPROCAL
38807 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38808 
38809 #undef TARGET_ASM_FUNCTION_EPILOGUE
38810 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38811 
38812 #undef TARGET_ENCODE_SECTION_INFO
38813 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38814 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38815 #else
38816 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38817 #endif
38818 
38819 #undef TARGET_ASM_OPEN_PAREN
38820 #define TARGET_ASM_OPEN_PAREN ""
38821 #undef TARGET_ASM_CLOSE_PAREN
38822 #define TARGET_ASM_CLOSE_PAREN ""
38823 
38824 #undef TARGET_ASM_BYTE_OP
38825 #define TARGET_ASM_BYTE_OP ASM_BYTE
38826 
38827 #undef TARGET_ASM_ALIGNED_HI_OP
38828 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38829 #undef TARGET_ASM_ALIGNED_SI_OP
38830 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38831 #ifdef ASM_QUAD
38832 #undef TARGET_ASM_ALIGNED_DI_OP
38833 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38834 #endif
38835 
38836 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38837 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38838 
38839 #undef TARGET_ASM_UNALIGNED_HI_OP
38840 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38841 #undef TARGET_ASM_UNALIGNED_SI_OP
38842 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38843 #undef TARGET_ASM_UNALIGNED_DI_OP
38844 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38845 
38846 #undef TARGET_PRINT_OPERAND
38847 #define TARGET_PRINT_OPERAND ix86_print_operand
38848 #undef TARGET_PRINT_OPERAND_ADDRESS
38849 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38850 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38851 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38852 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38853 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38854 
38855 #undef TARGET_SCHED_INIT_GLOBAL
38856 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38857 #undef TARGET_SCHED_ADJUST_COST
38858 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38859 #undef TARGET_SCHED_ISSUE_RATE
38860 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38861 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38862 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38863   ia32_multipass_dfa_lookahead
38864 
38865 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38866 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38867 
38868 #ifdef HAVE_AS_TLS
38869 #undef TARGET_HAVE_TLS
38870 #define TARGET_HAVE_TLS true
38871 #endif
38872 #undef TARGET_CANNOT_FORCE_CONST_MEM
38873 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38874 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38875 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38876 
38877 #undef TARGET_DELEGITIMIZE_ADDRESS
38878 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38879 
38880 #undef TARGET_MS_BITFIELD_LAYOUT_P
38881 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38882 
38883 #if TARGET_MACHO
38884 #undef TARGET_BINDS_LOCAL_P
38885 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38886 #endif
38887 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38888 #undef TARGET_BINDS_LOCAL_P
38889 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38890 #endif
38891 
38892 #undef TARGET_ASM_OUTPUT_MI_THUNK
38893 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38894 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38895 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38896 
38897 #undef TARGET_ASM_FILE_START
38898 #define TARGET_ASM_FILE_START x86_file_start
38899 
38900 #undef TARGET_OPTION_OVERRIDE
38901 #define TARGET_OPTION_OVERRIDE ix86_option_override
38902 
38903 #undef TARGET_REGISTER_MOVE_COST
38904 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38905 #undef TARGET_MEMORY_MOVE_COST
38906 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38907 #undef TARGET_RTX_COSTS
38908 #define TARGET_RTX_COSTS ix86_rtx_costs
38909 #undef TARGET_ADDRESS_COST
38910 #define TARGET_ADDRESS_COST ix86_address_cost
38911 
38912 #undef TARGET_FIXED_CONDITION_CODE_REGS
38913 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38914 #undef TARGET_CC_MODES_COMPATIBLE
38915 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38916 
38917 #undef TARGET_MACHINE_DEPENDENT_REORG
38918 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38919 
38920 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38921 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38922 
38923 #undef TARGET_BUILD_BUILTIN_VA_LIST
38924 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38925 
38926 #undef TARGET_ENUM_VA_LIST_P
38927 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38928 
38929 #undef TARGET_FN_ABI_VA_LIST
38930 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38931 
38932 #undef TARGET_CANONICAL_VA_LIST_TYPE
38933 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38934 
38935 #undef TARGET_EXPAND_BUILTIN_VA_START
38936 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38937 
38938 #undef TARGET_MD_ASM_CLOBBERS
38939 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38940 
38941 #undef TARGET_PROMOTE_PROTOTYPES
38942 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38943 #undef TARGET_STRUCT_VALUE_RTX
38944 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38945 #undef TARGET_SETUP_INCOMING_VARARGS
38946 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38947 #undef TARGET_MUST_PASS_IN_STACK
38948 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38949 #undef TARGET_FUNCTION_ARG_ADVANCE
38950 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38951 #undef TARGET_FUNCTION_ARG
38952 #define TARGET_FUNCTION_ARG ix86_function_arg
38953 #undef TARGET_FUNCTION_ARG_BOUNDARY
38954 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38955 #undef TARGET_PASS_BY_REFERENCE
38956 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38957 #undef TARGET_INTERNAL_ARG_POINTER
38958 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38959 #undef TARGET_UPDATE_STACK_BOUNDARY
38960 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38961 #undef TARGET_GET_DRAP_RTX
38962 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38963 #undef TARGET_STRICT_ARGUMENT_NAMING
38964 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38965 #undef TARGET_STATIC_CHAIN
38966 #define TARGET_STATIC_CHAIN ix86_static_chain
38967 #undef TARGET_TRAMPOLINE_INIT
38968 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38969 #undef TARGET_RETURN_POPS_ARGS
38970 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38971 
38972 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38973 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38974 
38975 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38976 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38977 
38978 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38979 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38980 
38981 #undef TARGET_C_MODE_FOR_SUFFIX
38982 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38983 
38984 #ifdef HAVE_AS_TLS
38985 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38986 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38987 #endif
38988 
38989 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38990 #undef TARGET_INSERT_ATTRIBUTES
38991 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38992 #endif
38993 
38994 #undef TARGET_MANGLE_TYPE
38995 #define TARGET_MANGLE_TYPE ix86_mangle_type
38996 
38997 #if !TARGET_MACHO
38998 #undef TARGET_STACK_PROTECT_FAIL
38999 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
39000 #endif
39001 
39002 #undef TARGET_FUNCTION_VALUE
39003 #define TARGET_FUNCTION_VALUE ix86_function_value
39004 
39005 #undef TARGET_FUNCTION_VALUE_REGNO_P
39006 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
39007 
39008 #undef TARGET_PROMOTE_FUNCTION_MODE
39009 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
39010 
39011 #undef TARGET_INSTANTIATE_DECLS
39012 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
39013 
39014 #undef TARGET_SECONDARY_RELOAD
39015 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
39016 
39017 #undef TARGET_CLASS_MAX_NREGS
39018 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
39019 
39020 #undef TARGET_PREFERRED_RELOAD_CLASS
39021 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
39022 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
39023 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
39024 #undef TARGET_CLASS_LIKELY_SPILLED_P
39025 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
39026 
39027 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
39028 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
39029   ix86_builtin_vectorization_cost
39030 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
39031 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
39032   ix86_vectorize_vec_perm_const_ok
39033 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
39034 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
39035   ix86_preferred_simd_mode
39036 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
39037 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
39038   ix86_autovectorize_vector_sizes
39039 
39040 #undef TARGET_SET_CURRENT_FUNCTION
39041 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
39042 
39043 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
39044 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
39045 
39046 #undef TARGET_OPTION_SAVE
39047 #define TARGET_OPTION_SAVE ix86_function_specific_save
39048 
39049 #undef TARGET_OPTION_RESTORE
39050 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39051 
39052 #undef TARGET_OPTION_PRINT
39053 #define TARGET_OPTION_PRINT ix86_function_specific_print
39054 
39055 #undef TARGET_CAN_INLINE_P
39056 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39057 
39058 #undef TARGET_EXPAND_TO_RTL_HOOK
39059 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39060 
39061 #undef TARGET_LEGITIMATE_ADDRESS_P
39062 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39063 
39064 #undef TARGET_LEGITIMATE_CONSTANT_P
39065 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39066 
39067 #undef TARGET_FRAME_POINTER_REQUIRED
39068 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39069 
39070 #undef TARGET_CAN_ELIMINATE
39071 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39072 
39073 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39074 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39075 
39076 #undef TARGET_ASM_CODE_END
39077 #define TARGET_ASM_CODE_END ix86_code_end
39078 
39079 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39080 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39081 
39082 #if TARGET_MACHO
39083 #undef TARGET_INIT_LIBFUNCS
39084 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39085 #endif
39086 
39087 struct gcc_target targetm = TARGET_INITIALIZER;
39088 
39089 #include "gt-i386.h"
39090