xref: /dragonfly/contrib/gcc-4.7/gcc/config/i386/i386.c (revision 6ca88057)
1 /* Subroutines used for code generation on IA-32.
2    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3    2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013
4    Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12 
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 
65 enum upper_128bits_state
66 {
67   unknown = 0,
68   unused,
69   used
70 };
71 
72 typedef struct block_info_def
73 {
74   /* State of the upper 128bits of AVX registers at exit.  */
75   enum upper_128bits_state state;
76   /* TRUE if state of the upper 128bits of AVX registers is unchanged
77      in this block.  */
78   bool unchanged;
79   /* TRUE if block has been processed.  */
80   bool processed;
81   /* TRUE if block has been scanned.  */
82   bool scanned;
83   /* Previous state of the upper 128bits of AVX registers at entry.  */
84   enum upper_128bits_state prev;
85 } *block_info;
86 
87 #define BLOCK_INFO(B)   ((block_info) (B)->aux)
88 
89 enum call_avx256_state
90 {
91   /* Callee returns 256bit AVX register.  */
92   callee_return_avx256 = -1,
93   /* Callee returns and passes 256bit AVX register.  */
94   callee_return_pass_avx256,
95   /* Callee passes 256bit AVX register.  */
96   callee_pass_avx256,
97   /* Callee doesn't return nor passe 256bit AVX register, or no
98      256bit AVX register in function return.  */
99   call_no_avx256,
100   /* vzeroupper intrinsic.  */
101   vzeroupper_intrinsic
102 };
103 
104 /* Check if a 256bit AVX register is referenced in stores.   */
105 
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109   if ((REG_P (dest)
110        && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111       || (GET_CODE (set) == SET
112 	  && REG_P (SET_SRC (set))
113 	  && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114     {
115       enum upper_128bits_state *state
116 	= (enum upper_128bits_state *) data;
117       *state = used;
118     }
119 }
120 
121 /* Helper function for move_or_delete_vzeroupper_1.  Look for vzeroupper
122    in basic block BB.  Delete it if upper 128bit AVX registers are
123    unused.  If it isn't deleted, move it to just before a jump insn.
124 
125    STATE is state of the upper 128bits of AVX registers at entry.  */
126 
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 			     enum upper_128bits_state state)
130 {
131   rtx insn, bb_end;
132   rtx vzeroupper_insn = NULL_RTX;
133   rtx pat;
134   int avx256;
135   bool unchanged;
136 
137   if (BLOCK_INFO (bb)->unchanged)
138     {
139       if (dump_file)
140 	fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 		 bb->index, state);
142 
143       BLOCK_INFO (bb)->state = state;
144       return;
145     }
146 
147   if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148     {
149       if (dump_file)
150 	fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 		 bb->index, BLOCK_INFO (bb)->state);
152       return;
153     }
154 
155   BLOCK_INFO (bb)->prev = state;
156 
157   if (dump_file)
158     fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 	     bb->index, state);
160 
161   unchanged = true;
162 
163   /* BB_END changes when it is deleted.  */
164   bb_end = BB_END (bb);
165   insn = BB_HEAD (bb);
166   while (insn != bb_end)
167     {
168       insn = NEXT_INSN (insn);
169 
170       if (!NONDEBUG_INSN_P (insn))
171 	continue;
172 
173       /* Move vzeroupper before jump/call.  */
174       if (JUMP_P (insn) || CALL_P (insn))
175 	{
176 	  if (!vzeroupper_insn)
177 	    continue;
178 
179 	  if (PREV_INSN (insn) != vzeroupper_insn)
180 	    {
181 	      if (dump_file)
182 		{
183 		  fprintf (dump_file, "Move vzeroupper after:\n");
184 		  print_rtl_single (dump_file, PREV_INSN (insn));
185 		  fprintf (dump_file, "before:\n");
186 		  print_rtl_single (dump_file, insn);
187 		}
188 	      reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 				  PREV_INSN (insn));
190 	    }
191 	  vzeroupper_insn = NULL_RTX;
192 	  continue;
193 	}
194 
195       pat = PATTERN (insn);
196 
197       /* Check insn for vzeroupper intrinsic.  */
198       if (GET_CODE (pat) == UNSPEC_VOLATILE
199 	  && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 	{
201 	  if (dump_file)
202 	    {
203 	      /* Found vzeroupper intrinsic.  */
204 	      fprintf (dump_file, "Found vzeroupper:\n");
205 	      print_rtl_single (dump_file, insn);
206 	    }
207 	}
208       else
209 	{
210 	  /* Check insn for vzeroall intrinsic.  */
211 	  if (GET_CODE (pat) == PARALLEL
212 	      && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 	      && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 	    {
215 	      state = unused;
216 	      unchanged = false;
217 
218 	      /* Delete pending vzeroupper insertion.  */
219 	      if (vzeroupper_insn)
220 		{
221 		  delete_insn (vzeroupper_insn);
222 		  vzeroupper_insn = NULL_RTX;
223 		}
224 	    }
225 	  else if (state != used)
226 	    {
227 	      note_stores (pat, check_avx256_stores, &state);
228 	      if (state == used)
229 		unchanged = false;
230 	    }
231 	  continue;
232 	}
233 
234       /* Process vzeroupper intrinsic.  */
235       avx256 = INTVAL (XVECEXP (pat, 0, 0));
236 
237       if (state == unused)
238 	{
239 	  /* Since the upper 128bits are cleared, callee must not pass
240 	     256bit AVX register.  We only need to check if callee
241 	     returns 256bit AVX register.  */
242 	  if (avx256 == callee_return_avx256)
243 	    {
244 	      state = used;
245 	      unchanged = false;
246 	    }
247 
248 	  /* Remove unnecessary vzeroupper since upper 128bits are
249 	     cleared.  */
250 	  if (dump_file)
251 	    {
252 	      fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 	      print_rtl_single (dump_file, insn);
254 	    }
255 	  delete_insn (insn);
256 	}
257       else
258 	{
259 	  /* Set state to UNUSED if callee doesn't return 256bit AVX
260 	     register.  */
261 	  if (avx256 != callee_return_pass_avx256)
262 	    state = unused;
263 
264 	  if (avx256 == callee_return_pass_avx256
265 	      || avx256 == callee_pass_avx256)
266 	    {
267 	      /* Must remove vzeroupper since callee passes in 256bit
268 		 AVX register.  */
269 	      if (dump_file)
270 		{
271 		  fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 		  print_rtl_single (dump_file, insn);
273 		}
274 	      delete_insn (insn);
275 	    }
276 	  else
277 	    {
278 	      vzeroupper_insn = insn;
279 	      unchanged = false;
280 	    }
281 	}
282     }
283 
284   BLOCK_INFO (bb)->state = state;
285   BLOCK_INFO (bb)->unchanged = unchanged;
286   BLOCK_INFO (bb)->scanned = true;
287 
288   if (dump_file)
289     fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 	     bb->index, unchanged ? "unchanged" : "changed",
291 	     state);
292 }
293 
294 /* Helper function for move_or_delete_vzeroupper.  Process vzeroupper
295    in BLOCK and check its predecessor blocks.  Treat UNKNOWN state
296    as USED if UNKNOWN_IS_UNUSED is true.  Return TRUE if the exit
297    state is changed.  */
298 
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302   edge e;
303   edge_iterator ei;
304   enum upper_128bits_state state, old_state, new_state;
305   bool seen_unknown;
306 
307   if (dump_file)
308     fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 	     block->index, BLOCK_INFO (block)->processed);
310 
311   if (BLOCK_INFO (block)->processed)
312     return false;
313 
314   state = unused;
315 
316   /* Check all predecessor edges of this block.  */
317   seen_unknown = false;
318   FOR_EACH_EDGE (e, ei, block->preds)
319     {
320       if (e->src == block)
321 	continue;
322       switch (BLOCK_INFO (e->src)->state)
323 	{
324 	case unknown:
325 	  if (!unknown_is_unused)
326 	    seen_unknown = true;
327 	case unused:
328 	  break;
329 	case used:
330 	  state = used;
331 	  goto done;
332 	}
333     }
334 
335   if (seen_unknown)
336     state = unknown;
337 
338 done:
339   old_state = BLOCK_INFO (block)->state;
340   move_or_delete_vzeroupper_2 (block, state);
341   new_state = BLOCK_INFO (block)->state;
342 
343   if (state != unknown || new_state == used)
344     BLOCK_INFO (block)->processed = true;
345 
346   /* Need to rescan if the upper 128bits of AVX registers are changed
347      to USED at exit.  */
348   if (new_state != old_state)
349     {
350       if (new_state == used)
351 	cfun->machine->rescan_vzeroupper_p = 1;
352       return true;
353     }
354   else
355     return false;
356 }
357 
358 /* Go through the instruction stream looking for vzeroupper.  Delete
359    it if upper 128bit AVX registers are unused.  If it isn't deleted,
360    move it to just before a jump insn.  */
361 
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365   edge e;
366   edge_iterator ei;
367   basic_block bb;
368   fibheap_t worklist, pending, fibheap_swap;
369   sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370   int *bb_order;
371   int *rc_order;
372   int i;
373 
374   /* Set up block info for each basic block.  */
375   alloc_aux_for_blocks (sizeof (struct block_info_def));
376 
377   /* Process outgoing edges of entry point.  */
378   if (dump_file)
379     fprintf (dump_file, "Process outgoing edges of entry point\n");
380 
381   FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382     {
383       move_or_delete_vzeroupper_2 (e->dest,
384 				   cfun->machine->caller_pass_avx256_p
385 				   ? used : unused);
386       BLOCK_INFO (e->dest)->processed = true;
387     }
388 
389   /* Compute reverse completion order of depth first search of the CFG
390      so that the data-flow runs faster.  */
391   rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392   bb_order = XNEWVEC (int, last_basic_block);
393   pre_and_rev_post_order_compute (NULL, rc_order, false);
394   for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395     bb_order[rc_order[i]] = i;
396   free (rc_order);
397 
398   worklist = fibheap_new ();
399   pending = fibheap_new ();
400   visited = sbitmap_alloc (last_basic_block);
401   in_worklist = sbitmap_alloc (last_basic_block);
402   in_pending = sbitmap_alloc (last_basic_block);
403   sbitmap_zero (in_worklist);
404 
405   /* Don't check outgoing edges of entry point.  */
406   sbitmap_ones (in_pending);
407   FOR_EACH_BB (bb)
408     if (BLOCK_INFO (bb)->processed)
409       RESET_BIT (in_pending, bb->index);
410     else
411       {
412 	move_or_delete_vzeroupper_1 (bb, false);
413 	fibheap_insert (pending, bb_order[bb->index], bb);
414       }
415 
416   if (dump_file)
417     fprintf (dump_file, "Check remaining basic blocks\n");
418 
419   while (!fibheap_empty (pending))
420     {
421       fibheap_swap = pending;
422       pending = worklist;
423       worklist = fibheap_swap;
424       sbitmap_swap = in_pending;
425       in_pending = in_worklist;
426       in_worklist = sbitmap_swap;
427 
428       sbitmap_zero (visited);
429 
430       cfun->machine->rescan_vzeroupper_p = 0;
431 
432       while (!fibheap_empty (worklist))
433 	{
434 	  bb = (basic_block) fibheap_extract_min (worklist);
435 	  RESET_BIT (in_worklist, bb->index);
436 	  gcc_assert (!TEST_BIT (visited, bb->index));
437 	  if (!TEST_BIT (visited, bb->index))
438 	    {
439 	      edge_iterator ei;
440 
441 	      SET_BIT (visited, bb->index);
442 
443 	      if (move_or_delete_vzeroupper_1 (bb, false))
444 		FOR_EACH_EDGE (e, ei, bb->succs)
445 		  {
446 		    if (e->dest == EXIT_BLOCK_PTR
447 			|| BLOCK_INFO (e->dest)->processed)
448 		      continue;
449 
450 		    if (TEST_BIT (visited, e->dest->index))
451 		      {
452 			if (!TEST_BIT (in_pending, e->dest->index))
453 			  {
454 			    /* Send E->DEST to next round.  */
455 			    SET_BIT (in_pending, e->dest->index);
456 			    fibheap_insert (pending,
457 					    bb_order[e->dest->index],
458 					    e->dest);
459 			  }
460 		      }
461 		    else if (!TEST_BIT (in_worklist, e->dest->index))
462 		      {
463 			/* Add E->DEST to current round.  */
464 			SET_BIT (in_worklist, e->dest->index);
465 			fibheap_insert (worklist, bb_order[e->dest->index],
466 					e->dest);
467 		      }
468 		  }
469 	    }
470 	}
471 
472       if (!cfun->machine->rescan_vzeroupper_p)
473 	break;
474     }
475 
476   free (bb_order);
477   fibheap_delete (worklist);
478   fibheap_delete (pending);
479   sbitmap_free (visited);
480   sbitmap_free (in_worklist);
481   sbitmap_free (in_pending);
482 
483   if (dump_file)
484     fprintf (dump_file, "Process remaining basic blocks\n");
485 
486   FOR_EACH_BB (bb)
487     move_or_delete_vzeroupper_1 (bb, true);
488 
489   free_aux_for_blocks ();
490 }
491 
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493 
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497 
498 /* Return index of given mode in mult and division cost tables.  */
499 #define MODE_INDEX(mode)					\
500   ((mode) == QImode ? 0						\
501    : (mode) == HImode ? 1					\
502    : (mode) == SImode ? 2					\
503    : (mode) == DImode ? 3					\
504    : 4)
505 
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509 
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511 
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514   COSTS_N_BYTES (2),			/* cost of an add instruction */
515   COSTS_N_BYTES (3),			/* cost of a lea instruction */
516   COSTS_N_BYTES (2),			/* variable shift costs */
517   COSTS_N_BYTES (3),			/* constant shift costs */
518   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
519    COSTS_N_BYTES (3),			/*				 HI */
520    COSTS_N_BYTES (3),			/*				 SI */
521    COSTS_N_BYTES (3),			/*				 DI */
522    COSTS_N_BYTES (5)},			/*			      other */
523   0,					/* cost of multiply per each bit set */
524   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
525    COSTS_N_BYTES (3),			/*			    HI */
526    COSTS_N_BYTES (3),			/*			    SI */
527    COSTS_N_BYTES (3),			/*			    DI */
528    COSTS_N_BYTES (5)},			/*			    other */
529   COSTS_N_BYTES (3),			/* cost of movsx */
530   COSTS_N_BYTES (3),			/* cost of movzx */
531   0,					/* "large" insn */
532   2,					/* MOVE_RATIO */
533   2,				     /* cost for loading QImode using movzbl */
534   {2, 2, 2},				/* cost of loading integer registers
535 					   in QImode, HImode and SImode.
536 					   Relative to reg-reg move (2).  */
537   {2, 2, 2},				/* cost of storing integer registers */
538   2,					/* cost of reg,reg fld/fst */
539   {2, 2, 2},				/* cost of loading fp registers
540 					   in SFmode, DFmode and XFmode */
541   {2, 2, 2},				/* cost of storing fp registers
542 					   in SFmode, DFmode and XFmode */
543   3,					/* cost of moving MMX register */
544   {3, 3},				/* cost of loading MMX registers
545 					   in SImode and DImode */
546   {3, 3},				/* cost of storing MMX registers
547 					   in SImode and DImode */
548   3,					/* cost of moving SSE register */
549   {3, 3, 3},				/* cost of loading SSE registers
550 					   in SImode, DImode and TImode */
551   {3, 3, 3},				/* cost of storing SSE registers
552 					   in SImode, DImode and TImode */
553   3,					/* MMX or SSE register to integer */
554   0,					/* size of l1 cache  */
555   0,					/* size of l2 cache  */
556   0,					/* size of prefetch block */
557   0,					/* number of parallel prefetches */
558   2,					/* Branch cost */
559   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
560   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
561   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
562   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
563   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
564   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
565   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569   1,					/* scalar_stmt_cost.  */
570   1,					/* scalar load_cost.  */
571   1,					/* scalar_store_cost.  */
572   1,					/* vec_stmt_cost.  */
573   1,					/* vec_to_scalar_cost.  */
574   1,					/* scalar_to_vec_cost.  */
575   1,					/* vec_align_load_cost.  */
576   1,					/* vec_unalign_load_cost.  */
577   1,					/* vec_store_cost.  */
578   1,					/* cond_taken_branch_cost.  */
579   1,					/* cond_not_taken_branch_cost.  */
580 };
581 
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = {	/* 386 specific costs */
585   COSTS_N_INSNS (1),			/* cost of an add instruction */
586   COSTS_N_INSNS (1),			/* cost of a lea instruction */
587   COSTS_N_INSNS (3),			/* variable shift costs */
588   COSTS_N_INSNS (2),			/* constant shift costs */
589   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
590    COSTS_N_INSNS (6),			/*				 HI */
591    COSTS_N_INSNS (6),			/*				 SI */
592    COSTS_N_INSNS (6),			/*				 DI */
593    COSTS_N_INSNS (6)},			/*			      other */
594   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
595   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
596    COSTS_N_INSNS (23),			/*			    HI */
597    COSTS_N_INSNS (23),			/*			    SI */
598    COSTS_N_INSNS (23),			/*			    DI */
599    COSTS_N_INSNS (23)},			/*			    other */
600   COSTS_N_INSNS (3),			/* cost of movsx */
601   COSTS_N_INSNS (2),			/* cost of movzx */
602   15,					/* "large" insn */
603   3,					/* MOVE_RATIO */
604   4,				     /* cost for loading QImode using movzbl */
605   {2, 4, 2},				/* cost of loading integer registers
606 					   in QImode, HImode and SImode.
607 					   Relative to reg-reg move (2).  */
608   {2, 4, 2},				/* cost of storing integer registers */
609   2,					/* cost of reg,reg fld/fst */
610   {8, 8, 8},				/* cost of loading fp registers
611 					   in SFmode, DFmode and XFmode */
612   {8, 8, 8},				/* cost of storing fp registers
613 					   in SFmode, DFmode and XFmode */
614   2,					/* cost of moving MMX register */
615   {4, 8},				/* cost of loading MMX registers
616 					   in SImode and DImode */
617   {4, 8},				/* cost of storing MMX registers
618 					   in SImode and DImode */
619   2,					/* cost of moving SSE register */
620   {4, 8, 16},				/* cost of loading SSE registers
621 					   in SImode, DImode and TImode */
622   {4, 8, 16},				/* cost of storing SSE registers
623 					   in SImode, DImode and TImode */
624   3,					/* MMX or SSE register to integer */
625   0,					/* size of l1 cache  */
626   0,					/* size of l2 cache  */
627   0,					/* size of prefetch block */
628   0,					/* number of parallel prefetches */
629   1,					/* Branch cost */
630   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
631   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
632   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
633   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
634   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
635   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
636   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637    DUMMY_STRINGOP_ALGS},
638   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639    DUMMY_STRINGOP_ALGS},
640   1,					/* scalar_stmt_cost.  */
641   1,					/* scalar load_cost.  */
642   1,					/* scalar_store_cost.  */
643   1,					/* vec_stmt_cost.  */
644   1,					/* vec_to_scalar_cost.  */
645   1,					/* scalar_to_vec_cost.  */
646   1,					/* vec_align_load_cost.  */
647   2,					/* vec_unalign_load_cost.  */
648   1,					/* vec_store_cost.  */
649   3,					/* cond_taken_branch_cost.  */
650   1,					/* cond_not_taken_branch_cost.  */
651 };
652 
653 static const
654 struct processor_costs i486_cost = {	/* 486 specific costs */
655   COSTS_N_INSNS (1),			/* cost of an add instruction */
656   COSTS_N_INSNS (1),			/* cost of a lea instruction */
657   COSTS_N_INSNS (3),			/* variable shift costs */
658   COSTS_N_INSNS (2),			/* constant shift costs */
659   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
660    COSTS_N_INSNS (12),			/*				 HI */
661    COSTS_N_INSNS (12),			/*				 SI */
662    COSTS_N_INSNS (12),			/*				 DI */
663    COSTS_N_INSNS (12)},			/*			      other */
664   1,					/* cost of multiply per each bit set */
665   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
666    COSTS_N_INSNS (40),			/*			    HI */
667    COSTS_N_INSNS (40),			/*			    SI */
668    COSTS_N_INSNS (40),			/*			    DI */
669    COSTS_N_INSNS (40)},			/*			    other */
670   COSTS_N_INSNS (3),			/* cost of movsx */
671   COSTS_N_INSNS (2),			/* cost of movzx */
672   15,					/* "large" insn */
673   3,					/* MOVE_RATIO */
674   4,				     /* cost for loading QImode using movzbl */
675   {2, 4, 2},				/* cost of loading integer registers
676 					   in QImode, HImode and SImode.
677 					   Relative to reg-reg move (2).  */
678   {2, 4, 2},				/* cost of storing integer registers */
679   2,					/* cost of reg,reg fld/fst */
680   {8, 8, 8},				/* cost of loading fp registers
681 					   in SFmode, DFmode and XFmode */
682   {8, 8, 8},				/* cost of storing fp registers
683 					   in SFmode, DFmode and XFmode */
684   2,					/* cost of moving MMX register */
685   {4, 8},				/* cost of loading MMX registers
686 					   in SImode and DImode */
687   {4, 8},				/* cost of storing MMX registers
688 					   in SImode and DImode */
689   2,					/* cost of moving SSE register */
690   {4, 8, 16},				/* cost of loading SSE registers
691 					   in SImode, DImode and TImode */
692   {4, 8, 16},				/* cost of storing SSE registers
693 					   in SImode, DImode and TImode */
694   3,					/* MMX or SSE register to integer */
695   4,					/* size of l1 cache.  486 has 8kB cache
696 					   shared for code and data, so 4kB is
697 					   not really precise.  */
698   4,					/* size of l2 cache  */
699   0,					/* size of prefetch block */
700   0,					/* number of parallel prefetches */
701   1,					/* Branch cost */
702   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
703   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
704   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
705   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
706   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
707   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
708   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709    DUMMY_STRINGOP_ALGS},
710   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711    DUMMY_STRINGOP_ALGS},
712   1,					/* scalar_stmt_cost.  */
713   1,					/* scalar load_cost.  */
714   1,					/* scalar_store_cost.  */
715   1,					/* vec_stmt_cost.  */
716   1,					/* vec_to_scalar_cost.  */
717   1,					/* scalar_to_vec_cost.  */
718   1,					/* vec_align_load_cost.  */
719   2,					/* vec_unalign_load_cost.  */
720   1,					/* vec_store_cost.  */
721   3,					/* cond_taken_branch_cost.  */
722   1,					/* cond_not_taken_branch_cost.  */
723 };
724 
725 static const
726 struct processor_costs pentium_cost = {
727   COSTS_N_INSNS (1),			/* cost of an add instruction */
728   COSTS_N_INSNS (1),			/* cost of a lea instruction */
729   COSTS_N_INSNS (4),			/* variable shift costs */
730   COSTS_N_INSNS (1),			/* constant shift costs */
731   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
732    COSTS_N_INSNS (11),			/*				 HI */
733    COSTS_N_INSNS (11),			/*				 SI */
734    COSTS_N_INSNS (11),			/*				 DI */
735    COSTS_N_INSNS (11)},			/*			      other */
736   0,					/* cost of multiply per each bit set */
737   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
738    COSTS_N_INSNS (25),			/*			    HI */
739    COSTS_N_INSNS (25),			/*			    SI */
740    COSTS_N_INSNS (25),			/*			    DI */
741    COSTS_N_INSNS (25)},			/*			    other */
742   COSTS_N_INSNS (3),			/* cost of movsx */
743   COSTS_N_INSNS (2),			/* cost of movzx */
744   8,					/* "large" insn */
745   6,					/* MOVE_RATIO */
746   6,				     /* cost for loading QImode using movzbl */
747   {2, 4, 2},				/* cost of loading integer registers
748 					   in QImode, HImode and SImode.
749 					   Relative to reg-reg move (2).  */
750   {2, 4, 2},				/* cost of storing integer registers */
751   2,					/* cost of reg,reg fld/fst */
752   {2, 2, 6},				/* cost of loading fp registers
753 					   in SFmode, DFmode and XFmode */
754   {4, 4, 6},				/* cost of storing fp registers
755 					   in SFmode, DFmode and XFmode */
756   8,					/* cost of moving MMX register */
757   {8, 8},				/* cost of loading MMX registers
758 					   in SImode and DImode */
759   {8, 8},				/* cost of storing MMX registers
760 					   in SImode and DImode */
761   2,					/* cost of moving SSE register */
762   {4, 8, 16},				/* cost of loading SSE registers
763 					   in SImode, DImode and TImode */
764   {4, 8, 16},				/* cost of storing SSE registers
765 					   in SImode, DImode and TImode */
766   3,					/* MMX or SSE register to integer */
767   8,					/* size of l1 cache.  */
768   8,					/* size of l2 cache  */
769   0,					/* size of prefetch block */
770   0,					/* number of parallel prefetches */
771   2,					/* Branch cost */
772   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
773   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
774   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
775   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
776   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
777   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
778   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779    DUMMY_STRINGOP_ALGS},
780   {{libcall, {{-1, rep_prefix_4_byte}}},
781    DUMMY_STRINGOP_ALGS},
782   1,					/* scalar_stmt_cost.  */
783   1,					/* scalar load_cost.  */
784   1,					/* scalar_store_cost.  */
785   1,					/* vec_stmt_cost.  */
786   1,					/* vec_to_scalar_cost.  */
787   1,					/* scalar_to_vec_cost.  */
788   1,					/* vec_align_load_cost.  */
789   2,					/* vec_unalign_load_cost.  */
790   1,					/* vec_store_cost.  */
791   3,					/* cond_taken_branch_cost.  */
792   1,					/* cond_not_taken_branch_cost.  */
793 };
794 
795 static const
796 struct processor_costs pentiumpro_cost = {
797   COSTS_N_INSNS (1),			/* cost of an add instruction */
798   COSTS_N_INSNS (1),			/* cost of a lea instruction */
799   COSTS_N_INSNS (1),			/* variable shift costs */
800   COSTS_N_INSNS (1),			/* constant shift costs */
801   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
802    COSTS_N_INSNS (4),			/*				 HI */
803    COSTS_N_INSNS (4),			/*				 SI */
804    COSTS_N_INSNS (4),			/*				 DI */
805    COSTS_N_INSNS (4)},			/*			      other */
806   0,					/* cost of multiply per each bit set */
807   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
808    COSTS_N_INSNS (17),			/*			    HI */
809    COSTS_N_INSNS (17),			/*			    SI */
810    COSTS_N_INSNS (17),			/*			    DI */
811    COSTS_N_INSNS (17)},			/*			    other */
812   COSTS_N_INSNS (1),			/* cost of movsx */
813   COSTS_N_INSNS (1),			/* cost of movzx */
814   8,					/* "large" insn */
815   6,					/* MOVE_RATIO */
816   2,				     /* cost for loading QImode using movzbl */
817   {4, 4, 4},				/* cost of loading integer registers
818 					   in QImode, HImode and SImode.
819 					   Relative to reg-reg move (2).  */
820   {2, 2, 2},				/* cost of storing integer registers */
821   2,					/* cost of reg,reg fld/fst */
822   {2, 2, 6},				/* cost of loading fp registers
823 					   in SFmode, DFmode and XFmode */
824   {4, 4, 6},				/* cost of storing fp registers
825 					   in SFmode, DFmode and XFmode */
826   2,					/* cost of moving MMX register */
827   {2, 2},				/* cost of loading MMX registers
828 					   in SImode and DImode */
829   {2, 2},				/* cost of storing MMX registers
830 					   in SImode and DImode */
831   2,					/* cost of moving SSE register */
832   {2, 2, 8},				/* cost of loading SSE registers
833 					   in SImode, DImode and TImode */
834   {2, 2, 8},				/* cost of storing SSE registers
835 					   in SImode, DImode and TImode */
836   3,					/* MMX or SSE register to integer */
837   8,					/* size of l1 cache.  */
838   256,					/* size of l2 cache  */
839   32,					/* size of prefetch block */
840   6,					/* number of parallel prefetches */
841   2,					/* Branch cost */
842   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
843   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
844   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
845   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
846   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
847   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
848   /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849      (we ensure the alignment).  For small blocks inline loop is still a
850      noticeable win, for bigger blocks either rep movsl or rep movsb is
851      way to go.  Rep movsb has apparently more expensive startup time in CPU,
852      but after 4K the difference is down in the noise.  */
853   {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 			{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855    DUMMY_STRINGOP_ALGS},
856   {{rep_prefix_4_byte, {{1024, unrolled_loop},
857   			{8192, rep_prefix_4_byte}, {-1, libcall}}},
858    DUMMY_STRINGOP_ALGS},
859   1,					/* scalar_stmt_cost.  */
860   1,					/* scalar load_cost.  */
861   1,					/* scalar_store_cost.  */
862   1,					/* vec_stmt_cost.  */
863   1,					/* vec_to_scalar_cost.  */
864   1,					/* scalar_to_vec_cost.  */
865   1,					/* vec_align_load_cost.  */
866   2,					/* vec_unalign_load_cost.  */
867   1,					/* vec_store_cost.  */
868   3,					/* cond_taken_branch_cost.  */
869   1,					/* cond_not_taken_branch_cost.  */
870 };
871 
872 static const
873 struct processor_costs geode_cost = {
874   COSTS_N_INSNS (1),			/* cost of an add instruction */
875   COSTS_N_INSNS (1),			/* cost of a lea instruction */
876   COSTS_N_INSNS (2),			/* variable shift costs */
877   COSTS_N_INSNS (1),			/* constant shift costs */
878   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
879    COSTS_N_INSNS (4),			/*				 HI */
880    COSTS_N_INSNS (7),			/*				 SI */
881    COSTS_N_INSNS (7),			/*				 DI */
882    COSTS_N_INSNS (7)},			/*			      other */
883   0,					/* cost of multiply per each bit set */
884   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
885    COSTS_N_INSNS (23),			/*			    HI */
886    COSTS_N_INSNS (39),			/*			    SI */
887    COSTS_N_INSNS (39),			/*			    DI */
888    COSTS_N_INSNS (39)},			/*			    other */
889   COSTS_N_INSNS (1),			/* cost of movsx */
890   COSTS_N_INSNS (1),			/* cost of movzx */
891   8,					/* "large" insn */
892   4,					/* MOVE_RATIO */
893   1,				     /* cost for loading QImode using movzbl */
894   {1, 1, 1},				/* cost of loading integer registers
895 					   in QImode, HImode and SImode.
896 					   Relative to reg-reg move (2).  */
897   {1, 1, 1},				/* cost of storing integer registers */
898   1,					/* cost of reg,reg fld/fst */
899   {1, 1, 1},				/* cost of loading fp registers
900 					   in SFmode, DFmode and XFmode */
901   {4, 6, 6},				/* cost of storing fp registers
902 					   in SFmode, DFmode and XFmode */
903 
904   1,					/* cost of moving MMX register */
905   {1, 1},				/* cost of loading MMX registers
906 					   in SImode and DImode */
907   {1, 1},				/* cost of storing MMX registers
908 					   in SImode and DImode */
909   1,					/* cost of moving SSE register */
910   {1, 1, 1},				/* cost of loading SSE registers
911 					   in SImode, DImode and TImode */
912   {1, 1, 1},				/* cost of storing SSE registers
913 					   in SImode, DImode and TImode */
914   1,					/* MMX or SSE register to integer */
915   64,					/* size of l1 cache.  */
916   128,					/* size of l2 cache.  */
917   32,					/* size of prefetch block */
918   1,					/* number of parallel prefetches */
919   1,					/* Branch cost */
920   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
921   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
922   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
923   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
924   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
925   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
926   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927    DUMMY_STRINGOP_ALGS},
928   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929    DUMMY_STRINGOP_ALGS},
930   1,					/* scalar_stmt_cost.  */
931   1,					/* scalar load_cost.  */
932   1,					/* scalar_store_cost.  */
933   1,					/* vec_stmt_cost.  */
934   1,					/* vec_to_scalar_cost.  */
935   1,					/* scalar_to_vec_cost.  */
936   1,					/* vec_align_load_cost.  */
937   2,					/* vec_unalign_load_cost.  */
938   1,					/* vec_store_cost.  */
939   3,					/* cond_taken_branch_cost.  */
940   1,					/* cond_not_taken_branch_cost.  */
941 };
942 
943 static const
944 struct processor_costs k6_cost = {
945   COSTS_N_INSNS (1),			/* cost of an add instruction */
946   COSTS_N_INSNS (2),			/* cost of a lea instruction */
947   COSTS_N_INSNS (1),			/* variable shift costs */
948   COSTS_N_INSNS (1),			/* constant shift costs */
949   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
950    COSTS_N_INSNS (3),			/*				 HI */
951    COSTS_N_INSNS (3),			/*				 SI */
952    COSTS_N_INSNS (3),			/*				 DI */
953    COSTS_N_INSNS (3)},			/*			      other */
954   0,					/* cost of multiply per each bit set */
955   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
956    COSTS_N_INSNS (18),			/*			    HI */
957    COSTS_N_INSNS (18),			/*			    SI */
958    COSTS_N_INSNS (18),			/*			    DI */
959    COSTS_N_INSNS (18)},			/*			    other */
960   COSTS_N_INSNS (2),			/* cost of movsx */
961   COSTS_N_INSNS (2),			/* cost of movzx */
962   8,					/* "large" insn */
963   4,					/* MOVE_RATIO */
964   3,				     /* cost for loading QImode using movzbl */
965   {4, 5, 4},				/* cost of loading integer registers
966 					   in QImode, HImode and SImode.
967 					   Relative to reg-reg move (2).  */
968   {2, 3, 2},				/* cost of storing integer registers */
969   4,					/* cost of reg,reg fld/fst */
970   {6, 6, 6},				/* cost of loading fp registers
971 					   in SFmode, DFmode and XFmode */
972   {4, 4, 4},				/* cost of storing fp registers
973 					   in SFmode, DFmode and XFmode */
974   2,					/* cost of moving MMX register */
975   {2, 2},				/* cost of loading MMX registers
976 					   in SImode and DImode */
977   {2, 2},				/* cost of storing MMX registers
978 					   in SImode and DImode */
979   2,					/* cost of moving SSE register */
980   {2, 2, 8},				/* cost of loading SSE registers
981 					   in SImode, DImode and TImode */
982   {2, 2, 8},				/* cost of storing SSE registers
983 					   in SImode, DImode and TImode */
984   6,					/* MMX or SSE register to integer */
985   32,					/* size of l1 cache.  */
986   32,					/* size of l2 cache.  Some models
987 					   have integrated l2 cache, but
988 					   optimizing for k6 is not important
989 					   enough to worry about that.  */
990   32,					/* size of prefetch block */
991   1,					/* number of parallel prefetches */
992   1,					/* Branch cost */
993   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
994   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
995   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
996   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
997   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
998   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
999   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000    DUMMY_STRINGOP_ALGS},
1001   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002    DUMMY_STRINGOP_ALGS},
1003   1,					/* scalar_stmt_cost.  */
1004   1,					/* scalar load_cost.  */
1005   1,					/* scalar_store_cost.  */
1006   1,					/* vec_stmt_cost.  */
1007   1,					/* vec_to_scalar_cost.  */
1008   1,					/* scalar_to_vec_cost.  */
1009   1,					/* vec_align_load_cost.  */
1010   2,					/* vec_unalign_load_cost.  */
1011   1,					/* vec_store_cost.  */
1012   3,					/* cond_taken_branch_cost.  */
1013   1,					/* cond_not_taken_branch_cost.  */
1014 };
1015 
1016 static const
1017 struct processor_costs athlon_cost = {
1018   COSTS_N_INSNS (1),			/* cost of an add instruction */
1019   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1020   COSTS_N_INSNS (1),			/* variable shift costs */
1021   COSTS_N_INSNS (1),			/* constant shift costs */
1022   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
1023    COSTS_N_INSNS (5),			/*				 HI */
1024    COSTS_N_INSNS (5),			/*				 SI */
1025    COSTS_N_INSNS (5),			/*				 DI */
1026    COSTS_N_INSNS (5)},			/*			      other */
1027   0,					/* cost of multiply per each bit set */
1028   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1029    COSTS_N_INSNS (26),			/*			    HI */
1030    COSTS_N_INSNS (42),			/*			    SI */
1031    COSTS_N_INSNS (74),			/*			    DI */
1032    COSTS_N_INSNS (74)},			/*			    other */
1033   COSTS_N_INSNS (1),			/* cost of movsx */
1034   COSTS_N_INSNS (1),			/* cost of movzx */
1035   8,					/* "large" insn */
1036   9,					/* MOVE_RATIO */
1037   4,				     /* cost for loading QImode using movzbl */
1038   {3, 4, 3},				/* cost of loading integer registers
1039 					   in QImode, HImode and SImode.
1040 					   Relative to reg-reg move (2).  */
1041   {3, 4, 3},				/* cost of storing integer registers */
1042   4,					/* cost of reg,reg fld/fst */
1043   {4, 4, 12},				/* cost of loading fp registers
1044 					   in SFmode, DFmode and XFmode */
1045   {6, 6, 8},				/* cost of storing fp registers
1046 					   in SFmode, DFmode and XFmode */
1047   2,					/* cost of moving MMX register */
1048   {4, 4},				/* cost of loading MMX registers
1049 					   in SImode and DImode */
1050   {4, 4},				/* cost of storing MMX registers
1051 					   in SImode and DImode */
1052   2,					/* cost of moving SSE register */
1053   {4, 4, 6},				/* cost of loading SSE registers
1054 					   in SImode, DImode and TImode */
1055   {4, 4, 5},				/* cost of storing SSE registers
1056 					   in SImode, DImode and TImode */
1057   5,					/* MMX or SSE register to integer */
1058   64,					/* size of l1 cache.  */
1059   256,					/* size of l2 cache.  */
1060   64,					/* size of prefetch block */
1061   6,					/* number of parallel prefetches */
1062   5,					/* Branch cost */
1063   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1064   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1065   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
1066   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1067   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1068   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1069   /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070      compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071      128 bytes for memset.  */
1072   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073    DUMMY_STRINGOP_ALGS},
1074   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075    DUMMY_STRINGOP_ALGS},
1076   1,					/* scalar_stmt_cost.  */
1077   1,					/* scalar load_cost.  */
1078   1,					/* scalar_store_cost.  */
1079   1,					/* vec_stmt_cost.  */
1080   1,					/* vec_to_scalar_cost.  */
1081   1,					/* scalar_to_vec_cost.  */
1082   1,					/* vec_align_load_cost.  */
1083   2,					/* vec_unalign_load_cost.  */
1084   1,					/* vec_store_cost.  */
1085   3,					/* cond_taken_branch_cost.  */
1086   1,					/* cond_not_taken_branch_cost.  */
1087 };
1088 
1089 static const
1090 struct processor_costs k8_cost = {
1091   COSTS_N_INSNS (1),			/* cost of an add instruction */
1092   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1093   COSTS_N_INSNS (1),			/* variable shift costs */
1094   COSTS_N_INSNS (1),			/* constant shift costs */
1095   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1096    COSTS_N_INSNS (4),			/*				 HI */
1097    COSTS_N_INSNS (3),			/*				 SI */
1098    COSTS_N_INSNS (4),			/*				 DI */
1099    COSTS_N_INSNS (5)},			/*			      other */
1100   0,					/* cost of multiply per each bit set */
1101   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1102    COSTS_N_INSNS (26),			/*			    HI */
1103    COSTS_N_INSNS (42),			/*			    SI */
1104    COSTS_N_INSNS (74),			/*			    DI */
1105    COSTS_N_INSNS (74)},			/*			    other */
1106   COSTS_N_INSNS (1),			/* cost of movsx */
1107   COSTS_N_INSNS (1),			/* cost of movzx */
1108   8,					/* "large" insn */
1109   9,					/* MOVE_RATIO */
1110   4,				     /* cost for loading QImode using movzbl */
1111   {3, 4, 3},				/* cost of loading integer registers
1112 					   in QImode, HImode and SImode.
1113 					   Relative to reg-reg move (2).  */
1114   {3, 4, 3},				/* cost of storing integer registers */
1115   4,					/* cost of reg,reg fld/fst */
1116   {4, 4, 12},				/* cost of loading fp registers
1117 					   in SFmode, DFmode and XFmode */
1118   {6, 6, 8},				/* cost of storing fp registers
1119 					   in SFmode, DFmode and XFmode */
1120   2,					/* cost of moving MMX register */
1121   {3, 3},				/* cost of loading MMX registers
1122 					   in SImode and DImode */
1123   {4, 4},				/* cost of storing MMX registers
1124 					   in SImode and DImode */
1125   2,					/* cost of moving SSE register */
1126   {4, 3, 6},				/* cost of loading SSE registers
1127 					   in SImode, DImode and TImode */
1128   {4, 4, 5},				/* cost of storing SSE registers
1129 					   in SImode, DImode and TImode */
1130   5,					/* MMX or SSE register to integer */
1131   64,					/* size of l1 cache.  */
1132   512,					/* size of l2 cache.  */
1133   64,					/* size of prefetch block */
1134   /* New AMD processors never drop prefetches; if they cannot be performed
1135      immediately, they are queued.  We set number of simultaneous prefetches
1136      to a large constant to reflect this (it probably is not a good idea not
1137      to limit number of prefetches at all, as their execution also takes some
1138      time).  */
1139   100,					/* number of parallel prefetches */
1140   3,					/* Branch cost */
1141   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1142   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1143   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1144   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1145   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1146   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1147   /* K8 has optimized REP instruction for medium sized blocks, but for very
1148      small blocks it is better to use loop. For large blocks, libcall can
1149      do nontemporary accesses and beat inline considerably.  */
1150   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152   {{libcall, {{8, loop}, {24, unrolled_loop},
1153 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155   4,					/* scalar_stmt_cost.  */
1156   2,					/* scalar load_cost.  */
1157   2,					/* scalar_store_cost.  */
1158   5,					/* vec_stmt_cost.  */
1159   0,					/* vec_to_scalar_cost.  */
1160   2,					/* scalar_to_vec_cost.  */
1161   2,					/* vec_align_load_cost.  */
1162   3,					/* vec_unalign_load_cost.  */
1163   3,					/* vec_store_cost.  */
1164   3,					/* cond_taken_branch_cost.  */
1165   2,					/* cond_not_taken_branch_cost.  */
1166 };
1167 
1168 struct processor_costs amdfam10_cost = {
1169   COSTS_N_INSNS (1),			/* cost of an add instruction */
1170   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1171   COSTS_N_INSNS (1),			/* variable shift costs */
1172   COSTS_N_INSNS (1),			/* constant shift costs */
1173   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1174    COSTS_N_INSNS (4),			/*				 HI */
1175    COSTS_N_INSNS (3),			/*				 SI */
1176    COSTS_N_INSNS (4),			/*				 DI */
1177    COSTS_N_INSNS (5)},			/*			      other */
1178   0,					/* cost of multiply per each bit set */
1179   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1180    COSTS_N_INSNS (35),			/*			    HI */
1181    COSTS_N_INSNS (51),			/*			    SI */
1182    COSTS_N_INSNS (83),			/*			    DI */
1183    COSTS_N_INSNS (83)},			/*			    other */
1184   COSTS_N_INSNS (1),			/* cost of movsx */
1185   COSTS_N_INSNS (1),			/* cost of movzx */
1186   8,					/* "large" insn */
1187   9,					/* MOVE_RATIO */
1188   4,				     /* cost for loading QImode using movzbl */
1189   {3, 4, 3},				/* cost of loading integer registers
1190 					   in QImode, HImode and SImode.
1191 					   Relative to reg-reg move (2).  */
1192   {3, 4, 3},				/* cost of storing integer registers */
1193   4,					/* cost of reg,reg fld/fst */
1194   {4, 4, 12},				/* cost of loading fp registers
1195 		   			   in SFmode, DFmode and XFmode */
1196   {6, 6, 8},				/* cost of storing fp registers
1197  		   			   in SFmode, DFmode and XFmode */
1198   2,					/* cost of moving MMX register */
1199   {3, 3},				/* cost of loading MMX registers
1200 					   in SImode and DImode */
1201   {4, 4},				/* cost of storing MMX registers
1202 					   in SImode and DImode */
1203   2,					/* cost of moving SSE register */
1204   {4, 4, 3},				/* cost of loading SSE registers
1205 					   in SImode, DImode and TImode */
1206   {4, 4, 5},				/* cost of storing SSE registers
1207 					   in SImode, DImode and TImode */
1208   3,					/* MMX or SSE register to integer */
1209   					/* On K8:
1210   					    MOVD reg64, xmmreg Double FSTORE 4
1211 					    MOVD reg32, xmmreg Double FSTORE 4
1212 					   On AMDFAM10:
1213 					    MOVD reg64, xmmreg Double FADD 3
1214 							       1/1  1/1
1215 					    MOVD reg32, xmmreg Double FADD 3
1216 							       1/1  1/1 */
1217   64,					/* size of l1 cache.  */
1218   512,					/* size of l2 cache.  */
1219   64,					/* size of prefetch block */
1220   /* New AMD processors never drop prefetches; if they cannot be performed
1221      immediately, they are queued.  We set number of simultaneous prefetches
1222      to a large constant to reflect this (it probably is not a good idea not
1223      to limit number of prefetches at all, as their execution also takes some
1224      time).  */
1225   100,					/* number of parallel prefetches */
1226   2,					/* Branch cost */
1227   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1228   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1229   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1230   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1231   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1232   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1233 
1234   /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235      very small blocks it is better to use loop. For large blocks, libcall can
1236      do nontemporary accesses and beat inline considerably.  */
1237   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239   {{libcall, {{8, loop}, {24, unrolled_loop},
1240 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242   4,					/* scalar_stmt_cost.  */
1243   2,					/* scalar load_cost.  */
1244   2,					/* scalar_store_cost.  */
1245   6,					/* vec_stmt_cost.  */
1246   0,					/* vec_to_scalar_cost.  */
1247   2,					/* scalar_to_vec_cost.  */
1248   2,					/* vec_align_load_cost.  */
1249   2,					/* vec_unalign_load_cost.  */
1250   2,					/* vec_store_cost.  */
1251   2,					/* cond_taken_branch_cost.  */
1252   1,					/* cond_not_taken_branch_cost.  */
1253 };
1254 
1255 struct processor_costs bdver1_cost = {
1256   COSTS_N_INSNS (1),			/* cost of an add instruction */
1257   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1258   COSTS_N_INSNS (1),			/* variable shift costs */
1259   COSTS_N_INSNS (1),			/* constant shift costs */
1260   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1261    COSTS_N_INSNS (4),			/*				 HI */
1262    COSTS_N_INSNS (4),			/*				 SI */
1263    COSTS_N_INSNS (6),			/*				 DI */
1264    COSTS_N_INSNS (6)},			/*			      other */
1265   0,					/* cost of multiply per each bit set */
1266   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1267    COSTS_N_INSNS (35),			/*			    HI */
1268    COSTS_N_INSNS (51),			/*			    SI */
1269    COSTS_N_INSNS (83),			/*			    DI */
1270    COSTS_N_INSNS (83)},			/*			    other */
1271   COSTS_N_INSNS (1),			/* cost of movsx */
1272   COSTS_N_INSNS (1),			/* cost of movzx */
1273   8,					/* "large" insn */
1274   9,					/* MOVE_RATIO */
1275   4,				     /* cost for loading QImode using movzbl */
1276   {5, 5, 4},				/* cost of loading integer registers
1277 					   in QImode, HImode and SImode.
1278 					   Relative to reg-reg move (2).  */
1279   {4, 4, 4},				/* cost of storing integer registers */
1280   2,					/* cost of reg,reg fld/fst */
1281   {5, 5, 12},				/* cost of loading fp registers
1282 		   			   in SFmode, DFmode and XFmode */
1283   {4, 4, 8},				/* cost of storing fp registers
1284  		   			   in SFmode, DFmode and XFmode */
1285   2,					/* cost of moving MMX register */
1286   {4, 4},				/* cost of loading MMX registers
1287 					   in SImode and DImode */
1288   {4, 4},				/* cost of storing MMX registers
1289 					   in SImode and DImode */
1290   2,					/* cost of moving SSE register */
1291   {4, 4, 4},				/* cost of loading SSE registers
1292 					   in SImode, DImode and TImode */
1293   {4, 4, 4},				/* cost of storing SSE registers
1294 					   in SImode, DImode and TImode */
1295   2,					/* MMX or SSE register to integer */
1296   					/* On K8:
1297 					    MOVD reg64, xmmreg Double FSTORE 4
1298 					    MOVD reg32, xmmreg Double FSTORE 4
1299 					   On AMDFAM10:
1300 					    MOVD reg64, xmmreg Double FADD 3
1301 							       1/1  1/1
1302 					    MOVD reg32, xmmreg Double FADD 3
1303 							       1/1  1/1 */
1304   16,					/* size of l1 cache.  */
1305   2048,					/* size of l2 cache.  */
1306   64,					/* size of prefetch block */
1307   /* New AMD processors never drop prefetches; if they cannot be performed
1308      immediately, they are queued.  We set number of simultaneous prefetches
1309      to a large constant to reflect this (it probably is not a good idea not
1310      to limit number of prefetches at all, as their execution also takes some
1311      time).  */
1312   100,					/* number of parallel prefetches */
1313   2,					/* Branch cost */
1314   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1315   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1316   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1317   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1318   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1319   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1320 
1321   /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
1322       very small blocks it is better to use loop. For large blocks, libcall
1323       can do nontemporary accesses and beat inline considerably.  */
1324   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326   {{libcall, {{8, loop}, {24, unrolled_loop},
1327 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329   6,					/* scalar_stmt_cost.  */
1330   4,					/* scalar load_cost.  */
1331   4,					/* scalar_store_cost.  */
1332   6,					/* vec_stmt_cost.  */
1333   0,					/* vec_to_scalar_cost.  */
1334   2,					/* scalar_to_vec_cost.  */
1335   4,					/* vec_align_load_cost.  */
1336   4,					/* vec_unalign_load_cost.  */
1337   4,					/* vec_store_cost.  */
1338   2,					/* cond_taken_branch_cost.  */
1339   1,					/* cond_not_taken_branch_cost.  */
1340 };
1341 
1342 struct processor_costs bdver2_cost = {
1343   COSTS_N_INSNS (1),			/* cost of an add instruction */
1344   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1345   COSTS_N_INSNS (1),			/* variable shift costs */
1346   COSTS_N_INSNS (1),			/* constant shift costs */
1347   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1348    COSTS_N_INSNS (4),			/*				 HI */
1349    COSTS_N_INSNS (4),			/*				 SI */
1350    COSTS_N_INSNS (6),			/*				 DI */
1351    COSTS_N_INSNS (6)},			/*			      other */
1352   0,					/* cost of multiply per each bit set */
1353   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1354    COSTS_N_INSNS (35),			/*			    HI */
1355    COSTS_N_INSNS (51),			/*			    SI */
1356    COSTS_N_INSNS (83),			/*			    DI */
1357    COSTS_N_INSNS (83)},			/*			    other */
1358   COSTS_N_INSNS (1),			/* cost of movsx */
1359   COSTS_N_INSNS (1),			/* cost of movzx */
1360   8,					/* "large" insn */
1361   9,					/* MOVE_RATIO */
1362   4,				     /* cost for loading QImode using movzbl */
1363   {5, 5, 4},				/* cost of loading integer registers
1364 					   in QImode, HImode and SImode.
1365 					   Relative to reg-reg move (2).  */
1366   {4, 4, 4},				/* cost of storing integer registers */
1367   2,					/* cost of reg,reg fld/fst */
1368   {5, 5, 12},				/* cost of loading fp registers
1369 		   			   in SFmode, DFmode and XFmode */
1370   {4, 4, 8},				/* cost of storing fp registers
1371  		   			   in SFmode, DFmode and XFmode */
1372   2,					/* cost of moving MMX register */
1373   {4, 4},				/* cost of loading MMX registers
1374 					   in SImode and DImode */
1375   {4, 4},				/* cost of storing MMX registers
1376 					   in SImode and DImode */
1377   2,					/* cost of moving SSE register */
1378   {4, 4, 4},				/* cost of loading SSE registers
1379 					   in SImode, DImode and TImode */
1380   {4, 4, 4},				/* cost of storing SSE registers
1381 					   in SImode, DImode and TImode */
1382   2,					/* MMX or SSE register to integer */
1383   					/* On K8:
1384 					    MOVD reg64, xmmreg Double FSTORE 4
1385 					    MOVD reg32, xmmreg Double FSTORE 4
1386 					   On AMDFAM10:
1387 					    MOVD reg64, xmmreg Double FADD 3
1388 							       1/1  1/1
1389 					    MOVD reg32, xmmreg Double FADD 3
1390 							       1/1  1/1 */
1391   16,					/* size of l1 cache.  */
1392   2048,					/* size of l2 cache.  */
1393   64,					/* size of prefetch block */
1394   /* New AMD processors never drop prefetches; if they cannot be performed
1395      immediately, they are queued.  We set number of simultaneous prefetches
1396      to a large constant to reflect this (it probably is not a good idea not
1397      to limit number of prefetches at all, as their execution also takes some
1398      time).  */
1399   100,					/* number of parallel prefetches */
1400   2,					/* Branch cost */
1401   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1402   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1403   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1404   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1405   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1406   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1407 
1408   /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1409       very small blocks it is better to use loop. For large blocks, libcall
1410       can do nontemporary accesses and beat inline considerably.  */
1411   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413   {{libcall, {{8, loop}, {24, unrolled_loop},
1414 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416   6,					/* scalar_stmt_cost.  */
1417   4,					/* scalar load_cost.  */
1418   4,					/* scalar_store_cost.  */
1419   6,					/* vec_stmt_cost.  */
1420   0,					/* vec_to_scalar_cost.  */
1421   2,					/* scalar_to_vec_cost.  */
1422   4,					/* vec_align_load_cost.  */
1423   4,					/* vec_unalign_load_cost.  */
1424   4,					/* vec_store_cost.  */
1425   2,					/* cond_taken_branch_cost.  */
1426   1,					/* cond_not_taken_branch_cost.  */
1427 };
1428 
1429 struct processor_costs btver1_cost = {
1430   COSTS_N_INSNS (1),			/* cost of an add instruction */
1431   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1432   COSTS_N_INSNS (1),			/* variable shift costs */
1433   COSTS_N_INSNS (1),			/* constant shift costs */
1434   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1435    COSTS_N_INSNS (4),			/*				 HI */
1436    COSTS_N_INSNS (3),			/*				 SI */
1437    COSTS_N_INSNS (4),			/*				 DI */
1438    COSTS_N_INSNS (5)},			/*			      other */
1439   0,					/* cost of multiply per each bit set */
1440   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1441    COSTS_N_INSNS (35),			/*			    HI */
1442    COSTS_N_INSNS (51),			/*			    SI */
1443    COSTS_N_INSNS (83),			/*			    DI */
1444    COSTS_N_INSNS (83)},			/*			    other */
1445   COSTS_N_INSNS (1),			/* cost of movsx */
1446   COSTS_N_INSNS (1),			/* cost of movzx */
1447   8,					/* "large" insn */
1448   9,					/* MOVE_RATIO */
1449   4,				     /* cost for loading QImode using movzbl */
1450   {3, 4, 3},				/* cost of loading integer registers
1451 					   in QImode, HImode and SImode.
1452 					   Relative to reg-reg move (2).  */
1453   {3, 4, 3},				/* cost of storing integer registers */
1454   4,					/* cost of reg,reg fld/fst */
1455   {4, 4, 12},				/* cost of loading fp registers
1456 					   in SFmode, DFmode and XFmode */
1457   {6, 6, 8},				/* cost of storing fp registers
1458 					   in SFmode, DFmode and XFmode */
1459   2,					/* cost of moving MMX register */
1460   {3, 3},				/* cost of loading MMX registers
1461 					   in SImode and DImode */
1462   {4, 4},				/* cost of storing MMX registers
1463 					   in SImode and DImode */
1464   2,					/* cost of moving SSE register */
1465   {4, 4, 3},				/* cost of loading SSE registers
1466 					   in SImode, DImode and TImode */
1467   {4, 4, 5},				/* cost of storing SSE registers
1468 					   in SImode, DImode and TImode */
1469   3,					/* MMX or SSE register to integer */
1470 					/* On K8:
1471 					   MOVD reg64, xmmreg Double FSTORE 4
1472 					   MOVD reg32, xmmreg Double FSTORE 4
1473 					   On AMDFAM10:
1474 					   MOVD reg64, xmmreg Double FADD 3
1475 							       1/1  1/1
1476 					    MOVD reg32, xmmreg Double FADD 3
1477 							       1/1  1/1 */
1478   32,					/* size of l1 cache.  */
1479   512,					/* size of l2 cache.  */
1480   64,					/* size of prefetch block */
1481   100,					/* number of parallel prefetches */
1482   2,					/* Branch cost */
1483   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1484   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1485   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1486   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1487   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1488   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1489 
1490   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491      very small blocks it is better to use loop. For large blocks, libcall can
1492      do nontemporary accesses and beat inline considerably.  */
1493   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495   {{libcall, {{8, loop}, {24, unrolled_loop},
1496 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498   4,					/* scalar_stmt_cost.  */
1499   2,					/* scalar load_cost.  */
1500   2,					/* scalar_store_cost.  */
1501   6,					/* vec_stmt_cost.  */
1502   0,					/* vec_to_scalar_cost.  */
1503   2,					/* scalar_to_vec_cost.  */
1504   2,					/* vec_align_load_cost.  */
1505   2,					/* vec_unalign_load_cost.  */
1506   2,					/* vec_store_cost.  */
1507   2,					/* cond_taken_branch_cost.  */
1508   1,					/* cond_not_taken_branch_cost.  */
1509 };
1510 
1511 static const
1512 struct processor_costs pentium4_cost = {
1513   COSTS_N_INSNS (1),			/* cost of an add instruction */
1514   COSTS_N_INSNS (3),			/* cost of a lea instruction */
1515   COSTS_N_INSNS (4),			/* variable shift costs */
1516   COSTS_N_INSNS (4),			/* constant shift costs */
1517   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
1518    COSTS_N_INSNS (15),			/*				 HI */
1519    COSTS_N_INSNS (15),			/*				 SI */
1520    COSTS_N_INSNS (15),			/*				 DI */
1521    COSTS_N_INSNS (15)},			/*			      other */
1522   0,					/* cost of multiply per each bit set */
1523   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
1524    COSTS_N_INSNS (56),			/*			    HI */
1525    COSTS_N_INSNS (56),			/*			    SI */
1526    COSTS_N_INSNS (56),			/*			    DI */
1527    COSTS_N_INSNS (56)},			/*			    other */
1528   COSTS_N_INSNS (1),			/* cost of movsx */
1529   COSTS_N_INSNS (1),			/* cost of movzx */
1530   16,					/* "large" insn */
1531   6,					/* MOVE_RATIO */
1532   2,				     /* cost for loading QImode using movzbl */
1533   {4, 5, 4},				/* cost of loading integer registers
1534 					   in QImode, HImode and SImode.
1535 					   Relative to reg-reg move (2).  */
1536   {2, 3, 2},				/* cost of storing integer registers */
1537   2,					/* cost of reg,reg fld/fst */
1538   {2, 2, 6},				/* cost of loading fp registers
1539 					   in SFmode, DFmode and XFmode */
1540   {4, 4, 6},				/* cost of storing fp registers
1541 					   in SFmode, DFmode and XFmode */
1542   2,					/* cost of moving MMX register */
1543   {2, 2},				/* cost of loading MMX registers
1544 					   in SImode and DImode */
1545   {2, 2},				/* cost of storing MMX registers
1546 					   in SImode and DImode */
1547   12,					/* cost of moving SSE register */
1548   {12, 12, 12},				/* cost of loading SSE registers
1549 					   in SImode, DImode and TImode */
1550   {2, 2, 8},				/* cost of storing SSE registers
1551 					   in SImode, DImode and TImode */
1552   10,					/* MMX or SSE register to integer */
1553   8,					/* size of l1 cache.  */
1554   256,					/* size of l2 cache.  */
1555   64,					/* size of prefetch block */
1556   6,					/* number of parallel prefetches */
1557   2,					/* Branch cost */
1558   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1559   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
1560   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
1561   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1562   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1563   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
1564   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565    DUMMY_STRINGOP_ALGS},
1566   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567    {-1, libcall}}},
1568    DUMMY_STRINGOP_ALGS},
1569   1,					/* scalar_stmt_cost.  */
1570   1,					/* scalar load_cost.  */
1571   1,					/* scalar_store_cost.  */
1572   1,					/* vec_stmt_cost.  */
1573   1,					/* vec_to_scalar_cost.  */
1574   1,					/* scalar_to_vec_cost.  */
1575   1,					/* vec_align_load_cost.  */
1576   2,					/* vec_unalign_load_cost.  */
1577   1,					/* vec_store_cost.  */
1578   3,					/* cond_taken_branch_cost.  */
1579   1,					/* cond_not_taken_branch_cost.  */
1580 };
1581 
1582 static const
1583 struct processor_costs nocona_cost = {
1584   COSTS_N_INSNS (1),			/* cost of an add instruction */
1585   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1586   COSTS_N_INSNS (1),			/* variable shift costs */
1587   COSTS_N_INSNS (1),			/* constant shift costs */
1588   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
1589    COSTS_N_INSNS (10),			/*				 HI */
1590    COSTS_N_INSNS (10),			/*				 SI */
1591    COSTS_N_INSNS (10),			/*				 DI */
1592    COSTS_N_INSNS (10)},			/*			      other */
1593   0,					/* cost of multiply per each bit set */
1594   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
1595    COSTS_N_INSNS (66),			/*			    HI */
1596    COSTS_N_INSNS (66),			/*			    SI */
1597    COSTS_N_INSNS (66),			/*			    DI */
1598    COSTS_N_INSNS (66)},			/*			    other */
1599   COSTS_N_INSNS (1),			/* cost of movsx */
1600   COSTS_N_INSNS (1),			/* cost of movzx */
1601   16,					/* "large" insn */
1602   17,					/* MOVE_RATIO */
1603   4,				     /* cost for loading QImode using movzbl */
1604   {4, 4, 4},				/* cost of loading integer registers
1605 					   in QImode, HImode and SImode.
1606 					   Relative to reg-reg move (2).  */
1607   {4, 4, 4},				/* cost of storing integer registers */
1608   3,					/* cost of reg,reg fld/fst */
1609   {12, 12, 12},				/* cost of loading fp registers
1610 					   in SFmode, DFmode and XFmode */
1611   {4, 4, 4},				/* cost of storing fp registers
1612 					   in SFmode, DFmode and XFmode */
1613   6,					/* cost of moving MMX register */
1614   {12, 12},				/* cost of loading MMX registers
1615 					   in SImode and DImode */
1616   {12, 12},				/* cost of storing MMX registers
1617 					   in SImode and DImode */
1618   6,					/* cost of moving SSE register */
1619   {12, 12, 12},				/* cost of loading SSE registers
1620 					   in SImode, DImode and TImode */
1621   {12, 12, 12},				/* cost of storing SSE registers
1622 					   in SImode, DImode and TImode */
1623   8,					/* MMX or SSE register to integer */
1624   8,					/* size of l1 cache.  */
1625   1024,					/* size of l2 cache.  */
1626   64,					/* size of prefetch block */
1627   8,					/* number of parallel prefetches */
1628   1,					/* Branch cost */
1629   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1630   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1631   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
1632   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
1633   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
1634   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
1635   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636    {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 	      {100000, unrolled_loop}, {-1, libcall}}}},
1638   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639    {-1, libcall}}},
1640    {libcall, {{24, loop}, {64, unrolled_loop},
1641 	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642   1,					/* scalar_stmt_cost.  */
1643   1,					/* scalar load_cost.  */
1644   1,					/* scalar_store_cost.  */
1645   1,					/* vec_stmt_cost.  */
1646   1,					/* vec_to_scalar_cost.  */
1647   1,					/* scalar_to_vec_cost.  */
1648   1,					/* vec_align_load_cost.  */
1649   2,					/* vec_unalign_load_cost.  */
1650   1,					/* vec_store_cost.  */
1651   3,					/* cond_taken_branch_cost.  */
1652   1,					/* cond_not_taken_branch_cost.  */
1653 };
1654 
1655 static const
1656 struct processor_costs atom_cost = {
1657   COSTS_N_INSNS (1),			/* cost of an add instruction */
1658   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1659   COSTS_N_INSNS (1),			/* variable shift costs */
1660   COSTS_N_INSNS (1),			/* constant shift costs */
1661   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1662    COSTS_N_INSNS (4),			/*				 HI */
1663    COSTS_N_INSNS (3),			/*				 SI */
1664    COSTS_N_INSNS (4),			/*				 DI */
1665    COSTS_N_INSNS (2)},			/*			      other */
1666   0,					/* cost of multiply per each bit set */
1667   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1668    COSTS_N_INSNS (26),			/*			    HI */
1669    COSTS_N_INSNS (42),			/*			    SI */
1670    COSTS_N_INSNS (74),			/*			    DI */
1671    COSTS_N_INSNS (74)},			/*			    other */
1672   COSTS_N_INSNS (1),			/* cost of movsx */
1673   COSTS_N_INSNS (1),			/* cost of movzx */
1674   8,					/* "large" insn */
1675   17,					/* MOVE_RATIO */
1676   4,					/* cost for loading QImode using movzbl */
1677   {4, 4, 4},				/* cost of loading integer registers
1678 					   in QImode, HImode and SImode.
1679 					   Relative to reg-reg move (2).  */
1680   {4, 4, 4},				/* cost of storing integer registers */
1681   4,					/* cost of reg,reg fld/fst */
1682   {12, 12, 12},				/* cost of loading fp registers
1683 					   in SFmode, DFmode and XFmode */
1684   {6, 6, 8},				/* cost of storing fp registers
1685 					   in SFmode, DFmode and XFmode */
1686   2,					/* cost of moving MMX register */
1687   {8, 8},				/* cost of loading MMX registers
1688 					   in SImode and DImode */
1689   {8, 8},				/* cost of storing MMX registers
1690 					   in SImode and DImode */
1691   2,					/* cost of moving SSE register */
1692   {8, 8, 8},				/* cost of loading SSE registers
1693 					   in SImode, DImode and TImode */
1694   {8, 8, 8},				/* cost of storing SSE registers
1695 					   in SImode, DImode and TImode */
1696   5,					/* MMX or SSE register to integer */
1697   32,					/* size of l1 cache.  */
1698   256,					/* size of l2 cache.  */
1699   64,					/* size of prefetch block */
1700   6,					/* number of parallel prefetches */
1701   3,					/* Branch cost */
1702   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1703   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1704   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1705   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1706   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1707   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1708   {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709    {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 	  {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711   {{libcall, {{8, loop}, {15, unrolled_loop},
1712 	  {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713    {libcall, {{24, loop}, {32, unrolled_loop},
1714 	  {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715   1,					/* scalar_stmt_cost.  */
1716   1,					/* scalar load_cost.  */
1717   1,					/* scalar_store_cost.  */
1718   1,					/* vec_stmt_cost.  */
1719   1,					/* vec_to_scalar_cost.  */
1720   1,					/* scalar_to_vec_cost.  */
1721   1,					/* vec_align_load_cost.  */
1722   2,					/* vec_unalign_load_cost.  */
1723   1,					/* vec_store_cost.  */
1724   3,					/* cond_taken_branch_cost.  */
1725   1,					/* cond_not_taken_branch_cost.  */
1726 };
1727 
1728 /* Generic64 should produce code tuned for Nocona and K8.  */
1729 static const
1730 struct processor_costs generic64_cost = {
1731   COSTS_N_INSNS (1),			/* cost of an add instruction */
1732   /* On all chips taken into consideration lea is 2 cycles and more.  With
1733      this cost however our current implementation of synth_mult results in
1734      use of unnecessary temporary registers causing regression on several
1735      SPECfp benchmarks.  */
1736   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1737   COSTS_N_INSNS (1),			/* variable shift costs */
1738   COSTS_N_INSNS (1),			/* constant shift costs */
1739   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1740    COSTS_N_INSNS (4),			/*				 HI */
1741    COSTS_N_INSNS (3),			/*				 SI */
1742    COSTS_N_INSNS (4),			/*				 DI */
1743    COSTS_N_INSNS (2)},			/*			      other */
1744   0,					/* cost of multiply per each bit set */
1745   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1746    COSTS_N_INSNS (26),			/*			    HI */
1747    COSTS_N_INSNS (42),			/*			    SI */
1748    COSTS_N_INSNS (74),			/*			    DI */
1749    COSTS_N_INSNS (74)},			/*			    other */
1750   COSTS_N_INSNS (1),			/* cost of movsx */
1751   COSTS_N_INSNS (1),			/* cost of movzx */
1752   8,					/* "large" insn */
1753   17,					/* MOVE_RATIO */
1754   4,				     /* cost for loading QImode using movzbl */
1755   {4, 4, 4},				/* cost of loading integer registers
1756 					   in QImode, HImode and SImode.
1757 					   Relative to reg-reg move (2).  */
1758   {4, 4, 4},				/* cost of storing integer registers */
1759   4,					/* cost of reg,reg fld/fst */
1760   {12, 12, 12},				/* cost of loading fp registers
1761 					   in SFmode, DFmode and XFmode */
1762   {6, 6, 8},				/* cost of storing fp registers
1763 					   in SFmode, DFmode and XFmode */
1764   2,					/* cost of moving MMX register */
1765   {8, 8},				/* cost of loading MMX registers
1766 					   in SImode and DImode */
1767   {8, 8},				/* cost of storing MMX registers
1768 					   in SImode and DImode */
1769   2,					/* cost of moving SSE register */
1770   {8, 8, 8},				/* cost of loading SSE registers
1771 					   in SImode, DImode and TImode */
1772   {8, 8, 8},				/* cost of storing SSE registers
1773 					   in SImode, DImode and TImode */
1774   5,					/* MMX or SSE register to integer */
1775   32,					/* size of l1 cache.  */
1776   512,					/* size of l2 cache.  */
1777   64,					/* size of prefetch block */
1778   6,					/* number of parallel prefetches */
1779   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780      value is increased to perhaps more appropriate value of 5.  */
1781   3,					/* Branch cost */
1782   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1783   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1784   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1785   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1786   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1787   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1788   {DUMMY_STRINGOP_ALGS,
1789    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790   {DUMMY_STRINGOP_ALGS,
1791    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792   1,					/* scalar_stmt_cost.  */
1793   1,					/* scalar load_cost.  */
1794   1,					/* scalar_store_cost.  */
1795   1,					/* vec_stmt_cost.  */
1796   1,					/* vec_to_scalar_cost.  */
1797   1,					/* scalar_to_vec_cost.  */
1798   1,					/* vec_align_load_cost.  */
1799   2,					/* vec_unalign_load_cost.  */
1800   1,					/* vec_store_cost.  */
1801   3,					/* cond_taken_branch_cost.  */
1802   1,					/* cond_not_taken_branch_cost.  */
1803 };
1804 
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806    Athlon and K8.  */
1807 static const
1808 struct processor_costs generic32_cost = {
1809   COSTS_N_INSNS (1),			/* cost of an add instruction */
1810   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1811   COSTS_N_INSNS (1),			/* variable shift costs */
1812   COSTS_N_INSNS (1),			/* constant shift costs */
1813   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1814    COSTS_N_INSNS (4),			/*				 HI */
1815    COSTS_N_INSNS (3),			/*				 SI */
1816    COSTS_N_INSNS (4),			/*				 DI */
1817    COSTS_N_INSNS (2)},			/*			      other */
1818   0,					/* cost of multiply per each bit set */
1819   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1820    COSTS_N_INSNS (26),			/*			    HI */
1821    COSTS_N_INSNS (42),			/*			    SI */
1822    COSTS_N_INSNS (74),			/*			    DI */
1823    COSTS_N_INSNS (74)},			/*			    other */
1824   COSTS_N_INSNS (1),			/* cost of movsx */
1825   COSTS_N_INSNS (1),			/* cost of movzx */
1826   8,					/* "large" insn */
1827   17,					/* MOVE_RATIO */
1828   4,				     /* cost for loading QImode using movzbl */
1829   {4, 4, 4},				/* cost of loading integer registers
1830 					   in QImode, HImode and SImode.
1831 					   Relative to reg-reg move (2).  */
1832   {4, 4, 4},				/* cost of storing integer registers */
1833   4,					/* cost of reg,reg fld/fst */
1834   {12, 12, 12},				/* cost of loading fp registers
1835 					   in SFmode, DFmode and XFmode */
1836   {6, 6, 8},				/* cost of storing fp registers
1837 					   in SFmode, DFmode and XFmode */
1838   2,					/* cost of moving MMX register */
1839   {8, 8},				/* cost of loading MMX registers
1840 					   in SImode and DImode */
1841   {8, 8},				/* cost of storing MMX registers
1842 					   in SImode and DImode */
1843   2,					/* cost of moving SSE register */
1844   {8, 8, 8},				/* cost of loading SSE registers
1845 					   in SImode, DImode and TImode */
1846   {8, 8, 8},				/* cost of storing SSE registers
1847 					   in SImode, DImode and TImode */
1848   5,					/* MMX or SSE register to integer */
1849   32,					/* size of l1 cache.  */
1850   256,					/* size of l2 cache.  */
1851   64,					/* size of prefetch block */
1852   6,					/* number of parallel prefetches */
1853   3,					/* Branch cost */
1854   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1855   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1856   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1857   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1858   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1859   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1860   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861    DUMMY_STRINGOP_ALGS},
1862   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863    DUMMY_STRINGOP_ALGS},
1864   1,					/* scalar_stmt_cost.  */
1865   1,					/* scalar load_cost.  */
1866   1,					/* scalar_store_cost.  */
1867   1,					/* vec_stmt_cost.  */
1868   1,					/* vec_to_scalar_cost.  */
1869   1,					/* scalar_to_vec_cost.  */
1870   1,					/* vec_align_load_cost.  */
1871   2,					/* vec_unalign_load_cost.  */
1872   1,					/* vec_store_cost.  */
1873   3,					/* cond_taken_branch_cost.  */
1874   1,					/* cond_not_taken_branch_cost.  */
1875 };
1876 
1877 const struct processor_costs *ix86_cost = &pentium_cost;
1878 
1879 /* Processor feature/optimization bitmasks.  */
1880 #define m_386 (1<<PROCESSOR_I386)
1881 #define m_486 (1<<PROCESSOR_I486)
1882 #define m_PENT (1<<PROCESSOR_PENTIUM)
1883 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1886 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895 #define m_ATOM (1<<PROCESSOR_ATOM)
1896 
1897 #define m_GEODE (1<<PROCESSOR_GEODE)
1898 #define m_K6 (1<<PROCESSOR_K6)
1899 #define m_K6_GEODE (m_K6 | m_GEODE)
1900 #define m_K8 (1<<PROCESSOR_K8)
1901 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1902 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906 #define m_BDVER	(m_BDVER1 | m_BDVER2)
1907 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909 
1910 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912 
1913 /* Generic instruction choice should be common subset of supported CPUs
1914    (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
1915 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916 
1917 /* Feature tests against the various tunings.  */
1918 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919 
1920 /* Feature tests against the various tunings used to create ix86_tune_features
1921    based on the processor mask.  */
1922 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923   /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924      negatively, so enabling for Generic64 seems like good code size
1925      tradeoff.  We can't enable it for 32bit generic because it does not
1926      work well with PPro base chips.  */
1927   m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928 
1929   /* X86_TUNE_PUSH_MEMORY */
1930   m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931 
1932   /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1933   m_486 | m_PENT,
1934 
1935   /* X86_TUNE_UNROLL_STRLEN */
1936   m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937 
1938   /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939      on simulation result. But after P4 was made, no performance benefit
1940      was observed with branch hints.  It also increases the code size.
1941      As a result, icc never generates branch hints.  */
1942   0,
1943 
1944   /* X86_TUNE_DOUBLE_WITH_ADD */
1945   ~m_386,
1946 
1947   /* X86_TUNE_USE_SAHF */
1948   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949 
1950   /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951      partial dependencies.  */
1952   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC,
1953 
1954   /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955      register stalls on Generic32 compilation setting as well.  However
1956      in current implementation the partial register stalls are not eliminated
1957      very well - they can be introduced via subregs synthesized by combine
1958      and can happen in caller/callee saving sequences.  Because this option
1959      pays back little on PPro based chips and is in conflict with partial reg
1960      dependencies used by Athlon/P4 based chips, it is better to leave it off
1961      for generic32 for now.  */
1962   m_PPRO,
1963 
1964   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965   m_CORE2I7 | m_GENERIC,
1966 
1967   /* X86_TUNE_USE_HIMODE_FIOP */
1968   m_386 | m_486 | m_K6_GEODE,
1969 
1970   /* X86_TUNE_USE_SIMODE_FIOP */
1971   ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1972 
1973   /* X86_TUNE_USE_MOV0 */
1974   m_K6,
1975 
1976   /* X86_TUNE_USE_CLTD */
1977   ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1978 
1979   /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
1980   m_PENT4,
1981 
1982   /* X86_TUNE_SPLIT_LONG_MOVES */
1983   m_PPRO,
1984 
1985   /* X86_TUNE_READ_MODIFY_WRITE */
1986   ~m_PENT,
1987 
1988   /* X86_TUNE_READ_MODIFY */
1989   ~(m_PENT | m_PPRO),
1990 
1991   /* X86_TUNE_PROMOTE_QIMODE */
1992   m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1993 
1994   /* X86_TUNE_FAST_PREFIX */
1995   ~(m_386 | m_486 | m_PENT),
1996 
1997   /* X86_TUNE_SINGLE_STRINGOP */
1998   m_386 | m_P4_NOCONA,
1999 
2000   /* X86_TUNE_QIMODE_MATH */
2001   ~0,
2002 
2003   /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2004      register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
2005      might be considered for Generic32 if our scheme for avoiding partial
2006      stalls was more effective.  */
2007   ~m_PPRO,
2008 
2009   /* X86_TUNE_PROMOTE_QI_REGS */
2010   0,
2011 
2012   /* X86_TUNE_PROMOTE_HI_REGS */
2013   m_PPRO,
2014 
2015   /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2016      over esp addition.  */
2017   m_386 | m_486 | m_PENT | m_PPRO,
2018 
2019   /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2020      over esp addition.  */
2021   m_PENT,
2022 
2023   /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2024      over esp subtraction.  */
2025   m_386 | m_486 | m_PENT | m_K6_GEODE,
2026 
2027   /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2028      over esp subtraction.  */
2029   m_PENT | m_K6_GEODE,
2030 
2031   /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2032      for DFmode copies */
2033   ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2034 
2035   /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2036   m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2037 
2038   /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2039      conflict here in between PPro/Pentium4 based chips that thread 128bit
2040      SSE registers as single units versus K8 based chips that divide SSE
2041      registers to two 64bit halves.  This knob promotes all store destinations
2042      to be 128bit to allow register renaming on 128bit SSE units, but usually
2043      results in one extra microop on 64bit SSE units.  Experimental results
2044      shows that disabling this option on P4 brings over 20% SPECfp regression,
2045      while enabling it on K8 brings roughly 2.4% regression that can be partly
2046      masked by careful scheduling of moves.  */
2047   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM  | m_AMDFAM10 | m_BDVER | m_GENERIC,
2048 
2049   /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2050   m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2051 
2052   /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2053   m_COREI7 | m_BDVER,
2054 
2055   /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2056   m_BDVER ,
2057 
2058   /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2059      are resolved on SSE register parts instead of whole registers, so we may
2060      maintain just lower part of scalar values in proper format leaving the
2061      upper part undefined.  */
2062   m_ATHLON_K8,
2063 
2064   /* X86_TUNE_SSE_TYPELESS_STORES */
2065   m_AMD_MULTIPLE,
2066 
2067   /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2068   m_PPRO | m_P4_NOCONA,
2069 
2070   /* X86_TUNE_MEMORY_MISMATCH_STALL */
2071   m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2072 
2073   /* X86_TUNE_PROLOGUE_USING_MOVE */
2074   m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2075 
2076   /* X86_TUNE_EPILOGUE_USING_MOVE */
2077   m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2078 
2079   /* X86_TUNE_SHIFT1 */
2080   ~m_486,
2081 
2082   /* X86_TUNE_USE_FFREEP */
2083   m_AMD_MULTIPLE,
2084 
2085   /* X86_TUNE_INTER_UNIT_MOVES */
2086   ~(m_AMD_MULTIPLE | m_GENERIC),
2087 
2088   /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2089   ~(m_AMDFAM10 | m_BDVER ),
2090 
2091   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2092      than 4 branch instructions in the 16 byte window.  */
2093   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2094 
2095   /* X86_TUNE_SCHEDULE */
2096   m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2097 
2098   /* X86_TUNE_USE_BT */
2099   m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2100 
2101   /* X86_TUNE_USE_INCDEC */
2102   ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2103 
2104   /* X86_TUNE_PAD_RETURNS */
2105   m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2106 
2107   /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion.  */
2108   m_ATOM,
2109 
2110   /* X86_TUNE_EXT_80387_CONSTANTS */
2111   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2112 
2113   /* X86_TUNE_SHORTEN_X87_SSE */
2114   ~m_K8,
2115 
2116   /* X86_TUNE_AVOID_VECTOR_DECODE */
2117   m_CORE2I7_64 | m_K8 | m_GENERIC64,
2118 
2119   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2120      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
2121   ~(m_386 | m_486),
2122 
2123   /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2124      vector path on AMD machines.  */
2125   m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2126 
2127   /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2128      machines.  */
2129   m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130 
2131   /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2132      than a MOV.  */
2133   m_PENT,
2134 
2135   /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2136      but one byte longer.  */
2137   m_PENT,
2138 
2139   /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2140      operand that cannot be represented using a modRM byte.  The XOR
2141      replacement is long decoded, so this split helps here as well.  */
2142   m_K6,
2143 
2144   /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2145      from FP to FP. */
2146   m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2147 
2148   /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2149      from integer to FP. */
2150   m_AMDFAM10,
2151 
2152   /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2153      with a subsequent conditional jump instruction into a single
2154      compare-and-branch uop.  */
2155   m_BDVER,
2156 
2157   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2158      will impact LEA instruction selection. */
2159   m_ATOM,
2160 
2161   /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2162      instructions.  */
2163   ~m_ATOM,
2164 
2165   /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2166      at -O3.  For the moment, the prefetching seems badly tuned for Intel
2167      chips.  */
2168   m_K6_GEODE | m_AMD_MULTIPLE,
2169 
2170   /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2171      the auto-vectorizer.  */
2172   m_BDVER,
2173 
2174   /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2175      during reassociation of integer computation.  */
2176   m_ATOM,
2177 
2178   /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2179      during reassociation of fp computation.  */
2180   m_ATOM
2181 };
2182 
2183 /* Feature tests against the various architecture variations.  */
2184 unsigned char ix86_arch_features[X86_ARCH_LAST];
2185 
2186 /* Feature tests against the various architecture variations, used to create
2187    ix86_arch_features based on the processor mask.  */
2188 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2189   /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
2190   ~(m_386 | m_486 | m_PENT | m_K6),
2191 
2192   /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
2193   ~m_386,
2194 
2195   /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2196   ~(m_386 | m_486),
2197 
2198   /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
2199   ~m_386,
2200 
2201   /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
2202   ~m_386,
2203 };
2204 
2205 static const unsigned int x86_accumulate_outgoing_args
2206   = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2207 
2208 static const unsigned int x86_arch_always_fancy_math_387
2209   = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2210 
2211 static const unsigned int x86_avx256_split_unaligned_load
2212   = m_COREI7 | m_GENERIC;
2213 
2214 static const unsigned int x86_avx256_split_unaligned_store
2215   = m_COREI7 | m_BDVER | m_GENERIC;
2216 
2217 /* In case the average insn count for single function invocation is
2218    lower than this constant, emit fast (but longer) prologue and
2219    epilogue code.  */
2220 #define FAST_PROLOGUE_INSN_COUNT 20
2221 
2222 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
2223 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2224 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2225 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2226 
2227 /* Array of the smallest class containing reg number REGNO, indexed by
2228    REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
2229 
2230 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2231 {
2232   /* ax, dx, cx, bx */
2233   AREG, DREG, CREG, BREG,
2234   /* si, di, bp, sp */
2235   SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2236   /* FP registers */
2237   FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2238   FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2239   /* arg pointer */
2240   NON_Q_REGS,
2241   /* flags, fpsr, fpcr, frame */
2242   NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2243   /* SSE registers */
2244   SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2245   SSE_REGS, SSE_REGS,
2246   /* MMX registers */
2247   MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2248   MMX_REGS, MMX_REGS,
2249   /* REX registers */
2250   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2252   /* SSE REX registers */
2253   SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2254   SSE_REGS, SSE_REGS,
2255 };
2256 
2257 /* The "default" register map used in 32bit mode.  */
2258 
2259 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2260 {
2261   0, 2, 1, 3, 6, 7, 4, 5,		/* general regs */
2262   12, 13, 14, 15, 16, 17, 18, 19,	/* fp regs */
2263   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
2264   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE */
2265   29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
2266   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
2267   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
2268 };
2269 
2270 /* The "default" register map used in 64bit mode.  */
2271 
2272 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2273 {
2274   0, 1, 2, 3, 4, 5, 6, 7,		/* general regs */
2275   33, 34, 35, 36, 37, 38, 39, 40,	/* fp regs */
2276   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
2277   17, 18, 19, 20, 21, 22, 23, 24,	/* SSE */
2278   41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
2279   8,9,10,11,12,13,14,15,		/* extended integer registers */
2280   25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
2281 };
2282 
2283 /* Define the register numbers to be used in Dwarf debugging information.
2284    The SVR4 reference port C compiler uses the following register numbers
2285    in its Dwarf output code:
2286 	0 for %eax (gcc regno = 0)
2287 	1 for %ecx (gcc regno = 2)
2288 	2 for %edx (gcc regno = 1)
2289 	3 for %ebx (gcc regno = 3)
2290 	4 for %esp (gcc regno = 7)
2291 	5 for %ebp (gcc regno = 6)
2292 	6 for %esi (gcc regno = 4)
2293 	7 for %edi (gcc regno = 5)
2294    The following three DWARF register numbers are never generated by
2295    the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2296    believes these numbers have these meanings.
2297 	8  for %eip    (no gcc equivalent)
2298 	9  for %eflags (gcc regno = 17)
2299 	10 for %trapno (no gcc equivalent)
2300    It is not at all clear how we should number the FP stack registers
2301    for the x86 architecture.  If the version of SDB on x86/svr4 were
2302    a bit less brain dead with respect to floating-point then we would
2303    have a precedent to follow with respect to DWARF register numbers
2304    for x86 FP registers, but the SDB on x86/svr4 is so completely
2305    broken with respect to FP registers that it is hardly worth thinking
2306    of it as something to strive for compatibility with.
2307    The version of x86/svr4 SDB I have at the moment does (partially)
2308    seem to believe that DWARF register number 11 is associated with
2309    the x86 register %st(0), but that's about all.  Higher DWARF
2310    register numbers don't seem to be associated with anything in
2311    particular, and even for DWARF regno 11, SDB only seems to under-
2312    stand that it should say that a variable lives in %st(0) (when
2313    asked via an `=' command) if we said it was in DWARF regno 11,
2314    but SDB still prints garbage when asked for the value of the
2315    variable in question (via a `/' command).
2316    (Also note that the labels SDB prints for various FP stack regs
2317    when doing an `x' command are all wrong.)
2318    Note that these problems generally don't affect the native SVR4
2319    C compiler because it doesn't allow the use of -O with -g and
2320    because when it is *not* optimizing, it allocates a memory
2321    location for each floating-point variable, and the memory
2322    location is what gets described in the DWARF AT_location
2323    attribute for the variable in question.
2324    Regardless of the severe mental illness of the x86/svr4 SDB, we
2325    do something sensible here and we use the following DWARF
2326    register numbers.  Note that these are all stack-top-relative
2327    numbers.
2328 	11 for %st(0) (gcc regno = 8)
2329 	12 for %st(1) (gcc regno = 9)
2330 	13 for %st(2) (gcc regno = 10)
2331 	14 for %st(3) (gcc regno = 11)
2332 	15 for %st(4) (gcc regno = 12)
2333 	16 for %st(5) (gcc regno = 13)
2334 	17 for %st(6) (gcc regno = 14)
2335 	18 for %st(7) (gcc regno = 15)
2336 */
2337 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2338 {
2339   0, 2, 1, 3, 6, 7, 5, 4,		/* general regs */
2340   11, 12, 13, 14, 15, 16, 17, 18,	/* fp regs */
2341   -1, 9, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
2342   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE registers */
2343   29, 30, 31, 32, 33, 34, 35, 36,	/* MMX registers */
2344   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
2345   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
2346 };
2347 
2348 /* Define parameter passing and return registers.  */
2349 
2350 static int const x86_64_int_parameter_registers[6] =
2351 {
2352   DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2353 };
2354 
2355 static int const x86_64_ms_abi_int_parameter_registers[4] =
2356 {
2357   CX_REG, DX_REG, R8_REG, R9_REG
2358 };
2359 
2360 static int const x86_64_int_return_registers[4] =
2361 {
2362   AX_REG, DX_REG, DI_REG, SI_REG
2363 };
2364 
2365 /* Define the structure for the machine field in struct function.  */
2366 
2367 struct GTY(()) stack_local_entry {
2368   unsigned short mode;
2369   unsigned short n;
2370   rtx rtl;
2371   struct stack_local_entry *next;
2372 };
2373 
2374 /* Structure describing stack frame layout.
2375    Stack grows downward:
2376 
2377    [arguments]
2378 					<- ARG_POINTER
2379    saved pc
2380 
2381    saved static chain			if ix86_static_chain_on_stack
2382 
2383    saved frame pointer			if frame_pointer_needed
2384 					<- HARD_FRAME_POINTER
2385    [saved regs]
2386 					<- regs_save_offset
2387    [padding0]
2388 
2389    [saved SSE regs]
2390 					<- sse_regs_save_offset
2391    [padding1]          |
2392 		       |		<- FRAME_POINTER
2393    [va_arg registers]  |
2394 		       |
2395    [frame]	       |
2396 		       |
2397    [padding2]	       | = to_allocate
2398 					<- STACK_POINTER
2399   */
2400 struct ix86_frame
2401 {
2402   int nsseregs;
2403   int nregs;
2404   int va_arg_size;
2405   int red_zone_size;
2406   int outgoing_arguments_size;
2407   HOST_WIDE_INT frame;
2408 
2409   /* The offsets relative to ARG_POINTER.  */
2410   HOST_WIDE_INT frame_pointer_offset;
2411   HOST_WIDE_INT hard_frame_pointer_offset;
2412   HOST_WIDE_INT stack_pointer_offset;
2413   HOST_WIDE_INT hfp_save_offset;
2414   HOST_WIDE_INT reg_save_offset;
2415   HOST_WIDE_INT sse_reg_save_offset;
2416 
2417   /* When save_regs_using_mov is set, emit prologue using
2418      move instead of push instructions.  */
2419   bool save_regs_using_mov;
2420 };
2421 
2422 /* Which cpu are we scheduling for.  */
2423 enum attr_cpu ix86_schedule;
2424 
2425 /* Which cpu are we optimizing for.  */
2426 enum processor_type ix86_tune;
2427 
2428 /* Which instruction set architecture to use.  */
2429 enum processor_type ix86_arch;
2430 
2431 /* True if processor has SSE prefetch instruction.  */
2432 int x86_prefetch_sse;
2433 
2434 /* True if processor has prefetchw instruction.  */
2435 int x86_prefetchw;
2436 
2437 /* -mstackrealign option */
2438 static const char ix86_force_align_arg_pointer_string[]
2439   = "force_align_arg_pointer";
2440 
2441 static rtx (*ix86_gen_leave) (void);
2442 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2445 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2446 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2449 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2450 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2451 
2452 /* Preferred alignment for stack boundary in bits.  */
2453 unsigned int ix86_preferred_stack_boundary;
2454 
2455 /* Alignment for incoming stack boundary in bits specified at
2456    command line.  */
2457 static unsigned int ix86_user_incoming_stack_boundary;
2458 
2459 /* Default alignment for incoming stack boundary in bits.  */
2460 static unsigned int ix86_default_incoming_stack_boundary;
2461 
2462 /* Alignment for incoming stack boundary in bits.  */
2463 unsigned int ix86_incoming_stack_boundary;
2464 
2465 /* Calling abi specific va_list type nodes.  */
2466 static GTY(()) tree sysv_va_list_type_node;
2467 static GTY(()) tree ms_va_list_type_node;
2468 
2469 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
2470 char internal_label_prefix[16];
2471 int internal_label_prefix_len;
2472 
2473 /* Fence to use after loop using movnt.  */
2474 tree x86_mfence;
2475 
2476 /* Register class used for passing given 64bit part of the argument.
2477    These represent classes as documented by the PS ABI, with the exception
2478    of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2479    use SF or DFmode move instead of DImode to avoid reformatting penalties.
2480 
2481    Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2482    whenever possible (upper half does contain padding).  */
2483 enum x86_64_reg_class
2484   {
2485     X86_64_NO_CLASS,
2486     X86_64_INTEGER_CLASS,
2487     X86_64_INTEGERSI_CLASS,
2488     X86_64_SSE_CLASS,
2489     X86_64_SSESF_CLASS,
2490     X86_64_SSEDF_CLASS,
2491     X86_64_SSEUP_CLASS,
2492     X86_64_X87_CLASS,
2493     X86_64_X87UP_CLASS,
2494     X86_64_COMPLEX_X87_CLASS,
2495     X86_64_MEMORY_CLASS
2496   };
2497 
2498 #define MAX_CLASSES 4
2499 
2500 /* Table of constants used by fldpi, fldln2, etc....  */
2501 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2502 static bool ext_80387_constants_init = 0;
2503 
2504 
2505 static struct machine_function * ix86_init_machine_status (void);
2506 static rtx ix86_function_value (const_tree, const_tree, bool);
2507 static bool ix86_function_value_regno_p (const unsigned int);
2508 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2509 						const_tree);
2510 static rtx ix86_static_chain (const_tree, bool);
2511 static int ix86_function_regparm (const_tree, const_tree);
2512 static void ix86_compute_frame_layout (struct ix86_frame *);
2513 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2514 						 rtx, rtx, int);
2515 static void ix86_add_new_builtins (HOST_WIDE_INT);
2516 static tree ix86_canonical_va_list_type (tree);
2517 static void predict_jump (int);
2518 static unsigned int split_stack_prologue_scratch_regno (void);
2519 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2520 
2521 enum ix86_function_specific_strings
2522 {
2523   IX86_FUNCTION_SPECIFIC_ARCH,
2524   IX86_FUNCTION_SPECIFIC_TUNE,
2525   IX86_FUNCTION_SPECIFIC_MAX
2526 };
2527 
2528 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2529 				 const char *, enum fpmath_unit, bool);
2530 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2531 static void ix86_function_specific_save (struct cl_target_option *);
2532 static void ix86_function_specific_restore (struct cl_target_option *);
2533 static void ix86_function_specific_print (FILE *, int,
2534 					  struct cl_target_option *);
2535 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2536 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2537 						 struct gcc_options *);
2538 static bool ix86_can_inline_p (tree, tree);
2539 static void ix86_set_current_function (tree);
2540 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2541 
2542 static enum calling_abi ix86_function_abi (const_tree);
2543 
2544 
2545 #ifndef SUBTARGET32_DEFAULT_CPU
2546 #define SUBTARGET32_DEFAULT_CPU "i386"
2547 #endif
2548 
2549 /* The svr4 ABI for the i386 says that records and unions are returned
2550    in memory.  */
2551 #ifndef DEFAULT_PCC_STRUCT_RETURN
2552 #define DEFAULT_PCC_STRUCT_RETURN 1
2553 #endif
2554 
2555 /* Whether -mtune= or -march= were specified */
2556 static int ix86_tune_defaulted;
2557 static int ix86_arch_specified;
2558 
2559 /* Vectorization library interface and handlers.  */
2560 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2561 
2562 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2563 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2564 
2565 /* Processor target table, indexed by processor number */
2566 struct ptt
2567 {
2568   const struct processor_costs *cost;		/* Processor costs */
2569   const int align_loop;				/* Default alignments.  */
2570   const int align_loop_max_skip;
2571   const int align_jump;
2572   const int align_jump_max_skip;
2573   const int align_func;
2574 };
2575 
2576 static const struct ptt processor_target_table[PROCESSOR_max] =
2577 {
2578   {&i386_cost, 4, 3, 4, 3, 4},
2579   {&i486_cost, 16, 15, 16, 15, 16},
2580   {&pentium_cost, 16, 7, 16, 7, 16},
2581   {&pentiumpro_cost, 16, 15, 16, 10, 16},
2582   {&geode_cost, 0, 0, 0, 0, 0},
2583   {&k6_cost, 32, 7, 32, 7, 32},
2584   {&athlon_cost, 16, 7, 16, 7, 16},
2585   {&pentium4_cost, 0, 0, 0, 0, 0},
2586   {&k8_cost, 16, 7, 16, 7, 16},
2587   {&nocona_cost, 0, 0, 0, 0, 0},
2588   /* Core 2 32-bit.  */
2589   {&generic32_cost, 16, 10, 16, 10, 16},
2590   /* Core 2 64-bit.  */
2591   {&generic64_cost, 16, 10, 16, 10, 16},
2592   /* Core i7 32-bit.  */
2593   {&generic32_cost, 16, 10, 16, 10, 16},
2594   /* Core i7 64-bit.  */
2595   {&generic64_cost, 16, 10, 16, 10, 16},
2596   {&generic32_cost, 16, 7, 16, 7, 16},
2597   {&generic64_cost, 16, 10, 16, 10, 16},
2598   {&amdfam10_cost, 32, 24, 32, 7, 32},
2599   {&bdver1_cost, 32, 24, 32, 7, 32},
2600   {&bdver2_cost, 32, 24, 32, 7, 32},
2601   {&btver1_cost, 32, 24, 32, 7, 32},
2602   {&atom_cost, 16, 15, 16, 7, 16}
2603 };
2604 
2605 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2606 {
2607   "generic",
2608   "i386",
2609   "i486",
2610   "pentium",
2611   "pentium-mmx",
2612   "pentiumpro",
2613   "pentium2",
2614   "pentium3",
2615   "pentium4",
2616   "pentium-m",
2617   "prescott",
2618   "nocona",
2619   "core2",
2620   "corei7",
2621   "atom",
2622   "geode",
2623   "k6",
2624   "k6-2",
2625   "k6-3",
2626   "athlon",
2627   "athlon-4",
2628   "k8",
2629   "amdfam10",
2630   "bdver1",
2631   "bdver2",
2632   "btver1"
2633 };
2634 
2635 /* Return true if a red-zone is in use.  */
2636 
2637 static inline bool
2638 ix86_using_red_zone (void)
2639 {
2640   return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2641 }
2642 
2643 /* Return a string that documents the current -m options.  The caller is
2644    responsible for freeing the string.  */
2645 
2646 static char *
2647 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2648 		    const char *tune, enum fpmath_unit fpmath,
2649 		    bool add_nl_p)
2650 {
2651   struct ix86_target_opts
2652   {
2653     const char *option;		/* option string */
2654     HOST_WIDE_INT mask;		/* isa mask options */
2655   };
2656 
2657   /* This table is ordered so that options like -msse4.2 that imply
2658      preceding options while match those first.  */
2659   static struct ix86_target_opts isa_opts[] =
2660   {
2661     { "-m64",		OPTION_MASK_ISA_64BIT },
2662     { "-mfma4",		OPTION_MASK_ISA_FMA4 },
2663     { "-mfma",		OPTION_MASK_ISA_FMA },
2664     { "-mxop",		OPTION_MASK_ISA_XOP },
2665     { "-mlwp",		OPTION_MASK_ISA_LWP },
2666     { "-msse4a",	OPTION_MASK_ISA_SSE4A },
2667     { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
2668     { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
2669     { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
2670     { "-msse3",		OPTION_MASK_ISA_SSE3 },
2671     { "-msse2",		OPTION_MASK_ISA_SSE2 },
2672     { "-msse",		OPTION_MASK_ISA_SSE },
2673     { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
2674     { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
2675     { "-mmmx",		OPTION_MASK_ISA_MMX },
2676     { "-mabm",		OPTION_MASK_ISA_ABM },
2677     { "-mbmi",		OPTION_MASK_ISA_BMI },
2678     { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
2679     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
2680     { "-mtbm",		OPTION_MASK_ISA_TBM },
2681     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
2682     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
2683     { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
2684     { "-maes",		OPTION_MASK_ISA_AES },
2685     { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
2686     { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
2687     { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
2688     { "-mf16c",		OPTION_MASK_ISA_F16C },
2689   };
2690 
2691   /* Flag options.  */
2692   static struct ix86_target_opts flag_opts[] =
2693   {
2694     { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
2695     { "-m80387",			MASK_80387 },
2696     { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
2697     { "-malign-double",			MASK_ALIGN_DOUBLE },
2698     { "-mcld",				MASK_CLD },
2699     { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
2700     { "-mieee-fp",			MASK_IEEE_FP },
2701     { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
2702     { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
2703     { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
2704     { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
2705     { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
2706     { "-mno-push-args",			MASK_NO_PUSH_ARGS },
2707     { "-mno-red-zone",			MASK_NO_RED_ZONE },
2708     { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
2709     { "-mrecip",			MASK_RECIP },
2710     { "-mrtd",				MASK_RTD },
2711     { "-msseregparm",			MASK_SSEREGPARM },
2712     { "-mstack-arg-probe",		MASK_STACK_PROBE },
2713     { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
2714     { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
2715     { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
2716     { "-mvzeroupper",			MASK_VZEROUPPER },
2717     { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2718     { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE},
2719     { "-mprefer-avx128",		MASK_PREFER_AVX128},
2720   };
2721 
2722   const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2723 
2724   char isa_other[40];
2725   char target_other[40];
2726   unsigned num = 0;
2727   unsigned i, j;
2728   char *ret;
2729   char *ptr;
2730   size_t len;
2731   size_t line_len;
2732   size_t sep_len;
2733 
2734   memset (opts, '\0', sizeof (opts));
2735 
2736   /* Add -march= option.  */
2737   if (arch)
2738     {
2739       opts[num][0] = "-march=";
2740       opts[num++][1] = arch;
2741     }
2742 
2743   /* Add -mtune= option.  */
2744   if (tune)
2745     {
2746       opts[num][0] = "-mtune=";
2747       opts[num++][1] = tune;
2748     }
2749 
2750   /* Pick out the options in isa options.  */
2751   for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2752     {
2753       if ((isa & isa_opts[i].mask) != 0)
2754 	{
2755 	  opts[num++][0] = isa_opts[i].option;
2756 	  isa &= ~ isa_opts[i].mask;
2757 	}
2758     }
2759 
2760   if (isa && add_nl_p)
2761     {
2762       opts[num++][0] = isa_other;
2763       sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2764 	       isa);
2765     }
2766 
2767   /* Add flag options.  */
2768   for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2769     {
2770       if ((flags & flag_opts[i].mask) != 0)
2771 	{
2772 	  opts[num++][0] = flag_opts[i].option;
2773 	  flags &= ~ flag_opts[i].mask;
2774 	}
2775     }
2776 
2777   if (flags && add_nl_p)
2778     {
2779       opts[num++][0] = target_other;
2780       sprintf (target_other, "(other flags: %#x)", flags);
2781     }
2782 
2783   /* Add -fpmath= option.  */
2784   if (fpmath)
2785     {
2786       opts[num][0] = "-mfpmath=";
2787       switch ((int) fpmath)
2788 	{
2789 	case FPMATH_387:
2790 	  opts[num++][1] = "387";
2791 	  break;
2792 
2793 	case FPMATH_SSE:
2794 	  opts[num++][1] = "sse";
2795 	  break;
2796 
2797 	case FPMATH_387 | FPMATH_SSE:
2798 	  opts[num++][1] = "sse+387";
2799 	  break;
2800 
2801 	default:
2802 	  gcc_unreachable ();
2803 	}
2804     }
2805 
2806   /* Any options?  */
2807   if (num == 0)
2808     return NULL;
2809 
2810   gcc_assert (num < ARRAY_SIZE (opts));
2811 
2812   /* Size the string.  */
2813   len = 0;
2814   sep_len = (add_nl_p) ? 3 : 1;
2815   for (i = 0; i < num; i++)
2816     {
2817       len += sep_len;
2818       for (j = 0; j < 2; j++)
2819 	if (opts[i][j])
2820 	  len += strlen (opts[i][j]);
2821     }
2822 
2823   /* Build the string.  */
2824   ret = ptr = (char *) xmalloc (len);
2825   line_len = 0;
2826 
2827   for (i = 0; i < num; i++)
2828     {
2829       size_t len2[2];
2830 
2831       for (j = 0; j < 2; j++)
2832 	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2833 
2834       if (i != 0)
2835 	{
2836 	  *ptr++ = ' ';
2837 	  line_len++;
2838 
2839 	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2840 	    {
2841 	      *ptr++ = '\\';
2842 	      *ptr++ = '\n';
2843 	      line_len = 0;
2844 	    }
2845 	}
2846 
2847       for (j = 0; j < 2; j++)
2848 	if (opts[i][j])
2849 	  {
2850 	    memcpy (ptr, opts[i][j], len2[j]);
2851 	    ptr += len2[j];
2852 	    line_len += len2[j];
2853 	  }
2854     }
2855 
2856   *ptr = '\0';
2857   gcc_assert (ret + len >= ptr);
2858 
2859   return ret;
2860 }
2861 
2862 /* Return true, if profiling code should be emitted before
2863    prologue. Otherwise it returns false.
2864    Note: For x86 with "hotfix" it is sorried.  */
2865 static bool
2866 ix86_profile_before_prologue (void)
2867 {
2868   return flag_fentry != 0;
2869 }
2870 
2871 /* Function that is callable from the debugger to print the current
2872    options.  */
2873 void
2874 ix86_debug_options (void)
2875 {
2876   char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2877 				   ix86_arch_string, ix86_tune_string,
2878 				   ix86_fpmath, true);
2879 
2880   if (opts)
2881     {
2882       fprintf (stderr, "%s\n\n", opts);
2883       free (opts);
2884     }
2885   else
2886     fputs ("<no options>\n\n", stderr);
2887 
2888   return;
2889 }
2890 
2891 /* Override various settings based on options.  If MAIN_ARGS_P, the
2892    options are from the command line, otherwise they are from
2893    attributes.  */
2894 
2895 static void
2896 ix86_option_override_internal (bool main_args_p)
2897 {
2898   int i;
2899   unsigned int ix86_arch_mask, ix86_tune_mask;
2900   const bool ix86_tune_specified = (ix86_tune_string != NULL);
2901   const char *prefix;
2902   const char *suffix;
2903   const char *sw;
2904 
2905 #define PTA_3DNOW	 	(HOST_WIDE_INT_1 << 0)
2906 #define PTA_3DNOW_A	 	(HOST_WIDE_INT_1 << 1)
2907 #define PTA_64BIT		(HOST_WIDE_INT_1 << 2)
2908 #define PTA_ABM			(HOST_WIDE_INT_1 << 3)
2909 #define PTA_AES		 	(HOST_WIDE_INT_1 << 4)
2910 #define PTA_AVX			(HOST_WIDE_INT_1 << 5)
2911 #define PTA_BMI		 	(HOST_WIDE_INT_1 << 6)
2912 #define PTA_CX16		(HOST_WIDE_INT_1 << 7)
2913 #define PTA_F16C		(HOST_WIDE_INT_1 << 8)
2914 #define PTA_FMA			(HOST_WIDE_INT_1 << 9)
2915 #define PTA_FMA4	 	(HOST_WIDE_INT_1 << 10)
2916 #define PTA_FSGSBASE		(HOST_WIDE_INT_1 << 11)
2917 #define PTA_LWP		 	(HOST_WIDE_INT_1 << 12)
2918 #define PTA_LZCNT	 	(HOST_WIDE_INT_1 << 13)
2919 #define PTA_MMX			(HOST_WIDE_INT_1 << 14)
2920 #define PTA_MOVBE		(HOST_WIDE_INT_1 << 15)
2921 #define PTA_NO_SAHF		(HOST_WIDE_INT_1 << 16)
2922 #define PTA_PCLMUL		(HOST_WIDE_INT_1 << 17)
2923 #define PTA_POPCNT		(HOST_WIDE_INT_1 << 18)
2924 #define PTA_PREFETCH_SSE	(HOST_WIDE_INT_1 << 19)
2925 #define PTA_RDRND	 	(HOST_WIDE_INT_1 << 20)
2926 #define PTA_SSE			(HOST_WIDE_INT_1 << 21)
2927 #define PTA_SSE2		(HOST_WIDE_INT_1 << 22)
2928 #define PTA_SSE3		(HOST_WIDE_INT_1 << 23)
2929 #define PTA_SSE4_1	 	(HOST_WIDE_INT_1 << 24)
2930 #define PTA_SSE4_2	 	(HOST_WIDE_INT_1 << 25)
2931 #define PTA_SSE4A		(HOST_WIDE_INT_1 << 26)
2932 #define PTA_SSSE3		(HOST_WIDE_INT_1 << 27)
2933 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
2934 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
2935 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
2936 #define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
2937 #define PTA_PREFETCHW		(HOST_WIDE_INT_1 << 32)
2938 
2939 /* if this reaches 64, need to widen struct pta flags below */
2940 
2941   static struct pta
2942     {
2943       const char *const name;		/* processor name or nickname.  */
2944       const enum processor_type processor;
2945       const enum attr_cpu schedule;
2946       const unsigned HOST_WIDE_INT flags;
2947     }
2948   const processor_alias_table[] =
2949     {
2950       {"i386", PROCESSOR_I386, CPU_NONE, 0},
2951       {"i486", PROCESSOR_I486, CPU_NONE, 0},
2952       {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2953       {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2954       {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2955       {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2956       {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2957       {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2958       {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2959       {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2960       {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2961       {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2962       {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2963 	PTA_MMX | PTA_SSE},
2964       {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2965 	PTA_MMX | PTA_SSE},
2966       {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2967 	PTA_MMX | PTA_SSE | PTA_SSE2},
2968       {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2969 	PTA_MMX |PTA_SSE | PTA_SSE2},
2970       {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2971 	PTA_MMX | PTA_SSE | PTA_SSE2},
2972       {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2973 	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2974       {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2975 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 	| PTA_CX16 | PTA_NO_SAHF},
2977       {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2978 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 	| PTA_SSSE3 | PTA_CX16},
2980       {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2981 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2982 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT},
2983       {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2984 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2985 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2986 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2987       {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2988 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2989 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2990 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2991 	| PTA_RDRND | PTA_F16C},
2992       {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2993 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2995 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2996 	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2997 	| PTA_FMA | PTA_MOVBE},
2998       {"atom", PROCESSOR_ATOM, CPU_ATOM,
2999 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3000 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3001       {"geode", PROCESSOR_GEODE, CPU_GEODE,
3002 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3003       {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3004       {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3005       {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3006       {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3007 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008       {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3009 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3010       {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3011 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012       {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014       {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3015 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3016       {"x86-64", PROCESSOR_K8, CPU_K8,
3017 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3018       {"k8", PROCESSOR_K8, CPU_K8,
3019 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 	| PTA_SSE2 | PTA_NO_SAHF},
3021       {"k8-sse3", PROCESSOR_K8, CPU_K8,
3022 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024       {"opteron", PROCESSOR_K8, CPU_K8,
3025 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 	| PTA_SSE2 | PTA_NO_SAHF},
3027       {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3028 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030       {"athlon64", PROCESSOR_K8, CPU_K8,
3031 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 	| PTA_SSE2 | PTA_NO_SAHF},
3033       {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3034 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3036       {"athlon-fx", PROCESSOR_K8, CPU_K8,
3037 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 	| PTA_SSE2 | PTA_NO_SAHF},
3039       {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3040 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3041 	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3042       {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3043 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3044 	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3045       {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3046 	PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3047 	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3048 	| PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3049 	| PTA_FMA4 | PTA_XOP | PTA_LWP},
3050       {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3051 	PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3052 	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3053 	| PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3054 	| PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3055 	| PTA_FMA},
3056       {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3057 	PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3058 	| PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16},
3059       {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3060 	0 /* flags are only used for -march switch.  */ },
3061       {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3062 	PTA_64BIT /* flags are only used for -march switch.  */ },
3063     };
3064 
3065   /* -mrecip options.  */
3066   static struct
3067     {
3068       const char *string;           /* option name */
3069       unsigned int mask;            /* mask bits to set */
3070     }
3071   const recip_options[] =
3072     {
3073       { "all",       RECIP_MASK_ALL },
3074       { "none",      RECIP_MASK_NONE },
3075       { "div",       RECIP_MASK_DIV },
3076       { "sqrt",      RECIP_MASK_SQRT },
3077       { "vec-div",   RECIP_MASK_VEC_DIV },
3078       { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
3079     };
3080 
3081   int const pta_size = ARRAY_SIZE (processor_alias_table);
3082 
3083   /* Set up prefix/suffix so the error messages refer to either the command
3084      line argument, or the attribute(target).  */
3085   if (main_args_p)
3086     {
3087       prefix = "-m";
3088       suffix = "";
3089       sw = "switch";
3090     }
3091   else
3092     {
3093       prefix = "option(\"";
3094       suffix = "\")";
3095       sw = "attribute";
3096     }
3097 
3098 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3099   SUBTARGET_OVERRIDE_OPTIONS;
3100 #endif
3101 
3102 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3103   SUBSUBTARGET_OVERRIDE_OPTIONS;
3104 #endif
3105 
3106   if (TARGET_X32)
3107     ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3108 
3109   /* -fPIC is the default for x86_64.  */
3110   if (TARGET_MACHO && TARGET_64BIT)
3111     flag_pic = 2;
3112 
3113   /* Need to check -mtune=generic first.  */
3114   if (ix86_tune_string)
3115     {
3116       if (!strcmp (ix86_tune_string, "generic")
3117 	  || !strcmp (ix86_tune_string, "i686")
3118 	  /* As special support for cross compilers we read -mtune=native
3119 	     as -mtune=generic.  With native compilers we won't see the
3120 	     -mtune=native, as it was changed by the driver.  */
3121 	  || !strcmp (ix86_tune_string, "native"))
3122 	{
3123 	  if (TARGET_64BIT)
3124 	    ix86_tune_string = "generic64";
3125 	  else
3126 	    ix86_tune_string = "generic32";
3127 	}
3128       /* If this call is for setting the option attribute, allow the
3129 	 generic32/generic64 that was previously set.  */
3130       else if (!main_args_p
3131 	       && (!strcmp (ix86_tune_string, "generic32")
3132 		   || !strcmp (ix86_tune_string, "generic64")))
3133 	;
3134       else if (!strncmp (ix86_tune_string, "generic", 7))
3135         error ("bad value (%s) for %stune=%s %s",
3136 	       ix86_tune_string, prefix, suffix, sw);
3137       else if (!strcmp (ix86_tune_string, "x86-64"))
3138         warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3139                  "%stune=k8%s or %stune=generic%s instead as appropriate",
3140                  prefix, suffix, prefix, suffix, prefix, suffix);
3141     }
3142   else
3143     {
3144       if (ix86_arch_string)
3145 	ix86_tune_string = ix86_arch_string;
3146       if (!ix86_tune_string)
3147 	{
3148 	  ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3149 	  ix86_tune_defaulted = 1;
3150 	}
3151 
3152       /* ix86_tune_string is set to ix86_arch_string or defaulted.  We
3153 	 need to use a sensible tune option.  */
3154       if (!strcmp (ix86_tune_string, "generic")
3155 	  || !strcmp (ix86_tune_string, "x86-64")
3156 	  || !strcmp (ix86_tune_string, "i686"))
3157 	{
3158 	  if (TARGET_64BIT)
3159 	    ix86_tune_string = "generic64";
3160 	  else
3161 	    ix86_tune_string = "generic32";
3162 	}
3163     }
3164 
3165   if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3166     {
3167       /* rep; movq isn't available in 32-bit code.  */
3168       error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3169       ix86_stringop_alg = no_stringop;
3170     }
3171 
3172   if (!ix86_arch_string)
3173     ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3174   else
3175     ix86_arch_specified = 1;
3176 
3177   if (!global_options_set.x_ix86_abi)
3178     ix86_abi = DEFAULT_ABI;
3179 
3180   if (global_options_set.x_ix86_cmodel)
3181     {
3182       switch (ix86_cmodel)
3183 	{
3184 	case CM_SMALL:
3185 	case CM_SMALL_PIC:
3186 	  if (flag_pic)
3187 	    ix86_cmodel = CM_SMALL_PIC;
3188 	  if (!TARGET_64BIT)
3189 	    error ("code model %qs not supported in the %s bit mode",
3190 		   "small", "32");
3191 	  break;
3192 
3193 	case CM_MEDIUM:
3194 	case CM_MEDIUM_PIC:
3195 	  if (flag_pic)
3196 	    ix86_cmodel = CM_MEDIUM_PIC;
3197 	  if (!TARGET_64BIT)
3198 	    error ("code model %qs not supported in the %s bit mode",
3199 		   "medium", "32");
3200 	  else if (TARGET_X32)
3201 	    error ("code model %qs not supported in x32 mode",
3202 		   "medium");
3203 	  break;
3204 
3205 	case CM_LARGE:
3206 	case CM_LARGE_PIC:
3207 	  if (flag_pic)
3208 	    ix86_cmodel = CM_LARGE_PIC;
3209 	  if (!TARGET_64BIT)
3210 	    error ("code model %qs not supported in the %s bit mode",
3211 		   "large", "32");
3212 	  else if (TARGET_X32)
3213 	    error ("code model %qs not supported in x32 mode",
3214 		   "large");
3215 	  break;
3216 
3217 	case CM_32:
3218 	  if (flag_pic)
3219 	    error ("code model %s does not support PIC mode", "32");
3220 	  if (TARGET_64BIT)
3221 	    error ("code model %qs not supported in the %s bit mode",
3222 		   "32", "64");
3223 	  break;
3224 
3225 	case CM_KERNEL:
3226 	  if (flag_pic)
3227 	    {
3228 	      error ("code model %s does not support PIC mode", "kernel");
3229 	      ix86_cmodel = CM_32;
3230 	    }
3231 	  if (!TARGET_64BIT)
3232 	    error ("code model %qs not supported in the %s bit mode",
3233 		   "kernel", "32");
3234 	  break;
3235 
3236 	default:
3237 	  gcc_unreachable ();
3238 	}
3239     }
3240   else
3241     {
3242       /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3243 	 use of rip-relative addressing.  This eliminates fixups that
3244 	 would otherwise be needed if this object is to be placed in a
3245 	 DLL, and is essentially just as efficient as direct addressing.  */
3246       if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3247 	ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3248       else if (TARGET_64BIT)
3249 	ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3250       else
3251         ix86_cmodel = CM_32;
3252     }
3253   if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3254     {
3255       error ("-masm=intel not supported in this configuration");
3256       ix86_asm_dialect = ASM_ATT;
3257     }
3258   if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3259     sorry ("%i-bit mode not compiled in",
3260 	   (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3261 
3262   for (i = 0; i < pta_size; i++)
3263     if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3264       {
3265 	ix86_schedule = processor_alias_table[i].schedule;
3266 	ix86_arch = processor_alias_table[i].processor;
3267 	/* Default cpu tuning to the architecture.  */
3268 	ix86_tune = ix86_arch;
3269 
3270 	if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3271 	  error ("CPU you selected does not support x86-64 "
3272 		 "instruction set");
3273 
3274 	if (processor_alias_table[i].flags & PTA_MMX
3275 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3276 	  ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3277 	if (processor_alias_table[i].flags & PTA_3DNOW
3278 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3279 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3280 	if (processor_alias_table[i].flags & PTA_3DNOW_A
3281 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3282 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3283 	if (processor_alias_table[i].flags & PTA_SSE
3284 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3285 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3286 	if (processor_alias_table[i].flags & PTA_SSE2
3287 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3288 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3289 	if (processor_alias_table[i].flags & PTA_SSE3
3290 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3291 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3292 	if (processor_alias_table[i].flags & PTA_SSSE3
3293 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3294 	  ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3295 	if (processor_alias_table[i].flags & PTA_SSE4_1
3296 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3297 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3298 	if (processor_alias_table[i].flags & PTA_SSE4_2
3299 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3300 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3301 	if (processor_alias_table[i].flags & PTA_AVX
3302 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3303 	  ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3304 	if (processor_alias_table[i].flags & PTA_AVX2
3305 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3306 	  ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3307 	if (processor_alias_table[i].flags & PTA_FMA
3308 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3309 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3310 	if (processor_alias_table[i].flags & PTA_SSE4A
3311 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3312 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3313 	if (processor_alias_table[i].flags & PTA_FMA4
3314 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3315 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3316 	if (processor_alias_table[i].flags & PTA_XOP
3317 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3318 	  ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3319 	if (processor_alias_table[i].flags & PTA_LWP
3320 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3321 	  ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3322 	if (processor_alias_table[i].flags & PTA_ABM
3323 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3324 	  ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3325 	if (processor_alias_table[i].flags & PTA_BMI
3326 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3327 	  ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3328 	if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3329 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3330 	  ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3331 	if (processor_alias_table[i].flags & PTA_TBM
3332 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3333 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3334 	if (processor_alias_table[i].flags & PTA_BMI2
3335 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3336 	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3337 	if (processor_alias_table[i].flags & PTA_CX16
3338 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3339 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3340 	if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3341 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3342 	  ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3343 	if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3344 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3345 	  ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3346 	if (processor_alias_table[i].flags & PTA_MOVBE
3347 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3348 	  ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3349 	if (processor_alias_table[i].flags & PTA_AES
3350 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3351 	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
3352 	if (processor_alias_table[i].flags & PTA_PCLMUL
3353 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3354 	  ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3355 	if (processor_alias_table[i].flags & PTA_FSGSBASE
3356 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3357 	  ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3358 	if (processor_alias_table[i].flags & PTA_RDRND
3359 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3360 	  ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3361 	if (processor_alias_table[i].flags & PTA_F16C
3362 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3363 	  ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3364 	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3365 	  x86_prefetch_sse = true;
3366 	if (processor_alias_table[i].flags & PTA_PREFETCHW)
3367 	  x86_prefetchw = true;
3368 
3369 	break;
3370       }
3371 
3372   if (!strcmp (ix86_arch_string, "generic"))
3373     error ("generic CPU can be used only for %stune=%s %s",
3374 	   prefix, suffix, sw);
3375   else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3376     error ("bad value (%s) for %sarch=%s %s",
3377 	   ix86_arch_string, prefix, suffix, sw);
3378 
3379   ix86_arch_mask = 1u << ix86_arch;
3380   for (i = 0; i < X86_ARCH_LAST; ++i)
3381     ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3382 
3383   for (i = 0; i < pta_size; i++)
3384     if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3385       {
3386 	ix86_schedule = processor_alias_table[i].schedule;
3387 	ix86_tune = processor_alias_table[i].processor;
3388 	if (TARGET_64BIT)
3389 	  {
3390 	    if (!(processor_alias_table[i].flags & PTA_64BIT))
3391 	      {
3392 		if (ix86_tune_defaulted)
3393 		  {
3394 		    ix86_tune_string = "x86-64";
3395 		    for (i = 0; i < pta_size; i++)
3396 		      if (! strcmp (ix86_tune_string,
3397 				    processor_alias_table[i].name))
3398 			break;
3399 		    ix86_schedule = processor_alias_table[i].schedule;
3400 		    ix86_tune = processor_alias_table[i].processor;
3401 		  }
3402 		else
3403 		  error ("CPU you selected does not support x86-64 "
3404 			 "instruction set");
3405 	      }
3406 	  }
3407 	else
3408 	  {
3409 	    /* Adjust tuning when compiling for 32-bit ABI.  */
3410 	    switch (ix86_tune)
3411 	      {
3412 	      case PROCESSOR_GENERIC64:
3413 		ix86_tune = PROCESSOR_GENERIC32;
3414 		ix86_schedule = CPU_PENTIUMPRO;
3415 		break;
3416 
3417 	      case PROCESSOR_CORE2_64:
3418 		ix86_tune = PROCESSOR_CORE2_32;
3419 		break;
3420 
3421 	      case PROCESSOR_COREI7_64:
3422 		ix86_tune = PROCESSOR_COREI7_32;
3423 		break;
3424 
3425 	      default:
3426 		break;
3427 	      }
3428 	  }
3429 	/* Intel CPUs have always interpreted SSE prefetch instructions as
3430 	   NOPs; so, we can enable SSE prefetch instructions even when
3431 	   -mtune (rather than -march) points us to a processor that has them.
3432 	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3433 	   higher processors.  */
3434 	if (TARGET_CMOV
3435 	    && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3436 	  x86_prefetch_sse = true;
3437 	break;
3438       }
3439 
3440   if (ix86_tune_specified && i == pta_size)
3441     error ("bad value (%s) for %stune=%s %s",
3442 	   ix86_tune_string, prefix, suffix, sw);
3443 
3444   ix86_tune_mask = 1u << ix86_tune;
3445   for (i = 0; i < X86_TUNE_LAST; ++i)
3446     ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3447 
3448 #ifndef USE_IX86_FRAME_POINTER
3449 #define USE_IX86_FRAME_POINTER 0
3450 #endif
3451 
3452 #ifndef USE_X86_64_FRAME_POINTER
3453 #define USE_X86_64_FRAME_POINTER 0
3454 #endif
3455 
3456   /* Set the default values for switches whose default depends on TARGET_64BIT
3457      in case they weren't overwritten by command line options.  */
3458   if (TARGET_64BIT)
3459     {
3460       if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3461 	flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3462       if (flag_asynchronous_unwind_tables == 2)
3463 	flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3464       if (flag_pcc_struct_return == 2)
3465 	flag_pcc_struct_return = 0;
3466     }
3467   else
3468     {
3469       if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3470 	flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3471       if (flag_asynchronous_unwind_tables == 2)
3472 	flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3473       if (flag_pcc_struct_return == 2)
3474 	flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3475     }
3476 
3477   if (optimize_size)
3478     ix86_cost = &ix86_size_cost;
3479   else
3480     ix86_cost = processor_target_table[ix86_tune].cost;
3481 
3482   /* Arrange to set up i386_stack_locals for all functions.  */
3483   init_machine_status = ix86_init_machine_status;
3484 
3485   /* Validate -mregparm= value.  */
3486   if (global_options_set.x_ix86_regparm)
3487     {
3488       if (TARGET_64BIT)
3489 	warning (0, "-mregparm is ignored in 64-bit mode");
3490       if (ix86_regparm > REGPARM_MAX)
3491 	{
3492 	  error ("-mregparm=%d is not between 0 and %d",
3493 		 ix86_regparm, REGPARM_MAX);
3494 	  ix86_regparm = 0;
3495 	}
3496     }
3497   if (TARGET_64BIT)
3498     ix86_regparm = REGPARM_MAX;
3499 
3500   /* Default align_* from the processor table.  */
3501   if (align_loops == 0)
3502     {
3503       align_loops = processor_target_table[ix86_tune].align_loop;
3504       align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3505     }
3506   if (align_jumps == 0)
3507     {
3508       align_jumps = processor_target_table[ix86_tune].align_jump;
3509       align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3510     }
3511   if (align_functions == 0)
3512     {
3513       align_functions = processor_target_table[ix86_tune].align_func;
3514     }
3515 
3516   /* Provide default for -mbranch-cost= value.  */
3517   if (!global_options_set.x_ix86_branch_cost)
3518     ix86_branch_cost = ix86_cost->branch_cost;
3519 
3520   if (TARGET_64BIT)
3521     {
3522       target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3523 
3524       /* Enable by default the SSE and MMX builtins.  Do allow the user to
3525 	 explicitly disable any of these.  In particular, disabling SSE and
3526 	 MMX for kernel code is extremely useful.  */
3527       if (!ix86_arch_specified)
3528       ix86_isa_flags
3529 	|= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3530 	     | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3531 
3532       if (TARGET_RTD)
3533 	warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3534     }
3535   else
3536     {
3537       target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3538 
3539       if (!ix86_arch_specified)
3540       ix86_isa_flags
3541 	|= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3542 
3543       /* i386 ABI does not specify red zone.  It still makes sense to use it
3544          when programmer takes care to stack from being destroyed.  */
3545       if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3546         target_flags |= MASK_NO_RED_ZONE;
3547     }
3548 
3549   /* Keep nonleaf frame pointers.  */
3550   if (flag_omit_frame_pointer)
3551     target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3552   else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3553     flag_omit_frame_pointer = 1;
3554 
3555   /* If we're doing fast math, we don't care about comparison order
3556      wrt NaNs.  This lets us use a shorter comparison sequence.  */
3557   if (flag_finite_math_only)
3558     target_flags &= ~MASK_IEEE_FP;
3559 
3560   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3561      since the insns won't need emulation.  */
3562   if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3563     target_flags &= ~MASK_NO_FANCY_MATH_387;
3564 
3565   /* Likewise, if the target doesn't have a 387, or we've specified
3566      software floating point, don't use 387 inline intrinsics.  */
3567   if (!TARGET_80387)
3568     target_flags |= MASK_NO_FANCY_MATH_387;
3569 
3570   /* Turn on MMX builtins for -msse.  */
3571   if (TARGET_SSE)
3572     {
3573       ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3574       x86_prefetch_sse = true;
3575     }
3576 
3577   /* Turn on popcnt instruction for -msse4.2 or -mabm.  */
3578   if (TARGET_SSE4_2 || TARGET_ABM)
3579     ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3580 
3581   /* Turn on lzcnt instruction for -mabm.  */
3582   if (TARGET_ABM)
3583     ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3584 
3585   /* Validate -mpreferred-stack-boundary= value or default it to
3586      PREFERRED_STACK_BOUNDARY_DEFAULT.  */
3587   ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3588   if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3589     {
3590       int min = (TARGET_64BIT ? 4 : 2);
3591       int max = (TARGET_SEH ? 4 : 12);
3592 
3593       if (ix86_preferred_stack_boundary_arg < min
3594 	  || ix86_preferred_stack_boundary_arg > max)
3595 	{
3596 	  if (min == max)
3597 	    error ("-mpreferred-stack-boundary is not supported "
3598 		   "for this target");
3599 	  else
3600 	    error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3601 		   ix86_preferred_stack_boundary_arg, min, max);
3602 	}
3603       else
3604 	ix86_preferred_stack_boundary
3605 	  = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3606     }
3607 
3608   /* Set the default value for -mstackrealign.  */
3609   if (ix86_force_align_arg_pointer == -1)
3610     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3611 
3612   ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3613 
3614   /* Validate -mincoming-stack-boundary= value or default it to
3615      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
3616   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3617   if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3618     {
3619       if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3620 	  || ix86_incoming_stack_boundary_arg > 12)
3621 	error ("-mincoming-stack-boundary=%d is not between %d and 12",
3622 	       ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3623       else
3624 	{
3625 	  ix86_user_incoming_stack_boundary
3626 	    = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3627 	  ix86_incoming_stack_boundary
3628 	    = ix86_user_incoming_stack_boundary;
3629 	}
3630     }
3631 
3632   /* Accept -msseregparm only if at least SSE support is enabled.  */
3633   if (TARGET_SSEREGPARM
3634       && ! TARGET_SSE)
3635     error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3636 
3637   if (global_options_set.x_ix86_fpmath)
3638     {
3639       if (ix86_fpmath & FPMATH_SSE)
3640 	{
3641 	  if (!TARGET_SSE)
3642 	    {
3643 	      warning (0, "SSE instruction set disabled, using 387 arithmetics");
3644 	      ix86_fpmath = FPMATH_387;
3645 	    }
3646 	  else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3647 	    {
3648 	      warning (0, "387 instruction set disabled, using SSE arithmetics");
3649 	      ix86_fpmath = FPMATH_SSE;
3650 	    }
3651 	}
3652     }
3653   else
3654     ix86_fpmath = TARGET_FPMATH_DEFAULT;
3655 
3656   /* If the i387 is disabled, then do not return values in it. */
3657   if (!TARGET_80387)
3658     target_flags &= ~MASK_FLOAT_RETURNS;
3659 
3660   /* Use external vectorized library in vectorizing intrinsics.  */
3661   if (global_options_set.x_ix86_veclibabi_type)
3662     switch (ix86_veclibabi_type)
3663       {
3664       case ix86_veclibabi_type_svml:
3665 	ix86_veclib_handler = ix86_veclibabi_svml;
3666 	break;
3667 
3668       case ix86_veclibabi_type_acml:
3669 	ix86_veclib_handler = ix86_veclibabi_acml;
3670 	break;
3671 
3672       default:
3673 	gcc_unreachable ();
3674       }
3675 
3676   if ((!USE_IX86_FRAME_POINTER
3677        || (x86_accumulate_outgoing_args & ix86_tune_mask))
3678       && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3679       && !optimize_size)
3680     target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3681 
3682   /* ??? Unwind info is not correct around the CFG unless either a frame
3683      pointer is present or M_A_O_A is set.  Fixing this requires rewriting
3684      unwind info generation to be aware of the CFG and propagating states
3685      around edges.  */
3686   if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3687        || flag_exceptions || flag_non_call_exceptions)
3688       && flag_omit_frame_pointer
3689       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3690     {
3691       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3692 	warning (0, "unwind tables currently require either a frame pointer "
3693 		 "or %saccumulate-outgoing-args%s for correctness",
3694 		 prefix, suffix);
3695       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3696     }
3697 
3698   /* If stack probes are required, the space used for large function
3699      arguments on the stack must also be probed, so enable
3700      -maccumulate-outgoing-args so this happens in the prologue.  */
3701   if (TARGET_STACK_PROBE
3702       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3703     {
3704       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3705 	warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3706 		 "for correctness", prefix, suffix);
3707       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3708     }
3709 
3710   /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
3711   {
3712     char *p;
3713     ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714     p = strchr (internal_label_prefix, 'X');
3715     internal_label_prefix_len = p - internal_label_prefix;
3716     *p = '\0';
3717   }
3718 
3719   /* When scheduling description is not available, disable scheduler pass
3720      so it won't slow down the compilation and make x87 code slower.  */
3721   if (!TARGET_SCHEDULE)
3722     flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723 
3724   maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 			 ix86_cost->simultaneous_prefetches,
3726 			 global_options.x_param_values,
3727 			 global_options_set.x_param_values);
3728   maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 			 global_options.x_param_values,
3730 			 global_options_set.x_param_values);
3731   maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 			 global_options.x_param_values,
3733 			 global_options_set.x_param_values);
3734   maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 			 global_options.x_param_values,
3736 			 global_options_set.x_param_values);
3737 
3738   /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
3739   if (flag_prefetch_loop_arrays < 0
3740       && HAVE_prefetch
3741       && optimize >= 3
3742       && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743     flag_prefetch_loop_arrays = 1;
3744 
3745   /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746      can be optimized to ap = __builtin_next_arg (0).  */
3747   if (!TARGET_64BIT && !flag_split_stack)
3748     targetm.expand_builtin_va_start = NULL;
3749 
3750   if (TARGET_64BIT)
3751     {
3752       ix86_gen_leave = gen_leave_rex64;
3753       ix86_gen_add3 = gen_adddi3;
3754       ix86_gen_sub3 = gen_subdi3;
3755       ix86_gen_sub3_carry = gen_subdi3_carry;
3756       ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757       ix86_gen_monitor = gen_sse3_monitor64;
3758       ix86_gen_andsp = gen_anddi3;
3759       ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760       ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761       ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762     }
3763   else
3764     {
3765       ix86_gen_leave = gen_leave;
3766       ix86_gen_add3 = gen_addsi3;
3767       ix86_gen_sub3 = gen_subsi3;
3768       ix86_gen_sub3_carry = gen_subsi3_carry;
3769       ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770       ix86_gen_monitor = gen_sse3_monitor;
3771       ix86_gen_andsp = gen_andsi3;
3772       ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773       ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774       ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775     }
3776 
3777 #ifdef USE_IX86_CLD
3778   /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
3779   if (!TARGET_64BIT)
3780     target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782 
3783   if (!TARGET_64BIT && flag_pic)
3784     {
3785       if (flag_fentry > 0)
3786         sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 	       "with -fpic");
3788       flag_fentry = 0;
3789     }
3790   else if (TARGET_SEH)
3791     {
3792       if (flag_fentry == 0)
3793 	sorry ("-mno-fentry isn%'t compatible with SEH");
3794       flag_fentry = 1;
3795     }
3796   else if (flag_fentry < 0)
3797    {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799      flag_fentry = 1;
3800 #else
3801      flag_fentry = 0;
3802 #endif
3803    }
3804 
3805   if (TARGET_AVX)
3806     {
3807       /* When not optimize for size, enable vzeroupper optimization for
3808 	 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 	 AVX unaligned load/store.  */
3810       if (!optimize_size)
3811 	{
3812 	  if (flag_expensive_optimizations
3813 	      && !(target_flags_explicit & MASK_VZEROUPPER))
3814 	    target_flags |= MASK_VZEROUPPER;
3815 	  if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 	      && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 	    target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 	  if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 	      && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 	    target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 	  /* Enable 128-bit AVX instruction generation for the auto-vectorizer.  */
3822 	  if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 	    target_flags |= MASK_PREFER_AVX128;
3824 	}
3825     }
3826   else
3827     {
3828       /* Disable vzeroupper pass if TARGET_AVX is disabled.  */
3829       target_flags &= ~MASK_VZEROUPPER;
3830     }
3831 
3832   if (ix86_recip_name)
3833     {
3834       char *p = ASTRDUP (ix86_recip_name);
3835       char *q;
3836       unsigned int mask, i;
3837       bool invert;
3838 
3839       while ((q = strtok (p, ",")) != NULL)
3840 	{
3841 	  p = NULL;
3842 	  if (*q == '!')
3843 	    {
3844 	      invert = true;
3845 	      q++;
3846 	    }
3847 	  else
3848 	    invert = false;
3849 
3850 	  if (!strcmp (q, "default"))
3851 	    mask = RECIP_MASK_ALL;
3852 	  else
3853 	    {
3854 	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 		if (!strcmp (q, recip_options[i].string))
3856 		  {
3857 		    mask = recip_options[i].mask;
3858 		    break;
3859 		  }
3860 
3861 	      if (i == ARRAY_SIZE (recip_options))
3862 		{
3863 		  error ("unknown option for -mrecip=%s", q);
3864 		  invert = false;
3865 		  mask = RECIP_MASK_NONE;
3866 		}
3867 	    }
3868 
3869 	  recip_mask_explicit |= mask;
3870 	  if (invert)
3871 	    recip_mask &= ~mask;
3872 	  else
3873 	    recip_mask |= mask;
3874 	}
3875     }
3876 
3877   if (TARGET_RECIP)
3878     recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879   else if (target_flags_explicit & MASK_RECIP)
3880     recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881 
3882   /* Save the initial options in case the user does function specific
3883      options.  */
3884   if (main_args_p)
3885     target_option_default_node = target_option_current_node
3886       = build_target_option_node ();
3887 }
3888 
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes.  */
3890 
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894   if (!val)
3895     return false;
3896 
3897   if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898     return true;
3899 
3900   if (GET_CODE (val) == PARALLEL)
3901     {
3902       int i;
3903       rtx r;
3904 
3905       for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 	{
3907 	  r = XVECEXP (val, 0, i);
3908 	  if (GET_CODE (r) == EXPR_LIST
3909 	      && XEXP (r, 0)
3910 	      && REG_P (XEXP (r, 0))
3911 	      && (GET_MODE (XEXP (r, 0)) == OImode
3912 		  || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 	    return true;
3914 	}
3915     }
3916 
3917   return false;
3918 }
3919 
3920 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
3921 
3922 static void
3923 ix86_option_override (void)
3924 {
3925   ix86_option_override_internal (true);
3926 }
3927 
3928 /* Update register usage after having seen the compiler flags.  */
3929 
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933   int i;
3934   unsigned int j;
3935 
3936   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937     {
3938       if (fixed_regs[i] > 1)
3939 	fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940       if (call_used_regs[i] > 1)
3941 	call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942     }
3943 
3944   /* The PIC register, if it exists, is fixed.  */
3945   j = PIC_OFFSET_TABLE_REGNUM;
3946   if (j != INVALID_REGNUM)
3947     fixed_regs[j] = call_used_regs[j] = 1;
3948 
3949   /* The 64-bit MS_ABI changes the set of call-used registers.  */
3950   if (TARGET_64BIT_MS_ABI)
3951     {
3952       call_used_regs[SI_REG] = 0;
3953       call_used_regs[DI_REG] = 0;
3954       call_used_regs[XMM6_REG] = 0;
3955       call_used_regs[XMM7_REG] = 0;
3956       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 	call_used_regs[i] = 0;
3958     }
3959 
3960   /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961      other call-clobbered regs for 64-bit.  */
3962   if (TARGET_64BIT)
3963     {
3964       CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965 
3966       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 	if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 	    && call_used_regs[i])
3969 	  SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970     }
3971 
3972   /* If MMX is disabled, squash the registers.  */
3973   if (! TARGET_MMX)
3974     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975       if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977 
3978   /* If SSE is disabled, squash the registers.  */
3979   if (! TARGET_SSE)
3980     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981       if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983 
3984   /* If the FPU is disabled, squash the registers.  */
3985   if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987       if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989 
3990   /* If 32-bit, squash the 64-bit registers.  */
3991   if (! TARGET_64BIT)
3992     {
3993       for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 	reg_names[i] = "";
3995       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 	reg_names[i] = "";
3997     }
3998 }
3999 
4000 
4001 /* Save the current options */
4002 
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006   ptr->arch = ix86_arch;
4007   ptr->schedule = ix86_schedule;
4008   ptr->tune = ix86_tune;
4009   ptr->branch_cost = ix86_branch_cost;
4010   ptr->tune_defaulted = ix86_tune_defaulted;
4011   ptr->arch_specified = ix86_arch_specified;
4012   ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013   ptr->ix86_target_flags_explicit = target_flags_explicit;
4014   ptr->x_recip_mask_explicit = recip_mask_explicit;
4015 
4016   /* The fields are char but the variables are not; make sure the
4017      values fit in the fields.  */
4018   gcc_assert (ptr->arch == ix86_arch);
4019   gcc_assert (ptr->schedule == ix86_schedule);
4020   gcc_assert (ptr->tune == ix86_tune);
4021   gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023 
4024 /* Restore the current options */
4025 
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029   enum processor_type old_tune = ix86_tune;
4030   enum processor_type old_arch = ix86_arch;
4031   unsigned int ix86_arch_mask, ix86_tune_mask;
4032   int i;
4033 
4034   ix86_arch = (enum processor_type) ptr->arch;
4035   ix86_schedule = (enum attr_cpu) ptr->schedule;
4036   ix86_tune = (enum processor_type) ptr->tune;
4037   ix86_branch_cost = ptr->branch_cost;
4038   ix86_tune_defaulted = ptr->tune_defaulted;
4039   ix86_arch_specified = ptr->arch_specified;
4040   ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041   target_flags_explicit = ptr->ix86_target_flags_explicit;
4042   recip_mask_explicit = ptr->x_recip_mask_explicit;
4043 
4044   /* Recreate the arch feature tests if the arch changed */
4045   if (old_arch != ix86_arch)
4046     {
4047       ix86_arch_mask = 1u << ix86_arch;
4048       for (i = 0; i < X86_ARCH_LAST; ++i)
4049 	ix86_arch_features[i]
4050 	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051     }
4052 
4053   /* Recreate the tune optimization tests */
4054   if (old_tune != ix86_tune)
4055     {
4056       ix86_tune_mask = 1u << ix86_tune;
4057       for (i = 0; i < X86_TUNE_LAST; ++i)
4058 	ix86_tune_features[i]
4059 	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060     }
4061 }
4062 
4063 /* Print the current options */
4064 
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 			      struct cl_target_option *ptr)
4068 {
4069   char *target_string
4070     = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 			  NULL, NULL, ptr->x_ix86_fpmath, false);
4072 
4073   fprintf (file, "%*sarch = %d (%s)\n",
4074 	   indent, "",
4075 	   ptr->arch,
4076 	   ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 	    ? cpu_names[ptr->arch]
4078 	    : "<unknown>"));
4079 
4080   fprintf (file, "%*stune = %d (%s)\n",
4081 	   indent, "",
4082 	   ptr->tune,
4083 	   ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 	    ? cpu_names[ptr->tune]
4085 	    : "<unknown>"));
4086 
4087   fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088 
4089   if (target_string)
4090     {
4091       fprintf (file, "%*s%s\n", indent, "", target_string);
4092       free (target_string);
4093     }
4094 }
4095 
4096 
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098    set the current options from the argument. If we have a list, recursively go
4099    over the list.  */
4100 
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 				     struct gcc_options *enum_opts_set)
4104 {
4105   char *next_optstr;
4106   bool ret = true;
4107 
4108 #define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
4113 
4114   enum ix86_opt_type
4115   {
4116     ix86_opt_unknown,
4117     ix86_opt_yes,
4118     ix86_opt_no,
4119     ix86_opt_str,
4120     ix86_opt_enum,
4121     ix86_opt_isa
4122   };
4123 
4124   static const struct
4125   {
4126     const char *string;
4127     size_t len;
4128     enum ix86_opt_type type;
4129     int opt;
4130     int mask;
4131   } attrs[] = {
4132     /* isa options */
4133     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
4134     IX86_ATTR_ISA ("abm",	OPT_mabm),
4135     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
4136     IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
4137     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
4138     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
4139     IX86_ATTR_ISA ("aes",	OPT_maes),
4140     IX86_ATTR_ISA ("avx",	OPT_mavx),
4141     IX86_ATTR_ISA ("avx2",	OPT_mavx2),
4142     IX86_ATTR_ISA ("mmx",	OPT_mmmx),
4143     IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
4144     IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
4145     IX86_ATTR_ISA ("sse",	OPT_msse),
4146     IX86_ATTR_ISA ("sse2",	OPT_msse2),
4147     IX86_ATTR_ISA ("sse3",	OPT_msse3),
4148     IX86_ATTR_ISA ("sse4",	OPT_msse4),
4149     IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
4150     IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
4151     IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
4152     IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
4153     IX86_ATTR_ISA ("fma4",	OPT_mfma4),
4154     IX86_ATTR_ISA ("fma",	OPT_mfma),
4155     IX86_ATTR_ISA ("xop",	OPT_mxop),
4156     IX86_ATTR_ISA ("lwp",	OPT_mlwp),
4157     IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
4158     IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
4159     IX86_ATTR_ISA ("f16c",	OPT_mf16c),
4160 
4161     /* enum options */
4162     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
4163 
4164     /* string options */
4165     IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
4166     IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
4167 
4168     /* flag options */
4169     IX86_ATTR_YES ("cld",
4170 		   OPT_mcld,
4171 		   MASK_CLD),
4172 
4173     IX86_ATTR_NO ("fancy-math-387",
4174 		  OPT_mfancy_math_387,
4175 		  MASK_NO_FANCY_MATH_387),
4176 
4177     IX86_ATTR_YES ("ieee-fp",
4178 		   OPT_mieee_fp,
4179 		   MASK_IEEE_FP),
4180 
4181     IX86_ATTR_YES ("inline-all-stringops",
4182 		   OPT_minline_all_stringops,
4183 		   MASK_INLINE_ALL_STRINGOPS),
4184 
4185     IX86_ATTR_YES ("inline-stringops-dynamically",
4186 		   OPT_minline_stringops_dynamically,
4187 		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188 
4189     IX86_ATTR_NO ("align-stringops",
4190 		  OPT_mno_align_stringops,
4191 		  MASK_NO_ALIGN_STRINGOPS),
4192 
4193     IX86_ATTR_YES ("recip",
4194 		   OPT_mrecip,
4195 		   MASK_RECIP),
4196 
4197   };
4198 
4199   /* If this is a list, recurse to get the options.  */
4200   if (TREE_CODE (args) == TREE_LIST)
4201     {
4202       bool ret = true;
4203 
4204       for (; args; args = TREE_CHAIN (args))
4205 	if (TREE_VALUE (args)
4206 	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 						     p_strings, enum_opts_set))
4208 	  ret = false;
4209 
4210       return ret;
4211     }
4212 
4213   else if (TREE_CODE (args) != STRING_CST)
4214     gcc_unreachable ();
4215 
4216   /* Handle multiple arguments separated by commas.  */
4217   next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218 
4219   while (next_optstr && *next_optstr != '\0')
4220     {
4221       char *p = next_optstr;
4222       char *orig_p = p;
4223       char *comma = strchr (next_optstr, ',');
4224       const char *opt_string;
4225       size_t len, opt_len;
4226       int opt;
4227       bool opt_set_p;
4228       char ch;
4229       unsigned i;
4230       enum ix86_opt_type type = ix86_opt_unknown;
4231       int mask = 0;
4232 
4233       if (comma)
4234 	{
4235 	  *comma = '\0';
4236 	  len = comma - next_optstr;
4237 	  next_optstr = comma + 1;
4238 	}
4239       else
4240 	{
4241 	  len = strlen (p);
4242 	  next_optstr = NULL;
4243 	}
4244 
4245       /* Recognize no-xxx.  */
4246       if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 	{
4248 	  opt_set_p = false;
4249 	  p += 3;
4250 	  len -= 3;
4251 	}
4252       else
4253 	opt_set_p = true;
4254 
4255       /* Find the option.  */
4256       ch = *p;
4257       opt = N_OPTS;
4258       for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 	{
4260 	  type = attrs[i].type;
4261 	  opt_len = attrs[i].len;
4262 	  if (ch == attrs[i].string[0]
4263 	      && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 		  ? len == opt_len
4265 		  : len > opt_len)
4266 	      && memcmp (p, attrs[i].string, opt_len) == 0)
4267 	    {
4268 	      opt = attrs[i].opt;
4269 	      mask = attrs[i].mask;
4270 	      opt_string = attrs[i].string;
4271 	      break;
4272 	    }
4273 	}
4274 
4275       /* Process the option.  */
4276       if (opt == N_OPTS)
4277 	{
4278 	  error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 	  ret = false;
4280 	}
4281 
4282       else if (type == ix86_opt_isa)
4283 	{
4284 	  struct cl_decoded_option decoded;
4285 
4286 	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 	  ix86_handle_option (&global_options, &global_options_set,
4288 			      &decoded, input_location);
4289 	}
4290 
4291       else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 	{
4293 	  if (type == ix86_opt_no)
4294 	    opt_set_p = !opt_set_p;
4295 
4296 	  if (opt_set_p)
4297 	    target_flags |= mask;
4298 	  else
4299 	    target_flags &= ~mask;
4300 	}
4301 
4302       else if (type == ix86_opt_str)
4303 	{
4304 	  if (p_strings[opt])
4305 	    {
4306 	      error ("option(\"%s\") was already specified", opt_string);
4307 	      ret = false;
4308 	    }
4309 	  else
4310 	    p_strings[opt] = xstrdup (p + opt_len);
4311 	}
4312 
4313       else if (type == ix86_opt_enum)
4314 	{
4315 	  bool arg_ok;
4316 	  int value;
4317 
4318 	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 	  if (arg_ok)
4320 	    set_option (&global_options, enum_opts_set, opt, value,
4321 			p + opt_len, DK_UNSPECIFIED, input_location,
4322 			global_dc);
4323 	  else
4324 	    {
4325 	      error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 	      ret = false;
4327 	    }
4328 	}
4329 
4330       else
4331 	gcc_unreachable ();
4332     }
4333 
4334   return ret;
4335 }
4336 
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
4338 
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342   const char *orig_arch_string = ix86_arch_string;
4343   const char *orig_tune_string = ix86_tune_string;
4344   enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345   int orig_tune_defaulted = ix86_tune_defaulted;
4346   int orig_arch_specified = ix86_arch_specified;
4347   char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348   tree t = NULL_TREE;
4349   int i;
4350   struct cl_target_option *def
4351     = TREE_TARGET_OPTION (target_option_default_node);
4352   struct gcc_options enum_opts_set;
4353 
4354   memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355 
4356   /* Process each of the options on the chain.  */
4357   if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 					     &enum_opts_set))
4359     return NULL_TREE;
4360 
4361   /* If the changed options are different from the default, rerun
4362      ix86_option_override_internal, and then save the options away.
4363      The string options are are attribute options, and will be undone
4364      when we copy the save structure.  */
4365   if (ix86_isa_flags != def->x_ix86_isa_flags
4366       || target_flags != def->x_target_flags
4367       || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368       || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369       || enum_opts_set.x_ix86_fpmath)
4370     {
4371       /* If we are using the default tune= or arch=, undo the string assigned,
4372 	 and use the default.  */
4373       if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 	ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375       else if (!orig_arch_specified)
4376 	ix86_arch_string = NULL;
4377 
4378       if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 	ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380       else if (orig_tune_defaulted)
4381 	ix86_tune_string = NULL;
4382 
4383       /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
4384       if (enum_opts_set.x_ix86_fpmath)
4385 	global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386       else if (!TARGET_64BIT && TARGET_SSE)
4387 	{
4388 	  ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 	  global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 	}
4391 
4392       /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
4393       ix86_option_override_internal (false);
4394 
4395       /* Add any builtin functions with the new isa if any.  */
4396       ix86_add_new_builtins (ix86_isa_flags);
4397 
4398       /* Save the current options unless we are validating options for
4399 	 #pragma.  */
4400       t = build_target_option_node ();
4401 
4402       ix86_arch_string = orig_arch_string;
4403       ix86_tune_string = orig_tune_string;
4404       global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405 
4406       /* Free up memory allocated to hold the strings */
4407       for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 	free (option_strings[i]);
4409     }
4410 
4411   return t;
4412 }
4413 
4414 /* Hook to validate attribute((target("string"))).  */
4415 
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 			       tree ARG_UNUSED (name),
4419 			       tree args,
4420 			       int ARG_UNUSED (flags))
4421 {
4422   struct cl_target_option cur_target;
4423   bool ret = true;
4424   tree old_optimize = build_optimization_node ();
4425   tree new_target, new_optimize;
4426   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427 
4428   /* If the function changed the optimization levels as well as setting target
4429      options, start with the optimizations specified.  */
4430   if (func_optimize && func_optimize != old_optimize)
4431     cl_optimization_restore (&global_options,
4432 			     TREE_OPTIMIZATION (func_optimize));
4433 
4434   /* The target attributes may also change some optimization flags, so update
4435      the optimization options if necessary.  */
4436   cl_target_option_save (&cur_target, &global_options);
4437   new_target = ix86_valid_target_attribute_tree (args);
4438   new_optimize = build_optimization_node ();
4439 
4440   if (!new_target)
4441     ret = false;
4442 
4443   else if (fndecl)
4444     {
4445       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446 
4447       if (old_optimize != new_optimize)
4448 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449     }
4450 
4451   cl_target_option_restore (&global_options, &cur_target);
4452 
4453   if (old_optimize != new_optimize)
4454     cl_optimization_restore (&global_options,
4455 			     TREE_OPTIMIZATION (old_optimize));
4456 
4457   return ret;
4458 }
4459 
4460 
4461 /* Hook to determine if one function can safely inline another.  */
4462 
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466   bool ret = false;
4467   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469 
4470   /* If callee has no option attributes, then it is ok to inline.  */
4471   if (!callee_tree)
4472     ret = true;
4473 
4474   /* If caller has no option attributes, but callee does then it is not ok to
4475      inline.  */
4476   else if (!caller_tree)
4477     ret = false;
4478 
4479   else
4480     {
4481       struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482       struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483 
4484       /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 	 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 	 function.  */
4487       if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 	  != callee_opts->x_ix86_isa_flags)
4489 	ret = false;
4490 
4491       /* See if we have the same non-isa options.  */
4492       else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 	ret = false;
4494 
4495       /* See if arch, tune, etc. are the same.  */
4496       else if (caller_opts->arch != callee_opts->arch)
4497 	ret = false;
4498 
4499       else if (caller_opts->tune != callee_opts->tune)
4500 	ret = false;
4501 
4502       else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 	ret = false;
4504 
4505       else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 	ret = false;
4507 
4508       else
4509 	ret = true;
4510     }
4511 
4512   return ret;
4513 }
4514 
4515 
4516 /* Remember the last target of ix86_set_current_function.  */
4517 static GTY(()) tree ix86_previous_fndecl;
4518 
4519 /* Establish appropriate back-end context for processing the function
4520    FNDECL.  The argument might be NULL to indicate processing at top
4521    level, outside of any function scope.  */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525   /* Only change the context if the function changes.  This hook is called
4526      several times in the course of compiling a function, and we don't want to
4527      slow things down too much or call target_reinit when it isn't safe.  */
4528   if (fndecl && fndecl != ix86_previous_fndecl)
4529     {
4530       tree old_tree = (ix86_previous_fndecl
4531 		       ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 		       : NULL_TREE);
4533 
4534       tree new_tree = (fndecl
4535 		       ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 		       : NULL_TREE);
4537 
4538       ix86_previous_fndecl = fndecl;
4539       if (old_tree == new_tree)
4540 	;
4541 
4542       else if (new_tree)
4543 	{
4544 	  cl_target_option_restore (&global_options,
4545 				    TREE_TARGET_OPTION (new_tree));
4546 	  target_reinit ();
4547 	}
4548 
4549       else if (old_tree)
4550 	{
4551 	  struct cl_target_option *def
4552 	    = TREE_TARGET_OPTION (target_option_current_node);
4553 
4554 	  cl_target_option_restore (&global_options, def);
4555 	  target_reinit ();
4556 	}
4557     }
4558 }
4559 
4560 
4561 /* Return true if this goes in large data/bss.  */
4562 
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566   if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567     return false;
4568 
4569   /* Functions are never large data.  */
4570   if (TREE_CODE (exp) == FUNCTION_DECL)
4571     return false;
4572 
4573   if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574     {
4575       const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576       if (strcmp (section, ".ldata") == 0
4577 	  || strcmp (section, ".lbss") == 0)
4578 	return true;
4579       return false;
4580     }
4581   else
4582     {
4583       HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584 
4585       /* If this is an incomplete type with size 0, then we can't put it
4586 	 in data because it might be too big when completed.  */
4587       if (!size || size > ix86_section_threshold)
4588 	return true;
4589     }
4590 
4591   return false;
4592 }
4593 
4594 /* Switch to the appropriate section for output of DECL.
4595    DECL is either a `VAR_DECL' node or a constant of some sort.
4596    RELOC indicates whether forming the initial value of DECL requires
4597    link-time relocations.  */
4598 
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 	ATTRIBUTE_UNUSED;
4601 
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 			   unsigned HOST_WIDE_INT align)
4605 {
4606   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607       && ix86_in_large_data_p (decl))
4608     {
4609       const char *sname = NULL;
4610       unsigned int flags = SECTION_WRITE;
4611       switch (categorize_decl_for_section (decl, reloc))
4612 	{
4613 	case SECCAT_DATA:
4614 	  sname = ".ldata";
4615 	  break;
4616 	case SECCAT_DATA_REL:
4617 	  sname = ".ldata.rel";
4618 	  break;
4619 	case SECCAT_DATA_REL_LOCAL:
4620 	  sname = ".ldata.rel.local";
4621 	  break;
4622 	case SECCAT_DATA_REL_RO:
4623 	  sname = ".ldata.rel.ro";
4624 	  break;
4625 	case SECCAT_DATA_REL_RO_LOCAL:
4626 	  sname = ".ldata.rel.ro.local";
4627 	  break;
4628 	case SECCAT_BSS:
4629 	  sname = ".lbss";
4630 	  flags |= SECTION_BSS;
4631 	  break;
4632 	case SECCAT_RODATA:
4633 	case SECCAT_RODATA_MERGE_STR:
4634 	case SECCAT_RODATA_MERGE_STR_INIT:
4635 	case SECCAT_RODATA_MERGE_CONST:
4636 	  sname = ".lrodata";
4637 	  flags = 0;
4638 	  break;
4639 	case SECCAT_SRODATA:
4640 	case SECCAT_SDATA:
4641 	case SECCAT_SBSS:
4642 	  gcc_unreachable ();
4643 	case SECCAT_TEXT:
4644 	case SECCAT_TDATA:
4645 	case SECCAT_TBSS:
4646 	  /* We don't split these for medium model.  Place them into
4647 	     default sections and hope for best.  */
4648 	  break;
4649 	}
4650       if (sname)
4651 	{
4652 	  /* We might get called with string constants, but get_named_section
4653 	     doesn't like them as they are not DECLs.  Also, we need to set
4654 	     flags in that case.  */
4655 	  if (!DECL_P (decl))
4656 	    return get_section (sname, flags, NULL);
4657 	  return get_named_section (decl, sname, reloc);
4658 	}
4659     }
4660   return default_elf_select_section (decl, reloc, align);
4661 }
4662 
4663 /* Build up a unique section name, expressed as a
4664    STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665    RELOC indicates whether the initial value of EXP requires
4666    link-time relocations.  */
4667 
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672       && ix86_in_large_data_p (decl))
4673     {
4674       const char *prefix = NULL;
4675       /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
4676       bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677 
4678       switch (categorize_decl_for_section (decl, reloc))
4679 	{
4680 	case SECCAT_DATA:
4681 	case SECCAT_DATA_REL:
4682 	case SECCAT_DATA_REL_LOCAL:
4683 	case SECCAT_DATA_REL_RO:
4684 	case SECCAT_DATA_REL_RO_LOCAL:
4685           prefix = one_only ? ".ld" : ".ldata";
4686 	  break;
4687 	case SECCAT_BSS:
4688           prefix = one_only ? ".lb" : ".lbss";
4689 	  break;
4690 	case SECCAT_RODATA:
4691 	case SECCAT_RODATA_MERGE_STR:
4692 	case SECCAT_RODATA_MERGE_STR_INIT:
4693 	case SECCAT_RODATA_MERGE_CONST:
4694           prefix = one_only ? ".lr" : ".lrodata";
4695 	  break;
4696 	case SECCAT_SRODATA:
4697 	case SECCAT_SDATA:
4698 	case SECCAT_SBSS:
4699 	  gcc_unreachable ();
4700 	case SECCAT_TEXT:
4701 	case SECCAT_TDATA:
4702 	case SECCAT_TBSS:
4703 	  /* We don't split these for medium model.  Place them into
4704 	     default sections and hope for best.  */
4705 	  break;
4706 	}
4707       if (prefix)
4708 	{
4709 	  const char *name, *linkonce;
4710 	  char *string;
4711 
4712 	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 	  name = targetm.strip_name_encoding (name);
4714 
4715 	  /* If we're using one_only, then there needs to be a .gnu.linkonce
4716      	     prefix to the section name.  */
4717 	  linkonce = one_only ? ".gnu.linkonce" : "";
4718 
4719 	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720 
4721 	  DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 	  return;
4723 	}
4724     }
4725   default_unique_section (decl, reloc);
4726 }
4727 
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730    uninitialized external linkage data object.
4731 
4732    For medium model x86-64 we need to use .largecomm opcode for
4733    large objects.  */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 			const char *name, unsigned HOST_WIDE_INT size,
4737 			int align)
4738 {
4739   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740       && size > (unsigned int)ix86_section_threshold)
4741     fputs (".largecomm\t", file);
4742   else
4743     fputs (COMMON_ASM_OP, file);
4744   assemble_name (file, name);
4745   fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 	   size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749 
4750 /* Utility function for targets to use in implementing
4751    ASM_OUTPUT_ALIGNED_BSS.  */
4752 
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 			const char *name, unsigned HOST_WIDE_INT size,
4756 			int align)
4757 {
4758   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759       && size > (unsigned int)ix86_section_threshold)
4760     switch_to_section (get_named_section (decl, ".lbss", 0));
4761   else
4762     switch_to_section (bss_section);
4763   ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765   last_assemble_variable_decl = decl;
4766   ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768   /* Standard thing is just output label for the object.  */
4769   ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771   ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 
4774 /* Decide whether we must probe the stack before any space allocation
4775    on this target.  It's essentially TARGET_STACK_PROBE except when
4776    -fstack-check causes the stack to be already probed differently.  */
4777 
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781   /* Do not probe the stack twice if static stack checking is enabled.  */
4782   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783     return false;
4784 
4785   return TARGET_STACK_PROBE;
4786 }
4787 
4788 /* Decide whether we can make a sibling call to a function.  DECL is the
4789    declaration of the function being targeted by the call and EXP is the
4790    CALL_EXPR representing the call.  */
4791 
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795   tree type, decl_or_type;
4796   rtx a, b;
4797 
4798   /* If we are generating position-independent code, we cannot sibcall
4799      optimize any indirect call, or a direct call to a global function,
4800      as the PLT requires %ebx be live. (Darwin does not have a PLT.)  */
4801   if (!TARGET_MACHO
4802       && !TARGET_64BIT
4803       && flag_pic
4804       && (!decl || !targetm.binds_local_p (decl)))
4805     return false;
4806 
4807   /* If we need to align the outgoing stack, then sibcalling would
4808      unalign the stack, which may break the called function.  */
4809   if (ix86_minimum_incoming_stack_boundary (true)
4810       < PREFERRED_STACK_BOUNDARY)
4811     return false;
4812 
4813   if (decl)
4814     {
4815       decl_or_type = decl;
4816       type = TREE_TYPE (decl);
4817     }
4818   else
4819     {
4820       /* We're looking at the CALL_EXPR, we need the type of the function.  */
4821       type = CALL_EXPR_FN (exp);		/* pointer expression */
4822       type = TREE_TYPE (type);			/* pointer type */
4823       type = TREE_TYPE (type);			/* function type */
4824       decl_or_type = type;
4825     }
4826 
4827   /* Check that the return value locations are the same.  Like
4828      if we are returning floats on the 80387 register stack, we cannot
4829      make a sibcall from a function that doesn't return a float to a
4830      function that does or, conversely, from a function that does return
4831      a float to a function that doesn't; the necessary stack adjustment
4832      would not be executed.  This is also the place we notice
4833      differences in the return value ABI.  Note that it is ok for one
4834      of the functions to have void return type as long as the return
4835      value of the other is passed in a register.  */
4836   a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837   b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 			   cfun->decl, false);
4839   if (STACK_REG_P (a) || STACK_REG_P (b))
4840     {
4841       if (!rtx_equal_p (a, b))
4842 	return false;
4843     }
4844   else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845     {
4846       /* Disable sibcall if we need to generate vzeroupper after
4847 	 callee returns.  */
4848       if (TARGET_VZEROUPPER
4849 	  && cfun->machine->callee_return_avx256_p
4850 	  && !cfun->machine->caller_return_avx256_p)
4851 	return false;
4852     }
4853   else if (!rtx_equal_p (a, b))
4854     return false;
4855 
4856   if (TARGET_64BIT)
4857     {
4858       /* The SYSV ABI has more call-clobbered registers;
4859 	 disallow sibcalls from MS to SYSV.  */
4860       if (cfun->machine->call_abi == MS_ABI
4861 	  && ix86_function_type_abi (type) == SYSV_ABI)
4862 	return false;
4863     }
4864   else
4865     {
4866       /* If this call is indirect, we'll need to be able to use a
4867 	 call-clobbered register for the address of the target function.
4868 	 Make sure that all such registers are not used for passing
4869 	 parameters.  Note that DLLIMPORT functions are indirect.  */
4870       if (!decl
4871 	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 	{
4873 	  if (ix86_function_regparm (type, NULL) >= 3)
4874 	    {
4875 	      /* ??? Need to count the actual number of registers to be used,
4876 		 not the possible number of registers.  Fix later.  */
4877 	      return false;
4878 	    }
4879 	}
4880     }
4881 
4882   /* Otherwise okay.  That also includes certain types of indirect calls.  */
4883   return true;
4884 }
4885 
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887    and "sseregparm" calling convention attributes;
4888    arguments as in struct attribute_spec.handler.  */
4889 
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 				   tree args,
4893 				   int flags ATTRIBUTE_UNUSED,
4894 				   bool *no_add_attrs)
4895 {
4896   if (TREE_CODE (*node) != FUNCTION_TYPE
4897       && TREE_CODE (*node) != METHOD_TYPE
4898       && TREE_CODE (*node) != FIELD_DECL
4899       && TREE_CODE (*node) != TYPE_DECL)
4900     {
4901       warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 	       name);
4903       *no_add_attrs = true;
4904       return NULL_TREE;
4905     }
4906 
4907   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
4908   if (is_attribute_p ("regparm", name))
4909     {
4910       tree cst;
4911 
4912       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913         {
4914 	  error ("fastcall and regparm attributes are not compatible");
4915 	}
4916 
4917       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 	{
4919 	  error ("regparam and thiscall attributes are not compatible");
4920 	}
4921 
4922       cst = TREE_VALUE (args);
4923       if (TREE_CODE (cst) != INTEGER_CST)
4924 	{
4925 	  warning (OPT_Wattributes,
4926 		   "%qE attribute requires an integer constant argument",
4927 		   name);
4928 	  *no_add_attrs = true;
4929 	}
4930       else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 	{
4932 	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 		   name, REGPARM_MAX);
4934 	  *no_add_attrs = true;
4935 	}
4936 
4937       return NULL_TREE;
4938     }
4939 
4940   if (TARGET_64BIT)
4941     {
4942       /* Do not warn when emulating the MS ABI.  */
4943       if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 	   && TREE_CODE (*node) != METHOD_TYPE)
4945 	  || ix86_function_type_abi (*node) != MS_ABI)
4946 	warning (OPT_Wattributes, "%qE attribute ignored",
4947 	         name);
4948       *no_add_attrs = true;
4949       return NULL_TREE;
4950     }
4951 
4952   /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
4953   if (is_attribute_p ("fastcall", name))
4954     {
4955       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956         {
4957 	  error ("fastcall and cdecl attributes are not compatible");
4958 	}
4959       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960         {
4961 	  error ("fastcall and stdcall attributes are not compatible");
4962 	}
4963       if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964         {
4965 	  error ("fastcall and regparm attributes are not compatible");
4966 	}
4967       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 	{
4969 	  error ("fastcall and thiscall attributes are not compatible");
4970 	}
4971     }
4972 
4973   /* Can combine stdcall with fastcall (redundant), regparm and
4974      sseregparm.  */
4975   else if (is_attribute_p ("stdcall", name))
4976     {
4977       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978         {
4979 	  error ("stdcall and cdecl attributes are not compatible");
4980 	}
4981       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982         {
4983 	  error ("stdcall and fastcall attributes are not compatible");
4984 	}
4985       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 	{
4987 	  error ("stdcall and thiscall attributes are not compatible");
4988 	}
4989     }
4990 
4991   /* Can combine cdecl with regparm and sseregparm.  */
4992   else if (is_attribute_p ("cdecl", name))
4993     {
4994       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995         {
4996 	  error ("stdcall and cdecl attributes are not compatible");
4997 	}
4998       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999         {
5000 	  error ("fastcall and cdecl attributes are not compatible");
5001 	}
5002       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 	{
5004 	  error ("cdecl and thiscall attributes are not compatible");
5005 	}
5006     }
5007   else if (is_attribute_p ("thiscall", name))
5008     {
5009       if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 	warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 	         name);
5012       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 	{
5014 	  error ("stdcall and thiscall attributes are not compatible");
5015 	}
5016       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 	{
5018 	  error ("fastcall and thiscall attributes are not compatible");
5019 	}
5020       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 	{
5022 	  error ("cdecl and thiscall attributes are not compatible");
5023 	}
5024     }
5025 
5026   /* Can combine sseregparm with all attributes.  */
5027 
5028   return NULL_TREE;
5029 }
5030 
5031 /* The transactional memory builtins are implicitly regparm or fastcall
5032    depending on the ABI.  Override the generic do-nothing attribute that
5033    these builtins were declared with, and replace it with one of the two
5034    attributes that we expect elsewhere.  */
5035 
5036 static tree
5037 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5038     				  tree args ATTRIBUTE_UNUSED,
5039 				  int flags ATTRIBUTE_UNUSED,
5040 				  bool *no_add_attrs)
5041 {
5042   tree alt;
5043 
5044   /* In no case do we want to add the placeholder attribute.  */
5045   *no_add_attrs = true;
5046 
5047   /* The 64-bit ABI is unchanged for transactional memory.  */
5048   if (TARGET_64BIT)
5049     return NULL_TREE;
5050 
5051   /* ??? Is there a better way to validate 32-bit windows?  We have
5052      cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
5053   if (CHECK_STACK_LIMIT > 0)
5054     alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5055   else
5056     {
5057       alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5058       alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5059     }
5060   decl_attributes (node, alt, flags);
5061 
5062   return NULL_TREE;
5063 }
5064 
5065 /* This function determines from TYPE the calling-convention.  */
5066 
5067 unsigned int
5068 ix86_get_callcvt (const_tree type)
5069 {
5070   unsigned int ret = 0;
5071   bool is_stdarg;
5072   tree attrs;
5073 
5074   if (TARGET_64BIT)
5075     return IX86_CALLCVT_CDECL;
5076 
5077   attrs = TYPE_ATTRIBUTES (type);
5078   if (attrs != NULL_TREE)
5079     {
5080       if (lookup_attribute ("cdecl", attrs))
5081 	ret |= IX86_CALLCVT_CDECL;
5082       else if (lookup_attribute ("stdcall", attrs))
5083 	ret |= IX86_CALLCVT_STDCALL;
5084       else if (lookup_attribute ("fastcall", attrs))
5085 	ret |= IX86_CALLCVT_FASTCALL;
5086       else if (lookup_attribute ("thiscall", attrs))
5087 	ret |= IX86_CALLCVT_THISCALL;
5088 
5089       /* Regparam isn't allowed for thiscall and fastcall.  */
5090       if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5091 	{
5092 	  if (lookup_attribute ("regparm", attrs))
5093 	    ret |= IX86_CALLCVT_REGPARM;
5094 	  if (lookup_attribute ("sseregparm", attrs))
5095 	    ret |= IX86_CALLCVT_SSEREGPARM;
5096 	}
5097 
5098       if (IX86_BASE_CALLCVT(ret) != 0)
5099 	return ret;
5100     }
5101 
5102   is_stdarg = stdarg_p (type);
5103   if (TARGET_RTD && !is_stdarg)
5104     return IX86_CALLCVT_STDCALL | ret;
5105 
5106   if (ret != 0
5107       || is_stdarg
5108       || TREE_CODE (type) != METHOD_TYPE
5109       || ix86_function_type_abi (type) != MS_ABI)
5110     return IX86_CALLCVT_CDECL | ret;
5111 
5112   return IX86_CALLCVT_THISCALL;
5113 }
5114 
5115 /* Return 0 if the attributes for two types are incompatible, 1 if they
5116    are compatible, and 2 if they are nearly compatible (which causes a
5117    warning to be generated).  */
5118 
5119 static int
5120 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5121 {
5122   unsigned int ccvt1, ccvt2;
5123 
5124   if (TREE_CODE (type1) != FUNCTION_TYPE
5125       && TREE_CODE (type1) != METHOD_TYPE)
5126     return 1;
5127 
5128   ccvt1 = ix86_get_callcvt (type1);
5129   ccvt2 = ix86_get_callcvt (type2);
5130   if (ccvt1 != ccvt2)
5131     return 0;
5132   if (ix86_function_regparm (type1, NULL)
5133       != ix86_function_regparm (type2, NULL))
5134     return 0;
5135 
5136   return 1;
5137 }
5138 
5139 /* Return the regparm value for a function with the indicated TYPE and DECL.
5140    DECL may be NULL when calling function indirectly
5141    or considering a libcall.  */
5142 
5143 static int
5144 ix86_function_regparm (const_tree type, const_tree decl)
5145 {
5146   tree attr;
5147   int regparm;
5148   unsigned int ccvt;
5149 
5150   if (TARGET_64BIT)
5151     return (ix86_function_type_abi (type) == SYSV_ABI
5152 	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5153   ccvt = ix86_get_callcvt (type);
5154   regparm = ix86_regparm;
5155 
5156   if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5157     {
5158       attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5159       if (attr)
5160 	{
5161 	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5162 	  return regparm;
5163 	}
5164     }
5165   else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5166     return 2;
5167   else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5168     return 1;
5169 
5170   /* Use register calling convention for local functions when possible.  */
5171   if (decl
5172       && TREE_CODE (decl) == FUNCTION_DECL
5173       && optimize
5174       && !(profile_flag && !flag_fentry))
5175     {
5176       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
5177       struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5178       if (i && i->local && i->can_change_signature)
5179 	{
5180 	  int local_regparm, globals = 0, regno;
5181 
5182 	  /* Make sure no regparm register is taken by a
5183 	     fixed register variable.  */
5184 	  for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5185 	    if (fixed_regs[local_regparm])
5186 	      break;
5187 
5188 	  /* We don't want to use regparm(3) for nested functions as
5189 	     these use a static chain pointer in the third argument.  */
5190 	  if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5191 	    local_regparm = 2;
5192 
5193 	  /* In 32-bit mode save a register for the split stack.  */
5194 	  if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5195 	    local_regparm = 2;
5196 
5197 	  /* Each fixed register usage increases register pressure,
5198 	     so less registers should be used for argument passing.
5199 	     This functionality can be overriden by an explicit
5200 	     regparm value.  */
5201 	  for (regno = 0; regno <= DI_REG; regno++)
5202 	    if (fixed_regs[regno])
5203 	      globals++;
5204 
5205 	  local_regparm
5206 	    = globals < local_regparm ? local_regparm - globals : 0;
5207 
5208 	  if (local_regparm > regparm)
5209 	    regparm = local_regparm;
5210 	}
5211     }
5212 
5213   return regparm;
5214 }
5215 
5216 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5217    DFmode (2) arguments in SSE registers for a function with the
5218    indicated TYPE and DECL.  DECL may be NULL when calling function
5219    indirectly or considering a libcall.  Otherwise return 0.  */
5220 
5221 static int
5222 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5223 {
5224   gcc_assert (!TARGET_64BIT);
5225 
5226   /* Use SSE registers to pass SFmode and DFmode arguments if requested
5227      by the sseregparm attribute.  */
5228   if (TARGET_SSEREGPARM
5229       || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5230     {
5231       if (!TARGET_SSE)
5232 	{
5233 	  if (warn)
5234 	    {
5235 	      if (decl)
5236 		error ("calling %qD with attribute sseregparm without "
5237 		       "SSE/SSE2 enabled", decl);
5238 	      else
5239 		error ("calling %qT with attribute sseregparm without "
5240 		       "SSE/SSE2 enabled", type);
5241 	    }
5242 	  return 0;
5243 	}
5244 
5245       return 2;
5246     }
5247 
5248   /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5249      (and DFmode for SSE2) arguments in SSE registers.  */
5250   if (decl && TARGET_SSE_MATH && optimize
5251       && !(profile_flag && !flag_fentry))
5252     {
5253       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
5254       struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5255       if (i && i->local && i->can_change_signature)
5256 	return TARGET_SSE2 ? 2 : 1;
5257     }
5258 
5259   return 0;
5260 }
5261 
5262 /* Return true if EAX is live at the start of the function.  Used by
5263    ix86_expand_prologue to determine if we need special help before
5264    calling allocate_stack_worker.  */
5265 
5266 static bool
5267 ix86_eax_live_at_start_p (void)
5268 {
5269   /* Cheat.  Don't bother working forward from ix86_function_regparm
5270      to the function type to whether an actual argument is located in
5271      eax.  Instead just look at cfg info, which is still close enough
5272      to correct at this point.  This gives false positives for broken
5273      functions that might use uninitialized data that happens to be
5274      allocated in eax, but who cares?  */
5275   return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5276 }
5277 
5278 static bool
5279 ix86_keep_aggregate_return_pointer (tree fntype)
5280 {
5281   tree attr;
5282 
5283   if (!TARGET_64BIT)
5284     {
5285       attr = lookup_attribute ("callee_pop_aggregate_return",
5286 			       TYPE_ATTRIBUTES (fntype));
5287       if (attr)
5288 	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5289 
5290       /* For 32-bit MS-ABI the default is to keep aggregate
5291          return pointer.  */
5292       if (ix86_function_type_abi (fntype) == MS_ABI)
5293 	return true;
5294     }
5295   return KEEP_AGGREGATE_RETURN_POINTER != 0;
5296 }
5297 
5298 /* Value is the number of bytes of arguments automatically
5299    popped when returning from a subroutine call.
5300    FUNDECL is the declaration node of the function (as a tree),
5301    FUNTYPE is the data type of the function (as a tree),
5302    or for a library call it is an identifier node for the subroutine name.
5303    SIZE is the number of bytes of arguments passed on the stack.
5304 
5305    On the 80386, the RTD insn may be used to pop them if the number
5306      of args is fixed, but if the number is variable then the caller
5307      must pop them all.  RTD can't be used for library calls now
5308      because the library is compiled with the Unix compiler.
5309    Use of RTD is a selectable option, since it is incompatible with
5310    standard Unix calling sequences.  If the option is not selected,
5311    the caller must always pop the args.
5312 
5313    The attribute stdcall is equivalent to RTD on a per module basis.  */
5314 
5315 static int
5316 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5317 {
5318   unsigned int ccvt;
5319 
5320   /* None of the 64-bit ABIs pop arguments.  */
5321   if (TARGET_64BIT)
5322     return 0;
5323 
5324   ccvt = ix86_get_callcvt (funtype);
5325 
5326   if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5327 	       | IX86_CALLCVT_THISCALL)) != 0
5328       && ! stdarg_p (funtype))
5329     return size;
5330 
5331   /* Lose any fake structure return argument if it is passed on the stack.  */
5332   if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5333       && !ix86_keep_aggregate_return_pointer (funtype))
5334     {
5335       int nregs = ix86_function_regparm (funtype, fundecl);
5336       if (nregs == 0)
5337 	return GET_MODE_SIZE (Pmode);
5338     }
5339 
5340   return 0;
5341 }
5342 
5343 /* Argument support functions.  */
5344 
5345 /* Return true when register may be used to pass function parameters.  */
5346 bool
5347 ix86_function_arg_regno_p (int regno)
5348 {
5349   int i;
5350   const int *parm_regs;
5351 
5352   if (!TARGET_64BIT)
5353     {
5354       if (TARGET_MACHO)
5355         return (regno < REGPARM_MAX
5356                 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5357       else
5358         return (regno < REGPARM_MAX
5359 	        || (TARGET_MMX && MMX_REGNO_P (regno)
5360 	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5361 	        || (TARGET_SSE && SSE_REGNO_P (regno)
5362 		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5363     }
5364 
5365   if (TARGET_MACHO)
5366     {
5367       if (SSE_REGNO_P (regno) && TARGET_SSE)
5368         return true;
5369     }
5370   else
5371     {
5372       if (TARGET_SSE && SSE_REGNO_P (regno)
5373           && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5374         return true;
5375     }
5376 
5377   /* TODO: The function should depend on current function ABI but
5378      builtins.c would need updating then. Therefore we use the
5379      default ABI.  */
5380 
5381   /* RAX is used as hidden argument to va_arg functions.  */
5382   if (ix86_abi == SYSV_ABI && regno == AX_REG)
5383     return true;
5384 
5385   if (ix86_abi == MS_ABI)
5386     parm_regs = x86_64_ms_abi_int_parameter_registers;
5387   else
5388     parm_regs = x86_64_int_parameter_registers;
5389   for (i = 0; i < (ix86_abi == MS_ABI
5390 		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5391     if (regno == parm_regs[i])
5392       return true;
5393   return false;
5394 }
5395 
5396 /* Return if we do not know how to pass TYPE solely in registers.  */
5397 
5398 static bool
5399 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5400 {
5401   if (must_pass_in_stack_var_size_or_pad (mode, type))
5402     return true;
5403 
5404   /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
5405      The layout_type routine is crafty and tries to trick us into passing
5406      currently unsupported vector types on the stack by using TImode.  */
5407   return (!TARGET_64BIT && mode == TImode
5408 	  && type && TREE_CODE (type) != VECTOR_TYPE);
5409 }
5410 
5411 /* It returns the size, in bytes, of the area reserved for arguments passed
5412    in registers for the function represented by fndecl dependent to the used
5413    abi format.  */
5414 int
5415 ix86_reg_parm_stack_space (const_tree fndecl)
5416 {
5417   enum calling_abi call_abi = SYSV_ABI;
5418   if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5419     call_abi = ix86_function_abi (fndecl);
5420   else
5421     call_abi = ix86_function_type_abi (fndecl);
5422   if (TARGET_64BIT && call_abi == MS_ABI)
5423     return 32;
5424   return 0;
5425 }
5426 
5427 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5428    call abi used.  */
5429 enum calling_abi
5430 ix86_function_type_abi (const_tree fntype)
5431 {
5432   if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5433     {
5434       enum calling_abi abi = ix86_abi;
5435       if (abi == SYSV_ABI)
5436 	{
5437 	  if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5438 	    abi = MS_ABI;
5439 	}
5440       else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5441 	abi = SYSV_ABI;
5442       return abi;
5443     }
5444   return ix86_abi;
5445 }
5446 
5447 static bool
5448 ix86_function_ms_hook_prologue (const_tree fn)
5449 {
5450   if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5451     {
5452       if (decl_function_context (fn) != NULL_TREE)
5453 	error_at (DECL_SOURCE_LOCATION (fn),
5454 		  "ms_hook_prologue is not compatible with nested function");
5455       else
5456         return true;
5457     }
5458   return false;
5459 }
5460 
5461 static enum calling_abi
5462 ix86_function_abi (const_tree fndecl)
5463 {
5464   if (! fndecl)
5465     return ix86_abi;
5466   return ix86_function_type_abi (TREE_TYPE (fndecl));
5467 }
5468 
5469 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5470    call abi used.  */
5471 enum calling_abi
5472 ix86_cfun_abi (void)
5473 {
5474   if (! cfun)
5475     return ix86_abi;
5476   return cfun->machine->call_abi;
5477 }
5478 
5479 /* Write the extra assembler code needed to declare a function properly.  */
5480 
5481 void
5482 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5483 				tree decl)
5484 {
5485   bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5486 
5487   if (is_ms_hook)
5488     {
5489       int i, filler_count = (TARGET_64BIT ? 32 : 16);
5490       unsigned int filler_cc = 0xcccccccc;
5491 
5492       for (i = 0; i < filler_count; i += 4)
5493         fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5494     }
5495 
5496 #ifdef SUBTARGET_ASM_UNWIND_INIT
5497   SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5498 #endif
5499 
5500   ASM_OUTPUT_LABEL (asm_out_file, fname);
5501 
5502   /* Output magic byte marker, if hot-patch attribute is set.  */
5503   if (is_ms_hook)
5504     {
5505       if (TARGET_64BIT)
5506 	{
5507 	  /* leaq [%rsp + 0], %rsp  */
5508 	  asm_fprintf (asm_out_file, ASM_BYTE
5509 		       "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5510 	}
5511       else
5512 	{
5513           /* movl.s %edi, %edi
5514 	     push   %ebp
5515 	     movl.s %esp, %ebp */
5516 	  asm_fprintf (asm_out_file, ASM_BYTE
5517 		       "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5518 	}
5519     }
5520 }
5521 
5522 /* regclass.c  */
5523 extern void init_regs (void);
5524 
5525 /* Implementation of call abi switching target hook. Specific to FNDECL
5526    the specific call register sets are set.  See also
5527    ix86_conditional_register_usage for more details.  */
5528 void
5529 ix86_call_abi_override (const_tree fndecl)
5530 {
5531   if (fndecl == NULL_TREE)
5532     cfun->machine->call_abi = ix86_abi;
5533   else
5534     cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5535 }
5536 
5537 /* 64-bit MS and SYSV ABI have different set of call used registers.  Avoid
5538    expensive re-initialization of init_regs each time we switch function context
5539    since this is needed only during RTL expansion.  */
5540 static void
5541 ix86_maybe_switch_abi (void)
5542 {
5543   if (TARGET_64BIT &&
5544       call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5545     reinit_regs ();
5546 }
5547 
5548 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5549    for a call to a function whose data type is FNTYPE.
5550    For a library call, FNTYPE is 0.  */
5551 
5552 void
5553 init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
5554 		      tree fntype,	/* tree ptr for function decl */
5555 		      rtx libname,	/* SYMBOL_REF of library name or 0 */
5556 		      tree fndecl,
5557 		      int caller)
5558 {
5559   struct cgraph_local_info *i;
5560   tree fnret_type;
5561 
5562   memset (cum, 0, sizeof (*cum));
5563 
5564   /* Initialize for the current callee.  */
5565   if (caller)
5566     {
5567       cfun->machine->callee_pass_avx256_p = false;
5568       cfun->machine->callee_return_avx256_p = false;
5569     }
5570 
5571   if (fndecl)
5572     {
5573       i = cgraph_local_info (fndecl);
5574       cum->call_abi = ix86_function_abi (fndecl);
5575       fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5576     }
5577   else
5578     {
5579       i = NULL;
5580       cum->call_abi = ix86_function_type_abi (fntype);
5581       if (fntype)
5582 	fnret_type = TREE_TYPE (fntype);
5583       else
5584 	fnret_type = NULL;
5585     }
5586 
5587   if (TARGET_VZEROUPPER && fnret_type)
5588     {
5589       rtx fnret_value = ix86_function_value (fnret_type, fntype,
5590 					     false);
5591       if (function_pass_avx256_p (fnret_value))
5592 	{
5593 	  /* The return value of this function uses 256bit AVX modes.  */
5594 	  if (caller)
5595 	    {
5596 	      cfun->machine->callee_return_avx256_p = true;
5597 	      cum->callee_return_avx256_p = true;
5598 	    }
5599 	  else
5600 	    cfun->machine->caller_return_avx256_p = true;
5601 	}
5602     }
5603 
5604   cum->caller = caller;
5605 
5606   /* Set up the number of registers to use for passing arguments.  */
5607 
5608   if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5609     sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5610 	   "or subtarget optimization implying it");
5611   cum->nregs = ix86_regparm;
5612   if (TARGET_64BIT)
5613     {
5614       cum->nregs = (cum->call_abi == SYSV_ABI
5615                    ? X86_64_REGPARM_MAX
5616                    : X86_64_MS_REGPARM_MAX);
5617     }
5618   if (TARGET_SSE)
5619     {
5620       cum->sse_nregs = SSE_REGPARM_MAX;
5621       if (TARGET_64BIT)
5622         {
5623           cum->sse_nregs = (cum->call_abi == SYSV_ABI
5624                            ? X86_64_SSE_REGPARM_MAX
5625                            : X86_64_MS_SSE_REGPARM_MAX);
5626         }
5627     }
5628   if (TARGET_MMX)
5629     cum->mmx_nregs = MMX_REGPARM_MAX;
5630   cum->warn_avx = true;
5631   cum->warn_sse = true;
5632   cum->warn_mmx = true;
5633 
5634   /* Because type might mismatch in between caller and callee, we need to
5635      use actual type of function for local calls.
5636      FIXME: cgraph_analyze can be told to actually record if function uses
5637      va_start so for local functions maybe_vaarg can be made aggressive
5638      helping K&R code.
5639      FIXME: once typesytem is fixed, we won't need this code anymore.  */
5640   if (i && i->local && i->can_change_signature)
5641     fntype = TREE_TYPE (fndecl);
5642   cum->maybe_vaarg = (fntype
5643 		      ? (!prototype_p (fntype) || stdarg_p (fntype))
5644 		      : !libname);
5645 
5646   if (!TARGET_64BIT)
5647     {
5648       /* If there are variable arguments, then we won't pass anything
5649          in registers in 32-bit mode. */
5650       if (stdarg_p (fntype))
5651 	{
5652 	  cum->nregs = 0;
5653 	  cum->sse_nregs = 0;
5654 	  cum->mmx_nregs = 0;
5655 	  cum->warn_avx = 0;
5656 	  cum->warn_sse = 0;
5657 	  cum->warn_mmx = 0;
5658 	  return;
5659 	}
5660 
5661       /* Use ecx and edx registers if function has fastcall attribute,
5662 	 else look for regparm information.  */
5663       if (fntype)
5664 	{
5665 	  unsigned int ccvt = ix86_get_callcvt (fntype);
5666 	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5667 	    {
5668 	      cum->nregs = 1;
5669 	      cum->fastcall = 1; /* Same first register as in fastcall.  */
5670 	    }
5671 	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5672 	    {
5673 	      cum->nregs = 2;
5674 	      cum->fastcall = 1;
5675 	    }
5676 	  else
5677 	    cum->nregs = ix86_function_regparm (fntype, fndecl);
5678 	}
5679 
5680       /* Set up the number of SSE registers used for passing SFmode
5681 	 and DFmode arguments.  Warn for mismatching ABI.  */
5682       cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5683     }
5684 }
5685 
5686 /* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
5687    But in the case of vector types, it is some vector mode.
5688 
5689    When we have only some of our vector isa extensions enabled, then there
5690    are some modes for which vector_mode_supported_p is false.  For these
5691    modes, the generic vector support in gcc will choose some non-vector mode
5692    in order to implement the type.  By computing the natural mode, we'll
5693    select the proper ABI location for the operand and not depend on whatever
5694    the middle-end decides to do with these vector types.
5695 
5696    The midde-end can't deal with the vector types > 16 bytes.  In this
5697    case, we return the original mode and warn ABI change if CUM isn't
5698    NULL.  */
5699 
5700 static enum machine_mode
5701 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5702 {
5703   enum machine_mode mode = TYPE_MODE (type);
5704 
5705   if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5706     {
5707       HOST_WIDE_INT size = int_size_in_bytes (type);
5708       if ((size == 8 || size == 16 || size == 32)
5709 	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
5710 	  && TYPE_VECTOR_SUBPARTS (type) > 1)
5711 	{
5712 	  enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5713 
5714 	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5715 	    mode = MIN_MODE_VECTOR_FLOAT;
5716 	  else
5717 	    mode = MIN_MODE_VECTOR_INT;
5718 
5719 	  /* Get the mode which has this inner mode and number of units.  */
5720 	  for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5721 	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5722 		&& GET_MODE_INNER (mode) == innermode)
5723 	      {
5724 		if (size == 32 && !TARGET_AVX)
5725 		  {
5726 		    static bool warnedavx;
5727 
5728 		    if (cum
5729 			&& !warnedavx
5730 			&& cum->warn_avx)
5731 		      {
5732 			warnedavx = true;
5733 			warning (0, "AVX vector argument without AVX "
5734 				 "enabled changes the ABI");
5735 		      }
5736 		    return TYPE_MODE (type);
5737 		  }
5738 		else
5739 		  return mode;
5740 	      }
5741 
5742 	  gcc_unreachable ();
5743 	}
5744     }
5745 
5746   return mode;
5747 }
5748 
5749 /* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
5750    this may not agree with the mode that the type system has chosen for the
5751    register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
5752    go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
5753 
5754 static rtx
5755 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5756 		     unsigned int regno)
5757 {
5758   rtx tmp;
5759 
5760   if (orig_mode != BLKmode)
5761     tmp = gen_rtx_REG (orig_mode, regno);
5762   else
5763     {
5764       tmp = gen_rtx_REG (mode, regno);
5765       tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5766       tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5767     }
5768 
5769   return tmp;
5770 }
5771 
5772 /* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
5773    of this code is to classify each 8bytes of incoming argument by the register
5774    class and assign registers accordingly.  */
5775 
5776 /* Return the union class of CLASS1 and CLASS2.
5777    See the x86-64 PS ABI for details.  */
5778 
5779 static enum x86_64_reg_class
5780 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5781 {
5782   /* Rule #1: If both classes are equal, this is the resulting class.  */
5783   if (class1 == class2)
5784     return class1;
5785 
5786   /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5787      the other class.  */
5788   if (class1 == X86_64_NO_CLASS)
5789     return class2;
5790   if (class2 == X86_64_NO_CLASS)
5791     return class1;
5792 
5793   /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
5794   if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5795     return X86_64_MEMORY_CLASS;
5796 
5797   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
5798   if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5799       || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5800     return X86_64_INTEGERSI_CLASS;
5801   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5802       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5803     return X86_64_INTEGER_CLASS;
5804 
5805   /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5806      MEMORY is used.  */
5807   if (class1 == X86_64_X87_CLASS
5808       || class1 == X86_64_X87UP_CLASS
5809       || class1 == X86_64_COMPLEX_X87_CLASS
5810       || class2 == X86_64_X87_CLASS
5811       || class2 == X86_64_X87UP_CLASS
5812       || class2 == X86_64_COMPLEX_X87_CLASS)
5813     return X86_64_MEMORY_CLASS;
5814 
5815   /* Rule #6: Otherwise class SSE is used.  */
5816   return X86_64_SSE_CLASS;
5817 }
5818 
5819 /* Classify the argument of type TYPE and mode MODE.
5820    CLASSES will be filled by the register class used to pass each word
5821    of the operand.  The number of words is returned.  In case the parameter
5822    should be passed in memory, 0 is returned. As a special case for zero
5823    sized containers, classes[0] will be NO_CLASS and 1 is returned.
5824 
5825    BIT_OFFSET is used internally for handling records and specifies offset
5826    of the offset in bits modulo 256 to avoid overflow cases.
5827 
5828    See the x86-64 PS ABI for details.
5829 */
5830 
5831 static int
5832 classify_argument (enum machine_mode mode, const_tree type,
5833 		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5834 {
5835   HOST_WIDE_INT bytes =
5836     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5837   int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5838 
5839   /* Variable sized entities are always passed/returned in memory.  */
5840   if (bytes < 0)
5841     return 0;
5842 
5843   if (mode != VOIDmode
5844       && targetm.calls.must_pass_in_stack (mode, type))
5845     return 0;
5846 
5847   if (type && AGGREGATE_TYPE_P (type))
5848     {
5849       int i;
5850       tree field;
5851       enum x86_64_reg_class subclasses[MAX_CLASSES];
5852 
5853       /* On x86-64 we pass structures larger than 32 bytes on the stack.  */
5854       if (bytes > 32)
5855 	return 0;
5856 
5857       for (i = 0; i < words; i++)
5858 	classes[i] = X86_64_NO_CLASS;
5859 
5860       /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
5861 	 signalize memory class, so handle it as special case.  */
5862       if (!words)
5863 	{
5864 	  classes[0] = X86_64_NO_CLASS;
5865 	  return 1;
5866 	}
5867 
5868       /* Classify each field of record and merge classes.  */
5869       switch (TREE_CODE (type))
5870 	{
5871 	case RECORD_TYPE:
5872 	  /* And now merge the fields of structure.  */
5873 	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5874 	    {
5875 	      if (TREE_CODE (field) == FIELD_DECL)
5876 		{
5877 		  int num;
5878 
5879 		  if (TREE_TYPE (field) == error_mark_node)
5880 		    continue;
5881 
5882 		  /* Bitfields are always classified as integer.  Handle them
5883 		     early, since later code would consider them to be
5884 		     misaligned integers.  */
5885 		  if (DECL_BIT_FIELD (field))
5886 		    {
5887 		      for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5888 			   i < ((int_bit_position (field) + (bit_offset % 64))
5889 			        + tree_low_cst (DECL_SIZE (field), 0)
5890 				+ 63) / 8 / 8; i++)
5891 			classes[i] =
5892 			  merge_classes (X86_64_INTEGER_CLASS,
5893 					 classes[i]);
5894 		    }
5895 		  else
5896 		    {
5897 		      int pos;
5898 
5899 		      type = TREE_TYPE (field);
5900 
5901 		      /* Flexible array member is ignored.  */
5902 		      if (TYPE_MODE (type) == BLKmode
5903 			  && TREE_CODE (type) == ARRAY_TYPE
5904 			  && TYPE_SIZE (type) == NULL_TREE
5905 			  && TYPE_DOMAIN (type) != NULL_TREE
5906 			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5907 			      == NULL_TREE))
5908 			{
5909 			  static bool warned;
5910 
5911 			  if (!warned && warn_psabi)
5912 			    {
5913 			      warned = true;
5914 			      inform (input_location,
5915 				      "the ABI of passing struct with"
5916 				      " a flexible array member has"
5917 				      " changed in GCC 4.4");
5918 			    }
5919 			  continue;
5920 			}
5921 		      num = classify_argument (TYPE_MODE (type), type,
5922 					       subclasses,
5923 					       (int_bit_position (field)
5924 						+ bit_offset) % 256);
5925 		      if (!num)
5926 			return 0;
5927 		      pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5928 		      for (i = 0; i < num && (i + pos) < words; i++)
5929 			classes[i + pos] =
5930 			  merge_classes (subclasses[i], classes[i + pos]);
5931 		    }
5932 		}
5933 	    }
5934 	  break;
5935 
5936 	case ARRAY_TYPE:
5937 	  /* Arrays are handled as small records.  */
5938 	  {
5939 	    int num;
5940 	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5941 				     TREE_TYPE (type), subclasses, bit_offset);
5942 	    if (!num)
5943 	      return 0;
5944 
5945 	    /* The partial classes are now full classes.  */
5946 	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5947 	      subclasses[0] = X86_64_SSE_CLASS;
5948 	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
5949 		&& !((bit_offset % 64) == 0 && bytes == 4))
5950 	      subclasses[0] = X86_64_INTEGER_CLASS;
5951 
5952 	    for (i = 0; i < words; i++)
5953 	      classes[i] = subclasses[i % num];
5954 
5955 	    break;
5956 	  }
5957 	case UNION_TYPE:
5958 	case QUAL_UNION_TYPE:
5959 	  /* Unions are similar to RECORD_TYPE but offset is always 0.
5960 	     */
5961 	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5962 	    {
5963 	      if (TREE_CODE (field) == FIELD_DECL)
5964 		{
5965 		  int num;
5966 
5967 		  if (TREE_TYPE (field) == error_mark_node)
5968 		    continue;
5969 
5970 		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5971 					   TREE_TYPE (field), subclasses,
5972 					   bit_offset);
5973 		  if (!num)
5974 		    return 0;
5975 		  for (i = 0; i < num; i++)
5976 		    classes[i] = merge_classes (subclasses[i], classes[i]);
5977 		}
5978 	    }
5979 	  break;
5980 
5981 	default:
5982 	  gcc_unreachable ();
5983 	}
5984 
5985       if (words > 2)
5986 	{
5987 	  /* When size > 16 bytes, if the first one isn't
5988 	     X86_64_SSE_CLASS or any other ones aren't
5989 	     X86_64_SSEUP_CLASS, everything should be passed in
5990 	     memory.  */
5991 	  if (classes[0] != X86_64_SSE_CLASS)
5992 	      return 0;
5993 
5994 	  for (i = 1; i < words; i++)
5995 	    if (classes[i] != X86_64_SSEUP_CLASS)
5996 	      return 0;
5997 	}
5998 
5999       /* Final merger cleanup.  */
6000       for (i = 0; i < words; i++)
6001 	{
6002 	  /* If one class is MEMORY, everything should be passed in
6003 	     memory.  */
6004 	  if (classes[i] == X86_64_MEMORY_CLASS)
6005 	    return 0;
6006 
6007 	  /* The X86_64_SSEUP_CLASS should be always preceded by
6008 	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
6009 	  if (classes[i] == X86_64_SSEUP_CLASS
6010 	      && classes[i - 1] != X86_64_SSE_CLASS
6011 	      && classes[i - 1] != X86_64_SSEUP_CLASS)
6012 	    {
6013 	      /* The first one should never be X86_64_SSEUP_CLASS.  */
6014 	      gcc_assert (i != 0);
6015 	      classes[i] = X86_64_SSE_CLASS;
6016 	    }
6017 
6018 	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6019 	       everything should be passed in memory.  */
6020 	  if (classes[i] == X86_64_X87UP_CLASS
6021 	      && (classes[i - 1] != X86_64_X87_CLASS))
6022 	    {
6023 	      static bool warned;
6024 
6025 	      /* The first one should never be X86_64_X87UP_CLASS.  */
6026 	      gcc_assert (i != 0);
6027 	      if (!warned && warn_psabi)
6028 		{
6029 		  warned = true;
6030 		  inform (input_location,
6031 			  "the ABI of passing union with long double"
6032 			  " has changed in GCC 4.4");
6033 		}
6034 	      return 0;
6035 	    }
6036 	}
6037       return words;
6038     }
6039 
6040   /* Compute alignment needed.  We align all types to natural boundaries with
6041      exception of XFmode that is aligned to 64bits.  */
6042   if (mode != VOIDmode && mode != BLKmode)
6043     {
6044       int mode_alignment = GET_MODE_BITSIZE (mode);
6045 
6046       if (mode == XFmode)
6047 	mode_alignment = 128;
6048       else if (mode == XCmode)
6049 	mode_alignment = 256;
6050       if (COMPLEX_MODE_P (mode))
6051 	mode_alignment /= 2;
6052       /* Misaligned fields are always returned in memory.  */
6053       if (bit_offset % mode_alignment)
6054 	return 0;
6055     }
6056 
6057   /* for V1xx modes, just use the base mode */
6058   if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6059       && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6060     mode = GET_MODE_INNER (mode);
6061 
6062   /* Classification of atomic types.  */
6063   switch (mode)
6064     {
6065     case SDmode:
6066     case DDmode:
6067       classes[0] = X86_64_SSE_CLASS;
6068       return 1;
6069     case TDmode:
6070       classes[0] = X86_64_SSE_CLASS;
6071       classes[1] = X86_64_SSEUP_CLASS;
6072       return 2;
6073     case DImode:
6074     case SImode:
6075     case HImode:
6076     case QImode:
6077     case CSImode:
6078     case CHImode:
6079     case CQImode:
6080       {
6081 	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
6082 
6083 	/* Analyze last 128 bits only.  */
6084 	size = (size - 1) & 0x7f;
6085 
6086 	if (size < 32)
6087 	  {
6088 	    classes[0] = X86_64_INTEGERSI_CLASS;
6089 	    return 1;
6090 	  }
6091 	else if (size < 64)
6092 	  {
6093 	    classes[0] = X86_64_INTEGER_CLASS;
6094 	    return 1;
6095 	  }
6096 	else if (size < 64+32)
6097 	  {
6098 	    classes[0] = X86_64_INTEGER_CLASS;
6099 	    classes[1] = X86_64_INTEGERSI_CLASS;
6100 	    return 2;
6101 	  }
6102 	else if (size < 64+64)
6103 	  {
6104 	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6105 	    return 2;
6106 	  }
6107 	else
6108 	  gcc_unreachable ();
6109       }
6110     case CDImode:
6111     case TImode:
6112       classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6113       return 2;
6114     case COImode:
6115     case OImode:
6116       /* OImode shouldn't be used directly.  */
6117       gcc_unreachable ();
6118     case CTImode:
6119       return 0;
6120     case SFmode:
6121       if (!(bit_offset % 64))
6122 	classes[0] = X86_64_SSESF_CLASS;
6123       else
6124 	classes[0] = X86_64_SSE_CLASS;
6125       return 1;
6126     case DFmode:
6127       classes[0] = X86_64_SSEDF_CLASS;
6128       return 1;
6129     case XFmode:
6130       classes[0] = X86_64_X87_CLASS;
6131       classes[1] = X86_64_X87UP_CLASS;
6132       return 2;
6133     case TFmode:
6134       classes[0] = X86_64_SSE_CLASS;
6135       classes[1] = X86_64_SSEUP_CLASS;
6136       return 2;
6137     case SCmode:
6138       classes[0] = X86_64_SSE_CLASS;
6139       if (!(bit_offset % 64))
6140 	return 1;
6141       else
6142 	{
6143 	  static bool warned;
6144 
6145 	  if (!warned && warn_psabi)
6146 	    {
6147 	      warned = true;
6148 	      inform (input_location,
6149 		      "the ABI of passing structure with complex float"
6150 		      " member has changed in GCC 4.4");
6151 	    }
6152 	  classes[1] = X86_64_SSESF_CLASS;
6153 	  return 2;
6154 	}
6155     case DCmode:
6156       classes[0] = X86_64_SSEDF_CLASS;
6157       classes[1] = X86_64_SSEDF_CLASS;
6158       return 2;
6159     case XCmode:
6160       classes[0] = X86_64_COMPLEX_X87_CLASS;
6161       return 1;
6162     case TCmode:
6163       /* This modes is larger than 16 bytes.  */
6164       return 0;
6165     case V8SFmode:
6166     case V8SImode:
6167     case V32QImode:
6168     case V16HImode:
6169     case V4DFmode:
6170     case V4DImode:
6171       classes[0] = X86_64_SSE_CLASS;
6172       classes[1] = X86_64_SSEUP_CLASS;
6173       classes[2] = X86_64_SSEUP_CLASS;
6174       classes[3] = X86_64_SSEUP_CLASS;
6175       return 4;
6176     case V4SFmode:
6177     case V4SImode:
6178     case V16QImode:
6179     case V8HImode:
6180     case V2DFmode:
6181     case V2DImode:
6182       classes[0] = X86_64_SSE_CLASS;
6183       classes[1] = X86_64_SSEUP_CLASS;
6184       return 2;
6185     case V1TImode:
6186     case V1DImode:
6187     case V2SFmode:
6188     case V2SImode:
6189     case V4HImode:
6190     case V8QImode:
6191       classes[0] = X86_64_SSE_CLASS;
6192       return 1;
6193     case BLKmode:
6194     case VOIDmode:
6195       return 0;
6196     default:
6197       gcc_assert (VECTOR_MODE_P (mode));
6198 
6199       if (bytes > 16)
6200 	return 0;
6201 
6202       gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6203 
6204       if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6205 	classes[0] = X86_64_INTEGERSI_CLASS;
6206       else
6207 	classes[0] = X86_64_INTEGER_CLASS;
6208       classes[1] = X86_64_INTEGER_CLASS;
6209       return 1 + (bytes > 8);
6210     }
6211 }
6212 
6213 /* Examine the argument and return set number of register required in each
6214    class.  Return 0 iff parameter should be passed in memory.  */
6215 static int
6216 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6217 		  int *int_nregs, int *sse_nregs)
6218 {
6219   enum x86_64_reg_class regclass[MAX_CLASSES];
6220   int n = classify_argument (mode, type, regclass, 0);
6221 
6222   *int_nregs = 0;
6223   *sse_nregs = 0;
6224   if (!n)
6225     return 0;
6226   for (n--; n >= 0; n--)
6227     switch (regclass[n])
6228       {
6229       case X86_64_INTEGER_CLASS:
6230       case X86_64_INTEGERSI_CLASS:
6231 	(*int_nregs)++;
6232 	break;
6233       case X86_64_SSE_CLASS:
6234       case X86_64_SSESF_CLASS:
6235       case X86_64_SSEDF_CLASS:
6236 	(*sse_nregs)++;
6237 	break;
6238       case X86_64_NO_CLASS:
6239       case X86_64_SSEUP_CLASS:
6240 	break;
6241       case X86_64_X87_CLASS:
6242       case X86_64_X87UP_CLASS:
6243 	if (!in_return)
6244 	  return 0;
6245 	break;
6246       case X86_64_COMPLEX_X87_CLASS:
6247 	return in_return ? 2 : 0;
6248       case X86_64_MEMORY_CLASS:
6249 	gcc_unreachable ();
6250       }
6251   return 1;
6252 }
6253 
6254 /* Construct container for the argument used by GCC interface.  See
6255    FUNCTION_ARG for the detailed description.  */
6256 
6257 static rtx
6258 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6259 		     const_tree type, int in_return, int nintregs, int nsseregs,
6260 		     const int *intreg, int sse_regno)
6261 {
6262   /* The following variables hold the static issued_error state.  */
6263   static bool issued_sse_arg_error;
6264   static bool issued_sse_ret_error;
6265   static bool issued_x87_ret_error;
6266 
6267   enum machine_mode tmpmode;
6268   int bytes =
6269     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6270   enum x86_64_reg_class regclass[MAX_CLASSES];
6271   int n;
6272   int i;
6273   int nexps = 0;
6274   int needed_sseregs, needed_intregs;
6275   rtx exp[MAX_CLASSES];
6276   rtx ret;
6277 
6278   n = classify_argument (mode, type, regclass, 0);
6279   if (!n)
6280     return NULL;
6281   if (!examine_argument (mode, type, in_return, &needed_intregs,
6282 			 &needed_sseregs))
6283     return NULL;
6284   if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6285     return NULL;
6286 
6287   /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
6288      some less clueful developer tries to use floating-point anyway.  */
6289   if (needed_sseregs && !TARGET_SSE)
6290     {
6291       if (in_return)
6292 	{
6293 	  if (!issued_sse_ret_error)
6294 	    {
6295 	      error ("SSE register return with SSE disabled");
6296 	      issued_sse_ret_error = true;
6297 	    }
6298 	}
6299       else if (!issued_sse_arg_error)
6300 	{
6301 	  error ("SSE register argument with SSE disabled");
6302 	  issued_sse_arg_error = true;
6303 	}
6304       return NULL;
6305     }
6306 
6307   /* Likewise, error if the ABI requires us to return values in the
6308      x87 registers and the user specified -mno-80387.  */
6309   if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6310     for (i = 0; i < n; i++)
6311       if (regclass[i] == X86_64_X87_CLASS
6312 	  || regclass[i] == X86_64_X87UP_CLASS
6313 	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6314 	{
6315 	  if (!issued_x87_ret_error)
6316 	    {
6317 	      error ("x87 register return with x87 disabled");
6318 	      issued_x87_ret_error = true;
6319 	    }
6320 	  return NULL;
6321 	}
6322 
6323   /* First construct simple cases.  Avoid SCmode, since we want to use
6324      single register to pass this type.  */
6325   if (n == 1 && mode != SCmode)
6326     switch (regclass[0])
6327       {
6328       case X86_64_INTEGER_CLASS:
6329       case X86_64_INTEGERSI_CLASS:
6330 	return gen_rtx_REG (mode, intreg[0]);
6331       case X86_64_SSE_CLASS:
6332       case X86_64_SSESF_CLASS:
6333       case X86_64_SSEDF_CLASS:
6334 	if (mode != BLKmode)
6335 	  return gen_reg_or_parallel (mode, orig_mode,
6336 				      SSE_REGNO (sse_regno));
6337 	break;
6338       case X86_64_X87_CLASS:
6339       case X86_64_COMPLEX_X87_CLASS:
6340 	return gen_rtx_REG (mode, FIRST_STACK_REG);
6341       case X86_64_NO_CLASS:
6342 	/* Zero sized array, struct or class.  */
6343 	return NULL;
6344       default:
6345 	gcc_unreachable ();
6346       }
6347   if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6348       && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6349     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6350   if (n == 4
6351       && regclass[0] == X86_64_SSE_CLASS
6352       && regclass[1] == X86_64_SSEUP_CLASS
6353       && regclass[2] == X86_64_SSEUP_CLASS
6354       && regclass[3] == X86_64_SSEUP_CLASS
6355       && mode != BLKmode)
6356     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6357 
6358   if (n == 2
6359       && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6360     return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6361   if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6362       && regclass[1] == X86_64_INTEGER_CLASS
6363       && (mode == CDImode || mode == TImode)
6364       && intreg[0] + 1 == intreg[1])
6365     return gen_rtx_REG (mode, intreg[0]);
6366 
6367   /* Otherwise figure out the entries of the PARALLEL.  */
6368   for (i = 0; i < n; i++)
6369     {
6370       int pos;
6371 
6372       switch (regclass[i])
6373         {
6374 	  case X86_64_NO_CLASS:
6375 	    break;
6376 	  case X86_64_INTEGER_CLASS:
6377 	  case X86_64_INTEGERSI_CLASS:
6378 	    /* Merge TImodes on aligned occasions here too.  */
6379 	    if (i * 8 + 8 > bytes)
6380 	      tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6381 	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6382 	      tmpmode = SImode;
6383 	    else
6384 	      tmpmode = DImode;
6385 	    /* We've requested 24 bytes we don't have mode for.  Use DImode.  */
6386 	    if (tmpmode == BLKmode)
6387 	      tmpmode = DImode;
6388 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6389 					       gen_rtx_REG (tmpmode, *intreg),
6390 					       GEN_INT (i*8));
6391 	    intreg++;
6392 	    break;
6393 	  case X86_64_SSESF_CLASS:
6394 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6395 					       gen_rtx_REG (SFmode,
6396 							    SSE_REGNO (sse_regno)),
6397 					       GEN_INT (i*8));
6398 	    sse_regno++;
6399 	    break;
6400 	  case X86_64_SSEDF_CLASS:
6401 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6402 					       gen_rtx_REG (DFmode,
6403 							    SSE_REGNO (sse_regno)),
6404 					       GEN_INT (i*8));
6405 	    sse_regno++;
6406 	    break;
6407 	  case X86_64_SSE_CLASS:
6408 	    pos = i;
6409 	    switch (n)
6410 	      {
6411 	      case 1:
6412 		tmpmode = DImode;
6413 		break;
6414 	      case 2:
6415 		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6416 		  {
6417 		    tmpmode = TImode;
6418 		    i++;
6419 		  }
6420 		else
6421 		  tmpmode = DImode;
6422 		break;
6423 	      case 4:
6424 		gcc_assert (i == 0
6425 			    && regclass[1] == X86_64_SSEUP_CLASS
6426 			    && regclass[2] == X86_64_SSEUP_CLASS
6427 			    && regclass[3] == X86_64_SSEUP_CLASS);
6428 		tmpmode = OImode;
6429 		i += 3;
6430 		break;
6431 	      default:
6432 		gcc_unreachable ();
6433 	      }
6434 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6435 					       gen_rtx_REG (tmpmode,
6436 							    SSE_REGNO (sse_regno)),
6437 					       GEN_INT (pos*8));
6438 	    sse_regno++;
6439 	    break;
6440 	  default:
6441 	    gcc_unreachable ();
6442 	}
6443     }
6444 
6445   /* Empty aligned struct, union or class.  */
6446   if (nexps == 0)
6447     return NULL;
6448 
6449   ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6450   for (i = 0; i < nexps; i++)
6451     XVECEXP (ret, 0, i) = exp [i];
6452   return ret;
6453 }
6454 
6455 /* Update the data in CUM to advance over an argument of mode MODE
6456    and data type TYPE.  (TYPE is null for libcalls where that information
6457    may not be available.)  */
6458 
6459 static void
6460 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6461 			 const_tree type, HOST_WIDE_INT bytes,
6462 			 HOST_WIDE_INT words)
6463 {
6464   switch (mode)
6465     {
6466     default:
6467       break;
6468 
6469     case BLKmode:
6470       if (bytes < 0)
6471 	break;
6472       /* FALLTHRU */
6473 
6474     case DImode:
6475     case SImode:
6476     case HImode:
6477     case QImode:
6478       cum->words += words;
6479       cum->nregs -= words;
6480       cum->regno += words;
6481 
6482       if (cum->nregs <= 0)
6483 	{
6484 	  cum->nregs = 0;
6485 	  cum->regno = 0;
6486 	}
6487       break;
6488 
6489     case OImode:
6490       /* OImode shouldn't be used directly.  */
6491       gcc_unreachable ();
6492 
6493     case DFmode:
6494       if (cum->float_in_sse < 2)
6495 	break;
6496     case SFmode:
6497       if (cum->float_in_sse < 1)
6498 	break;
6499       /* FALLTHRU */
6500 
6501     case V8SFmode:
6502     case V8SImode:
6503     case V32QImode:
6504     case V16HImode:
6505     case V4DFmode:
6506     case V4DImode:
6507     case TImode:
6508     case V16QImode:
6509     case V8HImode:
6510     case V4SImode:
6511     case V2DImode:
6512     case V4SFmode:
6513     case V2DFmode:
6514       if (!type || !AGGREGATE_TYPE_P (type))
6515 	{
6516 	  cum->sse_words += words;
6517 	  cum->sse_nregs -= 1;
6518 	  cum->sse_regno += 1;
6519 	  if (cum->sse_nregs <= 0)
6520 	    {
6521 	      cum->sse_nregs = 0;
6522 	      cum->sse_regno = 0;
6523 	    }
6524 	}
6525       break;
6526 
6527     case V8QImode:
6528     case V4HImode:
6529     case V2SImode:
6530     case V2SFmode:
6531     case V1TImode:
6532     case V1DImode:
6533       if (!type || !AGGREGATE_TYPE_P (type))
6534 	{
6535 	  cum->mmx_words += words;
6536 	  cum->mmx_nregs -= 1;
6537 	  cum->mmx_regno += 1;
6538 	  if (cum->mmx_nregs <= 0)
6539 	    {
6540 	      cum->mmx_nregs = 0;
6541 	      cum->mmx_regno = 0;
6542 	    }
6543 	}
6544       break;
6545     }
6546 }
6547 
6548 static void
6549 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6550 			 const_tree type, HOST_WIDE_INT words, bool named)
6551 {
6552   int int_nregs, sse_nregs;
6553 
6554   /* Unnamed 256bit vector mode parameters are passed on stack.  */
6555   if (!named && VALID_AVX256_REG_MODE (mode))
6556     return;
6557 
6558   if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6559       && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6560     {
6561       cum->nregs -= int_nregs;
6562       cum->sse_nregs -= sse_nregs;
6563       cum->regno += int_nregs;
6564       cum->sse_regno += sse_nregs;
6565     }
6566   else
6567     {
6568       int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6569       cum->words = (cum->words + align - 1) & ~(align - 1);
6570       cum->words += words;
6571     }
6572 }
6573 
6574 static void
6575 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6576 			    HOST_WIDE_INT words)
6577 {
6578   /* Otherwise, this should be passed indirect.  */
6579   gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6580 
6581   cum->words += words;
6582   if (cum->nregs > 0)
6583     {
6584       cum->nregs -= 1;
6585       cum->regno += 1;
6586     }
6587 }
6588 
6589 /* Update the data in CUM to advance over an argument of mode MODE and
6590    data type TYPE.  (TYPE is null for libcalls where that information
6591    may not be available.)  */
6592 
6593 static void
6594 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6595 			   const_tree type, bool named)
6596 {
6597   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6598   HOST_WIDE_INT bytes, words;
6599 
6600   if (mode == BLKmode)
6601     bytes = int_size_in_bytes (type);
6602   else
6603     bytes = GET_MODE_SIZE (mode);
6604   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6605 
6606   if (type)
6607     mode = type_natural_mode (type, NULL);
6608 
6609   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6610     function_arg_advance_ms_64 (cum, bytes, words);
6611   else if (TARGET_64BIT)
6612     function_arg_advance_64 (cum, mode, type, words, named);
6613   else
6614     function_arg_advance_32 (cum, mode, type, bytes, words);
6615 }
6616 
6617 /* Define where to put the arguments to a function.
6618    Value is zero to push the argument on the stack,
6619    or a hard register in which to store the argument.
6620 
6621    MODE is the argument's machine mode.
6622    TYPE is the data type of the argument (as a tree).
6623     This is null for libcalls where that information may
6624     not be available.
6625    CUM is a variable of type CUMULATIVE_ARGS which gives info about
6626     the preceding args and about the function being called.
6627    NAMED is nonzero if this argument is a named parameter
6628     (otherwise it is an extra parameter matching an ellipsis).  */
6629 
6630 static rtx
6631 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6632 		 enum machine_mode orig_mode, const_tree type,
6633 		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6634 {
6635   static bool warnedsse, warnedmmx;
6636 
6637   /* Avoid the AL settings for the Unix64 ABI.  */
6638   if (mode == VOIDmode)
6639     return constm1_rtx;
6640 
6641   switch (mode)
6642     {
6643     default:
6644       break;
6645 
6646     case BLKmode:
6647       if (bytes < 0)
6648 	break;
6649       /* FALLTHRU */
6650     case DImode:
6651     case SImode:
6652     case HImode:
6653     case QImode:
6654       if (words <= cum->nregs)
6655 	{
6656 	  int regno = cum->regno;
6657 
6658 	  /* Fastcall allocates the first two DWORD (SImode) or
6659             smaller arguments to ECX and EDX if it isn't an
6660             aggregate type .  */
6661 	  if (cum->fastcall)
6662 	    {
6663 	      if (mode == BLKmode
6664 		  || mode == DImode
6665 		  || (type && AGGREGATE_TYPE_P (type)))
6666 	        break;
6667 
6668 	      /* ECX not EAX is the first allocated register.  */
6669 	      if (regno == AX_REG)
6670 		regno = CX_REG;
6671 	    }
6672 	  return gen_rtx_REG (mode, regno);
6673 	}
6674       break;
6675 
6676     case DFmode:
6677       if (cum->float_in_sse < 2)
6678 	break;
6679     case SFmode:
6680       if (cum->float_in_sse < 1)
6681 	break;
6682       /* FALLTHRU */
6683     case TImode:
6684       /* In 32bit, we pass TImode in xmm registers.  */
6685     case V16QImode:
6686     case V8HImode:
6687     case V4SImode:
6688     case V2DImode:
6689     case V4SFmode:
6690     case V2DFmode:
6691       if (!type || !AGGREGATE_TYPE_P (type))
6692 	{
6693 	  if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6694 	    {
6695 	      warnedsse = true;
6696 	      warning (0, "SSE vector argument without SSE enabled "
6697 		       "changes the ABI");
6698 	    }
6699 	  if (cum->sse_nregs)
6700 	    return gen_reg_or_parallel (mode, orig_mode,
6701 				        cum->sse_regno + FIRST_SSE_REG);
6702 	}
6703       break;
6704 
6705     case OImode:
6706       /* OImode shouldn't be used directly.  */
6707       gcc_unreachable ();
6708 
6709     case V8SFmode:
6710     case V8SImode:
6711     case V32QImode:
6712     case V16HImode:
6713     case V4DFmode:
6714     case V4DImode:
6715       if (!type || !AGGREGATE_TYPE_P (type))
6716 	{
6717 	  if (cum->sse_nregs)
6718 	    return gen_reg_or_parallel (mode, orig_mode,
6719 				        cum->sse_regno + FIRST_SSE_REG);
6720 	}
6721       break;
6722 
6723     case V8QImode:
6724     case V4HImode:
6725     case V2SImode:
6726     case V2SFmode:
6727     case V1TImode:
6728     case V1DImode:
6729       if (!type || !AGGREGATE_TYPE_P (type))
6730 	{
6731 	  if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6732 	    {
6733 	      warnedmmx = true;
6734 	      warning (0, "MMX vector argument without MMX enabled "
6735 		       "changes the ABI");
6736 	    }
6737 	  if (cum->mmx_nregs)
6738 	    return gen_reg_or_parallel (mode, orig_mode,
6739 				        cum->mmx_regno + FIRST_MMX_REG);
6740 	}
6741       break;
6742     }
6743 
6744   return NULL_RTX;
6745 }
6746 
6747 static rtx
6748 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6749 		 enum machine_mode orig_mode, const_tree type, bool named)
6750 {
6751   /* Handle a hidden AL argument containing number of registers
6752      for varargs x86-64 functions.  */
6753   if (mode == VOIDmode)
6754     return GEN_INT (cum->maybe_vaarg
6755 		    ? (cum->sse_nregs < 0
6756 		       ? X86_64_SSE_REGPARM_MAX
6757 		       : cum->sse_regno)
6758 		    : -1);
6759 
6760   switch (mode)
6761     {
6762     default:
6763       break;
6764 
6765     case V8SFmode:
6766     case V8SImode:
6767     case V32QImode:
6768     case V16HImode:
6769     case V4DFmode:
6770     case V4DImode:
6771       /* Unnamed 256bit vector mode parameters are passed on stack.  */
6772       if (!named)
6773 	return NULL;
6774       break;
6775     }
6776 
6777   return construct_container (mode, orig_mode, type, 0, cum->nregs,
6778 			      cum->sse_nregs,
6779 			      &x86_64_int_parameter_registers [cum->regno],
6780 			      cum->sse_regno);
6781 }
6782 
6783 static rtx
6784 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6785 		    enum machine_mode orig_mode, bool named,
6786 		    HOST_WIDE_INT bytes)
6787 {
6788   unsigned int regno;
6789 
6790   /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6791      We use value of -2 to specify that current function call is MSABI.  */
6792   if (mode == VOIDmode)
6793     return GEN_INT (-2);
6794 
6795   /* If we've run out of registers, it goes on the stack.  */
6796   if (cum->nregs == 0)
6797     return NULL_RTX;
6798 
6799   regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6800 
6801   /* Only floating point modes are passed in anything but integer regs.  */
6802   if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6803     {
6804       if (named)
6805 	regno = cum->regno + FIRST_SSE_REG;
6806       else
6807 	{
6808 	  rtx t1, t2;
6809 
6810 	  /* Unnamed floating parameters are passed in both the
6811 	     SSE and integer registers.  */
6812 	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6813 	  t2 = gen_rtx_REG (mode, regno);
6814 	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6815 	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6816 	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6817 	}
6818     }
6819   /* Handle aggregated types passed in register.  */
6820   if (orig_mode == BLKmode)
6821     {
6822       if (bytes > 0 && bytes <= 8)
6823         mode = (bytes > 4 ? DImode : SImode);
6824       if (mode == BLKmode)
6825         mode = DImode;
6826     }
6827 
6828   return gen_reg_or_parallel (mode, orig_mode, regno);
6829 }
6830 
6831 /* Return where to put the arguments to a function.
6832    Return zero to push the argument on the stack, or a hard register in which to store the argument.
6833 
6834    MODE is the argument's machine mode.  TYPE is the data type of the
6835    argument.  It is null for libcalls where that information may not be
6836    available.  CUM gives information about the preceding args and about
6837    the function being called.  NAMED is nonzero if this argument is a
6838    named parameter (otherwise it is an extra parameter matching an
6839    ellipsis).  */
6840 
6841 static rtx
6842 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6843 		   const_tree type, bool named)
6844 {
6845   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6846   enum machine_mode mode = omode;
6847   HOST_WIDE_INT bytes, words;
6848   rtx arg;
6849 
6850   if (mode == BLKmode)
6851     bytes = int_size_in_bytes (type);
6852   else
6853     bytes = GET_MODE_SIZE (mode);
6854   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6855 
6856   /* To simplify the code below, represent vector types with a vector mode
6857      even if MMX/SSE are not active.  */
6858   if (type && TREE_CODE (type) == VECTOR_TYPE)
6859     mode = type_natural_mode (type, cum);
6860 
6861   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6862     arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6863   else if (TARGET_64BIT)
6864     arg = function_arg_64 (cum, mode, omode, type, named);
6865   else
6866     arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6867 
6868   if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6869     {
6870       /* This argument uses 256bit AVX modes.  */
6871       if (cum->caller)
6872 	cum->callee_pass_avx256_p = true;
6873       else
6874 	cfun->machine->caller_pass_avx256_p = true;
6875     }
6876 
6877   if (cum->caller && mode == VOIDmode)
6878     {
6879       /* This function is called with MODE == VOIDmode immediately
6880 	 before the call instruction is emitted.  We copy callee 256bit
6881 	 AVX info from the current CUM here.  */
6882       cfun->machine->callee_return_avx256_p = cum->callee_return_avx256_p;
6883       cfun->machine->callee_pass_avx256_p = cum->callee_pass_avx256_p;
6884     }
6885 
6886   return arg;
6887 }
6888 
6889 /* A C expression that indicates when an argument must be passed by
6890    reference.  If nonzero for an argument, a copy of that argument is
6891    made in memory and a pointer to the argument is passed instead of
6892    the argument itself.  The pointer is passed in whatever way is
6893    appropriate for passing a pointer to that type.  */
6894 
6895 static bool
6896 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6897 			enum machine_mode mode ATTRIBUTE_UNUSED,
6898 			const_tree type, bool named ATTRIBUTE_UNUSED)
6899 {
6900   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6901 
6902   /* See Windows x64 Software Convention.  */
6903   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6904     {
6905       int msize = (int) GET_MODE_SIZE (mode);
6906       if (type)
6907 	{
6908 	  /* Arrays are passed by reference.  */
6909 	  if (TREE_CODE (type) == ARRAY_TYPE)
6910 	    return true;
6911 
6912 	  if (AGGREGATE_TYPE_P (type))
6913 	    {
6914 	      /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6915 	         are passed by reference.  */
6916 	      msize = int_size_in_bytes (type);
6917 	    }
6918 	}
6919 
6920       /* __m128 is passed by reference.  */
6921       switch (msize) {
6922       case 1: case 2: case 4: case 8:
6923         break;
6924       default:
6925         return true;
6926       }
6927     }
6928   else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6929     return 1;
6930 
6931   return 0;
6932 }
6933 
6934 /* Return true when TYPE should be 128bit aligned for 32bit argument
6935    passing ABI.  XXX: This function is obsolete and is only used for
6936    checking psABI compatibility with previous versions of GCC.  */
6937 
6938 static bool
6939 ix86_compat_aligned_value_p (const_tree type)
6940 {
6941   enum machine_mode mode = TYPE_MODE (type);
6942   if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6943        || mode == TDmode
6944        || mode == TFmode
6945        || mode == TCmode)
6946       && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6947     return true;
6948   if (TYPE_ALIGN (type) < 128)
6949     return false;
6950 
6951   if (AGGREGATE_TYPE_P (type))
6952     {
6953       /* Walk the aggregates recursively.  */
6954       switch (TREE_CODE (type))
6955 	{
6956 	case RECORD_TYPE:
6957 	case UNION_TYPE:
6958 	case QUAL_UNION_TYPE:
6959 	  {
6960 	    tree field;
6961 
6962 	    /* Walk all the structure fields.  */
6963 	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6964 	      {
6965 		if (TREE_CODE (field) == FIELD_DECL
6966 		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6967 		  return true;
6968 	      }
6969 	    break;
6970 	  }
6971 
6972 	case ARRAY_TYPE:
6973 	  /* Just for use if some languages passes arrays by value.  */
6974 	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6975 	    return true;
6976 	  break;
6977 
6978 	default:
6979 	  gcc_unreachable ();
6980 	}
6981     }
6982   return false;
6983 }
6984 
6985 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6986    XXX: This function is obsolete and is only used for checking psABI
6987    compatibility with previous versions of GCC.  */
6988 
6989 static unsigned int
6990 ix86_compat_function_arg_boundary (enum machine_mode mode,
6991 				   const_tree type, unsigned int align)
6992 {
6993   /* In 32bit, only _Decimal128 and __float128 are aligned to their
6994      natural boundaries.  */
6995   if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6996     {
6997       /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
6998 	 make an exception for SSE modes since these require 128bit
6999 	 alignment.
7000 
7001 	 The handling here differs from field_alignment.  ICC aligns MMX
7002 	 arguments to 4 byte boundaries, while structure fields are aligned
7003 	 to 8 byte boundaries.  */
7004       if (!type)
7005 	{
7006 	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7007 	    align = PARM_BOUNDARY;
7008 	}
7009       else
7010 	{
7011 	  if (!ix86_compat_aligned_value_p (type))
7012 	    align = PARM_BOUNDARY;
7013 	}
7014     }
7015   if (align > BIGGEST_ALIGNMENT)
7016     align = BIGGEST_ALIGNMENT;
7017   return align;
7018 }
7019 
7020 /* Return true when TYPE should be 128bit aligned for 32bit argument
7021    passing ABI.  */
7022 
7023 static bool
7024 ix86_contains_aligned_value_p (const_tree type)
7025 {
7026   enum machine_mode mode = TYPE_MODE (type);
7027 
7028   if (mode == XFmode || mode == XCmode)
7029     return false;
7030 
7031   if (TYPE_ALIGN (type) < 128)
7032     return false;
7033 
7034   if (AGGREGATE_TYPE_P (type))
7035     {
7036       /* Walk the aggregates recursively.  */
7037       switch (TREE_CODE (type))
7038 	{
7039 	case RECORD_TYPE:
7040 	case UNION_TYPE:
7041 	case QUAL_UNION_TYPE:
7042 	  {
7043 	    tree field;
7044 
7045 	    /* Walk all the structure fields.  */
7046 	    for (field = TYPE_FIELDS (type);
7047 		 field;
7048 		 field = DECL_CHAIN (field))
7049 	      {
7050 		if (TREE_CODE (field) == FIELD_DECL
7051 		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7052 		  return true;
7053 	      }
7054 	    break;
7055 	  }
7056 
7057 	case ARRAY_TYPE:
7058 	  /* Just for use if some languages passes arrays by value.  */
7059 	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7060 	    return true;
7061 	  break;
7062 
7063 	default:
7064 	  gcc_unreachable ();
7065 	}
7066     }
7067   else
7068     return TYPE_ALIGN (type) >= 128;
7069 
7070   return false;
7071 }
7072 
7073 /* Gives the alignment boundary, in bits, of an argument with the
7074    specified mode and type.  */
7075 
7076 static unsigned int
7077 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7078 {
7079   unsigned int align;
7080   if (type)
7081     {
7082       /* Since the main variant type is used for call, we convert it to
7083 	 the main variant type.  */
7084       type = TYPE_MAIN_VARIANT (type);
7085       align = TYPE_ALIGN (type);
7086     }
7087   else
7088     align = GET_MODE_ALIGNMENT (mode);
7089   if (align < PARM_BOUNDARY)
7090     align = PARM_BOUNDARY;
7091   else
7092     {
7093       static bool warned;
7094       unsigned int saved_align = align;
7095 
7096       if (!TARGET_64BIT)
7097 	{
7098 	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
7099 	  if (!type)
7100 	    {
7101 	      if (mode == XFmode || mode == XCmode)
7102 		align = PARM_BOUNDARY;
7103 	    }
7104 	  else if (!ix86_contains_aligned_value_p (type))
7105 	    align = PARM_BOUNDARY;
7106 
7107 	  if (align < 128)
7108 	    align = PARM_BOUNDARY;
7109 	}
7110 
7111       if (warn_psabi
7112 	  && !warned
7113 	  && align != ix86_compat_function_arg_boundary (mode, type,
7114 							 saved_align))
7115 	{
7116 	  warned = true;
7117 	  inform (input_location,
7118 		  "The ABI for passing parameters with %d-byte"
7119 		  " alignment has changed in GCC 4.6",
7120 		  align / BITS_PER_UNIT);
7121 	}
7122     }
7123 
7124   return align;
7125 }
7126 
7127 /* Return true if N is a possible register number of function value.  */
7128 
7129 static bool
7130 ix86_function_value_regno_p (const unsigned int regno)
7131 {
7132   switch (regno)
7133     {
7134     case AX_REG:
7135     case DX_REG:
7136       return true;
7137     case DI_REG:
7138     case SI_REG:
7139       return TARGET_64BIT && ix86_abi != MS_ABI;
7140 
7141       /* Complex values are returned in %st(0)/%st(1) pair.  */
7142     case ST0_REG:
7143     case ST1_REG:
7144       /* TODO: The function should depend on current function ABI but
7145        builtins.c would need updating then. Therefore we use the
7146        default ABI.  */
7147       if (TARGET_64BIT && ix86_abi == MS_ABI)
7148 	return false;
7149       return TARGET_FLOAT_RETURNS_IN_80387;
7150 
7151       /* Complex values are returned in %xmm0/%xmm1 pair.  */
7152     case XMM0_REG:
7153     case XMM1_REG:
7154       return TARGET_SSE;
7155 
7156     case MM0_REG:
7157       if (TARGET_MACHO || TARGET_64BIT)
7158 	return false;
7159       return TARGET_MMX;
7160     }
7161 
7162   return false;
7163 }
7164 
7165 /* Define how to find the value returned by a function.
7166    VALTYPE is the data type of the value (as a tree).
7167    If the precise function being called is known, FUNC is its FUNCTION_DECL;
7168    otherwise, FUNC is 0.  */
7169 
7170 static rtx
7171 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7172 		   const_tree fntype, const_tree fn)
7173 {
7174   unsigned int regno;
7175 
7176   /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7177      we normally prevent this case when mmx is not available.  However
7178      some ABIs may require the result to be returned like DImode.  */
7179   if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7180     regno = FIRST_MMX_REG;
7181 
7182   /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
7183      we prevent this case when sse is not available.  However some ABIs
7184      may require the result to be returned like integer TImode.  */
7185   else if (mode == TImode
7186 	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7187     regno = FIRST_SSE_REG;
7188 
7189   /* 32-byte vector modes in %ymm0.   */
7190   else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7191     regno = FIRST_SSE_REG;
7192 
7193   /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
7194   else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7195     regno = FIRST_FLOAT_REG;
7196   else
7197     /* Most things go in %eax.  */
7198     regno = AX_REG;
7199 
7200   /* Override FP return register with %xmm0 for local functions when
7201      SSE math is enabled or for functions with sseregparm attribute.  */
7202   if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7203     {
7204       int sse_level = ix86_function_sseregparm (fntype, fn, false);
7205       if ((sse_level >= 1 && mode == SFmode)
7206 	  || (sse_level == 2 && mode == DFmode))
7207 	regno = FIRST_SSE_REG;
7208     }
7209 
7210   /* OImode shouldn't be used directly.  */
7211   gcc_assert (mode != OImode);
7212 
7213   return gen_rtx_REG (orig_mode, regno);
7214 }
7215 
7216 static rtx
7217 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7218 		   const_tree valtype)
7219 {
7220   rtx ret;
7221 
7222   /* Handle libcalls, which don't provide a type node.  */
7223   if (valtype == NULL)
7224     {
7225       unsigned int regno;
7226 
7227       switch (mode)
7228 	{
7229 	case SFmode:
7230 	case SCmode:
7231 	case DFmode:
7232 	case DCmode:
7233 	case TFmode:
7234 	case SDmode:
7235 	case DDmode:
7236 	case TDmode:
7237 	  regno = FIRST_SSE_REG;
7238 	  break;
7239 	case XFmode:
7240 	case XCmode:
7241 	  regno = FIRST_FLOAT_REG;
7242 	  break;
7243 	case TCmode:
7244 	  return NULL;
7245 	default:
7246 	  regno = AX_REG;
7247 	}
7248 
7249       return gen_rtx_REG (mode, regno);
7250     }
7251   else if (POINTER_TYPE_P (valtype))
7252     {
7253       /* Pointers are always returned in Pmode. */
7254       mode = Pmode;
7255     }
7256 
7257   ret = construct_container (mode, orig_mode, valtype, 1,
7258 			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7259 			     x86_64_int_return_registers, 0);
7260 
7261   /* For zero sized structures, construct_container returns NULL, but we
7262      need to keep rest of compiler happy by returning meaningful value.  */
7263   if (!ret)
7264     ret = gen_rtx_REG (orig_mode, AX_REG);
7265 
7266   return ret;
7267 }
7268 
7269 static rtx
7270 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7271 {
7272   unsigned int regno = AX_REG;
7273 
7274   if (TARGET_SSE)
7275     {
7276       switch (GET_MODE_SIZE (mode))
7277         {
7278         case 16:
7279           if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7280 	     && !COMPLEX_MODE_P (mode))
7281 	    regno = FIRST_SSE_REG;
7282 	  break;
7283 	case 8:
7284 	case 4:
7285 	  if (mode == SFmode || mode == DFmode)
7286 	    regno = FIRST_SSE_REG;
7287 	  break;
7288 	default:
7289 	  break;
7290         }
7291     }
7292   return gen_rtx_REG (orig_mode, regno);
7293 }
7294 
7295 static rtx
7296 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7297 		       enum machine_mode orig_mode, enum machine_mode mode)
7298 {
7299   const_tree fn, fntype;
7300 
7301   fn = NULL_TREE;
7302   if (fntype_or_decl && DECL_P (fntype_or_decl))
7303     fn = fntype_or_decl;
7304   fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7305 
7306   if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7307     return function_value_ms_64 (orig_mode, mode);
7308   else if (TARGET_64BIT)
7309     return function_value_64 (orig_mode, mode, valtype);
7310   else
7311     return function_value_32 (orig_mode, mode, fntype, fn);
7312 }
7313 
7314 static rtx
7315 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7316 		     bool outgoing ATTRIBUTE_UNUSED)
7317 {
7318   enum machine_mode mode, orig_mode;
7319 
7320   orig_mode = TYPE_MODE (valtype);
7321   mode = type_natural_mode (valtype, NULL);
7322   return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7323 }
7324 
7325 /* Pointer function arguments and return values are promoted to Pmode.  */
7326 
7327 static enum machine_mode
7328 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7329 			    int *punsignedp, const_tree fntype,
7330 			    int for_return)
7331 {
7332   if (type != NULL_TREE && POINTER_TYPE_P (type))
7333     {
7334       *punsignedp = POINTERS_EXTEND_UNSIGNED;
7335       return Pmode;
7336     }
7337   return default_promote_function_mode (type, mode, punsignedp, fntype,
7338 					for_return);
7339 }
7340 
7341 rtx
7342 ix86_libcall_value (enum machine_mode mode)
7343 {
7344   return ix86_function_value_1 (NULL, NULL, mode, mode);
7345 }
7346 
7347 /* Return true iff type is returned in memory.  */
7348 
7349 static bool ATTRIBUTE_UNUSED
7350 return_in_memory_32 (const_tree type, enum machine_mode mode)
7351 {
7352   HOST_WIDE_INT size;
7353 
7354   if (mode == BLKmode)
7355     return true;
7356 
7357   size = int_size_in_bytes (type);
7358 
7359   if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7360     return false;
7361 
7362   if (VECTOR_MODE_P (mode) || mode == TImode)
7363     {
7364       /* User-created vectors small enough to fit in EAX.  */
7365       if (size < 8)
7366 	return false;
7367 
7368       /* MMX/3dNow values are returned in MM0,
7369 	 except when it doesn't exits or the ABI prescribes otherwise.  */
7370       if (size == 8)
7371 	return !TARGET_MMX || TARGET_VECT8_RETURNS;
7372 
7373       /* SSE values are returned in XMM0, except when it doesn't exist.  */
7374       if (size == 16)
7375 	return !TARGET_SSE;
7376 
7377       /* AVX values are returned in YMM0, except when it doesn't exist.  */
7378       if (size == 32)
7379 	return !TARGET_AVX;
7380     }
7381 
7382   if (mode == XFmode)
7383     return false;
7384 
7385   if (size > 12)
7386     return true;
7387 
7388   /* OImode shouldn't be used directly.  */
7389   gcc_assert (mode != OImode);
7390 
7391   return false;
7392 }
7393 
7394 static bool ATTRIBUTE_UNUSED
7395 return_in_memory_64 (const_tree type, enum machine_mode mode)
7396 {
7397   int needed_intregs, needed_sseregs;
7398   return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7399 }
7400 
7401 static bool ATTRIBUTE_UNUSED
7402 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7403 {
7404   HOST_WIDE_INT size = int_size_in_bytes (type);
7405 
7406   /* __m128 is returned in xmm0.  */
7407   if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7408       && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7409     return false;
7410 
7411   /* Otherwise, the size must be exactly in [1248]. */
7412   return size != 1 && size != 2 && size != 4 && size != 8;
7413 }
7414 
7415 static bool
7416 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7417 {
7418 #ifdef SUBTARGET_RETURN_IN_MEMORY
7419   return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7420 #else
7421   const enum machine_mode mode = type_natural_mode (type, NULL);
7422 
7423   if (TARGET_64BIT)
7424     {
7425       if (ix86_function_type_abi (fntype) == MS_ABI)
7426 	return return_in_memory_ms_64 (type, mode);
7427       else
7428 	return return_in_memory_64 (type, mode);
7429     }
7430   else
7431     return return_in_memory_32 (type, mode);
7432 #endif
7433 }
7434 
7435 /* When returning SSE vector types, we have a choice of either
7436      (1) being abi incompatible with a -march switch, or
7437      (2) generating an error.
7438    Given no good solution, I think the safest thing is one warning.
7439    The user won't be able to use -Werror, but....
7440 
7441    Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7442    called in response to actually generating a caller or callee that
7443    uses such a type.  As opposed to TARGET_RETURN_IN_MEMORY, which is called
7444    via aggregate_value_p for general type probing from tree-ssa.  */
7445 
7446 static rtx
7447 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7448 {
7449   static bool warnedsse, warnedmmx;
7450 
7451   if (!TARGET_64BIT && type)
7452     {
7453       /* Look at the return type of the function, not the function type.  */
7454       enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7455 
7456       if (!TARGET_SSE && !warnedsse)
7457 	{
7458 	  if (mode == TImode
7459 	      || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7460 	    {
7461 	      warnedsse = true;
7462 	      warning (0, "SSE vector return without SSE enabled "
7463 		       "changes the ABI");
7464 	    }
7465 	}
7466 
7467       if (!TARGET_MMX && !warnedmmx)
7468 	{
7469 	  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7470 	    {
7471 	      warnedmmx = true;
7472 	      warning (0, "MMX vector return without MMX enabled "
7473 		       "changes the ABI");
7474 	    }
7475 	}
7476     }
7477 
7478   return NULL;
7479 }
7480 
7481 
7482 /* Create the va_list data type.  */
7483 
7484 /* Returns the calling convention specific va_list date type.
7485    The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI.  */
7486 
7487 static tree
7488 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7489 {
7490   tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7491 
7492   /* For i386 we use plain pointer to argument area.  */
7493   if (!TARGET_64BIT || abi == MS_ABI)
7494     return build_pointer_type (char_type_node);
7495 
7496   record = lang_hooks.types.make_type (RECORD_TYPE);
7497   type_decl = build_decl (BUILTINS_LOCATION,
7498 			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
7499 
7500   f_gpr = build_decl (BUILTINS_LOCATION,
7501 		      FIELD_DECL, get_identifier ("gp_offset"),
7502 		      unsigned_type_node);
7503   f_fpr = build_decl (BUILTINS_LOCATION,
7504 		      FIELD_DECL, get_identifier ("fp_offset"),
7505 		      unsigned_type_node);
7506   f_ovf = build_decl (BUILTINS_LOCATION,
7507 		      FIELD_DECL, get_identifier ("overflow_arg_area"),
7508 		      ptr_type_node);
7509   f_sav = build_decl (BUILTINS_LOCATION,
7510 		      FIELD_DECL, get_identifier ("reg_save_area"),
7511 		      ptr_type_node);
7512 
7513   va_list_gpr_counter_field = f_gpr;
7514   va_list_fpr_counter_field = f_fpr;
7515 
7516   DECL_FIELD_CONTEXT (f_gpr) = record;
7517   DECL_FIELD_CONTEXT (f_fpr) = record;
7518   DECL_FIELD_CONTEXT (f_ovf) = record;
7519   DECL_FIELD_CONTEXT (f_sav) = record;
7520 
7521   TYPE_STUB_DECL (record) = type_decl;
7522   TYPE_NAME (record) = type_decl;
7523   TYPE_FIELDS (record) = f_gpr;
7524   DECL_CHAIN (f_gpr) = f_fpr;
7525   DECL_CHAIN (f_fpr) = f_ovf;
7526   DECL_CHAIN (f_ovf) = f_sav;
7527 
7528   layout_type (record);
7529 
7530   /* The correct type is an array type of one element.  */
7531   return build_array_type (record, build_index_type (size_zero_node));
7532 }
7533 
7534 /* Setup the builtin va_list data type and for 64-bit the additional
7535    calling convention specific va_list data types.  */
7536 
7537 static tree
7538 ix86_build_builtin_va_list (void)
7539 {
7540   tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7541 
7542   /* Initialize abi specific va_list builtin types.  */
7543   if (TARGET_64BIT)
7544     {
7545       tree t;
7546       if (ix86_abi == MS_ABI)
7547         {
7548           t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7549           if (TREE_CODE (t) != RECORD_TYPE)
7550             t = build_variant_type_copy (t);
7551           sysv_va_list_type_node = t;
7552         }
7553       else
7554         {
7555           t = ret;
7556           if (TREE_CODE (t) != RECORD_TYPE)
7557             t = build_variant_type_copy (t);
7558           sysv_va_list_type_node = t;
7559         }
7560       if (ix86_abi != MS_ABI)
7561         {
7562           t = ix86_build_builtin_va_list_abi (MS_ABI);
7563           if (TREE_CODE (t) != RECORD_TYPE)
7564             t = build_variant_type_copy (t);
7565           ms_va_list_type_node = t;
7566         }
7567       else
7568         {
7569           t = ret;
7570           if (TREE_CODE (t) != RECORD_TYPE)
7571             t = build_variant_type_copy (t);
7572           ms_va_list_type_node = t;
7573         }
7574     }
7575 
7576   return ret;
7577 }
7578 
7579 /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
7580 
7581 static void
7582 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7583 {
7584   rtx save_area, mem;
7585   alias_set_type set;
7586   int i, max;
7587 
7588   /* GPR size of varargs save area.  */
7589   if (cfun->va_list_gpr_size)
7590     ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7591   else
7592     ix86_varargs_gpr_size = 0;
7593 
7594   /* FPR size of varargs save area.  We don't need it if we don't pass
7595      anything in SSE registers.  */
7596   if (TARGET_SSE && cfun->va_list_fpr_size)
7597     ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7598   else
7599     ix86_varargs_fpr_size = 0;
7600 
7601   if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7602     return;
7603 
7604   save_area = frame_pointer_rtx;
7605   set = get_varargs_alias_set ();
7606 
7607   max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7608   if (max > X86_64_REGPARM_MAX)
7609     max = X86_64_REGPARM_MAX;
7610 
7611   for (i = cum->regno; i < max; i++)
7612     {
7613       mem = gen_rtx_MEM (Pmode,
7614 			 plus_constant (save_area, i * UNITS_PER_WORD));
7615       MEM_NOTRAP_P (mem) = 1;
7616       set_mem_alias_set (mem, set);
7617       emit_move_insn (mem, gen_rtx_REG (Pmode,
7618 					x86_64_int_parameter_registers[i]));
7619     }
7620 
7621   if (ix86_varargs_fpr_size)
7622     {
7623       enum machine_mode smode;
7624       rtx label, test;
7625 
7626       /* Now emit code to save SSE registers.  The AX parameter contains number
7627 	 of SSE parameter registers used to call this function, though all we
7628 	 actually check here is the zero/non-zero status.  */
7629 
7630       label = gen_label_rtx ();
7631       test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7632       emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7633 				      label));
7634 
7635       /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7636 	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
7637 	 be if we could determine the real mode of the data, via a hook
7638 	 into pass_stdarg.  Ignore all that for now.  */
7639       smode = V4SFmode;
7640       if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7641 	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7642 
7643       max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7644       if (max > X86_64_SSE_REGPARM_MAX)
7645 	max = X86_64_SSE_REGPARM_MAX;
7646 
7647       for (i = cum->sse_regno; i < max; ++i)
7648 	{
7649 	  mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7650 	  mem = gen_rtx_MEM (smode, mem);
7651 	  MEM_NOTRAP_P (mem) = 1;
7652 	  set_mem_alias_set (mem, set);
7653 	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7654 
7655 	  emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7656 	}
7657 
7658       emit_label (label);
7659     }
7660 }
7661 
7662 static void
7663 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7664 {
7665   alias_set_type set = get_varargs_alias_set ();
7666   int i;
7667 
7668   /* Reset to zero, as there might be a sysv vaarg used
7669      before.  */
7670   ix86_varargs_gpr_size = 0;
7671   ix86_varargs_fpr_size = 0;
7672 
7673   for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7674     {
7675       rtx reg, mem;
7676 
7677       mem = gen_rtx_MEM (Pmode,
7678 			 plus_constant (virtual_incoming_args_rtx,
7679 					i * UNITS_PER_WORD));
7680       MEM_NOTRAP_P (mem) = 1;
7681       set_mem_alias_set (mem, set);
7682 
7683       reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7684       emit_move_insn (mem, reg);
7685     }
7686 }
7687 
7688 static void
7689 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7690 			     tree type, int *pretend_size ATTRIBUTE_UNUSED,
7691 			     int no_rtl)
7692 {
7693   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7694   CUMULATIVE_ARGS next_cum;
7695   tree fntype;
7696 
7697   /* This argument doesn't appear to be used anymore.  Which is good,
7698      because the old code here didn't suppress rtl generation.  */
7699   gcc_assert (!no_rtl);
7700 
7701   if (!TARGET_64BIT)
7702     return;
7703 
7704   fntype = TREE_TYPE (current_function_decl);
7705 
7706   /* For varargs, we do not want to skip the dummy va_dcl argument.
7707      For stdargs, we do want to skip the last named argument.  */
7708   next_cum = *cum;
7709   if (stdarg_p (fntype))
7710     ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7711 			       true);
7712 
7713   if (cum->call_abi == MS_ABI)
7714     setup_incoming_varargs_ms_64 (&next_cum);
7715   else
7716     setup_incoming_varargs_64 (&next_cum);
7717 }
7718 
7719 /* Checks if TYPE is of kind va_list char *.  */
7720 
7721 static bool
7722 is_va_list_char_pointer (tree type)
7723 {
7724   tree canonic;
7725 
7726   /* For 32-bit it is always true.  */
7727   if (!TARGET_64BIT)
7728     return true;
7729   canonic = ix86_canonical_va_list_type (type);
7730   return (canonic == ms_va_list_type_node
7731           || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7732 }
7733 
7734 /* Implement va_start.  */
7735 
7736 static void
7737 ix86_va_start (tree valist, rtx nextarg)
7738 {
7739   HOST_WIDE_INT words, n_gpr, n_fpr;
7740   tree f_gpr, f_fpr, f_ovf, f_sav;
7741   tree gpr, fpr, ovf, sav, t;
7742   tree type;
7743   rtx ovf_rtx;
7744 
7745   if (flag_split_stack
7746       && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7747     {
7748       unsigned int scratch_regno;
7749 
7750       /* When we are splitting the stack, we can't refer to the stack
7751 	 arguments using internal_arg_pointer, because they may be on
7752 	 the old stack.  The split stack prologue will arrange to
7753 	 leave a pointer to the old stack arguments in a scratch
7754 	 register, which we here copy to a pseudo-register.  The split
7755 	 stack prologue can't set the pseudo-register directly because
7756 	 it (the prologue) runs before any registers have been saved.  */
7757 
7758       scratch_regno = split_stack_prologue_scratch_regno ();
7759       if (scratch_regno != INVALID_REGNUM)
7760 	{
7761 	  rtx reg, seq;
7762 
7763 	  reg = gen_reg_rtx (Pmode);
7764 	  cfun->machine->split_stack_varargs_pointer = reg;
7765 
7766 	  start_sequence ();
7767 	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7768 	  seq = get_insns ();
7769 	  end_sequence ();
7770 
7771 	  push_topmost_sequence ();
7772 	  emit_insn_after (seq, entry_of_function ());
7773 	  pop_topmost_sequence ();
7774 	}
7775     }
7776 
7777   /* Only 64bit target needs something special.  */
7778   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7779     {
7780       if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7781 	std_expand_builtin_va_start (valist, nextarg);
7782       else
7783 	{
7784 	  rtx va_r, next;
7785 
7786 	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7787 	  next = expand_binop (ptr_mode, add_optab,
7788 			       cfun->machine->split_stack_varargs_pointer,
7789 			       crtl->args.arg_offset_rtx,
7790 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
7791 	  convert_move (va_r, next, 0);
7792 	}
7793       return;
7794     }
7795 
7796   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7797   f_fpr = DECL_CHAIN (f_gpr);
7798   f_ovf = DECL_CHAIN (f_fpr);
7799   f_sav = DECL_CHAIN (f_ovf);
7800 
7801   valist = build_simple_mem_ref (valist);
7802   TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7803   /* The following should be folded into the MEM_REF offset.  */
7804   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7805 		f_gpr, NULL_TREE);
7806   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7807 		f_fpr, NULL_TREE);
7808   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7809 		f_ovf, NULL_TREE);
7810   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7811 		f_sav, NULL_TREE);
7812 
7813   /* Count number of gp and fp argument registers used.  */
7814   words = crtl->args.info.words;
7815   n_gpr = crtl->args.info.regno;
7816   n_fpr = crtl->args.info.sse_regno;
7817 
7818   if (cfun->va_list_gpr_size)
7819     {
7820       type = TREE_TYPE (gpr);
7821       t = build2 (MODIFY_EXPR, type,
7822 		  gpr, build_int_cst (type, n_gpr * 8));
7823       TREE_SIDE_EFFECTS (t) = 1;
7824       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7825     }
7826 
7827   if (TARGET_SSE && cfun->va_list_fpr_size)
7828     {
7829       type = TREE_TYPE (fpr);
7830       t = build2 (MODIFY_EXPR, type, fpr,
7831 		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7832       TREE_SIDE_EFFECTS (t) = 1;
7833       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7834     }
7835 
7836   /* Find the overflow area.  */
7837   type = TREE_TYPE (ovf);
7838   if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7839     ovf_rtx = crtl->args.internal_arg_pointer;
7840   else
7841     ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7842   t = make_tree (type, ovf_rtx);
7843   if (words != 0)
7844     t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7845   t = build2 (MODIFY_EXPR, type, ovf, t);
7846   TREE_SIDE_EFFECTS (t) = 1;
7847   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7848 
7849   if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7850     {
7851       /* Find the register save area.
7852 	 Prologue of the function save it right above stack frame.  */
7853       type = TREE_TYPE (sav);
7854       t = make_tree (type, frame_pointer_rtx);
7855       if (!ix86_varargs_gpr_size)
7856 	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7857       t = build2 (MODIFY_EXPR, type, sav, t);
7858       TREE_SIDE_EFFECTS (t) = 1;
7859       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7860     }
7861 }
7862 
7863 /* Implement va_arg.  */
7864 
7865 static tree
7866 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7867 		      gimple_seq *post_p)
7868 {
7869   static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7870   tree f_gpr, f_fpr, f_ovf, f_sav;
7871   tree gpr, fpr, ovf, sav, t;
7872   int size, rsize;
7873   tree lab_false, lab_over = NULL_TREE;
7874   tree addr, t2;
7875   rtx container;
7876   int indirect_p = 0;
7877   tree ptrtype;
7878   enum machine_mode nat_mode;
7879   unsigned int arg_boundary;
7880 
7881   /* Only 64bit target needs something special.  */
7882   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7883     return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7884 
7885   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7886   f_fpr = DECL_CHAIN (f_gpr);
7887   f_ovf = DECL_CHAIN (f_fpr);
7888   f_sav = DECL_CHAIN (f_ovf);
7889 
7890   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7891 		build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7892   valist = build_va_arg_indirect_ref (valist);
7893   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7894   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7895   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7896 
7897   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7898   if (indirect_p)
7899     type = build_pointer_type (type);
7900   size = int_size_in_bytes (type);
7901   rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7902 
7903   nat_mode = type_natural_mode (type, NULL);
7904   switch (nat_mode)
7905     {
7906     case V8SFmode:
7907     case V8SImode:
7908     case V32QImode:
7909     case V16HImode:
7910     case V4DFmode:
7911     case V4DImode:
7912       /* Unnamed 256bit vector mode parameters are passed on stack.  */
7913       if (!TARGET_64BIT_MS_ABI)
7914 	{
7915 	  container = NULL;
7916 	  break;
7917 	}
7918 
7919     default:
7920       container = construct_container (nat_mode, TYPE_MODE (type),
7921 				       type, 0, X86_64_REGPARM_MAX,
7922 				       X86_64_SSE_REGPARM_MAX, intreg,
7923 				       0);
7924       break;
7925     }
7926 
7927   /* Pull the value out of the saved registers.  */
7928 
7929   addr = create_tmp_var (ptr_type_node, "addr");
7930 
7931   if (container)
7932     {
7933       int needed_intregs, needed_sseregs;
7934       bool need_temp;
7935       tree int_addr, sse_addr;
7936 
7937       lab_false = create_artificial_label (UNKNOWN_LOCATION);
7938       lab_over = create_artificial_label (UNKNOWN_LOCATION);
7939 
7940       examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7941 
7942       need_temp = (!REG_P (container)
7943 		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
7944 		       || TYPE_ALIGN (type) > 128));
7945 
7946       /* In case we are passing structure, verify that it is consecutive block
7947          on the register save area.  If not we need to do moves.  */
7948       if (!need_temp && !REG_P (container))
7949 	{
7950 	  /* Verify that all registers are strictly consecutive  */
7951 	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7952 	    {
7953 	      int i;
7954 
7955 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7956 		{
7957 		  rtx slot = XVECEXP (container, 0, i);
7958 		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7959 		      || INTVAL (XEXP (slot, 1)) != i * 16)
7960 		    need_temp = 1;
7961 		}
7962 	    }
7963 	  else
7964 	    {
7965 	      int i;
7966 
7967 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7968 		{
7969 		  rtx slot = XVECEXP (container, 0, i);
7970 		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7971 		      || INTVAL (XEXP (slot, 1)) != i * 8)
7972 		    need_temp = 1;
7973 		}
7974 	    }
7975 	}
7976       if (!need_temp)
7977 	{
7978 	  int_addr = addr;
7979 	  sse_addr = addr;
7980 	}
7981       else
7982 	{
7983 	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
7984 	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7985 	}
7986 
7987       /* First ensure that we fit completely in registers.  */
7988       if (needed_intregs)
7989 	{
7990 	  t = build_int_cst (TREE_TYPE (gpr),
7991 			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7992 	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7993 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7994 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7995 	  gimplify_and_add (t, pre_p);
7996 	}
7997       if (needed_sseregs)
7998 	{
7999 	  t = build_int_cst (TREE_TYPE (fpr),
8000 			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8001 			     + X86_64_REGPARM_MAX * 8);
8002 	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8003 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8004 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8005 	  gimplify_and_add (t, pre_p);
8006 	}
8007 
8008       /* Compute index to start of area used for integer regs.  */
8009       if (needed_intregs)
8010 	{
8011 	  /* int_addr = gpr + sav; */
8012 	  t = fold_build_pointer_plus (sav, gpr);
8013 	  gimplify_assign (int_addr, t, pre_p);
8014 	}
8015       if (needed_sseregs)
8016 	{
8017 	  /* sse_addr = fpr + sav; */
8018 	  t = fold_build_pointer_plus (sav, fpr);
8019 	  gimplify_assign (sse_addr, t, pre_p);
8020 	}
8021       if (need_temp)
8022 	{
8023 	  int i, prev_size = 0;
8024 	  tree temp = create_tmp_var (type, "va_arg_tmp");
8025 
8026 	  /* addr = &temp; */
8027 	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8028 	  gimplify_assign (addr, t, pre_p);
8029 
8030 	  for (i = 0; i < XVECLEN (container, 0); i++)
8031 	    {
8032 	      rtx slot = XVECEXP (container, 0, i);
8033 	      rtx reg = XEXP (slot, 0);
8034 	      enum machine_mode mode = GET_MODE (reg);
8035 	      tree piece_type;
8036 	      tree addr_type;
8037 	      tree daddr_type;
8038 	      tree src_addr, src;
8039 	      int src_offset;
8040 	      tree dest_addr, dest;
8041 	      int cur_size = GET_MODE_SIZE (mode);
8042 
8043 	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8044 	      prev_size = INTVAL (XEXP (slot, 1));
8045 	      if (prev_size + cur_size > size)
8046 		{
8047 		  cur_size = size - prev_size;
8048 		  mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8049 		  if (mode == BLKmode)
8050 		    mode = QImode;
8051 		}
8052 	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
8053 	      if (mode == GET_MODE (reg))
8054 		addr_type = build_pointer_type (piece_type);
8055 	      else
8056 		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8057 							 true);
8058 	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8059 							true);
8060 
8061 	      if (SSE_REGNO_P (REGNO (reg)))
8062 		{
8063 		  src_addr = sse_addr;
8064 		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8065 		}
8066 	      else
8067 		{
8068 		  src_addr = int_addr;
8069 		  src_offset = REGNO (reg) * 8;
8070 		}
8071 	      src_addr = fold_convert (addr_type, src_addr);
8072 	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8073 
8074 	      dest_addr = fold_convert (daddr_type, addr);
8075 	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8076 	      if (cur_size == GET_MODE_SIZE (mode))
8077 		{
8078 		  src = build_va_arg_indirect_ref (src_addr);
8079 		  dest = build_va_arg_indirect_ref (dest_addr);
8080 
8081 		  gimplify_assign (dest, src, pre_p);
8082 		}
8083 	      else
8084 		{
8085 		  tree copy
8086 		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8087 				       3, dest_addr, src_addr,
8088 				       size_int (cur_size));
8089 		  gimplify_and_add (copy, pre_p);
8090 		}
8091 	      prev_size += cur_size;
8092 	    }
8093 	}
8094 
8095       if (needed_intregs)
8096 	{
8097 	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8098 		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8099 	  gimplify_assign (gpr, t, pre_p);
8100 	}
8101 
8102       if (needed_sseregs)
8103 	{
8104 	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8105 		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8106 	  gimplify_assign (fpr, t, pre_p);
8107 	}
8108 
8109       gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8110 
8111       gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8112     }
8113 
8114   /* ... otherwise out of the overflow area.  */
8115 
8116   /* When we align parameter on stack for caller, if the parameter
8117      alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8118      aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
8119      here with caller.  */
8120   arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8121   if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8122     arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8123 
8124   /* Care for on-stack alignment if needed.  */
8125   if (arg_boundary <= 64 || size == 0)
8126     t = ovf;
8127  else
8128     {
8129       HOST_WIDE_INT align = arg_boundary / 8;
8130       t = fold_build_pointer_plus_hwi (ovf, align - 1);
8131       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8132 		  build_int_cst (TREE_TYPE (t), -align));
8133     }
8134 
8135   gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8136   gimplify_assign (addr, t, pre_p);
8137 
8138   t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8139   gimplify_assign (unshare_expr (ovf), t, pre_p);
8140 
8141   if (container)
8142     gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8143 
8144   ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8145   addr = fold_convert (ptrtype, addr);
8146 
8147   if (indirect_p)
8148     addr = build_va_arg_indirect_ref (addr);
8149   return build_va_arg_indirect_ref (addr);
8150 }
8151 
8152 /* Return true if OPNUM's MEM should be matched
8153    in movabs* patterns.  */
8154 
8155 bool
8156 ix86_check_movabs (rtx insn, int opnum)
8157 {
8158   rtx set, mem;
8159 
8160   set = PATTERN (insn);
8161   if (GET_CODE (set) == PARALLEL)
8162     set = XVECEXP (set, 0, 0);
8163   gcc_assert (GET_CODE (set) == SET);
8164   mem = XEXP (set, opnum);
8165   while (GET_CODE (mem) == SUBREG)
8166     mem = SUBREG_REG (mem);
8167   gcc_assert (MEM_P (mem));
8168   return volatile_ok || !MEM_VOLATILE_P (mem);
8169 }
8170 
8171 /* Initialize the table of extra 80387 mathematical constants.  */
8172 
8173 static void
8174 init_ext_80387_constants (void)
8175 {
8176   static const char * cst[5] =
8177   {
8178     "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
8179     "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
8180     "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
8181     "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
8182     "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
8183   };
8184   int i;
8185 
8186   for (i = 0; i < 5; i++)
8187     {
8188       real_from_string (&ext_80387_constants_table[i], cst[i]);
8189       /* Ensure each constant is rounded to XFmode precision.  */
8190       real_convert (&ext_80387_constants_table[i],
8191 		    XFmode, &ext_80387_constants_table[i]);
8192     }
8193 
8194   ext_80387_constants_init = 1;
8195 }
8196 
8197 /* Return non-zero if the constant is something that
8198    can be loaded with a special instruction.  */
8199 
8200 int
8201 standard_80387_constant_p (rtx x)
8202 {
8203   enum machine_mode mode = GET_MODE (x);
8204 
8205   REAL_VALUE_TYPE r;
8206 
8207   if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8208     return -1;
8209 
8210   if (x == CONST0_RTX (mode))
8211     return 1;
8212   if (x == CONST1_RTX (mode))
8213     return 2;
8214 
8215   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8216 
8217   /* For XFmode constants, try to find a special 80387 instruction when
8218      optimizing for size or on those CPUs that benefit from them.  */
8219   if (mode == XFmode
8220       && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8221     {
8222       int i;
8223 
8224       if (! ext_80387_constants_init)
8225 	init_ext_80387_constants ();
8226 
8227       for (i = 0; i < 5; i++)
8228         if (real_identical (&r, &ext_80387_constants_table[i]))
8229 	  return i + 3;
8230     }
8231 
8232   /* Load of the constant -0.0 or -1.0 will be split as
8233      fldz;fchs or fld1;fchs sequence.  */
8234   if (real_isnegzero (&r))
8235     return 8;
8236   if (real_identical (&r, &dconstm1))
8237     return 9;
8238 
8239   return 0;
8240 }
8241 
8242 /* Return the opcode of the special instruction to be used to load
8243    the constant X.  */
8244 
8245 const char *
8246 standard_80387_constant_opcode (rtx x)
8247 {
8248   switch (standard_80387_constant_p (x))
8249     {
8250     case 1:
8251       return "fldz";
8252     case 2:
8253       return "fld1";
8254     case 3:
8255       return "fldlg2";
8256     case 4:
8257       return "fldln2";
8258     case 5:
8259       return "fldl2e";
8260     case 6:
8261       return "fldl2t";
8262     case 7:
8263       return "fldpi";
8264     case 8:
8265     case 9:
8266       return "#";
8267     default:
8268       gcc_unreachable ();
8269     }
8270 }
8271 
8272 /* Return the CONST_DOUBLE representing the 80387 constant that is
8273    loaded by the specified special instruction.  The argument IDX
8274    matches the return value from standard_80387_constant_p.  */
8275 
8276 rtx
8277 standard_80387_constant_rtx (int idx)
8278 {
8279   int i;
8280 
8281   if (! ext_80387_constants_init)
8282     init_ext_80387_constants ();
8283 
8284   switch (idx)
8285     {
8286     case 3:
8287     case 4:
8288     case 5:
8289     case 6:
8290     case 7:
8291       i = idx - 3;
8292       break;
8293 
8294     default:
8295       gcc_unreachable ();
8296     }
8297 
8298   return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8299 				       XFmode);
8300 }
8301 
8302 /* Return 1 if X is all 0s and 2 if x is all 1s
8303    in supported SSE/AVX vector mode.  */
8304 
8305 int
8306 standard_sse_constant_p (rtx x)
8307 {
8308   enum machine_mode mode = GET_MODE (x);
8309 
8310   if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8311     return 1;
8312   if (vector_all_ones_operand (x, mode))
8313     switch (mode)
8314       {
8315       case V16QImode:
8316       case V8HImode:
8317       case V4SImode:
8318       case V2DImode:
8319 	if (TARGET_SSE2)
8320 	  return 2;
8321       case V32QImode:
8322       case V16HImode:
8323       case V8SImode:
8324       case V4DImode:
8325 	if (TARGET_AVX2)
8326 	  return 2;
8327       default:
8328 	break;
8329       }
8330 
8331   return 0;
8332 }
8333 
8334 /* Return the opcode of the special instruction to be used to load
8335    the constant X.  */
8336 
8337 const char *
8338 standard_sse_constant_opcode (rtx insn, rtx x)
8339 {
8340   switch (standard_sse_constant_p (x))
8341     {
8342     case 1:
8343       switch (get_attr_mode (insn))
8344 	{
8345 	case MODE_TI:
8346 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8347 	    return "%vpxor\t%0, %d0";
8348 	case MODE_V2DF:
8349 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8350 	    return "%vxorpd\t%0, %d0";
8351 	case MODE_V4SF:
8352 	  return "%vxorps\t%0, %d0";
8353 
8354 	case MODE_OI:
8355 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8356 	    return "vpxor\t%x0, %x0, %x0";
8357 	case MODE_V4DF:
8358 	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8359 	    return "vxorpd\t%x0, %x0, %x0";
8360 	case MODE_V8SF:
8361 	  return "vxorps\t%x0, %x0, %x0";
8362 
8363 	default:
8364 	  break;
8365 	}
8366 
8367     case 2:
8368       if (TARGET_AVX)
8369 	return "vpcmpeqd\t%0, %0, %0";
8370       else
8371 	return "pcmpeqd\t%0, %0";
8372 
8373     default:
8374       break;
8375     }
8376   gcc_unreachable ();
8377 }
8378 
8379 /* Returns true if OP contains a symbol reference */
8380 
8381 bool
8382 symbolic_reference_mentioned_p (rtx op)
8383 {
8384   const char *fmt;
8385   int i;
8386 
8387   if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8388     return true;
8389 
8390   fmt = GET_RTX_FORMAT (GET_CODE (op));
8391   for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8392     {
8393       if (fmt[i] == 'E')
8394 	{
8395 	  int j;
8396 
8397 	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8398 	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8399 	      return true;
8400 	}
8401 
8402       else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8403 	return true;
8404     }
8405 
8406   return false;
8407 }
8408 
8409 /* Return true if it is appropriate to emit `ret' instructions in the
8410    body of a function.  Do this only if the epilogue is simple, needing a
8411    couple of insns.  Prior to reloading, we can't tell how many registers
8412    must be saved, so return false then.  Return false if there is no frame
8413    marker to de-allocate.  */
8414 
8415 bool
8416 ix86_can_use_return_insn_p (void)
8417 {
8418   struct ix86_frame frame;
8419 
8420   if (! reload_completed || frame_pointer_needed)
8421     return 0;
8422 
8423   /* Don't allow more than 32k pop, since that's all we can do
8424      with one instruction.  */
8425   if (crtl->args.pops_args && crtl->args.size >= 32768)
8426     return 0;
8427 
8428   ix86_compute_frame_layout (&frame);
8429   return (frame.stack_pointer_offset == UNITS_PER_WORD
8430 	  && (frame.nregs + frame.nsseregs) == 0);
8431 }
8432 
8433 /* Value should be nonzero if functions must have frame pointers.
8434    Zero means the frame pointer need not be set up (and parms may
8435    be accessed via the stack pointer) in functions that seem suitable.  */
8436 
8437 static bool
8438 ix86_frame_pointer_required (void)
8439 {
8440   /* If we accessed previous frames, then the generated code expects
8441      to be able to access the saved ebp value in our frame.  */
8442   if (cfun->machine->accesses_prev_frame)
8443     return true;
8444 
8445   /* Several x86 os'es need a frame pointer for other reasons,
8446      usually pertaining to setjmp.  */
8447   if (SUBTARGET_FRAME_POINTER_REQUIRED)
8448     return true;
8449 
8450   /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
8451   if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8452     return true;
8453 
8454   /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8455      allocation is 4GB.  */
8456   if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8457     return true;
8458 
8459   /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8460      turns off the frame pointer by default.  Turn it back on now if
8461      we've not got a leaf function.  */
8462   if (TARGET_OMIT_LEAF_FRAME_POINTER
8463       && (!current_function_is_leaf
8464 	  || ix86_current_function_calls_tls_descriptor))
8465     return true;
8466 
8467   if (crtl->profile && !flag_fentry)
8468     return true;
8469 
8470   return false;
8471 }
8472 
8473 /* Record that the current function accesses previous call frames.  */
8474 
8475 void
8476 ix86_setup_frame_addresses (void)
8477 {
8478   cfun->machine->accesses_prev_frame = 1;
8479 }
8480 
8481 #ifndef USE_HIDDEN_LINKONCE
8482 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8483 #  define USE_HIDDEN_LINKONCE 1
8484 # else
8485 #  define USE_HIDDEN_LINKONCE 0
8486 # endif
8487 #endif
8488 
8489 static int pic_labels_used;
8490 
8491 /* Fills in the label name that should be used for a pc thunk for
8492    the given register.  */
8493 
8494 static void
8495 get_pc_thunk_name (char name[32], unsigned int regno)
8496 {
8497   gcc_assert (!TARGET_64BIT);
8498 
8499   if (USE_HIDDEN_LINKONCE)
8500     sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8501   else
8502     ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8503 }
8504 
8505 
8506 /* This function generates code for -fpic that loads %ebx with
8507    the return address of the caller and then returns.  */
8508 
8509 static void
8510 ix86_code_end (void)
8511 {
8512   rtx xops[2];
8513   int regno;
8514 
8515   for (regno = AX_REG; regno <= SP_REG; regno++)
8516     {
8517       char name[32];
8518       tree decl;
8519 
8520       if (!(pic_labels_used & (1 << regno)))
8521 	continue;
8522 
8523       get_pc_thunk_name (name, regno);
8524 
8525       decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8526 			 get_identifier (name),
8527 			 build_function_type_list (void_type_node, NULL_TREE));
8528       DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8529 				       NULL_TREE, void_type_node);
8530       TREE_PUBLIC (decl) = 1;
8531       TREE_STATIC (decl) = 1;
8532 
8533 #if TARGET_MACHO
8534       if (TARGET_MACHO)
8535 	{
8536 	  switch_to_section (darwin_sections[text_coal_section]);
8537 	  fputs ("\t.weak_definition\t", asm_out_file);
8538 	  assemble_name (asm_out_file, name);
8539 	  fputs ("\n\t.private_extern\t", asm_out_file);
8540 	  assemble_name (asm_out_file, name);
8541 	  putc ('\n', asm_out_file);
8542 	  ASM_OUTPUT_LABEL (asm_out_file, name);
8543 	  DECL_WEAK (decl) = 1;
8544 	}
8545       else
8546 #endif
8547       if (USE_HIDDEN_LINKONCE)
8548 	{
8549 	  DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8550 
8551 	  targetm.asm_out.unique_section (decl, 0);
8552 	  switch_to_section (get_named_section (decl, NULL, 0));
8553 
8554 	  targetm.asm_out.globalize_label (asm_out_file, name);
8555 	  fputs ("\t.hidden\t", asm_out_file);
8556 	  assemble_name (asm_out_file, name);
8557 	  putc ('\n', asm_out_file);
8558 	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8559 	}
8560       else
8561 	{
8562 	  switch_to_section (text_section);
8563 	  ASM_OUTPUT_LABEL (asm_out_file, name);
8564 	}
8565 
8566       DECL_INITIAL (decl) = make_node (BLOCK);
8567       current_function_decl = decl;
8568       init_function_start (decl);
8569       first_function_block_is_cold = false;
8570       /* Make sure unwind info is emitted for the thunk if needed.  */
8571       final_start_function (emit_barrier (), asm_out_file, 1);
8572 
8573       /* Pad stack IP move with 4 instructions (two NOPs count
8574 	 as one instruction).  */
8575       if (TARGET_PAD_SHORT_FUNCTION)
8576 	{
8577 	  int i = 8;
8578 
8579 	  while (i--)
8580 	    fputs ("\tnop\n", asm_out_file);
8581 	}
8582 
8583       xops[0] = gen_rtx_REG (Pmode, regno);
8584       xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8585       output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8586       fputs ("\tret\n", asm_out_file);
8587       final_end_function ();
8588       init_insn_lengths ();
8589       free_after_compilation (cfun);
8590       set_cfun (NULL);
8591       current_function_decl = NULL;
8592     }
8593 
8594   if (flag_split_stack)
8595     file_end_indicate_split_stack ();
8596 }
8597 
8598 /* Emit code for the SET_GOT patterns.  */
8599 
8600 const char *
8601 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8602 {
8603   rtx xops[3];
8604 
8605   xops[0] = dest;
8606 
8607   if (TARGET_VXWORKS_RTP && flag_pic)
8608     {
8609       /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
8610       xops[2] = gen_rtx_MEM (Pmode,
8611 			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8612       output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8613 
8614       /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8615 	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8616 	 an unadorned address.  */
8617       xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8618       SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8619       output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8620       return "";
8621     }
8622 
8623   xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8624 
8625   if (!flag_pic)
8626     {
8627       if (TARGET_MACHO)
8628 	/* We don't need a pic base, we're not producing pic.  */
8629 	gcc_unreachable ();
8630 
8631       xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8632       output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8633       targetm.asm_out.internal_label (asm_out_file, "L",
8634 				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8635     }
8636   else
8637     {
8638       char name[32];
8639       get_pc_thunk_name (name, REGNO (dest));
8640       pic_labels_used |= 1 << REGNO (dest);
8641 
8642       xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8643       xops[2] = gen_rtx_MEM (QImode, xops[2]);
8644       output_asm_insn ("call\t%X2", xops);
8645 
8646 #if TARGET_MACHO
8647       /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8648          This is what will be referenced by the Mach-O PIC subsystem.  */
8649       if (machopic_should_output_picbase_label () || !label)
8650 	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8651 
8652       /* When we are restoring the pic base at the site of a nonlocal label,
8653          and we decided to emit the pic base above, we will still output a
8654          local label used for calculating the correction offset (even though
8655          the offset will be 0 in that case).  */
8656       if (label)
8657         targetm.asm_out.internal_label (asm_out_file, "L",
8658 					   CODE_LABEL_NUMBER (label));
8659 #endif
8660     }
8661 
8662   if (!TARGET_MACHO)
8663     output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8664 
8665   return "";
8666 }
8667 
8668 /* Generate an "push" pattern for input ARG.  */
8669 
8670 static rtx
8671 gen_push (rtx arg)
8672 {
8673   struct machine_function *m = cfun->machine;
8674 
8675   if (m->fs.cfa_reg == stack_pointer_rtx)
8676     m->fs.cfa_offset += UNITS_PER_WORD;
8677   m->fs.sp_offset += UNITS_PER_WORD;
8678 
8679   return gen_rtx_SET (VOIDmode,
8680 		      gen_rtx_MEM (Pmode,
8681 				   gen_rtx_PRE_DEC (Pmode,
8682 						    stack_pointer_rtx)),
8683 		      arg);
8684 }
8685 
8686 /* Generate an "pop" pattern for input ARG.  */
8687 
8688 static rtx
8689 gen_pop (rtx arg)
8690 {
8691   return gen_rtx_SET (VOIDmode,
8692 		      arg,
8693 		      gen_rtx_MEM (Pmode,
8694 				   gen_rtx_POST_INC (Pmode,
8695 						     stack_pointer_rtx)));
8696 }
8697 
8698 /* Return >= 0 if there is an unused call-clobbered register available
8699    for the entire function.  */
8700 
8701 static unsigned int
8702 ix86_select_alt_pic_regnum (void)
8703 {
8704   if (current_function_is_leaf
8705       && !crtl->profile
8706       && !ix86_current_function_calls_tls_descriptor)
8707     {
8708       int i, drap;
8709       /* Can't use the same register for both PIC and DRAP.  */
8710       if (crtl->drap_reg)
8711 	drap = REGNO (crtl->drap_reg);
8712       else
8713 	drap = -1;
8714       for (i = 2; i >= 0; --i)
8715         if (i != drap && !df_regs_ever_live_p (i))
8716 	  return i;
8717     }
8718 
8719   return INVALID_REGNUM;
8720 }
8721 
8722 /* Return TRUE if we need to save REGNO.  */
8723 
8724 static bool
8725 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8726 {
8727   if (pic_offset_table_rtx
8728       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8729       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8730 	  || crtl->profile
8731 	  || crtl->calls_eh_return
8732 	  || crtl->uses_const_pool
8733 	  || cfun->has_nonlocal_label))
8734     return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8735 
8736   if (crtl->calls_eh_return && maybe_eh_return)
8737     {
8738       unsigned i;
8739       for (i = 0; ; i++)
8740 	{
8741 	  unsigned test = EH_RETURN_DATA_REGNO (i);
8742 	  if (test == INVALID_REGNUM)
8743 	    break;
8744 	  if (test == regno)
8745 	    return true;
8746 	}
8747     }
8748 
8749   if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8750     return true;
8751 
8752   return (df_regs_ever_live_p (regno)
8753 	  && !call_used_regs[regno]
8754 	  && !fixed_regs[regno]
8755 	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8756 }
8757 
8758 /* Return number of saved general prupose registers.  */
8759 
8760 static int
8761 ix86_nsaved_regs (void)
8762 {
8763   int nregs = 0;
8764   int regno;
8765 
8766   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8767     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8768       nregs ++;
8769   return nregs;
8770 }
8771 
8772 /* Return number of saved SSE registrers.  */
8773 
8774 static int
8775 ix86_nsaved_sseregs (void)
8776 {
8777   int nregs = 0;
8778   int regno;
8779 
8780   if (!TARGET_64BIT_MS_ABI)
8781     return 0;
8782   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8783     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8784       nregs ++;
8785   return nregs;
8786 }
8787 
8788 /* Given FROM and TO register numbers, say whether this elimination is
8789    allowed.  If stack alignment is needed, we can only replace argument
8790    pointer with hard frame pointer, or replace frame pointer with stack
8791    pointer.  Otherwise, frame pointer elimination is automatically
8792    handled and all other eliminations are valid.  */
8793 
8794 static bool
8795 ix86_can_eliminate (const int from, const int to)
8796 {
8797   if (stack_realign_fp)
8798     return ((from == ARG_POINTER_REGNUM
8799 	     && to == HARD_FRAME_POINTER_REGNUM)
8800 	    || (from == FRAME_POINTER_REGNUM
8801 		&& to == STACK_POINTER_REGNUM));
8802   else
8803     return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8804 }
8805 
8806 /* Return the offset between two registers, one to be eliminated, and the other
8807    its replacement, at the start of a routine.  */
8808 
8809 HOST_WIDE_INT
8810 ix86_initial_elimination_offset (int from, int to)
8811 {
8812   struct ix86_frame frame;
8813   ix86_compute_frame_layout (&frame);
8814 
8815   if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8816     return frame.hard_frame_pointer_offset;
8817   else if (from == FRAME_POINTER_REGNUM
8818 	   && to == HARD_FRAME_POINTER_REGNUM)
8819     return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8820   else
8821     {
8822       gcc_assert (to == STACK_POINTER_REGNUM);
8823 
8824       if (from == ARG_POINTER_REGNUM)
8825 	return frame.stack_pointer_offset;
8826 
8827       gcc_assert (from == FRAME_POINTER_REGNUM);
8828       return frame.stack_pointer_offset - frame.frame_pointer_offset;
8829     }
8830 }
8831 
8832 /* In a dynamically-aligned function, we can't know the offset from
8833    stack pointer to frame pointer, so we must ensure that setjmp
8834    eliminates fp against the hard fp (%ebp) rather than trying to
8835    index from %esp up to the top of the frame across a gap that is
8836    of unknown (at compile-time) size.  */
8837 static rtx
8838 ix86_builtin_setjmp_frame_value (void)
8839 {
8840   return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8841 }
8842 
8843 /* When using -fsplit-stack, the allocation routines set a field in
8844    the TCB to the bottom of the stack plus this much space, measured
8845    in bytes.  */
8846 
8847 #define SPLIT_STACK_AVAILABLE 256
8848 
8849 /* Fill structure ix86_frame about frame of currently computed function.  */
8850 
8851 static void
8852 ix86_compute_frame_layout (struct ix86_frame *frame)
8853 {
8854   unsigned int stack_alignment_needed;
8855   HOST_WIDE_INT offset;
8856   unsigned int preferred_alignment;
8857   HOST_WIDE_INT size = get_frame_size ();
8858   HOST_WIDE_INT to_allocate;
8859 
8860   frame->nregs = ix86_nsaved_regs ();
8861   frame->nsseregs = ix86_nsaved_sseregs ();
8862 
8863   stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8864   preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8865 
8866   /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8867      function prologues and leaf.  */
8868   if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8869       && (!current_function_is_leaf || cfun->calls_alloca != 0
8870           || ix86_current_function_calls_tls_descriptor))
8871     {
8872       preferred_alignment = 16;
8873       stack_alignment_needed = 16;
8874       crtl->preferred_stack_boundary = 128;
8875       crtl->stack_alignment_needed = 128;
8876     }
8877 
8878   gcc_assert (!size || stack_alignment_needed);
8879   gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8880   gcc_assert (preferred_alignment <= stack_alignment_needed);
8881 
8882   /* For SEH we have to limit the amount of code movement into the prologue.
8883      At present we do this via a BLOCKAGE, at which point there's very little
8884      scheduling that can be done, which means that there's very little point
8885      in doing anything except PUSHs.  */
8886   if (TARGET_SEH)
8887     cfun->machine->use_fast_prologue_epilogue = false;
8888 
8889   /* During reload iteration the amount of registers saved can change.
8890      Recompute the value as needed.  Do not recompute when amount of registers
8891      didn't change as reload does multiple calls to the function and does not
8892      expect the decision to change within single iteration.  */
8893   else if (!optimize_function_for_size_p (cfun)
8894            && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8895     {
8896       int count = frame->nregs;
8897       struct cgraph_node *node = cgraph_get_node (current_function_decl);
8898 
8899       cfun->machine->use_fast_prologue_epilogue_nregs = count;
8900 
8901       /* The fast prologue uses move instead of push to save registers.  This
8902          is significantly longer, but also executes faster as modern hardware
8903          can execute the moves in parallel, but can't do that for push/pop.
8904 
8905 	 Be careful about choosing what prologue to emit:  When function takes
8906 	 many instructions to execute we may use slow version as well as in
8907 	 case function is known to be outside hot spot (this is known with
8908 	 feedback only).  Weight the size of function by number of registers
8909 	 to save as it is cheap to use one or two push instructions but very
8910 	 slow to use many of them.  */
8911       if (count)
8912 	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8913       if (node->frequency < NODE_FREQUENCY_NORMAL
8914 	  || (flag_branch_probabilities
8915 	      && node->frequency < NODE_FREQUENCY_HOT))
8916         cfun->machine->use_fast_prologue_epilogue = false;
8917       else
8918         cfun->machine->use_fast_prologue_epilogue
8919 	   = !expensive_function_p (count);
8920     }
8921 
8922   frame->save_regs_using_mov
8923     = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8924        /* If static stack checking is enabled and done with probes,
8925 	  the registers need to be saved before allocating the frame.  */
8926        && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8927 
8928   /* Skip return address.  */
8929   offset = UNITS_PER_WORD;
8930 
8931   /* Skip pushed static chain.  */
8932   if (ix86_static_chain_on_stack)
8933     offset += UNITS_PER_WORD;
8934 
8935   /* Skip saved base pointer.  */
8936   if (frame_pointer_needed)
8937     offset += UNITS_PER_WORD;
8938   frame->hfp_save_offset = offset;
8939 
8940   /* The traditional frame pointer location is at the top of the frame.  */
8941   frame->hard_frame_pointer_offset = offset;
8942 
8943   /* Register save area */
8944   offset += frame->nregs * UNITS_PER_WORD;
8945   frame->reg_save_offset = offset;
8946 
8947   /* On SEH target, registers are pushed just before the frame pointer
8948      location.  */
8949   if (TARGET_SEH)
8950     frame->hard_frame_pointer_offset = offset;
8951 
8952   /* Align and set SSE register save area.  */
8953   if (frame->nsseregs)
8954     {
8955       /* The only ABI that has saved SSE registers (Win64) also has a
8956          16-byte aligned default stack, and thus we don't need to be
8957 	 within the re-aligned local stack frame to save them.  */
8958       gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8959       offset = (offset + 16 - 1) & -16;
8960       offset += frame->nsseregs * 16;
8961     }
8962   frame->sse_reg_save_offset = offset;
8963 
8964   /* The re-aligned stack starts here.  Values before this point are not
8965      directly comparable with values below this point.  In order to make
8966      sure that no value happens to be the same before and after, force
8967      the alignment computation below to add a non-zero value.  */
8968   if (stack_realign_fp)
8969     offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8970 
8971   /* Va-arg area */
8972   frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8973   offset += frame->va_arg_size;
8974 
8975   /* Align start of frame for local function.  */
8976   if (stack_realign_fp
8977       || offset != frame->sse_reg_save_offset
8978       || size != 0
8979       || !current_function_is_leaf
8980       || cfun->calls_alloca
8981       || ix86_current_function_calls_tls_descriptor)
8982     offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8983 
8984   /* Frame pointer points here.  */
8985   frame->frame_pointer_offset = offset;
8986 
8987   offset += size;
8988 
8989   /* Add outgoing arguments area.  Can be skipped if we eliminated
8990      all the function calls as dead code.
8991      Skipping is however impossible when function calls alloca.  Alloca
8992      expander assumes that last crtl->outgoing_args_size
8993      of stack frame are unused.  */
8994   if (ACCUMULATE_OUTGOING_ARGS
8995       && (!current_function_is_leaf || cfun->calls_alloca
8996 	  || ix86_current_function_calls_tls_descriptor))
8997     {
8998       offset += crtl->outgoing_args_size;
8999       frame->outgoing_arguments_size = crtl->outgoing_args_size;
9000     }
9001   else
9002     frame->outgoing_arguments_size = 0;
9003 
9004   /* Align stack boundary.  Only needed if we're calling another function
9005      or using alloca.  */
9006   if (!current_function_is_leaf || cfun->calls_alloca
9007       || ix86_current_function_calls_tls_descriptor)
9008     offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9009 
9010   /* We've reached end of stack frame.  */
9011   frame->stack_pointer_offset = offset;
9012 
9013   /* Size prologue needs to allocate.  */
9014   to_allocate = offset - frame->sse_reg_save_offset;
9015 
9016   if ((!to_allocate && frame->nregs <= 1)
9017       || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9018     frame->save_regs_using_mov = false;
9019 
9020   if (ix86_using_red_zone ()
9021       && current_function_sp_is_unchanging
9022       && current_function_is_leaf
9023       && !ix86_current_function_calls_tls_descriptor)
9024     {
9025       frame->red_zone_size = to_allocate;
9026       if (frame->save_regs_using_mov)
9027 	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9028       if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9029 	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9030     }
9031   else
9032     frame->red_zone_size = 0;
9033   frame->stack_pointer_offset -= frame->red_zone_size;
9034 
9035   /* The SEH frame pointer location is near the bottom of the frame.
9036      This is enforced by the fact that the difference between the
9037      stack pointer and the frame pointer is limited to 240 bytes in
9038      the unwind data structure.  */
9039   if (TARGET_SEH)
9040     {
9041       HOST_WIDE_INT diff;
9042 
9043       /* If we can leave the frame pointer where it is, do so.  Also, returns
9044 	 the establisher frame for __builtin_frame_address (0).  */
9045       diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9046       if (diff <= SEH_MAX_FRAME_SIZE
9047 	  && (diff > 240 || (diff & 15) != 0)
9048 	  && !crtl->accesses_prior_frames)
9049 	{
9050 	  /* Ideally we'd determine what portion of the local stack frame
9051 	     (within the constraint of the lowest 240) is most heavily used.
9052 	     But without that complication, simply bias the frame pointer
9053 	     by 128 bytes so as to maximize the amount of the local stack
9054 	     frame that is addressable with 8-bit offsets.  */
9055 	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9056 	}
9057     }
9058 }
9059 
9060 /* This is semi-inlined memory_address_length, but simplified
9061    since we know that we're always dealing with reg+offset, and
9062    to avoid having to create and discard all that rtl.  */
9063 
9064 static inline int
9065 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9066 {
9067   int len = 4;
9068 
9069   if (offset == 0)
9070     {
9071       /* EBP and R13 cannot be encoded without an offset.  */
9072       len = (regno == BP_REG || regno == R13_REG);
9073     }
9074   else if (IN_RANGE (offset, -128, 127))
9075     len = 1;
9076 
9077   /* ESP and R12 must be encoded with a SIB byte.  */
9078   if (regno == SP_REG || regno == R12_REG)
9079     len++;
9080 
9081   return len;
9082 }
9083 
9084 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9085    The valid base registers are taken from CFUN->MACHINE->FS.  */
9086 
9087 static rtx
9088 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9089 {
9090   const struct machine_function *m = cfun->machine;
9091   rtx base_reg = NULL;
9092   HOST_WIDE_INT base_offset = 0;
9093 
9094   if (m->use_fast_prologue_epilogue)
9095     {
9096       /* Choose the base register most likely to allow the most scheduling
9097          opportunities.  Generally FP is valid througout the function,
9098          while DRAP must be reloaded within the epilogue.  But choose either
9099          over the SP due to increased encoding size.  */
9100 
9101       if (m->fs.fp_valid)
9102 	{
9103 	  base_reg = hard_frame_pointer_rtx;
9104 	  base_offset = m->fs.fp_offset - cfa_offset;
9105 	}
9106       else if (m->fs.drap_valid)
9107 	{
9108 	  base_reg = crtl->drap_reg;
9109 	  base_offset = 0 - cfa_offset;
9110 	}
9111       else if (m->fs.sp_valid)
9112 	{
9113 	  base_reg = stack_pointer_rtx;
9114 	  base_offset = m->fs.sp_offset - cfa_offset;
9115 	}
9116     }
9117   else
9118     {
9119       HOST_WIDE_INT toffset;
9120       int len = 16, tlen;
9121 
9122       /* Choose the base register with the smallest address encoding.
9123          With a tie, choose FP > DRAP > SP.  */
9124       if (m->fs.sp_valid)
9125 	{
9126 	  base_reg = stack_pointer_rtx;
9127 	  base_offset = m->fs.sp_offset - cfa_offset;
9128           len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9129 	}
9130       if (m->fs.drap_valid)
9131 	{
9132 	  toffset = 0 - cfa_offset;
9133 	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9134 	  if (tlen <= len)
9135 	    {
9136 	      base_reg = crtl->drap_reg;
9137 	      base_offset = toffset;
9138 	      len = tlen;
9139 	    }
9140 	}
9141       if (m->fs.fp_valid)
9142 	{
9143 	  toffset = m->fs.fp_offset - cfa_offset;
9144 	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9145 	  if (tlen <= len)
9146 	    {
9147 	      base_reg = hard_frame_pointer_rtx;
9148 	      base_offset = toffset;
9149 	      len = tlen;
9150 	    }
9151 	}
9152     }
9153   gcc_assert (base_reg != NULL);
9154 
9155   return plus_constant (base_reg, base_offset);
9156 }
9157 
9158 /* Emit code to save registers in the prologue.  */
9159 
9160 static void
9161 ix86_emit_save_regs (void)
9162 {
9163   unsigned int regno;
9164   rtx insn;
9165 
9166   for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9167     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9168       {
9169 	insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9170 	RTX_FRAME_RELATED_P (insn) = 1;
9171       }
9172 }
9173 
9174 /* Emit a single register save at CFA - CFA_OFFSET.  */
9175 
9176 static void
9177 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9178 			      HOST_WIDE_INT cfa_offset)
9179 {
9180   struct machine_function *m = cfun->machine;
9181   rtx reg = gen_rtx_REG (mode, regno);
9182   rtx mem, addr, base, insn;
9183 
9184   addr = choose_baseaddr (cfa_offset);
9185   mem = gen_frame_mem (mode, addr);
9186 
9187   /* For SSE saves, we need to indicate the 128-bit alignment.  */
9188   set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9189 
9190   insn = emit_move_insn (mem, reg);
9191   RTX_FRAME_RELATED_P (insn) = 1;
9192 
9193   base = addr;
9194   if (GET_CODE (base) == PLUS)
9195     base = XEXP (base, 0);
9196   gcc_checking_assert (REG_P (base));
9197 
9198   /* When saving registers into a re-aligned local stack frame, avoid
9199      any tricky guessing by dwarf2out.  */
9200   if (m->fs.realigned)
9201     {
9202       gcc_checking_assert (stack_realign_drap);
9203 
9204       if (regno == REGNO (crtl->drap_reg))
9205 	{
9206 	  /* A bit of a hack.  We force the DRAP register to be saved in
9207 	     the re-aligned stack frame, which provides us with a copy
9208 	     of the CFA that will last past the prologue.  Install it.  */
9209 	  gcc_checking_assert (cfun->machine->fs.fp_valid);
9210 	  addr = plus_constant (hard_frame_pointer_rtx,
9211 				cfun->machine->fs.fp_offset - cfa_offset);
9212 	  mem = gen_rtx_MEM (mode, addr);
9213 	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9214 	}
9215       else
9216 	{
9217 	  /* The frame pointer is a stable reference within the
9218 	     aligned frame.  Use it.  */
9219 	  gcc_checking_assert (cfun->machine->fs.fp_valid);
9220 	  addr = plus_constant (hard_frame_pointer_rtx,
9221 				cfun->machine->fs.fp_offset - cfa_offset);
9222 	  mem = gen_rtx_MEM (mode, addr);
9223 	  add_reg_note (insn, REG_CFA_EXPRESSION,
9224 			gen_rtx_SET (VOIDmode, mem, reg));
9225 	}
9226     }
9227 
9228   /* The memory may not be relative to the current CFA register,
9229      which means that we may need to generate a new pattern for
9230      use by the unwind info.  */
9231   else if (base != m->fs.cfa_reg)
9232     {
9233       addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9234       mem = gen_rtx_MEM (mode, addr);
9235       add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9236     }
9237 }
9238 
9239 /* Emit code to save registers using MOV insns.
9240    First register is stored at CFA - CFA_OFFSET.  */
9241 static void
9242 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9243 {
9244   unsigned int regno;
9245 
9246   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9247     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9248       {
9249         ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9250 	cfa_offset -= UNITS_PER_WORD;
9251       }
9252 }
9253 
9254 /* Emit code to save SSE registers using MOV insns.
9255    First register is stored at CFA - CFA_OFFSET.  */
9256 static void
9257 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9258 {
9259   unsigned int regno;
9260 
9261   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9262     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9263       {
9264 	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9265 	cfa_offset -= 16;
9266       }
9267 }
9268 
9269 static GTY(()) rtx queued_cfa_restores;
9270 
9271 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9272    manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
9273    Don't add the note if the previously saved value will be left untouched
9274    within stack red-zone till return, as unwinders can find the same value
9275    in the register and on the stack.  */
9276 
9277 static void
9278 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9279 {
9280   if (!crtl->shrink_wrapped
9281       && cfa_offset <= cfun->machine->fs.red_zone_offset)
9282     return;
9283 
9284   if (insn)
9285     {
9286       add_reg_note (insn, REG_CFA_RESTORE, reg);
9287       RTX_FRAME_RELATED_P (insn) = 1;
9288     }
9289   else
9290     queued_cfa_restores
9291       = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9292 }
9293 
9294 /* Add queued REG_CFA_RESTORE notes if any to INSN.  */
9295 
9296 static void
9297 ix86_add_queued_cfa_restore_notes (rtx insn)
9298 {
9299   rtx last;
9300   if (!queued_cfa_restores)
9301     return;
9302   for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9303     ;
9304   XEXP (last, 1) = REG_NOTES (insn);
9305   REG_NOTES (insn) = queued_cfa_restores;
9306   queued_cfa_restores = NULL_RTX;
9307   RTX_FRAME_RELATED_P (insn) = 1;
9308 }
9309 
9310 /* Expand prologue or epilogue stack adjustment.
9311    The pattern exist to put a dependency on all ebp-based memory accesses.
9312    STYLE should be negative if instructions should be marked as frame related,
9313    zero if %r11 register is live and cannot be freely used and positive
9314    otherwise.  */
9315 
9316 static void
9317 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9318 			   int style, bool set_cfa)
9319 {
9320   struct machine_function *m = cfun->machine;
9321   rtx insn;
9322   bool add_frame_related_expr = false;
9323 
9324   if (! TARGET_64BIT)
9325     insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9326   else if (x86_64_immediate_operand (offset, DImode))
9327     insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9328   else
9329     {
9330       rtx tmp;
9331       /* r11 is used by indirect sibcall return as well, set before the
9332 	 epilogue and used after the epilogue.  */
9333       if (style)
9334         tmp = gen_rtx_REG (DImode, R11_REG);
9335       else
9336 	{
9337 	  gcc_assert (src != hard_frame_pointer_rtx
9338 		      && dest != hard_frame_pointer_rtx);
9339 	  tmp = hard_frame_pointer_rtx;
9340 	}
9341       insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9342       if (style < 0)
9343 	add_frame_related_expr = true;
9344 
9345       insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9346     }
9347 
9348   insn = emit_insn (insn);
9349   if (style >= 0)
9350     ix86_add_queued_cfa_restore_notes (insn);
9351 
9352   if (set_cfa)
9353     {
9354       rtx r;
9355 
9356       gcc_assert (m->fs.cfa_reg == src);
9357       m->fs.cfa_offset += INTVAL (offset);
9358       m->fs.cfa_reg = dest;
9359 
9360       r = gen_rtx_PLUS (Pmode, src, offset);
9361       r = gen_rtx_SET (VOIDmode, dest, r);
9362       add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9363       RTX_FRAME_RELATED_P (insn) = 1;
9364     }
9365   else if (style < 0)
9366     {
9367       RTX_FRAME_RELATED_P (insn) = 1;
9368       if (add_frame_related_expr)
9369 	{
9370 	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
9371 	  r = gen_rtx_SET (VOIDmode, dest, r);
9372 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9373 	}
9374     }
9375 
9376   if (dest == stack_pointer_rtx)
9377     {
9378       HOST_WIDE_INT ooffset = m->fs.sp_offset;
9379       bool valid = m->fs.sp_valid;
9380 
9381       if (src == hard_frame_pointer_rtx)
9382 	{
9383 	  valid = m->fs.fp_valid;
9384 	  ooffset = m->fs.fp_offset;
9385 	}
9386       else if (src == crtl->drap_reg)
9387 	{
9388 	  valid = m->fs.drap_valid;
9389 	  ooffset = 0;
9390 	}
9391       else
9392 	{
9393 	  /* Else there are two possibilities: SP itself, which we set
9394 	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
9395 	     taken care of this by hand along the eh_return path.  */
9396 	  gcc_checking_assert (src == stack_pointer_rtx
9397 			       || offset == const0_rtx);
9398 	}
9399 
9400       m->fs.sp_offset = ooffset - INTVAL (offset);
9401       m->fs.sp_valid = valid;
9402     }
9403 }
9404 
9405 /* Find an available register to be used as dynamic realign argument
9406    pointer regsiter.  Such a register will be written in prologue and
9407    used in begin of body, so it must not be
9408 	1. parameter passing register.
9409 	2. GOT pointer.
9410    We reuse static-chain register if it is available.  Otherwise, we
9411    use DI for i386 and R13 for x86-64.  We chose R13 since it has
9412    shorter encoding.
9413 
9414    Return: the regno of chosen register.  */
9415 
9416 static unsigned int
9417 find_drap_reg (void)
9418 {
9419   tree decl = cfun->decl;
9420 
9421   if (TARGET_64BIT)
9422     {
9423       /* Use R13 for nested function or function need static chain.
9424 	 Since function with tail call may use any caller-saved
9425 	 registers in epilogue, DRAP must not use caller-saved
9426 	 register in such case.  */
9427       if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9428 	return R13_REG;
9429 
9430       return R10_REG;
9431     }
9432   else
9433     {
9434       /* Use DI for nested function or function need static chain.
9435 	 Since function with tail call may use any caller-saved
9436 	 registers in epilogue, DRAP must not use caller-saved
9437 	 register in such case.  */
9438       if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9439 	return DI_REG;
9440 
9441       /* Reuse static chain register if it isn't used for parameter
9442          passing.  */
9443       if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9444 	{
9445 	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9446 	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9447 	    return CX_REG;
9448 	}
9449       return DI_REG;
9450     }
9451 }
9452 
9453 /* Return minimum incoming stack alignment.  */
9454 
9455 static unsigned int
9456 ix86_minimum_incoming_stack_boundary (bool sibcall)
9457 {
9458   unsigned int incoming_stack_boundary;
9459 
9460   /* Prefer the one specified at command line. */
9461   if (ix86_user_incoming_stack_boundary)
9462     incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9463   /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9464      if -mstackrealign is used, it isn't used for sibcall check and
9465      estimated stack alignment is 128bit.  */
9466   else if (!sibcall
9467 	   && !TARGET_64BIT
9468 	   && ix86_force_align_arg_pointer
9469 	   && crtl->stack_alignment_estimated == 128)
9470     incoming_stack_boundary = MIN_STACK_BOUNDARY;
9471   else
9472     incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9473 
9474   /* Incoming stack alignment can be changed on individual functions
9475      via force_align_arg_pointer attribute.  We use the smallest
9476      incoming stack boundary.  */
9477   if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9478       && lookup_attribute (ix86_force_align_arg_pointer_string,
9479 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9480     incoming_stack_boundary = MIN_STACK_BOUNDARY;
9481 
9482   /* The incoming stack frame has to be aligned at least at
9483      parm_stack_boundary.  */
9484   if (incoming_stack_boundary < crtl->parm_stack_boundary)
9485     incoming_stack_boundary = crtl->parm_stack_boundary;
9486 
9487   /* Stack at entrance of main is aligned by runtime.  We use the
9488      smallest incoming stack boundary. */
9489   if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9490       && DECL_NAME (current_function_decl)
9491       && MAIN_NAME_P (DECL_NAME (current_function_decl))
9492       && DECL_FILE_SCOPE_P (current_function_decl))
9493     incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9494 
9495   return incoming_stack_boundary;
9496 }
9497 
9498 /* Update incoming stack boundary and estimated stack alignment.  */
9499 
9500 static void
9501 ix86_update_stack_boundary (void)
9502 {
9503   ix86_incoming_stack_boundary
9504     = ix86_minimum_incoming_stack_boundary (false);
9505 
9506   /* x86_64 vararg needs 16byte stack alignment for register save
9507      area.  */
9508   if (TARGET_64BIT
9509       && cfun->stdarg
9510       && crtl->stack_alignment_estimated < 128)
9511     crtl->stack_alignment_estimated = 128;
9512 }
9513 
9514 /* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
9515    needed or an rtx for DRAP otherwise.  */
9516 
9517 static rtx
9518 ix86_get_drap_rtx (void)
9519 {
9520   if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9521     crtl->need_drap = true;
9522 
9523   if (stack_realign_drap)
9524     {
9525       /* Assign DRAP to vDRAP and returns vDRAP */
9526       unsigned int regno = find_drap_reg ();
9527       rtx drap_vreg;
9528       rtx arg_ptr;
9529       rtx seq, insn;
9530 
9531       arg_ptr = gen_rtx_REG (Pmode, regno);
9532       crtl->drap_reg = arg_ptr;
9533 
9534       start_sequence ();
9535       drap_vreg = copy_to_reg (arg_ptr);
9536       seq = get_insns ();
9537       end_sequence ();
9538 
9539       insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9540       if (!optimize)
9541 	{
9542 	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9543 	  RTX_FRAME_RELATED_P (insn) = 1;
9544 	}
9545       return drap_vreg;
9546     }
9547   else
9548     return NULL;
9549 }
9550 
9551 /* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
9552 
9553 static rtx
9554 ix86_internal_arg_pointer (void)
9555 {
9556   return virtual_incoming_args_rtx;
9557 }
9558 
9559 struct scratch_reg {
9560   rtx reg;
9561   bool saved;
9562 };
9563 
9564 /* Return a short-lived scratch register for use on function entry.
9565    In 32-bit mode, it is valid only after the registers are saved
9566    in the prologue.  This register must be released by means of
9567    release_scratch_register_on_entry once it is dead.  */
9568 
9569 static void
9570 get_scratch_register_on_entry (struct scratch_reg *sr)
9571 {
9572   int regno;
9573 
9574   sr->saved = false;
9575 
9576   if (TARGET_64BIT)
9577     {
9578       /* We always use R11 in 64-bit mode.  */
9579       regno = R11_REG;
9580     }
9581   else
9582     {
9583       tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9584       bool fastcall_p
9585 	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9586       bool thiscall_p
9587 	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9588       bool static_chain_p = DECL_STATIC_CHAIN (decl);
9589       int regparm = ix86_function_regparm (fntype, decl);
9590       int drap_regno
9591 	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9592 
9593       /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9594 	  for the static chain register.  */
9595       if ((regparm < 1 || (fastcall_p && !static_chain_p))
9596 	  && drap_regno != AX_REG)
9597 	regno = AX_REG;
9598       /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9599 	  for the static chain register.  */
9600       else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9601         regno = AX_REG;
9602       else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9603 	regno = DX_REG;
9604       /* ecx is the static chain register.  */
9605       else if (regparm < 3 && !fastcall_p && !thiscall_p
9606 	       && !static_chain_p
9607 	       && drap_regno != CX_REG)
9608 	regno = CX_REG;
9609       else if (ix86_save_reg (BX_REG, true))
9610 	regno = BX_REG;
9611       /* esi is the static chain register.  */
9612       else if (!(regparm == 3 && static_chain_p)
9613 	       && ix86_save_reg (SI_REG, true))
9614 	regno = SI_REG;
9615       else if (ix86_save_reg (DI_REG, true))
9616 	regno = DI_REG;
9617       else
9618 	{
9619 	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9620 	  sr->saved = true;
9621 	}
9622     }
9623 
9624   sr->reg = gen_rtx_REG (Pmode, regno);
9625   if (sr->saved)
9626     {
9627       rtx insn = emit_insn (gen_push (sr->reg));
9628       RTX_FRAME_RELATED_P (insn) = 1;
9629     }
9630 }
9631 
9632 /* Release a scratch register obtained from the preceding function.  */
9633 
9634 static void
9635 release_scratch_register_on_entry (struct scratch_reg *sr)
9636 {
9637   if (sr->saved)
9638     {
9639       struct machine_function *m = cfun->machine;
9640       rtx x, insn = emit_insn (gen_pop (sr->reg));
9641 
9642       /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop.  */
9643       RTX_FRAME_RELATED_P (insn) = 1;
9644       x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9645       x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9646       add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9647       m->fs.sp_offset -= UNITS_PER_WORD;
9648     }
9649 }
9650 
9651 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9652 
9653 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.  */
9654 
9655 static void
9656 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9657 {
9658   /* We skip the probe for the first interval + a small dope of 4 words and
9659      probe that many bytes past the specified size to maintain a protection
9660      area at the botton of the stack.  */
9661   const int dope = 4 * UNITS_PER_WORD;
9662   rtx size_rtx = GEN_INT (size), last;
9663 
9664   /* See if we have a constant small number of probes to generate.  If so,
9665      that's the easy case.  The run-time loop is made up of 11 insns in the
9666      generic case while the compile-time loop is made up of 3+2*(n-1) insns
9667      for n # of intervals.  */
9668   if (size <= 5 * PROBE_INTERVAL)
9669     {
9670       HOST_WIDE_INT i, adjust;
9671       bool first_probe = true;
9672 
9673       /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9674 	 values of N from 1 until it exceeds SIZE.  If only one probe is
9675 	 needed, this will not generate any code.  Then adjust and probe
9676 	 to PROBE_INTERVAL + SIZE.  */
9677       for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9678 	{
9679 	  if (first_probe)
9680 	    {
9681 	      adjust = 2 * PROBE_INTERVAL + dope;
9682 	      first_probe = false;
9683 	    }
9684 	  else
9685 	    adjust = PROBE_INTERVAL;
9686 
9687 	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9688 				  plus_constant (stack_pointer_rtx, -adjust)));
9689 	  emit_stack_probe (stack_pointer_rtx);
9690 	}
9691 
9692       if (first_probe)
9693 	adjust = size + PROBE_INTERVAL + dope;
9694       else
9695         adjust = size + PROBE_INTERVAL - i;
9696 
9697       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9698 			      plus_constant (stack_pointer_rtx, -adjust)));
9699       emit_stack_probe (stack_pointer_rtx);
9700 
9701       /* Adjust back to account for the additional first interval.  */
9702       last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9703 				     plus_constant (stack_pointer_rtx,
9704 						    PROBE_INTERVAL + dope)));
9705     }
9706 
9707   /* Otherwise, do the same as above, but in a loop.  Note that we must be
9708      extra careful with variables wrapping around because we might be at
9709      the very top (or the very bottom) of the address space and we have
9710      to be able to handle this case properly; in particular, we use an
9711      equality test for the loop condition.  */
9712   else
9713     {
9714       HOST_WIDE_INT rounded_size;
9715       struct scratch_reg sr;
9716 
9717       get_scratch_register_on_entry (&sr);
9718 
9719 
9720       /* Step 1: round SIZE to the previous multiple of the interval.  */
9721 
9722       rounded_size = size & -PROBE_INTERVAL;
9723 
9724 
9725       /* Step 2: compute initial and final value of the loop counter.  */
9726 
9727       /* SP = SP_0 + PROBE_INTERVAL.  */
9728       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9729 			      plus_constant (stack_pointer_rtx,
9730 					     - (PROBE_INTERVAL + dope))));
9731 
9732       /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
9733       emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9734       emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9735 			      gen_rtx_PLUS (Pmode, sr.reg,
9736 					    stack_pointer_rtx)));
9737 
9738 
9739       /* Step 3: the loop
9740 
9741 	 while (SP != LAST_ADDR)
9742 	   {
9743 	     SP = SP + PROBE_INTERVAL
9744 	     probe at SP
9745 	   }
9746 
9747 	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9748 	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
9749 
9750       emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9751 
9752 
9753       /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9754 	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
9755 
9756       if (size != rounded_size)
9757 	{
9758 	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9759 			          plus_constant (stack_pointer_rtx,
9760 						 rounded_size - size)));
9761 	  emit_stack_probe (stack_pointer_rtx);
9762 	}
9763 
9764       /* Adjust back to account for the additional first interval.  */
9765       last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9766 				     plus_constant (stack_pointer_rtx,
9767 						    PROBE_INTERVAL + dope)));
9768 
9769       release_scratch_register_on_entry (&sr);
9770     }
9771 
9772   gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9773 
9774   /* Even if the stack pointer isn't the CFA register, we need to correctly
9775      describe the adjustments made to it, in particular differentiate the
9776      frame-related ones from the frame-unrelated ones.  */
9777   if (size > 0)
9778     {
9779       rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9780       XVECEXP (expr, 0, 0)
9781 	= gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9782 		       plus_constant (stack_pointer_rtx, -size));
9783       XVECEXP (expr, 0, 1)
9784 	= gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9785 		       plus_constant (stack_pointer_rtx,
9786 				      PROBE_INTERVAL + dope + size));
9787       add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9788       RTX_FRAME_RELATED_P (last) = 1;
9789 
9790       cfun->machine->fs.sp_offset += size;
9791     }
9792 
9793   /* Make sure nothing is scheduled before we are done.  */
9794   emit_insn (gen_blockage ());
9795 }
9796 
9797 /* Adjust the stack pointer up to REG while probing it.  */
9798 
9799 const char *
9800 output_adjust_stack_and_probe (rtx reg)
9801 {
9802   static int labelno = 0;
9803   char loop_lab[32], end_lab[32];
9804   rtx xops[2];
9805 
9806   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9807   ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9808 
9809   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9810 
9811   /* Jump to END_LAB if SP == LAST_ADDR.  */
9812   xops[0] = stack_pointer_rtx;
9813   xops[1] = reg;
9814   output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9815   fputs ("\tje\t", asm_out_file);
9816   assemble_name_raw (asm_out_file, end_lab);
9817   fputc ('\n', asm_out_file);
9818 
9819   /* SP = SP + PROBE_INTERVAL.  */
9820   xops[1] = GEN_INT (PROBE_INTERVAL);
9821   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9822 
9823   /* Probe at SP.  */
9824   xops[1] = const0_rtx;
9825   output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9826 
9827   fprintf (asm_out_file, "\tjmp\t");
9828   assemble_name_raw (asm_out_file, loop_lab);
9829   fputc ('\n', asm_out_file);
9830 
9831   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9832 
9833   return "";
9834 }
9835 
9836 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9837    inclusive.  These are offsets from the current stack pointer.  */
9838 
9839 static void
9840 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9841 {
9842   /* See if we have a constant small number of probes to generate.  If so,
9843      that's the easy case.  The run-time loop is made up of 7 insns in the
9844      generic case while the compile-time loop is made up of n insns for n #
9845      of intervals.  */
9846   if (size <= 7 * PROBE_INTERVAL)
9847     {
9848       HOST_WIDE_INT i;
9849 
9850       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9851 	 it exceeds SIZE.  If only one probe is needed, this will not
9852 	 generate any code.  Then probe at FIRST + SIZE.  */
9853       for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9854 	emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9855 
9856       emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9857     }
9858 
9859   /* Otherwise, do the same as above, but in a loop.  Note that we must be
9860      extra careful with variables wrapping around because we might be at
9861      the very top (or the very bottom) of the address space and we have
9862      to be able to handle this case properly; in particular, we use an
9863      equality test for the loop condition.  */
9864   else
9865     {
9866       HOST_WIDE_INT rounded_size, last;
9867       struct scratch_reg sr;
9868 
9869       get_scratch_register_on_entry (&sr);
9870 
9871 
9872       /* Step 1: round SIZE to the previous multiple of the interval.  */
9873 
9874       rounded_size = size & -PROBE_INTERVAL;
9875 
9876 
9877       /* Step 2: compute initial and final value of the loop counter.  */
9878 
9879       /* TEST_OFFSET = FIRST.  */
9880       emit_move_insn (sr.reg, GEN_INT (-first));
9881 
9882       /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
9883       last = first + rounded_size;
9884 
9885 
9886       /* Step 3: the loop
9887 
9888 	 while (TEST_ADDR != LAST_ADDR)
9889 	   {
9890 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9891 	     probe at TEST_ADDR
9892 	   }
9893 
9894          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9895          until it is equal to ROUNDED_SIZE.  */
9896 
9897       emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9898 
9899 
9900       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9901 	 that SIZE is equal to ROUNDED_SIZE.  */
9902 
9903       if (size != rounded_size)
9904 	emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9905 						       stack_pointer_rtx,
9906 						       sr.reg),
9907 					 rounded_size - size));
9908 
9909       release_scratch_register_on_entry (&sr);
9910     }
9911 
9912   /* Make sure nothing is scheduled before we are done.  */
9913   emit_insn (gen_blockage ());
9914 }
9915 
9916 /* Probe a range of stack addresses from REG to END, inclusive.  These are
9917    offsets from the current stack pointer.  */
9918 
9919 const char *
9920 output_probe_stack_range (rtx reg, rtx end)
9921 {
9922   static int labelno = 0;
9923   char loop_lab[32], end_lab[32];
9924   rtx xops[3];
9925 
9926   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9927   ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9928 
9929   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9930 
9931   /* Jump to END_LAB if TEST_ADDR == LAST_ADDR.  */
9932   xops[0] = reg;
9933   xops[1] = end;
9934   output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9935   fputs ("\tje\t", asm_out_file);
9936   assemble_name_raw (asm_out_file, end_lab);
9937   fputc ('\n', asm_out_file);
9938 
9939   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
9940   xops[1] = GEN_INT (PROBE_INTERVAL);
9941   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9942 
9943   /* Probe at TEST_ADDR.  */
9944   xops[0] = stack_pointer_rtx;
9945   xops[1] = reg;
9946   xops[2] = const0_rtx;
9947   output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9948 
9949   fprintf (asm_out_file, "\tjmp\t");
9950   assemble_name_raw (asm_out_file, loop_lab);
9951   fputc ('\n', asm_out_file);
9952 
9953   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9954 
9955   return "";
9956 }
9957 
9958 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9959    to be generated in correct form.  */
9960 static void
9961 ix86_finalize_stack_realign_flags (void)
9962 {
9963   /* Check if stack realign is really needed after reload, and
9964      stores result in cfun */
9965   unsigned int incoming_stack_boundary
9966     = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9967        ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9968   unsigned int stack_realign = (incoming_stack_boundary
9969 				< (current_function_is_leaf
9970 				   ? crtl->max_used_stack_slot_alignment
9971 				   : crtl->stack_alignment_needed));
9972 
9973   if (crtl->stack_realign_finalized)
9974     {
9975       /* After stack_realign_needed is finalized, we can't no longer
9976 	 change it.  */
9977       gcc_assert (crtl->stack_realign_needed == stack_realign);
9978       return;
9979     }
9980 
9981   /* If the only reason for frame_pointer_needed is that we conservatively
9982      assumed stack realignment might be needed, but in the end nothing that
9983      needed the stack alignment had been spilled, clear frame_pointer_needed
9984      and say we don't need stack realignment.  */
9985   if (stack_realign
9986       && !crtl->need_drap
9987       && frame_pointer_needed
9988       && current_function_is_leaf
9989       && flag_omit_frame_pointer
9990       && current_function_sp_is_unchanging
9991       && !ix86_current_function_calls_tls_descriptor
9992       && !crtl->accesses_prior_frames
9993       && !cfun->calls_alloca
9994       && !crtl->calls_eh_return
9995       && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9996       && !ix86_frame_pointer_required ()
9997       && get_frame_size () == 0
9998       && ix86_nsaved_sseregs () == 0
9999       && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10000     {
10001       HARD_REG_SET set_up_by_prologue, prologue_used;
10002       basic_block bb;
10003 
10004       CLEAR_HARD_REG_SET (prologue_used);
10005       CLEAR_HARD_REG_SET (set_up_by_prologue);
10006       add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10007       add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10008       add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10009 			   HARD_FRAME_POINTER_REGNUM);
10010       FOR_EACH_BB (bb)
10011         {
10012           rtx insn;
10013 	  FOR_BB_INSNS (bb, insn)
10014 	    if (NONDEBUG_INSN_P (insn)
10015 		&& requires_stack_frame_p (insn, prologue_used,
10016 					   set_up_by_prologue))
10017 	      {
10018 		crtl->stack_realign_needed = stack_realign;
10019 		crtl->stack_realign_finalized = true;
10020 		return;
10021 	      }
10022 	}
10023 
10024       frame_pointer_needed = false;
10025       stack_realign = false;
10026       crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10027       crtl->stack_alignment_needed = incoming_stack_boundary;
10028       crtl->stack_alignment_estimated = incoming_stack_boundary;
10029       if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10030 	crtl->preferred_stack_boundary = incoming_stack_boundary;
10031       df_finish_pass (true);
10032       df_scan_alloc (NULL);
10033       df_scan_blocks ();
10034       df_compute_regs_ever_live (true);
10035       df_analyze ();
10036     }
10037 
10038   crtl->stack_realign_needed = stack_realign;
10039   crtl->stack_realign_finalized = true;
10040 }
10041 
10042 /* Expand the prologue into a bunch of separate insns.  */
10043 
10044 void
10045 ix86_expand_prologue (void)
10046 {
10047   struct machine_function *m = cfun->machine;
10048   rtx insn, t;
10049   bool pic_reg_used;
10050   struct ix86_frame frame;
10051   HOST_WIDE_INT allocate;
10052   bool int_registers_saved;
10053   bool sse_registers_saved;
10054 
10055   ix86_finalize_stack_realign_flags ();
10056 
10057   /* DRAP should not coexist with stack_realign_fp */
10058   gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10059 
10060   memset (&m->fs, 0, sizeof (m->fs));
10061 
10062   /* Initialize CFA state for before the prologue.  */
10063   m->fs.cfa_reg = stack_pointer_rtx;
10064   m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10065 
10066   /* Track SP offset to the CFA.  We continue tracking this after we've
10067      swapped the CFA register away from SP.  In the case of re-alignment
10068      this is fudged; we're interested to offsets within the local frame.  */
10069   m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10070   m->fs.sp_valid = true;
10071 
10072   ix86_compute_frame_layout (&frame);
10073 
10074   if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10075     {
10076       /* We should have already generated an error for any use of
10077          ms_hook on a nested function.  */
10078       gcc_checking_assert (!ix86_static_chain_on_stack);
10079 
10080       /* Check if profiling is active and we shall use profiling before
10081          prologue variant. If so sorry.  */
10082       if (crtl->profile && flag_fentry != 0)
10083         sorry ("ms_hook_prologue attribute isn%'t compatible "
10084 	       "with -mfentry for 32-bit");
10085 
10086       /* In ix86_asm_output_function_label we emitted:
10087 	 8b ff     movl.s %edi,%edi
10088 	 55        push   %ebp
10089 	 8b ec     movl.s %esp,%ebp
10090 
10091 	 This matches the hookable function prologue in Win32 API
10092 	 functions in Microsoft Windows XP Service Pack 2 and newer.
10093 	 Wine uses this to enable Windows apps to hook the Win32 API
10094 	 functions provided by Wine.
10095 
10096 	 What that means is that we've already set up the frame pointer.  */
10097 
10098       if (frame_pointer_needed
10099 	  && !(crtl->drap_reg && crtl->stack_realign_needed))
10100 	{
10101 	  rtx push, mov;
10102 
10103 	  /* We've decided to use the frame pointer already set up.
10104 	     Describe this to the unwinder by pretending that both
10105 	     push and mov insns happen right here.
10106 
10107 	     Putting the unwind info here at the end of the ms_hook
10108 	     is done so that we can make absolutely certain we get
10109 	     the required byte sequence at the start of the function,
10110 	     rather than relying on an assembler that can produce
10111 	     the exact encoding required.
10112 
10113 	     However it does mean (in the unpatched case) that we have
10114 	     a 1 insn window where the asynchronous unwind info is
10115 	     incorrect.  However, if we placed the unwind info at
10116 	     its correct location we would have incorrect unwind info
10117 	     in the patched case.  Which is probably all moot since
10118 	     I don't expect Wine generates dwarf2 unwind info for the
10119 	     system libraries that use this feature.  */
10120 
10121 	  insn = emit_insn (gen_blockage ());
10122 
10123 	  push = gen_push (hard_frame_pointer_rtx);
10124 	  mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10125 			     stack_pointer_rtx);
10126 	  RTX_FRAME_RELATED_P (push) = 1;
10127 	  RTX_FRAME_RELATED_P (mov) = 1;
10128 
10129 	  RTX_FRAME_RELATED_P (insn) = 1;
10130 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10131 			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10132 
10133 	  /* Note that gen_push incremented m->fs.cfa_offset, even
10134 	     though we didn't emit the push insn here.  */
10135 	  m->fs.cfa_reg = hard_frame_pointer_rtx;
10136 	  m->fs.fp_offset = m->fs.cfa_offset;
10137 	  m->fs.fp_valid = true;
10138 	}
10139       else
10140 	{
10141 	  /* The frame pointer is not needed so pop %ebp again.
10142 	     This leaves us with a pristine state.  */
10143 	  emit_insn (gen_pop (hard_frame_pointer_rtx));
10144 	}
10145     }
10146 
10147   /* The first insn of a function that accepts its static chain on the
10148      stack is to push the register that would be filled in by a direct
10149      call.  This insn will be skipped by the trampoline.  */
10150   else if (ix86_static_chain_on_stack)
10151     {
10152       insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10153       emit_insn (gen_blockage ());
10154 
10155       /* We don't want to interpret this push insn as a register save,
10156 	 only as a stack adjustment.  The real copy of the register as
10157 	 a save will be done later, if needed.  */
10158       t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10159       t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10160       add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10161       RTX_FRAME_RELATED_P (insn) = 1;
10162     }
10163 
10164   /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10165      of DRAP is needed and stack realignment is really needed after reload */
10166   if (stack_realign_drap)
10167     {
10168       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10169 
10170       /* Only need to push parameter pointer reg if it is caller saved.  */
10171       if (!call_used_regs[REGNO (crtl->drap_reg)])
10172 	{
10173 	  /* Push arg pointer reg */
10174 	  insn = emit_insn (gen_push (crtl->drap_reg));
10175 	  RTX_FRAME_RELATED_P (insn) = 1;
10176 	}
10177 
10178       /* Grab the argument pointer.  */
10179       t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10180       insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10181       RTX_FRAME_RELATED_P (insn) = 1;
10182       m->fs.cfa_reg = crtl->drap_reg;
10183       m->fs.cfa_offset = 0;
10184 
10185       /* Align the stack.  */
10186       insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10187 					stack_pointer_rtx,
10188 					GEN_INT (-align_bytes)));
10189       RTX_FRAME_RELATED_P (insn) = 1;
10190 
10191       /* Replicate the return address on the stack so that return
10192 	 address can be reached via (argp - 1) slot.  This is needed
10193 	 to implement macro RETURN_ADDR_RTX and intrinsic function
10194 	 expand_builtin_return_addr etc.  */
10195       t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10196       t = gen_frame_mem (Pmode, t);
10197       insn = emit_insn (gen_push (t));
10198       RTX_FRAME_RELATED_P (insn) = 1;
10199 
10200       /* For the purposes of frame and register save area addressing,
10201 	 we've started over with a new frame.  */
10202       m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10203       m->fs.realigned = true;
10204     }
10205 
10206   int_registers_saved = (frame.nregs == 0);
10207   sse_registers_saved = (frame.nsseregs == 0);
10208 
10209   if (frame_pointer_needed && !m->fs.fp_valid)
10210     {
10211       /* Note: AT&T enter does NOT have reversed args.  Enter is probably
10212          slower on all targets.  Also sdb doesn't like it.  */
10213       insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10214       RTX_FRAME_RELATED_P (insn) = 1;
10215 
10216       /* Push registers now, before setting the frame pointer
10217 	 on SEH target.  */
10218       if (!int_registers_saved
10219 	  && TARGET_SEH
10220 	  && !frame.save_regs_using_mov)
10221 	{
10222 	  ix86_emit_save_regs ();
10223 	  int_registers_saved = true;
10224 	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10225 	}
10226 
10227       if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10228 	{
10229 	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10230 	  RTX_FRAME_RELATED_P (insn) = 1;
10231 
10232 	  if (m->fs.cfa_reg == stack_pointer_rtx)
10233 	    m->fs.cfa_reg = hard_frame_pointer_rtx;
10234 	  m->fs.fp_offset = m->fs.sp_offset;
10235 	  m->fs.fp_valid = true;
10236 	}
10237     }
10238 
10239   if (!int_registers_saved)
10240     {
10241       /* If saving registers via PUSH, do so now.  */
10242       if (!frame.save_regs_using_mov)
10243 	{
10244 	  ix86_emit_save_regs ();
10245 	  int_registers_saved = true;
10246 	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10247 	}
10248 
10249       /* When using red zone we may start register saving before allocating
10250 	 the stack frame saving one cycle of the prologue.  However, avoid
10251 	 doing this if we have to probe the stack; at least on x86_64 the
10252 	 stack probe can turn into a call that clobbers a red zone location. */
10253       else if (ix86_using_red_zone ()
10254 	       && (! TARGET_STACK_PROBE
10255 		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10256 	{
10257 	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10258 	  int_registers_saved = true;
10259 	}
10260     }
10261 
10262   if (stack_realign_fp)
10263     {
10264       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10265       gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10266 
10267       /* The computation of the size of the re-aligned stack frame means
10268 	 that we must allocate the size of the register save area before
10269 	 performing the actual alignment.  Otherwise we cannot guarantee
10270 	 that there's enough storage above the realignment point.  */
10271       if (m->fs.sp_offset != frame.sse_reg_save_offset)
10272         pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10273 				   GEN_INT (m->fs.sp_offset
10274 					    - frame.sse_reg_save_offset),
10275 				   -1, false);
10276 
10277       /* Align the stack.  */
10278       insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10279 					stack_pointer_rtx,
10280 					GEN_INT (-align_bytes)));
10281 
10282       /* For the purposes of register save area addressing, the stack
10283          pointer is no longer valid.  As for the value of sp_offset,
10284 	 see ix86_compute_frame_layout, which we need to match in order
10285 	 to pass verification of stack_pointer_offset at the end.  */
10286       m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10287       m->fs.sp_valid = false;
10288     }
10289 
10290   allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10291 
10292   if (flag_stack_usage_info)
10293     {
10294       /* We start to count from ARG_POINTER.  */
10295       HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10296 
10297       /* If it was realigned, take into account the fake frame.  */
10298       if (stack_realign_drap)
10299 	{
10300 	  if (ix86_static_chain_on_stack)
10301 	    stack_size += UNITS_PER_WORD;
10302 
10303 	  if (!call_used_regs[REGNO (crtl->drap_reg)])
10304 	    stack_size += UNITS_PER_WORD;
10305 
10306 	  /* This over-estimates by 1 minimal-stack-alignment-unit but
10307 	     mitigates that by counting in the new return address slot.  */
10308 	  current_function_dynamic_stack_size
10309 	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
10310 	}
10311 
10312       current_function_static_stack_size = stack_size;
10313     }
10314 
10315   /* On SEH target with very large frame size, allocate an area to save
10316      SSE registers (as the very large allocation won't be described).  */
10317   if (TARGET_SEH
10318       && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10319       && !sse_registers_saved)
10320     {
10321       HOST_WIDE_INT sse_size =
10322 	frame.sse_reg_save_offset - frame.reg_save_offset;
10323 
10324       gcc_assert (int_registers_saved);
10325 
10326       /* No need to do stack checking as the area will be immediately
10327 	 written.  */
10328       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10329 			         GEN_INT (-sse_size), -1,
10330 				 m->fs.cfa_reg == stack_pointer_rtx);
10331       allocate -= sse_size;
10332       ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10333       sse_registers_saved = true;
10334     }
10335 
10336   /* The stack has already been decremented by the instruction calling us
10337      so probe if the size is non-negative to preserve the protection area.  */
10338   if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10339     {
10340       /* We expect the registers to be saved when probes are used.  */
10341       gcc_assert (int_registers_saved);
10342 
10343       if (STACK_CHECK_MOVING_SP)
10344 	{
10345 	  ix86_adjust_stack_and_probe (allocate);
10346 	  allocate = 0;
10347 	}
10348       else
10349 	{
10350 	  HOST_WIDE_INT size = allocate;
10351 
10352 	  if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10353 	    size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10354 
10355 	  if (TARGET_STACK_PROBE)
10356 	    ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10357 	  else
10358 	    ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10359 	}
10360     }
10361 
10362   if (allocate == 0)
10363     ;
10364   else if (!ix86_target_stack_probe ()
10365 	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10366     {
10367       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10368 			         GEN_INT (-allocate), -1,
10369 			         m->fs.cfa_reg == stack_pointer_rtx);
10370     }
10371   else
10372     {
10373       rtx eax = gen_rtx_REG (Pmode, AX_REG);
10374       rtx r10 = NULL;
10375       rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10376       const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10377       bool eax_live = false;
10378       bool r10_live = false;
10379 
10380       if (TARGET_64BIT)
10381         r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10382       if (!TARGET_64BIT_MS_ABI)
10383         eax_live = ix86_eax_live_at_start_p ();
10384 
10385       /* Note that SEH directives need to continue tracking the stack
10386 	 pointer even after the frame pointer has been set up.  */
10387       if (eax_live)
10388 	{
10389 	  insn = emit_insn (gen_push (eax));
10390 	  allocate -= UNITS_PER_WORD;
10391 	  if (sp_is_cfa_reg || TARGET_SEH)
10392 	    {
10393 	      if (sp_is_cfa_reg)
10394 		m->fs.cfa_offset += UNITS_PER_WORD;
10395 	      RTX_FRAME_RELATED_P (insn) = 1;
10396 	    }
10397 	}
10398 
10399       if (r10_live)
10400 	{
10401 	  r10 = gen_rtx_REG (Pmode, R10_REG);
10402 	  insn = emit_insn (gen_push (r10));
10403 	  allocate -= UNITS_PER_WORD;
10404 	  if (sp_is_cfa_reg || TARGET_SEH)
10405 	    {
10406 	      if (sp_is_cfa_reg)
10407 		m->fs.cfa_offset += UNITS_PER_WORD;
10408 	      RTX_FRAME_RELATED_P (insn) = 1;
10409 	    }
10410 	}
10411 
10412       emit_move_insn (eax, GEN_INT (allocate));
10413       emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10414 
10415       /* Use the fact that AX still contains ALLOCATE.  */
10416       adjust_stack_insn = (TARGET_64BIT
10417 			   ? gen_pro_epilogue_adjust_stack_di_sub
10418 			   : gen_pro_epilogue_adjust_stack_si_sub);
10419 
10420       insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10421 					   stack_pointer_rtx, eax));
10422 
10423       if (sp_is_cfa_reg || TARGET_SEH)
10424 	{
10425 	  if (sp_is_cfa_reg)
10426 	    m->fs.cfa_offset += allocate;
10427 	  RTX_FRAME_RELATED_P (insn) = 1;
10428 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10429 			gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10430 				     plus_constant (stack_pointer_rtx,
10431 						    -allocate)));
10432 	}
10433       m->fs.sp_offset += allocate;
10434 
10435       if (r10_live && eax_live)
10436         {
10437 	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
10438 	  emit_move_insn (r10, gen_frame_mem (Pmode, t));
10439 	  t = plus_constant (t, UNITS_PER_WORD);
10440 	  emit_move_insn (eax, gen_frame_mem (Pmode, t));
10441 	}
10442       else if (eax_live || r10_live)
10443 	{
10444 	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
10445 	  emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10446 	}
10447     }
10448   gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10449 
10450   /* If we havn't already set up the frame pointer, do so now.  */
10451   if (frame_pointer_needed && !m->fs.fp_valid)
10452     {
10453       insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10454 			    GEN_INT (frame.stack_pointer_offset
10455 				     - frame.hard_frame_pointer_offset));
10456       insn = emit_insn (insn);
10457       RTX_FRAME_RELATED_P (insn) = 1;
10458       add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10459 
10460       if (m->fs.cfa_reg == stack_pointer_rtx)
10461 	m->fs.cfa_reg = hard_frame_pointer_rtx;
10462       m->fs.fp_offset = frame.hard_frame_pointer_offset;
10463       m->fs.fp_valid = true;
10464     }
10465 
10466   if (!int_registers_saved)
10467     ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10468   if (!sse_registers_saved)
10469     ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10470 
10471   pic_reg_used = false;
10472   if (pic_offset_table_rtx
10473       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10474 	  || crtl->profile))
10475     {
10476       unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10477 
10478       if (alt_pic_reg_used != INVALID_REGNUM)
10479 	SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10480 
10481       pic_reg_used = true;
10482     }
10483 
10484   if (pic_reg_used)
10485     {
10486       if (TARGET_64BIT)
10487 	{
10488 	  if (ix86_cmodel == CM_LARGE_PIC)
10489 	    {
10490               rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10491 	      rtx label = gen_label_rtx ();
10492 	      emit_label (label);
10493 	      LABEL_PRESERVE_P (label) = 1;
10494 	      gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10495 	      insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10496 	      insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10497 	      insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10498 					    pic_offset_table_rtx, tmp_reg));
10499 	    }
10500 	  else
10501             insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10502 	}
10503       else
10504 	{
10505           insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10506 	  RTX_FRAME_RELATED_P (insn) = 1;
10507 	  add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10508 	}
10509     }
10510 
10511   /* In the pic_reg_used case, make sure that the got load isn't deleted
10512      when mcount needs it.  Blockage to avoid call movement across mcount
10513      call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10514      note.  */
10515   if (crtl->profile && !flag_fentry && pic_reg_used)
10516     emit_insn (gen_prologue_use (pic_offset_table_rtx));
10517 
10518   if (crtl->drap_reg && !crtl->stack_realign_needed)
10519     {
10520       /* vDRAP is setup but after reload it turns out stack realign
10521          isn't necessary, here we will emit prologue to setup DRAP
10522          without stack realign adjustment */
10523       t = choose_baseaddr (0);
10524       emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10525     }
10526 
10527   /* Prevent instructions from being scheduled into register save push
10528      sequence when access to the redzone area is done through frame pointer.
10529      The offset between the frame pointer and the stack pointer is calculated
10530      relative to the value of the stack pointer at the end of the function
10531      prologue, and moving instructions that access redzone area via frame
10532      pointer inside push sequence violates this assumption.  */
10533   if (frame_pointer_needed && frame.red_zone_size)
10534     emit_insn (gen_memory_blockage ());
10535 
10536   /* Emit cld instruction if stringops are used in the function.  */
10537   if (TARGET_CLD && ix86_current_function_needs_cld)
10538     emit_insn (gen_cld ());
10539 
10540   /* SEH requires that the prologue end within 256 bytes of the start of
10541      the function.  Prevent instruction schedules that would extend that.
10542      Further, prevent alloca modifications to the stack pointer from being
10543      combined with prologue modifications.  */
10544   if (TARGET_SEH)
10545     emit_insn (gen_prologue_use (stack_pointer_rtx));
10546 }
10547 
10548 /* Emit code to restore REG using a POP insn.  */
10549 
10550 static void
10551 ix86_emit_restore_reg_using_pop (rtx reg)
10552 {
10553   struct machine_function *m = cfun->machine;
10554   rtx insn = emit_insn (gen_pop (reg));
10555 
10556   ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10557   m->fs.sp_offset -= UNITS_PER_WORD;
10558 
10559   if (m->fs.cfa_reg == crtl->drap_reg
10560       && REGNO (reg) == REGNO (crtl->drap_reg))
10561     {
10562       /* Previously we'd represented the CFA as an expression
10563 	 like *(%ebp - 8).  We've just popped that value from
10564 	 the stack, which means we need to reset the CFA to
10565 	 the drap register.  This will remain until we restore
10566 	 the stack pointer.  */
10567       add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10568       RTX_FRAME_RELATED_P (insn) = 1;
10569 
10570       /* This means that the DRAP register is valid for addressing too.  */
10571       m->fs.drap_valid = true;
10572       return;
10573     }
10574 
10575   if (m->fs.cfa_reg == stack_pointer_rtx)
10576     {
10577       rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10578       x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10579       add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10580       RTX_FRAME_RELATED_P (insn) = 1;
10581 
10582       m->fs.cfa_offset -= UNITS_PER_WORD;
10583     }
10584 
10585   /* When the frame pointer is the CFA, and we pop it, we are
10586      swapping back to the stack pointer as the CFA.  This happens
10587      for stack frames that don't allocate other data, so we assume
10588      the stack pointer is now pointing at the return address, i.e.
10589      the function entry state, which makes the offset be 1 word.  */
10590   if (reg == hard_frame_pointer_rtx)
10591     {
10592       m->fs.fp_valid = false;
10593       if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10594 	{
10595 	  m->fs.cfa_reg = stack_pointer_rtx;
10596 	  m->fs.cfa_offset -= UNITS_PER_WORD;
10597 
10598 	  add_reg_note (insn, REG_CFA_DEF_CFA,
10599 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10600 				      GEN_INT (m->fs.cfa_offset)));
10601 	  RTX_FRAME_RELATED_P (insn) = 1;
10602 	}
10603     }
10604 }
10605 
10606 /* Emit code to restore saved registers using POP insns.  */
10607 
10608 static void
10609 ix86_emit_restore_regs_using_pop (void)
10610 {
10611   unsigned int regno;
10612 
10613   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10614     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10615       ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10616 }
10617 
10618 /* Emit code and notes for the LEAVE instruction.  */
10619 
10620 static void
10621 ix86_emit_leave (void)
10622 {
10623   struct machine_function *m = cfun->machine;
10624   rtx insn = emit_insn (ix86_gen_leave ());
10625 
10626   ix86_add_queued_cfa_restore_notes (insn);
10627 
10628   gcc_assert (m->fs.fp_valid);
10629   m->fs.sp_valid = true;
10630   m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10631   m->fs.fp_valid = false;
10632 
10633   if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10634     {
10635       m->fs.cfa_reg = stack_pointer_rtx;
10636       m->fs.cfa_offset = m->fs.sp_offset;
10637 
10638       add_reg_note (insn, REG_CFA_DEF_CFA,
10639 		    plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10640       RTX_FRAME_RELATED_P (insn) = 1;
10641     }
10642   ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10643 			     m->fs.fp_offset);
10644 }
10645 
10646 /* Emit code to restore saved registers using MOV insns.
10647    First register is restored from CFA - CFA_OFFSET.  */
10648 static void
10649 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10650 				  bool maybe_eh_return)
10651 {
10652   struct machine_function *m = cfun->machine;
10653   unsigned int regno;
10654 
10655   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10656     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10657       {
10658 	rtx reg = gen_rtx_REG (Pmode, regno);
10659 	rtx insn, mem;
10660 
10661 	mem = choose_baseaddr (cfa_offset);
10662 	mem = gen_frame_mem (Pmode, mem);
10663 	insn = emit_move_insn (reg, mem);
10664 
10665         if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10666 	  {
10667 	    /* Previously we'd represented the CFA as an expression
10668 	       like *(%ebp - 8).  We've just popped that value from
10669 	       the stack, which means we need to reset the CFA to
10670 	       the drap register.  This will remain until we restore
10671 	       the stack pointer.  */
10672 	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10673 	    RTX_FRAME_RELATED_P (insn) = 1;
10674 
10675 	    /* This means that the DRAP register is valid for addressing.  */
10676 	    m->fs.drap_valid = true;
10677 	  }
10678 	else
10679 	  ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10680 
10681 	cfa_offset -= UNITS_PER_WORD;
10682       }
10683 }
10684 
10685 /* Emit code to restore saved registers using MOV insns.
10686    First register is restored from CFA - CFA_OFFSET.  */
10687 static void
10688 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10689 				      bool maybe_eh_return)
10690 {
10691   unsigned int regno;
10692 
10693   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10694     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10695       {
10696 	rtx reg = gen_rtx_REG (V4SFmode, regno);
10697 	rtx mem;
10698 
10699 	mem = choose_baseaddr (cfa_offset);
10700 	mem = gen_rtx_MEM (V4SFmode, mem);
10701 	set_mem_align (mem, 128);
10702 	emit_move_insn (reg, mem);
10703 
10704 	ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10705 
10706 	cfa_offset -= 16;
10707       }
10708 }
10709 
10710 /* Emit vzeroupper if needed.  */
10711 
10712 void
10713 ix86_maybe_emit_epilogue_vzeroupper (void)
10714 {
10715   if (TARGET_VZEROUPPER
10716       && !TREE_THIS_VOLATILE (cfun->decl)
10717       && !cfun->machine->caller_return_avx256_p)
10718     emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10719 }
10720 
10721 /* Restore function stack, frame, and registers.  */
10722 
10723 void
10724 ix86_expand_epilogue (int style)
10725 {
10726   struct machine_function *m = cfun->machine;
10727   struct machine_frame_state frame_state_save = m->fs;
10728   struct ix86_frame frame;
10729   bool restore_regs_via_mov;
10730   bool using_drap;
10731 
10732   ix86_finalize_stack_realign_flags ();
10733   ix86_compute_frame_layout (&frame);
10734 
10735   m->fs.sp_valid = (!frame_pointer_needed
10736 		    || (current_function_sp_is_unchanging
10737 			&& !stack_realign_fp));
10738   gcc_assert (!m->fs.sp_valid
10739 	      || m->fs.sp_offset == frame.stack_pointer_offset);
10740 
10741   /* The FP must be valid if the frame pointer is present.  */
10742   gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10743   gcc_assert (!m->fs.fp_valid
10744 	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10745 
10746   /* We must have *some* valid pointer to the stack frame.  */
10747   gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10748 
10749   /* The DRAP is never valid at this point.  */
10750   gcc_assert (!m->fs.drap_valid);
10751 
10752   /* See the comment about red zone and frame
10753      pointer usage in ix86_expand_prologue.  */
10754   if (frame_pointer_needed && frame.red_zone_size)
10755     emit_insn (gen_memory_blockage ());
10756 
10757   using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10758   gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10759 
10760   /* Determine the CFA offset of the end of the red-zone.  */
10761   m->fs.red_zone_offset = 0;
10762   if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10763     {
10764       /* The red-zone begins below the return address.  */
10765       m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10766 
10767       /* When the register save area is in the aligned portion of
10768          the stack, determine the maximum runtime displacement that
10769 	 matches up with the aligned frame.  */
10770       if (stack_realign_drap)
10771 	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10772 				  + UNITS_PER_WORD);
10773     }
10774 
10775   /* Special care must be taken for the normal return case of a function
10776      using eh_return: the eax and edx registers are marked as saved, but
10777      not restored along this path.  Adjust the save location to match.  */
10778   if (crtl->calls_eh_return && style != 2)
10779     frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10780 
10781   /* EH_RETURN requires the use of moves to function properly.  */
10782   if (crtl->calls_eh_return)
10783     restore_regs_via_mov = true;
10784   /* SEH requires the use of pops to identify the epilogue.  */
10785   else if (TARGET_SEH)
10786     restore_regs_via_mov = false;
10787   /* If we're only restoring one register and sp is not valid then
10788      using a move instruction to restore the register since it's
10789      less work than reloading sp and popping the register.  */
10790   else if (!m->fs.sp_valid && frame.nregs <= 1)
10791     restore_regs_via_mov = true;
10792   else if (TARGET_EPILOGUE_USING_MOVE
10793 	   && cfun->machine->use_fast_prologue_epilogue
10794 	   && (frame.nregs > 1
10795 	       || m->fs.sp_offset != frame.reg_save_offset))
10796     restore_regs_via_mov = true;
10797   else if (frame_pointer_needed
10798 	   && !frame.nregs
10799 	   && m->fs.sp_offset != frame.reg_save_offset)
10800     restore_regs_via_mov = true;
10801   else if (frame_pointer_needed
10802 	   && TARGET_USE_LEAVE
10803 	   && cfun->machine->use_fast_prologue_epilogue
10804 	   && frame.nregs == 1)
10805     restore_regs_via_mov = true;
10806   else
10807     restore_regs_via_mov = false;
10808 
10809   if (restore_regs_via_mov || frame.nsseregs)
10810     {
10811       /* Ensure that the entire register save area is addressable via
10812 	 the stack pointer, if we will restore via sp.  */
10813       if (TARGET_64BIT
10814 	  && m->fs.sp_offset > 0x7fffffff
10815 	  && !(m->fs.fp_valid || m->fs.drap_valid)
10816 	  && (frame.nsseregs + frame.nregs) != 0)
10817 	{
10818 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10819 				     GEN_INT (m->fs.sp_offset
10820 					      - frame.sse_reg_save_offset),
10821 				     style,
10822 				     m->fs.cfa_reg == stack_pointer_rtx);
10823 	}
10824     }
10825 
10826   /* If there are any SSE registers to restore, then we have to do it
10827      via moves, since there's obviously no pop for SSE regs.  */
10828   if (frame.nsseregs)
10829     ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10830 					  style == 2);
10831 
10832   if (restore_regs_via_mov)
10833     {
10834       rtx t;
10835 
10836       if (frame.nregs)
10837 	ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10838 
10839       /* eh_return epilogues need %ecx added to the stack pointer.  */
10840       if (style == 2)
10841 	{
10842 	  rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10843 
10844 	  /* Stack align doesn't work with eh_return.  */
10845 	  gcc_assert (!stack_realign_drap);
10846 	  /* Neither does regparm nested functions.  */
10847 	  gcc_assert (!ix86_static_chain_on_stack);
10848 
10849 	  if (frame_pointer_needed)
10850 	    {
10851 	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10852 	      t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10853 	      emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10854 
10855 	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10856 	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
10857 
10858 	      /* Note that we use SA as a temporary CFA, as the return
10859 		 address is at the proper place relative to it.  We
10860 		 pretend this happens at the FP restore insn because
10861 		 prior to this insn the FP would be stored at the wrong
10862 		 offset relative to SA, and after this insn we have no
10863 		 other reasonable register to use for the CFA.  We don't
10864 		 bother resetting the CFA to the SP for the duration of
10865 		 the return insn.  */
10866 	      add_reg_note (insn, REG_CFA_DEF_CFA,
10867 			    plus_constant (sa, UNITS_PER_WORD));
10868 	      ix86_add_queued_cfa_restore_notes (insn);
10869 	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10870 	      RTX_FRAME_RELATED_P (insn) = 1;
10871 
10872 	      m->fs.cfa_reg = sa;
10873 	      m->fs.cfa_offset = UNITS_PER_WORD;
10874 	      m->fs.fp_valid = false;
10875 
10876 	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10877 					 const0_rtx, style, false);
10878 	    }
10879 	  else
10880 	    {
10881 	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10882 	      t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10883 	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10884 	      ix86_add_queued_cfa_restore_notes (insn);
10885 
10886 	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10887 	      if (m->fs.cfa_offset != UNITS_PER_WORD)
10888 		{
10889 		  m->fs.cfa_offset = UNITS_PER_WORD;
10890 		  add_reg_note (insn, REG_CFA_DEF_CFA,
10891 				plus_constant (stack_pointer_rtx,
10892 					       UNITS_PER_WORD));
10893 		  RTX_FRAME_RELATED_P (insn) = 1;
10894 		}
10895 	    }
10896 	  m->fs.sp_offset = UNITS_PER_WORD;
10897 	  m->fs.sp_valid = true;
10898 	}
10899     }
10900   else
10901     {
10902       /* SEH requires that the function end with (1) a stack adjustment
10903 	 if necessary, (2) a sequence of pops, and (3) a return or
10904 	 jump instruction.  Prevent insns from the function body from
10905 	 being scheduled into this sequence.  */
10906       if (TARGET_SEH)
10907 	{
10908 	  /* Prevent a catch region from being adjacent to the standard
10909 	     epilogue sequence.  Unfortuantely crtl->uses_eh_lsda nor
10910 	     several other flags that would be interesting to test are
10911 	     not yet set up.  */
10912 	  if (flag_non_call_exceptions)
10913 	    emit_insn (gen_nops (const1_rtx));
10914 	  else
10915 	    emit_insn (gen_blockage ());
10916 	}
10917 
10918       /* First step is to deallocate the stack frame so that we can
10919 	 pop the registers.  Also do it on SEH target for very large
10920 	 frame as the emitted instructions aren't allowed by the ABI in
10921 	 epilogues.  */
10922       if (!m->fs.sp_valid
10923  	  || (TARGET_SEH
10924 	      && (m->fs.sp_offset - frame.reg_save_offset
10925 		  >= SEH_MAX_FRAME_SIZE)))
10926 	{
10927 	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10928 				     GEN_INT (m->fs.fp_offset
10929 					      - frame.reg_save_offset),
10930 				     style, false);
10931 	}
10932       else if (m->fs.sp_offset != frame.reg_save_offset)
10933 	{
10934 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10935 				     GEN_INT (m->fs.sp_offset
10936 					      - frame.reg_save_offset),
10937 				     style,
10938 				     m->fs.cfa_reg == stack_pointer_rtx);
10939 	}
10940 
10941       ix86_emit_restore_regs_using_pop ();
10942     }
10943 
10944   /* If we used a stack pointer and haven't already got rid of it,
10945      then do so now.  */
10946   if (m->fs.fp_valid)
10947     {
10948       /* If the stack pointer is valid and pointing at the frame
10949 	 pointer store address, then we only need a pop.  */
10950       if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10951 	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10952       /* Leave results in shorter dependency chains on CPUs that are
10953 	 able to grok it fast.  */
10954       else if (TARGET_USE_LEAVE
10955 	       || optimize_function_for_size_p (cfun)
10956 	       || !cfun->machine->use_fast_prologue_epilogue)
10957 	ix86_emit_leave ();
10958       else
10959         {
10960 	  pro_epilogue_adjust_stack (stack_pointer_rtx,
10961 				     hard_frame_pointer_rtx,
10962 				     const0_rtx, style, !using_drap);
10963 	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10964         }
10965     }
10966 
10967   if (using_drap)
10968     {
10969       int param_ptr_offset = UNITS_PER_WORD;
10970       rtx insn;
10971 
10972       gcc_assert (stack_realign_drap);
10973 
10974       if (ix86_static_chain_on_stack)
10975 	param_ptr_offset += UNITS_PER_WORD;
10976       if (!call_used_regs[REGNO (crtl->drap_reg)])
10977 	param_ptr_offset += UNITS_PER_WORD;
10978 
10979       insn = emit_insn (gen_rtx_SET
10980 			(VOIDmode, stack_pointer_rtx,
10981 			 gen_rtx_PLUS (Pmode,
10982 				       crtl->drap_reg,
10983 				       GEN_INT (-param_ptr_offset))));
10984       m->fs.cfa_reg = stack_pointer_rtx;
10985       m->fs.cfa_offset = param_ptr_offset;
10986       m->fs.sp_offset = param_ptr_offset;
10987       m->fs.realigned = false;
10988 
10989       add_reg_note (insn, REG_CFA_DEF_CFA,
10990 		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10991 				  GEN_INT (param_ptr_offset)));
10992       RTX_FRAME_RELATED_P (insn) = 1;
10993 
10994       if (!call_used_regs[REGNO (crtl->drap_reg)])
10995 	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10996     }
10997 
10998   /* At this point the stack pointer must be valid, and we must have
10999      restored all of the registers.  We may not have deallocated the
11000      entire stack frame.  We've delayed this until now because it may
11001      be possible to merge the local stack deallocation with the
11002      deallocation forced by ix86_static_chain_on_stack.   */
11003   gcc_assert (m->fs.sp_valid);
11004   gcc_assert (!m->fs.fp_valid);
11005   gcc_assert (!m->fs.realigned);
11006   if (m->fs.sp_offset != UNITS_PER_WORD)
11007     {
11008       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11009 				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11010 				 style, true);
11011     }
11012   else
11013     ix86_add_queued_cfa_restore_notes (get_last_insn ());
11014 
11015   /* Sibcall epilogues don't want a return instruction.  */
11016   if (style == 0)
11017     {
11018       m->fs = frame_state_save;
11019       return;
11020     }
11021 
11022   /* Emit vzeroupper if needed.  */
11023   ix86_maybe_emit_epilogue_vzeroupper ();
11024 
11025   if (crtl->args.pops_args && crtl->args.size)
11026     {
11027       rtx popc = GEN_INT (crtl->args.pops_args);
11028 
11029       /* i386 can only pop 64K bytes.  If asked to pop more, pop return
11030 	 address, do explicit add, and jump indirectly to the caller.  */
11031 
11032       if (crtl->args.pops_args >= 65536)
11033 	{
11034 	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
11035 	  rtx insn;
11036 
11037 	  /* There is no "pascal" calling convention in any 64bit ABI.  */
11038 	  gcc_assert (!TARGET_64BIT);
11039 
11040 	  insn = emit_insn (gen_pop (ecx));
11041 	  m->fs.cfa_offset -= UNITS_PER_WORD;
11042 	  m->fs.sp_offset -= UNITS_PER_WORD;
11043 
11044 	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
11045 			copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11046 	  add_reg_note (insn, REG_CFA_REGISTER,
11047 			gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11048 	  RTX_FRAME_RELATED_P (insn) = 1;
11049 
11050 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11051 				     popc, -1, true);
11052 	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11053 	}
11054       else
11055 	emit_jump_insn (gen_simple_return_pop_internal (popc));
11056     }
11057   else
11058     emit_jump_insn (gen_simple_return_internal ());
11059 
11060   /* Restore the state back to the state from the prologue,
11061      so that it's correct for the next epilogue.  */
11062   m->fs = frame_state_save;
11063 }
11064 
11065 /* Reset from the function's potential modifications.  */
11066 
11067 static void
11068 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11069 			       HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11070 {
11071   if (pic_offset_table_rtx)
11072     SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11073 #if TARGET_MACHO
11074   /* Mach-O doesn't support labels at the end of objects, so if
11075      it looks like we might want one, insert a NOP.  */
11076   {
11077     rtx insn = get_last_insn ();
11078     rtx deleted_debug_label = NULL_RTX;
11079     while (insn
11080 	   && NOTE_P (insn)
11081 	   && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11082       {
11083 	/* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11084 	   notes only, instead set their CODE_LABEL_NUMBER to -1,
11085 	   otherwise there would be code generation differences
11086 	   in between -g and -g0.  */
11087 	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11088 	  deleted_debug_label = insn;
11089 	insn = PREV_INSN (insn);
11090       }
11091     if (insn
11092 	&& (LABEL_P (insn)
11093 	    || (NOTE_P (insn)
11094 		&& NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11095       fputs ("\tnop\n", file);
11096     else if (deleted_debug_label)
11097       for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11098 	if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11099 	  CODE_LABEL_NUMBER (insn) = -1;
11100   }
11101 #endif
11102 
11103 }
11104 
11105 /* Return a scratch register to use in the split stack prologue.  The
11106    split stack prologue is used for -fsplit-stack.  It is the first
11107    instructions in the function, even before the regular prologue.
11108    The scratch register can be any caller-saved register which is not
11109    used for parameters or for the static chain.  */
11110 
11111 static unsigned int
11112 split_stack_prologue_scratch_regno (void)
11113 {
11114   if (TARGET_64BIT)
11115     return R11_REG;
11116   else
11117     {
11118       bool is_fastcall, is_thiscall;
11119       int regparm;
11120 
11121       is_fastcall = (lookup_attribute ("fastcall",
11122 				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11123 		     != NULL);
11124       is_thiscall = (lookup_attribute ("thiscall",
11125 				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11126 		     != NULL);
11127       regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11128 
11129       if (is_fastcall)
11130 	{
11131 	  if (DECL_STATIC_CHAIN (cfun->decl))
11132 	    {
11133 	      sorry ("-fsplit-stack does not support fastcall with "
11134 		     "nested function");
11135 	      return INVALID_REGNUM;
11136 	    }
11137 	  return AX_REG;
11138 	}
11139       else if (is_thiscall)
11140         {
11141 	  if (!DECL_STATIC_CHAIN (cfun->decl))
11142 	    return DX_REG;
11143 	  return AX_REG;
11144 	}
11145       else if (regparm < 3)
11146 	{
11147 	  if (!DECL_STATIC_CHAIN (cfun->decl))
11148 	    return CX_REG;
11149 	  else
11150 	    {
11151 	      if (regparm >= 2)
11152 		{
11153 		  sorry ("-fsplit-stack does not support 2 register "
11154 			 " parameters for a nested function");
11155 		  return INVALID_REGNUM;
11156 		}
11157 	      return DX_REG;
11158 	    }
11159 	}
11160       else
11161 	{
11162 	  /* FIXME: We could make this work by pushing a register
11163 	     around the addition and comparison.  */
11164 	  sorry ("-fsplit-stack does not support 3 register parameters");
11165 	  return INVALID_REGNUM;
11166 	}
11167     }
11168 }
11169 
11170 /* A SYMBOL_REF for the function which allocates new stackspace for
11171    -fsplit-stack.  */
11172 
11173 static GTY(()) rtx split_stack_fn;
11174 
11175 /* A SYMBOL_REF for the more stack function when using the large
11176    model.  */
11177 
11178 static GTY(()) rtx split_stack_fn_large;
11179 
11180 /* Handle -fsplit-stack.  These are the first instructions in the
11181    function, even before the regular prologue.  */
11182 
11183 void
11184 ix86_expand_split_stack_prologue (void)
11185 {
11186   struct ix86_frame frame;
11187   HOST_WIDE_INT allocate;
11188   unsigned HOST_WIDE_INT args_size;
11189   rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11190   rtx scratch_reg = NULL_RTX;
11191   rtx varargs_label = NULL_RTX;
11192   rtx fn;
11193 
11194   gcc_assert (flag_split_stack && reload_completed);
11195 
11196   ix86_finalize_stack_realign_flags ();
11197   ix86_compute_frame_layout (&frame);
11198   allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11199 
11200   /* This is the label we will branch to if we have enough stack
11201      space.  We expect the basic block reordering pass to reverse this
11202      branch if optimizing, so that we branch in the unlikely case.  */
11203   label = gen_label_rtx ();
11204 
11205   /* We need to compare the stack pointer minus the frame size with
11206      the stack boundary in the TCB.  The stack boundary always gives
11207      us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11208      can compare directly.  Otherwise we need to do an addition.  */
11209 
11210   limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11211 			  UNSPEC_STACK_CHECK);
11212   limit = gen_rtx_CONST (Pmode, limit);
11213   limit = gen_rtx_MEM (Pmode, limit);
11214   if (allocate < SPLIT_STACK_AVAILABLE)
11215     current = stack_pointer_rtx;
11216   else
11217     {
11218       unsigned int scratch_regno;
11219       rtx offset;
11220 
11221       /* We need a scratch register to hold the stack pointer minus
11222 	 the required frame size.  Since this is the very start of the
11223 	 function, the scratch register can be any caller-saved
11224 	 register which is not used for parameters.  */
11225       offset = GEN_INT (- allocate);
11226       scratch_regno = split_stack_prologue_scratch_regno ();
11227       if (scratch_regno == INVALID_REGNUM)
11228 	return;
11229       scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11230       if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11231 	{
11232 	  /* We don't use ix86_gen_add3 in this case because it will
11233 	     want to split to lea, but when not optimizing the insn
11234 	     will not be split after this point.  */
11235 	  emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11236 				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11237 						offset)));
11238 	}
11239       else
11240 	{
11241 	  emit_move_insn (scratch_reg, offset);
11242 	  emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11243 				 stack_pointer_rtx));
11244 	}
11245       current = scratch_reg;
11246     }
11247 
11248   ix86_expand_branch (GEU, current, limit, label);
11249   jump_insn = get_last_insn ();
11250   JUMP_LABEL (jump_insn) = label;
11251 
11252   /* Mark the jump as very likely to be taken.  */
11253   add_reg_note (jump_insn, REG_BR_PROB,
11254 		GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11255 
11256   if (split_stack_fn == NULL_RTX)
11257     split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11258   fn = split_stack_fn;
11259 
11260   /* Get more stack space.  We pass in the desired stack space and the
11261      size of the arguments to copy to the new stack.  In 32-bit mode
11262      we push the parameters; __morestack will return on a new stack
11263      anyhow.  In 64-bit mode we pass the parameters in r10 and
11264      r11.  */
11265   allocate_rtx = GEN_INT (allocate);
11266   args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11267   call_fusage = NULL_RTX;
11268   if (TARGET_64BIT)
11269     {
11270       rtx reg10, reg11;
11271 
11272       reg10 = gen_rtx_REG (Pmode, R10_REG);
11273       reg11 = gen_rtx_REG (Pmode, R11_REG);
11274 
11275       /* If this function uses a static chain, it will be in %r10.
11276 	 Preserve it across the call to __morestack.  */
11277       if (DECL_STATIC_CHAIN (cfun->decl))
11278 	{
11279 	  rtx rax;
11280 
11281 	  rax = gen_rtx_REG (Pmode, AX_REG);
11282 	  emit_move_insn (rax, reg10);
11283 	  use_reg (&call_fusage, rax);
11284 	}
11285 
11286       if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11287 	{
11288 	  HOST_WIDE_INT argval;
11289 
11290 	  /* When using the large model we need to load the address
11291 	     into a register, and we've run out of registers.  So we
11292 	     switch to a different calling convention, and we call a
11293 	     different function: __morestack_large.  We pass the
11294 	     argument size in the upper 32 bits of r10 and pass the
11295 	     frame size in the lower 32 bits.  */
11296 	  gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11297 	  gcc_assert ((args_size & 0xffffffff) == args_size);
11298 
11299 	  if (split_stack_fn_large == NULL_RTX)
11300 	    split_stack_fn_large =
11301 	      gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11302 
11303 	  if (ix86_cmodel == CM_LARGE_PIC)
11304 	    {
11305 	      rtx label, x;
11306 
11307 	      label = gen_label_rtx ();
11308 	      emit_label (label);
11309 	      LABEL_PRESERVE_P (label) = 1;
11310 	      emit_insn (gen_set_rip_rex64 (reg10, label));
11311 	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
11312 	      emit_insn (gen_adddi3 (reg10, reg10, reg11));
11313 	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11314 				  UNSPEC_GOT);
11315 	      x = gen_rtx_CONST (Pmode, x);
11316 	      emit_move_insn (reg11, x);
11317 	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
11318 	      x = gen_const_mem (Pmode, x);
11319 	      emit_move_insn (reg11, x);
11320 	    }
11321 	  else
11322 	    emit_move_insn (reg11, split_stack_fn_large);
11323 
11324 	  fn = reg11;
11325 
11326 	  argval = ((args_size << 16) << 16) + allocate;
11327 	  emit_move_insn (reg10, GEN_INT (argval));
11328 	}
11329       else
11330 	{
11331 	  emit_move_insn (reg10, allocate_rtx);
11332 	  emit_move_insn (reg11, GEN_INT (args_size));
11333 	  use_reg (&call_fusage, reg11);
11334 	}
11335 
11336       use_reg (&call_fusage, reg10);
11337     }
11338   else
11339     {
11340       emit_insn (gen_push (GEN_INT (args_size)));
11341       emit_insn (gen_push (allocate_rtx));
11342     }
11343   call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11344 				GEN_INT (UNITS_PER_WORD), constm1_rtx,
11345 				NULL_RTX, false);
11346   add_function_usage_to (call_insn, call_fusage);
11347 
11348   /* In order to make call/return prediction work right, we now need
11349      to execute a return instruction.  See
11350      libgcc/config/i386/morestack.S for the details on how this works.
11351 
11352      For flow purposes gcc must not see this as a return
11353      instruction--we need control flow to continue at the subsequent
11354      label.  Therefore, we use an unspec.  */
11355   gcc_assert (crtl->args.pops_args < 65536);
11356   emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11357 
11358   /* If we are in 64-bit mode and this function uses a static chain,
11359      we saved %r10 in %rax before calling _morestack.  */
11360   if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11361     emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11362 		    gen_rtx_REG (Pmode, AX_REG));
11363 
11364   /* If this function calls va_start, we need to store a pointer to
11365      the arguments on the old stack, because they may not have been
11366      all copied to the new stack.  At this point the old stack can be
11367      found at the frame pointer value used by __morestack, because
11368      __morestack has set that up before calling back to us.  Here we
11369      store that pointer in a scratch register, and in
11370      ix86_expand_prologue we store the scratch register in a stack
11371      slot.  */
11372   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11373     {
11374       unsigned int scratch_regno;
11375       rtx frame_reg;
11376       int words;
11377 
11378       scratch_regno = split_stack_prologue_scratch_regno ();
11379       scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11380       frame_reg = gen_rtx_REG (Pmode, BP_REG);
11381 
11382       /* 64-bit:
11383 	 fp -> old fp value
11384 	       return address within this function
11385 	       return address of caller of this function
11386 	       stack arguments
11387 	 So we add three words to get to the stack arguments.
11388 
11389 	 32-bit:
11390 	 fp -> old fp value
11391 	       return address within this function
11392                first argument to __morestack
11393                second argument to __morestack
11394                return address of caller of this function
11395                stack arguments
11396          So we add five words to get to the stack arguments.
11397       */
11398       words = TARGET_64BIT ? 3 : 5;
11399       emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11400 			      gen_rtx_PLUS (Pmode, frame_reg,
11401 					    GEN_INT (words * UNITS_PER_WORD))));
11402 
11403       varargs_label = gen_label_rtx ();
11404       emit_jump_insn (gen_jump (varargs_label));
11405       JUMP_LABEL (get_last_insn ()) = varargs_label;
11406 
11407       emit_barrier ();
11408     }
11409 
11410   emit_label (label);
11411   LABEL_NUSES (label) = 1;
11412 
11413   /* If this function calls va_start, we now have to set the scratch
11414      register for the case where we do not call __morestack.  In this
11415      case we need to set it based on the stack pointer.  */
11416   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11417     {
11418       emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11419 			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11420 					    GEN_INT (UNITS_PER_WORD))));
11421 
11422       emit_label (varargs_label);
11423       LABEL_NUSES (varargs_label) = 1;
11424     }
11425 }
11426 
11427 /* We may have to tell the dataflow pass that the split stack prologue
11428    is initializing a scratch register.  */
11429 
11430 static void
11431 ix86_live_on_entry (bitmap regs)
11432 {
11433   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11434     {
11435       gcc_assert (flag_split_stack);
11436       bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11437     }
11438 }
11439 
11440 /* Extract the parts of an RTL expression that is a valid memory address
11441    for an instruction.  Return 0 if the structure of the address is
11442    grossly off.  Return -1 if the address contains ASHIFT, so it is not
11443    strictly valid, but still used for computing length of lea instruction.  */
11444 
11445 int
11446 ix86_decompose_address (rtx addr, struct ix86_address *out)
11447 {
11448   rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11449   rtx base_reg, index_reg;
11450   HOST_WIDE_INT scale = 1;
11451   rtx scale_rtx = NULL_RTX;
11452   rtx tmp;
11453   int retval = 1;
11454   enum ix86_address_seg seg = SEG_DEFAULT;
11455 
11456   /* Allow zero-extended SImode addresses,
11457      they will be emitted with addr32 prefix.  */
11458   if (TARGET_64BIT && GET_MODE (addr) == DImode)
11459     {
11460       if (GET_CODE (addr) == ZERO_EXTEND
11461 	  && GET_MODE (XEXP (addr, 0)) == SImode)
11462 	{
11463 	  addr = XEXP (addr, 0);
11464 	  if (CONST_INT_P (addr))
11465 	    return 0;
11466 	}
11467       else if (GET_CODE (addr) == AND
11468 	       && const_32bit_mask (XEXP (addr, 1), DImode))
11469 	{
11470 	  addr = XEXP (addr, 0);
11471 
11472 	  /* Adjust SUBREGs.  */
11473 	  if (GET_CODE (addr) == SUBREG
11474 	      && GET_MODE (SUBREG_REG (addr)) == SImode)
11475 	    {
11476 	      addr = SUBREG_REG (addr);
11477 	      if (CONST_INT_P (addr))
11478 		return 0;
11479 	    }
11480 	  else if (GET_MODE (addr) == DImode)
11481 	    addr = gen_rtx_SUBREG (SImode, addr, 0);
11482 	  else if (GET_MODE (addr) != VOIDmode)
11483 	    return 0;
11484 	}
11485     }
11486 
11487   /* Allow SImode subregs of DImode addresses,
11488      they will be emitted with addr32 prefix.  */
11489   if (TARGET_64BIT && GET_MODE (addr) == SImode)
11490     {
11491       if (GET_CODE (addr) == SUBREG
11492 	  && GET_MODE (SUBREG_REG (addr)) == DImode)
11493 	{
11494 	  addr = SUBREG_REG (addr);
11495 	  if (CONST_INT_P (addr))
11496 	    return 0;
11497 	}
11498     }
11499 
11500   if (REG_P (addr))
11501     base = addr;
11502   else if (GET_CODE (addr) == SUBREG)
11503     {
11504       if (REG_P (SUBREG_REG (addr)))
11505 	base = addr;
11506       else
11507 	return 0;
11508     }
11509   else if (GET_CODE (addr) == PLUS)
11510     {
11511       rtx addends[4], op;
11512       int n = 0, i;
11513 
11514       op = addr;
11515       do
11516 	{
11517 	  if (n >= 4)
11518 	    return 0;
11519 	  addends[n++] = XEXP (op, 1);
11520 	  op = XEXP (op, 0);
11521 	}
11522       while (GET_CODE (op) == PLUS);
11523       if (n >= 4)
11524 	return 0;
11525       addends[n] = op;
11526 
11527       for (i = n; i >= 0; --i)
11528 	{
11529 	  op = addends[i];
11530 	  switch (GET_CODE (op))
11531 	    {
11532 	    case MULT:
11533 	      if (index)
11534 		return 0;
11535 	      index = XEXP (op, 0);
11536 	      scale_rtx = XEXP (op, 1);
11537 	      break;
11538 
11539 	    case ASHIFT:
11540 	      if (index)
11541 		return 0;
11542 	      index = XEXP (op, 0);
11543 	      tmp = XEXP (op, 1);
11544 	      if (!CONST_INT_P (tmp))
11545 		return 0;
11546 	      scale = INTVAL (tmp);
11547 	      if ((unsigned HOST_WIDE_INT) scale > 3)
11548 		return 0;
11549 	      scale = 1 << scale;
11550 	      break;
11551 
11552 	    case UNSPEC:
11553 	      if (XINT (op, 1) == UNSPEC_TP
11554 	          && TARGET_TLS_DIRECT_SEG_REFS
11555 	          && seg == SEG_DEFAULT)
11556 		seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11557 	      else
11558 		return 0;
11559 	      break;
11560 
11561 	    case SUBREG:
11562 	      if (!REG_P (SUBREG_REG (op)))
11563 		return 0;
11564 	      /* FALLTHRU */
11565 
11566 	    case REG:
11567 	      if (!base)
11568 		base = op;
11569 	      else if (!index)
11570 		index = op;
11571 	      else
11572 		return 0;
11573 	      break;
11574 
11575 	    case CONST:
11576 	    case CONST_INT:
11577 	    case SYMBOL_REF:
11578 	    case LABEL_REF:
11579 	      if (disp)
11580 		return 0;
11581 	      disp = op;
11582 	      break;
11583 
11584 	    default:
11585 	      return 0;
11586 	    }
11587 	}
11588     }
11589   else if (GET_CODE (addr) == MULT)
11590     {
11591       index = XEXP (addr, 0);		/* index*scale */
11592       scale_rtx = XEXP (addr, 1);
11593     }
11594   else if (GET_CODE (addr) == ASHIFT)
11595     {
11596       /* We're called for lea too, which implements ashift on occasion.  */
11597       index = XEXP (addr, 0);
11598       tmp = XEXP (addr, 1);
11599       if (!CONST_INT_P (tmp))
11600 	return 0;
11601       scale = INTVAL (tmp);
11602       if ((unsigned HOST_WIDE_INT) scale > 3)
11603 	return 0;
11604       scale = 1 << scale;
11605       retval = -1;
11606     }
11607   else
11608     disp = addr;			/* displacement */
11609 
11610   if (index)
11611     {
11612       if (REG_P (index))
11613 	;
11614       else if (GET_CODE (index) == SUBREG
11615 	       && REG_P (SUBREG_REG (index)))
11616 	;
11617       else
11618 	return 0;
11619     }
11620 
11621   /* Extract the integral value of scale.  */
11622   if (scale_rtx)
11623     {
11624       if (!CONST_INT_P (scale_rtx))
11625 	return 0;
11626       scale = INTVAL (scale_rtx);
11627     }
11628 
11629   base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11630   index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11631 
11632   /* Avoid useless 0 displacement.  */
11633   if (disp == const0_rtx && (base || index))
11634     disp = NULL_RTX;
11635 
11636   /* Allow arg pointer and stack pointer as index if there is not scaling.  */
11637   if (base_reg && index_reg && scale == 1
11638       && (index_reg == arg_pointer_rtx
11639 	  || index_reg == frame_pointer_rtx
11640 	  || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11641     {
11642       rtx tmp;
11643       tmp = base, base = index, index = tmp;
11644       tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11645     }
11646 
11647   /* Special case: %ebp cannot be encoded as a base without a displacement.
11648      Similarly %r13.  */
11649   if (!disp
11650       && base_reg
11651       && (base_reg == hard_frame_pointer_rtx
11652 	  || base_reg == frame_pointer_rtx
11653 	  || base_reg == arg_pointer_rtx
11654 	  || (REG_P (base_reg)
11655 	      && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11656 		  || REGNO (base_reg) == R13_REG))))
11657     disp = const0_rtx;
11658 
11659   /* Special case: on K6, [%esi] makes the instruction vector decoded.
11660      Avoid this by transforming to [%esi+0].
11661      Reload calls address legitimization without cfun defined, so we need
11662      to test cfun for being non-NULL. */
11663   if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11664       && base_reg && !index_reg && !disp
11665       && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11666     disp = const0_rtx;
11667 
11668   /* Special case: encode reg+reg instead of reg*2.  */
11669   if (!base && index && scale == 2)
11670     base = index, base_reg = index_reg, scale = 1;
11671 
11672   /* Special case: scaling cannot be encoded without base or displacement.  */
11673   if (!base && !disp && index && scale != 1)
11674     disp = const0_rtx;
11675 
11676   out->base = base;
11677   out->index = index;
11678   out->disp = disp;
11679   out->scale = scale;
11680   out->seg = seg;
11681 
11682   return retval;
11683 }
11684 
11685 /* Return cost of the memory address x.
11686    For i386, it is better to use a complex address than let gcc copy
11687    the address into a reg and make a new pseudo.  But not if the address
11688    requires to two regs - that would mean more pseudos with longer
11689    lifetimes.  */
11690 static int
11691 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11692 {
11693   struct ix86_address parts;
11694   int cost = 1;
11695   int ok = ix86_decompose_address (x, &parts);
11696 
11697   gcc_assert (ok);
11698 
11699   if (parts.base && GET_CODE (parts.base) == SUBREG)
11700     parts.base = SUBREG_REG (parts.base);
11701   if (parts.index && GET_CODE (parts.index) == SUBREG)
11702     parts.index = SUBREG_REG (parts.index);
11703 
11704   /* Attempt to minimize number of registers in the address.  */
11705   if ((parts.base
11706        && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11707       || (parts.index
11708 	  && (!REG_P (parts.index)
11709 	      || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11710     cost++;
11711 
11712   if (parts.base
11713       && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11714       && parts.index
11715       && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11716       && parts.base != parts.index)
11717     cost++;
11718 
11719   /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11720      since it's predecode logic can't detect the length of instructions
11721      and it degenerates to vector decoded.  Increase cost of such
11722      addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
11723      to split such addresses or even refuse such addresses at all.
11724 
11725      Following addressing modes are affected:
11726       [base+scale*index]
11727       [scale*index+disp]
11728       [base+index]
11729 
11730      The first and last case  may be avoidable by explicitly coding the zero in
11731      memory address, but I don't have AMD-K6 machine handy to check this
11732      theory.  */
11733 
11734   if (TARGET_K6
11735       && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11736 	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11737 	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11738     cost += 10;
11739 
11740   return cost;
11741 }
11742 
11743 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11744    this is used for to form addresses to local data when -fPIC is in
11745    use.  */
11746 
11747 static bool
11748 darwin_local_data_pic (rtx disp)
11749 {
11750   return (GET_CODE (disp) == UNSPEC
11751 	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11752 }
11753 
11754 /* Determine if a given RTX is a valid constant.  We already know this
11755    satisfies CONSTANT_P.  */
11756 
11757 static bool
11758 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11759 {
11760   switch (GET_CODE (x))
11761     {
11762     case CONST:
11763       x = XEXP (x, 0);
11764 
11765       if (GET_CODE (x) == PLUS)
11766 	{
11767 	  if (!CONST_INT_P (XEXP (x, 1)))
11768 	    return false;
11769 	  x = XEXP (x, 0);
11770 	}
11771 
11772       if (TARGET_MACHO && darwin_local_data_pic (x))
11773 	return true;
11774 
11775       /* Only some unspecs are valid as "constants".  */
11776       if (GET_CODE (x) == UNSPEC)
11777 	switch (XINT (x, 1))
11778 	  {
11779 	  case UNSPEC_GOT:
11780 	  case UNSPEC_GOTOFF:
11781 	  case UNSPEC_PLTOFF:
11782 	    return TARGET_64BIT;
11783 	  case UNSPEC_TPOFF:
11784 	  case UNSPEC_NTPOFF:
11785 	    x = XVECEXP (x, 0, 0);
11786 	    return (GET_CODE (x) == SYMBOL_REF
11787 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11788 	  case UNSPEC_DTPOFF:
11789 	    x = XVECEXP (x, 0, 0);
11790 	    return (GET_CODE (x) == SYMBOL_REF
11791 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11792 	  default:
11793 	    return false;
11794 	  }
11795 
11796       /* We must have drilled down to a symbol.  */
11797       if (GET_CODE (x) == LABEL_REF)
11798 	return true;
11799       if (GET_CODE (x) != SYMBOL_REF)
11800 	return false;
11801       /* FALLTHRU */
11802 
11803     case SYMBOL_REF:
11804       /* TLS symbols are never valid.  */
11805       if (SYMBOL_REF_TLS_MODEL (x))
11806 	return false;
11807 
11808       /* DLLIMPORT symbols are never valid.  */
11809       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11810 	  && SYMBOL_REF_DLLIMPORT_P (x))
11811 	return false;
11812 
11813 #if TARGET_MACHO
11814       /* mdynamic-no-pic */
11815       if (MACHO_DYNAMIC_NO_PIC_P)
11816 	return machopic_symbol_defined_p (x);
11817 #endif
11818       break;
11819 
11820     case CONST_DOUBLE:
11821       if (GET_MODE (x) == TImode
11822 	  && x != CONST0_RTX (TImode)
11823           && !TARGET_64BIT)
11824 	return false;
11825       break;
11826 
11827     case CONST_VECTOR:
11828       if (!standard_sse_constant_p (x))
11829 	return false;
11830 
11831     default:
11832       break;
11833     }
11834 
11835   /* Otherwise we handle everything else in the move patterns.  */
11836   return true;
11837 }
11838 
11839 /* Determine if it's legal to put X into the constant pool.  This
11840    is not possible for the address of thread-local symbols, which
11841    is checked above.  */
11842 
11843 static bool
11844 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11845 {
11846   /* We can always put integral constants and vectors in memory.  */
11847   switch (GET_CODE (x))
11848     {
11849     case CONST_INT:
11850     case CONST_DOUBLE:
11851     case CONST_VECTOR:
11852       return false;
11853 
11854     default:
11855       break;
11856     }
11857   return !ix86_legitimate_constant_p (mode, x);
11858 }
11859 
11860 
11861 /* Nonzero if the constant value X is a legitimate general operand
11862    when generating PIC code.  It is given that flag_pic is on and
11863    that X satisfies CONSTANT_P or is a CONST_DOUBLE.  */
11864 
11865 bool
11866 legitimate_pic_operand_p (rtx x)
11867 {
11868   rtx inner;
11869 
11870   switch (GET_CODE (x))
11871     {
11872     case CONST:
11873       inner = XEXP (x, 0);
11874       if (GET_CODE (inner) == PLUS
11875 	  && CONST_INT_P (XEXP (inner, 1)))
11876 	inner = XEXP (inner, 0);
11877 
11878       /* Only some unspecs are valid as "constants".  */
11879       if (GET_CODE (inner) == UNSPEC)
11880 	switch (XINT (inner, 1))
11881 	  {
11882 	  case UNSPEC_GOT:
11883 	  case UNSPEC_GOTOFF:
11884 	  case UNSPEC_PLTOFF:
11885 	    return TARGET_64BIT;
11886 	  case UNSPEC_TPOFF:
11887 	    x = XVECEXP (inner, 0, 0);
11888 	    return (GET_CODE (x) == SYMBOL_REF
11889 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11890 	  case UNSPEC_MACHOPIC_OFFSET:
11891 	    return legitimate_pic_address_disp_p (x);
11892 	  default:
11893 	    return false;
11894 	  }
11895       /* FALLTHRU */
11896 
11897     case SYMBOL_REF:
11898     case LABEL_REF:
11899       return legitimate_pic_address_disp_p (x);
11900 
11901     default:
11902       return true;
11903     }
11904 }
11905 
11906 /* Determine if a given CONST RTX is a valid memory displacement
11907    in PIC mode.  */
11908 
11909 bool
11910 legitimate_pic_address_disp_p (rtx disp)
11911 {
11912   bool saw_plus;
11913 
11914   /* In 64bit mode we can allow direct addresses of symbols and labels
11915      when they are not dynamic symbols.  */
11916   if (TARGET_64BIT)
11917     {
11918       rtx op0 = disp, op1;
11919 
11920       switch (GET_CODE (disp))
11921 	{
11922 	case LABEL_REF:
11923 	  return true;
11924 
11925 	case CONST:
11926 	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
11927 	    break;
11928 	  op0 = XEXP (XEXP (disp, 0), 0);
11929 	  op1 = XEXP (XEXP (disp, 0), 1);
11930 	  if (!CONST_INT_P (op1)
11931 	      || INTVAL (op1) >= 16*1024*1024
11932 	      || INTVAL (op1) < -16*1024*1024)
11933             break;
11934 	  if (GET_CODE (op0) == LABEL_REF)
11935 	    return true;
11936 	  if (GET_CODE (op0) == CONST
11937 	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
11938 	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11939 	    return true;
11940 	  if (GET_CODE (op0) == UNSPEC
11941 	      && XINT (op0, 1) == UNSPEC_PCREL)
11942 	    return true;
11943 	  if (GET_CODE (op0) != SYMBOL_REF)
11944 	    break;
11945 	  /* FALLTHRU */
11946 
11947 	case SYMBOL_REF:
11948 	  /* TLS references should always be enclosed in UNSPEC.  */
11949 	  if (SYMBOL_REF_TLS_MODEL (op0))
11950 	    return false;
11951 	  if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11952 	      && ix86_cmodel != CM_LARGE_PIC)
11953 	    return true;
11954 	  break;
11955 
11956 	default:
11957 	  break;
11958 	}
11959     }
11960   if (GET_CODE (disp) != CONST)
11961     return false;
11962   disp = XEXP (disp, 0);
11963 
11964   if (TARGET_64BIT)
11965     {
11966       /* We are unsafe to allow PLUS expressions.  This limit allowed distance
11967          of GOT tables.  We should not need these anyway.  */
11968       if (GET_CODE (disp) != UNSPEC
11969 	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
11970 	      && XINT (disp, 1) != UNSPEC_GOTOFF
11971 	      && XINT (disp, 1) != UNSPEC_PCREL
11972 	      && XINT (disp, 1) != UNSPEC_PLTOFF))
11973 	return false;
11974 
11975       if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11976 	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11977 	return false;
11978       return true;
11979     }
11980 
11981   saw_plus = false;
11982   if (GET_CODE (disp) == PLUS)
11983     {
11984       if (!CONST_INT_P (XEXP (disp, 1)))
11985 	return false;
11986       disp = XEXP (disp, 0);
11987       saw_plus = true;
11988     }
11989 
11990   if (TARGET_MACHO && darwin_local_data_pic (disp))
11991     return true;
11992 
11993   if (GET_CODE (disp) != UNSPEC)
11994     return false;
11995 
11996   switch (XINT (disp, 1))
11997     {
11998     case UNSPEC_GOT:
11999       if (saw_plus)
12000 	return false;
12001       /* We need to check for both symbols and labels because VxWorks loads
12002 	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
12003 	 details.  */
12004       return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12005 	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12006     case UNSPEC_GOTOFF:
12007       /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12008 	 While ABI specify also 32bit relocation but we don't produce it in
12009 	 small PIC model at all.  */
12010       if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12011 	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12012 	  && !TARGET_64BIT)
12013         return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12014       return false;
12015     case UNSPEC_GOTTPOFF:
12016     case UNSPEC_GOTNTPOFF:
12017     case UNSPEC_INDNTPOFF:
12018       if (saw_plus)
12019 	return false;
12020       disp = XVECEXP (disp, 0, 0);
12021       return (GET_CODE (disp) == SYMBOL_REF
12022 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12023     case UNSPEC_NTPOFF:
12024       disp = XVECEXP (disp, 0, 0);
12025       return (GET_CODE (disp) == SYMBOL_REF
12026 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12027     case UNSPEC_DTPOFF:
12028       disp = XVECEXP (disp, 0, 0);
12029       return (GET_CODE (disp) == SYMBOL_REF
12030 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12031     }
12032 
12033   return false;
12034 }
12035 
12036 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS.  Returns a value to
12037    replace the input X, or the original X if no replacement is called for.
12038    The output parameter *WIN is 1 if the calling macro should goto WIN,
12039    0 if it should not.  */
12040 
12041 bool
12042 ix86_legitimize_reload_address (rtx x,
12043 				enum machine_mode mode ATTRIBUTE_UNUSED,
12044 				int opnum, int type,
12045 				int ind_levels ATTRIBUTE_UNUSED)
12046 {
12047   /* Reload can generate:
12048 
12049      (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12050 		       (reg:DI 97))
12051 	      (reg:DI 2 cx))
12052 
12053      This RTX is rejected from ix86_legitimate_address_p due to
12054      non-strictness of base register 97.  Following this rejection,
12055      reload pushes all three components into separate registers,
12056      creating invalid memory address RTX.
12057 
12058      Following code reloads only the invalid part of the
12059      memory address RTX.  */
12060 
12061   if (GET_CODE (x) == PLUS
12062       && REG_P (XEXP (x, 1))
12063       && GET_CODE (XEXP (x, 0)) == PLUS
12064       && REG_P (XEXP (XEXP (x, 0), 1)))
12065     {
12066       rtx base, index;
12067       bool something_reloaded = false;
12068 
12069       base = XEXP (XEXP (x, 0), 1);
12070       if (!REG_OK_FOR_BASE_STRICT_P (base))
12071 	{
12072 	  push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12073 		       BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12074 		       opnum, (enum reload_type)type);
12075 	  something_reloaded = true;
12076 	}
12077 
12078       index = XEXP (x, 1);
12079       if (!REG_OK_FOR_INDEX_STRICT_P (index))
12080 	{
12081 	  push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12082 		       INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12083 		       opnum, (enum reload_type)type);
12084 	  something_reloaded = true;
12085 	}
12086 
12087       gcc_assert (something_reloaded);
12088       return true;
12089     }
12090 
12091   return false;
12092 }
12093 
12094 /* Determine if op is suitable RTX for an address register.
12095    Return naked register if a register or a register subreg is
12096    found, otherwise return NULL_RTX.  */
12097 
12098 static rtx
12099 ix86_validate_address_register (rtx op)
12100 {
12101   enum machine_mode mode = GET_MODE (op);
12102 
12103   /* Only SImode or DImode registers can form the address.  */
12104   if (mode != SImode && mode != DImode)
12105     return NULL_RTX;
12106 
12107   if (REG_P (op))
12108     return op;
12109   else if (GET_CODE (op) == SUBREG)
12110     {
12111       rtx reg = SUBREG_REG (op);
12112 
12113       if (!REG_P (reg))
12114 	return NULL_RTX;
12115 
12116       mode = GET_MODE (reg);
12117 
12118       /* Don't allow SUBREGs that span more than a word.  It can
12119 	 lead to spill failures when the register is one word out
12120 	 of a two word structure.  */
12121       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
12122 	return NULL_RTX;
12123 
12124       /* Allow only SUBREGs of non-eliminable hard registers.  */
12125       if (register_no_elim_operand (reg, mode))
12126 	return reg;
12127     }
12128 
12129   /* Op is not a register.  */
12130   return NULL_RTX;
12131 }
12132 
12133 /* Recognizes RTL expressions that are valid memory addresses for an
12134    instruction.  The MODE argument is the machine mode for the MEM
12135    expression that wants to use this address.
12136 
12137    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
12138    convert common non-canonical forms to canonical form so that they will
12139    be recognized.  */
12140 
12141 static bool
12142 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12143 		           rtx addr, bool strict)
12144 {
12145   struct ix86_address parts;
12146   rtx base, index, disp;
12147   HOST_WIDE_INT scale;
12148   enum ix86_address_seg seg;
12149 
12150   if (ix86_decompose_address (addr, &parts) <= 0)
12151     /* Decomposition failed.  */
12152     return false;
12153 
12154   base = parts.base;
12155   index = parts.index;
12156   disp = parts.disp;
12157   scale = parts.scale;
12158   seg = parts.seg;
12159 
12160   /* Validate base register.  */
12161   if (base)
12162     {
12163       rtx reg = ix86_validate_address_register (base);
12164 
12165       if (reg == NULL_RTX)
12166 	return false;
12167 
12168       if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12169 	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12170 	/* Base is not valid.  */
12171 	return false;
12172     }
12173 
12174   /* Validate index register.  */
12175   if (index)
12176     {
12177       rtx reg = ix86_validate_address_register (index);
12178 
12179       if (reg == NULL_RTX)
12180 	return false;
12181 
12182       if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12183 	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12184 	/* Index is not valid.  */
12185 	return false;
12186     }
12187 
12188   /* Index and base should have the same mode.  */
12189   if (base && index
12190       && GET_MODE (base) != GET_MODE (index))
12191     return false;
12192 
12193   /* Address override works only on the (%reg) part of %fs:(%reg).  */
12194   if (seg != SEG_DEFAULT
12195       && ((base && GET_MODE (base) != word_mode)
12196 	  || (index && GET_MODE (index) != word_mode)))
12197     return false;
12198 
12199   /* Validate scale factor.  */
12200   if (scale != 1)
12201     {
12202       if (!index)
12203 	/* Scale without index.  */
12204 	return false;
12205 
12206       if (scale != 2 && scale != 4 && scale != 8)
12207 	/* Scale is not a valid multiplier.  */
12208 	return false;
12209     }
12210 
12211   /* Validate displacement.  */
12212   if (disp)
12213     {
12214       if (GET_CODE (disp) == CONST
12215 	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
12216 	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12217 	switch (XINT (XEXP (disp, 0), 1))
12218 	  {
12219 	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12220 	     used.  While ABI specify also 32bit relocations, we don't produce
12221 	     them at all and use IP relative instead.  */
12222 	  case UNSPEC_GOT:
12223 	  case UNSPEC_GOTOFF:
12224 	    gcc_assert (flag_pic);
12225 	    if (!TARGET_64BIT)
12226 	      goto is_legitimate_pic;
12227 
12228 	    /* 64bit address unspec.  */
12229 	    return false;
12230 
12231 	  case UNSPEC_GOTPCREL:
12232 	  case UNSPEC_PCREL:
12233 	    gcc_assert (flag_pic);
12234 	    goto is_legitimate_pic;
12235 
12236 	  case UNSPEC_GOTTPOFF:
12237 	  case UNSPEC_GOTNTPOFF:
12238 	  case UNSPEC_INDNTPOFF:
12239 	  case UNSPEC_NTPOFF:
12240 	  case UNSPEC_DTPOFF:
12241 	    break;
12242 
12243 	  case UNSPEC_STACK_CHECK:
12244 	    gcc_assert (flag_split_stack);
12245 	    break;
12246 
12247 	  default:
12248 	    /* Invalid address unspec.  */
12249 	    return false;
12250 	  }
12251 
12252       else if (SYMBOLIC_CONST (disp)
12253 	       && (flag_pic
12254 		   || (TARGET_MACHO
12255 #if TARGET_MACHO
12256 		       && MACHOPIC_INDIRECT
12257 		       && !machopic_operand_p (disp)
12258 #endif
12259 	       )))
12260 	{
12261 
12262 	is_legitimate_pic:
12263 	  if (TARGET_64BIT && (index || base))
12264 	    {
12265 	      /* foo@dtpoff(%rX) is ok.  */
12266 	      if (GET_CODE (disp) != CONST
12267 		  || GET_CODE (XEXP (disp, 0)) != PLUS
12268 		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12269 		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12270 		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12271 		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12272 		/* Non-constant pic memory reference.  */
12273 		return false;
12274 	    }
12275 	  else if ((!TARGET_MACHO || flag_pic)
12276 		    && ! legitimate_pic_address_disp_p (disp))
12277 	    /* Displacement is an invalid pic construct.  */
12278 	    return false;
12279 #if TARGET_MACHO
12280 	  else if (MACHO_DYNAMIC_NO_PIC_P
12281 		   && !ix86_legitimate_constant_p (Pmode, disp))
12282 	    /* displacment must be referenced via non_lazy_pointer */
12283 	    return false;
12284 #endif
12285 
12286           /* This code used to verify that a symbolic pic displacement
12287 	     includes the pic_offset_table_rtx register.
12288 
12289 	     While this is good idea, unfortunately these constructs may
12290 	     be created by "adds using lea" optimization for incorrect
12291 	     code like:
12292 
12293 	     int a;
12294 	     int foo(int i)
12295 	       {
12296 	         return *(&a+i);
12297 	       }
12298 
12299 	     This code is nonsensical, but results in addressing
12300 	     GOT table with pic_offset_table_rtx base.  We can't
12301 	     just refuse it easily, since it gets matched by
12302 	     "addsi3" pattern, that later gets split to lea in the
12303 	     case output register differs from input.  While this
12304 	     can be handled by separate addsi pattern for this case
12305 	     that never results in lea, this seems to be easier and
12306 	     correct fix for crash to disable this test.  */
12307 	}
12308       else if (GET_CODE (disp) != LABEL_REF
12309 	       && !CONST_INT_P (disp)
12310 	       && (GET_CODE (disp) != CONST
12311 		   || !ix86_legitimate_constant_p (Pmode, disp))
12312 	       && (GET_CODE (disp) != SYMBOL_REF
12313 		   || !ix86_legitimate_constant_p (Pmode, disp)))
12314 	/* Displacement is not constant.  */
12315 	return false;
12316       else if (TARGET_64BIT
12317 	       && !x86_64_immediate_operand (disp, VOIDmode))
12318 	/* Displacement is out of range.  */
12319 	return false;
12320       /* In x32 mode, constant addresses are sign extended to 64bit, so
12321 	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
12322       else if (TARGET_X32 && !(index || base)
12323 	       && CONST_INT_P (disp)
12324 	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
12325 	return false;
12326     }
12327 
12328   /* Everything looks valid.  */
12329   return true;
12330 }
12331 
12332 /* Determine if a given RTX is a valid constant address.  */
12333 
12334 bool
12335 constant_address_p (rtx x)
12336 {
12337   return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12338 }
12339 
12340 /* Return a unique alias set for the GOT.  */
12341 
12342 static alias_set_type
12343 ix86_GOT_alias_set (void)
12344 {
12345   static alias_set_type set = -1;
12346   if (set == -1)
12347     set = new_alias_set ();
12348   return set;
12349 }
12350 
12351 /* Return a legitimate reference for ORIG (an address) using the
12352    register REG.  If REG is 0, a new pseudo is generated.
12353 
12354    There are two types of references that must be handled:
12355 
12356    1. Global data references must load the address from the GOT, via
12357       the PIC reg.  An insn is emitted to do this load, and the reg is
12358       returned.
12359 
12360    2. Static data references, constant pool addresses, and code labels
12361       compute the address as an offset from the GOT, whose base is in
12362       the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
12363       differentiate them from global data objects.  The returned
12364       address is the PIC reg + an unspec constant.
12365 
12366    TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12367    reg also appears in the address.  */
12368 
12369 static rtx
12370 legitimize_pic_address (rtx orig, rtx reg)
12371 {
12372   rtx addr = orig;
12373   rtx new_rtx = orig;
12374 
12375 #if TARGET_MACHO
12376   if (TARGET_MACHO && !TARGET_64BIT)
12377     {
12378       if (reg == 0)
12379 	reg = gen_reg_rtx (Pmode);
12380       /* Use the generic Mach-O PIC machinery.  */
12381       return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12382     }
12383 #endif
12384 
12385   if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12386     new_rtx = addr;
12387   else if (TARGET_64BIT
12388 	   && ix86_cmodel != CM_SMALL_PIC
12389 	   && gotoff_operand (addr, Pmode))
12390     {
12391       rtx tmpreg;
12392       /* This symbol may be referenced via a displacement from the PIC
12393 	 base address (@GOTOFF).  */
12394 
12395       if (reload_in_progress)
12396 	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12397       if (GET_CODE (addr) == CONST)
12398 	addr = XEXP (addr, 0);
12399       if (GET_CODE (addr) == PLUS)
12400 	  {
12401             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12402 				      UNSPEC_GOTOFF);
12403 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12404 	  }
12405 	else
12406           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12407       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12408       if (!reg)
12409         tmpreg = gen_reg_rtx (Pmode);
12410       else
12411 	tmpreg = reg;
12412       emit_move_insn (tmpreg, new_rtx);
12413 
12414       if (reg != 0)
12415 	{
12416 	  new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12417 					 tmpreg, 1, OPTAB_DIRECT);
12418 	  new_rtx = reg;
12419 	}
12420       else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12421     }
12422   else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12423     {
12424       /* This symbol may be referenced via a displacement from the PIC
12425 	 base address (@GOTOFF).  */
12426 
12427       if (reload_in_progress)
12428 	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12429       if (GET_CODE (addr) == CONST)
12430 	addr = XEXP (addr, 0);
12431       if (GET_CODE (addr) == PLUS)
12432 	  {
12433             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12434 				      UNSPEC_GOTOFF);
12435 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12436 	  }
12437 	else
12438           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12439       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12440       new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12441 
12442       if (reg != 0)
12443 	{
12444 	  emit_move_insn (reg, new_rtx);
12445 	  new_rtx = reg;
12446 	}
12447     }
12448   else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12449 	   /* We can't use @GOTOFF for text labels on VxWorks;
12450 	      see gotoff_operand.  */
12451 	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12452     {
12453       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12454         {
12455           if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12456             return legitimize_dllimport_symbol (addr, true);
12457           if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12458               && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12459               && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12460             {
12461               rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12462               return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12463             }
12464         }
12465 
12466       /* For x64 PE-COFF there is no GOT table.  So we use address
12467          directly.  */
12468       if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12469       {
12470 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12471 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12472 
12473 	  if (reg == 0)
12474 	    reg = gen_reg_rtx (Pmode);
12475   	  emit_move_insn (reg, new_rtx);
12476 	  new_rtx = reg;
12477       }
12478       else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12479 	{
12480 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12481 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12482 	  new_rtx = gen_const_mem (Pmode, new_rtx);
12483 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12484 
12485 	  if (reg == 0)
12486 	    reg = gen_reg_rtx (Pmode);
12487 	  /* Use directly gen_movsi, otherwise the address is loaded
12488 	     into register for CSE.  We don't want to CSE this addresses,
12489 	     instead we CSE addresses from the GOT table, so skip this.  */
12490 	  emit_insn (gen_movsi (reg, new_rtx));
12491 	  new_rtx = reg;
12492 	}
12493       else
12494 	{
12495 	  /* This symbol must be referenced via a load from the
12496 	     Global Offset Table (@GOT).  */
12497 
12498 	  if (reload_in_progress)
12499 	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12500 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12501 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12502 	  if (TARGET_64BIT)
12503 	    new_rtx = force_reg (Pmode, new_rtx);
12504 	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12505 	  new_rtx = gen_const_mem (Pmode, new_rtx);
12506 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12507 
12508 	  if (reg == 0)
12509 	    reg = gen_reg_rtx (Pmode);
12510 	  emit_move_insn (reg, new_rtx);
12511 	  new_rtx = reg;
12512 	}
12513     }
12514   else
12515     {
12516       if (CONST_INT_P (addr)
12517 	  && !x86_64_immediate_operand (addr, VOIDmode))
12518 	{
12519 	  if (reg)
12520 	    {
12521 	      emit_move_insn (reg, addr);
12522 	      new_rtx = reg;
12523 	    }
12524 	  else
12525 	    new_rtx = force_reg (Pmode, addr);
12526 	}
12527       else if (GET_CODE (addr) == CONST)
12528 	{
12529 	  addr = XEXP (addr, 0);
12530 
12531 	  /* We must match stuff we generate before.  Assume the only
12532 	     unspecs that can get here are ours.  Not that we could do
12533 	     anything with them anyway....  */
12534 	  if (GET_CODE (addr) == UNSPEC
12535 	      || (GET_CODE (addr) == PLUS
12536 		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12537 	    return orig;
12538 	  gcc_assert (GET_CODE (addr) == PLUS);
12539 	}
12540       if (GET_CODE (addr) == PLUS)
12541 	{
12542 	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12543 
12544 	  /* Check first to see if this is a constant offset from a @GOTOFF
12545 	     symbol reference.  */
12546 	  if (gotoff_operand (op0, Pmode)
12547 	      && CONST_INT_P (op1))
12548 	    {
12549 	      if (!TARGET_64BIT)
12550 		{
12551 		  if (reload_in_progress)
12552 		    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12553 		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12554 					    UNSPEC_GOTOFF);
12555 		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12556 		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12557 		  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12558 
12559 		  if (reg != 0)
12560 		    {
12561 		      emit_move_insn (reg, new_rtx);
12562 		      new_rtx = reg;
12563 		    }
12564 		}
12565 	      else
12566 		{
12567 		  if (INTVAL (op1) < -16*1024*1024
12568 		      || INTVAL (op1) >= 16*1024*1024)
12569 		    {
12570 		      if (!x86_64_immediate_operand (op1, Pmode))
12571 			op1 = force_reg (Pmode, op1);
12572 		      new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12573 		    }
12574 		}
12575 	    }
12576 	  else
12577 	    {
12578 	      rtx base = legitimize_pic_address (op0, reg);
12579 	      enum machine_mode mode = GET_MODE (base);
12580 	      new_rtx
12581 	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12582 
12583 	      if (CONST_INT_P (new_rtx))
12584 		{
12585 		  if (INTVAL (new_rtx) < -16*1024*1024
12586 		      || INTVAL (new_rtx) >= 16*1024*1024)
12587 		    {
12588 		      if (!x86_64_immediate_operand (new_rtx, mode))
12589 			new_rtx = force_reg (mode, new_rtx);
12590 		      new_rtx
12591 		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12592 		    }
12593 		  else
12594 		    new_rtx = plus_constant (base, INTVAL (new_rtx));
12595 		}
12596 	      else
12597 		{
12598 		  if (GET_CODE (new_rtx) == PLUS
12599 		      && CONSTANT_P (XEXP (new_rtx, 1)))
12600 		    {
12601 		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12602 		      new_rtx = XEXP (new_rtx, 1);
12603 		    }
12604 		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12605 		}
12606 	    }
12607 	}
12608     }
12609   return new_rtx;
12610 }
12611 
12612 /* Load the thread pointer.  If TO_REG is true, force it into a register.  */
12613 
12614 static rtx
12615 get_thread_pointer (bool to_reg)
12616 {
12617   rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12618 
12619   if (GET_MODE (tp) != Pmode)
12620     tp = convert_to_mode (Pmode, tp, 1);
12621 
12622   if (to_reg)
12623     tp = copy_addr_to_reg (tp);
12624 
12625   return tp;
12626 }
12627 
12628 /* Construct the SYMBOL_REF for the tls_get_addr function.  */
12629 
12630 static GTY(()) rtx ix86_tls_symbol;
12631 
12632 static rtx
12633 ix86_tls_get_addr (void)
12634 {
12635   if (!ix86_tls_symbol)
12636     {
12637       const char *sym
12638 	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12639 	   ? "___tls_get_addr" : "__tls_get_addr");
12640 
12641       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12642     }
12643 
12644   return ix86_tls_symbol;
12645 }
12646 
12647 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
12648 
12649 static GTY(()) rtx ix86_tls_module_base_symbol;
12650 
12651 rtx
12652 ix86_tls_module_base (void)
12653 {
12654   if (!ix86_tls_module_base_symbol)
12655     {
12656       ix86_tls_module_base_symbol
12657 	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12658 
12659       SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12660 	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12661     }
12662 
12663   return ix86_tls_module_base_symbol;
12664 }
12665 
12666 /* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
12667    false if we expect this to be used for a memory address and true if
12668    we expect to load the address into a register.  */
12669 
12670 static rtx
12671 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12672 {
12673   rtx dest, base, off;
12674   rtx pic = NULL_RTX, tp = NULL_RTX;
12675   int type;
12676 
12677   switch (model)
12678     {
12679     case TLS_MODEL_GLOBAL_DYNAMIC:
12680       dest = gen_reg_rtx (Pmode);
12681 
12682       if (!TARGET_64BIT)
12683 	{
12684 	  if (flag_pic)
12685 	    pic = pic_offset_table_rtx;
12686 	  else
12687 	    {
12688 	      pic = gen_reg_rtx (Pmode);
12689 	      emit_insn (gen_set_got (pic));
12690 	    }
12691 	}
12692 
12693       if (TARGET_GNU2_TLS)
12694 	{
12695 	  if (TARGET_64BIT)
12696 	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12697 	  else
12698 	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12699 
12700 	  tp = get_thread_pointer (true);
12701 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12702 
12703 	  if (GET_MODE (x) != Pmode)
12704 	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
12705 
12706 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12707 	}
12708       else
12709 	{
12710 	  rtx caddr = ix86_tls_get_addr ();
12711 
12712 	  if (TARGET_64BIT)
12713 	    {
12714 	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
12715 	      rtx insns;
12716 
12717 	      start_sequence ();
12718 	      emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12719 	      insns = get_insns ();
12720 	      end_sequence ();
12721 
12722 	      if (GET_MODE (x) != Pmode)
12723 		x = gen_rtx_ZERO_EXTEND (Pmode, x);
12724 
12725 	      RTL_CONST_CALL_P (insns) = 1;
12726 	      emit_libcall_block (insns, dest, rax, x);
12727 	    }
12728 	  else
12729 	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12730 	}
12731       break;
12732 
12733     case TLS_MODEL_LOCAL_DYNAMIC:
12734       base = gen_reg_rtx (Pmode);
12735 
12736       if (!TARGET_64BIT)
12737 	{
12738 	  if (flag_pic)
12739 	    pic = pic_offset_table_rtx;
12740 	  else
12741 	    {
12742 	      pic = gen_reg_rtx (Pmode);
12743 	      emit_insn (gen_set_got (pic));
12744 	    }
12745 	}
12746 
12747       if (TARGET_GNU2_TLS)
12748 	{
12749 	  rtx tmp = ix86_tls_module_base ();
12750 
12751 	  if (TARGET_64BIT)
12752 	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12753 	  else
12754 	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12755 
12756 	  tp = get_thread_pointer (true);
12757 	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
12758 			       gen_rtx_MINUS (Pmode, tmp, tp));
12759 	}
12760       else
12761 	{
12762 	  rtx caddr = ix86_tls_get_addr ();
12763 
12764 	  if (TARGET_64BIT)
12765 	    {
12766 	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
12767 	      rtx insns, eqv;
12768 
12769 	      start_sequence ();
12770 	      emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12771 	      insns = get_insns ();
12772 	      end_sequence ();
12773 
12774 	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12775 		 share the LD_BASE result with other LD model accesses.  */
12776 	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12777 				    UNSPEC_TLS_LD_BASE);
12778 
12779 	      RTL_CONST_CALL_P (insns) = 1;
12780 	      emit_libcall_block (insns, base, rax, eqv);
12781 	    }
12782 	  else
12783 	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12784 	}
12785 
12786       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12787       off = gen_rtx_CONST (Pmode, off);
12788 
12789       dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12790 
12791       if (TARGET_GNU2_TLS)
12792 	{
12793 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12794 
12795 	  if (GET_MODE (x) != Pmode)
12796 	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
12797 
12798 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12799 	}
12800       break;
12801 
12802     case TLS_MODEL_INITIAL_EXEC:
12803       if (TARGET_64BIT)
12804 	{
12805 	  if (TARGET_SUN_TLS)
12806 	    {
12807 	      /* The Sun linker took the AMD64 TLS spec literally
12808 		 and can only handle %rax as destination of the
12809 		 initial executable code sequence.  */
12810 
12811 	      dest = gen_reg_rtx (Pmode);
12812 	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12813 	      return dest;
12814 	    }
12815 
12816 	  pic = NULL;
12817 	  type = UNSPEC_GOTNTPOFF;
12818 	}
12819       else if (flag_pic)
12820 	{
12821 	  if (reload_in_progress)
12822 	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12823 	  pic = pic_offset_table_rtx;
12824 	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12825 	}
12826       else if (!TARGET_ANY_GNU_TLS)
12827 	{
12828 	  pic = gen_reg_rtx (Pmode);
12829 	  emit_insn (gen_set_got (pic));
12830 	  type = UNSPEC_GOTTPOFF;
12831 	}
12832       else
12833 	{
12834 	  pic = NULL;
12835 	  type = UNSPEC_INDNTPOFF;
12836 	}
12837 
12838       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12839       off = gen_rtx_CONST (Pmode, off);
12840       if (pic)
12841 	off = gen_rtx_PLUS (Pmode, pic, off);
12842       off = gen_const_mem (Pmode, off);
12843       set_mem_alias_set (off, ix86_GOT_alias_set ());
12844 
12845       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12846 	{
12847           base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12848 	  off = force_reg (Pmode, off);
12849 	  return gen_rtx_PLUS (Pmode, base, off);
12850 	}
12851       else
12852 	{
12853 	  base = get_thread_pointer (true);
12854 	  dest = gen_reg_rtx (Pmode);
12855 	  emit_insn (gen_subsi3 (dest, base, off));
12856 	}
12857       break;
12858 
12859     case TLS_MODEL_LOCAL_EXEC:
12860       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12861 			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12862 			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12863       off = gen_rtx_CONST (Pmode, off);
12864 
12865       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12866 	{
12867 	  base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12868 	  return gen_rtx_PLUS (Pmode, base, off);
12869 	}
12870       else
12871 	{
12872 	  base = get_thread_pointer (true);
12873 	  dest = gen_reg_rtx (Pmode);
12874 	  emit_insn (gen_subsi3 (dest, base, off));
12875 	}
12876       break;
12877 
12878     default:
12879       gcc_unreachable ();
12880     }
12881 
12882   return dest;
12883 }
12884 
12885 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12886    to symbol DECL.  */
12887 
12888 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12889   htab_t dllimport_map;
12890 
12891 static tree
12892 get_dllimport_decl (tree decl)
12893 {
12894   struct tree_map *h, in;
12895   void **loc;
12896   const char *name;
12897   const char *prefix;
12898   size_t namelen, prefixlen;
12899   char *imp_name;
12900   tree to;
12901   rtx rtl;
12902 
12903   if (!dllimport_map)
12904     dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12905 
12906   in.hash = htab_hash_pointer (decl);
12907   in.base.from = decl;
12908   loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12909   h = (struct tree_map *) *loc;
12910   if (h)
12911     return h->to;
12912 
12913   *loc = h = ggc_alloc_tree_map ();
12914   h->hash = in.hash;
12915   h->base.from = decl;
12916   h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12917 			   VAR_DECL, NULL, ptr_type_node);
12918   DECL_ARTIFICIAL (to) = 1;
12919   DECL_IGNORED_P (to) = 1;
12920   DECL_EXTERNAL (to) = 1;
12921   TREE_READONLY (to) = 1;
12922 
12923   name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12924   name = targetm.strip_name_encoding (name);
12925   prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12926     ? "*__imp_" : "*__imp__";
12927   namelen = strlen (name);
12928   prefixlen = strlen (prefix);
12929   imp_name = (char *) alloca (namelen + prefixlen + 1);
12930   memcpy (imp_name, prefix, prefixlen);
12931   memcpy (imp_name + prefixlen, name, namelen + 1);
12932 
12933   name = ggc_alloc_string (imp_name, namelen + prefixlen);
12934   rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12935   SET_SYMBOL_REF_DECL (rtl, to);
12936   SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12937 
12938   rtl = gen_const_mem (Pmode, rtl);
12939   set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12940 
12941   SET_DECL_RTL (to, rtl);
12942   SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12943 
12944   return to;
12945 }
12946 
12947 /* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
12948    true if we require the result be a register.  */
12949 
12950 static rtx
12951 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12952 {
12953   tree imp_decl;
12954   rtx x;
12955 
12956   gcc_assert (SYMBOL_REF_DECL (symbol));
12957   imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12958 
12959   x = DECL_RTL (imp_decl);
12960   if (want_reg)
12961     x = force_reg (Pmode, x);
12962   return x;
12963 }
12964 
12965 /* Try machine-dependent ways of modifying an illegitimate address
12966    to be legitimate.  If we find one, return the new, valid address.
12967    This macro is used in only one place: `memory_address' in explow.c.
12968 
12969    OLDX is the address as it was before break_out_memory_refs was called.
12970    In some cases it is useful to look at this to decide what needs to be done.
12971 
12972    It is always safe for this macro to do nothing.  It exists to recognize
12973    opportunities to optimize the output.
12974 
12975    For the 80386, we handle X+REG by loading X into a register R and
12976    using R+REG.  R will go in a general reg and indexing will be used.
12977    However, if REG is a broken-out memory address or multiplication,
12978    nothing needs to be done because REG can certainly go in a general reg.
12979 
12980    When -fpic is used, special handling is needed for symbolic references.
12981    See comments by legitimize_pic_address in i386.c for details.  */
12982 
12983 static rtx
12984 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12985 			 enum machine_mode mode)
12986 {
12987   int changed = 0;
12988   unsigned log;
12989 
12990   log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12991   if (log)
12992     return legitimize_tls_address (x, (enum tls_model) log, false);
12993   if (GET_CODE (x) == CONST
12994       && GET_CODE (XEXP (x, 0)) == PLUS
12995       && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12996       && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12997     {
12998       rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12999 				      (enum tls_model) log, false);
13000       return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13001     }
13002 
13003   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13004     {
13005       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13006 	return legitimize_dllimport_symbol (x, true);
13007       if (GET_CODE (x) == CONST
13008 	  && GET_CODE (XEXP (x, 0)) == PLUS
13009 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13010 	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13011 	{
13012 	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13013 	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13014 	}
13015     }
13016 
13017   if (flag_pic && SYMBOLIC_CONST (x))
13018     return legitimize_pic_address (x, 0);
13019 
13020 #if TARGET_MACHO
13021   if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13022     return machopic_indirect_data_reference (x, 0);
13023 #endif
13024 
13025   /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13026   if (GET_CODE (x) == ASHIFT
13027       && CONST_INT_P (XEXP (x, 1))
13028       && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13029     {
13030       changed = 1;
13031       log = INTVAL (XEXP (x, 1));
13032       x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13033 			GEN_INT (1 << log));
13034     }
13035 
13036   if (GET_CODE (x) == PLUS)
13037     {
13038       /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
13039 
13040       if (GET_CODE (XEXP (x, 0)) == ASHIFT
13041 	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13042 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13043 	{
13044 	  changed = 1;
13045 	  log = INTVAL (XEXP (XEXP (x, 0), 1));
13046 	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
13047 				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13048 				      GEN_INT (1 << log));
13049 	}
13050 
13051       if (GET_CODE (XEXP (x, 1)) == ASHIFT
13052 	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13053 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13054 	{
13055 	  changed = 1;
13056 	  log = INTVAL (XEXP (XEXP (x, 1), 1));
13057 	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
13058 				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13059 				      GEN_INT (1 << log));
13060 	}
13061 
13062       /* Put multiply first if it isn't already.  */
13063       if (GET_CODE (XEXP (x, 1)) == MULT)
13064 	{
13065 	  rtx tmp = XEXP (x, 0);
13066 	  XEXP (x, 0) = XEXP (x, 1);
13067 	  XEXP (x, 1) = tmp;
13068 	  changed = 1;
13069 	}
13070 
13071       /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13072 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
13073 	 created by virtual register instantiation, register elimination, and
13074 	 similar optimizations.  */
13075       if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13076 	{
13077 	  changed = 1;
13078 	  x = gen_rtx_PLUS (Pmode,
13079 			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
13080 					  XEXP (XEXP (x, 1), 0)),
13081 			    XEXP (XEXP (x, 1), 1));
13082 	}
13083 
13084       /* Canonicalize
13085 	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13086 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
13087       else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13088 	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13089 	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13090 	       && CONSTANT_P (XEXP (x, 1)))
13091 	{
13092 	  rtx constant;
13093 	  rtx other = NULL_RTX;
13094 
13095 	  if (CONST_INT_P (XEXP (x, 1)))
13096 	    {
13097 	      constant = XEXP (x, 1);
13098 	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13099 	    }
13100 	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13101 	    {
13102 	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13103 	      other = XEXP (x, 1);
13104 	    }
13105 	  else
13106 	    constant = 0;
13107 
13108 	  if (constant)
13109 	    {
13110 	      changed = 1;
13111 	      x = gen_rtx_PLUS (Pmode,
13112 				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13113 					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
13114 				plus_constant (other, INTVAL (constant)));
13115 	    }
13116 	}
13117 
13118       if (changed && ix86_legitimate_address_p (mode, x, false))
13119 	return x;
13120 
13121       if (GET_CODE (XEXP (x, 0)) == MULT)
13122 	{
13123 	  changed = 1;
13124 	  XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13125 	}
13126 
13127       if (GET_CODE (XEXP (x, 1)) == MULT)
13128 	{
13129 	  changed = 1;
13130 	  XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13131 	}
13132 
13133       if (changed
13134 	  && REG_P (XEXP (x, 1))
13135 	  && REG_P (XEXP (x, 0)))
13136 	return x;
13137 
13138       if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13139 	{
13140 	  changed = 1;
13141 	  x = legitimize_pic_address (x, 0);
13142 	}
13143 
13144       if (changed && ix86_legitimate_address_p (mode, x, false))
13145 	return x;
13146 
13147       if (REG_P (XEXP (x, 0)))
13148 	{
13149 	  rtx temp = gen_reg_rtx (Pmode);
13150 	  rtx val  = force_operand (XEXP (x, 1), temp);
13151 	  if (val != temp)
13152 	    {
13153 	      if (GET_MODE (val) != Pmode)
13154 		val = convert_to_mode (Pmode, val, 1);
13155 	      emit_move_insn (temp, val);
13156 	    }
13157 
13158 	  XEXP (x, 1) = temp;
13159 	  return x;
13160 	}
13161 
13162       else if (REG_P (XEXP (x, 1)))
13163 	{
13164 	  rtx temp = gen_reg_rtx (Pmode);
13165 	  rtx val  = force_operand (XEXP (x, 0), temp);
13166 	  if (val != temp)
13167 	    {
13168 	      if (GET_MODE (val) != Pmode)
13169 		val = convert_to_mode (Pmode, val, 1);
13170 	      emit_move_insn (temp, val);
13171 	    }
13172 
13173 	  XEXP (x, 0) = temp;
13174 	  return x;
13175 	}
13176     }
13177 
13178   return x;
13179 }
13180 
13181 /* Print an integer constant expression in assembler syntax.  Addition
13182    and subtraction are the only arithmetic that may appear in these
13183    expressions.  FILE is the stdio stream to write to, X is the rtx, and
13184    CODE is the operand print code from the output string.  */
13185 
13186 static void
13187 output_pic_addr_const (FILE *file, rtx x, int code)
13188 {
13189   char buf[256];
13190 
13191   switch (GET_CODE (x))
13192     {
13193     case PC:
13194       gcc_assert (flag_pic);
13195       putc ('.', file);
13196       break;
13197 
13198     case SYMBOL_REF:
13199       if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13200 	output_addr_const (file, x);
13201       else
13202 	{
13203 	  const char *name = XSTR (x, 0);
13204 
13205 	  /* Mark the decl as referenced so that cgraph will
13206 	     output the function.  */
13207 	  if (SYMBOL_REF_DECL (x))
13208 	    mark_decl_referenced (SYMBOL_REF_DECL (x));
13209 
13210 #if TARGET_MACHO
13211 	  if (MACHOPIC_INDIRECT
13212 	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13213 	    name = machopic_indirection_name (x, /*stub_p=*/true);
13214 #endif
13215 	  assemble_name (file, name);
13216 	}
13217       if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13218 	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13219 	fputs ("@PLT", file);
13220       break;
13221 
13222     case LABEL_REF:
13223       x = XEXP (x, 0);
13224       /* FALLTHRU */
13225     case CODE_LABEL:
13226       ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13227       assemble_name (asm_out_file, buf);
13228       break;
13229 
13230     case CONST_INT:
13231       fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13232       break;
13233 
13234     case CONST:
13235       /* This used to output parentheses around the expression,
13236 	 but that does not work on the 386 (either ATT or BSD assembler).  */
13237       output_pic_addr_const (file, XEXP (x, 0), code);
13238       break;
13239 
13240     case CONST_DOUBLE:
13241       if (GET_MODE (x) == VOIDmode)
13242 	{
13243 	  /* We can use %d if the number is <32 bits and positive.  */
13244 	  if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13245 	    fprintf (file, "0x%lx%08lx",
13246 		     (unsigned long) CONST_DOUBLE_HIGH (x),
13247 		     (unsigned long) CONST_DOUBLE_LOW (x));
13248 	  else
13249 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13250 	}
13251       else
13252 	/* We can't handle floating point constants;
13253 	   TARGET_PRINT_OPERAND must handle them.  */
13254 	output_operand_lossage ("floating constant misused");
13255       break;
13256 
13257     case PLUS:
13258       /* Some assemblers need integer constants to appear first.  */
13259       if (CONST_INT_P (XEXP (x, 0)))
13260 	{
13261 	  output_pic_addr_const (file, XEXP (x, 0), code);
13262 	  putc ('+', file);
13263 	  output_pic_addr_const (file, XEXP (x, 1), code);
13264 	}
13265       else
13266 	{
13267 	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
13268 	  output_pic_addr_const (file, XEXP (x, 1), code);
13269 	  putc ('+', file);
13270 	  output_pic_addr_const (file, XEXP (x, 0), code);
13271 	}
13272       break;
13273 
13274     case MINUS:
13275       if (!TARGET_MACHO)
13276 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13277       output_pic_addr_const (file, XEXP (x, 0), code);
13278       putc ('-', file);
13279       output_pic_addr_const (file, XEXP (x, 1), code);
13280       if (!TARGET_MACHO)
13281 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13282       break;
13283 
13284      case UNSPEC:
13285        if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13286 	 {
13287 	   bool f = i386_asm_output_addr_const_extra (file, x);
13288 	   gcc_assert (f);
13289 	   break;
13290 	 }
13291 
13292        gcc_assert (XVECLEN (x, 0) == 1);
13293        output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13294        switch (XINT (x, 1))
13295 	{
13296 	case UNSPEC_GOT:
13297 	  fputs ("@GOT", file);
13298 	  break;
13299 	case UNSPEC_GOTOFF:
13300 	  fputs ("@GOTOFF", file);
13301 	  break;
13302 	case UNSPEC_PLTOFF:
13303 	  fputs ("@PLTOFF", file);
13304 	  break;
13305 	case UNSPEC_PCREL:
13306 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13307 		 "(%rip)" : "[rip]", file);
13308 	  break;
13309 	case UNSPEC_GOTPCREL:
13310 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13311 		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13312 	  break;
13313 	case UNSPEC_GOTTPOFF:
13314 	  /* FIXME: This might be @TPOFF in Sun ld too.  */
13315 	  fputs ("@gottpoff", file);
13316 	  break;
13317 	case UNSPEC_TPOFF:
13318 	  fputs ("@tpoff", file);
13319 	  break;
13320 	case UNSPEC_NTPOFF:
13321 	  if (TARGET_64BIT)
13322 	    fputs ("@tpoff", file);
13323 	  else
13324 	    fputs ("@ntpoff", file);
13325 	  break;
13326 	case UNSPEC_DTPOFF:
13327 	  fputs ("@dtpoff", file);
13328 	  break;
13329 	case UNSPEC_GOTNTPOFF:
13330 	  if (TARGET_64BIT)
13331 	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13332 		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
13333 	  else
13334 	    fputs ("@gotntpoff", file);
13335 	  break;
13336 	case UNSPEC_INDNTPOFF:
13337 	  fputs ("@indntpoff", file);
13338 	  break;
13339 #if TARGET_MACHO
13340 	case UNSPEC_MACHOPIC_OFFSET:
13341 	  putc ('-', file);
13342 	  machopic_output_function_base_name (file);
13343 	  break;
13344 #endif
13345 	default:
13346 	  output_operand_lossage ("invalid UNSPEC as operand");
13347 	  break;
13348 	}
13349        break;
13350 
13351     default:
13352       output_operand_lossage ("invalid expression as operand");
13353     }
13354 }
13355 
13356 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13357    We need to emit DTP-relative relocations.  */
13358 
13359 static void ATTRIBUTE_UNUSED
13360 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13361 {
13362   fputs (ASM_LONG, file);
13363   output_addr_const (file, x);
13364   fputs ("@dtpoff", file);
13365   switch (size)
13366     {
13367     case 4:
13368       break;
13369     case 8:
13370       fputs (", 0", file);
13371       break;
13372     default:
13373       gcc_unreachable ();
13374    }
13375 }
13376 
13377 /* Return true if X is a representation of the PIC register.  This copes
13378    with calls from ix86_find_base_term, where the register might have
13379    been replaced by a cselib value.  */
13380 
13381 static bool
13382 ix86_pic_register_p (rtx x)
13383 {
13384   if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13385     return (pic_offset_table_rtx
13386 	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13387   else
13388     return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13389 }
13390 
13391 /* Helper function for ix86_delegitimize_address.
13392    Attempt to delegitimize TLS local-exec accesses.  */
13393 
13394 static rtx
13395 ix86_delegitimize_tls_address (rtx orig_x)
13396 {
13397   rtx x = orig_x, unspec;
13398   struct ix86_address addr;
13399 
13400   if (!TARGET_TLS_DIRECT_SEG_REFS)
13401     return orig_x;
13402   if (MEM_P (x))
13403     x = XEXP (x, 0);
13404   if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13405     return orig_x;
13406   if (ix86_decompose_address (x, &addr) == 0
13407       || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13408       || addr.disp == NULL_RTX
13409       || GET_CODE (addr.disp) != CONST)
13410     return orig_x;
13411   unspec = XEXP (addr.disp, 0);
13412   if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13413     unspec = XEXP (unspec, 0);
13414   if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13415     return orig_x;
13416   x = XVECEXP (unspec, 0, 0);
13417   gcc_assert (GET_CODE (x) == SYMBOL_REF);
13418   if (unspec != XEXP (addr.disp, 0))
13419     x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13420   if (addr.index)
13421     {
13422       rtx idx = addr.index;
13423       if (addr.scale != 1)
13424 	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13425       x = gen_rtx_PLUS (Pmode, idx, x);
13426     }
13427   if (addr.base)
13428     x = gen_rtx_PLUS (Pmode, addr.base, x);
13429   if (MEM_P (orig_x))
13430     x = replace_equiv_address_nv (orig_x, x);
13431   return x;
13432 }
13433 
13434 /* In the name of slightly smaller debug output, and to cater to
13435    general assembler lossage, recognize PIC+GOTOFF and turn it back
13436    into a direct symbol reference.
13437 
13438    On Darwin, this is necessary to avoid a crash, because Darwin
13439    has a different PIC label for each routine but the DWARF debugging
13440    information is not associated with any particular routine, so it's
13441    necessary to remove references to the PIC label from RTL stored by
13442    the DWARF output code.  */
13443 
13444 static rtx
13445 ix86_delegitimize_address (rtx x)
13446 {
13447   rtx orig_x = delegitimize_mem_from_attrs (x);
13448   /* addend is NULL or some rtx if x is something+GOTOFF where
13449      something doesn't include the PIC register.  */
13450   rtx addend = NULL_RTX;
13451   /* reg_addend is NULL or a multiple of some register.  */
13452   rtx reg_addend = NULL_RTX;
13453   /* const_addend is NULL or a const_int.  */
13454   rtx const_addend = NULL_RTX;
13455   /* This is the result, or NULL.  */
13456   rtx result = NULL_RTX;
13457 
13458   x = orig_x;
13459 
13460   if (MEM_P (x))
13461     x = XEXP (x, 0);
13462 
13463   if (TARGET_64BIT)
13464     {
13465       if (GET_CODE (x) == CONST
13466           && GET_CODE (XEXP (x, 0)) == PLUS
13467           && GET_MODE (XEXP (x, 0)) == Pmode
13468           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13469           && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13470           && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13471         {
13472 	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13473 	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13474 	  if (MEM_P (orig_x))
13475 	    x = replace_equiv_address_nv (orig_x, x);
13476 	  return x;
13477 	}
13478       if (GET_CODE (x) != CONST
13479 	  || GET_CODE (XEXP (x, 0)) != UNSPEC
13480 	  || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13481 	      && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13482 	  || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13483 	return ix86_delegitimize_tls_address (orig_x);
13484       x = XVECEXP (XEXP (x, 0), 0, 0);
13485       if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13486 	{
13487 	  x = simplify_gen_subreg (GET_MODE (orig_x), x,
13488 				   GET_MODE (x), 0);
13489 	  if (x == NULL_RTX)
13490 	    return orig_x;
13491 	}
13492       return x;
13493     }
13494 
13495   if (GET_CODE (x) != PLUS
13496       || GET_CODE (XEXP (x, 1)) != CONST)
13497     return ix86_delegitimize_tls_address (orig_x);
13498 
13499   if (ix86_pic_register_p (XEXP (x, 0)))
13500     /* %ebx + GOT/GOTOFF */
13501     ;
13502   else if (GET_CODE (XEXP (x, 0)) == PLUS)
13503     {
13504       /* %ebx + %reg * scale + GOT/GOTOFF */
13505       reg_addend = XEXP (x, 0);
13506       if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13507 	reg_addend = XEXP (reg_addend, 1);
13508       else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13509 	reg_addend = XEXP (reg_addend, 0);
13510       else
13511 	{
13512 	  reg_addend = NULL_RTX;
13513 	  addend = XEXP (x, 0);
13514 	}
13515     }
13516   else
13517     addend = XEXP (x, 0);
13518 
13519   x = XEXP (XEXP (x, 1), 0);
13520   if (GET_CODE (x) == PLUS
13521       && CONST_INT_P (XEXP (x, 1)))
13522     {
13523       const_addend = XEXP (x, 1);
13524       x = XEXP (x, 0);
13525     }
13526 
13527   if (GET_CODE (x) == UNSPEC
13528       && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13529 	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13530     result = XVECEXP (x, 0, 0);
13531 
13532   if (TARGET_MACHO && darwin_local_data_pic (x)
13533       && !MEM_P (orig_x))
13534     result = XVECEXP (x, 0, 0);
13535 
13536   if (! result)
13537     return ix86_delegitimize_tls_address (orig_x);
13538 
13539   if (const_addend)
13540     result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13541   if (reg_addend)
13542     result = gen_rtx_PLUS (Pmode, reg_addend, result);
13543   if (addend)
13544     {
13545       /* If the rest of original X doesn't involve the PIC register, add
13546 	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
13547 	 for code like:
13548 	 leal (%ebx, %ecx, 4), %ecx
13549 	 ...
13550 	 movl foo@GOTOFF(%ecx), %edx
13551 	 in which case we return (%ecx - %ebx) + foo.  */
13552       if (pic_offset_table_rtx)
13553         result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13554 						     pic_offset_table_rtx),
13555 			       result);
13556       else
13557 	return orig_x;
13558     }
13559   if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13560     {
13561       result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13562       if (result == NULL_RTX)
13563 	return orig_x;
13564     }
13565   return result;
13566 }
13567 
13568 /* If X is a machine specific address (i.e. a symbol or label being
13569    referenced as a displacement from the GOT implemented using an
13570    UNSPEC), then return the base term.  Otherwise return X.  */
13571 
13572 rtx
13573 ix86_find_base_term (rtx x)
13574 {
13575   rtx term;
13576 
13577   if (TARGET_64BIT)
13578     {
13579       if (GET_CODE (x) != CONST)
13580 	return x;
13581       term = XEXP (x, 0);
13582       if (GET_CODE (term) == PLUS
13583 	  && (CONST_INT_P (XEXP (term, 1))
13584 	      || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13585 	term = XEXP (term, 0);
13586       if (GET_CODE (term) != UNSPEC
13587 	  || (XINT (term, 1) != UNSPEC_GOTPCREL
13588 	      && XINT (term, 1) != UNSPEC_PCREL))
13589 	return x;
13590 
13591       return XVECEXP (term, 0, 0);
13592     }
13593 
13594   return ix86_delegitimize_address (x);
13595 }
13596 
13597 static void
13598 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13599 		    int fp, FILE *file)
13600 {
13601   const char *suffix;
13602 
13603   if (mode == CCFPmode || mode == CCFPUmode)
13604     {
13605       code = ix86_fp_compare_code_to_integer (code);
13606       mode = CCmode;
13607     }
13608   if (reverse)
13609     code = reverse_condition (code);
13610 
13611   switch (code)
13612     {
13613     case EQ:
13614       switch (mode)
13615 	{
13616 	case CCAmode:
13617 	  suffix = "a";
13618 	  break;
13619 
13620 	case CCCmode:
13621 	  suffix = "c";
13622 	  break;
13623 
13624 	case CCOmode:
13625 	  suffix = "o";
13626 	  break;
13627 
13628 	case CCSmode:
13629 	  suffix = "s";
13630 	  break;
13631 
13632 	default:
13633 	  suffix = "e";
13634 	}
13635       break;
13636     case NE:
13637       switch (mode)
13638 	{
13639 	case CCAmode:
13640 	  suffix = "na";
13641 	  break;
13642 
13643 	case CCCmode:
13644 	  suffix = "nc";
13645 	  break;
13646 
13647 	case CCOmode:
13648 	  suffix = "no";
13649 	  break;
13650 
13651 	case CCSmode:
13652 	  suffix = "ns";
13653 	  break;
13654 
13655 	default:
13656 	  suffix = "ne";
13657 	}
13658       break;
13659     case GT:
13660       gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13661       suffix = "g";
13662       break;
13663     case GTU:
13664       /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13665 	 Those same assemblers have the same but opposite lossage on cmov.  */
13666       if (mode == CCmode)
13667 	suffix = fp ? "nbe" : "a";
13668       else
13669 	gcc_unreachable ();
13670       break;
13671     case LT:
13672       switch (mode)
13673 	{
13674 	case CCNOmode:
13675 	case CCGOCmode:
13676 	  suffix = "s";
13677 	  break;
13678 
13679 	case CCmode:
13680 	case CCGCmode:
13681 	  suffix = "l";
13682 	  break;
13683 
13684 	default:
13685 	  gcc_unreachable ();
13686 	}
13687       break;
13688     case LTU:
13689       if (mode == CCmode)
13690 	suffix = "b";
13691       else if (mode == CCCmode)
13692 	suffix = "c";
13693       else
13694 	gcc_unreachable ();
13695       break;
13696     case GE:
13697       switch (mode)
13698 	{
13699 	case CCNOmode:
13700 	case CCGOCmode:
13701 	  suffix = "ns";
13702 	  break;
13703 
13704 	case CCmode:
13705 	case CCGCmode:
13706 	  suffix = "ge";
13707 	  break;
13708 
13709 	default:
13710 	  gcc_unreachable ();
13711 	}
13712       break;
13713     case GEU:
13714       if (mode == CCmode)
13715 	suffix = fp ? "nb" : "ae";
13716       else if (mode == CCCmode)
13717 	suffix = "nc";
13718       else
13719 	gcc_unreachable ();
13720       break;
13721     case LE:
13722       gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13723       suffix = "le";
13724       break;
13725     case LEU:
13726       if (mode == CCmode)
13727 	suffix = "be";
13728       else
13729 	gcc_unreachable ();
13730       break;
13731     case UNORDERED:
13732       suffix = fp ? "u" : "p";
13733       break;
13734     case ORDERED:
13735       suffix = fp ? "nu" : "np";
13736       break;
13737     default:
13738       gcc_unreachable ();
13739     }
13740   fputs (suffix, file);
13741 }
13742 
13743 /* Print the name of register X to FILE based on its machine mode and number.
13744    If CODE is 'w', pretend the mode is HImode.
13745    If CODE is 'b', pretend the mode is QImode.
13746    If CODE is 'k', pretend the mode is SImode.
13747    If CODE is 'q', pretend the mode is DImode.
13748    If CODE is 'x', pretend the mode is V4SFmode.
13749    If CODE is 't', pretend the mode is V8SFmode.
13750    If CODE is 'h', pretend the reg is the 'high' byte register.
13751    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13752    If CODE is 'd', duplicate the operand for AVX instruction.
13753  */
13754 
13755 void
13756 print_reg (rtx x, int code, FILE *file)
13757 {
13758   const char *reg;
13759   unsigned int regno;
13760   bool duplicated = code == 'd' && TARGET_AVX;
13761 
13762   if (ASSEMBLER_DIALECT == ASM_ATT)
13763     putc ('%', file);
13764 
13765   if (x == pc_rtx)
13766     {
13767       gcc_assert (TARGET_64BIT);
13768       fputs ("rip", file);
13769       return;
13770     }
13771 
13772   regno = true_regnum (x);
13773   gcc_assert (regno != ARG_POINTER_REGNUM
13774 	      && regno != FRAME_POINTER_REGNUM
13775 	      && regno != FLAGS_REG
13776 	      && regno != FPSR_REG
13777 	      && regno != FPCR_REG);
13778 
13779   if (code == 'w' || MMX_REG_P (x))
13780     code = 2;
13781   else if (code == 'b')
13782     code = 1;
13783   else if (code == 'k')
13784     code = 4;
13785   else if (code == 'q')
13786     code = 8;
13787   else if (code == 'y')
13788     code = 3;
13789   else if (code == 'h')
13790     code = 0;
13791   else if (code == 'x')
13792     code = 16;
13793   else if (code == 't')
13794     code = 32;
13795   else
13796     code = GET_MODE_SIZE (GET_MODE (x));
13797 
13798   /* Irritatingly, AMD extended registers use different naming convention
13799      from the normal registers: "r%d[bwd]"  */
13800   if (REX_INT_REGNO_P (regno))
13801     {
13802       gcc_assert (TARGET_64BIT);
13803       putc ('r', file);
13804       fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13805       switch (code)
13806 	{
13807 	  case 0:
13808 	    error ("extended registers have no high halves");
13809 	    break;
13810 	  case 1:
13811 	    putc ('b', file);
13812 	    break;
13813 	  case 2:
13814 	    putc ('w', file);
13815 	    break;
13816 	  case 4:
13817 	    putc ('d', file);
13818 	    break;
13819 	  case 8:
13820 	    /* no suffix */
13821 	    break;
13822 	  default:
13823 	    error ("unsupported operand size for extended register");
13824 	    break;
13825 	}
13826       return;
13827     }
13828 
13829   reg = NULL;
13830   switch (code)
13831     {
13832     case 3:
13833       if (STACK_TOP_P (x))
13834 	{
13835 	  reg = "st(0)";
13836 	  break;
13837 	}
13838       /* FALLTHRU */
13839     case 8:
13840     case 4:
13841     case 12:
13842       if (! ANY_FP_REG_P (x))
13843 	putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13844       /* FALLTHRU */
13845     case 16:
13846     case 2:
13847     normal:
13848       reg = hi_reg_name[regno];
13849       break;
13850     case 1:
13851       if (regno >= ARRAY_SIZE (qi_reg_name))
13852 	goto normal;
13853       reg = qi_reg_name[regno];
13854       break;
13855     case 0:
13856       if (regno >= ARRAY_SIZE (qi_high_reg_name))
13857 	goto normal;
13858       reg = qi_high_reg_name[regno];
13859       break;
13860     case 32:
13861       if (SSE_REG_P (x))
13862 	{
13863 	  gcc_assert (!duplicated);
13864 	  putc ('y', file);
13865 	  fputs (hi_reg_name[regno] + 1, file);
13866 	  return;
13867 	}
13868       break;
13869     default:
13870       gcc_unreachable ();
13871     }
13872 
13873   fputs (reg, file);
13874   if (duplicated)
13875     {
13876       if (ASSEMBLER_DIALECT == ASM_ATT)
13877 	fprintf (file, ", %%%s", reg);
13878       else
13879 	fprintf (file, ", %s", reg);
13880     }
13881 }
13882 
13883 /* Locate some local-dynamic symbol still in use by this function
13884    so that we can print its name in some tls_local_dynamic_base
13885    pattern.  */
13886 
13887 static int
13888 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13889 {
13890   rtx x = *px;
13891 
13892   if (GET_CODE (x) == SYMBOL_REF
13893       && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13894     {
13895       cfun->machine->some_ld_name = XSTR (x, 0);
13896       return 1;
13897     }
13898 
13899   return 0;
13900 }
13901 
13902 static const char *
13903 get_some_local_dynamic_name (void)
13904 {
13905   rtx insn;
13906 
13907   if (cfun->machine->some_ld_name)
13908     return cfun->machine->some_ld_name;
13909 
13910   for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13911     if (NONDEBUG_INSN_P (insn)
13912 	&& for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13913       return cfun->machine->some_ld_name;
13914 
13915   return NULL;
13916 }
13917 
13918 /* Meaning of CODE:
13919    L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13920    C -- print opcode suffix for set/cmov insn.
13921    c -- like C, but print reversed condition
13922    F,f -- likewise, but for floating-point.
13923    O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13924         otherwise nothing
13925    R -- print the prefix for register names.
13926    z -- print the opcode suffix for the size of the current operand.
13927    Z -- likewise, with special suffixes for x87 instructions.
13928    * -- print a star (in certain assembler syntax)
13929    A -- print an absolute memory reference.
13930    E -- print address with DImode register names if TARGET_64BIT.
13931    w -- print the operand as if it's a "word" (HImode) even if it isn't.
13932    s -- print a shift double count, followed by the assemblers argument
13933 	delimiter.
13934    b -- print the QImode name of the register for the indicated operand.
13935 	%b0 would print %al if operands[0] is reg 0.
13936    w --  likewise, print the HImode name of the register.
13937    k --  likewise, print the SImode name of the register.
13938    q --  likewise, print the DImode name of the register.
13939    x --  likewise, print the V4SFmode name of the register.
13940    t --  likewise, print the V8SFmode name of the register.
13941    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13942    y -- print "st(0)" instead of "st" as a register.
13943    d -- print duplicated register operand for AVX instruction.
13944    D -- print condition for SSE cmp instruction.
13945    P -- if PIC, print an @PLT suffix.
13946    p -- print raw symbol name.
13947    X -- don't print any sort of PIC '@' suffix for a symbol.
13948    & -- print some in-use local-dynamic symbol name.
13949    H -- print a memory address offset by 8; used for sse high-parts
13950    Y -- print condition for XOP pcom* instruction.
13951    + -- print a branch hint as 'cs' or 'ds' prefix
13952    ; -- print a semicolon (after prefixes due to bug in older gas).
13953    ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13954    @ -- print a segment register of thread base pointer load
13955  */
13956 
13957 void
13958 ix86_print_operand (FILE *file, rtx x, int code)
13959 {
13960   if (code)
13961     {
13962       switch (code)
13963 	{
13964 	case '*':
13965 	  if (ASSEMBLER_DIALECT == ASM_ATT)
13966 	    putc ('*', file);
13967 	  return;
13968 
13969 	case '&':
13970 	  {
13971 	    const char *name = get_some_local_dynamic_name ();
13972 	    if (name == NULL)
13973 	      output_operand_lossage ("'%%&' used without any "
13974 				      "local dynamic TLS references");
13975 	    else
13976 	      assemble_name (file, name);
13977 	    return;
13978 	  }
13979 
13980 	case 'A':
13981 	  switch (ASSEMBLER_DIALECT)
13982 	    {
13983 	    case ASM_ATT:
13984 	      putc ('*', file);
13985 	      break;
13986 
13987 	    case ASM_INTEL:
13988 	      /* Intel syntax. For absolute addresses, registers should not
13989 		 be surrounded by braces.  */
13990 	      if (!REG_P (x))
13991 		{
13992 		  putc ('[', file);
13993 		  ix86_print_operand (file, x, 0);
13994 		  putc (']', file);
13995 		  return;
13996 		}
13997 	      break;
13998 
13999 	    default:
14000 	      gcc_unreachable ();
14001 	    }
14002 
14003 	  ix86_print_operand (file, x, 0);
14004 	  return;
14005 
14006 	case 'E':
14007 	  /* Wrap address in an UNSPEC to declare special handling.  */
14008 	  if (TARGET_64BIT)
14009 	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14010 
14011 	  output_address (x);
14012 	  return;
14013 
14014 	case 'L':
14015 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14016 	    putc ('l', file);
14017 	  return;
14018 
14019 	case 'W':
14020 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14021 	    putc ('w', file);
14022 	  return;
14023 
14024 	case 'B':
14025 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14026 	    putc ('b', file);
14027 	  return;
14028 
14029 	case 'Q':
14030 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14031 	    putc ('l', file);
14032 	  return;
14033 
14034 	case 'S':
14035 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14036 	    putc ('s', file);
14037 	  return;
14038 
14039 	case 'T':
14040 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14041 	    putc ('t', file);
14042 	  return;
14043 
14044 	case 'z':
14045 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14046 	    {
14047 	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
14048 	      if (ASSEMBLER_DIALECT == ASM_INTEL)
14049 		return;
14050 
14051 	      switch (GET_MODE_SIZE (GET_MODE (x)))
14052 		{
14053 		case 1:
14054 		  putc ('b', file);
14055 		  return;
14056 
14057 		case 2:
14058 		  putc ('w', file);
14059 		  return;
14060 
14061 		case 4:
14062 		  putc ('l', file);
14063 		  return;
14064 
14065 		case 8:
14066 		  putc ('q', file);
14067 		  return;
14068 
14069 		default:
14070 		  output_operand_lossage
14071 		    ("invalid operand size for operand code '%c'", code);
14072 		  return;
14073 		}
14074 	    }
14075 
14076 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14077 	    warning
14078 	      (0, "non-integer operand used with operand code '%c'", code);
14079 	  /* FALLTHRU */
14080 
14081 	case 'Z':
14082 	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
14083 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
14084 	    return;
14085 
14086 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14087 	    {
14088 	      switch (GET_MODE_SIZE (GET_MODE (x)))
14089 		{
14090 		case 2:
14091 #ifdef HAVE_AS_IX86_FILDS
14092 		  putc ('s', file);
14093 #endif
14094 		  return;
14095 
14096 		case 4:
14097 		  putc ('l', file);
14098 		  return;
14099 
14100 		case 8:
14101 #ifdef HAVE_AS_IX86_FILDQ
14102 		  putc ('q', file);
14103 #else
14104 		  fputs ("ll", file);
14105 #endif
14106 		  return;
14107 
14108 		default:
14109 		  break;
14110 		}
14111 	    }
14112 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14113 	    {
14114 	      /* 387 opcodes don't get size suffixes
14115 		 if the operands are registers.  */
14116 	      if (STACK_REG_P (x))
14117 		return;
14118 
14119 	      switch (GET_MODE_SIZE (GET_MODE (x)))
14120 		{
14121 		case 4:
14122 		  putc ('s', file);
14123 		  return;
14124 
14125 		case 8:
14126 		  putc ('l', file);
14127 		  return;
14128 
14129 		case 12:
14130 		case 16:
14131 		  putc ('t', file);
14132 		  return;
14133 
14134 		default:
14135 		  break;
14136 		}
14137 	    }
14138 	  else
14139 	    {
14140 	      output_operand_lossage
14141 		("invalid operand type used with operand code '%c'", code);
14142 	      return;
14143 	    }
14144 
14145 	  output_operand_lossage
14146 	    ("invalid operand size for operand code '%c'", code);
14147 	  return;
14148 
14149 	case 'd':
14150 	case 'b':
14151 	case 'w':
14152 	case 'k':
14153 	case 'q':
14154 	case 'h':
14155 	case 't':
14156 	case 'y':
14157 	case 'x':
14158 	case 'X':
14159 	case 'P':
14160 	case 'p':
14161 	  break;
14162 
14163 	case 's':
14164 	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14165 	    {
14166 	      ix86_print_operand (file, x, 0);
14167 	      fputs (", ", file);
14168 	    }
14169 	  return;
14170 
14171 	case 'D':
14172 	  /* Little bit of braindamage here.  The SSE compare instructions
14173 	     does use completely different names for the comparisons that the
14174 	     fp conditional moves.  */
14175 	  if (TARGET_AVX)
14176 	    {
14177 	      switch (GET_CODE (x))
14178 		{
14179 		case EQ:
14180 		  fputs ("eq", file);
14181 		  break;
14182 		case UNEQ:
14183 		  fputs ("eq_us", file);
14184 		  break;
14185 		case LT:
14186 		  fputs ("lt", file);
14187 		  break;
14188 		case UNLT:
14189 		  fputs ("nge", file);
14190 		  break;
14191 		case LE:
14192 		  fputs ("le", file);
14193 		  break;
14194 		case UNLE:
14195 		  fputs ("ngt", file);
14196 		  break;
14197 		case UNORDERED:
14198 		  fputs ("unord", file);
14199 		  break;
14200 		case NE:
14201 		  fputs ("neq", file);
14202 		  break;
14203 		case LTGT:
14204 		  fputs ("neq_oq", file);
14205 		  break;
14206 		case GE:
14207 		  fputs ("ge", file);
14208 		  break;
14209 		case UNGE:
14210 		  fputs ("nlt", file);
14211 		  break;
14212 		case GT:
14213 		  fputs ("gt", file);
14214 		  break;
14215 		case UNGT:
14216 		  fputs ("nle", file);
14217 		  break;
14218 		case ORDERED:
14219 		  fputs ("ord", file);
14220 		  break;
14221 		default:
14222 		  output_operand_lossage ("operand is not a condition code, "
14223 					  "invalid operand code 'D'");
14224 		  return;
14225 		}
14226 	    }
14227 	  else
14228 	    {
14229 	      switch (GET_CODE (x))
14230 		{
14231 		case EQ:
14232 		case UNEQ:
14233 		  fputs ("eq", file);
14234 		  break;
14235 		case LT:
14236 		case UNLT:
14237 		  fputs ("lt", file);
14238 		  break;
14239 		case LE:
14240 		case UNLE:
14241 		  fputs ("le", file);
14242 		  break;
14243 		case UNORDERED:
14244 		  fputs ("unord", file);
14245 		  break;
14246 		case NE:
14247 		case LTGT:
14248 		  fputs ("neq", file);
14249 		  break;
14250 		case UNGE:
14251 		case GE:
14252 		  fputs ("nlt", file);
14253 		  break;
14254 		case UNGT:
14255 		case GT:
14256 		  fputs ("nle", file);
14257 		  break;
14258 		case ORDERED:
14259 		  fputs ("ord", file);
14260 		  break;
14261 		default:
14262 		  output_operand_lossage ("operand is not a condition code, "
14263 					  "invalid operand code 'D'");
14264 		  return;
14265 		}
14266 	    }
14267 	  return;
14268 	case 'O':
14269 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14270 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14271 	    {
14272 	      switch (GET_MODE (x))
14273 		{
14274 		case HImode: putc ('w', file); break;
14275 		case SImode:
14276 		case SFmode: putc ('l', file); break;
14277 		case DImode:
14278 		case DFmode: putc ('q', file); break;
14279 		default: gcc_unreachable ();
14280 		}
14281 	      putc ('.', file);
14282 	    }
14283 #endif
14284 	  return;
14285 	case 'C':
14286 	  if (!COMPARISON_P (x))
14287 	    {
14288 	      output_operand_lossage ("operand is neither a constant nor a "
14289 				      "condition code, invalid operand code "
14290 				      "'C'");
14291 	      return;
14292 	    }
14293 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14294 	  return;
14295 	case 'F':
14296 	  if (!COMPARISON_P (x))
14297 	    {
14298 	      output_operand_lossage ("operand is neither a constant nor a "
14299 				      "condition code, invalid operand code "
14300 				      "'F'");
14301 	      return;
14302 	    }
14303 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14304 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14305 	    putc ('.', file);
14306 #endif
14307 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14308 	  return;
14309 
14310 	  /* Like above, but reverse condition */
14311 	case 'c':
14312 	  /* Check to see if argument to %c is really a constant
14313 	     and not a condition code which needs to be reversed.  */
14314 	  if (!COMPARISON_P (x))
14315 	    {
14316 	      output_operand_lossage ("operand is neither a constant nor a "
14317 				      "condition code, invalid operand "
14318 				      "code 'c'");
14319 	      return;
14320 	    }
14321 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14322 	  return;
14323 	case 'f':
14324 	  if (!COMPARISON_P (x))
14325 	    {
14326 	      output_operand_lossage ("operand is neither a constant nor a "
14327 				      "condition code, invalid operand "
14328 				      "code 'f'");
14329 	      return;
14330 	    }
14331 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14332 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14333 	    putc ('.', file);
14334 #endif
14335 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14336 	  return;
14337 
14338 	case 'H':
14339 	  if (!offsettable_memref_p (x))
14340 	    {
14341 	      output_operand_lossage ("operand is not an offsettable memory "
14342 				      "reference, invalid operand "
14343 				      "code 'H'");
14344 	      return;
14345 	    }
14346 	  /* It doesn't actually matter what mode we use here, as we're
14347 	     only going to use this for printing.  */
14348 	  x = adjust_address_nv (x, DImode, 8);
14349 	  break;
14350 
14351 	case '+':
14352 	  {
14353 	    rtx x;
14354 
14355 	    if (!optimize
14356 	        || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14357 	      return;
14358 
14359 	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14360 	    if (x)
14361 	      {
14362 		int pred_val = INTVAL (XEXP (x, 0));
14363 
14364 		if (pred_val < REG_BR_PROB_BASE * 45 / 100
14365 		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
14366 		  {
14367 		    int taken = pred_val > REG_BR_PROB_BASE / 2;
14368 		    int cputaken = final_forward_branch_p (current_output_insn) == 0;
14369 
14370 		    /* Emit hints only in the case default branch prediction
14371 		       heuristics would fail.  */
14372 		    if (taken != cputaken)
14373 		      {
14374 			/* We use 3e (DS) prefix for taken branches and
14375 			   2e (CS) prefix for not taken branches.  */
14376 			if (taken)
14377 			  fputs ("ds ; ", file);
14378 			else
14379 			  fputs ("cs ; ", file);
14380 		      }
14381 		  }
14382 	      }
14383 	    return;
14384 	  }
14385 
14386 	case 'Y':
14387 	  switch (GET_CODE (x))
14388 	    {
14389 	    case NE:
14390 	      fputs ("neq", file);
14391 	      break;
14392 	    case EQ:
14393 	      fputs ("eq", file);
14394 	      break;
14395 	    case GE:
14396 	    case GEU:
14397 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14398 	      break;
14399 	    case GT:
14400 	    case GTU:
14401 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14402 	      break;
14403 	    case LE:
14404 	    case LEU:
14405 	      fputs ("le", file);
14406 	      break;
14407 	    case LT:
14408 	    case LTU:
14409 	      fputs ("lt", file);
14410 	      break;
14411 	    case UNORDERED:
14412 	      fputs ("unord", file);
14413 	      break;
14414 	    case ORDERED:
14415 	      fputs ("ord", file);
14416 	      break;
14417 	    case UNEQ:
14418 	      fputs ("ueq", file);
14419 	      break;
14420 	    case UNGE:
14421 	      fputs ("nlt", file);
14422 	      break;
14423 	    case UNGT:
14424 	      fputs ("nle", file);
14425 	      break;
14426 	    case UNLE:
14427 	      fputs ("ule", file);
14428 	      break;
14429 	    case UNLT:
14430 	      fputs ("ult", file);
14431 	      break;
14432 	    case LTGT:
14433 	      fputs ("une", file);
14434 	      break;
14435 	    default:
14436 	      output_operand_lossage ("operand is not a condition code, "
14437 				      "invalid operand code 'Y'");
14438 	      return;
14439 	    }
14440 	  return;
14441 
14442 	case ';':
14443 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14444 	  putc (';', file);
14445 #endif
14446 	  return;
14447 
14448 	case '@':
14449 	  if (ASSEMBLER_DIALECT == ASM_ATT)
14450 	    putc ('%', file);
14451 
14452 	  /* The kernel uses a different segment register for performance
14453 	     reasons; a system call would not have to trash the userspace
14454 	     segment register, which would be expensive.  */
14455 	  if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14456 	    fputs ("fs", file);
14457 	  else
14458 	    fputs ("gs", file);
14459 	  return;
14460 
14461 	case '~':
14462 	  putc (TARGET_AVX2 ? 'i' : 'f', file);
14463 	  return;
14464 
14465 	default:
14466 	    output_operand_lossage ("invalid operand code '%c'", code);
14467 	}
14468     }
14469 
14470   if (REG_P (x))
14471     print_reg (x, code, file);
14472 
14473   else if (MEM_P (x))
14474     {
14475       /* No `byte ptr' prefix for call instructions or BLKmode operands.  */
14476       if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14477 	  && GET_MODE (x) != BLKmode)
14478 	{
14479 	  const char * size;
14480 	  switch (GET_MODE_SIZE (GET_MODE (x)))
14481 	    {
14482 	    case 1: size = "BYTE"; break;
14483 	    case 2: size = "WORD"; break;
14484 	    case 4: size = "DWORD"; break;
14485 	    case 8: size = "QWORD"; break;
14486 	    case 12: size = "TBYTE"; break;
14487 	    case 16:
14488 	      if (GET_MODE (x) == XFmode)
14489 		size = "TBYTE";
14490               else
14491 		size = "XMMWORD";
14492               break;
14493 	    case 32: size = "YMMWORD"; break;
14494 	    default:
14495 	      gcc_unreachable ();
14496 	    }
14497 
14498 	  /* Check for explicit size override (codes 'b', 'w', 'k',
14499 	     'q' and 'x')  */
14500 	  if (code == 'b')
14501 	    size = "BYTE";
14502 	  else if (code == 'w')
14503 	    size = "WORD";
14504 	  else if (code == 'k')
14505 	    size = "DWORD";
14506 	  else if (code == 'q')
14507 	    size = "QWORD";
14508 	  else if (code == 'x')
14509 	    size = "XMMWORD";
14510 
14511 	  fputs (size, file);
14512 	  fputs (" PTR ", file);
14513 	}
14514 
14515       x = XEXP (x, 0);
14516       /* Avoid (%rip) for call operands.  */
14517       if (CONSTANT_ADDRESS_P (x) && code == 'P'
14518 	  && !CONST_INT_P (x))
14519 	output_addr_const (file, x);
14520       else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14521 	output_operand_lossage ("invalid constraints for operand");
14522       else
14523 	output_address (x);
14524     }
14525 
14526   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14527     {
14528       REAL_VALUE_TYPE r;
14529       long l;
14530 
14531       REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14532       REAL_VALUE_TO_TARGET_SINGLE (r, l);
14533 
14534       if (ASSEMBLER_DIALECT == ASM_ATT)
14535 	putc ('$', file);
14536       /* Sign extend 32bit SFmode immediate to 8 bytes.  */
14537       if (code == 'q')
14538 	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14539 		 (unsigned long long) (int) l);
14540       else
14541 	fprintf (file, "0x%08x", (unsigned int) l);
14542     }
14543 
14544   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14545     {
14546       REAL_VALUE_TYPE r;
14547       long l[2];
14548 
14549       REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14550       REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14551 
14552       if (ASSEMBLER_DIALECT == ASM_ATT)
14553 	putc ('$', file);
14554       fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14555     }
14556 
14557   /* These float cases don't actually occur as immediate operands.  */
14558   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14559     {
14560       char dstr[30];
14561 
14562       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14563       fputs (dstr, file);
14564     }
14565 
14566   else
14567     {
14568       /* We have patterns that allow zero sets of memory, for instance.
14569 	 In 64-bit mode, we should probably support all 8-byte vectors,
14570 	 since we can in fact encode that into an immediate.  */
14571       if (GET_CODE (x) == CONST_VECTOR)
14572 	{
14573 	  gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14574 	  x = const0_rtx;
14575 	}
14576 
14577       if (code != 'P' && code != 'p')
14578 	{
14579 	  if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14580 	    {
14581 	      if (ASSEMBLER_DIALECT == ASM_ATT)
14582 		putc ('$', file);
14583 	    }
14584 	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14585 		   || GET_CODE (x) == LABEL_REF)
14586 	    {
14587 	      if (ASSEMBLER_DIALECT == ASM_ATT)
14588 		putc ('$', file);
14589 	      else
14590 		fputs ("OFFSET FLAT:", file);
14591 	    }
14592 	}
14593       if (CONST_INT_P (x))
14594 	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14595       else if (flag_pic || MACHOPIC_INDIRECT)
14596 	output_pic_addr_const (file, x, code);
14597       else
14598 	output_addr_const (file, x);
14599     }
14600 }
14601 
14602 static bool
14603 ix86_print_operand_punct_valid_p (unsigned char code)
14604 {
14605   return (code == '@' || code == '*' || code == '+'
14606 	  || code == '&' || code == ';' || code == '~');
14607 }
14608 
14609 /* Print a memory operand whose address is ADDR.  */
14610 
14611 static void
14612 ix86_print_operand_address (FILE *file, rtx addr)
14613 {
14614   struct ix86_address parts;
14615   rtx base, index, disp;
14616   int scale;
14617   int ok;
14618   bool vsib = false;
14619   int code = 0;
14620 
14621   if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14622     {
14623       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14624       gcc_assert (parts.index == NULL_RTX);
14625       parts.index = XVECEXP (addr, 0, 1);
14626       parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14627       addr = XVECEXP (addr, 0, 0);
14628       vsib = true;
14629     }
14630   else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14631     {
14632       gcc_assert (TARGET_64BIT);
14633       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14634       code = 'q';
14635     }
14636   else
14637     ok = ix86_decompose_address (addr, &parts);
14638 
14639   gcc_assert (ok);
14640 
14641   base = parts.base;
14642   index = parts.index;
14643   disp = parts.disp;
14644   scale = parts.scale;
14645 
14646   switch (parts.seg)
14647     {
14648     case SEG_DEFAULT:
14649       break;
14650     case SEG_FS:
14651     case SEG_GS:
14652       if (ASSEMBLER_DIALECT == ASM_ATT)
14653 	putc ('%', file);
14654       fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14655       break;
14656     default:
14657       gcc_unreachable ();
14658     }
14659 
14660   /* Use one byte shorter RIP relative addressing for 64bit mode.  */
14661   if (TARGET_64BIT && !base && !index)
14662     {
14663       rtx symbol = disp;
14664 
14665       if (GET_CODE (disp) == CONST
14666 	  && GET_CODE (XEXP (disp, 0)) == PLUS
14667 	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14668 	symbol = XEXP (XEXP (disp, 0), 0);
14669 
14670       if (GET_CODE (symbol) == LABEL_REF
14671 	  || (GET_CODE (symbol) == SYMBOL_REF
14672 	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14673 	base = pc_rtx;
14674     }
14675   if (!base && !index)
14676     {
14677       /* Displacement only requires special attention.  */
14678 
14679       if (CONST_INT_P (disp))
14680 	{
14681 	  if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14682 	    fputs ("ds:", file);
14683 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14684 	}
14685       else if (flag_pic)
14686 	output_pic_addr_const (file, disp, 0);
14687       else
14688 	output_addr_const (file, disp);
14689     }
14690   else
14691     {
14692       /* Print SImode register names to force addr32 prefix.  */
14693       if (SImode_address_operand (addr, VOIDmode))
14694 	{
14695 #ifdef ENABLE_CHECKING
14696 	  gcc_assert (TARGET_64BIT);
14697 	  switch (GET_CODE (addr))
14698 	    {
14699 	    case SUBREG:
14700 	      gcc_assert (GET_MODE (addr) == SImode);
14701 	      gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14702 	      break;
14703 	    case ZERO_EXTEND:
14704 	    case AND:
14705 	      gcc_assert (GET_MODE (addr) == DImode);
14706 	      break;
14707 	    default:
14708 	      gcc_unreachable ();
14709 	    }
14710 #endif
14711 	  gcc_assert (!code);
14712 	  code = 'k';
14713 	}
14714       else if (code == 0
14715 	       && TARGET_X32
14716 	       && disp
14717 	       && CONST_INT_P (disp)
14718 	       && INTVAL (disp) < -16*1024*1024)
14719 	{
14720 	  /* X32 runs in 64-bit mode, where displacement, DISP, in
14721 	     address DISP(%r64), is encoded as 32-bit immediate sign-
14722 	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
14723 	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
14724 	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14725 	     which is invalid for x32.  The correct address is %r64
14726 	     - 0x40000300 == 0xf7ffdd64.  To properly encode
14727 	     -0x40000300(%r64) for x32, we zero-extend negative
14728 	     displacement by forcing addr32 prefix which truncates
14729 	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
14730 	     zero-extend all negative displacements, including -1(%rsp).
14731 	     However, for small negative displacements, sign-extension
14732 	     won't cause overflow.  We only zero-extend negative
14733 	     displacements if they < -16*1024*1024, which is also used
14734 	     to check legitimate address displacements for PIC.  */
14735 	  code = 'k';
14736 	}
14737 
14738       if (ASSEMBLER_DIALECT == ASM_ATT)
14739 	{
14740 	  if (disp)
14741 	    {
14742 	      if (flag_pic)
14743 		output_pic_addr_const (file, disp, 0);
14744 	      else if (GET_CODE (disp) == LABEL_REF)
14745 		output_asm_label (disp);
14746 	      else
14747 		output_addr_const (file, disp);
14748 	    }
14749 
14750 	  putc ('(', file);
14751 	  if (base)
14752 	    print_reg (base, code, file);
14753 	  if (index)
14754 	    {
14755 	      putc (',', file);
14756 	      print_reg (index, vsib ? 0 : code, file);
14757 	      if (scale != 1 || vsib)
14758 		fprintf (file, ",%d", scale);
14759 	    }
14760 	  putc (')', file);
14761 	}
14762       else
14763 	{
14764 	  rtx offset = NULL_RTX;
14765 
14766 	  if (disp)
14767 	    {
14768 	      /* Pull out the offset of a symbol; print any symbol itself.  */
14769 	      if (GET_CODE (disp) == CONST
14770 		  && GET_CODE (XEXP (disp, 0)) == PLUS
14771 		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14772 		{
14773 		  offset = XEXP (XEXP (disp, 0), 1);
14774 		  disp = gen_rtx_CONST (VOIDmode,
14775 					XEXP (XEXP (disp, 0), 0));
14776 		}
14777 
14778 	      if (flag_pic)
14779 		output_pic_addr_const (file, disp, 0);
14780 	      else if (GET_CODE (disp) == LABEL_REF)
14781 		output_asm_label (disp);
14782 	      else if (CONST_INT_P (disp))
14783 		offset = disp;
14784 	      else
14785 		output_addr_const (file, disp);
14786 	    }
14787 
14788 	  putc ('[', file);
14789 	  if (base)
14790 	    {
14791 	      print_reg (base, code, file);
14792 	      if (offset)
14793 		{
14794 		  if (INTVAL (offset) >= 0)
14795 		    putc ('+', file);
14796 		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14797 		}
14798 	    }
14799 	  else if (offset)
14800 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14801 	  else
14802 	    putc ('0', file);
14803 
14804 	  if (index)
14805 	    {
14806 	      putc ('+', file);
14807 	      print_reg (index, vsib ? 0 : code, file);
14808 	      if (scale != 1 || vsib)
14809 		fprintf (file, "*%d", scale);
14810 	    }
14811 	  putc (']', file);
14812 	}
14813     }
14814 }
14815 
14816 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
14817 
14818 static bool
14819 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14820 {
14821   rtx op;
14822 
14823   if (GET_CODE (x) != UNSPEC)
14824     return false;
14825 
14826   op = XVECEXP (x, 0, 0);
14827   switch (XINT (x, 1))
14828     {
14829     case UNSPEC_GOTTPOFF:
14830       output_addr_const (file, op);
14831       /* FIXME: This might be @TPOFF in Sun ld.  */
14832       fputs ("@gottpoff", file);
14833       break;
14834     case UNSPEC_TPOFF:
14835       output_addr_const (file, op);
14836       fputs ("@tpoff", file);
14837       break;
14838     case UNSPEC_NTPOFF:
14839       output_addr_const (file, op);
14840       if (TARGET_64BIT)
14841 	fputs ("@tpoff", file);
14842       else
14843 	fputs ("@ntpoff", file);
14844       break;
14845     case UNSPEC_DTPOFF:
14846       output_addr_const (file, op);
14847       fputs ("@dtpoff", file);
14848       break;
14849     case UNSPEC_GOTNTPOFF:
14850       output_addr_const (file, op);
14851       if (TARGET_64BIT)
14852 	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14853 	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14854       else
14855 	fputs ("@gotntpoff", file);
14856       break;
14857     case UNSPEC_INDNTPOFF:
14858       output_addr_const (file, op);
14859       fputs ("@indntpoff", file);
14860       break;
14861 #if TARGET_MACHO
14862     case UNSPEC_MACHOPIC_OFFSET:
14863       output_addr_const (file, op);
14864       putc ('-', file);
14865       machopic_output_function_base_name (file);
14866       break;
14867 #endif
14868 
14869     case UNSPEC_STACK_CHECK:
14870       {
14871 	int offset;
14872 
14873 	gcc_assert (flag_split_stack);
14874 
14875 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14876 	offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14877 #else
14878 	gcc_unreachable ();
14879 #endif
14880 
14881 	fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14882       }
14883       break;
14884 
14885     default:
14886       return false;
14887     }
14888 
14889   return true;
14890 }
14891 
14892 /* Split one or more double-mode RTL references into pairs of half-mode
14893    references.  The RTL can be REG, offsettable MEM, integer constant, or
14894    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
14895    split and "num" is its length.  lo_half and hi_half are output arrays
14896    that parallel "operands".  */
14897 
14898 void
14899 split_double_mode (enum machine_mode mode, rtx operands[],
14900 		   int num, rtx lo_half[], rtx hi_half[])
14901 {
14902   enum machine_mode half_mode;
14903   unsigned int byte;
14904 
14905   switch (mode)
14906     {
14907     case TImode:
14908       half_mode = DImode;
14909       break;
14910     case DImode:
14911       half_mode = SImode;
14912       break;
14913     default:
14914       gcc_unreachable ();
14915     }
14916 
14917   byte = GET_MODE_SIZE (half_mode);
14918 
14919   while (num--)
14920     {
14921       rtx op = operands[num];
14922 
14923       /* simplify_subreg refuse to split volatile memory addresses,
14924          but we still have to handle it.  */
14925       if (MEM_P (op))
14926 	{
14927 	  lo_half[num] = adjust_address (op, half_mode, 0);
14928 	  hi_half[num] = adjust_address (op, half_mode, byte);
14929 	}
14930       else
14931 	{
14932 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
14933 					      GET_MODE (op) == VOIDmode
14934 					      ? mode : GET_MODE (op), 0);
14935 	  hi_half[num] = simplify_gen_subreg (half_mode, op,
14936 					      GET_MODE (op) == VOIDmode
14937 					      ? mode : GET_MODE (op), byte);
14938 	}
14939     }
14940 }
14941 
14942 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14943    MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
14944    is the expression of the binary operation.  The output may either be
14945    emitted here, or returned to the caller, like all output_* functions.
14946 
14947    There is no guarantee that the operands are the same mode, as they
14948    might be within FLOAT or FLOAT_EXTEND expressions.  */
14949 
14950 #ifndef SYSV386_COMPAT
14951 /* Set to 1 for compatibility with brain-damaged assemblers.  No-one
14952    wants to fix the assemblers because that causes incompatibility
14953    with gcc.  No-one wants to fix gcc because that causes
14954    incompatibility with assemblers...  You can use the option of
14955    -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
14956 #define SYSV386_COMPAT 1
14957 #endif
14958 
14959 const char *
14960 output_387_binary_op (rtx insn, rtx *operands)
14961 {
14962   static char buf[40];
14963   const char *p;
14964   const char *ssep;
14965   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14966 
14967 #ifdef ENABLE_CHECKING
14968   /* Even if we do not want to check the inputs, this documents input
14969      constraints.  Which helps in understanding the following code.  */
14970   if (STACK_REG_P (operands[0])
14971       && ((REG_P (operands[1])
14972 	   && REGNO (operands[0]) == REGNO (operands[1])
14973 	   && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14974 	  || (REG_P (operands[2])
14975 	      && REGNO (operands[0]) == REGNO (operands[2])
14976 	      && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14977       && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14978     ; /* ok */
14979   else
14980     gcc_assert (is_sse);
14981 #endif
14982 
14983   switch (GET_CODE (operands[3]))
14984     {
14985     case PLUS:
14986       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14987 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14988 	p = "fiadd";
14989       else
14990 	p = "fadd";
14991       ssep = "vadd";
14992       break;
14993 
14994     case MINUS:
14995       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14996 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14997 	p = "fisub";
14998       else
14999 	p = "fsub";
15000       ssep = "vsub";
15001       break;
15002 
15003     case MULT:
15004       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15005 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15006 	p = "fimul";
15007       else
15008 	p = "fmul";
15009       ssep = "vmul";
15010       break;
15011 
15012     case DIV:
15013       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15014 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15015 	p = "fidiv";
15016       else
15017 	p = "fdiv";
15018       ssep = "vdiv";
15019       break;
15020 
15021     default:
15022       gcc_unreachable ();
15023     }
15024 
15025   if (is_sse)
15026    {
15027      if (TARGET_AVX)
15028        {
15029 	 strcpy (buf, ssep);
15030 	 if (GET_MODE (operands[0]) == SFmode)
15031 	   strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15032 	 else
15033 	   strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15034        }
15035      else
15036        {
15037 	 strcpy (buf, ssep + 1);
15038 	 if (GET_MODE (operands[0]) == SFmode)
15039 	   strcat (buf, "ss\t{%2, %0|%0, %2}");
15040 	 else
15041 	   strcat (buf, "sd\t{%2, %0|%0, %2}");
15042        }
15043       return buf;
15044    }
15045   strcpy (buf, p);
15046 
15047   switch (GET_CODE (operands[3]))
15048     {
15049     case MULT:
15050     case PLUS:
15051       if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15052 	{
15053 	  rtx temp = operands[2];
15054 	  operands[2] = operands[1];
15055 	  operands[1] = temp;
15056 	}
15057 
15058       /* know operands[0] == operands[1].  */
15059 
15060       if (MEM_P (operands[2]))
15061 	{
15062 	  p = "%Z2\t%2";
15063 	  break;
15064 	}
15065 
15066       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15067 	{
15068 	  if (STACK_TOP_P (operands[0]))
15069 	    /* How is it that we are storing to a dead operand[2]?
15070 	       Well, presumably operands[1] is dead too.  We can't
15071 	       store the result to st(0) as st(0) gets popped on this
15072 	       instruction.  Instead store to operands[2] (which I
15073 	       think has to be st(1)).  st(1) will be popped later.
15074 	       gcc <= 2.8.1 didn't have this check and generated
15075 	       assembly code that the Unixware assembler rejected.  */
15076 	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
15077 	  else
15078 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
15079 	  break;
15080 	}
15081 
15082       if (STACK_TOP_P (operands[0]))
15083 	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
15084       else
15085 	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
15086       break;
15087 
15088     case MINUS:
15089     case DIV:
15090       if (MEM_P (operands[1]))
15091 	{
15092 	  p = "r%Z1\t%1";
15093 	  break;
15094 	}
15095 
15096       if (MEM_P (operands[2]))
15097 	{
15098 	  p = "%Z2\t%2";
15099 	  break;
15100 	}
15101 
15102       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15103 	{
15104 #if SYSV386_COMPAT
15105 	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15106 	     derived assemblers, confusingly reverse the direction of
15107 	     the operation for fsub{r} and fdiv{r} when the
15108 	     destination register is not st(0).  The Intel assembler
15109 	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
15110 	     figure out what the hardware really does.  */
15111 	  if (STACK_TOP_P (operands[0]))
15112 	    p = "{p\t%0, %2|rp\t%2, %0}";
15113 	  else
15114 	    p = "{rp\t%2, %0|p\t%0, %2}";
15115 #else
15116 	  if (STACK_TOP_P (operands[0]))
15117 	    /* As above for fmul/fadd, we can't store to st(0).  */
15118 	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
15119 	  else
15120 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
15121 #endif
15122 	  break;
15123 	}
15124 
15125       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15126 	{
15127 #if SYSV386_COMPAT
15128 	  if (STACK_TOP_P (operands[0]))
15129 	    p = "{rp\t%0, %1|p\t%1, %0}";
15130 	  else
15131 	    p = "{p\t%1, %0|rp\t%0, %1}";
15132 #else
15133 	  if (STACK_TOP_P (operands[0]))
15134 	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
15135 	  else
15136 	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
15137 #endif
15138 	  break;
15139 	}
15140 
15141       if (STACK_TOP_P (operands[0]))
15142 	{
15143 	  if (STACK_TOP_P (operands[1]))
15144 	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
15145 	  else
15146 	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
15147 	  break;
15148 	}
15149       else if (STACK_TOP_P (operands[1]))
15150 	{
15151 #if SYSV386_COMPAT
15152 	  p = "{\t%1, %0|r\t%0, %1}";
15153 #else
15154 	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
15155 #endif
15156 	}
15157       else
15158 	{
15159 #if SYSV386_COMPAT
15160 	  p = "{r\t%2, %0|\t%0, %2}";
15161 #else
15162 	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
15163 #endif
15164 	}
15165       break;
15166 
15167     default:
15168       gcc_unreachable ();
15169     }
15170 
15171   strcat (buf, p);
15172   return buf;
15173 }
15174 
15175 /* Return needed mode for entity in optimize_mode_switching pass.  */
15176 
15177 int
15178 ix86_mode_needed (int entity, rtx insn)
15179 {
15180   enum attr_i387_cw mode;
15181 
15182   /* The mode UNINITIALIZED is used to store control word after a
15183      function call or ASM pattern.  The mode ANY specify that function
15184      has no requirements on the control word and make no changes in the
15185      bits we are interested in.  */
15186 
15187   if (CALL_P (insn)
15188       || (NONJUMP_INSN_P (insn)
15189 	  && (asm_noperands (PATTERN (insn)) >= 0
15190 	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15191     return I387_CW_UNINITIALIZED;
15192 
15193   if (recog_memoized (insn) < 0)
15194     return I387_CW_ANY;
15195 
15196   mode = get_attr_i387_cw (insn);
15197 
15198   switch (entity)
15199     {
15200     case I387_TRUNC:
15201       if (mode == I387_CW_TRUNC)
15202 	return mode;
15203       break;
15204 
15205     case I387_FLOOR:
15206       if (mode == I387_CW_FLOOR)
15207 	return mode;
15208       break;
15209 
15210     case I387_CEIL:
15211       if (mode == I387_CW_CEIL)
15212 	return mode;
15213       break;
15214 
15215     case I387_MASK_PM:
15216       if (mode == I387_CW_MASK_PM)
15217 	return mode;
15218       break;
15219 
15220     default:
15221       gcc_unreachable ();
15222     }
15223 
15224   return I387_CW_ANY;
15225 }
15226 
15227 /* Output code to initialize control word copies used by trunc?f?i and
15228    rounding patterns.  CURRENT_MODE is set to current control word,
15229    while NEW_MODE is set to new control word.  */
15230 
15231 void
15232 emit_i387_cw_initialization (int mode)
15233 {
15234   rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15235   rtx new_mode;
15236 
15237   enum ix86_stack_slot slot;
15238 
15239   rtx reg = gen_reg_rtx (HImode);
15240 
15241   emit_insn (gen_x86_fnstcw_1 (stored_mode));
15242   emit_move_insn (reg, copy_rtx (stored_mode));
15243 
15244   if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15245       || optimize_function_for_size_p (cfun))
15246     {
15247       switch (mode)
15248 	{
15249 	case I387_CW_TRUNC:
15250 	  /* round toward zero (truncate) */
15251 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15252 	  slot = SLOT_CW_TRUNC;
15253 	  break;
15254 
15255 	case I387_CW_FLOOR:
15256 	  /* round down toward -oo */
15257 	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15258 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15259 	  slot = SLOT_CW_FLOOR;
15260 	  break;
15261 
15262 	case I387_CW_CEIL:
15263 	  /* round up toward +oo */
15264 	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15265 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15266 	  slot = SLOT_CW_CEIL;
15267 	  break;
15268 
15269 	case I387_CW_MASK_PM:
15270 	  /* mask precision exception for nearbyint() */
15271 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15272 	  slot = SLOT_CW_MASK_PM;
15273 	  break;
15274 
15275 	default:
15276 	  gcc_unreachable ();
15277 	}
15278     }
15279   else
15280     {
15281       switch (mode)
15282 	{
15283 	case I387_CW_TRUNC:
15284 	  /* round toward zero (truncate) */
15285 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15286 	  slot = SLOT_CW_TRUNC;
15287 	  break;
15288 
15289 	case I387_CW_FLOOR:
15290 	  /* round down toward -oo */
15291 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15292 	  slot = SLOT_CW_FLOOR;
15293 	  break;
15294 
15295 	case I387_CW_CEIL:
15296 	  /* round up toward +oo */
15297 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15298 	  slot = SLOT_CW_CEIL;
15299 	  break;
15300 
15301 	case I387_CW_MASK_PM:
15302 	  /* mask precision exception for nearbyint() */
15303 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15304 	  slot = SLOT_CW_MASK_PM;
15305 	  break;
15306 
15307 	default:
15308 	  gcc_unreachable ();
15309 	}
15310     }
15311 
15312   gcc_assert (slot < MAX_386_STACK_LOCALS);
15313 
15314   new_mode = assign_386_stack_local (HImode, slot);
15315   emit_move_insn (new_mode, reg);
15316 }
15317 
15318 /* Output code for INSN to convert a float to a signed int.  OPERANDS
15319    are the insn operands.  The output may be [HSD]Imode and the input
15320    operand may be [SDX]Fmode.  */
15321 
15322 const char *
15323 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15324 {
15325   int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15326   int dimode_p = GET_MODE (operands[0]) == DImode;
15327   int round_mode = get_attr_i387_cw (insn);
15328 
15329   /* Jump through a hoop or two for DImode, since the hardware has no
15330      non-popping instruction.  We used to do this a different way, but
15331      that was somewhat fragile and broke with post-reload splitters.  */
15332   if ((dimode_p || fisttp) && !stack_top_dies)
15333     output_asm_insn ("fld\t%y1", operands);
15334 
15335   gcc_assert (STACK_TOP_P (operands[1]));
15336   gcc_assert (MEM_P (operands[0]));
15337   gcc_assert (GET_MODE (operands[1]) != TFmode);
15338 
15339   if (fisttp)
15340       output_asm_insn ("fisttp%Z0\t%0", operands);
15341   else
15342     {
15343       if (round_mode != I387_CW_ANY)
15344 	output_asm_insn ("fldcw\t%3", operands);
15345       if (stack_top_dies || dimode_p)
15346 	output_asm_insn ("fistp%Z0\t%0", operands);
15347       else
15348 	output_asm_insn ("fist%Z0\t%0", operands);
15349       if (round_mode != I387_CW_ANY)
15350 	output_asm_insn ("fldcw\t%2", operands);
15351     }
15352 
15353   return "";
15354 }
15355 
15356 /* Output code for x87 ffreep insn.  The OPNO argument, which may only
15357    have the values zero or one, indicates the ffreep insn's operand
15358    from the OPERANDS array.  */
15359 
15360 static const char *
15361 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15362 {
15363   if (TARGET_USE_FFREEP)
15364 #ifdef HAVE_AS_IX86_FFREEP
15365     return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15366 #else
15367     {
15368       static char retval[32];
15369       int regno = REGNO (operands[opno]);
15370 
15371       gcc_assert (FP_REGNO_P (regno));
15372 
15373       regno -= FIRST_STACK_REG;
15374 
15375       snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15376       return retval;
15377     }
15378 #endif
15379 
15380   return opno ? "fstp\t%y1" : "fstp\t%y0";
15381 }
15382 
15383 
15384 /* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
15385    should be used.  UNORDERED_P is true when fucom should be used.  */
15386 
15387 const char *
15388 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15389 {
15390   int stack_top_dies;
15391   rtx cmp_op0, cmp_op1;
15392   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15393 
15394   if (eflags_p)
15395     {
15396       cmp_op0 = operands[0];
15397       cmp_op1 = operands[1];
15398     }
15399   else
15400     {
15401       cmp_op0 = operands[1];
15402       cmp_op1 = operands[2];
15403     }
15404 
15405   if (is_sse)
15406     {
15407       if (GET_MODE (operands[0]) == SFmode)
15408 	if (unordered_p)
15409 	  return "%vucomiss\t{%1, %0|%0, %1}";
15410 	else
15411 	  return "%vcomiss\t{%1, %0|%0, %1}";
15412       else
15413 	if (unordered_p)
15414 	  return "%vucomisd\t{%1, %0|%0, %1}";
15415 	else
15416 	  return "%vcomisd\t{%1, %0|%0, %1}";
15417     }
15418 
15419   gcc_assert (STACK_TOP_P (cmp_op0));
15420 
15421   stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15422 
15423   if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15424     {
15425       if (stack_top_dies)
15426 	{
15427 	  output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15428 	  return output_387_ffreep (operands, 1);
15429 	}
15430       else
15431 	return "ftst\n\tfnstsw\t%0";
15432     }
15433 
15434   if (STACK_REG_P (cmp_op1)
15435       && stack_top_dies
15436       && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15437       && REGNO (cmp_op1) != FIRST_STACK_REG)
15438     {
15439       /* If both the top of the 387 stack dies, and the other operand
15440 	 is also a stack register that dies, then this must be a
15441 	 `fcompp' float compare */
15442 
15443       if (eflags_p)
15444 	{
15445 	  /* There is no double popping fcomi variant.  Fortunately,
15446 	     eflags is immune from the fstp's cc clobbering.  */
15447 	  if (unordered_p)
15448 	    output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15449 	  else
15450 	    output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15451 	  return output_387_ffreep (operands, 0);
15452 	}
15453       else
15454 	{
15455 	  if (unordered_p)
15456 	    return "fucompp\n\tfnstsw\t%0";
15457 	  else
15458 	    return "fcompp\n\tfnstsw\t%0";
15459 	}
15460     }
15461   else
15462     {
15463       /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies.  */
15464 
15465       static const char * const alt[16] =
15466       {
15467 	"fcom%Z2\t%y2\n\tfnstsw\t%0",
15468 	"fcomp%Z2\t%y2\n\tfnstsw\t%0",
15469 	"fucom%Z2\t%y2\n\tfnstsw\t%0",
15470 	"fucomp%Z2\t%y2\n\tfnstsw\t%0",
15471 
15472 	"ficom%Z2\t%y2\n\tfnstsw\t%0",
15473 	"ficomp%Z2\t%y2\n\tfnstsw\t%0",
15474 	NULL,
15475 	NULL,
15476 
15477 	"fcomi\t{%y1, %0|%0, %y1}",
15478 	"fcomip\t{%y1, %0|%0, %y1}",
15479 	"fucomi\t{%y1, %0|%0, %y1}",
15480 	"fucomip\t{%y1, %0|%0, %y1}",
15481 
15482 	NULL,
15483 	NULL,
15484 	NULL,
15485 	NULL
15486       };
15487 
15488       int mask;
15489       const char *ret;
15490 
15491       mask  = eflags_p << 3;
15492       mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15493       mask |= unordered_p << 1;
15494       mask |= stack_top_dies;
15495 
15496       gcc_assert (mask < 16);
15497       ret = alt[mask];
15498       gcc_assert (ret);
15499 
15500       return ret;
15501     }
15502 }
15503 
15504 void
15505 ix86_output_addr_vec_elt (FILE *file, int value)
15506 {
15507   const char *directive = ASM_LONG;
15508 
15509 #ifdef ASM_QUAD
15510   if (TARGET_LP64)
15511     directive = ASM_QUAD;
15512 #else
15513   gcc_assert (!TARGET_64BIT);
15514 #endif
15515 
15516   fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15517 }
15518 
15519 void
15520 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15521 {
15522   const char *directive = ASM_LONG;
15523 
15524 #ifdef ASM_QUAD
15525   if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15526     directive = ASM_QUAD;
15527 #else
15528   gcc_assert (!TARGET_64BIT);
15529 #endif
15530   /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
15531   if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15532     fprintf (file, "%s%s%d-%s%d\n",
15533 	     directive, LPREFIX, value, LPREFIX, rel);
15534   else if (HAVE_AS_GOTOFF_IN_DATA)
15535     fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15536 #if TARGET_MACHO
15537   else if (TARGET_MACHO)
15538     {
15539       fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15540       machopic_output_function_base_name (file);
15541       putc ('\n', file);
15542     }
15543 #endif
15544   else
15545     asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15546 		 GOT_SYMBOL_NAME, LPREFIX, value);
15547 }
15548 
15549 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15550    for the target.  */
15551 
15552 void
15553 ix86_expand_clear (rtx dest)
15554 {
15555   rtx tmp;
15556 
15557   /* We play register width games, which are only valid after reload.  */
15558   gcc_assert (reload_completed);
15559 
15560   /* Avoid HImode and its attendant prefix byte.  */
15561   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15562     dest = gen_rtx_REG (SImode, REGNO (dest));
15563   tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15564 
15565   /* This predicate should match that for movsi_xor and movdi_xor_rex64.  */
15566   if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15567     {
15568       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15569       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15570     }
15571 
15572   emit_insn (tmp);
15573 }
15574 
15575 /* X is an unchanging MEM.  If it is a constant pool reference, return
15576    the constant pool rtx, else NULL.  */
15577 
15578 rtx
15579 maybe_get_pool_constant (rtx x)
15580 {
15581   x = ix86_delegitimize_address (XEXP (x, 0));
15582 
15583   if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15584     return get_pool_constant (x);
15585 
15586   return NULL_RTX;
15587 }
15588 
15589 void
15590 ix86_expand_move (enum machine_mode mode, rtx operands[])
15591 {
15592   rtx op0, op1;
15593   enum tls_model model;
15594 
15595   op0 = operands[0];
15596   op1 = operands[1];
15597 
15598   if (GET_CODE (op1) == SYMBOL_REF)
15599     {
15600       model = SYMBOL_REF_TLS_MODEL (op1);
15601       if (model)
15602 	{
15603 	  op1 = legitimize_tls_address (op1, model, true);
15604 	  op1 = force_operand (op1, op0);
15605 	  if (op1 == op0)
15606 	    return;
15607 	  if (GET_MODE (op1) != mode)
15608 	    op1 = convert_to_mode (mode, op1, 1);
15609 	}
15610       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15611 	       && SYMBOL_REF_DLLIMPORT_P (op1))
15612 	op1 = legitimize_dllimport_symbol (op1, false);
15613     }
15614   else if (GET_CODE (op1) == CONST
15615 	   && GET_CODE (XEXP (op1, 0)) == PLUS
15616 	   && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15617     {
15618       rtx addend = XEXP (XEXP (op1, 0), 1);
15619       rtx symbol = XEXP (XEXP (op1, 0), 0);
15620       rtx tmp = NULL;
15621 
15622       model = SYMBOL_REF_TLS_MODEL (symbol);
15623       if (model)
15624 	tmp = legitimize_tls_address (symbol, model, true);
15625       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15626 	       && SYMBOL_REF_DLLIMPORT_P (symbol))
15627 	tmp = legitimize_dllimport_symbol (symbol, true);
15628 
15629       if (tmp)
15630 	{
15631 	  tmp = force_operand (tmp, NULL);
15632 	  tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15633 				     op0, 1, OPTAB_DIRECT);
15634 	  if (tmp == op0)
15635 	    return;
15636 	  op1 = convert_to_mode (mode, tmp, 1);
15637 	}
15638     }
15639 
15640   if ((flag_pic || MACHOPIC_INDIRECT)
15641       && symbolic_operand (op1, mode))
15642     {
15643       if (TARGET_MACHO && !TARGET_64BIT)
15644 	{
15645 #if TARGET_MACHO
15646 	  /* dynamic-no-pic */
15647 	  if (MACHOPIC_INDIRECT)
15648 	    {
15649 	      rtx temp = ((reload_in_progress
15650 			   || ((op0 && REG_P (op0))
15651 			       && mode == Pmode))
15652 			  ? op0 : gen_reg_rtx (Pmode));
15653 	      op1 = machopic_indirect_data_reference (op1, temp);
15654 	      if (MACHOPIC_PURE)
15655 		op1 = machopic_legitimize_pic_address (op1, mode,
15656 						       temp == op1 ? 0 : temp);
15657 	    }
15658 	  if (op0 != op1 && GET_CODE (op0) != MEM)
15659 	    {
15660 	      rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15661 	      emit_insn (insn);
15662 	      return;
15663 	    }
15664 	  if (GET_CODE (op0) == MEM)
15665 	    op1 = force_reg (Pmode, op1);
15666 	  else
15667 	    {
15668 	      rtx temp = op0;
15669 	      if (GET_CODE (temp) != REG)
15670 		temp = gen_reg_rtx (Pmode);
15671 	      temp = legitimize_pic_address (op1, temp);
15672 	      if (temp == op0)
15673 	    return;
15674 	      op1 = temp;
15675 	    }
15676       /* dynamic-no-pic */
15677 #endif
15678 	}
15679       else
15680 	{
15681 	  if (MEM_P (op0))
15682 	    op1 = force_reg (mode, op1);
15683 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15684 	    {
15685 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15686 	      op1 = legitimize_pic_address (op1, reg);
15687 	      if (op0 == op1)
15688 		return;
15689 	      if (GET_MODE (op1) != mode)
15690 		op1 = convert_to_mode (mode, op1, 1);
15691 	    }
15692 	}
15693     }
15694   else
15695     {
15696       if (MEM_P (op0)
15697 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15698 	      || !push_operand (op0, mode))
15699 	  && MEM_P (op1))
15700 	op1 = force_reg (mode, op1);
15701 
15702       if (push_operand (op0, mode)
15703 	  && ! general_no_elim_operand (op1, mode))
15704 	op1 = copy_to_mode_reg (mode, op1);
15705 
15706       /* Force large constants in 64bit compilation into register
15707 	 to get them CSEed.  */
15708       if (can_create_pseudo_p ()
15709 	  && (mode == DImode) && TARGET_64BIT
15710 	  && immediate_operand (op1, mode)
15711 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
15712 	  && !register_operand (op0, mode)
15713 	  && optimize)
15714 	op1 = copy_to_mode_reg (mode, op1);
15715 
15716       if (can_create_pseudo_p ()
15717 	  && FLOAT_MODE_P (mode)
15718 	  && GET_CODE (op1) == CONST_DOUBLE)
15719 	{
15720 	  /* If we are loading a floating point constant to a register,
15721 	     force the value to memory now, since we'll get better code
15722 	     out the back end.  */
15723 
15724 	  op1 = validize_mem (force_const_mem (mode, op1));
15725 	  if (!register_operand (op0, mode))
15726 	    {
15727 	      rtx temp = gen_reg_rtx (mode);
15728 	      emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15729 	      emit_move_insn (op0, temp);
15730 	      return;
15731 	    }
15732 	}
15733     }
15734 
15735   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15736 }
15737 
15738 void
15739 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15740 {
15741   rtx op0 = operands[0], op1 = operands[1];
15742   unsigned int align = GET_MODE_ALIGNMENT (mode);
15743 
15744   /* Force constants other than zero into memory.  We do not know how
15745      the instructions used to build constants modify the upper 64 bits
15746      of the register, once we have that information we may be able
15747      to handle some of them more efficiently.  */
15748   if (can_create_pseudo_p ()
15749       && register_operand (op0, mode)
15750       && (CONSTANT_P (op1)
15751 	  || (GET_CODE (op1) == SUBREG
15752 	      && CONSTANT_P (SUBREG_REG (op1))))
15753       && !standard_sse_constant_p (op1))
15754     op1 = validize_mem (force_const_mem (mode, op1));
15755 
15756   /* We need to check memory alignment for SSE mode since attribute
15757      can make operands unaligned.  */
15758   if (can_create_pseudo_p ()
15759       && SSE_REG_MODE_P (mode)
15760       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15761 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15762     {
15763       rtx tmp[2];
15764 
15765       /* ix86_expand_vector_move_misalign() does not like constants ... */
15766       if (CONSTANT_P (op1)
15767 	  || (GET_CODE (op1) == SUBREG
15768 	      && CONSTANT_P (SUBREG_REG (op1))))
15769 	op1 = validize_mem (force_const_mem (mode, op1));
15770 
15771       /* ... nor both arguments in memory.  */
15772       if (!register_operand (op0, mode)
15773 	  && !register_operand (op1, mode))
15774 	op1 = force_reg (mode, op1);
15775 
15776       tmp[0] = op0; tmp[1] = op1;
15777       ix86_expand_vector_move_misalign (mode, tmp);
15778       return;
15779     }
15780 
15781   /* Make operand1 a register if it isn't already.  */
15782   if (can_create_pseudo_p ()
15783       && !register_operand (op0, mode)
15784       && !register_operand (op1, mode))
15785     {
15786       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15787       return;
15788     }
15789 
15790   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15791 }
15792 
15793 /* Split 32-byte AVX unaligned load and store if needed.  */
15794 
15795 static void
15796 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15797 {
15798   rtx m;
15799   rtx (*extract) (rtx, rtx, rtx);
15800   rtx (*load_unaligned) (rtx, rtx);
15801   rtx (*store_unaligned) (rtx, rtx);
15802   enum machine_mode mode;
15803 
15804   switch (GET_MODE (op0))
15805     {
15806     default:
15807       gcc_unreachable ();
15808     case V32QImode:
15809       extract = gen_avx_vextractf128v32qi;
15810       load_unaligned = gen_avx_loaddqu256;
15811       store_unaligned = gen_avx_storedqu256;
15812       mode = V16QImode;
15813       break;
15814     case V8SFmode:
15815       extract = gen_avx_vextractf128v8sf;
15816       load_unaligned = gen_avx_loadups256;
15817       store_unaligned = gen_avx_storeups256;
15818       mode = V4SFmode;
15819       break;
15820     case V4DFmode:
15821       extract = gen_avx_vextractf128v4df;
15822       load_unaligned = gen_avx_loadupd256;
15823       store_unaligned = gen_avx_storeupd256;
15824       mode = V2DFmode;
15825       break;
15826     }
15827 
15828   if (MEM_P (op1))
15829     {
15830       if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15831 	{
15832 	  rtx r = gen_reg_rtx (mode);
15833 	  m = adjust_address (op1, mode, 0);
15834 	  emit_move_insn (r, m);
15835 	  m = adjust_address (op1, mode, 16);
15836 	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15837 	  emit_move_insn (op0, r);
15838 	}
15839       else
15840 	emit_insn (load_unaligned (op0, op1));
15841     }
15842   else if (MEM_P (op0))
15843     {
15844       if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15845 	{
15846 	  m = adjust_address (op0, mode, 0);
15847 	  emit_insn (extract (m, op1, const0_rtx));
15848 	  m = adjust_address (op0, mode, 16);
15849 	  emit_insn (extract (m, op1, const1_rtx));
15850 	}
15851       else
15852 	emit_insn (store_unaligned (op0, op1));
15853     }
15854   else
15855     gcc_unreachable ();
15856 }
15857 
15858 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
15859    straight to ix86_expand_vector_move.  */
15860 /* Code generation for scalar reg-reg moves of single and double precision data:
15861      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15862        movaps reg, reg
15863      else
15864        movss reg, reg
15865      if (x86_sse_partial_reg_dependency == true)
15866        movapd reg, reg
15867      else
15868        movsd reg, reg
15869 
15870    Code generation for scalar loads of double precision data:
15871      if (x86_sse_split_regs == true)
15872        movlpd mem, reg      (gas syntax)
15873      else
15874        movsd mem, reg
15875 
15876    Code generation for unaligned packed loads of single precision data
15877    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15878      if (x86_sse_unaligned_move_optimal)
15879        movups mem, reg
15880 
15881      if (x86_sse_partial_reg_dependency == true)
15882        {
15883          xorps  reg, reg
15884          movlps mem, reg
15885          movhps mem+8, reg
15886        }
15887      else
15888        {
15889          movlps mem, reg
15890          movhps mem+8, reg
15891        }
15892 
15893    Code generation for unaligned packed loads of double precision data
15894    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15895      if (x86_sse_unaligned_move_optimal)
15896        movupd mem, reg
15897 
15898      if (x86_sse_split_regs == true)
15899        {
15900          movlpd mem, reg
15901          movhpd mem+8, reg
15902        }
15903      else
15904        {
15905          movsd  mem, reg
15906          movhpd mem+8, reg
15907        }
15908  */
15909 
15910 void
15911 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15912 {
15913   rtx op0, op1, m;
15914   rtx (*move_unaligned) (rtx, rtx);
15915 
15916   op0 = operands[0];
15917   op1 = operands[1];
15918 
15919   if (TARGET_AVX)
15920     {
15921       switch (GET_MODE_CLASS (mode))
15922 	{
15923 	case MODE_VECTOR_INT:
15924 	case MODE_INT:
15925 	  switch (GET_MODE_SIZE (mode))
15926 	    {
15927 	    case 16:
15928 	      /*  If we're optimizing for size, movups is the smallest.  */
15929 	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15930 		{
15931 		  if (MEM_P (op1))
15932 		    move_unaligned = gen_sse_loadups;
15933 		  else if (MEM_P (op0))
15934 		    move_unaligned = gen_sse_storeups;
15935 		  else
15936 		    gcc_unreachable ();
15937 
15938 		  op0 = gen_lowpart (V4SFmode, op0);
15939 		  op1 = gen_lowpart (V4SFmode, op1);
15940 		  emit_insn (move_unaligned (op0, op1));
15941 		  return;
15942 		}
15943 	      if (MEM_P (op1))
15944 		move_unaligned = gen_sse2_loaddqu;
15945 	      else if (MEM_P (op0))
15946 		move_unaligned = gen_sse2_storedqu;
15947 	      else
15948 		gcc_unreachable ();
15949 
15950 	      op0 = gen_lowpart (V16QImode, op0);
15951 	      op1 = gen_lowpart (V16QImode, op1);
15952 	      emit_insn (move_unaligned (op0, op1));
15953 	      break;
15954 	    case 32:
15955 	      op0 = gen_lowpart (V32QImode, op0);
15956 	      op1 = gen_lowpart (V32QImode, op1);
15957 	      ix86_avx256_split_vector_move_misalign (op0, op1);
15958 	      break;
15959 	    default:
15960 	      gcc_unreachable ();
15961 	    }
15962 	  break;
15963 	case MODE_VECTOR_FLOAT:
15964 	  op0 = gen_lowpart (mode, op0);
15965 	  op1 = gen_lowpart (mode, op1);
15966 
15967 	  switch (mode)
15968 	    {
15969 	    case V4SFmode:
15970 	      if (MEM_P (op1))
15971 		move_unaligned = gen_sse_loadups;
15972 	      else if (MEM_P (op0))
15973 		move_unaligned = gen_sse_storeups;
15974 	      else
15975 		gcc_unreachable ();
15976 
15977 	      emit_insn (move_unaligned (op0, op1));
15978 	      break;
15979 	    case V8SFmode:
15980 	      ix86_avx256_split_vector_move_misalign (op0, op1);
15981 	      break;
15982 	    case V2DFmode:
15983 	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15984 		{
15985 		  if (MEM_P (op1))
15986 		    move_unaligned = gen_sse_loadups;
15987 		  else if (MEM_P (op0))
15988 		    move_unaligned = gen_sse_storeups;
15989 		  else
15990 		    gcc_unreachable ();
15991 
15992 		  op0 = gen_lowpart (V4SFmode, op0);
15993 		  op1 = gen_lowpart (V4SFmode, op1);
15994 		  emit_insn (move_unaligned (op0, op1));
15995 		  return;
15996 		}
15997 	      if (MEM_P (op1))
15998 		move_unaligned = gen_sse2_loadupd;
15999 	      else if (MEM_P (op0))
16000 		move_unaligned = gen_sse2_storeupd;
16001 	      else
16002 		gcc_unreachable ();
16003 
16004 	      emit_insn (move_unaligned (op0, op1));
16005 	      break;
16006 	    case V4DFmode:
16007 	      ix86_avx256_split_vector_move_misalign (op0, op1);
16008 	      break;
16009 	    default:
16010 	      gcc_unreachable ();
16011 	    }
16012 	  break;
16013 
16014 	default:
16015 	  gcc_unreachable ();
16016 	}
16017 
16018       return;
16019     }
16020 
16021   if (MEM_P (op1))
16022     {
16023       /* If we're optimizing for size, movups is the smallest.  */
16024       if (optimize_insn_for_size_p ()
16025 	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
16026 	{
16027 	  op0 = gen_lowpart (V4SFmode, op0);
16028 	  op1 = gen_lowpart (V4SFmode, op1);
16029 	  emit_insn (gen_sse_loadups (op0, op1));
16030 	  return;
16031 	}
16032 
16033       /* ??? If we have typed data, then it would appear that using
16034 	 movdqu is the only way to get unaligned data loaded with
16035 	 integer type.  */
16036       if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16037 	{
16038 	  op0 = gen_lowpart (V16QImode, op0);
16039 	  op1 = gen_lowpart (V16QImode, op1);
16040 	  emit_insn (gen_sse2_loaddqu (op0, op1));
16041 	  return;
16042 	}
16043 
16044       if (TARGET_SSE2 && mode == V2DFmode)
16045         {
16046           rtx zero;
16047 
16048 	  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
16049 	    {
16050 	      op0 = gen_lowpart (V2DFmode, op0);
16051 	      op1 = gen_lowpart (V2DFmode, op1);
16052 	      emit_insn (gen_sse2_loadupd (op0, op1));
16053 	      return;
16054 	    }
16055 
16056 	  /* When SSE registers are split into halves, we can avoid
16057 	     writing to the top half twice.  */
16058 	  if (TARGET_SSE_SPLIT_REGS)
16059 	    {
16060 	      emit_clobber (op0);
16061 	      zero = op0;
16062 	    }
16063 	  else
16064 	    {
16065 	      /* ??? Not sure about the best option for the Intel chips.
16066 		 The following would seem to satisfy; the register is
16067 		 entirely cleared, breaking the dependency chain.  We
16068 		 then store to the upper half, with a dependency depth
16069 		 of one.  A rumor has it that Intel recommends two movsd
16070 		 followed by an unpacklpd, but this is unconfirmed.  And
16071 		 given that the dependency depth of the unpacklpd would
16072 		 still be one, I'm not sure why this would be better.  */
16073 	      zero = CONST0_RTX (V2DFmode);
16074 	    }
16075 
16076 	  m = adjust_address (op1, DFmode, 0);
16077 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
16078 	  m = adjust_address (op1, DFmode, 8);
16079 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
16080 	}
16081       else
16082         {
16083 	  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
16084 	    {
16085 	      op0 = gen_lowpart (V4SFmode, op0);
16086 	      op1 = gen_lowpart (V4SFmode, op1);
16087 	      emit_insn (gen_sse_loadups (op0, op1));
16088 	      return;
16089             }
16090 
16091 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16092 	    emit_move_insn (op0, CONST0_RTX (mode));
16093 	  else
16094 	    emit_clobber (op0);
16095 
16096 	  if (mode != V4SFmode)
16097 	    op0 = gen_lowpart (V4SFmode, op0);
16098 	  m = adjust_address (op1, V2SFmode, 0);
16099 	  emit_insn (gen_sse_loadlps (op0, op0, m));
16100 	  m = adjust_address (op1, V2SFmode, 8);
16101 	  emit_insn (gen_sse_loadhps (op0, op0, m));
16102 	}
16103     }
16104   else if (MEM_P (op0))
16105     {
16106       /* If we're optimizing for size, movups is the smallest.  */
16107       if (optimize_insn_for_size_p ()
16108 	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
16109 	{
16110 	  op0 = gen_lowpart (V4SFmode, op0);
16111 	  op1 = gen_lowpart (V4SFmode, op1);
16112 	  emit_insn (gen_sse_storeups (op0, op1));
16113 	  return;
16114 	}
16115 
16116       /* ??? Similar to above, only less clear because of quote
16117 	 typeless stores unquote.  */
16118       if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
16119 	  && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16120         {
16121 	  op0 = gen_lowpart (V16QImode, op0);
16122 	  op1 = gen_lowpart (V16QImode, op1);
16123 	  emit_insn (gen_sse2_storedqu (op0, op1));
16124 	  return;
16125 	}
16126 
16127       if (TARGET_SSE2 && mode == V2DFmode)
16128 	{
16129 	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
16130 	    {
16131 	      op0 = gen_lowpart (V2DFmode, op0);
16132 	      op1 = gen_lowpart (V2DFmode, op1);
16133 	      emit_insn (gen_sse2_storeupd (op0, op1));
16134 	    }
16135 	  else
16136 	    {
16137 	      m = adjust_address (op0, DFmode, 0);
16138 	      emit_insn (gen_sse2_storelpd (m, op1));
16139 	      m = adjust_address (op0, DFmode, 8);
16140 	      emit_insn (gen_sse2_storehpd (m, op1));
16141 	    }
16142 	}
16143       else
16144 	{
16145 	  if (mode != V4SFmode)
16146 	    op1 = gen_lowpart (V4SFmode, op1);
16147 
16148 	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
16149 	    {
16150 	      op0 = gen_lowpart (V4SFmode, op0);
16151 	      emit_insn (gen_sse_storeups (op0, op1));
16152 	    }
16153 	  else
16154 	    {
16155 	      m = adjust_address (op0, V2SFmode, 0);
16156 	      emit_insn (gen_sse_storelps (m, op1));
16157 	      m = adjust_address (op0, V2SFmode, 8);
16158 	      emit_insn (gen_sse_storehps (m, op1));
16159 	    }
16160 	}
16161     }
16162   else
16163     gcc_unreachable ();
16164 }
16165 
16166 /* Expand a push in MODE.  This is some mode for which we do not support
16167    proper push instructions, at least from the registers that we expect
16168    the value to live in.  */
16169 
16170 void
16171 ix86_expand_push (enum machine_mode mode, rtx x)
16172 {
16173   rtx tmp;
16174 
16175   tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16176 			     GEN_INT (-GET_MODE_SIZE (mode)),
16177 			     stack_pointer_rtx, 1, OPTAB_DIRECT);
16178   if (tmp != stack_pointer_rtx)
16179     emit_move_insn (stack_pointer_rtx, tmp);
16180 
16181   tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16182 
16183   /* When we push an operand onto stack, it has to be aligned at least
16184      at the function argument boundary.  However since we don't have
16185      the argument type, we can't determine the actual argument
16186      boundary.  */
16187   emit_move_insn (tmp, x);
16188 }
16189 
16190 /* Helper function of ix86_fixup_binary_operands to canonicalize
16191    operand order.  Returns true if the operands should be swapped.  */
16192 
16193 static bool
16194 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16195 			     rtx operands[])
16196 {
16197   rtx dst = operands[0];
16198   rtx src1 = operands[1];
16199   rtx src2 = operands[2];
16200 
16201   /* If the operation is not commutative, we can't do anything.  */
16202   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16203     return false;
16204 
16205   /* Highest priority is that src1 should match dst.  */
16206   if (rtx_equal_p (dst, src1))
16207     return false;
16208   if (rtx_equal_p (dst, src2))
16209     return true;
16210 
16211   /* Next highest priority is that immediate constants come second.  */
16212   if (immediate_operand (src2, mode))
16213     return false;
16214   if (immediate_operand (src1, mode))
16215     return true;
16216 
16217   /* Lowest priority is that memory references should come second.  */
16218   if (MEM_P (src2))
16219     return false;
16220   if (MEM_P (src1))
16221     return true;
16222 
16223   return false;
16224 }
16225 
16226 
16227 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
16228    destination to use for the operation.  If different from the true
16229    destination in operands[0], a copy operation will be required.  */
16230 
16231 rtx
16232 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16233 			    rtx operands[])
16234 {
16235   rtx dst = operands[0];
16236   rtx src1 = operands[1];
16237   rtx src2 = operands[2];
16238 
16239   /* Canonicalize operand order.  */
16240   if (ix86_swap_binary_operands_p (code, mode, operands))
16241     {
16242       rtx temp;
16243 
16244       /* It is invalid to swap operands of different modes.  */
16245       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16246 
16247       temp = src1;
16248       src1 = src2;
16249       src2 = temp;
16250     }
16251 
16252   /* Both source operands cannot be in memory.  */
16253   if (MEM_P (src1) && MEM_P (src2))
16254     {
16255       /* Optimization: Only read from memory once.  */
16256       if (rtx_equal_p (src1, src2))
16257 	{
16258 	  src2 = force_reg (mode, src2);
16259 	  src1 = src2;
16260 	}
16261       else
16262 	src2 = force_reg (mode, src2);
16263     }
16264 
16265   /* If the destination is memory, and we do not have matching source
16266      operands, do things in registers.  */
16267   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16268     dst = gen_reg_rtx (mode);
16269 
16270   /* Source 1 cannot be a constant.  */
16271   if (CONSTANT_P (src1))
16272     src1 = force_reg (mode, src1);
16273 
16274   /* Source 1 cannot be a non-matching memory.  */
16275   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16276     src1 = force_reg (mode, src1);
16277 
16278   /* Improve address combine.  */
16279   if (code == PLUS
16280       && GET_MODE_CLASS (mode) == MODE_INT
16281       && MEM_P (src2))
16282     src2 = force_reg (mode, src2);
16283 
16284   operands[1] = src1;
16285   operands[2] = src2;
16286   return dst;
16287 }
16288 
16289 /* Similarly, but assume that the destination has already been
16290    set up properly.  */
16291 
16292 void
16293 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16294 				    enum machine_mode mode, rtx operands[])
16295 {
16296   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16297   gcc_assert (dst == operands[0]);
16298 }
16299 
16300 /* Attempt to expand a binary operator.  Make the expansion closer to the
16301    actual machine, then just general_operand, which will allow 3 separate
16302    memory references (one output, two input) in a single insn.  */
16303 
16304 void
16305 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16306 			     rtx operands[])
16307 {
16308   rtx src1, src2, dst, op, clob;
16309 
16310   dst = ix86_fixup_binary_operands (code, mode, operands);
16311   src1 = operands[1];
16312   src2 = operands[2];
16313 
16314  /* Emit the instruction.  */
16315 
16316   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16317   if (reload_in_progress)
16318     {
16319       /* Reload doesn't know about the flags register, and doesn't know that
16320          it doesn't want to clobber it.  We can only do this with PLUS.  */
16321       gcc_assert (code == PLUS);
16322       emit_insn (op);
16323     }
16324   else if (reload_completed
16325 	   && code == PLUS
16326 	   && !rtx_equal_p (dst, src1))
16327     {
16328       /* This is going to be an LEA; avoid splitting it later.  */
16329       emit_insn (op);
16330     }
16331   else
16332     {
16333       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16334       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16335     }
16336 
16337   /* Fix up the destination if needed.  */
16338   if (dst != operands[0])
16339     emit_move_insn (operands[0], dst);
16340 }
16341 
16342 /* Return TRUE or FALSE depending on whether the binary operator meets the
16343    appropriate constraints.  */
16344 
16345 bool
16346 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16347 			 rtx operands[3])
16348 {
16349   rtx dst = operands[0];
16350   rtx src1 = operands[1];
16351   rtx src2 = operands[2];
16352 
16353   /* Both source operands cannot be in memory.  */
16354   if (MEM_P (src1) && MEM_P (src2))
16355     return false;
16356 
16357   /* Canonicalize operand order for commutative operators.  */
16358   if (ix86_swap_binary_operands_p (code, mode, operands))
16359     {
16360       rtx temp = src1;
16361       src1 = src2;
16362       src2 = temp;
16363     }
16364 
16365   /* If the destination is memory, we must have a matching source operand.  */
16366   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16367       return false;
16368 
16369   /* Source 1 cannot be a constant.  */
16370   if (CONSTANT_P (src1))
16371     return false;
16372 
16373   /* Source 1 cannot be a non-matching memory.  */
16374   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16375     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
16376     return (code == AND
16377 	    && (mode == HImode
16378 		|| mode == SImode
16379 		|| (TARGET_64BIT && mode == DImode))
16380 	    && satisfies_constraint_L (src2));
16381 
16382   return true;
16383 }
16384 
16385 /* Attempt to expand a unary operator.  Make the expansion closer to the
16386    actual machine, then just general_operand, which will allow 2 separate
16387    memory references (one output, one input) in a single insn.  */
16388 
16389 void
16390 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16391 			    rtx operands[])
16392 {
16393   int matching_memory;
16394   rtx src, dst, op, clob;
16395 
16396   dst = operands[0];
16397   src = operands[1];
16398 
16399   /* If the destination is memory, and we do not have matching source
16400      operands, do things in registers.  */
16401   matching_memory = 0;
16402   if (MEM_P (dst))
16403     {
16404       if (rtx_equal_p (dst, src))
16405 	matching_memory = 1;
16406       else
16407 	dst = gen_reg_rtx (mode);
16408     }
16409 
16410   /* When source operand is memory, destination must match.  */
16411   if (MEM_P (src) && !matching_memory)
16412     src = force_reg (mode, src);
16413 
16414   /* Emit the instruction.  */
16415 
16416   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16417   if (reload_in_progress || code == NOT)
16418     {
16419       /* Reload doesn't know about the flags register, and doesn't know that
16420          it doesn't want to clobber it.  */
16421       gcc_assert (code == NOT);
16422       emit_insn (op);
16423     }
16424   else
16425     {
16426       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16427       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16428     }
16429 
16430   /* Fix up the destination if needed.  */
16431   if (dst != operands[0])
16432     emit_move_insn (operands[0], dst);
16433 }
16434 
16435 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16436    divisor are within the range [0-255].  */
16437 
16438 void
16439 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16440 		    bool signed_p)
16441 {
16442   rtx end_label, qimode_label;
16443   rtx insn, div, mod;
16444   rtx scratch, tmp0, tmp1, tmp2;
16445   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16446   rtx (*gen_zero_extend) (rtx, rtx);
16447   rtx (*gen_test_ccno_1) (rtx, rtx);
16448 
16449   switch (mode)
16450     {
16451     case SImode:
16452       gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16453       gen_test_ccno_1 = gen_testsi_ccno_1;
16454       gen_zero_extend = gen_zero_extendqisi2;
16455       break;
16456     case DImode:
16457       gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16458       gen_test_ccno_1 = gen_testdi_ccno_1;
16459       gen_zero_extend = gen_zero_extendqidi2;
16460       break;
16461     default:
16462       gcc_unreachable ();
16463     }
16464 
16465   end_label = gen_label_rtx ();
16466   qimode_label = gen_label_rtx ();
16467 
16468   scratch = gen_reg_rtx (mode);
16469 
16470   /* Use 8bit unsigned divimod if dividend and divisor are within
16471      the range [0-255].  */
16472   emit_move_insn (scratch, operands[2]);
16473   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16474 				 scratch, 1, OPTAB_DIRECT);
16475   emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16476   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16477   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16478   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16479 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16480 			       pc_rtx);
16481   insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16482   predict_jump (REG_BR_PROB_BASE * 50 / 100);
16483   JUMP_LABEL (insn) = qimode_label;
16484 
16485   /* Generate original signed/unsigned divimod.  */
16486   div = gen_divmod4_1 (operands[0], operands[1],
16487 		       operands[2], operands[3]);
16488   emit_insn (div);
16489 
16490   /* Branch to the end.  */
16491   emit_jump_insn (gen_jump (end_label));
16492   emit_barrier ();
16493 
16494   /* Generate 8bit unsigned divide.  */
16495   emit_label (qimode_label);
16496   /* Don't use operands[0] for result of 8bit divide since not all
16497      registers support QImode ZERO_EXTRACT.  */
16498   tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16499   tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16500   tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16501   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16502 
16503   if (signed_p)
16504     {
16505       div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16506       mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16507     }
16508   else
16509     {
16510       div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16511       mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16512     }
16513 
16514   /* Extract remainder from AH.  */
16515   tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16516   if (REG_P (operands[1]))
16517     insn = emit_move_insn (operands[1], tmp1);
16518   else
16519     {
16520       /* Need a new scratch register since the old one has result
16521 	 of 8bit divide.  */
16522       scratch = gen_reg_rtx (mode);
16523       emit_move_insn (scratch, tmp1);
16524       insn = emit_move_insn (operands[1], scratch);
16525     }
16526   set_unique_reg_note (insn, REG_EQUAL, mod);
16527 
16528   /* Zero extend quotient from AL.  */
16529   tmp1 = gen_lowpart (QImode, tmp0);
16530   insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16531   set_unique_reg_note (insn, REG_EQUAL, div);
16532 
16533   emit_label (end_label);
16534 }
16535 
16536 #define LEA_MAX_STALL (3)
16537 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16538 
16539 /* Increase given DISTANCE in half-cycles according to
16540    dependencies between PREV and NEXT instructions.
16541    Add 1 half-cycle if there is no dependency and
16542    go to next cycle if there is some dependecy.  */
16543 
16544 static unsigned int
16545 increase_distance (rtx prev, rtx next, unsigned int distance)
16546 {
16547   df_ref *use_rec;
16548   df_ref *def_rec;
16549 
16550   if (!prev || !next)
16551     return distance + (distance & 1) + 2;
16552 
16553   if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16554     return distance + 1;
16555 
16556   for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16557     for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16558       if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16559 	  && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16560 	return distance + (distance & 1) + 2;
16561 
16562   return distance + 1;
16563 }
16564 
16565 /* Function checks if instruction INSN defines register number
16566    REGNO1 or REGNO2.  */
16567 
16568 static bool
16569 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16570 		  rtx insn)
16571 {
16572   df_ref *def_rec;
16573 
16574   for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16575     if (DF_REF_REG_DEF_P (*def_rec)
16576 	&& !DF_REF_IS_ARTIFICIAL (*def_rec)
16577 	&& (regno1 == DF_REF_REGNO (*def_rec)
16578 	    || regno2 == DF_REF_REGNO (*def_rec)))
16579       {
16580 	return true;
16581       }
16582 
16583   return false;
16584 }
16585 
16586 /* Function checks if instruction INSN uses register number
16587    REGNO as a part of address expression.  */
16588 
16589 static bool
16590 insn_uses_reg_mem (unsigned int regno, rtx insn)
16591 {
16592   df_ref *use_rec;
16593 
16594   for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16595     if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16596       return true;
16597 
16598   return false;
16599 }
16600 
16601 /* Search backward for non-agu definition of register number REGNO1
16602    or register number REGNO2 in basic block starting from instruction
16603    START up to head of basic block or instruction INSN.
16604 
16605    Function puts true value into *FOUND var if definition was found
16606    and false otherwise.
16607 
16608    Distance in half-cycles between START and found instruction or head
16609    of BB is added to DISTANCE and returned.  */
16610 
16611 static int
16612 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16613 			       rtx insn, int distance,
16614 			       rtx start, bool *found)
16615 {
16616   basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16617   rtx prev = start;
16618   rtx next = NULL;
16619 
16620   *found = false;
16621 
16622   while (prev
16623 	 && prev != insn
16624 	 && distance < LEA_SEARCH_THRESHOLD)
16625     {
16626       if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16627 	{
16628 	  distance = increase_distance (prev, next, distance);
16629 	  if (insn_defines_reg (regno1, regno2, prev))
16630 	    {
16631 	      if (recog_memoized (prev) < 0
16632 		  || get_attr_type (prev) != TYPE_LEA)
16633 		{
16634 		  *found = true;
16635 		  return distance;
16636 		}
16637 	    }
16638 
16639 	  next = prev;
16640 	}
16641       if (prev == BB_HEAD (bb))
16642 	break;
16643 
16644       prev = PREV_INSN (prev);
16645     }
16646 
16647   return distance;
16648 }
16649 
16650 /* Search backward for non-agu definition of register number REGNO1
16651    or register number REGNO2 in INSN's basic block until
16652    1. Pass LEA_SEARCH_THRESHOLD instructions, or
16653    2. Reach neighbour BBs boundary, or
16654    3. Reach agu definition.
16655    Returns the distance between the non-agu definition point and INSN.
16656    If no definition point, returns -1.  */
16657 
16658 static int
16659 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16660 			 rtx insn)
16661 {
16662   basic_block bb = BLOCK_FOR_INSN (insn);
16663   int distance = 0;
16664   bool found = false;
16665 
16666   if (insn != BB_HEAD (bb))
16667     distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16668 					      distance, PREV_INSN (insn),
16669 					      &found);
16670 
16671   if (!found && distance < LEA_SEARCH_THRESHOLD)
16672     {
16673       edge e;
16674       edge_iterator ei;
16675       bool simple_loop = false;
16676 
16677       FOR_EACH_EDGE (e, ei, bb->preds)
16678 	if (e->src == bb)
16679 	  {
16680 	    simple_loop = true;
16681 	    break;
16682 	  }
16683 
16684       if (simple_loop)
16685 	distance = distance_non_agu_define_in_bb (regno1, regno2,
16686 						  insn, distance,
16687 						  BB_END (bb), &found);
16688       else
16689 	{
16690 	  int shortest_dist = -1;
16691 	  bool found_in_bb = false;
16692 
16693 	  FOR_EACH_EDGE (e, ei, bb->preds)
16694 	    {
16695 	      int bb_dist
16696 		= distance_non_agu_define_in_bb (regno1, regno2,
16697 						 insn, distance,
16698 						 BB_END (e->src),
16699 						 &found_in_bb);
16700 	      if (found_in_bb)
16701 		{
16702 		  if (shortest_dist < 0)
16703 		    shortest_dist = bb_dist;
16704 		  else if (bb_dist > 0)
16705 		    shortest_dist = MIN (bb_dist, shortest_dist);
16706 
16707 		  found = true;
16708 		}
16709 	    }
16710 
16711 	  distance = shortest_dist;
16712 	}
16713     }
16714 
16715   /* get_attr_type may modify recog data.  We want to make sure
16716      that recog data is valid for instruction INSN, on which
16717      distance_non_agu_define is called.  INSN is unchanged here.  */
16718   extract_insn_cached (insn);
16719 
16720   if (!found)
16721     return -1;
16722 
16723   return distance >> 1;
16724 }
16725 
16726 /* Return the distance in half-cycles between INSN and the next
16727    insn that uses register number REGNO in memory address added
16728    to DISTANCE.  Return -1 if REGNO0 is set.
16729 
16730    Put true value into *FOUND if register usage was found and
16731    false otherwise.
16732    Put true value into *REDEFINED if register redefinition was
16733    found and false otherwise.  */
16734 
16735 static int
16736 distance_agu_use_in_bb (unsigned int regno,
16737 			rtx insn, int distance, rtx start,
16738 			bool *found, bool *redefined)
16739 {
16740   basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16741   rtx next = start;
16742   rtx prev = NULL;
16743 
16744   *found = false;
16745   *redefined = false;
16746 
16747   while (next
16748 	 && next != insn
16749 	 && distance < LEA_SEARCH_THRESHOLD)
16750     {
16751       if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16752 	{
16753 	  distance = increase_distance(prev, next, distance);
16754 	  if (insn_uses_reg_mem (regno, next))
16755 	    {
16756 	      /* Return DISTANCE if OP0 is used in memory
16757 		 address in NEXT.  */
16758 	      *found = true;
16759 	      return distance;
16760 	    }
16761 
16762 	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
16763 	    {
16764 	      /* Return -1 if OP0 is set in NEXT.  */
16765 	      *redefined = true;
16766 	      return -1;
16767 	    }
16768 
16769 	  prev = next;
16770 	}
16771 
16772       if (next == BB_END (bb))
16773 	break;
16774 
16775       next = NEXT_INSN (next);
16776     }
16777 
16778   return distance;
16779 }
16780 
16781 /* Return the distance between INSN and the next insn that uses
16782    register number REGNO0 in memory address.  Return -1 if no such
16783    a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
16784 
16785 static int
16786 distance_agu_use (unsigned int regno0, rtx insn)
16787 {
16788   basic_block bb = BLOCK_FOR_INSN (insn);
16789   int distance = 0;
16790   bool found = false;
16791   bool redefined = false;
16792 
16793   if (insn != BB_END (bb))
16794     distance = distance_agu_use_in_bb (regno0, insn, distance,
16795 				       NEXT_INSN (insn),
16796 				       &found, &redefined);
16797 
16798   if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16799     {
16800       edge e;
16801       edge_iterator ei;
16802       bool simple_loop = false;
16803 
16804       FOR_EACH_EDGE (e, ei, bb->succs)
16805         if (e->dest == bb)
16806 	  {
16807 	    simple_loop = true;
16808 	    break;
16809 	  }
16810 
16811       if (simple_loop)
16812 	distance = distance_agu_use_in_bb (regno0, insn,
16813 					   distance, BB_HEAD (bb),
16814 					   &found, &redefined);
16815       else
16816 	{
16817 	  int shortest_dist = -1;
16818 	  bool found_in_bb = false;
16819 	  bool redefined_in_bb = false;
16820 
16821 	  FOR_EACH_EDGE (e, ei, bb->succs)
16822 	    {
16823 	      int bb_dist
16824 		= distance_agu_use_in_bb (regno0, insn,
16825 					  distance, BB_HEAD (e->dest),
16826 					  &found_in_bb, &redefined_in_bb);
16827 	      if (found_in_bb)
16828 		{
16829 		  if (shortest_dist < 0)
16830 		    shortest_dist = bb_dist;
16831 		  else if (bb_dist > 0)
16832 		    shortest_dist = MIN (bb_dist, shortest_dist);
16833 
16834 		  found = true;
16835 		}
16836 	    }
16837 
16838 	  distance = shortest_dist;
16839 	}
16840     }
16841 
16842   if (!found || redefined)
16843     return -1;
16844 
16845   return distance >> 1;
16846 }
16847 
16848 /* Define this macro to tune LEA priority vs ADD, it take effect when
16849    there is a dilemma of choicing LEA or ADD
16850    Negative value: ADD is more preferred than LEA
16851    Zero: Netrual
16852    Positive value: LEA is more preferred than ADD*/
16853 #define IX86_LEA_PRIORITY 0
16854 
16855 /* Return true if usage of lea INSN has performance advantage
16856    over a sequence of instructions.  Instructions sequence has
16857    SPLIT_COST cycles higher latency than lea latency.  */
16858 
16859 static bool
16860 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16861 		      unsigned int regno2, int split_cost)
16862 {
16863   int dist_define, dist_use;
16864 
16865   dist_define = distance_non_agu_define (regno1, regno2, insn);
16866   dist_use = distance_agu_use (regno0, insn);
16867 
16868   if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16869     {
16870       /* If there is no non AGU operand definition, no AGU
16871 	 operand usage and split cost is 0 then both lea
16872 	 and non lea variants have same priority.  Currently
16873 	 we prefer lea for 64 bit code and non lea on 32 bit
16874 	 code.  */
16875       if (dist_use < 0 && split_cost == 0)
16876 	return TARGET_64BIT || IX86_LEA_PRIORITY;
16877       else
16878 	return true;
16879     }
16880 
16881   /* With longer definitions distance lea is more preferable.
16882      Here we change it to take into account splitting cost and
16883      lea priority.  */
16884   dist_define += split_cost + IX86_LEA_PRIORITY;
16885 
16886   /* If there is no use in memory addess then we just check
16887      that split cost does not exceed AGU stall.  */
16888   if (dist_use < 0)
16889     return dist_define >= LEA_MAX_STALL;
16890 
16891   /* If this insn has both backward non-agu dependence and forward
16892      agu dependence, the one with short distance takes effect.  */
16893   return dist_define >= dist_use;
16894 }
16895 
16896 /* Return true if it is legal to clobber flags by INSN and
16897    false otherwise.  */
16898 
16899 static bool
16900 ix86_ok_to_clobber_flags (rtx insn)
16901 {
16902   basic_block bb = BLOCK_FOR_INSN (insn);
16903   df_ref *use;
16904   bitmap live;
16905 
16906   while (insn)
16907     {
16908       if (NONDEBUG_INSN_P (insn))
16909 	{
16910 	  for (use = DF_INSN_USES (insn); *use; use++)
16911 	    if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16912 	      return false;
16913 
16914 	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16915 	    return true;
16916 	}
16917 
16918       if (insn == BB_END (bb))
16919 	break;
16920 
16921       insn = NEXT_INSN (insn);
16922     }
16923 
16924   live = df_get_live_out(bb);
16925   return !REGNO_REG_SET_P (live, FLAGS_REG);
16926 }
16927 
16928 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16929    move and add to avoid AGU stalls.  */
16930 
16931 bool
16932 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16933 {
16934   unsigned int regno0 = true_regnum (operands[0]);
16935   unsigned int regno1 = true_regnum (operands[1]);
16936   unsigned int regno2 = true_regnum (operands[2]);
16937 
16938   /* Check if we need to optimize.  */
16939   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16940     return false;
16941 
16942   /* Check it is correct to split here.  */
16943   if (!ix86_ok_to_clobber_flags(insn))
16944     return false;
16945 
16946   /* We need to split only adds with non destructive
16947      destination operand.  */
16948   if (regno0 == regno1 || regno0 == regno2)
16949     return false;
16950   else
16951     return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16952 }
16953 
16954 /* Return true if we should emit lea instruction instead of mov
16955    instruction.  */
16956 
16957 bool
16958 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16959 {
16960   unsigned int regno0;
16961   unsigned int regno1;
16962 
16963   /* Check if we need to optimize.  */
16964   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16965     return false;
16966 
16967   /* Use lea for reg to reg moves only.  */
16968   if (!REG_P (operands[0]) || !REG_P (operands[1]))
16969     return false;
16970 
16971   regno0 = true_regnum (operands[0]);
16972   regno1 = true_regnum (operands[1]);
16973 
16974   return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
16975 }
16976 
16977 /* Return true if we need to split lea into a sequence of
16978    instructions to avoid AGU stalls. */
16979 
16980 bool
16981 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16982 {
16983   unsigned int regno0 = true_regnum (operands[0]) ;
16984   unsigned int regno1 = INVALID_REGNUM;
16985   unsigned int regno2 = INVALID_REGNUM;
16986   int split_cost = 0;
16987   struct ix86_address parts;
16988   int ok;
16989 
16990   /* FIXME: Handle zero-extended addresses.  */
16991   if (SImode_address_operand (operands[1], VOIDmode))
16992     return false;
16993 
16994   /* Check we need to optimize.  */
16995   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16996     return false;
16997 
16998   /* The "at least two components" test below might not catch simple
16999      move insns if parts.base is non-NULL and parts.disp is const0_rtx
17000      as the only components in the address, e.g. if the register is
17001      %rbp or %r13.  As this test is much cheaper and moves are the
17002      common case, do this check first.  */
17003   if (REG_P (operands[1]))
17004     return false;
17005 
17006   /* Check if it is OK to split here.  */
17007   if (!ix86_ok_to_clobber_flags (insn))
17008     return false;
17009 
17010   ok = ix86_decompose_address (operands[1], &parts);
17011   gcc_assert (ok);
17012 
17013   /* There should be at least two components in the address.  */
17014   if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17015       + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17016     return false;
17017 
17018   /* We should not split into add if non legitimate pic
17019      operand is used as displacement. */
17020   if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17021     return false;
17022 
17023   if (parts.base)
17024     regno1 = true_regnum (parts.base);
17025   if (parts.index)
17026     regno2 = true_regnum (parts.index);
17027 
17028   /* Compute how many cycles we will add to execution time
17029      if split lea into a sequence of instructions.  */
17030   if (parts.base || parts.index)
17031     {
17032       /* Have to use mov instruction if non desctructive
17033 	 destination form is used.  */
17034       if (regno1 != regno0 && regno2 != regno0)
17035 	split_cost += 1;
17036 
17037       /* Have to add index to base if both exist.  */
17038       if (parts.base && parts.index)
17039 	split_cost += 1;
17040 
17041       /* Have to use shift and adds if scale is 2 or greater.  */
17042       if (parts.scale > 1)
17043 	{
17044 	  if (regno0 != regno1)
17045 	    split_cost += 1;
17046 	  else if (regno2 == regno0)
17047 	    split_cost += 4;
17048 	  else
17049 	    split_cost += parts.scale;
17050 	}
17051 
17052       /* Have to use add instruction with immediate if
17053 	 disp is non zero.  */
17054       if (parts.disp && parts.disp != const0_rtx)
17055 	split_cost += 1;
17056 
17057       /* Subtract the price of lea.  */
17058       split_cost -= 1;
17059     }
17060 
17061   return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17062 }
17063 
17064 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17065    matches destination.  RTX includes clobber of FLAGS_REG.  */
17066 
17067 static void
17068 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17069 		 rtx dst, rtx src)
17070 {
17071   rtx op, clob;
17072 
17073   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17074   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17075 
17076   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17077 }
17078 
17079 /* Split lea instructions into a sequence of instructions
17080    which are executed on ALU to avoid AGU stalls.
17081    It is assumed that it is allowed to clobber flags register
17082    at lea position.  */
17083 
17084 extern void
17085 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
17086 {
17087   unsigned int regno0 = true_regnum (operands[0]) ;
17088   unsigned int regno1 = INVALID_REGNUM;
17089   unsigned int regno2 = INVALID_REGNUM;
17090   struct ix86_address parts;
17091   rtx tmp;
17092   int ok, adds;
17093 
17094   ok = ix86_decompose_address (operands[1], &parts);
17095   gcc_assert (ok);
17096 
17097   if (parts.base)
17098     {
17099       if (GET_MODE (parts.base) != mode)
17100 	parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
17101       regno1 = true_regnum (parts.base);
17102     }
17103 
17104   if (parts.index)
17105     {
17106       if (GET_MODE (parts.index) != mode)
17107 	parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
17108       regno2 = true_regnum (parts.index);
17109     }
17110 
17111   if (parts.scale > 1)
17112     {
17113       /* Case r1 = r1 + ...  */
17114       if (regno1 == regno0)
17115 	{
17116 	  /* If we have a case r1 = r1 + C * r1 then we
17117 	     should use multiplication which is very
17118 	     expensive.  Assume cost model is wrong if we
17119 	     have such case here.  */
17120 	  gcc_assert (regno2 != regno0);
17121 
17122 	  for (adds = parts.scale; adds > 0; adds--)
17123 	    ix86_emit_binop (PLUS, mode, operands[0], parts.index);
17124 	}
17125       else
17126 	{
17127 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
17128 	  if (regno0 != regno2)
17129 	    emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17130 
17131 	  /* Use shift for scaling.  */
17132 	  ix86_emit_binop (ASHIFT, mode, operands[0],
17133 			   GEN_INT (exact_log2 (parts.scale)));
17134 
17135 	  if (parts.base)
17136 	    ix86_emit_binop (PLUS, mode, operands[0], parts.base);
17137 
17138 	  if (parts.disp && parts.disp != const0_rtx)
17139 	    ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17140 	}
17141     }
17142   else if (!parts.base && !parts.index)
17143     {
17144       gcc_assert(parts.disp);
17145       emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
17146     }
17147   else
17148     {
17149       if (!parts.base)
17150 	{
17151 	  if (regno0 != regno2)
17152 	    emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17153 	}
17154       else if (!parts.index)
17155 	{
17156 	  if (regno0 != regno1)
17157 	    emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17158 	}
17159       else
17160 	{
17161 	  if (regno0 == regno1)
17162 	    tmp = parts.index;
17163 	  else if (regno0 == regno2)
17164 	    tmp = parts.base;
17165 	  else
17166 	    {
17167 	      emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17168 	      tmp = parts.index;
17169 	    }
17170 
17171 	  ix86_emit_binop (PLUS, mode, operands[0], tmp);
17172 	}
17173 
17174       if (parts.disp && parts.disp != const0_rtx)
17175 	ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17176     }
17177 }
17178 
17179 /* Return true if it is ok to optimize an ADD operation to LEA
17180    operation to avoid flag register consumation.  For most processors,
17181    ADD is faster than LEA.  For the processors like ATOM, if the
17182    destination register of LEA holds an actual address which will be
17183    used soon, LEA is better and otherwise ADD is better.  */
17184 
17185 bool
17186 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17187 {
17188   unsigned int regno0 = true_regnum (operands[0]);
17189   unsigned int regno1 = true_regnum (operands[1]);
17190   unsigned int regno2 = true_regnum (operands[2]);
17191 
17192   /* If a = b + c, (a!=b && a!=c), must use lea form. */
17193   if (regno0 != regno1 && regno0 != regno2)
17194     return true;
17195 
17196   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17197     return false;
17198 
17199   return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17200 }
17201 
17202 /* Return true if destination reg of SET_BODY is shift count of
17203    USE_BODY.  */
17204 
17205 static bool
17206 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17207 {
17208   rtx set_dest;
17209   rtx shift_rtx;
17210   int i;
17211 
17212   /* Retrieve destination of SET_BODY.  */
17213   switch (GET_CODE (set_body))
17214     {
17215     case SET:
17216       set_dest = SET_DEST (set_body);
17217       if (!set_dest || !REG_P (set_dest))
17218 	return false;
17219       break;
17220     case PARALLEL:
17221       for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17222 	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17223 					  use_body))
17224 	  return true;
17225     default:
17226       return false;
17227       break;
17228     }
17229 
17230   /* Retrieve shift count of USE_BODY.  */
17231   switch (GET_CODE (use_body))
17232     {
17233     case SET:
17234       shift_rtx = XEXP (use_body, 1);
17235       break;
17236     case PARALLEL:
17237       for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17238 	if (ix86_dep_by_shift_count_body (set_body,
17239 					  XVECEXP (use_body, 0, i)))
17240 	  return true;
17241     default:
17242       return false;
17243       break;
17244     }
17245 
17246   if (shift_rtx
17247       && (GET_CODE (shift_rtx) == ASHIFT
17248 	  || GET_CODE (shift_rtx) == LSHIFTRT
17249 	  || GET_CODE (shift_rtx) == ASHIFTRT
17250 	  || GET_CODE (shift_rtx) == ROTATE
17251 	  || GET_CODE (shift_rtx) == ROTATERT))
17252     {
17253       rtx shift_count = XEXP (shift_rtx, 1);
17254 
17255       /* Return true if shift count is dest of SET_BODY.  */
17256       if (REG_P (shift_count)
17257 	  && true_regnum (set_dest) == true_regnum (shift_count))
17258 	return true;
17259     }
17260 
17261   return false;
17262 }
17263 
17264 /* Return true if destination reg of SET_INSN is shift count of
17265    USE_INSN.  */
17266 
17267 bool
17268 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17269 {
17270   return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17271 				       PATTERN (use_insn));
17272 }
17273 
17274 /* Return TRUE or FALSE depending on whether the unary operator meets the
17275    appropriate constraints.  */
17276 
17277 bool
17278 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17279 			enum machine_mode mode ATTRIBUTE_UNUSED,
17280 			rtx operands[2] ATTRIBUTE_UNUSED)
17281 {
17282   /* If one of operands is memory, source and destination must match.  */
17283   if ((MEM_P (operands[0])
17284        || MEM_P (operands[1]))
17285       && ! rtx_equal_p (operands[0], operands[1]))
17286     return false;
17287   return true;
17288 }
17289 
17290 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17291    are ok, keeping in mind the possible movddup alternative.  */
17292 
17293 bool
17294 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17295 {
17296   if (MEM_P (operands[0]))
17297     return rtx_equal_p (operands[0], operands[1 + high]);
17298   if (MEM_P (operands[1]) && MEM_P (operands[2]))
17299     return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17300   return true;
17301 }
17302 
17303 /* Post-reload splitter for converting an SF or DFmode value in an
17304    SSE register into an unsigned SImode.  */
17305 
17306 void
17307 ix86_split_convert_uns_si_sse (rtx operands[])
17308 {
17309   enum machine_mode vecmode;
17310   rtx value, large, zero_or_two31, input, two31, x;
17311 
17312   large = operands[1];
17313   zero_or_two31 = operands[2];
17314   input = operands[3];
17315   two31 = operands[4];
17316   vecmode = GET_MODE (large);
17317   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17318 
17319   /* Load up the value into the low element.  We must ensure that the other
17320      elements are valid floats -- zero is the easiest such value.  */
17321   if (MEM_P (input))
17322     {
17323       if (vecmode == V4SFmode)
17324 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17325       else
17326 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17327     }
17328   else
17329     {
17330       input = gen_rtx_REG (vecmode, REGNO (input));
17331       emit_move_insn (value, CONST0_RTX (vecmode));
17332       if (vecmode == V4SFmode)
17333 	emit_insn (gen_sse_movss (value, value, input));
17334       else
17335 	emit_insn (gen_sse2_movsd (value, value, input));
17336     }
17337 
17338   emit_move_insn (large, two31);
17339   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17340 
17341   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17342   emit_insn (gen_rtx_SET (VOIDmode, large, x));
17343 
17344   x = gen_rtx_AND (vecmode, zero_or_two31, large);
17345   emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17346 
17347   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17348   emit_insn (gen_rtx_SET (VOIDmode, value, x));
17349 
17350   large = gen_rtx_REG (V4SImode, REGNO (large));
17351   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17352 
17353   x = gen_rtx_REG (V4SImode, REGNO (value));
17354   if (vecmode == V4SFmode)
17355     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17356   else
17357     emit_insn (gen_sse2_cvttpd2dq (x, value));
17358   value = x;
17359 
17360   emit_insn (gen_xorv4si3 (value, value, large));
17361 }
17362 
17363 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17364    Expects the 64-bit DImode to be supplied in a pair of integral
17365    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
17366    -mfpmath=sse, !optimize_size only.  */
17367 
17368 void
17369 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17370 {
17371   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17372   rtx int_xmm, fp_xmm;
17373   rtx biases, exponents;
17374   rtx x;
17375 
17376   int_xmm = gen_reg_rtx (V4SImode);
17377   if (TARGET_INTER_UNIT_MOVES)
17378     emit_insn (gen_movdi_to_sse (int_xmm, input));
17379   else if (TARGET_SSE_SPLIT_REGS)
17380     {
17381       emit_clobber (int_xmm);
17382       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17383     }
17384   else
17385     {
17386       x = gen_reg_rtx (V2DImode);
17387       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17388       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17389     }
17390 
17391   x = gen_rtx_CONST_VECTOR (V4SImode,
17392 			    gen_rtvec (4, GEN_INT (0x43300000UL),
17393 				       GEN_INT (0x45300000UL),
17394 				       const0_rtx, const0_rtx));
17395   exponents = validize_mem (force_const_mem (V4SImode, x));
17396 
17397   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17398   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17399 
17400   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17401      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17402      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17403      (0x1.0p84 + double(fp_value_hi_xmm)).
17404      Note these exponents differ by 32.  */
17405 
17406   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17407 
17408   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17409      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
17410   real_ldexp (&bias_lo_rvt, &dconst1, 52);
17411   real_ldexp (&bias_hi_rvt, &dconst1, 84);
17412   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17413   x = const_double_from_real_value (bias_hi_rvt, DFmode);
17414   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17415   biases = validize_mem (force_const_mem (V2DFmode, biases));
17416   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17417 
17418   /* Add the upper and lower DFmode values together.  */
17419   if (TARGET_SSE3)
17420     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17421   else
17422     {
17423       x = copy_to_mode_reg (V2DFmode, fp_xmm);
17424       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17425       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17426     }
17427 
17428   ix86_expand_vector_extract (false, target, fp_xmm, 0);
17429 }
17430 
17431 /* Not used, but eases macroization of patterns.  */
17432 void
17433 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17434 				  rtx input ATTRIBUTE_UNUSED)
17435 {
17436   gcc_unreachable ();
17437 }
17438 
17439 /* Convert an unsigned SImode value into a DFmode.  Only currently used
17440    for SSE, but applicable anywhere.  */
17441 
17442 void
17443 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17444 {
17445   REAL_VALUE_TYPE TWO31r;
17446   rtx x, fp;
17447 
17448   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17449 			   NULL, 1, OPTAB_DIRECT);
17450 
17451   fp = gen_reg_rtx (DFmode);
17452   emit_insn (gen_floatsidf2 (fp, x));
17453 
17454   real_ldexp (&TWO31r, &dconst1, 31);
17455   x = const_double_from_real_value (TWO31r, DFmode);
17456 
17457   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17458   if (x != target)
17459     emit_move_insn (target, x);
17460 }
17461 
17462 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
17463    32-bit mode; otherwise we have a direct convert instruction.  */
17464 
17465 void
17466 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17467 {
17468   REAL_VALUE_TYPE TWO32r;
17469   rtx fp_lo, fp_hi, x;
17470 
17471   fp_lo = gen_reg_rtx (DFmode);
17472   fp_hi = gen_reg_rtx (DFmode);
17473 
17474   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17475 
17476   real_ldexp (&TWO32r, &dconst1, 32);
17477   x = const_double_from_real_value (TWO32r, DFmode);
17478   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17479 
17480   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17481 
17482   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17483 			   0, OPTAB_DIRECT);
17484   if (x != target)
17485     emit_move_insn (target, x);
17486 }
17487 
17488 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17489    For x86_32, -mfpmath=sse, !optimize_size only.  */
17490 void
17491 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17492 {
17493   REAL_VALUE_TYPE ONE16r;
17494   rtx fp_hi, fp_lo, int_hi, int_lo, x;
17495 
17496   real_ldexp (&ONE16r, &dconst1, 16);
17497   x = const_double_from_real_value (ONE16r, SFmode);
17498   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17499 				      NULL, 0, OPTAB_DIRECT);
17500   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17501 				      NULL, 0, OPTAB_DIRECT);
17502   fp_hi = gen_reg_rtx (SFmode);
17503   fp_lo = gen_reg_rtx (SFmode);
17504   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17505   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17506   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17507 			       0, OPTAB_DIRECT);
17508   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17509 			       0, OPTAB_DIRECT);
17510   if (!rtx_equal_p (target, fp_hi))
17511     emit_move_insn (target, fp_hi);
17512 }
17513 
17514 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
17515    a vector of unsigned ints VAL to vector of floats TARGET.  */
17516 
17517 void
17518 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17519 {
17520   rtx tmp[8];
17521   REAL_VALUE_TYPE TWO16r;
17522   enum machine_mode intmode = GET_MODE (val);
17523   enum machine_mode fltmode = GET_MODE (target);
17524   rtx (*cvt) (rtx, rtx);
17525 
17526   if (intmode == V4SImode)
17527     cvt = gen_floatv4siv4sf2;
17528   else
17529     cvt = gen_floatv8siv8sf2;
17530   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17531   tmp[0] = force_reg (intmode, tmp[0]);
17532   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17533 				OPTAB_DIRECT);
17534   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17535 				NULL_RTX, 1, OPTAB_DIRECT);
17536   tmp[3] = gen_reg_rtx (fltmode);
17537   emit_insn (cvt (tmp[3], tmp[1]));
17538   tmp[4] = gen_reg_rtx (fltmode);
17539   emit_insn (cvt (tmp[4], tmp[2]));
17540   real_ldexp (&TWO16r, &dconst1, 16);
17541   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17542   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17543   tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17544 				OPTAB_DIRECT);
17545   tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17546 				OPTAB_DIRECT);
17547   if (tmp[7] != target)
17548     emit_move_insn (target, tmp[7]);
17549 }
17550 
17551 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17552    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17553    This is done by doing just signed conversion if < 0x1p31, and otherwise by
17554    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
17555 
17556 rtx
17557 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17558 {
17559   REAL_VALUE_TYPE TWO31r;
17560   rtx two31r, tmp[4];
17561   enum machine_mode mode = GET_MODE (val);
17562   enum machine_mode scalarmode = GET_MODE_INNER (mode);
17563   enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17564   rtx (*cmp) (rtx, rtx, rtx, rtx);
17565   int i;
17566 
17567   for (i = 0; i < 3; i++)
17568     tmp[i] = gen_reg_rtx (mode);
17569   real_ldexp (&TWO31r, &dconst1, 31);
17570   two31r = const_double_from_real_value (TWO31r, scalarmode);
17571   two31r = ix86_build_const_vector (mode, 1, two31r);
17572   two31r = force_reg (mode, two31r);
17573   switch (mode)
17574     {
17575     case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17576     case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17577     case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17578     case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17579     default: gcc_unreachable ();
17580     }
17581   tmp[3] = gen_rtx_LE (mode, two31r, val);
17582   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17583   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17584 				0, OPTAB_DIRECT);
17585   if (intmode == V4SImode || TARGET_AVX2)
17586     *xorp = expand_simple_binop (intmode, ASHIFT,
17587 				 gen_lowpart (intmode, tmp[0]),
17588 				 GEN_INT (31), NULL_RTX, 0,
17589 				 OPTAB_DIRECT);
17590   else
17591     {
17592       rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17593       two31 = ix86_build_const_vector (intmode, 1, two31);
17594       *xorp = expand_simple_binop (intmode, AND,
17595 				   gen_lowpart (intmode, tmp[0]),
17596 				   two31, NULL_RTX, 0,
17597 				   OPTAB_DIRECT);
17598     }
17599   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17600 			      0, OPTAB_DIRECT);
17601 }
17602 
17603 /* A subroutine of ix86_build_signbit_mask.  If VECT is true,
17604    then replicate the value for all elements of the vector
17605    register.  */
17606 
17607 rtx
17608 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17609 {
17610   int i, n_elt;
17611   rtvec v;
17612   enum machine_mode scalar_mode;
17613 
17614   switch (mode)
17615     {
17616     case V32QImode:
17617     case V16QImode:
17618     case V16HImode:
17619     case V8HImode:
17620     case V8SImode:
17621     case V4SImode:
17622     case V4DImode:
17623     case V2DImode:
17624       gcc_assert (vect);
17625     case V8SFmode:
17626     case V4SFmode:
17627     case V4DFmode:
17628     case V2DFmode:
17629       n_elt = GET_MODE_NUNITS (mode);
17630       v = rtvec_alloc (n_elt);
17631       scalar_mode = GET_MODE_INNER (mode);
17632 
17633       RTVEC_ELT (v, 0) = value;
17634 
17635       for (i = 1; i < n_elt; ++i)
17636 	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17637 
17638       return gen_rtx_CONST_VECTOR (mode, v);
17639 
17640     default:
17641       gcc_unreachable ();
17642     }
17643 }
17644 
17645 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17646    and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
17647    for an SSE register.  If VECT is true, then replicate the mask for
17648    all elements of the vector register.  If INVERT is true, then create
17649    a mask excluding the sign bit.  */
17650 
17651 rtx
17652 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17653 {
17654   enum machine_mode vec_mode, imode;
17655   HOST_WIDE_INT hi, lo;
17656   int shift = 63;
17657   rtx v;
17658   rtx mask;
17659 
17660   /* Find the sign bit, sign extended to 2*HWI.  */
17661   switch (mode)
17662     {
17663     case V8SImode:
17664     case V4SImode:
17665     case V8SFmode:
17666     case V4SFmode:
17667       vec_mode = mode;
17668       mode = GET_MODE_INNER (mode);
17669       imode = SImode;
17670       lo = 0x80000000, hi = lo < 0;
17671       break;
17672 
17673     case V4DImode:
17674     case V2DImode:
17675     case V4DFmode:
17676     case V2DFmode:
17677       vec_mode = mode;
17678       mode = GET_MODE_INNER (mode);
17679       imode = DImode;
17680       if (HOST_BITS_PER_WIDE_INT >= 64)
17681 	lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17682       else
17683 	lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17684       break;
17685 
17686     case TImode:
17687     case TFmode:
17688       vec_mode = VOIDmode;
17689       if (HOST_BITS_PER_WIDE_INT >= 64)
17690 	{
17691 	  imode = TImode;
17692 	  lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17693 	}
17694       else
17695 	{
17696 	  rtvec vec;
17697 
17698 	  imode = DImode;
17699 	  lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17700 
17701 	  if (invert)
17702 	    {
17703 	      lo = ~lo, hi = ~hi;
17704 	      v = constm1_rtx;
17705 	    }
17706 	  else
17707 	    v = const0_rtx;
17708 
17709 	  mask = immed_double_const (lo, hi, imode);
17710 
17711 	  vec = gen_rtvec (2, v, mask);
17712 	  v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17713 	  v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17714 
17715 	  return v;
17716 	}
17717      break;
17718 
17719     default:
17720       gcc_unreachable ();
17721     }
17722 
17723   if (invert)
17724     lo = ~lo, hi = ~hi;
17725 
17726   /* Force this value into the low part of a fp vector constant.  */
17727   mask = immed_double_const (lo, hi, imode);
17728   mask = gen_lowpart (mode, mask);
17729 
17730   if (vec_mode == VOIDmode)
17731     return force_reg (mode, mask);
17732 
17733   v = ix86_build_const_vector (vec_mode, vect, mask);
17734   return force_reg (vec_mode, v);
17735 }
17736 
17737 /* Generate code for floating point ABS or NEG.  */
17738 
17739 void
17740 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17741 				rtx operands[])
17742 {
17743   rtx mask, set, dst, src;
17744   bool use_sse = false;
17745   bool vector_mode = VECTOR_MODE_P (mode);
17746   enum machine_mode vmode = mode;
17747 
17748   if (vector_mode)
17749     use_sse = true;
17750   else if (mode == TFmode)
17751     use_sse = true;
17752   else if (TARGET_SSE_MATH)
17753     {
17754       use_sse = SSE_FLOAT_MODE_P (mode);
17755       if (mode == SFmode)
17756 	vmode = V4SFmode;
17757       else if (mode == DFmode)
17758 	vmode = V2DFmode;
17759     }
17760 
17761   /* NEG and ABS performed with SSE use bitwise mask operations.
17762      Create the appropriate mask now.  */
17763   if (use_sse)
17764     mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17765   else
17766     mask = NULL_RTX;
17767 
17768   dst = operands[0];
17769   src = operands[1];
17770 
17771   set = gen_rtx_fmt_e (code, mode, src);
17772   set = gen_rtx_SET (VOIDmode, dst, set);
17773 
17774   if (mask)
17775     {
17776       rtx use, clob;
17777       rtvec par;
17778 
17779       use = gen_rtx_USE (VOIDmode, mask);
17780       if (vector_mode)
17781 	par = gen_rtvec (2, set, use);
17782       else
17783 	{
17784           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17785 	  par = gen_rtvec (3, set, use, clob);
17786         }
17787       emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17788     }
17789   else
17790     emit_insn (set);
17791 }
17792 
17793 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
17794 
17795 void
17796 ix86_expand_copysign (rtx operands[])
17797 {
17798   enum machine_mode mode, vmode;
17799   rtx dest, op0, op1, mask, nmask;
17800 
17801   dest = operands[0];
17802   op0 = operands[1];
17803   op1 = operands[2];
17804 
17805   mode = GET_MODE (dest);
17806 
17807   if (mode == SFmode)
17808     vmode = V4SFmode;
17809   else if (mode == DFmode)
17810     vmode = V2DFmode;
17811   else
17812     vmode = mode;
17813 
17814   if (GET_CODE (op0) == CONST_DOUBLE)
17815     {
17816       rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17817 
17818       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17819 	op0 = simplify_unary_operation (ABS, mode, op0, mode);
17820 
17821       if (mode == SFmode || mode == DFmode)
17822 	{
17823 	  if (op0 == CONST0_RTX (mode))
17824 	    op0 = CONST0_RTX (vmode);
17825 	  else
17826 	    {
17827 	      rtx v = ix86_build_const_vector (vmode, false, op0);
17828 
17829 	      op0 = force_reg (vmode, v);
17830 	    }
17831 	}
17832       else if (op0 != CONST0_RTX (mode))
17833 	op0 = force_reg (mode, op0);
17834 
17835       mask = ix86_build_signbit_mask (vmode, 0, 0);
17836 
17837       if (mode == SFmode)
17838 	copysign_insn = gen_copysignsf3_const;
17839       else if (mode == DFmode)
17840 	copysign_insn = gen_copysigndf3_const;
17841       else
17842 	copysign_insn = gen_copysigntf3_const;
17843 
17844 	emit_insn (copysign_insn (dest, op0, op1, mask));
17845     }
17846   else
17847     {
17848       rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17849 
17850       nmask = ix86_build_signbit_mask (vmode, 0, 1);
17851       mask = ix86_build_signbit_mask (vmode, 0, 0);
17852 
17853       if (mode == SFmode)
17854 	copysign_insn = gen_copysignsf3_var;
17855       else if (mode == DFmode)
17856 	copysign_insn = gen_copysigndf3_var;
17857       else
17858 	copysign_insn = gen_copysigntf3_var;
17859 
17860       emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17861     }
17862 }
17863 
17864 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
17865    be a constant, and so has already been expanded into a vector constant.  */
17866 
17867 void
17868 ix86_split_copysign_const (rtx operands[])
17869 {
17870   enum machine_mode mode, vmode;
17871   rtx dest, op0, mask, x;
17872 
17873   dest = operands[0];
17874   op0 = operands[1];
17875   mask = operands[3];
17876 
17877   mode = GET_MODE (dest);
17878   vmode = GET_MODE (mask);
17879 
17880   dest = simplify_gen_subreg (vmode, dest, mode, 0);
17881   x = gen_rtx_AND (vmode, dest, mask);
17882   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17883 
17884   if (op0 != CONST0_RTX (vmode))
17885     {
17886       x = gen_rtx_IOR (vmode, dest, op0);
17887       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17888     }
17889 }
17890 
17891 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
17892    so we have to do two masks.  */
17893 
17894 void
17895 ix86_split_copysign_var (rtx operands[])
17896 {
17897   enum machine_mode mode, vmode;
17898   rtx dest, scratch, op0, op1, mask, nmask, x;
17899 
17900   dest = operands[0];
17901   scratch = operands[1];
17902   op0 = operands[2];
17903   op1 = operands[3];
17904   nmask = operands[4];
17905   mask = operands[5];
17906 
17907   mode = GET_MODE (dest);
17908   vmode = GET_MODE (mask);
17909 
17910   if (rtx_equal_p (op0, op1))
17911     {
17912       /* Shouldn't happen often (it's useless, obviously), but when it does
17913 	 we'd generate incorrect code if we continue below.  */
17914       emit_move_insn (dest, op0);
17915       return;
17916     }
17917 
17918   if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
17919     {
17920       gcc_assert (REGNO (op1) == REGNO (scratch));
17921 
17922       x = gen_rtx_AND (vmode, scratch, mask);
17923       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17924 
17925       dest = mask;
17926       op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17927       x = gen_rtx_NOT (vmode, dest);
17928       x = gen_rtx_AND (vmode, x, op0);
17929       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17930     }
17931   else
17932     {
17933       if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
17934 	{
17935 	  x = gen_rtx_AND (vmode, scratch, mask);
17936 	}
17937       else						/* alternative 2,4 */
17938 	{
17939           gcc_assert (REGNO (mask) == REGNO (scratch));
17940           op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17941 	  x = gen_rtx_AND (vmode, scratch, op1);
17942 	}
17943       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17944 
17945       if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
17946 	{
17947 	  dest = simplify_gen_subreg (vmode, op0, mode, 0);
17948 	  x = gen_rtx_AND (vmode, dest, nmask);
17949 	}
17950       else						/* alternative 3,4 */
17951 	{
17952           gcc_assert (REGNO (nmask) == REGNO (dest));
17953 	  dest = nmask;
17954 	  op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17955 	  x = gen_rtx_AND (vmode, dest, op0);
17956 	}
17957       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17958     }
17959 
17960   x = gen_rtx_IOR (vmode, dest, scratch);
17961   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17962 }
17963 
17964 /* Return TRUE or FALSE depending on whether the first SET in INSN
17965    has source and destination with matching CC modes, and that the
17966    CC mode is at least as constrained as REQ_MODE.  */
17967 
17968 bool
17969 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17970 {
17971   rtx set;
17972   enum machine_mode set_mode;
17973 
17974   set = PATTERN (insn);
17975   if (GET_CODE (set) == PARALLEL)
17976     set = XVECEXP (set, 0, 0);
17977   gcc_assert (GET_CODE (set) == SET);
17978   gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17979 
17980   set_mode = GET_MODE (SET_DEST (set));
17981   switch (set_mode)
17982     {
17983     case CCNOmode:
17984       if (req_mode != CCNOmode
17985 	  && (req_mode != CCmode
17986 	      || XEXP (SET_SRC (set), 1) != const0_rtx))
17987 	return false;
17988       break;
17989     case CCmode:
17990       if (req_mode == CCGCmode)
17991 	return false;
17992       /* FALLTHRU */
17993     case CCGCmode:
17994       if (req_mode == CCGOCmode || req_mode == CCNOmode)
17995 	return false;
17996       /* FALLTHRU */
17997     case CCGOCmode:
17998       if (req_mode == CCZmode)
17999 	return false;
18000       /* FALLTHRU */
18001     case CCZmode:
18002       break;
18003 
18004     case CCAmode:
18005     case CCCmode:
18006     case CCOmode:
18007     case CCSmode:
18008       if (set_mode != req_mode)
18009 	return false;
18010       break;
18011 
18012     default:
18013       gcc_unreachable ();
18014     }
18015 
18016   return GET_MODE (SET_SRC (set)) == set_mode;
18017 }
18018 
18019 /* Generate insn patterns to do an integer compare of OPERANDS.  */
18020 
18021 static rtx
18022 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18023 {
18024   enum machine_mode cmpmode;
18025   rtx tmp, flags;
18026 
18027   cmpmode = SELECT_CC_MODE (code, op0, op1);
18028   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18029 
18030   /* This is very simple, but making the interface the same as in the
18031      FP case makes the rest of the code easier.  */
18032   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18033   emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18034 
18035   /* Return the test that should be put into the flags user, i.e.
18036      the bcc, scc, or cmov instruction.  */
18037   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18038 }
18039 
18040 /* Figure out whether to use ordered or unordered fp comparisons.
18041    Return the appropriate mode to use.  */
18042 
18043 enum machine_mode
18044 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18045 {
18046   /* ??? In order to make all comparisons reversible, we do all comparisons
18047      non-trapping when compiling for IEEE.  Once gcc is able to distinguish
18048      all forms trapping and nontrapping comparisons, we can make inequality
18049      comparisons trapping again, since it results in better code when using
18050      FCOM based compares.  */
18051   return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18052 }
18053 
18054 enum machine_mode
18055 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18056 {
18057   enum machine_mode mode = GET_MODE (op0);
18058 
18059   if (SCALAR_FLOAT_MODE_P (mode))
18060     {
18061       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18062       return ix86_fp_compare_mode (code);
18063     }
18064 
18065   switch (code)
18066     {
18067       /* Only zero flag is needed.  */
18068     case EQ:			/* ZF=0 */
18069     case NE:			/* ZF!=0 */
18070       return CCZmode;
18071       /* Codes needing carry flag.  */
18072     case GEU:			/* CF=0 */
18073     case LTU:			/* CF=1 */
18074       /* Detect overflow checks.  They need just the carry flag.  */
18075       if (GET_CODE (op0) == PLUS
18076 	  && rtx_equal_p (op1, XEXP (op0, 0)))
18077 	return CCCmode;
18078       else
18079 	return CCmode;
18080     case GTU:			/* CF=0 & ZF=0 */
18081     case LEU:			/* CF=1 | ZF=1 */
18082       return CCmode;
18083       /* Codes possibly doable only with sign flag when
18084          comparing against zero.  */
18085     case GE:			/* SF=OF   or   SF=0 */
18086     case LT:			/* SF<>OF  or   SF=1 */
18087       if (op1 == const0_rtx)
18088 	return CCGOCmode;
18089       else
18090 	/* For other cases Carry flag is not required.  */
18091 	return CCGCmode;
18092       /* Codes doable only with sign flag when comparing
18093          against zero, but we miss jump instruction for it
18094          so we need to use relational tests against overflow
18095          that thus needs to be zero.  */
18096     case GT:			/* ZF=0 & SF=OF */
18097     case LE:			/* ZF=1 | SF<>OF */
18098       if (op1 == const0_rtx)
18099 	return CCNOmode;
18100       else
18101 	return CCGCmode;
18102       /* strcmp pattern do (use flags) and combine may ask us for proper
18103 	 mode.  */
18104     case USE:
18105       return CCmode;
18106     default:
18107       gcc_unreachable ();
18108     }
18109 }
18110 
18111 /* Return the fixed registers used for condition codes.  */
18112 
18113 static bool
18114 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18115 {
18116   *p1 = FLAGS_REG;
18117   *p2 = FPSR_REG;
18118   return true;
18119 }
18120 
18121 /* If two condition code modes are compatible, return a condition code
18122    mode which is compatible with both.  Otherwise, return
18123    VOIDmode.  */
18124 
18125 static enum machine_mode
18126 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18127 {
18128   if (m1 == m2)
18129     return m1;
18130 
18131   if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18132     return VOIDmode;
18133 
18134   if ((m1 == CCGCmode && m2 == CCGOCmode)
18135       || (m1 == CCGOCmode && m2 == CCGCmode))
18136     return CCGCmode;
18137 
18138   switch (m1)
18139     {
18140     default:
18141       gcc_unreachable ();
18142 
18143     case CCmode:
18144     case CCGCmode:
18145     case CCGOCmode:
18146     case CCNOmode:
18147     case CCAmode:
18148     case CCCmode:
18149     case CCOmode:
18150     case CCSmode:
18151     case CCZmode:
18152       switch (m2)
18153 	{
18154 	default:
18155 	  return VOIDmode;
18156 
18157 	case CCmode:
18158 	case CCGCmode:
18159 	case CCGOCmode:
18160 	case CCNOmode:
18161 	case CCAmode:
18162 	case CCCmode:
18163 	case CCOmode:
18164 	case CCSmode:
18165 	case CCZmode:
18166 	  return CCmode;
18167 	}
18168 
18169     case CCFPmode:
18170     case CCFPUmode:
18171       /* These are only compatible with themselves, which we already
18172 	 checked above.  */
18173       return VOIDmode;
18174     }
18175 }
18176 
18177 
18178 /* Return a comparison we can do and that it is equivalent to
18179    swap_condition (code) apart possibly from orderedness.
18180    But, never change orderedness if TARGET_IEEE_FP, returning
18181    UNKNOWN in that case if necessary.  */
18182 
18183 static enum rtx_code
18184 ix86_fp_swap_condition (enum rtx_code code)
18185 {
18186   switch (code)
18187     {
18188     case GT:                   /* GTU - CF=0 & ZF=0 */
18189       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18190     case GE:                   /* GEU - CF=0 */
18191       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18192     case UNLT:                 /* LTU - CF=1 */
18193       return TARGET_IEEE_FP ? UNKNOWN : GT;
18194     case UNLE:                 /* LEU - CF=1 | ZF=1 */
18195       return TARGET_IEEE_FP ? UNKNOWN : GE;
18196     default:
18197       return swap_condition (code);
18198     }
18199 }
18200 
18201 /* Return cost of comparison CODE using the best strategy for performance.
18202    All following functions do use number of instructions as a cost metrics.
18203    In future this should be tweaked to compute bytes for optimize_size and
18204    take into account performance of various instructions on various CPUs.  */
18205 
18206 static int
18207 ix86_fp_comparison_cost (enum rtx_code code)
18208 {
18209   int arith_cost;
18210 
18211   /* The cost of code using bit-twiddling on %ah.  */
18212   switch (code)
18213     {
18214     case UNLE:
18215     case UNLT:
18216     case LTGT:
18217     case GT:
18218     case GE:
18219     case UNORDERED:
18220     case ORDERED:
18221     case UNEQ:
18222       arith_cost = 4;
18223       break;
18224     case LT:
18225     case NE:
18226     case EQ:
18227     case UNGE:
18228       arith_cost = TARGET_IEEE_FP ? 5 : 4;
18229       break;
18230     case LE:
18231     case UNGT:
18232       arith_cost = TARGET_IEEE_FP ? 6 : 4;
18233       break;
18234     default:
18235       gcc_unreachable ();
18236     }
18237 
18238   switch (ix86_fp_comparison_strategy (code))
18239     {
18240     case IX86_FPCMP_COMI:
18241       return arith_cost > 4 ? 3 : 2;
18242     case IX86_FPCMP_SAHF:
18243       return arith_cost > 4 ? 4 : 3;
18244     default:
18245       return arith_cost;
18246     }
18247 }
18248 
18249 /* Return strategy to use for floating-point.  We assume that fcomi is always
18250    preferrable where available, since that is also true when looking at size
18251    (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
18252 
18253 enum ix86_fpcmp_strategy
18254 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18255 {
18256   /* Do fcomi/sahf based test when profitable.  */
18257 
18258   if (TARGET_CMOVE)
18259     return IX86_FPCMP_COMI;
18260 
18261   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18262     return IX86_FPCMP_SAHF;
18263 
18264   return IX86_FPCMP_ARITH;
18265 }
18266 
18267 /* Swap, force into registers, or otherwise massage the two operands
18268    to a fp comparison.  The operands are updated in place; the new
18269    comparison code is returned.  */
18270 
18271 static enum rtx_code
18272 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18273 {
18274   enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18275   rtx op0 = *pop0, op1 = *pop1;
18276   enum machine_mode op_mode = GET_MODE (op0);
18277   int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18278 
18279   /* All of the unordered compare instructions only work on registers.
18280      The same is true of the fcomi compare instructions.  The XFmode
18281      compare instructions require registers except when comparing
18282      against zero or when converting operand 1 from fixed point to
18283      floating point.  */
18284 
18285   if (!is_sse
18286       && (fpcmp_mode == CCFPUmode
18287 	  || (op_mode == XFmode
18288 	      && ! (standard_80387_constant_p (op0) == 1
18289 		    || standard_80387_constant_p (op1) == 1)
18290 	      && GET_CODE (op1) != FLOAT)
18291 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18292     {
18293       op0 = force_reg (op_mode, op0);
18294       op1 = force_reg (op_mode, op1);
18295     }
18296   else
18297     {
18298       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
18299 	 things around if they appear profitable, otherwise force op0
18300 	 into a register.  */
18301 
18302       if (standard_80387_constant_p (op0) == 0
18303 	  || (MEM_P (op0)
18304 	      && ! (standard_80387_constant_p (op1) == 0
18305 		    || MEM_P (op1))))
18306 	{
18307 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
18308 	  if (new_code != UNKNOWN)
18309 	    {
18310 	      rtx tmp;
18311 	      tmp = op0, op0 = op1, op1 = tmp;
18312 	      code = new_code;
18313 	    }
18314 	}
18315 
18316       if (!REG_P (op0))
18317 	op0 = force_reg (op_mode, op0);
18318 
18319       if (CONSTANT_P (op1))
18320 	{
18321 	  int tmp = standard_80387_constant_p (op1);
18322 	  if (tmp == 0)
18323 	    op1 = validize_mem (force_const_mem (op_mode, op1));
18324 	  else if (tmp == 1)
18325 	    {
18326 	      if (TARGET_CMOVE)
18327 		op1 = force_reg (op_mode, op1);
18328 	    }
18329 	  else
18330 	    op1 = force_reg (op_mode, op1);
18331 	}
18332     }
18333 
18334   /* Try to rearrange the comparison to make it cheaper.  */
18335   if (ix86_fp_comparison_cost (code)
18336       > ix86_fp_comparison_cost (swap_condition (code))
18337       && (REG_P (op1) || can_create_pseudo_p ()))
18338     {
18339       rtx tmp;
18340       tmp = op0, op0 = op1, op1 = tmp;
18341       code = swap_condition (code);
18342       if (!REG_P (op0))
18343 	op0 = force_reg (op_mode, op0);
18344     }
18345 
18346   *pop0 = op0;
18347   *pop1 = op1;
18348   return code;
18349 }
18350 
18351 /* Convert comparison codes we use to represent FP comparison to integer
18352    code that will result in proper branch.  Return UNKNOWN if no such code
18353    is available.  */
18354 
18355 enum rtx_code
18356 ix86_fp_compare_code_to_integer (enum rtx_code code)
18357 {
18358   switch (code)
18359     {
18360     case GT:
18361       return GTU;
18362     case GE:
18363       return GEU;
18364     case ORDERED:
18365     case UNORDERED:
18366       return code;
18367       break;
18368     case UNEQ:
18369       return EQ;
18370       break;
18371     case UNLT:
18372       return LTU;
18373       break;
18374     case UNLE:
18375       return LEU;
18376       break;
18377     case LTGT:
18378       return NE;
18379       break;
18380     default:
18381       return UNKNOWN;
18382     }
18383 }
18384 
18385 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
18386 
18387 static rtx
18388 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18389 {
18390   enum machine_mode fpcmp_mode, intcmp_mode;
18391   rtx tmp, tmp2;
18392 
18393   fpcmp_mode = ix86_fp_compare_mode (code);
18394   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18395 
18396   /* Do fcomi/sahf based test when profitable.  */
18397   switch (ix86_fp_comparison_strategy (code))
18398     {
18399     case IX86_FPCMP_COMI:
18400       intcmp_mode = fpcmp_mode;
18401       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18402       tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18403 			 tmp);
18404       emit_insn (tmp);
18405       break;
18406 
18407     case IX86_FPCMP_SAHF:
18408       intcmp_mode = fpcmp_mode;
18409       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18410       tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18411 			 tmp);
18412 
18413       if (!scratch)
18414 	scratch = gen_reg_rtx (HImode);
18415       tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18416       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18417       break;
18418 
18419     case IX86_FPCMP_ARITH:
18420       /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
18421       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18422       tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18423       if (!scratch)
18424 	scratch = gen_reg_rtx (HImode);
18425       emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18426 
18427       /* In the unordered case, we have to check C2 for NaN's, which
18428 	 doesn't happen to work out to anything nice combination-wise.
18429 	 So do some bit twiddling on the value we've got in AH to come
18430 	 up with an appropriate set of condition codes.  */
18431 
18432       intcmp_mode = CCNOmode;
18433       switch (code)
18434 	{
18435 	case GT:
18436 	case UNGT:
18437 	  if (code == GT || !TARGET_IEEE_FP)
18438 	    {
18439 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18440 	      code = EQ;
18441 	    }
18442 	  else
18443 	    {
18444 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18445 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18446 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18447 	      intcmp_mode = CCmode;
18448 	      code = GEU;
18449 	    }
18450 	  break;
18451 	case LT:
18452 	case UNLT:
18453 	  if (code == LT && TARGET_IEEE_FP)
18454 	    {
18455 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18456 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18457 	      intcmp_mode = CCmode;
18458 	      code = EQ;
18459 	    }
18460 	  else
18461 	    {
18462 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18463 	      code = NE;
18464 	    }
18465 	  break;
18466 	case GE:
18467 	case UNGE:
18468 	  if (code == GE || !TARGET_IEEE_FP)
18469 	    {
18470 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18471 	      code = EQ;
18472 	    }
18473 	  else
18474 	    {
18475 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18476 	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18477 	      code = NE;
18478 	    }
18479 	  break;
18480 	case LE:
18481 	case UNLE:
18482 	  if (code == LE && TARGET_IEEE_FP)
18483 	    {
18484 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18485 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18486 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18487 	      intcmp_mode = CCmode;
18488 	      code = LTU;
18489 	    }
18490 	  else
18491 	    {
18492 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18493 	      code = NE;
18494 	    }
18495 	  break;
18496 	case EQ:
18497 	case UNEQ:
18498 	  if (code == EQ && TARGET_IEEE_FP)
18499 	    {
18500 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18501 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18502 	      intcmp_mode = CCmode;
18503 	      code = EQ;
18504 	    }
18505 	  else
18506 	    {
18507 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18508 	      code = NE;
18509 	    }
18510 	  break;
18511 	case NE:
18512 	case LTGT:
18513 	  if (code == NE && TARGET_IEEE_FP)
18514 	    {
18515 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18516 	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18517 					     GEN_INT (0x40)));
18518 	      code = NE;
18519 	    }
18520 	  else
18521 	    {
18522 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18523 	      code = EQ;
18524 	    }
18525 	  break;
18526 
18527 	case UNORDERED:
18528 	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18529 	  code = NE;
18530 	  break;
18531 	case ORDERED:
18532 	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18533 	  code = EQ;
18534 	  break;
18535 
18536 	default:
18537 	  gcc_unreachable ();
18538 	}
18539 	break;
18540 
18541     default:
18542       gcc_unreachable();
18543     }
18544 
18545   /* Return the test that should be put into the flags user, i.e.
18546      the bcc, scc, or cmov instruction.  */
18547   return gen_rtx_fmt_ee (code, VOIDmode,
18548 			 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18549 			 const0_rtx);
18550 }
18551 
18552 static rtx
18553 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18554 {
18555   rtx ret;
18556 
18557   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18558     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18559 
18560   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18561     {
18562       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18563       ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18564     }
18565   else
18566     ret = ix86_expand_int_compare (code, op0, op1);
18567 
18568   return ret;
18569 }
18570 
18571 void
18572 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18573 {
18574   enum machine_mode mode = GET_MODE (op0);
18575   rtx tmp;
18576 
18577   switch (mode)
18578     {
18579     case SFmode:
18580     case DFmode:
18581     case XFmode:
18582     case QImode:
18583     case HImode:
18584     case SImode:
18585       simple:
18586       tmp = ix86_expand_compare (code, op0, op1);
18587       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18588 				  gen_rtx_LABEL_REF (VOIDmode, label),
18589 				  pc_rtx);
18590       emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18591       return;
18592 
18593     case DImode:
18594       if (TARGET_64BIT)
18595 	goto simple;
18596     case TImode:
18597       /* Expand DImode branch into multiple compare+branch.  */
18598       {
18599 	rtx lo[2], hi[2], label2;
18600 	enum rtx_code code1, code2, code3;
18601 	enum machine_mode submode;
18602 
18603 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18604 	  {
18605 	    tmp = op0, op0 = op1, op1 = tmp;
18606 	    code = swap_condition (code);
18607 	  }
18608 
18609 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
18610 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
18611 
18612 	submode = mode == DImode ? SImode : DImode;
18613 
18614 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18615 	   avoid two branches.  This costs one extra insn, so disable when
18616 	   optimizing for size.  */
18617 
18618 	if ((code == EQ || code == NE)
18619 	    && (!optimize_insn_for_size_p ()
18620 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
18621 	  {
18622 	    rtx xor0, xor1;
18623 
18624 	    xor1 = hi[0];
18625 	    if (hi[1] != const0_rtx)
18626 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18627 				   NULL_RTX, 0, OPTAB_WIDEN);
18628 
18629 	    xor0 = lo[0];
18630 	    if (lo[1] != const0_rtx)
18631 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18632 				   NULL_RTX, 0, OPTAB_WIDEN);
18633 
18634 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
18635 				NULL_RTX, 0, OPTAB_WIDEN);
18636 
18637 	    ix86_expand_branch (code, tmp, const0_rtx, label);
18638 	    return;
18639 	  }
18640 
18641 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
18642 	   op1 is a constant and the low word is zero, then we can just
18643 	   examine the high word.  Similarly for low word -1 and
18644 	   less-or-equal-than or greater-than.  */
18645 
18646 	if (CONST_INT_P (hi[1]))
18647 	  switch (code)
18648 	    {
18649 	    case LT: case LTU: case GE: case GEU:
18650 	      if (lo[1] == const0_rtx)
18651 		{
18652 		  ix86_expand_branch (code, hi[0], hi[1], label);
18653 		  return;
18654 		}
18655 	      break;
18656 	    case LE: case LEU: case GT: case GTU:
18657 	      if (lo[1] == constm1_rtx)
18658 		{
18659 		  ix86_expand_branch (code, hi[0], hi[1], label);
18660 		  return;
18661 		}
18662 	      break;
18663 	    default:
18664 	      break;
18665 	    }
18666 
18667 	/* Otherwise, we need two or three jumps.  */
18668 
18669 	label2 = gen_label_rtx ();
18670 
18671 	code1 = code;
18672 	code2 = swap_condition (code);
18673 	code3 = unsigned_condition (code);
18674 
18675 	switch (code)
18676 	  {
18677 	  case LT: case GT: case LTU: case GTU:
18678 	    break;
18679 
18680 	  case LE:   code1 = LT;  code2 = GT;  break;
18681 	  case GE:   code1 = GT;  code2 = LT;  break;
18682 	  case LEU:  code1 = LTU; code2 = GTU; break;
18683 	  case GEU:  code1 = GTU; code2 = LTU; break;
18684 
18685 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
18686 	  case NE:   code2 = UNKNOWN; break;
18687 
18688 	  default:
18689 	    gcc_unreachable ();
18690 	  }
18691 
18692 	/*
18693 	 * a < b =>
18694 	 *    if (hi(a) < hi(b)) goto true;
18695 	 *    if (hi(a) > hi(b)) goto false;
18696 	 *    if (lo(a) < lo(b)) goto true;
18697 	 *  false:
18698 	 */
18699 
18700 	if (code1 != UNKNOWN)
18701 	  ix86_expand_branch (code1, hi[0], hi[1], label);
18702 	if (code2 != UNKNOWN)
18703 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
18704 
18705 	ix86_expand_branch (code3, lo[0], lo[1], label);
18706 
18707 	if (code2 != UNKNOWN)
18708 	  emit_label (label2);
18709 	return;
18710       }
18711 
18712     default:
18713       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18714       goto simple;
18715     }
18716 }
18717 
18718 /* Split branch based on floating point condition.  */
18719 void
18720 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18721 		      rtx target1, rtx target2, rtx tmp, rtx pushed)
18722 {
18723   rtx condition;
18724   rtx i;
18725 
18726   if (target2 != pc_rtx)
18727     {
18728       rtx tmp = target2;
18729       code = reverse_condition_maybe_unordered (code);
18730       target2 = target1;
18731       target1 = tmp;
18732     }
18733 
18734   condition = ix86_expand_fp_compare (code, op1, op2,
18735 				      tmp);
18736 
18737   /* Remove pushed operand from stack.  */
18738   if (pushed)
18739     ix86_free_from_memory (GET_MODE (pushed));
18740 
18741   i = emit_jump_insn (gen_rtx_SET
18742 		      (VOIDmode, pc_rtx,
18743 		       gen_rtx_IF_THEN_ELSE (VOIDmode,
18744 					     condition, target1, target2)));
18745   if (split_branch_probability >= 0)
18746     add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18747 }
18748 
18749 void
18750 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18751 {
18752   rtx ret;
18753 
18754   gcc_assert (GET_MODE (dest) == QImode);
18755 
18756   ret = ix86_expand_compare (code, op0, op1);
18757   PUT_MODE (ret, QImode);
18758   emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18759 }
18760 
18761 /* Expand comparison setting or clearing carry flag.  Return true when
18762    successful and set pop for the operation.  */
18763 static bool
18764 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18765 {
18766   enum machine_mode mode =
18767     GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18768 
18769   /* Do not handle double-mode compares that go through special path.  */
18770   if (mode == (TARGET_64BIT ? TImode : DImode))
18771     return false;
18772 
18773   if (SCALAR_FLOAT_MODE_P (mode))
18774     {
18775       rtx compare_op, compare_seq;
18776 
18777       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18778 
18779       /* Shortcut:  following common codes never translate
18780 	 into carry flag compares.  */
18781       if (code == EQ || code == NE || code == UNEQ || code == LTGT
18782 	  || code == ORDERED || code == UNORDERED)
18783 	return false;
18784 
18785       /* These comparisons require zero flag; swap operands so they won't.  */
18786       if ((code == GT || code == UNLE || code == LE || code == UNGT)
18787 	  && !TARGET_IEEE_FP)
18788 	{
18789 	  rtx tmp = op0;
18790 	  op0 = op1;
18791 	  op1 = tmp;
18792 	  code = swap_condition (code);
18793 	}
18794 
18795       /* Try to expand the comparison and verify that we end up with
18796 	 carry flag based comparison.  This fails to be true only when
18797 	 we decide to expand comparison using arithmetic that is not
18798 	 too common scenario.  */
18799       start_sequence ();
18800       compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18801       compare_seq = get_insns ();
18802       end_sequence ();
18803 
18804       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18805 	  || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18806         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18807       else
18808 	code = GET_CODE (compare_op);
18809 
18810       if (code != LTU && code != GEU)
18811 	return false;
18812 
18813       emit_insn (compare_seq);
18814       *pop = compare_op;
18815       return true;
18816     }
18817 
18818   if (!INTEGRAL_MODE_P (mode))
18819     return false;
18820 
18821   switch (code)
18822     {
18823     case LTU:
18824     case GEU:
18825       break;
18826 
18827     /* Convert a==0 into (unsigned)a<1.  */
18828     case EQ:
18829     case NE:
18830       if (op1 != const0_rtx)
18831 	return false;
18832       op1 = const1_rtx;
18833       code = (code == EQ ? LTU : GEU);
18834       break;
18835 
18836     /* Convert a>b into b<a or a>=b-1.  */
18837     case GTU:
18838     case LEU:
18839       if (CONST_INT_P (op1))
18840 	{
18841 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18842 	  /* Bail out on overflow.  We still can swap operands but that
18843 	     would force loading of the constant into register.  */
18844 	  if (op1 == const0_rtx
18845 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18846 	    return false;
18847 	  code = (code == GTU ? GEU : LTU);
18848 	}
18849       else
18850 	{
18851 	  rtx tmp = op1;
18852 	  op1 = op0;
18853 	  op0 = tmp;
18854 	  code = (code == GTU ? LTU : GEU);
18855 	}
18856       break;
18857 
18858     /* Convert a>=0 into (unsigned)a<0x80000000.  */
18859     case LT:
18860     case GE:
18861       if (mode == DImode || op1 != const0_rtx)
18862 	return false;
18863       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18864       code = (code == LT ? GEU : LTU);
18865       break;
18866     case LE:
18867     case GT:
18868       if (mode == DImode || op1 != constm1_rtx)
18869 	return false;
18870       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18871       code = (code == LE ? GEU : LTU);
18872       break;
18873 
18874     default:
18875       return false;
18876     }
18877   /* Swapping operands may cause constant to appear as first operand.  */
18878   if (!nonimmediate_operand (op0, VOIDmode))
18879     {
18880       if (!can_create_pseudo_p ())
18881 	return false;
18882       op0 = force_reg (mode, op0);
18883     }
18884   *pop = ix86_expand_compare (code, op0, op1);
18885   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18886   return true;
18887 }
18888 
18889 bool
18890 ix86_expand_int_movcc (rtx operands[])
18891 {
18892   enum rtx_code code = GET_CODE (operands[1]), compare_code;
18893   rtx compare_seq, compare_op;
18894   enum machine_mode mode = GET_MODE (operands[0]);
18895   bool sign_bit_compare_p = false;
18896   rtx op0 = XEXP (operands[1], 0);
18897   rtx op1 = XEXP (operands[1], 1);
18898 
18899   start_sequence ();
18900   compare_op = ix86_expand_compare (code, op0, op1);
18901   compare_seq = get_insns ();
18902   end_sequence ();
18903 
18904   compare_code = GET_CODE (compare_op);
18905 
18906   if ((op1 == const0_rtx && (code == GE || code == LT))
18907       || (op1 == constm1_rtx && (code == GT || code == LE)))
18908     sign_bit_compare_p = true;
18909 
18910   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18911      HImode insns, we'd be swallowed in word prefix ops.  */
18912 
18913   if ((mode != HImode || TARGET_FAST_PREFIX)
18914       && (mode != (TARGET_64BIT ? TImode : DImode))
18915       && CONST_INT_P (operands[2])
18916       && CONST_INT_P (operands[3]))
18917     {
18918       rtx out = operands[0];
18919       HOST_WIDE_INT ct = INTVAL (operands[2]);
18920       HOST_WIDE_INT cf = INTVAL (operands[3]);
18921       HOST_WIDE_INT diff;
18922 
18923       diff = ct - cf;
18924       /*  Sign bit compares are better done using shifts than we do by using
18925 	  sbb.  */
18926       if (sign_bit_compare_p
18927 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18928 	{
18929 	  /* Detect overlap between destination and compare sources.  */
18930 	  rtx tmp = out;
18931 
18932           if (!sign_bit_compare_p)
18933 	    {
18934 	      rtx flags;
18935 	      bool fpcmp = false;
18936 
18937 	      compare_code = GET_CODE (compare_op);
18938 
18939 	      flags = XEXP (compare_op, 0);
18940 
18941 	      if (GET_MODE (flags) == CCFPmode
18942 		  || GET_MODE (flags) == CCFPUmode)
18943 		{
18944 		  fpcmp = true;
18945 		  compare_code
18946 		    = ix86_fp_compare_code_to_integer (compare_code);
18947 		}
18948 
18949 	      /* To simplify rest of code, restrict to the GEU case.  */
18950 	      if (compare_code == LTU)
18951 		{
18952 		  HOST_WIDE_INT tmp = ct;
18953 		  ct = cf;
18954 		  cf = tmp;
18955 		  compare_code = reverse_condition (compare_code);
18956 		  code = reverse_condition (code);
18957 		}
18958 	      else
18959 		{
18960 		  if (fpcmp)
18961 		    PUT_CODE (compare_op,
18962 			      reverse_condition_maybe_unordered
18963 			        (GET_CODE (compare_op)));
18964 		  else
18965 		    PUT_CODE (compare_op,
18966 			      reverse_condition (GET_CODE (compare_op)));
18967 		}
18968 	      diff = ct - cf;
18969 
18970 	      if (reg_overlap_mentioned_p (out, op0)
18971 		  || reg_overlap_mentioned_p (out, op1))
18972 		tmp = gen_reg_rtx (mode);
18973 
18974 	      if (mode == DImode)
18975 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18976 	      else
18977 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
18978 						 flags, compare_op));
18979 	    }
18980 	  else
18981 	    {
18982 	      if (code == GT || code == GE)
18983 		code = reverse_condition (code);
18984 	      else
18985 		{
18986 		  HOST_WIDE_INT tmp = ct;
18987 		  ct = cf;
18988 		  cf = tmp;
18989 		  diff = ct - cf;
18990 		}
18991 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18992 	    }
18993 
18994 	  if (diff == 1)
18995 	    {
18996 	      /*
18997 	       * cmpl op0,op1
18998 	       * sbbl dest,dest
18999 	       * [addl dest, ct]
19000 	       *
19001 	       * Size 5 - 8.
19002 	       */
19003 	      if (ct)
19004 		tmp = expand_simple_binop (mode, PLUS,
19005 					   tmp, GEN_INT (ct),
19006 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
19007 	    }
19008 	  else if (cf == -1)
19009 	    {
19010 	      /*
19011 	       * cmpl op0,op1
19012 	       * sbbl dest,dest
19013 	       * orl $ct, dest
19014 	       *
19015 	       * Size 8.
19016 	       */
19017 	      tmp = expand_simple_binop (mode, IOR,
19018 					 tmp, GEN_INT (ct),
19019 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
19020 	    }
19021 	  else if (diff == -1 && ct)
19022 	    {
19023 	      /*
19024 	       * cmpl op0,op1
19025 	       * sbbl dest,dest
19026 	       * notl dest
19027 	       * [addl dest, cf]
19028 	       *
19029 	       * Size 8 - 11.
19030 	       */
19031 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19032 	      if (cf)
19033 		tmp = expand_simple_binop (mode, PLUS,
19034 					   copy_rtx (tmp), GEN_INT (cf),
19035 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
19036 	    }
19037 	  else
19038 	    {
19039 	      /*
19040 	       * cmpl op0,op1
19041 	       * sbbl dest,dest
19042 	       * [notl dest]
19043 	       * andl cf - ct, dest
19044 	       * [addl dest, ct]
19045 	       *
19046 	       * Size 8 - 11.
19047 	       */
19048 
19049 	      if (cf == 0)
19050 		{
19051 		  cf = ct;
19052 		  ct = 0;
19053 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19054 		}
19055 
19056 	      tmp = expand_simple_binop (mode, AND,
19057 					 copy_rtx (tmp),
19058 					 gen_int_mode (cf - ct, mode),
19059 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
19060 	      if (ct)
19061 		tmp = expand_simple_binop (mode, PLUS,
19062 					   copy_rtx (tmp), GEN_INT (ct),
19063 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
19064 	    }
19065 
19066 	  if (!rtx_equal_p (tmp, out))
19067 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19068 
19069 	  return true;
19070 	}
19071 
19072       if (diff < 0)
19073 	{
19074 	  enum machine_mode cmp_mode = GET_MODE (op0);
19075 
19076 	  HOST_WIDE_INT tmp;
19077 	  tmp = ct, ct = cf, cf = tmp;
19078 	  diff = -diff;
19079 
19080 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
19081 	    {
19082 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19083 
19084 	      /* We may be reversing unordered compare to normal compare, that
19085 		 is not valid in general (we may convert non-trapping condition
19086 		 to trapping one), however on i386 we currently emit all
19087 		 comparisons unordered.  */
19088 	      compare_code = reverse_condition_maybe_unordered (compare_code);
19089 	      code = reverse_condition_maybe_unordered (code);
19090 	    }
19091 	  else
19092 	    {
19093 	      compare_code = reverse_condition (compare_code);
19094 	      code = reverse_condition (code);
19095 	    }
19096 	}
19097 
19098       compare_code = UNKNOWN;
19099       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19100 	  && CONST_INT_P (op1))
19101 	{
19102 	  if (op1 == const0_rtx
19103 	      && (code == LT || code == GE))
19104 	    compare_code = code;
19105 	  else if (op1 == constm1_rtx)
19106 	    {
19107 	      if (code == LE)
19108 		compare_code = LT;
19109 	      else if (code == GT)
19110 		compare_code = GE;
19111 	    }
19112 	}
19113 
19114       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
19115       if (compare_code != UNKNOWN
19116 	  && GET_MODE (op0) == GET_MODE (out)
19117 	  && (cf == -1 || ct == -1))
19118 	{
19119 	  /* If lea code below could be used, only optimize
19120 	     if it results in a 2 insn sequence.  */
19121 
19122 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19123 		 || diff == 3 || diff == 5 || diff == 9)
19124 	      || (compare_code == LT && ct == -1)
19125 	      || (compare_code == GE && cf == -1))
19126 	    {
19127 	      /*
19128 	       * notl op1	(if necessary)
19129 	       * sarl $31, op1
19130 	       * orl cf, op1
19131 	       */
19132 	      if (ct != -1)
19133 		{
19134 		  cf = ct;
19135 		  ct = -1;
19136 		  code = reverse_condition (code);
19137 		}
19138 
19139 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19140 
19141 	      out = expand_simple_binop (mode, IOR,
19142 					 out, GEN_INT (cf),
19143 					 out, 1, OPTAB_DIRECT);
19144 	      if (out != operands[0])
19145 		emit_move_insn (operands[0], out);
19146 
19147 	      return true;
19148 	    }
19149 	}
19150 
19151 
19152       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19153 	   || diff == 3 || diff == 5 || diff == 9)
19154 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19155 	  && (mode != DImode
19156 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19157 	{
19158 	  /*
19159 	   * xorl dest,dest
19160 	   * cmpl op1,op2
19161 	   * setcc dest
19162 	   * lea cf(dest*(ct-cf)),dest
19163 	   *
19164 	   * Size 14.
19165 	   *
19166 	   * This also catches the degenerate setcc-only case.
19167 	   */
19168 
19169 	  rtx tmp;
19170 	  int nops;
19171 
19172 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19173 
19174 	  nops = 0;
19175 	  /* On x86_64 the lea instruction operates on Pmode, so we need
19176 	     to get arithmetics done in proper mode to match.  */
19177 	  if (diff == 1)
19178 	    tmp = copy_rtx (out);
19179 	  else
19180 	    {
19181 	      rtx out1;
19182 	      out1 = copy_rtx (out);
19183 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19184 	      nops++;
19185 	      if (diff & 1)
19186 		{
19187 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
19188 		  nops++;
19189 		}
19190 	    }
19191 	  if (cf != 0)
19192 	    {
19193 	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19194 	      nops++;
19195 	    }
19196 	  if (!rtx_equal_p (tmp, out))
19197 	    {
19198 	      if (nops == 1)
19199 		out = force_operand (tmp, copy_rtx (out));
19200 	      else
19201 		emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19202 	    }
19203 	  if (!rtx_equal_p (out, operands[0]))
19204 	    emit_move_insn (operands[0], copy_rtx (out));
19205 
19206 	  return true;
19207 	}
19208 
19209       /*
19210        * General case:			Jumpful:
19211        *   xorl dest,dest		cmpl op1, op2
19212        *   cmpl op1, op2		movl ct, dest
19213        *   setcc dest			jcc 1f
19214        *   decl dest			movl cf, dest
19215        *   andl (cf-ct),dest		1:
19216        *   addl ct,dest
19217        *
19218        * Size 20.			Size 14.
19219        *
19220        * This is reasonably steep, but branch mispredict costs are
19221        * high on modern cpus, so consider failing only if optimizing
19222        * for space.
19223        */
19224 
19225       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19226 	  && BRANCH_COST (optimize_insn_for_speed_p (),
19227 		  	  false) >= 2)
19228 	{
19229 	  if (cf == 0)
19230 	    {
19231 	      enum machine_mode cmp_mode = GET_MODE (op0);
19232 
19233 	      cf = ct;
19234 	      ct = 0;
19235 
19236 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
19237 		{
19238 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19239 
19240 		  /* We may be reversing unordered compare to normal compare,
19241 		     that is not valid in general (we may convert non-trapping
19242 		     condition to trapping one), however on i386 we currently
19243 		     emit all comparisons unordered.  */
19244 		  code = reverse_condition_maybe_unordered (code);
19245 		}
19246 	      else
19247 		{
19248 		  code = reverse_condition (code);
19249 		  if (compare_code != UNKNOWN)
19250 		    compare_code = reverse_condition (compare_code);
19251 		}
19252 	    }
19253 
19254 	  if (compare_code != UNKNOWN)
19255 	    {
19256 	      /* notl op1	(if needed)
19257 		 sarl $31, op1
19258 		 andl (cf-ct), op1
19259 		 addl ct, op1
19260 
19261 		 For x < 0 (resp. x <= -1) there will be no notl,
19262 		 so if possible swap the constants to get rid of the
19263 		 complement.
19264 		 True/false will be -1/0 while code below (store flag
19265 		 followed by decrement) is 0/-1, so the constants need
19266 		 to be exchanged once more.  */
19267 
19268 	      if (compare_code == GE || !cf)
19269 		{
19270 		  code = reverse_condition (code);
19271 		  compare_code = LT;
19272 		}
19273 	      else
19274 		{
19275 		  HOST_WIDE_INT tmp = cf;
19276 		  cf = ct;
19277 		  ct = tmp;
19278 		}
19279 
19280 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19281 	    }
19282 	  else
19283 	    {
19284 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19285 
19286 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19287 					 constm1_rtx,
19288 					 copy_rtx (out), 1, OPTAB_DIRECT);
19289 	    }
19290 
19291 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
19292 				     gen_int_mode (cf - ct, mode),
19293 				     copy_rtx (out), 1, OPTAB_DIRECT);
19294 	  if (ct)
19295 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19296 				       copy_rtx (out), 1, OPTAB_DIRECT);
19297 	  if (!rtx_equal_p (out, operands[0]))
19298 	    emit_move_insn (operands[0], copy_rtx (out));
19299 
19300 	  return true;
19301 	}
19302     }
19303 
19304   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19305     {
19306       /* Try a few things more with specific constants and a variable.  */
19307 
19308       optab op;
19309       rtx var, orig_out, out, tmp;
19310 
19311       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19312 	return false;
19313 
19314       /* If one of the two operands is an interesting constant, load a
19315 	 constant with the above and mask it in with a logical operation.  */
19316 
19317       if (CONST_INT_P (operands[2]))
19318 	{
19319 	  var = operands[3];
19320 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19321 	    operands[3] = constm1_rtx, op = and_optab;
19322 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19323 	    operands[3] = const0_rtx, op = ior_optab;
19324 	  else
19325 	    return false;
19326 	}
19327       else if (CONST_INT_P (operands[3]))
19328 	{
19329 	  var = operands[2];
19330 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19331 	    operands[2] = constm1_rtx, op = and_optab;
19332 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19333 	    operands[2] = const0_rtx, op = ior_optab;
19334 	  else
19335 	    return false;
19336 	}
19337       else
19338         return false;
19339 
19340       orig_out = operands[0];
19341       tmp = gen_reg_rtx (mode);
19342       operands[0] = tmp;
19343 
19344       /* Recurse to get the constant loaded.  */
19345       if (ix86_expand_int_movcc (operands) == 0)
19346         return false;
19347 
19348       /* Mask in the interesting variable.  */
19349       out = expand_binop (mode, op, var, tmp, orig_out, 0,
19350 			  OPTAB_WIDEN);
19351       if (!rtx_equal_p (out, orig_out))
19352 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19353 
19354       return true;
19355     }
19356 
19357   /*
19358    * For comparison with above,
19359    *
19360    * movl cf,dest
19361    * movl ct,tmp
19362    * cmpl op1,op2
19363    * cmovcc tmp,dest
19364    *
19365    * Size 15.
19366    */
19367 
19368   if (! nonimmediate_operand (operands[2], mode))
19369     operands[2] = force_reg (mode, operands[2]);
19370   if (! nonimmediate_operand (operands[3], mode))
19371     operands[3] = force_reg (mode, operands[3]);
19372 
19373   if (! register_operand (operands[2], VOIDmode)
19374       && (mode == QImode
19375           || ! register_operand (operands[3], VOIDmode)))
19376     operands[2] = force_reg (mode, operands[2]);
19377 
19378   if (mode == QImode
19379       && ! register_operand (operands[3], VOIDmode))
19380     operands[3] = force_reg (mode, operands[3]);
19381 
19382   emit_insn (compare_seq);
19383   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19384 			  gen_rtx_IF_THEN_ELSE (mode,
19385 						compare_op, operands[2],
19386 						operands[3])));
19387   return true;
19388 }
19389 
19390 /* Swap, force into registers, or otherwise massage the two operands
19391    to an sse comparison with a mask result.  Thus we differ a bit from
19392    ix86_prepare_fp_compare_args which expects to produce a flags result.
19393 
19394    The DEST operand exists to help determine whether to commute commutative
19395    operators.  The POP0/POP1 operands are updated in place.  The new
19396    comparison code is returned, or UNKNOWN if not implementable.  */
19397 
19398 static enum rtx_code
19399 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19400 				  rtx *pop0, rtx *pop1)
19401 {
19402   rtx tmp;
19403 
19404   switch (code)
19405     {
19406     case LTGT:
19407     case UNEQ:
19408       /* AVX supports all the needed comparisons.  */
19409       if (TARGET_AVX)
19410 	break;
19411       /* We have no LTGT as an operator.  We could implement it with
19412 	 NE & ORDERED, but this requires an extra temporary.  It's
19413 	 not clear that it's worth it.  */
19414       return UNKNOWN;
19415 
19416     case LT:
19417     case LE:
19418     case UNGT:
19419     case UNGE:
19420       /* These are supported directly.  */
19421       break;
19422 
19423     case EQ:
19424     case NE:
19425     case UNORDERED:
19426     case ORDERED:
19427       /* AVX has 3 operand comparisons, no need to swap anything.  */
19428       if (TARGET_AVX)
19429 	break;
19430       /* For commutative operators, try to canonicalize the destination
19431 	 operand to be first in the comparison - this helps reload to
19432 	 avoid extra moves.  */
19433       if (!dest || !rtx_equal_p (dest, *pop1))
19434 	break;
19435       /* FALLTHRU */
19436 
19437     case GE:
19438     case GT:
19439     case UNLE:
19440     case UNLT:
19441       /* These are not supported directly before AVX, and furthermore
19442 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
19443 	 comparison operands to transform into something that is
19444 	 supported.  */
19445       tmp = *pop0;
19446       *pop0 = *pop1;
19447       *pop1 = tmp;
19448       code = swap_condition (code);
19449       break;
19450 
19451     default:
19452       gcc_unreachable ();
19453     }
19454 
19455   return code;
19456 }
19457 
19458 /* Detect conditional moves that exactly match min/max operational
19459    semantics.  Note that this is IEEE safe, as long as we don't
19460    interchange the operands.
19461 
19462    Returns FALSE if this conditional move doesn't match a MIN/MAX,
19463    and TRUE if the operation is successful and instructions are emitted.  */
19464 
19465 static bool
19466 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19467 			   rtx cmp_op1, rtx if_true, rtx if_false)
19468 {
19469   enum machine_mode mode;
19470   bool is_min;
19471   rtx tmp;
19472 
19473   if (code == LT)
19474     ;
19475   else if (code == UNGE)
19476     {
19477       tmp = if_true;
19478       if_true = if_false;
19479       if_false = tmp;
19480     }
19481   else
19482     return false;
19483 
19484   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19485     is_min = true;
19486   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19487     is_min = false;
19488   else
19489     return false;
19490 
19491   mode = GET_MODE (dest);
19492 
19493   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19494      but MODE may be a vector mode and thus not appropriate.  */
19495   if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19496     {
19497       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19498       rtvec v;
19499 
19500       if_true = force_reg (mode, if_true);
19501       v = gen_rtvec (2, if_true, if_false);
19502       tmp = gen_rtx_UNSPEC (mode, v, u);
19503     }
19504   else
19505     {
19506       code = is_min ? SMIN : SMAX;
19507       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19508     }
19509 
19510   emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19511   return true;
19512 }
19513 
19514 /* Expand an sse vector comparison.  Return the register with the result.  */
19515 
19516 static rtx
19517 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19518 		     rtx op_true, rtx op_false)
19519 {
19520   enum machine_mode mode = GET_MODE (dest);
19521   enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19522   rtx x;
19523 
19524   cmp_op0 = force_reg (cmp_mode, cmp_op0);
19525   if (!nonimmediate_operand (cmp_op1, cmp_mode))
19526     cmp_op1 = force_reg (cmp_mode, cmp_op1);
19527 
19528   if (optimize
19529       || reg_overlap_mentioned_p (dest, op_true)
19530       || reg_overlap_mentioned_p (dest, op_false))
19531     dest = gen_reg_rtx (mode);
19532 
19533   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19534   if (cmp_mode != mode)
19535     {
19536       x = force_reg (cmp_mode, x);
19537       convert_move (dest, x, false);
19538     }
19539   else
19540     emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19541 
19542   return dest;
19543 }
19544 
19545 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19546    operations.  This is used for both scalar and vector conditional moves.  */
19547 
19548 static void
19549 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19550 {
19551   enum machine_mode mode = GET_MODE (dest);
19552   rtx t2, t3, x;
19553 
19554   if (vector_all_ones_operand (op_true, mode)
19555       && rtx_equal_p (op_false, CONST0_RTX (mode)))
19556     {
19557       emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19558     }
19559   else if (op_false == CONST0_RTX (mode))
19560     {
19561       op_true = force_reg (mode, op_true);
19562       x = gen_rtx_AND (mode, cmp, op_true);
19563       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19564     }
19565   else if (op_true == CONST0_RTX (mode))
19566     {
19567       op_false = force_reg (mode, op_false);
19568       x = gen_rtx_NOT (mode, cmp);
19569       x = gen_rtx_AND (mode, x, op_false);
19570       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19571     }
19572   else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19573     {
19574       op_false = force_reg (mode, op_false);
19575       x = gen_rtx_IOR (mode, cmp, op_false);
19576       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19577     }
19578   else if (TARGET_XOP)
19579     {
19580       op_true = force_reg (mode, op_true);
19581 
19582       if (!nonimmediate_operand (op_false, mode))
19583 	op_false = force_reg (mode, op_false);
19584 
19585       emit_insn (gen_rtx_SET (mode, dest,
19586 			      gen_rtx_IF_THEN_ELSE (mode, cmp,
19587 						    op_true,
19588 						    op_false)));
19589     }
19590   else
19591     {
19592       rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19593 
19594       if (!nonimmediate_operand (op_true, mode))
19595 	op_true = force_reg (mode, op_true);
19596 
19597       op_false = force_reg (mode, op_false);
19598 
19599       switch (mode)
19600 	{
19601 	case V4SFmode:
19602 	  if (TARGET_SSE4_1)
19603 	    gen = gen_sse4_1_blendvps;
19604 	  break;
19605 	case V2DFmode:
19606 	  if (TARGET_SSE4_1)
19607 	    gen = gen_sse4_1_blendvpd;
19608 	  break;
19609 	case V16QImode:
19610 	case V8HImode:
19611 	case V4SImode:
19612 	case V2DImode:
19613 	  if (TARGET_SSE4_1)
19614 	    {
19615 	      gen = gen_sse4_1_pblendvb;
19616 	      dest = gen_lowpart (V16QImode, dest);
19617 	      op_false = gen_lowpart (V16QImode, op_false);
19618 	      op_true = gen_lowpart (V16QImode, op_true);
19619 	      cmp = gen_lowpart (V16QImode, cmp);
19620 	    }
19621 	  break;
19622 	case V8SFmode:
19623 	  if (TARGET_AVX)
19624 	    gen = gen_avx_blendvps256;
19625 	  break;
19626 	case V4DFmode:
19627 	  if (TARGET_AVX)
19628 	    gen = gen_avx_blendvpd256;
19629 	  break;
19630 	case V32QImode:
19631 	case V16HImode:
19632 	case V8SImode:
19633 	case V4DImode:
19634 	  if (TARGET_AVX2)
19635 	    {
19636 	      gen = gen_avx2_pblendvb;
19637 	      dest = gen_lowpart (V32QImode, dest);
19638 	      op_false = gen_lowpart (V32QImode, op_false);
19639 	      op_true = gen_lowpart (V32QImode, op_true);
19640 	      cmp = gen_lowpart (V32QImode, cmp);
19641 	    }
19642 	  break;
19643 	default:
19644 	  break;
19645 	}
19646 
19647       if (gen != NULL)
19648 	emit_insn (gen (dest, op_false, op_true, cmp));
19649       else
19650 	{
19651 	  op_true = force_reg (mode, op_true);
19652 
19653 	  t2 = gen_reg_rtx (mode);
19654 	  if (optimize)
19655 	    t3 = gen_reg_rtx (mode);
19656 	  else
19657 	    t3 = dest;
19658 
19659 	  x = gen_rtx_AND (mode, op_true, cmp);
19660 	  emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19661 
19662 	  x = gen_rtx_NOT (mode, cmp);
19663 	  x = gen_rtx_AND (mode, x, op_false);
19664 	  emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19665 
19666 	  x = gen_rtx_IOR (mode, t3, t2);
19667 	  emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19668 	}
19669     }
19670 }
19671 
19672 /* Expand a floating-point conditional move.  Return true if successful.  */
19673 
19674 bool
19675 ix86_expand_fp_movcc (rtx operands[])
19676 {
19677   enum machine_mode mode = GET_MODE (operands[0]);
19678   enum rtx_code code = GET_CODE (operands[1]);
19679   rtx tmp, compare_op;
19680   rtx op0 = XEXP (operands[1], 0);
19681   rtx op1 = XEXP (operands[1], 1);
19682 
19683   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19684     {
19685       enum machine_mode cmode;
19686 
19687       /* Since we've no cmove for sse registers, don't force bad register
19688 	 allocation just to gain access to it.  Deny movcc when the
19689 	 comparison mode doesn't match the move mode.  */
19690       cmode = GET_MODE (op0);
19691       if (cmode == VOIDmode)
19692 	cmode = GET_MODE (op1);
19693       if (cmode != mode)
19694 	return false;
19695 
19696       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19697       if (code == UNKNOWN)
19698 	return false;
19699 
19700       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19701 				     operands[2], operands[3]))
19702 	return true;
19703 
19704       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19705 				 operands[2], operands[3]);
19706       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19707       return true;
19708     }
19709 
19710   /* The floating point conditional move instructions don't directly
19711      support conditions resulting from a signed integer comparison.  */
19712 
19713   compare_op = ix86_expand_compare (code, op0, op1);
19714   if (!fcmov_comparison_operator (compare_op, VOIDmode))
19715     {
19716       tmp = gen_reg_rtx (QImode);
19717       ix86_expand_setcc (tmp, code, op0, op1);
19718 
19719       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19720     }
19721 
19722   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19723 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
19724 						operands[2], operands[3])));
19725 
19726   return true;
19727 }
19728 
19729 /* Expand a floating-point vector conditional move; a vcond operation
19730    rather than a movcc operation.  */
19731 
19732 bool
19733 ix86_expand_fp_vcond (rtx operands[])
19734 {
19735   enum rtx_code code = GET_CODE (operands[3]);
19736   rtx cmp;
19737 
19738   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19739 					   &operands[4], &operands[5]);
19740   if (code == UNKNOWN)
19741     {
19742       rtx temp;
19743       switch (GET_CODE (operands[3]))
19744 	{
19745 	case LTGT:
19746 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19747 				      operands[5], operands[0], operands[0]);
19748 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19749 				     operands[5], operands[1], operands[2]);
19750 	  code = AND;
19751 	  break;
19752 	case UNEQ:
19753 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19754 				      operands[5], operands[0], operands[0]);
19755 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19756 				     operands[5], operands[1], operands[2]);
19757 	  code = IOR;
19758 	  break;
19759 	default:
19760 	  gcc_unreachable ();
19761 	}
19762       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19763 				 OPTAB_DIRECT);
19764       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19765       return true;
19766     }
19767 
19768   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19769 				 operands[5], operands[1], operands[2]))
19770     return true;
19771 
19772   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19773 			     operands[1], operands[2]);
19774   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19775   return true;
19776 }
19777 
19778 /* Expand a signed/unsigned integral vector conditional move.  */
19779 
19780 bool
19781 ix86_expand_int_vcond (rtx operands[])
19782 {
19783   enum machine_mode data_mode = GET_MODE (operands[0]);
19784   enum machine_mode mode = GET_MODE (operands[4]);
19785   enum rtx_code code = GET_CODE (operands[3]);
19786   bool negate = false;
19787   rtx x, cop0, cop1;
19788 
19789   cop0 = operands[4];
19790   cop1 = operands[5];
19791 
19792   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19793      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
19794   if ((code == LT || code == GE)
19795       && data_mode == mode
19796       && cop1 == CONST0_RTX (mode)
19797       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19798       && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19799       && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19800       && (GET_MODE_SIZE (data_mode) == 16
19801 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19802     {
19803       rtx negop = operands[2 - (code == LT)];
19804       int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19805       if (negop == CONST1_RTX (data_mode))
19806 	{
19807 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19808 					 operands[0], 1, OPTAB_DIRECT);
19809 	  if (res != operands[0])
19810 	    emit_move_insn (operands[0], res);
19811 	  return true;
19812 	}
19813       else if (GET_MODE_INNER (data_mode) != DImode
19814 	       && vector_all_ones_operand (negop, data_mode))
19815 	{
19816 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19817 					 operands[0], 0, OPTAB_DIRECT);
19818 	  if (res != operands[0])
19819 	    emit_move_insn (operands[0], res);
19820 	  return true;
19821 	}
19822     }
19823 
19824   if (!nonimmediate_operand (cop1, mode))
19825     cop1 = force_reg (mode, cop1);
19826   if (!general_operand (operands[1], data_mode))
19827     operands[1] = force_reg (data_mode, operands[1]);
19828   if (!general_operand (operands[2], data_mode))
19829     operands[2] = force_reg (data_mode, operands[2]);
19830 
19831   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
19832   if (TARGET_XOP
19833       && (mode == V16QImode || mode == V8HImode
19834 	  || mode == V4SImode || mode == V2DImode))
19835     ;
19836   else
19837     {
19838       /* Canonicalize the comparison to EQ, GT, GTU.  */
19839       switch (code)
19840 	{
19841 	case EQ:
19842 	case GT:
19843 	case GTU:
19844 	  break;
19845 
19846 	case NE:
19847 	case LE:
19848 	case LEU:
19849 	  code = reverse_condition (code);
19850 	  negate = true;
19851 	  break;
19852 
19853 	case GE:
19854 	case GEU:
19855 	  code = reverse_condition (code);
19856 	  negate = true;
19857 	  /* FALLTHRU */
19858 
19859 	case LT:
19860 	case LTU:
19861 	  code = swap_condition (code);
19862 	  x = cop0, cop0 = cop1, cop1 = x;
19863 	  break;
19864 
19865 	default:
19866 	  gcc_unreachable ();
19867 	}
19868 
19869       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
19870       if (mode == V2DImode)
19871 	{
19872 	  switch (code)
19873 	    {
19874 	    case EQ:
19875 	      /* SSE4.1 supports EQ.  */
19876 	      if (!TARGET_SSE4_1)
19877 		return false;
19878 	      break;
19879 
19880 	    case GT:
19881 	    case GTU:
19882 	      /* SSE4.2 supports GT/GTU.  */
19883 	      if (!TARGET_SSE4_2)
19884 		return false;
19885 	      break;
19886 
19887 	    default:
19888 	      gcc_unreachable ();
19889 	    }
19890 	}
19891 
19892       /* Unsigned parallel compare is not supported by the hardware.
19893 	 Play some tricks to turn this into a signed comparison
19894 	 against 0.  */
19895       if (code == GTU)
19896 	{
19897 	  cop0 = force_reg (mode, cop0);
19898 
19899 	  switch (mode)
19900 	    {
19901 	    case V8SImode:
19902 	    case V4DImode:
19903 	    case V4SImode:
19904 	    case V2DImode:
19905 		{
19906 		  rtx t1, t2, mask;
19907 		  rtx (*gen_sub3) (rtx, rtx, rtx);
19908 
19909 		  switch (mode)
19910 		    {
19911 		    case V8SImode: gen_sub3 = gen_subv8si3; break;
19912 		    case V4DImode: gen_sub3 = gen_subv4di3; break;
19913 		    case V4SImode: gen_sub3 = gen_subv4si3; break;
19914 		    case V2DImode: gen_sub3 = gen_subv2di3; break;
19915 		    default:
19916 		      gcc_unreachable ();
19917 		    }
19918 		  /* Subtract (-(INT MAX) - 1) from both operands to make
19919 		     them signed.  */
19920 		  mask = ix86_build_signbit_mask (mode, true, false);
19921 		  t1 = gen_reg_rtx (mode);
19922 		  emit_insn (gen_sub3 (t1, cop0, mask));
19923 
19924 		  t2 = gen_reg_rtx (mode);
19925 		  emit_insn (gen_sub3 (t2, cop1, mask));
19926 
19927 		  cop0 = t1;
19928 		  cop1 = t2;
19929 		  code = GT;
19930 		}
19931 	      break;
19932 
19933 	    case V32QImode:
19934 	    case V16HImode:
19935 	    case V16QImode:
19936 	    case V8HImode:
19937 	      /* Perform a parallel unsigned saturating subtraction.  */
19938 	      x = gen_reg_rtx (mode);
19939 	      emit_insn (gen_rtx_SET (VOIDmode, x,
19940 				      gen_rtx_US_MINUS (mode, cop0, cop1)));
19941 
19942 	      cop0 = x;
19943 	      cop1 = CONST0_RTX (mode);
19944 	      code = EQ;
19945 	      negate = !negate;
19946 	      break;
19947 
19948 	    default:
19949 	      gcc_unreachable ();
19950 	    }
19951 	}
19952     }
19953 
19954   /* Allow the comparison to be done in one mode, but the movcc to
19955      happen in another mode.  */
19956   if (data_mode == mode)
19957     {
19958       x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19959 			       operands[1+negate], operands[2-negate]);
19960     }
19961   else
19962     {
19963       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19964       x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19965 			       code, cop0, cop1,
19966 			       operands[1+negate], operands[2-negate]);
19967       x = gen_lowpart (data_mode, x);
19968     }
19969 
19970   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19971 			 operands[2-negate]);
19972   return true;
19973 }
19974 
19975 /* Expand a variable vector permutation.  */
19976 
19977 void
19978 ix86_expand_vec_perm (rtx operands[])
19979 {
19980   rtx target = operands[0];
19981   rtx op0 = operands[1];
19982   rtx op1 = operands[2];
19983   rtx mask = operands[3];
19984   rtx t1, t2, t3, t4, vt, vt2, vec[32];
19985   enum machine_mode mode = GET_MODE (op0);
19986   enum machine_mode maskmode = GET_MODE (mask);
19987   int w, e, i;
19988   bool one_operand_shuffle = rtx_equal_p (op0, op1);
19989 
19990   /* Number of elements in the vector.  */
19991   w = GET_MODE_NUNITS (mode);
19992   e = GET_MODE_UNIT_SIZE (mode);
19993   gcc_assert (w <= 32);
19994 
19995   if (TARGET_AVX2)
19996     {
19997       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19998 	{
19999 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20000 	     an constant shuffle operand.  With a tiny bit of effort we can
20001 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
20002 	     unfortunate but there's no avoiding it.
20003 	     Similarly for V16HImode we don't have instructions for variable
20004 	     shuffling, while for V32QImode we can use after preparing suitable
20005 	     masks vpshufb; vpshufb; vpermq; vpor.  */
20006 
20007 	  if (mode == V16HImode)
20008 	    {
20009 	      maskmode = mode = V32QImode;
20010 	      w = 32;
20011 	      e = 1;
20012 	    }
20013 	  else
20014 	    {
20015 	      maskmode = mode = V8SImode;
20016 	      w = 8;
20017 	      e = 4;
20018 	    }
20019 	  t1 = gen_reg_rtx (maskmode);
20020 
20021 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
20022 	       mask = { A B C D }
20023 	       t1 = { A A B B C C D D }.  */
20024 	  for (i = 0; i < w / 2; ++i)
20025 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20026 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20027 	  vt = force_reg (maskmode, vt);
20028 	  mask = gen_lowpart (maskmode, mask);
20029 	  if (maskmode == V8SImode)
20030 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20031 	  else
20032 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20033 
20034 	  /* Multiply the shuffle indicies by two.  */
20035 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20036 				    OPTAB_DIRECT);
20037 
20038 	  /* Add one to the odd shuffle indicies:
20039 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
20040 	  for (i = 0; i < w / 2; ++i)
20041 	    {
20042 	      vec[i * 2] = const0_rtx;
20043 	      vec[i * 2 + 1] = const1_rtx;
20044 	    }
20045 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20046 	  vt = validize_mem (force_const_mem (maskmode, vt));
20047 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20048 				    OPTAB_DIRECT);
20049 
20050 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
20051 	  operands[3] = mask = t1;
20052 	  target = gen_lowpart (mode, target);
20053 	  op0 = gen_lowpart (mode, op0);
20054 	  op1 = gen_lowpart (mode, op1);
20055 	}
20056 
20057       switch (mode)
20058 	{
20059 	case V8SImode:
20060 	  /* The VPERMD and VPERMPS instructions already properly ignore
20061 	     the high bits of the shuffle elements.  No need for us to
20062 	     perform an AND ourselves.  */
20063 	  if (one_operand_shuffle)
20064 	    emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20065 	  else
20066 	    {
20067 	      t1 = gen_reg_rtx (V8SImode);
20068 	      t2 = gen_reg_rtx (V8SImode);
20069 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20070 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20071 	      goto merge_two;
20072 	    }
20073 	  return;
20074 
20075 	case V8SFmode:
20076 	  mask = gen_lowpart (V8SImode, mask);
20077 	  if (one_operand_shuffle)
20078 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20079 	  else
20080 	    {
20081 	      t1 = gen_reg_rtx (V8SFmode);
20082 	      t2 = gen_reg_rtx (V8SFmode);
20083 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20084 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20085 	      goto merge_two;
20086 	    }
20087 	  return;
20088 
20089         case V4SImode:
20090 	  /* By combining the two 128-bit input vectors into one 256-bit
20091 	     input vector, we can use VPERMD and VPERMPS for the full
20092 	     two-operand shuffle.  */
20093 	  t1 = gen_reg_rtx (V8SImode);
20094 	  t2 = gen_reg_rtx (V8SImode);
20095 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20096 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20097 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20098 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20099 	  return;
20100 
20101         case V4SFmode:
20102 	  t1 = gen_reg_rtx (V8SFmode);
20103 	  t2 = gen_reg_rtx (V8SImode);
20104 	  mask = gen_lowpart (V4SImode, mask);
20105 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20106 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20107 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20108 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20109 	  return;
20110 
20111 	case V32QImode:
20112 	  t1 = gen_reg_rtx (V32QImode);
20113 	  t2 = gen_reg_rtx (V32QImode);
20114 	  t3 = gen_reg_rtx (V32QImode);
20115 	  vt2 = GEN_INT (128);
20116 	  for (i = 0; i < 32; i++)
20117 	    vec[i] = vt2;
20118 	  vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20119 	  vt = force_reg (V32QImode, vt);
20120 	  for (i = 0; i < 32; i++)
20121 	    vec[i] = i < 16 ? vt2 : const0_rtx;
20122 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20123 	  vt2 = force_reg (V32QImode, vt2);
20124 	  /* From mask create two adjusted masks, which contain the same
20125 	     bits as mask in the low 7 bits of each vector element.
20126 	     The first mask will have the most significant bit clear
20127 	     if it requests element from the same 128-bit lane
20128 	     and MSB set if it requests element from the other 128-bit lane.
20129 	     The second mask will have the opposite values of the MSB,
20130 	     and additionally will have its 128-bit lanes swapped.
20131 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20132 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
20133 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20134 	     stands for other 12 bytes.  */
20135 	  /* The bit whether element is from the same lane or the other
20136 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
20137 	  emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20138 				    gen_lowpart (V4DImode, mask),
20139 				    GEN_INT (3)));
20140 	  /* Clear MSB bits from the mask just in case it had them set.  */
20141 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20142 	  /* After this t1 will have MSB set for elements from other lane.  */
20143 	  emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20144 	  /* Clear bits other than MSB.  */
20145 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
20146 	  /* Or in the lower bits from mask into t3.  */
20147 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
20148 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
20149 	     lane.  */
20150 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
20151 	  /* Swap 128-bit lanes in t3.  */
20152 	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20153 					  gen_lowpart (V4DImode, t3),
20154 					  const2_rtx, GEN_INT (3),
20155 					  const0_rtx, const1_rtx));
20156 	  /* And or in the lower bits from mask into t1.  */
20157 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
20158 	  if (one_operand_shuffle)
20159 	    {
20160 	      /* Each of these shuffles will put 0s in places where
20161 		 element from the other 128-bit lane is needed, otherwise
20162 		 will shuffle in the requested value.  */
20163 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20164 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20165 	      /* For t3 the 128-bit lanes are swapped again.  */
20166 	      emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20167 					      gen_lowpart (V4DImode, t3),
20168 					      const2_rtx, GEN_INT (3),
20169 					      const0_rtx, const1_rtx));
20170 	      /* And oring both together leads to the result.  */
20171 	      emit_insn (gen_iorv32qi3 (target, t1, t3));
20172 	      return;
20173 	    }
20174 
20175 	  t4 = gen_reg_rtx (V32QImode);
20176 	  /* Similarly to the above one_operand_shuffle code,
20177 	     just for repeated twice for each operand.  merge_two:
20178 	     code will merge the two results together.  */
20179 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20180 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20181 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20182 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20183 	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20184 					  gen_lowpart (V4DImode, t4),
20185 					  const2_rtx, GEN_INT (3),
20186 					  const0_rtx, const1_rtx));
20187 	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20188 					  gen_lowpart (V4DImode, t3),
20189 					  const2_rtx, GEN_INT (3),
20190 					  const0_rtx, const1_rtx));
20191 	  emit_insn (gen_iorv32qi3 (t4, t2, t4));
20192 	  emit_insn (gen_iorv32qi3 (t3, t1, t3));
20193 	  t1 = t4;
20194 	  t2 = t3;
20195 	  goto merge_two;
20196 
20197 	default:
20198 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
20199 	  break;
20200 	}
20201     }
20202 
20203   if (TARGET_XOP)
20204     {
20205       /* The XOP VPPERM insn supports three inputs.  By ignoring the
20206 	 one_operand_shuffle special case, we avoid creating another
20207 	 set of constant vectors in memory.  */
20208       one_operand_shuffle = false;
20209 
20210       /* mask = mask & {2*w-1, ...} */
20211       vt = GEN_INT (2*w - 1);
20212     }
20213   else
20214     {
20215       /* mask = mask & {w-1, ...} */
20216       vt = GEN_INT (w - 1);
20217     }
20218 
20219   for (i = 0; i < w; i++)
20220     vec[i] = vt;
20221   vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20222   mask = expand_simple_binop (maskmode, AND, mask, vt,
20223 			      NULL_RTX, 0, OPTAB_DIRECT);
20224 
20225   /* For non-QImode operations, convert the word permutation control
20226      into a byte permutation control.  */
20227   if (mode != V16QImode)
20228     {
20229       mask = expand_simple_binop (maskmode, ASHIFT, mask,
20230 				  GEN_INT (exact_log2 (e)),
20231 				  NULL_RTX, 0, OPTAB_DIRECT);
20232 
20233       /* Convert mask to vector of chars.  */
20234       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20235 
20236       /* Replicate each of the input bytes into byte positions:
20237 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20238 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20239 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
20240       for (i = 0; i < 16; ++i)
20241 	vec[i] = GEN_INT (i/e * e);
20242       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20243       vt = validize_mem (force_const_mem (V16QImode, vt));
20244       if (TARGET_XOP)
20245 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20246       else
20247 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20248 
20249       /* Convert it into the byte positions by doing
20250 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
20251       for (i = 0; i < 16; ++i)
20252 	vec[i] = GEN_INT (i % e);
20253       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20254       vt = validize_mem (force_const_mem (V16QImode, vt));
20255       emit_insn (gen_addv16qi3 (mask, mask, vt));
20256     }
20257 
20258   /* The actual shuffle operations all operate on V16QImode.  */
20259   op0 = gen_lowpart (V16QImode, op0);
20260   op1 = gen_lowpart (V16QImode, op1);
20261   target = gen_lowpart (V16QImode, target);
20262 
20263   if (TARGET_XOP)
20264     {
20265       emit_insn (gen_xop_pperm (target, op0, op1, mask));
20266     }
20267   else if (one_operand_shuffle)
20268     {
20269       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20270     }
20271   else
20272     {
20273       rtx xops[6];
20274       bool ok;
20275 
20276       /* Shuffle the two input vectors independently.  */
20277       t1 = gen_reg_rtx (V16QImode);
20278       t2 = gen_reg_rtx (V16QImode);
20279       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20280       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20281 
20282  merge_two:
20283       /* Then merge them together.  The key is whether any given control
20284          element contained a bit set that indicates the second word.  */
20285       mask = operands[3];
20286       vt = GEN_INT (w);
20287       if (maskmode == V2DImode && !TARGET_SSE4_1)
20288 	{
20289 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
20290 	     more shuffle to convert the V2DI input mask into a V4SI
20291 	     input mask.  At which point the masking that expand_int_vcond
20292 	     will work as desired.  */
20293 	  rtx t3 = gen_reg_rtx (V4SImode);
20294 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20295 				        const0_rtx, const0_rtx,
20296 				        const2_rtx, const2_rtx));
20297 	  mask = t3;
20298 	  maskmode = V4SImode;
20299 	  e = w = 4;
20300 	}
20301 
20302       for (i = 0; i < w; i++)
20303 	vec[i] = vt;
20304       vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20305       vt = force_reg (maskmode, vt);
20306       mask = expand_simple_binop (maskmode, AND, mask, vt,
20307 				  NULL_RTX, 0, OPTAB_DIRECT);
20308 
20309       xops[0] = gen_lowpart (mode, operands[0]);
20310       xops[1] = gen_lowpart (mode, t2);
20311       xops[2] = gen_lowpart (mode, t1);
20312       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20313       xops[4] = mask;
20314       xops[5] = vt;
20315       ok = ix86_expand_int_vcond (xops);
20316       gcc_assert (ok);
20317     }
20318 }
20319 
20320 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
20321    true if we should do zero extension, else sign extension.  HIGH_P is
20322    true if we want the N/2 high elements, else the low elements.  */
20323 
20324 void
20325 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20326 {
20327   enum machine_mode imode = GET_MODE (operands[1]);
20328   rtx tmp, dest;
20329 
20330   if (TARGET_SSE4_1)
20331     {
20332       rtx (*unpack)(rtx, rtx);
20333       rtx (*extract)(rtx, rtx) = NULL;
20334       enum machine_mode halfmode = BLKmode;
20335 
20336       switch (imode)
20337 	{
20338 	case V32QImode:
20339 	  if (unsigned_p)
20340 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
20341 	  else
20342 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
20343 	  halfmode = V16QImode;
20344 	  extract
20345 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20346 	  break;
20347 	case V16HImode:
20348 	  if (unsigned_p)
20349 	    unpack = gen_avx2_zero_extendv8hiv8si2;
20350 	  else
20351 	    unpack = gen_avx2_sign_extendv8hiv8si2;
20352 	  halfmode = V8HImode;
20353 	  extract
20354 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20355 	  break;
20356 	case V8SImode:
20357 	  if (unsigned_p)
20358 	    unpack = gen_avx2_zero_extendv4siv4di2;
20359 	  else
20360 	    unpack = gen_avx2_sign_extendv4siv4di2;
20361 	  halfmode = V4SImode;
20362 	  extract
20363 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20364 	  break;
20365 	case V16QImode:
20366 	  if (unsigned_p)
20367 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20368 	  else
20369 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20370 	  break;
20371 	case V8HImode:
20372 	  if (unsigned_p)
20373 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
20374 	  else
20375 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
20376 	  break;
20377 	case V4SImode:
20378 	  if (unsigned_p)
20379 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
20380 	  else
20381 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
20382 	  break;
20383 	default:
20384 	  gcc_unreachable ();
20385 	}
20386 
20387       if (GET_MODE_SIZE (imode) == 32)
20388 	{
20389 	  tmp = gen_reg_rtx (halfmode);
20390 	  emit_insn (extract (tmp, operands[1]));
20391 	}
20392       else if (high_p)
20393 	{
20394 	  /* Shift higher 8 bytes to lower 8 bytes.  */
20395 	  tmp = gen_reg_rtx (imode);
20396 	  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20397 					 gen_lowpart (V1TImode, operands[1]),
20398 					 GEN_INT (64)));
20399 	}
20400       else
20401 	tmp = operands[1];
20402 
20403       emit_insn (unpack (operands[0], tmp));
20404     }
20405   else
20406     {
20407       rtx (*unpack)(rtx, rtx, rtx);
20408 
20409       switch (imode)
20410 	{
20411 	case V16QImode:
20412 	  if (high_p)
20413 	    unpack = gen_vec_interleave_highv16qi;
20414 	  else
20415 	    unpack = gen_vec_interleave_lowv16qi;
20416 	  break;
20417 	case V8HImode:
20418 	  if (high_p)
20419 	    unpack = gen_vec_interleave_highv8hi;
20420 	  else
20421 	    unpack = gen_vec_interleave_lowv8hi;
20422 	  break;
20423 	case V4SImode:
20424 	  if (high_p)
20425 	    unpack = gen_vec_interleave_highv4si;
20426 	  else
20427 	    unpack = gen_vec_interleave_lowv4si;
20428 	  break;
20429 	default:
20430 	  gcc_unreachable ();
20431 	}
20432 
20433       dest = gen_lowpart (imode, operands[0]);
20434 
20435       if (unsigned_p)
20436 	tmp = force_reg (imode, CONST0_RTX (imode));
20437       else
20438 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20439 				   operands[1], pc_rtx, pc_rtx);
20440 
20441       emit_insn (unpack (dest, operands[1], tmp));
20442     }
20443 }
20444 
20445 /* Expand conditional increment or decrement using adb/sbb instructions.
20446    The default case using setcc followed by the conditional move can be
20447    done by generic code.  */
20448 bool
20449 ix86_expand_int_addcc (rtx operands[])
20450 {
20451   enum rtx_code code = GET_CODE (operands[1]);
20452   rtx flags;
20453   rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20454   rtx compare_op;
20455   rtx val = const0_rtx;
20456   bool fpcmp = false;
20457   enum machine_mode mode;
20458   rtx op0 = XEXP (operands[1], 0);
20459   rtx op1 = XEXP (operands[1], 1);
20460 
20461   if (operands[3] != const1_rtx
20462       && operands[3] != constm1_rtx)
20463     return false;
20464   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20465      return false;
20466   code = GET_CODE (compare_op);
20467 
20468   flags = XEXP (compare_op, 0);
20469 
20470   if (GET_MODE (flags) == CCFPmode
20471       || GET_MODE (flags) == CCFPUmode)
20472     {
20473       fpcmp = true;
20474       code = ix86_fp_compare_code_to_integer (code);
20475     }
20476 
20477   if (code != LTU)
20478     {
20479       val = constm1_rtx;
20480       if (fpcmp)
20481 	PUT_CODE (compare_op,
20482 		  reverse_condition_maybe_unordered
20483 		    (GET_CODE (compare_op)));
20484       else
20485 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20486     }
20487 
20488   mode = GET_MODE (operands[0]);
20489 
20490   /* Construct either adc or sbb insn.  */
20491   if ((code == LTU) == (operands[3] == constm1_rtx))
20492     {
20493       switch (mode)
20494 	{
20495 	  case QImode:
20496 	    insn = gen_subqi3_carry;
20497 	    break;
20498 	  case HImode:
20499 	    insn = gen_subhi3_carry;
20500 	    break;
20501 	  case SImode:
20502 	    insn = gen_subsi3_carry;
20503 	    break;
20504 	  case DImode:
20505 	    insn = gen_subdi3_carry;
20506 	    break;
20507 	  default:
20508 	    gcc_unreachable ();
20509 	}
20510     }
20511   else
20512     {
20513       switch (mode)
20514 	{
20515 	  case QImode:
20516 	    insn = gen_addqi3_carry;
20517 	    break;
20518 	  case HImode:
20519 	    insn = gen_addhi3_carry;
20520 	    break;
20521 	  case SImode:
20522 	    insn = gen_addsi3_carry;
20523 	    break;
20524 	  case DImode:
20525 	    insn = gen_adddi3_carry;
20526 	    break;
20527 	  default:
20528 	    gcc_unreachable ();
20529 	}
20530     }
20531   emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20532 
20533   return true;
20534 }
20535 
20536 
20537 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
20538    but works for floating pointer parameters and nonoffsetable memories.
20539    For pushes, it returns just stack offsets; the values will be saved
20540    in the right order.  Maximally three parts are generated.  */
20541 
20542 static int
20543 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20544 {
20545   int size;
20546 
20547   if (!TARGET_64BIT)
20548     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20549   else
20550     size = (GET_MODE_SIZE (mode) + 4) / 8;
20551 
20552   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20553   gcc_assert (size >= 2 && size <= 4);
20554 
20555   /* Optimize constant pool reference to immediates.  This is used by fp
20556      moves, that force all constants to memory to allow combining.  */
20557   if (MEM_P (operand) && MEM_READONLY_P (operand))
20558     {
20559       rtx tmp = maybe_get_pool_constant (operand);
20560       if (tmp)
20561 	operand = tmp;
20562     }
20563 
20564   if (MEM_P (operand) && !offsettable_memref_p (operand))
20565     {
20566       /* The only non-offsetable memories we handle are pushes.  */
20567       int ok = push_operand (operand, VOIDmode);
20568 
20569       gcc_assert (ok);
20570 
20571       operand = copy_rtx (operand);
20572       PUT_MODE (operand, Pmode);
20573       parts[0] = parts[1] = parts[2] = parts[3] = operand;
20574       return size;
20575     }
20576 
20577   if (GET_CODE (operand) == CONST_VECTOR)
20578     {
20579       enum machine_mode imode = int_mode_for_mode (mode);
20580       /* Caution: if we looked through a constant pool memory above,
20581 	 the operand may actually have a different mode now.  That's
20582 	 ok, since we want to pun this all the way back to an integer.  */
20583       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20584       gcc_assert (operand != NULL);
20585       mode = imode;
20586     }
20587 
20588   if (!TARGET_64BIT)
20589     {
20590       if (mode == DImode)
20591 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20592       else
20593 	{
20594 	  int i;
20595 
20596 	  if (REG_P (operand))
20597 	    {
20598 	      gcc_assert (reload_completed);
20599 	      for (i = 0; i < size; i++)
20600 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20601 	    }
20602 	  else if (offsettable_memref_p (operand))
20603 	    {
20604 	      operand = adjust_address (operand, SImode, 0);
20605 	      parts[0] = operand;
20606 	      for (i = 1; i < size; i++)
20607 		parts[i] = adjust_address (operand, SImode, 4 * i);
20608 	    }
20609 	  else if (GET_CODE (operand) == CONST_DOUBLE)
20610 	    {
20611 	      REAL_VALUE_TYPE r;
20612 	      long l[4];
20613 
20614 	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20615 	      switch (mode)
20616 		{
20617 		case TFmode:
20618 		  real_to_target (l, &r, mode);
20619 		  parts[3] = gen_int_mode (l[3], SImode);
20620 		  parts[2] = gen_int_mode (l[2], SImode);
20621 		  break;
20622 		case XFmode:
20623 		  REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20624 		  parts[2] = gen_int_mode (l[2], SImode);
20625 		  break;
20626 		case DFmode:
20627 		  REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20628 		  break;
20629 		default:
20630 		  gcc_unreachable ();
20631 		}
20632 	      parts[1] = gen_int_mode (l[1], SImode);
20633 	      parts[0] = gen_int_mode (l[0], SImode);
20634 	    }
20635 	  else
20636 	    gcc_unreachable ();
20637 	}
20638     }
20639   else
20640     {
20641       if (mode == TImode)
20642 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20643       if (mode == XFmode || mode == TFmode)
20644 	{
20645 	  enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20646 	  if (REG_P (operand))
20647 	    {
20648 	      gcc_assert (reload_completed);
20649 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20650 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20651 	    }
20652 	  else if (offsettable_memref_p (operand))
20653 	    {
20654 	      operand = adjust_address (operand, DImode, 0);
20655 	      parts[0] = operand;
20656 	      parts[1] = adjust_address (operand, upper_mode, 8);
20657 	    }
20658 	  else if (GET_CODE (operand) == CONST_DOUBLE)
20659 	    {
20660 	      REAL_VALUE_TYPE r;
20661 	      long l[4];
20662 
20663 	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20664 	      real_to_target (l, &r, mode);
20665 
20666 	      /* Do not use shift by 32 to avoid warning on 32bit systems.  */
20667 	      if (HOST_BITS_PER_WIDE_INT >= 64)
20668 	        parts[0]
20669 		  = gen_int_mode
20670 		      ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20671 		       + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20672 		       DImode);
20673 	      else
20674 	        parts[0] = immed_double_const (l[0], l[1], DImode);
20675 
20676 	      if (upper_mode == SImode)
20677 	        parts[1] = gen_int_mode (l[2], SImode);
20678 	      else if (HOST_BITS_PER_WIDE_INT >= 64)
20679 	        parts[1]
20680 		  = gen_int_mode
20681 		      ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20682 		       + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20683 		       DImode);
20684 	      else
20685 	        parts[1] = immed_double_const (l[2], l[3], DImode);
20686 	    }
20687 	  else
20688 	    gcc_unreachable ();
20689 	}
20690     }
20691 
20692   return size;
20693 }
20694 
20695 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20696    Return false when normal moves are needed; true when all required
20697    insns have been emitted.  Operands 2-4 contain the input values
20698    int the correct order; operands 5-7 contain the output values.  */
20699 
20700 void
20701 ix86_split_long_move (rtx operands[])
20702 {
20703   rtx part[2][4];
20704   int nparts, i, j;
20705   int push = 0;
20706   int collisions = 0;
20707   enum machine_mode mode = GET_MODE (operands[0]);
20708   bool collisionparts[4];
20709 
20710   /* The DFmode expanders may ask us to move double.
20711      For 64bit target this is single move.  By hiding the fact
20712      here we simplify i386.md splitters.  */
20713   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20714     {
20715       /* Optimize constant pool reference to immediates.  This is used by
20716 	 fp moves, that force all constants to memory to allow combining.  */
20717 
20718       if (MEM_P (operands[1])
20719 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20720 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20721 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
20722       if (push_operand (operands[0], VOIDmode))
20723 	{
20724 	  operands[0] = copy_rtx (operands[0]);
20725 	  PUT_MODE (operands[0], Pmode);
20726 	}
20727       else
20728         operands[0] = gen_lowpart (DImode, operands[0]);
20729       operands[1] = gen_lowpart (DImode, operands[1]);
20730       emit_move_insn (operands[0], operands[1]);
20731       return;
20732     }
20733 
20734   /* The only non-offsettable memory we handle is push.  */
20735   if (push_operand (operands[0], VOIDmode))
20736     push = 1;
20737   else
20738     gcc_assert (!MEM_P (operands[0])
20739 		|| offsettable_memref_p (operands[0]));
20740 
20741   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20742   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20743 
20744   /* When emitting push, take care for source operands on the stack.  */
20745   if (push && MEM_P (operands[1])
20746       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20747     {
20748       rtx src_base = XEXP (part[1][nparts - 1], 0);
20749 
20750       /* Compensate for the stack decrement by 4.  */
20751       if (!TARGET_64BIT && nparts == 3
20752 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20753 	src_base = plus_constant (src_base, 4);
20754 
20755       /* src_base refers to the stack pointer and is
20756 	 automatically decreased by emitted push.  */
20757       for (i = 0; i < nparts; i++)
20758 	part[1][i] = change_address (part[1][i],
20759 				     GET_MODE (part[1][i]), src_base);
20760     }
20761 
20762   /* We need to do copy in the right order in case an address register
20763      of the source overlaps the destination.  */
20764   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20765     {
20766       rtx tmp;
20767 
20768       for (i = 0; i < nparts; i++)
20769 	{
20770 	  collisionparts[i]
20771 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20772 	  if (collisionparts[i])
20773 	    collisions++;
20774 	}
20775 
20776       /* Collision in the middle part can be handled by reordering.  */
20777       if (collisions == 1 && nparts == 3 && collisionparts [1])
20778 	{
20779 	  tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20780 	  tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20781 	}
20782       else if (collisions == 1
20783 	       && nparts == 4
20784 	       && (collisionparts [1] || collisionparts [2]))
20785 	{
20786 	  if (collisionparts [1])
20787 	    {
20788 	      tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20789 	      tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20790 	    }
20791 	  else
20792 	    {
20793 	      tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20794 	      tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20795 	    }
20796 	}
20797 
20798       /* If there are more collisions, we can't handle it by reordering.
20799 	 Do an lea to the last part and use only one colliding move.  */
20800       else if (collisions > 1)
20801 	{
20802 	  rtx base;
20803 
20804 	  collisions = 1;
20805 
20806 	  base = part[0][nparts - 1];
20807 
20808 	  /* Handle the case when the last part isn't valid for lea.
20809 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
20810 	  if (GET_MODE (base) != Pmode)
20811 	    base = gen_rtx_REG (Pmode, REGNO (base));
20812 
20813 	  emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20814 	  part[1][0] = replace_equiv_address (part[1][0], base);
20815 	  for (i = 1; i < nparts; i++)
20816 	    {
20817 	      tmp = plus_constant (base, UNITS_PER_WORD * i);
20818 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
20819 	    }
20820 	}
20821     }
20822 
20823   if (push)
20824     {
20825       if (!TARGET_64BIT)
20826 	{
20827 	  if (nparts == 3)
20828 	    {
20829 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20830                 emit_insn (gen_addsi3 (stack_pointer_rtx,
20831 				       stack_pointer_rtx, GEN_INT (-4)));
20832 	      emit_move_insn (part[0][2], part[1][2]);
20833 	    }
20834 	  else if (nparts == 4)
20835 	    {
20836 	      emit_move_insn (part[0][3], part[1][3]);
20837 	      emit_move_insn (part[0][2], part[1][2]);
20838 	    }
20839 	}
20840       else
20841 	{
20842 	  /* In 64bit mode we don't have 32bit push available.  In case this is
20843 	     register, it is OK - we will just use larger counterpart.  We also
20844 	     retype memory - these comes from attempt to avoid REX prefix on
20845 	     moving of second half of TFmode value.  */
20846 	  if (GET_MODE (part[1][1]) == SImode)
20847 	    {
20848 	      switch (GET_CODE (part[1][1]))
20849 		{
20850 		case MEM:
20851 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
20852 		  break;
20853 
20854 		case REG:
20855 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20856 		  break;
20857 
20858 		default:
20859 		  gcc_unreachable ();
20860 		}
20861 
20862 	      if (GET_MODE (part[1][0]) == SImode)
20863 		part[1][0] = part[1][1];
20864 	    }
20865 	}
20866       emit_move_insn (part[0][1], part[1][1]);
20867       emit_move_insn (part[0][0], part[1][0]);
20868       return;
20869     }
20870 
20871   /* Choose correct order to not overwrite the source before it is copied.  */
20872   if ((REG_P (part[0][0])
20873        && REG_P (part[1][1])
20874        && (REGNO (part[0][0]) == REGNO (part[1][1])
20875 	   || (nparts == 3
20876 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
20877 	   || (nparts == 4
20878 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
20879       || (collisions > 0
20880 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20881     {
20882       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20883 	{
20884 	  operands[2 + i] = part[0][j];
20885 	  operands[6 + i] = part[1][j];
20886 	}
20887     }
20888   else
20889     {
20890       for (i = 0; i < nparts; i++)
20891 	{
20892 	  operands[2 + i] = part[0][i];
20893 	  operands[6 + i] = part[1][i];
20894 	}
20895     }
20896 
20897   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
20898   if (optimize_insn_for_size_p ())
20899     {
20900       for (j = 0; j < nparts - 1; j++)
20901 	if (CONST_INT_P (operands[6 + j])
20902 	    && operands[6 + j] != const0_rtx
20903 	    && REG_P (operands[2 + j]))
20904 	  for (i = j; i < nparts - 1; i++)
20905 	    if (CONST_INT_P (operands[7 + i])
20906 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20907 	      operands[7 + i] = operands[2 + j];
20908     }
20909 
20910   for (i = 0; i < nparts; i++)
20911     emit_move_insn (operands[2 + i], operands[6 + i]);
20912 
20913   return;
20914 }
20915 
20916 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20917    left shift by a constant, either using a single shift or
20918    a sequence of add instructions.  */
20919 
20920 static void
20921 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20922 {
20923   rtx (*insn)(rtx, rtx, rtx);
20924 
20925   if (count == 1
20926       || (count * ix86_cost->add <= ix86_cost->shift_const
20927 	  && !optimize_insn_for_size_p ()))
20928     {
20929       insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20930       while (count-- > 0)
20931 	emit_insn (insn (operand, operand, operand));
20932     }
20933   else
20934     {
20935       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20936       emit_insn (insn (operand, operand, GEN_INT (count)));
20937     }
20938 }
20939 
20940 void
20941 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20942 {
20943   rtx (*gen_ashl3)(rtx, rtx, rtx);
20944   rtx (*gen_shld)(rtx, rtx, rtx);
20945   int half_width = GET_MODE_BITSIZE (mode) >> 1;
20946 
20947   rtx low[2], high[2];
20948   int count;
20949 
20950   if (CONST_INT_P (operands[2]))
20951     {
20952       split_double_mode (mode, operands, 2, low, high);
20953       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20954 
20955       if (count >= half_width)
20956 	{
20957 	  emit_move_insn (high[0], low[1]);
20958 	  emit_move_insn (low[0], const0_rtx);
20959 
20960 	  if (count > half_width)
20961 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
20962 	}
20963       else
20964 	{
20965 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20966 
20967 	  if (!rtx_equal_p (operands[0], operands[1]))
20968 	    emit_move_insn (operands[0], operands[1]);
20969 
20970 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20971 	  ix86_expand_ashl_const (low[0], count, mode);
20972 	}
20973       return;
20974     }
20975 
20976   split_double_mode (mode, operands, 1, low, high);
20977 
20978   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20979 
20980   if (operands[1] == const1_rtx)
20981     {
20982       /* Assuming we've chosen a QImode capable registers, then 1 << N
20983 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
20984       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20985 	{
20986 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20987 
20988 	  ix86_expand_clear (low[0]);
20989 	  ix86_expand_clear (high[0]);
20990 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20991 
20992 	  d = gen_lowpart (QImode, low[0]);
20993 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20994 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
20995 	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
20996 
20997 	  d = gen_lowpart (QImode, high[0]);
20998 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20999 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
21000 	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
21001 	}
21002 
21003       /* Otherwise, we can get the same results by manually performing
21004 	 a bit extract operation on bit 5/6, and then performing the two
21005 	 shifts.  The two methods of getting 0/1 into low/high are exactly
21006 	 the same size.  Avoiding the shift in the bit extract case helps
21007 	 pentium4 a bit; no one else seems to care much either way.  */
21008       else
21009 	{
21010 	  enum machine_mode half_mode;
21011 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
21012 	  rtx (*gen_and3)(rtx, rtx, rtx);
21013 	  rtx (*gen_xor3)(rtx, rtx, rtx);
21014 	  HOST_WIDE_INT bits;
21015 	  rtx x;
21016 
21017 	  if (mode == DImode)
21018 	    {
21019 	      half_mode = SImode;
21020 	      gen_lshr3 = gen_lshrsi3;
21021 	      gen_and3 = gen_andsi3;
21022 	      gen_xor3 = gen_xorsi3;
21023 	      bits = 5;
21024 	    }
21025 	  else
21026 	    {
21027 	      half_mode = DImode;
21028 	      gen_lshr3 = gen_lshrdi3;
21029 	      gen_and3 = gen_anddi3;
21030 	      gen_xor3 = gen_xordi3;
21031 	      bits = 6;
21032 	    }
21033 
21034 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21035 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21036 	  else
21037 	    x = gen_lowpart (half_mode, operands[2]);
21038 	  emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21039 
21040 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21041 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21042 	  emit_move_insn (low[0], high[0]);
21043 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21044 	}
21045 
21046       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21047       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21048       return;
21049     }
21050 
21051   if (operands[1] == constm1_rtx)
21052     {
21053       /* For -1 << N, we can avoid the shld instruction, because we
21054 	 know that we're shifting 0...31/63 ones into a -1.  */
21055       emit_move_insn (low[0], constm1_rtx);
21056       if (optimize_insn_for_size_p ())
21057 	emit_move_insn (high[0], low[0]);
21058       else
21059 	emit_move_insn (high[0], constm1_rtx);
21060     }
21061   else
21062     {
21063       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21064 
21065       if (!rtx_equal_p (operands[0], operands[1]))
21066 	emit_move_insn (operands[0], operands[1]);
21067 
21068       split_double_mode (mode, operands, 1, low, high);
21069       emit_insn (gen_shld (high[0], low[0], operands[2]));
21070     }
21071 
21072   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21073 
21074   if (TARGET_CMOVE && scratch)
21075     {
21076       rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21077 	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21078 
21079       ix86_expand_clear (scratch);
21080       emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21081     }
21082   else
21083     {
21084       rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21085 	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21086 
21087       emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21088     }
21089 }
21090 
21091 void
21092 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21093 {
21094   rtx (*gen_ashr3)(rtx, rtx, rtx)
21095     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21096   rtx (*gen_shrd)(rtx, rtx, rtx);
21097   int half_width = GET_MODE_BITSIZE (mode) >> 1;
21098 
21099   rtx low[2], high[2];
21100   int count;
21101 
21102   if (CONST_INT_P (operands[2]))
21103     {
21104       split_double_mode (mode, operands, 2, low, high);
21105       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21106 
21107       if (count == GET_MODE_BITSIZE (mode) - 1)
21108 	{
21109 	  emit_move_insn (high[0], high[1]);
21110 	  emit_insn (gen_ashr3 (high[0], high[0],
21111 				GEN_INT (half_width - 1)));
21112 	  emit_move_insn (low[0], high[0]);
21113 
21114 	}
21115       else if (count >= half_width)
21116 	{
21117 	  emit_move_insn (low[0], high[1]);
21118 	  emit_move_insn (high[0], low[0]);
21119 	  emit_insn (gen_ashr3 (high[0], high[0],
21120 				GEN_INT (half_width - 1)));
21121 
21122 	  if (count > half_width)
21123 	    emit_insn (gen_ashr3 (low[0], low[0],
21124 				  GEN_INT (count - half_width)));
21125 	}
21126       else
21127 	{
21128 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21129 
21130 	  if (!rtx_equal_p (operands[0], operands[1]))
21131 	    emit_move_insn (operands[0], operands[1]);
21132 
21133 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21134 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21135 	}
21136     }
21137   else
21138     {
21139       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21140 
21141      if (!rtx_equal_p (operands[0], operands[1]))
21142 	emit_move_insn (operands[0], operands[1]);
21143 
21144       split_double_mode (mode, operands, 1, low, high);
21145 
21146       emit_insn (gen_shrd (low[0], high[0], operands[2]));
21147       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21148 
21149       if (TARGET_CMOVE && scratch)
21150 	{
21151 	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21152 	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21153 
21154 	  emit_move_insn (scratch, high[0]);
21155 	  emit_insn (gen_ashr3 (scratch, scratch,
21156 				GEN_INT (half_width - 1)));
21157 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21158 					  scratch));
21159 	}
21160       else
21161 	{
21162 	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21163 	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21164 
21165 	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21166 	}
21167     }
21168 }
21169 
21170 void
21171 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21172 {
21173   rtx (*gen_lshr3)(rtx, rtx, rtx)
21174     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21175   rtx (*gen_shrd)(rtx, rtx, rtx);
21176   int half_width = GET_MODE_BITSIZE (mode) >> 1;
21177 
21178   rtx low[2], high[2];
21179   int count;
21180 
21181   if (CONST_INT_P (operands[2]))
21182     {
21183       split_double_mode (mode, operands, 2, low, high);
21184       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21185 
21186       if (count >= half_width)
21187 	{
21188 	  emit_move_insn (low[0], high[1]);
21189 	  ix86_expand_clear (high[0]);
21190 
21191 	  if (count > half_width)
21192 	    emit_insn (gen_lshr3 (low[0], low[0],
21193 				  GEN_INT (count - half_width)));
21194 	}
21195       else
21196 	{
21197 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21198 
21199 	  if (!rtx_equal_p (operands[0], operands[1]))
21200 	    emit_move_insn (operands[0], operands[1]);
21201 
21202 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21203 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21204 	}
21205     }
21206   else
21207     {
21208       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21209 
21210       if (!rtx_equal_p (operands[0], operands[1]))
21211 	emit_move_insn (operands[0], operands[1]);
21212 
21213       split_double_mode (mode, operands, 1, low, high);
21214 
21215       emit_insn (gen_shrd (low[0], high[0], operands[2]));
21216       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21217 
21218       if (TARGET_CMOVE && scratch)
21219 	{
21220 	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21221 	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21222 
21223 	  ix86_expand_clear (scratch);
21224 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21225 					  scratch));
21226 	}
21227       else
21228 	{
21229 	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21230 	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21231 
21232 	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21233 	}
21234     }
21235 }
21236 
21237 /* Predict just emitted jump instruction to be taken with probability PROB.  */
21238 static void
21239 predict_jump (int prob)
21240 {
21241   rtx insn = get_last_insn ();
21242   gcc_assert (JUMP_P (insn));
21243   add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21244 }
21245 
21246 /* Helper function for the string operations below.  Dest VARIABLE whether
21247    it is aligned to VALUE bytes.  If true, jump to the label.  */
21248 static rtx
21249 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21250 {
21251   rtx label = gen_label_rtx ();
21252   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21253   if (GET_MODE (variable) == DImode)
21254     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21255   else
21256     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21257   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21258 			   1, label);
21259   if (epilogue)
21260     predict_jump (REG_BR_PROB_BASE * 50 / 100);
21261   else
21262     predict_jump (REG_BR_PROB_BASE * 90 / 100);
21263   return label;
21264 }
21265 
21266 /* Adjust COUNTER by the VALUE.  */
21267 static void
21268 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21269 {
21270   rtx (*gen_add)(rtx, rtx, rtx)
21271     = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21272 
21273   emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21274 }
21275 
21276 /* Zero extend possibly SImode EXP to Pmode register.  */
21277 rtx
21278 ix86_zero_extend_to_Pmode (rtx exp)
21279 {
21280   rtx r;
21281   if (GET_MODE (exp) == VOIDmode)
21282     return force_reg (Pmode, exp);
21283   if (GET_MODE (exp) == Pmode)
21284     return copy_to_mode_reg (Pmode, exp);
21285   r = gen_reg_rtx (Pmode);
21286   emit_insn (gen_zero_extendsidi2 (r, exp));
21287   return r;
21288 }
21289 
21290 /* Divide COUNTREG by SCALE.  */
21291 static rtx
21292 scale_counter (rtx countreg, int scale)
21293 {
21294   rtx sc;
21295 
21296   if (scale == 1)
21297     return countreg;
21298   if (CONST_INT_P (countreg))
21299     return GEN_INT (INTVAL (countreg) / scale);
21300   gcc_assert (REG_P (countreg));
21301 
21302   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21303 			    GEN_INT (exact_log2 (scale)),
21304 			    NULL, 1, OPTAB_DIRECT);
21305   return sc;
21306 }
21307 
21308 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
21309    DImode for constant loop counts.  */
21310 
21311 static enum machine_mode
21312 counter_mode (rtx count_exp)
21313 {
21314   if (GET_MODE (count_exp) != VOIDmode)
21315     return GET_MODE (count_exp);
21316   if (!CONST_INT_P (count_exp))
21317     return Pmode;
21318   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21319     return DImode;
21320   return SImode;
21321 }
21322 
21323 /* When SRCPTR is non-NULL, output simple loop to move memory
21324    pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21325    overall size is COUNT specified in bytes.  When SRCPTR is NULL, output the
21326    equivalent loop to set memory by VALUE (supposed to be in MODE).
21327 
21328    The size is rounded down to whole number of chunk size moved at once.
21329    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
21330 
21331 
21332 static void
21333 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21334 			       rtx destptr, rtx srcptr, rtx value,
21335 			       rtx count, enum machine_mode mode, int unroll,
21336 			       int expected_size)
21337 {
21338   rtx out_label, top_label, iter, tmp;
21339   enum machine_mode iter_mode = counter_mode (count);
21340   rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21341   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21342   rtx size;
21343   rtx x_addr;
21344   rtx y_addr;
21345   int i;
21346 
21347   top_label = gen_label_rtx ();
21348   out_label = gen_label_rtx ();
21349   iter = gen_reg_rtx (iter_mode);
21350 
21351   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21352 			      NULL, 1, OPTAB_DIRECT);
21353   /* Those two should combine.  */
21354   if (piece_size == const1_rtx)
21355     {
21356       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21357 			       true, out_label);
21358       predict_jump (REG_BR_PROB_BASE * 10 / 100);
21359     }
21360   emit_move_insn (iter, const0_rtx);
21361 
21362   emit_label (top_label);
21363 
21364   tmp = convert_modes (Pmode, iter_mode, iter, true);
21365   x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21366   destmem = change_address (destmem, mode, x_addr);
21367 
21368   if (srcmem)
21369     {
21370       y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21371       srcmem = change_address (srcmem, mode, y_addr);
21372 
21373       /* When unrolling for chips that reorder memory reads and writes,
21374 	 we can save registers by using single temporary.
21375 	 Also using 4 temporaries is overkill in 32bit mode.  */
21376       if (!TARGET_64BIT && 0)
21377 	{
21378 	  for (i = 0; i < unroll; i++)
21379 	    {
21380 	      if (i)
21381 		{
21382 		  destmem =
21383 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21384 		  srcmem =
21385 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21386 		}
21387 	      emit_move_insn (destmem, srcmem);
21388 	    }
21389 	}
21390       else
21391 	{
21392 	  rtx tmpreg[4];
21393 	  gcc_assert (unroll <= 4);
21394 	  for (i = 0; i < unroll; i++)
21395 	    {
21396 	      tmpreg[i] = gen_reg_rtx (mode);
21397 	      if (i)
21398 		{
21399 		  srcmem =
21400 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21401 		}
21402 	      emit_move_insn (tmpreg[i], srcmem);
21403 	    }
21404 	  for (i = 0; i < unroll; i++)
21405 	    {
21406 	      if (i)
21407 		{
21408 		  destmem =
21409 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21410 		}
21411 	      emit_move_insn (destmem, tmpreg[i]);
21412 	    }
21413 	}
21414     }
21415   else
21416     for (i = 0; i < unroll; i++)
21417       {
21418 	if (i)
21419 	  destmem =
21420 	    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21421 	emit_move_insn (destmem, value);
21422       }
21423 
21424   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21425 			     true, OPTAB_LIB_WIDEN);
21426   if (tmp != iter)
21427     emit_move_insn (iter, tmp);
21428 
21429   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21430 			   true, top_label);
21431   if (expected_size != -1)
21432     {
21433       expected_size /= GET_MODE_SIZE (mode) * unroll;
21434       if (expected_size == 0)
21435 	predict_jump (0);
21436       else if (expected_size > REG_BR_PROB_BASE)
21437 	predict_jump (REG_BR_PROB_BASE - 1);
21438       else
21439         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21440     }
21441   else
21442     predict_jump (REG_BR_PROB_BASE * 80 / 100);
21443   iter = ix86_zero_extend_to_Pmode (iter);
21444   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21445 			     true, OPTAB_LIB_WIDEN);
21446   if (tmp != destptr)
21447     emit_move_insn (destptr, tmp);
21448   if (srcptr)
21449     {
21450       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21451 				 true, OPTAB_LIB_WIDEN);
21452       if (tmp != srcptr)
21453 	emit_move_insn (srcptr, tmp);
21454     }
21455   emit_label (out_label);
21456 }
21457 
21458 /* Output "rep; mov" instruction.
21459    Arguments have same meaning as for previous function */
21460 static void
21461 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21462 			   rtx destptr, rtx srcptr,
21463 			   rtx count,
21464 			   enum machine_mode mode)
21465 {
21466   rtx destexp;
21467   rtx srcexp;
21468   rtx countreg;
21469   HOST_WIDE_INT rounded_count;
21470 
21471   /* If the size is known, it is shorter to use rep movs.  */
21472   if (mode == QImode && CONST_INT_P (count)
21473       && !(INTVAL (count) & 3))
21474     mode = SImode;
21475 
21476   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21477     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21478   if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21479     srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21480   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21481   if (mode != QImode)
21482     {
21483       destexp = gen_rtx_ASHIFT (Pmode, countreg,
21484 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21485       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21486       srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21487 			       GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21488       srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21489     }
21490   else
21491     {
21492       destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21493       srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21494     }
21495   if (CONST_INT_P (count))
21496     {
21497       rounded_count = (INTVAL (count)
21498 		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21499       destmem = shallow_copy_rtx (destmem);
21500       srcmem = shallow_copy_rtx (srcmem);
21501       set_mem_size (destmem, rounded_count);
21502       set_mem_size (srcmem, rounded_count);
21503     }
21504   else
21505     {
21506       if (MEM_SIZE_KNOWN_P (destmem))
21507 	clear_mem_size (destmem);
21508       if (MEM_SIZE_KNOWN_P (srcmem))
21509 	clear_mem_size (srcmem);
21510     }
21511   emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21512 			  destexp, srcexp));
21513 }
21514 
21515 /* Output "rep; stos" instruction.
21516    Arguments have same meaning as for previous function */
21517 static void
21518 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21519 			    rtx count, enum machine_mode mode,
21520 			    rtx orig_value)
21521 {
21522   rtx destexp;
21523   rtx countreg;
21524   HOST_WIDE_INT rounded_count;
21525 
21526   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21527     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21528   value = force_reg (mode, gen_lowpart (mode, value));
21529   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21530   if (mode != QImode)
21531     {
21532       destexp = gen_rtx_ASHIFT (Pmode, countreg,
21533 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21534       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21535     }
21536   else
21537     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21538   if (orig_value == const0_rtx && CONST_INT_P (count))
21539     {
21540       rounded_count = (INTVAL (count)
21541 		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21542       destmem = shallow_copy_rtx (destmem);
21543       set_mem_size (destmem, rounded_count);
21544     }
21545   else if (MEM_SIZE_KNOWN_P (destmem))
21546     clear_mem_size (destmem);
21547   emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21548 }
21549 
21550 static void
21551 emit_strmov (rtx destmem, rtx srcmem,
21552 	     rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21553 {
21554   rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21555   rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21556   emit_insn (gen_strmov (destptr, dest, srcptr, src));
21557 }
21558 
21559 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
21560 static void
21561 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21562 			rtx destptr, rtx srcptr, rtx count, int max_size)
21563 {
21564   rtx src, dest;
21565   if (CONST_INT_P (count))
21566     {
21567       HOST_WIDE_INT countval = INTVAL (count);
21568       int offset = 0;
21569 
21570       if ((countval & 0x10) && max_size > 16)
21571 	{
21572 	  if (TARGET_64BIT)
21573 	    {
21574 	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21575 	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21576 	    }
21577 	  else
21578 	    gcc_unreachable ();
21579 	  offset += 16;
21580 	}
21581       if ((countval & 0x08) && max_size > 8)
21582 	{
21583 	  if (TARGET_64BIT)
21584 	    emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21585 	  else
21586 	    {
21587 	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21588 	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21589 	    }
21590 	  offset += 8;
21591 	}
21592       if ((countval & 0x04) && max_size > 4)
21593 	{
21594           emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21595 	  offset += 4;
21596 	}
21597       if ((countval & 0x02) && max_size > 2)
21598 	{
21599           emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21600 	  offset += 2;
21601 	}
21602       if ((countval & 0x01) && max_size > 1)
21603 	{
21604           emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21605 	  offset += 1;
21606 	}
21607       return;
21608     }
21609   if (max_size > 8)
21610     {
21611       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21612 				    count, 1, OPTAB_DIRECT);
21613       expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21614 				     count, QImode, 1, 4);
21615       return;
21616     }
21617 
21618   /* When there are stringops, we can cheaply increase dest and src pointers.
21619      Otherwise we save code size by maintaining offset (zero is readily
21620      available from preceding rep operation) and using x86 addressing modes.
21621    */
21622   if (TARGET_SINGLE_STRINGOP)
21623     {
21624       if (max_size > 4)
21625 	{
21626 	  rtx label = ix86_expand_aligntest (count, 4, true);
21627 	  src = change_address (srcmem, SImode, srcptr);
21628 	  dest = change_address (destmem, SImode, destptr);
21629 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
21630 	  emit_label (label);
21631 	  LABEL_NUSES (label) = 1;
21632 	}
21633       if (max_size > 2)
21634 	{
21635 	  rtx label = ix86_expand_aligntest (count, 2, true);
21636 	  src = change_address (srcmem, HImode, srcptr);
21637 	  dest = change_address (destmem, HImode, destptr);
21638 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
21639 	  emit_label (label);
21640 	  LABEL_NUSES (label) = 1;
21641 	}
21642       if (max_size > 1)
21643 	{
21644 	  rtx label = ix86_expand_aligntest (count, 1, true);
21645 	  src = change_address (srcmem, QImode, srcptr);
21646 	  dest = change_address (destmem, QImode, destptr);
21647 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
21648 	  emit_label (label);
21649 	  LABEL_NUSES (label) = 1;
21650 	}
21651     }
21652   else
21653     {
21654       rtx offset = force_reg (Pmode, const0_rtx);
21655       rtx tmp;
21656 
21657       if (max_size > 4)
21658 	{
21659 	  rtx label = ix86_expand_aligntest (count, 4, true);
21660 	  src = change_address (srcmem, SImode, srcptr);
21661 	  dest = change_address (destmem, SImode, destptr);
21662 	  emit_move_insn (dest, src);
21663 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21664 				     true, OPTAB_LIB_WIDEN);
21665 	  if (tmp != offset)
21666 	    emit_move_insn (offset, tmp);
21667 	  emit_label (label);
21668 	  LABEL_NUSES (label) = 1;
21669 	}
21670       if (max_size > 2)
21671 	{
21672 	  rtx label = ix86_expand_aligntest (count, 2, true);
21673 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21674 	  src = change_address (srcmem, HImode, tmp);
21675 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21676 	  dest = change_address (destmem, HImode, tmp);
21677 	  emit_move_insn (dest, src);
21678 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21679 				     true, OPTAB_LIB_WIDEN);
21680 	  if (tmp != offset)
21681 	    emit_move_insn (offset, tmp);
21682 	  emit_label (label);
21683 	  LABEL_NUSES (label) = 1;
21684 	}
21685       if (max_size > 1)
21686 	{
21687 	  rtx label = ix86_expand_aligntest (count, 1, true);
21688 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21689 	  src = change_address (srcmem, QImode, tmp);
21690 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21691 	  dest = change_address (destmem, QImode, tmp);
21692 	  emit_move_insn (dest, src);
21693 	  emit_label (label);
21694 	  LABEL_NUSES (label) = 1;
21695 	}
21696     }
21697 }
21698 
21699 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
21700 static void
21701 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21702 				 rtx count, int max_size)
21703 {
21704   count =
21705     expand_simple_binop (counter_mode (count), AND, count,
21706 			 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21707   expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21708 				 gen_lowpart (QImode, value), count, QImode,
21709 				 1, max_size / 2);
21710 }
21711 
21712 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
21713 static void
21714 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21715 {
21716   rtx dest;
21717 
21718   if (CONST_INT_P (count))
21719     {
21720       HOST_WIDE_INT countval = INTVAL (count);
21721       int offset = 0;
21722 
21723       if ((countval & 0x10) && max_size > 16)
21724 	{
21725 	  if (TARGET_64BIT)
21726 	    {
21727 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21728 	      emit_insn (gen_strset (destptr, dest, value));
21729 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21730 	      emit_insn (gen_strset (destptr, dest, value));
21731 	    }
21732 	  else
21733 	    gcc_unreachable ();
21734 	  offset += 16;
21735 	}
21736       if ((countval & 0x08) && max_size > 8)
21737 	{
21738 	  if (TARGET_64BIT)
21739 	    {
21740 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21741 	      emit_insn (gen_strset (destptr, dest, value));
21742 	    }
21743 	  else
21744 	    {
21745 	      dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21746 	      emit_insn (gen_strset (destptr, dest, value));
21747 	      dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21748 	      emit_insn (gen_strset (destptr, dest, value));
21749 	    }
21750 	  offset += 8;
21751 	}
21752       if ((countval & 0x04) && max_size > 4)
21753 	{
21754 	  dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21755 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21756 	  offset += 4;
21757 	}
21758       if ((countval & 0x02) && max_size > 2)
21759 	{
21760 	  dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21761 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21762 	  offset += 2;
21763 	}
21764       if ((countval & 0x01) && max_size > 1)
21765 	{
21766 	  dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21767 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21768 	  offset += 1;
21769 	}
21770       return;
21771     }
21772   if (max_size > 32)
21773     {
21774       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21775       return;
21776     }
21777   if (max_size > 16)
21778     {
21779       rtx label = ix86_expand_aligntest (count, 16, true);
21780       if (TARGET_64BIT)
21781 	{
21782 	  dest = change_address (destmem, DImode, destptr);
21783 	  emit_insn (gen_strset (destptr, dest, value));
21784 	  emit_insn (gen_strset (destptr, dest, value));
21785 	}
21786       else
21787 	{
21788 	  dest = change_address (destmem, SImode, destptr);
21789 	  emit_insn (gen_strset (destptr, dest, value));
21790 	  emit_insn (gen_strset (destptr, dest, value));
21791 	  emit_insn (gen_strset (destptr, dest, value));
21792 	  emit_insn (gen_strset (destptr, dest, value));
21793 	}
21794       emit_label (label);
21795       LABEL_NUSES (label) = 1;
21796     }
21797   if (max_size > 8)
21798     {
21799       rtx label = ix86_expand_aligntest (count, 8, true);
21800       if (TARGET_64BIT)
21801 	{
21802 	  dest = change_address (destmem, DImode, destptr);
21803 	  emit_insn (gen_strset (destptr, dest, value));
21804 	}
21805       else
21806 	{
21807 	  dest = change_address (destmem, SImode, destptr);
21808 	  emit_insn (gen_strset (destptr, dest, value));
21809 	  emit_insn (gen_strset (destptr, dest, value));
21810 	}
21811       emit_label (label);
21812       LABEL_NUSES (label) = 1;
21813     }
21814   if (max_size > 4)
21815     {
21816       rtx label = ix86_expand_aligntest (count, 4, true);
21817       dest = change_address (destmem, SImode, destptr);
21818       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21819       emit_label (label);
21820       LABEL_NUSES (label) = 1;
21821     }
21822   if (max_size > 2)
21823     {
21824       rtx label = ix86_expand_aligntest (count, 2, true);
21825       dest = change_address (destmem, HImode, destptr);
21826       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21827       emit_label (label);
21828       LABEL_NUSES (label) = 1;
21829     }
21830   if (max_size > 1)
21831     {
21832       rtx label = ix86_expand_aligntest (count, 1, true);
21833       dest = change_address (destmem, QImode, destptr);
21834       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21835       emit_label (label);
21836       LABEL_NUSES (label) = 1;
21837     }
21838 }
21839 
21840 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21841    DESIRED_ALIGNMENT.  */
21842 static void
21843 expand_movmem_prologue (rtx destmem, rtx srcmem,
21844 			rtx destptr, rtx srcptr, rtx count,
21845 			int align, int desired_alignment)
21846 {
21847   if (align <= 1 && desired_alignment > 1)
21848     {
21849       rtx label = ix86_expand_aligntest (destptr, 1, false);
21850       srcmem = change_address (srcmem, QImode, srcptr);
21851       destmem = change_address (destmem, QImode, destptr);
21852       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21853       ix86_adjust_counter (count, 1);
21854       emit_label (label);
21855       LABEL_NUSES (label) = 1;
21856     }
21857   if (align <= 2 && desired_alignment > 2)
21858     {
21859       rtx label = ix86_expand_aligntest (destptr, 2, false);
21860       srcmem = change_address (srcmem, HImode, srcptr);
21861       destmem = change_address (destmem, HImode, destptr);
21862       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21863       ix86_adjust_counter (count, 2);
21864       emit_label (label);
21865       LABEL_NUSES (label) = 1;
21866     }
21867   if (align <= 4 && desired_alignment > 4)
21868     {
21869       rtx label = ix86_expand_aligntest (destptr, 4, false);
21870       srcmem = change_address (srcmem, SImode, srcptr);
21871       destmem = change_address (destmem, SImode, destptr);
21872       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21873       ix86_adjust_counter (count, 4);
21874       emit_label (label);
21875       LABEL_NUSES (label) = 1;
21876     }
21877   gcc_assert (desired_alignment <= 8);
21878 }
21879 
21880 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21881    ALIGN_BYTES is how many bytes need to be copied.  */
21882 static rtx
21883 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21884 				 int desired_align, int align_bytes)
21885 {
21886   rtx src = *srcp;
21887   rtx orig_dst = dst;
21888   rtx orig_src = src;
21889   int off = 0;
21890   int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21891   if (src_align_bytes >= 0)
21892     src_align_bytes = desired_align - src_align_bytes;
21893   if (align_bytes & 1)
21894     {
21895       dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21896       src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21897       off = 1;
21898       emit_insn (gen_strmov (destreg, dst, srcreg, src));
21899     }
21900   if (align_bytes & 2)
21901     {
21902       dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21903       src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21904       if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21905 	set_mem_align (dst, 2 * BITS_PER_UNIT);
21906       if (src_align_bytes >= 0
21907 	  && (src_align_bytes & 1) == (align_bytes & 1)
21908 	  && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21909 	set_mem_align (src, 2 * BITS_PER_UNIT);
21910       off = 2;
21911       emit_insn (gen_strmov (destreg, dst, srcreg, src));
21912     }
21913   if (align_bytes & 4)
21914     {
21915       dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21916       src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21917       if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21918 	set_mem_align (dst, 4 * BITS_PER_UNIT);
21919       if (src_align_bytes >= 0)
21920 	{
21921 	  unsigned int src_align = 0;
21922 	  if ((src_align_bytes & 3) == (align_bytes & 3))
21923 	    src_align = 4;
21924 	  else if ((src_align_bytes & 1) == (align_bytes & 1))
21925 	    src_align = 2;
21926 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21927 	    set_mem_align (src, src_align * BITS_PER_UNIT);
21928 	}
21929       off = 4;
21930       emit_insn (gen_strmov (destreg, dst, srcreg, src));
21931     }
21932   dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21933   src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21934   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21935     set_mem_align (dst, desired_align * BITS_PER_UNIT);
21936   if (src_align_bytes >= 0)
21937     {
21938       unsigned int src_align = 0;
21939       if ((src_align_bytes & 7) == (align_bytes & 7))
21940 	src_align = 8;
21941       else if ((src_align_bytes & 3) == (align_bytes & 3))
21942 	src_align = 4;
21943       else if ((src_align_bytes & 1) == (align_bytes & 1))
21944 	src_align = 2;
21945       if (src_align > (unsigned int) desired_align)
21946 	src_align = desired_align;
21947       if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21948 	set_mem_align (src, src_align * BITS_PER_UNIT);
21949     }
21950   if (MEM_SIZE_KNOWN_P (orig_dst))
21951     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21952   if (MEM_SIZE_KNOWN_P (orig_src))
21953     set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21954   *srcp = src;
21955   return dst;
21956 }
21957 
21958 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21959    DESIRED_ALIGNMENT.  */
21960 static void
21961 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21962 			int align, int desired_alignment)
21963 {
21964   if (align <= 1 && desired_alignment > 1)
21965     {
21966       rtx label = ix86_expand_aligntest (destptr, 1, false);
21967       destmem = change_address (destmem, QImode, destptr);
21968       emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21969       ix86_adjust_counter (count, 1);
21970       emit_label (label);
21971       LABEL_NUSES (label) = 1;
21972     }
21973   if (align <= 2 && desired_alignment > 2)
21974     {
21975       rtx label = ix86_expand_aligntest (destptr, 2, false);
21976       destmem = change_address (destmem, HImode, destptr);
21977       emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21978       ix86_adjust_counter (count, 2);
21979       emit_label (label);
21980       LABEL_NUSES (label) = 1;
21981     }
21982   if (align <= 4 && desired_alignment > 4)
21983     {
21984       rtx label = ix86_expand_aligntest (destptr, 4, false);
21985       destmem = change_address (destmem, SImode, destptr);
21986       emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21987       ix86_adjust_counter (count, 4);
21988       emit_label (label);
21989       LABEL_NUSES (label) = 1;
21990     }
21991   gcc_assert (desired_alignment <= 8);
21992 }
21993 
21994 /* Set enough from DST to align DST known to by aligned by ALIGN to
21995    DESIRED_ALIGN.  ALIGN_BYTES is how many bytes need to be stored.  */
21996 static rtx
21997 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21998 				 int desired_align, int align_bytes)
21999 {
22000   int off = 0;
22001   rtx orig_dst = dst;
22002   if (align_bytes & 1)
22003     {
22004       dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22005       off = 1;
22006       emit_insn (gen_strset (destreg, dst,
22007 			     gen_lowpart (QImode, value)));
22008     }
22009   if (align_bytes & 2)
22010     {
22011       dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22012       if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22013 	set_mem_align (dst, 2 * BITS_PER_UNIT);
22014       off = 2;
22015       emit_insn (gen_strset (destreg, dst,
22016 			     gen_lowpart (HImode, value)));
22017     }
22018   if (align_bytes & 4)
22019     {
22020       dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22021       if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22022 	set_mem_align (dst, 4 * BITS_PER_UNIT);
22023       off = 4;
22024       emit_insn (gen_strset (destreg, dst,
22025 			     gen_lowpart (SImode, value)));
22026     }
22027   dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22028   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22029     set_mem_align (dst, desired_align * BITS_PER_UNIT);
22030   if (MEM_SIZE_KNOWN_P (orig_dst))
22031     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22032   return dst;
22033 }
22034 
22035 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
22036 static enum stringop_alg
22037 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22038 	    int *dynamic_check)
22039 {
22040   const struct stringop_algs * algs;
22041   bool optimize_for_speed;
22042   /* Algorithms using the rep prefix want at least edi and ecx;
22043      additionally, memset wants eax and memcpy wants esi.  Don't
22044      consider such algorithms if the user has appropriated those
22045      registers for their own purposes.	*/
22046   bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22047                              || (memset
22048 				 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22049 
22050 #define ALG_USABLE_P(alg) (rep_prefix_usable			\
22051 			   || (alg != rep_prefix_1_byte		\
22052 			       && alg != rep_prefix_4_byte      \
22053 			       && alg != rep_prefix_8_byte))
22054   const struct processor_costs *cost;
22055 
22056   /* Even if the string operation call is cold, we still might spend a lot
22057      of time processing large blocks.  */
22058   if (optimize_function_for_size_p (cfun)
22059       || (optimize_insn_for_size_p ()
22060           && expected_size != -1 && expected_size < 256))
22061     optimize_for_speed = false;
22062   else
22063     optimize_for_speed = true;
22064 
22065   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22066 
22067   *dynamic_check = -1;
22068   if (memset)
22069     algs = &cost->memset[TARGET_64BIT != 0];
22070   else
22071     algs = &cost->memcpy[TARGET_64BIT != 0];
22072   if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22073     return ix86_stringop_alg;
22074   /* rep; movq or rep; movl is the smallest variant.  */
22075   else if (!optimize_for_speed)
22076     {
22077       if (!count || (count & 3))
22078 	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22079       else
22080 	return rep_prefix_usable ? rep_prefix_4_byte : loop;
22081     }
22082   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22083    */
22084   else if (expected_size != -1 && expected_size < 4)
22085     return loop_1_byte;
22086   else if (expected_size != -1)
22087     {
22088       unsigned int i;
22089       enum stringop_alg alg = libcall;
22090       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22091 	{
22092 	  /* We get here if the algorithms that were not libcall-based
22093 	     were rep-prefix based and we are unable to use rep prefixes
22094 	     based on global register usage.  Break out of the loop and
22095 	     use the heuristic below.  */
22096 	  if (algs->size[i].max == 0)
22097 	    break;
22098 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22099 	    {
22100 	      enum stringop_alg candidate = algs->size[i].alg;
22101 
22102 	      if (candidate != libcall && ALG_USABLE_P (candidate))
22103 		alg = candidate;
22104 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22105 		 last non-libcall inline algorithm.  */
22106 	      if (TARGET_INLINE_ALL_STRINGOPS)
22107 		{
22108 		  /* When the current size is best to be copied by a libcall,
22109 		     but we are still forced to inline, run the heuristic below
22110 		     that will pick code for medium sized blocks.  */
22111 		  if (alg != libcall)
22112 		    return alg;
22113 		  break;
22114 		}
22115 	      else if (ALG_USABLE_P (candidate))
22116 		return candidate;
22117 	    }
22118 	}
22119       gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22120     }
22121   /* When asked to inline the call anyway, try to pick meaningful choice.
22122      We look for maximal size of block that is faster to copy by hand and
22123      take blocks of at most of that size guessing that average size will
22124      be roughly half of the block.
22125 
22126      If this turns out to be bad, we might simply specify the preferred
22127      choice in ix86_costs.  */
22128   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22129       && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22130     {
22131       int max = -1;
22132       enum stringop_alg alg;
22133       int i;
22134       bool any_alg_usable_p = true;
22135 
22136       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22137         {
22138           enum stringop_alg candidate = algs->size[i].alg;
22139           any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22140 
22141           if (candidate != libcall && candidate
22142               && ALG_USABLE_P (candidate))
22143               max = algs->size[i].max;
22144         }
22145       /* If there aren't any usable algorithms, then recursing on
22146          smaller sizes isn't going to find anything.  Just return the
22147          simple byte-at-a-time copy loop.  */
22148       if (!any_alg_usable_p)
22149         {
22150           /* Pick something reasonable.  */
22151           if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22152             *dynamic_check = 128;
22153           return loop_1_byte;
22154         }
22155       if (max == -1)
22156 	max = 4096;
22157       alg = decide_alg (count, max / 2, memset, dynamic_check);
22158       gcc_assert (*dynamic_check == -1);
22159       gcc_assert (alg != libcall);
22160       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22161 	*dynamic_check = max;
22162       return alg;
22163     }
22164   return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22165 #undef ALG_USABLE_P
22166 }
22167 
22168 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
22169    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
22170 static int
22171 decide_alignment (int align,
22172 		  enum stringop_alg alg,
22173 		  int expected_size)
22174 {
22175   int desired_align = 0;
22176   switch (alg)
22177     {
22178       case no_stringop:
22179 	gcc_unreachable ();
22180       case loop:
22181       case unrolled_loop:
22182 	desired_align = GET_MODE_SIZE (Pmode);
22183 	break;
22184       case rep_prefix_8_byte:
22185 	desired_align = 8;
22186 	break;
22187       case rep_prefix_4_byte:
22188 	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
22189 	   copying whole cacheline at once.  */
22190 	if (TARGET_PENTIUMPRO)
22191 	  desired_align = 8;
22192 	else
22193 	  desired_align = 4;
22194 	break;
22195       case rep_prefix_1_byte:
22196 	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
22197 	   copying whole cacheline at once.  */
22198 	if (TARGET_PENTIUMPRO)
22199 	  desired_align = 8;
22200 	else
22201 	  desired_align = 1;
22202 	break;
22203       case loop_1_byte:
22204 	desired_align = 1;
22205 	break;
22206       case libcall:
22207 	return 0;
22208     }
22209 
22210   if (optimize_size)
22211     desired_align = 1;
22212   if (desired_align < align)
22213     desired_align = align;
22214   if (expected_size != -1 && expected_size < 4)
22215     desired_align = align;
22216   return desired_align;
22217 }
22218 
22219 /* Return the smallest power of 2 greater than VAL.  */
22220 static int
22221 smallest_pow2_greater_than (int val)
22222 {
22223   int ret = 1;
22224   while (ret <= val)
22225     ret <<= 1;
22226   return ret;
22227 }
22228 
22229 /* Expand string move (memcpy) operation.  Use i386 string operations
22230    when profitable.  expand_setmem contains similar code.  The code
22231    depends upon architecture, block size and alignment, but always has
22232    the same overall structure:
22233 
22234    1) Prologue guard: Conditional that jumps up to epilogues for small
22235       blocks that can be handled by epilogue alone.  This is faster
22236       but also needed for correctness, since prologue assume the block
22237       is larger than the desired alignment.
22238 
22239       Optional dynamic check for size and libcall for large
22240       blocks is emitted here too, with -minline-stringops-dynamically.
22241 
22242    2) Prologue: copy first few bytes in order to get destination
22243       aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
22244       than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22245       copied.  We emit either a jump tree on power of two sized
22246       blocks, or a byte loop.
22247 
22248    3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22249       with specified algorithm.
22250 
22251    4) Epilogue: code copying tail of the block that is too small to be
22252       handled by main body (or up to size guarded by prologue guard).  */
22253 
22254 bool
22255 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22256 		    rtx expected_align_exp, rtx expected_size_exp)
22257 {
22258   rtx destreg;
22259   rtx srcreg;
22260   rtx label = NULL;
22261   rtx tmp;
22262   rtx jump_around_label = NULL;
22263   HOST_WIDE_INT align = 1;
22264   unsigned HOST_WIDE_INT count = 0;
22265   HOST_WIDE_INT expected_size = -1;
22266   int size_needed = 0, epilogue_size_needed;
22267   int desired_align = 0, align_bytes = 0;
22268   enum stringop_alg alg;
22269   int dynamic_check;
22270   bool need_zero_guard = false;
22271 
22272   if (CONST_INT_P (align_exp))
22273     align = INTVAL (align_exp);
22274   /* i386 can do misaligned access on reasonably increased cost.  */
22275   if (CONST_INT_P (expected_align_exp)
22276       && INTVAL (expected_align_exp) > align)
22277     align = INTVAL (expected_align_exp);
22278   /* ALIGN is the minimum of destination and source alignment, but we care here
22279      just about destination alignment.  */
22280   else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22281     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22282 
22283   if (CONST_INT_P (count_exp))
22284     count = expected_size = INTVAL (count_exp);
22285   if (CONST_INT_P (expected_size_exp) && count == 0)
22286     expected_size = INTVAL (expected_size_exp);
22287 
22288   /* Make sure we don't need to care about overflow later on.  */
22289   if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22290     return false;
22291 
22292   /* Step 0: Decide on preferred algorithm, desired alignment and
22293      size of chunks to be copied by main loop.  */
22294 
22295   alg = decide_alg (count, expected_size, false, &dynamic_check);
22296   desired_align = decide_alignment (align, alg, expected_size);
22297 
22298   if (!TARGET_ALIGN_STRINGOPS)
22299     align = desired_align;
22300 
22301   if (alg == libcall)
22302     return false;
22303   gcc_assert (alg != no_stringop);
22304   if (!count)
22305     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22306   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22307   srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
22308   switch (alg)
22309     {
22310     case libcall:
22311     case no_stringop:
22312       gcc_unreachable ();
22313     case loop:
22314       need_zero_guard = true;
22315       size_needed = GET_MODE_SIZE (Pmode);
22316       break;
22317     case unrolled_loop:
22318       need_zero_guard = true;
22319       size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
22320       break;
22321     case rep_prefix_8_byte:
22322       size_needed = 8;
22323       break;
22324     case rep_prefix_4_byte:
22325       size_needed = 4;
22326       break;
22327     case rep_prefix_1_byte:
22328       size_needed = 1;
22329       break;
22330     case loop_1_byte:
22331       need_zero_guard = true;
22332       size_needed = 1;
22333       break;
22334     }
22335 
22336   epilogue_size_needed = size_needed;
22337 
22338   /* Step 1: Prologue guard.  */
22339 
22340   /* Alignment code needs count to be in register.  */
22341   if (CONST_INT_P (count_exp) && desired_align > align)
22342     {
22343       if (INTVAL (count_exp) > desired_align
22344 	  && INTVAL (count_exp) > size_needed)
22345 	{
22346 	  align_bytes
22347 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22348 	  if (align_bytes <= 0)
22349 	    align_bytes = 0;
22350 	  else
22351 	    align_bytes = desired_align - align_bytes;
22352 	}
22353       if (align_bytes == 0)
22354 	count_exp = force_reg (counter_mode (count_exp), count_exp);
22355     }
22356   gcc_assert (desired_align >= 1 && align >= 1);
22357 
22358   /* Ensure that alignment prologue won't copy past end of block.  */
22359   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22360     {
22361       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22362       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22363 	 Make sure it is power of 2.  */
22364       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22365 
22366       if (count)
22367 	{
22368 	  if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22369 	    {
22370 	      /* If main algorithm works on QImode, no epilogue is needed.
22371 		 For small sizes just don't align anything.  */
22372 	      if (size_needed == 1)
22373 		desired_align = align;
22374 	      else
22375 		goto epilogue;
22376 	    }
22377 	}
22378       else
22379 	{
22380 	  label = gen_label_rtx ();
22381 	  emit_cmp_and_jump_insns (count_exp,
22382 				   GEN_INT (epilogue_size_needed),
22383 				   LTU, 0, counter_mode (count_exp), 1, label);
22384 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
22385 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22386 	  else
22387 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22388 	}
22389     }
22390 
22391   /* Emit code to decide on runtime whether library call or inline should be
22392      used.  */
22393   if (dynamic_check != -1)
22394     {
22395       if (CONST_INT_P (count_exp))
22396 	{
22397 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22398 	    {
22399 	      emit_block_move_via_libcall (dst, src, count_exp, false);
22400 	      count_exp = const0_rtx;
22401 	      goto epilogue;
22402 	    }
22403 	}
22404       else
22405 	{
22406 	  rtx hot_label = gen_label_rtx ();
22407 	  jump_around_label = gen_label_rtx ();
22408 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22409 				   LEU, 0, GET_MODE (count_exp), 1, hot_label);
22410 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
22411 	  emit_block_move_via_libcall (dst, src, count_exp, false);
22412 	  emit_jump (jump_around_label);
22413 	  emit_label (hot_label);
22414 	}
22415     }
22416 
22417   /* Step 2: Alignment prologue.  */
22418 
22419   if (desired_align > align)
22420     {
22421       if (align_bytes == 0)
22422 	{
22423 	  /* Except for the first move in epilogue, we no longer know
22424 	     constant offset in aliasing info.  It don't seems to worth
22425 	     the pain to maintain it for the first move, so throw away
22426 	     the info early.  */
22427 	  src = change_address (src, BLKmode, srcreg);
22428 	  dst = change_address (dst, BLKmode, destreg);
22429 	  expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22430 				  desired_align);
22431 	}
22432       else
22433 	{
22434 	  /* If we know how many bytes need to be stored before dst is
22435 	     sufficiently aligned, maintain aliasing info accurately.  */
22436 	  dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22437 						 desired_align, align_bytes);
22438 	  count_exp = plus_constant (count_exp, -align_bytes);
22439 	  count -= align_bytes;
22440 	}
22441       if (need_zero_guard
22442 	  && (count < (unsigned HOST_WIDE_INT) size_needed
22443 	      || (align_bytes == 0
22444 		  && count < ((unsigned HOST_WIDE_INT) size_needed
22445 			      + desired_align - align))))
22446 	{
22447 	  /* It is possible that we copied enough so the main loop will not
22448 	     execute.  */
22449 	  gcc_assert (size_needed > 1);
22450 	  if (label == NULL_RTX)
22451 	    label = gen_label_rtx ();
22452 	  emit_cmp_and_jump_insns (count_exp,
22453 				   GEN_INT (size_needed),
22454 				   LTU, 0, counter_mode (count_exp), 1, label);
22455 	  if (expected_size == -1
22456 	      || expected_size < (desired_align - align) / 2 + size_needed)
22457 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22458 	  else
22459 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22460 	}
22461     }
22462   if (label && size_needed == 1)
22463     {
22464       emit_label (label);
22465       LABEL_NUSES (label) = 1;
22466       label = NULL;
22467       epilogue_size_needed = 1;
22468     }
22469   else if (label == NULL_RTX)
22470     epilogue_size_needed = size_needed;
22471 
22472   /* Step 3: Main loop.  */
22473 
22474   switch (alg)
22475     {
22476     case libcall:
22477     case no_stringop:
22478       gcc_unreachable ();
22479     case loop_1_byte:
22480       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22481 				     count_exp, QImode, 1, expected_size);
22482       break;
22483     case loop:
22484       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22485 				     count_exp, Pmode, 1, expected_size);
22486       break;
22487     case unrolled_loop:
22488       /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22489 	 registers for 4 temporaries anyway.  */
22490       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22491 				     count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22492 				     expected_size);
22493       break;
22494     case rep_prefix_8_byte:
22495       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22496 				 DImode);
22497       break;
22498     case rep_prefix_4_byte:
22499       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22500 				 SImode);
22501       break;
22502     case rep_prefix_1_byte:
22503       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22504 				 QImode);
22505       break;
22506     }
22507   /* Adjust properly the offset of src and dest memory for aliasing.  */
22508   if (CONST_INT_P (count_exp))
22509     {
22510       src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22511 					  (count / size_needed) * size_needed);
22512       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22513 					  (count / size_needed) * size_needed);
22514     }
22515   else
22516     {
22517       src = change_address (src, BLKmode, srcreg);
22518       dst = change_address (dst, BLKmode, destreg);
22519     }
22520 
22521   /* Step 4: Epilogue to copy the remaining bytes.  */
22522  epilogue:
22523   if (label)
22524     {
22525       /* When the main loop is done, COUNT_EXP might hold original count,
22526  	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22527 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22528 	 bytes. Compensate if needed.  */
22529 
22530       if (size_needed < epilogue_size_needed)
22531 	{
22532 	  tmp =
22533 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22534 				 GEN_INT (size_needed - 1), count_exp, 1,
22535 				 OPTAB_DIRECT);
22536 	  if (tmp != count_exp)
22537 	    emit_move_insn (count_exp, tmp);
22538 	}
22539       emit_label (label);
22540       LABEL_NUSES (label) = 1;
22541     }
22542 
22543   if (count_exp != const0_rtx && epilogue_size_needed > 1)
22544     expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22545 			    epilogue_size_needed);
22546   if (jump_around_label)
22547     emit_label (jump_around_label);
22548   return true;
22549 }
22550 
22551 /* Helper function for memcpy.  For QImode value 0xXY produce
22552    0xXYXYXYXY of wide specified by MODE.  This is essentially
22553    a * 0x10101010, but we can do slightly better than
22554    synth_mult by unwinding the sequence by hand on CPUs with
22555    slow multiply.  */
22556 static rtx
22557 promote_duplicated_reg (enum machine_mode mode, rtx val)
22558 {
22559   enum machine_mode valmode = GET_MODE (val);
22560   rtx tmp;
22561   int nops = mode == DImode ? 3 : 2;
22562 
22563   gcc_assert (mode == SImode || mode == DImode);
22564   if (val == const0_rtx)
22565     return copy_to_mode_reg (mode, const0_rtx);
22566   if (CONST_INT_P (val))
22567     {
22568       HOST_WIDE_INT v = INTVAL (val) & 255;
22569 
22570       v |= v << 8;
22571       v |= v << 16;
22572       if (mode == DImode)
22573         v |= (v << 16) << 16;
22574       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22575     }
22576 
22577   if (valmode == VOIDmode)
22578     valmode = QImode;
22579   if (valmode != QImode)
22580     val = gen_lowpart (QImode, val);
22581   if (mode == QImode)
22582     return val;
22583   if (!TARGET_PARTIAL_REG_STALL)
22584     nops--;
22585   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22586       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22587       <= (ix86_cost->shift_const + ix86_cost->add) * nops
22588           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22589     {
22590       rtx reg = convert_modes (mode, QImode, val, true);
22591       tmp = promote_duplicated_reg (mode, const1_rtx);
22592       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22593 				  OPTAB_DIRECT);
22594     }
22595   else
22596     {
22597       rtx reg = convert_modes (mode, QImode, val, true);
22598 
22599       if (!TARGET_PARTIAL_REG_STALL)
22600 	if (mode == SImode)
22601 	  emit_insn (gen_movsi_insv_1 (reg, reg));
22602 	else
22603 	  emit_insn (gen_movdi_insv_1 (reg, reg));
22604       else
22605 	{
22606 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22607 				     NULL, 1, OPTAB_DIRECT);
22608 	  reg =
22609 	    expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22610 	}
22611       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22612 			         NULL, 1, OPTAB_DIRECT);
22613       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22614       if (mode == SImode)
22615 	return reg;
22616       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22617 				 NULL, 1, OPTAB_DIRECT);
22618       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22619       return reg;
22620     }
22621 }
22622 
22623 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22624    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22625    alignment from ALIGN to DESIRED_ALIGN.  */
22626 static rtx
22627 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22628 {
22629   rtx promoted_val;
22630 
22631   if (TARGET_64BIT
22632       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22633     promoted_val = promote_duplicated_reg (DImode, val);
22634   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22635     promoted_val = promote_duplicated_reg (SImode, val);
22636   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22637     promoted_val = promote_duplicated_reg (HImode, val);
22638   else
22639     promoted_val = val;
22640 
22641   return promoted_val;
22642 }
22643 
22644 /* Expand string clear operation (bzero).  Use i386 string operations when
22645    profitable.  See expand_movmem comment for explanation of individual
22646    steps performed.  */
22647 bool
22648 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22649 		    rtx expected_align_exp, rtx expected_size_exp)
22650 {
22651   rtx destreg;
22652   rtx label = NULL;
22653   rtx tmp;
22654   rtx jump_around_label = NULL;
22655   HOST_WIDE_INT align = 1;
22656   unsigned HOST_WIDE_INT count = 0;
22657   HOST_WIDE_INT expected_size = -1;
22658   int size_needed = 0, epilogue_size_needed;
22659   int desired_align = 0, align_bytes = 0;
22660   enum stringop_alg alg;
22661   rtx promoted_val = NULL;
22662   bool force_loopy_epilogue = false;
22663   int dynamic_check;
22664   bool need_zero_guard = false;
22665 
22666   if (CONST_INT_P (align_exp))
22667     align = INTVAL (align_exp);
22668   /* i386 can do misaligned access on reasonably increased cost.  */
22669   if (CONST_INT_P (expected_align_exp)
22670       && INTVAL (expected_align_exp) > align)
22671     align = INTVAL (expected_align_exp);
22672   if (CONST_INT_P (count_exp))
22673     count = expected_size = INTVAL (count_exp);
22674   if (CONST_INT_P (expected_size_exp) && count == 0)
22675     expected_size = INTVAL (expected_size_exp);
22676 
22677   /* Make sure we don't need to care about overflow later on.  */
22678   if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22679     return false;
22680 
22681   /* Step 0: Decide on preferred algorithm, desired alignment and
22682      size of chunks to be copied by main loop.  */
22683 
22684   alg = decide_alg (count, expected_size, true, &dynamic_check);
22685   desired_align = decide_alignment (align, alg, expected_size);
22686 
22687   if (!TARGET_ALIGN_STRINGOPS)
22688     align = desired_align;
22689 
22690   if (alg == libcall)
22691     return false;
22692   gcc_assert (alg != no_stringop);
22693   if (!count)
22694     count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22695   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22696   switch (alg)
22697     {
22698     case libcall:
22699     case no_stringop:
22700       gcc_unreachable ();
22701     case loop:
22702       need_zero_guard = true;
22703       size_needed = GET_MODE_SIZE (Pmode);
22704       break;
22705     case unrolled_loop:
22706       need_zero_guard = true;
22707       size_needed = GET_MODE_SIZE (Pmode) * 4;
22708       break;
22709     case rep_prefix_8_byte:
22710       size_needed = 8;
22711       break;
22712     case rep_prefix_4_byte:
22713       size_needed = 4;
22714       break;
22715     case rep_prefix_1_byte:
22716       size_needed = 1;
22717       break;
22718     case loop_1_byte:
22719       need_zero_guard = true;
22720       size_needed = 1;
22721       break;
22722     }
22723   epilogue_size_needed = size_needed;
22724 
22725   /* Step 1: Prologue guard.  */
22726 
22727   /* Alignment code needs count to be in register.  */
22728   if (CONST_INT_P (count_exp) && desired_align > align)
22729     {
22730       if (INTVAL (count_exp) > desired_align
22731 	  && INTVAL (count_exp) > size_needed)
22732 	{
22733 	  align_bytes
22734 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22735 	  if (align_bytes <= 0)
22736 	    align_bytes = 0;
22737 	  else
22738 	    align_bytes = desired_align - align_bytes;
22739 	}
22740       if (align_bytes == 0)
22741 	{
22742 	  enum machine_mode mode = SImode;
22743 	  if (TARGET_64BIT && (count & ~0xffffffff))
22744 	    mode = DImode;
22745 	  count_exp = force_reg (mode, count_exp);
22746 	}
22747     }
22748   /* Do the cheap promotion to allow better CSE across the
22749      main loop and epilogue (ie one load of the big constant in the
22750      front of all code.  */
22751   if (CONST_INT_P (val_exp))
22752     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22753 						   desired_align, align);
22754   /* Ensure that alignment prologue won't copy past end of block.  */
22755   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22756     {
22757       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22758       /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22759 	 Make sure it is power of 2.  */
22760       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22761 
22762       /* To improve performance of small blocks, we jump around the VAL
22763 	 promoting mode.  This mean that if the promoted VAL is not constant,
22764 	 we might not use it in the epilogue and have to use byte
22765 	 loop variant.  */
22766       if (epilogue_size_needed > 2 && !promoted_val)
22767         force_loopy_epilogue = true;
22768       if (count)
22769 	{
22770 	  if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22771 	    {
22772 	      /* If main algorithm works on QImode, no epilogue is needed.
22773 		 For small sizes just don't align anything.  */
22774 	      if (size_needed == 1)
22775 		desired_align = align;
22776 	      else
22777 		goto epilogue;
22778 	    }
22779 	}
22780       else
22781 	{
22782 	  label = gen_label_rtx ();
22783 	  emit_cmp_and_jump_insns (count_exp,
22784 				   GEN_INT (epilogue_size_needed),
22785 				   LTU, 0, counter_mode (count_exp), 1, label);
22786 	  if (expected_size == -1 || expected_size <= epilogue_size_needed)
22787 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22788 	  else
22789 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22790 	}
22791     }
22792   if (dynamic_check != -1)
22793     {
22794       rtx hot_label = gen_label_rtx ();
22795       jump_around_label = gen_label_rtx ();
22796       emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22797 			       LEU, 0, counter_mode (count_exp), 1, hot_label);
22798       predict_jump (REG_BR_PROB_BASE * 90 / 100);
22799       set_storage_via_libcall (dst, count_exp, val_exp, false);
22800       emit_jump (jump_around_label);
22801       emit_label (hot_label);
22802     }
22803 
22804   /* Step 2: Alignment prologue.  */
22805 
22806   /* Do the expensive promotion once we branched off the small blocks.  */
22807   if (!promoted_val)
22808     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22809 						   desired_align, align);
22810   gcc_assert (desired_align >= 1 && align >= 1);
22811 
22812   if (desired_align > align)
22813     {
22814       if (align_bytes == 0)
22815 	{
22816 	  /* Except for the first move in epilogue, we no longer know
22817 	     constant offset in aliasing info.  It don't seems to worth
22818 	     the pain to maintain it for the first move, so throw away
22819 	     the info early.  */
22820 	  dst = change_address (dst, BLKmode, destreg);
22821 	  expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22822 				  desired_align);
22823 	}
22824       else
22825 	{
22826 	  /* If we know how many bytes need to be stored before dst is
22827 	     sufficiently aligned, maintain aliasing info accurately.  */
22828 	  dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22829 						 desired_align, align_bytes);
22830 	  count_exp = plus_constant (count_exp, -align_bytes);
22831 	  count -= align_bytes;
22832 	}
22833       if (need_zero_guard
22834 	  && (count < (unsigned HOST_WIDE_INT) size_needed
22835 	      || (align_bytes == 0
22836 		  && count < ((unsigned HOST_WIDE_INT) size_needed
22837 			      + desired_align - align))))
22838 	{
22839 	  /* It is possible that we copied enough so the main loop will not
22840 	     execute.  */
22841 	  gcc_assert (size_needed > 1);
22842 	  if (label == NULL_RTX)
22843 	    label = gen_label_rtx ();
22844 	  emit_cmp_and_jump_insns (count_exp,
22845 				   GEN_INT (size_needed),
22846 				   LTU, 0, counter_mode (count_exp), 1, label);
22847 	  if (expected_size == -1
22848 	      || expected_size < (desired_align - align) / 2 + size_needed)
22849 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
22850 	  else
22851 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
22852 	}
22853     }
22854   if (label && size_needed == 1)
22855     {
22856       emit_label (label);
22857       LABEL_NUSES (label) = 1;
22858       label = NULL;
22859       promoted_val = val_exp;
22860       epilogue_size_needed = 1;
22861     }
22862   else if (label == NULL_RTX)
22863     epilogue_size_needed = size_needed;
22864 
22865   /* Step 3: Main loop.  */
22866 
22867   switch (alg)
22868     {
22869     case libcall:
22870     case no_stringop:
22871       gcc_unreachable ();
22872     case loop_1_byte:
22873       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22874 				     count_exp, QImode, 1, expected_size);
22875       break;
22876     case loop:
22877       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22878 				     count_exp, Pmode, 1, expected_size);
22879       break;
22880     case unrolled_loop:
22881       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22882 				     count_exp, Pmode, 4, expected_size);
22883       break;
22884     case rep_prefix_8_byte:
22885       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22886 				  DImode, val_exp);
22887       break;
22888     case rep_prefix_4_byte:
22889       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22890 				  SImode, val_exp);
22891       break;
22892     case rep_prefix_1_byte:
22893       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22894 				  QImode, val_exp);
22895       break;
22896     }
22897   /* Adjust properly the offset of src and dest memory for aliasing.  */
22898   if (CONST_INT_P (count_exp))
22899     dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22900 					(count / size_needed) * size_needed);
22901   else
22902     dst = change_address (dst, BLKmode, destreg);
22903 
22904   /* Step 4: Epilogue to copy the remaining bytes.  */
22905 
22906   if (label)
22907     {
22908       /* When the main loop is done, COUNT_EXP might hold original count,
22909  	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22910 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22911 	 bytes. Compensate if needed.  */
22912 
22913       if (size_needed < epilogue_size_needed)
22914 	{
22915 	  tmp =
22916 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22917 				 GEN_INT (size_needed - 1), count_exp, 1,
22918 				 OPTAB_DIRECT);
22919 	  if (tmp != count_exp)
22920 	    emit_move_insn (count_exp, tmp);
22921 	}
22922       emit_label (label);
22923       LABEL_NUSES (label) = 1;
22924     }
22925  epilogue:
22926   if (count_exp != const0_rtx && epilogue_size_needed > 1)
22927     {
22928       if (force_loopy_epilogue)
22929 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22930 					 epilogue_size_needed);
22931       else
22932 	expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22933 				epilogue_size_needed);
22934     }
22935   if (jump_around_label)
22936     emit_label (jump_around_label);
22937   return true;
22938 }
22939 
22940 /* Expand the appropriate insns for doing strlen if not just doing
22941    repnz; scasb
22942 
22943    out = result, initialized with the start address
22944    align_rtx = alignment of the address.
22945    scratch = scratch register, initialized with the startaddress when
22946 	not aligned, otherwise undefined
22947 
22948    This is just the body. It needs the initializations mentioned above and
22949    some address computing at the end.  These things are done in i386.md.  */
22950 
22951 static void
22952 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22953 {
22954   int align;
22955   rtx tmp;
22956   rtx align_2_label = NULL_RTX;
22957   rtx align_3_label = NULL_RTX;
22958   rtx align_4_label = gen_label_rtx ();
22959   rtx end_0_label = gen_label_rtx ();
22960   rtx mem;
22961   rtx tmpreg = gen_reg_rtx (SImode);
22962   rtx scratch = gen_reg_rtx (SImode);
22963   rtx cmp;
22964 
22965   align = 0;
22966   if (CONST_INT_P (align_rtx))
22967     align = INTVAL (align_rtx);
22968 
22969   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
22970 
22971   /* Is there a known alignment and is it less than 4?  */
22972   if (align < 4)
22973     {
22974       rtx scratch1 = gen_reg_rtx (Pmode);
22975       emit_move_insn (scratch1, out);
22976       /* Is there a known alignment and is it not 2? */
22977       if (align != 2)
22978 	{
22979 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22980 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22981 
22982 	  /* Leave just the 3 lower bits.  */
22983 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22984 				    NULL_RTX, 0, OPTAB_WIDEN);
22985 
22986 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22987 				   Pmode, 1, align_4_label);
22988 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22989 				   Pmode, 1, align_2_label);
22990 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22991 				   Pmode, 1, align_3_label);
22992 	}
22993       else
22994         {
22995 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
22996 	     check if is aligned to 4 - byte.  */
22997 
22998 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22999 				    NULL_RTX, 0, OPTAB_WIDEN);
23000 
23001 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23002 				   Pmode, 1, align_4_label);
23003         }
23004 
23005       mem = change_address (src, QImode, out);
23006 
23007       /* Now compare the bytes.  */
23008 
23009       /* Compare the first n unaligned byte on a byte per byte basis.  */
23010       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23011 			       QImode, 1, end_0_label);
23012 
23013       /* Increment the address.  */
23014       emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23015 
23016       /* Not needed with an alignment of 2 */
23017       if (align != 2)
23018 	{
23019 	  emit_label (align_2_label);
23020 
23021 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23022 				   end_0_label);
23023 
23024 	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23025 
23026 	  emit_label (align_3_label);
23027 	}
23028 
23029       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23030 			       end_0_label);
23031 
23032       emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23033     }
23034 
23035   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
23036      align this loop.  It gives only huge programs, but does not help to
23037      speed up.  */
23038   emit_label (align_4_label);
23039 
23040   mem = change_address (src, SImode, out);
23041   emit_move_insn (scratch, mem);
23042   emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23043 
23044   /* This formula yields a nonzero result iff one of the bytes is zero.
23045      This saves three branches inside loop and many cycles.  */
23046 
23047   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23048   emit_insn (gen_one_cmplsi2 (scratch, scratch));
23049   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23050   emit_insn (gen_andsi3 (tmpreg, tmpreg,
23051 			 gen_int_mode (0x80808080, SImode)));
23052   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23053 			   align_4_label);
23054 
23055   if (TARGET_CMOVE)
23056     {
23057        rtx reg = gen_reg_rtx (SImode);
23058        rtx reg2 = gen_reg_rtx (Pmode);
23059        emit_move_insn (reg, tmpreg);
23060        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23061 
23062        /* If zero is not in the first two bytes, move two bytes forward.  */
23063        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23064        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23065        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23066        emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23067 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
23068 						     reg,
23069 						     tmpreg)));
23070        /* Emit lea manually to avoid clobbering of flags.  */
23071        emit_insn (gen_rtx_SET (SImode, reg2,
23072 			       gen_rtx_PLUS (Pmode, out, const2_rtx)));
23073 
23074        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23075        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23076        emit_insn (gen_rtx_SET (VOIDmode, out,
23077 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23078 						     reg2,
23079 						     out)));
23080     }
23081   else
23082     {
23083        rtx end_2_label = gen_label_rtx ();
23084        /* Is zero in the first two bytes? */
23085 
23086        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23087        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23088        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23089        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23090                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23091                             pc_rtx);
23092        tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23093        JUMP_LABEL (tmp) = end_2_label;
23094 
23095        /* Not in the first two.  Move two bytes forward.  */
23096        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23097        emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23098 
23099        emit_label (end_2_label);
23100 
23101     }
23102 
23103   /* Avoid branch in fixing the byte.  */
23104   tmpreg = gen_lowpart (QImode, tmpreg);
23105   emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23106   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23107   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23108   emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23109 
23110   emit_label (end_0_label);
23111 }
23112 
23113 /* Expand strlen.  */
23114 
23115 bool
23116 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23117 {
23118   rtx addr, scratch1, scratch2, scratch3, scratch4;
23119 
23120   /* The generic case of strlen expander is long.  Avoid it's
23121      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
23122 
23123   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23124       && !TARGET_INLINE_ALL_STRINGOPS
23125       && !optimize_insn_for_size_p ()
23126       && (!CONST_INT_P (align) || INTVAL (align) < 4))
23127     return false;
23128 
23129   addr = force_reg (Pmode, XEXP (src, 0));
23130   scratch1 = gen_reg_rtx (Pmode);
23131 
23132   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23133       && !optimize_insn_for_size_p ())
23134     {
23135       /* Well it seems that some optimizer does not combine a call like
23136          foo(strlen(bar), strlen(bar));
23137          when the move and the subtraction is done here.  It does calculate
23138          the length just once when these instructions are done inside of
23139          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
23140          often used and I use one fewer register for the lifetime of
23141          output_strlen_unroll() this is better.  */
23142 
23143       emit_move_insn (out, addr);
23144 
23145       ix86_expand_strlensi_unroll_1 (out, src, align);
23146 
23147       /* strlensi_unroll_1 returns the address of the zero at the end of
23148          the string, like memchr(), so compute the length by subtracting
23149          the start address.  */
23150       emit_insn (ix86_gen_sub3 (out, out, addr));
23151     }
23152   else
23153     {
23154       rtx unspec;
23155 
23156       /* Can't use this if the user has appropriated eax, ecx, or edi.  */
23157       if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23158         return false;
23159 
23160       scratch2 = gen_reg_rtx (Pmode);
23161       scratch3 = gen_reg_rtx (Pmode);
23162       scratch4 = force_reg (Pmode, constm1_rtx);
23163 
23164       emit_move_insn (scratch3, addr);
23165       eoschar = force_reg (QImode, eoschar);
23166 
23167       src = replace_equiv_address_nv (src, scratch3);
23168 
23169       /* If .md starts supporting :P, this can be done in .md.  */
23170       unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23171 						 scratch4), UNSPEC_SCAS);
23172       emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23173       emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23174       emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23175     }
23176   return true;
23177 }
23178 
23179 /* For given symbol (function) construct code to compute address of it's PLT
23180    entry in large x86-64 PIC model.  */
23181 rtx
23182 construct_plt_address (rtx symbol)
23183 {
23184   rtx tmp = gen_reg_rtx (Pmode);
23185   rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23186 
23187   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23188   gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23189 
23190   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23191   emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
23192   return tmp;
23193 }
23194 
23195 rtx
23196 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23197 		  rtx callarg2,
23198 		  rtx pop, bool sibcall)
23199 {
23200   /* We need to represent that SI and DI registers are clobbered
23201      by SYSV calls.  */
23202   static int clobbered_registers[] = {
23203 	XMM6_REG, XMM7_REG, XMM8_REG,
23204 	XMM9_REG, XMM10_REG, XMM11_REG,
23205 	XMM12_REG, XMM13_REG, XMM14_REG,
23206 	XMM15_REG, SI_REG, DI_REG
23207   };
23208   rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23209   rtx use = NULL, call;
23210   unsigned int vec_len;
23211 
23212   if (pop == const0_rtx)
23213     pop = NULL;
23214   gcc_assert (!TARGET_64BIT || !pop);
23215 
23216   if (TARGET_MACHO && !TARGET_64BIT)
23217     {
23218 #if TARGET_MACHO
23219       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23220 	fnaddr = machopic_indirect_call_target (fnaddr);
23221 #endif
23222     }
23223   else
23224     {
23225       /* Static functions and indirect calls don't need the pic register.  */
23226       if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23227 	  && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23228 	  && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23229 	use_reg (&use, pic_offset_table_rtx);
23230     }
23231 
23232   if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23233     {
23234       rtx al = gen_rtx_REG (QImode, AX_REG);
23235       emit_move_insn (al, callarg2);
23236       use_reg (&use, al);
23237     }
23238 
23239   if (ix86_cmodel == CM_LARGE_PIC
23240       && MEM_P (fnaddr)
23241       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23242       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23243     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23244   else if (sibcall
23245 	   ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
23246 	   : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
23247     {
23248       fnaddr = XEXP (fnaddr, 0);
23249       if (GET_MODE (fnaddr) != Pmode)
23250 	fnaddr = convert_to_mode (Pmode, fnaddr, 1);
23251       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
23252     }
23253 
23254   vec_len = 0;
23255   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23256   if (retval)
23257     call = gen_rtx_SET (VOIDmode, retval, call);
23258   vec[vec_len++] = call;
23259 
23260   if (pop)
23261     {
23262       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23263       pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23264       vec[vec_len++] = pop;
23265     }
23266 
23267   if (TARGET_64BIT_MS_ABI
23268       && (!callarg2 || INTVAL (callarg2) != -2))
23269     {
23270       unsigned i;
23271 
23272       vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23273 				       UNSPEC_MS_TO_SYSV_CALL);
23274 
23275       for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23276         vec[vec_len++]
23277 	  = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23278 			     ? TImode : DImode,
23279 			     gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23280 					  ? TImode : DImode,
23281 					  clobbered_registers[i]));
23282     }
23283 
23284   /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration.  */
23285   if (TARGET_VZEROUPPER)
23286     {
23287       int avx256;
23288       if (cfun->machine->callee_pass_avx256_p)
23289 	{
23290 	  if (cfun->machine->callee_return_avx256_p)
23291 	    avx256 = callee_return_pass_avx256;
23292 	  else
23293 	    avx256 = callee_pass_avx256;
23294 	}
23295       else if (cfun->machine->callee_return_avx256_p)
23296 	avx256 = callee_return_avx256;
23297       else
23298 	avx256 = call_no_avx256;
23299 
23300       if (reload_completed)
23301 	emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23302       else
23303 	vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23304 					 gen_rtvec (1, GEN_INT (avx256)),
23305 					 UNSPEC_CALL_NEEDS_VZEROUPPER);
23306     }
23307 
23308   if (vec_len > 1)
23309     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23310   call = emit_call_insn (call);
23311   if (use)
23312     CALL_INSN_FUNCTION_USAGE (call) = use;
23313 
23314   return call;
23315 }
23316 
23317 void
23318 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23319 {
23320   rtx pat = PATTERN (insn);
23321   rtvec vec = XVEC (pat, 0);
23322   int len = GET_NUM_ELEM (vec) - 1;
23323 
23324   /* Strip off the last entry of the parallel.  */
23325   gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23326   gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23327   if (len == 1)
23328     pat = RTVEC_ELT (vec, 0);
23329   else
23330     pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23331 
23332   emit_insn (gen_avx_vzeroupper (vzeroupper));
23333   emit_call_insn (pat);
23334 }
23335 
23336 /* Output the assembly for a call instruction.  */
23337 
23338 const char *
23339 ix86_output_call_insn (rtx insn, rtx call_op)
23340 {
23341   bool direct_p = constant_call_address_operand (call_op, Pmode);
23342   bool seh_nop_p = false;
23343   const char *xasm;
23344 
23345   if (SIBLING_CALL_P (insn))
23346     {
23347       if (direct_p)
23348 	xasm = "jmp\t%P0";
23349       /* SEH epilogue detection requires the indirect branch case
23350 	 to include REX.W.  */
23351       else if (TARGET_SEH)
23352 	xasm = "rex.W jmp %A0";
23353       else
23354 	xasm = "jmp\t%A0";
23355 
23356       output_asm_insn (xasm, &call_op);
23357       return "";
23358     }
23359 
23360   /* SEH unwinding can require an extra nop to be emitted in several
23361      circumstances.  Determine if we have one of those.  */
23362   if (TARGET_SEH)
23363     {
23364       rtx i;
23365 
23366       for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23367 	{
23368 	  /* If we get to another real insn, we don't need the nop.  */
23369 	  if (INSN_P (i))
23370 	    break;
23371 
23372 	  /* If we get to the epilogue note, prevent a catch region from
23373 	     being adjacent to the standard epilogue sequence.  If non-
23374 	     call-exceptions, we'll have done this during epilogue emission. */
23375 	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23376 	      && !flag_non_call_exceptions
23377 	      && !can_throw_internal (insn))
23378 	    {
23379 	      seh_nop_p = true;
23380 	      break;
23381 	    }
23382 	}
23383 
23384       /* If we didn't find a real insn following the call, prevent the
23385 	 unwinder from looking into the next function.  */
23386       if (i == NULL)
23387 	seh_nop_p = true;
23388     }
23389 
23390   if (direct_p)
23391     xasm = "call\t%P0";
23392   else
23393     xasm = "call\t%A0";
23394 
23395   output_asm_insn (xasm, &call_op);
23396 
23397   if (seh_nop_p)
23398     return "nop";
23399 
23400   return "";
23401 }
23402 
23403 /* Clear stack slot assignments remembered from previous functions.
23404    This is called from INIT_EXPANDERS once before RTL is emitted for each
23405    function.  */
23406 
23407 static struct machine_function *
23408 ix86_init_machine_status (void)
23409 {
23410   struct machine_function *f;
23411 
23412   f = ggc_alloc_cleared_machine_function ();
23413   f->use_fast_prologue_epilogue_nregs = -1;
23414   f->call_abi = ix86_abi;
23415 
23416   return f;
23417 }
23418 
23419 /* Return a MEM corresponding to a stack slot with mode MODE.
23420    Allocate a new slot if necessary.
23421 
23422    The RTL for a function can have several slots available: N is
23423    which slot to use.  */
23424 
23425 rtx
23426 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23427 {
23428   struct stack_local_entry *s;
23429 
23430   gcc_assert (n < MAX_386_STACK_LOCALS);
23431 
23432   for (s = ix86_stack_locals; s; s = s->next)
23433     if (s->mode == mode && s->n == n)
23434       return validize_mem (copy_rtx (s->rtl));
23435 
23436   s = ggc_alloc_stack_local_entry ();
23437   s->n = n;
23438   s->mode = mode;
23439   s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23440 
23441   s->next = ix86_stack_locals;
23442   ix86_stack_locals = s;
23443   return validize_mem (s->rtl);
23444 }
23445 
23446 static void
23447 ix86_instantiate_decls (void)
23448 {
23449   struct stack_local_entry *s;
23450 
23451   for (s = ix86_stack_locals; s; s = s->next)
23452     if (s->rtl != NULL_RTX)
23453       instantiate_decl_rtl (s->rtl);
23454 }
23455 
23456 /* Calculate the length of the memory address in the instruction encoding.
23457    Includes addr32 prefix, does not include the one-byte modrm, opcode,
23458    or other prefixes.  We never generate addr32 prefix for LEA insn.  */
23459 
23460 int
23461 memory_address_length (rtx addr, bool lea)
23462 {
23463   struct ix86_address parts;
23464   rtx base, index, disp;
23465   int len;
23466   int ok;
23467 
23468   if (GET_CODE (addr) == PRE_DEC
23469       || GET_CODE (addr) == POST_INC
23470       || GET_CODE (addr) == PRE_MODIFY
23471       || GET_CODE (addr) == POST_MODIFY)
23472     return 0;
23473 
23474   ok = ix86_decompose_address (addr, &parts);
23475   gcc_assert (ok);
23476 
23477   len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23478 
23479   /*  If this is not LEA instruction, add the length of addr32 prefix.  */
23480   if (TARGET_64BIT && !lea
23481       && (SImode_address_operand (addr, VOIDmode)
23482 	  || (parts.base && GET_MODE (parts.base) == SImode)
23483 	  || (parts.index && GET_MODE (parts.index) == SImode)))
23484     len++;
23485 
23486   base = parts.base;
23487   index = parts.index;
23488   disp = parts.disp;
23489 
23490   if (base && GET_CODE (base) == SUBREG)
23491     base = SUBREG_REG (base);
23492   if (index && GET_CODE (index) == SUBREG)
23493     index = SUBREG_REG (index);
23494 
23495   gcc_assert (base == NULL_RTX || REG_P (base));
23496   gcc_assert (index == NULL_RTX || REG_P (index));
23497 
23498   /* Rule of thumb:
23499        - esp as the base always wants an index,
23500        - ebp as the base always wants a displacement,
23501        - r12 as the base always wants an index,
23502        - r13 as the base always wants a displacement.  */
23503 
23504   /* Register Indirect.  */
23505   if (base && !index && !disp)
23506     {
23507       /* esp (for its index) and ebp (for its displacement) need
23508 	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
23509 	 code.  */
23510       if (base == arg_pointer_rtx
23511 	  || base == frame_pointer_rtx
23512 	  || REGNO (base) == SP_REG
23513 	  || REGNO (base) == BP_REG
23514 	  || REGNO (base) == R12_REG
23515 	  || REGNO (base) == R13_REG)
23516 	len++;
23517     }
23518 
23519   /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
23520      is not disp32, but disp32(%rip), so for disp32
23521      SIB byte is needed, unless print_operand_address
23522      optimizes it into disp32(%rip) or (%rip) is implied
23523      by UNSPEC.  */
23524   else if (disp && !base && !index)
23525     {
23526       len += 4;
23527       if (TARGET_64BIT)
23528 	{
23529 	  rtx symbol = disp;
23530 
23531 	  if (GET_CODE (disp) == CONST)
23532 	    symbol = XEXP (disp, 0);
23533 	  if (GET_CODE (symbol) == PLUS
23534 	      && CONST_INT_P (XEXP (symbol, 1)))
23535 	    symbol = XEXP (symbol, 0);
23536 
23537 	  if (GET_CODE (symbol) != LABEL_REF
23538 	      && (GET_CODE (symbol) != SYMBOL_REF
23539 		  || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23540 	      && (GET_CODE (symbol) != UNSPEC
23541 		  || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23542 		      && XINT (symbol, 1) != UNSPEC_PCREL
23543 		      && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23544 	    len++;
23545 	}
23546     }
23547   else
23548     {
23549       /* Find the length of the displacement constant.  */
23550       if (disp)
23551 	{
23552 	  if (base && satisfies_constraint_K (disp))
23553 	    len += 1;
23554 	  else
23555 	    len += 4;
23556 	}
23557       /* ebp always wants a displacement.  Similarly r13.  */
23558       else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23559 	len++;
23560 
23561       /* An index requires the two-byte modrm form....  */
23562       if (index
23563 	  /* ...like esp (or r12), which always wants an index.  */
23564 	  || base == arg_pointer_rtx
23565 	  || base == frame_pointer_rtx
23566 	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23567 	len++;
23568     }
23569 
23570   return len;
23571 }
23572 
23573 /* Compute default value for "length_immediate" attribute.  When SHORTFORM
23574    is set, expect that insn have 8bit immediate alternative.  */
23575 int
23576 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23577 {
23578   int len = 0;
23579   int i;
23580   extract_insn_cached (insn);
23581   for (i = recog_data.n_operands - 1; i >= 0; --i)
23582     if (CONSTANT_P (recog_data.operand[i]))
23583       {
23584         enum attr_mode mode = get_attr_mode (insn);
23585 
23586 	gcc_assert (!len);
23587 	if (shortform && CONST_INT_P (recog_data.operand[i]))
23588 	  {
23589 	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23590 	    switch (mode)
23591 	      {
23592 	      case MODE_QI:
23593 		len = 1;
23594 		continue;
23595 	      case MODE_HI:
23596 		ival = trunc_int_for_mode (ival, HImode);
23597 		break;
23598 	      case MODE_SI:
23599 		ival = trunc_int_for_mode (ival, SImode);
23600 		break;
23601 	      default:
23602 		break;
23603 	      }
23604 	    if (IN_RANGE (ival, -128, 127))
23605 	      {
23606 		len = 1;
23607 		continue;
23608 	      }
23609 	  }
23610 	switch (mode)
23611 	  {
23612 	  case MODE_QI:
23613 	    len = 1;
23614 	    break;
23615 	  case MODE_HI:
23616 	    len = 2;
23617 	    break;
23618 	  case MODE_SI:
23619 	    len = 4;
23620 	    break;
23621 	  /* Immediates for DImode instructions are encoded
23622 	     as 32bit sign extended values.  */
23623 	  case MODE_DI:
23624 	    len = 4;
23625 	    break;
23626 	  default:
23627 	    fatal_insn ("unknown insn mode", insn);
23628 	}
23629       }
23630   return len;
23631 }
23632 
23633 /* Compute default value for "length_address" attribute.  */
23634 int
23635 ix86_attr_length_address_default (rtx insn)
23636 {
23637   int i;
23638 
23639   if (get_attr_type (insn) == TYPE_LEA)
23640     {
23641       rtx set = PATTERN (insn), addr;
23642 
23643       if (GET_CODE (set) == PARALLEL)
23644 	set = XVECEXP (set, 0, 0);
23645 
23646       gcc_assert (GET_CODE (set) == SET);
23647 
23648       addr = SET_SRC (set);
23649 
23650       return memory_address_length (addr, true);
23651     }
23652 
23653   extract_insn_cached (insn);
23654   for (i = recog_data.n_operands - 1; i >= 0; --i)
23655     if (MEM_P (recog_data.operand[i]))
23656       {
23657         constrain_operands_cached (reload_completed);
23658         if (which_alternative != -1)
23659 	  {
23660 	    const char *constraints = recog_data.constraints[i];
23661 	    int alt = which_alternative;
23662 
23663 	    while (*constraints == '=' || *constraints == '+')
23664 	      constraints++;
23665 	    while (alt-- > 0)
23666 	      while (*constraints++ != ',')
23667 		;
23668 	    /* Skip ignored operands.  */
23669 	    if (*constraints == 'X')
23670 	      continue;
23671 	  }
23672 	return memory_address_length (XEXP (recog_data.operand[i], 0), false);
23673       }
23674   return 0;
23675 }
23676 
23677 /* Compute default value for "length_vex" attribute. It includes
23678    2 or 3 byte VEX prefix and 1 opcode byte.  */
23679 
23680 int
23681 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23682 {
23683   int i;
23684 
23685   /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
23686      byte VEX prefix.  */
23687   if (!has_0f_opcode || has_vex_w)
23688     return 3 + 1;
23689 
23690  /* We can always use 2 byte VEX prefix in 32bit.  */
23691   if (!TARGET_64BIT)
23692     return 2 + 1;
23693 
23694   extract_insn_cached (insn);
23695 
23696   for (i = recog_data.n_operands - 1; i >= 0; --i)
23697     if (REG_P (recog_data.operand[i]))
23698       {
23699 	/* REX.W bit uses 3 byte VEX prefix.  */
23700 	if (GET_MODE (recog_data.operand[i]) == DImode
23701 	    && GENERAL_REG_P (recog_data.operand[i]))
23702 	  return 3 + 1;
23703       }
23704     else
23705       {
23706 	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
23707 	if (MEM_P (recog_data.operand[i])
23708 	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23709 	  return 3 + 1;
23710       }
23711 
23712   return 2 + 1;
23713 }
23714 
23715 /* Return the maximum number of instructions a cpu can issue.  */
23716 
23717 static int
23718 ix86_issue_rate (void)
23719 {
23720   switch (ix86_tune)
23721     {
23722     case PROCESSOR_PENTIUM:
23723     case PROCESSOR_ATOM:
23724     case PROCESSOR_K6:
23725       return 2;
23726 
23727     case PROCESSOR_PENTIUMPRO:
23728     case PROCESSOR_PENTIUM4:
23729     case PROCESSOR_CORE2_32:
23730     case PROCESSOR_CORE2_64:
23731     case PROCESSOR_COREI7_32:
23732     case PROCESSOR_COREI7_64:
23733     case PROCESSOR_ATHLON:
23734     case PROCESSOR_K8:
23735     case PROCESSOR_AMDFAM10:
23736     case PROCESSOR_NOCONA:
23737     case PROCESSOR_GENERIC32:
23738     case PROCESSOR_GENERIC64:
23739     case PROCESSOR_BDVER1:
23740     case PROCESSOR_BDVER2:
23741     case PROCESSOR_BTVER1:
23742       return 3;
23743 
23744     default:
23745       return 1;
23746     }
23747 }
23748 
23749 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23750    by DEP_INSN and nothing set by DEP_INSN.  */
23751 
23752 static bool
23753 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23754 {
23755   rtx set, set2;
23756 
23757   /* Simplify the test for uninteresting insns.  */
23758   if (insn_type != TYPE_SETCC
23759       && insn_type != TYPE_ICMOV
23760       && insn_type != TYPE_FCMOV
23761       && insn_type != TYPE_IBR)
23762     return false;
23763 
23764   if ((set = single_set (dep_insn)) != 0)
23765     {
23766       set = SET_DEST (set);
23767       set2 = NULL_RTX;
23768     }
23769   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23770 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
23771 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23772 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23773     {
23774       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23775       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23776     }
23777   else
23778     return false;
23779 
23780   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23781     return false;
23782 
23783   /* This test is true if the dependent insn reads the flags but
23784      not any other potentially set register.  */
23785   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23786     return false;
23787 
23788   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23789     return false;
23790 
23791   return true;
23792 }
23793 
23794 /* Return true iff USE_INSN has a memory address with operands set by
23795    SET_INSN.  */
23796 
23797 bool
23798 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23799 {
23800   int i;
23801   extract_insn_cached (use_insn);
23802   for (i = recog_data.n_operands - 1; i >= 0; --i)
23803     if (MEM_P (recog_data.operand[i]))
23804       {
23805 	rtx addr = XEXP (recog_data.operand[i], 0);
23806 	return modified_in_p (addr, set_insn) != 0;
23807       }
23808   return false;
23809 }
23810 
23811 static int
23812 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23813 {
23814   enum attr_type insn_type, dep_insn_type;
23815   enum attr_memory memory;
23816   rtx set, set2;
23817   int dep_insn_code_number;
23818 
23819   /* Anti and output dependencies have zero cost on all CPUs.  */
23820   if (REG_NOTE_KIND (link) != 0)
23821     return 0;
23822 
23823   dep_insn_code_number = recog_memoized (dep_insn);
23824 
23825   /* If we can't recognize the insns, we can't really do anything.  */
23826   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23827     return cost;
23828 
23829   insn_type = get_attr_type (insn);
23830   dep_insn_type = get_attr_type (dep_insn);
23831 
23832   switch (ix86_tune)
23833     {
23834     case PROCESSOR_PENTIUM:
23835       /* Address Generation Interlock adds a cycle of latency.  */
23836       if (insn_type == TYPE_LEA)
23837 	{
23838 	  rtx addr = PATTERN (insn);
23839 
23840 	  if (GET_CODE (addr) == PARALLEL)
23841 	    addr = XVECEXP (addr, 0, 0);
23842 
23843 	  gcc_assert (GET_CODE (addr) == SET);
23844 
23845 	  addr = SET_SRC (addr);
23846 	  if (modified_in_p (addr, dep_insn))
23847 	    cost += 1;
23848 	}
23849       else if (ix86_agi_dependent (dep_insn, insn))
23850 	cost += 1;
23851 
23852       /* ??? Compares pair with jump/setcc.  */
23853       if (ix86_flags_dependent (insn, dep_insn, insn_type))
23854 	cost = 0;
23855 
23856       /* Floating point stores require value to be ready one cycle earlier.  */
23857       if (insn_type == TYPE_FMOV
23858 	  && get_attr_memory (insn) == MEMORY_STORE
23859 	  && !ix86_agi_dependent (dep_insn, insn))
23860 	cost += 1;
23861       break;
23862 
23863     case PROCESSOR_PENTIUMPRO:
23864       memory = get_attr_memory (insn);
23865 
23866       /* INT->FP conversion is expensive.  */
23867       if (get_attr_fp_int_src (dep_insn))
23868 	cost += 5;
23869 
23870       /* There is one cycle extra latency between an FP op and a store.  */
23871       if (insn_type == TYPE_FMOV
23872 	  && (set = single_set (dep_insn)) != NULL_RTX
23873 	  && (set2 = single_set (insn)) != NULL_RTX
23874 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23875 	  && MEM_P (SET_DEST (set2)))
23876 	cost += 1;
23877 
23878       /* Show ability of reorder buffer to hide latency of load by executing
23879 	 in parallel with previous instruction in case
23880 	 previous instruction is not needed to compute the address.  */
23881       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23882 	  && !ix86_agi_dependent (dep_insn, insn))
23883 	{
23884 	  /* Claim moves to take one cycle, as core can issue one load
23885 	     at time and the next load can start cycle later.  */
23886 	  if (dep_insn_type == TYPE_IMOV
23887 	      || dep_insn_type == TYPE_FMOV)
23888 	    cost = 1;
23889 	  else if (cost > 1)
23890 	    cost--;
23891 	}
23892       break;
23893 
23894     case PROCESSOR_K6:
23895       memory = get_attr_memory (insn);
23896 
23897       /* The esp dependency is resolved before the instruction is really
23898          finished.  */
23899       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23900 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23901 	return 1;
23902 
23903       /* INT->FP conversion is expensive.  */
23904       if (get_attr_fp_int_src (dep_insn))
23905 	cost += 5;
23906 
23907       /* Show ability of reorder buffer to hide latency of load by executing
23908 	 in parallel with previous instruction in case
23909 	 previous instruction is not needed to compute the address.  */
23910       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23911 	  && !ix86_agi_dependent (dep_insn, insn))
23912 	{
23913 	  /* Claim moves to take one cycle, as core can issue one load
23914 	     at time and the next load can start cycle later.  */
23915 	  if (dep_insn_type == TYPE_IMOV
23916 	      || dep_insn_type == TYPE_FMOV)
23917 	    cost = 1;
23918 	  else if (cost > 2)
23919 	    cost -= 2;
23920 	  else
23921 	    cost = 1;
23922 	}
23923       break;
23924 
23925     case PROCESSOR_ATHLON:
23926     case PROCESSOR_K8:
23927     case PROCESSOR_AMDFAM10:
23928     case PROCESSOR_BDVER1:
23929     case PROCESSOR_BDVER2:
23930     case PROCESSOR_BTVER1:
23931     case PROCESSOR_ATOM:
23932     case PROCESSOR_GENERIC32:
23933     case PROCESSOR_GENERIC64:
23934       memory = get_attr_memory (insn);
23935 
23936       /* Show ability of reorder buffer to hide latency of load by executing
23937 	 in parallel with previous instruction in case
23938 	 previous instruction is not needed to compute the address.  */
23939       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23940 	  && !ix86_agi_dependent (dep_insn, insn))
23941 	{
23942 	  enum attr_unit unit = get_attr_unit (insn);
23943 	  int loadcost = 3;
23944 
23945 	  /* Because of the difference between the length of integer and
23946 	     floating unit pipeline preparation stages, the memory operands
23947 	     for floating point are cheaper.
23948 
23949 	     ??? For Athlon it the difference is most probably 2.  */
23950 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23951 	    loadcost = 3;
23952 	  else
23953 	    loadcost = TARGET_ATHLON ? 2 : 0;
23954 
23955 	  if (cost >= loadcost)
23956 	    cost -= loadcost;
23957 	  else
23958 	    cost = 0;
23959 	}
23960 
23961     default:
23962       break;
23963     }
23964 
23965   return cost;
23966 }
23967 
23968 /* How many alternative schedules to try.  This should be as wide as the
23969    scheduling freedom in the DFA, but no wider.  Making this value too
23970    large results extra work for the scheduler.  */
23971 
23972 static int
23973 ia32_multipass_dfa_lookahead (void)
23974 {
23975   switch (ix86_tune)
23976     {
23977     case PROCESSOR_PENTIUM:
23978       return 2;
23979 
23980     case PROCESSOR_PENTIUMPRO:
23981     case PROCESSOR_K6:
23982       return 1;
23983 
23984     case PROCESSOR_CORE2_32:
23985     case PROCESSOR_CORE2_64:
23986     case PROCESSOR_COREI7_32:
23987     case PROCESSOR_COREI7_64:
23988     case PROCESSOR_ATOM:
23989       /* Generally, we want haifa-sched:max_issue() to look ahead as far
23990 	 as many instructions can be executed on a cycle, i.e.,
23991 	 issue_rate.  I wonder why tuning for many CPUs does not do this.  */
23992       return ix86_issue_rate ();
23993 
23994     default:
23995       return 0;
23996     }
23997 }
23998 
23999 
24000 
24001 /* Model decoder of Core 2/i7.
24002    Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24003    track the instruction fetch block boundaries and make sure that long
24004    (9+ bytes) instructions are assigned to D0.  */
24005 
24006 /* Maximum length of an insn that can be handled by
24007    a secondary decoder unit.  '8' for Core 2/i7.  */
24008 static int core2i7_secondary_decoder_max_insn_size;
24009 
24010 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24011    '16' for Core 2/i7.  */
24012 static int core2i7_ifetch_block_size;
24013 
24014 /* Maximum number of instructions decoder can handle per cycle.
24015    '6' for Core 2/i7.  */
24016 static int core2i7_ifetch_block_max_insns;
24017 
24018 typedef struct ix86_first_cycle_multipass_data_ *
24019   ix86_first_cycle_multipass_data_t;
24020 typedef const struct ix86_first_cycle_multipass_data_ *
24021   const_ix86_first_cycle_multipass_data_t;
24022 
24023 /* A variable to store target state across calls to max_issue within
24024    one cycle.  */
24025 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24026   *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24027 
24028 /* Initialize DATA.  */
24029 static void
24030 core2i7_first_cycle_multipass_init (void *_data)
24031 {
24032   ix86_first_cycle_multipass_data_t data
24033     = (ix86_first_cycle_multipass_data_t) _data;
24034 
24035   data->ifetch_block_len = 0;
24036   data->ifetch_block_n_insns = 0;
24037   data->ready_try_change = NULL;
24038   data->ready_try_change_size = 0;
24039 }
24040 
24041 /* Advancing the cycle; reset ifetch block counts.  */
24042 static void
24043 core2i7_dfa_post_advance_cycle (void)
24044 {
24045   ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24046 
24047   gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24048 
24049   data->ifetch_block_len = 0;
24050   data->ifetch_block_n_insns = 0;
24051 }
24052 
24053 static int min_insn_size (rtx);
24054 
24055 /* Filter out insns from ready_try that the core will not be able to issue
24056    on current cycle due to decoder.  */
24057 static void
24058 core2i7_first_cycle_multipass_filter_ready_try
24059 (const_ix86_first_cycle_multipass_data_t data,
24060  char *ready_try, int n_ready, bool first_cycle_insn_p)
24061 {
24062   while (n_ready--)
24063     {
24064       rtx insn;
24065       int insn_size;
24066 
24067       if (ready_try[n_ready])
24068 	continue;
24069 
24070       insn = get_ready_element (n_ready);
24071       insn_size = min_insn_size (insn);
24072 
24073       if (/* If this is a too long an insn for a secondary decoder ...  */
24074 	  (!first_cycle_insn_p
24075 	   && insn_size > core2i7_secondary_decoder_max_insn_size)
24076 	  /* ... or it would not fit into the ifetch block ...  */
24077 	  || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24078 	  /* ... or the decoder is full already ...  */
24079 	  || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24080 	/* ... mask the insn out.  */
24081 	{
24082 	  ready_try[n_ready] = 1;
24083 
24084 	  if (data->ready_try_change)
24085 	    SET_BIT (data->ready_try_change, n_ready);
24086 	}
24087     }
24088 }
24089 
24090 /* Prepare for a new round of multipass lookahead scheduling.  */
24091 static void
24092 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24093 				     bool first_cycle_insn_p)
24094 {
24095   ix86_first_cycle_multipass_data_t data
24096     = (ix86_first_cycle_multipass_data_t) _data;
24097   const_ix86_first_cycle_multipass_data_t prev_data
24098     = ix86_first_cycle_multipass_data;
24099 
24100   /* Restore the state from the end of the previous round.  */
24101   data->ifetch_block_len = prev_data->ifetch_block_len;
24102   data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24103 
24104   /* Filter instructions that cannot be issued on current cycle due to
24105      decoder restrictions.  */
24106   core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24107 						  first_cycle_insn_p);
24108 }
24109 
24110 /* INSN is being issued in current solution.  Account for its impact on
24111    the decoder model.  */
24112 static void
24113 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24114 				     rtx insn, const void *_prev_data)
24115 {
24116   ix86_first_cycle_multipass_data_t data
24117     = (ix86_first_cycle_multipass_data_t) _data;
24118   const_ix86_first_cycle_multipass_data_t prev_data
24119     = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24120 
24121   int insn_size = min_insn_size (insn);
24122 
24123   data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24124   data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24125   gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24126 	      && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24127 
24128   /* Allocate or resize the bitmap for storing INSN's effect on ready_try.  */
24129   if (!data->ready_try_change)
24130     {
24131       data->ready_try_change = sbitmap_alloc (n_ready);
24132       data->ready_try_change_size = n_ready;
24133     }
24134   else if (data->ready_try_change_size < n_ready)
24135     {
24136       data->ready_try_change = sbitmap_resize (data->ready_try_change,
24137 					       n_ready, 0);
24138       data->ready_try_change_size = n_ready;
24139     }
24140   sbitmap_zero (data->ready_try_change);
24141 
24142   /* Filter out insns from ready_try that the core will not be able to issue
24143      on current cycle due to decoder.  */
24144   core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24145 						  false);
24146 }
24147 
24148 /* Revert the effect on ready_try.  */
24149 static void
24150 core2i7_first_cycle_multipass_backtrack (const void *_data,
24151 					 char *ready_try,
24152 					 int n_ready ATTRIBUTE_UNUSED)
24153 {
24154   const_ix86_first_cycle_multipass_data_t data
24155     = (const_ix86_first_cycle_multipass_data_t) _data;
24156   unsigned int i = 0;
24157   sbitmap_iterator sbi;
24158 
24159   gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24160   EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24161     {
24162       ready_try[i] = 0;
24163     }
24164 }
24165 
24166 /* Save the result of multipass lookahead scheduling for the next round.  */
24167 static void
24168 core2i7_first_cycle_multipass_end (const void *_data)
24169 {
24170   const_ix86_first_cycle_multipass_data_t data
24171     = (const_ix86_first_cycle_multipass_data_t) _data;
24172   ix86_first_cycle_multipass_data_t next_data
24173     = ix86_first_cycle_multipass_data;
24174 
24175   if (data != NULL)
24176     {
24177       next_data->ifetch_block_len = data->ifetch_block_len;
24178       next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24179     }
24180 }
24181 
24182 /* Deallocate target data.  */
24183 static void
24184 core2i7_first_cycle_multipass_fini (void *_data)
24185 {
24186   ix86_first_cycle_multipass_data_t data
24187     = (ix86_first_cycle_multipass_data_t) _data;
24188 
24189   if (data->ready_try_change)
24190     {
24191       sbitmap_free (data->ready_try_change);
24192       data->ready_try_change = NULL;
24193       data->ready_try_change_size = 0;
24194     }
24195 }
24196 
24197 /* Prepare for scheduling pass.  */
24198 static void
24199 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24200 			int verbose ATTRIBUTE_UNUSED,
24201 			int max_uid ATTRIBUTE_UNUSED)
24202 {
24203   /* Install scheduling hooks for current CPU.  Some of these hooks are used
24204      in time-critical parts of the scheduler, so we only set them up when
24205      they are actually used.  */
24206   switch (ix86_tune)
24207     {
24208     case PROCESSOR_CORE2_32:
24209     case PROCESSOR_CORE2_64:
24210     case PROCESSOR_COREI7_32:
24211     case PROCESSOR_COREI7_64:
24212       targetm.sched.dfa_post_advance_cycle
24213 	= core2i7_dfa_post_advance_cycle;
24214       targetm.sched.first_cycle_multipass_init
24215 	= core2i7_first_cycle_multipass_init;
24216       targetm.sched.first_cycle_multipass_begin
24217 	= core2i7_first_cycle_multipass_begin;
24218       targetm.sched.first_cycle_multipass_issue
24219 	= core2i7_first_cycle_multipass_issue;
24220       targetm.sched.first_cycle_multipass_backtrack
24221 	= core2i7_first_cycle_multipass_backtrack;
24222       targetm.sched.first_cycle_multipass_end
24223 	= core2i7_first_cycle_multipass_end;
24224       targetm.sched.first_cycle_multipass_fini
24225 	= core2i7_first_cycle_multipass_fini;
24226 
24227       /* Set decoder parameters.  */
24228       core2i7_secondary_decoder_max_insn_size = 8;
24229       core2i7_ifetch_block_size = 16;
24230       core2i7_ifetch_block_max_insns = 6;
24231       break;
24232 
24233     default:
24234       targetm.sched.dfa_post_advance_cycle = NULL;
24235       targetm.sched.first_cycle_multipass_init = NULL;
24236       targetm.sched.first_cycle_multipass_begin = NULL;
24237       targetm.sched.first_cycle_multipass_issue = NULL;
24238       targetm.sched.first_cycle_multipass_backtrack = NULL;
24239       targetm.sched.first_cycle_multipass_end = NULL;
24240       targetm.sched.first_cycle_multipass_fini = NULL;
24241       break;
24242     }
24243 }
24244 
24245 
24246 /* Compute the alignment given to a constant that is being placed in memory.
24247    EXP is the constant and ALIGN is the alignment that the object would
24248    ordinarily have.
24249    The value of this function is used instead of that alignment to align
24250    the object.  */
24251 
24252 int
24253 ix86_constant_alignment (tree exp, int align)
24254 {
24255   if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24256       || TREE_CODE (exp) == INTEGER_CST)
24257     {
24258       if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24259 	return 64;
24260       else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24261 	return 128;
24262     }
24263   else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24264 	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24265     return BITS_PER_WORD;
24266 
24267   return align;
24268 }
24269 
24270 /* Compute the alignment for a static variable.
24271    TYPE is the data type, and ALIGN is the alignment that
24272    the object would ordinarily have.  The value of this function is used
24273    instead of that alignment to align the object.  */
24274 
24275 int
24276 ix86_data_alignment (tree type, int align)
24277 {
24278   int max_align
24279     = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24280 
24281   if (AGGREGATE_TYPE_P (type)
24282       && TYPE_SIZE (type)
24283       && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24284       && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24285 	  || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24286       && align < max_align)
24287     align = max_align;
24288 
24289   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24290      to 16byte boundary.  */
24291   if (TARGET_64BIT)
24292     {
24293       if (AGGREGATE_TYPE_P (type)
24294 	   && TYPE_SIZE (type)
24295 	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24296 	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24297 	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24298 	return 128;
24299     }
24300 
24301   if (TREE_CODE (type) == ARRAY_TYPE)
24302     {
24303       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24304 	return 64;
24305       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24306 	return 128;
24307     }
24308   else if (TREE_CODE (type) == COMPLEX_TYPE)
24309     {
24310 
24311       if (TYPE_MODE (type) == DCmode && align < 64)
24312 	return 64;
24313       if ((TYPE_MODE (type) == XCmode
24314 	   || TYPE_MODE (type) == TCmode) && align < 128)
24315 	return 128;
24316     }
24317   else if ((TREE_CODE (type) == RECORD_TYPE
24318 	    || TREE_CODE (type) == UNION_TYPE
24319 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
24320 	   && TYPE_FIELDS (type))
24321     {
24322       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24323 	return 64;
24324       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24325 	return 128;
24326     }
24327   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24328 	   || TREE_CODE (type) == INTEGER_TYPE)
24329     {
24330       if (TYPE_MODE (type) == DFmode && align < 64)
24331 	return 64;
24332       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24333 	return 128;
24334     }
24335 
24336   return align;
24337 }
24338 
24339 /* Compute the alignment for a local variable or a stack slot.  EXP is
24340    the data type or decl itself, MODE is the widest mode available and
24341    ALIGN is the alignment that the object would ordinarily have.  The
24342    value of this macro is used instead of that alignment to align the
24343    object.  */
24344 
24345 unsigned int
24346 ix86_local_alignment (tree exp, enum machine_mode mode,
24347 		      unsigned int align)
24348 {
24349   tree type, decl;
24350 
24351   if (exp && DECL_P (exp))
24352     {
24353       type = TREE_TYPE (exp);
24354       decl = exp;
24355     }
24356   else
24357     {
24358       type = exp;
24359       decl = NULL;
24360     }
24361 
24362   /* Don't do dynamic stack realignment for long long objects with
24363      -mpreferred-stack-boundary=2.  */
24364   if (!TARGET_64BIT
24365       && align == 64
24366       && ix86_preferred_stack_boundary < 64
24367       && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24368       && (!type || !TYPE_USER_ALIGN (type))
24369       && (!decl || !DECL_USER_ALIGN (decl)))
24370     align = 32;
24371 
24372   /* If TYPE is NULL, we are allocating a stack slot for caller-save
24373      register in MODE.  We will return the largest alignment of XF
24374      and DF.  */
24375   if (!type)
24376     {
24377       if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24378 	align = GET_MODE_ALIGNMENT (DFmode);
24379       return align;
24380     }
24381 
24382   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24383      to 16byte boundary.  Exact wording is:
24384 
24385      An array uses the same alignment as its elements, except that a local or
24386      global array variable of length at least 16 bytes or
24387      a C99 variable-length array variable always has alignment of at least 16 bytes.
24388 
24389      This was added to allow use of aligned SSE instructions at arrays.  This
24390      rule is meant for static storage (where compiler can not do the analysis
24391      by itself).  We follow it for automatic variables only when convenient.
24392      We fully control everything in the function compiled and functions from
24393      other unit can not rely on the alignment.
24394 
24395      Exclude va_list type.  It is the common case of local array where
24396      we can not benefit from the alignment.  */
24397   if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24398       && TARGET_SSE)
24399     {
24400       if (AGGREGATE_TYPE_P (type)
24401 	   && (va_list_type_node == NULL_TREE
24402 	       || (TYPE_MAIN_VARIANT (type)
24403 		   != TYPE_MAIN_VARIANT (va_list_type_node)))
24404 	   && TYPE_SIZE (type)
24405 	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24406 	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24407 	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24408 	return 128;
24409     }
24410   if (TREE_CODE (type) == ARRAY_TYPE)
24411     {
24412       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24413 	return 64;
24414       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24415 	return 128;
24416     }
24417   else if (TREE_CODE (type) == COMPLEX_TYPE)
24418     {
24419       if (TYPE_MODE (type) == DCmode && align < 64)
24420 	return 64;
24421       if ((TYPE_MODE (type) == XCmode
24422 	   || TYPE_MODE (type) == TCmode) && align < 128)
24423 	return 128;
24424     }
24425   else if ((TREE_CODE (type) == RECORD_TYPE
24426 	    || TREE_CODE (type) == UNION_TYPE
24427 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
24428 	   && TYPE_FIELDS (type))
24429     {
24430       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24431 	return 64;
24432       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24433 	return 128;
24434     }
24435   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24436 	   || TREE_CODE (type) == INTEGER_TYPE)
24437     {
24438 
24439       if (TYPE_MODE (type) == DFmode && align < 64)
24440 	return 64;
24441       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24442 	return 128;
24443     }
24444   return align;
24445 }
24446 
24447 /* Compute the minimum required alignment for dynamic stack realignment
24448    purposes for a local variable, parameter or a stack slot.  EXP is
24449    the data type or decl itself, MODE is its mode and ALIGN is the
24450    alignment that the object would ordinarily have.  */
24451 
24452 unsigned int
24453 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24454 			unsigned int align)
24455 {
24456   tree type, decl;
24457 
24458   if (exp && DECL_P (exp))
24459     {
24460       type = TREE_TYPE (exp);
24461       decl = exp;
24462     }
24463   else
24464     {
24465       type = exp;
24466       decl = NULL;
24467     }
24468 
24469   if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24470     return align;
24471 
24472   /* Don't do dynamic stack realignment for long long objects with
24473      -mpreferred-stack-boundary=2.  */
24474   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24475       && (!type || !TYPE_USER_ALIGN (type))
24476       && (!decl || !DECL_USER_ALIGN (decl)))
24477     return 32;
24478 
24479   return align;
24480 }
24481 
24482 /* Find a location for the static chain incoming to a nested function.
24483    This is a register, unless all free registers are used by arguments.  */
24484 
24485 static rtx
24486 ix86_static_chain (const_tree fndecl, bool incoming_p)
24487 {
24488   unsigned regno;
24489 
24490   if (!DECL_STATIC_CHAIN (fndecl))
24491     return NULL;
24492 
24493   if (TARGET_64BIT)
24494     {
24495       /* We always use R10 in 64-bit mode.  */
24496       regno = R10_REG;
24497     }
24498   else
24499     {
24500       tree fntype;
24501       unsigned int ccvt;
24502 
24503       /* By default in 32-bit mode we use ECX to pass the static chain.  */
24504       regno = CX_REG;
24505 
24506       fntype = TREE_TYPE (fndecl);
24507       ccvt = ix86_get_callcvt (fntype);
24508       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
24509 	{
24510 	  /* Fastcall functions use ecx/edx for arguments, which leaves
24511 	     us with EAX for the static chain.
24512 	     Thiscall functions use ecx for arguments, which also
24513 	     leaves us with EAX for the static chain.  */
24514 	  regno = AX_REG;
24515 	}
24516       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
24517 	{
24518 	  /* Thiscall functions use ecx for arguments, which leaves
24519 	     us with EAX and EDX for the static chain.
24520 	     We are using for abi-compatibility EAX.  */
24521 	  regno = AX_REG;
24522 	}
24523       else if (ix86_function_regparm (fntype, fndecl) == 3)
24524 	{
24525 	  /* For regparm 3, we have no free call-clobbered registers in
24526 	     which to store the static chain.  In order to implement this,
24527 	     we have the trampoline push the static chain to the stack.
24528 	     However, we can't push a value below the return address when
24529 	     we call the nested function directly, so we have to use an
24530 	     alternate entry point.  For this we use ESI, and have the
24531 	     alternate entry point push ESI, so that things appear the
24532 	     same once we're executing the nested function.  */
24533 	  if (incoming_p)
24534 	    {
24535 	      if (fndecl == current_function_decl)
24536 		ix86_static_chain_on_stack = true;
24537 	      return gen_frame_mem (SImode,
24538 				    plus_constant (arg_pointer_rtx, -8));
24539 	    }
24540 	  regno = SI_REG;
24541 	}
24542     }
24543 
24544   return gen_rtx_REG (Pmode, regno);
24545 }
24546 
24547 /* Emit RTL insns to initialize the variable parts of a trampoline.
24548    FNDECL is the decl of the target address; M_TRAMP is a MEM for
24549    the trampoline, and CHAIN_VALUE is an RTX for the static chain
24550    to be passed to the target function.  */
24551 
24552 static void
24553 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24554 {
24555   rtx mem, fnaddr;
24556   int opcode;
24557   int offset = 0;
24558 
24559   fnaddr = XEXP (DECL_RTL (fndecl), 0);
24560 
24561   if (TARGET_64BIT)
24562     {
24563       int size;
24564 
24565       /* Load the function address to r11.  Try to load address using
24566 	 the shorter movl instead of movabs.  We may want to support
24567 	 movq for kernel mode, but kernel does not use trampolines at
24568 	 the moment.  */
24569       if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24570 	{
24571 	  fnaddr = copy_to_mode_reg (DImode, fnaddr);
24572 
24573 	  mem = adjust_address (m_tramp, HImode, offset);
24574 	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24575 
24576 	  mem = adjust_address (m_tramp, SImode, offset + 2);
24577 	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24578 	  offset += 6;
24579 	}
24580       else
24581 	{
24582 	  mem = adjust_address (m_tramp, HImode, offset);
24583 	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24584 
24585 	  mem = adjust_address (m_tramp, DImode, offset + 2);
24586 	  emit_move_insn (mem, fnaddr);
24587 	  offset += 10;
24588 	}
24589 
24590       /* Load static chain using movabs to r10.  Use the
24591 	 shorter movl instead of movabs for x32.  */
24592       if (TARGET_X32)
24593 	{
24594 	  opcode = 0xba41;
24595 	  size = 6;
24596 	}
24597       else
24598 	{
24599 	  opcode = 0xba49;
24600 	  size = 10;
24601 	}
24602 
24603       mem = adjust_address (m_tramp, HImode, offset);
24604       emit_move_insn (mem, gen_int_mode (opcode, HImode));
24605 
24606       mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24607       emit_move_insn (mem, chain_value);
24608       offset += size;
24609 
24610       /* Jump to r11; the last (unused) byte is a nop, only there to
24611 	 pad the write out to a single 32-bit store.  */
24612       mem = adjust_address (m_tramp, SImode, offset);
24613       emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24614       offset += 4;
24615     }
24616   else
24617     {
24618       rtx disp, chain;
24619 
24620       /* Depending on the static chain location, either load a register
24621 	 with a constant, or push the constant to the stack.  All of the
24622 	 instructions are the same size.  */
24623       chain = ix86_static_chain (fndecl, true);
24624       if (REG_P (chain))
24625 	{
24626 	  switch (REGNO (chain))
24627 	    {
24628 	    case AX_REG:
24629 	      opcode = 0xb8; break;
24630 	    case CX_REG:
24631 	      opcode = 0xb9; break;
24632 	    default:
24633 	      gcc_unreachable ();
24634 	    }
24635 	}
24636       else
24637 	opcode = 0x68;
24638 
24639       mem = adjust_address (m_tramp, QImode, offset);
24640       emit_move_insn (mem, gen_int_mode (opcode, QImode));
24641 
24642       mem = adjust_address (m_tramp, SImode, offset + 1);
24643       emit_move_insn (mem, chain_value);
24644       offset += 5;
24645 
24646       mem = adjust_address (m_tramp, QImode, offset);
24647       emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24648 
24649       mem = adjust_address (m_tramp, SImode, offset + 1);
24650 
24651       /* Compute offset from the end of the jmp to the target function.
24652 	 In the case in which the trampoline stores the static chain on
24653 	 the stack, we need to skip the first insn which pushes the
24654 	 (call-saved) register static chain; this push is 1 byte.  */
24655       offset += 5;
24656       disp = expand_binop (SImode, sub_optab, fnaddr,
24657 			   plus_constant (XEXP (m_tramp, 0),
24658 					  offset - (MEM_P (chain) ? 1 : 0)),
24659 			   NULL_RTX, 1, OPTAB_DIRECT);
24660       emit_move_insn (mem, disp);
24661     }
24662 
24663   gcc_assert (offset <= TRAMPOLINE_SIZE);
24664 
24665 #ifdef HAVE_ENABLE_EXECUTE_STACK
24666 #ifdef CHECK_EXECUTE_STACK_ENABLED
24667   if (CHECK_EXECUTE_STACK_ENABLED)
24668 #endif
24669   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24670 		     LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24671 #endif
24672 }
24673 
24674 /* The following file contains several enumerations and data structures
24675    built from the definitions in i386-builtin-types.def.  */
24676 
24677 #include "i386-builtin-types.inc"
24678 
24679 /* Table for the ix86 builtin non-function types.  */
24680 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24681 
24682 /* Retrieve an element from the above table, building some of
24683    the types lazily.  */
24684 
24685 static tree
24686 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24687 {
24688   unsigned int index;
24689   tree type, itype;
24690 
24691   gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24692 
24693   type = ix86_builtin_type_tab[(int) tcode];
24694   if (type != NULL)
24695     return type;
24696 
24697   gcc_assert (tcode > IX86_BT_LAST_PRIM);
24698   if (tcode <= IX86_BT_LAST_VECT)
24699     {
24700       enum machine_mode mode;
24701 
24702       index = tcode - IX86_BT_LAST_PRIM - 1;
24703       itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24704       mode = ix86_builtin_type_vect_mode[index];
24705 
24706       type = build_vector_type_for_mode (itype, mode);
24707     }
24708   else
24709     {
24710       int quals;
24711 
24712       index = tcode - IX86_BT_LAST_VECT - 1;
24713       if (tcode <= IX86_BT_LAST_PTR)
24714 	quals = TYPE_UNQUALIFIED;
24715       else
24716 	quals = TYPE_QUAL_CONST;
24717 
24718       itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24719       if (quals != TYPE_UNQUALIFIED)
24720 	itype = build_qualified_type (itype, quals);
24721 
24722       type = build_pointer_type (itype);
24723     }
24724 
24725   ix86_builtin_type_tab[(int) tcode] = type;
24726   return type;
24727 }
24728 
24729 /* Table for the ix86 builtin function types.  */
24730 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24731 
24732 /* Retrieve an element from the above table, building some of
24733    the types lazily.  */
24734 
24735 static tree
24736 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24737 {
24738   tree type;
24739 
24740   gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24741 
24742   type = ix86_builtin_func_type_tab[(int) tcode];
24743   if (type != NULL)
24744     return type;
24745 
24746   if (tcode <= IX86_BT_LAST_FUNC)
24747     {
24748       unsigned start = ix86_builtin_func_start[(int) tcode];
24749       unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24750       tree rtype, atype, args = void_list_node;
24751       unsigned i;
24752 
24753       rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24754       for (i = after - 1; i > start; --i)
24755 	{
24756 	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24757 	  args = tree_cons (NULL, atype, args);
24758 	}
24759 
24760       type = build_function_type (rtype, args);
24761     }
24762   else
24763     {
24764       unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24765       enum ix86_builtin_func_type icode;
24766 
24767       icode = ix86_builtin_func_alias_base[index];
24768       type = ix86_get_builtin_func_type (icode);
24769     }
24770 
24771   ix86_builtin_func_type_tab[(int) tcode] = type;
24772   return type;
24773 }
24774 
24775 
24776 /* Codes for all the SSE/MMX builtins.  */
24777 enum ix86_builtins
24778 {
24779   IX86_BUILTIN_ADDPS,
24780   IX86_BUILTIN_ADDSS,
24781   IX86_BUILTIN_DIVPS,
24782   IX86_BUILTIN_DIVSS,
24783   IX86_BUILTIN_MULPS,
24784   IX86_BUILTIN_MULSS,
24785   IX86_BUILTIN_SUBPS,
24786   IX86_BUILTIN_SUBSS,
24787 
24788   IX86_BUILTIN_CMPEQPS,
24789   IX86_BUILTIN_CMPLTPS,
24790   IX86_BUILTIN_CMPLEPS,
24791   IX86_BUILTIN_CMPGTPS,
24792   IX86_BUILTIN_CMPGEPS,
24793   IX86_BUILTIN_CMPNEQPS,
24794   IX86_BUILTIN_CMPNLTPS,
24795   IX86_BUILTIN_CMPNLEPS,
24796   IX86_BUILTIN_CMPNGTPS,
24797   IX86_BUILTIN_CMPNGEPS,
24798   IX86_BUILTIN_CMPORDPS,
24799   IX86_BUILTIN_CMPUNORDPS,
24800   IX86_BUILTIN_CMPEQSS,
24801   IX86_BUILTIN_CMPLTSS,
24802   IX86_BUILTIN_CMPLESS,
24803   IX86_BUILTIN_CMPNEQSS,
24804   IX86_BUILTIN_CMPNLTSS,
24805   IX86_BUILTIN_CMPNLESS,
24806   IX86_BUILTIN_CMPNGTSS,
24807   IX86_BUILTIN_CMPNGESS,
24808   IX86_BUILTIN_CMPORDSS,
24809   IX86_BUILTIN_CMPUNORDSS,
24810 
24811   IX86_BUILTIN_COMIEQSS,
24812   IX86_BUILTIN_COMILTSS,
24813   IX86_BUILTIN_COMILESS,
24814   IX86_BUILTIN_COMIGTSS,
24815   IX86_BUILTIN_COMIGESS,
24816   IX86_BUILTIN_COMINEQSS,
24817   IX86_BUILTIN_UCOMIEQSS,
24818   IX86_BUILTIN_UCOMILTSS,
24819   IX86_BUILTIN_UCOMILESS,
24820   IX86_BUILTIN_UCOMIGTSS,
24821   IX86_BUILTIN_UCOMIGESS,
24822   IX86_BUILTIN_UCOMINEQSS,
24823 
24824   IX86_BUILTIN_CVTPI2PS,
24825   IX86_BUILTIN_CVTPS2PI,
24826   IX86_BUILTIN_CVTSI2SS,
24827   IX86_BUILTIN_CVTSI642SS,
24828   IX86_BUILTIN_CVTSS2SI,
24829   IX86_BUILTIN_CVTSS2SI64,
24830   IX86_BUILTIN_CVTTPS2PI,
24831   IX86_BUILTIN_CVTTSS2SI,
24832   IX86_BUILTIN_CVTTSS2SI64,
24833 
24834   IX86_BUILTIN_MAXPS,
24835   IX86_BUILTIN_MAXSS,
24836   IX86_BUILTIN_MINPS,
24837   IX86_BUILTIN_MINSS,
24838 
24839   IX86_BUILTIN_LOADUPS,
24840   IX86_BUILTIN_STOREUPS,
24841   IX86_BUILTIN_MOVSS,
24842 
24843   IX86_BUILTIN_MOVHLPS,
24844   IX86_BUILTIN_MOVLHPS,
24845   IX86_BUILTIN_LOADHPS,
24846   IX86_BUILTIN_LOADLPS,
24847   IX86_BUILTIN_STOREHPS,
24848   IX86_BUILTIN_STORELPS,
24849 
24850   IX86_BUILTIN_MASKMOVQ,
24851   IX86_BUILTIN_MOVMSKPS,
24852   IX86_BUILTIN_PMOVMSKB,
24853 
24854   IX86_BUILTIN_MOVNTPS,
24855   IX86_BUILTIN_MOVNTQ,
24856 
24857   IX86_BUILTIN_LOADDQU,
24858   IX86_BUILTIN_STOREDQU,
24859 
24860   IX86_BUILTIN_PACKSSWB,
24861   IX86_BUILTIN_PACKSSDW,
24862   IX86_BUILTIN_PACKUSWB,
24863 
24864   IX86_BUILTIN_PADDB,
24865   IX86_BUILTIN_PADDW,
24866   IX86_BUILTIN_PADDD,
24867   IX86_BUILTIN_PADDQ,
24868   IX86_BUILTIN_PADDSB,
24869   IX86_BUILTIN_PADDSW,
24870   IX86_BUILTIN_PADDUSB,
24871   IX86_BUILTIN_PADDUSW,
24872   IX86_BUILTIN_PSUBB,
24873   IX86_BUILTIN_PSUBW,
24874   IX86_BUILTIN_PSUBD,
24875   IX86_BUILTIN_PSUBQ,
24876   IX86_BUILTIN_PSUBSB,
24877   IX86_BUILTIN_PSUBSW,
24878   IX86_BUILTIN_PSUBUSB,
24879   IX86_BUILTIN_PSUBUSW,
24880 
24881   IX86_BUILTIN_PAND,
24882   IX86_BUILTIN_PANDN,
24883   IX86_BUILTIN_POR,
24884   IX86_BUILTIN_PXOR,
24885 
24886   IX86_BUILTIN_PAVGB,
24887   IX86_BUILTIN_PAVGW,
24888 
24889   IX86_BUILTIN_PCMPEQB,
24890   IX86_BUILTIN_PCMPEQW,
24891   IX86_BUILTIN_PCMPEQD,
24892   IX86_BUILTIN_PCMPGTB,
24893   IX86_BUILTIN_PCMPGTW,
24894   IX86_BUILTIN_PCMPGTD,
24895 
24896   IX86_BUILTIN_PMADDWD,
24897 
24898   IX86_BUILTIN_PMAXSW,
24899   IX86_BUILTIN_PMAXUB,
24900   IX86_BUILTIN_PMINSW,
24901   IX86_BUILTIN_PMINUB,
24902 
24903   IX86_BUILTIN_PMULHUW,
24904   IX86_BUILTIN_PMULHW,
24905   IX86_BUILTIN_PMULLW,
24906 
24907   IX86_BUILTIN_PSADBW,
24908   IX86_BUILTIN_PSHUFW,
24909 
24910   IX86_BUILTIN_PSLLW,
24911   IX86_BUILTIN_PSLLD,
24912   IX86_BUILTIN_PSLLQ,
24913   IX86_BUILTIN_PSRAW,
24914   IX86_BUILTIN_PSRAD,
24915   IX86_BUILTIN_PSRLW,
24916   IX86_BUILTIN_PSRLD,
24917   IX86_BUILTIN_PSRLQ,
24918   IX86_BUILTIN_PSLLWI,
24919   IX86_BUILTIN_PSLLDI,
24920   IX86_BUILTIN_PSLLQI,
24921   IX86_BUILTIN_PSRAWI,
24922   IX86_BUILTIN_PSRADI,
24923   IX86_BUILTIN_PSRLWI,
24924   IX86_BUILTIN_PSRLDI,
24925   IX86_BUILTIN_PSRLQI,
24926 
24927   IX86_BUILTIN_PUNPCKHBW,
24928   IX86_BUILTIN_PUNPCKHWD,
24929   IX86_BUILTIN_PUNPCKHDQ,
24930   IX86_BUILTIN_PUNPCKLBW,
24931   IX86_BUILTIN_PUNPCKLWD,
24932   IX86_BUILTIN_PUNPCKLDQ,
24933 
24934   IX86_BUILTIN_SHUFPS,
24935 
24936   IX86_BUILTIN_RCPPS,
24937   IX86_BUILTIN_RCPSS,
24938   IX86_BUILTIN_RSQRTPS,
24939   IX86_BUILTIN_RSQRTPS_NR,
24940   IX86_BUILTIN_RSQRTSS,
24941   IX86_BUILTIN_RSQRTF,
24942   IX86_BUILTIN_SQRTPS,
24943   IX86_BUILTIN_SQRTPS_NR,
24944   IX86_BUILTIN_SQRTSS,
24945 
24946   IX86_BUILTIN_UNPCKHPS,
24947   IX86_BUILTIN_UNPCKLPS,
24948 
24949   IX86_BUILTIN_ANDPS,
24950   IX86_BUILTIN_ANDNPS,
24951   IX86_BUILTIN_ORPS,
24952   IX86_BUILTIN_XORPS,
24953 
24954   IX86_BUILTIN_EMMS,
24955   IX86_BUILTIN_LDMXCSR,
24956   IX86_BUILTIN_STMXCSR,
24957   IX86_BUILTIN_SFENCE,
24958 
24959   /* 3DNow! Original */
24960   IX86_BUILTIN_FEMMS,
24961   IX86_BUILTIN_PAVGUSB,
24962   IX86_BUILTIN_PF2ID,
24963   IX86_BUILTIN_PFACC,
24964   IX86_BUILTIN_PFADD,
24965   IX86_BUILTIN_PFCMPEQ,
24966   IX86_BUILTIN_PFCMPGE,
24967   IX86_BUILTIN_PFCMPGT,
24968   IX86_BUILTIN_PFMAX,
24969   IX86_BUILTIN_PFMIN,
24970   IX86_BUILTIN_PFMUL,
24971   IX86_BUILTIN_PFRCP,
24972   IX86_BUILTIN_PFRCPIT1,
24973   IX86_BUILTIN_PFRCPIT2,
24974   IX86_BUILTIN_PFRSQIT1,
24975   IX86_BUILTIN_PFRSQRT,
24976   IX86_BUILTIN_PFSUB,
24977   IX86_BUILTIN_PFSUBR,
24978   IX86_BUILTIN_PI2FD,
24979   IX86_BUILTIN_PMULHRW,
24980 
24981   /* 3DNow! Athlon Extensions */
24982   IX86_BUILTIN_PF2IW,
24983   IX86_BUILTIN_PFNACC,
24984   IX86_BUILTIN_PFPNACC,
24985   IX86_BUILTIN_PI2FW,
24986   IX86_BUILTIN_PSWAPDSI,
24987   IX86_BUILTIN_PSWAPDSF,
24988 
24989   /* SSE2 */
24990   IX86_BUILTIN_ADDPD,
24991   IX86_BUILTIN_ADDSD,
24992   IX86_BUILTIN_DIVPD,
24993   IX86_BUILTIN_DIVSD,
24994   IX86_BUILTIN_MULPD,
24995   IX86_BUILTIN_MULSD,
24996   IX86_BUILTIN_SUBPD,
24997   IX86_BUILTIN_SUBSD,
24998 
24999   IX86_BUILTIN_CMPEQPD,
25000   IX86_BUILTIN_CMPLTPD,
25001   IX86_BUILTIN_CMPLEPD,
25002   IX86_BUILTIN_CMPGTPD,
25003   IX86_BUILTIN_CMPGEPD,
25004   IX86_BUILTIN_CMPNEQPD,
25005   IX86_BUILTIN_CMPNLTPD,
25006   IX86_BUILTIN_CMPNLEPD,
25007   IX86_BUILTIN_CMPNGTPD,
25008   IX86_BUILTIN_CMPNGEPD,
25009   IX86_BUILTIN_CMPORDPD,
25010   IX86_BUILTIN_CMPUNORDPD,
25011   IX86_BUILTIN_CMPEQSD,
25012   IX86_BUILTIN_CMPLTSD,
25013   IX86_BUILTIN_CMPLESD,
25014   IX86_BUILTIN_CMPNEQSD,
25015   IX86_BUILTIN_CMPNLTSD,
25016   IX86_BUILTIN_CMPNLESD,
25017   IX86_BUILTIN_CMPORDSD,
25018   IX86_BUILTIN_CMPUNORDSD,
25019 
25020   IX86_BUILTIN_COMIEQSD,
25021   IX86_BUILTIN_COMILTSD,
25022   IX86_BUILTIN_COMILESD,
25023   IX86_BUILTIN_COMIGTSD,
25024   IX86_BUILTIN_COMIGESD,
25025   IX86_BUILTIN_COMINEQSD,
25026   IX86_BUILTIN_UCOMIEQSD,
25027   IX86_BUILTIN_UCOMILTSD,
25028   IX86_BUILTIN_UCOMILESD,
25029   IX86_BUILTIN_UCOMIGTSD,
25030   IX86_BUILTIN_UCOMIGESD,
25031   IX86_BUILTIN_UCOMINEQSD,
25032 
25033   IX86_BUILTIN_MAXPD,
25034   IX86_BUILTIN_MAXSD,
25035   IX86_BUILTIN_MINPD,
25036   IX86_BUILTIN_MINSD,
25037 
25038   IX86_BUILTIN_ANDPD,
25039   IX86_BUILTIN_ANDNPD,
25040   IX86_BUILTIN_ORPD,
25041   IX86_BUILTIN_XORPD,
25042 
25043   IX86_BUILTIN_SQRTPD,
25044   IX86_BUILTIN_SQRTSD,
25045 
25046   IX86_BUILTIN_UNPCKHPD,
25047   IX86_BUILTIN_UNPCKLPD,
25048 
25049   IX86_BUILTIN_SHUFPD,
25050 
25051   IX86_BUILTIN_LOADUPD,
25052   IX86_BUILTIN_STOREUPD,
25053   IX86_BUILTIN_MOVSD,
25054 
25055   IX86_BUILTIN_LOADHPD,
25056   IX86_BUILTIN_LOADLPD,
25057 
25058   IX86_BUILTIN_CVTDQ2PD,
25059   IX86_BUILTIN_CVTDQ2PS,
25060 
25061   IX86_BUILTIN_CVTPD2DQ,
25062   IX86_BUILTIN_CVTPD2PI,
25063   IX86_BUILTIN_CVTPD2PS,
25064   IX86_BUILTIN_CVTTPD2DQ,
25065   IX86_BUILTIN_CVTTPD2PI,
25066 
25067   IX86_BUILTIN_CVTPI2PD,
25068   IX86_BUILTIN_CVTSI2SD,
25069   IX86_BUILTIN_CVTSI642SD,
25070 
25071   IX86_BUILTIN_CVTSD2SI,
25072   IX86_BUILTIN_CVTSD2SI64,
25073   IX86_BUILTIN_CVTSD2SS,
25074   IX86_BUILTIN_CVTSS2SD,
25075   IX86_BUILTIN_CVTTSD2SI,
25076   IX86_BUILTIN_CVTTSD2SI64,
25077 
25078   IX86_BUILTIN_CVTPS2DQ,
25079   IX86_BUILTIN_CVTPS2PD,
25080   IX86_BUILTIN_CVTTPS2DQ,
25081 
25082   IX86_BUILTIN_MOVNTI,
25083   IX86_BUILTIN_MOVNTI64,
25084   IX86_BUILTIN_MOVNTPD,
25085   IX86_BUILTIN_MOVNTDQ,
25086 
25087   IX86_BUILTIN_MOVQ128,
25088 
25089   /* SSE2 MMX */
25090   IX86_BUILTIN_MASKMOVDQU,
25091   IX86_BUILTIN_MOVMSKPD,
25092   IX86_BUILTIN_PMOVMSKB128,
25093 
25094   IX86_BUILTIN_PACKSSWB128,
25095   IX86_BUILTIN_PACKSSDW128,
25096   IX86_BUILTIN_PACKUSWB128,
25097 
25098   IX86_BUILTIN_PADDB128,
25099   IX86_BUILTIN_PADDW128,
25100   IX86_BUILTIN_PADDD128,
25101   IX86_BUILTIN_PADDQ128,
25102   IX86_BUILTIN_PADDSB128,
25103   IX86_BUILTIN_PADDSW128,
25104   IX86_BUILTIN_PADDUSB128,
25105   IX86_BUILTIN_PADDUSW128,
25106   IX86_BUILTIN_PSUBB128,
25107   IX86_BUILTIN_PSUBW128,
25108   IX86_BUILTIN_PSUBD128,
25109   IX86_BUILTIN_PSUBQ128,
25110   IX86_BUILTIN_PSUBSB128,
25111   IX86_BUILTIN_PSUBSW128,
25112   IX86_BUILTIN_PSUBUSB128,
25113   IX86_BUILTIN_PSUBUSW128,
25114 
25115   IX86_BUILTIN_PAND128,
25116   IX86_BUILTIN_PANDN128,
25117   IX86_BUILTIN_POR128,
25118   IX86_BUILTIN_PXOR128,
25119 
25120   IX86_BUILTIN_PAVGB128,
25121   IX86_BUILTIN_PAVGW128,
25122 
25123   IX86_BUILTIN_PCMPEQB128,
25124   IX86_BUILTIN_PCMPEQW128,
25125   IX86_BUILTIN_PCMPEQD128,
25126   IX86_BUILTIN_PCMPGTB128,
25127   IX86_BUILTIN_PCMPGTW128,
25128   IX86_BUILTIN_PCMPGTD128,
25129 
25130   IX86_BUILTIN_PMADDWD128,
25131 
25132   IX86_BUILTIN_PMAXSW128,
25133   IX86_BUILTIN_PMAXUB128,
25134   IX86_BUILTIN_PMINSW128,
25135   IX86_BUILTIN_PMINUB128,
25136 
25137   IX86_BUILTIN_PMULUDQ,
25138   IX86_BUILTIN_PMULUDQ128,
25139   IX86_BUILTIN_PMULHUW128,
25140   IX86_BUILTIN_PMULHW128,
25141   IX86_BUILTIN_PMULLW128,
25142 
25143   IX86_BUILTIN_PSADBW128,
25144   IX86_BUILTIN_PSHUFHW,
25145   IX86_BUILTIN_PSHUFLW,
25146   IX86_BUILTIN_PSHUFD,
25147 
25148   IX86_BUILTIN_PSLLDQI128,
25149   IX86_BUILTIN_PSLLWI128,
25150   IX86_BUILTIN_PSLLDI128,
25151   IX86_BUILTIN_PSLLQI128,
25152   IX86_BUILTIN_PSRAWI128,
25153   IX86_BUILTIN_PSRADI128,
25154   IX86_BUILTIN_PSRLDQI128,
25155   IX86_BUILTIN_PSRLWI128,
25156   IX86_BUILTIN_PSRLDI128,
25157   IX86_BUILTIN_PSRLQI128,
25158 
25159   IX86_BUILTIN_PSLLDQ128,
25160   IX86_BUILTIN_PSLLW128,
25161   IX86_BUILTIN_PSLLD128,
25162   IX86_BUILTIN_PSLLQ128,
25163   IX86_BUILTIN_PSRAW128,
25164   IX86_BUILTIN_PSRAD128,
25165   IX86_BUILTIN_PSRLW128,
25166   IX86_BUILTIN_PSRLD128,
25167   IX86_BUILTIN_PSRLQ128,
25168 
25169   IX86_BUILTIN_PUNPCKHBW128,
25170   IX86_BUILTIN_PUNPCKHWD128,
25171   IX86_BUILTIN_PUNPCKHDQ128,
25172   IX86_BUILTIN_PUNPCKHQDQ128,
25173   IX86_BUILTIN_PUNPCKLBW128,
25174   IX86_BUILTIN_PUNPCKLWD128,
25175   IX86_BUILTIN_PUNPCKLDQ128,
25176   IX86_BUILTIN_PUNPCKLQDQ128,
25177 
25178   IX86_BUILTIN_CLFLUSH,
25179   IX86_BUILTIN_MFENCE,
25180   IX86_BUILTIN_LFENCE,
25181   IX86_BUILTIN_PAUSE,
25182 
25183   IX86_BUILTIN_BSRSI,
25184   IX86_BUILTIN_BSRDI,
25185   IX86_BUILTIN_RDPMC,
25186   IX86_BUILTIN_RDTSC,
25187   IX86_BUILTIN_RDTSCP,
25188   IX86_BUILTIN_ROLQI,
25189   IX86_BUILTIN_ROLHI,
25190   IX86_BUILTIN_RORQI,
25191   IX86_BUILTIN_RORHI,
25192 
25193   /* SSE3.  */
25194   IX86_BUILTIN_ADDSUBPS,
25195   IX86_BUILTIN_HADDPS,
25196   IX86_BUILTIN_HSUBPS,
25197   IX86_BUILTIN_MOVSHDUP,
25198   IX86_BUILTIN_MOVSLDUP,
25199   IX86_BUILTIN_ADDSUBPD,
25200   IX86_BUILTIN_HADDPD,
25201   IX86_BUILTIN_HSUBPD,
25202   IX86_BUILTIN_LDDQU,
25203 
25204   IX86_BUILTIN_MONITOR,
25205   IX86_BUILTIN_MWAIT,
25206 
25207   /* SSSE3.  */
25208   IX86_BUILTIN_PHADDW,
25209   IX86_BUILTIN_PHADDD,
25210   IX86_BUILTIN_PHADDSW,
25211   IX86_BUILTIN_PHSUBW,
25212   IX86_BUILTIN_PHSUBD,
25213   IX86_BUILTIN_PHSUBSW,
25214   IX86_BUILTIN_PMADDUBSW,
25215   IX86_BUILTIN_PMULHRSW,
25216   IX86_BUILTIN_PSHUFB,
25217   IX86_BUILTIN_PSIGNB,
25218   IX86_BUILTIN_PSIGNW,
25219   IX86_BUILTIN_PSIGND,
25220   IX86_BUILTIN_PALIGNR,
25221   IX86_BUILTIN_PABSB,
25222   IX86_BUILTIN_PABSW,
25223   IX86_BUILTIN_PABSD,
25224 
25225   IX86_BUILTIN_PHADDW128,
25226   IX86_BUILTIN_PHADDD128,
25227   IX86_BUILTIN_PHADDSW128,
25228   IX86_BUILTIN_PHSUBW128,
25229   IX86_BUILTIN_PHSUBD128,
25230   IX86_BUILTIN_PHSUBSW128,
25231   IX86_BUILTIN_PMADDUBSW128,
25232   IX86_BUILTIN_PMULHRSW128,
25233   IX86_BUILTIN_PSHUFB128,
25234   IX86_BUILTIN_PSIGNB128,
25235   IX86_BUILTIN_PSIGNW128,
25236   IX86_BUILTIN_PSIGND128,
25237   IX86_BUILTIN_PALIGNR128,
25238   IX86_BUILTIN_PABSB128,
25239   IX86_BUILTIN_PABSW128,
25240   IX86_BUILTIN_PABSD128,
25241 
25242   /* AMDFAM10 - SSE4A New Instructions.  */
25243   IX86_BUILTIN_MOVNTSD,
25244   IX86_BUILTIN_MOVNTSS,
25245   IX86_BUILTIN_EXTRQI,
25246   IX86_BUILTIN_EXTRQ,
25247   IX86_BUILTIN_INSERTQI,
25248   IX86_BUILTIN_INSERTQ,
25249 
25250   /* SSE4.1.  */
25251   IX86_BUILTIN_BLENDPD,
25252   IX86_BUILTIN_BLENDPS,
25253   IX86_BUILTIN_BLENDVPD,
25254   IX86_BUILTIN_BLENDVPS,
25255   IX86_BUILTIN_PBLENDVB128,
25256   IX86_BUILTIN_PBLENDW128,
25257 
25258   IX86_BUILTIN_DPPD,
25259   IX86_BUILTIN_DPPS,
25260 
25261   IX86_BUILTIN_INSERTPS128,
25262 
25263   IX86_BUILTIN_MOVNTDQA,
25264   IX86_BUILTIN_MPSADBW128,
25265   IX86_BUILTIN_PACKUSDW128,
25266   IX86_BUILTIN_PCMPEQQ,
25267   IX86_BUILTIN_PHMINPOSUW128,
25268 
25269   IX86_BUILTIN_PMAXSB128,
25270   IX86_BUILTIN_PMAXSD128,
25271   IX86_BUILTIN_PMAXUD128,
25272   IX86_BUILTIN_PMAXUW128,
25273 
25274   IX86_BUILTIN_PMINSB128,
25275   IX86_BUILTIN_PMINSD128,
25276   IX86_BUILTIN_PMINUD128,
25277   IX86_BUILTIN_PMINUW128,
25278 
25279   IX86_BUILTIN_PMOVSXBW128,
25280   IX86_BUILTIN_PMOVSXBD128,
25281   IX86_BUILTIN_PMOVSXBQ128,
25282   IX86_BUILTIN_PMOVSXWD128,
25283   IX86_BUILTIN_PMOVSXWQ128,
25284   IX86_BUILTIN_PMOVSXDQ128,
25285 
25286   IX86_BUILTIN_PMOVZXBW128,
25287   IX86_BUILTIN_PMOVZXBD128,
25288   IX86_BUILTIN_PMOVZXBQ128,
25289   IX86_BUILTIN_PMOVZXWD128,
25290   IX86_BUILTIN_PMOVZXWQ128,
25291   IX86_BUILTIN_PMOVZXDQ128,
25292 
25293   IX86_BUILTIN_PMULDQ128,
25294   IX86_BUILTIN_PMULLD128,
25295 
25296   IX86_BUILTIN_ROUNDSD,
25297   IX86_BUILTIN_ROUNDSS,
25298 
25299   IX86_BUILTIN_ROUNDPD,
25300   IX86_BUILTIN_ROUNDPS,
25301 
25302   IX86_BUILTIN_FLOORPD,
25303   IX86_BUILTIN_CEILPD,
25304   IX86_BUILTIN_TRUNCPD,
25305   IX86_BUILTIN_RINTPD,
25306   IX86_BUILTIN_ROUNDPD_AZ,
25307 
25308   IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25309   IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25310   IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25311 
25312   IX86_BUILTIN_FLOORPS,
25313   IX86_BUILTIN_CEILPS,
25314   IX86_BUILTIN_TRUNCPS,
25315   IX86_BUILTIN_RINTPS,
25316   IX86_BUILTIN_ROUNDPS_AZ,
25317 
25318   IX86_BUILTIN_FLOORPS_SFIX,
25319   IX86_BUILTIN_CEILPS_SFIX,
25320   IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25321 
25322   IX86_BUILTIN_PTESTZ,
25323   IX86_BUILTIN_PTESTC,
25324   IX86_BUILTIN_PTESTNZC,
25325 
25326   IX86_BUILTIN_VEC_INIT_V2SI,
25327   IX86_BUILTIN_VEC_INIT_V4HI,
25328   IX86_BUILTIN_VEC_INIT_V8QI,
25329   IX86_BUILTIN_VEC_EXT_V2DF,
25330   IX86_BUILTIN_VEC_EXT_V2DI,
25331   IX86_BUILTIN_VEC_EXT_V4SF,
25332   IX86_BUILTIN_VEC_EXT_V4SI,
25333   IX86_BUILTIN_VEC_EXT_V8HI,
25334   IX86_BUILTIN_VEC_EXT_V2SI,
25335   IX86_BUILTIN_VEC_EXT_V4HI,
25336   IX86_BUILTIN_VEC_EXT_V16QI,
25337   IX86_BUILTIN_VEC_SET_V2DI,
25338   IX86_BUILTIN_VEC_SET_V4SF,
25339   IX86_BUILTIN_VEC_SET_V4SI,
25340   IX86_BUILTIN_VEC_SET_V8HI,
25341   IX86_BUILTIN_VEC_SET_V4HI,
25342   IX86_BUILTIN_VEC_SET_V16QI,
25343 
25344   IX86_BUILTIN_VEC_PACK_SFIX,
25345   IX86_BUILTIN_VEC_PACK_SFIX256,
25346 
25347   /* SSE4.2.  */
25348   IX86_BUILTIN_CRC32QI,
25349   IX86_BUILTIN_CRC32HI,
25350   IX86_BUILTIN_CRC32SI,
25351   IX86_BUILTIN_CRC32DI,
25352 
25353   IX86_BUILTIN_PCMPESTRI128,
25354   IX86_BUILTIN_PCMPESTRM128,
25355   IX86_BUILTIN_PCMPESTRA128,
25356   IX86_BUILTIN_PCMPESTRC128,
25357   IX86_BUILTIN_PCMPESTRO128,
25358   IX86_BUILTIN_PCMPESTRS128,
25359   IX86_BUILTIN_PCMPESTRZ128,
25360   IX86_BUILTIN_PCMPISTRI128,
25361   IX86_BUILTIN_PCMPISTRM128,
25362   IX86_BUILTIN_PCMPISTRA128,
25363   IX86_BUILTIN_PCMPISTRC128,
25364   IX86_BUILTIN_PCMPISTRO128,
25365   IX86_BUILTIN_PCMPISTRS128,
25366   IX86_BUILTIN_PCMPISTRZ128,
25367 
25368   IX86_BUILTIN_PCMPGTQ,
25369 
25370   /* AES instructions */
25371   IX86_BUILTIN_AESENC128,
25372   IX86_BUILTIN_AESENCLAST128,
25373   IX86_BUILTIN_AESDEC128,
25374   IX86_BUILTIN_AESDECLAST128,
25375   IX86_BUILTIN_AESIMC128,
25376   IX86_BUILTIN_AESKEYGENASSIST128,
25377 
25378   /* PCLMUL instruction */
25379   IX86_BUILTIN_PCLMULQDQ128,
25380 
25381   /* AVX */
25382   IX86_BUILTIN_ADDPD256,
25383   IX86_BUILTIN_ADDPS256,
25384   IX86_BUILTIN_ADDSUBPD256,
25385   IX86_BUILTIN_ADDSUBPS256,
25386   IX86_BUILTIN_ANDPD256,
25387   IX86_BUILTIN_ANDPS256,
25388   IX86_BUILTIN_ANDNPD256,
25389   IX86_BUILTIN_ANDNPS256,
25390   IX86_BUILTIN_BLENDPD256,
25391   IX86_BUILTIN_BLENDPS256,
25392   IX86_BUILTIN_BLENDVPD256,
25393   IX86_BUILTIN_BLENDVPS256,
25394   IX86_BUILTIN_DIVPD256,
25395   IX86_BUILTIN_DIVPS256,
25396   IX86_BUILTIN_DPPS256,
25397   IX86_BUILTIN_HADDPD256,
25398   IX86_BUILTIN_HADDPS256,
25399   IX86_BUILTIN_HSUBPD256,
25400   IX86_BUILTIN_HSUBPS256,
25401   IX86_BUILTIN_MAXPD256,
25402   IX86_BUILTIN_MAXPS256,
25403   IX86_BUILTIN_MINPD256,
25404   IX86_BUILTIN_MINPS256,
25405   IX86_BUILTIN_MULPD256,
25406   IX86_BUILTIN_MULPS256,
25407   IX86_BUILTIN_ORPD256,
25408   IX86_BUILTIN_ORPS256,
25409   IX86_BUILTIN_SHUFPD256,
25410   IX86_BUILTIN_SHUFPS256,
25411   IX86_BUILTIN_SUBPD256,
25412   IX86_BUILTIN_SUBPS256,
25413   IX86_BUILTIN_XORPD256,
25414   IX86_BUILTIN_XORPS256,
25415   IX86_BUILTIN_CMPSD,
25416   IX86_BUILTIN_CMPSS,
25417   IX86_BUILTIN_CMPPD,
25418   IX86_BUILTIN_CMPPS,
25419   IX86_BUILTIN_CMPPD256,
25420   IX86_BUILTIN_CMPPS256,
25421   IX86_BUILTIN_CVTDQ2PD256,
25422   IX86_BUILTIN_CVTDQ2PS256,
25423   IX86_BUILTIN_CVTPD2PS256,
25424   IX86_BUILTIN_CVTPS2DQ256,
25425   IX86_BUILTIN_CVTPS2PD256,
25426   IX86_BUILTIN_CVTTPD2DQ256,
25427   IX86_BUILTIN_CVTPD2DQ256,
25428   IX86_BUILTIN_CVTTPS2DQ256,
25429   IX86_BUILTIN_EXTRACTF128PD256,
25430   IX86_BUILTIN_EXTRACTF128PS256,
25431   IX86_BUILTIN_EXTRACTF128SI256,
25432   IX86_BUILTIN_VZEROALL,
25433   IX86_BUILTIN_VZEROUPPER,
25434   IX86_BUILTIN_VPERMILVARPD,
25435   IX86_BUILTIN_VPERMILVARPS,
25436   IX86_BUILTIN_VPERMILVARPD256,
25437   IX86_BUILTIN_VPERMILVARPS256,
25438   IX86_BUILTIN_VPERMILPD,
25439   IX86_BUILTIN_VPERMILPS,
25440   IX86_BUILTIN_VPERMILPD256,
25441   IX86_BUILTIN_VPERMILPS256,
25442   IX86_BUILTIN_VPERMIL2PD,
25443   IX86_BUILTIN_VPERMIL2PS,
25444   IX86_BUILTIN_VPERMIL2PD256,
25445   IX86_BUILTIN_VPERMIL2PS256,
25446   IX86_BUILTIN_VPERM2F128PD256,
25447   IX86_BUILTIN_VPERM2F128PS256,
25448   IX86_BUILTIN_VPERM2F128SI256,
25449   IX86_BUILTIN_VBROADCASTSS,
25450   IX86_BUILTIN_VBROADCASTSD256,
25451   IX86_BUILTIN_VBROADCASTSS256,
25452   IX86_BUILTIN_VBROADCASTPD256,
25453   IX86_BUILTIN_VBROADCASTPS256,
25454   IX86_BUILTIN_VINSERTF128PD256,
25455   IX86_BUILTIN_VINSERTF128PS256,
25456   IX86_BUILTIN_VINSERTF128SI256,
25457   IX86_BUILTIN_LOADUPD256,
25458   IX86_BUILTIN_LOADUPS256,
25459   IX86_BUILTIN_STOREUPD256,
25460   IX86_BUILTIN_STOREUPS256,
25461   IX86_BUILTIN_LDDQU256,
25462   IX86_BUILTIN_MOVNTDQ256,
25463   IX86_BUILTIN_MOVNTPD256,
25464   IX86_BUILTIN_MOVNTPS256,
25465   IX86_BUILTIN_LOADDQU256,
25466   IX86_BUILTIN_STOREDQU256,
25467   IX86_BUILTIN_MASKLOADPD,
25468   IX86_BUILTIN_MASKLOADPS,
25469   IX86_BUILTIN_MASKSTOREPD,
25470   IX86_BUILTIN_MASKSTOREPS,
25471   IX86_BUILTIN_MASKLOADPD256,
25472   IX86_BUILTIN_MASKLOADPS256,
25473   IX86_BUILTIN_MASKSTOREPD256,
25474   IX86_BUILTIN_MASKSTOREPS256,
25475   IX86_BUILTIN_MOVSHDUP256,
25476   IX86_BUILTIN_MOVSLDUP256,
25477   IX86_BUILTIN_MOVDDUP256,
25478 
25479   IX86_BUILTIN_SQRTPD256,
25480   IX86_BUILTIN_SQRTPS256,
25481   IX86_BUILTIN_SQRTPS_NR256,
25482   IX86_BUILTIN_RSQRTPS256,
25483   IX86_BUILTIN_RSQRTPS_NR256,
25484 
25485   IX86_BUILTIN_RCPPS256,
25486 
25487   IX86_BUILTIN_ROUNDPD256,
25488   IX86_BUILTIN_ROUNDPS256,
25489 
25490   IX86_BUILTIN_FLOORPD256,
25491   IX86_BUILTIN_CEILPD256,
25492   IX86_BUILTIN_TRUNCPD256,
25493   IX86_BUILTIN_RINTPD256,
25494   IX86_BUILTIN_ROUNDPD_AZ256,
25495 
25496   IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25497   IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25498   IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25499 
25500   IX86_BUILTIN_FLOORPS256,
25501   IX86_BUILTIN_CEILPS256,
25502   IX86_BUILTIN_TRUNCPS256,
25503   IX86_BUILTIN_RINTPS256,
25504   IX86_BUILTIN_ROUNDPS_AZ256,
25505 
25506   IX86_BUILTIN_FLOORPS_SFIX256,
25507   IX86_BUILTIN_CEILPS_SFIX256,
25508   IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25509 
25510   IX86_BUILTIN_UNPCKHPD256,
25511   IX86_BUILTIN_UNPCKLPD256,
25512   IX86_BUILTIN_UNPCKHPS256,
25513   IX86_BUILTIN_UNPCKLPS256,
25514 
25515   IX86_BUILTIN_SI256_SI,
25516   IX86_BUILTIN_PS256_PS,
25517   IX86_BUILTIN_PD256_PD,
25518   IX86_BUILTIN_SI_SI256,
25519   IX86_BUILTIN_PS_PS256,
25520   IX86_BUILTIN_PD_PD256,
25521 
25522   IX86_BUILTIN_VTESTZPD,
25523   IX86_BUILTIN_VTESTCPD,
25524   IX86_BUILTIN_VTESTNZCPD,
25525   IX86_BUILTIN_VTESTZPS,
25526   IX86_BUILTIN_VTESTCPS,
25527   IX86_BUILTIN_VTESTNZCPS,
25528   IX86_BUILTIN_VTESTZPD256,
25529   IX86_BUILTIN_VTESTCPD256,
25530   IX86_BUILTIN_VTESTNZCPD256,
25531   IX86_BUILTIN_VTESTZPS256,
25532   IX86_BUILTIN_VTESTCPS256,
25533   IX86_BUILTIN_VTESTNZCPS256,
25534   IX86_BUILTIN_PTESTZ256,
25535   IX86_BUILTIN_PTESTC256,
25536   IX86_BUILTIN_PTESTNZC256,
25537 
25538   IX86_BUILTIN_MOVMSKPD256,
25539   IX86_BUILTIN_MOVMSKPS256,
25540 
25541   /* AVX2 */
25542   IX86_BUILTIN_MPSADBW256,
25543   IX86_BUILTIN_PABSB256,
25544   IX86_BUILTIN_PABSW256,
25545   IX86_BUILTIN_PABSD256,
25546   IX86_BUILTIN_PACKSSDW256,
25547   IX86_BUILTIN_PACKSSWB256,
25548   IX86_BUILTIN_PACKUSDW256,
25549   IX86_BUILTIN_PACKUSWB256,
25550   IX86_BUILTIN_PADDB256,
25551   IX86_BUILTIN_PADDW256,
25552   IX86_BUILTIN_PADDD256,
25553   IX86_BUILTIN_PADDQ256,
25554   IX86_BUILTIN_PADDSB256,
25555   IX86_BUILTIN_PADDSW256,
25556   IX86_BUILTIN_PADDUSB256,
25557   IX86_BUILTIN_PADDUSW256,
25558   IX86_BUILTIN_PALIGNR256,
25559   IX86_BUILTIN_AND256I,
25560   IX86_BUILTIN_ANDNOT256I,
25561   IX86_BUILTIN_PAVGB256,
25562   IX86_BUILTIN_PAVGW256,
25563   IX86_BUILTIN_PBLENDVB256,
25564   IX86_BUILTIN_PBLENDVW256,
25565   IX86_BUILTIN_PCMPEQB256,
25566   IX86_BUILTIN_PCMPEQW256,
25567   IX86_BUILTIN_PCMPEQD256,
25568   IX86_BUILTIN_PCMPEQQ256,
25569   IX86_BUILTIN_PCMPGTB256,
25570   IX86_BUILTIN_PCMPGTW256,
25571   IX86_BUILTIN_PCMPGTD256,
25572   IX86_BUILTIN_PCMPGTQ256,
25573   IX86_BUILTIN_PHADDW256,
25574   IX86_BUILTIN_PHADDD256,
25575   IX86_BUILTIN_PHADDSW256,
25576   IX86_BUILTIN_PHSUBW256,
25577   IX86_BUILTIN_PHSUBD256,
25578   IX86_BUILTIN_PHSUBSW256,
25579   IX86_BUILTIN_PMADDUBSW256,
25580   IX86_BUILTIN_PMADDWD256,
25581   IX86_BUILTIN_PMAXSB256,
25582   IX86_BUILTIN_PMAXSW256,
25583   IX86_BUILTIN_PMAXSD256,
25584   IX86_BUILTIN_PMAXUB256,
25585   IX86_BUILTIN_PMAXUW256,
25586   IX86_BUILTIN_PMAXUD256,
25587   IX86_BUILTIN_PMINSB256,
25588   IX86_BUILTIN_PMINSW256,
25589   IX86_BUILTIN_PMINSD256,
25590   IX86_BUILTIN_PMINUB256,
25591   IX86_BUILTIN_PMINUW256,
25592   IX86_BUILTIN_PMINUD256,
25593   IX86_BUILTIN_PMOVMSKB256,
25594   IX86_BUILTIN_PMOVSXBW256,
25595   IX86_BUILTIN_PMOVSXBD256,
25596   IX86_BUILTIN_PMOVSXBQ256,
25597   IX86_BUILTIN_PMOVSXWD256,
25598   IX86_BUILTIN_PMOVSXWQ256,
25599   IX86_BUILTIN_PMOVSXDQ256,
25600   IX86_BUILTIN_PMOVZXBW256,
25601   IX86_BUILTIN_PMOVZXBD256,
25602   IX86_BUILTIN_PMOVZXBQ256,
25603   IX86_BUILTIN_PMOVZXWD256,
25604   IX86_BUILTIN_PMOVZXWQ256,
25605   IX86_BUILTIN_PMOVZXDQ256,
25606   IX86_BUILTIN_PMULDQ256,
25607   IX86_BUILTIN_PMULHRSW256,
25608   IX86_BUILTIN_PMULHUW256,
25609   IX86_BUILTIN_PMULHW256,
25610   IX86_BUILTIN_PMULLW256,
25611   IX86_BUILTIN_PMULLD256,
25612   IX86_BUILTIN_PMULUDQ256,
25613   IX86_BUILTIN_POR256,
25614   IX86_BUILTIN_PSADBW256,
25615   IX86_BUILTIN_PSHUFB256,
25616   IX86_BUILTIN_PSHUFD256,
25617   IX86_BUILTIN_PSHUFHW256,
25618   IX86_BUILTIN_PSHUFLW256,
25619   IX86_BUILTIN_PSIGNB256,
25620   IX86_BUILTIN_PSIGNW256,
25621   IX86_BUILTIN_PSIGND256,
25622   IX86_BUILTIN_PSLLDQI256,
25623   IX86_BUILTIN_PSLLWI256,
25624   IX86_BUILTIN_PSLLW256,
25625   IX86_BUILTIN_PSLLDI256,
25626   IX86_BUILTIN_PSLLD256,
25627   IX86_BUILTIN_PSLLQI256,
25628   IX86_BUILTIN_PSLLQ256,
25629   IX86_BUILTIN_PSRAWI256,
25630   IX86_BUILTIN_PSRAW256,
25631   IX86_BUILTIN_PSRADI256,
25632   IX86_BUILTIN_PSRAD256,
25633   IX86_BUILTIN_PSRLDQI256,
25634   IX86_BUILTIN_PSRLWI256,
25635   IX86_BUILTIN_PSRLW256,
25636   IX86_BUILTIN_PSRLDI256,
25637   IX86_BUILTIN_PSRLD256,
25638   IX86_BUILTIN_PSRLQI256,
25639   IX86_BUILTIN_PSRLQ256,
25640   IX86_BUILTIN_PSUBB256,
25641   IX86_BUILTIN_PSUBW256,
25642   IX86_BUILTIN_PSUBD256,
25643   IX86_BUILTIN_PSUBQ256,
25644   IX86_BUILTIN_PSUBSB256,
25645   IX86_BUILTIN_PSUBSW256,
25646   IX86_BUILTIN_PSUBUSB256,
25647   IX86_BUILTIN_PSUBUSW256,
25648   IX86_BUILTIN_PUNPCKHBW256,
25649   IX86_BUILTIN_PUNPCKHWD256,
25650   IX86_BUILTIN_PUNPCKHDQ256,
25651   IX86_BUILTIN_PUNPCKHQDQ256,
25652   IX86_BUILTIN_PUNPCKLBW256,
25653   IX86_BUILTIN_PUNPCKLWD256,
25654   IX86_BUILTIN_PUNPCKLDQ256,
25655   IX86_BUILTIN_PUNPCKLQDQ256,
25656   IX86_BUILTIN_PXOR256,
25657   IX86_BUILTIN_MOVNTDQA256,
25658   IX86_BUILTIN_VBROADCASTSS_PS,
25659   IX86_BUILTIN_VBROADCASTSS_PS256,
25660   IX86_BUILTIN_VBROADCASTSD_PD256,
25661   IX86_BUILTIN_VBROADCASTSI256,
25662   IX86_BUILTIN_PBLENDD256,
25663   IX86_BUILTIN_PBLENDD128,
25664   IX86_BUILTIN_PBROADCASTB256,
25665   IX86_BUILTIN_PBROADCASTW256,
25666   IX86_BUILTIN_PBROADCASTD256,
25667   IX86_BUILTIN_PBROADCASTQ256,
25668   IX86_BUILTIN_PBROADCASTB128,
25669   IX86_BUILTIN_PBROADCASTW128,
25670   IX86_BUILTIN_PBROADCASTD128,
25671   IX86_BUILTIN_PBROADCASTQ128,
25672   IX86_BUILTIN_VPERMVARSI256,
25673   IX86_BUILTIN_VPERMDF256,
25674   IX86_BUILTIN_VPERMVARSF256,
25675   IX86_BUILTIN_VPERMDI256,
25676   IX86_BUILTIN_VPERMTI256,
25677   IX86_BUILTIN_VEXTRACT128I256,
25678   IX86_BUILTIN_VINSERT128I256,
25679   IX86_BUILTIN_MASKLOADD,
25680   IX86_BUILTIN_MASKLOADQ,
25681   IX86_BUILTIN_MASKLOADD256,
25682   IX86_BUILTIN_MASKLOADQ256,
25683   IX86_BUILTIN_MASKSTORED,
25684   IX86_BUILTIN_MASKSTOREQ,
25685   IX86_BUILTIN_MASKSTORED256,
25686   IX86_BUILTIN_MASKSTOREQ256,
25687   IX86_BUILTIN_PSLLVV4DI,
25688   IX86_BUILTIN_PSLLVV2DI,
25689   IX86_BUILTIN_PSLLVV8SI,
25690   IX86_BUILTIN_PSLLVV4SI,
25691   IX86_BUILTIN_PSRAVV8SI,
25692   IX86_BUILTIN_PSRAVV4SI,
25693   IX86_BUILTIN_PSRLVV4DI,
25694   IX86_BUILTIN_PSRLVV2DI,
25695   IX86_BUILTIN_PSRLVV8SI,
25696   IX86_BUILTIN_PSRLVV4SI,
25697 
25698   IX86_BUILTIN_GATHERSIV2DF,
25699   IX86_BUILTIN_GATHERSIV4DF,
25700   IX86_BUILTIN_GATHERDIV2DF,
25701   IX86_BUILTIN_GATHERDIV4DF,
25702   IX86_BUILTIN_GATHERSIV4SF,
25703   IX86_BUILTIN_GATHERSIV8SF,
25704   IX86_BUILTIN_GATHERDIV4SF,
25705   IX86_BUILTIN_GATHERDIV8SF,
25706   IX86_BUILTIN_GATHERSIV2DI,
25707   IX86_BUILTIN_GATHERSIV4DI,
25708   IX86_BUILTIN_GATHERDIV2DI,
25709   IX86_BUILTIN_GATHERDIV4DI,
25710   IX86_BUILTIN_GATHERSIV4SI,
25711   IX86_BUILTIN_GATHERSIV8SI,
25712   IX86_BUILTIN_GATHERDIV4SI,
25713   IX86_BUILTIN_GATHERDIV8SI,
25714 
25715   /* Alternate 4 element gather for the vectorizer where
25716      all operands are 32-byte wide.  */
25717   IX86_BUILTIN_GATHERALTSIV4DF,
25718   IX86_BUILTIN_GATHERALTDIV8SF,
25719   IX86_BUILTIN_GATHERALTSIV4DI,
25720   IX86_BUILTIN_GATHERALTDIV8SI,
25721 
25722   /* TFmode support builtins.  */
25723   IX86_BUILTIN_INFQ,
25724   IX86_BUILTIN_HUGE_VALQ,
25725   IX86_BUILTIN_FABSQ,
25726   IX86_BUILTIN_COPYSIGNQ,
25727 
25728   /* Vectorizer support builtins.  */
25729   IX86_BUILTIN_CPYSGNPS,
25730   IX86_BUILTIN_CPYSGNPD,
25731   IX86_BUILTIN_CPYSGNPS256,
25732   IX86_BUILTIN_CPYSGNPD256,
25733 
25734   /* FMA4 instructions.  */
25735   IX86_BUILTIN_VFMADDSS,
25736   IX86_BUILTIN_VFMADDSD,
25737   IX86_BUILTIN_VFMADDPS,
25738   IX86_BUILTIN_VFMADDPD,
25739   IX86_BUILTIN_VFMADDPS256,
25740   IX86_BUILTIN_VFMADDPD256,
25741   IX86_BUILTIN_VFMADDSUBPS,
25742   IX86_BUILTIN_VFMADDSUBPD,
25743   IX86_BUILTIN_VFMADDSUBPS256,
25744   IX86_BUILTIN_VFMADDSUBPD256,
25745 
25746   /* FMA3 instructions.  */
25747   IX86_BUILTIN_VFMADDSS3,
25748   IX86_BUILTIN_VFMADDSD3,
25749 
25750   /* XOP instructions.  */
25751   IX86_BUILTIN_VPCMOV,
25752   IX86_BUILTIN_VPCMOV_V2DI,
25753   IX86_BUILTIN_VPCMOV_V4SI,
25754   IX86_BUILTIN_VPCMOV_V8HI,
25755   IX86_BUILTIN_VPCMOV_V16QI,
25756   IX86_BUILTIN_VPCMOV_V4SF,
25757   IX86_BUILTIN_VPCMOV_V2DF,
25758   IX86_BUILTIN_VPCMOV256,
25759   IX86_BUILTIN_VPCMOV_V4DI256,
25760   IX86_BUILTIN_VPCMOV_V8SI256,
25761   IX86_BUILTIN_VPCMOV_V16HI256,
25762   IX86_BUILTIN_VPCMOV_V32QI256,
25763   IX86_BUILTIN_VPCMOV_V8SF256,
25764   IX86_BUILTIN_VPCMOV_V4DF256,
25765 
25766   IX86_BUILTIN_VPPERM,
25767 
25768   IX86_BUILTIN_VPMACSSWW,
25769   IX86_BUILTIN_VPMACSWW,
25770   IX86_BUILTIN_VPMACSSWD,
25771   IX86_BUILTIN_VPMACSWD,
25772   IX86_BUILTIN_VPMACSSDD,
25773   IX86_BUILTIN_VPMACSDD,
25774   IX86_BUILTIN_VPMACSSDQL,
25775   IX86_BUILTIN_VPMACSSDQH,
25776   IX86_BUILTIN_VPMACSDQL,
25777   IX86_BUILTIN_VPMACSDQH,
25778   IX86_BUILTIN_VPMADCSSWD,
25779   IX86_BUILTIN_VPMADCSWD,
25780 
25781   IX86_BUILTIN_VPHADDBW,
25782   IX86_BUILTIN_VPHADDBD,
25783   IX86_BUILTIN_VPHADDBQ,
25784   IX86_BUILTIN_VPHADDWD,
25785   IX86_BUILTIN_VPHADDWQ,
25786   IX86_BUILTIN_VPHADDDQ,
25787   IX86_BUILTIN_VPHADDUBW,
25788   IX86_BUILTIN_VPHADDUBD,
25789   IX86_BUILTIN_VPHADDUBQ,
25790   IX86_BUILTIN_VPHADDUWD,
25791   IX86_BUILTIN_VPHADDUWQ,
25792   IX86_BUILTIN_VPHADDUDQ,
25793   IX86_BUILTIN_VPHSUBBW,
25794   IX86_BUILTIN_VPHSUBWD,
25795   IX86_BUILTIN_VPHSUBDQ,
25796 
25797   IX86_BUILTIN_VPROTB,
25798   IX86_BUILTIN_VPROTW,
25799   IX86_BUILTIN_VPROTD,
25800   IX86_BUILTIN_VPROTQ,
25801   IX86_BUILTIN_VPROTB_IMM,
25802   IX86_BUILTIN_VPROTW_IMM,
25803   IX86_BUILTIN_VPROTD_IMM,
25804   IX86_BUILTIN_VPROTQ_IMM,
25805 
25806   IX86_BUILTIN_VPSHLB,
25807   IX86_BUILTIN_VPSHLW,
25808   IX86_BUILTIN_VPSHLD,
25809   IX86_BUILTIN_VPSHLQ,
25810   IX86_BUILTIN_VPSHAB,
25811   IX86_BUILTIN_VPSHAW,
25812   IX86_BUILTIN_VPSHAD,
25813   IX86_BUILTIN_VPSHAQ,
25814 
25815   IX86_BUILTIN_VFRCZSS,
25816   IX86_BUILTIN_VFRCZSD,
25817   IX86_BUILTIN_VFRCZPS,
25818   IX86_BUILTIN_VFRCZPD,
25819   IX86_BUILTIN_VFRCZPS256,
25820   IX86_BUILTIN_VFRCZPD256,
25821 
25822   IX86_BUILTIN_VPCOMEQUB,
25823   IX86_BUILTIN_VPCOMNEUB,
25824   IX86_BUILTIN_VPCOMLTUB,
25825   IX86_BUILTIN_VPCOMLEUB,
25826   IX86_BUILTIN_VPCOMGTUB,
25827   IX86_BUILTIN_VPCOMGEUB,
25828   IX86_BUILTIN_VPCOMFALSEUB,
25829   IX86_BUILTIN_VPCOMTRUEUB,
25830 
25831   IX86_BUILTIN_VPCOMEQUW,
25832   IX86_BUILTIN_VPCOMNEUW,
25833   IX86_BUILTIN_VPCOMLTUW,
25834   IX86_BUILTIN_VPCOMLEUW,
25835   IX86_BUILTIN_VPCOMGTUW,
25836   IX86_BUILTIN_VPCOMGEUW,
25837   IX86_BUILTIN_VPCOMFALSEUW,
25838   IX86_BUILTIN_VPCOMTRUEUW,
25839 
25840   IX86_BUILTIN_VPCOMEQUD,
25841   IX86_BUILTIN_VPCOMNEUD,
25842   IX86_BUILTIN_VPCOMLTUD,
25843   IX86_BUILTIN_VPCOMLEUD,
25844   IX86_BUILTIN_VPCOMGTUD,
25845   IX86_BUILTIN_VPCOMGEUD,
25846   IX86_BUILTIN_VPCOMFALSEUD,
25847   IX86_BUILTIN_VPCOMTRUEUD,
25848 
25849   IX86_BUILTIN_VPCOMEQUQ,
25850   IX86_BUILTIN_VPCOMNEUQ,
25851   IX86_BUILTIN_VPCOMLTUQ,
25852   IX86_BUILTIN_VPCOMLEUQ,
25853   IX86_BUILTIN_VPCOMGTUQ,
25854   IX86_BUILTIN_VPCOMGEUQ,
25855   IX86_BUILTIN_VPCOMFALSEUQ,
25856   IX86_BUILTIN_VPCOMTRUEUQ,
25857 
25858   IX86_BUILTIN_VPCOMEQB,
25859   IX86_BUILTIN_VPCOMNEB,
25860   IX86_BUILTIN_VPCOMLTB,
25861   IX86_BUILTIN_VPCOMLEB,
25862   IX86_BUILTIN_VPCOMGTB,
25863   IX86_BUILTIN_VPCOMGEB,
25864   IX86_BUILTIN_VPCOMFALSEB,
25865   IX86_BUILTIN_VPCOMTRUEB,
25866 
25867   IX86_BUILTIN_VPCOMEQW,
25868   IX86_BUILTIN_VPCOMNEW,
25869   IX86_BUILTIN_VPCOMLTW,
25870   IX86_BUILTIN_VPCOMLEW,
25871   IX86_BUILTIN_VPCOMGTW,
25872   IX86_BUILTIN_VPCOMGEW,
25873   IX86_BUILTIN_VPCOMFALSEW,
25874   IX86_BUILTIN_VPCOMTRUEW,
25875 
25876   IX86_BUILTIN_VPCOMEQD,
25877   IX86_BUILTIN_VPCOMNED,
25878   IX86_BUILTIN_VPCOMLTD,
25879   IX86_BUILTIN_VPCOMLED,
25880   IX86_BUILTIN_VPCOMGTD,
25881   IX86_BUILTIN_VPCOMGED,
25882   IX86_BUILTIN_VPCOMFALSED,
25883   IX86_BUILTIN_VPCOMTRUED,
25884 
25885   IX86_BUILTIN_VPCOMEQQ,
25886   IX86_BUILTIN_VPCOMNEQ,
25887   IX86_BUILTIN_VPCOMLTQ,
25888   IX86_BUILTIN_VPCOMLEQ,
25889   IX86_BUILTIN_VPCOMGTQ,
25890   IX86_BUILTIN_VPCOMGEQ,
25891   IX86_BUILTIN_VPCOMFALSEQ,
25892   IX86_BUILTIN_VPCOMTRUEQ,
25893 
25894   /* LWP instructions.  */
25895   IX86_BUILTIN_LLWPCB,
25896   IX86_BUILTIN_SLWPCB,
25897   IX86_BUILTIN_LWPVAL32,
25898   IX86_BUILTIN_LWPVAL64,
25899   IX86_BUILTIN_LWPINS32,
25900   IX86_BUILTIN_LWPINS64,
25901 
25902   IX86_BUILTIN_CLZS,
25903 
25904   /* BMI instructions.  */
25905   IX86_BUILTIN_BEXTR32,
25906   IX86_BUILTIN_BEXTR64,
25907   IX86_BUILTIN_CTZS,
25908 
25909   /* TBM instructions.  */
25910   IX86_BUILTIN_BEXTRI32,
25911   IX86_BUILTIN_BEXTRI64,
25912 
25913   /* BMI2 instructions. */
25914   IX86_BUILTIN_BZHI32,
25915   IX86_BUILTIN_BZHI64,
25916   IX86_BUILTIN_PDEP32,
25917   IX86_BUILTIN_PDEP64,
25918   IX86_BUILTIN_PEXT32,
25919   IX86_BUILTIN_PEXT64,
25920 
25921   /* FSGSBASE instructions.  */
25922   IX86_BUILTIN_RDFSBASE32,
25923   IX86_BUILTIN_RDFSBASE64,
25924   IX86_BUILTIN_RDGSBASE32,
25925   IX86_BUILTIN_RDGSBASE64,
25926   IX86_BUILTIN_WRFSBASE32,
25927   IX86_BUILTIN_WRFSBASE64,
25928   IX86_BUILTIN_WRGSBASE32,
25929   IX86_BUILTIN_WRGSBASE64,
25930 
25931   /* RDRND instructions.  */
25932   IX86_BUILTIN_RDRAND16_STEP,
25933   IX86_BUILTIN_RDRAND32_STEP,
25934   IX86_BUILTIN_RDRAND64_STEP,
25935 
25936   /* F16C instructions.  */
25937   IX86_BUILTIN_CVTPH2PS,
25938   IX86_BUILTIN_CVTPH2PS256,
25939   IX86_BUILTIN_CVTPS2PH,
25940   IX86_BUILTIN_CVTPS2PH256,
25941 
25942   /* CFString built-in for darwin */
25943   IX86_BUILTIN_CFSTRING,
25944 
25945   IX86_BUILTIN_MAX
25946 };
25947 
25948 /* Table for the ix86 builtin decls.  */
25949 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25950 
25951 /* Table of all of the builtin functions that are possible with different ISA's
25952    but are waiting to be built until a function is declared to use that
25953    ISA.  */
25954 struct builtin_isa {
25955   const char *name;		/* function name */
25956   enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25957   HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
25958   bool const_p;			/* true if the declaration is constant */
25959   bool set_and_not_built_p;
25960 };
25961 
25962 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25963 
25964 
25965 /* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the MASK
25966    of which isa_flags to use in the ix86_builtins_isa array.  Stores the
25967    function decl in the ix86_builtins array.  Returns the function decl or
25968    NULL_TREE, if the builtin was not added.
25969 
25970    If the front end has a special hook for builtin functions, delay adding
25971    builtin functions that aren't in the current ISA until the ISA is changed
25972    with function specific optimization.  Doing so, can save about 300K for the
25973    default compiler.  When the builtin is expanded, check at that time whether
25974    it is valid.
25975 
25976    If the front end doesn't have a special hook, record all builtins, even if
25977    it isn't an instruction set in the current ISA in case the user uses
25978    function specific options for a different ISA, so that we don't get scope
25979    errors if a builtin is added in the middle of a function scope.  */
25980 
25981 static inline tree
25982 def_builtin (HOST_WIDE_INT mask, const char *name,
25983 	     enum ix86_builtin_func_type tcode,
25984 	     enum ix86_builtins code)
25985 {
25986   tree decl = NULL_TREE;
25987 
25988   if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25989     {
25990       ix86_builtins_isa[(int) code].isa = mask;
25991 
25992       mask &= ~OPTION_MASK_ISA_64BIT;
25993       if (mask == 0
25994 	  || (mask & ix86_isa_flags) != 0
25995 	  || (lang_hooks.builtin_function
25996 	      == lang_hooks.builtin_function_ext_scope))
25997 
25998 	{
25999 	  tree type = ix86_get_builtin_func_type (tcode);
26000 	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26001 				       NULL, NULL_TREE);
26002 	  ix86_builtins[(int) code] = decl;
26003 	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26004 	}
26005       else
26006 	{
26007 	  ix86_builtins[(int) code] = NULL_TREE;
26008 	  ix86_builtins_isa[(int) code].tcode = tcode;
26009 	  ix86_builtins_isa[(int) code].name = name;
26010 	  ix86_builtins_isa[(int) code].const_p = false;
26011 	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26012 	}
26013     }
26014 
26015   return decl;
26016 }
26017 
26018 /* Like def_builtin, but also marks the function decl "const".  */
26019 
26020 static inline tree
26021 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26022 		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26023 {
26024   tree decl = def_builtin (mask, name, tcode, code);
26025   if (decl)
26026     TREE_READONLY (decl) = 1;
26027   else
26028     ix86_builtins_isa[(int) code].const_p = true;
26029 
26030   return decl;
26031 }
26032 
26033 /* Add any new builtin functions for a given ISA that may not have been
26034    declared.  This saves a bit of space compared to adding all of the
26035    declarations to the tree, even if we didn't use them.  */
26036 
26037 static void
26038 ix86_add_new_builtins (HOST_WIDE_INT isa)
26039 {
26040   int i;
26041 
26042   for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26043     {
26044       if ((ix86_builtins_isa[i].isa & isa) != 0
26045 	  && ix86_builtins_isa[i].set_and_not_built_p)
26046 	{
26047 	  tree decl, type;
26048 
26049 	  /* Don't define the builtin again.  */
26050 	  ix86_builtins_isa[i].set_and_not_built_p = false;
26051 
26052 	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26053 	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26054 						 type, i, BUILT_IN_MD, NULL,
26055 						 NULL_TREE);
26056 
26057 	  ix86_builtins[i] = decl;
26058 	  if (ix86_builtins_isa[i].const_p)
26059 	    TREE_READONLY (decl) = 1;
26060 	}
26061     }
26062 }
26063 
26064 /* Bits for builtin_description.flag.  */
26065 
26066 /* Set when we don't support the comparison natively, and should
26067    swap_comparison in order to support it.  */
26068 #define BUILTIN_DESC_SWAP_OPERANDS	1
26069 
26070 struct builtin_description
26071 {
26072   const HOST_WIDE_INT mask;
26073   const enum insn_code icode;
26074   const char *const name;
26075   const enum ix86_builtins code;
26076   const enum rtx_code comparison;
26077   const int flag;
26078 };
26079 
26080 static const struct builtin_description bdesc_comi[] =
26081 {
26082   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26083   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26084   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26085   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26086   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26087   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26088   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26089   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26090   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26091   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26092   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26093   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26094   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26095   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26096   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26097   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26098   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26099   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26100   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26101   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26102   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26103   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26104   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26105   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26106 };
26107 
26108 static const struct builtin_description bdesc_pcmpestr[] =
26109 {
26110   /* SSE4.2 */
26111   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26112   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26113   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26114   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26115   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26116   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26117   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26118 };
26119 
26120 static const struct builtin_description bdesc_pcmpistr[] =
26121 {
26122   /* SSE4.2 */
26123   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26124   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26125   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26126   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26127   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26128   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26129   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26130 };
26131 
26132 /* Special builtins with variable number of arguments.  */
26133 static const struct builtin_description bdesc_special_args[] =
26134 {
26135   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26136   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26137   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26138 
26139   /* MMX */
26140   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26141 
26142   /* 3DNow! */
26143   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26144 
26145   /* SSE */
26146   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26147   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26148   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26149 
26150   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26151   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26152   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26153   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26154 
26155   /* SSE or 3DNow!A  */
26156   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26157   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26158 
26159   /* SSE2 */
26160   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26161   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26162   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26163   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26164   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26165   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26166   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26167   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26168   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26169   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26170 
26171   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26172   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26173 
26174   /* SSE3 */
26175   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26176 
26177   /* SSE4.1 */
26178   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26179 
26180   /* SSE4A */
26181   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26182   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26183 
26184   /* AVX */
26185   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26186   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26187 
26188   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26189   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26190   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26191   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26192   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26193 
26194   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26195   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26196   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26197   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26198   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26199   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26200   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26201 
26202   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26203   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26204   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26205 
26206   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26207   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26208   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26209   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26210   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26211   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26212   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26213   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26214 
26215   /* AVX2 */
26216   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26217   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26218   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26219   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26220   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26221   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26222   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26223   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26224   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26225 
26226   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26227   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26228   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26229   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26230   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26231   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26232 
26233   /* FSGSBASE */
26234   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26235   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26236   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26237   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26238   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26239   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26240   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26241   { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26242 };
26243 
26244 /* Builtins with variable number of arguments.  */
26245 static const struct builtin_description bdesc_args[] =
26246 {
26247   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26248   { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26249   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26250   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26251   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26252   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26253   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26254 
26255   /* MMX */
26256   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26257   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26258   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26259   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26260   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26261   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26262 
26263   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26264   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26265   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26266   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26267   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26268   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26269   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26270   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26271 
26272   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26273   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26274 
26275   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26276   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26277   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26278   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26279 
26280   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26281   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26282   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26283   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26284   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26285   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26286 
26287   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26288   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26289   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26290   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26291   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26292   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26293 
26294   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26295   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26296   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26297 
26298   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26299 
26300   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26301   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26302   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26303   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26304   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26305   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26306 
26307   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26308   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26309   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26310   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26311   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26312   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26313 
26314   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26315   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26316   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26317   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26318 
26319   /* 3DNow! */
26320   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26321   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26322   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26323   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26324 
26325   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26326   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26327   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26328   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26329   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26330   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26331   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26332   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26333   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26334   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26335   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26336   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26337   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26338   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26339   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26340 
26341   /* 3DNow!A */
26342   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26343   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26344   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26345   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26346   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26347   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26348 
26349   /* SSE */
26350   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26351   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26352   { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26353   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26354   { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26355   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26356   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26357   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26358   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26359   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26360   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26361   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26362 
26363   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26364 
26365   { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26366   { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26367   { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26368   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26369   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3,  "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26370   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3,  "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26371   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3,  "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26372   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3,  "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26373 
26374   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26375   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26376   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26377   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26378   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26379   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26380   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26381   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26382   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26383   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26384   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26385   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26386   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26387   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26388   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26389   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26390   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26391   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26392   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26393   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26394   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26395   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26396 
26397   { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26398   { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26399   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26400   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26401 
26402   { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26403   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3,  "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26404   { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26405   { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26406 
26407   { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3,  "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26408 
26409   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26410   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26411   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26412   { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26413   { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26414 
26415   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26416   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26417   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26418 
26419   { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26420 
26421   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26422   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26423   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26424 
26425   /* SSE MMX or 3Dnow!A */
26426   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26427   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26428   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26429 
26430   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26431   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26432   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26433   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26434 
26435   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26436   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26437 
26438   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26439 
26440   /* SSE2 */
26441   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26442 
26443   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF  },
26444   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26445   { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26446   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26447   { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26448 
26449   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26450   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26451   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26452   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26453   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26454 
26455   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26456 
26457   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26458   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26459   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26460   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26461 
26462   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26463   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26464   { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26465 
26466   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26467   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26468   { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26469   { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26470   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3,  "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26471   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3,  "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26472   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3,  "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26473   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3,  "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26474 
26475   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26476   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26477   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26478   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26479   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26480   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26481   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26482   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26483   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26484   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26485   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26486   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26487   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26488   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26489   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26490   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26491   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26492   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26493   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26494   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26495 
26496   { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26497   { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26498   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26499   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26500 
26501   { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26502   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26503   { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26504   { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26505 
26506   { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26507 
26508   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26509   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26510   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26511 
26512   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26513 
26514   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26515   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26516   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26517   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26518   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26519   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26520   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26521   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26522 
26523   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26524   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26525   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26526   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26527   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26528   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26529   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26530   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26531 
26532   { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26533   { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26534 
26535   { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26536   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26537   { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26538   { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26539 
26540   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26541   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26542 
26543   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26544   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26545   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
26546   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26547   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26548   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
26549 
26550   { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26551   { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26552   { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26553   { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26554 
26555   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26556   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI  },
26557   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN,  (int) V4SI_FTYPE_V4SI_V4SI },
26558   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26559   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26560   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26561   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26562   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26563 
26564   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26565   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26566   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26567 
26568   { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26569   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26570 
26571   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26572   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26573 
26574   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26575 
26576   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26577   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26578   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26579   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26580 
26581   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26582   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26583   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26584   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26585   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26586   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26587   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26588 
26589   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26590   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26591   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26592   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26593   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26594   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26595   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26596 
26597   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26598   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26599   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26600   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26601 
26602   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26603   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26604   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26605 
26606   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26607 
26608   { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26609   { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26610 
26611   { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26612 
26613   /* SSE2 MMX */
26614   { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26615   { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26616 
26617   /* SSE3 */
26618   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26619   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26620 
26621   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26622   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26623   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26624   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26625   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26626   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26627 
26628   /* SSSE3 */
26629   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26630   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26631   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26632   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26633   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26634   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26635 
26636   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26637   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26638   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26639   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26640   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26641   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26642   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26643   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26644   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26645   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26646   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26647   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26648   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26649   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26650   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26651   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26652   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26653   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26654   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26655   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26656   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26657   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26658   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26659   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26660 
26661   /* SSSE3.  */
26662   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26663   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26664 
26665   /* SSE4.1 */
26666   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26667   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26668   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26669   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26670   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26671   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26672   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26673   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26674   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26675   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26676 
26677   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26678   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26679   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26680   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26681   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26682   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26683   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26684   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26685   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26686   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26687   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26688   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26689   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26690 
26691   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26692   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26693   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26694   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26695   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26696   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26697   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26698   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26699   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26700   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26701   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26702   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26703 
26704   /* SSE4.1 */
26705   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26706   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26707   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26708   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26709 
26710   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26711   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26712   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26713   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26714 
26715   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26716   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26717 
26718   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26719   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26720 
26721   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26722   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26723   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26724   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26725 
26726   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26727   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26728 
26729   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26730   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26731 
26732   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26733   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26734   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26735 
26736   /* SSE4.2 */
26737   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26738   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26739   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26740   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26741   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26742 
26743   /* SSE4A */
26744   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26745   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26746   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26747   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26748 
26749   /* AES */
26750   { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26751   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26752 
26753   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26754   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26755   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26756   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26757 
26758   /* PCLMUL */
26759   { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26760 
26761   /* AVX */
26762   { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26763   { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26764   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26765   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26766   { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26767   { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26768   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26769   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26770   { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26771   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26772   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26773   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26774   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26775   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26776   { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26777   { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26778   { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26779   { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26780   { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26781   { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26782   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26783   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26784   { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26785   { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26786   { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26787   { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26788 
26789   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26790   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26791   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26792   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26793 
26794   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26795   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26796   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26797   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26798   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26799   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26800   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26801   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26802   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26803   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26804   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26805   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26806   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26807   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26808   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26809   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26810   { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26811   { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26812   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26813   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26814   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26815   { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26816   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26817   { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26818   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26819   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26820   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26821   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26822   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26823   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26824   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26825   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26826   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26827   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26828 
26829   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26830   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26831   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26832 
26833   { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26834   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26835   { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26836   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26837   { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26838 
26839   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26840 
26841   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26842   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26843 
26844   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26845   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26846   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26847   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26848 
26849   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26850   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26851 
26852   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26853   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26854 
26855   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26856   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26857   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26858   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26859 
26860   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26861   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26862 
26863   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26864   { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26865 
26866   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26867   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26868   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,  "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26869   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256,  "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26870 
26871   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26872   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26873   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26874   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26875   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26876   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26877 
26878   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26879   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26880   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26881   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26882   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26883   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26884   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26885   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26886   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26887   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26888   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26889   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26890   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26891   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26892   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26893 
26894   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF  },
26895   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26896 
26897   { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3,  "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26898   { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3,  "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26899 
26900   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26901 
26902   /* AVX2 */
26903   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26904   { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26905   { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26906   { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26907   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26908   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26909   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26910   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256",  IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26911   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26912   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26913   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26914   { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26915   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26916   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26917   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26918   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26919   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26920   { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26921   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26922   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26923   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26924   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26925   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26926   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26927   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26928   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
26929   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
26930   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26931   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26932   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
26933   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
26934   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26935   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26936   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26937   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26938   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26939   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26940   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26941   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26942   { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26943   { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26944   { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26945   { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26946   { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26947   { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26948   { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26949   { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26950   { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26951   { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26952   { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26953   { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26954   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26955   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26956   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2  , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26957   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2  , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26958   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2  , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26959   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2  , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26960   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2  , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26961   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26962   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2  , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26963   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2  , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26964   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2  , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26965   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2  , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26966   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2  , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26967   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3  , "__builtin_ia32_pmuldq256"  , IX86_BUILTIN_PMULDQ256  , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26968   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26969   { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26970   { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256"  , IX86_BUILTIN_PMULHW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26971   { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"  , IX86_BUILTIN_PMULLW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26972   { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26973   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3  , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26974   { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26975   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26976   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26977   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26978   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26979   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26980   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26981   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26982   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26983   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26984   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26985   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26986   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26987   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26988   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26989   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26990   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26991   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26992   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26993   { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26994   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26995   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26996   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26997   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26998   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26999   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27000   { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27001   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27002   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27003   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27004   { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27005   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27006   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27007   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27008   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27009   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27010   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI  },
27011   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN,  (int) V8SI_FTYPE_V8SI_V8SI },
27012   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27013   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27014   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27015   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27016   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27017   { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27018   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27019   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27020   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27021   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27022   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27023   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27024   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27025   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27026   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27027   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27028   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27029   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27030   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27031   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27032   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27033   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27034   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27035   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27036   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27037   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27038   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27039   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27040   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27041   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27042   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27043   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27044   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27045   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27046   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27047   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27048   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27049 
27050   { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
27051 
27052   /* BMI */
27053   { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27054   { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27055   { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2,       "__builtin_ctzs",           IX86_BUILTIN_CTZS,    UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27056 
27057   /* TBM */
27058   { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27059   { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27060 
27061   /* F16C */
27062   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27063   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27064   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27065   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27066 
27067   /* BMI2 */
27068   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27069   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27070   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27071   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27072   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27073   { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27074 };
27075 
27076 /* FMA4 and XOP.  */
27077 #define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27078 #define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27079 #define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27080 #define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27081 #define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
27082 #define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
27083 #define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
27084 #define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
27085 #define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
27086 #define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
27087 #define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
27088 #define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
27089 #define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
27090 #define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
27091 #define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
27092 #define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
27093 #define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
27094 #define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
27095 #define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
27096 #define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
27097 #define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
27098 #define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
27099 #define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
27100 #define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
27101 #define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
27102 #define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
27103 #define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
27104 #define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
27105 #define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
27106 #define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
27107 #define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
27108 #define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
27109 #define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
27110 #define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
27111 #define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
27112 #define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
27113 #define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
27114 #define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
27115 #define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
27116 #define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
27117 #define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
27118 #define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
27119 #define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
27120 #define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
27121 #define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
27122 #define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
27123 #define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
27124 #define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
27125 #define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
27126 #define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
27127 #define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
27128 #define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
27129 
27130 static const struct builtin_description bdesc_multi_arg[] =
27131 {
27132   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27133     "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27134     UNKNOWN, (int)MULTI_ARG_3_SF },
27135   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27136     "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27137     UNKNOWN, (int)MULTI_ARG_3_DF },
27138 
27139   { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27140     "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27141     UNKNOWN, (int)MULTI_ARG_3_SF },
27142   { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27143     "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27144     UNKNOWN, (int)MULTI_ARG_3_DF },
27145 
27146   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27147     "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27148     UNKNOWN, (int)MULTI_ARG_3_SF },
27149   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27150     "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27151     UNKNOWN, (int)MULTI_ARG_3_DF },
27152   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27153     "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27154     UNKNOWN, (int)MULTI_ARG_3_SF2 },
27155   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27156     "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27157     UNKNOWN, (int)MULTI_ARG_3_DF2 },
27158 
27159   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27160     "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27161     UNKNOWN, (int)MULTI_ARG_3_SF },
27162   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27163     "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27164     UNKNOWN, (int)MULTI_ARG_3_DF },
27165   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27166     "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27167     UNKNOWN, (int)MULTI_ARG_3_SF2 },
27168   { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27169     "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27170     UNKNOWN, (int)MULTI_ARG_3_DF2 },
27171 
27172   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov",      IX86_BUILTIN_VPCMOV,	 UNKNOWN,      (int)MULTI_ARG_3_DI },
27173   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN,      (int)MULTI_ARG_3_DI },
27174   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si,        "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN,      (int)MULTI_ARG_3_SI },
27175   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi,        "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN,      (int)MULTI_ARG_3_HI },
27176   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi,       "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN,      (int)MULTI_ARG_3_QI },
27177   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df,        "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN,      (int)MULTI_ARG_3_DF },
27178   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf,        "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN,      (int)MULTI_ARG_3_SF },
27179 
27180   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov256",       IX86_BUILTIN_VPCMOV256,       UNKNOWN,      (int)MULTI_ARG_3_DI2 },
27181   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov_v4di256",  IX86_BUILTIN_VPCMOV_V4DI256,  UNKNOWN,      (int)MULTI_ARG_3_DI2 },
27182   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256,        "__builtin_ia32_vpcmov_v8si256",  IX86_BUILTIN_VPCMOV_V8SI256,  UNKNOWN,      (int)MULTI_ARG_3_SI2 },
27183   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256,       "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN,      (int)MULTI_ARG_3_HI2 },
27184   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256,       "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN,      (int)MULTI_ARG_3_QI2 },
27185   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256,        "__builtin_ia32_vpcmov_v4df256",  IX86_BUILTIN_VPCMOV_V4DF256,  UNKNOWN,      (int)MULTI_ARG_3_DF2 },
27186   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256,        "__builtin_ia32_vpcmov_v8sf256",  IX86_BUILTIN_VPCMOV_V8SF256,  UNKNOWN,      (int)MULTI_ARG_3_SF2 },
27187 
27188   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm,             "__builtin_ia32_vpperm",      IX86_BUILTIN_VPPERM,      UNKNOWN,      (int)MULTI_ARG_3_QI },
27189 
27190   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww,          "__builtin_ia32_vpmacssww",   IX86_BUILTIN_VPMACSSWW,   UNKNOWN,      (int)MULTI_ARG_3_HI },
27191   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww,           "__builtin_ia32_vpmacsww",    IX86_BUILTIN_VPMACSWW,    UNKNOWN,      (int)MULTI_ARG_3_HI },
27192   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd,          "__builtin_ia32_vpmacsswd",   IX86_BUILTIN_VPMACSSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27193   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd,           "__builtin_ia32_vpmacswd",    IX86_BUILTIN_VPMACSWD,    UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27194   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd,          "__builtin_ia32_vpmacssdd",   IX86_BUILTIN_VPMACSSDD,   UNKNOWN,      (int)MULTI_ARG_3_SI },
27195   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd,           "__builtin_ia32_vpmacsdd",    IX86_BUILTIN_VPMACSDD,    UNKNOWN,      (int)MULTI_ARG_3_SI },
27196   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql,         "__builtin_ia32_vpmacssdql",  IX86_BUILTIN_VPMACSSDQL,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27197   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh,         "__builtin_ia32_vpmacssdqh",  IX86_BUILTIN_VPMACSSDQH,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27198   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql,          "__builtin_ia32_vpmacsdql",   IX86_BUILTIN_VPMACSDQL,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27199   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh,          "__builtin_ia32_vpmacsdqh",   IX86_BUILTIN_VPMACSDQH,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
27200   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd,         "__builtin_ia32_vpmadcsswd",  IX86_BUILTIN_VPMADCSSWD,  UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27201   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd,          "__builtin_ia32_vpmadcswd",   IX86_BUILTIN_VPMADCSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
27202 
27203   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3,        "__builtin_ia32_vprotq",      IX86_BUILTIN_VPROTQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
27204   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3,        "__builtin_ia32_vprotd",      IX86_BUILTIN_VPROTD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
27205   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3,        "__builtin_ia32_vprotw",      IX86_BUILTIN_VPROTW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
27206   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3,       "__builtin_ia32_vprotb",      IX86_BUILTIN_VPROTB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
27207   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3,         "__builtin_ia32_vprotqi",     IX86_BUILTIN_VPROTQ_IMM,  UNKNOWN,      (int)MULTI_ARG_2_DI_IMM },
27208   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3,         "__builtin_ia32_vprotdi",     IX86_BUILTIN_VPROTD_IMM,  UNKNOWN,      (int)MULTI_ARG_2_SI_IMM },
27209   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3,         "__builtin_ia32_vprotwi",     IX86_BUILTIN_VPROTW_IMM,  UNKNOWN,      (int)MULTI_ARG_2_HI_IMM },
27210   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3,        "__builtin_ia32_vprotbi",     IX86_BUILTIN_VPROTB_IMM,  UNKNOWN,      (int)MULTI_ARG_2_QI_IMM },
27211   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3,         "__builtin_ia32_vpshaq",      IX86_BUILTIN_VPSHAQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
27212   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3,         "__builtin_ia32_vpshad",      IX86_BUILTIN_VPSHAD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
27213   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3,         "__builtin_ia32_vpshaw",      IX86_BUILTIN_VPSHAW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
27214   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3,        "__builtin_ia32_vpshab",      IX86_BUILTIN_VPSHAB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
27215   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3,         "__builtin_ia32_vpshlq",      IX86_BUILTIN_VPSHLQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
27216   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3,         "__builtin_ia32_vpshld",      IX86_BUILTIN_VPSHLD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
27217   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3,         "__builtin_ia32_vpshlw",      IX86_BUILTIN_VPSHLW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
27218   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3,        "__builtin_ia32_vpshlb",      IX86_BUILTIN_VPSHLB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
27219 
27220   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2,       "__builtin_ia32_vfrczss",     IX86_BUILTIN_VFRCZSS,     UNKNOWN,      (int)MULTI_ARG_1_SF },
27221   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2,       "__builtin_ia32_vfrczsd",     IX86_BUILTIN_VFRCZSD,     UNKNOWN,      (int)MULTI_ARG_1_DF },
27222   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2,         "__builtin_ia32_vfrczps",     IX86_BUILTIN_VFRCZPS,     UNKNOWN,      (int)MULTI_ARG_1_SF },
27223   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2,         "__builtin_ia32_vfrczpd",     IX86_BUILTIN_VFRCZPD,     UNKNOWN,      (int)MULTI_ARG_1_DF },
27224   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2,         "__builtin_ia32_vfrczps256",  IX86_BUILTIN_VFRCZPS256,  UNKNOWN,      (int)MULTI_ARG_1_SF2 },
27225   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2,         "__builtin_ia32_vfrczpd256",  IX86_BUILTIN_VFRCZPD256,  UNKNOWN,      (int)MULTI_ARG_1_DF2 },
27226 
27227   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw,           "__builtin_ia32_vphaddbw",    IX86_BUILTIN_VPHADDBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
27228   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd,           "__builtin_ia32_vphaddbd",    IX86_BUILTIN_VPHADDBD,    UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
27229   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq,           "__builtin_ia32_vphaddbq",    IX86_BUILTIN_VPHADDBQ,    UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
27230   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd,           "__builtin_ia32_vphaddwd",    IX86_BUILTIN_VPHADDWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
27231   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq,           "__builtin_ia32_vphaddwq",    IX86_BUILTIN_VPHADDWQ,    UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
27232   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq,           "__builtin_ia32_vphadddq",    IX86_BUILTIN_VPHADDDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
27233   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw,          "__builtin_ia32_vphaddubw",   IX86_BUILTIN_VPHADDUBW,   UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
27234   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd,          "__builtin_ia32_vphaddubd",   IX86_BUILTIN_VPHADDUBD,   UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
27235   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq,          "__builtin_ia32_vphaddubq",   IX86_BUILTIN_VPHADDUBQ,   UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
27236   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd,          "__builtin_ia32_vphadduwd",   IX86_BUILTIN_VPHADDUWD,   UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
27237   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq,          "__builtin_ia32_vphadduwq",   IX86_BUILTIN_VPHADDUWQ,   UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
27238   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq,          "__builtin_ia32_vphaddudq",   IX86_BUILTIN_VPHADDUDQ,   UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
27239   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw,           "__builtin_ia32_vphsubbw",    IX86_BUILTIN_VPHSUBBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
27240   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd,           "__builtin_ia32_vphsubwd",    IX86_BUILTIN_VPHSUBWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
27241   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq,           "__builtin_ia32_vphsubdq",    IX86_BUILTIN_VPHSUBDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
27242 
27243   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomeqb",    IX86_BUILTIN_VPCOMEQB,    EQ,           (int)MULTI_ARG_2_QI_CMP },
27244   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneb",    IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
27245   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneqb",   IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
27246   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomltb",    IX86_BUILTIN_VPCOMLTB,    LT,           (int)MULTI_ARG_2_QI_CMP },
27247   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomleb",    IX86_BUILTIN_VPCOMLEB,    LE,           (int)MULTI_ARG_2_QI_CMP },
27248   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgtb",    IX86_BUILTIN_VPCOMGTB,    GT,           (int)MULTI_ARG_2_QI_CMP },
27249   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgeb",    IX86_BUILTIN_VPCOMGEB,    GE,           (int)MULTI_ARG_2_QI_CMP },
27250 
27251   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomeqw",    IX86_BUILTIN_VPCOMEQW,    EQ,           (int)MULTI_ARG_2_HI_CMP },
27252   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomnew",    IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
27253   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomneqw",   IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
27254   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomltw",    IX86_BUILTIN_VPCOMLTW,    LT,           (int)MULTI_ARG_2_HI_CMP },
27255   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomlew",    IX86_BUILTIN_VPCOMLEW,    LE,           (int)MULTI_ARG_2_HI_CMP },
27256   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgtw",    IX86_BUILTIN_VPCOMGTW,    GT,           (int)MULTI_ARG_2_HI_CMP },
27257   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgew",    IX86_BUILTIN_VPCOMGEW,    GE,           (int)MULTI_ARG_2_HI_CMP },
27258 
27259   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomeqd",    IX86_BUILTIN_VPCOMEQD,    EQ,           (int)MULTI_ARG_2_SI_CMP },
27260   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomned",    IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
27261   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomneqd",   IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
27262   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomltd",    IX86_BUILTIN_VPCOMLTD,    LT,           (int)MULTI_ARG_2_SI_CMP },
27263   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomled",    IX86_BUILTIN_VPCOMLED,    LE,           (int)MULTI_ARG_2_SI_CMP },
27264   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomgtd",    IX86_BUILTIN_VPCOMGTD,    GT,           (int)MULTI_ARG_2_SI_CMP },
27265   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomged",    IX86_BUILTIN_VPCOMGED,    GE,           (int)MULTI_ARG_2_SI_CMP },
27266 
27267   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomeqq",    IX86_BUILTIN_VPCOMEQQ,    EQ,           (int)MULTI_ARG_2_DI_CMP },
27268   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneq",    IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
27269   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneqq",   IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
27270   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomltq",    IX86_BUILTIN_VPCOMLTQ,    LT,           (int)MULTI_ARG_2_DI_CMP },
27271   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomleq",    IX86_BUILTIN_VPCOMLEQ,    LE,           (int)MULTI_ARG_2_DI_CMP },
27272   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgtq",    IX86_BUILTIN_VPCOMGTQ,    GT,           (int)MULTI_ARG_2_DI_CMP },
27273   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgeq",    IX86_BUILTIN_VPCOMGEQ,    GE,           (int)MULTI_ARG_2_DI_CMP },
27274 
27275   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb",   IX86_BUILTIN_VPCOMEQUB,   EQ,           (int)MULTI_ARG_2_QI_CMP },
27276   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub",   IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
27277   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb",  IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
27278   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub",   IX86_BUILTIN_VPCOMLTUB,   LTU,          (int)MULTI_ARG_2_QI_CMP },
27279   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub",   IX86_BUILTIN_VPCOMLEUB,   LEU,          (int)MULTI_ARG_2_QI_CMP },
27280   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub",   IX86_BUILTIN_VPCOMGTUB,   GTU,          (int)MULTI_ARG_2_QI_CMP },
27281   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub",   IX86_BUILTIN_VPCOMGEUB,   GEU,          (int)MULTI_ARG_2_QI_CMP },
27282 
27283   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw",   IX86_BUILTIN_VPCOMEQUW,   EQ,           (int)MULTI_ARG_2_HI_CMP },
27284   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw",   IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
27285   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw",  IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
27286   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomltuw",   IX86_BUILTIN_VPCOMLTUW,   LTU,          (int)MULTI_ARG_2_HI_CMP },
27287   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomleuw",   IX86_BUILTIN_VPCOMLEUW,   LEU,          (int)MULTI_ARG_2_HI_CMP },
27288   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgtuw",   IX86_BUILTIN_VPCOMGTUW,   GTU,          (int)MULTI_ARG_2_HI_CMP },
27289   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgeuw",   IX86_BUILTIN_VPCOMGEUW,   GEU,          (int)MULTI_ARG_2_HI_CMP },
27290 
27291   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd",   IX86_BUILTIN_VPCOMEQUD,   EQ,           (int)MULTI_ARG_2_SI_CMP },
27292   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud",   IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
27293   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd",  IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
27294   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomltud",   IX86_BUILTIN_VPCOMLTUD,   LTU,          (int)MULTI_ARG_2_SI_CMP },
27295   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomleud",   IX86_BUILTIN_VPCOMLEUD,   LEU,          (int)MULTI_ARG_2_SI_CMP },
27296   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgtud",   IX86_BUILTIN_VPCOMGTUD,   GTU,          (int)MULTI_ARG_2_SI_CMP },
27297   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgeud",   IX86_BUILTIN_VPCOMGEUD,   GEU,          (int)MULTI_ARG_2_SI_CMP },
27298 
27299   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq",   IX86_BUILTIN_VPCOMEQUQ,   EQ,           (int)MULTI_ARG_2_DI_CMP },
27300   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq",   IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
27301   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq",  IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
27302   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomltuq",   IX86_BUILTIN_VPCOMLTUQ,   LTU,          (int)MULTI_ARG_2_DI_CMP },
27303   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomleuq",   IX86_BUILTIN_VPCOMLEUQ,   LEU,          (int)MULTI_ARG_2_DI_CMP },
27304   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgtuq",   IX86_BUILTIN_VPCOMGTUQ,   GTU,          (int)MULTI_ARG_2_DI_CMP },
27305   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgeuq",   IX86_BUILTIN_VPCOMGEUQ,   GEU,          (int)MULTI_ARG_2_DI_CMP },
27306 
27307   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
27308   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
27309   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
27310   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
27311   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
27312   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
27313   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
27314   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
27315 
27316   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueb",  IX86_BUILTIN_VPCOMTRUEB,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
27317   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtruew",  IX86_BUILTIN_VPCOMTRUEW,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
27318   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrued",  IX86_BUILTIN_VPCOMTRUED,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
27319   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueq",  IX86_BUILTIN_VPCOMTRUEQ,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
27320   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
27321   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
27322   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
27323   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
27324 
27325   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3,     "__builtin_ia32_vpermil2pd",  IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27326   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3,     "__builtin_ia32_vpermil2ps",  IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27327   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3,     "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27328   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3,     "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27329 
27330 };
27331 
27332 /* TM vector builtins.  */
27333 
27334 /* Reuse the existing x86-specific `struct builtin_description' cause
27335    we're lazy.  Add casts to make them fit.  */
27336 static const struct builtin_description bdesc_tm[] =
27337 {
27338   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27339   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27340   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27341   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27342   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27343   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27344   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27345 
27346   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27347   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27348   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27349   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27350   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27351   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27352   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27353 
27354   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27355   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27356   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27357   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27358   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27359   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27360   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27361 
27362   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27363   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27364   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27365 };
27366 
27367 /* TM callbacks.  */
27368 
27369 /* Return the builtin decl needed to load a vector of TYPE.  */
27370 
27371 static tree
27372 ix86_builtin_tm_load (tree type)
27373 {
27374   if (TREE_CODE (type) == VECTOR_TYPE)
27375     {
27376       switch (tree_low_cst (TYPE_SIZE (type), 1))
27377 	{
27378 	case 64:
27379 	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27380 	case 128:
27381 	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27382 	case 256:
27383 	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27384 	}
27385     }
27386   return NULL_TREE;
27387 }
27388 
27389 /* Return the builtin decl needed to store a vector of TYPE.  */
27390 
27391 static tree
27392 ix86_builtin_tm_store (tree type)
27393 {
27394   if (TREE_CODE (type) == VECTOR_TYPE)
27395     {
27396       switch (tree_low_cst (TYPE_SIZE (type), 1))
27397 	{
27398 	case 64:
27399 	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27400 	case 128:
27401 	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27402 	case 256:
27403 	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27404 	}
27405     }
27406   return NULL_TREE;
27407 }
27408 
27409 /* Initialize the transactional memory vector load/store builtins.  */
27410 
27411 static void
27412 ix86_init_tm_builtins (void)
27413 {
27414   enum ix86_builtin_func_type ftype;
27415   const struct builtin_description *d;
27416   size_t i;
27417   tree decl;
27418   tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27419   tree attrs_log, attrs_type_log;
27420 
27421   if (!flag_tm)
27422     return;
27423 
27424   /* If there are no builtins defined, we must be compiling in a
27425      language without trans-mem support.  */
27426   if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27427     return;
27428 
27429   /* Use whatever attributes a normal TM load has.  */
27430   decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27431   attrs_load = DECL_ATTRIBUTES (decl);
27432   attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27433   /* Use whatever attributes a normal TM store has.  */
27434   decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27435   attrs_store = DECL_ATTRIBUTES (decl);
27436   attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27437   /* Use whatever attributes a normal TM log has.  */
27438   decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27439   attrs_log = DECL_ATTRIBUTES (decl);
27440   attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27441 
27442   for (i = 0, d = bdesc_tm;
27443        i < ARRAY_SIZE (bdesc_tm);
27444        i++, d++)
27445     {
27446       if ((d->mask & ix86_isa_flags) != 0
27447 	  || (lang_hooks.builtin_function
27448 	      == lang_hooks.builtin_function_ext_scope))
27449 	{
27450 	  tree type, attrs, attrs_type;
27451 	  enum built_in_function code = (enum built_in_function) d->code;
27452 
27453 	  ftype = (enum ix86_builtin_func_type) d->flag;
27454 	  type = ix86_get_builtin_func_type (ftype);
27455 
27456 	  if (BUILTIN_TM_LOAD_P (code))
27457 	    {
27458 	      attrs = attrs_load;
27459 	      attrs_type = attrs_type_load;
27460 	    }
27461 	  else if (BUILTIN_TM_STORE_P (code))
27462 	    {
27463 	      attrs = attrs_store;
27464 	      attrs_type = attrs_type_store;
27465 	    }
27466 	  else
27467 	    {
27468 	      attrs = attrs_log;
27469 	      attrs_type = attrs_type_log;
27470 	    }
27471 	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27472 				       /* The builtin without the prefix for
27473 					  calling it directly.  */
27474 				       d->name + strlen ("__builtin_"),
27475 				       attrs);
27476 	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27477 	     set the TYPE_ATTRIBUTES.  */
27478 	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27479 
27480 	  set_builtin_decl (code, decl, false);
27481 	}
27482     }
27483 }
27484 
27485 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27486    in the current target ISA to allow the user to compile particular modules
27487    with different target specific options that differ from the command line
27488    options.  */
27489 static void
27490 ix86_init_mmx_sse_builtins (void)
27491 {
27492   const struct builtin_description * d;
27493   enum ix86_builtin_func_type ftype;
27494   size_t i;
27495 
27496   /* Add all special builtins with variable number of operands.  */
27497   for (i = 0, d = bdesc_special_args;
27498        i < ARRAY_SIZE (bdesc_special_args);
27499        i++, d++)
27500     {
27501       if (d->name == 0)
27502 	continue;
27503 
27504       ftype = (enum ix86_builtin_func_type) d->flag;
27505       def_builtin (d->mask, d->name, ftype, d->code);
27506     }
27507 
27508   /* Add all builtins with variable number of operands.  */
27509   for (i = 0, d = bdesc_args;
27510        i < ARRAY_SIZE (bdesc_args);
27511        i++, d++)
27512     {
27513       if (d->name == 0)
27514 	continue;
27515 
27516       ftype = (enum ix86_builtin_func_type) d->flag;
27517       def_builtin_const (d->mask, d->name, ftype, d->code);
27518     }
27519 
27520   /* pcmpestr[im] insns.  */
27521   for (i = 0, d = bdesc_pcmpestr;
27522        i < ARRAY_SIZE (bdesc_pcmpestr);
27523        i++, d++)
27524     {
27525       if (d->code == IX86_BUILTIN_PCMPESTRM128)
27526 	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27527       else
27528 	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27529       def_builtin_const (d->mask, d->name, ftype, d->code);
27530     }
27531 
27532   /* pcmpistr[im] insns.  */
27533   for (i = 0, d = bdesc_pcmpistr;
27534        i < ARRAY_SIZE (bdesc_pcmpistr);
27535        i++, d++)
27536     {
27537       if (d->code == IX86_BUILTIN_PCMPISTRM128)
27538 	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27539       else
27540 	ftype = INT_FTYPE_V16QI_V16QI_INT;
27541       def_builtin_const (d->mask, d->name, ftype, d->code);
27542     }
27543 
27544   /* comi/ucomi insns.  */
27545   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27546     {
27547       if (d->mask == OPTION_MASK_ISA_SSE2)
27548 	ftype = INT_FTYPE_V2DF_V2DF;
27549       else
27550 	ftype = INT_FTYPE_V4SF_V4SF;
27551       def_builtin_const (d->mask, d->name, ftype, d->code);
27552     }
27553 
27554   /* SSE */
27555   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27556 	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27557   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27558 	       UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27559 
27560   /* SSE or 3DNow!A */
27561   def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27562 	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27563 	       IX86_BUILTIN_MASKMOVQ);
27564 
27565   /* SSE2 */
27566   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27567 	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27568 
27569   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27570 	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27571   x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27572 			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27573 
27574   /* SSE3.  */
27575   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27576 	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27577   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27578 	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27579 
27580   /* AES */
27581   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27582 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27583   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27584 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27585   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27586 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27587   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27588 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27589   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27590 		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27591   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27592 		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27593 
27594   /* PCLMUL */
27595   def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27596 		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27597 
27598   /* RDRND */
27599   def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27600 	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27601   def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27602 	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27603   def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27604 	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27605 	       IX86_BUILTIN_RDRAND64_STEP);
27606 
27607   /* AVX2 */
27608   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27609 	       V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27610 	       IX86_BUILTIN_GATHERSIV2DF);
27611 
27612   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27613 	       V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27614 	       IX86_BUILTIN_GATHERSIV4DF);
27615 
27616   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27617 	       V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27618 	       IX86_BUILTIN_GATHERDIV2DF);
27619 
27620   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27621 	       V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27622 	       IX86_BUILTIN_GATHERDIV4DF);
27623 
27624   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27625 	       V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27626 	       IX86_BUILTIN_GATHERSIV4SF);
27627 
27628   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27629 	       V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27630 	       IX86_BUILTIN_GATHERSIV8SF);
27631 
27632   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27633 	       V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27634 	       IX86_BUILTIN_GATHERDIV4SF);
27635 
27636   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27637 	       V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27638 	       IX86_BUILTIN_GATHERDIV8SF);
27639 
27640   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27641 	       V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27642 	       IX86_BUILTIN_GATHERSIV2DI);
27643 
27644   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27645 	       V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27646 	       IX86_BUILTIN_GATHERSIV4DI);
27647 
27648   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27649 	       V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27650 	       IX86_BUILTIN_GATHERDIV2DI);
27651 
27652   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27653 	       V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27654 	       IX86_BUILTIN_GATHERDIV4DI);
27655 
27656   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27657 	       V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27658 	       IX86_BUILTIN_GATHERSIV4SI);
27659 
27660   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27661 	       V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27662 	       IX86_BUILTIN_GATHERSIV8SI);
27663 
27664   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27665 	       V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27666 	       IX86_BUILTIN_GATHERDIV4SI);
27667 
27668   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27669 	       V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27670 	       IX86_BUILTIN_GATHERDIV8SI);
27671 
27672   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27673 	       V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27674 	       IX86_BUILTIN_GATHERALTSIV4DF);
27675 
27676   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27677 	       V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27678 	       IX86_BUILTIN_GATHERALTDIV8SF);
27679 
27680   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27681 	       V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27682 	       IX86_BUILTIN_GATHERALTSIV4DI);
27683 
27684   def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27685 	       V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27686 	       IX86_BUILTIN_GATHERALTDIV8SI);
27687 
27688   /* MMX access to the vec_init patterns.  */
27689   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27690 		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27691 
27692   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27693 		     V4HI_FTYPE_HI_HI_HI_HI,
27694 		     IX86_BUILTIN_VEC_INIT_V4HI);
27695 
27696   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27697 		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27698 		     IX86_BUILTIN_VEC_INIT_V8QI);
27699 
27700   /* Access to the vec_extract patterns.  */
27701   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27702 		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27703   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27704 		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27705   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27706 		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27707   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27708 		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27709   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27710 		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27711 
27712   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27713 		     "__builtin_ia32_vec_ext_v4hi",
27714 		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27715 
27716   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27717 		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27718 
27719   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27720 		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27721 
27722   /* Access to the vec_set patterns.  */
27723   def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27724 		     "__builtin_ia32_vec_set_v2di",
27725 		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27726 
27727   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27728 		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27729 
27730   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27731 		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27732 
27733   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27734 		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27735 
27736   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27737 		     "__builtin_ia32_vec_set_v4hi",
27738 		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27739 
27740   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27741 		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27742 
27743   /* Add FMA4 multi-arg argument instructions */
27744   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27745     {
27746       if (d->name == 0)
27747 	continue;
27748 
27749       ftype = (enum ix86_builtin_func_type) d->flag;
27750       def_builtin_const (d->mask, d->name, ftype, d->code);
27751     }
27752 }
27753 
27754 /* Internal method for ix86_init_builtins.  */
27755 
27756 static void
27757 ix86_init_builtins_va_builtins_abi (void)
27758 {
27759   tree ms_va_ref, sysv_va_ref;
27760   tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27761   tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27762   tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27763   tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27764 
27765   if (!TARGET_64BIT)
27766     return;
27767   fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27768   fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27769   ms_va_ref = build_reference_type (ms_va_list_type_node);
27770   sysv_va_ref =
27771     build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27772 
27773   fnvoid_va_end_ms =
27774     build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27775   fnvoid_va_start_ms =
27776     build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27777   fnvoid_va_end_sysv =
27778     build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27779   fnvoid_va_start_sysv =
27780     build_varargs_function_type_list (void_type_node, sysv_va_ref,
27781     				       NULL_TREE);
27782   fnvoid_va_copy_ms =
27783     build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27784     			      NULL_TREE);
27785   fnvoid_va_copy_sysv =
27786     build_function_type_list (void_type_node, sysv_va_ref,
27787     			      sysv_va_ref, NULL_TREE);
27788 
27789   add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27790   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27791   add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27792   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27793   add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27794 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27795   add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27796   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27797   add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27798   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27799   add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27800 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27801 }
27802 
27803 static void
27804 ix86_init_builtin_types (void)
27805 {
27806   tree float128_type_node, float80_type_node;
27807 
27808   /* The __float80 type.  */
27809   float80_type_node = long_double_type_node;
27810   if (TYPE_MODE (float80_type_node) != XFmode)
27811     {
27812       /* The __float80 type.  */
27813       float80_type_node = make_node (REAL_TYPE);
27814 
27815       TYPE_PRECISION (float80_type_node) = 80;
27816       layout_type (float80_type_node);
27817     }
27818   lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27819 
27820   /* The __float128 type.  */
27821   float128_type_node = make_node (REAL_TYPE);
27822   TYPE_PRECISION (float128_type_node) = 128;
27823   layout_type (float128_type_node);
27824   lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27825 
27826   /* This macro is built by i386-builtin-types.awk.  */
27827   DEFINE_BUILTIN_PRIMITIVE_TYPES;
27828 }
27829 
27830 static void
27831 ix86_init_builtins (void)
27832 {
27833   tree t;
27834 
27835   ix86_init_builtin_types ();
27836 
27837   /* TFmode support builtins.  */
27838   def_builtin_const (0, "__builtin_infq",
27839 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27840   def_builtin_const (0, "__builtin_huge_valq",
27841 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27842 
27843   /* We will expand them to normal call if SSE2 isn't available since
27844      they are used by libgcc. */
27845   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27846   t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27847 			    BUILT_IN_MD, "__fabstf2", NULL_TREE);
27848   TREE_READONLY (t) = 1;
27849   ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27850 
27851   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27852   t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27853 			    BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27854   TREE_READONLY (t) = 1;
27855   ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27856 
27857   ix86_init_tm_builtins ();
27858   ix86_init_mmx_sse_builtins ();
27859 
27860   if (TARGET_LP64)
27861     ix86_init_builtins_va_builtins_abi ();
27862 
27863 #ifdef SUBTARGET_INIT_BUILTINS
27864   SUBTARGET_INIT_BUILTINS;
27865 #endif
27866 }
27867 
27868 /* Return the ix86 builtin for CODE.  */
27869 
27870 static tree
27871 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27872 {
27873   if (code >= IX86_BUILTIN_MAX)
27874     return error_mark_node;
27875 
27876   return ix86_builtins[code];
27877 }
27878 
27879 /* Errors in the source file can cause expand_expr to return const0_rtx
27880    where we expect a vector.  To avoid crashing, use one of the vector
27881    clear instructions.  */
27882 static rtx
27883 safe_vector_operand (rtx x, enum machine_mode mode)
27884 {
27885   if (x == const0_rtx)
27886     x = CONST0_RTX (mode);
27887   return x;
27888 }
27889 
27890 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
27891 
27892 static rtx
27893 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27894 {
27895   rtx pat;
27896   tree arg0 = CALL_EXPR_ARG (exp, 0);
27897   tree arg1 = CALL_EXPR_ARG (exp, 1);
27898   rtx op0 = expand_normal (arg0);
27899   rtx op1 = expand_normal (arg1);
27900   enum machine_mode tmode = insn_data[icode].operand[0].mode;
27901   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27902   enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27903 
27904   if (VECTOR_MODE_P (mode0))
27905     op0 = safe_vector_operand (op0, mode0);
27906   if (VECTOR_MODE_P (mode1))
27907     op1 = safe_vector_operand (op1, mode1);
27908 
27909   if (optimize || !target
27910       || GET_MODE (target) != tmode
27911       || !insn_data[icode].operand[0].predicate (target, tmode))
27912     target = gen_reg_rtx (tmode);
27913 
27914   if (GET_MODE (op1) == SImode && mode1 == TImode)
27915     {
27916       rtx x = gen_reg_rtx (V4SImode);
27917       emit_insn (gen_sse2_loadd (x, op1));
27918       op1 = gen_lowpart (TImode, x);
27919     }
27920 
27921   if (!insn_data[icode].operand[1].predicate (op0, mode0))
27922     op0 = copy_to_mode_reg (mode0, op0);
27923   if (!insn_data[icode].operand[2].predicate (op1, mode1))
27924     op1 = copy_to_mode_reg (mode1, op1);
27925 
27926   pat = GEN_FCN (icode) (target, op0, op1);
27927   if (! pat)
27928     return 0;
27929 
27930   emit_insn (pat);
27931 
27932   return target;
27933 }
27934 
27935 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
27936 
27937 static rtx
27938 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27939 			       enum ix86_builtin_func_type m_type,
27940 			       enum rtx_code sub_code)
27941 {
27942   rtx pat;
27943   int i;
27944   int nargs;
27945   bool comparison_p = false;
27946   bool tf_p = false;
27947   bool last_arg_constant = false;
27948   int num_memory = 0;
27949   struct {
27950     rtx op;
27951     enum machine_mode mode;
27952   } args[4];
27953 
27954   enum machine_mode tmode = insn_data[icode].operand[0].mode;
27955 
27956   switch (m_type)
27957     {
27958     case MULTI_ARG_4_DF2_DI_I:
27959     case MULTI_ARG_4_DF2_DI_I1:
27960     case MULTI_ARG_4_SF2_SI_I:
27961     case MULTI_ARG_4_SF2_SI_I1:
27962       nargs = 4;
27963       last_arg_constant = true;
27964       break;
27965 
27966     case MULTI_ARG_3_SF:
27967     case MULTI_ARG_3_DF:
27968     case MULTI_ARG_3_SF2:
27969     case MULTI_ARG_3_DF2:
27970     case MULTI_ARG_3_DI:
27971     case MULTI_ARG_3_SI:
27972     case MULTI_ARG_3_SI_DI:
27973     case MULTI_ARG_3_HI:
27974     case MULTI_ARG_3_HI_SI:
27975     case MULTI_ARG_3_QI:
27976     case MULTI_ARG_3_DI2:
27977     case MULTI_ARG_3_SI2:
27978     case MULTI_ARG_3_HI2:
27979     case MULTI_ARG_3_QI2:
27980       nargs = 3;
27981       break;
27982 
27983     case MULTI_ARG_2_SF:
27984     case MULTI_ARG_2_DF:
27985     case MULTI_ARG_2_DI:
27986     case MULTI_ARG_2_SI:
27987     case MULTI_ARG_2_HI:
27988     case MULTI_ARG_2_QI:
27989       nargs = 2;
27990       break;
27991 
27992     case MULTI_ARG_2_DI_IMM:
27993     case MULTI_ARG_2_SI_IMM:
27994     case MULTI_ARG_2_HI_IMM:
27995     case MULTI_ARG_2_QI_IMM:
27996       nargs = 2;
27997       last_arg_constant = true;
27998       break;
27999 
28000     case MULTI_ARG_1_SF:
28001     case MULTI_ARG_1_DF:
28002     case MULTI_ARG_1_SF2:
28003     case MULTI_ARG_1_DF2:
28004     case MULTI_ARG_1_DI:
28005     case MULTI_ARG_1_SI:
28006     case MULTI_ARG_1_HI:
28007     case MULTI_ARG_1_QI:
28008     case MULTI_ARG_1_SI_DI:
28009     case MULTI_ARG_1_HI_DI:
28010     case MULTI_ARG_1_HI_SI:
28011     case MULTI_ARG_1_QI_DI:
28012     case MULTI_ARG_1_QI_SI:
28013     case MULTI_ARG_1_QI_HI:
28014       nargs = 1;
28015       break;
28016 
28017     case MULTI_ARG_2_DI_CMP:
28018     case MULTI_ARG_2_SI_CMP:
28019     case MULTI_ARG_2_HI_CMP:
28020     case MULTI_ARG_2_QI_CMP:
28021       nargs = 2;
28022       comparison_p = true;
28023       break;
28024 
28025     case MULTI_ARG_2_SF_TF:
28026     case MULTI_ARG_2_DF_TF:
28027     case MULTI_ARG_2_DI_TF:
28028     case MULTI_ARG_2_SI_TF:
28029     case MULTI_ARG_2_HI_TF:
28030     case MULTI_ARG_2_QI_TF:
28031       nargs = 2;
28032       tf_p = true;
28033       break;
28034 
28035     default:
28036       gcc_unreachable ();
28037     }
28038 
28039   if (optimize || !target
28040       || GET_MODE (target) != tmode
28041       || !insn_data[icode].operand[0].predicate (target, tmode))
28042     target = gen_reg_rtx (tmode);
28043 
28044   gcc_assert (nargs <= 4);
28045 
28046   for (i = 0; i < nargs; i++)
28047     {
28048       tree arg = CALL_EXPR_ARG (exp, i);
28049       rtx op = expand_normal (arg);
28050       int adjust = (comparison_p) ? 1 : 0;
28051       enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28052 
28053       if (last_arg_constant && i == nargs - 1)
28054 	{
28055 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28056 	    {
28057 	      enum insn_code new_icode = icode;
28058 	      switch (icode)
28059 		{
28060 		case CODE_FOR_xop_vpermil2v2df3:
28061 		case CODE_FOR_xop_vpermil2v4sf3:
28062 		case CODE_FOR_xop_vpermil2v4df3:
28063 		case CODE_FOR_xop_vpermil2v8sf3:
28064 		  error ("the last argument must be a 2-bit immediate");
28065 		  return gen_reg_rtx (tmode);
28066 		case CODE_FOR_xop_rotlv2di3:
28067 		  new_icode = CODE_FOR_rotlv2di3;
28068 		  goto xop_rotl;
28069 		case CODE_FOR_xop_rotlv4si3:
28070 		  new_icode = CODE_FOR_rotlv4si3;
28071 		  goto xop_rotl;
28072 		case CODE_FOR_xop_rotlv8hi3:
28073 		  new_icode = CODE_FOR_rotlv8hi3;
28074 		  goto xop_rotl;
28075 		case CODE_FOR_xop_rotlv16qi3:
28076 		  new_icode = CODE_FOR_rotlv16qi3;
28077 		xop_rotl:
28078 		  if (CONST_INT_P (op))
28079 		    {
28080 		      int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28081 		      op = GEN_INT (INTVAL (op) & mask);
28082 		      gcc_checking_assert
28083 			(insn_data[icode].operand[i + 1].predicate (op, mode));
28084 		    }
28085 		  else
28086 		    {
28087 		      gcc_checking_assert
28088 			(nargs == 2
28089 			 && insn_data[new_icode].operand[0].mode == tmode
28090 			 && insn_data[new_icode].operand[1].mode == tmode
28091 			 && insn_data[new_icode].operand[2].mode == mode
28092 			 && insn_data[new_icode].operand[0].predicate
28093 			    == insn_data[icode].operand[0].predicate
28094 			 && insn_data[new_icode].operand[1].predicate
28095 			    == insn_data[icode].operand[1].predicate);
28096 		      icode = new_icode;
28097 		      goto non_constant;
28098 		    }
28099 		  break;
28100 		default:
28101 		  gcc_unreachable ();
28102 		}
28103 	    }
28104 	}
28105       else
28106 	{
28107 	non_constant:
28108 	  if (VECTOR_MODE_P (mode))
28109 	    op = safe_vector_operand (op, mode);
28110 
28111 	  /* If we aren't optimizing, only allow one memory operand to be
28112 	     generated.  */
28113 	  if (memory_operand (op, mode))
28114 	    num_memory++;
28115 
28116 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28117 
28118 	  if (optimize
28119 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28120 	      || num_memory > 1)
28121 	    op = force_reg (mode, op);
28122 	}
28123 
28124       args[i].op = op;
28125       args[i].mode = mode;
28126     }
28127 
28128   switch (nargs)
28129     {
28130     case 1:
28131       pat = GEN_FCN (icode) (target, args[0].op);
28132       break;
28133 
28134     case 2:
28135       if (tf_p)
28136 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28137 			       GEN_INT ((int)sub_code));
28138       else if (! comparison_p)
28139 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28140       else
28141 	{
28142 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28143 				       args[0].op,
28144 				       args[1].op);
28145 
28146 	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28147 	}
28148       break;
28149 
28150     case 3:
28151       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28152       break;
28153 
28154     case 4:
28155       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28156       break;
28157 
28158     default:
28159       gcc_unreachable ();
28160     }
28161 
28162   if (! pat)
28163     return 0;
28164 
28165   emit_insn (pat);
28166   return target;
28167 }
28168 
28169 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28170    insns with vec_merge.  */
28171 
28172 static rtx
28173 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28174 				    rtx target)
28175 {
28176   rtx pat;
28177   tree arg0 = CALL_EXPR_ARG (exp, 0);
28178   rtx op1, op0 = expand_normal (arg0);
28179   enum machine_mode tmode = insn_data[icode].operand[0].mode;
28180   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28181 
28182   if (optimize || !target
28183       || GET_MODE (target) != tmode
28184       || !insn_data[icode].operand[0].predicate (target, tmode))
28185     target = gen_reg_rtx (tmode);
28186 
28187   if (VECTOR_MODE_P (mode0))
28188     op0 = safe_vector_operand (op0, mode0);
28189 
28190   if ((optimize && !register_operand (op0, mode0))
28191       || !insn_data[icode].operand[1].predicate (op0, mode0))
28192     op0 = copy_to_mode_reg (mode0, op0);
28193 
28194   op1 = op0;
28195   if (!insn_data[icode].operand[2].predicate (op1, mode0))
28196     op1 = copy_to_mode_reg (mode0, op1);
28197 
28198   pat = GEN_FCN (icode) (target, op0, op1);
28199   if (! pat)
28200     return 0;
28201   emit_insn (pat);
28202   return target;
28203 }
28204 
28205 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
28206 
28207 static rtx
28208 ix86_expand_sse_compare (const struct builtin_description *d,
28209 			 tree exp, rtx target, bool swap)
28210 {
28211   rtx pat;
28212   tree arg0 = CALL_EXPR_ARG (exp, 0);
28213   tree arg1 = CALL_EXPR_ARG (exp, 1);
28214   rtx op0 = expand_normal (arg0);
28215   rtx op1 = expand_normal (arg1);
28216   rtx op2;
28217   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28218   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28219   enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28220   enum rtx_code comparison = d->comparison;
28221 
28222   if (VECTOR_MODE_P (mode0))
28223     op0 = safe_vector_operand (op0, mode0);
28224   if (VECTOR_MODE_P (mode1))
28225     op1 = safe_vector_operand (op1, mode1);
28226 
28227   /* Swap operands if we have a comparison that isn't available in
28228      hardware.  */
28229   if (swap)
28230     {
28231       rtx tmp = gen_reg_rtx (mode1);
28232       emit_move_insn (tmp, op1);
28233       op1 = op0;
28234       op0 = tmp;
28235     }
28236 
28237   if (optimize || !target
28238       || GET_MODE (target) != tmode
28239       || !insn_data[d->icode].operand[0].predicate (target, tmode))
28240     target = gen_reg_rtx (tmode);
28241 
28242   if ((optimize && !register_operand (op0, mode0))
28243       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28244     op0 = copy_to_mode_reg (mode0, op0);
28245   if ((optimize && !register_operand (op1, mode1))
28246       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28247     op1 = copy_to_mode_reg (mode1, op1);
28248 
28249   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28250   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28251   if (! pat)
28252     return 0;
28253   emit_insn (pat);
28254   return target;
28255 }
28256 
28257 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
28258 
28259 static rtx
28260 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28261 		      rtx target)
28262 {
28263   rtx pat;
28264   tree arg0 = CALL_EXPR_ARG (exp, 0);
28265   tree arg1 = CALL_EXPR_ARG (exp, 1);
28266   rtx op0 = expand_normal (arg0);
28267   rtx op1 = expand_normal (arg1);
28268   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28269   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28270   enum rtx_code comparison = d->comparison;
28271 
28272   if (VECTOR_MODE_P (mode0))
28273     op0 = safe_vector_operand (op0, mode0);
28274   if (VECTOR_MODE_P (mode1))
28275     op1 = safe_vector_operand (op1, mode1);
28276 
28277   /* Swap operands if we have a comparison that isn't available in
28278      hardware.  */
28279   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28280     {
28281       rtx tmp = op1;
28282       op1 = op0;
28283       op0 = tmp;
28284     }
28285 
28286   target = gen_reg_rtx (SImode);
28287   emit_move_insn (target, const0_rtx);
28288   target = gen_rtx_SUBREG (QImode, target, 0);
28289 
28290   if ((optimize && !register_operand (op0, mode0))
28291       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28292     op0 = copy_to_mode_reg (mode0, op0);
28293   if ((optimize && !register_operand (op1, mode1))
28294       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28295     op1 = copy_to_mode_reg (mode1, op1);
28296 
28297   pat = GEN_FCN (d->icode) (op0, op1);
28298   if (! pat)
28299     return 0;
28300   emit_insn (pat);
28301   emit_insn (gen_rtx_SET (VOIDmode,
28302 			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28303 			  gen_rtx_fmt_ee (comparison, QImode,
28304 					  SET_DEST (pat),
28305 					  const0_rtx)));
28306 
28307   return SUBREG_REG (target);
28308 }
28309 
28310 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
28311 
28312 static rtx
28313 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28314 		       rtx target)
28315 {
28316   rtx pat;
28317   tree arg0 = CALL_EXPR_ARG (exp, 0);
28318   rtx op1, op0 = expand_normal (arg0);
28319   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28320   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28321 
28322   if (optimize || target == 0
28323       || GET_MODE (target) != tmode
28324       || !insn_data[d->icode].operand[0].predicate (target, tmode))
28325     target = gen_reg_rtx (tmode);
28326 
28327   if (VECTOR_MODE_P (mode0))
28328     op0 = safe_vector_operand (op0, mode0);
28329 
28330   if ((optimize && !register_operand (op0, mode0))
28331       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28332     op0 = copy_to_mode_reg (mode0, op0);
28333 
28334   op1 = GEN_INT (d->comparison);
28335 
28336   pat = GEN_FCN (d->icode) (target, op0, op1);
28337   if (! pat)
28338     return 0;
28339   emit_insn (pat);
28340   return target;
28341 }
28342 
28343 static rtx
28344 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28345 				     tree exp, rtx target)
28346 {
28347   rtx pat;
28348   tree arg0 = CALL_EXPR_ARG (exp, 0);
28349   tree arg1 = CALL_EXPR_ARG (exp, 1);
28350   rtx op0 = expand_normal (arg0);
28351   rtx op1 = expand_normal (arg1);
28352   rtx op2;
28353   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28354   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28355   enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28356 
28357   if (optimize || target == 0
28358       || GET_MODE (target) != tmode
28359       || !insn_data[d->icode].operand[0].predicate (target, tmode))
28360     target = gen_reg_rtx (tmode);
28361 
28362   op0 = safe_vector_operand (op0, mode0);
28363   op1 = safe_vector_operand (op1, mode1);
28364 
28365   if ((optimize && !register_operand (op0, mode0))
28366       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28367     op0 = copy_to_mode_reg (mode0, op0);
28368   if ((optimize && !register_operand (op1, mode1))
28369       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28370     op1 = copy_to_mode_reg (mode1, op1);
28371 
28372   op2 = GEN_INT (d->comparison);
28373 
28374   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28375   if (! pat)
28376     return 0;
28377   emit_insn (pat);
28378   return target;
28379 }
28380 
28381 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
28382 
28383 static rtx
28384 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28385 		       rtx target)
28386 {
28387   rtx pat;
28388   tree arg0 = CALL_EXPR_ARG (exp, 0);
28389   tree arg1 = CALL_EXPR_ARG (exp, 1);
28390   rtx op0 = expand_normal (arg0);
28391   rtx op1 = expand_normal (arg1);
28392   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28393   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28394   enum rtx_code comparison = d->comparison;
28395 
28396   if (VECTOR_MODE_P (mode0))
28397     op0 = safe_vector_operand (op0, mode0);
28398   if (VECTOR_MODE_P (mode1))
28399     op1 = safe_vector_operand (op1, mode1);
28400 
28401   target = gen_reg_rtx (SImode);
28402   emit_move_insn (target, const0_rtx);
28403   target = gen_rtx_SUBREG (QImode, target, 0);
28404 
28405   if ((optimize && !register_operand (op0, mode0))
28406       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28407     op0 = copy_to_mode_reg (mode0, op0);
28408   if ((optimize && !register_operand (op1, mode1))
28409       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28410     op1 = copy_to_mode_reg (mode1, op1);
28411 
28412   pat = GEN_FCN (d->icode) (op0, op1);
28413   if (! pat)
28414     return 0;
28415   emit_insn (pat);
28416   emit_insn (gen_rtx_SET (VOIDmode,
28417 			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28418 			  gen_rtx_fmt_ee (comparison, QImode,
28419 					  SET_DEST (pat),
28420 					  const0_rtx)));
28421 
28422   return SUBREG_REG (target);
28423 }
28424 
28425 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
28426 
28427 static rtx
28428 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28429 			  tree exp, rtx target)
28430 {
28431   rtx pat;
28432   tree arg0 = CALL_EXPR_ARG (exp, 0);
28433   tree arg1 = CALL_EXPR_ARG (exp, 1);
28434   tree arg2 = CALL_EXPR_ARG (exp, 2);
28435   tree arg3 = CALL_EXPR_ARG (exp, 3);
28436   tree arg4 = CALL_EXPR_ARG (exp, 4);
28437   rtx scratch0, scratch1;
28438   rtx op0 = expand_normal (arg0);
28439   rtx op1 = expand_normal (arg1);
28440   rtx op2 = expand_normal (arg2);
28441   rtx op3 = expand_normal (arg3);
28442   rtx op4 = expand_normal (arg4);
28443   enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28444 
28445   tmode0 = insn_data[d->icode].operand[0].mode;
28446   tmode1 = insn_data[d->icode].operand[1].mode;
28447   modev2 = insn_data[d->icode].operand[2].mode;
28448   modei3 = insn_data[d->icode].operand[3].mode;
28449   modev4 = insn_data[d->icode].operand[4].mode;
28450   modei5 = insn_data[d->icode].operand[5].mode;
28451   modeimm = insn_data[d->icode].operand[6].mode;
28452 
28453   if (VECTOR_MODE_P (modev2))
28454     op0 = safe_vector_operand (op0, modev2);
28455   if (VECTOR_MODE_P (modev4))
28456     op2 = safe_vector_operand (op2, modev4);
28457 
28458   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28459     op0 = copy_to_mode_reg (modev2, op0);
28460   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28461     op1 = copy_to_mode_reg (modei3, op1);
28462   if ((optimize && !register_operand (op2, modev4))
28463       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28464     op2 = copy_to_mode_reg (modev4, op2);
28465   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28466     op3 = copy_to_mode_reg (modei5, op3);
28467 
28468   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28469     {
28470       error ("the fifth argument must be an 8-bit immediate");
28471       return const0_rtx;
28472     }
28473 
28474   if (d->code == IX86_BUILTIN_PCMPESTRI128)
28475     {
28476       if (optimize || !target
28477 	  || GET_MODE (target) != tmode0
28478 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28479 	target = gen_reg_rtx (tmode0);
28480 
28481       scratch1 = gen_reg_rtx (tmode1);
28482 
28483       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28484     }
28485   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28486     {
28487       if (optimize || !target
28488 	  || GET_MODE (target) != tmode1
28489 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28490 	target = gen_reg_rtx (tmode1);
28491 
28492       scratch0 = gen_reg_rtx (tmode0);
28493 
28494       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28495     }
28496   else
28497     {
28498       gcc_assert (d->flag);
28499 
28500       scratch0 = gen_reg_rtx (tmode0);
28501       scratch1 = gen_reg_rtx (tmode1);
28502 
28503       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28504     }
28505 
28506   if (! pat)
28507     return 0;
28508 
28509   emit_insn (pat);
28510 
28511   if (d->flag)
28512     {
28513       target = gen_reg_rtx (SImode);
28514       emit_move_insn (target, const0_rtx);
28515       target = gen_rtx_SUBREG (QImode, target, 0);
28516 
28517       emit_insn
28518 	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28519 		      gen_rtx_fmt_ee (EQ, QImode,
28520 				      gen_rtx_REG ((enum machine_mode) d->flag,
28521 						   FLAGS_REG),
28522 				      const0_rtx)));
28523       return SUBREG_REG (target);
28524     }
28525   else
28526     return target;
28527 }
28528 
28529 
28530 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
28531 
28532 static rtx
28533 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28534 			  tree exp, rtx target)
28535 {
28536   rtx pat;
28537   tree arg0 = CALL_EXPR_ARG (exp, 0);
28538   tree arg1 = CALL_EXPR_ARG (exp, 1);
28539   tree arg2 = CALL_EXPR_ARG (exp, 2);
28540   rtx scratch0, scratch1;
28541   rtx op0 = expand_normal (arg0);
28542   rtx op1 = expand_normal (arg1);
28543   rtx op2 = expand_normal (arg2);
28544   enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28545 
28546   tmode0 = insn_data[d->icode].operand[0].mode;
28547   tmode1 = insn_data[d->icode].operand[1].mode;
28548   modev2 = insn_data[d->icode].operand[2].mode;
28549   modev3 = insn_data[d->icode].operand[3].mode;
28550   modeimm = insn_data[d->icode].operand[4].mode;
28551 
28552   if (VECTOR_MODE_P (modev2))
28553     op0 = safe_vector_operand (op0, modev2);
28554   if (VECTOR_MODE_P (modev3))
28555     op1 = safe_vector_operand (op1, modev3);
28556 
28557   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28558     op0 = copy_to_mode_reg (modev2, op0);
28559   if ((optimize && !register_operand (op1, modev3))
28560       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28561     op1 = copy_to_mode_reg (modev3, op1);
28562 
28563   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28564     {
28565       error ("the third argument must be an 8-bit immediate");
28566       return const0_rtx;
28567     }
28568 
28569   if (d->code == IX86_BUILTIN_PCMPISTRI128)
28570     {
28571       if (optimize || !target
28572 	  || GET_MODE (target) != tmode0
28573 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28574 	target = gen_reg_rtx (tmode0);
28575 
28576       scratch1 = gen_reg_rtx (tmode1);
28577 
28578       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28579     }
28580   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28581     {
28582       if (optimize || !target
28583 	  || GET_MODE (target) != tmode1
28584 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28585 	target = gen_reg_rtx (tmode1);
28586 
28587       scratch0 = gen_reg_rtx (tmode0);
28588 
28589       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28590     }
28591   else
28592     {
28593       gcc_assert (d->flag);
28594 
28595       scratch0 = gen_reg_rtx (tmode0);
28596       scratch1 = gen_reg_rtx (tmode1);
28597 
28598       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28599     }
28600 
28601   if (! pat)
28602     return 0;
28603 
28604   emit_insn (pat);
28605 
28606   if (d->flag)
28607     {
28608       target = gen_reg_rtx (SImode);
28609       emit_move_insn (target, const0_rtx);
28610       target = gen_rtx_SUBREG (QImode, target, 0);
28611 
28612       emit_insn
28613 	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28614 		      gen_rtx_fmt_ee (EQ, QImode,
28615 				      gen_rtx_REG ((enum machine_mode) d->flag,
28616 						   FLAGS_REG),
28617 				      const0_rtx)));
28618       return SUBREG_REG (target);
28619     }
28620   else
28621     return target;
28622 }
28623 
28624 /* Subroutine of ix86_expand_builtin to take care of insns with
28625    variable number of operands.  */
28626 
28627 static rtx
28628 ix86_expand_args_builtin (const struct builtin_description *d,
28629 			  tree exp, rtx target)
28630 {
28631   rtx pat, real_target;
28632   unsigned int i, nargs;
28633   unsigned int nargs_constant = 0;
28634   int num_memory = 0;
28635   struct
28636     {
28637       rtx op;
28638       enum machine_mode mode;
28639     } args[4];
28640   bool last_arg_count = false;
28641   enum insn_code icode = d->icode;
28642   const struct insn_data_d *insn_p = &insn_data[icode];
28643   enum machine_mode tmode = insn_p->operand[0].mode;
28644   enum machine_mode rmode = VOIDmode;
28645   bool swap = false;
28646   enum rtx_code comparison = d->comparison;
28647 
28648   switch ((enum ix86_builtin_func_type) d->flag)
28649     {
28650     case V2DF_FTYPE_V2DF_ROUND:
28651     case V4DF_FTYPE_V4DF_ROUND:
28652     case V4SF_FTYPE_V4SF_ROUND:
28653     case V8SF_FTYPE_V8SF_ROUND:
28654     case V4SI_FTYPE_V4SF_ROUND:
28655     case V8SI_FTYPE_V8SF_ROUND:
28656       return ix86_expand_sse_round (d, exp, target);
28657     case V4SI_FTYPE_V2DF_V2DF_ROUND:
28658     case V8SI_FTYPE_V4DF_V4DF_ROUND:
28659       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28660     case INT_FTYPE_V8SF_V8SF_PTEST:
28661     case INT_FTYPE_V4DI_V4DI_PTEST:
28662     case INT_FTYPE_V4DF_V4DF_PTEST:
28663     case INT_FTYPE_V4SF_V4SF_PTEST:
28664     case INT_FTYPE_V2DI_V2DI_PTEST:
28665     case INT_FTYPE_V2DF_V2DF_PTEST:
28666       return ix86_expand_sse_ptest (d, exp, target);
28667     case FLOAT128_FTYPE_FLOAT128:
28668     case FLOAT_FTYPE_FLOAT:
28669     case INT_FTYPE_INT:
28670     case UINT64_FTYPE_INT:
28671     case UINT16_FTYPE_UINT16:
28672     case INT64_FTYPE_INT64:
28673     case INT64_FTYPE_V4SF:
28674     case INT64_FTYPE_V2DF:
28675     case INT_FTYPE_V16QI:
28676     case INT_FTYPE_V8QI:
28677     case INT_FTYPE_V8SF:
28678     case INT_FTYPE_V4DF:
28679     case INT_FTYPE_V4SF:
28680     case INT_FTYPE_V2DF:
28681     case INT_FTYPE_V32QI:
28682     case V16QI_FTYPE_V16QI:
28683     case V8SI_FTYPE_V8SF:
28684     case V8SI_FTYPE_V4SI:
28685     case V8HI_FTYPE_V8HI:
28686     case V8HI_FTYPE_V16QI:
28687     case V8QI_FTYPE_V8QI:
28688     case V8SF_FTYPE_V8SF:
28689     case V8SF_FTYPE_V8SI:
28690     case V8SF_FTYPE_V4SF:
28691     case V8SF_FTYPE_V8HI:
28692     case V4SI_FTYPE_V4SI:
28693     case V4SI_FTYPE_V16QI:
28694     case V4SI_FTYPE_V4SF:
28695     case V4SI_FTYPE_V8SI:
28696     case V4SI_FTYPE_V8HI:
28697     case V4SI_FTYPE_V4DF:
28698     case V4SI_FTYPE_V2DF:
28699     case V4HI_FTYPE_V4HI:
28700     case V4DF_FTYPE_V4DF:
28701     case V4DF_FTYPE_V4SI:
28702     case V4DF_FTYPE_V4SF:
28703     case V4DF_FTYPE_V2DF:
28704     case V4SF_FTYPE_V4SF:
28705     case V4SF_FTYPE_V4SI:
28706     case V4SF_FTYPE_V8SF:
28707     case V4SF_FTYPE_V4DF:
28708     case V4SF_FTYPE_V8HI:
28709     case V4SF_FTYPE_V2DF:
28710     case V2DI_FTYPE_V2DI:
28711     case V2DI_FTYPE_V16QI:
28712     case V2DI_FTYPE_V8HI:
28713     case V2DI_FTYPE_V4SI:
28714     case V2DF_FTYPE_V2DF:
28715     case V2DF_FTYPE_V4SI:
28716     case V2DF_FTYPE_V4DF:
28717     case V2DF_FTYPE_V4SF:
28718     case V2DF_FTYPE_V2SI:
28719     case V2SI_FTYPE_V2SI:
28720     case V2SI_FTYPE_V4SF:
28721     case V2SI_FTYPE_V2SF:
28722     case V2SI_FTYPE_V2DF:
28723     case V2SF_FTYPE_V2SF:
28724     case V2SF_FTYPE_V2SI:
28725     case V32QI_FTYPE_V32QI:
28726     case V32QI_FTYPE_V16QI:
28727     case V16HI_FTYPE_V16HI:
28728     case V16HI_FTYPE_V8HI:
28729     case V8SI_FTYPE_V8SI:
28730     case V16HI_FTYPE_V16QI:
28731     case V8SI_FTYPE_V16QI:
28732     case V4DI_FTYPE_V16QI:
28733     case V8SI_FTYPE_V8HI:
28734     case V4DI_FTYPE_V8HI:
28735     case V4DI_FTYPE_V4SI:
28736     case V4DI_FTYPE_V2DI:
28737       nargs = 1;
28738       break;
28739     case V4SF_FTYPE_V4SF_VEC_MERGE:
28740     case V2DF_FTYPE_V2DF_VEC_MERGE:
28741       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28742     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28743     case V16QI_FTYPE_V16QI_V16QI:
28744     case V16QI_FTYPE_V8HI_V8HI:
28745     case V8QI_FTYPE_V8QI_V8QI:
28746     case V8QI_FTYPE_V4HI_V4HI:
28747     case V8HI_FTYPE_V8HI_V8HI:
28748     case V8HI_FTYPE_V16QI_V16QI:
28749     case V8HI_FTYPE_V4SI_V4SI:
28750     case V8SF_FTYPE_V8SF_V8SF:
28751     case V8SF_FTYPE_V8SF_V8SI:
28752     case V4SI_FTYPE_V4SI_V4SI:
28753     case V4SI_FTYPE_V8HI_V8HI:
28754     case V4SI_FTYPE_V4SF_V4SF:
28755     case V4SI_FTYPE_V2DF_V2DF:
28756     case V4HI_FTYPE_V4HI_V4HI:
28757     case V4HI_FTYPE_V8QI_V8QI:
28758     case V4HI_FTYPE_V2SI_V2SI:
28759     case V4DF_FTYPE_V4DF_V4DF:
28760     case V4DF_FTYPE_V4DF_V4DI:
28761     case V4SF_FTYPE_V4SF_V4SF:
28762     case V4SF_FTYPE_V4SF_V4SI:
28763     case V4SF_FTYPE_V4SF_V2SI:
28764     case V4SF_FTYPE_V4SF_V2DF:
28765     case V4SF_FTYPE_V4SF_DI:
28766     case V4SF_FTYPE_V4SF_SI:
28767     case V2DI_FTYPE_V2DI_V2DI:
28768     case V2DI_FTYPE_V16QI_V16QI:
28769     case V2DI_FTYPE_V4SI_V4SI:
28770     case V2DI_FTYPE_V2DI_V16QI:
28771     case V2DI_FTYPE_V2DF_V2DF:
28772     case V2SI_FTYPE_V2SI_V2SI:
28773     case V2SI_FTYPE_V4HI_V4HI:
28774     case V2SI_FTYPE_V2SF_V2SF:
28775     case V2DF_FTYPE_V2DF_V2DF:
28776     case V2DF_FTYPE_V2DF_V4SF:
28777     case V2DF_FTYPE_V2DF_V2DI:
28778     case V2DF_FTYPE_V2DF_DI:
28779     case V2DF_FTYPE_V2DF_SI:
28780     case V2SF_FTYPE_V2SF_V2SF:
28781     case V1DI_FTYPE_V1DI_V1DI:
28782     case V1DI_FTYPE_V8QI_V8QI:
28783     case V1DI_FTYPE_V2SI_V2SI:
28784     case V32QI_FTYPE_V16HI_V16HI:
28785     case V16HI_FTYPE_V8SI_V8SI:
28786     case V32QI_FTYPE_V32QI_V32QI:
28787     case V16HI_FTYPE_V32QI_V32QI:
28788     case V16HI_FTYPE_V16HI_V16HI:
28789     case V8SI_FTYPE_V4DF_V4DF:
28790     case V8SI_FTYPE_V8SI_V8SI:
28791     case V8SI_FTYPE_V16HI_V16HI:
28792     case V4DI_FTYPE_V4DI_V4DI:
28793     case V4DI_FTYPE_V8SI_V8SI:
28794       if (comparison == UNKNOWN)
28795 	return ix86_expand_binop_builtin (icode, exp, target);
28796       nargs = 2;
28797       break;
28798     case V4SF_FTYPE_V4SF_V4SF_SWAP:
28799     case V2DF_FTYPE_V2DF_V2DF_SWAP:
28800       gcc_assert (comparison != UNKNOWN);
28801       nargs = 2;
28802       swap = true;
28803       break;
28804     case V16HI_FTYPE_V16HI_V8HI_COUNT:
28805     case V16HI_FTYPE_V16HI_SI_COUNT:
28806     case V8SI_FTYPE_V8SI_V4SI_COUNT:
28807     case V8SI_FTYPE_V8SI_SI_COUNT:
28808     case V4DI_FTYPE_V4DI_V2DI_COUNT:
28809     case V4DI_FTYPE_V4DI_INT_COUNT:
28810     case V8HI_FTYPE_V8HI_V8HI_COUNT:
28811     case V8HI_FTYPE_V8HI_SI_COUNT:
28812     case V4SI_FTYPE_V4SI_V4SI_COUNT:
28813     case V4SI_FTYPE_V4SI_SI_COUNT:
28814     case V4HI_FTYPE_V4HI_V4HI_COUNT:
28815     case V4HI_FTYPE_V4HI_SI_COUNT:
28816     case V2DI_FTYPE_V2DI_V2DI_COUNT:
28817     case V2DI_FTYPE_V2DI_SI_COUNT:
28818     case V2SI_FTYPE_V2SI_V2SI_COUNT:
28819     case V2SI_FTYPE_V2SI_SI_COUNT:
28820     case V1DI_FTYPE_V1DI_V1DI_COUNT:
28821     case V1DI_FTYPE_V1DI_SI_COUNT:
28822       nargs = 2;
28823       last_arg_count = true;
28824       break;
28825     case UINT64_FTYPE_UINT64_UINT64:
28826     case UINT_FTYPE_UINT_UINT:
28827     case UINT_FTYPE_UINT_USHORT:
28828     case UINT_FTYPE_UINT_UCHAR:
28829     case UINT16_FTYPE_UINT16_INT:
28830     case UINT8_FTYPE_UINT8_INT:
28831       nargs = 2;
28832       break;
28833     case V2DI_FTYPE_V2DI_INT_CONVERT:
28834       nargs = 2;
28835       rmode = V1TImode;
28836       nargs_constant = 1;
28837       break;
28838     case V4DI_FTYPE_V4DI_INT_CONVERT:
28839       nargs = 2;
28840       rmode = V2TImode;
28841       nargs_constant = 1;
28842       break;
28843     case V8HI_FTYPE_V8HI_INT:
28844     case V8HI_FTYPE_V8SF_INT:
28845     case V8HI_FTYPE_V4SF_INT:
28846     case V8SF_FTYPE_V8SF_INT:
28847     case V4SI_FTYPE_V4SI_INT:
28848     case V4SI_FTYPE_V8SI_INT:
28849     case V4HI_FTYPE_V4HI_INT:
28850     case V4DF_FTYPE_V4DF_INT:
28851     case V4SF_FTYPE_V4SF_INT:
28852     case V4SF_FTYPE_V8SF_INT:
28853     case V2DI_FTYPE_V2DI_INT:
28854     case V2DF_FTYPE_V2DF_INT:
28855     case V2DF_FTYPE_V4DF_INT:
28856     case V16HI_FTYPE_V16HI_INT:
28857     case V8SI_FTYPE_V8SI_INT:
28858     case V4DI_FTYPE_V4DI_INT:
28859     case V2DI_FTYPE_V4DI_INT:
28860       nargs = 2;
28861       nargs_constant = 1;
28862       break;
28863     case V16QI_FTYPE_V16QI_V16QI_V16QI:
28864     case V8SF_FTYPE_V8SF_V8SF_V8SF:
28865     case V4DF_FTYPE_V4DF_V4DF_V4DF:
28866     case V4SF_FTYPE_V4SF_V4SF_V4SF:
28867     case V2DF_FTYPE_V2DF_V2DF_V2DF:
28868     case V32QI_FTYPE_V32QI_V32QI_V32QI:
28869       nargs = 3;
28870       break;
28871     case V32QI_FTYPE_V32QI_V32QI_INT:
28872     case V16HI_FTYPE_V16HI_V16HI_INT:
28873     case V16QI_FTYPE_V16QI_V16QI_INT:
28874     case V4DI_FTYPE_V4DI_V4DI_INT:
28875     case V8HI_FTYPE_V8HI_V8HI_INT:
28876     case V8SI_FTYPE_V8SI_V8SI_INT:
28877     case V8SI_FTYPE_V8SI_V4SI_INT:
28878     case V8SF_FTYPE_V8SF_V8SF_INT:
28879     case V8SF_FTYPE_V8SF_V4SF_INT:
28880     case V4SI_FTYPE_V4SI_V4SI_INT:
28881     case V4DF_FTYPE_V4DF_V4DF_INT:
28882     case V4DF_FTYPE_V4DF_V2DF_INT:
28883     case V4SF_FTYPE_V4SF_V4SF_INT:
28884     case V2DI_FTYPE_V2DI_V2DI_INT:
28885     case V4DI_FTYPE_V4DI_V2DI_INT:
28886     case V2DF_FTYPE_V2DF_V2DF_INT:
28887       nargs = 3;
28888       nargs_constant = 1;
28889       break;
28890     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28891       nargs = 3;
28892       rmode = V4DImode;
28893       nargs_constant = 1;
28894       break;
28895     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28896       nargs = 3;
28897       rmode = V2DImode;
28898       nargs_constant = 1;
28899       break;
28900     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28901       nargs = 3;
28902       rmode = DImode;
28903       nargs_constant = 1;
28904       break;
28905     case V2DI_FTYPE_V2DI_UINT_UINT:
28906       nargs = 3;
28907       nargs_constant = 2;
28908       break;
28909     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28910     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28911     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28912     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28913       nargs = 4;
28914       nargs_constant = 1;
28915       break;
28916     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28917       nargs = 4;
28918       nargs_constant = 2;
28919       break;
28920     default:
28921       gcc_unreachable ();
28922     }
28923 
28924   gcc_assert (nargs <= ARRAY_SIZE (args));
28925 
28926   if (comparison != UNKNOWN)
28927     {
28928       gcc_assert (nargs == 2);
28929       return ix86_expand_sse_compare (d, exp, target, swap);
28930     }
28931 
28932   if (rmode == VOIDmode || rmode == tmode)
28933     {
28934       if (optimize
28935 	  || target == 0
28936 	  || GET_MODE (target) != tmode
28937 	  || !insn_p->operand[0].predicate (target, tmode))
28938 	target = gen_reg_rtx (tmode);
28939       real_target = target;
28940     }
28941   else
28942     {
28943       target = gen_reg_rtx (rmode);
28944       real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28945     }
28946 
28947   for (i = 0; i < nargs; i++)
28948     {
28949       tree arg = CALL_EXPR_ARG (exp, i);
28950       rtx op = expand_normal (arg);
28951       enum machine_mode mode = insn_p->operand[i + 1].mode;
28952       bool match = insn_p->operand[i + 1].predicate (op, mode);
28953 
28954       if (last_arg_count && (i + 1) == nargs)
28955 	{
28956 	  /* SIMD shift insns take either an 8-bit immediate or
28957 	     register as count.  But builtin functions take int as
28958 	     count.  If count doesn't match, we put it in register.  */
28959 	  if (!match)
28960 	    {
28961 	      op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28962 	      if (!insn_p->operand[i + 1].predicate (op, mode))
28963 		op = copy_to_reg (op);
28964 	    }
28965 	}
28966       else if ((nargs - i) <= nargs_constant)
28967 	{
28968 	  if (!match)
28969 	    switch (icode)
28970 	      {
28971 	      case CODE_FOR_avx2_inserti128:
28972 	      case CODE_FOR_avx2_extracti128:
28973 		error ("the last argument must be an 1-bit immediate");
28974 		return const0_rtx;
28975 
28976 	      case CODE_FOR_sse4_1_roundsd:
28977 	      case CODE_FOR_sse4_1_roundss:
28978 
28979 	      case CODE_FOR_sse4_1_roundpd:
28980 	      case CODE_FOR_sse4_1_roundps:
28981 	      case CODE_FOR_avx_roundpd256:
28982 	      case CODE_FOR_avx_roundps256:
28983 
28984 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28985 	      case CODE_FOR_sse4_1_roundps_sfix:
28986 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28987 	      case CODE_FOR_avx_roundps_sfix256:
28988 
28989 	      case CODE_FOR_sse4_1_blendps:
28990 	      case CODE_FOR_avx_blendpd256:
28991 	      case CODE_FOR_avx_vpermilv4df:
28992 		error ("the last argument must be a 4-bit immediate");
28993 		return const0_rtx;
28994 
28995 	      case CODE_FOR_sse4_1_blendpd:
28996 	      case CODE_FOR_avx_vpermilv2df:
28997 	      case CODE_FOR_xop_vpermil2v2df3:
28998 	      case CODE_FOR_xop_vpermil2v4sf3:
28999 	      case CODE_FOR_xop_vpermil2v4df3:
29000 	      case CODE_FOR_xop_vpermil2v8sf3:
29001 		error ("the last argument must be a 2-bit immediate");
29002 		return const0_rtx;
29003 
29004 	      case CODE_FOR_avx_vextractf128v4df:
29005 	      case CODE_FOR_avx_vextractf128v8sf:
29006 	      case CODE_FOR_avx_vextractf128v8si:
29007 	      case CODE_FOR_avx_vinsertf128v4df:
29008 	      case CODE_FOR_avx_vinsertf128v8sf:
29009 	      case CODE_FOR_avx_vinsertf128v8si:
29010 		error ("the last argument must be a 1-bit immediate");
29011 		return const0_rtx;
29012 
29013 	      case CODE_FOR_avx_vmcmpv2df3:
29014 	      case CODE_FOR_avx_vmcmpv4sf3:
29015 	      case CODE_FOR_avx_cmpv2df3:
29016 	      case CODE_FOR_avx_cmpv4sf3:
29017 	      case CODE_FOR_avx_cmpv4df3:
29018 	      case CODE_FOR_avx_cmpv8sf3:
29019 		error ("the last argument must be a 5-bit immediate");
29020 		return const0_rtx;
29021 
29022 	     default:
29023 		switch (nargs_constant)
29024 		  {
29025 		  case 2:
29026 		    if ((nargs - i) == nargs_constant)
29027 		      {
29028 			error ("the next to last argument must be an 8-bit immediate");
29029 			break;
29030 		      }
29031 		  case 1:
29032 		    error ("the last argument must be an 8-bit immediate");
29033 		    break;
29034 		  default:
29035 		    gcc_unreachable ();
29036 		  }
29037 		return const0_rtx;
29038 	      }
29039 	}
29040       else
29041 	{
29042 	  if (VECTOR_MODE_P (mode))
29043 	    op = safe_vector_operand (op, mode);
29044 
29045 	  /* If we aren't optimizing, only allow one memory operand to
29046 	     be generated.  */
29047 	  if (memory_operand (op, mode))
29048 	    num_memory++;
29049 
29050 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29051 	    {
29052 	      if (optimize || !match || num_memory > 1)
29053 		op = copy_to_mode_reg (mode, op);
29054 	    }
29055 	  else
29056 	    {
29057 	      op = copy_to_reg (op);
29058 	      op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29059 	    }
29060 	}
29061 
29062       args[i].op = op;
29063       args[i].mode = mode;
29064     }
29065 
29066   switch (nargs)
29067     {
29068     case 1:
29069       pat = GEN_FCN (icode) (real_target, args[0].op);
29070       break;
29071     case 2:
29072       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29073       break;
29074     case 3:
29075       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29076 			     args[2].op);
29077       break;
29078     case 4:
29079       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29080 			     args[2].op, args[3].op);
29081       break;
29082     default:
29083       gcc_unreachable ();
29084     }
29085 
29086   if (! pat)
29087     return 0;
29088 
29089   emit_insn (pat);
29090   return target;
29091 }
29092 
29093 /* Subroutine of ix86_expand_builtin to take care of special insns
29094    with variable number of operands.  */
29095 
29096 static rtx
29097 ix86_expand_special_args_builtin (const struct builtin_description *d,
29098 				    tree exp, rtx target)
29099 {
29100   tree arg;
29101   rtx pat, op;
29102   unsigned int i, nargs, arg_adjust, memory;
29103   struct
29104     {
29105       rtx op;
29106       enum machine_mode mode;
29107     } args[3];
29108   enum insn_code icode = d->icode;
29109   bool last_arg_constant = false;
29110   const struct insn_data_d *insn_p = &insn_data[icode];
29111   enum machine_mode tmode = insn_p->operand[0].mode;
29112   enum { load, store } klass;
29113 
29114   switch ((enum ix86_builtin_func_type) d->flag)
29115     {
29116     case VOID_FTYPE_VOID:
29117       if (icode == CODE_FOR_avx_vzeroupper)
29118 	target = GEN_INT (vzeroupper_intrinsic);
29119       emit_insn (GEN_FCN (icode) (target));
29120       return 0;
29121     case VOID_FTYPE_UINT64:
29122     case VOID_FTYPE_UNSIGNED:
29123       nargs = 0;
29124       klass = store;
29125       memory = 0;
29126       break;
29127     case UINT64_FTYPE_VOID:
29128     case UNSIGNED_FTYPE_VOID:
29129       nargs = 0;
29130       klass = load;
29131       memory = 0;
29132       break;
29133     case UINT64_FTYPE_PUNSIGNED:
29134     case V2DI_FTYPE_PV2DI:
29135     case V4DI_FTYPE_PV4DI:
29136     case V32QI_FTYPE_PCCHAR:
29137     case V16QI_FTYPE_PCCHAR:
29138     case V8SF_FTYPE_PCV4SF:
29139     case V8SF_FTYPE_PCFLOAT:
29140     case V4SF_FTYPE_PCFLOAT:
29141     case V4DF_FTYPE_PCV2DF:
29142     case V4DF_FTYPE_PCDOUBLE:
29143     case V2DF_FTYPE_PCDOUBLE:
29144     case VOID_FTYPE_PVOID:
29145       nargs = 1;
29146       klass = load;
29147       memory = 0;
29148       break;
29149     case VOID_FTYPE_PV2SF_V4SF:
29150     case VOID_FTYPE_PV4DI_V4DI:
29151     case VOID_FTYPE_PV2DI_V2DI:
29152     case VOID_FTYPE_PCHAR_V32QI:
29153     case VOID_FTYPE_PCHAR_V16QI:
29154     case VOID_FTYPE_PFLOAT_V8SF:
29155     case VOID_FTYPE_PFLOAT_V4SF:
29156     case VOID_FTYPE_PDOUBLE_V4DF:
29157     case VOID_FTYPE_PDOUBLE_V2DF:
29158     case VOID_FTYPE_PLONGLONG_LONGLONG:
29159     case VOID_FTYPE_PULONGLONG_ULONGLONG:
29160     case VOID_FTYPE_PINT_INT:
29161       nargs = 1;
29162       klass = store;
29163       /* Reserve memory operand for target.  */
29164       memory = ARRAY_SIZE (args);
29165       break;
29166     case V4SF_FTYPE_V4SF_PCV2SF:
29167     case V2DF_FTYPE_V2DF_PCDOUBLE:
29168       nargs = 2;
29169       klass = load;
29170       memory = 1;
29171       break;
29172     case V8SF_FTYPE_PCV8SF_V8SI:
29173     case V4DF_FTYPE_PCV4DF_V4DI:
29174     case V4SF_FTYPE_PCV4SF_V4SI:
29175     case V2DF_FTYPE_PCV2DF_V2DI:
29176     case V8SI_FTYPE_PCV8SI_V8SI:
29177     case V4DI_FTYPE_PCV4DI_V4DI:
29178     case V4SI_FTYPE_PCV4SI_V4SI:
29179     case V2DI_FTYPE_PCV2DI_V2DI:
29180       nargs = 2;
29181       klass = load;
29182       memory = 0;
29183       break;
29184     case VOID_FTYPE_PV8SF_V8SI_V8SF:
29185     case VOID_FTYPE_PV4DF_V4DI_V4DF:
29186     case VOID_FTYPE_PV4SF_V4SI_V4SF:
29187     case VOID_FTYPE_PV2DF_V2DI_V2DF:
29188     case VOID_FTYPE_PV8SI_V8SI_V8SI:
29189     case VOID_FTYPE_PV4DI_V4DI_V4DI:
29190     case VOID_FTYPE_PV4SI_V4SI_V4SI:
29191     case VOID_FTYPE_PV2DI_V2DI_V2DI:
29192       nargs = 2;
29193       klass = store;
29194       /* Reserve memory operand for target.  */
29195       memory = ARRAY_SIZE (args);
29196       break;
29197     case VOID_FTYPE_UINT_UINT_UINT:
29198     case VOID_FTYPE_UINT64_UINT_UINT:
29199     case UCHAR_FTYPE_UINT_UINT_UINT:
29200     case UCHAR_FTYPE_UINT64_UINT_UINT:
29201       nargs = 3;
29202       klass = load;
29203       memory = ARRAY_SIZE (args);
29204       last_arg_constant = true;
29205       break;
29206     default:
29207       gcc_unreachable ();
29208     }
29209 
29210   gcc_assert (nargs <= ARRAY_SIZE (args));
29211 
29212   if (klass == store)
29213     {
29214       arg = CALL_EXPR_ARG (exp, 0);
29215       op = expand_normal (arg);
29216       gcc_assert (target == 0);
29217       if (memory)
29218 	{
29219 	  if (GET_MODE (op) != Pmode)
29220 	    op = convert_to_mode (Pmode, op, 1);
29221 	  target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29222 	}
29223       else
29224 	target = force_reg (tmode, op);
29225       arg_adjust = 1;
29226     }
29227   else
29228     {
29229       arg_adjust = 0;
29230       if (optimize
29231 	  || target == 0
29232 	  || !register_operand (target, tmode)
29233 	  || GET_MODE (target) != tmode)
29234 	target = gen_reg_rtx (tmode);
29235     }
29236 
29237   for (i = 0; i < nargs; i++)
29238     {
29239       enum machine_mode mode = insn_p->operand[i + 1].mode;
29240       bool match;
29241 
29242       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29243       op = expand_normal (arg);
29244       match = insn_p->operand[i + 1].predicate (op, mode);
29245 
29246       if (last_arg_constant && (i + 1) == nargs)
29247 	{
29248 	  if (!match)
29249 	    {
29250 	      if (icode == CODE_FOR_lwp_lwpvalsi3
29251 		  || icode == CODE_FOR_lwp_lwpinssi3
29252 		  || icode == CODE_FOR_lwp_lwpvaldi3
29253 		  || icode == CODE_FOR_lwp_lwpinsdi3)
29254 		error ("the last argument must be a 32-bit immediate");
29255 	      else
29256 		error ("the last argument must be an 8-bit immediate");
29257 	      return const0_rtx;
29258 	    }
29259 	}
29260       else
29261 	{
29262 	  if (i == memory)
29263 	    {
29264 	      /* This must be the memory operand.  */
29265 	      if (GET_MODE (op) != Pmode)
29266 		op = convert_to_mode (Pmode, op, 1);
29267 	      op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29268 	      gcc_assert (GET_MODE (op) == mode
29269 			  || GET_MODE (op) == VOIDmode);
29270 	    }
29271 	  else
29272 	    {
29273 	      /* This must be register.  */
29274 	      if (VECTOR_MODE_P (mode))
29275 		op = safe_vector_operand (op, mode);
29276 
29277 	      gcc_assert (GET_MODE (op) == mode
29278 			  || GET_MODE (op) == VOIDmode);
29279 	      op = copy_to_mode_reg (mode, op);
29280 	    }
29281 	}
29282 
29283       args[i].op = op;
29284       args[i].mode = mode;
29285     }
29286 
29287   switch (nargs)
29288     {
29289     case 0:
29290       pat = GEN_FCN (icode) (target);
29291       break;
29292     case 1:
29293       pat = GEN_FCN (icode) (target, args[0].op);
29294       break;
29295     case 2:
29296       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29297       break;
29298     case 3:
29299       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29300       break;
29301     default:
29302       gcc_unreachable ();
29303     }
29304 
29305   if (! pat)
29306     return 0;
29307   emit_insn (pat);
29308   return klass == store ? 0 : target;
29309 }
29310 
29311 /* Return the integer constant in ARG.  Constrain it to be in the range
29312    of the subparts of VEC_TYPE; issue an error if not.  */
29313 
29314 static int
29315 get_element_number (tree vec_type, tree arg)
29316 {
29317   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29318 
29319   if (!host_integerp (arg, 1)
29320       || (elt = tree_low_cst (arg, 1), elt > max))
29321     {
29322       error ("selector must be an integer constant in the range 0..%wi", max);
29323       return 0;
29324     }
29325 
29326   return elt;
29327 }
29328 
29329 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
29330    ix86_expand_vector_init.  We DO have language-level syntax for this, in
29331    the form of  (type){ init-list }.  Except that since we can't place emms
29332    instructions from inside the compiler, we can't allow the use of MMX
29333    registers unless the user explicitly asks for it.  So we do *not* define
29334    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
29335    we have builtins invoked by mmintrin.h that gives us license to emit
29336    these sorts of instructions.  */
29337 
29338 static rtx
29339 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29340 {
29341   enum machine_mode tmode = TYPE_MODE (type);
29342   enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29343   int i, n_elt = GET_MODE_NUNITS (tmode);
29344   rtvec v = rtvec_alloc (n_elt);
29345 
29346   gcc_assert (VECTOR_MODE_P (tmode));
29347   gcc_assert (call_expr_nargs (exp) == n_elt);
29348 
29349   for (i = 0; i < n_elt; ++i)
29350     {
29351       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29352       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29353     }
29354 
29355   if (!target || !register_operand (target, tmode))
29356     target = gen_reg_rtx (tmode);
29357 
29358   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29359   return target;
29360 }
29361 
29362 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
29363    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
29364    had a language-level syntax for referencing vector elements.  */
29365 
29366 static rtx
29367 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29368 {
29369   enum machine_mode tmode, mode0;
29370   tree arg0, arg1;
29371   int elt;
29372   rtx op0;
29373 
29374   arg0 = CALL_EXPR_ARG (exp, 0);
29375   arg1 = CALL_EXPR_ARG (exp, 1);
29376 
29377   op0 = expand_normal (arg0);
29378   elt = get_element_number (TREE_TYPE (arg0), arg1);
29379 
29380   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29381   mode0 = TYPE_MODE (TREE_TYPE (arg0));
29382   gcc_assert (VECTOR_MODE_P (mode0));
29383 
29384   op0 = force_reg (mode0, op0);
29385 
29386   if (optimize || !target || !register_operand (target, tmode))
29387     target = gen_reg_rtx (tmode);
29388 
29389   ix86_expand_vector_extract (true, target, op0, elt);
29390 
29391   return target;
29392 }
29393 
29394 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
29395    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
29396    a language-level syntax for referencing vector elements.  */
29397 
29398 static rtx
29399 ix86_expand_vec_set_builtin (tree exp)
29400 {
29401   enum machine_mode tmode, mode1;
29402   tree arg0, arg1, arg2;
29403   int elt;
29404   rtx op0, op1, target;
29405 
29406   arg0 = CALL_EXPR_ARG (exp, 0);
29407   arg1 = CALL_EXPR_ARG (exp, 1);
29408   arg2 = CALL_EXPR_ARG (exp, 2);
29409 
29410   tmode = TYPE_MODE (TREE_TYPE (arg0));
29411   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29412   gcc_assert (VECTOR_MODE_P (tmode));
29413 
29414   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29415   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29416   elt = get_element_number (TREE_TYPE (arg0), arg2);
29417 
29418   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29419     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29420 
29421   op0 = force_reg (tmode, op0);
29422   op1 = force_reg (mode1, op1);
29423 
29424   /* OP0 is the source of these builtin functions and shouldn't be
29425      modified.  Create a copy, use it and return it as target.  */
29426   target = gen_reg_rtx (tmode);
29427   emit_move_insn (target, op0);
29428   ix86_expand_vector_set (true, target, op1, elt);
29429 
29430   return target;
29431 }
29432 
29433 /* Expand an expression EXP that calls a built-in function,
29434    with result going to TARGET if that's convenient
29435    (and in mode MODE if that's convenient).
29436    SUBTARGET may be used as the target for computing one of EXP's operands.
29437    IGNORE is nonzero if the value is to be ignored.  */
29438 
29439 static rtx
29440 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29441 		     enum machine_mode mode ATTRIBUTE_UNUSED,
29442 		     int ignore ATTRIBUTE_UNUSED)
29443 {
29444   const struct builtin_description *d;
29445   size_t i;
29446   enum insn_code icode;
29447   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29448   tree arg0, arg1, arg2, arg3, arg4;
29449   rtx op0, op1, op2, op3, op4, pat;
29450   enum machine_mode mode0, mode1, mode2, mode3, mode4;
29451   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29452 
29453   /* Determine whether the builtin function is available under the current ISA.
29454      Originally the builtin was not created if it wasn't applicable to the
29455      current ISA based on the command line switches.  With function specific
29456      options, we need to check in the context of the function making the call
29457      whether it is supported.  */
29458   if (ix86_builtins_isa[fcode].isa
29459       && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29460     {
29461       char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29462 				       NULL, (enum fpmath_unit) 0, false);
29463 
29464       if (!opts)
29465 	error ("%qE needs unknown isa option", fndecl);
29466       else
29467 	{
29468 	  gcc_assert (opts != NULL);
29469 	  error ("%qE needs isa option %s", fndecl, opts);
29470 	  free (opts);
29471 	}
29472       return const0_rtx;
29473     }
29474 
29475   switch (fcode)
29476     {
29477     case IX86_BUILTIN_MASKMOVQ:
29478     case IX86_BUILTIN_MASKMOVDQU:
29479       icode = (fcode == IX86_BUILTIN_MASKMOVQ
29480 	       ? CODE_FOR_mmx_maskmovq
29481 	       : CODE_FOR_sse2_maskmovdqu);
29482       /* Note the arg order is different from the operand order.  */
29483       arg1 = CALL_EXPR_ARG (exp, 0);
29484       arg2 = CALL_EXPR_ARG (exp, 1);
29485       arg0 = CALL_EXPR_ARG (exp, 2);
29486       op0 = expand_normal (arg0);
29487       op1 = expand_normal (arg1);
29488       op2 = expand_normal (arg2);
29489       mode0 = insn_data[icode].operand[0].mode;
29490       mode1 = insn_data[icode].operand[1].mode;
29491       mode2 = insn_data[icode].operand[2].mode;
29492 
29493       if (GET_MODE (op0) != Pmode)
29494 	op0 = convert_to_mode (Pmode, op0, 1);
29495       op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29496 
29497       if (!insn_data[icode].operand[0].predicate (op0, mode0))
29498 	op0 = copy_to_mode_reg (mode0, op0);
29499       if (!insn_data[icode].operand[1].predicate (op1, mode1))
29500 	op1 = copy_to_mode_reg (mode1, op1);
29501       if (!insn_data[icode].operand[2].predicate (op2, mode2))
29502 	op2 = copy_to_mode_reg (mode2, op2);
29503       pat = GEN_FCN (icode) (op0, op1, op2);
29504       if (! pat)
29505 	return 0;
29506       emit_insn (pat);
29507       return 0;
29508 
29509     case IX86_BUILTIN_LDMXCSR:
29510       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29511       target = assign_386_stack_local (SImode, SLOT_TEMP);
29512       emit_move_insn (target, op0);
29513       emit_insn (gen_sse_ldmxcsr (target));
29514       return 0;
29515 
29516     case IX86_BUILTIN_STMXCSR:
29517       target = assign_386_stack_local (SImode, SLOT_TEMP);
29518       emit_insn (gen_sse_stmxcsr (target));
29519       return copy_to_mode_reg (SImode, target);
29520 
29521     case IX86_BUILTIN_CLFLUSH:
29522 	arg0 = CALL_EXPR_ARG (exp, 0);
29523 	op0 = expand_normal (arg0);
29524 	icode = CODE_FOR_sse2_clflush;
29525 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29526 	  {
29527 	    if (GET_MODE (op0) != Pmode)
29528 	      op0 = convert_to_mode (Pmode, op0, 1);
29529 	    op0 = force_reg (Pmode, op0);
29530 	  }
29531 
29532 	emit_insn (gen_sse2_clflush (op0));
29533 	return 0;
29534 
29535     case IX86_BUILTIN_MONITOR:
29536       arg0 = CALL_EXPR_ARG (exp, 0);
29537       arg1 = CALL_EXPR_ARG (exp, 1);
29538       arg2 = CALL_EXPR_ARG (exp, 2);
29539       op0 = expand_normal (arg0);
29540       op1 = expand_normal (arg1);
29541       op2 = expand_normal (arg2);
29542       if (!REG_P (op0))
29543 	{
29544 	  if (GET_MODE (op0) != Pmode)
29545 	    op0 = convert_to_mode (Pmode, op0, 1);
29546 	  op0 = force_reg (Pmode, op0);
29547 	}
29548       if (!REG_P (op1))
29549 	op1 = copy_to_mode_reg (SImode, op1);
29550       if (!REG_P (op2))
29551 	op2 = copy_to_mode_reg (SImode, op2);
29552       emit_insn (ix86_gen_monitor (op0, op1, op2));
29553       return 0;
29554 
29555     case IX86_BUILTIN_MWAIT:
29556       arg0 = CALL_EXPR_ARG (exp, 0);
29557       arg1 = CALL_EXPR_ARG (exp, 1);
29558       op0 = expand_normal (arg0);
29559       op1 = expand_normal (arg1);
29560       if (!REG_P (op0))
29561 	op0 = copy_to_mode_reg (SImode, op0);
29562       if (!REG_P (op1))
29563 	op1 = copy_to_mode_reg (SImode, op1);
29564       emit_insn (gen_sse3_mwait (op0, op1));
29565       return 0;
29566 
29567     case IX86_BUILTIN_VEC_INIT_V2SI:
29568     case IX86_BUILTIN_VEC_INIT_V4HI:
29569     case IX86_BUILTIN_VEC_INIT_V8QI:
29570       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29571 
29572     case IX86_BUILTIN_VEC_EXT_V2DF:
29573     case IX86_BUILTIN_VEC_EXT_V2DI:
29574     case IX86_BUILTIN_VEC_EXT_V4SF:
29575     case IX86_BUILTIN_VEC_EXT_V4SI:
29576     case IX86_BUILTIN_VEC_EXT_V8HI:
29577     case IX86_BUILTIN_VEC_EXT_V2SI:
29578     case IX86_BUILTIN_VEC_EXT_V4HI:
29579     case IX86_BUILTIN_VEC_EXT_V16QI:
29580       return ix86_expand_vec_ext_builtin (exp, target);
29581 
29582     case IX86_BUILTIN_VEC_SET_V2DI:
29583     case IX86_BUILTIN_VEC_SET_V4SF:
29584     case IX86_BUILTIN_VEC_SET_V4SI:
29585     case IX86_BUILTIN_VEC_SET_V8HI:
29586     case IX86_BUILTIN_VEC_SET_V4HI:
29587     case IX86_BUILTIN_VEC_SET_V16QI:
29588       return ix86_expand_vec_set_builtin (exp);
29589 
29590     case IX86_BUILTIN_INFQ:
29591     case IX86_BUILTIN_HUGE_VALQ:
29592       {
29593 	REAL_VALUE_TYPE inf;
29594 	rtx tmp;
29595 
29596 	real_inf (&inf);
29597 	tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29598 
29599 	tmp = validize_mem (force_const_mem (mode, tmp));
29600 
29601 	if (target == 0)
29602 	  target = gen_reg_rtx (mode);
29603 
29604 	emit_move_insn (target, tmp);
29605 	return target;
29606       }
29607 
29608     case IX86_BUILTIN_LLWPCB:
29609       arg0 = CALL_EXPR_ARG (exp, 0);
29610       op0 = expand_normal (arg0);
29611       icode = CODE_FOR_lwp_llwpcb;
29612       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29613 	{
29614 	  if (GET_MODE (op0) != Pmode)
29615 	    op0 = convert_to_mode (Pmode, op0, 1);
29616 	  op0 = force_reg (Pmode, op0);
29617 	}
29618       emit_insn (gen_lwp_llwpcb (op0));
29619       return 0;
29620 
29621     case IX86_BUILTIN_SLWPCB:
29622       icode = CODE_FOR_lwp_slwpcb;
29623       if (!target
29624 	  || !insn_data[icode].operand[0].predicate (target, Pmode))
29625 	target = gen_reg_rtx (Pmode);
29626       emit_insn (gen_lwp_slwpcb (target));
29627       return target;
29628 
29629     case IX86_BUILTIN_BEXTRI32:
29630     case IX86_BUILTIN_BEXTRI64:
29631       arg0 = CALL_EXPR_ARG (exp, 0);
29632       arg1 = CALL_EXPR_ARG (exp, 1);
29633       op0 = expand_normal (arg0);
29634       op1 = expand_normal (arg1);
29635       icode = (fcode == IX86_BUILTIN_BEXTRI32
29636 	  ? CODE_FOR_tbm_bextri_si
29637 	  : CODE_FOR_tbm_bextri_di);
29638       if (!CONST_INT_P (op1))
29639         {
29640           error ("last argument must be an immediate");
29641           return const0_rtx;
29642         }
29643       else
29644         {
29645           unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29646           unsigned char lsb_index = INTVAL (op1) & 0xFF;
29647           op1 = GEN_INT (length);
29648           op2 = GEN_INT (lsb_index);
29649           pat = GEN_FCN (icode) (target, op0, op1, op2);
29650           if (pat)
29651             emit_insn (pat);
29652           return target;
29653         }
29654 
29655     case IX86_BUILTIN_RDRAND16_STEP:
29656       icode = CODE_FOR_rdrandhi_1;
29657       mode0 = HImode;
29658       goto rdrand_step;
29659 
29660     case IX86_BUILTIN_RDRAND32_STEP:
29661       icode = CODE_FOR_rdrandsi_1;
29662       mode0 = SImode;
29663       goto rdrand_step;
29664 
29665     case IX86_BUILTIN_RDRAND64_STEP:
29666       icode = CODE_FOR_rdranddi_1;
29667       mode0 = DImode;
29668 
29669 rdrand_step:
29670       op0 = gen_reg_rtx (mode0);
29671       emit_insn (GEN_FCN (icode) (op0));
29672 
29673       arg0 = CALL_EXPR_ARG (exp, 0);
29674       op1 = expand_normal (arg0);
29675       if (!address_operand (op1, VOIDmode))
29676 	{
29677 	  op1 = convert_memory_address (Pmode, op1);
29678 	  op1 = copy_addr_to_reg (op1);
29679 	}
29680       emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29681 
29682       op1 = gen_reg_rtx (SImode);
29683       emit_move_insn (op1, CONST1_RTX (SImode));
29684 
29685       /* Emit SImode conditional move.  */
29686       if (mode0 == HImode)
29687 	{
29688 	  op2 = gen_reg_rtx (SImode);
29689 	  emit_insn (gen_zero_extendhisi2 (op2, op0));
29690 	}
29691       else if (mode0 == SImode)
29692 	op2 = op0;
29693       else
29694 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
29695 
29696       if (target == 0
29697 	  || !register_operand (target, SImode))
29698 	target = gen_reg_rtx (SImode);
29699 
29700       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29701 			 const0_rtx);
29702       emit_insn (gen_rtx_SET (VOIDmode, target,
29703 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29704       return target;
29705 
29706     case IX86_BUILTIN_GATHERSIV2DF:
29707       icode = CODE_FOR_avx2_gathersiv2df;
29708       goto gather_gen;
29709     case IX86_BUILTIN_GATHERSIV4DF:
29710       icode = CODE_FOR_avx2_gathersiv4df;
29711       goto gather_gen;
29712     case IX86_BUILTIN_GATHERDIV2DF:
29713       icode = CODE_FOR_avx2_gatherdiv2df;
29714       goto gather_gen;
29715     case IX86_BUILTIN_GATHERDIV4DF:
29716       icode = CODE_FOR_avx2_gatherdiv4df;
29717       goto gather_gen;
29718     case IX86_BUILTIN_GATHERSIV4SF:
29719       icode = CODE_FOR_avx2_gathersiv4sf;
29720       goto gather_gen;
29721     case IX86_BUILTIN_GATHERSIV8SF:
29722       icode = CODE_FOR_avx2_gathersiv8sf;
29723       goto gather_gen;
29724     case IX86_BUILTIN_GATHERDIV4SF:
29725       icode = CODE_FOR_avx2_gatherdiv4sf;
29726       goto gather_gen;
29727     case IX86_BUILTIN_GATHERDIV8SF:
29728       icode = CODE_FOR_avx2_gatherdiv8sf;
29729       goto gather_gen;
29730     case IX86_BUILTIN_GATHERSIV2DI:
29731       icode = CODE_FOR_avx2_gathersiv2di;
29732       goto gather_gen;
29733     case IX86_BUILTIN_GATHERSIV4DI:
29734       icode = CODE_FOR_avx2_gathersiv4di;
29735       goto gather_gen;
29736     case IX86_BUILTIN_GATHERDIV2DI:
29737       icode = CODE_FOR_avx2_gatherdiv2di;
29738       goto gather_gen;
29739     case IX86_BUILTIN_GATHERDIV4DI:
29740       icode = CODE_FOR_avx2_gatherdiv4di;
29741       goto gather_gen;
29742     case IX86_BUILTIN_GATHERSIV4SI:
29743       icode = CODE_FOR_avx2_gathersiv4si;
29744       goto gather_gen;
29745     case IX86_BUILTIN_GATHERSIV8SI:
29746       icode = CODE_FOR_avx2_gathersiv8si;
29747       goto gather_gen;
29748     case IX86_BUILTIN_GATHERDIV4SI:
29749       icode = CODE_FOR_avx2_gatherdiv4si;
29750       goto gather_gen;
29751     case IX86_BUILTIN_GATHERDIV8SI:
29752       icode = CODE_FOR_avx2_gatherdiv8si;
29753       goto gather_gen;
29754     case IX86_BUILTIN_GATHERALTSIV4DF:
29755       icode = CODE_FOR_avx2_gathersiv4df;
29756       goto gather_gen;
29757     case IX86_BUILTIN_GATHERALTDIV8SF:
29758       icode = CODE_FOR_avx2_gatherdiv8sf;
29759       goto gather_gen;
29760     case IX86_BUILTIN_GATHERALTSIV4DI:
29761       icode = CODE_FOR_avx2_gathersiv4di;
29762       goto gather_gen;
29763     case IX86_BUILTIN_GATHERALTDIV8SI:
29764       icode = CODE_FOR_avx2_gatherdiv8si;
29765       goto gather_gen;
29766 
29767     gather_gen:
29768       arg0 = CALL_EXPR_ARG (exp, 0);
29769       arg1 = CALL_EXPR_ARG (exp, 1);
29770       arg2 = CALL_EXPR_ARG (exp, 2);
29771       arg3 = CALL_EXPR_ARG (exp, 3);
29772       arg4 = CALL_EXPR_ARG (exp, 4);
29773       op0 = expand_normal (arg0);
29774       op1 = expand_normal (arg1);
29775       op2 = expand_normal (arg2);
29776       op3 = expand_normal (arg3);
29777       op4 = expand_normal (arg4);
29778       /* Note the arg order is different from the operand order.  */
29779       mode0 = insn_data[icode].operand[1].mode;
29780       mode2 = insn_data[icode].operand[3].mode;
29781       mode3 = insn_data[icode].operand[4].mode;
29782       mode4 = insn_data[icode].operand[5].mode;
29783 
29784       if (target == NULL_RTX
29785 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
29786 	  || !insn_data[icode].operand[0].predicate (target,
29787 						     GET_MODE (target)))
29788 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29789       else
29790 	subtarget = target;
29791 
29792       if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29793 	  || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29794 	{
29795 	  rtx half = gen_reg_rtx (V4SImode);
29796 	  if (!nonimmediate_operand (op2, V8SImode))
29797 	    op2 = copy_to_mode_reg (V8SImode, op2);
29798 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
29799 	  op2 = half;
29800 	}
29801       else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29802 	       || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29803 	{
29804 	  rtx (*gen) (rtx, rtx);
29805 	  rtx half = gen_reg_rtx (mode0);
29806 	  if (mode0 == V4SFmode)
29807 	    gen = gen_vec_extract_lo_v8sf;
29808 	  else
29809 	    gen = gen_vec_extract_lo_v8si;
29810 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
29811 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29812 	  emit_insn (gen (half, op0));
29813 	  op0 = half;
29814 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
29815 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29816 	  emit_insn (gen (half, op3));
29817 	  op3 = half;
29818 	}
29819 
29820       /* Force memory operand only with base register here.  But we
29821 	 don't want to do it on memory operand for other builtin
29822 	 functions.  */
29823       if (GET_MODE (op1) != Pmode)
29824 	op1 = convert_to_mode (Pmode, op1, 1);
29825       op1 = force_reg (Pmode, op1);
29826 
29827       if (!insn_data[icode].operand[1].predicate (op0, mode0))
29828 	op0 = copy_to_mode_reg (mode0, op0);
29829       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29830 	op1 = copy_to_mode_reg (Pmode, op1);
29831       if (!insn_data[icode].operand[3].predicate (op2, mode2))
29832 	op2 = copy_to_mode_reg (mode2, op2);
29833       if (!insn_data[icode].operand[4].predicate (op3, mode3))
29834 	op3 = copy_to_mode_reg (mode3, op3);
29835       if (!insn_data[icode].operand[5].predicate (op4, mode4))
29836 	{
29837           error ("last argument must be scale 1, 2, 4, 8");
29838           return const0_rtx;
29839 	}
29840 
29841       /* Optimize.  If mask is known to have all high bits set,
29842 	 replace op0 with pc_rtx to signal that the instruction
29843 	 overwrites the whole destination and doesn't use its
29844 	 previous contents.  */
29845       if (optimize)
29846 	{
29847 	  if (TREE_CODE (arg3) == VECTOR_CST)
29848 	    {
29849 	      tree elt;
29850 	      unsigned int negative = 0;
29851 	      for (elt = TREE_VECTOR_CST_ELTS (arg3);
29852 		   elt; elt = TREE_CHAIN (elt))
29853 		{
29854 		  tree cst = TREE_VALUE (elt);
29855 		  if (TREE_CODE (cst) == INTEGER_CST
29856 		      && tree_int_cst_sign_bit (cst))
29857 		    negative++;
29858 		  else if (TREE_CODE (cst) == REAL_CST
29859 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29860 		    negative++;
29861 		}
29862 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29863 		op0 = pc_rtx;
29864 	    }
29865 	  else if (TREE_CODE (arg3) == SSA_NAME)
29866 	    {
29867 	      /* Recognize also when mask is like:
29868 		 __v2df src = _mm_setzero_pd ();
29869 		 __v2df mask = _mm_cmpeq_pd (src, src);
29870 		 or
29871 		 __v8sf src = _mm256_setzero_ps ();
29872 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29873 		 as that is a cheaper way to load all ones into
29874 		 a register than having to load a constant from
29875 		 memory.  */
29876 	      gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29877 	      if (is_gimple_call (def_stmt))
29878 		{
29879 		  tree fndecl = gimple_call_fndecl (def_stmt);
29880 		  if (fndecl
29881 		      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29882 		    switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29883 		      {
29884 		      case IX86_BUILTIN_CMPPD:
29885 		      case IX86_BUILTIN_CMPPS:
29886 		      case IX86_BUILTIN_CMPPD256:
29887 		      case IX86_BUILTIN_CMPPS256:
29888 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29889 			  break;
29890 			/* FALLTHRU */
29891 		      case IX86_BUILTIN_CMPEQPD:
29892 		      case IX86_BUILTIN_CMPEQPS:
29893 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29894 			    && initializer_zerop (gimple_call_arg (def_stmt,
29895 								   1)))
29896 			  op0 = pc_rtx;
29897 			break;
29898 		      default:
29899 			break;
29900 		      }
29901 		}
29902 	    }
29903 	}
29904 
29905       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29906       if (! pat)
29907 	return const0_rtx;
29908       emit_insn (pat);
29909 
29910       if (fcode == IX86_BUILTIN_GATHERDIV8SF
29911 	  || fcode == IX86_BUILTIN_GATHERDIV8SI)
29912 	{
29913 	  enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29914 				    ? V4SFmode : V4SImode;
29915 	  if (target == NULL_RTX)
29916 	    target = gen_reg_rtx (tmode);
29917 	  if (tmode == V4SFmode)
29918 	    emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29919 	  else
29920 	    emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29921 	}
29922       else
29923 	target = subtarget;
29924 
29925       return target;
29926 
29927     default:
29928       break;
29929     }
29930 
29931   for (i = 0, d = bdesc_special_args;
29932        i < ARRAY_SIZE (bdesc_special_args);
29933        i++, d++)
29934     if (d->code == fcode)
29935       return ix86_expand_special_args_builtin (d, exp, target);
29936 
29937   for (i = 0, d = bdesc_args;
29938        i < ARRAY_SIZE (bdesc_args);
29939        i++, d++)
29940     if (d->code == fcode)
29941       switch (fcode)
29942 	{
29943 	case IX86_BUILTIN_FABSQ:
29944 	case IX86_BUILTIN_COPYSIGNQ:
29945 	  if (!TARGET_SSE2)
29946 	    /* Emit a normal call if SSE2 isn't available.  */
29947 	    return expand_call (exp, target, ignore);
29948 	default:
29949 	  return ix86_expand_args_builtin (d, exp, target);
29950 	}
29951 
29952   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29953     if (d->code == fcode)
29954       return ix86_expand_sse_comi (d, exp, target);
29955 
29956   for (i = 0, d = bdesc_pcmpestr;
29957        i < ARRAY_SIZE (bdesc_pcmpestr);
29958        i++, d++)
29959     if (d->code == fcode)
29960       return ix86_expand_sse_pcmpestr (d, exp, target);
29961 
29962   for (i = 0, d = bdesc_pcmpistr;
29963        i < ARRAY_SIZE (bdesc_pcmpistr);
29964        i++, d++)
29965     if (d->code == fcode)
29966       return ix86_expand_sse_pcmpistr (d, exp, target);
29967 
29968   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29969     if (d->code == fcode)
29970       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29971 					    (enum ix86_builtin_func_type)
29972 					    d->flag, d->comparison);
29973 
29974   gcc_unreachable ();
29975 }
29976 
29977 /* Returns a function decl for a vectorized version of the builtin function
29978    with builtin function code FN and the result vector type TYPE, or NULL_TREE
29979    if it is not available.  */
29980 
29981 static tree
29982 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29983 				  tree type_in)
29984 {
29985   enum machine_mode in_mode, out_mode;
29986   int in_n, out_n;
29987   enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29988 
29989   if (TREE_CODE (type_out) != VECTOR_TYPE
29990       || TREE_CODE (type_in) != VECTOR_TYPE
29991       || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29992     return NULL_TREE;
29993 
29994   out_mode = TYPE_MODE (TREE_TYPE (type_out));
29995   out_n = TYPE_VECTOR_SUBPARTS (type_out);
29996   in_mode = TYPE_MODE (TREE_TYPE (type_in));
29997   in_n = TYPE_VECTOR_SUBPARTS (type_in);
29998 
29999   switch (fn)
30000     {
30001     case BUILT_IN_SQRT:
30002       if (out_mode == DFmode && in_mode == DFmode)
30003 	{
30004 	  if (out_n == 2 && in_n == 2)
30005 	    return ix86_builtins[IX86_BUILTIN_SQRTPD];
30006 	  else if (out_n == 4 && in_n == 4)
30007 	    return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30008 	}
30009       break;
30010 
30011     case BUILT_IN_SQRTF:
30012       if (out_mode == SFmode && in_mode == SFmode)
30013 	{
30014 	  if (out_n == 4 && in_n == 4)
30015 	    return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30016 	  else if (out_n == 8 && in_n == 8)
30017 	    return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30018 	}
30019       break;
30020 
30021     case BUILT_IN_IFLOOR:
30022     case BUILT_IN_LFLOOR:
30023     case BUILT_IN_LLFLOOR:
30024       /* The round insn does not trap on denormals.  */
30025       if (flag_trapping_math || !TARGET_ROUND)
30026 	break;
30027 
30028       if (out_mode == SImode && in_mode == DFmode)
30029 	{
30030 	  if (out_n == 4 && in_n == 2)
30031 	    return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30032 	  else if (out_n == 8 && in_n == 4)
30033 	    return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30034 	}
30035       break;
30036 
30037     case BUILT_IN_IFLOORF:
30038     case BUILT_IN_LFLOORF:
30039     case BUILT_IN_LLFLOORF:
30040       /* The round insn does not trap on denormals.  */
30041       if (flag_trapping_math || !TARGET_ROUND)
30042 	break;
30043 
30044       if (out_mode == SImode && in_mode == SFmode)
30045 	{
30046 	  if (out_n == 4 && in_n == 4)
30047 	    return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30048 	  else if (out_n == 8 && in_n == 8)
30049 	    return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30050 	}
30051       break;
30052 
30053     case BUILT_IN_ICEIL:
30054     case BUILT_IN_LCEIL:
30055     case BUILT_IN_LLCEIL:
30056       /* The round insn does not trap on denormals.  */
30057       if (flag_trapping_math || !TARGET_ROUND)
30058 	break;
30059 
30060       if (out_mode == SImode && in_mode == DFmode)
30061 	{
30062 	  if (out_n == 4 && in_n == 2)
30063 	    return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30064 	  else if (out_n == 8 && in_n == 4)
30065 	    return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30066 	}
30067       break;
30068 
30069     case BUILT_IN_ICEILF:
30070     case BUILT_IN_LCEILF:
30071     case BUILT_IN_LLCEILF:
30072       /* The round insn does not trap on denormals.  */
30073       if (flag_trapping_math || !TARGET_ROUND)
30074 	break;
30075 
30076       if (out_mode == SImode && in_mode == SFmode)
30077 	{
30078 	  if (out_n == 4 && in_n == 4)
30079 	    return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30080 	  else if (out_n == 8 && in_n == 8)
30081 	    return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30082 	}
30083       break;
30084 
30085     case BUILT_IN_IRINT:
30086     case BUILT_IN_LRINT:
30087     case BUILT_IN_LLRINT:
30088       if (out_mode == SImode && in_mode == DFmode)
30089 	{
30090 	  if (out_n == 4 && in_n == 2)
30091 	    return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30092 	  else if (out_n == 8 && in_n == 4)
30093 	    return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30094 	}
30095       break;
30096 
30097     case BUILT_IN_IRINTF:
30098     case BUILT_IN_LRINTF:
30099     case BUILT_IN_LLRINTF:
30100       if (out_mode == SImode && in_mode == SFmode)
30101 	{
30102 	  if (out_n == 4 && in_n == 4)
30103 	    return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30104 	  else if (out_n == 8 && in_n == 8)
30105 	    return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30106 	}
30107       break;
30108 
30109     case BUILT_IN_IROUND:
30110     case BUILT_IN_LROUND:
30111     case BUILT_IN_LLROUND:
30112       /* The round insn does not trap on denormals.  */
30113       if (flag_trapping_math || !TARGET_ROUND)
30114 	break;
30115 
30116       if (out_mode == SImode && in_mode == DFmode)
30117 	{
30118 	  if (out_n == 4 && in_n == 2)
30119 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30120 	  else if (out_n == 8 && in_n == 4)
30121 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30122 	}
30123       break;
30124 
30125     case BUILT_IN_IROUNDF:
30126     case BUILT_IN_LROUNDF:
30127     case BUILT_IN_LLROUNDF:
30128       /* The round insn does not trap on denormals.  */
30129       if (flag_trapping_math || !TARGET_ROUND)
30130 	break;
30131 
30132       if (out_mode == SImode && in_mode == SFmode)
30133 	{
30134 	  if (out_n == 4 && in_n == 4)
30135 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30136 	  else if (out_n == 8 && in_n == 8)
30137 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30138 	}
30139       break;
30140 
30141     case BUILT_IN_COPYSIGN:
30142       if (out_mode == DFmode && in_mode == DFmode)
30143 	{
30144 	  if (out_n == 2 && in_n == 2)
30145 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30146 	  else if (out_n == 4 && in_n == 4)
30147 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30148 	}
30149       break;
30150 
30151     case BUILT_IN_COPYSIGNF:
30152       if (out_mode == SFmode && in_mode == SFmode)
30153 	{
30154 	  if (out_n == 4 && in_n == 4)
30155 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30156 	  else if (out_n == 8 && in_n == 8)
30157 	    return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30158 	}
30159       break;
30160 
30161     case BUILT_IN_FLOOR:
30162       /* The round insn does not trap on denormals.  */
30163       if (flag_trapping_math || !TARGET_ROUND)
30164 	break;
30165 
30166       if (out_mode == DFmode && in_mode == DFmode)
30167 	{
30168 	  if (out_n == 2 && in_n == 2)
30169 	    return ix86_builtins[IX86_BUILTIN_FLOORPD];
30170 	  else if (out_n == 4 && in_n == 4)
30171 	    return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30172 	}
30173       break;
30174 
30175     case BUILT_IN_FLOORF:
30176       /* The round insn does not trap on denormals.  */
30177       if (flag_trapping_math || !TARGET_ROUND)
30178 	break;
30179 
30180       if (out_mode == SFmode && in_mode == SFmode)
30181 	{
30182 	  if (out_n == 4 && in_n == 4)
30183 	    return ix86_builtins[IX86_BUILTIN_FLOORPS];
30184 	  else if (out_n == 8 && in_n == 8)
30185 	    return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30186 	}
30187       break;
30188 
30189     case BUILT_IN_CEIL:
30190       /* The round insn does not trap on denormals.  */
30191       if (flag_trapping_math || !TARGET_ROUND)
30192 	break;
30193 
30194       if (out_mode == DFmode && in_mode == DFmode)
30195 	{
30196 	  if (out_n == 2 && in_n == 2)
30197 	    return ix86_builtins[IX86_BUILTIN_CEILPD];
30198 	  else if (out_n == 4 && in_n == 4)
30199 	    return ix86_builtins[IX86_BUILTIN_CEILPD256];
30200 	}
30201       break;
30202 
30203     case BUILT_IN_CEILF:
30204       /* The round insn does not trap on denormals.  */
30205       if (flag_trapping_math || !TARGET_ROUND)
30206 	break;
30207 
30208       if (out_mode == SFmode && in_mode == SFmode)
30209 	{
30210 	  if (out_n == 4 && in_n == 4)
30211 	    return ix86_builtins[IX86_BUILTIN_CEILPS];
30212 	  else if (out_n == 8 && in_n == 8)
30213 	    return ix86_builtins[IX86_BUILTIN_CEILPS256];
30214 	}
30215       break;
30216 
30217     case BUILT_IN_TRUNC:
30218       /* The round insn does not trap on denormals.  */
30219       if (flag_trapping_math || !TARGET_ROUND)
30220 	break;
30221 
30222       if (out_mode == DFmode && in_mode == DFmode)
30223 	{
30224 	  if (out_n == 2 && in_n == 2)
30225 	    return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30226 	  else if (out_n == 4 && in_n == 4)
30227 	    return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30228 	}
30229       break;
30230 
30231     case BUILT_IN_TRUNCF:
30232       /* The round insn does not trap on denormals.  */
30233       if (flag_trapping_math || !TARGET_ROUND)
30234 	break;
30235 
30236       if (out_mode == SFmode && in_mode == SFmode)
30237 	{
30238 	  if (out_n == 4 && in_n == 4)
30239 	    return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30240 	  else if (out_n == 8 && in_n == 8)
30241 	    return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30242 	}
30243       break;
30244 
30245     case BUILT_IN_RINT:
30246       /* The round insn does not trap on denormals.  */
30247       if (flag_trapping_math || !TARGET_ROUND)
30248 	break;
30249 
30250       if (out_mode == DFmode && in_mode == DFmode)
30251 	{
30252 	  if (out_n == 2 && in_n == 2)
30253 	    return ix86_builtins[IX86_BUILTIN_RINTPD];
30254 	  else if (out_n == 4 && in_n == 4)
30255 	    return ix86_builtins[IX86_BUILTIN_RINTPD256];
30256 	}
30257       break;
30258 
30259     case BUILT_IN_RINTF:
30260       /* The round insn does not trap on denormals.  */
30261       if (flag_trapping_math || !TARGET_ROUND)
30262 	break;
30263 
30264       if (out_mode == SFmode && in_mode == SFmode)
30265 	{
30266 	  if (out_n == 4 && in_n == 4)
30267 	    return ix86_builtins[IX86_BUILTIN_RINTPS];
30268 	  else if (out_n == 8 && in_n == 8)
30269 	    return ix86_builtins[IX86_BUILTIN_RINTPS256];
30270 	}
30271       break;
30272 
30273     case BUILT_IN_ROUND:
30274       /* The round insn does not trap on denormals.  */
30275       if (flag_trapping_math || !TARGET_ROUND)
30276 	break;
30277 
30278       if (out_mode == DFmode && in_mode == DFmode)
30279 	{
30280 	  if (out_n == 2 && in_n == 2)
30281 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30282 	  else if (out_n == 4 && in_n == 4)
30283 	    return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30284 	}
30285       break;
30286 
30287     case BUILT_IN_ROUNDF:
30288       /* The round insn does not trap on denormals.  */
30289       if (flag_trapping_math || !TARGET_ROUND)
30290 	break;
30291 
30292       if (out_mode == SFmode && in_mode == SFmode)
30293 	{
30294 	  if (out_n == 4 && in_n == 4)
30295 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30296 	  else if (out_n == 8 && in_n == 8)
30297 	    return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30298 	}
30299       break;
30300 
30301     case BUILT_IN_FMA:
30302       if (out_mode == DFmode && in_mode == DFmode)
30303 	{
30304 	  if (out_n == 2 && in_n == 2)
30305 	    return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30306 	  if (out_n == 4 && in_n == 4)
30307 	    return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30308 	}
30309       break;
30310 
30311     case BUILT_IN_FMAF:
30312       if (out_mode == SFmode && in_mode == SFmode)
30313 	{
30314 	  if (out_n == 4 && in_n == 4)
30315 	    return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30316 	  if (out_n == 8 && in_n == 8)
30317 	    return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30318 	}
30319       break;
30320 
30321     default:
30322       break;
30323     }
30324 
30325   /* Dispatch to a handler for a vectorization library.  */
30326   if (ix86_veclib_handler)
30327     return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30328 				type_in);
30329 
30330   return NULL_TREE;
30331 }
30332 
30333 /* Handler for an SVML-style interface to
30334    a library with vectorized intrinsics.  */
30335 
30336 static tree
30337 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30338 {
30339   char name[20];
30340   tree fntype, new_fndecl, args;
30341   unsigned arity;
30342   const char *bname;
30343   enum machine_mode el_mode, in_mode;
30344   int n, in_n;
30345 
30346   /* The SVML is suitable for unsafe math only.  */
30347   if (!flag_unsafe_math_optimizations)
30348     return NULL_TREE;
30349 
30350   el_mode = TYPE_MODE (TREE_TYPE (type_out));
30351   n = TYPE_VECTOR_SUBPARTS (type_out);
30352   in_mode = TYPE_MODE (TREE_TYPE (type_in));
30353   in_n = TYPE_VECTOR_SUBPARTS (type_in);
30354   if (el_mode != in_mode
30355       || n != in_n)
30356     return NULL_TREE;
30357 
30358   switch (fn)
30359     {
30360     case BUILT_IN_EXP:
30361     case BUILT_IN_LOG:
30362     case BUILT_IN_LOG10:
30363     case BUILT_IN_POW:
30364     case BUILT_IN_TANH:
30365     case BUILT_IN_TAN:
30366     case BUILT_IN_ATAN:
30367     case BUILT_IN_ATAN2:
30368     case BUILT_IN_ATANH:
30369     case BUILT_IN_CBRT:
30370     case BUILT_IN_SINH:
30371     case BUILT_IN_SIN:
30372     case BUILT_IN_ASINH:
30373     case BUILT_IN_ASIN:
30374     case BUILT_IN_COSH:
30375     case BUILT_IN_COS:
30376     case BUILT_IN_ACOSH:
30377     case BUILT_IN_ACOS:
30378       if (el_mode != DFmode || n != 2)
30379 	return NULL_TREE;
30380       break;
30381 
30382     case BUILT_IN_EXPF:
30383     case BUILT_IN_LOGF:
30384     case BUILT_IN_LOG10F:
30385     case BUILT_IN_POWF:
30386     case BUILT_IN_TANHF:
30387     case BUILT_IN_TANF:
30388     case BUILT_IN_ATANF:
30389     case BUILT_IN_ATAN2F:
30390     case BUILT_IN_ATANHF:
30391     case BUILT_IN_CBRTF:
30392     case BUILT_IN_SINHF:
30393     case BUILT_IN_SINF:
30394     case BUILT_IN_ASINHF:
30395     case BUILT_IN_ASINF:
30396     case BUILT_IN_COSHF:
30397     case BUILT_IN_COSF:
30398     case BUILT_IN_ACOSHF:
30399     case BUILT_IN_ACOSF:
30400       if (el_mode != SFmode || n != 4)
30401 	return NULL_TREE;
30402       break;
30403 
30404     default:
30405       return NULL_TREE;
30406     }
30407 
30408   bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30409 
30410   if (fn == BUILT_IN_LOGF)
30411     strcpy (name, "vmlsLn4");
30412   else if (fn == BUILT_IN_LOG)
30413     strcpy (name, "vmldLn2");
30414   else if (n == 4)
30415     {
30416       sprintf (name, "vmls%s", bname+10);
30417       name[strlen (name)-1] = '4';
30418     }
30419   else
30420     sprintf (name, "vmld%s2", bname+10);
30421 
30422   /* Convert to uppercase. */
30423   name[4] &= ~0x20;
30424 
30425   arity = 0;
30426   for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30427        args;
30428        args = TREE_CHAIN (args))
30429     arity++;
30430 
30431   if (arity == 1)
30432     fntype = build_function_type_list (type_out, type_in, NULL);
30433   else
30434     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30435 
30436   /* Build a function declaration for the vectorized function.  */
30437   new_fndecl = build_decl (BUILTINS_LOCATION,
30438 			   FUNCTION_DECL, get_identifier (name), fntype);
30439   TREE_PUBLIC (new_fndecl) = 1;
30440   DECL_EXTERNAL (new_fndecl) = 1;
30441   DECL_IS_NOVOPS (new_fndecl) = 1;
30442   TREE_READONLY (new_fndecl) = 1;
30443 
30444   return new_fndecl;
30445 }
30446 
30447 /* Handler for an ACML-style interface to
30448    a library with vectorized intrinsics.  */
30449 
30450 static tree
30451 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30452 {
30453   char name[20] = "__vr.._";
30454   tree fntype, new_fndecl, args;
30455   unsigned arity;
30456   const char *bname;
30457   enum machine_mode el_mode, in_mode;
30458   int n, in_n;
30459 
30460   /* The ACML is 64bits only and suitable for unsafe math only as
30461      it does not correctly support parts of IEEE with the required
30462      precision such as denormals.  */
30463   if (!TARGET_64BIT
30464       || !flag_unsafe_math_optimizations)
30465     return NULL_TREE;
30466 
30467   el_mode = TYPE_MODE (TREE_TYPE (type_out));
30468   n = TYPE_VECTOR_SUBPARTS (type_out);
30469   in_mode = TYPE_MODE (TREE_TYPE (type_in));
30470   in_n = TYPE_VECTOR_SUBPARTS (type_in);
30471   if (el_mode != in_mode
30472       || n != in_n)
30473     return NULL_TREE;
30474 
30475   switch (fn)
30476     {
30477     case BUILT_IN_SIN:
30478     case BUILT_IN_COS:
30479     case BUILT_IN_EXP:
30480     case BUILT_IN_LOG:
30481     case BUILT_IN_LOG2:
30482     case BUILT_IN_LOG10:
30483       name[4] = 'd';
30484       name[5] = '2';
30485       if (el_mode != DFmode
30486 	  || n != 2)
30487 	return NULL_TREE;
30488       break;
30489 
30490     case BUILT_IN_SINF:
30491     case BUILT_IN_COSF:
30492     case BUILT_IN_EXPF:
30493     case BUILT_IN_POWF:
30494     case BUILT_IN_LOGF:
30495     case BUILT_IN_LOG2F:
30496     case BUILT_IN_LOG10F:
30497       name[4] = 's';
30498       name[5] = '4';
30499       if (el_mode != SFmode
30500 	  || n != 4)
30501 	return NULL_TREE;
30502       break;
30503 
30504     default:
30505       return NULL_TREE;
30506     }
30507 
30508   bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30509   sprintf (name + 7, "%s", bname+10);
30510 
30511   arity = 0;
30512   for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30513        args;
30514        args = TREE_CHAIN (args))
30515     arity++;
30516 
30517   if (arity == 1)
30518     fntype = build_function_type_list (type_out, type_in, NULL);
30519   else
30520     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30521 
30522   /* Build a function declaration for the vectorized function.  */
30523   new_fndecl = build_decl (BUILTINS_LOCATION,
30524 			   FUNCTION_DECL, get_identifier (name), fntype);
30525   TREE_PUBLIC (new_fndecl) = 1;
30526   DECL_EXTERNAL (new_fndecl) = 1;
30527   DECL_IS_NOVOPS (new_fndecl) = 1;
30528   TREE_READONLY (new_fndecl) = 1;
30529 
30530   return new_fndecl;
30531 }
30532 
30533 /* Returns a decl of a function that implements gather load with
30534    memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30535    Return NULL_TREE if it is not available.  */
30536 
30537 static tree
30538 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30539 			       const_tree index_type, int scale)
30540 {
30541   bool si;
30542   enum ix86_builtins code;
30543 
30544   if (! TARGET_AVX2)
30545     return NULL_TREE;
30546 
30547   if ((TREE_CODE (index_type) != INTEGER_TYPE
30548        && !POINTER_TYPE_P (index_type))
30549       || (TYPE_MODE (index_type) != SImode
30550 	  && TYPE_MODE (index_type) != DImode))
30551     return NULL_TREE;
30552 
30553   if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30554     return NULL_TREE;
30555 
30556   /* v*gather* insn sign extends index to pointer mode.  */
30557   if (TYPE_PRECISION (index_type) < POINTER_SIZE
30558       && TYPE_UNSIGNED (index_type))
30559     return NULL_TREE;
30560 
30561   if (scale <= 0
30562       || scale > 8
30563       || (scale & (scale - 1)) != 0)
30564     return NULL_TREE;
30565 
30566   si = TYPE_MODE (index_type) == SImode;
30567   switch (TYPE_MODE (mem_vectype))
30568     {
30569     case V2DFmode:
30570       code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30571       break;
30572     case V4DFmode:
30573       code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30574       break;
30575     case V2DImode:
30576       code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30577       break;
30578     case V4DImode:
30579       code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30580       break;
30581     case V4SFmode:
30582       code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30583       break;
30584     case V8SFmode:
30585       code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30586       break;
30587     case V4SImode:
30588       code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30589       break;
30590     case V8SImode:
30591       code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30592       break;
30593     default:
30594       return NULL_TREE;
30595     }
30596 
30597   return ix86_builtins[code];
30598 }
30599 
30600 /* Returns a code for a target-specific builtin that implements
30601    reciprocal of the function, or NULL_TREE if not available.  */
30602 
30603 static tree
30604 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30605 			 bool sqrt ATTRIBUTE_UNUSED)
30606 {
30607   if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30608 	 && flag_finite_math_only && !flag_trapping_math
30609 	 && flag_unsafe_math_optimizations))
30610     return NULL_TREE;
30611 
30612   if (md_fn)
30613     /* Machine dependent builtins.  */
30614     switch (fn)
30615       {
30616 	/* Vectorized version of sqrt to rsqrt conversion.  */
30617       case IX86_BUILTIN_SQRTPS_NR:
30618 	return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30619 
30620       case IX86_BUILTIN_SQRTPS_NR256:
30621 	return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30622 
30623       default:
30624 	return NULL_TREE;
30625       }
30626   else
30627     /* Normal builtins.  */
30628     switch (fn)
30629       {
30630 	/* Sqrt to rsqrt conversion.  */
30631       case BUILT_IN_SQRTF:
30632 	return ix86_builtins[IX86_BUILTIN_RSQRTF];
30633 
30634       default:
30635 	return NULL_TREE;
30636       }
30637 }
30638 
30639 /* Helper for avx_vpermilps256_operand et al.  This is also used by
30640    the expansion functions to turn the parallel back into a mask.
30641    The return value is 0 for no match and the imm8+1 for a match.  */
30642 
30643 int
30644 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30645 {
30646   unsigned i, nelt = GET_MODE_NUNITS (mode);
30647   unsigned mask = 0;
30648   unsigned char ipar[8];
30649 
30650   if (XVECLEN (par, 0) != (int) nelt)
30651     return 0;
30652 
30653   /* Validate that all of the elements are constants, and not totally
30654      out of range.  Copy the data into an integral array to make the
30655      subsequent checks easier.  */
30656   for (i = 0; i < nelt; ++i)
30657     {
30658       rtx er = XVECEXP (par, 0, i);
30659       unsigned HOST_WIDE_INT ei;
30660 
30661       if (!CONST_INT_P (er))
30662 	return 0;
30663       ei = INTVAL (er);
30664       if (ei >= nelt)
30665 	return 0;
30666       ipar[i] = ei;
30667     }
30668 
30669   switch (mode)
30670     {
30671     case V4DFmode:
30672       /* In the 256-bit DFmode case, we can only move elements within
30673          a 128-bit lane.  */
30674       for (i = 0; i < 2; ++i)
30675 	{
30676 	  if (ipar[i] >= 2)
30677 	    return 0;
30678 	  mask |= ipar[i] << i;
30679 	}
30680       for (i = 2; i < 4; ++i)
30681 	{
30682 	  if (ipar[i] < 2)
30683 	    return 0;
30684 	  mask |= (ipar[i] - 2) << i;
30685 	}
30686       break;
30687 
30688     case V8SFmode:
30689       /* In the 256-bit SFmode case, we have full freedom of movement
30690 	 within the low 128-bit lane, but the high 128-bit lane must
30691 	 mirror the exact same pattern.  */
30692       for (i = 0; i < 4; ++i)
30693 	if (ipar[i] + 4 != ipar[i + 4])
30694 	  return 0;
30695       nelt = 4;
30696       /* FALLTHRU */
30697 
30698     case V2DFmode:
30699     case V4SFmode:
30700       /* In the 128-bit case, we've full freedom in the placement of
30701 	 the elements from the source operand.  */
30702       for (i = 0; i < nelt; ++i)
30703 	mask |= ipar[i] << (i * (nelt / 2));
30704       break;
30705 
30706     default:
30707       gcc_unreachable ();
30708     }
30709 
30710   /* Make sure success has a non-zero value by adding one.  */
30711   return mask + 1;
30712 }
30713 
30714 /* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
30715    the expansion functions to turn the parallel back into a mask.
30716    The return value is 0 for no match and the imm8+1 for a match.  */
30717 
30718 int
30719 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30720 {
30721   unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30722   unsigned mask = 0;
30723   unsigned char ipar[8];
30724 
30725   if (XVECLEN (par, 0) != (int) nelt)
30726     return 0;
30727 
30728   /* Validate that all of the elements are constants, and not totally
30729      out of range.  Copy the data into an integral array to make the
30730      subsequent checks easier.  */
30731   for (i = 0; i < nelt; ++i)
30732     {
30733       rtx er = XVECEXP (par, 0, i);
30734       unsigned HOST_WIDE_INT ei;
30735 
30736       if (!CONST_INT_P (er))
30737 	return 0;
30738       ei = INTVAL (er);
30739       if (ei >= 2 * nelt)
30740 	return 0;
30741       ipar[i] = ei;
30742     }
30743 
30744   /* Validate that the halves of the permute are halves.  */
30745   for (i = 0; i < nelt2 - 1; ++i)
30746     if (ipar[i] + 1 != ipar[i + 1])
30747       return 0;
30748   for (i = nelt2; i < nelt - 1; ++i)
30749     if (ipar[i] + 1 != ipar[i + 1])
30750       return 0;
30751 
30752   /* Reconstruct the mask.  */
30753   for (i = 0; i < 2; ++i)
30754     {
30755       unsigned e = ipar[i * nelt2];
30756       if (e % nelt2)
30757 	return 0;
30758       e /= nelt2;
30759       mask |= e << (i * 4);
30760     }
30761 
30762   /* Make sure success has a non-zero value by adding one.  */
30763   return mask + 1;
30764 }
30765 
30766 /* Store OPERAND to the memory after reload is completed.  This means
30767    that we can't easily use assign_stack_local.  */
30768 rtx
30769 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30770 {
30771   rtx result;
30772 
30773   gcc_assert (reload_completed);
30774   if (ix86_using_red_zone ())
30775     {
30776       result = gen_rtx_MEM (mode,
30777 			    gen_rtx_PLUS (Pmode,
30778 					  stack_pointer_rtx,
30779 					  GEN_INT (-RED_ZONE_SIZE)));
30780       emit_move_insn (result, operand);
30781     }
30782   else if (TARGET_64BIT)
30783     {
30784       switch (mode)
30785 	{
30786 	case HImode:
30787 	case SImode:
30788 	  operand = gen_lowpart (DImode, operand);
30789 	  /* FALLTHRU */
30790 	case DImode:
30791 	  emit_insn (
30792 		      gen_rtx_SET (VOIDmode,
30793 				   gen_rtx_MEM (DImode,
30794 						gen_rtx_PRE_DEC (DImode,
30795 							stack_pointer_rtx)),
30796 				   operand));
30797 	  break;
30798 	default:
30799 	  gcc_unreachable ();
30800 	}
30801       result = gen_rtx_MEM (mode, stack_pointer_rtx);
30802     }
30803   else
30804     {
30805       switch (mode)
30806 	{
30807 	case DImode:
30808 	  {
30809 	    rtx operands[2];
30810 	    split_double_mode (mode, &operand, 1, operands, operands + 1);
30811 	    emit_insn (
30812 			gen_rtx_SET (VOIDmode,
30813 				     gen_rtx_MEM (SImode,
30814 						  gen_rtx_PRE_DEC (Pmode,
30815 							stack_pointer_rtx)),
30816 				     operands[1]));
30817 	    emit_insn (
30818 			gen_rtx_SET (VOIDmode,
30819 				     gen_rtx_MEM (SImode,
30820 						  gen_rtx_PRE_DEC (Pmode,
30821 							stack_pointer_rtx)),
30822 				     operands[0]));
30823 	  }
30824 	  break;
30825 	case HImode:
30826 	  /* Store HImodes as SImodes.  */
30827 	  operand = gen_lowpart (SImode, operand);
30828 	  /* FALLTHRU */
30829 	case SImode:
30830 	  emit_insn (
30831 		      gen_rtx_SET (VOIDmode,
30832 				   gen_rtx_MEM (GET_MODE (operand),
30833 						gen_rtx_PRE_DEC (SImode,
30834 							stack_pointer_rtx)),
30835 				   operand));
30836 	  break;
30837 	default:
30838 	  gcc_unreachable ();
30839 	}
30840       result = gen_rtx_MEM (mode, stack_pointer_rtx);
30841     }
30842   return result;
30843 }
30844 
30845 /* Free operand from the memory.  */
30846 void
30847 ix86_free_from_memory (enum machine_mode mode)
30848 {
30849   if (!ix86_using_red_zone ())
30850     {
30851       int size;
30852 
30853       if (mode == DImode || TARGET_64BIT)
30854 	size = 8;
30855       else
30856 	size = 4;
30857       /* Use LEA to deallocate stack space.  In peephole2 it will be converted
30858          to pop or add instruction if registers are available.  */
30859       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30860 			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30861 					    GEN_INT (size))));
30862     }
30863 }
30864 
30865 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30866 
30867    Put float CONST_DOUBLE in the constant pool instead of fp regs.
30868    QImode must go into class Q_REGS.
30869    Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
30870    movdf to do mem-to-mem moves through integer regs.  */
30871 
30872 static reg_class_t
30873 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30874 {
30875   enum machine_mode mode = GET_MODE (x);
30876 
30877   /* We're only allowed to return a subclass of CLASS.  Many of the
30878      following checks fail for NO_REGS, so eliminate that early.  */
30879   if (regclass == NO_REGS)
30880     return NO_REGS;
30881 
30882   /* All classes can load zeros.  */
30883   if (x == CONST0_RTX (mode))
30884     return regclass;
30885 
30886   /* Force constants into memory if we are loading a (nonzero) constant into
30887      an MMX or SSE register.  This is because there are no MMX/SSE instructions
30888      to load from a constant.  */
30889   if (CONSTANT_P (x)
30890       && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30891     return NO_REGS;
30892 
30893   /* Prefer SSE regs only, if we can use them for math.  */
30894   if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30895     return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30896 
30897   /* Floating-point constants need more complex checks.  */
30898   if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30899     {
30900       /* General regs can load everything.  */
30901       if (reg_class_subset_p (regclass, GENERAL_REGS))
30902         return regclass;
30903 
30904       /* Floats can load 0 and 1 plus some others.  Note that we eliminated
30905 	 zero above.  We only want to wind up preferring 80387 registers if
30906 	 we plan on doing computation with them.  */
30907       if (TARGET_80387
30908 	  && standard_80387_constant_p (x) > 0)
30909 	{
30910 	  /* Limit class to non-sse.  */
30911 	  if (regclass == FLOAT_SSE_REGS)
30912 	    return FLOAT_REGS;
30913 	  if (regclass == FP_TOP_SSE_REGS)
30914 	    return FP_TOP_REG;
30915 	  if (regclass == FP_SECOND_SSE_REGS)
30916 	    return FP_SECOND_REG;
30917 	  if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30918 	    return regclass;
30919 	}
30920 
30921       return NO_REGS;
30922     }
30923 
30924   /* Generally when we see PLUS here, it's the function invariant
30925      (plus soft-fp const_int).  Which can only be computed into general
30926      regs.  */
30927   if (GET_CODE (x) == PLUS)
30928     return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30929 
30930   /* QImode constants are easy to load, but non-constant QImode data
30931      must go into Q_REGS.  */
30932   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30933     {
30934       if (reg_class_subset_p (regclass, Q_REGS))
30935 	return regclass;
30936       if (reg_class_subset_p (Q_REGS, regclass))
30937 	return Q_REGS;
30938       return NO_REGS;
30939     }
30940 
30941   return regclass;
30942 }
30943 
30944 /* Discourage putting floating-point values in SSE registers unless
30945    SSE math is being used, and likewise for the 387 registers.  */
30946 static reg_class_t
30947 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30948 {
30949   enum machine_mode mode = GET_MODE (x);
30950 
30951   /* Restrict the output reload class to the register bank that we are doing
30952      math on.  If we would like not to return a subset of CLASS, reject this
30953      alternative: if reload cannot do this, it will still use its choice.  */
30954   mode = GET_MODE (x);
30955   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30956     return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30957 
30958   if (X87_FLOAT_MODE_P (mode))
30959     {
30960       if (regclass == FP_TOP_SSE_REGS)
30961 	return FP_TOP_REG;
30962       else if (regclass == FP_SECOND_SSE_REGS)
30963 	return FP_SECOND_REG;
30964       else
30965 	return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30966     }
30967 
30968   return regclass;
30969 }
30970 
30971 static reg_class_t
30972 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30973 		       enum machine_mode mode, secondary_reload_info *sri)
30974 {
30975   /* Double-word spills from general registers to non-offsettable memory
30976      references (zero-extended addresses) require special handling.  */
30977   if (TARGET_64BIT
30978       && MEM_P (x)
30979       && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30980       && rclass == GENERAL_REGS
30981       && !offsettable_memref_p (x))
30982     {
30983       sri->icode = (in_p
30984 		    ? CODE_FOR_reload_noff_load
30985 		    : CODE_FOR_reload_noff_store);
30986       /* Add the cost of moving address to a temporary.  */
30987       sri->extra_cost = 1;
30988 
30989       return NO_REGS;
30990     }
30991 
30992   /* QImode spills from non-QI registers require
30993      intermediate register on 32bit targets.  */
30994   if (!TARGET_64BIT
30995       && !in_p && mode == QImode
30996       && (rclass == GENERAL_REGS
30997 	  || rclass == LEGACY_REGS
30998 	  || rclass == INDEX_REGS))
30999     {
31000       int regno;
31001 
31002       if (REG_P (x))
31003 	regno = REGNO (x);
31004       else
31005 	regno = -1;
31006 
31007       if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31008 	regno = true_regnum (x);
31009 
31010       /* Return Q_REGS if the operand is in memory.  */
31011       if (regno == -1)
31012 	return Q_REGS;
31013     }
31014 
31015   /* This condition handles corner case where an expression involving
31016      pointers gets vectorized.  We're trying to use the address of a
31017      stack slot as a vector initializer.
31018 
31019      (set (reg:V2DI 74 [ vect_cst_.2 ])
31020           (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31021 
31022      Eventually frame gets turned into sp+offset like this:
31023 
31024      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31025           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31026 	                               (const_int 392 [0x188]))))
31027 
31028      That later gets turned into:
31029 
31030      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31031           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31032 	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31033 
31034      We'll have the following reload recorded:
31035 
31036      Reload 0: reload_in (DI) =
31037            (plus:DI (reg/f:DI 7 sp)
31038             (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31039      reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31040      SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31041      reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31042      reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31043      reload_reg_rtx: (reg:V2DI 22 xmm1)
31044 
31045      Which isn't going to work since SSE instructions can't handle scalar
31046      additions.  Returning GENERAL_REGS forces the addition into integer
31047      register and reload can handle subsequent reloads without problems.  */
31048 
31049   if (in_p && GET_CODE (x) == PLUS
31050       && SSE_CLASS_P (rclass)
31051       && SCALAR_INT_MODE_P (mode))
31052     return GENERAL_REGS;
31053 
31054   return NO_REGS;
31055 }
31056 
31057 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
31058 
31059 static bool
31060 ix86_class_likely_spilled_p (reg_class_t rclass)
31061 {
31062   switch (rclass)
31063     {
31064       case AREG:
31065       case DREG:
31066       case CREG:
31067       case BREG:
31068       case AD_REGS:
31069       case SIREG:
31070       case DIREG:
31071       case SSE_FIRST_REG:
31072       case FP_TOP_REG:
31073       case FP_SECOND_REG:
31074 	return true;
31075 
31076       default:
31077 	break;
31078     }
31079 
31080   return false;
31081 }
31082 
31083 /* If we are copying between general and FP registers, we need a memory
31084    location. The same is true for SSE and MMX registers.
31085 
31086    To optimize register_move_cost performance, allow inline variant.
31087 
31088    The macro can't work reliably when one of the CLASSES is class containing
31089    registers from multiple units (SSE, MMX, integer).  We avoid this by never
31090    combining those units in single alternative in the machine description.
31091    Ensure that this constraint holds to avoid unexpected surprises.
31092 
31093    When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31094    enforce these sanity checks.  */
31095 
31096 static inline bool
31097 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31098 				enum machine_mode mode, int strict)
31099 {
31100   if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31101       || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31102       || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31103       || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31104       || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31105       || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31106     {
31107       gcc_assert (!strict);
31108       return true;
31109     }
31110 
31111   if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31112     return true;
31113 
31114   /* ??? This is a lie.  We do have moves between mmx/general, and for
31115      mmx/sse2.  But by saying we need secondary memory we discourage the
31116      register allocator from using the mmx registers unless needed.  */
31117   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31118     return true;
31119 
31120   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31121     {
31122       /* SSE1 doesn't have any direct moves from other classes.  */
31123       if (!TARGET_SSE2)
31124 	return true;
31125 
31126       /* If the target says that inter-unit moves are more expensive
31127 	 than moving through memory, then don't generate them.  */
31128       if (!TARGET_INTER_UNIT_MOVES)
31129 	return true;
31130 
31131       /* Between SSE and general, we have moves no larger than word size.  */
31132       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31133 	return true;
31134     }
31135 
31136   return false;
31137 }
31138 
31139 bool
31140 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31141 			      enum machine_mode mode, int strict)
31142 {
31143   return inline_secondary_memory_needed (class1, class2, mode, strict);
31144 }
31145 
31146 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31147 
31148    On the 80386, this is the size of MODE in words,
31149    except in the FP regs, where a single reg is always enough.  */
31150 
31151 static unsigned char
31152 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31153 {
31154   if (MAYBE_INTEGER_CLASS_P (rclass))
31155     {
31156       if (mode == XFmode)
31157 	return (TARGET_64BIT ? 2 : 3);
31158       else if (mode == XCmode)
31159 	return (TARGET_64BIT ? 4 : 6);
31160       else
31161 	return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31162     }
31163   else
31164     {
31165       if (COMPLEX_MODE_P (mode))
31166 	return 2;
31167       else
31168 	return 1;
31169     }
31170 }
31171 
31172 /* Return true if the registers in CLASS cannot represent the change from
31173    modes FROM to TO.  */
31174 
31175 bool
31176 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31177 			       enum reg_class regclass)
31178 {
31179   if (from == to)
31180     return false;
31181 
31182   /* x87 registers can't do subreg at all, as all values are reformatted
31183      to extended precision.  */
31184   if (MAYBE_FLOAT_CLASS_P (regclass))
31185     return true;
31186 
31187   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31188     {
31189       /* Vector registers do not support QI or HImode loads.  If we don't
31190 	 disallow a change to these modes, reload will assume it's ok to
31191 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
31192 	 the vec_dupv4hi pattern.  */
31193       if (GET_MODE_SIZE (from) < 4)
31194 	return true;
31195 
31196       /* Vector registers do not support subreg with nonzero offsets, which
31197 	 are otherwise valid for integer registers.  Since we can't see
31198 	 whether we have a nonzero offset from here, prohibit all
31199          nonparadoxical subregs changing size.  */
31200       if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31201 	return true;
31202     }
31203 
31204   return false;
31205 }
31206 
31207 /* Return the cost of moving data of mode M between a
31208    register and memory.  A value of 2 is the default; this cost is
31209    relative to those in `REGISTER_MOVE_COST'.
31210 
31211    This function is used extensively by register_move_cost that is used to
31212    build tables at startup.  Make it inline in this case.
31213    When IN is 2, return maximum of in and out move cost.
31214 
31215    If moving between registers and memory is more expensive than
31216    between two registers, you should define this macro to express the
31217    relative cost.
31218 
31219    Model also increased moving costs of QImode registers in non
31220    Q_REGS classes.
31221  */
31222 static inline int
31223 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31224 			 int in)
31225 {
31226   int cost;
31227   if (FLOAT_CLASS_P (regclass))
31228     {
31229       int index;
31230       switch (mode)
31231 	{
31232 	  case SFmode:
31233 	    index = 0;
31234 	    break;
31235 	  case DFmode:
31236 	    index = 1;
31237 	    break;
31238 	  case XFmode:
31239 	    index = 2;
31240 	    break;
31241 	  default:
31242 	    return 100;
31243 	}
31244       if (in == 2)
31245         return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31246       return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31247     }
31248   if (SSE_CLASS_P (regclass))
31249     {
31250       int index;
31251       switch (GET_MODE_SIZE (mode))
31252 	{
31253 	  case 4:
31254 	    index = 0;
31255 	    break;
31256 	  case 8:
31257 	    index = 1;
31258 	    break;
31259 	  case 16:
31260 	    index = 2;
31261 	    break;
31262 	  default:
31263 	    return 100;
31264 	}
31265       if (in == 2)
31266         return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31267       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31268     }
31269   if (MMX_CLASS_P (regclass))
31270     {
31271       int index;
31272       switch (GET_MODE_SIZE (mode))
31273 	{
31274 	  case 4:
31275 	    index = 0;
31276 	    break;
31277 	  case 8:
31278 	    index = 1;
31279 	    break;
31280 	  default:
31281 	    return 100;
31282 	}
31283       if (in)
31284         return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31285       return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31286     }
31287   switch (GET_MODE_SIZE (mode))
31288     {
31289       case 1:
31290 	if (Q_CLASS_P (regclass) || TARGET_64BIT)
31291 	  {
31292 	    if (!in)
31293 	      return ix86_cost->int_store[0];
31294 	    if (TARGET_PARTIAL_REG_DEPENDENCY
31295 	        && optimize_function_for_speed_p (cfun))
31296 	      cost = ix86_cost->movzbl_load;
31297 	    else
31298 	      cost = ix86_cost->int_load[0];
31299 	    if (in == 2)
31300 	      return MAX (cost, ix86_cost->int_store[0]);
31301 	    return cost;
31302 	  }
31303 	else
31304 	  {
31305 	   if (in == 2)
31306 	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31307 	   if (in)
31308 	     return ix86_cost->movzbl_load;
31309 	   else
31310 	     return ix86_cost->int_store[0] + 4;
31311 	  }
31312 	break;
31313       case 2:
31314 	if (in == 2)
31315 	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31316 	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31317       default:
31318 	/* Compute number of 32bit moves needed.  TFmode is moved as XFmode.  */
31319 	if (mode == TFmode)
31320 	  mode = XFmode;
31321 	if (in == 2)
31322 	  cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31323 	else if (in)
31324 	  cost = ix86_cost->int_load[2];
31325 	else
31326 	  cost = ix86_cost->int_store[2];
31327 	return (cost * (((int) GET_MODE_SIZE (mode)
31328 		        + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31329     }
31330 }
31331 
31332 static int
31333 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31334 		       bool in)
31335 {
31336   return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31337 }
31338 
31339 
31340 /* Return the cost of moving data from a register in class CLASS1 to
31341    one in class CLASS2.
31342 
31343    It is not required that the cost always equal 2 when FROM is the same as TO;
31344    on some machines it is expensive to move between registers if they are not
31345    general registers.  */
31346 
31347 static int
31348 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31349 			 reg_class_t class2_i)
31350 {
31351   enum reg_class class1 = (enum reg_class) class1_i;
31352   enum reg_class class2 = (enum reg_class) class2_i;
31353 
31354   /* In case we require secondary memory, compute cost of the store followed
31355      by load.  In order to avoid bad register allocation choices, we need
31356      for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
31357 
31358   if (inline_secondary_memory_needed (class1, class2, mode, 0))
31359     {
31360       int cost = 1;
31361 
31362       cost += inline_memory_move_cost (mode, class1, 2);
31363       cost += inline_memory_move_cost (mode, class2, 2);
31364 
31365       /* In case of copying from general_purpose_register we may emit multiple
31366          stores followed by single load causing memory size mismatch stall.
31367          Count this as arbitrarily high cost of 20.  */
31368       if (targetm.class_max_nregs (class1, mode)
31369 	  > targetm.class_max_nregs (class2, mode))
31370 	cost += 20;
31371 
31372       /* In the case of FP/MMX moves, the registers actually overlap, and we
31373 	 have to switch modes in order to treat them differently.  */
31374       if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31375           || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31376 	cost += 20;
31377 
31378       return cost;
31379     }
31380 
31381   /* Moves between SSE/MMX and integer unit are expensive.  */
31382   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31383       || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31384 
31385     /* ??? By keeping returned value relatively high, we limit the number
31386        of moves between integer and MMX/SSE registers for all targets.
31387        Additionally, high value prevents problem with x86_modes_tieable_p(),
31388        where integer modes in MMX/SSE registers are not tieable
31389        because of missing QImode and HImode moves to, from or between
31390        MMX/SSE registers.  */
31391     return MAX (8, ix86_cost->mmxsse_to_integer);
31392 
31393   if (MAYBE_FLOAT_CLASS_P (class1))
31394     return ix86_cost->fp_move;
31395   if (MAYBE_SSE_CLASS_P (class1))
31396     return ix86_cost->sse_move;
31397   if (MAYBE_MMX_CLASS_P (class1))
31398     return ix86_cost->mmx_move;
31399   return 2;
31400 }
31401 
31402 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31403    MODE.  */
31404 
31405 bool
31406 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31407 {
31408   /* Flags and only flags can only hold CCmode values.  */
31409   if (CC_REGNO_P (regno))
31410     return GET_MODE_CLASS (mode) == MODE_CC;
31411   if (GET_MODE_CLASS (mode) == MODE_CC
31412       || GET_MODE_CLASS (mode) == MODE_RANDOM
31413       || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31414     return false;
31415   if (FP_REGNO_P (regno))
31416     return VALID_FP_MODE_P (mode);
31417   if (SSE_REGNO_P (regno))
31418     {
31419       /* We implement the move patterns for all vector modes into and
31420 	 out of SSE registers, even when no operation instructions
31421 	 are available.  OImode move is available only when AVX is
31422 	 enabled.  */
31423       return ((TARGET_AVX && mode == OImode)
31424 	      || VALID_AVX256_REG_MODE (mode)
31425 	      || VALID_SSE_REG_MODE (mode)
31426 	      || VALID_SSE2_REG_MODE (mode)
31427 	      || VALID_MMX_REG_MODE (mode)
31428 	      || VALID_MMX_REG_MODE_3DNOW (mode));
31429     }
31430   if (MMX_REGNO_P (regno))
31431     {
31432       /* We implement the move patterns for 3DNOW modes even in MMX mode,
31433 	 so if the register is available at all, then we can move data of
31434 	 the given mode into or out of it.  */
31435       return (VALID_MMX_REG_MODE (mode)
31436 	      || VALID_MMX_REG_MODE_3DNOW (mode));
31437     }
31438 
31439   if (mode == QImode)
31440     {
31441       /* Take care for QImode values - they can be in non-QI regs,
31442 	 but then they do cause partial register stalls.  */
31443       if (regno <= BX_REG || TARGET_64BIT)
31444 	return true;
31445       if (!TARGET_PARTIAL_REG_STALL)
31446 	return true;
31447       return !can_create_pseudo_p ();
31448     }
31449   /* We handle both integer and floats in the general purpose registers.  */
31450   else if (VALID_INT_MODE_P (mode))
31451     return true;
31452   else if (VALID_FP_MODE_P (mode))
31453     return true;
31454   else if (VALID_DFP_MODE_P (mode))
31455     return true;
31456   /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
31457      on to use that value in smaller contexts, this can easily force a
31458      pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
31459      supporting DImode, allow it.  */
31460   else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31461     return true;
31462 
31463   return false;
31464 }
31465 
31466 /* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
31467    tieable integer mode.  */
31468 
31469 static bool
31470 ix86_tieable_integer_mode_p (enum machine_mode mode)
31471 {
31472   switch (mode)
31473     {
31474     case HImode:
31475     case SImode:
31476       return true;
31477 
31478     case QImode:
31479       return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31480 
31481     case DImode:
31482       return TARGET_64BIT;
31483 
31484     default:
31485       return false;
31486     }
31487 }
31488 
31489 /* Return true if MODE1 is accessible in a register that can hold MODE2
31490    without copying.  That is, all register classes that can hold MODE2
31491    can also hold MODE1.  */
31492 
31493 bool
31494 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31495 {
31496   if (mode1 == mode2)
31497     return true;
31498 
31499   if (ix86_tieable_integer_mode_p (mode1)
31500       && ix86_tieable_integer_mode_p (mode2))
31501     return true;
31502 
31503   /* MODE2 being XFmode implies fp stack or general regs, which means we
31504      can tie any smaller floating point modes to it.  Note that we do not
31505      tie this with TFmode.  */
31506   if (mode2 == XFmode)
31507     return mode1 == SFmode || mode1 == DFmode;
31508 
31509   /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31510      that we can tie it with SFmode.  */
31511   if (mode2 == DFmode)
31512     return mode1 == SFmode;
31513 
31514   /* If MODE2 is only appropriate for an SSE register, then tie with
31515      any other mode acceptable to SSE registers.  */
31516   if (GET_MODE_SIZE (mode2) == 16
31517       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31518     return (GET_MODE_SIZE (mode1) == 16
31519 	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31520 
31521   /* If MODE2 is appropriate for an MMX register, then tie
31522      with any other mode acceptable to MMX registers.  */
31523   if (GET_MODE_SIZE (mode2) == 8
31524       && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31525     return (GET_MODE_SIZE (mode1) == 8
31526 	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31527 
31528   return false;
31529 }
31530 
31531 /* Compute a (partial) cost for rtx X.  Return true if the complete
31532    cost has been computed, and false if subexpressions should be
31533    scanned.  In either case, *TOTAL contains the cost result.  */
31534 
31535 static bool
31536 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31537 		bool speed)
31538 {
31539   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31540   enum machine_mode mode = GET_MODE (x);
31541   const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31542 
31543   switch (code)
31544     {
31545     case CONST_INT:
31546     case CONST:
31547     case LABEL_REF:
31548     case SYMBOL_REF:
31549       if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31550 	*total = 3;
31551       else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31552 	*total = 2;
31553       else if (flag_pic && SYMBOLIC_CONST (x)
31554 	       && (!TARGET_64BIT
31555 		   || (!GET_CODE (x) != LABEL_REF
31556 		       && (GET_CODE (x) != SYMBOL_REF
31557 		           || !SYMBOL_REF_LOCAL_P (x)))))
31558 	*total = 1;
31559       else
31560 	*total = 0;
31561       return true;
31562 
31563     case CONST_DOUBLE:
31564       if (mode == VOIDmode)
31565 	*total = 0;
31566       else
31567 	switch (standard_80387_constant_p (x))
31568 	  {
31569 	  case 1: /* 0.0 */
31570 	    *total = 1;
31571 	    break;
31572 	  default: /* Other constants */
31573 	    *total = 2;
31574 	    break;
31575 	  case 0:
31576 	  case -1:
31577 	    /* Start with (MEM (SYMBOL_REF)), since that's where
31578 	       it'll probably end up.  Add a penalty for size.  */
31579 	    *total = (COSTS_N_INSNS (1)
31580 		      + (flag_pic != 0 && !TARGET_64BIT)
31581 		      + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31582 	    break;
31583 	  }
31584       return true;
31585 
31586     case ZERO_EXTEND:
31587       /* The zero extensions is often completely free on x86_64, so make
31588 	 it as cheap as possible.  */
31589       if (TARGET_64BIT && mode == DImode
31590 	  && GET_MODE (XEXP (x, 0)) == SImode)
31591 	*total = 1;
31592       else if (TARGET_ZERO_EXTEND_WITH_AND)
31593 	*total = cost->add;
31594       else
31595 	*total = cost->movzx;
31596       return false;
31597 
31598     case SIGN_EXTEND:
31599       *total = cost->movsx;
31600       return false;
31601 
31602     case ASHIFT:
31603       if (CONST_INT_P (XEXP (x, 1))
31604 	  && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31605 	{
31606 	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31607 	  if (value == 1)
31608 	    {
31609 	      *total = cost->add;
31610 	      return false;
31611 	    }
31612 	  if ((value == 2 || value == 3)
31613 	      && cost->lea <= cost->shift_const)
31614 	    {
31615 	      *total = cost->lea;
31616 	      return false;
31617 	    }
31618 	}
31619       /* FALLTHRU */
31620 
31621     case ROTATE:
31622     case ASHIFTRT:
31623     case LSHIFTRT:
31624     case ROTATERT:
31625       if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31626 	{
31627 	  if (CONST_INT_P (XEXP (x, 1)))
31628 	    {
31629 	      if (INTVAL (XEXP (x, 1)) > 32)
31630 		*total = cost->shift_const + COSTS_N_INSNS (2);
31631 	      else
31632 		*total = cost->shift_const * 2;
31633 	    }
31634 	  else
31635 	    {
31636 	      if (GET_CODE (XEXP (x, 1)) == AND)
31637 		*total = cost->shift_var * 2;
31638 	      else
31639 		*total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31640 	    }
31641 	}
31642       else
31643 	{
31644 	  if (CONST_INT_P (XEXP (x, 1)))
31645 	    *total = cost->shift_const;
31646 	  else if (GET_CODE (XEXP (x, 1)) == SUBREG
31647 		   && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
31648 	    {
31649 	      /* Return the cost after shift-and truncation.  */
31650 	      *total = cost->shift_var;
31651 	      return true;
31652 	    }
31653 	  else
31654 	    *total = cost->shift_var;
31655 	}
31656       return false;
31657 
31658     case FMA:
31659       {
31660 	rtx sub;
31661 
31662         gcc_assert (FLOAT_MODE_P (mode));
31663         gcc_assert (TARGET_FMA || TARGET_FMA4);
31664 
31665         /* ??? SSE scalar/vector cost should be used here.  */
31666         /* ??? Bald assumption that fma has the same cost as fmul.  */
31667         *total = cost->fmul;
31668 	*total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31669 
31670         /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
31671 	sub = XEXP (x, 0);
31672 	if (GET_CODE (sub) == NEG)
31673 	  sub = XEXP (sub, 0);
31674 	*total += rtx_cost (sub, FMA, 0, speed);
31675 
31676 	sub = XEXP (x, 2);
31677 	if (GET_CODE (sub) == NEG)
31678 	  sub = XEXP (sub, 0);
31679 	*total += rtx_cost (sub, FMA, 2, speed);
31680 	return true;
31681       }
31682 
31683     case MULT:
31684       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31685 	{
31686 	  /* ??? SSE scalar cost should be used here.  */
31687 	  *total = cost->fmul;
31688 	  return false;
31689 	}
31690       else if (X87_FLOAT_MODE_P (mode))
31691 	{
31692 	  *total = cost->fmul;
31693 	  return false;
31694 	}
31695       else if (FLOAT_MODE_P (mode))
31696 	{
31697 	  /* ??? SSE vector cost should be used here.  */
31698 	  *total = cost->fmul;
31699 	  return false;
31700 	}
31701       else
31702 	{
31703 	  rtx op0 = XEXP (x, 0);
31704 	  rtx op1 = XEXP (x, 1);
31705 	  int nbits;
31706 	  if (CONST_INT_P (XEXP (x, 1)))
31707 	    {
31708 	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31709 	      for (nbits = 0; value != 0; value &= value - 1)
31710 	        nbits++;
31711 	    }
31712 	  else
31713 	    /* This is arbitrary.  */
31714 	    nbits = 7;
31715 
31716 	  /* Compute costs correctly for widening multiplication.  */
31717 	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31718 	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31719 	         == GET_MODE_SIZE (mode))
31720 	    {
31721 	      int is_mulwiden = 0;
31722 	      enum machine_mode inner_mode = GET_MODE (op0);
31723 
31724 	      if (GET_CODE (op0) == GET_CODE (op1))
31725 		is_mulwiden = 1, op1 = XEXP (op1, 0);
31726 	      else if (CONST_INT_P (op1))
31727 		{
31728 		  if (GET_CODE (op0) == SIGN_EXTEND)
31729 		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31730 			          == INTVAL (op1);
31731 		  else
31732 		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31733 	        }
31734 
31735 	      if (is_mulwiden)
31736 	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31737 	    }
31738 
31739   	  *total = (cost->mult_init[MODE_INDEX (mode)]
31740 		    + nbits * cost->mult_bit
31741 	            + rtx_cost (op0, outer_code, opno, speed)
31742 		    + rtx_cost (op1, outer_code, opno, speed));
31743 
31744           return true;
31745 	}
31746 
31747     case DIV:
31748     case UDIV:
31749     case MOD:
31750     case UMOD:
31751       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31752 	/* ??? SSE cost should be used here.  */
31753 	*total = cost->fdiv;
31754       else if (X87_FLOAT_MODE_P (mode))
31755 	*total = cost->fdiv;
31756       else if (FLOAT_MODE_P (mode))
31757 	/* ??? SSE vector cost should be used here.  */
31758 	*total = cost->fdiv;
31759       else
31760 	*total = cost->divide[MODE_INDEX (mode)];
31761       return false;
31762 
31763     case PLUS:
31764       if (GET_MODE_CLASS (mode) == MODE_INT
31765 	       && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31766 	{
31767 	  if (GET_CODE (XEXP (x, 0)) == PLUS
31768 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31769 	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31770 	      && CONSTANT_P (XEXP (x, 1)))
31771 	    {
31772 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31773 	      if (val == 2 || val == 4 || val == 8)
31774 		{
31775 		  *total = cost->lea;
31776 		  *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31777 				      outer_code, opno, speed);
31778 		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31779 				      outer_code, opno, speed);
31780 		  *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31781 		  return true;
31782 		}
31783 	    }
31784 	  else if (GET_CODE (XEXP (x, 0)) == MULT
31785 		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31786 	    {
31787 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31788 	      if (val == 2 || val == 4 || val == 8)
31789 		{
31790 		  *total = cost->lea;
31791 		  *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31792 				      outer_code, opno, speed);
31793 		  *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31794 		  return true;
31795 		}
31796 	    }
31797 	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
31798 	    {
31799 	      *total = cost->lea;
31800 	      *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31801 				  outer_code, opno, speed);
31802 	      *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31803 				  outer_code, opno, speed);
31804 	      *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31805 	      return true;
31806 	    }
31807 	}
31808       /* FALLTHRU */
31809 
31810     case MINUS:
31811       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31812 	{
31813 	  /* ??? SSE cost should be used here.  */
31814 	  *total = cost->fadd;
31815 	  return false;
31816 	}
31817       else if (X87_FLOAT_MODE_P (mode))
31818 	{
31819 	  *total = cost->fadd;
31820 	  return false;
31821 	}
31822       else if (FLOAT_MODE_P (mode))
31823 	{
31824 	  /* ??? SSE vector cost should be used here.  */
31825 	  *total = cost->fadd;
31826 	  return false;
31827 	}
31828       /* FALLTHRU */
31829 
31830     case AND:
31831     case IOR:
31832     case XOR:
31833       if (!TARGET_64BIT && mode == DImode)
31834 	{
31835 	  *total = (cost->add * 2
31836 		    + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31837 		       << (GET_MODE (XEXP (x, 0)) != DImode))
31838 		    + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31839 	               << (GET_MODE (XEXP (x, 1)) != DImode)));
31840 	  return true;
31841 	}
31842       /* FALLTHRU */
31843 
31844     case NEG:
31845       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31846 	{
31847 	  /* ??? SSE cost should be used here.  */
31848 	  *total = cost->fchs;
31849 	  return false;
31850 	}
31851       else if (X87_FLOAT_MODE_P (mode))
31852 	{
31853 	  *total = cost->fchs;
31854 	  return false;
31855 	}
31856       else if (FLOAT_MODE_P (mode))
31857 	{
31858 	  /* ??? SSE vector cost should be used here.  */
31859 	  *total = cost->fchs;
31860 	  return false;
31861 	}
31862       /* FALLTHRU */
31863 
31864     case NOT:
31865       if (!TARGET_64BIT && mode == DImode)
31866 	*total = cost->add * 2;
31867       else
31868 	*total = cost->add;
31869       return false;
31870 
31871     case COMPARE:
31872       if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31873 	  && XEXP (XEXP (x, 0), 1) == const1_rtx
31874 	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31875 	  && XEXP (x, 1) == const0_rtx)
31876 	{
31877 	  /* This kind of construct is implemented using test[bwl].
31878 	     Treat it as if we had an AND.  */
31879 	  *total = (cost->add
31880 		    + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31881 		    + rtx_cost (const1_rtx, outer_code, opno, speed));
31882 	  return true;
31883 	}
31884       return false;
31885 
31886     case FLOAT_EXTEND:
31887       if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31888 	*total = 0;
31889       return false;
31890 
31891     case ABS:
31892       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31893 	/* ??? SSE cost should be used here.  */
31894 	*total = cost->fabs;
31895       else if (X87_FLOAT_MODE_P (mode))
31896 	*total = cost->fabs;
31897       else if (FLOAT_MODE_P (mode))
31898 	/* ??? SSE vector cost should be used here.  */
31899 	*total = cost->fabs;
31900       return false;
31901 
31902     case SQRT:
31903       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31904 	/* ??? SSE cost should be used here.  */
31905 	*total = cost->fsqrt;
31906       else if (X87_FLOAT_MODE_P (mode))
31907 	*total = cost->fsqrt;
31908       else if (FLOAT_MODE_P (mode))
31909 	/* ??? SSE vector cost should be used here.  */
31910 	*total = cost->fsqrt;
31911       return false;
31912 
31913     case UNSPEC:
31914       if (XINT (x, 1) == UNSPEC_TP)
31915 	*total = 0;
31916       return false;
31917 
31918     case VEC_SELECT:
31919     case VEC_CONCAT:
31920     case VEC_MERGE:
31921     case VEC_DUPLICATE:
31922       /* ??? Assume all of these vector manipulation patterns are
31923 	 recognizable.  In which case they all pretty much have the
31924 	 same cost.  */
31925      *total = COSTS_N_INSNS (1);
31926      return true;
31927 
31928     default:
31929       return false;
31930     }
31931 }
31932 
31933 #if TARGET_MACHO
31934 
31935 static int current_machopic_label_num;
31936 
31937 /* Given a symbol name and its associated stub, write out the
31938    definition of the stub.  */
31939 
31940 void
31941 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31942 {
31943   unsigned int length;
31944   char *binder_name, *symbol_name, lazy_ptr_name[32];
31945   int label = ++current_machopic_label_num;
31946 
31947   /* For 64-bit we shouldn't get here.  */
31948   gcc_assert (!TARGET_64BIT);
31949 
31950   /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
31951   symb = targetm.strip_name_encoding (symb);
31952 
31953   length = strlen (stub);
31954   binder_name = XALLOCAVEC (char, length + 32);
31955   GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31956 
31957   length = strlen (symb);
31958   symbol_name = XALLOCAVEC (char, length + 32);
31959   GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31960 
31961   sprintf (lazy_ptr_name, "L%d$lz", label);
31962 
31963   if (MACHOPIC_ATT_STUB)
31964     switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31965   else if (MACHOPIC_PURE)
31966     switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31967   else
31968     switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31969 
31970   fprintf (file, "%s:\n", stub);
31971   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31972 
31973   if (MACHOPIC_ATT_STUB)
31974     {
31975       fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31976     }
31977   else if (MACHOPIC_PURE)
31978     {
31979       /* PIC stub.  */
31980       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
31981       rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31982       output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
31983       fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31984 	       label, lazy_ptr_name, label);
31985       fprintf (file, "\tjmp\t*%%ecx\n");
31986     }
31987   else
31988     fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31989 
31990   /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31991      it needs no stub-binding-helper.  */
31992   if (MACHOPIC_ATT_STUB)
31993     return;
31994 
31995   fprintf (file, "%s:\n", binder_name);
31996 
31997   if (MACHOPIC_PURE)
31998     {
31999       fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32000       fprintf (file, "\tpushl\t%%ecx\n");
32001     }
32002   else
32003     fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32004 
32005   fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32006 
32007   /* N.B. Keep the correspondence of these
32008      'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32009      old-pic/new-pic/non-pic stubs; altering this will break
32010      compatibility with existing dylibs.  */
32011   if (MACHOPIC_PURE)
32012     {
32013       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
32014       switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32015     }
32016   else
32017     /* 16-byte -mdynamic-no-pic stub.  */
32018     switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32019 
32020   fprintf (file, "%s:\n", lazy_ptr_name);
32021   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32022   fprintf (file, ASM_LONG "%s\n", binder_name);
32023 }
32024 #endif /* TARGET_MACHO */
32025 
32026 /* Order the registers for register allocator.  */
32027 
32028 void
32029 x86_order_regs_for_local_alloc (void)
32030 {
32031    int pos = 0;
32032    int i;
32033 
32034    /* First allocate the local general purpose registers.  */
32035    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32036      if (GENERAL_REGNO_P (i) && call_used_regs[i])
32037 	reg_alloc_order [pos++] = i;
32038 
32039    /* Global general purpose registers.  */
32040    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32041      if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32042 	reg_alloc_order [pos++] = i;
32043 
32044    /* x87 registers come first in case we are doing FP math
32045       using them.  */
32046    if (!TARGET_SSE_MATH)
32047      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32048        reg_alloc_order [pos++] = i;
32049 
32050    /* SSE registers.  */
32051    for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32052      reg_alloc_order [pos++] = i;
32053    for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32054      reg_alloc_order [pos++] = i;
32055 
32056    /* x87 registers.  */
32057    if (TARGET_SSE_MATH)
32058      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32059        reg_alloc_order [pos++] = i;
32060 
32061    for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32062      reg_alloc_order [pos++] = i;
32063 
32064    /* Initialize the rest of array as we do not allocate some registers
32065       at all.  */
32066    while (pos < FIRST_PSEUDO_REGISTER)
32067      reg_alloc_order [pos++] = 0;
32068 }
32069 
32070 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32071    in struct attribute_spec handler.  */
32072 static tree
32073 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32074 					      tree args,
32075 					      int flags ATTRIBUTE_UNUSED,
32076 					      bool *no_add_attrs)
32077 {
32078   if (TREE_CODE (*node) != FUNCTION_TYPE
32079       && TREE_CODE (*node) != METHOD_TYPE
32080       && TREE_CODE (*node) != FIELD_DECL
32081       && TREE_CODE (*node) != TYPE_DECL)
32082     {
32083       warning (OPT_Wattributes, "%qE attribute only applies to functions",
32084 	       name);
32085       *no_add_attrs = true;
32086       return NULL_TREE;
32087     }
32088   if (TARGET_64BIT)
32089     {
32090       warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32091 	       name);
32092       *no_add_attrs = true;
32093       return NULL_TREE;
32094     }
32095   if (is_attribute_p ("callee_pop_aggregate_return", name))
32096     {
32097       tree cst;
32098 
32099       cst = TREE_VALUE (args);
32100       if (TREE_CODE (cst) != INTEGER_CST)
32101 	{
32102 	  warning (OPT_Wattributes,
32103 		   "%qE attribute requires an integer constant argument",
32104 		   name);
32105 	  *no_add_attrs = true;
32106 	}
32107       else if (compare_tree_int (cst, 0) != 0
32108 	       && compare_tree_int (cst, 1) != 0)
32109 	{
32110 	  warning (OPT_Wattributes,
32111 		   "argument to %qE attribute is neither zero, nor one",
32112 		   name);
32113 	  *no_add_attrs = true;
32114 	}
32115 
32116       return NULL_TREE;
32117     }
32118 
32119   return NULL_TREE;
32120 }
32121 
32122 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32123    struct attribute_spec.handler.  */
32124 static tree
32125 ix86_handle_abi_attribute (tree *node, tree name,
32126 			      tree args ATTRIBUTE_UNUSED,
32127 			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32128 {
32129   if (TREE_CODE (*node) != FUNCTION_TYPE
32130       && TREE_CODE (*node) != METHOD_TYPE
32131       && TREE_CODE (*node) != FIELD_DECL
32132       && TREE_CODE (*node) != TYPE_DECL)
32133     {
32134       warning (OPT_Wattributes, "%qE attribute only applies to functions",
32135 	       name);
32136       *no_add_attrs = true;
32137       return NULL_TREE;
32138     }
32139 
32140   /* Can combine regparm with all attributes but fastcall.  */
32141   if (is_attribute_p ("ms_abi", name))
32142     {
32143       if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32144         {
32145 	  error ("ms_abi and sysv_abi attributes are not compatible");
32146 	}
32147 
32148       return NULL_TREE;
32149     }
32150   else if (is_attribute_p ("sysv_abi", name))
32151     {
32152       if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32153         {
32154 	  error ("ms_abi and sysv_abi attributes are not compatible");
32155 	}
32156 
32157       return NULL_TREE;
32158     }
32159 
32160   return NULL_TREE;
32161 }
32162 
32163 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32164    struct attribute_spec.handler.  */
32165 static tree
32166 ix86_handle_struct_attribute (tree *node, tree name,
32167 			      tree args ATTRIBUTE_UNUSED,
32168 			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32169 {
32170   tree *type = NULL;
32171   if (DECL_P (*node))
32172     {
32173       if (TREE_CODE (*node) == TYPE_DECL)
32174 	type = &TREE_TYPE (*node);
32175     }
32176   else
32177     type = node;
32178 
32179   if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32180     {
32181       warning (OPT_Wattributes, "%qE attribute ignored",
32182 	       name);
32183       *no_add_attrs = true;
32184     }
32185 
32186   else if ((is_attribute_p ("ms_struct", name)
32187 	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32188 	   || ((is_attribute_p ("gcc_struct", name)
32189 		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32190     {
32191       warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32192                name);
32193       *no_add_attrs = true;
32194     }
32195 
32196   return NULL_TREE;
32197 }
32198 
32199 static tree
32200 ix86_handle_fndecl_attribute (tree *node, tree name,
32201                               tree args ATTRIBUTE_UNUSED,
32202                               int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32203 {
32204   if (TREE_CODE (*node) != FUNCTION_DECL)
32205     {
32206       warning (OPT_Wattributes, "%qE attribute only applies to functions",
32207                name);
32208       *no_add_attrs = true;
32209     }
32210   return NULL_TREE;
32211 }
32212 
32213 static bool
32214 ix86_ms_bitfield_layout_p (const_tree record_type)
32215 {
32216   return ((TARGET_MS_BITFIELD_LAYOUT
32217 	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32218           || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32219 }
32220 
32221 /* Returns an expression indicating where the this parameter is
32222    located on entry to the FUNCTION.  */
32223 
32224 static rtx
32225 x86_this_parameter (tree function)
32226 {
32227   tree type = TREE_TYPE (function);
32228   bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32229   int nregs;
32230 
32231   if (TARGET_64BIT)
32232     {
32233       const int *parm_regs;
32234 
32235       if (ix86_function_type_abi (type) == MS_ABI)
32236         parm_regs = x86_64_ms_abi_int_parameter_registers;
32237       else
32238         parm_regs = x86_64_int_parameter_registers;
32239       return gen_rtx_REG (DImode, parm_regs[aggr]);
32240     }
32241 
32242   nregs = ix86_function_regparm (type, function);
32243 
32244   if (nregs > 0 && !stdarg_p (type))
32245     {
32246       int regno;
32247       unsigned int ccvt = ix86_get_callcvt (type);
32248 
32249       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32250 	regno = aggr ? DX_REG : CX_REG;
32251       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32252         {
32253 	  regno = CX_REG;
32254 	  if (aggr)
32255 	    return gen_rtx_MEM (SImode,
32256 				plus_constant (stack_pointer_rtx, 4));
32257 	}
32258       else
32259         {
32260 	  regno = AX_REG;
32261 	  if (aggr)
32262 	    {
32263 	      regno = DX_REG;
32264 	      if (nregs == 1)
32265 		return gen_rtx_MEM (SImode,
32266 				    plus_constant (stack_pointer_rtx, 4));
32267 	    }
32268 	}
32269       return gen_rtx_REG (SImode, regno);
32270     }
32271 
32272   return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32273 }
32274 
32275 /* Determine whether x86_output_mi_thunk can succeed.  */
32276 
32277 static bool
32278 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32279 			 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32280 			 HOST_WIDE_INT vcall_offset, const_tree function)
32281 {
32282   /* 64-bit can handle anything.  */
32283   if (TARGET_64BIT)
32284     return true;
32285 
32286   /* For 32-bit, everything's fine if we have one free register.  */
32287   if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32288     return true;
32289 
32290   /* Need a free register for vcall_offset.  */
32291   if (vcall_offset)
32292     return false;
32293 
32294   /* Need a free register for GOT references.  */
32295   if (flag_pic && !targetm.binds_local_p (function))
32296     return false;
32297 
32298   /* Otherwise ok.  */
32299   return true;
32300 }
32301 
32302 /* Output the assembler code for a thunk function.  THUNK_DECL is the
32303    declaration for the thunk function itself, FUNCTION is the decl for
32304    the target function.  DELTA is an immediate constant offset to be
32305    added to THIS.  If VCALL_OFFSET is nonzero, the word at
32306    *(*this + vcall_offset) should be added to THIS.  */
32307 
32308 static void
32309 x86_output_mi_thunk (FILE *file,
32310 		     tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32311 		     HOST_WIDE_INT vcall_offset, tree function)
32312 {
32313   rtx this_param = x86_this_parameter (function);
32314   rtx this_reg, tmp, fnaddr;
32315   unsigned int tmp_regno;
32316 
32317   if (TARGET_64BIT)
32318     tmp_regno = R10_REG;
32319   else
32320     {
32321       unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32322       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32323 	tmp_regno = AX_REG;
32324       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32325 	tmp_regno = DX_REG;
32326       else
32327 	tmp_regno = CX_REG;
32328     }
32329 
32330   emit_note (NOTE_INSN_PROLOGUE_END);
32331 
32332   /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
32333      pull it in now and let DELTA benefit.  */
32334   if (REG_P (this_param))
32335     this_reg = this_param;
32336   else if (vcall_offset)
32337     {
32338       /* Put the this parameter into %eax.  */
32339       this_reg = gen_rtx_REG (Pmode, AX_REG);
32340       emit_move_insn (this_reg, this_param);
32341     }
32342   else
32343     this_reg = NULL_RTX;
32344 
32345   /* Adjust the this parameter by a fixed constant.  */
32346   if (delta)
32347     {
32348       rtx delta_rtx = GEN_INT (delta);
32349       rtx delta_dst = this_reg ? this_reg : this_param;
32350 
32351       if (TARGET_64BIT)
32352 	{
32353 	  if (!x86_64_general_operand (delta_rtx, Pmode))
32354 	    {
32355 	      tmp = gen_rtx_REG (Pmode, tmp_regno);
32356 	      emit_move_insn (tmp, delta_rtx);
32357 	      delta_rtx = tmp;
32358 	    }
32359 	}
32360 
32361       ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32362     }
32363 
32364   /* Adjust the this parameter by a value stored in the vtable.  */
32365   if (vcall_offset)
32366     {
32367       rtx vcall_addr, vcall_mem, this_mem;
32368 
32369       tmp = gen_rtx_REG (Pmode, tmp_regno);
32370 
32371       this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32372       if (Pmode != ptr_mode)
32373 	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32374       emit_move_insn (tmp, this_mem);
32375 
32376       /* Adjust the this parameter.  */
32377       vcall_addr = plus_constant (tmp, vcall_offset);
32378       if (TARGET_64BIT
32379 	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32380 	{
32381 	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32382 	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
32383 	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32384 	}
32385 
32386       vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32387       if (Pmode != ptr_mode)
32388 	emit_insn (gen_addsi_1_zext (this_reg,
32389 				     gen_rtx_REG (ptr_mode,
32390 						  REGNO (this_reg)),
32391 				     vcall_mem));
32392       else
32393 	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32394     }
32395 
32396   /* If necessary, drop THIS back to its stack slot.  */
32397   if (this_reg && this_reg != this_param)
32398     emit_move_insn (this_param, this_reg);
32399 
32400   fnaddr = XEXP (DECL_RTL (function), 0);
32401   if (TARGET_64BIT)
32402     {
32403       if (!flag_pic || targetm.binds_local_p (function)
32404 	  || cfun->machine->call_abi == MS_ABI)
32405 	;
32406       else
32407 	{
32408 	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32409 	  tmp = gen_rtx_CONST (Pmode, tmp);
32410 	  fnaddr = gen_const_mem (Pmode, tmp);
32411 	}
32412     }
32413   else
32414     {
32415       if (!flag_pic || targetm.binds_local_p (function))
32416 	;
32417 #if TARGET_MACHO
32418       else if (TARGET_MACHO)
32419 	{
32420 	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32421 	  fnaddr = XEXP (fnaddr, 0);
32422 	}
32423 #endif /* TARGET_MACHO */
32424       else
32425 	{
32426 	  tmp = gen_rtx_REG (Pmode, CX_REG);
32427 	  output_set_got (tmp, NULL_RTX);
32428 
32429 	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32430 	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
32431 	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
32432 	  fnaddr = gen_const_mem (Pmode, fnaddr);
32433 	}
32434     }
32435 
32436   /* Our sibling call patterns do not allow memories, because we have no
32437      predicate that can distinguish between frame and non-frame memory.
32438      For our purposes here, we can get away with (ab)using a jump pattern,
32439      because we're going to do no optimization.  */
32440   if (MEM_P (fnaddr))
32441     emit_jump_insn (gen_indirect_jump (fnaddr));
32442   else
32443     {
32444       if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
32445 	fnaddr = legitimize_pic_address (fnaddr,
32446 					 gen_rtx_REG (Pmode, tmp_regno));
32447 
32448       if (!sibcall_insn_operand (fnaddr, Pmode))
32449 	{
32450 	  tmp = gen_rtx_REG (Pmode, tmp_regno);
32451 	  if (GET_MODE (fnaddr) != Pmode)
32452 	    fnaddr = gen_rtx_ZERO_EXTEND (Pmode, fnaddr);
32453 	  emit_move_insn (tmp, fnaddr);
32454 	  fnaddr = tmp;
32455 	}
32456 
32457       tmp = gen_rtx_MEM (QImode, fnaddr);
32458       tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32459       tmp = emit_call_insn (tmp);
32460       SIBLING_CALL_P (tmp) = 1;
32461     }
32462   emit_barrier ();
32463 
32464   /* Emit just enough of rest_of_compilation to get the insns emitted.
32465      Note that use_thunk calls assemble_start_function et al.  */
32466   tmp = get_insns ();
32467   insn_locators_alloc ();
32468   shorten_branches (tmp);
32469   final_start_function (tmp, file, 1);
32470   final (tmp, file, 1);
32471   final_end_function ();
32472 }
32473 
32474 static void
32475 x86_file_start (void)
32476 {
32477   default_file_start ();
32478 #if TARGET_MACHO
32479   darwin_file_start ();
32480 #endif
32481   if (X86_FILE_START_VERSION_DIRECTIVE)
32482     fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32483   if (X86_FILE_START_FLTUSED)
32484     fputs ("\t.global\t__fltused\n", asm_out_file);
32485   if (ix86_asm_dialect == ASM_INTEL)
32486     fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32487 }
32488 
32489 int
32490 x86_field_alignment (tree field, int computed)
32491 {
32492   enum machine_mode mode;
32493   tree type = TREE_TYPE (field);
32494 
32495   if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32496     return computed;
32497   mode = TYPE_MODE (strip_array_types (type));
32498   if (mode == DFmode || mode == DCmode
32499       || GET_MODE_CLASS (mode) == MODE_INT
32500       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32501     return MIN (32, computed);
32502   return computed;
32503 }
32504 
32505 /* Output assembler code to FILE to increment profiler label # LABELNO
32506    for profiling a function entry.  */
32507 void
32508 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32509 {
32510   const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32511 					 : MCOUNT_NAME);
32512 
32513   if (TARGET_64BIT)
32514     {
32515 #ifndef NO_PROFILE_COUNTERS
32516       fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32517 #endif
32518 
32519       if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32520 	fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32521       else
32522 	fprintf (file, "\tcall\t%s\n", mcount_name);
32523     }
32524   else if (flag_pic)
32525     {
32526 #ifndef NO_PROFILE_COUNTERS
32527       fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32528 	       LPREFIX, labelno);
32529 #endif
32530       fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32531     }
32532   else
32533     {
32534 #ifndef NO_PROFILE_COUNTERS
32535       fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32536 	       LPREFIX, labelno);
32537 #endif
32538       fprintf (file, "\tcall\t%s\n", mcount_name);
32539     }
32540 }
32541 
32542 /* We don't have exact information about the insn sizes, but we may assume
32543    quite safely that we are informed about all 1 byte insns and memory
32544    address sizes.  This is enough to eliminate unnecessary padding in
32545    99% of cases.  */
32546 
32547 static int
32548 min_insn_size (rtx insn)
32549 {
32550   int l = 0, len;
32551 
32552   if (!INSN_P (insn) || !active_insn_p (insn))
32553     return 0;
32554 
32555   /* Discard alignments we've emit and jump instructions.  */
32556   if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32557       && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32558     return 0;
32559   if (JUMP_TABLE_DATA_P (insn))
32560     return 0;
32561 
32562   /* Important case - calls are always 5 bytes.
32563      It is common to have many calls in the row.  */
32564   if (CALL_P (insn)
32565       && symbolic_reference_mentioned_p (PATTERN (insn))
32566       && !SIBLING_CALL_P (insn))
32567     return 5;
32568   len = get_attr_length (insn);
32569   if (len <= 1)
32570     return 1;
32571 
32572   /* For normal instructions we rely on get_attr_length being exact,
32573      with a few exceptions.  */
32574   if (!JUMP_P (insn))
32575     {
32576       enum attr_type type = get_attr_type (insn);
32577 
32578       switch (type)
32579 	{
32580 	case TYPE_MULTI:
32581 	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32582 	      || asm_noperands (PATTERN (insn)) >= 0)
32583 	    return 0;
32584 	  break;
32585 	case TYPE_OTHER:
32586 	case TYPE_FCMP:
32587 	  break;
32588 	default:
32589 	  /* Otherwise trust get_attr_length.  */
32590 	  return len;
32591 	}
32592 
32593       l = get_attr_length_address (insn);
32594       if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32595 	l = 4;
32596     }
32597   if (l)
32598     return 1+l;
32599   else
32600     return 2;
32601 }
32602 
32603 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32604 
32605 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32606    window.  */
32607 
32608 static void
32609 ix86_avoid_jump_mispredicts (void)
32610 {
32611   rtx insn, start = get_insns ();
32612   int nbytes = 0, njumps = 0;
32613   int isjump = 0;
32614 
32615   /* Look for all minimal intervals of instructions containing 4 jumps.
32616      The intervals are bounded by START and INSN.  NBYTES is the total
32617      size of instructions in the interval including INSN and not including
32618      START.  When the NBYTES is smaller than 16 bytes, it is possible
32619      that the end of START and INSN ends up in the same 16byte page.
32620 
32621      The smallest offset in the page INSN can start is the case where START
32622      ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
32623      We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32624      */
32625   for (insn = start; insn; insn = NEXT_INSN (insn))
32626     {
32627       int min_size;
32628 
32629       if (LABEL_P (insn))
32630 	{
32631 	  int align = label_to_alignment (insn);
32632 	  int max_skip = label_to_max_skip (insn);
32633 
32634 	  if (max_skip > 15)
32635 	    max_skip = 15;
32636 	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32637 	     already in the current 16 byte page, because otherwise
32638 	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32639 	     bytes to reach 16 byte boundary.  */
32640 	  if (align <= 0
32641 	      || (align <= 3 && max_skip != (1 << align) - 1))
32642 	    max_skip = 0;
32643 	  if (dump_file)
32644 	    fprintf (dump_file, "Label %i with max_skip %i\n",
32645 		     INSN_UID (insn), max_skip);
32646 	  if (max_skip)
32647 	    {
32648 	      while (nbytes + max_skip >= 16)
32649 		{
32650 		  start = NEXT_INSN (start);
32651 		  if ((JUMP_P (start)
32652 		       && GET_CODE (PATTERN (start)) != ADDR_VEC
32653 		       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32654 		      || CALL_P (start))
32655 		    njumps--, isjump = 1;
32656 		  else
32657 		    isjump = 0;
32658 		  nbytes -= min_insn_size (start);
32659 		}
32660 	    }
32661 	  continue;
32662 	}
32663 
32664       min_size = min_insn_size (insn);
32665       nbytes += min_size;
32666       if (dump_file)
32667 	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32668 		 INSN_UID (insn), min_size);
32669       if ((JUMP_P (insn)
32670 	   && GET_CODE (PATTERN (insn)) != ADDR_VEC
32671 	   && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32672 	  || CALL_P (insn))
32673 	njumps++;
32674       else
32675 	continue;
32676 
32677       while (njumps > 3)
32678 	{
32679 	  start = NEXT_INSN (start);
32680 	  if ((JUMP_P (start)
32681 	       && GET_CODE (PATTERN (start)) != ADDR_VEC
32682 	       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32683 	      || CALL_P (start))
32684 	    njumps--, isjump = 1;
32685 	  else
32686 	    isjump = 0;
32687 	  nbytes -= min_insn_size (start);
32688 	}
32689       gcc_assert (njumps >= 0);
32690       if (dump_file)
32691         fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32692 		 INSN_UID (start), INSN_UID (insn), nbytes);
32693 
32694       if (njumps == 3 && isjump && nbytes < 16)
32695 	{
32696 	  int padsize = 15 - nbytes + min_insn_size (insn);
32697 
32698 	  if (dump_file)
32699 	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32700 		     INSN_UID (insn), padsize);
32701           emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32702 	}
32703     }
32704 }
32705 #endif
32706 
32707 /* AMD Athlon works faster
32708    when RET is not destination of conditional jump or directly preceded
32709    by other jump instruction.  We avoid the penalty by inserting NOP just
32710    before the RET instructions in such cases.  */
32711 static void
32712 ix86_pad_returns (void)
32713 {
32714   edge e;
32715   edge_iterator ei;
32716 
32717   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32718     {
32719       basic_block bb = e->src;
32720       rtx ret = BB_END (bb);
32721       rtx prev;
32722       bool replace = false;
32723 
32724       if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32725 	  || optimize_bb_for_size_p (bb))
32726 	continue;
32727       for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32728 	if (active_insn_p (prev) || LABEL_P (prev))
32729 	  break;
32730       if (prev && LABEL_P (prev))
32731 	{
32732 	  edge e;
32733 	  edge_iterator ei;
32734 
32735 	  FOR_EACH_EDGE (e, ei, bb->preds)
32736 	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
32737 		&& !(e->flags & EDGE_FALLTHRU))
32738 	      replace = true;
32739 	}
32740       if (!replace)
32741 	{
32742 	  prev = prev_active_insn (ret);
32743 	  if (prev
32744 	      && ((JUMP_P (prev) && any_condjump_p (prev))
32745 		  || CALL_P (prev)))
32746 	    replace = true;
32747 	  /* Empty functions get branch mispredict even when
32748 	     the jump destination is not visible to us.  */
32749 	  if (!prev && !optimize_function_for_size_p (cfun))
32750 	    replace = true;
32751 	}
32752       if (replace)
32753 	{
32754 	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32755 	  delete_insn (ret);
32756 	}
32757     }
32758 }
32759 
32760 /* Count the minimum number of instructions in BB.  Return 4 if the
32761    number of instructions >= 4.  */
32762 
32763 static int
32764 ix86_count_insn_bb (basic_block bb)
32765 {
32766   rtx insn;
32767   int insn_count = 0;
32768 
32769   /* Count number of instructions in this block.  Return 4 if the number
32770      of instructions >= 4.  */
32771   FOR_BB_INSNS (bb, insn)
32772     {
32773       /* Only happen in exit blocks.  */
32774       if (JUMP_P (insn)
32775 	  && ANY_RETURN_P (PATTERN (insn)))
32776 	break;
32777 
32778       if (NONDEBUG_INSN_P (insn)
32779 	  && GET_CODE (PATTERN (insn)) != USE
32780 	  && GET_CODE (PATTERN (insn)) != CLOBBER)
32781 	{
32782 	  insn_count++;
32783 	  if (insn_count >= 4)
32784 	    return insn_count;
32785 	}
32786     }
32787 
32788   return insn_count;
32789 }
32790 
32791 
32792 /* Count the minimum number of instructions in code path in BB.
32793    Return 4 if the number of instructions >= 4.  */
32794 
32795 static int
32796 ix86_count_insn (basic_block bb)
32797 {
32798   edge e;
32799   edge_iterator ei;
32800   int min_prev_count;
32801 
32802   /* Only bother counting instructions along paths with no
32803      more than 2 basic blocks between entry and exit.  Given
32804      that BB has an edge to exit, determine if a predecessor
32805      of BB has an edge from entry.  If so, compute the number
32806      of instructions in the predecessor block.  If there
32807      happen to be multiple such blocks, compute the minimum.  */
32808   min_prev_count = 4;
32809   FOR_EACH_EDGE (e, ei, bb->preds)
32810     {
32811       edge prev_e;
32812       edge_iterator prev_ei;
32813 
32814       if (e->src == ENTRY_BLOCK_PTR)
32815 	{
32816 	  min_prev_count = 0;
32817 	  break;
32818 	}
32819       FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32820 	{
32821 	  if (prev_e->src == ENTRY_BLOCK_PTR)
32822 	    {
32823 	      int count = ix86_count_insn_bb (e->src);
32824 	      if (count < min_prev_count)
32825 		min_prev_count = count;
32826 	      break;
32827 	    }
32828 	}
32829     }
32830 
32831   if (min_prev_count < 4)
32832     min_prev_count += ix86_count_insn_bb (bb);
32833 
32834   return min_prev_count;
32835 }
32836 
32837 /* Pad short funtion to 4 instructions.   */
32838 
32839 static void
32840 ix86_pad_short_function (void)
32841 {
32842   edge e;
32843   edge_iterator ei;
32844 
32845   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32846     {
32847       rtx ret = BB_END (e->src);
32848       if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32849 	{
32850 	  int insn_count = ix86_count_insn (e->src);
32851 
32852 	  /* Pad short function.  */
32853 	  if (insn_count < 4)
32854 	    {
32855 	      rtx insn = ret;
32856 
32857 	      /* Find epilogue.  */
32858 	      while (insn
32859 		     && (!NOTE_P (insn)
32860 			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32861 		insn = PREV_INSN (insn);
32862 
32863 	      if (!insn)
32864 		insn = ret;
32865 
32866 	      /* Two NOPs count as one instruction.  */
32867 	      insn_count = 2 * (4 - insn_count);
32868 	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32869 	    }
32870 	}
32871     }
32872 }
32873 
32874 /* Implement machine specific optimizations.  We implement padding of returns
32875    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
32876 static void
32877 ix86_reorg (void)
32878 {
32879   /* We are freeing block_for_insn in the toplev to keep compatibility
32880      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
32881   compute_bb_for_insn ();
32882 
32883   /* Run the vzeroupper optimization if needed.  */
32884   if (TARGET_VZEROUPPER)
32885     move_or_delete_vzeroupper ();
32886 
32887   if (optimize && optimize_function_for_speed_p (cfun))
32888     {
32889       if (TARGET_PAD_SHORT_FUNCTION)
32890 	ix86_pad_short_function ();
32891       else if (TARGET_PAD_RETURNS)
32892 	ix86_pad_returns ();
32893 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32894       if (TARGET_FOUR_JUMP_LIMIT)
32895 	ix86_avoid_jump_mispredicts ();
32896 #endif
32897     }
32898 }
32899 
32900 /* Return nonzero when QImode register that must be represented via REX prefix
32901    is used.  */
32902 bool
32903 x86_extended_QIreg_mentioned_p (rtx insn)
32904 {
32905   int i;
32906   extract_insn_cached (insn);
32907   for (i = 0; i < recog_data.n_operands; i++)
32908     if (REG_P (recog_data.operand[i])
32909 	&& REGNO (recog_data.operand[i]) > BX_REG)
32910        return true;
32911   return false;
32912 }
32913 
32914 /* Return nonzero when P points to register encoded via REX prefix.
32915    Called via for_each_rtx.  */
32916 static int
32917 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32918 {
32919    unsigned int regno;
32920    if (!REG_P (*p))
32921      return 0;
32922    regno = REGNO (*p);
32923    return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32924 }
32925 
32926 /* Return true when INSN mentions register that must be encoded using REX
32927    prefix.  */
32928 bool
32929 x86_extended_reg_mentioned_p (rtx insn)
32930 {
32931   return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32932 		       extended_reg_mentioned_1, NULL);
32933 }
32934 
32935 /* If profitable, negate (without causing overflow) integer constant
32936    of mode MODE at location LOC.  Return true in this case.  */
32937 bool
32938 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32939 {
32940   HOST_WIDE_INT val;
32941 
32942   if (!CONST_INT_P (*loc))
32943     return false;
32944 
32945   switch (mode)
32946     {
32947     case DImode:
32948       /* DImode x86_64 constants must fit in 32 bits.  */
32949       gcc_assert (x86_64_immediate_operand (*loc, mode));
32950 
32951       mode = SImode;
32952       break;
32953 
32954     case SImode:
32955     case HImode:
32956     case QImode:
32957       break;
32958 
32959     default:
32960       gcc_unreachable ();
32961     }
32962 
32963   /* Avoid overflows.  */
32964   if (mode_signbit_p (mode, *loc))
32965     return false;
32966 
32967   val = INTVAL (*loc);
32968 
32969   /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32970      Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
32971   if ((val < 0 && val != -128)
32972       || val == 128)
32973     {
32974       *loc = GEN_INT (-val);
32975       return true;
32976     }
32977 
32978   return false;
32979 }
32980 
32981 /* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
32982    optabs would emit if we didn't have TFmode patterns.  */
32983 
32984 void
32985 x86_emit_floatuns (rtx operands[2])
32986 {
32987   rtx neglab, donelab, i0, i1, f0, in, out;
32988   enum machine_mode mode, inmode;
32989 
32990   inmode = GET_MODE (operands[1]);
32991   gcc_assert (inmode == SImode || inmode == DImode);
32992 
32993   out = operands[0];
32994   in = force_reg (inmode, operands[1]);
32995   mode = GET_MODE (out);
32996   neglab = gen_label_rtx ();
32997   donelab = gen_label_rtx ();
32998   f0 = gen_reg_rtx (mode);
32999 
33000   emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33001 
33002   expand_float (out, in, 0);
33003 
33004   emit_jump_insn (gen_jump (donelab));
33005   emit_barrier ();
33006 
33007   emit_label (neglab);
33008 
33009   i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33010 			    1, OPTAB_DIRECT);
33011   i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33012 			    1, OPTAB_DIRECT);
33013   i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33014 
33015   expand_float (f0, i0, 0);
33016 
33017   emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33018 
33019   emit_label (donelab);
33020 }
33021 
33022 /* AVX2 does support 32-byte integer vector operations,
33023    thus the longest vector we are faced with is V32QImode.  */
33024 #define MAX_VECT_LEN	32
33025 
33026 struct expand_vec_perm_d
33027 {
33028   rtx target, op0, op1;
33029   unsigned char perm[MAX_VECT_LEN];
33030   enum machine_mode vmode;
33031   unsigned char nelt;
33032   bool testing_p;
33033 };
33034 
33035 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33036 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33037 
33038 /* Get a vector mode of the same size as the original but with elements
33039    twice as wide.  This is only guaranteed to apply to integral vectors.  */
33040 
33041 static inline enum machine_mode
33042 get_mode_wider_vector (enum machine_mode o)
33043 {
33044   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
33045   enum machine_mode n = GET_MODE_WIDER_MODE (o);
33046   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33047   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33048   return n;
33049 }
33050 
33051 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
33052    with all elements equal to VAR.  Return true if successful.  */
33053 
33054 static bool
33055 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33056 				   rtx target, rtx val)
33057 {
33058   bool ok;
33059 
33060   switch (mode)
33061     {
33062     case V2SImode:
33063     case V2SFmode:
33064       if (!mmx_ok)
33065 	return false;
33066       /* FALLTHRU */
33067 
33068     case V4DFmode:
33069     case V4DImode:
33070     case V8SFmode:
33071     case V8SImode:
33072     case V2DFmode:
33073     case V2DImode:
33074     case V4SFmode:
33075     case V4SImode:
33076       {
33077 	rtx insn, dup;
33078 
33079 	/* First attempt to recognize VAL as-is.  */
33080 	dup = gen_rtx_VEC_DUPLICATE (mode, val);
33081 	insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33082 	if (recog_memoized (insn) < 0)
33083 	  {
33084 	    rtx seq;
33085 	    /* If that fails, force VAL into a register.  */
33086 
33087 	    start_sequence ();
33088 	    XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33089 	    seq = get_insns ();
33090 	    end_sequence ();
33091 	    if (seq)
33092 	      emit_insn_before (seq, insn);
33093 
33094 	    ok = recog_memoized (insn) >= 0;
33095 	    gcc_assert (ok);
33096 	  }
33097       }
33098       return true;
33099 
33100     case V4HImode:
33101       if (!mmx_ok)
33102 	return false;
33103       if (TARGET_SSE || TARGET_3DNOW_A)
33104 	{
33105 	  rtx x;
33106 
33107 	  val = gen_lowpart (SImode, val);
33108 	  x = gen_rtx_TRUNCATE (HImode, val);
33109 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
33110 	  emit_insn (gen_rtx_SET (VOIDmode, target, x));
33111 	  return true;
33112 	}
33113       goto widen;
33114 
33115     case V8QImode:
33116       if (!mmx_ok)
33117 	return false;
33118       goto widen;
33119 
33120     case V8HImode:
33121       if (TARGET_SSE2)
33122 	{
33123 	  struct expand_vec_perm_d dperm;
33124 	  rtx tmp1, tmp2;
33125 
33126 	permute:
33127 	  memset (&dperm, 0, sizeof (dperm));
33128 	  dperm.target = target;
33129 	  dperm.vmode = mode;
33130 	  dperm.nelt = GET_MODE_NUNITS (mode);
33131 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33132 
33133 	  /* Extend to SImode using a paradoxical SUBREG.  */
33134 	  tmp1 = gen_reg_rtx (SImode);
33135 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
33136 
33137 	  /* Insert the SImode value as low element of a V4SImode vector. */
33138 	  tmp2 = gen_lowpart (V4SImode, dperm.op0);
33139 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33140 
33141 	  ok = (expand_vec_perm_1 (&dperm)
33142 		|| expand_vec_perm_broadcast_1 (&dperm));
33143 	  gcc_assert (ok);
33144 	  return ok;
33145 	}
33146       goto widen;
33147 
33148     case V16QImode:
33149       if (TARGET_SSE2)
33150 	goto permute;
33151       goto widen;
33152 
33153     widen:
33154       /* Replicate the value once into the next wider mode and recurse.  */
33155       {
33156 	enum machine_mode smode, wsmode, wvmode;
33157 	rtx x;
33158 
33159 	smode = GET_MODE_INNER (mode);
33160 	wvmode = get_mode_wider_vector (mode);
33161 	wsmode = GET_MODE_INNER (wvmode);
33162 
33163 	val = convert_modes (wsmode, smode, val, true);
33164 	x = expand_simple_binop (wsmode, ASHIFT, val,
33165 				 GEN_INT (GET_MODE_BITSIZE (smode)),
33166 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33167 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33168 
33169 	x = gen_lowpart (wvmode, target);
33170 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33171 	gcc_assert (ok);
33172 	return ok;
33173       }
33174 
33175     case V16HImode:
33176     case V32QImode:
33177       {
33178 	enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33179 	rtx x = gen_reg_rtx (hvmode);
33180 
33181 	ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33182 	gcc_assert (ok);
33183 
33184 	x = gen_rtx_VEC_CONCAT (mode, x, x);
33185 	emit_insn (gen_rtx_SET (VOIDmode, target, x));
33186       }
33187       return true;
33188 
33189     default:
33190       return false;
33191     }
33192 }
33193 
33194 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
33195    whose ONE_VAR element is VAR, and other elements are zero.  Return true
33196    if successful.  */
33197 
33198 static bool
33199 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33200 				     rtx target, rtx var, int one_var)
33201 {
33202   enum machine_mode vsimode;
33203   rtx new_target;
33204   rtx x, tmp;
33205   bool use_vector_set = false;
33206 
33207   switch (mode)
33208     {
33209     case V2DImode:
33210       /* For SSE4.1, we normally use vector set.  But if the second
33211 	 element is zero and inter-unit moves are OK, we use movq
33212 	 instead.  */
33213       use_vector_set = (TARGET_64BIT
33214 			&& TARGET_SSE4_1
33215 			&& !(TARGET_INTER_UNIT_MOVES
33216 			     && one_var == 0));
33217       break;
33218     case V16QImode:
33219     case V4SImode:
33220     case V4SFmode:
33221       use_vector_set = TARGET_SSE4_1;
33222       break;
33223     case V8HImode:
33224       use_vector_set = TARGET_SSE2;
33225       break;
33226     case V4HImode:
33227       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33228       break;
33229     case V32QImode:
33230     case V16HImode:
33231     case V8SImode:
33232     case V8SFmode:
33233     case V4DFmode:
33234       use_vector_set = TARGET_AVX;
33235       break;
33236     case V4DImode:
33237       /* Use ix86_expand_vector_set in 64bit mode only.  */
33238       use_vector_set = TARGET_AVX && TARGET_64BIT;
33239       break;
33240     default:
33241       break;
33242     }
33243 
33244   if (use_vector_set)
33245     {
33246       emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33247       var = force_reg (GET_MODE_INNER (mode), var);
33248       ix86_expand_vector_set (mmx_ok, target, var, one_var);
33249       return true;
33250     }
33251 
33252   switch (mode)
33253     {
33254     case V2SFmode:
33255     case V2SImode:
33256       if (!mmx_ok)
33257 	return false;
33258       /* FALLTHRU */
33259 
33260     case V2DFmode:
33261     case V2DImode:
33262       if (one_var != 0)
33263 	return false;
33264       var = force_reg (GET_MODE_INNER (mode), var);
33265       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33266       emit_insn (gen_rtx_SET (VOIDmode, target, x));
33267       return true;
33268 
33269     case V4SFmode:
33270     case V4SImode:
33271       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33272 	new_target = gen_reg_rtx (mode);
33273       else
33274 	new_target = target;
33275       var = force_reg (GET_MODE_INNER (mode), var);
33276       x = gen_rtx_VEC_DUPLICATE (mode, var);
33277       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33278       emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33279       if (one_var != 0)
33280 	{
33281 	  /* We need to shuffle the value to the correct position, so
33282 	     create a new pseudo to store the intermediate result.  */
33283 
33284 	  /* With SSE2, we can use the integer shuffle insns.  */
33285 	  if (mode != V4SFmode && TARGET_SSE2)
33286 	    {
33287 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33288 					    const1_rtx,
33289 					    GEN_INT (one_var == 1 ? 0 : 1),
33290 					    GEN_INT (one_var == 2 ? 0 : 1),
33291 					    GEN_INT (one_var == 3 ? 0 : 1)));
33292 	      if (target != new_target)
33293 		emit_move_insn (target, new_target);
33294 	      return true;
33295 	    }
33296 
33297 	  /* Otherwise convert the intermediate result to V4SFmode and
33298 	     use the SSE1 shuffle instructions.  */
33299 	  if (mode != V4SFmode)
33300 	    {
33301 	      tmp = gen_reg_rtx (V4SFmode);
33302 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33303 	    }
33304 	  else
33305 	    tmp = new_target;
33306 
33307 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33308 				       const1_rtx,
33309 				       GEN_INT (one_var == 1 ? 0 : 1),
33310 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
33311 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33312 
33313 	  if (mode != V4SFmode)
33314 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33315 	  else if (tmp != target)
33316 	    emit_move_insn (target, tmp);
33317 	}
33318       else if (target != new_target)
33319 	emit_move_insn (target, new_target);
33320       return true;
33321 
33322     case V8HImode:
33323     case V16QImode:
33324       vsimode = V4SImode;
33325       goto widen;
33326     case V4HImode:
33327     case V8QImode:
33328       if (!mmx_ok)
33329 	return false;
33330       vsimode = V2SImode;
33331       goto widen;
33332     widen:
33333       if (one_var != 0)
33334 	return false;
33335 
33336       /* Zero extend the variable element to SImode and recurse.  */
33337       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33338 
33339       x = gen_reg_rtx (vsimode);
33340       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33341 						var, one_var))
33342 	gcc_unreachable ();
33343 
33344       emit_move_insn (target, gen_lowpart (mode, x));
33345       return true;
33346 
33347     default:
33348       return false;
33349     }
33350 }
33351 
33352 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
33353    consisting of the values in VALS.  It is known that all elements
33354    except ONE_VAR are constants.  Return true if successful.  */
33355 
33356 static bool
33357 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33358 				 rtx target, rtx vals, int one_var)
33359 {
33360   rtx var = XVECEXP (vals, 0, one_var);
33361   enum machine_mode wmode;
33362   rtx const_vec, x;
33363 
33364   const_vec = copy_rtx (vals);
33365   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33366   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33367 
33368   switch (mode)
33369     {
33370     case V2DFmode:
33371     case V2DImode:
33372     case V2SFmode:
33373     case V2SImode:
33374       /* For the two element vectors, it's just as easy to use
33375 	 the general case.  */
33376       return false;
33377 
33378     case V4DImode:
33379       /* Use ix86_expand_vector_set in 64bit mode only.  */
33380       if (!TARGET_64BIT)
33381 	return false;
33382     case V4DFmode:
33383     case V8SFmode:
33384     case V8SImode:
33385     case V16HImode:
33386     case V32QImode:
33387     case V4SFmode:
33388     case V4SImode:
33389     case V8HImode:
33390     case V4HImode:
33391       break;
33392 
33393     case V16QImode:
33394       if (TARGET_SSE4_1)
33395 	break;
33396       wmode = V8HImode;
33397       goto widen;
33398     case V8QImode:
33399       wmode = V4HImode;
33400       goto widen;
33401     widen:
33402       /* There's no way to set one QImode entry easily.  Combine
33403 	 the variable value with its adjacent constant value, and
33404 	 promote to an HImode set.  */
33405       x = XVECEXP (vals, 0, one_var ^ 1);
33406       if (one_var & 1)
33407 	{
33408 	  var = convert_modes (HImode, QImode, var, true);
33409 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33410 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
33411 	  x = GEN_INT (INTVAL (x) & 0xff);
33412 	}
33413       else
33414 	{
33415 	  var = convert_modes (HImode, QImode, var, true);
33416 	  x = gen_int_mode (INTVAL (x) << 8, HImode);
33417 	}
33418       if (x != const0_rtx)
33419 	var = expand_simple_binop (HImode, IOR, var, x, var,
33420 				   1, OPTAB_LIB_WIDEN);
33421 
33422       x = gen_reg_rtx (wmode);
33423       emit_move_insn (x, gen_lowpart (wmode, const_vec));
33424       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33425 
33426       emit_move_insn (target, gen_lowpart (mode, x));
33427       return true;
33428 
33429     default:
33430       return false;
33431     }
33432 
33433   emit_move_insn (target, const_vec);
33434   ix86_expand_vector_set (mmx_ok, target, var, one_var);
33435   return true;
33436 }
33437 
33438 /* A subroutine of ix86_expand_vector_init_general.  Use vector
33439    concatenate to handle the most general case: all values variable,
33440    and none identical.  */
33441 
33442 static void
33443 ix86_expand_vector_init_concat (enum machine_mode mode,
33444 				rtx target, rtx *ops, int n)
33445 {
33446   enum machine_mode cmode, hmode = VOIDmode;
33447   rtx first[8], second[4];
33448   rtvec v;
33449   int i, j;
33450 
33451   switch (n)
33452     {
33453     case 2:
33454       switch (mode)
33455 	{
33456 	case V8SImode:
33457 	  cmode = V4SImode;
33458 	  break;
33459 	case V8SFmode:
33460 	  cmode = V4SFmode;
33461 	  break;
33462 	case V4DImode:
33463 	  cmode = V2DImode;
33464 	  break;
33465 	case V4DFmode:
33466 	  cmode = V2DFmode;
33467 	  break;
33468 	case V4SImode:
33469 	  cmode = V2SImode;
33470 	  break;
33471 	case V4SFmode:
33472 	  cmode = V2SFmode;
33473 	  break;
33474 	case V2DImode:
33475 	  cmode = DImode;
33476 	  break;
33477 	case V2SImode:
33478 	  cmode = SImode;
33479 	  break;
33480 	case V2DFmode:
33481 	  cmode = DFmode;
33482 	  break;
33483 	case V2SFmode:
33484 	  cmode = SFmode;
33485 	  break;
33486 	default:
33487 	  gcc_unreachable ();
33488 	}
33489 
33490       if (!register_operand (ops[1], cmode))
33491 	ops[1] = force_reg (cmode, ops[1]);
33492       if (!register_operand (ops[0], cmode))
33493 	ops[0] = force_reg (cmode, ops[0]);
33494       emit_insn (gen_rtx_SET (VOIDmode, target,
33495 			      gen_rtx_VEC_CONCAT (mode, ops[0],
33496 						  ops[1])));
33497       break;
33498 
33499     case 4:
33500       switch (mode)
33501 	{
33502 	case V4DImode:
33503 	  cmode = V2DImode;
33504 	  break;
33505 	case V4DFmode:
33506 	  cmode = V2DFmode;
33507 	  break;
33508 	case V4SImode:
33509 	  cmode = V2SImode;
33510 	  break;
33511 	case V4SFmode:
33512 	  cmode = V2SFmode;
33513 	  break;
33514 	default:
33515 	  gcc_unreachable ();
33516 	}
33517       goto half;
33518 
33519     case 8:
33520       switch (mode)
33521 	{
33522 	case V8SImode:
33523 	  cmode = V2SImode;
33524 	  hmode = V4SImode;
33525 	  break;
33526 	case V8SFmode:
33527 	  cmode = V2SFmode;
33528 	  hmode = V4SFmode;
33529 	  break;
33530 	default:
33531 	  gcc_unreachable ();
33532 	}
33533       goto half;
33534 
33535 half:
33536       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
33537       i = n - 1;
33538       j = (n >> 1) - 1;
33539       for (; i > 0; i -= 2, j--)
33540 	{
33541 	  first[j] = gen_reg_rtx (cmode);
33542 	  v = gen_rtvec (2, ops[i - 1], ops[i]);
33543 	  ix86_expand_vector_init (false, first[j],
33544 				   gen_rtx_PARALLEL (cmode, v));
33545 	}
33546 
33547       n >>= 1;
33548       if (n > 2)
33549 	{
33550 	  gcc_assert (hmode != VOIDmode);
33551 	  for (i = j = 0; i < n; i += 2, j++)
33552 	    {
33553 	      second[j] = gen_reg_rtx (hmode);
33554 	      ix86_expand_vector_init_concat (hmode, second [j],
33555 					      &first [i], 2);
33556 	    }
33557 	  n >>= 1;
33558 	  ix86_expand_vector_init_concat (mode, target, second, n);
33559 	}
33560       else
33561 	ix86_expand_vector_init_concat (mode, target, first, n);
33562       break;
33563 
33564     default:
33565       gcc_unreachable ();
33566     }
33567 }
33568 
33569 /* A subroutine of ix86_expand_vector_init_general.  Use vector
33570    interleave to handle the most general case: all values variable,
33571    and none identical.  */
33572 
33573 static void
33574 ix86_expand_vector_init_interleave (enum machine_mode mode,
33575 				    rtx target, rtx *ops, int n)
33576 {
33577   enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33578   int i, j;
33579   rtx op0, op1;
33580   rtx (*gen_load_even) (rtx, rtx, rtx);
33581   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33582   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33583 
33584   switch (mode)
33585     {
33586     case V8HImode:
33587       gen_load_even = gen_vec_setv8hi;
33588       gen_interleave_first_low = gen_vec_interleave_lowv4si;
33589       gen_interleave_second_low = gen_vec_interleave_lowv2di;
33590       inner_mode = HImode;
33591       first_imode = V4SImode;
33592       second_imode = V2DImode;
33593       third_imode = VOIDmode;
33594       break;
33595     case V16QImode:
33596       gen_load_even = gen_vec_setv16qi;
33597       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33598       gen_interleave_second_low = gen_vec_interleave_lowv4si;
33599       inner_mode = QImode;
33600       first_imode = V8HImode;
33601       second_imode = V4SImode;
33602       third_imode = V2DImode;
33603       break;
33604     default:
33605       gcc_unreachable ();
33606     }
33607 
33608   for (i = 0; i < n; i++)
33609     {
33610       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
33611       op0 = gen_reg_rtx (SImode);
33612       emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33613 
33614       /* Insert the SImode value as low element of V4SImode vector. */
33615       op1 = gen_reg_rtx (V4SImode);
33616       op0 = gen_rtx_VEC_MERGE (V4SImode,
33617 			       gen_rtx_VEC_DUPLICATE (V4SImode,
33618 						      op0),
33619 			       CONST0_RTX (V4SImode),
33620 			       const1_rtx);
33621       emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33622 
33623       /* Cast the V4SImode vector back to a vector in orignal mode.  */
33624       op0 = gen_reg_rtx (mode);
33625       emit_move_insn (op0, gen_lowpart (mode, op1));
33626 
33627       /* Load even elements into the second positon.  */
33628       emit_insn (gen_load_even (op0,
33629 				force_reg (inner_mode,
33630 					   ops [i + i + 1]),
33631 				const1_rtx));
33632 
33633       /* Cast vector to FIRST_IMODE vector.  */
33634       ops[i] = gen_reg_rtx (first_imode);
33635       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33636     }
33637 
33638   /* Interleave low FIRST_IMODE vectors.  */
33639   for (i = j = 0; i < n; i += 2, j++)
33640     {
33641       op0 = gen_reg_rtx (first_imode);
33642       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33643 
33644       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
33645       ops[j] = gen_reg_rtx (second_imode);
33646       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33647     }
33648 
33649   /* Interleave low SECOND_IMODE vectors.  */
33650   switch (second_imode)
33651     {
33652     case V4SImode:
33653       for (i = j = 0; i < n / 2; i += 2, j++)
33654 	{
33655 	  op0 = gen_reg_rtx (second_imode);
33656 	  emit_insn (gen_interleave_second_low (op0, ops[i],
33657 						ops[i + 1]));
33658 
33659 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33660 	     vector.  */
33661 	  ops[j] = gen_reg_rtx (third_imode);
33662 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33663 	}
33664       second_imode = V2DImode;
33665       gen_interleave_second_low = gen_vec_interleave_lowv2di;
33666       /* FALLTHRU */
33667 
33668     case V2DImode:
33669       op0 = gen_reg_rtx (second_imode);
33670       emit_insn (gen_interleave_second_low (op0, ops[0],
33671 					    ops[1]));
33672 
33673       /* Cast the SECOND_IMODE vector back to a vector on original
33674 	 mode.  */
33675       emit_insn (gen_rtx_SET (VOIDmode, target,
33676 			      gen_lowpart (mode, op0)));
33677       break;
33678 
33679     default:
33680       gcc_unreachable ();
33681     }
33682 }
33683 
33684 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
33685    all values variable, and none identical.  */
33686 
33687 static void
33688 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33689 				 rtx target, rtx vals)
33690 {
33691   rtx ops[32], op0, op1;
33692   enum machine_mode half_mode = VOIDmode;
33693   int n, i;
33694 
33695   switch (mode)
33696     {
33697     case V2SFmode:
33698     case V2SImode:
33699       if (!mmx_ok && !TARGET_SSE)
33700 	break;
33701       /* FALLTHRU */
33702 
33703     case V8SFmode:
33704     case V8SImode:
33705     case V4DFmode:
33706     case V4DImode:
33707     case V4SFmode:
33708     case V4SImode:
33709     case V2DFmode:
33710     case V2DImode:
33711       n = GET_MODE_NUNITS (mode);
33712       for (i = 0; i < n; i++)
33713 	ops[i] = XVECEXP (vals, 0, i);
33714       ix86_expand_vector_init_concat (mode, target, ops, n);
33715       return;
33716 
33717     case V32QImode:
33718       half_mode = V16QImode;
33719       goto half;
33720 
33721     case V16HImode:
33722       half_mode = V8HImode;
33723       goto half;
33724 
33725 half:
33726       n = GET_MODE_NUNITS (mode);
33727       for (i = 0; i < n; i++)
33728 	ops[i] = XVECEXP (vals, 0, i);
33729       op0 = gen_reg_rtx (half_mode);
33730       op1 = gen_reg_rtx (half_mode);
33731       ix86_expand_vector_init_interleave (half_mode, op0, ops,
33732 					  n >> 2);
33733       ix86_expand_vector_init_interleave (half_mode, op1,
33734 					  &ops [n >> 1], n >> 2);
33735       emit_insn (gen_rtx_SET (VOIDmode, target,
33736 			      gen_rtx_VEC_CONCAT (mode, op0, op1)));
33737       return;
33738 
33739     case V16QImode:
33740       if (!TARGET_SSE4_1)
33741 	break;
33742       /* FALLTHRU */
33743 
33744     case V8HImode:
33745       if (!TARGET_SSE2)
33746 	break;
33747 
33748       /* Don't use ix86_expand_vector_init_interleave if we can't
33749 	 move from GPR to SSE register directly.  */
33750       if (!TARGET_INTER_UNIT_MOVES)
33751 	break;
33752 
33753       n = GET_MODE_NUNITS (mode);
33754       for (i = 0; i < n; i++)
33755 	ops[i] = XVECEXP (vals, 0, i);
33756       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33757       return;
33758 
33759     case V4HImode:
33760     case V8QImode:
33761       break;
33762 
33763     default:
33764       gcc_unreachable ();
33765     }
33766 
33767     {
33768       int i, j, n_elts, n_words, n_elt_per_word;
33769       enum machine_mode inner_mode;
33770       rtx words[4], shift;
33771 
33772       inner_mode = GET_MODE_INNER (mode);
33773       n_elts = GET_MODE_NUNITS (mode);
33774       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33775       n_elt_per_word = n_elts / n_words;
33776       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33777 
33778       for (i = 0; i < n_words; ++i)
33779 	{
33780 	  rtx word = NULL_RTX;
33781 
33782 	  for (j = 0; j < n_elt_per_word; ++j)
33783 	    {
33784 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33785 	      elt = convert_modes (word_mode, inner_mode, elt, true);
33786 
33787 	      if (j == 0)
33788 		word = elt;
33789 	      else
33790 		{
33791 		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33792 					      word, 1, OPTAB_LIB_WIDEN);
33793 		  word = expand_simple_binop (word_mode, IOR, word, elt,
33794 					      word, 1, OPTAB_LIB_WIDEN);
33795 		}
33796 	    }
33797 
33798 	  words[i] = word;
33799 	}
33800 
33801       if (n_words == 1)
33802 	emit_move_insn (target, gen_lowpart (mode, words[0]));
33803       else if (n_words == 2)
33804 	{
33805 	  rtx tmp = gen_reg_rtx (mode);
33806 	  emit_clobber (tmp);
33807 	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33808 	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33809 	  emit_move_insn (target, tmp);
33810 	}
33811       else if (n_words == 4)
33812 	{
33813 	  rtx tmp = gen_reg_rtx (V4SImode);
33814 	  gcc_assert (word_mode == SImode);
33815 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33816 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33817 	  emit_move_insn (target, gen_lowpart (mode, tmp));
33818 	}
33819       else
33820 	gcc_unreachable ();
33821     }
33822 }
33823 
33824 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
33825    instructions unless MMX_OK is true.  */
33826 
33827 void
33828 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33829 {
33830   enum machine_mode mode = GET_MODE (target);
33831   enum machine_mode inner_mode = GET_MODE_INNER (mode);
33832   int n_elts = GET_MODE_NUNITS (mode);
33833   int n_var = 0, one_var = -1;
33834   bool all_same = true, all_const_zero = true;
33835   int i;
33836   rtx x;
33837 
33838   for (i = 0; i < n_elts; ++i)
33839     {
33840       x = XVECEXP (vals, 0, i);
33841       if (!(CONST_INT_P (x)
33842 	    || GET_CODE (x) == CONST_DOUBLE
33843 	    || GET_CODE (x) == CONST_FIXED))
33844 	n_var++, one_var = i;
33845       else if (x != CONST0_RTX (inner_mode))
33846 	all_const_zero = false;
33847       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33848 	all_same = false;
33849     }
33850 
33851   /* Constants are best loaded from the constant pool.  */
33852   if (n_var == 0)
33853     {
33854       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33855       return;
33856     }
33857 
33858   /* If all values are identical, broadcast the value.  */
33859   if (all_same
33860       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33861 					    XVECEXP (vals, 0, 0)))
33862     return;
33863 
33864   /* Values where only one field is non-constant are best loaded from
33865      the pool and overwritten via move later.  */
33866   if (n_var == 1)
33867     {
33868       if (all_const_zero
33869 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33870 						  XVECEXP (vals, 0, one_var),
33871 						  one_var))
33872 	return;
33873 
33874       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33875 	return;
33876     }
33877 
33878   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33879 }
33880 
33881 void
33882 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33883 {
33884   enum machine_mode mode = GET_MODE (target);
33885   enum machine_mode inner_mode = GET_MODE_INNER (mode);
33886   enum machine_mode half_mode;
33887   bool use_vec_merge = false;
33888   rtx tmp;
33889   static rtx (*gen_extract[6][2]) (rtx, rtx)
33890     = {
33891 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33892 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33893 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33894 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33895 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33896 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33897       };
33898   static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33899     = {
33900 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33901 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33902 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33903 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33904 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33905 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33906       };
33907   int i, j, n;
33908 
33909   switch (mode)
33910     {
33911     case V2SFmode:
33912     case V2SImode:
33913       if (mmx_ok)
33914 	{
33915 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33916 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33917 	  if (elt == 0)
33918 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33919 	  else
33920 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33921 	  emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33922 	  return;
33923 	}
33924       break;
33925 
33926     case V2DImode:
33927       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33928       if (use_vec_merge)
33929 	break;
33930 
33931       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33932       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33933       if (elt == 0)
33934 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33935       else
33936 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33937       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33938       return;
33939 
33940     case V2DFmode:
33941       {
33942 	rtx op0, op1;
33943 
33944 	/* For the two element vectors, we implement a VEC_CONCAT with
33945 	   the extraction of the other element.  */
33946 
33947 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33948 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33949 
33950 	if (elt == 0)
33951 	  op0 = val, op1 = tmp;
33952 	else
33953 	  op0 = tmp, op1 = val;
33954 
33955 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33956 	emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33957       }
33958       return;
33959 
33960     case V4SFmode:
33961       use_vec_merge = TARGET_SSE4_1;
33962       if (use_vec_merge)
33963 	break;
33964 
33965       switch (elt)
33966 	{
33967 	case 0:
33968 	  use_vec_merge = true;
33969 	  break;
33970 
33971 	case 1:
33972 	  /* tmp = target = A B C D */
33973 	  tmp = copy_to_reg (target);
33974 	  /* target = A A B B */
33975 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33976 	  /* target = X A B B */
33977 	  ix86_expand_vector_set (false, target, val, 0);
33978 	  /* target = A X C D  */
33979 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33980 					  const1_rtx, const0_rtx,
33981 					  GEN_INT (2+4), GEN_INT (3+4)));
33982 	  return;
33983 
33984 	case 2:
33985 	  /* tmp = target = A B C D */
33986 	  tmp = copy_to_reg (target);
33987 	  /* tmp = X B C D */
33988 	  ix86_expand_vector_set (false, tmp, val, 0);
33989 	  /* target = A B X D */
33990 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33991 					  const0_rtx, const1_rtx,
33992 					  GEN_INT (0+4), GEN_INT (3+4)));
33993 	  return;
33994 
33995 	case 3:
33996 	  /* tmp = target = A B C D */
33997 	  tmp = copy_to_reg (target);
33998 	  /* tmp = X B C D */
33999 	  ix86_expand_vector_set (false, tmp, val, 0);
34000 	  /* target = A B X D */
34001 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34002 					  const0_rtx, const1_rtx,
34003 					  GEN_INT (2+4), GEN_INT (0+4)));
34004 	  return;
34005 
34006 	default:
34007 	  gcc_unreachable ();
34008 	}
34009       break;
34010 
34011     case V4SImode:
34012       use_vec_merge = TARGET_SSE4_1;
34013       if (use_vec_merge)
34014 	break;
34015 
34016       /* Element 0 handled by vec_merge below.  */
34017       if (elt == 0)
34018 	{
34019 	  use_vec_merge = true;
34020 	  break;
34021 	}
34022 
34023       if (TARGET_SSE2)
34024 	{
34025 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
34026 	     store into element 0, then shuffle them back.  */
34027 
34028 	  rtx order[4];
34029 
34030 	  order[0] = GEN_INT (elt);
34031 	  order[1] = const1_rtx;
34032 	  order[2] = const2_rtx;
34033 	  order[3] = GEN_INT (3);
34034 	  order[elt] = const0_rtx;
34035 
34036 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34037 					order[1], order[2], order[3]));
34038 
34039 	  ix86_expand_vector_set (false, target, val, 0);
34040 
34041 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34042 					order[1], order[2], order[3]));
34043 	}
34044       else
34045 	{
34046 	  /* For SSE1, we have to reuse the V4SF code.  */
34047 	  ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34048 				  gen_lowpart (SFmode, val), elt);
34049 	}
34050       return;
34051 
34052     case V8HImode:
34053       use_vec_merge = TARGET_SSE2;
34054       break;
34055     case V4HImode:
34056       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34057       break;
34058 
34059     case V16QImode:
34060       use_vec_merge = TARGET_SSE4_1;
34061       break;
34062 
34063     case V8QImode:
34064       break;
34065 
34066     case V32QImode:
34067       half_mode = V16QImode;
34068       j = 0;
34069       n = 16;
34070       goto half;
34071 
34072     case V16HImode:
34073       half_mode = V8HImode;
34074       j = 1;
34075       n = 8;
34076       goto half;
34077 
34078     case V8SImode:
34079       half_mode = V4SImode;
34080       j = 2;
34081       n = 4;
34082       goto half;
34083 
34084     case V4DImode:
34085       half_mode = V2DImode;
34086       j = 3;
34087       n = 2;
34088       goto half;
34089 
34090     case V8SFmode:
34091       half_mode = V4SFmode;
34092       j = 4;
34093       n = 4;
34094       goto half;
34095 
34096     case V4DFmode:
34097       half_mode = V2DFmode;
34098       j = 5;
34099       n = 2;
34100       goto half;
34101 
34102 half:
34103       /* Compute offset.  */
34104       i = elt / n;
34105       elt %= n;
34106 
34107       gcc_assert (i <= 1);
34108 
34109       /* Extract the half.  */
34110       tmp = gen_reg_rtx (half_mode);
34111       emit_insn (gen_extract[j][i] (tmp, target));
34112 
34113       /* Put val in tmp at elt.  */
34114       ix86_expand_vector_set (false, tmp, val, elt);
34115 
34116       /* Put it back.  */
34117       emit_insn (gen_insert[j][i] (target, target, tmp));
34118       return;
34119 
34120     default:
34121       break;
34122     }
34123 
34124   if (use_vec_merge)
34125     {
34126       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34127       tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34128       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34129     }
34130   else
34131     {
34132       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34133 
34134       emit_move_insn (mem, target);
34135 
34136       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34137       emit_move_insn (tmp, val);
34138 
34139       emit_move_insn (target, mem);
34140     }
34141 }
34142 
34143 void
34144 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34145 {
34146   enum machine_mode mode = GET_MODE (vec);
34147   enum machine_mode inner_mode = GET_MODE_INNER (mode);
34148   bool use_vec_extr = false;
34149   rtx tmp;
34150 
34151   switch (mode)
34152     {
34153     case V2SImode:
34154     case V2SFmode:
34155       if (!mmx_ok)
34156 	break;
34157       /* FALLTHRU */
34158 
34159     case V2DFmode:
34160     case V2DImode:
34161       use_vec_extr = true;
34162       break;
34163 
34164     case V4SFmode:
34165       use_vec_extr = TARGET_SSE4_1;
34166       if (use_vec_extr)
34167 	break;
34168 
34169       switch (elt)
34170 	{
34171 	case 0:
34172 	  tmp = vec;
34173 	  break;
34174 
34175 	case 1:
34176 	case 3:
34177 	  tmp = gen_reg_rtx (mode);
34178 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34179 				       GEN_INT (elt), GEN_INT (elt),
34180 				       GEN_INT (elt+4), GEN_INT (elt+4)));
34181 	  break;
34182 
34183 	case 2:
34184 	  tmp = gen_reg_rtx (mode);
34185 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34186 	  break;
34187 
34188 	default:
34189 	  gcc_unreachable ();
34190 	}
34191       vec = tmp;
34192       use_vec_extr = true;
34193       elt = 0;
34194       break;
34195 
34196     case V4SImode:
34197       use_vec_extr = TARGET_SSE4_1;
34198       if (use_vec_extr)
34199 	break;
34200 
34201       if (TARGET_SSE2)
34202 	{
34203 	  switch (elt)
34204 	    {
34205 	    case 0:
34206 	      tmp = vec;
34207 	      break;
34208 
34209 	    case 1:
34210 	    case 3:
34211 	      tmp = gen_reg_rtx (mode);
34212 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34213 					    GEN_INT (elt), GEN_INT (elt),
34214 					    GEN_INT (elt), GEN_INT (elt)));
34215 	      break;
34216 
34217 	    case 2:
34218 	      tmp = gen_reg_rtx (mode);
34219 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34220 	      break;
34221 
34222 	    default:
34223 	      gcc_unreachable ();
34224 	    }
34225 	  vec = tmp;
34226 	  use_vec_extr = true;
34227 	  elt = 0;
34228 	}
34229       else
34230 	{
34231 	  /* For SSE1, we have to reuse the V4SF code.  */
34232 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34233 				      gen_lowpart (V4SFmode, vec), elt);
34234 	  return;
34235 	}
34236       break;
34237 
34238     case V8HImode:
34239       use_vec_extr = TARGET_SSE2;
34240       break;
34241     case V4HImode:
34242       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34243       break;
34244 
34245     case V16QImode:
34246       use_vec_extr = TARGET_SSE4_1;
34247       break;
34248 
34249     case V8SFmode:
34250       if (TARGET_AVX)
34251 	{
34252 	  tmp = gen_reg_rtx (V4SFmode);
34253 	  if (elt < 4)
34254 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34255 	  else
34256 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34257 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
34258 	  return;
34259 	}
34260       break;
34261 
34262     case V4DFmode:
34263       if (TARGET_AVX)
34264 	{
34265 	  tmp = gen_reg_rtx (V2DFmode);
34266 	  if (elt < 2)
34267 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34268 	  else
34269 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34270 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
34271 	  return;
34272 	}
34273       break;
34274 
34275     case V32QImode:
34276       if (TARGET_AVX)
34277 	{
34278 	  tmp = gen_reg_rtx (V16QImode);
34279 	  if (elt < 16)
34280 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34281 	  else
34282 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34283 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
34284 	  return;
34285 	}
34286       break;
34287 
34288     case V16HImode:
34289       if (TARGET_AVX)
34290 	{
34291 	  tmp = gen_reg_rtx (V8HImode);
34292 	  if (elt < 8)
34293 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34294 	  else
34295 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34296 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
34297 	  return;
34298 	}
34299       break;
34300 
34301     case V8SImode:
34302       if (TARGET_AVX)
34303 	{
34304 	  tmp = gen_reg_rtx (V4SImode);
34305 	  if (elt < 4)
34306 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34307 	  else
34308 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34309 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
34310 	  return;
34311 	}
34312       break;
34313 
34314     case V4DImode:
34315       if (TARGET_AVX)
34316 	{
34317 	  tmp = gen_reg_rtx (V2DImode);
34318 	  if (elt < 2)
34319 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34320 	  else
34321 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34322 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
34323 	  return;
34324 	}
34325       break;
34326 
34327     case V8QImode:
34328       /* ??? Could extract the appropriate HImode element and shift.  */
34329     default:
34330       break;
34331     }
34332 
34333   if (use_vec_extr)
34334     {
34335       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34336       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34337 
34338       /* Let the rtl optimizers know about the zero extension performed.  */
34339       if (inner_mode == QImode || inner_mode == HImode)
34340 	{
34341 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34342 	  target = gen_lowpart (SImode, target);
34343 	}
34344 
34345       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34346     }
34347   else
34348     {
34349       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34350 
34351       emit_move_insn (mem, vec);
34352 
34353       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34354       emit_move_insn (target, tmp);
34355     }
34356 }
34357 
34358 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34359    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34360    The upper bits of DEST are undefined, though they shouldn't cause
34361    exceptions (some bits from src or all zeros are ok).  */
34362 
34363 static void
34364 emit_reduc_half (rtx dest, rtx src, int i)
34365 {
34366   rtx tem;
34367   switch (GET_MODE (src))
34368     {
34369     case V4SFmode:
34370       if (i == 128)
34371 	tem = gen_sse_movhlps (dest, src, src);
34372       else
34373 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34374 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
34375       break;
34376     case V2DFmode:
34377       tem = gen_vec_interleave_highv2df (dest, src, src);
34378       break;
34379     case V16QImode:
34380     case V8HImode:
34381     case V4SImode:
34382     case V2DImode:
34383       tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34384 				gen_lowpart (V1TImode, src),
34385 				GEN_INT (i / 2));
34386       break;
34387     case V8SFmode:
34388       if (i == 256)
34389 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34390       else
34391 	tem = gen_avx_shufps256 (dest, src, src,
34392 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34393       break;
34394     case V4DFmode:
34395       if (i == 256)
34396 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34397       else
34398 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34399       break;
34400     case V32QImode:
34401     case V16HImode:
34402     case V8SImode:
34403     case V4DImode:
34404       if (i == 256)
34405 	tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34406 				 gen_lowpart (V4DImode, src),
34407 				 gen_lowpart (V4DImode, src),
34408 				 const1_rtx);
34409       else
34410 	tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34411 				  gen_lowpart (V2TImode, src),
34412 				  GEN_INT (i / 2));
34413       break;
34414     default:
34415       gcc_unreachable ();
34416     }
34417   emit_insn (tem);
34418 }
34419 
34420 /* Expand a vector reduction.  FN is the binary pattern to reduce;
34421    DEST is the destination; IN is the input vector.  */
34422 
34423 void
34424 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34425 {
34426   rtx half, dst, vec = in;
34427   enum machine_mode mode = GET_MODE (in);
34428   int i;
34429 
34430   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
34431   if (TARGET_SSE4_1
34432       && mode == V8HImode
34433       && fn == gen_uminv8hi3)
34434     {
34435       emit_insn (gen_sse4_1_phminposuw (dest, in));
34436       return;
34437     }
34438 
34439   for (i = GET_MODE_BITSIZE (mode);
34440        i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34441        i >>= 1)
34442     {
34443       half = gen_reg_rtx (mode);
34444       emit_reduc_half (half, vec, i);
34445       if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34446 	dst = dest;
34447       else
34448 	dst = gen_reg_rtx (mode);
34449       emit_insn (fn (dst, half, vec));
34450       vec = dst;
34451     }
34452 }
34453 
34454 /* Target hook for scalar_mode_supported_p.  */
34455 static bool
34456 ix86_scalar_mode_supported_p (enum machine_mode mode)
34457 {
34458   if (DECIMAL_FLOAT_MODE_P (mode))
34459     return default_decimal_float_supported_p ();
34460   else if (mode == TFmode)
34461     return true;
34462   else
34463     return default_scalar_mode_supported_p (mode);
34464 }
34465 
34466 /* Implements target hook vector_mode_supported_p.  */
34467 static bool
34468 ix86_vector_mode_supported_p (enum machine_mode mode)
34469 {
34470   if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34471     return true;
34472   if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34473     return true;
34474   if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34475     return true;
34476   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34477     return true;
34478   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34479     return true;
34480   return false;
34481 }
34482 
34483 /* Target hook for c_mode_for_suffix.  */
34484 static enum machine_mode
34485 ix86_c_mode_for_suffix (char suffix)
34486 {
34487   if (suffix == 'q')
34488     return TFmode;
34489   if (suffix == 'w')
34490     return XFmode;
34491 
34492   return VOIDmode;
34493 }
34494 
34495 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34496 
34497    We do this in the new i386 backend to maintain source compatibility
34498    with the old cc0-based compiler.  */
34499 
34500 static tree
34501 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34502 		      tree inputs ATTRIBUTE_UNUSED,
34503 		      tree clobbers)
34504 {
34505   clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34506 			clobbers);
34507   clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34508 			clobbers);
34509   return clobbers;
34510 }
34511 
34512 /* Implements target vector targetm.asm.encode_section_info.  */
34513 
34514 static void ATTRIBUTE_UNUSED
34515 ix86_encode_section_info (tree decl, rtx rtl, int first)
34516 {
34517   default_encode_section_info (decl, rtl, first);
34518 
34519   if (TREE_CODE (decl) == VAR_DECL
34520       && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34521       && ix86_in_large_data_p (decl))
34522     SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34523 }
34524 
34525 /* Worker function for REVERSE_CONDITION.  */
34526 
34527 enum rtx_code
34528 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34529 {
34530   return (mode != CCFPmode && mode != CCFPUmode
34531 	  ? reverse_condition (code)
34532 	  : reverse_condition_maybe_unordered (code));
34533 }
34534 
34535 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34536    to OPERANDS[0].  */
34537 
34538 const char *
34539 output_387_reg_move (rtx insn, rtx *operands)
34540 {
34541   if (REG_P (operands[0]))
34542     {
34543       if (REG_P (operands[1])
34544 	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34545 	{
34546 	  if (REGNO (operands[0]) == FIRST_STACK_REG)
34547 	    return output_387_ffreep (operands, 0);
34548 	  return "fstp\t%y0";
34549 	}
34550       if (STACK_TOP_P (operands[0]))
34551 	return "fld%Z1\t%y1";
34552       return "fst\t%y0";
34553     }
34554   else if (MEM_P (operands[0]))
34555     {
34556       gcc_assert (REG_P (operands[1]));
34557       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34558 	return "fstp%Z0\t%y0";
34559       else
34560 	{
34561 	  /* There is no non-popping store to memory for XFmode.
34562 	     So if we need one, follow the store with a load.  */
34563 	  if (GET_MODE (operands[0]) == XFmode)
34564 	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34565 	  else
34566 	    return "fst%Z0\t%y0";
34567 	}
34568     }
34569   else
34570     gcc_unreachable();
34571 }
34572 
34573 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34574    FP status register is set.  */
34575 
34576 void
34577 ix86_emit_fp_unordered_jump (rtx label)
34578 {
34579   rtx reg = gen_reg_rtx (HImode);
34580   rtx temp;
34581 
34582   emit_insn (gen_x86_fnstsw_1 (reg));
34583 
34584   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34585     {
34586       emit_insn (gen_x86_sahf_1 (reg));
34587 
34588       temp = gen_rtx_REG (CCmode, FLAGS_REG);
34589       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34590     }
34591   else
34592     {
34593       emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34594 
34595       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34596       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34597     }
34598 
34599   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34600 			      gen_rtx_LABEL_REF (VOIDmode, label),
34601 			      pc_rtx);
34602   temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34603 
34604   emit_jump_insn (temp);
34605   predict_jump (REG_BR_PROB_BASE * 10 / 100);
34606 }
34607 
34608 /* Output code to perform a log1p XFmode calculation.  */
34609 
34610 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34611 {
34612   rtx label1 = gen_label_rtx ();
34613   rtx label2 = gen_label_rtx ();
34614 
34615   rtx tmp = gen_reg_rtx (XFmode);
34616   rtx tmp2 = gen_reg_rtx (XFmode);
34617   rtx test;
34618 
34619   emit_insn (gen_absxf2 (tmp, op1));
34620   test = gen_rtx_GE (VOIDmode, tmp,
34621     CONST_DOUBLE_FROM_REAL_VALUE (
34622        REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34623        XFmode));
34624   emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34625 
34626   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34627   emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34628   emit_jump (label2);
34629 
34630   emit_label (label1);
34631   emit_move_insn (tmp, CONST1_RTX (XFmode));
34632   emit_insn (gen_addxf3 (tmp, op1, tmp));
34633   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34634   emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34635 
34636   emit_label (label2);
34637 }
34638 
34639 /* Emit code for round calculation.  */
34640 void ix86_emit_i387_round (rtx op0, rtx op1)
34641 {
34642   enum machine_mode inmode = GET_MODE (op1);
34643   enum machine_mode outmode = GET_MODE (op0);
34644   rtx e1, e2, res, tmp, tmp1, half;
34645   rtx scratch = gen_reg_rtx (HImode);
34646   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34647   rtx jump_label = gen_label_rtx ();
34648   rtx insn;
34649   rtx (*gen_abs) (rtx, rtx);
34650   rtx (*gen_neg) (rtx, rtx);
34651 
34652   switch (inmode)
34653     {
34654     case SFmode:
34655       gen_abs = gen_abssf2;
34656       break;
34657     case DFmode:
34658       gen_abs = gen_absdf2;
34659       break;
34660     case XFmode:
34661       gen_abs = gen_absxf2;
34662       break;
34663     default:
34664       gcc_unreachable ();
34665     }
34666 
34667   switch (outmode)
34668     {
34669     case SFmode:
34670       gen_neg = gen_negsf2;
34671       break;
34672     case DFmode:
34673       gen_neg = gen_negdf2;
34674       break;
34675     case XFmode:
34676       gen_neg = gen_negxf2;
34677       break;
34678     case HImode:
34679       gen_neg = gen_neghi2;
34680       break;
34681     case SImode:
34682       gen_neg = gen_negsi2;
34683       break;
34684     case DImode:
34685       gen_neg = gen_negdi2;
34686       break;
34687     default:
34688       gcc_unreachable ();
34689     }
34690 
34691   e1 = gen_reg_rtx (inmode);
34692   e2 = gen_reg_rtx (inmode);
34693   res = gen_reg_rtx (outmode);
34694 
34695   half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34696 
34697   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34698 
34699   /* scratch = fxam(op1) */
34700   emit_insn (gen_rtx_SET (VOIDmode, scratch,
34701 			  gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34702 					  UNSPEC_FXAM)));
34703   /* e1 = fabs(op1) */
34704   emit_insn (gen_abs (e1, op1));
34705 
34706   /* e2 = e1 + 0.5 */
34707   half = force_reg (inmode, half);
34708   emit_insn (gen_rtx_SET (VOIDmode, e2,
34709 			  gen_rtx_PLUS (inmode, e1, half)));
34710 
34711   /* res = floor(e2) */
34712   if (inmode != XFmode)
34713     {
34714       tmp1 = gen_reg_rtx (XFmode);
34715 
34716       emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34717 			      gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34718     }
34719   else
34720     tmp1 = e2;
34721 
34722   switch (outmode)
34723     {
34724     case SFmode:
34725     case DFmode:
34726       {
34727 	rtx tmp0 = gen_reg_rtx (XFmode);
34728 
34729 	emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34730 
34731 	emit_insn (gen_rtx_SET (VOIDmode, res,
34732 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34733 						UNSPEC_TRUNC_NOOP)));
34734       }
34735       break;
34736     case XFmode:
34737       emit_insn (gen_frndintxf2_floor (res, tmp1));
34738       break;
34739     case HImode:
34740       emit_insn (gen_lfloorxfhi2 (res, tmp1));
34741       break;
34742     case SImode:
34743       emit_insn (gen_lfloorxfsi2 (res, tmp1));
34744       break;
34745     case DImode:
34746       emit_insn (gen_lfloorxfdi2 (res, tmp1));
34747 	break;
34748     default:
34749       gcc_unreachable ();
34750     }
34751 
34752   /* flags = signbit(a) */
34753   emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34754 
34755   /* if (flags) then res = -res */
34756   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34757 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34758 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
34759 			      pc_rtx);
34760   insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34761   predict_jump (REG_BR_PROB_BASE * 50 / 100);
34762   JUMP_LABEL (insn) = jump_label;
34763 
34764   emit_insn (gen_neg (res, res));
34765 
34766   emit_label (jump_label);
34767   LABEL_NUSES (jump_label) = 1;
34768 
34769   emit_move_insn (op0, res);
34770 }
34771 
34772 /* Output code to perform a Newton-Rhapson approximation of a single precision
34773    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
34774 
34775 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34776 {
34777   rtx x0, x1, e0, e1;
34778 
34779   x0 = gen_reg_rtx (mode);
34780   e0 = gen_reg_rtx (mode);
34781   e1 = gen_reg_rtx (mode);
34782   x1 = gen_reg_rtx (mode);
34783 
34784   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34785 
34786   b = force_reg (mode, b);
34787 
34788   /* x0 = rcp(b) estimate */
34789   emit_insn (gen_rtx_SET (VOIDmode, x0,
34790 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34791 					  UNSPEC_RCP)));
34792   /* e0 = x0 * b */
34793   emit_insn (gen_rtx_SET (VOIDmode, e0,
34794 			  gen_rtx_MULT (mode, x0, b)));
34795 
34796   /* e0 = x0 * e0 */
34797   emit_insn (gen_rtx_SET (VOIDmode, e0,
34798 			  gen_rtx_MULT (mode, x0, e0)));
34799 
34800   /* e1 = x0 + x0 */
34801   emit_insn (gen_rtx_SET (VOIDmode, e1,
34802 			  gen_rtx_PLUS (mode, x0, x0)));
34803 
34804   /* x1 = e1 - e0 */
34805   emit_insn (gen_rtx_SET (VOIDmode, x1,
34806 			  gen_rtx_MINUS (mode, e1, e0)));
34807 
34808   /* res = a * x1 */
34809   emit_insn (gen_rtx_SET (VOIDmode, res,
34810 			  gen_rtx_MULT (mode, a, x1)));
34811 }
34812 
34813 /* Output code to perform a Newton-Rhapson approximation of a
34814    single precision floating point [reciprocal] square root.  */
34815 
34816 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34817 			 bool recip)
34818 {
34819   rtx x0, e0, e1, e2, e3, mthree, mhalf;
34820   REAL_VALUE_TYPE r;
34821 
34822   x0 = gen_reg_rtx (mode);
34823   e0 = gen_reg_rtx (mode);
34824   e1 = gen_reg_rtx (mode);
34825   e2 = gen_reg_rtx (mode);
34826   e3 = gen_reg_rtx (mode);
34827 
34828   real_from_integer (&r, VOIDmode, -3, -1, 0);
34829   mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34830 
34831   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34832   mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34833 
34834   if (VECTOR_MODE_P (mode))
34835     {
34836       mthree = ix86_build_const_vector (mode, true, mthree);
34837       mhalf = ix86_build_const_vector (mode, true, mhalf);
34838     }
34839 
34840   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34841      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34842 
34843   a = force_reg (mode, a);
34844 
34845   /* x0 = rsqrt(a) estimate */
34846   emit_insn (gen_rtx_SET (VOIDmode, x0,
34847 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34848 					  UNSPEC_RSQRT)));
34849 
34850   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
34851   if (!recip)
34852     {
34853       rtx zero, mask;
34854 
34855       zero = gen_reg_rtx (mode);
34856       mask = gen_reg_rtx (mode);
34857 
34858       zero = force_reg (mode, CONST0_RTX(mode));
34859       emit_insn (gen_rtx_SET (VOIDmode, mask,
34860 			      gen_rtx_NE (mode, zero, a)));
34861 
34862       emit_insn (gen_rtx_SET (VOIDmode, x0,
34863 			      gen_rtx_AND (mode, x0, mask)));
34864     }
34865 
34866   /* e0 = x0 * a */
34867   emit_insn (gen_rtx_SET (VOIDmode, e0,
34868 			  gen_rtx_MULT (mode, x0, a)));
34869   /* e1 = e0 * x0 */
34870   emit_insn (gen_rtx_SET (VOIDmode, e1,
34871 			  gen_rtx_MULT (mode, e0, x0)));
34872 
34873   /* e2 = e1 - 3. */
34874   mthree = force_reg (mode, mthree);
34875   emit_insn (gen_rtx_SET (VOIDmode, e2,
34876 			  gen_rtx_PLUS (mode, e1, mthree)));
34877 
34878   mhalf = force_reg (mode, mhalf);
34879   if (recip)
34880     /* e3 = -.5 * x0 */
34881     emit_insn (gen_rtx_SET (VOIDmode, e3,
34882 			    gen_rtx_MULT (mode, x0, mhalf)));
34883   else
34884     /* e3 = -.5 * e0 */
34885     emit_insn (gen_rtx_SET (VOIDmode, e3,
34886 			    gen_rtx_MULT (mode, e0, mhalf)));
34887   /* ret = e2 * e3 */
34888   emit_insn (gen_rtx_SET (VOIDmode, res,
34889 			  gen_rtx_MULT (mode, e2, e3)));
34890 }
34891 
34892 #ifdef TARGET_SOLARIS
34893 /* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
34894 
34895 static void
34896 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34897 				tree decl)
34898 {
34899   /* With Binutils 2.15, the "@unwind" marker must be specified on
34900      every occurrence of the ".eh_frame" section, not just the first
34901      one.  */
34902   if (TARGET_64BIT
34903       && strcmp (name, ".eh_frame") == 0)
34904     {
34905       fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34906 	       flags & SECTION_WRITE ? "aw" : "a");
34907       return;
34908     }
34909 
34910 #ifndef USE_GAS
34911   if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34912     {
34913       solaris_elf_asm_comdat_section (name, flags, decl);
34914       return;
34915     }
34916 #endif
34917 
34918   default_elf_asm_named_section (name, flags, decl);
34919 }
34920 #endif /* TARGET_SOLARIS */
34921 
34922 /* Return the mangling of TYPE if it is an extended fundamental type.  */
34923 
34924 static const char *
34925 ix86_mangle_type (const_tree type)
34926 {
34927   type = TYPE_MAIN_VARIANT (type);
34928 
34929   if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34930       && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34931     return NULL;
34932 
34933   switch (TYPE_MODE (type))
34934     {
34935     case TFmode:
34936       /* __float128 is "g".  */
34937       return "g";
34938     case XFmode:
34939       /* "long double" or __float80 is "e".  */
34940       return "e";
34941     default:
34942       return NULL;
34943     }
34944 }
34945 
34946 /* For 32-bit code we can save PIC register setup by using
34947    __stack_chk_fail_local hidden function instead of calling
34948    __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
34949    register, so it is better to call __stack_chk_fail directly.  */
34950 
34951 static tree ATTRIBUTE_UNUSED
34952 ix86_stack_protect_fail (void)
34953 {
34954 #if 0  /*  Still broken -- affects FreeBSD too  */
34955   return TARGET_64BIT
34956 	 ? default_external_stack_protect_fail ()
34957 	 : default_hidden_stack_protect_fail ();
34958 #else
34959   return default_external_stack_protect_fail ();
34960 #endif
34961 }
34962 
34963 /* Select a format to encode pointers in exception handling data.  CODE
34964    is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
34965    true if the symbol may be affected by dynamic relocations.
34966 
34967    ??? All x86 object file formats are capable of representing this.
34968    After all, the relocation needed is the same as for the call insn.
34969    Whether or not a particular assembler allows us to enter such, I
34970    guess we'll have to see.  */
34971 int
34972 asm_preferred_eh_data_format (int code, int global)
34973 {
34974   if (flag_pic)
34975     {
34976       int type = DW_EH_PE_sdata8;
34977       if (!TARGET_64BIT
34978 	  || ix86_cmodel == CM_SMALL_PIC
34979 	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34980 	type = DW_EH_PE_sdata4;
34981       return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34982     }
34983   if (ix86_cmodel == CM_SMALL
34984       || (ix86_cmodel == CM_MEDIUM && code))
34985     return DW_EH_PE_udata4;
34986   return DW_EH_PE_absptr;
34987 }
34988 
34989 /* Expand copysign from SIGN to the positive value ABS_VALUE
34990    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
34991    the sign-bit.  */
34992 static void
34993 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34994 {
34995   enum machine_mode mode = GET_MODE (sign);
34996   rtx sgn = gen_reg_rtx (mode);
34997   if (mask == NULL_RTX)
34998     {
34999       enum machine_mode vmode;
35000 
35001       if (mode == SFmode)
35002 	vmode = V4SFmode;
35003       else if (mode == DFmode)
35004 	vmode = V2DFmode;
35005       else
35006 	vmode = mode;
35007 
35008       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35009       if (!VECTOR_MODE_P (mode))
35010 	{
35011 	  /* We need to generate a scalar mode mask in this case.  */
35012 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35013 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35014 	  mask = gen_reg_rtx (mode);
35015 	  emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35016 	}
35017     }
35018   else
35019     mask = gen_rtx_NOT (mode, mask);
35020   emit_insn (gen_rtx_SET (VOIDmode, sgn,
35021 			  gen_rtx_AND (mode, mask, sign)));
35022   emit_insn (gen_rtx_SET (VOIDmode, result,
35023 			  gen_rtx_IOR (mode, abs_value, sgn)));
35024 }
35025 
35026 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
35027    mask for masking out the sign-bit is stored in *SMASK, if that is
35028    non-null.  */
35029 static rtx
35030 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35031 {
35032   enum machine_mode vmode, mode = GET_MODE (op0);
35033   rtx xa, mask;
35034 
35035   xa = gen_reg_rtx (mode);
35036   if (mode == SFmode)
35037     vmode = V4SFmode;
35038   else if (mode == DFmode)
35039     vmode = V2DFmode;
35040   else
35041     vmode = mode;
35042   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35043   if (!VECTOR_MODE_P (mode))
35044     {
35045       /* We need to generate a scalar mode mask in this case.  */
35046       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35047       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35048       mask = gen_reg_rtx (mode);
35049       emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35050     }
35051   emit_insn (gen_rtx_SET (VOIDmode, xa,
35052 			  gen_rtx_AND (mode, op0, mask)));
35053 
35054   if (smask)
35055     *smask = mask;
35056 
35057   return xa;
35058 }
35059 
35060 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35061    swapping the operands if SWAP_OPERANDS is true.  The expanded
35062    code is a forward jump to a newly created label in case the
35063    comparison is true.  The generated label rtx is returned.  */
35064 static rtx
35065 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35066                                   bool swap_operands)
35067 {
35068   rtx label, tmp;
35069 
35070   if (swap_operands)
35071     {
35072       tmp = op0;
35073       op0 = op1;
35074       op1 = tmp;
35075     }
35076 
35077   label = gen_label_rtx ();
35078   tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35079   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35080 			  gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35081   tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35082   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35083 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35084   tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35085   JUMP_LABEL (tmp) = label;
35086 
35087   return label;
35088 }
35089 
35090 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35091    using comparison code CODE.  Operands are swapped for the comparison if
35092    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
35093 static rtx
35094 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35095 			      bool swap_operands)
35096 {
35097   rtx (*insn)(rtx, rtx, rtx, rtx);
35098   enum machine_mode mode = GET_MODE (op0);
35099   rtx mask = gen_reg_rtx (mode);
35100 
35101   if (swap_operands)
35102     {
35103       rtx tmp = op0;
35104       op0 = op1;
35105       op1 = tmp;
35106     }
35107 
35108   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35109 
35110   emit_insn (insn (mask, op0, op1,
35111 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
35112   return mask;
35113 }
35114 
35115 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35116    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
35117 static rtx
35118 ix86_gen_TWO52 (enum machine_mode mode)
35119 {
35120   REAL_VALUE_TYPE TWO52r;
35121   rtx TWO52;
35122 
35123   real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35124   TWO52 = const_double_from_real_value (TWO52r, mode);
35125   TWO52 = force_reg (mode, TWO52);
35126 
35127   return TWO52;
35128 }
35129 
35130 /* Expand SSE sequence for computing lround from OP1 storing
35131    into OP0.  */
35132 void
35133 ix86_expand_lround (rtx op0, rtx op1)
35134 {
35135   /* C code for the stuff we're doing below:
35136        tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35137        return (long)tmp;
35138    */
35139   enum machine_mode mode = GET_MODE (op1);
35140   const struct real_format *fmt;
35141   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35142   rtx adj;
35143 
35144   /* load nextafter (0.5, 0.0) */
35145   fmt = REAL_MODE_FORMAT (mode);
35146   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35147   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35148 
35149   /* adj = copysign (0.5, op1) */
35150   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35151   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35152 
35153   /* adj = op1 + adj */
35154   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35155 
35156   /* op0 = (imode)adj */
35157   expand_fix (op0, adj, 0);
35158 }
35159 
35160 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35161    into OPERAND0.  */
35162 void
35163 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35164 {
35165   /* C code for the stuff we're doing below (for do_floor):
35166 	xi = (long)op1;
35167         xi -= (double)xi > op1 ? 1 : 0;
35168         return xi;
35169    */
35170   enum machine_mode fmode = GET_MODE (op1);
35171   enum machine_mode imode = GET_MODE (op0);
35172   rtx ireg, freg, label, tmp;
35173 
35174   /* reg = (long)op1 */
35175   ireg = gen_reg_rtx (imode);
35176   expand_fix (ireg, op1, 0);
35177 
35178   /* freg = (double)reg */
35179   freg = gen_reg_rtx (fmode);
35180   expand_float (freg, ireg, 0);
35181 
35182   /* ireg = (freg > op1) ? ireg - 1 : ireg */
35183   label = ix86_expand_sse_compare_and_jump (UNLE,
35184 					    freg, op1, !do_floor);
35185   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35186 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35187   emit_move_insn (ireg, tmp);
35188 
35189   emit_label (label);
35190   LABEL_NUSES (label) = 1;
35191 
35192   emit_move_insn (op0, ireg);
35193 }
35194 
35195 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35196    result in OPERAND0.  */
35197 void
35198 ix86_expand_rint (rtx operand0, rtx operand1)
35199 {
35200   /* C code for the stuff we're doing below:
35201 	xa = fabs (operand1);
35202         if (!isless (xa, 2**52))
35203 	  return operand1;
35204         xa = xa + 2**52 - 2**52;
35205         return copysign (xa, operand1);
35206    */
35207   enum machine_mode mode = GET_MODE (operand0);
35208   rtx res, xa, label, TWO52, mask;
35209 
35210   res = gen_reg_rtx (mode);
35211   emit_move_insn (res, operand1);
35212 
35213   /* xa = abs (operand1) */
35214   xa = ix86_expand_sse_fabs (res, &mask);
35215 
35216   /* if (!isless (xa, TWO52)) goto label; */
35217   TWO52 = ix86_gen_TWO52 (mode);
35218   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35219 
35220   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35221   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35222 
35223   ix86_sse_copysign_to_positive (res, xa, res, mask);
35224 
35225   emit_label (label);
35226   LABEL_NUSES (label) = 1;
35227 
35228   emit_move_insn (operand0, res);
35229 }
35230 
35231 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35232    into OPERAND0.  */
35233 void
35234 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35235 {
35236   /* C code for the stuff we expand below.
35237         double xa = fabs (x), x2;
35238         if (!isless (xa, TWO52))
35239           return x;
35240         xa = xa + TWO52 - TWO52;
35241         x2 = copysign (xa, x);
35242      Compensate.  Floor:
35243         if (x2 > x)
35244           x2 -= 1;
35245      Compensate.  Ceil:
35246         if (x2 < x)
35247           x2 -= -1;
35248         return x2;
35249    */
35250   enum machine_mode mode = GET_MODE (operand0);
35251   rtx xa, TWO52, tmp, label, one, res, mask;
35252 
35253   TWO52 = ix86_gen_TWO52 (mode);
35254 
35255   /* Temporary for holding the result, initialized to the input
35256      operand to ease control flow.  */
35257   res = gen_reg_rtx (mode);
35258   emit_move_insn (res, operand1);
35259 
35260   /* xa = abs (operand1) */
35261   xa = ix86_expand_sse_fabs (res, &mask);
35262 
35263   /* if (!isless (xa, TWO52)) goto label; */
35264   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35265 
35266   /* xa = xa + TWO52 - TWO52; */
35267   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35268   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35269 
35270   /* xa = copysign (xa, operand1) */
35271   ix86_sse_copysign_to_positive (xa, xa, res, mask);
35272 
35273   /* generate 1.0 or -1.0 */
35274   one = force_reg (mode,
35275 	           const_double_from_real_value (do_floor
35276 						 ? dconst1 : dconstm1, mode));
35277 
35278   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35279   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35280   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35281                           gen_rtx_AND (mode, one, tmp)));
35282   /* We always need to subtract here to preserve signed zero.  */
35283   tmp = expand_simple_binop (mode, MINUS,
35284 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35285   emit_move_insn (res, tmp);
35286 
35287   emit_label (label);
35288   LABEL_NUSES (label) = 1;
35289 
35290   emit_move_insn (operand0, res);
35291 }
35292 
35293 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35294    into OPERAND0.  */
35295 void
35296 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35297 {
35298   /* C code for the stuff we expand below.
35299 	double xa = fabs (x), x2;
35300         if (!isless (xa, TWO52))
35301           return x;
35302 	x2 = (double)(long)x;
35303      Compensate.  Floor:
35304 	if (x2 > x)
35305 	  x2 -= 1;
35306      Compensate.  Ceil:
35307 	if (x2 < x)
35308 	  x2 += 1;
35309 	if (HONOR_SIGNED_ZEROS (mode))
35310 	  return copysign (x2, x);
35311 	return x2;
35312    */
35313   enum machine_mode mode = GET_MODE (operand0);
35314   rtx xa, xi, TWO52, tmp, label, one, res, mask;
35315 
35316   TWO52 = ix86_gen_TWO52 (mode);
35317 
35318   /* Temporary for holding the result, initialized to the input
35319      operand to ease control flow.  */
35320   res = gen_reg_rtx (mode);
35321   emit_move_insn (res, operand1);
35322 
35323   /* xa = abs (operand1) */
35324   xa = ix86_expand_sse_fabs (res, &mask);
35325 
35326   /* if (!isless (xa, TWO52)) goto label; */
35327   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35328 
35329   /* xa = (double)(long)x */
35330   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35331   expand_fix (xi, res, 0);
35332   expand_float (xa, xi, 0);
35333 
35334   /* generate 1.0 */
35335   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35336 
35337   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35338   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35339   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35340                           gen_rtx_AND (mode, one, tmp)));
35341   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35342 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35343   emit_move_insn (res, tmp);
35344 
35345   if (HONOR_SIGNED_ZEROS (mode))
35346     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35347 
35348   emit_label (label);
35349   LABEL_NUSES (label) = 1;
35350 
35351   emit_move_insn (operand0, res);
35352 }
35353 
35354 /* Expand SSE sequence for computing round from OPERAND1 storing
35355    into OPERAND0.  Sequence that works without relying on DImode truncation
35356    via cvttsd2siq that is only available on 64bit targets.  */
35357 void
35358 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35359 {
35360   /* C code for the stuff we expand below.
35361         double xa = fabs (x), xa2, x2;
35362         if (!isless (xa, TWO52))
35363           return x;
35364      Using the absolute value and copying back sign makes
35365      -0.0 -> -0.0 correct.
35366         xa2 = xa + TWO52 - TWO52;
35367      Compensate.
35368 	dxa = xa2 - xa;
35369         if (dxa <= -0.5)
35370           xa2 += 1;
35371         else if (dxa > 0.5)
35372           xa2 -= 1;
35373         x2 = copysign (xa2, x);
35374         return x2;
35375    */
35376   enum machine_mode mode = GET_MODE (operand0);
35377   rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35378 
35379   TWO52 = ix86_gen_TWO52 (mode);
35380 
35381   /* Temporary for holding the result, initialized to the input
35382      operand to ease control flow.  */
35383   res = gen_reg_rtx (mode);
35384   emit_move_insn (res, operand1);
35385 
35386   /* xa = abs (operand1) */
35387   xa = ix86_expand_sse_fabs (res, &mask);
35388 
35389   /* if (!isless (xa, TWO52)) goto label; */
35390   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35391 
35392   /* xa2 = xa + TWO52 - TWO52; */
35393   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35394   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35395 
35396   /* dxa = xa2 - xa; */
35397   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35398 
35399   /* generate 0.5, 1.0 and -0.5 */
35400   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35401   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35402   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35403 			       0, OPTAB_DIRECT);
35404 
35405   /* Compensate.  */
35406   tmp = gen_reg_rtx (mode);
35407   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35408   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35409   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35410                           gen_rtx_AND (mode, one, tmp)));
35411   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35412   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35413   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35414   emit_insn (gen_rtx_SET (VOIDmode, tmp,
35415                           gen_rtx_AND (mode, one, tmp)));
35416   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35417 
35418   /* res = copysign (xa2, operand1) */
35419   ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35420 
35421   emit_label (label);
35422   LABEL_NUSES (label) = 1;
35423 
35424   emit_move_insn (operand0, res);
35425 }
35426 
35427 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35428    into OPERAND0.  */
35429 void
35430 ix86_expand_trunc (rtx operand0, rtx operand1)
35431 {
35432   /* C code for SSE variant we expand below.
35433         double xa = fabs (x), x2;
35434         if (!isless (xa, TWO52))
35435           return x;
35436         x2 = (double)(long)x;
35437 	if (HONOR_SIGNED_ZEROS (mode))
35438 	  return copysign (x2, x);
35439 	return x2;
35440    */
35441   enum machine_mode mode = GET_MODE (operand0);
35442   rtx xa, xi, TWO52, label, res, mask;
35443 
35444   TWO52 = ix86_gen_TWO52 (mode);
35445 
35446   /* Temporary for holding the result, initialized to the input
35447      operand to ease control flow.  */
35448   res = gen_reg_rtx (mode);
35449   emit_move_insn (res, operand1);
35450 
35451   /* xa = abs (operand1) */
35452   xa = ix86_expand_sse_fabs (res, &mask);
35453 
35454   /* if (!isless (xa, TWO52)) goto label; */
35455   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35456 
35457   /* x = (double)(long)x */
35458   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35459   expand_fix (xi, res, 0);
35460   expand_float (res, xi, 0);
35461 
35462   if (HONOR_SIGNED_ZEROS (mode))
35463     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35464 
35465   emit_label (label);
35466   LABEL_NUSES (label) = 1;
35467 
35468   emit_move_insn (operand0, res);
35469 }
35470 
35471 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35472    into OPERAND0.  */
35473 void
35474 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35475 {
35476   enum machine_mode mode = GET_MODE (operand0);
35477   rtx xa, mask, TWO52, label, one, res, smask, tmp;
35478 
35479   /* C code for SSE variant we expand below.
35480         double xa = fabs (x), x2;
35481         if (!isless (xa, TWO52))
35482           return x;
35483         xa2 = xa + TWO52 - TWO52;
35484      Compensate:
35485         if (xa2 > xa)
35486           xa2 -= 1.0;
35487         x2 = copysign (xa2, x);
35488         return x2;
35489    */
35490 
35491   TWO52 = ix86_gen_TWO52 (mode);
35492 
35493   /* Temporary for holding the result, initialized to the input
35494      operand to ease control flow.  */
35495   res = gen_reg_rtx (mode);
35496   emit_move_insn (res, operand1);
35497 
35498   /* xa = abs (operand1) */
35499   xa = ix86_expand_sse_fabs (res, &smask);
35500 
35501   /* if (!isless (xa, TWO52)) goto label; */
35502   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35503 
35504   /* res = xa + TWO52 - TWO52; */
35505   tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35506   tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35507   emit_move_insn (res, tmp);
35508 
35509   /* generate 1.0 */
35510   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35511 
35512   /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
35513   mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35514   emit_insn (gen_rtx_SET (VOIDmode, mask,
35515                           gen_rtx_AND (mode, mask, one)));
35516   tmp = expand_simple_binop (mode, MINUS,
35517 			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35518   emit_move_insn (res, tmp);
35519 
35520   /* res = copysign (res, operand1) */
35521   ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35522 
35523   emit_label (label);
35524   LABEL_NUSES (label) = 1;
35525 
35526   emit_move_insn (operand0, res);
35527 }
35528 
35529 /* Expand SSE sequence for computing round from OPERAND1 storing
35530    into OPERAND0.  */
35531 void
35532 ix86_expand_round (rtx operand0, rtx operand1)
35533 {
35534   /* C code for the stuff we're doing below:
35535         double xa = fabs (x);
35536         if (!isless (xa, TWO52))
35537           return x;
35538         xa = (double)(long)(xa + nextafter (0.5, 0.0));
35539         return copysign (xa, x);
35540    */
35541   enum machine_mode mode = GET_MODE (operand0);
35542   rtx res, TWO52, xa, label, xi, half, mask;
35543   const struct real_format *fmt;
35544   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35545 
35546   /* Temporary for holding the result, initialized to the input
35547      operand to ease control flow.  */
35548   res = gen_reg_rtx (mode);
35549   emit_move_insn (res, operand1);
35550 
35551   TWO52 = ix86_gen_TWO52 (mode);
35552   xa = ix86_expand_sse_fabs (res, &mask);
35553   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35554 
35555   /* load nextafter (0.5, 0.0) */
35556   fmt = REAL_MODE_FORMAT (mode);
35557   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35558   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35559 
35560   /* xa = xa + 0.5 */
35561   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35562   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35563 
35564   /* xa = (double)(int64_t)xa */
35565   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35566   expand_fix (xi, xa, 0);
35567   expand_float (xa, xi, 0);
35568 
35569   /* res = copysign (xa, operand1) */
35570   ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35571 
35572   emit_label (label);
35573   LABEL_NUSES (label) = 1;
35574 
35575   emit_move_insn (operand0, res);
35576 }
35577 
35578 /* Expand SSE sequence for computing round
35579    from OP1 storing into OP0 using sse4 round insn.  */
35580 void
35581 ix86_expand_round_sse4 (rtx op0, rtx op1)
35582 {
35583   enum machine_mode mode = GET_MODE (op0);
35584   rtx e1, e2, res, half;
35585   const struct real_format *fmt;
35586   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35587   rtx (*gen_copysign) (rtx, rtx, rtx);
35588   rtx (*gen_round) (rtx, rtx, rtx);
35589 
35590   switch (mode)
35591     {
35592     case SFmode:
35593       gen_copysign = gen_copysignsf3;
35594       gen_round = gen_sse4_1_roundsf2;
35595       break;
35596     case DFmode:
35597       gen_copysign = gen_copysigndf3;
35598       gen_round = gen_sse4_1_rounddf2;
35599       break;
35600     default:
35601       gcc_unreachable ();
35602     }
35603 
35604   /* round (a) = trunc (a + copysign (0.5, a)) */
35605 
35606   /* load nextafter (0.5, 0.0) */
35607   fmt = REAL_MODE_FORMAT (mode);
35608   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35609   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35610   half = const_double_from_real_value (pred_half, mode);
35611 
35612   /* e1 = copysign (0.5, op1) */
35613   e1 = gen_reg_rtx (mode);
35614   emit_insn (gen_copysign (e1, half, op1));
35615 
35616   /* e2 = op1 + e1 */
35617   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35618 
35619   /* res = trunc (e2) */
35620   res = gen_reg_rtx (mode);
35621   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35622 
35623   emit_move_insn (op0, res);
35624 }
35625 
35626 
35627 /* Table of valid machine attributes.  */
35628 static const struct attribute_spec ix86_attribute_table[] =
35629 {
35630   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35631        affects_type_identity } */
35632   /* Stdcall attribute says callee is responsible for popping arguments
35633      if they are not variable.  */
35634   { "stdcall",   0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35635     true },
35636   /* Fastcall attribute says callee is responsible for popping arguments
35637      if they are not variable.  */
35638   { "fastcall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35639     true },
35640   /* Thiscall attribute says callee is responsible for popping arguments
35641      if they are not variable.  */
35642   { "thiscall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35643     true },
35644   /* Cdecl attribute says the callee is a normal C declaration */
35645   { "cdecl",     0, 0, false, true,  true,  ix86_handle_cconv_attribute,
35646     true },
35647   /* Regparm attribute specifies how many integer arguments are to be
35648      passed in registers.  */
35649   { "regparm",   1, 1, false, true,  true,  ix86_handle_cconv_attribute,
35650     true },
35651   /* Sseregparm attribute says we are using x86_64 calling conventions
35652      for FP arguments.  */
35653   { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35654     true },
35655   /* The transactional memory builtins are implicitly regparm or fastcall
35656      depending on the ABI.  Override the generic do-nothing attribute that
35657      these builtins were declared with.  */
35658   { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35659     true },
35660   /* force_align_arg_pointer says this function realigns the stack at entry.  */
35661   { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35662     false, true,  true, ix86_handle_cconv_attribute, false },
35663 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35664   { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35665   { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35666   { "shared",    0, 0, true,  false, false, ix86_handle_shared_attribute,
35667     false },
35668 #endif
35669   { "ms_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute,
35670     false },
35671   { "gcc_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute,
35672     false },
35673 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35674   SUBTARGET_ATTRIBUTE_TABLE,
35675 #endif
35676   /* ms_abi and sysv_abi calling convention function attributes.  */
35677   { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35678   { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35679   { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35680     false },
35681   { "callee_pop_aggregate_return", 1, 1, false, true, true,
35682     ix86_handle_callee_pop_aggregate_return, true },
35683   /* End element.  */
35684   { NULL,        0, 0, false, false, false, NULL, false }
35685 };
35686 
35687 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
35688 static int
35689 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35690                                  tree vectype ATTRIBUTE_UNUSED,
35691                                  int misalign ATTRIBUTE_UNUSED)
35692 {
35693   switch (type_of_cost)
35694     {
35695       case scalar_stmt:
35696         return ix86_cost->scalar_stmt_cost;
35697 
35698       case scalar_load:
35699         return ix86_cost->scalar_load_cost;
35700 
35701       case scalar_store:
35702         return ix86_cost->scalar_store_cost;
35703 
35704       case vector_stmt:
35705         return ix86_cost->vec_stmt_cost;
35706 
35707       case vector_load:
35708         return ix86_cost->vec_align_load_cost;
35709 
35710       case vector_store:
35711         return ix86_cost->vec_store_cost;
35712 
35713       case vec_to_scalar:
35714         return ix86_cost->vec_to_scalar_cost;
35715 
35716       case scalar_to_vec:
35717         return ix86_cost->scalar_to_vec_cost;
35718 
35719       case unaligned_load:
35720       case unaligned_store:
35721         return ix86_cost->vec_unalign_load_cost;
35722 
35723       case cond_branch_taken:
35724         return ix86_cost->cond_taken_branch_cost;
35725 
35726       case cond_branch_not_taken:
35727         return ix86_cost->cond_not_taken_branch_cost;
35728 
35729       case vec_perm:
35730       case vec_promote_demote:
35731         return ix86_cost->vec_stmt_cost;
35732 
35733       default:
35734         gcc_unreachable ();
35735     }
35736 }
35737 
35738 /* Construct (set target (vec_select op0 (parallel perm))) and
35739    return true if that's a valid instruction in the active ISA.  */
35740 
35741 static bool
35742 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35743 {
35744   rtx rperm[MAX_VECT_LEN], x;
35745   unsigned i;
35746 
35747   for (i = 0; i < nelt; ++i)
35748     rperm[i] = GEN_INT (perm[i]);
35749 
35750   x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35751   x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35752   x = gen_rtx_SET (VOIDmode, target, x);
35753 
35754   x = emit_insn (x);
35755   if (recog_memoized (x) < 0)
35756     {
35757       remove_insn (x);
35758       return false;
35759     }
35760   return true;
35761 }
35762 
35763 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
35764 
35765 static bool
35766 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35767 			const unsigned char *perm, unsigned nelt)
35768 {
35769   enum machine_mode v2mode;
35770   rtx x;
35771 
35772   v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35773   x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35774   return expand_vselect (target, x, perm, nelt);
35775 }
35776 
35777 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
35778    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
35779 
35780 static bool
35781 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35782 {
35783   enum machine_mode vmode = d->vmode;
35784   unsigned i, mask, nelt = d->nelt;
35785   rtx target, op0, op1, x;
35786   rtx rperm[32], vperm;
35787 
35788   if (d->op0 == d->op1)
35789     return false;
35790   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35791     ;
35792   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35793     ;
35794   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35795     ;
35796   else
35797     return false;
35798 
35799   /* This is a blend, not a permute.  Elements must stay in their
35800      respective lanes.  */
35801   for (i = 0; i < nelt; ++i)
35802     {
35803       unsigned e = d->perm[i];
35804       if (!(e == i || e == i + nelt))
35805 	return false;
35806     }
35807 
35808   if (d->testing_p)
35809     return true;
35810 
35811   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
35812      decision should be extracted elsewhere, so that we only try that
35813      sequence once all budget==3 options have been tried.  */
35814   target = d->target;
35815   op0 = d->op0;
35816   op1 = d->op1;
35817   mask = 0;
35818 
35819   switch (vmode)
35820     {
35821     case V4DFmode:
35822     case V8SFmode:
35823     case V2DFmode:
35824     case V4SFmode:
35825     case V8HImode:
35826     case V8SImode:
35827       for (i = 0; i < nelt; ++i)
35828 	mask |= (d->perm[i] >= nelt) << i;
35829       break;
35830 
35831     case V2DImode:
35832       for (i = 0; i < 2; ++i)
35833 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35834       vmode = V8HImode;
35835       goto do_subreg;
35836 
35837     case V4SImode:
35838       for (i = 0; i < 4; ++i)
35839 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35840       vmode = V8HImode;
35841       goto do_subreg;
35842 
35843     case V16QImode:
35844       /* See if bytes move in pairs so we can use pblendw with
35845 	 an immediate argument, rather than pblendvb with a vector
35846 	 argument.  */
35847       for (i = 0; i < 16; i += 2)
35848 	if (d->perm[i] + 1 != d->perm[i + 1])
35849 	  {
35850 	  use_pblendvb:
35851 	    for (i = 0; i < nelt; ++i)
35852 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35853 
35854 	  finish_pblendvb:
35855 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35856 	    vperm = force_reg (vmode, vperm);
35857 
35858 	    if (GET_MODE_SIZE (vmode) == 16)
35859 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35860 	    else
35861 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35862 	    return true;
35863 	  }
35864 
35865       for (i = 0; i < 8; ++i)
35866 	mask |= (d->perm[i * 2] >= 16) << i;
35867       vmode = V8HImode;
35868       /* FALLTHRU */
35869 
35870     do_subreg:
35871       target = gen_lowpart (vmode, target);
35872       op0 = gen_lowpart (vmode, op0);
35873       op1 = gen_lowpart (vmode, op1);
35874       break;
35875 
35876     case V32QImode:
35877       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
35878       for (i = 0; i < 32; i += 2)
35879 	if (d->perm[i] + 1 != d->perm[i + 1])
35880 	  goto use_pblendvb;
35881       /* See if bytes move in quadruplets.  If yes, vpblendd
35882 	 with immediate can be used.  */
35883       for (i = 0; i < 32; i += 4)
35884 	if (d->perm[i] + 2 != d->perm[i + 2])
35885 	  break;
35886       if (i < 32)
35887 	{
35888 	  /* See if bytes move the same in both lanes.  If yes,
35889 	     vpblendw with immediate can be used.  */
35890 	  for (i = 0; i < 16; i += 2)
35891 	    if (d->perm[i] + 16 != d->perm[i + 16])
35892 	      goto use_pblendvb;
35893 
35894 	  /* Use vpblendw.  */
35895 	  for (i = 0; i < 16; ++i)
35896 	    mask |= (d->perm[i * 2] >= 32) << i;
35897 	  vmode = V16HImode;
35898 	  goto do_subreg;
35899 	}
35900 
35901       /* Use vpblendd.  */
35902       for (i = 0; i < 8; ++i)
35903 	mask |= (d->perm[i * 4] >= 32) << i;
35904       vmode = V8SImode;
35905       goto do_subreg;
35906 
35907     case V16HImode:
35908       /* See if words move in pairs.  If yes, vpblendd can be used.  */
35909       for (i = 0; i < 16; i += 2)
35910 	if (d->perm[i] + 1 != d->perm[i + 1])
35911 	  break;
35912       if (i < 16)
35913 	{
35914 	  /* See if words move the same in both lanes.  If not,
35915 	     vpblendvb must be used.  */
35916 	  for (i = 0; i < 8; i++)
35917 	    if (d->perm[i] + 8 != d->perm[i + 8])
35918 	      {
35919 		/* Use vpblendvb.  */
35920 		for (i = 0; i < 32; ++i)
35921 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35922 
35923 		vmode = V32QImode;
35924 		nelt = 32;
35925 		target = gen_lowpart (vmode, target);
35926 		op0 = gen_lowpart (vmode, op0);
35927 		op1 = gen_lowpart (vmode, op1);
35928 		goto finish_pblendvb;
35929 	      }
35930 
35931 	  /* Use vpblendw.  */
35932 	  for (i = 0; i < 16; ++i)
35933 	    mask |= (d->perm[i] >= 16) << i;
35934 	  break;
35935 	}
35936 
35937       /* Use vpblendd.  */
35938       for (i = 0; i < 8; ++i)
35939 	mask |= (d->perm[i * 2] >= 16) << i;
35940       vmode = V8SImode;
35941       goto do_subreg;
35942 
35943     case V4DImode:
35944       /* Use vpblendd.  */
35945       for (i = 0; i < 4; ++i)
35946 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35947       vmode = V8SImode;
35948       goto do_subreg;
35949 
35950     default:
35951       gcc_unreachable ();
35952     }
35953 
35954   /* This matches five different patterns with the different modes.  */
35955   x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35956   x = gen_rtx_SET (VOIDmode, target, x);
35957   emit_insn (x);
35958 
35959   return true;
35960 }
35961 
35962 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
35963    in terms of the variable form of vpermilps.
35964 
35965    Note that we will have already failed the immediate input vpermilps,
35966    which requires that the high and low part shuffle be identical; the
35967    variable form doesn't require that.  */
35968 
35969 static bool
35970 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35971 {
35972   rtx rperm[8], vperm;
35973   unsigned i;
35974 
35975   if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35976     return false;
35977 
35978   /* We can only permute within the 128-bit lane.  */
35979   for (i = 0; i < 8; ++i)
35980     {
35981       unsigned e = d->perm[i];
35982       if (i < 4 ? e >= 4 : e < 4)
35983 	return false;
35984     }
35985 
35986   if (d->testing_p)
35987     return true;
35988 
35989   for (i = 0; i < 8; ++i)
35990     {
35991       unsigned e = d->perm[i];
35992 
35993       /* Within each 128-bit lane, the elements of op0 are numbered
35994 	 from 0 and the elements of op1 are numbered from 4.  */
35995       if (e >= 8 + 4)
35996 	e -= 8;
35997       else if (e >= 4)
35998 	e -= 4;
35999 
36000       rperm[i] = GEN_INT (e);
36001     }
36002 
36003   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36004   vperm = force_reg (V8SImode, vperm);
36005   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36006 
36007   return true;
36008 }
36009 
36010 /* Return true if permutation D can be performed as VMODE permutation
36011    instead.  */
36012 
36013 static bool
36014 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36015 {
36016   unsigned int i, j, chunk;
36017 
36018   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36019       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36020       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36021     return false;
36022 
36023   if (GET_MODE_NUNITS (vmode) >= d->nelt)
36024     return true;
36025 
36026   chunk = d->nelt / GET_MODE_NUNITS (vmode);
36027   for (i = 0; i < d->nelt; i += chunk)
36028     if (d->perm[i] & (chunk - 1))
36029       return false;
36030     else
36031       for (j = 1; j < chunk; ++j)
36032 	if (d->perm[i] + j != d->perm[i + j])
36033 	  return false;
36034 
36035   return true;
36036 }
36037 
36038 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
36039    in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128.  */
36040 
36041 static bool
36042 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36043 {
36044   unsigned i, nelt, eltsz, mask;
36045   unsigned char perm[32];
36046   enum machine_mode vmode = V16QImode;
36047   rtx rperm[32], vperm, target, op0, op1;
36048 
36049   nelt = d->nelt;
36050 
36051   if (d->op0 != d->op1)
36052     {
36053       if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36054 	{
36055 	  if (TARGET_AVX2
36056 	      && valid_perm_using_mode_p (V2TImode, d))
36057 	    {
36058 	      if (d->testing_p)
36059 		return true;
36060 
36061 	      /* Use vperm2i128 insn.  The pattern uses
36062 		 V4DImode instead of V2TImode.  */
36063 	      target = gen_lowpart (V4DImode, d->target);
36064 	      op0 = gen_lowpart (V4DImode, d->op0);
36065 	      op1 = gen_lowpart (V4DImode, d->op1);
36066 	      rperm[0]
36067 		= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36068 			   || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36069 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36070 	      return true;
36071 	    }
36072 	  return false;
36073 	}
36074     }
36075   else
36076     {
36077       if (GET_MODE_SIZE (d->vmode) == 16)
36078 	{
36079 	  if (!TARGET_SSSE3)
36080 	    return false;
36081 	}
36082       else if (GET_MODE_SIZE (d->vmode) == 32)
36083 	{
36084 	  if (!TARGET_AVX2)
36085 	    return false;
36086 
36087 	  /* V4DImode should be already handled through
36088 	     expand_vselect by vpermq instruction.  */
36089 	  gcc_assert (d->vmode != V4DImode);
36090 
36091 	  vmode = V32QImode;
36092 	  if (d->vmode == V8SImode
36093 	      || d->vmode == V16HImode
36094 	      || d->vmode == V32QImode)
36095 	    {
36096 	      /* First see if vpermq can be used for
36097 		 V8SImode/V16HImode/V32QImode.  */
36098 	      if (valid_perm_using_mode_p (V4DImode, d))
36099 		{
36100 		  for (i = 0; i < 4; i++)
36101 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36102 		  if (d->testing_p)
36103 		    return true;
36104 		  return expand_vselect (gen_lowpart (V4DImode, d->target),
36105 					 gen_lowpart (V4DImode, d->op0),
36106 					 perm, 4);
36107 		}
36108 
36109 	      /* Next see if vpermd can be used.  */
36110 	      if (valid_perm_using_mode_p (V8SImode, d))
36111 		vmode = V8SImode;
36112 	    }
36113 
36114 	  if (vmode == V32QImode)
36115 	    {
36116 	      /* vpshufb only works intra lanes, it is not
36117 		 possible to shuffle bytes in between the lanes.  */
36118 	      for (i = 0; i < nelt; ++i)
36119 		if ((d->perm[i] ^ i) & (nelt / 2))
36120 		  return false;
36121 	    }
36122 	}
36123       else
36124 	return false;
36125     }
36126 
36127   if (d->testing_p)
36128     return true;
36129 
36130   if (vmode == V8SImode)
36131     for (i = 0; i < 8; ++i)
36132       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36133   else
36134     {
36135       eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36136       if (d->op0 != d->op1)
36137 	mask = 2 * nelt - 1;
36138       else if (vmode == V16QImode)
36139 	mask = nelt - 1;
36140       else
36141 	mask = nelt / 2 - 1;
36142 
36143       for (i = 0; i < nelt; ++i)
36144 	{
36145 	  unsigned j, e = d->perm[i] & mask;
36146 	  for (j = 0; j < eltsz; ++j)
36147 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36148 	}
36149     }
36150 
36151   vperm = gen_rtx_CONST_VECTOR (vmode,
36152 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36153   vperm = force_reg (vmode, vperm);
36154 
36155   target = gen_lowpart (vmode, d->target);
36156   op0 = gen_lowpart (vmode, d->op0);
36157   if (d->op0 == d->op1)
36158     {
36159       if (vmode == V16QImode)
36160 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36161       else if (vmode == V32QImode)
36162 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36163       else
36164 	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36165     }
36166   else
36167     {
36168       op1 = gen_lowpart (vmode, d->op1);
36169       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36170     }
36171 
36172   return true;
36173 }
36174 
36175 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
36176    in a single instruction.  */
36177 
36178 static bool
36179 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36180 {
36181   unsigned i, nelt = d->nelt;
36182   unsigned char perm2[MAX_VECT_LEN];
36183 
36184   /* Check plain VEC_SELECT first, because AVX has instructions that could
36185      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36186      input where SEL+CONCAT may not.  */
36187   if (d->op0 == d->op1)
36188     {
36189       int mask = nelt - 1;
36190       bool identity_perm = true;
36191       bool broadcast_perm = true;
36192 
36193       for (i = 0; i < nelt; i++)
36194 	{
36195 	  perm2[i] = d->perm[i] & mask;
36196 	  if (perm2[i] != i)
36197 	    identity_perm = false;
36198 	  if (perm2[i])
36199 	    broadcast_perm = false;
36200 	}
36201 
36202       if (identity_perm)
36203 	{
36204 	  if (!d->testing_p)
36205 	    emit_move_insn (d->target, d->op0);
36206 	  return true;
36207 	}
36208       else if (broadcast_perm && TARGET_AVX2)
36209 	{
36210 	  /* Use vpbroadcast{b,w,d}.  */
36211 	  rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
36212 	  switch (d->vmode)
36213 	    {
36214 	    case V32QImode:
36215 	      op = gen_lowpart (V16QImode, op);
36216 	      gen = gen_avx2_pbroadcastv32qi;
36217 	      break;
36218 	    case V16HImode:
36219 	      op = gen_lowpart (V8HImode, op);
36220 	      gen = gen_avx2_pbroadcastv16hi;
36221 	      break;
36222 	    case V8SImode:
36223 	      op = gen_lowpart (V4SImode, op);
36224 	      gen = gen_avx2_pbroadcastv8si;
36225 	      break;
36226 	    case V16QImode:
36227 	      gen = gen_avx2_pbroadcastv16qi;
36228 	      break;
36229 	    case V8HImode:
36230 	      gen = gen_avx2_pbroadcastv8hi;
36231 	      break;
36232 	    /* For other modes prefer other shuffles this function creates.  */
36233 	    default: break;
36234 	    }
36235 	  if (gen != NULL)
36236 	    {
36237 	      if (!d->testing_p)
36238 		emit_insn (gen (d->target, op));
36239 	      return true;
36240 	    }
36241 	}
36242 
36243       if (expand_vselect (d->target, d->op0, perm2, nelt))
36244 	return true;
36245 
36246       /* There are plenty of patterns in sse.md that are written for
36247 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
36248 	 that should be changed, to avoid the nastiness here.  */
36249 
36250       /* Recognize interleave style patterns, which means incrementing
36251 	 every other permutation operand.  */
36252       for (i = 0; i < nelt; i += 2)
36253 	{
36254 	  perm2[i] = d->perm[i] & mask;
36255 	  perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36256 	}
36257       if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36258 	return true;
36259 
36260       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
36261       if (nelt >= 4)
36262 	{
36263 	  for (i = 0; i < nelt; i += 4)
36264 	    {
36265 	      perm2[i + 0] = d->perm[i + 0] & mask;
36266 	      perm2[i + 1] = d->perm[i + 1] & mask;
36267 	      perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36268 	      perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36269 	    }
36270 
36271 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36272 	    return true;
36273 	}
36274     }
36275 
36276   /* Finally, try the fully general two operand permute.  */
36277   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
36278     return true;
36279 
36280   /* Recognize interleave style patterns with reversed operands.  */
36281   if (d->op0 != d->op1)
36282     {
36283       for (i = 0; i < nelt; ++i)
36284 	{
36285 	  unsigned e = d->perm[i];
36286 	  if (e >= nelt)
36287 	    e -= nelt;
36288 	  else
36289 	    e += nelt;
36290 	  perm2[i] = e;
36291 	}
36292 
36293       if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
36294 	return true;
36295     }
36296 
36297   /* Try the SSE4.1 blend variable merge instructions.  */
36298   if (expand_vec_perm_blend (d))
36299     return true;
36300 
36301   /* Try one of the AVX vpermil variable permutations.  */
36302   if (expand_vec_perm_vpermil (d))
36303     return true;
36304 
36305   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36306      vpshufb, vpermd or vpermq variable permutation.  */
36307   if (expand_vec_perm_pshufb (d))
36308     return true;
36309 
36310   return false;
36311 }
36312 
36313 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
36314    in terms of a pair of pshuflw + pshufhw instructions.  */
36315 
36316 static bool
36317 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36318 {
36319   unsigned char perm2[MAX_VECT_LEN];
36320   unsigned i;
36321   bool ok;
36322 
36323   if (d->vmode != V8HImode || d->op0 != d->op1)
36324     return false;
36325 
36326   /* The two permutations only operate in 64-bit lanes.  */
36327   for (i = 0; i < 4; ++i)
36328     if (d->perm[i] >= 4)
36329       return false;
36330   for (i = 4; i < 8; ++i)
36331     if (d->perm[i] < 4)
36332       return false;
36333 
36334   if (d->testing_p)
36335     return true;
36336 
36337   /* Emit the pshuflw.  */
36338   memcpy (perm2, d->perm, 4);
36339   for (i = 4; i < 8; ++i)
36340     perm2[i] = i;
36341   ok = expand_vselect (d->target, d->op0, perm2, 8);
36342   gcc_assert (ok);
36343 
36344   /* Emit the pshufhw.  */
36345   memcpy (perm2 + 4, d->perm + 4, 4);
36346   for (i = 0; i < 4; ++i)
36347     perm2[i] = i;
36348   ok = expand_vselect (d->target, d->target, perm2, 8);
36349   gcc_assert (ok);
36350 
36351   return true;
36352 }
36353 
36354 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36355    the permutation using the SSSE3 palignr instruction.  This succeeds
36356    when all of the elements in PERM fit within one vector and we merely
36357    need to shift them down so that a single vector permutation has a
36358    chance to succeed.  */
36359 
36360 static bool
36361 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36362 {
36363   unsigned i, nelt = d->nelt;
36364   unsigned min, max;
36365   bool in_order, ok;
36366   rtx shift;
36367 
36368   /* Even with AVX, palignr only operates on 128-bit vectors.  */
36369   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36370     return false;
36371 
36372   min = nelt, max = 0;
36373   for (i = 0; i < nelt; ++i)
36374     {
36375       unsigned e = d->perm[i];
36376       if (e < min)
36377 	min = e;
36378       if (e > max)
36379 	max = e;
36380     }
36381   if (min == 0 || max - min >= nelt)
36382     return false;
36383 
36384   /* Given that we have SSSE3, we know we'll be able to implement the
36385      single operand permutation after the palignr with pshufb.  */
36386   if (d->testing_p)
36387     return true;
36388 
36389   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36390   emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36391 				  gen_lowpart (TImode, d->op1),
36392 				  gen_lowpart (TImode, d->op0), shift));
36393 
36394   d->op0 = d->op1 = d->target;
36395 
36396   in_order = true;
36397   for (i = 0; i < nelt; ++i)
36398     {
36399       unsigned e = d->perm[i] - min;
36400       if (e != i)
36401 	in_order = false;
36402       d->perm[i] = e;
36403     }
36404 
36405   /* Test for the degenerate case where the alignment by itself
36406      produces the desired permutation.  */
36407   if (in_order)
36408     return true;
36409 
36410   ok = expand_vec_perm_1 (d);
36411   gcc_assert (ok);
36412 
36413   return ok;
36414 }
36415 
36416 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36417 
36418 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36419    a two vector permutation into a single vector permutation by using
36420    an interleave operation to merge the vectors.  */
36421 
36422 static bool
36423 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36424 {
36425   struct expand_vec_perm_d dremap, dfinal;
36426   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36427   unsigned HOST_WIDE_INT contents;
36428   unsigned char remap[2 * MAX_VECT_LEN];
36429   rtx seq;
36430   bool ok, same_halves = false;
36431 
36432   if (GET_MODE_SIZE (d->vmode) == 16)
36433     {
36434       if (d->op0 == d->op1)
36435 	return false;
36436     }
36437   else if (GET_MODE_SIZE (d->vmode) == 32)
36438     {
36439       if (!TARGET_AVX)
36440 	return false;
36441       /* For 32-byte modes allow even d->op0 == d->op1.
36442 	 The lack of cross-lane shuffling in some instructions
36443 	 might prevent a single insn shuffle.  */
36444       dfinal = *d;
36445       dfinal.testing_p = true;
36446       /* If expand_vec_perm_interleave3 can expand this into
36447 	 a 3 insn sequence, give up and let it be expanded as
36448 	 3 insn sequence.  While that is one insn longer,
36449 	 it doesn't need a memory operand and in the common
36450 	 case that both interleave low and high permutations
36451 	 with the same operands are adjacent needs 4 insns
36452 	 for both after CSE.  */
36453       if (expand_vec_perm_interleave3 (&dfinal))
36454 	return false;
36455     }
36456   else
36457     return false;
36458 
36459   /* Examine from whence the elements come.  */
36460   contents = 0;
36461   for (i = 0; i < nelt; ++i)
36462     contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36463 
36464   memset (remap, 0xff, sizeof (remap));
36465   dremap = *d;
36466 
36467   if (GET_MODE_SIZE (d->vmode) == 16)
36468     {
36469       unsigned HOST_WIDE_INT h1, h2, h3, h4;
36470 
36471       /* Split the two input vectors into 4 halves.  */
36472       h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36473       h2 = h1 << nelt2;
36474       h3 = h2 << nelt2;
36475       h4 = h3 << nelt2;
36476 
36477       /* If the elements from the low halves use interleave low, and similarly
36478 	 for interleave high.  If the elements are from mis-matched halves, we
36479 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
36480       if ((contents & (h1 | h3)) == contents)
36481 	{
36482 	  /* punpckl* */
36483 	  for (i = 0; i < nelt2; ++i)
36484 	    {
36485 	      remap[i] = i * 2;
36486 	      remap[i + nelt] = i * 2 + 1;
36487 	      dremap.perm[i * 2] = i;
36488 	      dremap.perm[i * 2 + 1] = i + nelt;
36489 	    }
36490 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
36491 	    dremap.vmode = V4SFmode;
36492 	}
36493       else if ((contents & (h2 | h4)) == contents)
36494 	{
36495 	  /* punpckh* */
36496 	  for (i = 0; i < nelt2; ++i)
36497 	    {
36498 	      remap[i + nelt2] = i * 2;
36499 	      remap[i + nelt + nelt2] = i * 2 + 1;
36500 	      dremap.perm[i * 2] = i + nelt2;
36501 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36502 	    }
36503 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
36504 	    dremap.vmode = V4SFmode;
36505 	}
36506       else if ((contents & (h1 | h4)) == contents)
36507 	{
36508 	  /* shufps */
36509 	  for (i = 0; i < nelt2; ++i)
36510 	    {
36511 	      remap[i] = i;
36512 	      remap[i + nelt + nelt2] = i + nelt2;
36513 	      dremap.perm[i] = i;
36514 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
36515 	    }
36516 	  if (nelt != 4)
36517 	    {
36518 	      /* shufpd */
36519 	      dremap.vmode = V2DImode;
36520 	      dremap.nelt = 2;
36521 	      dremap.perm[0] = 0;
36522 	      dremap.perm[1] = 3;
36523 	    }
36524 	}
36525       else if ((contents & (h2 | h3)) == contents)
36526 	{
36527 	  /* shufps */
36528 	  for (i = 0; i < nelt2; ++i)
36529 	    {
36530 	      remap[i + nelt2] = i;
36531 	      remap[i + nelt] = i + nelt2;
36532 	      dremap.perm[i] = i + nelt2;
36533 	      dremap.perm[i + nelt2] = i + nelt;
36534 	    }
36535 	  if (nelt != 4)
36536 	    {
36537 	      /* shufpd */
36538 	      dremap.vmode = V2DImode;
36539 	      dremap.nelt = 2;
36540 	      dremap.perm[0] = 1;
36541 	      dremap.perm[1] = 2;
36542 	    }
36543 	}
36544       else
36545 	return false;
36546     }
36547   else
36548     {
36549       unsigned int nelt4 = nelt / 4, nzcnt = 0;
36550       unsigned HOST_WIDE_INT q[8];
36551       unsigned int nonzero_halves[4];
36552 
36553       /* Split the two input vectors into 8 quarters.  */
36554       q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36555       for (i = 1; i < 8; ++i)
36556 	q[i] = q[0] << (nelt4 * i);
36557       for (i = 0; i < 4; ++i)
36558 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36559 	  {
36560 	    nonzero_halves[nzcnt] = i;
36561 	    ++nzcnt;
36562 	  }
36563 
36564       if (nzcnt == 1)
36565 	{
36566 	  gcc_assert (d->op0 == d->op1);
36567 	  nonzero_halves[1] = nonzero_halves[0];
36568 	  same_halves = true;
36569 	}
36570       else if (d->op0 == d->op1)
36571 	{
36572 	  gcc_assert (nonzero_halves[0] == 0);
36573 	  gcc_assert (nonzero_halves[1] == 1);
36574 	}
36575 
36576       if (nzcnt <= 2)
36577 	{
36578 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
36579 	    {
36580 	      /* Attempt to increase the likelyhood that dfinal
36581 		 shuffle will be intra-lane.  */
36582 	      char tmph = nonzero_halves[0];
36583 	      nonzero_halves[0] = nonzero_halves[1];
36584 	      nonzero_halves[1] = tmph;
36585 	    }
36586 
36587 	  /* vperm2f128 or vperm2i128.  */
36588 	  for (i = 0; i < nelt2; ++i)
36589 	    {
36590 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36591 	      remap[i + nonzero_halves[0] * nelt2] = i;
36592 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36593 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36594 	    }
36595 
36596 	  if (d->vmode != V8SFmode
36597 	      && d->vmode != V4DFmode
36598 	      && d->vmode != V8SImode)
36599 	    {
36600 	      dremap.vmode = V8SImode;
36601 	      dremap.nelt = 8;
36602 	      for (i = 0; i < 4; ++i)
36603 		{
36604 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
36605 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36606 		}
36607 	    }
36608 	}
36609       else if (d->op0 == d->op1)
36610 	return false;
36611       else if (TARGET_AVX2
36612 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36613 	{
36614 	  /* vpunpckl* */
36615 	  for (i = 0; i < nelt4; ++i)
36616 	    {
36617 	      remap[i] = i * 2;
36618 	      remap[i + nelt] = i * 2 + 1;
36619 	      remap[i + nelt2] = i * 2 + nelt2;
36620 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36621 	      dremap.perm[i * 2] = i;
36622 	      dremap.perm[i * 2 + 1] = i + nelt;
36623 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
36624 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36625 	    }
36626 	}
36627       else if (TARGET_AVX2
36628 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36629 	{
36630 	  /* vpunpckh* */
36631 	  for (i = 0; i < nelt4; ++i)
36632 	    {
36633 	      remap[i + nelt4] = i * 2;
36634 	      remap[i + nelt + nelt4] = i * 2 + 1;
36635 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36636 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36637 	      dremap.perm[i * 2] = i + nelt4;
36638 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36639 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36640 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36641 	    }
36642 	}
36643       else
36644 	return false;
36645     }
36646 
36647   /* Use the remapping array set up above to move the elements from their
36648      swizzled locations into their final destinations.  */
36649   dfinal = *d;
36650   for (i = 0; i < nelt; ++i)
36651     {
36652       unsigned e = remap[d->perm[i]];
36653       gcc_assert (e < nelt);
36654       /* If same_halves is true, both halves of the remapped vector are the
36655 	 same.  Avoid cross-lane accesses if possible.  */
36656       if (same_halves && i >= nelt2)
36657 	{
36658 	  gcc_assert (e < nelt2);
36659 	  dfinal.perm[i] = e + nelt2;
36660 	}
36661       else
36662 	dfinal.perm[i] = e;
36663     }
36664 
36665   if (!d->testing_p)
36666     dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36667   dfinal.op1 = dfinal.op0;
36668   dremap.target = dfinal.op0;
36669 
36670   /* Test if the final remap can be done with a single insn.  For V4SFmode or
36671      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
36672   start_sequence ();
36673   ok = expand_vec_perm_1 (&dfinal);
36674   seq = get_insns ();
36675   end_sequence ();
36676 
36677   if (!ok)
36678     return false;
36679 
36680   if (d->testing_p)
36681     return true;
36682 
36683   if (dremap.vmode != dfinal.vmode)
36684     {
36685       dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36686       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36687       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36688     }
36689 
36690   ok = expand_vec_perm_1 (&dremap);
36691   gcc_assert (ok);
36692 
36693   emit_insn (seq);
36694   return true;
36695 }
36696 
36697 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36698    a single vector cross-lane permutation into vpermq followed
36699    by any of the single insn permutations.  */
36700 
36701 static bool
36702 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36703 {
36704   struct expand_vec_perm_d dremap, dfinal;
36705   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36706   unsigned contents[2];
36707   bool ok;
36708 
36709   if (!(TARGET_AVX2
36710 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
36711 	&& d->op0 == d->op1))
36712     return false;
36713 
36714   contents[0] = 0;
36715   contents[1] = 0;
36716   for (i = 0; i < nelt2; ++i)
36717     {
36718       contents[0] |= 1u << (d->perm[i] / nelt4);
36719       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36720     }
36721 
36722   for (i = 0; i < 2; ++i)
36723     {
36724       unsigned int cnt = 0;
36725       for (j = 0; j < 4; ++j)
36726 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36727 	  return false;
36728     }
36729 
36730   if (d->testing_p)
36731     return true;
36732 
36733   dremap = *d;
36734   dremap.vmode = V4DImode;
36735   dremap.nelt = 4;
36736   dremap.target = gen_reg_rtx (V4DImode);
36737   dremap.op0 = gen_lowpart (V4DImode, d->op0);
36738   dremap.op1 = dremap.op0;
36739   for (i = 0; i < 2; ++i)
36740     {
36741       unsigned int cnt = 0;
36742       for (j = 0; j < 4; ++j)
36743 	if ((contents[i] & (1u << j)) != 0)
36744 	  dremap.perm[2 * i + cnt++] = j;
36745       for (; cnt < 2; ++cnt)
36746 	dremap.perm[2 * i + cnt] = 0;
36747     }
36748 
36749   dfinal = *d;
36750   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36751   dfinal.op1 = dfinal.op0;
36752   for (i = 0, j = 0; i < nelt; ++i)
36753     {
36754       if (i == nelt2)
36755 	j = 2;
36756       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36757       if ((d->perm[i] / nelt4) == dremap.perm[j])
36758 	;
36759       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36760 	dfinal.perm[i] |= nelt4;
36761       else
36762 	gcc_unreachable ();
36763     }
36764 
36765   ok = expand_vec_perm_1 (&dremap);
36766   gcc_assert (ok);
36767 
36768   ok = expand_vec_perm_1 (&dfinal);
36769   gcc_assert (ok);
36770 
36771   return true;
36772 }
36773 
36774 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
36775    a two vector permutation using 2 intra-lane interleave insns
36776    and cross-lane shuffle for 32-byte vectors.  */
36777 
36778 static bool
36779 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36780 {
36781   unsigned i, nelt;
36782   rtx (*gen) (rtx, rtx, rtx);
36783 
36784   if (d->op0 == d->op1)
36785     return false;
36786   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36787     ;
36788   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36789     ;
36790   else
36791     return false;
36792 
36793   nelt = d->nelt;
36794   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36795     return false;
36796   for (i = 0; i < nelt; i += 2)
36797     if (d->perm[i] != d->perm[0] + i / 2
36798 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36799       return false;
36800 
36801   if (d->testing_p)
36802     return true;
36803 
36804   switch (d->vmode)
36805     {
36806     case V32QImode:
36807       if (d->perm[0])
36808 	gen = gen_vec_interleave_highv32qi;
36809       else
36810 	gen = gen_vec_interleave_lowv32qi;
36811       break;
36812     case V16HImode:
36813       if (d->perm[0])
36814 	gen = gen_vec_interleave_highv16hi;
36815       else
36816 	gen = gen_vec_interleave_lowv16hi;
36817       break;
36818     case V8SImode:
36819       if (d->perm[0])
36820 	gen = gen_vec_interleave_highv8si;
36821       else
36822 	gen = gen_vec_interleave_lowv8si;
36823       break;
36824     case V4DImode:
36825       if (d->perm[0])
36826 	gen = gen_vec_interleave_highv4di;
36827       else
36828 	gen = gen_vec_interleave_lowv4di;
36829       break;
36830     case V8SFmode:
36831       if (d->perm[0])
36832 	gen = gen_vec_interleave_highv8sf;
36833       else
36834 	gen = gen_vec_interleave_lowv8sf;
36835       break;
36836     case V4DFmode:
36837       if (d->perm[0])
36838 	gen = gen_vec_interleave_highv4df;
36839       else
36840 	gen = gen_vec_interleave_lowv4df;
36841       break;
36842     default:
36843       gcc_unreachable ();
36844     }
36845 
36846   emit_insn (gen (d->target, d->op0, d->op1));
36847   return true;
36848 }
36849 
36850 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
36851    permutation with two pshufb insns and an ior.  We should have already
36852    failed all two instruction sequences.  */
36853 
36854 static bool
36855 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36856 {
36857   rtx rperm[2][16], vperm, l, h, op, m128;
36858   unsigned int i, nelt, eltsz;
36859 
36860   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36861     return false;
36862   gcc_assert (d->op0 != d->op1);
36863 
36864   if (d->testing_p)
36865     return true;
36866 
36867   nelt = d->nelt;
36868   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36869 
36870   /* Generate two permutation masks.  If the required element is within
36871      the given vector it is shuffled into the proper lane.  If the required
36872      element is in the other vector, force a zero into the lane by setting
36873      bit 7 in the permutation mask.  */
36874   m128 = GEN_INT (-128);
36875   for (i = 0; i < nelt; ++i)
36876     {
36877       unsigned j, e = d->perm[i];
36878       unsigned which = (e >= nelt);
36879       if (e >= nelt)
36880 	e -= nelt;
36881 
36882       for (j = 0; j < eltsz; ++j)
36883 	{
36884 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36885 	  rperm[1-which][i*eltsz + j] = m128;
36886 	}
36887     }
36888 
36889   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36890   vperm = force_reg (V16QImode, vperm);
36891 
36892   l = gen_reg_rtx (V16QImode);
36893   op = gen_lowpart (V16QImode, d->op0);
36894   emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36895 
36896   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36897   vperm = force_reg (V16QImode, vperm);
36898 
36899   h = gen_reg_rtx (V16QImode);
36900   op = gen_lowpart (V16QImode, d->op1);
36901   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36902 
36903   op = gen_lowpart (V16QImode, d->target);
36904   emit_insn (gen_iorv16qi3 (op, l, h));
36905 
36906   return true;
36907 }
36908 
36909 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36910    with two vpshufb insns, vpermq and vpor.  We should have already failed
36911    all two or three instruction sequences.  */
36912 
36913 static bool
36914 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36915 {
36916   rtx rperm[2][32], vperm, l, h, hp, op, m128;
36917   unsigned int i, nelt, eltsz;
36918 
36919   if (!TARGET_AVX2
36920       || d->op0 != d->op1
36921       || (d->vmode != V32QImode && d->vmode != V16HImode))
36922     return false;
36923 
36924   if (d->testing_p)
36925     return true;
36926 
36927   nelt = d->nelt;
36928   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36929 
36930   /* Generate two permutation masks.  If the required element is within
36931      the same lane, it is shuffled in.  If the required element from the
36932      other lane, force a zero by setting bit 7 in the permutation mask.
36933      In the other mask the mask has non-negative elements if element
36934      is requested from the other lane, but also moved to the other lane,
36935      so that the result of vpshufb can have the two V2TImode halves
36936      swapped.  */
36937   m128 = GEN_INT (-128);
36938   for (i = 0; i < nelt; ++i)
36939     {
36940       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36941       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36942 
36943       for (j = 0; j < eltsz; ++j)
36944 	{
36945 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36946 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36947 	}
36948     }
36949 
36950   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36951   vperm = force_reg (V32QImode, vperm);
36952 
36953   h = gen_reg_rtx (V32QImode);
36954   op = gen_lowpart (V32QImode, d->op0);
36955   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36956 
36957   /* Swap the 128-byte lanes of h into hp.  */
36958   hp = gen_reg_rtx (V4DImode);
36959   op = gen_lowpart (V4DImode, h);
36960   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36961 				  const1_rtx));
36962 
36963   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36964   vperm = force_reg (V32QImode, vperm);
36965 
36966   l = gen_reg_rtx (V32QImode);
36967   op = gen_lowpart (V32QImode, d->op0);
36968   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36969 
36970   op = gen_lowpart (V32QImode, d->target);
36971   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36972 
36973   return true;
36974 }
36975 
36976 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
36977    and extract-odd permutations of two V32QImode and V16QImode operand
36978    with two vpshufb insns, vpor and vpermq.  We should have already
36979    failed all two or three instruction sequences.  */
36980 
36981 static bool
36982 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36983 {
36984   rtx rperm[2][32], vperm, l, h, ior, op, m128;
36985   unsigned int i, nelt, eltsz;
36986 
36987   if (!TARGET_AVX2
36988       || d->op0 == d->op1
36989       || (d->vmode != V32QImode && d->vmode != V16HImode))
36990     return false;
36991 
36992   for (i = 0; i < d->nelt; ++i)
36993     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36994       return false;
36995 
36996   if (d->testing_p)
36997     return true;
36998 
36999   nelt = d->nelt;
37000   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37001 
37002   /* Generate two permutation masks.  In the first permutation mask
37003      the first quarter will contain indexes for the first half
37004      of the op0, the second quarter will contain bit 7 set, third quarter
37005      will contain indexes for the second half of the op0 and the
37006      last quarter bit 7 set.  In the second permutation mask
37007      the first quarter will contain bit 7 set, the second quarter
37008      indexes for the first half of the op1, the third quarter bit 7 set
37009      and last quarter indexes for the second half of the op1.
37010      I.e. the first mask e.g. for V32QImode extract even will be:
37011      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37012      (all values masked with 0xf except for -128) and second mask
37013      for extract even will be
37014      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
37015   m128 = GEN_INT (-128);
37016   for (i = 0; i < nelt; ++i)
37017     {
37018       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37019       unsigned which = d->perm[i] >= nelt;
37020       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37021 
37022       for (j = 0; j < eltsz; ++j)
37023 	{
37024 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37025 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37026 	}
37027     }
37028 
37029   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37030   vperm = force_reg (V32QImode, vperm);
37031 
37032   l = gen_reg_rtx (V32QImode);
37033   op = gen_lowpart (V32QImode, d->op0);
37034   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37035 
37036   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37037   vperm = force_reg (V32QImode, vperm);
37038 
37039   h = gen_reg_rtx (V32QImode);
37040   op = gen_lowpart (V32QImode, d->op1);
37041   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37042 
37043   ior = gen_reg_rtx (V32QImode);
37044   emit_insn (gen_iorv32qi3 (ior, l, h));
37045 
37046   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
37047   op = gen_lowpart (V4DImode, d->target);
37048   ior = gen_lowpart (V4DImode, ior);
37049   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37050 				  const1_rtx, GEN_INT (3)));
37051 
37052   return true;
37053 }
37054 
37055 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
37056    and extract-odd permutations.  */
37057 
37058 static bool
37059 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37060 {
37061   rtx t1, t2, t3;
37062 
37063   switch (d->vmode)
37064     {
37065     case V4DFmode:
37066       if (d->testing_p)
37067 	break;
37068       t1 = gen_reg_rtx (V4DFmode);
37069       t2 = gen_reg_rtx (V4DFmode);
37070 
37071       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
37072       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37073       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37074 
37075       /* Now an unpck[lh]pd will produce the result required.  */
37076       if (odd)
37077 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37078       else
37079 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37080       emit_insn (t3);
37081       break;
37082 
37083     case V8SFmode:
37084       {
37085 	int mask = odd ? 0xdd : 0x88;
37086 
37087 	if (d->testing_p)
37088 	  break;
37089 	t1 = gen_reg_rtx (V8SFmode);
37090 	t2 = gen_reg_rtx (V8SFmode);
37091 	t3 = gen_reg_rtx (V8SFmode);
37092 
37093 	/* Shuffle within the 128-bit lanes to produce:
37094 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
37095 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37096 				      GEN_INT (mask)));
37097 
37098 	/* Shuffle the lanes around to produce:
37099 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
37100 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37101 					    GEN_INT (0x3)));
37102 
37103 	/* Shuffle within the 128-bit lanes to produce:
37104 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
37105 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37106 
37107 	/* Shuffle within the 128-bit lanes to produce:
37108 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
37109 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37110 
37111 	/* Shuffle the lanes around to produce:
37112 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
37113 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37114 					    GEN_INT (0x20)));
37115       }
37116       break;
37117 
37118     case V2DFmode:
37119     case V4SFmode:
37120     case V2DImode:
37121     case V4SImode:
37122       /* These are always directly implementable by expand_vec_perm_1.  */
37123       gcc_unreachable ();
37124 
37125     case V8HImode:
37126       if (TARGET_SSSE3)
37127 	return expand_vec_perm_pshufb2 (d);
37128       else
37129 	{
37130 	  if (d->testing_p)
37131 	    break;
37132 	  /* We need 2*log2(N)-1 operations to achieve odd/even
37133 	     with interleave. */
37134 	  t1 = gen_reg_rtx (V8HImode);
37135 	  t2 = gen_reg_rtx (V8HImode);
37136 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37137 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37138 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37139 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37140 	  if (odd)
37141 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37142 	  else
37143 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37144 	  emit_insn (t3);
37145 	}
37146       break;
37147 
37148     case V16QImode:
37149       if (TARGET_SSSE3)
37150 	return expand_vec_perm_pshufb2 (d);
37151       else
37152 	{
37153 	  if (d->testing_p)
37154 	    break;
37155 	  t1 = gen_reg_rtx (V16QImode);
37156 	  t2 = gen_reg_rtx (V16QImode);
37157 	  t3 = gen_reg_rtx (V16QImode);
37158 	  emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37159 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37160 	  emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37161 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37162 	  emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37163 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37164 	  if (odd)
37165 	    t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37166 	  else
37167 	    t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37168 	  emit_insn (t3);
37169 	}
37170       break;
37171 
37172     case V16HImode:
37173     case V32QImode:
37174       return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37175 
37176     case V4DImode:
37177       if (!TARGET_AVX2)
37178 	{
37179 	  struct expand_vec_perm_d d_copy = *d;
37180 	  d_copy.vmode = V4DFmode;
37181 	  d_copy.target = gen_lowpart (V4DFmode, d->target);
37182 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37183 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37184 	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
37185 	}
37186 
37187       if (d->testing_p)
37188 	break;
37189 
37190       t1 = gen_reg_rtx (V4DImode);
37191       t2 = gen_reg_rtx (V4DImode);
37192 
37193       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
37194       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37195       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37196 
37197       /* Now an vpunpck[lh]qdq will produce the result required.  */
37198       if (odd)
37199 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37200       else
37201 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37202       emit_insn (t3);
37203       break;
37204 
37205     case V8SImode:
37206       if (!TARGET_AVX2)
37207 	{
37208 	  struct expand_vec_perm_d d_copy = *d;
37209 	  d_copy.vmode = V8SFmode;
37210 	  d_copy.target = gen_lowpart (V8SFmode, d->target);
37211 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37212 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37213 	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
37214 	}
37215 
37216       if (d->testing_p)
37217 	break;
37218 
37219       t1 = gen_reg_rtx (V8SImode);
37220       t2 = gen_reg_rtx (V8SImode);
37221 
37222       /* Shuffle the lanes around into
37223 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
37224       emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37225 				    gen_lowpart (V4DImode, d->op0),
37226 				    gen_lowpart (V4DImode, d->op1),
37227 				    GEN_INT (0x20)));
37228       emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37229 				    gen_lowpart (V4DImode, d->op0),
37230 				    gen_lowpart (V4DImode, d->op1),
37231 				    GEN_INT (0x31)));
37232 
37233       /* Swap the 2nd and 3rd position in each lane into
37234 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
37235       emit_insn (gen_avx2_pshufdv3 (t1, t1,
37236 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37237       emit_insn (gen_avx2_pshufdv3 (t2, t2,
37238 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37239 
37240       /* Now an vpunpck[lh]qdq will produce
37241 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
37242       if (odd)
37243 	t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37244 					   gen_lowpart (V4DImode, t1),
37245 					   gen_lowpart (V4DImode, t2));
37246       else
37247 	t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37248 					  gen_lowpart (V4DImode, t1),
37249 					  gen_lowpart (V4DImode, t2));
37250       emit_insn (t3);
37251       break;
37252 
37253     default:
37254       gcc_unreachable ();
37255     }
37256 
37257   return true;
37258 }
37259 
37260 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
37261    extract-even and extract-odd permutations.  */
37262 
37263 static bool
37264 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37265 {
37266   unsigned i, odd, nelt = d->nelt;
37267 
37268   odd = d->perm[0];
37269   if (odd != 0 && odd != 1)
37270     return false;
37271 
37272   for (i = 1; i < nelt; ++i)
37273     if (d->perm[i] != 2 * i + odd)
37274       return false;
37275 
37276   return expand_vec_perm_even_odd_1 (d, odd);
37277 }
37278 
37279 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
37280    permutations.  We assume that expand_vec_perm_1 has already failed.  */
37281 
37282 static bool
37283 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37284 {
37285   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37286   enum machine_mode vmode = d->vmode;
37287   unsigned char perm2[4];
37288   rtx op0 = d->op0;
37289   bool ok;
37290 
37291   switch (vmode)
37292     {
37293     case V4DFmode:
37294     case V8SFmode:
37295       /* These are special-cased in sse.md so that we can optionally
37296 	 use the vbroadcast instruction.  They expand to two insns
37297 	 if the input happens to be in a register.  */
37298       gcc_unreachable ();
37299 
37300     case V2DFmode:
37301     case V2DImode:
37302     case V4SFmode:
37303     case V4SImode:
37304       /* These are always implementable using standard shuffle patterns.  */
37305       gcc_unreachable ();
37306 
37307     case V8HImode:
37308     case V16QImode:
37309       /* These can be implemented via interleave.  We save one insn by
37310 	 stopping once we have promoted to V4SImode and then use pshufd.  */
37311       if (d->testing_p)
37312 	return true;
37313       do
37314 	{
37315 	  rtx dest;
37316 	  rtx (*gen) (rtx, rtx, rtx)
37317 	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37318 				 : gen_vec_interleave_lowv8hi;
37319 
37320 	  if (elt >= nelt2)
37321 	    {
37322 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37323 				       : gen_vec_interleave_highv8hi;
37324 	      elt -= nelt2;
37325 	    }
37326 	  nelt2 /= 2;
37327 
37328 	  dest = gen_reg_rtx (vmode);
37329 	  emit_insn (gen (dest, op0, op0));
37330 	  vmode = get_mode_wider_vector (vmode);
37331 	  op0 = gen_lowpart (vmode, dest);
37332 	}
37333       while (vmode != V4SImode);
37334 
37335       memset (perm2, elt, 4);
37336       ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
37337       gcc_assert (ok);
37338       return true;
37339 
37340     case V32QImode:
37341     case V16HImode:
37342     case V8SImode:
37343     case V4DImode:
37344       /* For AVX2 broadcasts of the first element vpbroadcast* or
37345 	 vpermq should be used by expand_vec_perm_1.  */
37346       gcc_assert (!TARGET_AVX2 || d->perm[0]);
37347       return false;
37348 
37349     default:
37350       gcc_unreachable ();
37351     }
37352 }
37353 
37354 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
37355    broadcast permutations.  */
37356 
37357 static bool
37358 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37359 {
37360   unsigned i, elt, nelt = d->nelt;
37361 
37362   if (d->op0 != d->op1)
37363     return false;
37364 
37365   elt = d->perm[0];
37366   for (i = 1; i < nelt; ++i)
37367     if (d->perm[i] != elt)
37368       return false;
37369 
37370   return expand_vec_perm_broadcast_1 (d);
37371 }
37372 
37373 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37374    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
37375    all the shorter instruction sequences.  */
37376 
37377 static bool
37378 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37379 {
37380   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37381   unsigned int i, nelt, eltsz;
37382   bool used[4];
37383 
37384   if (!TARGET_AVX2
37385       || d->op0 == d->op1
37386       || (d->vmode != V32QImode && d->vmode != V16HImode))
37387     return false;
37388 
37389   if (d->testing_p)
37390     return true;
37391 
37392   nelt = d->nelt;
37393   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37394 
37395   /* Generate 4 permutation masks.  If the required element is within
37396      the same lane, it is shuffled in.  If the required element from the
37397      other lane, force a zero by setting bit 7 in the permutation mask.
37398      In the other mask the mask has non-negative elements if element
37399      is requested from the other lane, but also moved to the other lane,
37400      so that the result of vpshufb can have the two V2TImode halves
37401      swapped.  */
37402   m128 = GEN_INT (-128);
37403   for (i = 0; i < 32; ++i)
37404     {
37405       rperm[0][i] = m128;
37406       rperm[1][i] = m128;
37407       rperm[2][i] = m128;
37408       rperm[3][i] = m128;
37409     }
37410   used[0] = false;
37411   used[1] = false;
37412   used[2] = false;
37413   used[3] = false;
37414   for (i = 0; i < nelt; ++i)
37415     {
37416       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37417       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37418       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37419 
37420       for (j = 0; j < eltsz; ++j)
37421 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37422       used[which] = true;
37423     }
37424 
37425   for (i = 0; i < 2; ++i)
37426     {
37427       if (!used[2 * i + 1])
37428 	{
37429 	  h[i] = NULL_RTX;
37430 	  continue;
37431 	}
37432       vperm = gen_rtx_CONST_VECTOR (V32QImode,
37433 				    gen_rtvec_v (32, rperm[2 * i + 1]));
37434       vperm = force_reg (V32QImode, vperm);
37435       h[i] = gen_reg_rtx (V32QImode);
37436       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37437       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37438     }
37439 
37440   /* Swap the 128-byte lanes of h[X].  */
37441   for (i = 0; i < 2; ++i)
37442    {
37443      if (h[i] == NULL_RTX)
37444        continue;
37445      op = gen_reg_rtx (V4DImode);
37446      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37447 				     const2_rtx, GEN_INT (3), const0_rtx,
37448 				     const1_rtx));
37449      h[i] = gen_lowpart (V32QImode, op);
37450    }
37451 
37452   for (i = 0; i < 2; ++i)
37453     {
37454       if (!used[2 * i])
37455 	{
37456 	  l[i] = NULL_RTX;
37457 	  continue;
37458 	}
37459       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37460       vperm = force_reg (V32QImode, vperm);
37461       l[i] = gen_reg_rtx (V32QImode);
37462       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37463       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37464     }
37465 
37466   for (i = 0; i < 2; ++i)
37467     {
37468       if (h[i] && l[i])
37469 	{
37470 	  op = gen_reg_rtx (V32QImode);
37471 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37472 	  l[i] = op;
37473 	}
37474       else if (h[i])
37475 	l[i] = h[i];
37476     }
37477 
37478   gcc_assert (l[0] && l[1]);
37479   op = gen_lowpart (V32QImode, d->target);
37480   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37481   return true;
37482 }
37483 
37484 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37485    With all of the interface bits taken care of, perform the expansion
37486    in D and return true on success.  */
37487 
37488 static bool
37489 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37490 {
37491   /* Try a single instruction expansion.  */
37492   if (expand_vec_perm_1 (d))
37493     return true;
37494 
37495   /* Try sequences of two instructions.  */
37496 
37497   if (expand_vec_perm_pshuflw_pshufhw (d))
37498     return true;
37499 
37500   if (expand_vec_perm_palignr (d))
37501     return true;
37502 
37503   if (expand_vec_perm_interleave2 (d))
37504     return true;
37505 
37506   if (expand_vec_perm_broadcast (d))
37507     return true;
37508 
37509   if (expand_vec_perm_vpermq_perm_1 (d))
37510     return true;
37511 
37512   /* Try sequences of three instructions.  */
37513 
37514   if (expand_vec_perm_pshufb2 (d))
37515     return true;
37516 
37517   if (expand_vec_perm_interleave3 (d))
37518     return true;
37519 
37520   /* Try sequences of four instructions.  */
37521 
37522   if (expand_vec_perm_vpshufb2_vpermq (d))
37523     return true;
37524 
37525   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37526     return true;
37527 
37528   /* ??? Look for narrow permutations whose element orderings would
37529      allow the promotion to a wider mode.  */
37530 
37531   /* ??? Look for sequences of interleave or a wider permute that place
37532      the data into the correct lanes for a half-vector shuffle like
37533      pshuf[lh]w or vpermilps.  */
37534 
37535   /* ??? Look for sequences of interleave that produce the desired results.
37536      The combinatorics of punpck[lh] get pretty ugly... */
37537 
37538   if (expand_vec_perm_even_odd (d))
37539     return true;
37540 
37541   /* Even longer sequences.  */
37542   if (expand_vec_perm_vpshufb4_vpermq2 (d))
37543     return true;
37544 
37545   return false;
37546 }
37547 
37548 bool
37549 ix86_expand_vec_perm_const (rtx operands[4])
37550 {
37551   struct expand_vec_perm_d d;
37552   unsigned char perm[MAX_VECT_LEN];
37553   int i, nelt, which;
37554   rtx sel;
37555 
37556   d.target = operands[0];
37557   d.op0 = operands[1];
37558   d.op1 = operands[2];
37559   sel = operands[3];
37560 
37561   d.vmode = GET_MODE (d.target);
37562   gcc_assert (VECTOR_MODE_P (d.vmode));
37563   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37564   d.testing_p = false;
37565 
37566   gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37567   gcc_assert (XVECLEN (sel, 0) == nelt);
37568   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37569 
37570   for (i = which = 0; i < nelt; ++i)
37571     {
37572       rtx e = XVECEXP (sel, 0, i);
37573       int ei = INTVAL (e) & (2 * nelt - 1);
37574 
37575       which |= (ei < nelt ? 1 : 2);
37576       d.perm[i] = ei;
37577       perm[i] = ei;
37578     }
37579 
37580   switch (which)
37581     {
37582     default:
37583       gcc_unreachable();
37584 
37585     case 3:
37586       if (!rtx_equal_p (d.op0, d.op1))
37587 	break;
37588 
37589       /* The elements of PERM do not suggest that only the first operand
37590 	 is used, but both operands are identical.  Allow easier matching
37591 	 of the permutation by folding the permutation into the single
37592 	 input vector.  */
37593       for (i = 0; i < nelt; ++i)
37594 	if (d.perm[i] >= nelt)
37595 	  d.perm[i] -= nelt;
37596       /* FALLTHRU */
37597 
37598     case 1:
37599       d.op1 = d.op0;
37600       break;
37601 
37602     case 2:
37603       for (i = 0; i < nelt; ++i)
37604         d.perm[i] -= nelt;
37605       d.op0 = d.op1;
37606       break;
37607     }
37608 
37609   if (ix86_expand_vec_perm_const_1 (&d))
37610     return true;
37611 
37612   /* If the mask says both arguments are needed, but they are the same,
37613      the above tried to expand with d.op0 == d.op1.  If that didn't work,
37614      retry with d.op0 != d.op1 as that is what testing has been done with.  */
37615   if (which == 3 && d.op0 == d.op1)
37616     {
37617       rtx seq;
37618       bool ok;
37619 
37620       memcpy (d.perm, perm, sizeof (perm));
37621       d.op1 = gen_reg_rtx (d.vmode);
37622       start_sequence ();
37623       ok = ix86_expand_vec_perm_const_1 (&d);
37624       seq = get_insns ();
37625       end_sequence ();
37626       if (ok)
37627 	{
37628 	  emit_move_insn (d.op1, d.op0);
37629 	  emit_insn (seq);
37630 	  return true;
37631 	}
37632     }
37633 
37634   return false;
37635 }
37636 
37637 /* Implement targetm.vectorize.vec_perm_const_ok.  */
37638 
37639 static bool
37640 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37641 				  const unsigned char *sel)
37642 {
37643   struct expand_vec_perm_d d;
37644   unsigned int i, nelt, which;
37645   bool ret, one_vec;
37646 
37647   d.vmode = vmode;
37648   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37649   d.testing_p = true;
37650 
37651   /* Given sufficient ISA support we can just return true here
37652      for selected vector modes.  */
37653   if (GET_MODE_SIZE (d.vmode) == 16)
37654     {
37655       /* All implementable with a single vpperm insn.  */
37656       if (TARGET_XOP)
37657 	return true;
37658       /* All implementable with 2 pshufb + 1 ior.  */
37659       if (TARGET_SSSE3)
37660 	return true;
37661       /* All implementable with shufpd or unpck[lh]pd.  */
37662       if (d.nelt == 2)
37663 	return true;
37664     }
37665 
37666   /* Extract the values from the vector CST into the permutation
37667      array in D.  */
37668   memcpy (d.perm, sel, nelt);
37669   for (i = which = 0; i < nelt; ++i)
37670     {
37671       unsigned char e = d.perm[i];
37672       gcc_assert (e < 2 * nelt);
37673       which |= (e < nelt ? 1 : 2);
37674     }
37675 
37676   /* For all elements from second vector, fold the elements to first.  */
37677   if (which == 2)
37678     for (i = 0; i < nelt; ++i)
37679       d.perm[i] -= nelt;
37680 
37681   /* Check whether the mask can be applied to the vector type.  */
37682   one_vec = (which != 3);
37683 
37684   /* Implementable with shufps or pshufd.  */
37685   if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37686     return true;
37687 
37688   /* Otherwise we have to go through the motions and see if we can
37689      figure out how to generate the requested permutation.  */
37690   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37691   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37692   if (!one_vec)
37693     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37694 
37695   start_sequence ();
37696   ret = ix86_expand_vec_perm_const_1 (&d);
37697   end_sequence ();
37698 
37699   return ret;
37700 }
37701 
37702 void
37703 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37704 {
37705   struct expand_vec_perm_d d;
37706   unsigned i, nelt;
37707 
37708   d.target = targ;
37709   d.op0 = op0;
37710   d.op1 = op1;
37711   d.vmode = GET_MODE (targ);
37712   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37713   d.testing_p = false;
37714 
37715   for (i = 0; i < nelt; ++i)
37716     d.perm[i] = i * 2 + odd;
37717 
37718   /* We'll either be able to implement the permutation directly...  */
37719   if (expand_vec_perm_1 (&d))
37720     return;
37721 
37722   /* ... or we use the special-case patterns.  */
37723   expand_vec_perm_even_odd_1 (&d, odd);
37724 }
37725 
37726 /* Expand an insert into a vector register through pinsr insn.
37727    Return true if successful.  */
37728 
37729 bool
37730 ix86_expand_pinsr (rtx *operands)
37731 {
37732   rtx dst = operands[0];
37733   rtx src = operands[3];
37734 
37735   unsigned int size = INTVAL (operands[1]);
37736   unsigned int pos = INTVAL (operands[2]);
37737 
37738   if (GET_CODE (dst) == SUBREG)
37739     {
37740       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37741       dst = SUBREG_REG (dst);
37742     }
37743 
37744   if (GET_CODE (src) == SUBREG)
37745     src = SUBREG_REG (src);
37746 
37747   switch (GET_MODE (dst))
37748     {
37749     case V16QImode:
37750     case V8HImode:
37751     case V4SImode:
37752     case V2DImode:
37753       {
37754 	enum machine_mode srcmode, dstmode;
37755 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
37756 
37757 	srcmode = mode_for_size (size, MODE_INT, 0);
37758 
37759 	switch (srcmode)
37760 	  {
37761 	  case QImode:
37762 	    if (!TARGET_SSE4_1)
37763 	      return false;
37764 	    dstmode = V16QImode;
37765 	    pinsr = gen_sse4_1_pinsrb;
37766 	    break;
37767 
37768 	  case HImode:
37769 	    if (!TARGET_SSE2)
37770 	      return false;
37771 	    dstmode = V8HImode;
37772 	    pinsr = gen_sse2_pinsrw;
37773 	    break;
37774 
37775 	  case SImode:
37776 	    if (!TARGET_SSE4_1)
37777 	      return false;
37778 	    dstmode = V4SImode;
37779 	    pinsr = gen_sse4_1_pinsrd;
37780 	    break;
37781 
37782 	  case DImode:
37783 	    gcc_assert (TARGET_64BIT);
37784 	    if (!TARGET_SSE4_1)
37785 	      return false;
37786 	    dstmode = V2DImode;
37787 	    pinsr = gen_sse4_1_pinsrq;
37788 	    break;
37789 
37790 	  default:
37791 	    return false;
37792 	  }
37793 
37794 	dst = gen_lowpart (dstmode, dst);
37795 	src = gen_lowpart (srcmode, src);
37796 
37797 	pos /= size;
37798 
37799 	emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37800 	return true;
37801       }
37802 
37803     default:
37804       return false;
37805     }
37806 }
37807 
37808 /* This function returns the calling abi specific va_list type node.
37809    It returns  the FNDECL specific va_list type.  */
37810 
37811 static tree
37812 ix86_fn_abi_va_list (tree fndecl)
37813 {
37814   if (!TARGET_64BIT)
37815     return va_list_type_node;
37816   gcc_assert (fndecl != NULL_TREE);
37817 
37818   if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37819     return ms_va_list_type_node;
37820   else
37821     return sysv_va_list_type_node;
37822 }
37823 
37824 /* Returns the canonical va_list type specified by TYPE. If there
37825    is no valid TYPE provided, it return NULL_TREE.  */
37826 
37827 static tree
37828 ix86_canonical_va_list_type (tree type)
37829 {
37830   tree wtype, htype;
37831 
37832   /* Resolve references and pointers to va_list type.  */
37833   if (TREE_CODE (type) == MEM_REF)
37834     type = TREE_TYPE (type);
37835   else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37836     type = TREE_TYPE (type);
37837   else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37838     type = TREE_TYPE (type);
37839 
37840   if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37841     {
37842       wtype = va_list_type_node;
37843 	  gcc_assert (wtype != NULL_TREE);
37844       htype = type;
37845       if (TREE_CODE (wtype) == ARRAY_TYPE)
37846 	{
37847 	  /* If va_list is an array type, the argument may have decayed
37848 	     to a pointer type, e.g. by being passed to another function.
37849 	     In that case, unwrap both types so that we can compare the
37850 	     underlying records.  */
37851 	  if (TREE_CODE (htype) == ARRAY_TYPE
37852 	      || POINTER_TYPE_P (htype))
37853 	    {
37854 	      wtype = TREE_TYPE (wtype);
37855 	      htype = TREE_TYPE (htype);
37856 	    }
37857 	}
37858       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37859 	return va_list_type_node;
37860       wtype = sysv_va_list_type_node;
37861 	  gcc_assert (wtype != NULL_TREE);
37862       htype = type;
37863       if (TREE_CODE (wtype) == ARRAY_TYPE)
37864 	{
37865 	  /* If va_list is an array type, the argument may have decayed
37866 	     to a pointer type, e.g. by being passed to another function.
37867 	     In that case, unwrap both types so that we can compare the
37868 	     underlying records.  */
37869 	  if (TREE_CODE (htype) == ARRAY_TYPE
37870 	      || POINTER_TYPE_P (htype))
37871 	    {
37872 	      wtype = TREE_TYPE (wtype);
37873 	      htype = TREE_TYPE (htype);
37874 	    }
37875 	}
37876       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37877 	return sysv_va_list_type_node;
37878       wtype = ms_va_list_type_node;
37879 	  gcc_assert (wtype != NULL_TREE);
37880       htype = type;
37881       if (TREE_CODE (wtype) == ARRAY_TYPE)
37882 	{
37883 	  /* If va_list is an array type, the argument may have decayed
37884 	     to a pointer type, e.g. by being passed to another function.
37885 	     In that case, unwrap both types so that we can compare the
37886 	     underlying records.  */
37887 	  if (TREE_CODE (htype) == ARRAY_TYPE
37888 	      || POINTER_TYPE_P (htype))
37889 	    {
37890 	      wtype = TREE_TYPE (wtype);
37891 	      htype = TREE_TYPE (htype);
37892 	    }
37893 	}
37894       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37895 	return ms_va_list_type_node;
37896       return NULL_TREE;
37897     }
37898   return std_canonical_va_list_type (type);
37899 }
37900 
37901 /* Iterate through the target-specific builtin types for va_list.
37902    IDX denotes the iterator, *PTREE is set to the result type of
37903    the va_list builtin, and *PNAME to its internal type.
37904    Returns zero if there is no element for this index, otherwise
37905    IDX should be increased upon the next call.
37906    Note, do not iterate a base builtin's name like __builtin_va_list.
37907    Used from c_common_nodes_and_builtins.  */
37908 
37909 static int
37910 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37911 {
37912   if (TARGET_64BIT)
37913     {
37914       switch (idx)
37915 	{
37916 	default:
37917 	  break;
37918 
37919 	case 0:
37920 	  *ptree = ms_va_list_type_node;
37921 	  *pname = "__builtin_ms_va_list";
37922 	  return 1;
37923 
37924 	case 1:
37925 	  *ptree = sysv_va_list_type_node;
37926 	  *pname = "__builtin_sysv_va_list";
37927 	  return 1;
37928 	}
37929     }
37930 
37931   return 0;
37932 }
37933 
37934 #undef TARGET_SCHED_DISPATCH
37935 #define TARGET_SCHED_DISPATCH has_dispatch
37936 #undef TARGET_SCHED_DISPATCH_DO
37937 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37938 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37939 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37940 
37941 /* The size of the dispatch window is the total number of bytes of
37942    object code allowed in a window.  */
37943 #define DISPATCH_WINDOW_SIZE 16
37944 
37945 /* Number of dispatch windows considered for scheduling.  */
37946 #define MAX_DISPATCH_WINDOWS 3
37947 
37948 /* Maximum number of instructions in a window.  */
37949 #define MAX_INSN 4
37950 
37951 /* Maximum number of immediate operands in a window.  */
37952 #define MAX_IMM 4
37953 
37954 /* Maximum number of immediate bits allowed in a window.  */
37955 #define MAX_IMM_SIZE 128
37956 
37957 /* Maximum number of 32 bit immediates allowed in a window.  */
37958 #define MAX_IMM_32 4
37959 
37960 /* Maximum number of 64 bit immediates allowed in a window.  */
37961 #define MAX_IMM_64 2
37962 
37963 /* Maximum total of loads or prefetches allowed in a window.  */
37964 #define MAX_LOAD 2
37965 
37966 /* Maximum total of stores allowed in a window.  */
37967 #define MAX_STORE 1
37968 
37969 #undef BIG
37970 #define BIG 100
37971 
37972 
37973 /* Dispatch groups.  Istructions that affect the mix in a dispatch window.  */
37974 enum dispatch_group {
37975   disp_no_group = 0,
37976   disp_load,
37977   disp_store,
37978   disp_load_store,
37979   disp_prefetch,
37980   disp_imm,
37981   disp_imm_32,
37982   disp_imm_64,
37983   disp_branch,
37984   disp_cmp,
37985   disp_jcc,
37986   disp_last
37987 };
37988 
37989 /* Number of allowable groups in a dispatch window.  It is an array
37990    indexed by dispatch_group enum.  100 is used as a big number,
37991    because the number of these kind of operations does not have any
37992    effect in dispatch window, but we need them for other reasons in
37993    the table.  */
37994 static unsigned int num_allowable_groups[disp_last] = {
37995   0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37996 };
37997 
37998 char group_name[disp_last + 1][16] = {
37999   "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38000   "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38001   "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38002 };
38003 
38004 /* Instruction path.  */
38005 enum insn_path {
38006   no_path = 0,
38007   path_single, /* Single micro op.  */
38008   path_double, /* Double micro op.  */
38009   path_multi,  /* Instructions with more than 2 micro op..  */
38010   last_path
38011 };
38012 
38013 /* sched_insn_info defines a window to the instructions scheduled in
38014    the basic block.  It contains a pointer to the insn_info table and
38015    the instruction scheduled.
38016 
38017    Windows are allocated for each basic block and are linked
38018    together.  */
38019 typedef struct sched_insn_info_s {
38020   rtx insn;
38021   enum dispatch_group group;
38022   enum insn_path path;
38023   int byte_len;
38024   int imm_bytes;
38025 } sched_insn_info;
38026 
38027 /* Linked list of dispatch windows.  This is a two way list of
38028    dispatch windows of a basic block.  It contains information about
38029    the number of uops in the window and the total number of
38030    instructions and of bytes in the object code for this dispatch
38031    window.  */
38032 typedef struct dispatch_windows_s {
38033   int num_insn;            /* Number of insn in the window.  */
38034   int num_uops;            /* Number of uops in the window.  */
38035   int window_size;         /* Number of bytes in the window.  */
38036   int window_num;          /* Window number between 0 or 1.  */
38037   int num_imm;             /* Number of immediates in an insn.  */
38038   int num_imm_32;          /* Number of 32 bit immediates in an insn.  */
38039   int num_imm_64;          /* Number of 64 bit immediates in an insn.  */
38040   int imm_size;            /* Total immediates in the window.  */
38041   int num_loads;           /* Total memory loads in the window.  */
38042   int num_stores;          /* Total memory stores in the window.  */
38043   int violation;          /* Violation exists in window.  */
38044   sched_insn_info *window; /* Pointer to the window.  */
38045   struct dispatch_windows_s *next;
38046   struct dispatch_windows_s *prev;
38047 } dispatch_windows;
38048 
38049 /* Immediate valuse used in an insn.  */
38050 typedef struct imm_info_s
38051   {
38052     int imm;
38053     int imm32;
38054     int imm64;
38055   } imm_info;
38056 
38057 static dispatch_windows *dispatch_window_list;
38058 static dispatch_windows *dispatch_window_list1;
38059 
38060 /* Get dispatch group of insn.  */
38061 
38062 static enum dispatch_group
38063 get_mem_group (rtx insn)
38064 {
38065   enum attr_memory memory;
38066 
38067   if (INSN_CODE (insn) < 0)
38068     return disp_no_group;
38069   memory = get_attr_memory (insn);
38070   if (memory == MEMORY_STORE)
38071     return disp_store;
38072 
38073   if (memory == MEMORY_LOAD)
38074     return disp_load;
38075 
38076   if (memory == MEMORY_BOTH)
38077     return disp_load_store;
38078 
38079   return disp_no_group;
38080 }
38081 
38082 /* Return true if insn is a compare instruction.  */
38083 
38084 static bool
38085 is_cmp (rtx insn)
38086 {
38087   enum attr_type type;
38088 
38089   type = get_attr_type (insn);
38090   return (type == TYPE_TEST
38091 	  || type == TYPE_ICMP
38092 	  || type == TYPE_FCMP
38093 	  || GET_CODE (PATTERN (insn)) == COMPARE);
38094 }
38095 
38096 /* Return true if a dispatch violation encountered.  */
38097 
38098 static bool
38099 dispatch_violation (void)
38100 {
38101   if (dispatch_window_list->next)
38102     return dispatch_window_list->next->violation;
38103   return dispatch_window_list->violation;
38104 }
38105 
38106 /* Return true if insn is a branch instruction.  */
38107 
38108 static bool
38109 is_branch (rtx insn)
38110 {
38111   return (CALL_P (insn) || JUMP_P (insn));
38112 }
38113 
38114 /* Return true if insn is a prefetch instruction.  */
38115 
38116 static bool
38117 is_prefetch (rtx insn)
38118 {
38119   return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38120 }
38121 
38122 /* This function initializes a dispatch window and the list container holding a
38123    pointer to the window.  */
38124 
38125 static void
38126 init_window (int window_num)
38127 {
38128   int i;
38129   dispatch_windows *new_list;
38130 
38131   if (window_num == 0)
38132     new_list = dispatch_window_list;
38133   else
38134     new_list = dispatch_window_list1;
38135 
38136   new_list->num_insn = 0;
38137   new_list->num_uops = 0;
38138   new_list->window_size = 0;
38139   new_list->next = NULL;
38140   new_list->prev = NULL;
38141   new_list->window_num = window_num;
38142   new_list->num_imm = 0;
38143   new_list->num_imm_32 = 0;
38144   new_list->num_imm_64 = 0;
38145   new_list->imm_size = 0;
38146   new_list->num_loads = 0;
38147   new_list->num_stores = 0;
38148   new_list->violation = false;
38149 
38150   for (i = 0; i < MAX_INSN; i++)
38151     {
38152       new_list->window[i].insn = NULL;
38153       new_list->window[i].group = disp_no_group;
38154       new_list->window[i].path = no_path;
38155       new_list->window[i].byte_len = 0;
38156       new_list->window[i].imm_bytes = 0;
38157     }
38158   return;
38159 }
38160 
38161 /* This function allocates and initializes a dispatch window and the
38162    list container holding a pointer to the window.  */
38163 
38164 static dispatch_windows *
38165 allocate_window (void)
38166 {
38167   dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38168   new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38169 
38170   return new_list;
38171 }
38172 
38173 /* This routine initializes the dispatch scheduling information.  It
38174    initiates building dispatch scheduler tables and constructs the
38175    first dispatch window.  */
38176 
38177 static void
38178 init_dispatch_sched (void)
38179 {
38180   /* Allocate a dispatch list and a window.  */
38181   dispatch_window_list = allocate_window ();
38182   dispatch_window_list1 = allocate_window ();
38183   init_window (0);
38184   init_window (1);
38185 }
38186 
38187 /* This function returns true if a branch is detected.  End of a basic block
38188    does not have to be a branch, but here we assume only branches end a
38189    window.  */
38190 
38191 static bool
38192 is_end_basic_block (enum dispatch_group group)
38193 {
38194   return group == disp_branch;
38195 }
38196 
38197 /* This function is called when the end of a window processing is reached.  */
38198 
38199 static void
38200 process_end_window (void)
38201 {
38202   gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38203   if (dispatch_window_list->next)
38204     {
38205       gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38206       gcc_assert (dispatch_window_list->window_size
38207 		  + dispatch_window_list1->window_size <= 48);
38208       init_window (1);
38209     }
38210   init_window (0);
38211 }
38212 
38213 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38214    WINDOW_NUM is either 0 or 1.  A maximum of two windows are generated
38215    for 48 bytes of instructions.  Note that these windows are not dispatch
38216    windows that their sizes are DISPATCH_WINDOW_SIZE.  */
38217 
38218 static dispatch_windows *
38219 allocate_next_window (int window_num)
38220 {
38221   if (window_num == 0)
38222     {
38223       if (dispatch_window_list->next)
38224 	  init_window (1);
38225       init_window (0);
38226       return dispatch_window_list;
38227     }
38228 
38229   dispatch_window_list->next = dispatch_window_list1;
38230   dispatch_window_list1->prev = dispatch_window_list;
38231 
38232   return dispatch_window_list1;
38233 }
38234 
38235 /* Increment the number of immediate operands of an instruction.  */
38236 
38237 static int
38238 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38239 {
38240   if (*in_rtx == 0)
38241     return 0;
38242 
38243     switch ( GET_CODE (*in_rtx))
38244     {
38245     case CONST:
38246     case SYMBOL_REF:
38247     case CONST_INT:
38248       (imm_values->imm)++;
38249       if (x86_64_immediate_operand (*in_rtx, SImode))
38250 	(imm_values->imm32)++;
38251       else
38252 	(imm_values->imm64)++;
38253       break;
38254 
38255     case CONST_DOUBLE:
38256       (imm_values->imm)++;
38257       (imm_values->imm64)++;
38258       break;
38259 
38260     case CODE_LABEL:
38261       if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38262 	{
38263 	  (imm_values->imm)++;
38264 	  (imm_values->imm32)++;
38265 	}
38266       break;
38267 
38268     default:
38269       break;
38270     }
38271 
38272   return 0;
38273 }
38274 
38275 /* Compute number of immediate operands of an instruction.  */
38276 
38277 static void
38278 find_constant (rtx in_rtx, imm_info *imm_values)
38279 {
38280   for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38281 		(rtx_function) find_constant_1, (void *) imm_values);
38282 }
38283 
38284 /* Return total size of immediate operands of an instruction along with number
38285    of corresponding immediate-operands.  It initializes its parameters to zero
38286    befor calling FIND_CONSTANT.
38287    INSN is the input instruction.  IMM is the total of immediates.
38288    IMM32 is the number of 32 bit immediates.  IMM64 is the number of 64
38289    bit immediates.  */
38290 
38291 static int
38292 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38293 {
38294   imm_info imm_values = {0, 0, 0};
38295 
38296   find_constant (insn, &imm_values);
38297   *imm = imm_values.imm;
38298   *imm32 = imm_values.imm32;
38299   *imm64 = imm_values.imm64;
38300   return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38301 }
38302 
38303 /* This function indicates if an operand of an instruction is an
38304    immediate.  */
38305 
38306 static bool
38307 has_immediate (rtx insn)
38308 {
38309   int num_imm_operand;
38310   int num_imm32_operand;
38311   int num_imm64_operand;
38312 
38313   if (insn)
38314     return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38315 			       &num_imm64_operand);
38316   return false;
38317 }
38318 
38319 /* Return single or double path for instructions.  */
38320 
38321 static enum insn_path
38322 get_insn_path (rtx insn)
38323 {
38324   enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38325 
38326   if ((int)path == 0)
38327     return path_single;
38328 
38329   if ((int)path == 1)
38330     return path_double;
38331 
38332   return path_multi;
38333 }
38334 
38335 /* Return insn dispatch group.  */
38336 
38337 static enum dispatch_group
38338 get_insn_group (rtx insn)
38339 {
38340   enum dispatch_group group = get_mem_group (insn);
38341   if (group)
38342     return group;
38343 
38344   if (is_branch (insn))
38345     return disp_branch;
38346 
38347   if (is_cmp (insn))
38348     return disp_cmp;
38349 
38350   if (has_immediate (insn))
38351     return disp_imm;
38352 
38353   if (is_prefetch (insn))
38354     return disp_prefetch;
38355 
38356   return disp_no_group;
38357 }
38358 
38359 /* Count number of GROUP restricted instructions in a dispatch
38360    window WINDOW_LIST.  */
38361 
38362 static int
38363 count_num_restricted (rtx insn, dispatch_windows *window_list)
38364 {
38365   enum dispatch_group group = get_insn_group (insn);
38366   int imm_size;
38367   int num_imm_operand;
38368   int num_imm32_operand;
38369   int num_imm64_operand;
38370 
38371   if (group == disp_no_group)
38372     return 0;
38373 
38374   if (group == disp_imm)
38375     {
38376       imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38377 			      &num_imm64_operand);
38378       if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38379 	  || num_imm_operand + window_list->num_imm > MAX_IMM
38380 	  || (num_imm32_operand > 0
38381 	      && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38382 		  || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38383 	  || (num_imm64_operand > 0
38384 	      && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38385 		  || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38386 	  || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38387 	      && num_imm64_operand > 0
38388 	      && ((window_list->num_imm_64 > 0
38389 		   && window_list->num_insn >= 2)
38390 		  || window_list->num_insn >= 3)))
38391 	return BIG;
38392 
38393       return 1;
38394     }
38395 
38396   if ((group == disp_load_store
38397        && (window_list->num_loads >= MAX_LOAD
38398 	   || window_list->num_stores >= MAX_STORE))
38399       || ((group == disp_load
38400 	   || group == disp_prefetch)
38401 	  && window_list->num_loads >= MAX_LOAD)
38402       || (group == disp_store
38403 	  && window_list->num_stores >= MAX_STORE))
38404     return BIG;
38405 
38406   return 1;
38407 }
38408 
38409 /* This function returns true if insn satisfies dispatch rules on the
38410    last window scheduled.  */
38411 
38412 static bool
38413 fits_dispatch_window (rtx insn)
38414 {
38415   dispatch_windows *window_list = dispatch_window_list;
38416   dispatch_windows *window_list_next = dispatch_window_list->next;
38417   unsigned int num_restrict;
38418   enum dispatch_group group = get_insn_group (insn);
38419   enum insn_path path = get_insn_path (insn);
38420   int sum;
38421 
38422   /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
38423      instructions should be given the lowest priority in the
38424      scheduling process in Haifa scheduler to make sure they will be
38425      scheduled in the same dispatch window as the refrence to them.  */
38426   if (group == disp_jcc || group == disp_cmp)
38427     return false;
38428 
38429   /* Check nonrestricted.  */
38430   if (group == disp_no_group || group == disp_branch)
38431     return true;
38432 
38433   /* Get last dispatch window.  */
38434   if (window_list_next)
38435     window_list = window_list_next;
38436 
38437   if (window_list->window_num == 1)
38438     {
38439       sum = window_list->prev->window_size + window_list->window_size;
38440 
38441       if (sum == 32
38442 	  || (min_insn_size (insn) + sum) >= 48)
38443 	/* Window 1 is full.  Go for next window.  */
38444 	return true;
38445     }
38446 
38447   num_restrict = count_num_restricted (insn, window_list);
38448 
38449   if (num_restrict > num_allowable_groups[group])
38450     return false;
38451 
38452   /* See if it fits in the first window.  */
38453   if (window_list->window_num == 0)
38454     {
38455       /* The first widow should have only single and double path
38456 	 uops.  */
38457       if (path == path_double
38458 	  && (window_list->num_uops + 2) > MAX_INSN)
38459 	return false;
38460       else if (path != path_single)
38461         return false;
38462     }
38463   return true;
38464 }
38465 
38466 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38467    dispatch window WINDOW_LIST.  */
38468 
38469 static void
38470 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38471 {
38472   int byte_len = min_insn_size (insn);
38473   int num_insn = window_list->num_insn;
38474   int imm_size;
38475   sched_insn_info *window = window_list->window;
38476   enum dispatch_group group = get_insn_group (insn);
38477   enum insn_path path = get_insn_path (insn);
38478   int num_imm_operand;
38479   int num_imm32_operand;
38480   int num_imm64_operand;
38481 
38482   if (!window_list->violation && group != disp_cmp
38483       && !fits_dispatch_window (insn))
38484     window_list->violation = true;
38485 
38486   imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38487 				 &num_imm64_operand);
38488 
38489   /* Initialize window with new instruction.  */
38490   window[num_insn].insn = insn;
38491   window[num_insn].byte_len = byte_len;
38492   window[num_insn].group = group;
38493   window[num_insn].path = path;
38494   window[num_insn].imm_bytes = imm_size;
38495 
38496   window_list->window_size += byte_len;
38497   window_list->num_insn = num_insn + 1;
38498   window_list->num_uops = window_list->num_uops + num_uops;
38499   window_list->imm_size += imm_size;
38500   window_list->num_imm += num_imm_operand;
38501   window_list->num_imm_32 += num_imm32_operand;
38502   window_list->num_imm_64 += num_imm64_operand;
38503 
38504   if (group == disp_store)
38505     window_list->num_stores += 1;
38506   else if (group == disp_load
38507 	   || group == disp_prefetch)
38508     window_list->num_loads += 1;
38509   else if (group == disp_load_store)
38510     {
38511       window_list->num_stores += 1;
38512       window_list->num_loads += 1;
38513     }
38514 }
38515 
38516 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38517    If the total bytes of instructions or the number of instructions in
38518    the window exceed allowable, it allocates a new window.  */
38519 
38520 static void
38521 add_to_dispatch_window (rtx insn)
38522 {
38523   int byte_len;
38524   dispatch_windows *window_list;
38525   dispatch_windows *next_list;
38526   dispatch_windows *window0_list;
38527   enum insn_path path;
38528   enum dispatch_group insn_group;
38529   bool insn_fits;
38530   int num_insn;
38531   int num_uops;
38532   int window_num;
38533   int insn_num_uops;
38534   int sum;
38535 
38536   if (INSN_CODE (insn) < 0)
38537     return;
38538 
38539   byte_len = min_insn_size (insn);
38540   window_list = dispatch_window_list;
38541   next_list = window_list->next;
38542   path = get_insn_path (insn);
38543   insn_group = get_insn_group (insn);
38544 
38545   /* Get the last dispatch window.  */
38546   if (next_list)
38547       window_list = dispatch_window_list->next;
38548 
38549   if (path == path_single)
38550     insn_num_uops = 1;
38551   else if (path == path_double)
38552     insn_num_uops = 2;
38553   else
38554     insn_num_uops = (int) path;
38555 
38556   /* If current window is full, get a new window.
38557      Window number zero is full, if MAX_INSN uops are scheduled in it.
38558      Window number one is full, if window zero's bytes plus window
38559      one's bytes is 32, or if the bytes of the new instruction added
38560      to the total makes it greater than 48, or it has already MAX_INSN
38561      instructions in it.  */
38562   num_insn = window_list->num_insn;
38563   num_uops = window_list->num_uops;
38564   window_num = window_list->window_num;
38565   insn_fits = fits_dispatch_window (insn);
38566 
38567   if (num_insn >= MAX_INSN
38568       || num_uops + insn_num_uops > MAX_INSN
38569       || !(insn_fits))
38570     {
38571       window_num = ~window_num & 1;
38572       window_list = allocate_next_window (window_num);
38573     }
38574 
38575   if (window_num == 0)
38576     {
38577       add_insn_window (insn, window_list, insn_num_uops);
38578       if (window_list->num_insn >= MAX_INSN
38579 	  && insn_group == disp_branch)
38580 	{
38581 	  process_end_window ();
38582 	  return;
38583 	}
38584     }
38585   else if (window_num == 1)
38586     {
38587       window0_list = window_list->prev;
38588       sum = window0_list->window_size + window_list->window_size;
38589       if (sum == 32
38590 	  || (byte_len + sum) >= 48)
38591 	{
38592 	  process_end_window ();
38593 	  window_list = dispatch_window_list;
38594 	}
38595 
38596       add_insn_window (insn, window_list, insn_num_uops);
38597     }
38598   else
38599     gcc_unreachable ();
38600 
38601   if (is_end_basic_block (insn_group))
38602     {
38603       /* End of basic block is reached do end-basic-block process.  */
38604       process_end_window ();
38605       return;
38606     }
38607 }
38608 
38609 /* Print the dispatch window, WINDOW_NUM, to FILE.  */
38610 
38611 DEBUG_FUNCTION static void
38612 debug_dispatch_window_file (FILE *file, int window_num)
38613 {
38614   dispatch_windows *list;
38615   int i;
38616 
38617   if (window_num == 0)
38618     list = dispatch_window_list;
38619   else
38620     list = dispatch_window_list1;
38621 
38622   fprintf (file, "Window #%d:\n", list->window_num);
38623   fprintf (file, "  num_insn = %d, num_uops = %d, window_size = %d\n",
38624 	  list->num_insn, list->num_uops, list->window_size);
38625   fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38626 	   list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38627 
38628   fprintf (file, "  num_loads = %d, num_stores = %d\n", list->num_loads,
38629 	  list->num_stores);
38630   fprintf (file, " insn info:\n");
38631 
38632   for (i = 0; i < MAX_INSN; i++)
38633     {
38634       if (!list->window[i].insn)
38635 	break;
38636       fprintf (file, "    group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38637 	      i, group_name[list->window[i].group],
38638 	      i, (void *)list->window[i].insn,
38639 	      i, list->window[i].path,
38640 	      i, list->window[i].byte_len,
38641 	      i, list->window[i].imm_bytes);
38642     }
38643 }
38644 
38645 /* Print to stdout a dispatch window.  */
38646 
38647 DEBUG_FUNCTION void
38648 debug_dispatch_window (int window_num)
38649 {
38650   debug_dispatch_window_file (stdout, window_num);
38651 }
38652 
38653 /* Print INSN dispatch information to FILE.  */
38654 
38655 DEBUG_FUNCTION static void
38656 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38657 {
38658   int byte_len;
38659   enum insn_path path;
38660   enum dispatch_group group;
38661   int imm_size;
38662   int num_imm_operand;
38663   int num_imm32_operand;
38664   int num_imm64_operand;
38665 
38666   if (INSN_CODE (insn) < 0)
38667     return;
38668 
38669   byte_len = min_insn_size (insn);
38670   path = get_insn_path (insn);
38671   group = get_insn_group (insn);
38672   imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38673 				 &num_imm64_operand);
38674 
38675   fprintf (file, " insn info:\n");
38676   fprintf (file, "  group = %s, path = %d, byte_len = %d\n",
38677 	   group_name[group], path, byte_len);
38678   fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38679 	   num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38680 }
38681 
38682 /* Print to STDERR the status of the ready list with respect to
38683    dispatch windows.  */
38684 
38685 DEBUG_FUNCTION void
38686 debug_ready_dispatch (void)
38687 {
38688   int i;
38689   int no_ready = number_in_ready ();
38690 
38691   fprintf (stdout, "Number of ready: %d\n", no_ready);
38692 
38693   for (i = 0; i < no_ready; i++)
38694     debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38695 }
38696 
38697 /* This routine is the driver of the dispatch scheduler.  */
38698 
38699 static void
38700 do_dispatch (rtx insn, int mode)
38701 {
38702   if (mode == DISPATCH_INIT)
38703     init_dispatch_sched ();
38704   else if (mode == ADD_TO_DISPATCH_WINDOW)
38705     add_to_dispatch_window (insn);
38706 }
38707 
38708 /* Return TRUE if Dispatch Scheduling is supported.  */
38709 
38710 static bool
38711 has_dispatch (rtx insn, int action)
38712 {
38713   if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38714       && flag_dispatch_scheduler)
38715     switch (action)
38716       {
38717       default:
38718 	return false;
38719 
38720       case IS_DISPATCH_ON:
38721 	return true;
38722 	break;
38723 
38724       case IS_CMP:
38725 	return is_cmp (insn);
38726 
38727       case DISPATCH_VIOLATION:
38728 	return dispatch_violation ();
38729 
38730       case FITS_DISPATCH_WINDOW:
38731 	return fits_dispatch_window (insn);
38732       }
38733 
38734   return false;
38735 }
38736 
38737 /* Implementation of reassociation_width target hook used by
38738    reassoc phase to identify parallelism level in reassociated
38739    tree.  Statements tree_code is passed in OPC.  Arguments type
38740    is passed in MODE.
38741 
38742    Currently parallel reassociation is enabled for Atom
38743    processors only and we set reassociation width to be 2
38744    because Atom may issue up to 2 instructions per cycle.
38745 
38746    Return value should be fixed if parallel reassociation is
38747    enabled for other processors.  */
38748 
38749 static int
38750 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38751 			  enum machine_mode mode)
38752 {
38753   int res = 1;
38754 
38755   if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38756     res = 2;
38757   else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38758     res = 2;
38759 
38760   return res;
38761 }
38762 
38763 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38764    place emms and femms instructions.  */
38765 
38766 static enum machine_mode
38767 ix86_preferred_simd_mode (enum machine_mode mode)
38768 {
38769   if (!TARGET_SSE)
38770     return word_mode;
38771 
38772   switch (mode)
38773     {
38774     case QImode:
38775       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38776     case HImode:
38777       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38778     case SImode:
38779       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38780     case DImode:
38781       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38782 
38783     case SFmode:
38784       if (TARGET_AVX && !TARGET_PREFER_AVX128)
38785 	return V8SFmode;
38786       else
38787 	return V4SFmode;
38788 
38789     case DFmode:
38790       if (!TARGET_VECTORIZE_DOUBLE)
38791 	return word_mode;
38792       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38793 	return V4DFmode;
38794       else if (TARGET_SSE2)
38795 	return V2DFmode;
38796       /* FALLTHRU */
38797 
38798     default:
38799       return word_mode;
38800     }
38801 }
38802 
38803 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38804    vectors.  */
38805 
38806 static unsigned int
38807 ix86_autovectorize_vector_sizes (void)
38808 {
38809   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38810 }
38811 
38812 /* Initialize the GCC target structure.  */
38813 #undef TARGET_RETURN_IN_MEMORY
38814 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38815 
38816 #undef TARGET_LEGITIMIZE_ADDRESS
38817 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38818 
38819 #undef TARGET_ATTRIBUTE_TABLE
38820 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38821 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38822 #  undef TARGET_MERGE_DECL_ATTRIBUTES
38823 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38824 #endif
38825 
38826 #undef TARGET_COMP_TYPE_ATTRIBUTES
38827 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38828 
38829 #undef TARGET_INIT_BUILTINS
38830 #define TARGET_INIT_BUILTINS ix86_init_builtins
38831 #undef TARGET_BUILTIN_DECL
38832 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38833 #undef TARGET_EXPAND_BUILTIN
38834 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38835 
38836 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38837 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38838   ix86_builtin_vectorized_function
38839 
38840 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38841 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38842 
38843 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38844 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38845 
38846 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38847 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38848 
38849 #undef TARGET_BUILTIN_RECIPROCAL
38850 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38851 
38852 #undef TARGET_ASM_FUNCTION_EPILOGUE
38853 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38854 
38855 #undef TARGET_ENCODE_SECTION_INFO
38856 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38857 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38858 #else
38859 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38860 #endif
38861 
38862 #undef TARGET_ASM_OPEN_PAREN
38863 #define TARGET_ASM_OPEN_PAREN ""
38864 #undef TARGET_ASM_CLOSE_PAREN
38865 #define TARGET_ASM_CLOSE_PAREN ""
38866 
38867 #undef TARGET_ASM_BYTE_OP
38868 #define TARGET_ASM_BYTE_OP ASM_BYTE
38869 
38870 #undef TARGET_ASM_ALIGNED_HI_OP
38871 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38872 #undef TARGET_ASM_ALIGNED_SI_OP
38873 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38874 #ifdef ASM_QUAD
38875 #undef TARGET_ASM_ALIGNED_DI_OP
38876 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38877 #endif
38878 
38879 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38880 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38881 
38882 #undef TARGET_ASM_UNALIGNED_HI_OP
38883 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38884 #undef TARGET_ASM_UNALIGNED_SI_OP
38885 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38886 #undef TARGET_ASM_UNALIGNED_DI_OP
38887 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38888 
38889 #undef TARGET_PRINT_OPERAND
38890 #define TARGET_PRINT_OPERAND ix86_print_operand
38891 #undef TARGET_PRINT_OPERAND_ADDRESS
38892 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38893 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38894 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38895 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38896 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38897 
38898 #undef TARGET_SCHED_INIT_GLOBAL
38899 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38900 #undef TARGET_SCHED_ADJUST_COST
38901 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38902 #undef TARGET_SCHED_ISSUE_RATE
38903 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38904 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38905 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38906   ia32_multipass_dfa_lookahead
38907 
38908 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38909 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38910 
38911 #ifdef HAVE_AS_TLS
38912 #undef TARGET_HAVE_TLS
38913 #define TARGET_HAVE_TLS true
38914 #endif
38915 #undef TARGET_CANNOT_FORCE_CONST_MEM
38916 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38917 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38918 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38919 
38920 #undef TARGET_DELEGITIMIZE_ADDRESS
38921 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38922 
38923 #undef TARGET_MS_BITFIELD_LAYOUT_P
38924 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38925 
38926 #if TARGET_MACHO
38927 #undef TARGET_BINDS_LOCAL_P
38928 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38929 #endif
38930 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38931 #undef TARGET_BINDS_LOCAL_P
38932 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38933 #endif
38934 
38935 #undef TARGET_ASM_OUTPUT_MI_THUNK
38936 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38937 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38938 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38939 
38940 #undef TARGET_ASM_FILE_START
38941 #define TARGET_ASM_FILE_START x86_file_start
38942 
38943 #undef TARGET_OPTION_OVERRIDE
38944 #define TARGET_OPTION_OVERRIDE ix86_option_override
38945 
38946 #undef TARGET_REGISTER_MOVE_COST
38947 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38948 #undef TARGET_MEMORY_MOVE_COST
38949 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38950 #undef TARGET_RTX_COSTS
38951 #define TARGET_RTX_COSTS ix86_rtx_costs
38952 #undef TARGET_ADDRESS_COST
38953 #define TARGET_ADDRESS_COST ix86_address_cost
38954 
38955 #undef TARGET_FIXED_CONDITION_CODE_REGS
38956 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38957 #undef TARGET_CC_MODES_COMPATIBLE
38958 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38959 
38960 #undef TARGET_MACHINE_DEPENDENT_REORG
38961 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38962 
38963 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38964 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38965 
38966 #undef TARGET_BUILD_BUILTIN_VA_LIST
38967 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38968 
38969 #undef TARGET_ENUM_VA_LIST_P
38970 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38971 
38972 #undef TARGET_FN_ABI_VA_LIST
38973 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38974 
38975 #undef TARGET_CANONICAL_VA_LIST_TYPE
38976 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38977 
38978 #undef TARGET_EXPAND_BUILTIN_VA_START
38979 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38980 
38981 #undef TARGET_MD_ASM_CLOBBERS
38982 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38983 
38984 #undef TARGET_PROMOTE_PROTOTYPES
38985 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38986 #undef TARGET_STRUCT_VALUE_RTX
38987 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38988 #undef TARGET_SETUP_INCOMING_VARARGS
38989 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38990 #undef TARGET_MUST_PASS_IN_STACK
38991 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38992 #undef TARGET_FUNCTION_ARG_ADVANCE
38993 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38994 #undef TARGET_FUNCTION_ARG
38995 #define TARGET_FUNCTION_ARG ix86_function_arg
38996 #undef TARGET_FUNCTION_ARG_BOUNDARY
38997 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38998 #undef TARGET_PASS_BY_REFERENCE
38999 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
39000 #undef TARGET_INTERNAL_ARG_POINTER
39001 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
39002 #undef TARGET_UPDATE_STACK_BOUNDARY
39003 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
39004 #undef TARGET_GET_DRAP_RTX
39005 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
39006 #undef TARGET_STRICT_ARGUMENT_NAMING
39007 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
39008 #undef TARGET_STATIC_CHAIN
39009 #define TARGET_STATIC_CHAIN ix86_static_chain
39010 #undef TARGET_TRAMPOLINE_INIT
39011 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
39012 #undef TARGET_RETURN_POPS_ARGS
39013 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
39014 
39015 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
39016 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
39017 
39018 #undef TARGET_SCALAR_MODE_SUPPORTED_P
39019 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
39020 
39021 #undef TARGET_VECTOR_MODE_SUPPORTED_P
39022 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
39023 
39024 #undef TARGET_C_MODE_FOR_SUFFIX
39025 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
39026 
39027 #ifdef HAVE_AS_TLS
39028 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
39029 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
39030 #endif
39031 
39032 #ifdef SUBTARGET_INSERT_ATTRIBUTES
39033 #undef TARGET_INSERT_ATTRIBUTES
39034 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
39035 #endif
39036 
39037 #undef TARGET_MANGLE_TYPE
39038 #define TARGET_MANGLE_TYPE ix86_mangle_type
39039 
39040 #if !TARGET_MACHO
39041 #undef TARGET_STACK_PROTECT_FAIL
39042 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
39043 #endif
39044 
39045 #undef TARGET_FUNCTION_VALUE
39046 #define TARGET_FUNCTION_VALUE ix86_function_value
39047 
39048 #undef TARGET_FUNCTION_VALUE_REGNO_P
39049 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
39050 
39051 #undef TARGET_PROMOTE_FUNCTION_MODE
39052 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
39053 
39054 #undef TARGET_INSTANTIATE_DECLS
39055 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
39056 
39057 #undef TARGET_SECONDARY_RELOAD
39058 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
39059 
39060 #undef TARGET_CLASS_MAX_NREGS
39061 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
39062 
39063 #undef TARGET_PREFERRED_RELOAD_CLASS
39064 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
39065 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
39066 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
39067 #undef TARGET_CLASS_LIKELY_SPILLED_P
39068 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
39069 
39070 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
39071 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
39072   ix86_builtin_vectorization_cost
39073 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
39074 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
39075   ix86_vectorize_vec_perm_const_ok
39076 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
39077 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
39078   ix86_preferred_simd_mode
39079 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
39080 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
39081   ix86_autovectorize_vector_sizes
39082 
39083 #undef TARGET_SET_CURRENT_FUNCTION
39084 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
39085 
39086 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
39087 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
39088 
39089 #undef TARGET_OPTION_SAVE
39090 #define TARGET_OPTION_SAVE ix86_function_specific_save
39091 
39092 #undef TARGET_OPTION_RESTORE
39093 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39094 
39095 #undef TARGET_OPTION_PRINT
39096 #define TARGET_OPTION_PRINT ix86_function_specific_print
39097 
39098 #undef TARGET_CAN_INLINE_P
39099 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39100 
39101 #undef TARGET_EXPAND_TO_RTL_HOOK
39102 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39103 
39104 #undef TARGET_LEGITIMATE_ADDRESS_P
39105 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39106 
39107 #undef TARGET_LEGITIMATE_CONSTANT_P
39108 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39109 
39110 #undef TARGET_FRAME_POINTER_REQUIRED
39111 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39112 
39113 #undef TARGET_CAN_ELIMINATE
39114 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39115 
39116 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39117 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39118 
39119 #undef TARGET_ASM_CODE_END
39120 #define TARGET_ASM_CODE_END ix86_code_end
39121 
39122 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39123 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39124 
39125 #if TARGET_MACHO
39126 #undef TARGET_INIT_LIBFUNCS
39127 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39128 #endif
39129 
39130 struct gcc_target targetm = TARGET_INITIALIZER;
39131 
39132 #include "gt-i386.h"
39133