xref: /dragonfly/contrib/gcc-8.0/gcc/config/i386/i386.c (revision 3d33658b)
1 /* Subroutines used for code generation on IA-32.
2    Copyright (C) 1988-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #define IN_TARGET_CODE 1
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
93 #include "wide-int-bitmask.h"
94 #include "debug.h"
95 #include "dwarf2out.h"
96 
97 /* This file should be included last.  */
98 #include "target-def.h"
99 
100 #include "x86-tune-costs.h"
101 
102 static rtx legitimize_dllimport_symbol (rtx, bool);
103 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
104 static rtx legitimize_pe_coff_symbol (rtx, bool);
105 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
106 static bool ix86_save_reg (unsigned int, bool, bool);
107 static bool ix86_function_naked (const_tree);
108 static bool ix86_notrack_prefixed_insn_p (rtx);
109 static void ix86_emit_restore_reg_using_pop (rtx);
110 
111 
112 #ifndef CHECK_STACK_LIMIT
113 #define CHECK_STACK_LIMIT (-1)
114 #endif
115 
116 /* Return index of given mode in mult and division cost tables.  */
117 #define MODE_INDEX(mode)					\
118   ((mode) == QImode ? 0						\
119    : (mode) == HImode ? 1					\
120    : (mode) == SImode ? 2					\
121    : (mode) == DImode ? 3					\
122    : 4)
123 
124 
125 /* Set by -mtune.  */
126 const struct processor_costs *ix86_tune_cost = NULL;
127 
128 /* Set by -mtune or -Os.  */
129 const struct processor_costs *ix86_cost = NULL;
130 
131 /* Processor feature/optimization bitmasks.  */
132 #define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
133 #define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
134 #define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
135 #define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
136 #define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
137 #define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
138 #define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
139 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
140 #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
141 #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
142 #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
143 #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
144 #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
145 #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
146 #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
147 #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
148 #define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
149 #define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
150 #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
151 #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
152 #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
153 #define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
154 		       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER)
155 #define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
156 #define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2)
157 #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
158 
159 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
160 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
161 #define m_K6_GEODE (m_K6 | m_GEODE)
162 #define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
163 #define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
164 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
165 #define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
166 #define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
167 #define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
168 #define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
169 #define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
170 #define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
171 #define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
172 #define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
173 #define m_BDVER	(m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
174 #define m_BTVER (m_BTVER1 | m_BTVER2)
175 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
176 			| m_ZNVER1)
177 
178 #define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
179 
180 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
181 #undef DEF_TUNE
182 #define DEF_TUNE(tune, name, selector) name,
183 #include "x86-tune.def"
184 #undef DEF_TUNE
185 };
186 
187 /* Feature tests against the various tunings.  */
188 unsigned char ix86_tune_features[X86_TUNE_LAST];
189 
190 /* Feature tests against the various tunings used to create ix86_tune_features
191    based on the processor mask.  */
192 static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
193 #undef DEF_TUNE
194 #define DEF_TUNE(tune, name, selector) selector,
195 #include "x86-tune.def"
196 #undef DEF_TUNE
197 };
198 
199 /* Feature tests against the various architecture variations.  */
200 unsigned char ix86_arch_features[X86_ARCH_LAST];
201 
202 /* Feature tests against the various architecture variations, used to create
203    ix86_arch_features based on the processor mask.  */
204 static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
205   /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
206   ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
207 
208   /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
209   ~m_386,
210 
211   /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
212   ~(m_386 | m_486),
213 
214   /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
215   ~m_386,
216 
217   /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
218   ~m_386,
219 };
220 
221 /* In case the average insn count for single function invocation is
222    lower than this constant, emit fast (but longer) prologue and
223    epilogue code.  */
224 #define FAST_PROLOGUE_INSN_COUNT 20
225 
226 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
227 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
228 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
229 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
230 
231 /* Array of the smallest class containing reg number REGNO, indexed by
232    REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
233 
234 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
235 {
236   /* ax, dx, cx, bx */
237   AREG, DREG, CREG, BREG,
238   /* si, di, bp, sp */
239   SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
240   /* FP registers */
241   FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
242   FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
243   /* arg pointer */
244   NON_Q_REGS,
245   /* flags, fpsr, fpcr, frame */
246   NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
247   /* SSE registers */
248   SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
249   SSE_REGS, SSE_REGS,
250   /* MMX registers */
251   MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
252   MMX_REGS, MMX_REGS,
253   /* REX registers */
254   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
255   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
256   /* SSE REX registers */
257   SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
258   SSE_REGS, SSE_REGS,
259   /* AVX-512 SSE registers */
260   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
261   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
262   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
263   EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
264   /* Mask registers.  */
265   MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
266   MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
267   /* MPX bound registers */
268   BND_REGS, BND_REGS, BND_REGS, BND_REGS,
269 };
270 
271 /* The "default" register map used in 32bit mode.  */
272 
273 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
274 {
275   0, 2, 1, 3, 6, 7, 4, 5,		/* general regs */
276   12, 13, 14, 15, 16, 17, 18, 19,	/* fp regs */
277   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
278   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE */
279   29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
280   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
281   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
282   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
283   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
284   93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
285   101, 102, 103, 104,			/* bound registers */
286 };
287 
288 /* The "default" register map used in 64bit mode.  */
289 
290 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
291 {
292   0, 1, 2, 3, 4, 5, 6, 7,		/* general regs */
293   33, 34, 35, 36, 37, 38, 39, 40,	/* fp regs */
294   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
295   17, 18, 19, 20, 21, 22, 23, 24,	/* SSE */
296   41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
297   8,9,10,11,12,13,14,15,		/* extended integer registers */
298   25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
299   67, 68, 69, 70, 71, 72, 73, 74,       /* AVX-512 registers 16-23 */
300   75, 76, 77, 78, 79, 80, 81, 82,       /* AVX-512 registers 24-31 */
301   118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
302   126, 127, 128, 129,			/* bound registers */
303 };
304 
305 /* Define the register numbers to be used in Dwarf debugging information.
306    The SVR4 reference port C compiler uses the following register numbers
307    in its Dwarf output code:
308 	0 for %eax (gcc regno = 0)
309 	1 for %ecx (gcc regno = 2)
310 	2 for %edx (gcc regno = 1)
311 	3 for %ebx (gcc regno = 3)
312 	4 for %esp (gcc regno = 7)
313 	5 for %ebp (gcc regno = 6)
314 	6 for %esi (gcc regno = 4)
315 	7 for %edi (gcc regno = 5)
316    The following three DWARF register numbers are never generated by
317    the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
318    believed these numbers have these meanings.
319 	8  for %eip    (no gcc equivalent)
320 	9  for %eflags (gcc regno = 17)
321 	10 for %trapno (no gcc equivalent)
322    It is not at all clear how we should number the FP stack registers
323    for the x86 architecture.  If the version of SDB on x86/svr4 were
324    a bit less brain dead with respect to floating-point then we would
325    have a precedent to follow with respect to DWARF register numbers
326    for x86 FP registers, but the SDB on x86/svr4 was so completely
327    broken with respect to FP registers that it is hardly worth thinking
328    of it as something to strive for compatibility with.
329    The version of x86/svr4 SDB I had does (partially)
330    seem to believe that DWARF register number 11 is associated with
331    the x86 register %st(0), but that's about all.  Higher DWARF
332    register numbers don't seem to be associated with anything in
333    particular, and even for DWARF regno 11, SDB only seemed to under-
334    stand that it should say that a variable lives in %st(0) (when
335    asked via an `=' command) if we said it was in DWARF regno 11,
336    but SDB still printed garbage when asked for the value of the
337    variable in question (via a `/' command).
338    (Also note that the labels SDB printed for various FP stack regs
339    when doing an `x' command were all wrong.)
340    Note that these problems generally don't affect the native SVR4
341    C compiler because it doesn't allow the use of -O with -g and
342    because when it is *not* optimizing, it allocates a memory
343    location for each floating-point variable, and the memory
344    location is what gets described in the DWARF AT_location
345    attribute for the variable in question.
346    Regardless of the severe mental illness of the x86/svr4 SDB, we
347    do something sensible here and we use the following DWARF
348    register numbers.  Note that these are all stack-top-relative
349    numbers.
350 	11 for %st(0) (gcc regno = 8)
351 	12 for %st(1) (gcc regno = 9)
352 	13 for %st(2) (gcc regno = 10)
353 	14 for %st(3) (gcc regno = 11)
354 	15 for %st(4) (gcc regno = 12)
355 	16 for %st(5) (gcc regno = 13)
356 	17 for %st(6) (gcc regno = 14)
357 	18 for %st(7) (gcc regno = 15)
358 */
359 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
360 {
361   0, 2, 1, 3, 6, 7, 5, 4,		/* general regs */
362   11, 12, 13, 14, 15, 16, 17, 18,	/* fp regs */
363   -1, 9, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
364   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE registers */
365   29, 30, 31, 32, 33, 34, 35, 36,	/* MMX registers */
366   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
367   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
368   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
369   -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
370   93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
371   101, 102, 103, 104,			/* bound registers */
372 };
373 
374 /* Define parameter passing and return registers.  */
375 
376 static int const x86_64_int_parameter_registers[6] =
377 {
378   DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
379 };
380 
381 static int const x86_64_ms_abi_int_parameter_registers[4] =
382 {
383   CX_REG, DX_REG, R8_REG, R9_REG
384 };
385 
386 static int const x86_64_int_return_registers[4] =
387 {
388   AX_REG, DX_REG, DI_REG, SI_REG
389 };
390 
391 /* Additional registers that are clobbered by SYSV calls.  */
392 
393 #define NUM_X86_64_MS_CLOBBERED_REGS 12
394 static int const x86_64_ms_sysv_extra_clobbered_registers
395 		 [NUM_X86_64_MS_CLOBBERED_REGS] =
396 {
397   SI_REG, DI_REG,
398   XMM6_REG, XMM7_REG,
399   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
400   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
401 };
402 
403 enum xlogue_stub {
404   XLOGUE_STUB_SAVE,
405   XLOGUE_STUB_RESTORE,
406   XLOGUE_STUB_RESTORE_TAIL,
407   XLOGUE_STUB_SAVE_HFP,
408   XLOGUE_STUB_RESTORE_HFP,
409   XLOGUE_STUB_RESTORE_HFP_TAIL,
410 
411   XLOGUE_STUB_COUNT
412 };
413 
414 enum xlogue_stub_sets {
415   XLOGUE_SET_ALIGNED,
416   XLOGUE_SET_ALIGNED_PLUS_8,
417   XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
418   XLOGUE_SET_HFP_ALIGNED_PLUS_8,
419 
420   XLOGUE_SET_COUNT
421 };
422 
423 /* Register save/restore layout used by out-of-line stubs.  */
424 class xlogue_layout {
425 public:
426   struct reginfo
427   {
428     unsigned regno;
429     HOST_WIDE_INT offset;	/* Offset used by stub base pointer (rax or
430 				   rsi) to where each register is stored.  */
431   };
432 
433   unsigned get_nregs () const			{return m_nregs;}
434   HOST_WIDE_INT get_stack_align_off_in () const	{return m_stack_align_off_in;}
435 
436   const reginfo &get_reginfo (unsigned reg) const
437   {
438     gcc_assert (reg < m_nregs);
439     return m_regs[reg];
440   }
441 
442   static const char *get_stub_name (enum xlogue_stub stub,
443 				    unsigned n_extra_args);
444 
445   /* Returns an rtx for the stub's symbol based upon
446        1.) the specified stub (save, restore or restore_ret) and
447        2.) the value of cfun->machine->call_ms2sysv_extra_regs and
448        3.) rather or not stack alignment is being performed.  */
449   static rtx get_stub_rtx (enum xlogue_stub stub);
450 
451   /* Returns the amount of stack space (including padding) that the stub
452      needs to store registers based upon data in the machine_function.  */
453   HOST_WIDE_INT get_stack_space_used () const
454   {
455     const struct machine_function *m = cfun->machine;
456     unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
457 
458     gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
459     return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
460   }
461 
462   /* Returns the offset for the base pointer used by the stub.  */
463   HOST_WIDE_INT get_stub_ptr_offset () const
464   {
465     return STUB_INDEX_OFFSET + m_stack_align_off_in;
466   }
467 
468   static const struct xlogue_layout &get_instance ();
469   static unsigned count_stub_managed_regs ();
470   static bool is_stub_managed_reg (unsigned regno, unsigned count);
471 
472   static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
473   static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
474   static const unsigned MAX_REGS = 18;
475   static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
476   static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
477   static const unsigned STUB_NAME_MAX_LEN = 20;
478   static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
479   static const unsigned REG_ORDER[MAX_REGS];
480   static const unsigned REG_ORDER_REALIGN[MAX_REGS];
481 
482 private:
483   xlogue_layout ();
484   xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
485   xlogue_layout (const xlogue_layout &);
486 
487   /* True if hard frame pointer is used.  */
488   bool m_hfp;
489 
490   /* Max number of register this layout manages.  */
491   unsigned m_nregs;
492 
493   /* Incoming offset from 16-byte alignment.  */
494   HOST_WIDE_INT m_stack_align_off_in;
495 
496   /* Register order and offsets.  */
497   struct reginfo m_regs[MAX_REGS];
498 
499   /* Lazy-inited cache of symbol names for stubs.  */
500   static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
501 			  [STUB_NAME_MAX_LEN];
502 
503   static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
504 };
505 
506 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
507   "savms64",
508   "resms64",
509   "resms64x",
510   "savms64f",
511   "resms64f",
512   "resms64fx"
513 };
514 
515 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
516 /* The below offset values are where each register is stored for the layout
517    relative to incoming stack pointer.  The value of each m_regs[].offset will
518    be relative to the incoming base pointer (rax or rsi) used by the stub.
519 
520     s_instances:   0		1		2		3
521     Offset:					realigned or	aligned + 8
522     Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
523     XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
524     XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
525     XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
526     XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
527     XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
528     XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
529     XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
530     XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
531     XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
532     XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
533     SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
534     DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
535     BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
536     BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
537     R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
538     R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
539     R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
540     R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
541 };
542 
543 /* Instantiate static const values.  */
544 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
545 const unsigned xlogue_layout::MIN_REGS;
546 const unsigned xlogue_layout::MAX_REGS;
547 const unsigned xlogue_layout::MAX_EXTRA_REGS;
548 const unsigned xlogue_layout::VARIANT_COUNT;
549 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
550 
551 /* Initialize xlogue_layout::s_stub_names to zero.  */
552 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
553 				[STUB_NAME_MAX_LEN];
554 
555 /* Instantiates all xlogue_layout instances.  */
556 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
557   xlogue_layout (0, false),
558   xlogue_layout (8, false),
559   xlogue_layout (0, true),
560   xlogue_layout (8, true)
561 };
562 
563 /* Return an appropriate const instance of xlogue_layout based upon values
564    in cfun->machine and crtl.  */
565 const struct xlogue_layout &
566 xlogue_layout::get_instance ()
567 {
568   enum xlogue_stub_sets stub_set;
569   bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
570 
571   if (stack_realign_fp)
572     stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
573   else if (frame_pointer_needed)
574     stub_set = aligned_plus_8
575 	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
576 	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
577   else
578     stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
579 
580   return s_instances[stub_set];
581 }
582 
583 /* Determine how many clobbered registers can be saved by the stub.
584    Returns the count of registers the stub will save and restore.  */
585 unsigned
586 xlogue_layout::count_stub_managed_regs ()
587 {
588   bool hfp = frame_pointer_needed || stack_realign_fp;
589   unsigned i, count;
590   unsigned regno;
591 
592   for (count = i = MIN_REGS; i < MAX_REGS; ++i)
593     {
594       regno = REG_ORDER[i];
595       if (regno == BP_REG && hfp)
596 	continue;
597       if (!ix86_save_reg (regno, false, false))
598 	break;
599       ++count;
600     }
601   return count;
602 }
603 
604 /* Determine if register REGNO is a stub managed register given the
605    total COUNT of stub managed registers.  */
606 bool
607 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
608 {
609   bool hfp = frame_pointer_needed || stack_realign_fp;
610   unsigned i;
611 
612   for (i = 0; i < count; ++i)
613     {
614       gcc_assert (i < MAX_REGS);
615       if (REG_ORDER[i] == BP_REG && hfp)
616 	++count;
617       else if (REG_ORDER[i] == regno)
618 	return true;
619     }
620   return false;
621 }
622 
623 /* Constructor for xlogue_layout.  */
624 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
625   : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
626     m_stack_align_off_in (stack_align_off_in)
627 {
628   HOST_WIDE_INT offset = stack_align_off_in;
629   unsigned i, j;
630 
631   for (i = j = 0; i < MAX_REGS; ++i)
632     {
633       unsigned regno = REG_ORDER[i];
634 
635       if (regno == BP_REG && hfp)
636 	continue;
637       if (SSE_REGNO_P (regno))
638 	{
639 	  offset += 16;
640 	  /* Verify that SSE regs are always aligned.  */
641 	  gcc_assert (!((stack_align_off_in + offset) & 15));
642 	}
643       else
644 	offset += 8;
645 
646       m_regs[j].regno    = regno;
647       m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
648     }
649   gcc_assert (j == m_nregs);
650 }
651 
652 const char *
653 xlogue_layout::get_stub_name (enum xlogue_stub stub,
654 			      unsigned n_extra_regs)
655 {
656   const int have_avx = TARGET_AVX;
657   char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
658 
659   /* Lazy init */
660   if (!*name)
661     {
662       int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
663 			  (have_avx ? "avx" : "sse"),
664 			  STUB_BASE_NAMES[stub],
665 			  MIN_REGS + n_extra_regs);
666       gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
667     }
668 
669   return name;
670 }
671 
672 /* Return rtx of a symbol ref for the entry point (based upon
673    cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
674 rtx
675 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
676 {
677   const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
678   gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
679   gcc_assert (stub < XLOGUE_STUB_COUNT);
680   gcc_assert (crtl->stack_realign_finalized);
681 
682   return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
683 }
684 
685 /* Define the structure for the machine field in struct function.  */
686 
687 struct GTY(()) stack_local_entry {
688   unsigned short mode;
689   unsigned short n;
690   rtx rtl;
691   struct stack_local_entry *next;
692 };
693 
694 /* Which cpu are we scheduling for.  */
695 enum attr_cpu ix86_schedule;
696 
697 /* Which cpu are we optimizing for.  */
698 enum processor_type ix86_tune;
699 
700 /* Which instruction set architecture to use.  */
701 enum processor_type ix86_arch;
702 
703 /* True if processor has SSE prefetch instruction.  */
704 unsigned char x86_prefetch_sse;
705 
706 /* -mstackrealign option */
707 static const char ix86_force_align_arg_pointer_string[]
708   = "force_align_arg_pointer";
709 
710 static rtx (*ix86_gen_leave) (void);
711 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
714 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
715 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
716 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
717 static rtx (*ix86_gen_clzero) (rtx);
718 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
719 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
720 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
721 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
722 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
723 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
724 
725 /* Preferred alignment for stack boundary in bits.  */
726 unsigned int ix86_preferred_stack_boundary;
727 
728 /* Alignment for incoming stack boundary in bits specified at
729    command line.  */
730 static unsigned int ix86_user_incoming_stack_boundary;
731 
732 /* Default alignment for incoming stack boundary in bits.  */
733 static unsigned int ix86_default_incoming_stack_boundary;
734 
735 /* Alignment for incoming stack boundary in bits.  */
736 unsigned int ix86_incoming_stack_boundary;
737 
738 /* Calling abi specific va_list type nodes.  */
739 static GTY(()) tree sysv_va_list_type_node;
740 static GTY(()) tree ms_va_list_type_node;
741 
742 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
743 char internal_label_prefix[16];
744 int internal_label_prefix_len;
745 
746 /* Fence to use after loop using movnt.  */
747 tree x86_mfence;
748 
749 /* Register class used for passing given 64bit part of the argument.
750    These represent classes as documented by the PS ABI, with the exception
751    of SSESF, SSEDF classes, that are basically SSE class, just gcc will
752    use SF or DFmode move instead of DImode to avoid reformatting penalties.
753 
754    Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
755    whenever possible (upper half does contain padding).  */
756 enum x86_64_reg_class
757   {
758     X86_64_NO_CLASS,
759     X86_64_INTEGER_CLASS,
760     X86_64_INTEGERSI_CLASS,
761     X86_64_SSE_CLASS,
762     X86_64_SSESF_CLASS,
763     X86_64_SSEDF_CLASS,
764     X86_64_SSEUP_CLASS,
765     X86_64_X87_CLASS,
766     X86_64_X87UP_CLASS,
767     X86_64_COMPLEX_X87_CLASS,
768     X86_64_MEMORY_CLASS
769   };
770 
771 #define MAX_CLASSES 8
772 
773 /* Table of constants used by fldpi, fldln2, etc....  */
774 static REAL_VALUE_TYPE ext_80387_constants_table [5];
775 static bool ext_80387_constants_init;
776 
777 
778 static struct machine_function * ix86_init_machine_status (void);
779 static rtx ix86_function_value (const_tree, const_tree, bool);
780 static bool ix86_function_value_regno_p (const unsigned int);
781 static unsigned int ix86_function_arg_boundary (machine_mode,
782 						const_tree);
783 static rtx ix86_static_chain (const_tree, bool);
784 static int ix86_function_regparm (const_tree, const_tree);
785 static void ix86_compute_frame_layout (void);
786 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
787 						 rtx, rtx, int);
788 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
789 static tree ix86_canonical_va_list_type (tree);
790 static void predict_jump (int);
791 static unsigned int split_stack_prologue_scratch_regno (void);
792 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
793 
794 enum ix86_function_specific_strings
795 {
796   IX86_FUNCTION_SPECIFIC_ARCH,
797   IX86_FUNCTION_SPECIFIC_TUNE,
798   IX86_FUNCTION_SPECIFIC_MAX
799 };
800 
801 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
802 				 const char *, const char *, enum fpmath_unit,
803 				 bool);
804 static void ix86_function_specific_save (struct cl_target_option *,
805 					 struct gcc_options *opts);
806 static void ix86_function_specific_restore (struct gcc_options *opts,
807 					    struct cl_target_option *);
808 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
809 static void ix86_function_specific_print (FILE *, int,
810 					  struct cl_target_option *);
811 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
812 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
813 						 struct gcc_options *,
814 						 struct gcc_options *,
815 						 struct gcc_options *);
816 static bool ix86_can_inline_p (tree, tree);
817 static void ix86_set_current_function (tree);
818 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
819 
820 static enum calling_abi ix86_function_abi (const_tree);
821 
822 
823 #ifndef SUBTARGET32_DEFAULT_CPU
824 #define SUBTARGET32_DEFAULT_CPU "i386"
825 #endif
826 
827 /* Whether -mtune= or -march= were specified */
828 static int ix86_tune_defaulted;
829 static int ix86_arch_specified;
830 
831 /* Vectorization library interface and handlers.  */
832 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
833 
834 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
835 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
836 
837 /* Processor target table, indexed by processor number */
838 struct ptt
839 {
840   const char *const name;			/* processor name  */
841   const struct processor_costs *cost;		/* Processor costs */
842   const int align_loop;				/* Default alignments.  */
843   const int align_loop_max_skip;
844   const int align_jump;
845   const int align_jump_max_skip;
846   const int align_func;
847 };
848 
849 /* This table must be in sync with enum processor_type in i386.h.  */
850 static const struct ptt processor_target_table[PROCESSOR_max] =
851 {
852   {"generic", &generic_cost, 16, 10, 16, 10, 16},
853   {"i386", &i386_cost, 4, 3, 4, 3, 4},
854   {"i486", &i486_cost, 16, 15, 16, 15, 16},
855   {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
856   {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
857   {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
858   {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
859   {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
860   {"core2", &core_cost, 16, 10, 16, 10, 16},
861   {"nehalem", &core_cost, 16, 10, 16, 10, 16},
862   {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
863   {"haswell", &core_cost, 16, 10, 16, 10, 16},
864   {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
865   {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
866   {"knl", &slm_cost, 16, 15, 16, 7, 16},
867   {"knm", &slm_cost, 16, 15, 16, 7, 16},
868   {"skylake", &skylake_cost, 16, 10, 16, 10, 16},
869   {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
870   {"cannonlake", &skylake_cost, 16, 10, 16, 10, 16},
871   {"icelake-client", &skylake_cost, 16, 10, 16, 10, 16},
872   {"icelake-server", &skylake_cost, 16, 10, 16, 10, 16},
873   {"intel", &intel_cost, 16, 15, 16, 7, 16},
874   {"geode", &geode_cost, 0, 0, 0, 0, 0},
875   {"k6", &k6_cost, 32, 7, 32, 7, 32},
876   {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
877   {"k8", &k8_cost, 16, 7, 16, 7, 16},
878   {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
879   {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
880   {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
881   {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
882   {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
883   {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
884   {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
885   {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
886 };
887 
888 static unsigned int
889 rest_of_handle_insert_vzeroupper (void)
890 {
891   int i;
892 
893   /* vzeroupper instructions are inserted immediately after reload to
894      account for possible spills from 256bit or 512bit registers.  The pass
895      reuses mode switching infrastructure by re-running mode insertion
896      pass, so disable entities that have already been processed.  */
897   for (i = 0; i < MAX_386_ENTITIES; i++)
898     ix86_optimize_mode_switching[i] = 0;
899 
900   ix86_optimize_mode_switching[AVX_U128] = 1;
901 
902   /* Call optimize_mode_switching.  */
903   g->get_passes ()->execute_pass_mode_switching ();
904   return 0;
905 }
906 
907 /* Return 1 if INSN uses or defines a hard register.
908    Hard register uses in a memory address are ignored.
909    Clobbers and flags definitions are ignored.  */
910 
911 static bool
912 has_non_address_hard_reg (rtx_insn *insn)
913 {
914   df_ref ref;
915   FOR_EACH_INSN_DEF (ref, insn)
916     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
917 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
918 	&& DF_REF_REGNO (ref) != FLAGS_REG)
919       return true;
920 
921   FOR_EACH_INSN_USE (ref, insn)
922     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
923       return true;
924 
925   return false;
926 }
927 
928 /* Check if comparison INSN may be transformed
929    into vector comparison.  Currently we transform
930    zero checks only which look like:
931 
932    (set (reg:CCZ 17 flags)
933         (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
934                              (subreg:SI (reg:DI x) 0))
935 		     (const_int 0 [0])))  */
936 
937 static bool
938 convertible_comparison_p (rtx_insn *insn)
939 {
940   if (!TARGET_SSE4_1)
941     return false;
942 
943   rtx def_set = single_set (insn);
944 
945   gcc_assert (def_set);
946 
947   rtx src = SET_SRC (def_set);
948   rtx dst = SET_DEST (def_set);
949 
950   gcc_assert (GET_CODE (src) == COMPARE);
951 
952   if (GET_CODE (dst) != REG
953       || REGNO (dst) != FLAGS_REG
954       || GET_MODE (dst) != CCZmode)
955     return false;
956 
957   rtx op1 = XEXP (src, 0);
958   rtx op2 = XEXP (src, 1);
959 
960   if (op2 != CONST0_RTX (GET_MODE (op2)))
961     return false;
962 
963   if (GET_CODE (op1) != IOR)
964     return false;
965 
966   op2 = XEXP (op1, 1);
967   op1 = XEXP (op1, 0);
968 
969   if (!SUBREG_P (op1)
970       || !SUBREG_P (op2)
971       || GET_MODE (op1) != SImode
972       || GET_MODE (op2) != SImode
973       || ((SUBREG_BYTE (op1) != 0
974 	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
975 	  && (SUBREG_BYTE (op2) != 0
976 	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
977     return false;
978 
979   op1 = SUBREG_REG (op1);
980   op2 = SUBREG_REG (op2);
981 
982   if (op1 != op2
983       || !REG_P (op1)
984       || GET_MODE (op1) != DImode)
985     return false;
986 
987   return true;
988 }
989 
990 /* The DImode version of scalar_to_vector_candidate_p.  */
991 
992 static bool
993 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
994 {
995   rtx def_set = single_set (insn);
996 
997   if (!def_set)
998     return false;
999 
1000   if (has_non_address_hard_reg (insn))
1001     return false;
1002 
1003   rtx src = SET_SRC (def_set);
1004   rtx dst = SET_DEST (def_set);
1005 
1006   if (GET_CODE (src) == COMPARE)
1007     return convertible_comparison_p (insn);
1008 
1009   /* We are interested in DImode promotion only.  */
1010   if ((GET_MODE (src) != DImode
1011        && !CONST_INT_P (src))
1012       || GET_MODE (dst) != DImode)
1013     return false;
1014 
1015   if (!REG_P (dst) && !MEM_P (dst))
1016     return false;
1017 
1018   switch (GET_CODE (src))
1019     {
1020     case ASHIFTRT:
1021       if (!TARGET_AVX512VL)
1022 	return false;
1023       /* FALLTHRU */
1024 
1025     case ASHIFT:
1026     case LSHIFTRT:
1027       if (!CONST_INT_P (XEXP (src, 1))
1028 	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
1029 	return false;
1030       break;
1031 
1032     case PLUS:
1033     case MINUS:
1034     case IOR:
1035     case XOR:
1036     case AND:
1037       if (!REG_P (XEXP (src, 1))
1038 	  && !MEM_P (XEXP (src, 1))
1039 	  && !CONST_INT_P (XEXP (src, 1)))
1040 	return false;
1041 
1042       if (GET_MODE (XEXP (src, 1)) != DImode
1043 	  && !CONST_INT_P (XEXP (src, 1)))
1044 	return false;
1045       break;
1046 
1047     case NEG:
1048     case NOT:
1049       break;
1050 
1051     case REG:
1052       return true;
1053 
1054     case MEM:
1055     case CONST_INT:
1056       return REG_P (dst);
1057 
1058     default:
1059       return false;
1060     }
1061 
1062   if (!REG_P (XEXP (src, 0))
1063       && !MEM_P (XEXP (src, 0))
1064       && !CONST_INT_P (XEXP (src, 0))
1065       /* Check for andnot case.  */
1066       && (GET_CODE (src) != AND
1067 	  || GET_CODE (XEXP (src, 0)) != NOT
1068 	  || !REG_P (XEXP (XEXP (src, 0), 0))))
1069       return false;
1070 
1071   if (GET_MODE (XEXP (src, 0)) != DImode
1072       && !CONST_INT_P (XEXP (src, 0)))
1073     return false;
1074 
1075   return true;
1076 }
1077 
1078 /* The TImode version of scalar_to_vector_candidate_p.  */
1079 
1080 static bool
1081 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1082 {
1083   rtx def_set = single_set (insn);
1084 
1085   if (!def_set)
1086     return false;
1087 
1088   if (has_non_address_hard_reg (insn))
1089     return false;
1090 
1091   rtx src = SET_SRC (def_set);
1092   rtx dst = SET_DEST (def_set);
1093 
1094   /* Only TImode load and store are allowed.  */
1095   if (GET_MODE (dst) != TImode)
1096     return false;
1097 
1098   if (MEM_P (dst))
1099     {
1100       /* Check for store.  Memory must be aligned or unaligned store
1101 	 is optimal.  Only support store from register, standard SSE
1102 	 constant or CONST_WIDE_INT generated from piecewise store.
1103 
1104 	 ??? Verify performance impact before enabling CONST_INT for
1105 	 __int128 store.  */
1106       if (misaligned_operand (dst, TImode)
1107 	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1108 	return false;
1109 
1110       switch (GET_CODE (src))
1111 	{
1112 	default:
1113 	  return false;
1114 
1115 	case REG:
1116 	case CONST_WIDE_INT:
1117 	  return true;
1118 
1119 	case CONST_INT:
1120 	  return standard_sse_constant_p (src, TImode);
1121 	}
1122     }
1123   else if (MEM_P (src))
1124     {
1125       /* Check for load.  Memory must be aligned or unaligned load is
1126 	 optimal.  */
1127       return (REG_P (dst)
1128 	      && (!misaligned_operand (src, TImode)
1129 		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1130     }
1131 
1132   return false;
1133 }
1134 
1135 /* Return 1 if INSN may be converted into vector
1136    instruction.  */
1137 
1138 static bool
1139 scalar_to_vector_candidate_p (rtx_insn *insn)
1140 {
1141   if (TARGET_64BIT)
1142     return timode_scalar_to_vector_candidate_p (insn);
1143   else
1144     return dimode_scalar_to_vector_candidate_p (insn);
1145 }
1146 
1147 /* The DImode version of remove_non_convertible_regs.  */
1148 
1149 static void
1150 dimode_remove_non_convertible_regs (bitmap candidates)
1151 {
1152   bitmap_iterator bi;
1153   unsigned id;
1154   bitmap regs = BITMAP_ALLOC (NULL);
1155 
1156   EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1157     {
1158       rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1159       rtx reg = SET_DEST (def_set);
1160 
1161       if (!REG_P (reg)
1162 	  || bitmap_bit_p (regs, REGNO (reg))
1163 	  || HARD_REGISTER_P (reg))
1164 	continue;
1165 
1166       for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1167 	   def;
1168 	   def = DF_REF_NEXT_REG (def))
1169 	{
1170 	  if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1171 	    {
1172 	      if (dump_file)
1173 		fprintf (dump_file,
1174 			 "r%d has non convertible definition in insn %d\n",
1175 			 REGNO (reg), DF_REF_INSN_UID (def));
1176 
1177 	      bitmap_set_bit (regs, REGNO (reg));
1178 	      break;
1179 	    }
1180 	}
1181     }
1182 
1183   EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1184     {
1185       for (df_ref def = DF_REG_DEF_CHAIN (id);
1186 	   def;
1187 	   def = DF_REF_NEXT_REG (def))
1188 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1189 	  {
1190 	    if (dump_file)
1191 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1192 		       DF_REF_INSN_UID (def));
1193 
1194 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1195 	  }
1196     }
1197 
1198   BITMAP_FREE (regs);
1199 }
1200 
1201 /* For a register REGNO, scan instructions for its defs and uses.
1202    Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
1203 
1204 static void
1205 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1206 				   unsigned int regno)
1207 {
1208   for (df_ref def = DF_REG_DEF_CHAIN (regno);
1209        def;
1210        def = DF_REF_NEXT_REG (def))
1211     {
1212       if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1213 	{
1214 	  if (dump_file)
1215 	    fprintf (dump_file,
1216 		     "r%d has non convertible def in insn %d\n",
1217 		     regno, DF_REF_INSN_UID (def));
1218 
1219 	  bitmap_set_bit (regs, regno);
1220 	  break;
1221 	}
1222     }
1223 
1224   for (df_ref ref = DF_REG_USE_CHAIN (regno);
1225        ref;
1226        ref = DF_REF_NEXT_REG (ref))
1227     {
1228       /* Debug instructions are skipped.  */
1229       if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1230 	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1231 	{
1232 	  if (dump_file)
1233 	    fprintf (dump_file,
1234 		     "r%d has non convertible use in insn %d\n",
1235 		     regno, DF_REF_INSN_UID (ref));
1236 
1237 	  bitmap_set_bit (regs, regno);
1238 	  break;
1239 	}
1240     }
1241 }
1242 
1243 /* The TImode version of remove_non_convertible_regs.  */
1244 
1245 static void
1246 timode_remove_non_convertible_regs (bitmap candidates)
1247 {
1248   bitmap_iterator bi;
1249   unsigned id;
1250   bitmap regs = BITMAP_ALLOC (NULL);
1251 
1252   EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1253     {
1254       rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1255       rtx dest = SET_DEST (def_set);
1256       rtx src = SET_SRC (def_set);
1257 
1258       if ((!REG_P (dest)
1259 	   || bitmap_bit_p (regs, REGNO (dest))
1260 	   || HARD_REGISTER_P (dest))
1261 	  && (!REG_P (src)
1262 	      || bitmap_bit_p (regs, REGNO (src))
1263 	      || HARD_REGISTER_P (src)))
1264 	continue;
1265 
1266       if (REG_P (dest))
1267 	timode_check_non_convertible_regs (candidates, regs,
1268 					   REGNO (dest));
1269 
1270       if (REG_P (src))
1271 	timode_check_non_convertible_regs (candidates, regs,
1272 					   REGNO (src));
1273     }
1274 
1275   EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1276     {
1277       for (df_ref def = DF_REG_DEF_CHAIN (id);
1278 	   def;
1279 	   def = DF_REF_NEXT_REG (def))
1280 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1281 	  {
1282 	    if (dump_file)
1283 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1284 		       DF_REF_INSN_UID (def));
1285 
1286 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1287 	  }
1288 
1289       for (df_ref ref = DF_REG_USE_CHAIN (id);
1290 	   ref;
1291 	   ref = DF_REF_NEXT_REG (ref))
1292 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1293 	  {
1294 	    if (dump_file)
1295 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1296 		       DF_REF_INSN_UID (ref));
1297 
1298 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1299 	  }
1300     }
1301 
1302   BITMAP_FREE (regs);
1303 }
1304 
1305 /* For a given bitmap of insn UIDs scans all instruction and
1306    remove insn from CANDIDATES in case it has both convertible
1307    and not convertible definitions.
1308 
1309    All insns in a bitmap are conversion candidates according to
1310    scalar_to_vector_candidate_p.  Currently it implies all insns
1311    are single_set.  */
1312 
1313 static void
1314 remove_non_convertible_regs (bitmap candidates)
1315 {
1316   if (TARGET_64BIT)
1317     timode_remove_non_convertible_regs (candidates);
1318   else
1319     dimode_remove_non_convertible_regs (candidates);
1320 }
1321 
1322 class scalar_chain
1323 {
1324  public:
1325   scalar_chain ();
1326   virtual ~scalar_chain ();
1327 
1328   static unsigned max_id;
1329 
1330   /* ID of a chain.  */
1331   unsigned int chain_id;
1332   /* A queue of instructions to be included into a chain.  */
1333   bitmap queue;
1334   /* Instructions included into a chain.  */
1335   bitmap insns;
1336   /* All registers defined by a chain.  */
1337   bitmap defs;
1338   /* Registers used in both vector and sclar modes.  */
1339   bitmap defs_conv;
1340 
1341   void build (bitmap candidates, unsigned insn_uid);
1342   virtual int compute_convert_gain () = 0;
1343   int convert ();
1344 
1345  protected:
1346   void add_to_queue (unsigned insn_uid);
1347   void emit_conversion_insns (rtx insns, rtx_insn *pos);
1348 
1349  private:
1350   void add_insn (bitmap candidates, unsigned insn_uid);
1351   void analyze_register_chain (bitmap candidates, df_ref ref);
1352   virtual void mark_dual_mode_def (df_ref def) = 0;
1353   virtual void convert_insn (rtx_insn *insn) = 0;
1354   virtual void convert_registers () = 0;
1355 };
1356 
1357 class dimode_scalar_chain : public scalar_chain
1358 {
1359  public:
1360   int compute_convert_gain ();
1361  private:
1362   void mark_dual_mode_def (df_ref def);
1363   rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1364   void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1365   void convert_insn (rtx_insn *insn);
1366   void convert_op (rtx *op, rtx_insn *insn);
1367   void convert_reg (unsigned regno);
1368   void make_vector_copies (unsigned regno);
1369   void convert_registers ();
1370   int vector_const_cost (rtx exp);
1371 };
1372 
1373 class timode_scalar_chain : public scalar_chain
1374 {
1375  public:
1376   /* Convert from TImode to V1TImode is always faster.  */
1377   int compute_convert_gain () { return 1; }
1378 
1379  private:
1380   void mark_dual_mode_def (df_ref def);
1381   void fix_debug_reg_uses (rtx reg);
1382   void convert_insn (rtx_insn *insn);
1383   /* We don't convert registers to difference size.  */
1384   void convert_registers () {}
1385 };
1386 
1387 unsigned scalar_chain::max_id = 0;
1388 
1389 /* Initialize new chain.  */
1390 
1391 scalar_chain::scalar_chain ()
1392 {
1393   chain_id = ++max_id;
1394 
1395    if (dump_file)
1396     fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1397 
1398   bitmap_obstack_initialize (NULL);
1399   insns = BITMAP_ALLOC (NULL);
1400   defs = BITMAP_ALLOC (NULL);
1401   defs_conv = BITMAP_ALLOC (NULL);
1402   queue = NULL;
1403 }
1404 
1405 /* Free chain's data.  */
1406 
1407 scalar_chain::~scalar_chain ()
1408 {
1409   BITMAP_FREE (insns);
1410   BITMAP_FREE (defs);
1411   BITMAP_FREE (defs_conv);
1412   bitmap_obstack_release (NULL);
1413 }
1414 
1415 /* Add instruction into chains' queue.  */
1416 
1417 void
1418 scalar_chain::add_to_queue (unsigned insn_uid)
1419 {
1420   if (bitmap_bit_p (insns, insn_uid)
1421       || bitmap_bit_p (queue, insn_uid))
1422     return;
1423 
1424   if (dump_file)
1425     fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
1426 	     insn_uid, chain_id);
1427   bitmap_set_bit (queue, insn_uid);
1428 }
1429 
1430 /* For DImode conversion, mark register defined by DEF as requiring
1431    conversion.  */
1432 
1433 void
1434 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1435 {
1436   gcc_assert (DF_REF_REG_DEF_P (def));
1437 
1438   if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1439     return;
1440 
1441   if (dump_file)
1442     fprintf (dump_file,
1443 	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1444 	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1445 
1446   bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1447 }
1448 
1449 /* For TImode conversion, it is unused.  */
1450 
1451 void
1452 timode_scalar_chain::mark_dual_mode_def (df_ref)
1453 {
1454   gcc_unreachable ();
1455 }
1456 
1457 /* Check REF's chain to add new insns into a queue
1458    and find registers requiring conversion.  */
1459 
1460 void
1461 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1462 {
1463   df_link *chain;
1464 
1465   gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1466 	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1467   add_to_queue (DF_REF_INSN_UID (ref));
1468 
1469   for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1470     {
1471       unsigned uid = DF_REF_INSN_UID (chain->ref);
1472 
1473       if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1474 	continue;
1475 
1476       if (!DF_REF_REG_MEM_P (chain->ref))
1477 	{
1478 	  if (bitmap_bit_p (insns, uid))
1479 	    continue;
1480 
1481 	  if (bitmap_bit_p (candidates, uid))
1482 	    {
1483 	      add_to_queue (uid);
1484 	      continue;
1485 	    }
1486 	}
1487 
1488       if (DF_REF_REG_DEF_P (chain->ref))
1489 	{
1490 	  if (dump_file)
1491 	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
1492 		     DF_REF_REGNO (chain->ref), uid);
1493 	  mark_dual_mode_def (chain->ref);
1494 	}
1495       else
1496 	{
1497 	  if (dump_file)
1498 	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
1499 		     DF_REF_REGNO (chain->ref), uid);
1500 	  mark_dual_mode_def (ref);
1501 	}
1502     }
1503 }
1504 
1505 /* Add instruction into a chain.  */
1506 
1507 void
1508 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1509 {
1510   if (bitmap_bit_p (insns, insn_uid))
1511     return;
1512 
1513   if (dump_file)
1514     fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
1515 
1516   bitmap_set_bit (insns, insn_uid);
1517 
1518   rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1519   rtx def_set = single_set (insn);
1520   if (def_set && REG_P (SET_DEST (def_set))
1521       && !HARD_REGISTER_P (SET_DEST (def_set)))
1522     bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1523 
1524   df_ref ref;
1525   df_ref def;
1526   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1527     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1528       for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1529 	   def;
1530 	   def = DF_REF_NEXT_REG (def))
1531 	analyze_register_chain (candidates, def);
1532   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1533     if (!DF_REF_REG_MEM_P (ref))
1534       analyze_register_chain (candidates, ref);
1535 }
1536 
1537 /* Build new chain starting from insn INSN_UID recursively
1538    adding all dependent uses and definitions.  */
1539 
1540 void
1541 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1542 {
1543   queue = BITMAP_ALLOC (NULL);
1544   bitmap_set_bit (queue, insn_uid);
1545 
1546   if (dump_file)
1547     fprintf (dump_file, "Building chain #%d...\n", chain_id);
1548 
1549   while (!bitmap_empty_p (queue))
1550     {
1551       insn_uid = bitmap_first_set_bit (queue);
1552       bitmap_clear_bit (queue, insn_uid);
1553       bitmap_clear_bit (candidates, insn_uid);
1554       add_insn (candidates, insn_uid);
1555     }
1556 
1557   if (dump_file)
1558     {
1559       fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1560       fprintf (dump_file, "  insns: ");
1561       dump_bitmap (dump_file, insns);
1562       if (!bitmap_empty_p (defs_conv))
1563 	{
1564 	  bitmap_iterator bi;
1565 	  unsigned id;
1566 	  const char *comma = "";
1567 	  fprintf (dump_file, "  defs to convert: ");
1568 	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1569 	    {
1570 	      fprintf (dump_file, "%sr%d", comma, id);
1571 	      comma = ", ";
1572 	    }
1573 	  fprintf (dump_file, "\n");
1574 	}
1575     }
1576 
1577   BITMAP_FREE (queue);
1578 }
1579 
1580 /* Return a cost of building a vector costant
1581    instead of using a scalar one.  */
1582 
1583 int
1584 dimode_scalar_chain::vector_const_cost (rtx exp)
1585 {
1586   gcc_assert (CONST_INT_P (exp));
1587 
1588   if (standard_sse_constant_p (exp, V2DImode))
1589     return COSTS_N_INSNS (1);
1590   return ix86_cost->sse_load[1];
1591 }
1592 
1593 /* Compute a gain for chain conversion.  */
1594 
1595 int
1596 dimode_scalar_chain::compute_convert_gain ()
1597 {
1598   bitmap_iterator bi;
1599   unsigned insn_uid;
1600   int gain = 0;
1601   int cost = 0;
1602 
1603   if (dump_file)
1604     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1605 
1606   EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1607     {
1608       rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1609       rtx def_set = single_set (insn);
1610       rtx src = SET_SRC (def_set);
1611       rtx dst = SET_DEST (def_set);
1612 
1613       if (REG_P (src) && REG_P (dst))
1614 	gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1615       else if (REG_P (src) && MEM_P (dst))
1616 	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1617       else if (MEM_P (src) && REG_P (dst))
1618 	gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1619       else if (GET_CODE (src) == ASHIFT
1620 	       || GET_CODE (src) == ASHIFTRT
1621 	       || GET_CODE (src) == LSHIFTRT)
1622 	{
1623     	  if (CONST_INT_P (XEXP (src, 0)))
1624 	    gain -= vector_const_cost (XEXP (src, 0));
1625 
1626 	  gain += ix86_cost->shift_const;
1627 	  if (INTVAL (XEXP (src, 1)) >= 32)
1628 	    gain -= COSTS_N_INSNS (1);
1629 	}
1630       else if (GET_CODE (src) == PLUS
1631 	       || GET_CODE (src) == MINUS
1632 	       || GET_CODE (src) == IOR
1633 	       || GET_CODE (src) == XOR
1634 	       || GET_CODE (src) == AND)
1635 	{
1636 	  gain += ix86_cost->add;
1637 	  /* Additional gain for andnot for targets without BMI.  */
1638 	  if (GET_CODE (XEXP (src, 0)) == NOT
1639 	      && !TARGET_BMI)
1640 	    gain += 2 * ix86_cost->add;
1641 
1642 	  if (CONST_INT_P (XEXP (src, 0)))
1643 	    gain -= vector_const_cost (XEXP (src, 0));
1644 	  if (CONST_INT_P (XEXP (src, 1)))
1645 	    gain -= vector_const_cost (XEXP (src, 1));
1646 	}
1647       else if (GET_CODE (src) == NEG
1648 	       || GET_CODE (src) == NOT)
1649 	gain += ix86_cost->add - COSTS_N_INSNS (1);
1650       else if (GET_CODE (src) == COMPARE)
1651 	{
1652 	  /* Assume comparison cost is the same.  */
1653 	}
1654       else if (CONST_INT_P (src))
1655 	{
1656 	  if (REG_P (dst))
1657 	    gain += COSTS_N_INSNS (2);
1658 	  else if (MEM_P (dst))
1659 	    gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1660 	  gain -= vector_const_cost (src);
1661 	}
1662       else
1663 	gcc_unreachable ();
1664     }
1665 
1666   if (dump_file)
1667     fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
1668 
1669   EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1670     cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1671 
1672   if (dump_file)
1673     fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
1674 
1675   gain -= cost;
1676 
1677   if (dump_file)
1678     fprintf (dump_file, "  Total gain: %d\n", gain);
1679 
1680   return gain;
1681 }
1682 
1683 /* Replace REG in X with a V2DI subreg of NEW_REG.  */
1684 
1685 rtx
1686 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1687 {
1688   if (x == reg)
1689     return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1690 
1691   const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1692   int i, j;
1693   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1694     {
1695       if (fmt[i] == 'e')
1696 	XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1697       else if (fmt[i] == 'E')
1698 	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1699 	  XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1700 						   reg, new_reg);
1701     }
1702 
1703   return x;
1704 }
1705 
1706 /* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
1707 
1708 void
1709 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1710 						  rtx reg, rtx new_reg)
1711 {
1712   replace_with_subreg (single_set (insn), reg, new_reg);
1713 }
1714 
1715 /* Insert generated conversion instruction sequence INSNS
1716    after instruction AFTER.  New BB may be required in case
1717    instruction has EH region attached.  */
1718 
1719 void
1720 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1721 {
1722   if (!control_flow_insn_p (after))
1723     {
1724       emit_insn_after (insns, after);
1725       return;
1726     }
1727 
1728   basic_block bb = BLOCK_FOR_INSN (after);
1729   edge e = find_fallthru_edge (bb->succs);
1730   gcc_assert (e);
1731 
1732   basic_block new_bb = split_edge (e);
1733   emit_insn_after (insns, BB_HEAD (new_bb));
1734 }
1735 
1736 /* Make vector copies for all register REGNO definitions
1737    and replace its uses in a chain.  */
1738 
1739 void
1740 dimode_scalar_chain::make_vector_copies (unsigned regno)
1741 {
1742   rtx reg = regno_reg_rtx[regno];
1743   rtx vreg = gen_reg_rtx (DImode);
1744   df_ref ref;
1745 
1746   for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1747     if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1748       {
1749 	start_sequence ();
1750 
1751 	if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1752 	  {
1753 	    rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1754 	    emit_move_insn (adjust_address (tmp, SImode, 0),
1755 			    gen_rtx_SUBREG (SImode, reg, 0));
1756 	    emit_move_insn (adjust_address (tmp, SImode, 4),
1757 			    gen_rtx_SUBREG (SImode, reg, 4));
1758 	    emit_move_insn (vreg, tmp);
1759 	  }
1760 	else if (TARGET_SSE4_1)
1761 	  {
1762 	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1763 					CONST0_RTX (V4SImode),
1764 					gen_rtx_SUBREG (SImode, reg, 0)));
1765 	    emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1766 					  gen_rtx_SUBREG (V4SImode, vreg, 0),
1767 					  gen_rtx_SUBREG (SImode, reg, 4),
1768 					  GEN_INT (2)));
1769 	  }
1770 	else
1771 	  {
1772 	    rtx tmp = gen_reg_rtx (DImode);
1773 	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1774 					CONST0_RTX (V4SImode),
1775 					gen_rtx_SUBREG (SImode, reg, 0)));
1776 	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1777 					CONST0_RTX (V4SImode),
1778 					gen_rtx_SUBREG (SImode, reg, 4)));
1779 	    emit_insn (gen_vec_interleave_lowv4si
1780 		       (gen_rtx_SUBREG (V4SImode, vreg, 0),
1781 			gen_rtx_SUBREG (V4SImode, vreg, 0),
1782 			gen_rtx_SUBREG (V4SImode, tmp, 0)));
1783 	  }
1784 	rtx_insn *seq = get_insns ();
1785 	end_sequence ();
1786 	rtx_insn *insn = DF_REF_INSN (ref);
1787 	emit_conversion_insns (seq, insn);
1788 
1789 	if (dump_file)
1790 	  fprintf (dump_file,
1791 		   "  Copied r%d to a vector register r%d for insn %d\n",
1792 		   regno, REGNO (vreg), INSN_UID (insn));
1793       }
1794 
1795   for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1796     if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1797       {
1798 	rtx_insn *insn = DF_REF_INSN (ref);
1799 
1800 	replace_with_subreg_in_insn (insn, reg, vreg);
1801 
1802 	if (dump_file)
1803 	  fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
1804 		   regno, REGNO (vreg), INSN_UID (insn));
1805       }
1806 }
1807 
1808 /* Convert all definitions of register REGNO
1809    and fix its uses.  Scalar copies may be created
1810    in case register is used in not convertible insn.  */
1811 
1812 void
1813 dimode_scalar_chain::convert_reg (unsigned regno)
1814 {
1815   bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1816   rtx reg = regno_reg_rtx[regno];
1817   rtx scopy = NULL_RTX;
1818   df_ref ref;
1819   bitmap conv;
1820 
1821   conv = BITMAP_ALLOC (NULL);
1822   bitmap_copy (conv, insns);
1823 
1824   if (scalar_copy)
1825     scopy = gen_reg_rtx (DImode);
1826 
1827   for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1828     {
1829       rtx_insn *insn = DF_REF_INSN (ref);
1830       rtx def_set = single_set (insn);
1831       rtx src = SET_SRC (def_set);
1832       rtx reg = DF_REF_REG (ref);
1833 
1834       if (!MEM_P (src))
1835 	{
1836 	  replace_with_subreg_in_insn (insn, reg, reg);
1837 	  bitmap_clear_bit (conv, INSN_UID (insn));
1838 	}
1839 
1840       if (scalar_copy)
1841 	{
1842 	  start_sequence ();
1843 	  if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1844 	    {
1845 	      rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1846 	      emit_move_insn (tmp, reg);
1847 	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1848 			      adjust_address (tmp, SImode, 0));
1849 	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1850 			      adjust_address (tmp, SImode, 4));
1851 	    }
1852 	  else if (TARGET_SSE4_1)
1853 	    {
1854 	      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1855 	      emit_insn
1856 		(gen_rtx_SET
1857 		 (gen_rtx_SUBREG (SImode, scopy, 0),
1858 		  gen_rtx_VEC_SELECT (SImode,
1859 				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1860 
1861 	      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1862 	      emit_insn
1863 		(gen_rtx_SET
1864 		 (gen_rtx_SUBREG (SImode, scopy, 4),
1865 		  gen_rtx_VEC_SELECT (SImode,
1866 				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1867 	    }
1868 	  else
1869 	    {
1870 	      rtx vcopy = gen_reg_rtx (V2DImode);
1871 	      emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1872 	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1873 			      gen_rtx_SUBREG (SImode, vcopy, 0));
1874 	      emit_move_insn (vcopy,
1875 			      gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1876 	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1877 			      gen_rtx_SUBREG (SImode, vcopy, 0));
1878 	    }
1879 	  rtx_insn *seq = get_insns ();
1880 	  end_sequence ();
1881 	  emit_conversion_insns (seq, insn);
1882 
1883 	  if (dump_file)
1884 	    fprintf (dump_file,
1885 		     "  Copied r%d to a scalar register r%d for insn %d\n",
1886 		     regno, REGNO (scopy), INSN_UID (insn));
1887 	}
1888     }
1889 
1890   for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1891     if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1892       {
1893 	if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1894 	  {
1895 	    rtx_insn *insn = DF_REF_INSN (ref);
1896 
1897 	    rtx def_set = single_set (insn);
1898 	    gcc_assert (def_set);
1899 
1900 	    rtx src = SET_SRC (def_set);
1901 	    rtx dst = SET_DEST (def_set);
1902 
1903 	    if (!MEM_P (dst) || !REG_P (src))
1904 	      replace_with_subreg_in_insn (insn, reg, reg);
1905 
1906 	    bitmap_clear_bit (conv, INSN_UID (insn));
1907 	  }
1908       }
1909     /* Skip debug insns and uninitialized uses.  */
1910     else if (DF_REF_CHAIN (ref)
1911 	     && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
1912       {
1913 	gcc_assert (scopy);
1914 	replace_rtx (DF_REF_INSN (ref), reg, scopy);
1915 	df_insn_rescan (DF_REF_INSN (ref));
1916       }
1917 
1918   BITMAP_FREE (conv);
1919 }
1920 
1921 /* Convert operand OP in INSN.  We should handle
1922    memory operands and uninitialized registers.
1923    All other register uses are converted during
1924    registers conversion.  */
1925 
1926 void
1927 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1928 {
1929   *op = copy_rtx_if_shared (*op);
1930 
1931   if (GET_CODE (*op) == NOT)
1932     {
1933       convert_op (&XEXP (*op, 0), insn);
1934       PUT_MODE (*op, V2DImode);
1935     }
1936   else if (MEM_P (*op))
1937     {
1938       rtx tmp = gen_reg_rtx (DImode);
1939 
1940       emit_insn_before (gen_move_insn (tmp, *op), insn);
1941       *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
1942 
1943       if (dump_file)
1944 	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
1945 		 INSN_UID (insn), REGNO (tmp));
1946     }
1947   else if (REG_P (*op))
1948     {
1949       /* We may have not converted register usage in case
1950 	 this register has no definition.  Otherwise it
1951 	 should be converted in convert_reg.  */
1952       df_ref ref;
1953       FOR_EACH_INSN_USE (ref, insn)
1954 	if (DF_REF_REGNO (ref) == REGNO (*op))
1955 	  {
1956 	    gcc_assert (!DF_REF_CHAIN (ref));
1957 	    break;
1958 	  }
1959       *op = gen_rtx_SUBREG (V2DImode, *op, 0);
1960     }
1961   else if (CONST_INT_P (*op))
1962     {
1963       rtx vec_cst;
1964       rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
1965 
1966       /* Prefer all ones vector in case of -1.  */
1967       if (constm1_operand (*op, GET_MODE (*op)))
1968 	vec_cst = CONSTM1_RTX (V2DImode);
1969       else
1970 	vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
1971 					gen_rtvec (2, *op, const0_rtx));
1972 
1973       if (!standard_sse_constant_p (vec_cst, V2DImode))
1974 	{
1975 	  start_sequence ();
1976 	  vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
1977 	  rtx_insn *seq = get_insns ();
1978 	  end_sequence ();
1979 	  emit_insn_before (seq, insn);
1980 	}
1981 
1982       emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1983       *op = tmp;
1984     }
1985   else
1986     {
1987       gcc_assert (SUBREG_P (*op));
1988       gcc_assert (GET_MODE (*op) == V2DImode);
1989     }
1990 }
1991 
1992 /* Convert INSN to vector mode.  */
1993 
1994 void
1995 dimode_scalar_chain::convert_insn (rtx_insn *insn)
1996 {
1997   rtx def_set = single_set (insn);
1998   rtx src = SET_SRC (def_set);
1999   rtx dst = SET_DEST (def_set);
2000   rtx subreg;
2001 
2002   if (MEM_P (dst) && !REG_P (src))
2003     {
2004       /* There are no scalar integer instructions and therefore
2005 	 temporary register usage is required.  */
2006       rtx tmp = gen_reg_rtx (DImode);
2007       emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2008       dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2009     }
2010 
2011   switch (GET_CODE (src))
2012     {
2013     case ASHIFT:
2014     case ASHIFTRT:
2015     case LSHIFTRT:
2016       convert_op (&XEXP (src, 0), insn);
2017       PUT_MODE (src, V2DImode);
2018       break;
2019 
2020     case PLUS:
2021     case MINUS:
2022     case IOR:
2023     case XOR:
2024     case AND:
2025       convert_op (&XEXP (src, 0), insn);
2026       convert_op (&XEXP (src, 1), insn);
2027       PUT_MODE (src, V2DImode);
2028       break;
2029 
2030     case NEG:
2031       src = XEXP (src, 0);
2032       convert_op (&src, insn);
2033       subreg = gen_reg_rtx (V2DImode);
2034       emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2035       src = gen_rtx_MINUS (V2DImode, subreg, src);
2036       break;
2037 
2038     case NOT:
2039       src = XEXP (src, 0);
2040       convert_op (&src, insn);
2041       subreg = gen_reg_rtx (V2DImode);
2042       emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2043       src = gen_rtx_XOR (V2DImode, src, subreg);
2044       break;
2045 
2046     case MEM:
2047       if (!REG_P (dst))
2048 	convert_op (&src, insn);
2049       break;
2050 
2051     case REG:
2052       if (!MEM_P (dst))
2053 	convert_op (&src, insn);
2054       break;
2055 
2056     case SUBREG:
2057       gcc_assert (GET_MODE (src) == V2DImode);
2058       break;
2059 
2060     case COMPARE:
2061       src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2062 
2063       gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2064 		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2065 
2066       if (REG_P (src))
2067 	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2068       else
2069 	subreg = copy_rtx_if_shared (src);
2070       emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2071 						    copy_rtx_if_shared (subreg),
2072 						    copy_rtx_if_shared (subreg)),
2073 			insn);
2074       dst = gen_rtx_REG (CCmode, FLAGS_REG);
2075       src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2076 					       copy_rtx_if_shared (src)),
2077 			    UNSPEC_PTEST);
2078       break;
2079 
2080     case CONST_INT:
2081       convert_op (&src, insn);
2082       break;
2083 
2084     default:
2085       gcc_unreachable ();
2086     }
2087 
2088   SET_SRC (def_set) = src;
2089   SET_DEST (def_set) = dst;
2090 
2091   /* Drop possible dead definitions.  */
2092   PATTERN (insn) = def_set;
2093 
2094   INSN_CODE (insn) = -1;
2095   recog_memoized (insn);
2096   df_insn_rescan (insn);
2097 }
2098 
2099 /* Fix uses of converted REG in debug insns.  */
2100 
2101 void
2102 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2103 {
2104   if (!flag_var_tracking)
2105     return;
2106 
2107   df_ref ref, next;
2108   for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2109     {
2110       rtx_insn *insn = DF_REF_INSN (ref);
2111       /* Make sure the next ref is for a different instruction,
2112          so that we're not affected by the rescan.  */
2113       next = DF_REF_NEXT_REG (ref);
2114       while (next && DF_REF_INSN (next) == insn)
2115 	next = DF_REF_NEXT_REG (next);
2116 
2117       if (DEBUG_INSN_P (insn))
2118 	{
2119 	  /* It may be a debug insn with a TImode variable in
2120 	     register.  */
2121 	  bool changed = false;
2122 	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2123 	    {
2124 	      rtx *loc = DF_REF_LOC (ref);
2125 	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2126 		{
2127 		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2128 		  changed = true;
2129 		}
2130 	    }
2131 	  if (changed)
2132 	    df_insn_rescan (insn);
2133 	}
2134     }
2135 }
2136 
2137 /* Convert INSN from TImode to V1T1mode.  */
2138 
2139 void
2140 timode_scalar_chain::convert_insn (rtx_insn *insn)
2141 {
2142   rtx def_set = single_set (insn);
2143   rtx src = SET_SRC (def_set);
2144   rtx dst = SET_DEST (def_set);
2145 
2146   switch (GET_CODE (dst))
2147     {
2148     case REG:
2149       {
2150 	rtx tmp = find_reg_equal_equiv_note (insn);
2151 	if (tmp)
2152 	  PUT_MODE (XEXP (tmp, 0), V1TImode);
2153 	PUT_MODE (dst, V1TImode);
2154 	fix_debug_reg_uses (dst);
2155       }
2156       break;
2157     case MEM:
2158       PUT_MODE (dst, V1TImode);
2159       break;
2160 
2161     default:
2162       gcc_unreachable ();
2163     }
2164 
2165   switch (GET_CODE (src))
2166     {
2167     case REG:
2168       PUT_MODE (src, V1TImode);
2169       /* Call fix_debug_reg_uses only if SRC is never defined.  */
2170       if (!DF_REG_DEF_CHAIN (REGNO (src)))
2171 	fix_debug_reg_uses (src);
2172       break;
2173 
2174     case MEM:
2175       PUT_MODE (src, V1TImode);
2176       break;
2177 
2178     case CONST_WIDE_INT:
2179       if (NONDEBUG_INSN_P (insn))
2180 	{
2181 	  /* Since there are no instructions to store 128-bit constant,
2182 	     temporary register usage is required.  */
2183 	  rtx tmp = gen_reg_rtx (V1TImode);
2184 	  start_sequence ();
2185 	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2186 	  src = validize_mem (force_const_mem (V1TImode, src));
2187 	  rtx_insn *seq = get_insns ();
2188 	  end_sequence ();
2189 	  if (seq)
2190 	    emit_insn_before (seq, insn);
2191 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2192 	  dst = tmp;
2193 	}
2194       break;
2195 
2196     case CONST_INT:
2197       switch (standard_sse_constant_p (src, TImode))
2198 	{
2199 	case 1:
2200 	  src = CONST0_RTX (GET_MODE (dst));
2201 	  break;
2202 	case 2:
2203 	  src = CONSTM1_RTX (GET_MODE (dst));
2204 	  break;
2205 	default:
2206 	  gcc_unreachable ();
2207 	}
2208       if (NONDEBUG_INSN_P (insn))
2209 	{
2210 	  rtx tmp = gen_reg_rtx (V1TImode);
2211 	  /* Since there are no instructions to store standard SSE
2212 	     constant, temporary register usage is required.  */
2213 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2214 	  dst = tmp;
2215 	}
2216       break;
2217 
2218     default:
2219       gcc_unreachable ();
2220     }
2221 
2222   SET_SRC (def_set) = src;
2223   SET_DEST (def_set) = dst;
2224 
2225   /* Drop possible dead definitions.  */
2226   PATTERN (insn) = def_set;
2227 
2228   INSN_CODE (insn) = -1;
2229   recog_memoized (insn);
2230   df_insn_rescan (insn);
2231 }
2232 
2233 void
2234 dimode_scalar_chain::convert_registers ()
2235 {
2236   bitmap_iterator bi;
2237   unsigned id;
2238 
2239   EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2240     convert_reg (id);
2241 
2242   EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2243     make_vector_copies (id);
2244 }
2245 
2246 /* Convert whole chain creating required register
2247    conversions and copies.  */
2248 
2249 int
2250 scalar_chain::convert ()
2251 {
2252   bitmap_iterator bi;
2253   unsigned id;
2254   int converted_insns = 0;
2255 
2256   if (!dbg_cnt (stv_conversion))
2257     return 0;
2258 
2259   if (dump_file)
2260     fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2261 
2262   convert_registers ();
2263 
2264   EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2265     {
2266       convert_insn (DF_INSN_UID_GET (id)->insn);
2267       converted_insns++;
2268     }
2269 
2270   return converted_insns;
2271 }
2272 
2273 /* Main STV pass function.  Find and convert scalar
2274    instructions into vector mode when profitable.  */
2275 
2276 static unsigned int
2277 convert_scalars_to_vector ()
2278 {
2279   basic_block bb;
2280   bitmap candidates;
2281   int converted_insns = 0;
2282 
2283   bitmap_obstack_initialize (NULL);
2284   candidates = BITMAP_ALLOC (NULL);
2285 
2286   calculate_dominance_info (CDI_DOMINATORS);
2287   df_set_flags (DF_DEFER_INSN_RESCAN);
2288   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2289   df_md_add_problem ();
2290   df_analyze ();
2291 
2292   /* Find all instructions we want to convert into vector mode.  */
2293   if (dump_file)
2294     fprintf (dump_file, "Searching for mode conversion candidates...\n");
2295 
2296   FOR_EACH_BB_FN (bb, cfun)
2297     {
2298       rtx_insn *insn;
2299       FOR_BB_INSNS (bb, insn)
2300 	if (scalar_to_vector_candidate_p (insn))
2301 	  {
2302 	    if (dump_file)
2303 	      fprintf (dump_file, "  insn %d is marked as a candidate\n",
2304 		       INSN_UID (insn));
2305 
2306 	    bitmap_set_bit (candidates, INSN_UID (insn));
2307 	  }
2308     }
2309 
2310   remove_non_convertible_regs (candidates);
2311 
2312   if (bitmap_empty_p (candidates))
2313     if (dump_file)
2314       fprintf (dump_file, "There are no candidates for optimization.\n");
2315 
2316   while (!bitmap_empty_p (candidates))
2317     {
2318       unsigned uid = bitmap_first_set_bit (candidates);
2319       scalar_chain *chain;
2320 
2321       if (TARGET_64BIT)
2322 	chain = new timode_scalar_chain;
2323       else
2324 	chain = new dimode_scalar_chain;
2325 
2326       /* Find instructions chain we want to convert to vector mode.
2327 	 Check all uses and definitions to estimate all required
2328 	 conversions.  */
2329       chain->build (candidates, uid);
2330 
2331       if (chain->compute_convert_gain () > 0)
2332 	converted_insns += chain->convert ();
2333       else
2334 	if (dump_file)
2335 	  fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2336 		   chain->chain_id);
2337 
2338       delete chain;
2339     }
2340 
2341   if (dump_file)
2342     fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2343 
2344   BITMAP_FREE (candidates);
2345   bitmap_obstack_release (NULL);
2346   df_process_deferred_rescans ();
2347 
2348   /* Conversion means we may have 128bit register spills/fills
2349      which require aligned stack.  */
2350   if (converted_insns)
2351     {
2352       if (crtl->stack_alignment_needed < 128)
2353 	crtl->stack_alignment_needed = 128;
2354       if (crtl->stack_alignment_estimated < 128)
2355 	crtl->stack_alignment_estimated = 128;
2356       /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
2357       if (TARGET_64BIT)
2358 	for (tree parm = DECL_ARGUMENTS (current_function_decl);
2359 	     parm; parm = DECL_CHAIN (parm))
2360 	  {
2361 	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2362 	      continue;
2363 	    if (DECL_RTL_SET_P (parm)
2364 		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
2365 	      {
2366 		rtx r = DECL_RTL (parm);
2367 		if (REG_P (r))
2368 		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2369 	      }
2370 	    if (DECL_INCOMING_RTL (parm)
2371 		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2372 	      {
2373 		rtx r = DECL_INCOMING_RTL (parm);
2374 		if (REG_P (r))
2375 		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2376 	      }
2377 	  }
2378     }
2379 
2380   return 0;
2381 }
2382 
2383 namespace {
2384 
2385 const pass_data pass_data_insert_vzeroupper =
2386 {
2387   RTL_PASS, /* type */
2388   "vzeroupper", /* name */
2389   OPTGROUP_NONE, /* optinfo_flags */
2390   TV_MACH_DEP, /* tv_id */
2391   0, /* properties_required */
2392   0, /* properties_provided */
2393   0, /* properties_destroyed */
2394   0, /* todo_flags_start */
2395   TODO_df_finish, /* todo_flags_finish */
2396 };
2397 
2398 class pass_insert_vzeroupper : public rtl_opt_pass
2399 {
2400 public:
2401   pass_insert_vzeroupper(gcc::context *ctxt)
2402     : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2403   {}
2404 
2405   /* opt_pass methods: */
2406   virtual bool gate (function *)
2407     {
2408       return TARGET_AVX
2409 	     && TARGET_VZEROUPPER && flag_expensive_optimizations
2410 	     && !optimize_size;
2411     }
2412 
2413   virtual unsigned int execute (function *)
2414     {
2415       return rest_of_handle_insert_vzeroupper ();
2416     }
2417 
2418 }; // class pass_insert_vzeroupper
2419 
2420 const pass_data pass_data_stv =
2421 {
2422   RTL_PASS, /* type */
2423   "stv", /* name */
2424   OPTGROUP_NONE, /* optinfo_flags */
2425   TV_MACH_DEP, /* tv_id */
2426   0, /* properties_required */
2427   0, /* properties_provided */
2428   0, /* properties_destroyed */
2429   0, /* todo_flags_start */
2430   TODO_df_finish, /* todo_flags_finish */
2431 };
2432 
2433 class pass_stv : public rtl_opt_pass
2434 {
2435 public:
2436   pass_stv (gcc::context *ctxt)
2437     : rtl_opt_pass (pass_data_stv, ctxt),
2438       timode_p (false)
2439   {}
2440 
2441   /* opt_pass methods: */
2442   virtual bool gate (function *)
2443     {
2444       return (timode_p == !!TARGET_64BIT
2445 	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
2446     }
2447 
2448   virtual unsigned int execute (function *)
2449     {
2450       return convert_scalars_to_vector ();
2451     }
2452 
2453   opt_pass *clone ()
2454     {
2455       return new pass_stv (m_ctxt);
2456     }
2457 
2458   void set_pass_param (unsigned int n, bool param)
2459     {
2460       gcc_assert (n == 0);
2461       timode_p = param;
2462     }
2463 
2464 private:
2465   bool timode_p;
2466 }; // class pass_stv
2467 
2468 } // anon namespace
2469 
2470 rtl_opt_pass *
2471 make_pass_insert_vzeroupper (gcc::context *ctxt)
2472 {
2473   return new pass_insert_vzeroupper (ctxt);
2474 }
2475 
2476 rtl_opt_pass *
2477 make_pass_stv (gcc::context *ctxt)
2478 {
2479   return new pass_stv (ctxt);
2480 }
2481 
2482 /* Inserting ENDBRANCH instructions.  */
2483 
2484 static unsigned int
2485 rest_of_insert_endbranch (void)
2486 {
2487   timevar_push (TV_MACH_DEP);
2488 
2489   rtx cet_eb;
2490   rtx_insn *insn;
2491   basic_block bb;
2492 
2493   /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2494      absent among function attributes.  Later an optimization will be
2495      introduced to make analysis if an address of a static function is
2496      taken.  A static function whose address is not taken will get a
2497      nocf_check attribute.  This will allow to reduce the number of EB.  */
2498 
2499   if (!lookup_attribute ("nocf_check",
2500 			 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2501       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2502     {
2503       /* Queue ENDBR insertion to x86_function_profiler.  */
2504       if (crtl->profile && flag_fentry)
2505 	cfun->machine->endbr_queued_at_entrance = true;
2506       else
2507 	{
2508 	  cet_eb = gen_nop_endbr ();
2509 
2510 	  bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2511 	  insn = BB_HEAD (bb);
2512 	  emit_insn_before (cet_eb, insn);
2513 	}
2514     }
2515 
2516   bb = 0;
2517   FOR_EACH_BB_FN (bb, cfun)
2518     {
2519       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2520 	   insn = NEXT_INSN (insn))
2521 	{
2522 	  if (CALL_P (insn))
2523 	    {
2524 	      if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2525 		continue;
2526 	      /* Generate ENDBRANCH after CALL, which can return more than
2527 		 twice, setjmp-like functions.  */
2528 
2529 	      cet_eb = gen_nop_endbr ();
2530 	      emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2531 	      continue;
2532 	    }
2533 
2534 	  if (JUMP_P (insn) && flag_cet_switch)
2535 	    {
2536 	      rtx target = JUMP_LABEL (insn);
2537 	      if (target == NULL_RTX || ANY_RETURN_P (target))
2538 		continue;
2539 
2540 	      /* Check the jump is a switch table.  */
2541 	      rtx_insn *label = as_a<rtx_insn *> (target);
2542 	      rtx_insn *table = next_insn (label);
2543 	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2544 		continue;
2545 
2546 	      /* For the indirect jump find out all places it jumps and insert
2547 		 ENDBRANCH there.  It should be done under a special flag to
2548 		 control ENDBRANCH generation for switch stmts.  */
2549 	      edge_iterator ei;
2550 	      edge e;
2551 	      basic_block dest_blk;
2552 
2553 	      FOR_EACH_EDGE (e, ei, bb->succs)
2554 		{
2555 		  rtx_insn *insn;
2556 
2557 		  dest_blk = e->dest;
2558 		  insn = BB_HEAD (dest_blk);
2559 		  gcc_assert (LABEL_P (insn));
2560 		  cet_eb = gen_nop_endbr ();
2561 		  emit_insn_after (cet_eb, insn);
2562 		}
2563 	      continue;
2564 	    }
2565 
2566 	  if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2567 	      || (NOTE_P (insn)
2568 		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2569 	    /* TODO.  Check /s bit also.  */
2570 	    {
2571 	      cet_eb = gen_nop_endbr ();
2572 	      emit_insn_after (cet_eb, insn);
2573 	      continue;
2574 	    }
2575 	}
2576     }
2577 
2578   timevar_pop (TV_MACH_DEP);
2579   return 0;
2580 }
2581 
2582 namespace {
2583 
2584 const pass_data pass_data_insert_endbranch =
2585 {
2586   RTL_PASS, /* type.  */
2587   "cet", /* name.  */
2588   OPTGROUP_NONE, /* optinfo_flags.  */
2589   TV_MACH_DEP, /* tv_id.  */
2590   0, /* properties_required.  */
2591   0, /* properties_provided.  */
2592   0, /* properties_destroyed.  */
2593   0, /* todo_flags_start.  */
2594   0, /* todo_flags_finish.  */
2595 };
2596 
2597 class pass_insert_endbranch : public rtl_opt_pass
2598 {
2599 public:
2600   pass_insert_endbranch (gcc::context *ctxt)
2601     : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2602   {}
2603 
2604   /* opt_pass methods: */
2605   virtual bool gate (function *)
2606     {
2607       return ((flag_cf_protection & CF_BRANCH));
2608     }
2609 
2610   virtual unsigned int execute (function *)
2611     {
2612       return rest_of_insert_endbranch ();
2613     }
2614 
2615 }; // class pass_insert_endbranch
2616 
2617 } // anon namespace
2618 
2619 rtl_opt_pass *
2620 make_pass_insert_endbranch (gcc::context *ctxt)
2621 {
2622   return new pass_insert_endbranch (ctxt);
2623 }
2624 
2625 /* Return true if a red-zone is in use.  We can't use red-zone when
2626    there are local indirect jumps, like "indirect_jump" or "tablejump",
2627    which jumps to another place in the function, since "call" in the
2628    indirect thunk pushes the return address onto stack, destroying
2629    red-zone.
2630 
2631    TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2632    for CALL, in red-zone, we can allow local indirect jumps with
2633    indirect thunk.  */
2634 
2635 bool
2636 ix86_using_red_zone (void)
2637 {
2638   return (TARGET_RED_ZONE
2639 	  && !TARGET_64BIT_MS_ABI
2640 	  && (!cfun->machine->has_local_indirect_jump
2641 	      || cfun->machine->indirect_branch_type == indirect_branch_keep));
2642 }
2643 
2644 /* Return a string that documents the current -m options.  The caller is
2645    responsible for freeing the string.  */
2646 
2647 static char *
2648 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2649 		    int flags, int flags2,
2650 		    const char *arch, const char *tune,
2651 		    enum fpmath_unit fpmath, bool add_nl_p)
2652 {
2653   struct ix86_target_opts
2654   {
2655     const char *option;		/* option string */
2656     HOST_WIDE_INT mask;		/* isa mask options */
2657   };
2658 
2659   /* This table is ordered so that options like -msse4.2 that imply other
2660      ISAs come first.  Target string will be displayed in the same order.  */
2661   static struct ix86_target_opts isa2_opts[] =
2662   {
2663     { "-mcx16",		OPTION_MASK_ISA_CX16 },
2664     { "-mmpx",		OPTION_MASK_ISA_MPX },
2665     { "-mvaes",		OPTION_MASK_ISA_VAES },
2666     { "-mrdpid",	OPTION_MASK_ISA_RDPID },
2667     { "-mpconfig",	OPTION_MASK_ISA_PCONFIG },
2668     { "-mwbnoinvd",     OPTION_MASK_ISA_WBNOINVD },
2669     { "-msgx",		OPTION_MASK_ISA_SGX },
2670     { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2671     { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2672     { "-mhle",		OPTION_MASK_ISA_HLE },
2673     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
2674     { "-mclzero",	OPTION_MASK_ISA_CLZERO },
2675     { "-mmwaitx",	OPTION_MASK_ISA_MWAITX },
2676     { "-mmovdir64b",	OPTION_MASK_ISA_MOVDIR64B }
2677   };
2678   static struct ix86_target_opts isa_opts[] =
2679   {
2680     { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2681     { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2682     { "-mvpclmulqdq",	OPTION_MASK_ISA_VPCLMULQDQ },
2683     { "-mgfni",		OPTION_MASK_ISA_GFNI },
2684     { "-mavx512vnni",	OPTION_MASK_ISA_AVX512VNNI },
2685     { "-mavx512vbmi2",	OPTION_MASK_ISA_AVX512VBMI2 },
2686     { "-mavx512vbmi",	OPTION_MASK_ISA_AVX512VBMI },
2687     { "-mavx512ifma",	OPTION_MASK_ISA_AVX512IFMA },
2688     { "-mavx512vl",	OPTION_MASK_ISA_AVX512VL },
2689     { "-mavx512bw",	OPTION_MASK_ISA_AVX512BW },
2690     { "-mavx512dq",	OPTION_MASK_ISA_AVX512DQ },
2691     { "-mavx512er",	OPTION_MASK_ISA_AVX512ER },
2692     { "-mavx512pf",	OPTION_MASK_ISA_AVX512PF },
2693     { "-mavx512cd",	OPTION_MASK_ISA_AVX512CD },
2694     { "-mavx512f",	OPTION_MASK_ISA_AVX512F },
2695     { "-mavx2",		OPTION_MASK_ISA_AVX2 },
2696     { "-mfma",		OPTION_MASK_ISA_FMA },
2697     { "-mxop",		OPTION_MASK_ISA_XOP },
2698     { "-mfma4",		OPTION_MASK_ISA_FMA4 },
2699     { "-mf16c",		OPTION_MASK_ISA_F16C },
2700     { "-mavx",		OPTION_MASK_ISA_AVX },
2701 /*  { "-msse4"		OPTION_MASK_ISA_SSE4 }, */
2702     { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
2703     { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
2704     { "-msse4a",	OPTION_MASK_ISA_SSE4A },
2705     { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
2706     { "-msse3",		OPTION_MASK_ISA_SSE3 },
2707     { "-maes",		OPTION_MASK_ISA_AES },
2708     { "-msha",		OPTION_MASK_ISA_SHA },
2709     { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
2710     { "-msse2",		OPTION_MASK_ISA_SSE2 },
2711     { "-msse",		OPTION_MASK_ISA_SSE },
2712     { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
2713     { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
2714     { "-mmmx",		OPTION_MASK_ISA_MMX },
2715     { "-mrtm",		OPTION_MASK_ISA_RTM },
2716     { "-mprfchw",	OPTION_MASK_ISA_PRFCHW },
2717     { "-mrdseed",	OPTION_MASK_ISA_RDSEED },
2718     { "-madx",		OPTION_MASK_ISA_ADX },
2719     { "-mprefetchwt1",	OPTION_MASK_ISA_PREFETCHWT1 },
2720     { "-mclflushopt",	OPTION_MASK_ISA_CLFLUSHOPT },
2721     { "-mxsaves",	OPTION_MASK_ISA_XSAVES },
2722     { "-mxsavec",	OPTION_MASK_ISA_XSAVEC },
2723     { "-mxsaveopt",	OPTION_MASK_ISA_XSAVEOPT },
2724     { "-mxsave",	OPTION_MASK_ISA_XSAVE },
2725     { "-mabm",		OPTION_MASK_ISA_ABM },
2726     { "-mbmi",		OPTION_MASK_ISA_BMI },
2727     { "-mbmi2",		OPTION_MASK_ISA_BMI2 },
2728     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
2729     { "-mtbm",		OPTION_MASK_ISA_TBM },
2730     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
2731     { "-msahf",		OPTION_MASK_ISA_SAHF },
2732     { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
2733     { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
2734     { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
2735     { "-mpku",		OPTION_MASK_ISA_PKU },
2736     { "-mlwp",		OPTION_MASK_ISA_LWP },
2737     { "-mfxsr",		OPTION_MASK_ISA_FXSR },
2738     { "-mclwb",		OPTION_MASK_ISA_CLWB },
2739     { "-mshstk",	OPTION_MASK_ISA_SHSTK },
2740     { "-mmovdiri",	OPTION_MASK_ISA_MOVDIRI }
2741   };
2742 
2743   /* Flag options.  */
2744   static struct ix86_target_opts flag_opts[] =
2745   {
2746     { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
2747     { "-mlong-double-128",		MASK_LONG_DOUBLE_128 },
2748     { "-mlong-double-64",		MASK_LONG_DOUBLE_64 },
2749     { "-m80387",			MASK_80387 },
2750     { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
2751     { "-malign-double",			MASK_ALIGN_DOUBLE },
2752     { "-mcld",				MASK_CLD },
2753     { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
2754     { "-mieee-fp",			MASK_IEEE_FP },
2755     { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
2756     { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
2757     { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
2758     { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
2759     { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
2760     { "-mno-push-args",			MASK_NO_PUSH_ARGS },
2761     { "-mno-red-zone",			MASK_NO_RED_ZONE },
2762     { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
2763     { "-mrecip",			MASK_RECIP },
2764     { "-mrtd",				MASK_RTD },
2765     { "-msseregparm",			MASK_SSEREGPARM },
2766     { "-mstack-arg-probe",		MASK_STACK_PROBE },
2767     { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
2768     { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
2769     { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
2770     { "-mvzeroupper",			MASK_VZEROUPPER },
2771     { "-mstv",				MASK_STV },
2772     { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2773     { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
2774     { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
2775   };
2776 
2777   /* Additional flag options.  */
2778   static struct ix86_target_opts flag2_opts[] =
2779   {
2780     { "-mgeneral-regs-only",		OPTION_MASK_GENERAL_REGS_ONLY }
2781   };
2782 
2783   const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2784 		   + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2785 
2786   char isa_other[40];
2787   char isa2_other[40];
2788   char flags_other[40];
2789   char flags2_other[40];
2790   unsigned num = 0;
2791   unsigned i, j;
2792   char *ret;
2793   char *ptr;
2794   size_t len;
2795   size_t line_len;
2796   size_t sep_len;
2797   const char *abi;
2798 
2799   memset (opts, '\0', sizeof (opts));
2800 
2801   /* Add -march= option.  */
2802   if (arch)
2803     {
2804       opts[num][0] = "-march=";
2805       opts[num++][1] = arch;
2806     }
2807 
2808   /* Add -mtune= option.  */
2809   if (tune)
2810     {
2811       opts[num][0] = "-mtune=";
2812       opts[num++][1] = tune;
2813     }
2814 
2815   /* Add -m32/-m64/-mx32.  */
2816   if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2817     {
2818       if ((isa & OPTION_MASK_ABI_64) != 0)
2819 	abi = "-m64";
2820       else
2821 	abi = "-mx32";
2822       isa &= ~ (OPTION_MASK_ISA_64BIT
2823 		| OPTION_MASK_ABI_64
2824 		| OPTION_MASK_ABI_X32);
2825     }
2826   else
2827     abi = "-m32";
2828   opts[num++][0] = abi;
2829 
2830   /* Pick out the options in isa2 options.  */
2831   for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2832     {
2833       if ((isa2 & isa2_opts[i].mask) != 0)
2834 	{
2835 	  opts[num++][0] = isa2_opts[i].option;
2836 	  isa2 &= ~ isa2_opts[i].mask;
2837 	}
2838     }
2839 
2840   if (isa2 && add_nl_p)
2841     {
2842       opts[num++][0] = isa2_other;
2843       sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2844     }
2845 
2846   /* Pick out the options in isa options.  */
2847   for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2848     {
2849       if ((isa & isa_opts[i].mask) != 0)
2850 	{
2851 	  opts[num++][0] = isa_opts[i].option;
2852 	  isa &= ~ isa_opts[i].mask;
2853 	}
2854     }
2855 
2856   if (isa && add_nl_p)
2857     {
2858       opts[num++][0] = isa_other;
2859       sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2860     }
2861 
2862   /* Add flag options.  */
2863   for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2864     {
2865       if ((flags & flag_opts[i].mask) != 0)
2866 	{
2867 	  opts[num++][0] = flag_opts[i].option;
2868 	  flags &= ~ flag_opts[i].mask;
2869 	}
2870     }
2871 
2872   if (flags && add_nl_p)
2873     {
2874       opts[num++][0] = flags_other;
2875       sprintf (flags_other, "(other flags: %#x)", flags);
2876     }
2877 
2878     /* Add additional flag options.  */
2879   for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2880     {
2881       if ((flags2 & flag2_opts[i].mask) != 0)
2882 	{
2883 	  opts[num++][0] = flag2_opts[i].option;
2884 	  flags2 &= ~ flag2_opts[i].mask;
2885 	}
2886     }
2887 
2888   if (flags2 && add_nl_p)
2889     {
2890       opts[num++][0] = flags2_other;
2891       sprintf (flags2_other, "(other flags2: %#x)", flags2);
2892     }
2893 
2894   /* Add -fpmath= option.  */
2895   if (fpmath)
2896     {
2897       opts[num][0] = "-mfpmath=";
2898       switch ((int) fpmath)
2899 	{
2900 	case FPMATH_387:
2901 	  opts[num++][1] = "387";
2902 	  break;
2903 
2904 	case FPMATH_SSE:
2905 	  opts[num++][1] = "sse";
2906 	  break;
2907 
2908 	case FPMATH_387 | FPMATH_SSE:
2909 	  opts[num++][1] = "sse+387";
2910 	  break;
2911 
2912 	default:
2913 	  gcc_unreachable ();
2914 	}
2915     }
2916 
2917   /* Any options?  */
2918   if (num == 0)
2919     return NULL;
2920 
2921   gcc_assert (num < ARRAY_SIZE (opts));
2922 
2923   /* Size the string.  */
2924   len = 0;
2925   sep_len = (add_nl_p) ? 3 : 1;
2926   for (i = 0; i < num; i++)
2927     {
2928       len += sep_len;
2929       for (j = 0; j < 2; j++)
2930 	if (opts[i][j])
2931 	  len += strlen (opts[i][j]);
2932     }
2933 
2934   /* Build the string.  */
2935   ret = ptr = (char *) xmalloc (len);
2936   line_len = 0;
2937 
2938   for (i = 0; i < num; i++)
2939     {
2940       size_t len2[2];
2941 
2942       for (j = 0; j < 2; j++)
2943 	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2944 
2945       if (i != 0)
2946 	{
2947 	  *ptr++ = ' ';
2948 	  line_len++;
2949 
2950 	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2951 	    {
2952 	      *ptr++ = '\\';
2953 	      *ptr++ = '\n';
2954 	      line_len = 0;
2955 	    }
2956 	}
2957 
2958       for (j = 0; j < 2; j++)
2959 	if (opts[i][j])
2960 	  {
2961 	    memcpy (ptr, opts[i][j], len2[j]);
2962 	    ptr += len2[j];
2963 	    line_len += len2[j];
2964 	  }
2965     }
2966 
2967   *ptr = '\0';
2968   gcc_assert (ret + len >= ptr);
2969 
2970   return ret;
2971 }
2972 
2973 /* Return true, if profiling code should be emitted before
2974    prologue. Otherwise it returns false.
2975    Note: For x86 with "hotfix" it is sorried.  */
2976 static bool
2977 ix86_profile_before_prologue (void)
2978 {
2979   return flag_fentry != 0;
2980 }
2981 
2982 /* Function that is callable from the debugger to print the current
2983    options.  */
2984 void ATTRIBUTE_UNUSED
2985 ix86_debug_options (void)
2986 {
2987   char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
2988 				   target_flags, ix86_target_flags,
2989 				   ix86_arch_string,ix86_tune_string,
2990 				   ix86_fpmath, true);
2991 
2992   if (opts)
2993     {
2994       fprintf (stderr, "%s\n\n", opts);
2995       free (opts);
2996     }
2997   else
2998     fputs ("<no options>\n\n", stderr);
2999 
3000   return;
3001 }
3002 
3003 /* Return true if T is one of the bytes we should avoid with
3004    -mmitigate-rop.  */
3005 
3006 static bool
3007 ix86_rop_should_change_byte_p (int t)
3008 {
3009   return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3010 }
3011 
3012 static const char *stringop_alg_names[] = {
3013 #define DEF_ENUM
3014 #define DEF_ALG(alg, name) #name,
3015 #include "stringop.def"
3016 #undef DEF_ENUM
3017 #undef DEF_ALG
3018 };
3019 
3020 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3021    The string is of the following form (or comma separated list of it):
3022 
3023      strategy_alg:max_size:[align|noalign]
3024 
3025    where the full size range for the strategy is either [0, max_size] or
3026    [min_size, max_size], in which min_size is the max_size + 1 of the
3027    preceding range.  The last size range must have max_size == -1.
3028 
3029    Examples:
3030 
3031     1.
3032        -mmemcpy-strategy=libcall:-1:noalign
3033 
3034       this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3035 
3036 
3037    2.
3038       -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3039 
3040       This is to tell the compiler to use the following strategy for memset
3041       1) when the expected size is between [1, 16], use rep_8byte strategy;
3042       2) when the size is between [17, 2048], use vector_loop;
3043       3) when the size is > 2048, use libcall.  */
3044 
3045 struct stringop_size_range
3046 {
3047   int max;
3048   stringop_alg alg;
3049   bool noalign;
3050 };
3051 
3052 static void
3053 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3054 {
3055   const struct stringop_algs *default_algs;
3056   stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3057   char *curr_range_str, *next_range_str;
3058   const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3059   int i = 0, n = 0;
3060 
3061   if (is_memset)
3062     default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3063   else
3064     default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3065 
3066   curr_range_str = strategy_str;
3067 
3068   do
3069     {
3070       int maxs;
3071       char alg_name[128];
3072       char align[16];
3073       next_range_str = strchr (curr_range_str, ',');
3074       if (next_range_str)
3075         *next_range_str++ = '\0';
3076 
3077       if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3078 		  align) != 3)
3079         {
3080 	  error ("wrong argument %qs to option %qs", curr_range_str, opt);
3081           return;
3082         }
3083 
3084       if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3085         {
3086 	  error ("size ranges of option %qs should be increasing", opt);
3087           return;
3088         }
3089 
3090       for (i = 0; i < last_alg; i++)
3091 	if (!strcmp (alg_name, stringop_alg_names[i]))
3092 	  break;
3093 
3094       if (i == last_alg)
3095         {
3096 	  error ("wrong strategy name %qs specified for option %qs",
3097 		 alg_name, opt);
3098 
3099 	  auto_vec <const char *> candidates;
3100 	  for (i = 0; i < last_alg; i++)
3101 	    if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3102 	      candidates.safe_push (stringop_alg_names[i]);
3103 
3104 	  char *s;
3105 	  const char *hint
3106 	    = candidates_list_and_hint (alg_name, s, candidates);
3107 	  if (hint)
3108 	    inform (input_location,
3109 		    "valid arguments to %qs are: %s; did you mean %qs?",
3110 		    opt, s, hint);
3111 	  else
3112 	    inform (input_location, "valid arguments to %qs are: %s",
3113 		    opt, s);
3114 	  XDELETEVEC (s);
3115           return;
3116         }
3117 
3118       if ((stringop_alg) i == rep_prefix_8_byte
3119 	  && !TARGET_64BIT)
3120 	{
3121 	  /* rep; movq isn't available in 32-bit code.  */
3122 	  error ("strategy name %qs specified for option %qs "
3123 		 "not supported for 32-bit code", alg_name, opt);
3124 	  return;
3125 	}
3126 
3127       input_ranges[n].max = maxs;
3128       input_ranges[n].alg = (stringop_alg) i;
3129       if (!strcmp (align, "align"))
3130         input_ranges[n].noalign = false;
3131       else if (!strcmp (align, "noalign"))
3132         input_ranges[n].noalign = true;
3133       else
3134         {
3135 	  error ("unknown alignment %qs specified for option %qs", align, opt);
3136           return;
3137         }
3138       n++;
3139       curr_range_str = next_range_str;
3140     }
3141   while (curr_range_str);
3142 
3143   if (input_ranges[n - 1].max != -1)
3144     {
3145       error ("the max value for the last size range should be -1"
3146              " for option %qs", opt);
3147       return;
3148     }
3149 
3150   if (n > MAX_STRINGOP_ALGS)
3151     {
3152       error ("too many size ranges specified in option %qs", opt);
3153       return;
3154     }
3155 
3156   /* Now override the default algs array.  */
3157   for (i = 0; i < n; i++)
3158     {
3159       *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3160       *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3161           = input_ranges[i].alg;
3162       *const_cast<int *>(&default_algs->size[i].noalign)
3163           = input_ranges[i].noalign;
3164     }
3165 }
3166 
3167 
3168 /* parse -mtune-ctrl= option. When DUMP is true,
3169    print the features that are explicitly set.  */
3170 
3171 static void
3172 parse_mtune_ctrl_str (bool dump)
3173 {
3174   if (!ix86_tune_ctrl_string)
3175     return;
3176 
3177   char *next_feature_string = NULL;
3178   char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3179   char *orig = curr_feature_string;
3180   int i;
3181   do
3182     {
3183       bool clear = false;
3184 
3185       next_feature_string = strchr (curr_feature_string, ',');
3186       if (next_feature_string)
3187         *next_feature_string++ = '\0';
3188       if (*curr_feature_string == '^')
3189         {
3190           curr_feature_string++;
3191           clear = true;
3192         }
3193       for (i = 0; i < X86_TUNE_LAST; i++)
3194         {
3195           if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3196             {
3197               ix86_tune_features[i] = !clear;
3198               if (dump)
3199                 fprintf (stderr, "Explicitly %s feature %s\n",
3200                          clear ? "clear" : "set", ix86_tune_feature_names[i]);
3201               break;
3202             }
3203         }
3204       if (i == X86_TUNE_LAST)
3205         error ("unknown parameter to option -mtune-ctrl: %s",
3206                clear ? curr_feature_string - 1 : curr_feature_string);
3207       curr_feature_string = next_feature_string;
3208     }
3209   while (curr_feature_string);
3210   free (orig);
3211 }
3212 
3213 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3214    processor type.  */
3215 
3216 static void
3217 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3218 {
3219   unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
3220   int i;
3221 
3222   for (i = 0; i < X86_TUNE_LAST; ++i)
3223     {
3224       if (ix86_tune_no_default)
3225         ix86_tune_features[i] = 0;
3226       else
3227 	ix86_tune_features[i]
3228 	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3229     }
3230 
3231   if (dump)
3232     {
3233       fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3234       for (i = 0; i < X86_TUNE_LAST; i++)
3235         fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3236                  ix86_tune_features[i] ? "on" : "off");
3237     }
3238 
3239   parse_mtune_ctrl_str (dump);
3240 }
3241 
3242 
3243 /* Default align_* from the processor table.  */
3244 
3245 static void
3246 ix86_default_align (struct gcc_options *opts)
3247 {
3248   if (opts->x_align_loops == 0)
3249     {
3250       opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3251       align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3252     }
3253   if (opts->x_align_jumps == 0)
3254     {
3255       opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3256       align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3257     }
3258   if (opts->x_align_functions == 0)
3259     {
3260       opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3261     }
3262 }
3263 
3264 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook.  */
3265 
3266 static void
3267 ix86_override_options_after_change (void)
3268 {
3269   ix86_default_align (&global_options);
3270 }
3271 
3272 /* Override various settings based on options.  If MAIN_ARGS_P, the
3273    options are from the command line, otherwise they are from
3274    attributes.  Return true if there's an error related to march
3275    option.  */
3276 
3277 static bool
3278 ix86_option_override_internal (bool main_args_p,
3279 			       struct gcc_options *opts,
3280 			       struct gcc_options *opts_set)
3281 {
3282   int i;
3283   unsigned HOST_WIDE_INT ix86_arch_mask;
3284   const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3285 
3286   const wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0);
3287   const wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1);
3288   const wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2);
3289   const wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3);
3290   const wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4);
3291   const wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5);
3292   const wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6);
3293   const wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7);
3294   const wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8);
3295   const wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9);
3296   const wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10);
3297   const wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11);
3298   const wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12);
3299   const wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13);
3300   const wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14);
3301   const wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15);
3302   const wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16);
3303   const wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17);
3304   const wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18);
3305   const wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19);
3306   const wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20);
3307   const wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21);
3308   const wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22);
3309   const wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23);
3310   const wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24);
3311   const wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25);
3312   const wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26);
3313   const wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27);
3314   const wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28);
3315   const wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29);
3316   const wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30);
3317   const wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31);
3318   const wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32);
3319   const wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33);
3320   const wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34);
3321   const wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35);
3322   const wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36);
3323   const wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37);
3324   const wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38);
3325   const wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39);
3326   const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
3327   const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
3328   const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
3329   const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
3330   const wide_int_bitmask PTA_MPX (HOST_WIDE_INT_1U << 44);
3331   const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
3332   const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
3333   const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
3334   const wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48);
3335   const wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49);
3336   const wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50);
3337   const wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51);
3338   const wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52);
3339   const wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53);
3340   const wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54);
3341   const wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55);
3342   const wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56);
3343   const wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57);
3344   const wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58);
3345   const wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59);
3346   const wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60);
3347   const wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61);
3348   const wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62);
3349   const wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63);
3350   const wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U);
3351   const wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1);
3352   const wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2);
3353   const wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3);
3354   const wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4);
3355   const wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5);
3356   const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
3357   const wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7);
3358   const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8);
3359 
3360   const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
3361     | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
3362   const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
3363     | PTA_POPCNT;
3364   const wide_int_bitmask PTA_WESTMERE = PTA_NEHALEM | PTA_AES | PTA_PCLMUL;
3365   const wide_int_bitmask PTA_SANDYBRIDGE = PTA_WESTMERE | PTA_AVX | PTA_XSAVE
3366     | PTA_XSAVEOPT;
3367   const wide_int_bitmask PTA_IVYBRIDGE = PTA_SANDYBRIDGE | PTA_FSGSBASE
3368     | PTA_RDRND | PTA_F16C;
3369   const wide_int_bitmask PTA_HASWELL = PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI
3370     | PTA_BMI2 | PTA_LZCNT | PTA_FMA | PTA_MOVBE | PTA_HLE;
3371   const wide_int_bitmask PTA_BROADWELL = PTA_HASWELL | PTA_ADX | PTA_PRFCHW
3372     | PTA_RDSEED;
3373   const wide_int_bitmask PTA_SKYLAKE = PTA_BROADWELL | PTA_CLFLUSHOPT
3374     | PTA_XSAVEC | PTA_XSAVES | PTA_SGX;
3375   const wide_int_bitmask PTA_SKYLAKE_AVX512 = PTA_SKYLAKE | PTA_AVX512F
3376     | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3377     | PTA_CLWB;
3378   const wide_int_bitmask PTA_CANNONLAKE = PTA_SKYLAKE | PTA_AVX512F
3379     | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3380     | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA;
3381   const wide_int_bitmask PTA_ICELAKE_CLIENT = PTA_CANNONLAKE | PTA_AVX512VNNI
3382     | PTA_GFNI | PTA_VAES | PTA_AVX512VBMI2 | PTA_VPCLMULQDQ | PTA_AVX512BITALG
3383     | PTA_RDPID | PTA_CLWB;
3384   const wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT | PTA_PCONFIG
3385     | PTA_WBNOINVD;
3386   const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
3387     | PTA_AVX512F | PTA_AVX512CD;
3388   const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
3389   const wide_int_bitmask PTA_SILVERMONT = PTA_WESTMERE | PTA_MOVBE | PTA_RDRND;
3390   const wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
3391     | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
3392 
3393   static struct pta
3394     {
3395       const char *const name;		/* processor name or nickname.  */
3396       const enum processor_type processor;
3397       const enum attr_cpu schedule;
3398       const wide_int_bitmask flags;
3399     }
3400   const processor_alias_table[] =
3401     {
3402       {"i386", PROCESSOR_I386, CPU_NONE, 0},
3403       {"i486", PROCESSOR_I486, CPU_NONE, 0},
3404       {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3405       {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3406       {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3407       {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3408       {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3409       {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3410       {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3411       {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3412       {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3413 	PTA_MMX | PTA_SSE | PTA_FXSR},
3414       {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3415         PTA_MMX | PTA_SSE | PTA_FXSR},
3416       {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3417         PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3418       {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3419         PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3420       {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3421       {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3422       {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3423       {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3424 	PTA_MMX | PTA_SSE | PTA_FXSR},
3425       {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3426 	PTA_MMX | PTA_SSE | PTA_FXSR},
3427       {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3428 	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3429       {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3430 	PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3431       {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3432 	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3433       {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3434 	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3435       {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3436 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3437 	| PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3438       {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3439       {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3440       {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3441       {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3442       {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3443 	PTA_SANDYBRIDGE},
3444       {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3445 	PTA_SANDYBRIDGE},
3446       {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3447 	PTA_IVYBRIDGE},
3448       {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3449 	PTA_IVYBRIDGE},
3450       {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3451       {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3452       {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3453       {"skylake", PROCESSOR_SKYLAKE, CPU_HASWELL, PTA_SKYLAKE},
3454       {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3455         PTA_SKYLAKE_AVX512},
3456       {"cannonlake", PROCESSOR_CANNONLAKE, CPU_HASWELL, PTA_CANNONLAKE},
3457       {"icelake-client", PROCESSOR_ICELAKE_CLIENT, CPU_HASWELL,
3458 	PTA_ICELAKE_CLIENT},
3459       {"icelake-server", PROCESSOR_ICELAKE_SERVER, CPU_HASWELL,
3460 	PTA_ICELAKE_SERVER},
3461       {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3462       {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3463       {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3464       {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3465       {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3466       {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3467       {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3468       {"geode", PROCESSOR_GEODE, CPU_GEODE,
3469 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3470       {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3471       {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3472       {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3473       {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3474 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3475       {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3476 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3477       {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3478 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3479       {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3480 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3481       {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3482 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3483       {"x86-64", PROCESSOR_K8, CPU_K8,
3484 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3485       {"eden-x2", PROCESSOR_K8, CPU_K8,
3486         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3487       {"nano", PROCESSOR_K8, CPU_K8,
3488         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3489         | PTA_SSSE3 | PTA_FXSR},
3490       {"nano-1000", PROCESSOR_K8, CPU_K8,
3491         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3492         | PTA_SSSE3 | PTA_FXSR},
3493       {"nano-2000", PROCESSOR_K8, CPU_K8,
3494         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3495         | PTA_SSSE3 | PTA_FXSR},
3496       {"nano-3000", PROCESSOR_K8, CPU_K8,
3497         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3498         | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3499       {"nano-x2", PROCESSOR_K8, CPU_K8,
3500         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3501         | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3502       {"eden-x4", PROCESSOR_K8, CPU_K8,
3503         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3504         | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3505       {"nano-x4", PROCESSOR_K8, CPU_K8,
3506         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3507         | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3508       {"k8", PROCESSOR_K8, CPU_K8,
3509 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3510 	| PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3511       {"k8-sse3", PROCESSOR_K8, CPU_K8,
3512 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3513 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3514       {"opteron", PROCESSOR_K8, CPU_K8,
3515 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3516 	| PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3517       {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3518 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3519 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3520       {"athlon64", PROCESSOR_K8, CPU_K8,
3521 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3522 	| PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3523       {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3524 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3525 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3526       {"athlon-fx", PROCESSOR_K8, CPU_K8,
3527 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3528 	| PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3529       {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3530 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3531 	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3532       {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3533 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3534 	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3535       {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3536 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3537 	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3538 	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3539 	| PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3540       {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3541 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3542 	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3543 	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3544 	| PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3545 	| PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3546       {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3547 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3548 	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3549 	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3550 	| PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3551 	| PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3552 	| PTA_XSAVEOPT | PTA_FSGSBASE},
3553       {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3554 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3555 	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3556 	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3557 	| PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3558 	| PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3559 	| PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3560 	| PTA_MOVBE | PTA_MWAITX},
3561       {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3562 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3563 	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3564 	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3565 	| PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3566 	| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3567 	| PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3568 	| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3569 	| PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3570       {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3571 	PTA_64BIT | PTA_MMX |  PTA_SSE  | PTA_SSE2 | PTA_SSE3
3572 	| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3573 	| PTA_FXSR | PTA_XSAVE},
3574       {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3575 	PTA_64BIT | PTA_MMX |  PTA_SSE  | PTA_SSE2 | PTA_SSE3
3576 	| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3577 	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3578 	| PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3579 	| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3580 
3581       {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3582 	PTA_64BIT
3583 	| PTA_HLE /* flags are only used for -march switch.  */ },
3584     };
3585 
3586   /* -mrecip options.  */
3587   static struct
3588     {
3589       const char *string;           /* option name */
3590       unsigned int mask;            /* mask bits to set */
3591     }
3592   const recip_options[] =
3593     {
3594       { "all",       RECIP_MASK_ALL },
3595       { "none",      RECIP_MASK_NONE },
3596       { "div",       RECIP_MASK_DIV },
3597       { "sqrt",      RECIP_MASK_SQRT },
3598       { "vec-div",   RECIP_MASK_VEC_DIV },
3599       { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
3600     };
3601 
3602   int const pta_size = ARRAY_SIZE (processor_alias_table);
3603 
3604   /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3605      TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false.  */
3606   if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3607     opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3608 #ifdef TARGET_BI_ARCH
3609   else
3610     {
3611 #if TARGET_BI_ARCH == 1
3612       /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3613 	 is on and OPTION_MASK_ABI_X32 is off.  We turn off
3614 	 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3615 	 -mx32.  */
3616       if (TARGET_X32_P (opts->x_ix86_isa_flags))
3617 	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3618 #else
3619       /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3620 	 on and OPTION_MASK_ABI_64 is off.  We turn off
3621 	 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3622 	 -m64 or OPTION_MASK_CODE16 is turned on by -m16.  */
3623       if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3624 	  || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3625 	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3626 #endif
3627       if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3628 	  && TARGET_IAMCU_P (opts->x_target_flags))
3629 	sorry ("Intel MCU psABI isn%'t supported in %s mode",
3630 	       TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3631     }
3632 #endif
3633 
3634   if (TARGET_X32_P (opts->x_ix86_isa_flags))
3635     {
3636       /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3637 	 OPTION_MASK_ABI_64 for TARGET_X32.  */
3638       opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3639       opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3640     }
3641   else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3642     opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3643 				| OPTION_MASK_ABI_X32
3644 				| OPTION_MASK_ABI_64);
3645   else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3646     {
3647       /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3648 	 OPTION_MASK_ABI_X32 for TARGET_LP64.  */
3649       opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3650       opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3651     }
3652 
3653 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3654   SUBTARGET_OVERRIDE_OPTIONS;
3655 #endif
3656 
3657 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3658   SUBSUBTARGET_OVERRIDE_OPTIONS;
3659 #endif
3660 
3661   /* -fPIC is the default for x86_64.  */
3662   if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3663     opts->x_flag_pic = 2;
3664 
3665   /* Need to check -mtune=generic first.  */
3666   if (opts->x_ix86_tune_string)
3667     {
3668       /* As special support for cross compilers we read -mtune=native
3669 	     as -mtune=generic.  With native compilers we won't see the
3670 	     -mtune=native, as it was changed by the driver.  */
3671       if (!strcmp (opts->x_ix86_tune_string, "native"))
3672 	{
3673 	  opts->x_ix86_tune_string = "generic";
3674 	}
3675       else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3676         warning (OPT_Wdeprecated,
3677 		 main_args_p
3678 		 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3679 		      "or %<-mtune=generic%> instead as appropriate")
3680 		 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3681 		      "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3682 		      " instead as appropriate"));
3683     }
3684   else
3685     {
3686       if (opts->x_ix86_arch_string)
3687 	opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3688       if (!opts->x_ix86_tune_string)
3689 	{
3690 	  opts->x_ix86_tune_string
3691 	    = processor_target_table[TARGET_CPU_DEFAULT].name;
3692 	  ix86_tune_defaulted = 1;
3693 	}
3694 
3695       /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3696 	 or defaulted.  We need to use a sensible tune option.  */
3697       if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3698 	{
3699 	  opts->x_ix86_tune_string = "generic";
3700 	}
3701     }
3702 
3703   if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3704       && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3705     {
3706       /* rep; movq isn't available in 32-bit code.  */
3707       error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3708       opts->x_ix86_stringop_alg = no_stringop;
3709     }
3710 
3711   if (!opts->x_ix86_arch_string)
3712     opts->x_ix86_arch_string
3713       = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3714 	? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3715   else
3716     ix86_arch_specified = 1;
3717 
3718   if (opts_set->x_ix86_pmode)
3719     {
3720       if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3721 	   && opts->x_ix86_pmode == PMODE_SI)
3722 	  || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3723 	       && opts->x_ix86_pmode == PMODE_DI))
3724 	error ("address mode %qs not supported in the %s bit mode",
3725 	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3726 	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3727     }
3728   else
3729     opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3730 			 ? PMODE_DI : PMODE_SI;
3731 
3732   if (!opts_set->x_ix86_abi)
3733     opts->x_ix86_abi = DEFAULT_ABI;
3734 
3735   if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3736     error ("-mabi=ms not supported with X32 ABI");
3737   gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3738 
3739   /* For targets using ms ABI enable ms-extensions, if not
3740      explicit turned off.  For non-ms ABI we turn off this
3741      option.  */
3742   if (!opts_set->x_flag_ms_extensions)
3743     opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3744 
3745   if (opts_set->x_ix86_cmodel)
3746     {
3747       switch (opts->x_ix86_cmodel)
3748 	{
3749 	case CM_SMALL:
3750 	case CM_SMALL_PIC:
3751 	  if (opts->x_flag_pic)
3752 	    opts->x_ix86_cmodel = CM_SMALL_PIC;
3753 	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3754 	    error ("code model %qs not supported in the %s bit mode",
3755 		   "small", "32");
3756 	  break;
3757 
3758 	case CM_MEDIUM:
3759 	case CM_MEDIUM_PIC:
3760 	  if (opts->x_flag_pic)
3761 	    opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3762 	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3763 	    error ("code model %qs not supported in the %s bit mode",
3764 		   "medium", "32");
3765 	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3766 	    error ("code model %qs not supported in x32 mode",
3767 		   "medium");
3768 	  break;
3769 
3770 	case CM_LARGE:
3771 	case CM_LARGE_PIC:
3772 	  if (opts->x_flag_pic)
3773 	    opts->x_ix86_cmodel = CM_LARGE_PIC;
3774 	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3775 	    error ("code model %qs not supported in the %s bit mode",
3776 		   "large", "32");
3777 	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3778 	    error ("code model %qs not supported in x32 mode",
3779 		   "large");
3780 	  break;
3781 
3782 	case CM_32:
3783 	  if (opts->x_flag_pic)
3784 	    error ("code model %s does not support PIC mode", "32");
3785 	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3786 	    error ("code model %qs not supported in the %s bit mode",
3787 		   "32", "64");
3788 	  break;
3789 
3790 	case CM_KERNEL:
3791 	  if (opts->x_flag_pic)
3792 	    {
3793 	      error ("code model %s does not support PIC mode", "kernel");
3794 	      opts->x_ix86_cmodel = CM_32;
3795 	    }
3796 	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3797 	    error ("code model %qs not supported in the %s bit mode",
3798 		   "kernel", "32");
3799 	  break;
3800 
3801 	default:
3802 	  gcc_unreachable ();
3803 	}
3804     }
3805   else
3806     {
3807       /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3808 	 use of rip-relative addressing.  This eliminates fixups that
3809 	 would otherwise be needed if this object is to be placed in a
3810 	 DLL, and is essentially just as efficient as direct addressing.  */
3811       if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3812 	  && (TARGET_RDOS || TARGET_PECOFF))
3813 	opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3814       else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3815 	opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3816       else
3817 	opts->x_ix86_cmodel = CM_32;
3818     }
3819   if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3820     {
3821       error ("-masm=intel not supported in this configuration");
3822       opts->x_ix86_asm_dialect = ASM_ATT;
3823     }
3824   if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3825       != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3826     sorry ("%i-bit mode not compiled in",
3827 	   (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3828 
3829   for (i = 0; i < pta_size; i++)
3830     if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3831       {
3832 	if (!strcmp (opts->x_ix86_arch_string, "generic"))
3833 	  {
3834 	    error (main_args_p
3835 		   ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3836 			"switch")
3837 		   : G_("%<generic%> CPU can be used only for "
3838 			"%<target(\"tune=\")%> attribute"));
3839 	    return false;
3840 	  }
3841 	else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3842 	  {
3843 	    error (main_args_p
3844 		   ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3845 			"switch")
3846 		   : G_("%<intel%> CPU can be used only for "
3847 			"%<target(\"tune=\")%> attribute"));
3848 	    return false;
3849 	  }
3850 
3851 	if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3852 	    && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
3853 	  {
3854 	    error ("CPU you selected does not support x86-64 "
3855 		   "instruction set");
3856 	    return false;
3857 	  }
3858 
3859 	ix86_schedule = processor_alias_table[i].schedule;
3860 	ix86_arch = processor_alias_table[i].processor;
3861 	/* Default cpu tuning to the architecture.  */
3862 	ix86_tune = ix86_arch;
3863 
3864 	if (((processor_alias_table[i].flags & PTA_MMX) != 0)
3865 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3866 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3867 	if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
3868 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3869 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3870 	if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
3871 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3872 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3873 	if (((processor_alias_table[i].flags & PTA_SSE) != 0)
3874 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3875 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3876 	if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
3877 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3878 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3879 	if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
3880 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3881 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3882 	if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
3883 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3884 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3885 	if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
3886 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3887 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3888 	if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
3889 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3890 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3891 	if (((processor_alias_table[i].flags & PTA_AVX) != 0)
3892 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3893 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3894 	if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
3895 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3896 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3897 	if (((processor_alias_table[i].flags & PTA_FMA) != 0)
3898 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3899 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3900 	if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
3901 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3902 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3903 	if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
3904 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3905 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3906 	if (((processor_alias_table[i].flags & PTA_XOP) != 0)
3907 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3908 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3909 	if (((processor_alias_table[i].flags & PTA_LWP) != 0)
3910 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3911 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3912 	if (((processor_alias_table[i].flags & PTA_ABM) != 0)
3913 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3914 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3915 	if (((processor_alias_table[i].flags & PTA_BMI) != 0)
3916 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3917 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3918 	if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
3919 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3920 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3921 	if (((processor_alias_table[i].flags & PTA_TBM) != 0)
3922 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3923 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3924 	if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
3925 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3926 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3927 	if (((processor_alias_table[i].flags & PTA_CX16) != 0)
3928 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
3929 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
3930 	if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
3931 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3932 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3933 	if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3934 	    && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
3935 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3936 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3937 	if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
3938 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
3939 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
3940 	if (((processor_alias_table[i].flags & PTA_AES) != 0)
3941 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3942 	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
3943 	if (((processor_alias_table[i].flags & PTA_SHA) != 0)
3944 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
3945 	  ix86_isa_flags |= OPTION_MASK_ISA_SHA;
3946 	if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
3947 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3948 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3949 	if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
3950 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3951 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3952 	if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
3953 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3954 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3955 	if (((processor_alias_table[i].flags & PTA_F16C) != 0)
3956 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3957 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3958 	if (((processor_alias_table[i].flags & PTA_RTM) != 0)
3959 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3960 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3961 	if (((processor_alias_table[i].flags & PTA_HLE) != 0)
3962 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
3963 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
3964 	if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
3965 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3966 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3967 	if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
3968 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3969 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3970 	if (((processor_alias_table[i].flags & PTA_ADX) != 0)
3971 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3972 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3973 	if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
3974 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3975 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3976 	if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
3977 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3978 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3979 	if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
3980 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3981 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3982 	if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
3983 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3984 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3985 	if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
3986 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3987 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3988 	if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
3989 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3990 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3991 	if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
3992 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3993 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3994 	if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
3995 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
3996 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
3997 	if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
3998 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
3999 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4000 	if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
4001 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4002 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4003 	if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
4004 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4005 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4006 	if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
4007 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4008 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4009 	if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
4010 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4011 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4012 	if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
4013 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4014 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4015 	if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
4016 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4017 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4018 	if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
4019 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4020 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4021 	if (((processor_alias_table[i].flags & PTA_MPX) != 0)
4022             && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4023           opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4024 	if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
4025 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4026 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4027 	if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
4028 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4029 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4030 	if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
4031 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
4032 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
4033 	if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
4034 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
4035 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
4036 	if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
4037 	    && !(opts->x_ix86_isa_flags_explicit
4038 	    & OPTION_MASK_ISA_AVX512VBMI2))
4039 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
4040 	if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
4041 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
4042 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
4043 	if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
4044 	    && !(opts->x_ix86_isa_flags_explicit
4045 	    & OPTION_MASK_ISA_AVX512BITALG))
4046 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
4047 
4048 	if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
4049 	    && !(opts->x_ix86_isa_flags2_explicit
4050 		 & OPTION_MASK_ISA_AVX5124VNNIW))
4051 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4052 	if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
4053 	    && !(opts->x_ix86_isa_flags2_explicit
4054 		 & OPTION_MASK_ISA_AVX5124FMAPS))
4055 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4056 	if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
4057 	    && !(opts->x_ix86_isa_flags_explicit
4058 		 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4059 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4060 	if (((processor_alias_table[i].flags & PTA_SGX) != 0)
4061 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4062 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4063 	if (((processor_alias_table[i].flags & PTA_VAES) != 0)
4064 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
4065 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
4066 	if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
4067 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
4068 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
4069 	if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
4070 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
4071 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
4072 	if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
4073 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
4074 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
4075 
4076 	if ((processor_alias_table[i].flags
4077 	   & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
4078 	  x86_prefetch_sse = true;
4079 	if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
4080 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4081 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4082 	if (((processor_alias_table[i].flags & PTA_PKU) != 0)
4083 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4084 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4085 
4086 	/* Don't enable x87 instructions if only
4087 	   general registers are allowed.  */
4088 	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4089 	    && !(opts_set->x_target_flags & MASK_80387))
4090 	  {
4091 	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
4092 	      opts->x_target_flags &= ~MASK_80387;
4093 	    else
4094 	      opts->x_target_flags |= MASK_80387;
4095 	  }
4096 	break;
4097       }
4098 
4099   if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4100     error ("Intel MPX does not support x32");
4101 
4102   if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4103     error ("Intel MPX does not support x32");
4104 
4105   if (i == pta_size)
4106     {
4107       error (main_args_p
4108 	     ? G_("bad value (%qs) for %<-march=%> switch")
4109 	     : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4110 	     opts->x_ix86_arch_string);
4111 
4112       auto_vec <const char *> candidates;
4113       for (i = 0; i < pta_size; i++)
4114 	if (strcmp (processor_alias_table[i].name, "generic")
4115 	    && strcmp (processor_alias_table[i].name, "intel")
4116 	    && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4117 		|| ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
4118 	  candidates.safe_push (processor_alias_table[i].name);
4119 
4120 #ifdef HAVE_LOCAL_CPU_DETECT
4121       /* Add also "native" as possible value.  */
4122       candidates.safe_push ("native");
4123 #endif
4124 
4125       char *s;
4126       const char *hint
4127 	= candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4128       if (hint)
4129 	inform (input_location,
4130 		main_args_p
4131 		? G_("valid arguments to %<-march=%> switch are: "
4132 		     "%s; did you mean %qs?")
4133 		: G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4134 		     "%s; did you mean %qs?"), s, hint);
4135       else
4136 	inform (input_location,
4137 		main_args_p
4138 		? G_("valid arguments to %<-march=%> switch are: %s")
4139 		: G_("valid arguments to %<target(\"arch=\")%> attribute "
4140 		     "are: %s"), s);
4141       XDELETEVEC (s);
4142     }
4143 
4144   ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
4145   for (i = 0; i < X86_ARCH_LAST; ++i)
4146     ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4147 
4148   for (i = 0; i < pta_size; i++)
4149     if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4150       {
4151 	ix86_schedule = processor_alias_table[i].schedule;
4152 	ix86_tune = processor_alias_table[i].processor;
4153 	if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4154 	  {
4155 	    if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
4156 	      {
4157 		if (ix86_tune_defaulted)
4158 		  {
4159 		    opts->x_ix86_tune_string = "x86-64";
4160 		    for (i = 0; i < pta_size; i++)
4161 		      if (! strcmp (opts->x_ix86_tune_string,
4162 				    processor_alias_table[i].name))
4163 			break;
4164 		    ix86_schedule = processor_alias_table[i].schedule;
4165 		    ix86_tune = processor_alias_table[i].processor;
4166 		  }
4167 		else
4168 		  error ("CPU you selected does not support x86-64 "
4169 			 "instruction set");
4170 	      }
4171 	  }
4172 	/* Intel CPUs have always interpreted SSE prefetch instructions as
4173 	   NOPs; so, we can enable SSE prefetch instructions even when
4174 	   -mtune (rather than -march) points us to a processor that has them.
4175 	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4176 	   higher processors.  */
4177 	if (TARGET_CMOV
4178 	    && ((processor_alias_table[i].flags
4179 	      & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
4180 	  x86_prefetch_sse = true;
4181 	break;
4182       }
4183 
4184   if (ix86_tune_specified && i == pta_size)
4185     {
4186       error (main_args_p
4187 	     ? G_("bad value (%qs) for %<-mtune=%> switch")
4188 	     : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4189 	     opts->x_ix86_tune_string);
4190 
4191       auto_vec <const char *> candidates;
4192       for (i = 0; i < pta_size; i++)
4193 	if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4194 	    || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
4195 	  candidates.safe_push (processor_alias_table[i].name);
4196 
4197 #ifdef HAVE_LOCAL_CPU_DETECT
4198       /* Add also "native" as possible value.  */
4199       candidates.safe_push ("native");
4200 #endif
4201 
4202       char *s;
4203       const char *hint
4204 	= candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4205       if (hint)
4206 	inform (input_location,
4207 		main_args_p
4208 		? G_("valid arguments to %<-mtune=%> switch are: "
4209 		     "%s; did you mean %qs?")
4210 		: G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4211 		     "%s; did you mean %qs?"), s, hint);
4212       else
4213 	inform (input_location,
4214 		main_args_p
4215 		? G_("valid arguments to %<-mtune=%> switch are: %s")
4216 		: G_("valid arguments to %<target(\"tune=\")%> attribute "
4217 		     "are: %s"), s);
4218       XDELETEVEC (s);
4219     }
4220 
4221   set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4222 
4223 #ifndef USE_IX86_FRAME_POINTER
4224 #define USE_IX86_FRAME_POINTER 0
4225 #endif
4226 
4227 #ifndef USE_X86_64_FRAME_POINTER
4228 #define USE_X86_64_FRAME_POINTER 0
4229 #endif
4230 
4231   /* Set the default values for switches whose default depends on TARGET_64BIT
4232      in case they weren't overwritten by command line options.  */
4233   if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4234     {
4235       if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4236 	opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4237       if (opts->x_flag_asynchronous_unwind_tables
4238 	  && !opts_set->x_flag_unwind_tables
4239 	  && TARGET_64BIT_MS_ABI)
4240 	opts->x_flag_unwind_tables = 1;
4241       if (opts->x_flag_asynchronous_unwind_tables == 2)
4242 	opts->x_flag_unwind_tables
4243 	  = opts->x_flag_asynchronous_unwind_tables = 1;
4244       if (opts->x_flag_pcc_struct_return == 2)
4245 	opts->x_flag_pcc_struct_return = 0;
4246     }
4247   else
4248     {
4249       if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4250 	opts->x_flag_omit_frame_pointer
4251 	  = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4252       if (opts->x_flag_asynchronous_unwind_tables == 2)
4253 	opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4254       if (opts->x_flag_pcc_struct_return == 2)
4255 	{
4256 	  /* Intel MCU psABI specifies that -freg-struct-return should
4257 	     be on.  Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4258 	     we check -miamcu so that -freg-struct-return is always
4259 	     turned on if -miamcu is used.  */
4260 	  if (TARGET_IAMCU_P (opts->x_target_flags))
4261 	    opts->x_flag_pcc_struct_return = 0;
4262 	  else
4263 	    opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4264 	}
4265     }
4266 
4267   ix86_tune_cost = processor_target_table[ix86_tune].cost;
4268   /* TODO: ix86_cost should be chosen at instruction or function granuality
4269      so for cold code we use size_cost even in !optimize_size compilation.  */
4270   if (opts->x_optimize_size)
4271     ix86_cost = &ix86_size_cost;
4272   else
4273     ix86_cost = ix86_tune_cost;
4274 
4275   /* Arrange to set up i386_stack_locals for all functions.  */
4276   init_machine_status = ix86_init_machine_status;
4277 
4278   /* Validate -mregparm= value.  */
4279   if (opts_set->x_ix86_regparm)
4280     {
4281       if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4282 	warning (0, "-mregparm is ignored in 64-bit mode");
4283       else if (TARGET_IAMCU_P (opts->x_target_flags))
4284 	warning (0, "-mregparm is ignored for Intel MCU psABI");
4285       if (opts->x_ix86_regparm > REGPARM_MAX)
4286 	{
4287 	  error ("-mregparm=%d is not between 0 and %d",
4288 		 opts->x_ix86_regparm, REGPARM_MAX);
4289 	  opts->x_ix86_regparm = 0;
4290 	}
4291     }
4292   if (TARGET_IAMCU_P (opts->x_target_flags)
4293       || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4294     opts->x_ix86_regparm = REGPARM_MAX;
4295 
4296   /* Default align_* from the processor table.  */
4297   ix86_default_align (opts);
4298 
4299   /* Provide default for -mbranch-cost= value.  */
4300   if (!opts_set->x_ix86_branch_cost)
4301     opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4302 
4303   if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4304     {
4305       opts->x_target_flags
4306 	|= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4307 
4308       /* Enable by default the SSE and MMX builtins.  Do allow the user to
4309 	 explicitly disable any of these.  In particular, disabling SSE and
4310 	 MMX for kernel code is extremely useful.  */
4311       if (!ix86_arch_specified)
4312       opts->x_ix86_isa_flags
4313 	|= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4314 	     | TARGET_SUBTARGET64_ISA_DEFAULT)
4315             & ~opts->x_ix86_isa_flags_explicit);
4316 
4317       if (TARGET_RTD_P (opts->x_target_flags))
4318 	warning (0,
4319 		 main_args_p
4320 		 ? G_("%<-mrtd%> is ignored in 64bit mode")
4321 		 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4322     }
4323   else
4324     {
4325       opts->x_target_flags
4326 	|= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4327 
4328       if (!ix86_arch_specified)
4329         opts->x_ix86_isa_flags
4330 	  |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4331 
4332       /* i386 ABI does not specify red zone.  It still makes sense to use it
4333          when programmer takes care to stack from being destroyed.  */
4334       if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4335         opts->x_target_flags |= MASK_NO_RED_ZONE;
4336     }
4337 
4338   /* Keep nonleaf frame pointers.  */
4339   if (opts->x_flag_omit_frame_pointer)
4340     opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4341   else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4342     opts->x_flag_omit_frame_pointer = 1;
4343 
4344   /* If we're doing fast math, we don't care about comparison order
4345      wrt NaNs.  This lets us use a shorter comparison sequence.  */
4346   if (opts->x_flag_finite_math_only)
4347     opts->x_target_flags &= ~MASK_IEEE_FP;
4348 
4349   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4350      since the insns won't need emulation.  */
4351   if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4352     opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4353 
4354   /* Likewise, if the target doesn't have a 387, or we've specified
4355      software floating point, don't use 387 inline intrinsics.  */
4356   if (!TARGET_80387_P (opts->x_target_flags))
4357     opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4358 
4359   /* Turn on MMX builtins for -msse.  */
4360   if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4361     opts->x_ix86_isa_flags
4362       |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4363 
4364   /* Enable SSE prefetch.  */
4365   if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4366       || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4367 	  && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4368       || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4369     x86_prefetch_sse = true;
4370 
4371   /* Enable popcnt instruction for -msse4.2 or -mabm.  */
4372   if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4373       || TARGET_ABM_P (opts->x_ix86_isa_flags))
4374     opts->x_ix86_isa_flags
4375       |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4376 
4377   /* Enable lzcnt instruction for -mabm.  */
4378   if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4379     opts->x_ix86_isa_flags
4380       |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4381 
4382   /* Disable BMI, BMI2 and TBM instructions for -m16.  */
4383   if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4384     opts->x_ix86_isa_flags
4385       &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4386 	   & ~opts->x_ix86_isa_flags_explicit);
4387 
4388   /* Validate -mpreferred-stack-boundary= value or default it to
4389      PREFERRED_STACK_BOUNDARY_DEFAULT.  */
4390   ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4391   if (opts_set->x_ix86_preferred_stack_boundary_arg)
4392     {
4393       int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4394       int max = TARGET_SEH ? 4 : 12;
4395 
4396       if (opts->x_ix86_preferred_stack_boundary_arg < min
4397 	  || opts->x_ix86_preferred_stack_boundary_arg > max)
4398 	{
4399 	  if (min == max)
4400 	    error ("-mpreferred-stack-boundary is not supported "
4401 		   "for this target");
4402 	  else
4403 	    error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4404 		   opts->x_ix86_preferred_stack_boundary_arg, min, max);
4405 	}
4406       else
4407 	ix86_preferred_stack_boundary
4408 	  = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4409     }
4410 
4411   /* Set the default value for -mstackrealign.  */
4412   if (!opts_set->x_ix86_force_align_arg_pointer)
4413     opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4414 
4415   ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4416 
4417   /* Validate -mincoming-stack-boundary= value or default it to
4418      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
4419   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4420   if (opts_set->x_ix86_incoming_stack_boundary_arg)
4421     {
4422       int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4423 
4424       if (opts->x_ix86_incoming_stack_boundary_arg < min
4425 	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
4426 	error ("-mincoming-stack-boundary=%d is not between %d and 12",
4427 	       opts->x_ix86_incoming_stack_boundary_arg, min);
4428       else
4429 	{
4430 	  ix86_user_incoming_stack_boundary
4431 	    = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4432 	  ix86_incoming_stack_boundary
4433 	    = ix86_user_incoming_stack_boundary;
4434 	}
4435     }
4436 
4437 #ifndef NO_PROFILE_COUNTERS
4438   if (flag_nop_mcount)
4439     error ("-mnop-mcount is not compatible with this target");
4440 #endif
4441   if (flag_nop_mcount && flag_pic)
4442     error ("-mnop-mcount is not implemented for -fPIC");
4443 
4444   /* Accept -msseregparm only if at least SSE support is enabled.  */
4445   if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4446       && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4447     error (main_args_p
4448 	   ? G_("%<-msseregparm%> used without SSE enabled")
4449 	   : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4450 
4451   if (opts_set->x_ix86_fpmath)
4452     {
4453       if (opts->x_ix86_fpmath & FPMATH_SSE)
4454 	{
4455 	  if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4456 	    {
4457 	      if (TARGET_80387_P (opts->x_target_flags))
4458 		{
4459 		  warning (0, "SSE instruction set disabled, using 387 arithmetics");
4460 		  opts->x_ix86_fpmath = FPMATH_387;
4461 		}
4462 	    }
4463 	  else if ((opts->x_ix86_fpmath & FPMATH_387)
4464 		   && !TARGET_80387_P (opts->x_target_flags))
4465 	    {
4466 	      warning (0, "387 instruction set disabled, using SSE arithmetics");
4467 	      opts->x_ix86_fpmath = FPMATH_SSE;
4468 	    }
4469 	}
4470     }
4471   /* For all chips supporting SSE2, -mfpmath=sse performs better than
4472      fpmath=387.  The second is however default at many targets since the
4473      extra 80bit precision of temporaries is considered to be part of ABI.
4474      Overwrite the default at least for -ffast-math.
4475      TODO: -mfpmath=both seems to produce same performing code with bit
4476      smaller binaries.  It is however not clear if register allocation is
4477      ready for this setting.
4478      Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4479      codegen.  We may switch to 387 with -ffast-math for size optimized
4480      functions. */
4481   else if (fast_math_flags_set_p (&global_options)
4482 	   && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4483     opts->x_ix86_fpmath = FPMATH_SSE;
4484   else
4485     opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4486 
4487   /* Use external vectorized library in vectorizing intrinsics.  */
4488   if (opts_set->x_ix86_veclibabi_type)
4489     switch (opts->x_ix86_veclibabi_type)
4490       {
4491       case ix86_veclibabi_type_svml:
4492 	ix86_veclib_handler = ix86_veclibabi_svml;
4493 	break;
4494 
4495       case ix86_veclibabi_type_acml:
4496 	ix86_veclib_handler = ix86_veclibabi_acml;
4497 	break;
4498 
4499       default:
4500 	gcc_unreachable ();
4501       }
4502 
4503   if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4504       && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4505     opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4506 
4507   /* If stack probes are required, the space used for large function
4508      arguments on the stack must also be probed, so enable
4509      -maccumulate-outgoing-args so this happens in the prologue.  */
4510   if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4511       && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4512     {
4513       if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4514 	warning (0,
4515 		 main_args_p
4516 		 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4517 		      "for correctness")
4518 		 : G_("stack probing requires "
4519 		      "%<target(\"accumulate-outgoing-args\")%> for "
4520 		      "correctness"));
4521       opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4522     }
4523 
4524   /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4525      so enable -maccumulate-outgoing-args when %ebp is fixed.  */
4526   if (fixed_regs[BP_REG]
4527       && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4528     {
4529       if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4530 	warning (0,
4531 		 main_args_p
4532 		 ? G_("fixed ebp register requires "
4533 		      "%<-maccumulate-outgoing-args%>")
4534 		 : G_("fixed ebp register requires "
4535 		      "%<target(\"accumulate-outgoing-args\")%>"));
4536       opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4537     }
4538 
4539   /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
4540   {
4541     char *p;
4542     ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4543     p = strchr (internal_label_prefix, 'X');
4544     internal_label_prefix_len = p - internal_label_prefix;
4545     *p = '\0';
4546   }
4547 
4548   /* When scheduling description is not available, disable scheduler pass
4549      so it won't slow down the compilation and make x87 code slower.  */
4550   if (!TARGET_SCHEDULE)
4551     opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4552 
4553   maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4554 			 ix86_tune_cost->simultaneous_prefetches,
4555 			 opts->x_param_values,
4556 			 opts_set->x_param_values);
4557   maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4558 			 ix86_tune_cost->prefetch_block,
4559 			 opts->x_param_values,
4560 			 opts_set->x_param_values);
4561   maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4562 			 ix86_tune_cost->l1_cache_size,
4563 			 opts->x_param_values,
4564 			 opts_set->x_param_values);
4565   maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4566 			 ix86_tune_cost->l2_cache_size,
4567 			 opts->x_param_values,
4568 			 opts_set->x_param_values);
4569 
4570   /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
4571   if (opts->x_flag_prefetch_loop_arrays < 0
4572       && HAVE_prefetch
4573       && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4574       && !opts->x_optimize_size
4575       && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4576     opts->x_flag_prefetch_loop_arrays = 1;
4577 
4578   /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4579      can be opts->x_optimized to ap = __builtin_next_arg (0).  */
4580   if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4581     targetm.expand_builtin_va_start = NULL;
4582 
4583   if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4584     {
4585       ix86_gen_leave = gen_leave_rex64;
4586       if (Pmode == DImode)
4587 	{
4588 	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4589 	  ix86_gen_tls_local_dynamic_base_64
4590 	    = gen_tls_local_dynamic_base_64_di;
4591 	}
4592       else
4593 	{
4594 	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4595 	  ix86_gen_tls_local_dynamic_base_64
4596 	    = gen_tls_local_dynamic_base_64_si;
4597 	}
4598     }
4599   else
4600     ix86_gen_leave = gen_leave;
4601 
4602   if (Pmode == DImode)
4603     {
4604       ix86_gen_add3 = gen_adddi3;
4605       ix86_gen_sub3 = gen_subdi3;
4606       ix86_gen_sub3_carry = gen_subdi3_carry;
4607       ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4608       ix86_gen_andsp = gen_anddi3;
4609       ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4610       ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4611       ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4612       ix86_gen_monitor = gen_sse3_monitor_di;
4613       ix86_gen_monitorx = gen_monitorx_di;
4614       ix86_gen_clzero = gen_clzero_di;
4615     }
4616   else
4617     {
4618       ix86_gen_add3 = gen_addsi3;
4619       ix86_gen_sub3 = gen_subsi3;
4620       ix86_gen_sub3_carry = gen_subsi3_carry;
4621       ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4622       ix86_gen_andsp = gen_andsi3;
4623       ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4624       ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4625       ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4626       ix86_gen_monitor = gen_sse3_monitor_si;
4627       ix86_gen_monitorx = gen_monitorx_si;
4628       ix86_gen_clzero = gen_clzero_si;
4629     }
4630 
4631 #ifdef USE_IX86_CLD
4632   /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
4633   if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4634     opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4635 #endif
4636 
4637   /* Set the default value for -mfentry.  */
4638   if (!opts_set->x_flag_fentry)
4639     opts->x_flag_fentry = TARGET_SEH;
4640   else
4641     {
4642       if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4643 	  && opts->x_flag_fentry)
4644 	sorry ("-mfentry isn%'t supported for 32-bit in combination "
4645 	       "with -fpic");
4646       else if (TARGET_SEH && !opts->x_flag_fentry)
4647 	sorry ("-mno-fentry isn%'t compatible with SEH");
4648     }
4649 
4650   if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4651     sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4652 
4653   if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4654       && TARGET_EMIT_VZEROUPPER)
4655     opts->x_target_flags |= MASK_VZEROUPPER;
4656   if (!(opts_set->x_target_flags & MASK_STV))
4657     opts->x_target_flags |= MASK_STV;
4658   /* Disable STV if -mpreferred-stack-boundary={2,3} or
4659      -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4660      stack realignment will be extra cost the pass doesn't take into
4661      account and the pass can't realign the stack.  */
4662   if (ix86_preferred_stack_boundary < 128
4663       || ix86_incoming_stack_boundary < 128
4664       || opts->x_ix86_force_align_arg_pointer)
4665     opts->x_target_flags &= ~MASK_STV;
4666   if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4667       && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4668     opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4669   if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4670       && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4671     opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4672 
4673   /* Enable 128-bit AVX instruction generation
4674      for the auto-vectorizer.  */
4675   if (TARGET_AVX128_OPTIMAL
4676       && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4677     opts->x_prefer_vector_width_type = PVW_AVX128;
4678 
4679   /* Use 256-bit AVX instruction generation
4680      in the auto-vectorizer.  */
4681   if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4682       && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4683     opts->x_prefer_vector_width_type = PVW_AVX256;
4684 
4685   if (opts->x_ix86_recip_name)
4686     {
4687       char *p = ASTRDUP (opts->x_ix86_recip_name);
4688       char *q;
4689       unsigned int mask, i;
4690       bool invert;
4691 
4692       while ((q = strtok (p, ",")) != NULL)
4693 	{
4694 	  p = NULL;
4695 	  if (*q == '!')
4696 	    {
4697 	      invert = true;
4698 	      q++;
4699 	    }
4700 	  else
4701 	    invert = false;
4702 
4703 	  if (!strcmp (q, "default"))
4704 	    mask = RECIP_MASK_ALL;
4705 	  else
4706 	    {
4707 	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4708 		if (!strcmp (q, recip_options[i].string))
4709 		  {
4710 		    mask = recip_options[i].mask;
4711 		    break;
4712 		  }
4713 
4714 	      if (i == ARRAY_SIZE (recip_options))
4715 		{
4716 		  error ("unknown option for -mrecip=%s", q);
4717 		  invert = false;
4718 		  mask = RECIP_MASK_NONE;
4719 		}
4720 	    }
4721 
4722 	  opts->x_recip_mask_explicit |= mask;
4723 	  if (invert)
4724 	    opts->x_recip_mask &= ~mask;
4725 	  else
4726 	    opts->x_recip_mask |= mask;
4727 	}
4728     }
4729 
4730   if (TARGET_RECIP_P (opts->x_target_flags))
4731     opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4732   else if (opts_set->x_target_flags & MASK_RECIP)
4733     opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4734 
4735   /* Default long double to 64-bit for 32-bit Bionic and to __float128
4736      for 64-bit Bionic.  Also default long double to 64-bit for Intel
4737      MCU psABI.  */
4738   if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4739       && !(opts_set->x_target_flags
4740 	   & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4741     opts->x_target_flags |= (TARGET_64BIT
4742 			     ? MASK_LONG_DOUBLE_128
4743 			     : MASK_LONG_DOUBLE_64);
4744 
4745   /* Only one of them can be active.  */
4746   gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4747 	      || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4748 
4749   /* Handle stack protector */
4750   if (!opts_set->x_ix86_stack_protector_guard)
4751     {
4752 #ifdef TARGET_THREAD_SSP_OFFSET
4753       if (!TARGET_HAS_BIONIC)
4754 	opts->x_ix86_stack_protector_guard = SSP_TLS;
4755       else
4756 #endif
4757 	opts->x_ix86_stack_protector_guard = SSP_GLOBAL;
4758     }
4759 
4760 #ifdef TARGET_THREAD_SSP_OFFSET
4761   ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4762 #endif
4763 
4764   if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4765     {
4766       char *endp;
4767       const char *str = ix86_stack_protector_guard_offset_str;
4768 
4769       errno = 0;
4770       int64_t offset;
4771 
4772 #if defined(INT64_T_IS_LONG)
4773       offset = strtol (str, &endp, 0);
4774 #else
4775       offset = strtoll (str, &endp, 0);
4776 #endif
4777 
4778       if (!*str || *endp || errno)
4779 	error ("%qs is not a valid number "
4780 	       "in -mstack-protector-guard-offset=", str);
4781 
4782       if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4783 		     HOST_WIDE_INT_C (0x7fffffff)))
4784 	error ("%qs is not a valid offset "
4785 	       "in -mstack-protector-guard-offset=", str);
4786 
4787       ix86_stack_protector_guard_offset = offset;
4788     }
4789 
4790   ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4791 
4792   /* The kernel uses a different segment register for performance
4793      reasons; a system call would not have to trash the userspace
4794      segment register, which would be expensive.  */
4795   if (ix86_cmodel == CM_KERNEL)
4796     ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4797 
4798   if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4799     {
4800       const char *str = ix86_stack_protector_guard_reg_str;
4801       addr_space_t seg = ADDR_SPACE_GENERIC;
4802 
4803       /* Discard optional register prefix.  */
4804       if (str[0] == '%')
4805 	str++;
4806 
4807       if (strlen (str) == 2 && str[1] == 's')
4808 	{
4809 	  if (str[0] == 'f')
4810 	    seg = ADDR_SPACE_SEG_FS;
4811 	  else if (str[0] == 'g')
4812 	    seg = ADDR_SPACE_SEG_GS;
4813 	}
4814 
4815       if (seg == ADDR_SPACE_GENERIC)
4816 	error ("%qs is not a valid base register "
4817 	       "in -mstack-protector-guard-reg=",
4818 	       ix86_stack_protector_guard_reg_str);
4819 
4820       ix86_stack_protector_guard_reg = seg;
4821     }
4822 
4823   /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
4824   if (opts->x_ix86_tune_memcpy_strategy)
4825     {
4826       char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4827       ix86_parse_stringop_strategy_string (str, false);
4828       free (str);
4829     }
4830 
4831   if (opts->x_ix86_tune_memset_strategy)
4832     {
4833       char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4834       ix86_parse_stringop_strategy_string (str, true);
4835       free (str);
4836     }
4837 
4838   /* Save the initial options in case the user does function specific
4839      options.  */
4840   if (main_args_p)
4841     target_option_default_node = target_option_current_node
4842       = build_target_option_node (opts);
4843 
4844   if (opts->x_flag_cf_protection != CF_NONE)
4845     opts->x_flag_cf_protection =
4846       (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4847 
4848   if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4849     maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4850 			   opts->x_param_values,
4851 			   opts_set->x_param_values);
4852 
4853   /* PR86952: jump table usage with retpolines is slow.
4854      The PR provides some numbers about the slowness.  */
4855   if (ix86_indirect_branch != indirect_branch_keep
4856       && !opts_set->x_flag_jump_tables)
4857     opts->x_flag_jump_tables = 0;
4858 
4859   return true;
4860 }
4861 
4862 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
4863 
4864 static void
4865 ix86_option_override (void)
4866 {
4867   ix86_option_override_internal (true, &global_options, &global_options_set);
4868 }
4869 
4870 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
4871 static char *
4872 ix86_offload_options (void)
4873 {
4874   if (TARGET_LP64)
4875     return xstrdup ("-foffload-abi=lp64");
4876   return xstrdup ("-foffload-abi=ilp32");
4877 }
4878 
4879 /* Update register usage after having seen the compiler flags.  */
4880 
4881 static void
4882 ix86_conditional_register_usage (void)
4883 {
4884   int i, c_mask;
4885 
4886   /* If there are no caller-saved registers, preserve all registers.
4887      except fixed_regs and registers used for function return value
4888      since aggregate_value_p checks call_used_regs[regno] on return
4889      value.  */
4890   if (cfun && cfun->machine->no_caller_saved_registers)
4891     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4892       if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4893 	call_used_regs[i] = 0;
4894 
4895   /* For 32-bit targets, squash the REX registers.  */
4896   if (! TARGET_64BIT)
4897     {
4898       for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4899 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4900       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4901 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4902       for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4903 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4904     }
4905 
4906   /*  See the definition of CALL_USED_REGISTERS in i386.h.  */
4907   c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4908 
4909   CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4910 
4911   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4912     {
4913       /* Set/reset conditionally defined registers from
4914 	 CALL_USED_REGISTERS initializer.  */
4915       if (call_used_regs[i] > 1)
4916 	call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4917 
4918       /* Calculate registers of CLOBBERED_REGS register set
4919 	 as call used registers from GENERAL_REGS register set.  */
4920       if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4921 	  && call_used_regs[i])
4922 	SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4923     }
4924 
4925   /* If MMX is disabled, squash the registers.  */
4926   if (! TARGET_MMX)
4927     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4928       if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4929 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4930 
4931   /* If SSE is disabled, squash the registers.  */
4932   if (! TARGET_SSE)
4933     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4934       if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4935 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4936 
4937   /* If the FPU is disabled, squash the registers.  */
4938   if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4939     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4940       if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4941 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4942 
4943   /* If AVX512F is disabled, squash the registers.  */
4944   if (! TARGET_AVX512F)
4945     {
4946       for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4947 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4948 
4949       for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4950 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4951     }
4952 
4953   /* If MPX is disabled, squash the registers.  */
4954   if (! TARGET_MPX)
4955     for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4956       fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4957 }
4958 
4959 /* Canonicalize a comparison from one we don't have to one we do have.  */
4960 
4961 static void
4962 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
4963 			      bool op0_preserve_value)
4964 {
4965   /* The order of operands in x87 ficom compare is forced by combine in
4966      simplify_comparison () function. Float operator is treated as RTX_OBJ
4967      with a precedence over other operators and is always put in the first
4968      place. Swap condition and operands to match ficom instruction.  */
4969   if (!op0_preserve_value
4970       && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
4971     {
4972       enum rtx_code scode = swap_condition ((enum rtx_code) *code);
4973 
4974       /* We are called only for compares that are split to SAHF instruction.
4975 	 Ensure that we have setcc/jcc insn for the swapped condition.  */
4976       if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
4977 	{
4978 	  std::swap (*op0, *op1);
4979 	  *code = (int) scode;
4980 	}
4981     }
4982 }
4983 
4984 /* Save the current options */
4985 
4986 static void
4987 ix86_function_specific_save (struct cl_target_option *ptr,
4988 			     struct gcc_options *opts)
4989 {
4990   ptr->arch = ix86_arch;
4991   ptr->schedule = ix86_schedule;
4992   ptr->prefetch_sse = x86_prefetch_sse;
4993   ptr->tune = ix86_tune;
4994   ptr->branch_cost = ix86_branch_cost;
4995   ptr->tune_defaulted = ix86_tune_defaulted;
4996   ptr->arch_specified = ix86_arch_specified;
4997   ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4998   ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
4999   ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5000   ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5001   ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5002   ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5003   ptr->x_ix86_abi = opts->x_ix86_abi;
5004   ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5005   ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5006   ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5007   ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5008   ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5009   ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5010   ptr->x_ix86_pmode = opts->x_ix86_pmode;
5011   ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5012   ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5013   ptr->x_ix86_regparm = opts->x_ix86_regparm;
5014   ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5015   ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5016   ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5017   ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5018   ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5019   ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5020   ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5021   ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5022   ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5023   ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5024 
5025   /* The fields are char but the variables are not; make sure the
5026      values fit in the fields.  */
5027   gcc_assert (ptr->arch == ix86_arch);
5028   gcc_assert (ptr->schedule == ix86_schedule);
5029   gcc_assert (ptr->tune == ix86_tune);
5030   gcc_assert (ptr->branch_cost == ix86_branch_cost);
5031 }
5032 
5033 /* Restore the current options */
5034 
5035 static void
5036 ix86_function_specific_restore (struct gcc_options *opts,
5037 				struct cl_target_option *ptr)
5038 {
5039   enum processor_type old_tune = ix86_tune;
5040   enum processor_type old_arch = ix86_arch;
5041   unsigned HOST_WIDE_INT ix86_arch_mask;
5042   int i;
5043 
5044   /* We don't change -fPIC.  */
5045   opts->x_flag_pic = flag_pic;
5046 
5047   ix86_arch = (enum processor_type) ptr->arch;
5048   ix86_schedule = (enum attr_cpu) ptr->schedule;
5049   ix86_tune = (enum processor_type) ptr->tune;
5050   x86_prefetch_sse = ptr->prefetch_sse;
5051   opts->x_ix86_branch_cost = ptr->branch_cost;
5052   ix86_tune_defaulted = ptr->tune_defaulted;
5053   ix86_arch_specified = ptr->arch_specified;
5054   opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5055   opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5056   opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5057   opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5058   opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5059   opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5060   opts->x_ix86_abi = ptr->x_ix86_abi;
5061   opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5062   opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5063   opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5064   opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5065   opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5066   opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5067   opts->x_ix86_pmode = ptr->x_ix86_pmode;
5068   opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5069   opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5070   opts->x_ix86_regparm = ptr->x_ix86_regparm;
5071   opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5072   opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5073   opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5074   opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5075   opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5076   opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5077   opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5078   opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5079   opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5080   opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5081   ix86_tune_cost = processor_target_table[ix86_tune].cost;
5082   /* TODO: ix86_cost should be chosen at instruction or function granuality
5083      so for cold code we use size_cost even in !optimize_size compilation.  */
5084   if (opts->x_optimize_size)
5085     ix86_cost = &ix86_size_cost;
5086   else
5087     ix86_cost = ix86_tune_cost;
5088 
5089   /* Recreate the arch feature tests if the arch changed */
5090   if (old_arch != ix86_arch)
5091     {
5092       ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
5093       for (i = 0; i < X86_ARCH_LAST; ++i)
5094 	ix86_arch_features[i]
5095 	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5096     }
5097 
5098   /* Recreate the tune optimization tests */
5099   if (old_tune != ix86_tune)
5100     set_ix86_tune_features (ix86_tune, false);
5101 }
5102 
5103 /* Adjust target options after streaming them in.  This is mainly about
5104    reconciling them with global options.  */
5105 
5106 static void
5107 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5108 {
5109   /* flag_pic is a global option, but ix86_cmodel is target saved option
5110      partly computed from flag_pic.  If flag_pic is on, adjust x_ix86_cmodel
5111      for PIC, or error out.  */
5112   if (flag_pic)
5113     switch (ptr->x_ix86_cmodel)
5114       {
5115       case CM_SMALL:
5116 	ptr->x_ix86_cmodel = CM_SMALL_PIC;
5117 	break;
5118 
5119       case CM_MEDIUM:
5120 	ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5121 	break;
5122 
5123       case CM_LARGE:
5124 	ptr->x_ix86_cmodel = CM_LARGE_PIC;
5125 	break;
5126 
5127       case CM_KERNEL:
5128 	error ("code model %s does not support PIC mode", "kernel");
5129 	break;
5130 
5131       default:
5132 	break;
5133       }
5134   else
5135     switch (ptr->x_ix86_cmodel)
5136       {
5137       case CM_SMALL_PIC:
5138 	ptr->x_ix86_cmodel = CM_SMALL;
5139 	break;
5140 
5141       case CM_MEDIUM_PIC:
5142 	ptr->x_ix86_cmodel = CM_MEDIUM;
5143 	break;
5144 
5145       case CM_LARGE_PIC:
5146 	ptr->x_ix86_cmodel = CM_LARGE;
5147 	break;
5148 
5149       default:
5150 	break;
5151       }
5152 }
5153 
5154 /* Print the current options */
5155 
5156 static void
5157 ix86_function_specific_print (FILE *file, int indent,
5158 			      struct cl_target_option *ptr)
5159 {
5160   char *target_string
5161     = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5162 			  ptr->x_target_flags, ptr->x_ix86_target_flags,
5163 			  NULL, NULL, ptr->x_ix86_fpmath, false);
5164 
5165   gcc_assert (ptr->arch < PROCESSOR_max);
5166   fprintf (file, "%*sarch = %d (%s)\n",
5167 	   indent, "",
5168 	   ptr->arch, processor_target_table[ptr->arch].name);
5169 
5170   gcc_assert (ptr->tune < PROCESSOR_max);
5171   fprintf (file, "%*stune = %d (%s)\n",
5172 	   indent, "",
5173 	   ptr->tune, processor_target_table[ptr->tune].name);
5174 
5175   fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5176 
5177   if (target_string)
5178     {
5179       fprintf (file, "%*s%s\n", indent, "", target_string);
5180       free (target_string);
5181     }
5182 }
5183 
5184 
5185 /* Inner function to process the attribute((target(...))), take an argument and
5186    set the current options from the argument. If we have a list, recursively go
5187    over the list.  */
5188 
5189 static bool
5190 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5191 				     struct gcc_options *opts,
5192 				     struct gcc_options *opts_set,
5193 				     struct gcc_options *enum_opts_set)
5194 {
5195   char *next_optstr;
5196   bool ret = true;
5197 
5198 #define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5199 #define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5200 #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5201 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5202 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
5203 
5204   enum ix86_opt_type
5205   {
5206     ix86_opt_unknown,
5207     ix86_opt_yes,
5208     ix86_opt_no,
5209     ix86_opt_str,
5210     ix86_opt_enum,
5211     ix86_opt_isa
5212   };
5213 
5214   static const struct
5215   {
5216     const char *string;
5217     size_t len;
5218     enum ix86_opt_type type;
5219     int opt;
5220     int mask;
5221   } attrs[] = {
5222     /* isa options */
5223     IX86_ATTR_ISA ("pconfig",	OPT_mpconfig),
5224     IX86_ATTR_ISA ("wbnoinvd",	OPT_mwbnoinvd),
5225     IX86_ATTR_ISA ("sgx",	OPT_msgx),
5226     IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5227     IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5228     IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5229     IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5230     IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5231     IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5232 
5233     IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5234     IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5235     IX86_ATTR_ISA ("avx512vl",	OPT_mavx512vl),
5236     IX86_ATTR_ISA ("avx512bw",	OPT_mavx512bw),
5237     IX86_ATTR_ISA ("avx512dq",	OPT_mavx512dq),
5238     IX86_ATTR_ISA ("avx512er",	OPT_mavx512er),
5239     IX86_ATTR_ISA ("avx512pf",	OPT_mavx512pf),
5240     IX86_ATTR_ISA ("avx512cd",	OPT_mavx512cd),
5241     IX86_ATTR_ISA ("avx512f",	OPT_mavx512f),
5242     IX86_ATTR_ISA ("avx2",	OPT_mavx2),
5243     IX86_ATTR_ISA ("fma",	OPT_mfma),
5244     IX86_ATTR_ISA ("xop",	OPT_mxop),
5245     IX86_ATTR_ISA ("fma4",	OPT_mfma4),
5246     IX86_ATTR_ISA ("f16c",	OPT_mf16c),
5247     IX86_ATTR_ISA ("avx",	OPT_mavx),
5248     IX86_ATTR_ISA ("sse4",	OPT_msse4),
5249     IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
5250     IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
5251     IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
5252     IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
5253     IX86_ATTR_ISA ("sse3",	OPT_msse3),
5254     IX86_ATTR_ISA ("aes",	OPT_maes),
5255     IX86_ATTR_ISA ("sha",	OPT_msha),
5256     IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
5257     IX86_ATTR_ISA ("sse2",	OPT_msse2),
5258     IX86_ATTR_ISA ("sse",	OPT_msse),
5259     IX86_ATTR_ISA ("3dnowa",	OPT_m3dnowa),
5260     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
5261     IX86_ATTR_ISA ("mmx",	OPT_mmmx),
5262     IX86_ATTR_ISA ("rtm",	OPT_mrtm),
5263     IX86_ATTR_ISA ("prfchw",	OPT_mprfchw),
5264     IX86_ATTR_ISA ("rdseed",	OPT_mrdseed),
5265     IX86_ATTR_ISA ("adx",	OPT_madx),
5266     IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5267     IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5268     IX86_ATTR_ISA ("xsaves",	OPT_mxsaves),
5269     IX86_ATTR_ISA ("xsavec",	OPT_mxsavec),
5270     IX86_ATTR_ISA ("xsaveopt",	OPT_mxsaveopt),
5271     IX86_ATTR_ISA ("xsave",	OPT_mxsave),
5272     IX86_ATTR_ISA ("abm",	OPT_mabm),
5273     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
5274     IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
5275     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
5276     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
5277     IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
5278     IX86_ATTR_ISA ("cx16",	OPT_mcx16),
5279     IX86_ATTR_ISA ("sahf",	OPT_msahf),
5280     IX86_ATTR_ISA ("movbe",	OPT_mmovbe),
5281     IX86_ATTR_ISA ("crc32",	OPT_mcrc32),
5282     IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
5283     IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
5284     IX86_ATTR_ISA ("mwaitx",	OPT_mmwaitx),
5285     IX86_ATTR_ISA ("clzero",	OPT_mclzero),
5286     IX86_ATTR_ISA ("pku",	OPT_mpku),
5287     IX86_ATTR_ISA ("lwp",	OPT_mlwp),
5288     IX86_ATTR_ISA ("hle",	OPT_mhle),
5289     IX86_ATTR_ISA ("fxsr",	OPT_mfxsr),
5290     IX86_ATTR_ISA ("mpx",	OPT_mmpx),
5291     IX86_ATTR_ISA ("clwb",	OPT_mclwb),
5292     IX86_ATTR_ISA ("rdpid",	OPT_mrdpid),
5293     IX86_ATTR_ISA ("gfni",	OPT_mgfni),
5294     IX86_ATTR_ISA ("shstk",	OPT_mshstk),
5295     IX86_ATTR_ISA ("vaes",	OPT_mvaes),
5296     IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5297     IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
5298     IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
5299 
5300     /* enum options */
5301     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
5302 
5303     /* string options */
5304     IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
5305     IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
5306 
5307     /* flag options */
5308     IX86_ATTR_YES ("cld",
5309 		   OPT_mcld,
5310 		   MASK_CLD),
5311 
5312     IX86_ATTR_NO ("fancy-math-387",
5313 		  OPT_mfancy_math_387,
5314 		  MASK_NO_FANCY_MATH_387),
5315 
5316     IX86_ATTR_YES ("ieee-fp",
5317 		   OPT_mieee_fp,
5318 		   MASK_IEEE_FP),
5319 
5320     IX86_ATTR_YES ("inline-all-stringops",
5321 		   OPT_minline_all_stringops,
5322 		   MASK_INLINE_ALL_STRINGOPS),
5323 
5324     IX86_ATTR_YES ("inline-stringops-dynamically",
5325 		   OPT_minline_stringops_dynamically,
5326 		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
5327 
5328     IX86_ATTR_NO ("align-stringops",
5329 		  OPT_mno_align_stringops,
5330 		  MASK_NO_ALIGN_STRINGOPS),
5331 
5332     IX86_ATTR_YES ("recip",
5333 		   OPT_mrecip,
5334 		   MASK_RECIP),
5335 
5336   };
5337 
5338   /* If this is a list, recurse to get the options.  */
5339   if (TREE_CODE (args) == TREE_LIST)
5340     {
5341       bool ret = true;
5342 
5343       for (; args; args = TREE_CHAIN (args))
5344 	if (TREE_VALUE (args)
5345 	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5346 						     p_strings, opts, opts_set,
5347 						     enum_opts_set))
5348 	  ret = false;
5349 
5350       return ret;
5351     }
5352 
5353   else if (TREE_CODE (args) != STRING_CST)
5354     {
5355       error ("attribute %<target%> argument not a string");
5356       return false;
5357     }
5358 
5359   /* Handle multiple arguments separated by commas.  */
5360   next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5361 
5362   while (next_optstr && *next_optstr != '\0')
5363     {
5364       char *p = next_optstr;
5365       char *orig_p = p;
5366       char *comma = strchr (next_optstr, ',');
5367       const char *opt_string;
5368       size_t len, opt_len;
5369       int opt;
5370       bool opt_set_p;
5371       char ch;
5372       unsigned i;
5373       enum ix86_opt_type type = ix86_opt_unknown;
5374       int mask = 0;
5375 
5376       if (comma)
5377 	{
5378 	  *comma = '\0';
5379 	  len = comma - next_optstr;
5380 	  next_optstr = comma + 1;
5381 	}
5382       else
5383 	{
5384 	  len = strlen (p);
5385 	  next_optstr = NULL;
5386 	}
5387 
5388       /* Recognize no-xxx.  */
5389       if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5390 	{
5391 	  opt_set_p = false;
5392 	  p += 3;
5393 	  len -= 3;
5394 	}
5395       else
5396 	opt_set_p = true;
5397 
5398       /* Find the option.  */
5399       ch = *p;
5400       opt = N_OPTS;
5401       for (i = 0; i < ARRAY_SIZE (attrs); i++)
5402 	{
5403 	  type = attrs[i].type;
5404 	  opt_len = attrs[i].len;
5405 	  if (ch == attrs[i].string[0]
5406 	      && ((type != ix86_opt_str && type != ix86_opt_enum)
5407 		  ? len == opt_len
5408 		  : len > opt_len)
5409 	      && memcmp (p, attrs[i].string, opt_len) == 0)
5410 	    {
5411 	      opt = attrs[i].opt;
5412 	      mask = attrs[i].mask;
5413 	      opt_string = attrs[i].string;
5414 	      break;
5415 	    }
5416 	}
5417 
5418       /* Process the option.  */
5419       if (opt == N_OPTS)
5420 	{
5421 	  error ("attribute(target(\"%s\")) is unknown", orig_p);
5422 	  ret = false;
5423 	}
5424 
5425       else if (type == ix86_opt_isa)
5426 	{
5427 	  struct cl_decoded_option decoded;
5428 
5429 	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5430 	  ix86_handle_option (opts, opts_set,
5431 			      &decoded, input_location);
5432 	}
5433 
5434       else if (type == ix86_opt_yes || type == ix86_opt_no)
5435 	{
5436 	  if (type == ix86_opt_no)
5437 	    opt_set_p = !opt_set_p;
5438 
5439 	  if (opt_set_p)
5440 	    opts->x_target_flags |= mask;
5441 	  else
5442 	    opts->x_target_flags &= ~mask;
5443 	}
5444 
5445       else if (type == ix86_opt_str)
5446 	{
5447 	  if (p_strings[opt])
5448 	    {
5449 	      error ("option(\"%s\") was already specified", opt_string);
5450 	      ret = false;
5451 	    }
5452 	  else
5453 	    p_strings[opt] = xstrdup (p + opt_len);
5454 	}
5455 
5456       else if (type == ix86_opt_enum)
5457 	{
5458 	  bool arg_ok;
5459 	  int value;
5460 
5461 	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5462 	  if (arg_ok)
5463 	    set_option (opts, enum_opts_set, opt, value,
5464 			p + opt_len, DK_UNSPECIFIED, input_location,
5465 			global_dc);
5466 	  else
5467 	    {
5468 	      error ("attribute(target(\"%s\")) is unknown", orig_p);
5469 	      ret = false;
5470 	    }
5471 	}
5472 
5473       else
5474 	gcc_unreachable ();
5475     }
5476 
5477   return ret;
5478 }
5479 
5480 /* Release allocated strings.  */
5481 static void
5482 release_options_strings (char **option_strings)
5483 {
5484   /* Free up memory allocated to hold the strings */
5485   for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5486     free (option_strings[i]);
5487 }
5488 
5489 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
5490 
5491 tree
5492 ix86_valid_target_attribute_tree (tree args,
5493 				  struct gcc_options *opts,
5494 				  struct gcc_options *opts_set)
5495 {
5496   const char *orig_arch_string = opts->x_ix86_arch_string;
5497   const char *orig_tune_string = opts->x_ix86_tune_string;
5498   enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5499   int orig_tune_defaulted = ix86_tune_defaulted;
5500   int orig_arch_specified = ix86_arch_specified;
5501   char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5502   tree t = NULL_TREE;
5503   struct cl_target_option *def
5504     = TREE_TARGET_OPTION (target_option_default_node);
5505   struct gcc_options enum_opts_set;
5506 
5507   memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5508 
5509   /* Process each of the options on the chain.  */
5510   if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5511 					     opts_set, &enum_opts_set))
5512     return error_mark_node;
5513 
5514   /* If the changed options are different from the default, rerun
5515      ix86_option_override_internal, and then save the options away.
5516      The string options are attribute options, and will be undone
5517      when we copy the save structure.  */
5518   if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5519       || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5520       || opts->x_target_flags != def->x_target_flags
5521       || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5522       || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5523       || enum_opts_set.x_ix86_fpmath)
5524     {
5525       /* If we are using the default tune= or arch=, undo the string assigned,
5526 	 and use the default.  */
5527       if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5528 	{
5529 	  opts->x_ix86_arch_string
5530 	    = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5531 
5532 	  /* If arch= is set,  clear all bits in x_ix86_isa_flags,
5533 	     except for ISA_64BIT, ABI_64, ABI_X32, and CODE16.  */
5534 	  opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5535 				     | OPTION_MASK_ABI_64
5536 				     | OPTION_MASK_ABI_X32
5537 				     | OPTION_MASK_CODE16);
5538 	  opts->x_ix86_isa_flags2 = 0;
5539 	}
5540       else if (!orig_arch_specified)
5541 	opts->x_ix86_arch_string = NULL;
5542 
5543       if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5544 	opts->x_ix86_tune_string
5545 	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5546       else if (orig_tune_defaulted)
5547 	opts->x_ix86_tune_string = NULL;
5548 
5549       /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
5550       if (enum_opts_set.x_ix86_fpmath)
5551 	opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5552 
5553       /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
5554       bool r = ix86_option_override_internal (false, opts, opts_set);
5555       if (!r)
5556 	{
5557 	  release_options_strings (option_strings);
5558 	  return error_mark_node;
5559 	}
5560 
5561       /* Add any builtin functions with the new isa if any.  */
5562       ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5563 
5564       /* Save the current options unless we are validating options for
5565 	 #pragma.  */
5566       t = build_target_option_node (opts);
5567 
5568       opts->x_ix86_arch_string = orig_arch_string;
5569       opts->x_ix86_tune_string = orig_tune_string;
5570       opts_set->x_ix86_fpmath = orig_fpmath_set;
5571 
5572       release_options_strings (option_strings);
5573     }
5574 
5575   return t;
5576 }
5577 
5578 /* Hook to validate attribute((target("string"))).  */
5579 
5580 static bool
5581 ix86_valid_target_attribute_p (tree fndecl,
5582 			       tree ARG_UNUSED (name),
5583 			       tree args,
5584 			       int ARG_UNUSED (flags))
5585 {
5586   struct gcc_options func_options;
5587   tree new_target, new_optimize;
5588   bool ret = true;
5589 
5590   /* attribute((target("default"))) does nothing, beyond
5591      affecting multi-versioning.  */
5592   if (TREE_VALUE (args)
5593       && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5594       && TREE_CHAIN (args) == NULL_TREE
5595       && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5596     return true;
5597 
5598   tree old_optimize = build_optimization_node (&global_options);
5599 
5600   /* Get the optimization options of the current function.  */
5601   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5602 
5603   if (!func_optimize)
5604     func_optimize = old_optimize;
5605 
5606   /* Init func_options.  */
5607   memset (&func_options, 0, sizeof (func_options));
5608   init_options_struct (&func_options, NULL);
5609   lang_hooks.init_options_struct (&func_options);
5610 
5611   cl_optimization_restore (&func_options,
5612 			   TREE_OPTIMIZATION (func_optimize));
5613 
5614   /* Initialize func_options to the default before its target options can
5615      be set.  */
5616   cl_target_option_restore (&func_options,
5617 			    TREE_TARGET_OPTION (target_option_default_node));
5618 
5619   new_target = ix86_valid_target_attribute_tree (args, &func_options,
5620 						 &global_options_set);
5621 
5622   new_optimize = build_optimization_node (&func_options);
5623 
5624   if (new_target == error_mark_node)
5625     ret = false;
5626 
5627   else if (fndecl && new_target)
5628     {
5629       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5630 
5631       if (old_optimize != new_optimize)
5632 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5633     }
5634 
5635   finalize_options_struct (&func_options);
5636 
5637   return ret;
5638 }
5639 
5640 
5641 /* Hook to determine if one function can safely inline another.  */
5642 
5643 static bool
5644 ix86_can_inline_p (tree caller, tree callee)
5645 {
5646   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5647   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5648 
5649   /* Changes of those flags can be tolerated for always inlines. Lets hope
5650      user knows what he is doing.  */
5651   const unsigned HOST_WIDE_INT always_inline_safe_mask
5652 	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
5653 	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
5654 	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
5655 	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
5656 	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
5657 	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
5658 	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
5659 
5660 
5661   if (!callee_tree)
5662     callee_tree = target_option_default_node;
5663   if (!caller_tree)
5664     caller_tree = target_option_default_node;
5665   if (callee_tree == caller_tree)
5666     return true;
5667 
5668   struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5669   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5670   bool ret = false;
5671   bool always_inline =
5672      (DECL_DISREGARD_INLINE_LIMITS (callee)
5673       && lookup_attribute ("always_inline",
5674 			   DECL_ATTRIBUTES (callee)));
5675 
5676   /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5677      function can inline a SSE2 function but a SSE2 function can't inline
5678      a SSE4 function.  */
5679   if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5680        != callee_opts->x_ix86_isa_flags)
5681       || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5682 	  != callee_opts->x_ix86_isa_flags2))
5683     ret = false;
5684 
5685   /* See if we have the same non-isa options.  */
5686   else if ((!always_inline
5687 	    && caller_opts->x_target_flags != callee_opts->x_target_flags)
5688 	   || (caller_opts->x_target_flags & ~always_inline_safe_mask)
5689 	       != (callee_opts->x_target_flags & ~always_inline_safe_mask))
5690     ret = false;
5691 
5692   /* See if arch, tune, etc. are the same.  */
5693   else if (caller_opts->arch != callee_opts->arch)
5694     ret = false;
5695 
5696   else if (!always_inline && caller_opts->tune != callee_opts->tune)
5697     ret = false;
5698 
5699   else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5700 	   /* If the calle doesn't use FP expressions differences in
5701 	      ix86_fpmath can be ignored.  We are called from FEs
5702 	      for multi-versioning call optimization, so beware of
5703 	      ipa_fn_summaries not available.  */
5704 	   && (! ipa_fn_summaries
5705 	       || ipa_fn_summaries->get
5706 	       (cgraph_node::get (callee))->fp_expressions))
5707     ret = false;
5708 
5709   else if (!always_inline
5710 	   && caller_opts->branch_cost != callee_opts->branch_cost)
5711     ret = false;
5712 
5713   else
5714     ret = true;
5715 
5716   return ret;
5717 }
5718 
5719 
5720 /* Remember the last target of ix86_set_current_function.  */
5721 static GTY(()) tree ix86_previous_fndecl;
5722 
5723 /* Set targets globals to the default (or current #pragma GCC target
5724    if active).  Invalidate ix86_previous_fndecl cache.  */
5725 
5726 void
5727 ix86_reset_previous_fndecl (void)
5728 {
5729   tree new_tree = target_option_current_node;
5730   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5731   if (TREE_TARGET_GLOBALS (new_tree))
5732     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5733   else if (new_tree == target_option_default_node)
5734     restore_target_globals (&default_target_globals);
5735   else
5736     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5737   ix86_previous_fndecl = NULL_TREE;
5738 }
5739 
5740 /* Set the func_type field from the function FNDECL.  */
5741 
5742 static void
5743 ix86_set_func_type (tree fndecl)
5744 {
5745   if (cfun->machine->func_type == TYPE_UNKNOWN)
5746     {
5747       if (lookup_attribute ("interrupt",
5748 			    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5749 	{
5750 	  if (ix86_function_naked (fndecl))
5751 	    error_at (DECL_SOURCE_LOCATION (fndecl),
5752 		      "interrupt and naked attributes are not compatible");
5753 
5754 	  int nargs = 0;
5755 	  for (tree arg = DECL_ARGUMENTS (fndecl);
5756 	       arg;
5757 	       arg = TREE_CHAIN (arg))
5758 	    nargs++;
5759 	  cfun->machine->no_caller_saved_registers = true;
5760 	  cfun->machine->func_type
5761 	    = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5762 
5763 	  ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5764 
5765 	  /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument.  */
5766 	  if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5767 	    sorry ("Only DWARF debug format is supported for interrupt "
5768 		   "service routine.");
5769 	}
5770       else
5771 	{
5772 	  cfun->machine->func_type = TYPE_NORMAL;
5773 	  if (lookup_attribute ("no_caller_saved_registers",
5774 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5775 	    cfun->machine->no_caller_saved_registers = true;
5776 	}
5777     }
5778 }
5779 
5780 /* Set the indirect_branch_type field from the function FNDECL.  */
5781 
5782 static void
5783 ix86_set_indirect_branch_type (tree fndecl)
5784 {
5785   if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5786     {
5787       tree attr = lookup_attribute ("indirect_branch",
5788 				    DECL_ATTRIBUTES (fndecl));
5789       if (attr != NULL)
5790 	{
5791 	  tree args = TREE_VALUE (attr);
5792 	  if (args == NULL)
5793 	    gcc_unreachable ();
5794 	  tree cst = TREE_VALUE (args);
5795 	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5796 	    cfun->machine->indirect_branch_type = indirect_branch_keep;
5797 	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5798 	    cfun->machine->indirect_branch_type = indirect_branch_thunk;
5799 	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5800 	    cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5801 	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5802 	    cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5803 	  else
5804 	    gcc_unreachable ();
5805 	}
5806       else
5807 	cfun->machine->indirect_branch_type = ix86_indirect_branch;
5808 
5809       /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5810 	 nor -mindirect-branch=thunk-extern.  */
5811       if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5812 	  && ((cfun->machine->indirect_branch_type
5813 	       == indirect_branch_thunk_extern)
5814 	      || (cfun->machine->indirect_branch_type
5815 		  == indirect_branch_thunk)))
5816 	error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5817 	       "compatible",
5818 	       ((cfun->machine->indirect_branch_type
5819 		 == indirect_branch_thunk_extern)
5820 		? "thunk-extern" : "thunk"));
5821 
5822       /* -mindirect-branch=thunk-extern, -fcf-protection=branch and
5823 	 -fcheck-pointer-bounds are not compatible.  */
5824       if ((cfun->machine->indirect_branch_type
5825 	   == indirect_branch_thunk_extern)
5826 	  && flag_check_pointer_bounds
5827 	  && (flag_cf_protection & CF_BRANCH) != 0)
5828 	error ("%<-mindirect-branch=thunk-extern%>, "
5829 	       "%<-fcf-protection=branch%> and "
5830 	       "%<-fcheck-pointer-bounds%> are not compatible");
5831     }
5832 
5833   if (cfun->machine->function_return_type == indirect_branch_unset)
5834     {
5835       tree attr = lookup_attribute ("function_return",
5836 				    DECL_ATTRIBUTES (fndecl));
5837       if (attr != NULL)
5838 	{
5839 	  tree args = TREE_VALUE (attr);
5840 	  if (args == NULL)
5841 	    gcc_unreachable ();
5842 	  tree cst = TREE_VALUE (args);
5843 	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5844 	    cfun->machine->function_return_type = indirect_branch_keep;
5845 	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5846 	    cfun->machine->function_return_type = indirect_branch_thunk;
5847 	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5848 	    cfun->machine->function_return_type = indirect_branch_thunk_inline;
5849 	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5850 	    cfun->machine->function_return_type = indirect_branch_thunk_extern;
5851 	  else
5852 	    gcc_unreachable ();
5853 	}
5854       else
5855 	cfun->machine->function_return_type = ix86_function_return;
5856 
5857       /* -mcmodel=large is not compatible with -mfunction-return=thunk
5858 	 nor -mfunction-return=thunk-extern.  */
5859       if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5860 	  && ((cfun->machine->function_return_type
5861 	       == indirect_branch_thunk_extern)
5862 	      || (cfun->machine->function_return_type
5863 		  == indirect_branch_thunk)))
5864 	error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5865 	       "compatible",
5866 	       ((cfun->machine->function_return_type
5867 		 == indirect_branch_thunk_extern)
5868 		? "thunk-extern" : "thunk"));
5869     }
5870 }
5871 
5872 /* Establish appropriate back-end context for processing the function
5873    FNDECL.  The argument might be NULL to indicate processing at top
5874    level, outside of any function scope.  */
5875 static void
5876 ix86_set_current_function (tree fndecl)
5877 {
5878   /* Only change the context if the function changes.  This hook is called
5879      several times in the course of compiling a function, and we don't want to
5880      slow things down too much or call target_reinit when it isn't safe.  */
5881   if (fndecl == ix86_previous_fndecl)
5882     {
5883       /* There may be 2 function bodies for the same function FNDECL,
5884 	 one is extern inline and one isn't.  Call ix86_set_func_type
5885 	 to set the func_type field.  */
5886       if (fndecl != NULL_TREE)
5887 	{
5888 	  ix86_set_func_type (fndecl);
5889 	  ix86_set_indirect_branch_type (fndecl);
5890 	}
5891       return;
5892     }
5893 
5894   tree old_tree;
5895   if (ix86_previous_fndecl == NULL_TREE)
5896     old_tree = target_option_current_node;
5897   else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5898     old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5899   else
5900     old_tree = target_option_default_node;
5901 
5902   if (fndecl == NULL_TREE)
5903     {
5904       if (old_tree != target_option_current_node)
5905 	ix86_reset_previous_fndecl ();
5906       return;
5907     }
5908 
5909   ix86_set_func_type (fndecl);
5910   ix86_set_indirect_branch_type (fndecl);
5911 
5912   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5913   if (new_tree == NULL_TREE)
5914     new_tree = target_option_default_node;
5915 
5916   if (old_tree != new_tree)
5917     {
5918       cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5919       if (TREE_TARGET_GLOBALS (new_tree))
5920 	restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5921       else if (new_tree == target_option_default_node)
5922 	restore_target_globals (&default_target_globals);
5923       else
5924 	TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5925     }
5926   ix86_previous_fndecl = fndecl;
5927 
5928   static bool prev_no_caller_saved_registers;
5929 
5930   /* 64-bit MS and SYSV ABI have different set of call used registers.
5931      Avoid expensive re-initialization of init_regs each time we switch
5932      function context.  */
5933   if (TARGET_64BIT
5934       && (call_used_regs[SI_REG]
5935 	  == (cfun->machine->call_abi == MS_ABI)))
5936     reinit_regs ();
5937   /* Need to re-initialize init_regs if caller-saved registers are
5938      changed.  */
5939   else if (prev_no_caller_saved_registers
5940 	   != cfun->machine->no_caller_saved_registers)
5941     reinit_regs ();
5942 
5943   if (cfun->machine->func_type != TYPE_NORMAL
5944       || cfun->machine->no_caller_saved_registers)
5945     {
5946       /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5947 	 may change processor state.  */
5948       const char *isa;
5949       if (TARGET_MPX)
5950 	isa = "MPX";
5951       else if (TARGET_SSE)
5952 	isa = "SSE";
5953       else if (TARGET_MMX)
5954 	isa = "MMX/3Dnow";
5955       else if (TARGET_80387)
5956 	isa = "80387";
5957       else
5958 	isa = NULL;
5959       if (isa != NULL)
5960 	{
5961 	  if (cfun->machine->func_type != TYPE_NORMAL)
5962 	    sorry ("%s instructions aren't allowed in %s service routine",
5963 		   isa, (cfun->machine->func_type == TYPE_EXCEPTION
5964 			 ? "exception" : "interrupt"));
5965 	  else
5966 	    sorry ("%s instructions aren't allowed in function with "
5967 		   "no_caller_saved_registers attribute", isa);
5968 	  /* Don't issue the same error twice.  */
5969 	  cfun->machine->func_type = TYPE_NORMAL;
5970 	  cfun->machine->no_caller_saved_registers = false;
5971 	}
5972     }
5973 
5974   prev_no_caller_saved_registers
5975     = cfun->machine->no_caller_saved_registers;
5976 }
5977 
5978 
5979 /* Return true if this goes in large data/bss.  */
5980 
5981 static bool
5982 ix86_in_large_data_p (tree exp)
5983 {
5984   if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5985     return false;
5986 
5987   if (exp == NULL_TREE)
5988     return false;
5989 
5990   /* Functions are never large data.  */
5991   if (TREE_CODE (exp) == FUNCTION_DECL)
5992     return false;
5993 
5994   /* Automatic variables are never large data.  */
5995   if (VAR_P (exp) && !is_global_var (exp))
5996     return false;
5997 
5998   if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5999     {
6000       const char *section = DECL_SECTION_NAME (exp);
6001       if (strcmp (section, ".ldata") == 0
6002 	  || strcmp (section, ".lbss") == 0)
6003 	return true;
6004       return false;
6005     }
6006   else
6007     {
6008       HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6009 
6010       /* If this is an incomplete type with size 0, then we can't put it
6011 	 in data because it might be too big when completed.  Also,
6012 	 int_size_in_bytes returns -1 if size can vary or is larger than
6013 	 an integer in which case also it is safer to assume that it goes in
6014 	 large data.  */
6015       if (size <= 0 || size > ix86_section_threshold)
6016 	return true;
6017     }
6018 
6019   return false;
6020 }
6021 
6022 /* i386-specific section flag to mark large sections.  */
6023 #define SECTION_LARGE SECTION_MACH_DEP
6024 
6025 /* Switch to the appropriate section for output of DECL.
6026    DECL is either a `VAR_DECL' node or a constant of some sort.
6027    RELOC indicates whether forming the initial value of DECL requires
6028    link-time relocations.  */
6029 
6030 ATTRIBUTE_UNUSED static section *
6031 x86_64_elf_select_section (tree decl, int reloc,
6032 			   unsigned HOST_WIDE_INT align)
6033 {
6034   if (ix86_in_large_data_p (decl))
6035     {
6036       const char *sname = NULL;
6037       unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6038       switch (categorize_decl_for_section (decl, reloc))
6039 	{
6040 	case SECCAT_DATA:
6041 	  sname = ".ldata";
6042 	  break;
6043 	case SECCAT_DATA_REL:
6044 	  sname = ".ldata.rel";
6045 	  break;
6046 	case SECCAT_DATA_REL_LOCAL:
6047 	  sname = ".ldata.rel.local";
6048 	  break;
6049 	case SECCAT_DATA_REL_RO:
6050 	  sname = ".ldata.rel.ro";
6051 	  break;
6052 	case SECCAT_DATA_REL_RO_LOCAL:
6053 	  sname = ".ldata.rel.ro.local";
6054 	  break;
6055 	case SECCAT_BSS:
6056 	  sname = ".lbss";
6057 	  flags |= SECTION_BSS;
6058 	  break;
6059 	case SECCAT_RODATA:
6060 	case SECCAT_RODATA_MERGE_STR:
6061 	case SECCAT_RODATA_MERGE_STR_INIT:
6062 	case SECCAT_RODATA_MERGE_CONST:
6063 	  sname = ".lrodata";
6064 	  flags &= ~SECTION_WRITE;
6065 	  break;
6066 	case SECCAT_SRODATA:
6067 	case SECCAT_SDATA:
6068 	case SECCAT_SBSS:
6069 	  gcc_unreachable ();
6070 	case SECCAT_TEXT:
6071 	case SECCAT_TDATA:
6072 	case SECCAT_TBSS:
6073 	  /* We don't split these for medium model.  Place them into
6074 	     default sections and hope for best.  */
6075 	  break;
6076 	}
6077       if (sname)
6078 	{
6079 	  /* We might get called with string constants, but get_named_section
6080 	     doesn't like them as they are not DECLs.  Also, we need to set
6081 	     flags in that case.  */
6082 	  if (!DECL_P (decl))
6083 	    return get_section (sname, flags, NULL);
6084 	  return get_named_section (decl, sname, reloc);
6085 	}
6086     }
6087   return default_elf_select_section (decl, reloc, align);
6088 }
6089 
6090 /* Select a set of attributes for section NAME based on the properties
6091    of DECL and whether or not RELOC indicates that DECL's initializer
6092    might contain runtime relocations.  */
6093 
6094 static unsigned int ATTRIBUTE_UNUSED
6095 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6096 {
6097   unsigned int flags = default_section_type_flags (decl, name, reloc);
6098 
6099   if (ix86_in_large_data_p (decl))
6100     flags |= SECTION_LARGE;
6101 
6102   if (decl == NULL_TREE
6103       && (strcmp (name, ".ldata.rel.ro") == 0
6104 	  || strcmp (name, ".ldata.rel.ro.local") == 0))
6105     flags |= SECTION_RELRO;
6106 
6107   if (strcmp (name, ".lbss") == 0
6108       || strncmp (name, ".lbss.", 5) == 0
6109       || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6110     flags |= SECTION_BSS;
6111 
6112   return flags;
6113 }
6114 
6115 /* Build up a unique section name, expressed as a
6116    STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6117    RELOC indicates whether the initial value of EXP requires
6118    link-time relocations.  */
6119 
6120 static void ATTRIBUTE_UNUSED
6121 x86_64_elf_unique_section (tree decl, int reloc)
6122 {
6123   if (ix86_in_large_data_p (decl))
6124     {
6125       const char *prefix = NULL;
6126       /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
6127       bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6128 
6129       switch (categorize_decl_for_section (decl, reloc))
6130 	{
6131 	case SECCAT_DATA:
6132 	case SECCAT_DATA_REL:
6133 	case SECCAT_DATA_REL_LOCAL:
6134 	case SECCAT_DATA_REL_RO:
6135 	case SECCAT_DATA_REL_RO_LOCAL:
6136           prefix = one_only ? ".ld" : ".ldata";
6137 	  break;
6138 	case SECCAT_BSS:
6139           prefix = one_only ? ".lb" : ".lbss";
6140 	  break;
6141 	case SECCAT_RODATA:
6142 	case SECCAT_RODATA_MERGE_STR:
6143 	case SECCAT_RODATA_MERGE_STR_INIT:
6144 	case SECCAT_RODATA_MERGE_CONST:
6145           prefix = one_only ? ".lr" : ".lrodata";
6146 	  break;
6147 	case SECCAT_SRODATA:
6148 	case SECCAT_SDATA:
6149 	case SECCAT_SBSS:
6150 	  gcc_unreachable ();
6151 	case SECCAT_TEXT:
6152 	case SECCAT_TDATA:
6153 	case SECCAT_TBSS:
6154 	  /* We don't split these for medium model.  Place them into
6155 	     default sections and hope for best.  */
6156 	  break;
6157 	}
6158       if (prefix)
6159 	{
6160 	  const char *name, *linkonce;
6161 	  char *string;
6162 
6163 	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6164 	  name = targetm.strip_name_encoding (name);
6165 
6166 	  /* If we're using one_only, then there needs to be a .gnu.linkonce
6167      	     prefix to the section name.  */
6168 	  linkonce = one_only ? ".gnu.linkonce" : "";
6169 
6170 	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6171 
6172 	  set_decl_section_name (decl, string);
6173 	  return;
6174 	}
6175     }
6176   default_unique_section (decl, reloc);
6177 }
6178 
6179 #ifdef COMMON_ASM_OP
6180 
6181 #ifndef LARGECOMM_SECTION_ASM_OP
6182 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6183 #endif
6184 
6185 /* This says how to output assembler code to declare an
6186    uninitialized external linkage data object.
6187 
6188    For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6189    large objects.  */
6190 void
6191 x86_elf_aligned_decl_common (FILE *file, tree decl,
6192 			const char *name, unsigned HOST_WIDE_INT size,
6193 			int align)
6194 {
6195   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6196       && size > (unsigned int)ix86_section_threshold)
6197     {
6198       switch_to_section (get_named_section (decl, ".lbss", 0));
6199       fputs (LARGECOMM_SECTION_ASM_OP, file);
6200     }
6201   else
6202     fputs (COMMON_ASM_OP, file);
6203   assemble_name (file, name);
6204   fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6205 	   size, align / BITS_PER_UNIT);
6206 }
6207 #endif
6208 
6209 /* Utility function for targets to use in implementing
6210    ASM_OUTPUT_ALIGNED_BSS.  */
6211 
6212 void
6213 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6214 		       	unsigned HOST_WIDE_INT size, int align)
6215 {
6216   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6217       && size > (unsigned int)ix86_section_threshold)
6218     switch_to_section (get_named_section (decl, ".lbss", 0));
6219   else
6220     switch_to_section (bss_section);
6221   ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6222 #ifdef ASM_DECLARE_OBJECT_NAME
6223   last_assemble_variable_decl = decl;
6224   ASM_DECLARE_OBJECT_NAME (file, name, decl);
6225 #else
6226   /* Standard thing is just output label for the object.  */
6227   ASM_OUTPUT_LABEL (file, name);
6228 #endif /* ASM_DECLARE_OBJECT_NAME */
6229   ASM_OUTPUT_SKIP (file, size ? size : 1);
6230 }
6231 
6232 /* Decide whether we must probe the stack before any space allocation
6233    on this target.  It's essentially TARGET_STACK_PROBE except when
6234    -fstack-check causes the stack to be already probed differently.  */
6235 
6236 bool
6237 ix86_target_stack_probe (void)
6238 {
6239   /* Do not probe the stack twice if static stack checking is enabled.  */
6240   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6241     return false;
6242 
6243   return TARGET_STACK_PROBE;
6244 }
6245 
6246 /* Decide whether we can make a sibling call to a function.  DECL is the
6247    declaration of the function being targeted by the call and EXP is the
6248    CALL_EXPR representing the call.  */
6249 
6250 static bool
6251 ix86_function_ok_for_sibcall (tree decl, tree exp)
6252 {
6253   tree type, decl_or_type;
6254   rtx a, b;
6255   bool bind_global = decl && !targetm.binds_local_p (decl);
6256 
6257   if (ix86_function_naked (current_function_decl))
6258     return false;
6259 
6260   /* Sibling call isn't OK if there are no caller-saved registers
6261      since all registers must be preserved before return.  */
6262   if (cfun->machine->no_caller_saved_registers)
6263     return false;
6264 
6265   /* If we are generating position-independent code, we cannot sibcall
6266      optimize direct calls to global functions, as the PLT requires
6267      %ebx be live. (Darwin does not have a PLT.)  */
6268   if (!TARGET_MACHO
6269       && !TARGET_64BIT
6270       && flag_pic
6271       && flag_plt
6272       && bind_global)
6273     return false;
6274 
6275   /* If we need to align the outgoing stack, then sibcalling would
6276      unalign the stack, which may break the called function.  */
6277   if (ix86_minimum_incoming_stack_boundary (true)
6278       < PREFERRED_STACK_BOUNDARY)
6279     return false;
6280 
6281   if (decl)
6282     {
6283       decl_or_type = decl;
6284       type = TREE_TYPE (decl);
6285     }
6286   else
6287     {
6288       /* We're looking at the CALL_EXPR, we need the type of the function.  */
6289       type = CALL_EXPR_FN (exp);		/* pointer expression */
6290       type = TREE_TYPE (type);			/* pointer type */
6291       type = TREE_TYPE (type);			/* function type */
6292       decl_or_type = type;
6293     }
6294 
6295   /* Check that the return value locations are the same.  Like
6296      if we are returning floats on the 80387 register stack, we cannot
6297      make a sibcall from a function that doesn't return a float to a
6298      function that does or, conversely, from a function that does return
6299      a float to a function that doesn't; the necessary stack adjustment
6300      would not be executed.  This is also the place we notice
6301      differences in the return value ABI.  Note that it is ok for one
6302      of the functions to have void return type as long as the return
6303      value of the other is passed in a register.  */
6304   a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6305   b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6306 			   cfun->decl, false);
6307   if (STACK_REG_P (a) || STACK_REG_P (b))
6308     {
6309       if (!rtx_equal_p (a, b))
6310 	return false;
6311     }
6312   else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6313     ;
6314   else if (!rtx_equal_p (a, b))
6315     return false;
6316 
6317   if (TARGET_64BIT)
6318     {
6319       /* The SYSV ABI has more call-clobbered registers;
6320 	 disallow sibcalls from MS to SYSV.  */
6321       if (cfun->machine->call_abi == MS_ABI
6322 	  && ix86_function_type_abi (type) == SYSV_ABI)
6323 	return false;
6324     }
6325   else
6326     {
6327       /* If this call is indirect, we'll need to be able to use a
6328 	 call-clobbered register for the address of the target function.
6329 	 Make sure that all such registers are not used for passing
6330 	 parameters.  Note that DLLIMPORT functions and call to global
6331 	 function via GOT slot are indirect.  */
6332       if (!decl
6333 	  || (bind_global && flag_pic && !flag_plt)
6334 	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
6335 	  || flag_force_indirect_call)
6336 	{
6337 	  /* Check if regparm >= 3 since arg_reg_available is set to
6338 	     false if regparm == 0.  If regparm is 1 or 2, there is
6339 	     always a call-clobbered register available.
6340 
6341 	     ??? The symbol indirect call doesn't need a call-clobbered
6342 	     register.  But we don't know if this is a symbol indirect
6343 	     call or not here.  */
6344 	  if (ix86_function_regparm (type, decl) >= 3
6345 	      && !cfun->machine->arg_reg_available)
6346 	    return false;
6347 	}
6348     }
6349 
6350   /* Otherwise okay.  That also includes certain types of indirect calls.  */
6351   return true;
6352 }
6353 
6354 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6355    and "sseregparm" calling convention attributes;
6356    arguments as in struct attribute_spec.handler.  */
6357 
6358 static tree
6359 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6360 			     bool *no_add_attrs)
6361 {
6362   if (TREE_CODE (*node) != FUNCTION_TYPE
6363       && TREE_CODE (*node) != METHOD_TYPE
6364       && TREE_CODE (*node) != FIELD_DECL
6365       && TREE_CODE (*node) != TYPE_DECL)
6366     {
6367       warning (OPT_Wattributes, "%qE attribute only applies to functions",
6368 	       name);
6369       *no_add_attrs = true;
6370       return NULL_TREE;
6371     }
6372 
6373   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
6374   if (is_attribute_p ("regparm", name))
6375     {
6376       tree cst;
6377 
6378       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6379         {
6380 	  error ("fastcall and regparm attributes are not compatible");
6381 	}
6382 
6383       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6384 	{
6385 	  error ("regparam and thiscall attributes are not compatible");
6386 	}
6387 
6388       cst = TREE_VALUE (args);
6389       if (TREE_CODE (cst) != INTEGER_CST)
6390 	{
6391 	  warning (OPT_Wattributes,
6392 		   "%qE attribute requires an integer constant argument",
6393 		   name);
6394 	  *no_add_attrs = true;
6395 	}
6396       else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6397 	{
6398 	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6399 		   name, REGPARM_MAX);
6400 	  *no_add_attrs = true;
6401 	}
6402 
6403       return NULL_TREE;
6404     }
6405 
6406   if (TARGET_64BIT)
6407     {
6408       /* Do not warn when emulating the MS ABI.  */
6409       if ((TREE_CODE (*node) != FUNCTION_TYPE
6410 	   && TREE_CODE (*node) != METHOD_TYPE)
6411 	  || ix86_function_type_abi (*node) != MS_ABI)
6412 	warning (OPT_Wattributes, "%qE attribute ignored",
6413 	         name);
6414       *no_add_attrs = true;
6415       return NULL_TREE;
6416     }
6417 
6418   /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
6419   if (is_attribute_p ("fastcall", name))
6420     {
6421       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6422         {
6423 	  error ("fastcall and cdecl attributes are not compatible");
6424 	}
6425       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6426         {
6427 	  error ("fastcall and stdcall attributes are not compatible");
6428 	}
6429       if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6430         {
6431 	  error ("fastcall and regparm attributes are not compatible");
6432 	}
6433       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6434 	{
6435 	  error ("fastcall and thiscall attributes are not compatible");
6436 	}
6437     }
6438 
6439   /* Can combine stdcall with fastcall (redundant), regparm and
6440      sseregparm.  */
6441   else if (is_attribute_p ("stdcall", name))
6442     {
6443       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6444         {
6445 	  error ("stdcall and cdecl attributes are not compatible");
6446 	}
6447       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6448         {
6449 	  error ("stdcall and fastcall attributes are not compatible");
6450 	}
6451       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6452 	{
6453 	  error ("stdcall and thiscall attributes are not compatible");
6454 	}
6455     }
6456 
6457   /* Can combine cdecl with regparm and sseregparm.  */
6458   else if (is_attribute_p ("cdecl", name))
6459     {
6460       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6461         {
6462 	  error ("stdcall and cdecl attributes are not compatible");
6463 	}
6464       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6465         {
6466 	  error ("fastcall and cdecl attributes are not compatible");
6467 	}
6468       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6469 	{
6470 	  error ("cdecl and thiscall attributes are not compatible");
6471 	}
6472     }
6473   else if (is_attribute_p ("thiscall", name))
6474     {
6475       if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6476 	warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6477 	         name);
6478       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6479 	{
6480 	  error ("stdcall and thiscall attributes are not compatible");
6481 	}
6482       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6483 	{
6484 	  error ("fastcall and thiscall attributes are not compatible");
6485 	}
6486       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6487 	{
6488 	  error ("cdecl and thiscall attributes are not compatible");
6489 	}
6490     }
6491 
6492   /* Can combine sseregparm with all attributes.  */
6493 
6494   return NULL_TREE;
6495 }
6496 
6497 /* The transactional memory builtins are implicitly regparm or fastcall
6498    depending on the ABI.  Override the generic do-nothing attribute that
6499    these builtins were declared with, and replace it with one of the two
6500    attributes that we expect elsewhere.  */
6501 
6502 static tree
6503 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6504 				  int flags, bool *no_add_attrs)
6505 {
6506   tree alt;
6507 
6508   /* In no case do we want to add the placeholder attribute.  */
6509   *no_add_attrs = true;
6510 
6511   /* The 64-bit ABI is unchanged for transactional memory.  */
6512   if (TARGET_64BIT)
6513     return NULL_TREE;
6514 
6515   /* ??? Is there a better way to validate 32-bit windows?  We have
6516      cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
6517   if (CHECK_STACK_LIMIT > 0)
6518     alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6519   else
6520     {
6521       alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6522       alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6523     }
6524   decl_attributes (node, alt, flags);
6525 
6526   return NULL_TREE;
6527 }
6528 
6529 /* This function determines from TYPE the calling-convention.  */
6530 
6531 unsigned int
6532 ix86_get_callcvt (const_tree type)
6533 {
6534   unsigned int ret = 0;
6535   bool is_stdarg;
6536   tree attrs;
6537 
6538   if (TARGET_64BIT)
6539     return IX86_CALLCVT_CDECL;
6540 
6541   attrs = TYPE_ATTRIBUTES (type);
6542   if (attrs != NULL_TREE)
6543     {
6544       if (lookup_attribute ("cdecl", attrs))
6545 	ret |= IX86_CALLCVT_CDECL;
6546       else if (lookup_attribute ("stdcall", attrs))
6547 	ret |= IX86_CALLCVT_STDCALL;
6548       else if (lookup_attribute ("fastcall", attrs))
6549 	ret |= IX86_CALLCVT_FASTCALL;
6550       else if (lookup_attribute ("thiscall", attrs))
6551 	ret |= IX86_CALLCVT_THISCALL;
6552 
6553       /* Regparam isn't allowed for thiscall and fastcall.  */
6554       if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6555 	{
6556 	  if (lookup_attribute ("regparm", attrs))
6557 	    ret |= IX86_CALLCVT_REGPARM;
6558 	  if (lookup_attribute ("sseregparm", attrs))
6559 	    ret |= IX86_CALLCVT_SSEREGPARM;
6560 	}
6561 
6562       if (IX86_BASE_CALLCVT(ret) != 0)
6563 	return ret;
6564     }
6565 
6566   is_stdarg = stdarg_p (type);
6567   if (TARGET_RTD && !is_stdarg)
6568     return IX86_CALLCVT_STDCALL | ret;
6569 
6570   if (ret != 0
6571       || is_stdarg
6572       || TREE_CODE (type) != METHOD_TYPE
6573       || ix86_function_type_abi (type) != MS_ABI)
6574     return IX86_CALLCVT_CDECL | ret;
6575 
6576   return IX86_CALLCVT_THISCALL;
6577 }
6578 
6579 /* Return 0 if the attributes for two types are incompatible, 1 if they
6580    are compatible, and 2 if they are nearly compatible (which causes a
6581    warning to be generated).  */
6582 
6583 static int
6584 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6585 {
6586   unsigned int ccvt1, ccvt2;
6587 
6588   if (TREE_CODE (type1) != FUNCTION_TYPE
6589       && TREE_CODE (type1) != METHOD_TYPE)
6590     return 1;
6591 
6592   ccvt1 = ix86_get_callcvt (type1);
6593   ccvt2 = ix86_get_callcvt (type2);
6594   if (ccvt1 != ccvt2)
6595     return 0;
6596   if (ix86_function_regparm (type1, NULL)
6597       != ix86_function_regparm (type2, NULL))
6598     return 0;
6599 
6600   return 1;
6601 }
6602 
6603 /* Return the regparm value for a function with the indicated TYPE and DECL.
6604    DECL may be NULL when calling function indirectly
6605    or considering a libcall.  */
6606 
6607 static int
6608 ix86_function_regparm (const_tree type, const_tree decl)
6609 {
6610   tree attr;
6611   int regparm;
6612   unsigned int ccvt;
6613 
6614   if (TARGET_64BIT)
6615     return (ix86_function_type_abi (type) == SYSV_ABI
6616 	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6617   ccvt = ix86_get_callcvt (type);
6618   regparm = ix86_regparm;
6619 
6620   if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6621     {
6622       attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6623       if (attr)
6624 	{
6625 	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6626 	  return regparm;
6627 	}
6628     }
6629   else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6630     return 2;
6631   else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6632     return 1;
6633 
6634   /* Use register calling convention for local functions when possible.  */
6635   if (decl
6636       && TREE_CODE (decl) == FUNCTION_DECL)
6637     {
6638       cgraph_node *target = cgraph_node::get (decl);
6639       if (target)
6640 	target = target->function_symbol ();
6641 
6642       /* Caller and callee must agree on the calling convention, so
6643 	 checking here just optimize means that with
6644 	 __attribute__((optimize (...))) caller could use regparm convention
6645 	 and callee not, or vice versa.  Instead look at whether the callee
6646 	 is optimized or not.  */
6647       if (target && opt_for_fn (target->decl, optimize)
6648 	  && !(profile_flag && !flag_fentry))
6649 	{
6650 	  cgraph_local_info *i = &target->local;
6651 	  if (i && i->local && i->can_change_signature)
6652 	    {
6653 	      int local_regparm, globals = 0, regno;
6654 
6655 	      /* Make sure no regparm register is taken by a
6656 		 fixed register variable.  */
6657 	      for (local_regparm = 0; local_regparm < REGPARM_MAX;
6658 		   local_regparm++)
6659 		if (fixed_regs[local_regparm])
6660 		  break;
6661 
6662 	      /* We don't want to use regparm(3) for nested functions as
6663 		 these use a static chain pointer in the third argument.  */
6664 	      if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6665 		local_regparm = 2;
6666 
6667 	      /* Save a register for the split stack.  */
6668 	      if (flag_split_stack)
6669 		{
6670 		  if (local_regparm == 3)
6671 		    local_regparm = 2;
6672 		  else if (local_regparm == 2
6673 			   && DECL_STATIC_CHAIN (target->decl))
6674 		    local_regparm = 1;
6675 		}
6676 
6677 	      /* Each fixed register usage increases register pressure,
6678 		 so less registers should be used for argument passing.
6679 		 This functionality can be overriden by an explicit
6680 		 regparm value.  */
6681 	      for (regno = AX_REG; regno <= DI_REG; regno++)
6682 		if (fixed_regs[regno])
6683 		  globals++;
6684 
6685 	      local_regparm
6686 		= globals < local_regparm ? local_regparm - globals : 0;
6687 
6688 	      if (local_regparm > regparm)
6689 		regparm = local_regparm;
6690 	    }
6691 	}
6692     }
6693 
6694   return regparm;
6695 }
6696 
6697 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6698    DFmode (2) arguments in SSE registers for a function with the
6699    indicated TYPE and DECL.  DECL may be NULL when calling function
6700    indirectly or considering a libcall.  Return -1 if any FP parameter
6701    should be rejected by error.  This is used in siutation we imply SSE
6702    calling convetion but the function is called from another function with
6703    SSE disabled. Otherwise return 0.  */
6704 
6705 static int
6706 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6707 {
6708   gcc_assert (!TARGET_64BIT);
6709 
6710   /* Use SSE registers to pass SFmode and DFmode arguments if requested
6711      by the sseregparm attribute.  */
6712   if (TARGET_SSEREGPARM
6713       || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6714     {
6715       if (!TARGET_SSE)
6716 	{
6717 	  if (warn)
6718 	    {
6719 	      if (decl)
6720 		error ("calling %qD with attribute sseregparm without "
6721 		       "SSE/SSE2 enabled", decl);
6722 	      else
6723 		error ("calling %qT with attribute sseregparm without "
6724 		       "SSE/SSE2 enabled", type);
6725 	    }
6726 	  return 0;
6727 	}
6728 
6729       return 2;
6730     }
6731 
6732   if (!decl)
6733     return 0;
6734 
6735   cgraph_node *target = cgraph_node::get (decl);
6736   if (target)
6737     target = target->function_symbol ();
6738 
6739   /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6740      (and DFmode for SSE2) arguments in SSE registers.  */
6741   if (target
6742       /* TARGET_SSE_MATH */
6743       && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6744       && opt_for_fn (target->decl, optimize)
6745       && !(profile_flag && !flag_fentry))
6746     {
6747       cgraph_local_info *i = &target->local;
6748       if (i && i->local && i->can_change_signature)
6749 	{
6750 	  /* Refuse to produce wrong code when local function with SSE enabled
6751 	     is called from SSE disabled function.
6752 	     FIXME: We need a way to detect these cases cross-ltrans partition
6753 	     and avoid using SSE calling conventions on local functions called
6754 	     from function with SSE disabled.  For now at least delay the
6755 	     warning until we know we are going to produce wrong code.
6756 	     See PR66047  */
6757 	  if (!TARGET_SSE && warn)
6758 	    return -1;
6759 	  return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6760 				->x_ix86_isa_flags) ? 2 : 1;
6761 	}
6762     }
6763 
6764   return 0;
6765 }
6766 
6767 /* Return true if EAX is live at the start of the function.  Used by
6768    ix86_expand_prologue to determine if we need special help before
6769    calling allocate_stack_worker.  */
6770 
6771 static bool
6772 ix86_eax_live_at_start_p (void)
6773 {
6774   /* Cheat.  Don't bother working forward from ix86_function_regparm
6775      to the function type to whether an actual argument is located in
6776      eax.  Instead just look at cfg info, which is still close enough
6777      to correct at this point.  This gives false positives for broken
6778      functions that might use uninitialized data that happens to be
6779      allocated in eax, but who cares?  */
6780   return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6781 }
6782 
6783 static bool
6784 ix86_keep_aggregate_return_pointer (tree fntype)
6785 {
6786   tree attr;
6787 
6788   if (!TARGET_64BIT)
6789     {
6790       attr = lookup_attribute ("callee_pop_aggregate_return",
6791 			       TYPE_ATTRIBUTES (fntype));
6792       if (attr)
6793 	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6794 
6795       /* For 32-bit MS-ABI the default is to keep aggregate
6796          return pointer.  */
6797       if (ix86_function_type_abi (fntype) == MS_ABI)
6798 	return true;
6799     }
6800   return KEEP_AGGREGATE_RETURN_POINTER != 0;
6801 }
6802 
6803 /* Value is the number of bytes of arguments automatically
6804    popped when returning from a subroutine call.
6805    FUNDECL is the declaration node of the function (as a tree),
6806    FUNTYPE is the data type of the function (as a tree),
6807    or for a library call it is an identifier node for the subroutine name.
6808    SIZE is the number of bytes of arguments passed on the stack.
6809 
6810    On the 80386, the RTD insn may be used to pop them if the number
6811      of args is fixed, but if the number is variable then the caller
6812      must pop them all.  RTD can't be used for library calls now
6813      because the library is compiled with the Unix compiler.
6814    Use of RTD is a selectable option, since it is incompatible with
6815    standard Unix calling sequences.  If the option is not selected,
6816    the caller must always pop the args.
6817 
6818    The attribute stdcall is equivalent to RTD on a per module basis.  */
6819 
6820 static poly_int64
6821 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6822 {
6823   unsigned int ccvt;
6824 
6825   /* None of the 64-bit ABIs pop arguments.  */
6826   if (TARGET_64BIT)
6827     return 0;
6828 
6829   ccvt = ix86_get_callcvt (funtype);
6830 
6831   if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6832 	       | IX86_CALLCVT_THISCALL)) != 0
6833       && ! stdarg_p (funtype))
6834     return size;
6835 
6836   /* Lose any fake structure return argument if it is passed on the stack.  */
6837   if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6838       && !ix86_keep_aggregate_return_pointer (funtype))
6839     {
6840       int nregs = ix86_function_regparm (funtype, fundecl);
6841       if (nregs == 0)
6842 	return GET_MODE_SIZE (Pmode);
6843     }
6844 
6845   return 0;
6846 }
6847 
6848 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */
6849 
6850 static bool
6851 ix86_legitimate_combined_insn (rtx_insn *insn)
6852 {
6853   int i;
6854 
6855   /* Check operand constraints in case hard registers were propagated
6856      into insn pattern.  This check prevents combine pass from
6857      generating insn patterns with invalid hard register operands.
6858      These invalid insns can eventually confuse reload to error out
6859      with a spill failure.  See also PRs 46829 and 46843.  */
6860 
6861   gcc_assert (INSN_CODE (insn) >= 0);
6862 
6863   extract_insn (insn);
6864   preprocess_constraints (insn);
6865 
6866   int n_operands = recog_data.n_operands;
6867   int n_alternatives = recog_data.n_alternatives;
6868   for (i = 0; i < n_operands; i++)
6869     {
6870       rtx op = recog_data.operand[i];
6871       machine_mode mode = GET_MODE (op);
6872       const operand_alternative *op_alt;
6873       int offset = 0;
6874       bool win;
6875       int j;
6876 
6877       /* A unary operator may be accepted by the predicate, but it
6878 	 is irrelevant for matching constraints.  */
6879       if (UNARY_P (op))
6880 	op = XEXP (op, 0);
6881 
6882       if (SUBREG_P (op))
6883 	{
6884 	  if (REG_P (SUBREG_REG (op))
6885 	      && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6886 	    offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6887 					  GET_MODE (SUBREG_REG (op)),
6888 					  SUBREG_BYTE (op),
6889 					  GET_MODE (op));
6890 	  op = SUBREG_REG (op);
6891 	}
6892 
6893       if (!(REG_P (op) && HARD_REGISTER_P (op)))
6894 	continue;
6895 
6896       op_alt = recog_op_alt;
6897 
6898       /* Operand has no constraints, anything is OK.  */
6899       win = !n_alternatives;
6900 
6901       alternative_mask preferred = get_preferred_alternatives (insn);
6902       for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6903 	{
6904 	  if (!TEST_BIT (preferred, j))
6905 	    continue;
6906 	  if (op_alt[i].anything_ok
6907 	      || (op_alt[i].matches != -1
6908 		  && operands_match_p
6909 		  (recog_data.operand[i],
6910 		   recog_data.operand[op_alt[i].matches]))
6911 	      || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6912 	    {
6913 	      win = true;
6914 	      break;
6915 	    }
6916 	}
6917 
6918       if (!win)
6919 	return false;
6920     }
6921 
6922   return true;
6923 }
6924 
6925 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
6926 
6927 static unsigned HOST_WIDE_INT
6928 ix86_asan_shadow_offset (void)
6929 {
6930   return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6931 				     : HOST_WIDE_INT_C (0x7fff8000))
6932 		     : (HOST_WIDE_INT_1 << 29);
6933 }
6934 
6935 /* Argument support functions.  */
6936 
6937 /* Return true when register may be used to pass function parameters.  */
6938 bool
6939 ix86_function_arg_regno_p (int regno)
6940 {
6941   int i;
6942   enum calling_abi call_abi;
6943   const int *parm_regs;
6944 
6945   if (TARGET_MPX && BND_REGNO_P (regno))
6946     return true;
6947 
6948   if (!TARGET_64BIT)
6949     {
6950       if (TARGET_MACHO)
6951         return (regno < REGPARM_MAX
6952                 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6953       else
6954         return (regno < REGPARM_MAX
6955 	        || (TARGET_MMX && MMX_REGNO_P (regno)
6956 	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6957 	        || (TARGET_SSE && SSE_REGNO_P (regno)
6958 		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6959     }
6960 
6961   if (TARGET_SSE && SSE_REGNO_P (regno)
6962       && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6963     return true;
6964 
6965   /* TODO: The function should depend on current function ABI but
6966      builtins.c would need updating then. Therefore we use the
6967      default ABI.  */
6968   call_abi = ix86_cfun_abi ();
6969 
6970   /* RAX is used as hidden argument to va_arg functions.  */
6971   if (call_abi == SYSV_ABI && regno == AX_REG)
6972     return true;
6973 
6974   if (call_abi == MS_ABI)
6975     parm_regs = x86_64_ms_abi_int_parameter_registers;
6976   else
6977     parm_regs = x86_64_int_parameter_registers;
6978 
6979   for (i = 0; i < (call_abi == MS_ABI
6980 		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6981     if (regno == parm_regs[i])
6982       return true;
6983   return false;
6984 }
6985 
6986 /* Return if we do not know how to pass TYPE solely in registers.  */
6987 
6988 static bool
6989 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6990 {
6991   if (must_pass_in_stack_var_size_or_pad (mode, type))
6992     return true;
6993 
6994   /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
6995      The layout_type routine is crafty and tries to trick us into passing
6996      currently unsupported vector types on the stack by using TImode.  */
6997   return (!TARGET_64BIT && mode == TImode
6998 	  && type && TREE_CODE (type) != VECTOR_TYPE);
6999 }
7000 
7001 /* It returns the size, in bytes, of the area reserved for arguments passed
7002    in registers for the function represented by fndecl dependent to the used
7003    abi format.  */
7004 int
7005 ix86_reg_parm_stack_space (const_tree fndecl)
7006 {
7007   enum calling_abi call_abi = SYSV_ABI;
7008   if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7009     call_abi = ix86_function_abi (fndecl);
7010   else
7011     call_abi = ix86_function_type_abi (fndecl);
7012   if (TARGET_64BIT && call_abi == MS_ABI)
7013     return 32;
7014   return 0;
7015 }
7016 
7017 /* We add this as a workaround in order to use libc_has_function
7018    hook in i386.md.  */
7019 bool
7020 ix86_libc_has_function (enum function_class fn_class)
7021 {
7022   return targetm.libc_has_function (fn_class);
7023 }
7024 
7025 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7026    specifying the call abi used.  */
7027 enum calling_abi
7028 ix86_function_type_abi (const_tree fntype)
7029 {
7030   enum calling_abi abi = ix86_abi;
7031 
7032   if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7033     return abi;
7034 
7035   if (abi == SYSV_ABI
7036       && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7037     {
7038       static int warned;
7039       if (TARGET_X32 && !warned)
7040 	{
7041 	  error ("X32 does not support ms_abi attribute");
7042 	  warned = 1;
7043 	}
7044 
7045       abi = MS_ABI;
7046     }
7047   else if (abi == MS_ABI
7048 	   && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7049     abi = SYSV_ABI;
7050 
7051   return abi;
7052 }
7053 
7054 static enum calling_abi
7055 ix86_function_abi (const_tree fndecl)
7056 {
7057   return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7058 }
7059 
7060 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7061    specifying the call abi used.  */
7062 enum calling_abi
7063 ix86_cfun_abi (void)
7064 {
7065   return cfun ? cfun->machine->call_abi : ix86_abi;
7066 }
7067 
7068 static bool
7069 ix86_function_ms_hook_prologue (const_tree fn)
7070 {
7071   if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7072     {
7073       if (decl_function_context (fn) != NULL_TREE)
7074 	error_at (DECL_SOURCE_LOCATION (fn),
7075 		  "ms_hook_prologue is not compatible with nested function");
7076       else
7077         return true;
7078     }
7079   return false;
7080 }
7081 
7082 static bool
7083 ix86_function_naked (const_tree fn)
7084 {
7085   if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7086     return true;
7087 
7088   return false;
7089 }
7090 
7091 /* Write the extra assembler code needed to declare a function properly.  */
7092 
7093 void
7094 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7095 				tree decl)
7096 {
7097   bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7098 
7099   if (is_ms_hook)
7100     {
7101       int i, filler_count = (TARGET_64BIT ? 32 : 16);
7102       unsigned int filler_cc = 0xcccccccc;
7103 
7104       for (i = 0; i < filler_count; i += 4)
7105         fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7106     }
7107 
7108 #ifdef SUBTARGET_ASM_UNWIND_INIT
7109   SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7110 #endif
7111 
7112   ASM_OUTPUT_LABEL (asm_out_file, fname);
7113 
7114   /* Output magic byte marker, if hot-patch attribute is set.  */
7115   if (is_ms_hook)
7116     {
7117       if (TARGET_64BIT)
7118 	{
7119 	  /* leaq [%rsp + 0], %rsp  */
7120 	  fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7121 		 asm_out_file);
7122 	}
7123       else
7124 	{
7125           /* movl.s %edi, %edi
7126 	     push   %ebp
7127 	     movl.s %esp, %ebp */
7128 	  fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7129 	}
7130     }
7131 }
7132 
7133 /* Implementation of call abi switching target hook. Specific to FNDECL
7134    the specific call register sets are set.  See also
7135    ix86_conditional_register_usage for more details.  */
7136 void
7137 ix86_call_abi_override (const_tree fndecl)
7138 {
7139   cfun->machine->call_abi = ix86_function_abi (fndecl);
7140 }
7141 
7142 /* Return 1 if pseudo register should be created and used to hold
7143    GOT address for PIC code.  */
7144 bool
7145 ix86_use_pseudo_pic_reg (void)
7146 {
7147   if ((TARGET_64BIT
7148        && (ix86_cmodel == CM_SMALL_PIC
7149 	   || TARGET_PECOFF))
7150       || !flag_pic)
7151     return false;
7152   return true;
7153 }
7154 
7155 /* Initialize large model PIC register.  */
7156 
7157 static void
7158 ix86_init_large_pic_reg (unsigned int tmp_regno)
7159 {
7160   rtx_code_label *label;
7161   rtx tmp_reg;
7162 
7163   gcc_assert (Pmode == DImode);
7164   label = gen_label_rtx ();
7165   emit_label (label);
7166   LABEL_PRESERVE_P (label) = 1;
7167   tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7168   gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7169   emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7170 				label));
7171   emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7172   emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7173 			    pic_offset_table_rtx, tmp_reg));
7174   const char *name = LABEL_NAME (label);
7175   PUT_CODE (label, NOTE);
7176   NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7177   NOTE_DELETED_LABEL_NAME (label) = name;
7178 }
7179 
7180 /* Create and initialize PIC register if required.  */
7181 static void
7182 ix86_init_pic_reg (void)
7183 {
7184   edge entry_edge;
7185   rtx_insn *seq;
7186 
7187   if (!ix86_use_pseudo_pic_reg ())
7188     return;
7189 
7190   start_sequence ();
7191 
7192   if (TARGET_64BIT)
7193     {
7194       if (ix86_cmodel == CM_LARGE_PIC)
7195 	ix86_init_large_pic_reg (R11_REG);
7196       else
7197 	emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7198     }
7199   else
7200     {
7201       /*  If there is future mcount call in the function it is more profitable
7202 	  to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM.  */
7203       rtx reg = crtl->profile
7204 		? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7205 		: pic_offset_table_rtx;
7206       rtx_insn *insn = emit_insn (gen_set_got (reg));
7207       RTX_FRAME_RELATED_P (insn) = 1;
7208       if (crtl->profile)
7209         emit_move_insn (pic_offset_table_rtx, reg);
7210       add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7211     }
7212 
7213   seq = get_insns ();
7214   end_sequence ();
7215 
7216   entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7217   insert_insn_on_edge (seq, entry_edge);
7218   commit_one_edge_insertion (entry_edge);
7219 }
7220 
7221 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7222    for a call to a function whose data type is FNTYPE.
7223    For a library call, FNTYPE is 0.  */
7224 
7225 void
7226 init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
7227 		      tree fntype,	/* tree ptr for function decl */
7228 		      rtx libname,	/* SYMBOL_REF of library name or 0 */
7229 		      tree fndecl,
7230 		      int caller)
7231 {
7232   struct cgraph_local_info *i = NULL;
7233   struct cgraph_node *target = NULL;
7234 
7235   memset (cum, 0, sizeof (*cum));
7236 
7237   if (fndecl)
7238     {
7239       target = cgraph_node::get (fndecl);
7240       if (target)
7241 	{
7242 	  target = target->function_symbol ();
7243 	  i = cgraph_node::local_info (target->decl);
7244 	  cum->call_abi = ix86_function_abi (target->decl);
7245 	}
7246       else
7247 	cum->call_abi = ix86_function_abi (fndecl);
7248     }
7249   else
7250     cum->call_abi = ix86_function_type_abi (fntype);
7251 
7252   cum->caller = caller;
7253 
7254   /* Set up the number of registers to use for passing arguments.  */
7255   cum->nregs = ix86_regparm;
7256   if (TARGET_64BIT)
7257     {
7258       cum->nregs = (cum->call_abi == SYSV_ABI
7259                    ? X86_64_REGPARM_MAX
7260                    : X86_64_MS_REGPARM_MAX);
7261     }
7262   if (TARGET_SSE)
7263     {
7264       cum->sse_nregs = SSE_REGPARM_MAX;
7265       if (TARGET_64BIT)
7266         {
7267           cum->sse_nregs = (cum->call_abi == SYSV_ABI
7268                            ? X86_64_SSE_REGPARM_MAX
7269                            : X86_64_MS_SSE_REGPARM_MAX);
7270         }
7271     }
7272   if (TARGET_MMX)
7273     cum->mmx_nregs = MMX_REGPARM_MAX;
7274   cum->warn_avx512f = true;
7275   cum->warn_avx = true;
7276   cum->warn_sse = true;
7277   cum->warn_mmx = true;
7278 
7279   /* Because type might mismatch in between caller and callee, we need to
7280      use actual type of function for local calls.
7281      FIXME: cgraph_analyze can be told to actually record if function uses
7282      va_start so for local functions maybe_vaarg can be made aggressive
7283      helping K&R code.
7284      FIXME: once typesytem is fixed, we won't need this code anymore.  */
7285   if (i && i->local && i->can_change_signature)
7286     fntype = TREE_TYPE (target->decl);
7287   cum->stdarg = stdarg_p (fntype);
7288   cum->maybe_vaarg = (fntype
7289 		      ? (!prototype_p (fntype) || stdarg_p (fntype))
7290 		      : !libname);
7291 
7292   cum->bnd_regno = FIRST_BND_REG;
7293   cum->bnds_in_bt = 0;
7294   cum->force_bnd_pass = 0;
7295   cum->decl = fndecl;
7296 
7297   cum->warn_empty = !warn_abi || cum->stdarg;
7298   if (!cum->warn_empty && fntype)
7299     {
7300       function_args_iterator iter;
7301       tree argtype;
7302       bool seen_empty_type = false;
7303       FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7304 	{
7305 	  if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7306 	    break;
7307 	  if (TYPE_EMPTY_P (argtype))
7308 	    seen_empty_type = true;
7309 	  else if (seen_empty_type)
7310 	    {
7311 	      cum->warn_empty = true;
7312 	      break;
7313 	    }
7314 	}
7315     }
7316 
7317   if (!TARGET_64BIT)
7318     {
7319       /* If there are variable arguments, then we won't pass anything
7320          in registers in 32-bit mode. */
7321       if (stdarg_p (fntype))
7322 	{
7323 	  cum->nregs = 0;
7324 	  /* Since in 32-bit, variable arguments are always passed on
7325 	     stack, there is scratch register available for indirect
7326 	     sibcall.  */
7327 	  cfun->machine->arg_reg_available = true;
7328 	  cum->sse_nregs = 0;
7329 	  cum->mmx_nregs = 0;
7330 	  cum->warn_avx512f = false;
7331 	  cum->warn_avx = false;
7332 	  cum->warn_sse = false;
7333 	  cum->warn_mmx = false;
7334 	  return;
7335 	}
7336 
7337       /* Use ecx and edx registers if function has fastcall attribute,
7338 	 else look for regparm information.  */
7339       if (fntype)
7340 	{
7341 	  unsigned int ccvt = ix86_get_callcvt (fntype);
7342 	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7343 	    {
7344 	      cum->nregs = 1;
7345 	      cum->fastcall = 1; /* Same first register as in fastcall.  */
7346 	    }
7347 	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7348 	    {
7349 	      cum->nregs = 2;
7350 	      cum->fastcall = 1;
7351 	    }
7352 	  else
7353 	    cum->nregs = ix86_function_regparm (fntype, fndecl);
7354 	}
7355 
7356       /* Set up the number of SSE registers used for passing SFmode
7357 	 and DFmode arguments.  Warn for mismatching ABI.  */
7358       cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7359     }
7360 
7361   cfun->machine->arg_reg_available = (cum->nregs > 0);
7362 }
7363 
7364 /* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
7365    But in the case of vector types, it is some vector mode.
7366 
7367    When we have only some of our vector isa extensions enabled, then there
7368    are some modes for which vector_mode_supported_p is false.  For these
7369    modes, the generic vector support in gcc will choose some non-vector mode
7370    in order to implement the type.  By computing the natural mode, we'll
7371    select the proper ABI location for the operand and not depend on whatever
7372    the middle-end decides to do with these vector types.
7373 
7374    The midde-end can't deal with the vector types > 16 bytes.  In this
7375    case, we return the original mode and warn ABI change if CUM isn't
7376    NULL.
7377 
7378    If INT_RETURN is true, warn ABI change if the vector mode isn't
7379    available for function return value.  */
7380 
7381 static machine_mode
7382 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7383 		   bool in_return)
7384 {
7385   machine_mode mode = TYPE_MODE (type);
7386 
7387   if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7388     {
7389       HOST_WIDE_INT size = int_size_in_bytes (type);
7390       if ((size == 8 || size == 16 || size == 32 || size == 64)
7391 	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
7392 	  && TYPE_VECTOR_SUBPARTS (type) > 1)
7393 	{
7394 	  machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7395 
7396 	  /* There are no XFmode vector modes.  */
7397 	  if (innermode == XFmode)
7398 	    return mode;
7399 
7400 	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7401 	    mode = MIN_MODE_VECTOR_FLOAT;
7402 	  else
7403 	    mode = MIN_MODE_VECTOR_INT;
7404 
7405 	  /* Get the mode which has this inner mode and number of units.  */
7406 	  FOR_EACH_MODE_FROM (mode, mode)
7407 	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7408 		&& GET_MODE_INNER (mode) == innermode)
7409 	      {
7410 		if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7411 		  {
7412 		    static bool warnedavx512f;
7413 		    static bool warnedavx512f_ret;
7414 
7415 		    if (cum && cum->warn_avx512f && !warnedavx512f)
7416 		      {
7417 			if (warning (OPT_Wpsabi, "AVX512F vector argument "
7418 				     "without AVX512F enabled changes the ABI"))
7419 			  warnedavx512f = true;
7420 		      }
7421 		    else if (in_return && !warnedavx512f_ret)
7422 		      {
7423 			if (warning (OPT_Wpsabi, "AVX512F vector return "
7424 				     "without AVX512F enabled changes the ABI"))
7425 			  warnedavx512f_ret = true;
7426 		      }
7427 
7428 		    return TYPE_MODE (type);
7429 		  }
7430 		else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7431 		  {
7432 		    static bool warnedavx;
7433 		    static bool warnedavx_ret;
7434 
7435 		    if (cum && cum->warn_avx && !warnedavx)
7436 		      {
7437 			if (warning (OPT_Wpsabi, "AVX vector argument "
7438 				     "without AVX enabled changes the ABI"))
7439 			  warnedavx = true;
7440 		      }
7441 		    else if (in_return && !warnedavx_ret)
7442 		      {
7443 			if (warning (OPT_Wpsabi, "AVX vector return "
7444 				     "without AVX enabled changes the ABI"))
7445 			  warnedavx_ret = true;
7446 		      }
7447 
7448 		    return TYPE_MODE (type);
7449 		  }
7450 		else if (((size == 8 && TARGET_64BIT) || size == 16)
7451 			 && !TARGET_SSE
7452 			 && !TARGET_IAMCU)
7453 		  {
7454 		    static bool warnedsse;
7455 		    static bool warnedsse_ret;
7456 
7457 		    if (cum && cum->warn_sse && !warnedsse)
7458 		      {
7459 			if (warning (OPT_Wpsabi, "SSE vector argument "
7460 				     "without SSE enabled changes the ABI"))
7461 			  warnedsse = true;
7462 		      }
7463 		    else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7464 		      {
7465 			if (warning (OPT_Wpsabi, "SSE vector return "
7466 				     "without SSE enabled changes the ABI"))
7467 			  warnedsse_ret = true;
7468 		      }
7469 		  }
7470 		else if ((size == 8 && !TARGET_64BIT)
7471 			 && (!cfun
7472 			     || cfun->machine->func_type == TYPE_NORMAL)
7473 			 && !TARGET_MMX
7474 			 && !TARGET_IAMCU)
7475 		  {
7476 		    static bool warnedmmx;
7477 		    static bool warnedmmx_ret;
7478 
7479 		    if (cum && cum->warn_mmx && !warnedmmx)
7480 		      {
7481 			if (warning (OPT_Wpsabi, "MMX vector argument "
7482 				     "without MMX enabled changes the ABI"))
7483 			  warnedmmx = true;
7484 		      }
7485 		    else if (in_return && !warnedmmx_ret)
7486 		      {
7487 			if (warning (OPT_Wpsabi, "MMX vector return "
7488 				     "without MMX enabled changes the ABI"))
7489 			  warnedmmx_ret = true;
7490 		      }
7491 		  }
7492 		return mode;
7493 	      }
7494 
7495 	  gcc_unreachable ();
7496 	}
7497     }
7498 
7499   return mode;
7500 }
7501 
7502 /* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
7503    this may not agree with the mode that the type system has chosen for the
7504    register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
7505    go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
7506 
7507 static rtx
7508 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7509 		     unsigned int regno)
7510 {
7511   rtx tmp;
7512 
7513   if (orig_mode != BLKmode)
7514     tmp = gen_rtx_REG (orig_mode, regno);
7515   else
7516     {
7517       tmp = gen_rtx_REG (mode, regno);
7518       tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7519       tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7520     }
7521 
7522   return tmp;
7523 }
7524 
7525 /* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
7526    of this code is to classify each 8bytes of incoming argument by the register
7527    class and assign registers accordingly.  */
7528 
7529 /* Return the union class of CLASS1 and CLASS2.
7530    See the x86-64 PS ABI for details.  */
7531 
7532 static enum x86_64_reg_class
7533 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7534 {
7535   /* Rule #1: If both classes are equal, this is the resulting class.  */
7536   if (class1 == class2)
7537     return class1;
7538 
7539   /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7540      the other class.  */
7541   if (class1 == X86_64_NO_CLASS)
7542     return class2;
7543   if (class2 == X86_64_NO_CLASS)
7544     return class1;
7545 
7546   /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
7547   if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7548     return X86_64_MEMORY_CLASS;
7549 
7550   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
7551   if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7552       || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7553     return X86_64_INTEGERSI_CLASS;
7554   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7555       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7556     return X86_64_INTEGER_CLASS;
7557 
7558   /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7559      MEMORY is used.  */
7560   if (class1 == X86_64_X87_CLASS
7561       || class1 == X86_64_X87UP_CLASS
7562       || class1 == X86_64_COMPLEX_X87_CLASS
7563       || class2 == X86_64_X87_CLASS
7564       || class2 == X86_64_X87UP_CLASS
7565       || class2 == X86_64_COMPLEX_X87_CLASS)
7566     return X86_64_MEMORY_CLASS;
7567 
7568   /* Rule #6: Otherwise class SSE is used.  */
7569   return X86_64_SSE_CLASS;
7570 }
7571 
7572 /* Classify the argument of type TYPE and mode MODE.
7573    CLASSES will be filled by the register class used to pass each word
7574    of the operand.  The number of words is returned.  In case the parameter
7575    should be passed in memory, 0 is returned. As a special case for zero
7576    sized containers, classes[0] will be NO_CLASS and 1 is returned.
7577 
7578    BIT_OFFSET is used internally for handling records and specifies offset
7579    of the offset in bits modulo 512 to avoid overflow cases.
7580 
7581    See the x86-64 PS ABI for details.
7582 */
7583 
7584 static int
7585 classify_argument (machine_mode mode, const_tree type,
7586 		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7587 {
7588   HOST_WIDE_INT bytes =
7589     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7590   int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7591 
7592   /* Variable sized entities are always passed/returned in memory.  */
7593   if (bytes < 0)
7594     return 0;
7595 
7596   if (mode != VOIDmode
7597       && targetm.calls.must_pass_in_stack (mode, type))
7598     return 0;
7599 
7600   if (type && AGGREGATE_TYPE_P (type))
7601     {
7602       int i;
7603       tree field;
7604       enum x86_64_reg_class subclasses[MAX_CLASSES];
7605 
7606       /* On x86-64 we pass structures larger than 64 bytes on the stack.  */
7607       if (bytes > 64)
7608 	return 0;
7609 
7610       for (i = 0; i < words; i++)
7611 	classes[i] = X86_64_NO_CLASS;
7612 
7613       /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
7614 	 signalize memory class, so handle it as special case.  */
7615       if (!words)
7616 	{
7617 	  classes[0] = X86_64_NO_CLASS;
7618 	  return 1;
7619 	}
7620 
7621       /* Classify each field of record and merge classes.  */
7622       switch (TREE_CODE (type))
7623 	{
7624 	case RECORD_TYPE:
7625 	  /* And now merge the fields of structure.  */
7626 	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7627 	    {
7628 	      if (TREE_CODE (field) == FIELD_DECL)
7629 		{
7630 		  int num;
7631 
7632 		  if (TREE_TYPE (field) == error_mark_node)
7633 		    continue;
7634 
7635 		  /* Bitfields are always classified as integer.  Handle them
7636 		     early, since later code would consider them to be
7637 		     misaligned integers.  */
7638 		  if (DECL_BIT_FIELD (field))
7639 		    {
7640 		      for (i = (int_bit_position (field)
7641 				+ (bit_offset % 64)) / 8 / 8;
7642 			   i < ((int_bit_position (field) + (bit_offset % 64))
7643 			        + tree_to_shwi (DECL_SIZE (field))
7644 				+ 63) / 8 / 8; i++)
7645 			classes[i] =
7646 			  merge_classes (X86_64_INTEGER_CLASS,
7647 					 classes[i]);
7648 		    }
7649 		  else
7650 		    {
7651 		      int pos;
7652 
7653 		      type = TREE_TYPE (field);
7654 
7655 		      /* Flexible array member is ignored.  */
7656 		      if (TYPE_MODE (type) == BLKmode
7657 			  && TREE_CODE (type) == ARRAY_TYPE
7658 			  && TYPE_SIZE (type) == NULL_TREE
7659 			  && TYPE_DOMAIN (type) != NULL_TREE
7660 			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7661 			      == NULL_TREE))
7662 			{
7663 			  static bool warned;
7664 
7665 			  if (!warned && warn_psabi)
7666 			    {
7667 			      warned = true;
7668 			      inform (input_location,
7669 				      "the ABI of passing struct with"
7670 				      " a flexible array member has"
7671 				      " changed in GCC 4.4");
7672 			    }
7673 			  continue;
7674 			}
7675 		      num = classify_argument (TYPE_MODE (type), type,
7676 					       subclasses,
7677 					       (int_bit_position (field)
7678 						+ bit_offset) % 512);
7679 		      if (!num)
7680 			return 0;
7681 		      pos = (int_bit_position (field)
7682 			     + (bit_offset % 64)) / 8 / 8;
7683 		      for (i = 0; i < num && (i + pos) < words; i++)
7684 			classes[i + pos] =
7685 			  merge_classes (subclasses[i], classes[i + pos]);
7686 		    }
7687 		}
7688 	    }
7689 	  break;
7690 
7691 	case ARRAY_TYPE:
7692 	  /* Arrays are handled as small records.  */
7693 	  {
7694 	    int num;
7695 	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7696 				     TREE_TYPE (type), subclasses, bit_offset);
7697 	    if (!num)
7698 	      return 0;
7699 
7700 	    /* The partial classes are now full classes.  */
7701 	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7702 	      subclasses[0] = X86_64_SSE_CLASS;
7703 	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
7704 		&& !((bit_offset % 64) == 0 && bytes == 4))
7705 	      subclasses[0] = X86_64_INTEGER_CLASS;
7706 
7707 	    for (i = 0; i < words; i++)
7708 	      classes[i] = subclasses[i % num];
7709 
7710 	    break;
7711 	  }
7712 	case UNION_TYPE:
7713 	case QUAL_UNION_TYPE:
7714 	  /* Unions are similar to RECORD_TYPE but offset is always 0.
7715 	     */
7716 	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7717 	    {
7718 	      if (TREE_CODE (field) == FIELD_DECL)
7719 		{
7720 		  int num;
7721 
7722 		  if (TREE_TYPE (field) == error_mark_node)
7723 		    continue;
7724 
7725 		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7726 					   TREE_TYPE (field), subclasses,
7727 					   bit_offset);
7728 		  if (!num)
7729 		    return 0;
7730 		  for (i = 0; i < num && i < words; i++)
7731 		    classes[i] = merge_classes (subclasses[i], classes[i]);
7732 		}
7733 	    }
7734 	  break;
7735 
7736 	default:
7737 	  gcc_unreachable ();
7738 	}
7739 
7740       if (words > 2)
7741 	{
7742 	  /* When size > 16 bytes, if the first one isn't
7743 	     X86_64_SSE_CLASS or any other ones aren't
7744 	     X86_64_SSEUP_CLASS, everything should be passed in
7745 	     memory.  */
7746 	  if (classes[0] != X86_64_SSE_CLASS)
7747 	      return 0;
7748 
7749 	  for (i = 1; i < words; i++)
7750 	    if (classes[i] != X86_64_SSEUP_CLASS)
7751 	      return 0;
7752 	}
7753 
7754       /* Final merger cleanup.  */
7755       for (i = 0; i < words; i++)
7756 	{
7757 	  /* If one class is MEMORY, everything should be passed in
7758 	     memory.  */
7759 	  if (classes[i] == X86_64_MEMORY_CLASS)
7760 	    return 0;
7761 
7762 	  /* The X86_64_SSEUP_CLASS should be always preceded by
7763 	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
7764 	  if (classes[i] == X86_64_SSEUP_CLASS
7765 	      && classes[i - 1] != X86_64_SSE_CLASS
7766 	      && classes[i - 1] != X86_64_SSEUP_CLASS)
7767 	    {
7768 	      /* The first one should never be X86_64_SSEUP_CLASS.  */
7769 	      gcc_assert (i != 0);
7770 	      classes[i] = X86_64_SSE_CLASS;
7771 	    }
7772 
7773 	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7774 	       everything should be passed in memory.  */
7775 	  if (classes[i] == X86_64_X87UP_CLASS
7776 	      && (classes[i - 1] != X86_64_X87_CLASS))
7777 	    {
7778 	      static bool warned;
7779 
7780 	      /* The first one should never be X86_64_X87UP_CLASS.  */
7781 	      gcc_assert (i != 0);
7782 	      if (!warned && warn_psabi)
7783 		{
7784 		  warned = true;
7785 		  inform (input_location,
7786 			  "the ABI of passing union with long double"
7787 			  " has changed in GCC 4.4");
7788 		}
7789 	      return 0;
7790 	    }
7791 	}
7792       return words;
7793     }
7794 
7795   /* Compute alignment needed.  We align all types to natural boundaries with
7796      exception of XFmode that is aligned to 64bits.  */
7797   if (mode != VOIDmode && mode != BLKmode)
7798     {
7799       int mode_alignment = GET_MODE_BITSIZE (mode);
7800 
7801       if (mode == XFmode)
7802 	mode_alignment = 128;
7803       else if (mode == XCmode)
7804 	mode_alignment = 256;
7805       if (COMPLEX_MODE_P (mode))
7806 	mode_alignment /= 2;
7807       /* Misaligned fields are always returned in memory.  */
7808       if (bit_offset % mode_alignment)
7809 	return 0;
7810     }
7811 
7812   /* for V1xx modes, just use the base mode */
7813   if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7814       && GET_MODE_UNIT_SIZE (mode) == bytes)
7815     mode = GET_MODE_INNER (mode);
7816 
7817   /* Classification of atomic types.  */
7818   switch (mode)
7819     {
7820     case E_SDmode:
7821     case E_DDmode:
7822       classes[0] = X86_64_SSE_CLASS;
7823       return 1;
7824     case E_TDmode:
7825       classes[0] = X86_64_SSE_CLASS;
7826       classes[1] = X86_64_SSEUP_CLASS;
7827       return 2;
7828     case E_DImode:
7829     case E_SImode:
7830     case E_HImode:
7831     case E_QImode:
7832     case E_CSImode:
7833     case E_CHImode:
7834     case E_CQImode:
7835       {
7836 	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7837 
7838 	/* Analyze last 128 bits only.  */
7839 	size = (size - 1) & 0x7f;
7840 
7841 	if (size < 32)
7842 	  {
7843 	    classes[0] = X86_64_INTEGERSI_CLASS;
7844 	    return 1;
7845 	  }
7846 	else if (size < 64)
7847 	  {
7848 	    classes[0] = X86_64_INTEGER_CLASS;
7849 	    return 1;
7850 	  }
7851 	else if (size < 64+32)
7852 	  {
7853 	    classes[0] = X86_64_INTEGER_CLASS;
7854 	    classes[1] = X86_64_INTEGERSI_CLASS;
7855 	    return 2;
7856 	  }
7857 	else if (size < 64+64)
7858 	  {
7859 	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7860 	    return 2;
7861 	  }
7862 	else
7863 	  gcc_unreachable ();
7864       }
7865     case E_CDImode:
7866     case E_TImode:
7867       classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7868       return 2;
7869     case E_COImode:
7870     case E_OImode:
7871       /* OImode shouldn't be used directly.  */
7872       gcc_unreachable ();
7873     case E_CTImode:
7874       return 0;
7875     case E_SFmode:
7876       if (!(bit_offset % 64))
7877 	classes[0] = X86_64_SSESF_CLASS;
7878       else
7879 	classes[0] = X86_64_SSE_CLASS;
7880       return 1;
7881     case E_DFmode:
7882       classes[0] = X86_64_SSEDF_CLASS;
7883       return 1;
7884     case E_XFmode:
7885       classes[0] = X86_64_X87_CLASS;
7886       classes[1] = X86_64_X87UP_CLASS;
7887       return 2;
7888     case E_TFmode:
7889       classes[0] = X86_64_SSE_CLASS;
7890       classes[1] = X86_64_SSEUP_CLASS;
7891       return 2;
7892     case E_SCmode:
7893       classes[0] = X86_64_SSE_CLASS;
7894       if (!(bit_offset % 64))
7895 	return 1;
7896       else
7897 	{
7898 	  static bool warned;
7899 
7900 	  if (!warned && warn_psabi)
7901 	    {
7902 	      warned = true;
7903 	      inform (input_location,
7904 		      "the ABI of passing structure with complex float"
7905 		      " member has changed in GCC 4.4");
7906 	    }
7907 	  classes[1] = X86_64_SSESF_CLASS;
7908 	  return 2;
7909 	}
7910     case E_DCmode:
7911       classes[0] = X86_64_SSEDF_CLASS;
7912       classes[1] = X86_64_SSEDF_CLASS;
7913       return 2;
7914     case E_XCmode:
7915       classes[0] = X86_64_COMPLEX_X87_CLASS;
7916       return 1;
7917     case E_TCmode:
7918       /* This modes is larger than 16 bytes.  */
7919       return 0;
7920     case E_V8SFmode:
7921     case E_V8SImode:
7922     case E_V32QImode:
7923     case E_V16HImode:
7924     case E_V4DFmode:
7925     case E_V4DImode:
7926       classes[0] = X86_64_SSE_CLASS;
7927       classes[1] = X86_64_SSEUP_CLASS;
7928       classes[2] = X86_64_SSEUP_CLASS;
7929       classes[3] = X86_64_SSEUP_CLASS;
7930       return 4;
7931     case E_V8DFmode:
7932     case E_V16SFmode:
7933     case E_V8DImode:
7934     case E_V16SImode:
7935     case E_V32HImode:
7936     case E_V64QImode:
7937       classes[0] = X86_64_SSE_CLASS;
7938       classes[1] = X86_64_SSEUP_CLASS;
7939       classes[2] = X86_64_SSEUP_CLASS;
7940       classes[3] = X86_64_SSEUP_CLASS;
7941       classes[4] = X86_64_SSEUP_CLASS;
7942       classes[5] = X86_64_SSEUP_CLASS;
7943       classes[6] = X86_64_SSEUP_CLASS;
7944       classes[7] = X86_64_SSEUP_CLASS;
7945       return 8;
7946     case E_V4SFmode:
7947     case E_V4SImode:
7948     case E_V16QImode:
7949     case E_V8HImode:
7950     case E_V2DFmode:
7951     case E_V2DImode:
7952       classes[0] = X86_64_SSE_CLASS;
7953       classes[1] = X86_64_SSEUP_CLASS;
7954       return 2;
7955     case E_V1TImode:
7956     case E_V1DImode:
7957     case E_V2SFmode:
7958     case E_V2SImode:
7959     case E_V4HImode:
7960     case E_V8QImode:
7961       classes[0] = X86_64_SSE_CLASS;
7962       return 1;
7963     case E_BLKmode:
7964     case E_VOIDmode:
7965       return 0;
7966     default:
7967       gcc_assert (VECTOR_MODE_P (mode));
7968 
7969       if (bytes > 16)
7970 	return 0;
7971 
7972       gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7973 
7974       if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7975 	classes[0] = X86_64_INTEGERSI_CLASS;
7976       else
7977 	classes[0] = X86_64_INTEGER_CLASS;
7978       classes[1] = X86_64_INTEGER_CLASS;
7979       return 1 + (bytes > 8);
7980     }
7981 }
7982 
7983 /* Examine the argument and return set number of register required in each
7984    class.  Return true iff parameter should be passed in memory.  */
7985 
7986 static bool
7987 examine_argument (machine_mode mode, const_tree type, int in_return,
7988 		  int *int_nregs, int *sse_nregs)
7989 {
7990   enum x86_64_reg_class regclass[MAX_CLASSES];
7991   int n = classify_argument (mode, type, regclass, 0);
7992 
7993   *int_nregs = 0;
7994   *sse_nregs = 0;
7995 
7996   if (!n)
7997     return true;
7998   for (n--; n >= 0; n--)
7999     switch (regclass[n])
8000       {
8001       case X86_64_INTEGER_CLASS:
8002       case X86_64_INTEGERSI_CLASS:
8003 	(*int_nregs)++;
8004 	break;
8005       case X86_64_SSE_CLASS:
8006       case X86_64_SSESF_CLASS:
8007       case X86_64_SSEDF_CLASS:
8008 	(*sse_nregs)++;
8009 	break;
8010       case X86_64_NO_CLASS:
8011       case X86_64_SSEUP_CLASS:
8012 	break;
8013       case X86_64_X87_CLASS:
8014       case X86_64_X87UP_CLASS:
8015       case X86_64_COMPLEX_X87_CLASS:
8016 	if (!in_return)
8017 	  return true;
8018 	break;
8019       case X86_64_MEMORY_CLASS:
8020 	gcc_unreachable ();
8021       }
8022 
8023   return false;
8024 }
8025 
8026 /* Construct container for the argument used by GCC interface.  See
8027    FUNCTION_ARG for the detailed description.  */
8028 
8029 static rtx
8030 construct_container (machine_mode mode, machine_mode orig_mode,
8031 		     const_tree type, int in_return, int nintregs, int nsseregs,
8032 		     const int *intreg, int sse_regno)
8033 {
8034   /* The following variables hold the static issued_error state.  */
8035   static bool issued_sse_arg_error;
8036   static bool issued_sse_ret_error;
8037   static bool issued_x87_ret_error;
8038 
8039   machine_mode tmpmode;
8040   int bytes =
8041     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8042   enum x86_64_reg_class regclass[MAX_CLASSES];
8043   int n;
8044   int i;
8045   int nexps = 0;
8046   int needed_sseregs, needed_intregs;
8047   rtx exp[MAX_CLASSES];
8048   rtx ret;
8049 
8050   n = classify_argument (mode, type, regclass, 0);
8051   if (!n)
8052     return NULL;
8053   if (examine_argument (mode, type, in_return, &needed_intregs,
8054 			&needed_sseregs))
8055     return NULL;
8056   if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8057     return NULL;
8058 
8059   /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
8060      some less clueful developer tries to use floating-point anyway.  */
8061   if (needed_sseregs && !TARGET_SSE)
8062     {
8063       if (in_return)
8064 	{
8065 	  if (!issued_sse_ret_error)
8066 	    {
8067 	      error ("SSE register return with SSE disabled");
8068 	      issued_sse_ret_error = true;
8069 	    }
8070 	}
8071       else if (!issued_sse_arg_error)
8072 	{
8073 	  error ("SSE register argument with SSE disabled");
8074 	  issued_sse_arg_error = true;
8075 	}
8076       return NULL;
8077     }
8078 
8079   /* Likewise, error if the ABI requires us to return values in the
8080      x87 registers and the user specified -mno-80387.  */
8081   if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8082     for (i = 0; i < n; i++)
8083       if (regclass[i] == X86_64_X87_CLASS
8084 	  || regclass[i] == X86_64_X87UP_CLASS
8085 	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8086 	{
8087 	  if (!issued_x87_ret_error)
8088 	    {
8089 	      error ("x87 register return with x87 disabled");
8090 	      issued_x87_ret_error = true;
8091 	    }
8092 	  return NULL;
8093 	}
8094 
8095   /* First construct simple cases.  Avoid SCmode, since we want to use
8096      single register to pass this type.  */
8097   if (n == 1 && mode != SCmode)
8098     switch (regclass[0])
8099       {
8100       case X86_64_INTEGER_CLASS:
8101       case X86_64_INTEGERSI_CLASS:
8102 	return gen_rtx_REG (mode, intreg[0]);
8103       case X86_64_SSE_CLASS:
8104       case X86_64_SSESF_CLASS:
8105       case X86_64_SSEDF_CLASS:
8106 	if (mode != BLKmode)
8107 	  return gen_reg_or_parallel (mode, orig_mode,
8108 				      GET_SSE_REGNO (sse_regno));
8109 	break;
8110       case X86_64_X87_CLASS:
8111       case X86_64_COMPLEX_X87_CLASS:
8112 	return gen_rtx_REG (mode, FIRST_STACK_REG);
8113       case X86_64_NO_CLASS:
8114 	/* Zero sized array, struct or class.  */
8115 	return NULL;
8116       default:
8117 	gcc_unreachable ();
8118       }
8119   if (n == 2
8120       && regclass[0] == X86_64_SSE_CLASS
8121       && regclass[1] == X86_64_SSEUP_CLASS
8122       && mode != BLKmode)
8123     return gen_reg_or_parallel (mode, orig_mode,
8124 				GET_SSE_REGNO (sse_regno));
8125   if (n == 4
8126       && regclass[0] == X86_64_SSE_CLASS
8127       && regclass[1] == X86_64_SSEUP_CLASS
8128       && regclass[2] == X86_64_SSEUP_CLASS
8129       && regclass[3] == X86_64_SSEUP_CLASS
8130       && mode != BLKmode)
8131     return gen_reg_or_parallel (mode, orig_mode,
8132 				GET_SSE_REGNO (sse_regno));
8133   if (n == 8
8134       && regclass[0] == X86_64_SSE_CLASS
8135       && regclass[1] == X86_64_SSEUP_CLASS
8136       && regclass[2] == X86_64_SSEUP_CLASS
8137       && regclass[3] == X86_64_SSEUP_CLASS
8138       && regclass[4] == X86_64_SSEUP_CLASS
8139       && regclass[5] == X86_64_SSEUP_CLASS
8140       && regclass[6] == X86_64_SSEUP_CLASS
8141       && regclass[7] == X86_64_SSEUP_CLASS
8142       && mode != BLKmode)
8143     return gen_reg_or_parallel (mode, orig_mode,
8144 				GET_SSE_REGNO (sse_regno));
8145   if (n == 2
8146       && regclass[0] == X86_64_X87_CLASS
8147       && regclass[1] == X86_64_X87UP_CLASS)
8148     return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8149 
8150   if (n == 2
8151       && regclass[0] == X86_64_INTEGER_CLASS
8152       && regclass[1] == X86_64_INTEGER_CLASS
8153       && (mode == CDImode || mode == TImode || mode == BLKmode)
8154       && intreg[0] + 1 == intreg[1])
8155     {
8156       if (mode == BLKmode)
8157 	{
8158 	  /* Use TImode for BLKmode values in 2 integer registers.  */
8159 	  exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
8160 				      gen_rtx_REG (TImode, intreg[0]),
8161 				      GEN_INT (0));
8162 	  ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
8163 	  XVECEXP (ret, 0, 0) = exp[0];
8164 	  return ret;
8165 	}
8166       else
8167 	return gen_rtx_REG (mode, intreg[0]);
8168     }
8169 
8170   /* Otherwise figure out the entries of the PARALLEL.  */
8171   for (i = 0; i < n; i++)
8172     {
8173       int pos;
8174 
8175       switch (regclass[i])
8176         {
8177 	  case X86_64_NO_CLASS:
8178 	    break;
8179 	  case X86_64_INTEGER_CLASS:
8180 	  case X86_64_INTEGERSI_CLASS:
8181 	    /* Merge TImodes on aligned occasions here too.  */
8182 	    if (i * 8 + 8 > bytes)
8183 	      {
8184 		unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8185 		if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8186 		  /* We've requested 24 bytes we
8187 		     don't have mode for.  Use DImode.  */
8188 		  tmpmode = DImode;
8189 	      }
8190 	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8191 	      tmpmode = SImode;
8192 	    else
8193 	      tmpmode = DImode;
8194 	    exp [nexps++]
8195 	      = gen_rtx_EXPR_LIST (VOIDmode,
8196 				   gen_rtx_REG (tmpmode, *intreg),
8197 				   GEN_INT (i*8));
8198 	    intreg++;
8199 	    break;
8200 	  case X86_64_SSESF_CLASS:
8201 	    exp [nexps++]
8202 	      = gen_rtx_EXPR_LIST (VOIDmode,
8203 				   gen_rtx_REG (SFmode,
8204 						GET_SSE_REGNO (sse_regno)),
8205 				   GEN_INT (i*8));
8206 	    sse_regno++;
8207 	    break;
8208 	  case X86_64_SSEDF_CLASS:
8209 	    exp [nexps++]
8210 	      = gen_rtx_EXPR_LIST (VOIDmode,
8211 				   gen_rtx_REG (DFmode,
8212 						GET_SSE_REGNO (sse_regno)),
8213 				   GEN_INT (i*8));
8214 	    sse_regno++;
8215 	    break;
8216 	  case X86_64_SSE_CLASS:
8217 	    pos = i;
8218 	    switch (n)
8219 	      {
8220 	      case 1:
8221 		tmpmode = DImode;
8222 		break;
8223 	      case 2:
8224 		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8225 		  {
8226 		    tmpmode = TImode;
8227 		    i++;
8228 		  }
8229 		else
8230 		  tmpmode = DImode;
8231 		break;
8232 	      case 4:
8233 		gcc_assert (i == 0
8234 			    && regclass[1] == X86_64_SSEUP_CLASS
8235 			    && regclass[2] == X86_64_SSEUP_CLASS
8236 			    && regclass[3] == X86_64_SSEUP_CLASS);
8237 		tmpmode = OImode;
8238 		i += 3;
8239 		break;
8240 	      case 8:
8241 		gcc_assert (i == 0
8242 			    && regclass[1] == X86_64_SSEUP_CLASS
8243 			    && regclass[2] == X86_64_SSEUP_CLASS
8244 			    && regclass[3] == X86_64_SSEUP_CLASS
8245 			    && regclass[4] == X86_64_SSEUP_CLASS
8246 			    && regclass[5] == X86_64_SSEUP_CLASS
8247 			    && regclass[6] == X86_64_SSEUP_CLASS
8248 			    && regclass[7] == X86_64_SSEUP_CLASS);
8249 		tmpmode = XImode;
8250 		i += 7;
8251 		break;
8252 	      default:
8253 		gcc_unreachable ();
8254 	      }
8255 	    exp [nexps++]
8256 	      = gen_rtx_EXPR_LIST (VOIDmode,
8257 				   gen_rtx_REG (tmpmode,
8258 						GET_SSE_REGNO (sse_regno)),
8259 				   GEN_INT (pos*8));
8260 	    sse_regno++;
8261 	    break;
8262 	  default:
8263 	    gcc_unreachable ();
8264 	}
8265     }
8266 
8267   /* Empty aligned struct, union or class.  */
8268   if (nexps == 0)
8269     return NULL;
8270 
8271   ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8272   for (i = 0; i < nexps; i++)
8273     XVECEXP (ret, 0, i) = exp [i];
8274   return ret;
8275 }
8276 
8277 /* Update the data in CUM to advance over an argument of mode MODE
8278    and data type TYPE.  (TYPE is null for libcalls where that information
8279    may not be available.)
8280 
8281    Return a number of integer regsiters advanced over.  */
8282 
8283 static int
8284 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8285 			 const_tree type, HOST_WIDE_INT bytes,
8286 			 HOST_WIDE_INT words)
8287 {
8288   int res = 0;
8289   bool error_p = false;
8290 
8291   if (TARGET_IAMCU)
8292     {
8293       /* Intel MCU psABI passes scalars and aggregates no larger than 8
8294 	 bytes in registers.  */
8295       if (!VECTOR_MODE_P (mode) && bytes <= 8)
8296 	goto pass_in_reg;
8297       return res;
8298     }
8299 
8300   switch (mode)
8301     {
8302     default:
8303       break;
8304 
8305     case E_BLKmode:
8306       if (bytes < 0)
8307 	break;
8308       /* FALLTHRU */
8309 
8310     case E_DImode:
8311     case E_SImode:
8312     case E_HImode:
8313     case E_QImode:
8314 pass_in_reg:
8315       cum->words += words;
8316       cum->nregs -= words;
8317       cum->regno += words;
8318       if (cum->nregs >= 0)
8319 	res = words;
8320       if (cum->nregs <= 0)
8321 	{
8322 	  cum->nregs = 0;
8323 	  cfun->machine->arg_reg_available = false;
8324 	  cum->regno = 0;
8325 	}
8326       break;
8327 
8328     case E_OImode:
8329       /* OImode shouldn't be used directly.  */
8330       gcc_unreachable ();
8331 
8332     case E_DFmode:
8333       if (cum->float_in_sse == -1)
8334 	error_p = true;
8335       if (cum->float_in_sse < 2)
8336 	break;
8337       /* FALLTHRU */
8338     case E_SFmode:
8339       if (cum->float_in_sse == -1)
8340 	error_p = true;
8341       if (cum->float_in_sse < 1)
8342 	break;
8343       /* FALLTHRU */
8344 
8345     case E_V8SFmode:
8346     case E_V8SImode:
8347     case E_V64QImode:
8348     case E_V32HImode:
8349     case E_V16SImode:
8350     case E_V8DImode:
8351     case E_V16SFmode:
8352     case E_V8DFmode:
8353     case E_V32QImode:
8354     case E_V16HImode:
8355     case E_V4DFmode:
8356     case E_V4DImode:
8357     case E_TImode:
8358     case E_V16QImode:
8359     case E_V8HImode:
8360     case E_V4SImode:
8361     case E_V2DImode:
8362     case E_V4SFmode:
8363     case E_V2DFmode:
8364       if (!type || !AGGREGATE_TYPE_P (type))
8365 	{
8366 	  cum->sse_words += words;
8367 	  cum->sse_nregs -= 1;
8368 	  cum->sse_regno += 1;
8369 	  if (cum->sse_nregs <= 0)
8370 	    {
8371 	      cum->sse_nregs = 0;
8372 	      cum->sse_regno = 0;
8373 	    }
8374 	}
8375       break;
8376 
8377     case E_V8QImode:
8378     case E_V4HImode:
8379     case E_V2SImode:
8380     case E_V2SFmode:
8381     case E_V1TImode:
8382     case E_V1DImode:
8383       if (!type || !AGGREGATE_TYPE_P (type))
8384 	{
8385 	  cum->mmx_words += words;
8386 	  cum->mmx_nregs -= 1;
8387 	  cum->mmx_regno += 1;
8388 	  if (cum->mmx_nregs <= 0)
8389 	    {
8390 	      cum->mmx_nregs = 0;
8391 	      cum->mmx_regno = 0;
8392 	    }
8393 	}
8394       break;
8395     }
8396   if (error_p)
8397     {
8398       cum->float_in_sse = 0;
8399       error ("calling %qD with SSE calling convention without "
8400 	     "SSE/SSE2 enabled", cum->decl);
8401       sorry ("this is a GCC bug that can be worked around by adding "
8402 	     "attribute used to function called");
8403     }
8404 
8405   return res;
8406 }
8407 
8408 static int
8409 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8410 			 const_tree type, HOST_WIDE_INT words, bool named)
8411 {
8412   int int_nregs, sse_nregs;
8413 
8414   /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
8415   if (!named && (VALID_AVX512F_REG_MODE (mode)
8416 		 || VALID_AVX256_REG_MODE (mode)))
8417     return 0;
8418 
8419   if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8420       && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8421     {
8422       cum->nregs -= int_nregs;
8423       cum->sse_nregs -= sse_nregs;
8424       cum->regno += int_nregs;
8425       cum->sse_regno += sse_nregs;
8426       return int_nregs;
8427     }
8428   else
8429     {
8430       int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8431       cum->words = ROUND_UP (cum->words, align);
8432       cum->words += words;
8433       return 0;
8434     }
8435 }
8436 
8437 static int
8438 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8439 			    HOST_WIDE_INT words)
8440 {
8441   /* Otherwise, this should be passed indirect.  */
8442   gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8443 
8444   cum->words += words;
8445   if (cum->nregs > 0)
8446     {
8447       cum->nregs -= 1;
8448       cum->regno += 1;
8449       return 1;
8450     }
8451   return 0;
8452 }
8453 
8454 /* Update the data in CUM to advance over an argument of mode MODE and
8455    data type TYPE.  (TYPE is null for libcalls where that information
8456    may not be available.)  */
8457 
8458 static void
8459 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8460 			   const_tree type, bool named)
8461 {
8462   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8463   HOST_WIDE_INT bytes, words;
8464   int nregs;
8465 
8466   /* The argument of interrupt handler is a special case and is
8467      handled in ix86_function_arg.  */
8468   if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8469     return;
8470 
8471   if (mode == BLKmode)
8472     bytes = int_size_in_bytes (type);
8473   else
8474     bytes = GET_MODE_SIZE (mode);
8475   words = CEIL (bytes, UNITS_PER_WORD);
8476 
8477   if (type)
8478     mode = type_natural_mode (type, NULL, false);
8479 
8480   if ((type && POINTER_BOUNDS_TYPE_P (type))
8481       || POINTER_BOUNDS_MODE_P (mode))
8482     {
8483       /* If we pass bounds in BT then just update remained bounds count.  */
8484       if (cum->bnds_in_bt)
8485 	{
8486 	  cum->bnds_in_bt--;
8487 	  return;
8488 	}
8489 
8490       /* Update remained number of bounds to force.  */
8491       if (cum->force_bnd_pass)
8492 	cum->force_bnd_pass--;
8493 
8494       cum->bnd_regno++;
8495 
8496       return;
8497     }
8498 
8499   /* The first arg not going to Bounds Tables resets this counter.  */
8500   cum->bnds_in_bt = 0;
8501   /* For unnamed args we always pass bounds to avoid bounds mess when
8502      passed and received types do not match.  If bounds do not follow
8503      unnamed arg, still pretend required number of bounds were passed.  */
8504   if (cum->force_bnd_pass)
8505     {
8506       cum->bnd_regno += cum->force_bnd_pass;
8507       cum->force_bnd_pass = 0;
8508     }
8509 
8510   if (TARGET_64BIT)
8511     {
8512       enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8513 
8514       if (call_abi == MS_ABI)
8515 	nregs = function_arg_advance_ms_64 (cum, bytes, words);
8516       else
8517 	nregs = function_arg_advance_64 (cum, mode, type, words, named);
8518     }
8519   else
8520     nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8521 
8522   /* For stdarg we expect bounds to be passed for each value passed
8523      in register.  */
8524   if (cum->stdarg)
8525     cum->force_bnd_pass = nregs;
8526   /* For pointers passed in memory we expect bounds passed in Bounds
8527      Table.  */
8528   if (!nregs)
8529     {
8530       /* Track if there are outgoing arguments on stack.  */
8531       if (cum->caller)
8532 	cfun->machine->outgoing_args_on_stack = true;
8533 
8534       if (flag_check_pointer_bounds)
8535 	cum->bnds_in_bt = chkp_type_bounds_count (type);
8536     }
8537 }
8538 
8539 /* Define where to put the arguments to a function.
8540    Value is zero to push the argument on the stack,
8541    or a hard register in which to store the argument.
8542 
8543    MODE is the argument's machine mode.
8544    TYPE is the data type of the argument (as a tree).
8545     This is null for libcalls where that information may
8546     not be available.
8547    CUM is a variable of type CUMULATIVE_ARGS which gives info about
8548     the preceding args and about the function being called.
8549    NAMED is nonzero if this argument is a named parameter
8550     (otherwise it is an extra parameter matching an ellipsis).  */
8551 
8552 static rtx
8553 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8554 		 machine_mode orig_mode, const_tree type,
8555 		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8556 {
8557   bool error_p = false;
8558 
8559   /* Avoid the AL settings for the Unix64 ABI.  */
8560   if (mode == VOIDmode)
8561     return constm1_rtx;
8562 
8563   if (TARGET_IAMCU)
8564     {
8565       /* Intel MCU psABI passes scalars and aggregates no larger than 8
8566 	 bytes in registers.  */
8567       if (!VECTOR_MODE_P (mode) && bytes <= 8)
8568 	goto pass_in_reg;
8569       return NULL_RTX;
8570     }
8571 
8572   switch (mode)
8573     {
8574     default:
8575       break;
8576 
8577     case E_BLKmode:
8578       if (bytes < 0)
8579 	break;
8580       /* FALLTHRU */
8581     case E_DImode:
8582     case E_SImode:
8583     case E_HImode:
8584     case E_QImode:
8585 pass_in_reg:
8586       if (words <= cum->nregs)
8587 	{
8588 	  int regno = cum->regno;
8589 
8590 	  /* Fastcall allocates the first two DWORD (SImode) or
8591             smaller arguments to ECX and EDX if it isn't an
8592             aggregate type .  */
8593 	  if (cum->fastcall)
8594 	    {
8595 	      if (mode == BLKmode
8596 		  || mode == DImode
8597 		  || (type && AGGREGATE_TYPE_P (type)))
8598 	        break;
8599 
8600 	      /* ECX not EAX is the first allocated register.  */
8601 	      if (regno == AX_REG)
8602 		regno = CX_REG;
8603 	    }
8604 	  return gen_rtx_REG (mode, regno);
8605 	}
8606       break;
8607 
8608     case E_DFmode:
8609       if (cum->float_in_sse == -1)
8610 	error_p = true;
8611       if (cum->float_in_sse < 2)
8612 	break;
8613       /* FALLTHRU */
8614     case E_SFmode:
8615       if (cum->float_in_sse == -1)
8616 	error_p = true;
8617       if (cum->float_in_sse < 1)
8618 	break;
8619       /* FALLTHRU */
8620     case E_TImode:
8621       /* In 32bit, we pass TImode in xmm registers.  */
8622     case E_V16QImode:
8623     case E_V8HImode:
8624     case E_V4SImode:
8625     case E_V2DImode:
8626     case E_V4SFmode:
8627     case E_V2DFmode:
8628       if (!type || !AGGREGATE_TYPE_P (type))
8629 	{
8630 	  if (cum->sse_nregs)
8631 	    return gen_reg_or_parallel (mode, orig_mode,
8632 				        cum->sse_regno + FIRST_SSE_REG);
8633 	}
8634       break;
8635 
8636     case E_OImode:
8637     case E_XImode:
8638       /* OImode and XImode shouldn't be used directly.  */
8639       gcc_unreachable ();
8640 
8641     case E_V64QImode:
8642     case E_V32HImode:
8643     case E_V16SImode:
8644     case E_V8DImode:
8645     case E_V16SFmode:
8646     case E_V8DFmode:
8647     case E_V8SFmode:
8648     case E_V8SImode:
8649     case E_V32QImode:
8650     case E_V16HImode:
8651     case E_V4DFmode:
8652     case E_V4DImode:
8653       if (!type || !AGGREGATE_TYPE_P (type))
8654 	{
8655 	  if (cum->sse_nregs)
8656 	    return gen_reg_or_parallel (mode, orig_mode,
8657 				        cum->sse_regno + FIRST_SSE_REG);
8658 	}
8659       break;
8660 
8661     case E_V8QImode:
8662     case E_V4HImode:
8663     case E_V2SImode:
8664     case E_V2SFmode:
8665     case E_V1TImode:
8666     case E_V1DImode:
8667       if (!type || !AGGREGATE_TYPE_P (type))
8668 	{
8669 	  if (cum->mmx_nregs)
8670 	    return gen_reg_or_parallel (mode, orig_mode,
8671 				        cum->mmx_regno + FIRST_MMX_REG);
8672 	}
8673       break;
8674     }
8675   if (error_p)
8676     {
8677       cum->float_in_sse = 0;
8678       error ("calling %qD with SSE calling convention without "
8679 	     "SSE/SSE2 enabled", cum->decl);
8680       sorry ("this is a GCC bug that can be worked around by adding "
8681 	     "attribute used to function called");
8682     }
8683 
8684   return NULL_RTX;
8685 }
8686 
8687 static rtx
8688 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8689 		 machine_mode orig_mode, const_tree type, bool named)
8690 {
8691   /* Handle a hidden AL argument containing number of registers
8692      for varargs x86-64 functions.  */
8693   if (mode == VOIDmode)
8694     return GEN_INT (cum->maybe_vaarg
8695 		    ? (cum->sse_nregs < 0
8696 		       ? X86_64_SSE_REGPARM_MAX
8697 		       : cum->sse_regno)
8698 		    : -1);
8699 
8700   switch (mode)
8701     {
8702     default:
8703       break;
8704 
8705     case E_V8SFmode:
8706     case E_V8SImode:
8707     case E_V32QImode:
8708     case E_V16HImode:
8709     case E_V4DFmode:
8710     case E_V4DImode:
8711     case E_V16SFmode:
8712     case E_V16SImode:
8713     case E_V64QImode:
8714     case E_V32HImode:
8715     case E_V8DFmode:
8716     case E_V8DImode:
8717       /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
8718       if (!named)
8719 	return NULL;
8720       break;
8721     }
8722 
8723   return construct_container (mode, orig_mode, type, 0, cum->nregs,
8724 			      cum->sse_nregs,
8725 			      &x86_64_int_parameter_registers [cum->regno],
8726 			      cum->sse_regno);
8727 }
8728 
8729 static rtx
8730 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8731 		    machine_mode orig_mode, bool named,
8732 		    HOST_WIDE_INT bytes)
8733 {
8734   unsigned int regno;
8735 
8736   /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8737      We use value of -2 to specify that current function call is MSABI.  */
8738   if (mode == VOIDmode)
8739     return GEN_INT (-2);
8740 
8741   /* If we've run out of registers, it goes on the stack.  */
8742   if (cum->nregs == 0)
8743     return NULL_RTX;
8744 
8745   regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8746 
8747   /* Only floating point modes are passed in anything but integer regs.  */
8748   if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8749     {
8750       if (named)
8751 	regno = cum->regno + FIRST_SSE_REG;
8752       else
8753 	{
8754 	  rtx t1, t2;
8755 
8756 	  /* Unnamed floating parameters are passed in both the
8757 	     SSE and integer registers.  */
8758 	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8759 	  t2 = gen_rtx_REG (mode, regno);
8760 	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8761 	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8762 	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8763 	}
8764     }
8765   /* Handle aggregated types passed in register.  */
8766   if (orig_mode == BLKmode)
8767     {
8768       if (bytes > 0 && bytes <= 8)
8769         mode = (bytes > 4 ? DImode : SImode);
8770       if (mode == BLKmode)
8771         mode = DImode;
8772     }
8773 
8774   return gen_reg_or_parallel (mode, orig_mode, regno);
8775 }
8776 
8777 /* Return where to put the arguments to a function.
8778    Return zero to push the argument on the stack, or a hard register in which to store the argument.
8779 
8780    MODE is the argument's machine mode.  TYPE is the data type of the
8781    argument.  It is null for libcalls where that information may not be
8782    available.  CUM gives information about the preceding args and about
8783    the function being called.  NAMED is nonzero if this argument is a
8784    named parameter (otherwise it is an extra parameter matching an
8785    ellipsis).  */
8786 
8787 static rtx
8788 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8789 		   const_tree type, bool named)
8790 {
8791   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8792   machine_mode mode = omode;
8793   HOST_WIDE_INT bytes, words;
8794   rtx arg;
8795 
8796   if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8797     {
8798       gcc_assert (type != NULL_TREE);
8799       if (POINTER_TYPE_P (type))
8800 	{
8801 	  /* This is the pointer argument.  */
8802 	  gcc_assert (TYPE_MODE (type) == Pmode);
8803 	  /* It is at -WORD(AP) in the current frame in interrupt and
8804 	     exception handlers.  */
8805 	  arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8806 	}
8807       else
8808 	{
8809 	  gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8810 		      && TREE_CODE (type) == INTEGER_TYPE
8811 		      && TYPE_MODE (type) == word_mode);
8812 	  /* The error code is the word-mode integer argument at
8813 	     -2 * WORD(AP) in the current frame of the exception
8814 	     handler.  */
8815 	  arg = gen_rtx_MEM (word_mode,
8816 			     plus_constant (Pmode,
8817 					    arg_pointer_rtx,
8818 					    -2 * UNITS_PER_WORD));
8819 	}
8820       return arg;
8821     }
8822 
8823   /* All pointer bounds arguments are handled separately here.  */
8824   if ((type && POINTER_BOUNDS_TYPE_P (type))
8825       || POINTER_BOUNDS_MODE_P (mode))
8826     {
8827       /* Return NULL if bounds are forced to go in Bounds Table.  */
8828       if (cum->bnds_in_bt)
8829 	arg = NULL;
8830       /* Return the next available bound reg if any.  */
8831       else if (cum->bnd_regno <= LAST_BND_REG)
8832 	arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8833       /* Return the next special slot number otherwise.  */
8834       else
8835 	arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8836 
8837       return arg;
8838     }
8839 
8840   if (mode == BLKmode)
8841     bytes = int_size_in_bytes (type);
8842   else
8843     bytes = GET_MODE_SIZE (mode);
8844   words = CEIL (bytes, UNITS_PER_WORD);
8845 
8846   /* To simplify the code below, represent vector types with a vector mode
8847      even if MMX/SSE are not active.  */
8848   if (type && TREE_CODE (type) == VECTOR_TYPE)
8849     mode = type_natural_mode (type, cum, false);
8850 
8851   if (TARGET_64BIT)
8852     {
8853       enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8854 
8855       if (call_abi == MS_ABI)
8856 	arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8857       else
8858 	arg = function_arg_64 (cum, mode, omode, type, named);
8859     }
8860   else
8861     arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8862 
8863   /* Track if there are outgoing arguments on stack.  */
8864   if (arg == NULL_RTX && cum->caller)
8865     cfun->machine->outgoing_args_on_stack = true;
8866 
8867   return arg;
8868 }
8869 
8870 /* A C expression that indicates when an argument must be passed by
8871    reference.  If nonzero for an argument, a copy of that argument is
8872    made in memory and a pointer to the argument is passed instead of
8873    the argument itself.  The pointer is passed in whatever way is
8874    appropriate for passing a pointer to that type.  */
8875 
8876 static bool
8877 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8878 			const_tree type, bool)
8879 {
8880   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8881 
8882   /* Bounds are never passed by reference.  */
8883   if ((type && POINTER_BOUNDS_TYPE_P (type))
8884       || POINTER_BOUNDS_MODE_P (mode))
8885     return false;
8886 
8887   if (TARGET_64BIT)
8888     {
8889       enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8890 
8891       /* See Windows x64 Software Convention.  */
8892       if (call_abi == MS_ABI)
8893 	{
8894 	  HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8895 
8896 	  if (type)
8897 	    {
8898 	      /* Arrays are passed by reference.  */
8899 	      if (TREE_CODE (type) == ARRAY_TYPE)
8900 		return true;
8901 
8902 	      if (RECORD_OR_UNION_TYPE_P (type))
8903 		{
8904 		  /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8905 		     are passed by reference.  */
8906 		  msize = int_size_in_bytes (type);
8907 		}
8908 	    }
8909 
8910 	  /* __m128 is passed by reference.  */
8911 	  return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8912 	}
8913       else if (type && int_size_in_bytes (type) == -1)
8914 	return true;
8915     }
8916 
8917   return false;
8918 }
8919 
8920 /* Return true when TYPE should be 128bit aligned for 32bit argument
8921    passing ABI.  XXX: This function is obsolete and is only used for
8922    checking psABI compatibility with previous versions of GCC.  */
8923 
8924 static bool
8925 ix86_compat_aligned_value_p (const_tree type)
8926 {
8927   machine_mode mode = TYPE_MODE (type);
8928   if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8929        || mode == TDmode
8930        || mode == TFmode
8931        || mode == TCmode)
8932       && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8933     return true;
8934   if (TYPE_ALIGN (type) < 128)
8935     return false;
8936 
8937   if (AGGREGATE_TYPE_P (type))
8938     {
8939       /* Walk the aggregates recursively.  */
8940       switch (TREE_CODE (type))
8941 	{
8942 	case RECORD_TYPE:
8943 	case UNION_TYPE:
8944 	case QUAL_UNION_TYPE:
8945 	  {
8946 	    tree field;
8947 
8948 	    /* Walk all the structure fields.  */
8949 	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8950 	      {
8951 		if (TREE_CODE (field) == FIELD_DECL
8952 		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8953 		  return true;
8954 	      }
8955 	    break;
8956 	  }
8957 
8958 	case ARRAY_TYPE:
8959 	  /* Just for use if some languages passes arrays by value.  */
8960 	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8961 	    return true;
8962 	  break;
8963 
8964 	default:
8965 	  gcc_unreachable ();
8966 	}
8967     }
8968   return false;
8969 }
8970 
8971 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8972    XXX: This function is obsolete and is only used for checking psABI
8973    compatibility with previous versions of GCC.  */
8974 
8975 static unsigned int
8976 ix86_compat_function_arg_boundary (machine_mode mode,
8977 				   const_tree type, unsigned int align)
8978 {
8979   /* In 32bit, only _Decimal128 and __float128 are aligned to their
8980      natural boundaries.  */
8981   if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8982     {
8983       /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
8984 	 make an exception for SSE modes since these require 128bit
8985 	 alignment.
8986 
8987 	 The handling here differs from field_alignment.  ICC aligns MMX
8988 	 arguments to 4 byte boundaries, while structure fields are aligned
8989 	 to 8 byte boundaries.  */
8990       if (!type)
8991 	{
8992 	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8993 	    align = PARM_BOUNDARY;
8994 	}
8995       else
8996 	{
8997 	  if (!ix86_compat_aligned_value_p (type))
8998 	    align = PARM_BOUNDARY;
8999 	}
9000     }
9001   if (align > BIGGEST_ALIGNMENT)
9002     align = BIGGEST_ALIGNMENT;
9003   return align;
9004 }
9005 
9006 /* Return true when TYPE should be 128bit aligned for 32bit argument
9007    passing ABI.  */
9008 
9009 static bool
9010 ix86_contains_aligned_value_p (const_tree type)
9011 {
9012   machine_mode mode = TYPE_MODE (type);
9013 
9014   if (mode == XFmode || mode == XCmode)
9015     return false;
9016 
9017   if (TYPE_ALIGN (type) < 128)
9018     return false;
9019 
9020   if (AGGREGATE_TYPE_P (type))
9021     {
9022       /* Walk the aggregates recursively.  */
9023       switch (TREE_CODE (type))
9024 	{
9025 	case RECORD_TYPE:
9026 	case UNION_TYPE:
9027 	case QUAL_UNION_TYPE:
9028 	  {
9029 	    tree field;
9030 
9031 	    /* Walk all the structure fields.  */
9032 	    for (field = TYPE_FIELDS (type);
9033 		 field;
9034 		 field = DECL_CHAIN (field))
9035 	      {
9036 		if (TREE_CODE (field) == FIELD_DECL
9037 		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9038 		  return true;
9039 	      }
9040 	    break;
9041 	  }
9042 
9043 	case ARRAY_TYPE:
9044 	  /* Just for use if some languages passes arrays by value.  */
9045 	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9046 	    return true;
9047 	  break;
9048 
9049 	default:
9050 	  gcc_unreachable ();
9051 	}
9052     }
9053   else
9054     return TYPE_ALIGN (type) >= 128;
9055 
9056   return false;
9057 }
9058 
9059 /* Gives the alignment boundary, in bits, of an argument with the
9060    specified mode and type.  */
9061 
9062 static unsigned int
9063 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9064 {
9065   unsigned int align;
9066   if (type)
9067     {
9068       /* Since the main variant type is used for call, we convert it to
9069 	 the main variant type.  */
9070       type = TYPE_MAIN_VARIANT (type);
9071       align = TYPE_ALIGN (type);
9072       if (TYPE_EMPTY_P (type))
9073 	return PARM_BOUNDARY;
9074     }
9075   else
9076     align = GET_MODE_ALIGNMENT (mode);
9077   if (align < PARM_BOUNDARY)
9078     align = PARM_BOUNDARY;
9079   else
9080     {
9081       static bool warned;
9082       unsigned int saved_align = align;
9083 
9084       if (!TARGET_64BIT)
9085 	{
9086 	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
9087 	  if (!type)
9088 	    {
9089 	      if (mode == XFmode || mode == XCmode)
9090 		align = PARM_BOUNDARY;
9091 	    }
9092 	  else if (!ix86_contains_aligned_value_p (type))
9093 	    align = PARM_BOUNDARY;
9094 
9095 	  if (align < 128)
9096 	    align = PARM_BOUNDARY;
9097 	}
9098 
9099       if (warn_psabi
9100 	  && !warned
9101 	  && align != ix86_compat_function_arg_boundary (mode, type,
9102 							 saved_align))
9103 	{
9104 	  warned = true;
9105 	  inform (input_location,
9106 		  "The ABI for passing parameters with %d-byte"
9107 		  " alignment has changed in GCC 4.6",
9108 		  align / BITS_PER_UNIT);
9109 	}
9110     }
9111 
9112   return align;
9113 }
9114 
9115 /* Return true if N is a possible register number of function value.  */
9116 
9117 static bool
9118 ix86_function_value_regno_p (const unsigned int regno)
9119 {
9120   switch (regno)
9121     {
9122     case AX_REG:
9123       return true;
9124     case DX_REG:
9125       return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9126     case DI_REG:
9127     case SI_REG:
9128       return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9129 
9130     case BND0_REG:
9131     case BND1_REG:
9132       return chkp_function_instrumented_p (current_function_decl);
9133 
9134       /* Complex values are returned in %st(0)/%st(1) pair.  */
9135     case ST0_REG:
9136     case ST1_REG:
9137       /* TODO: The function should depend on current function ABI but
9138        builtins.c would need updating then. Therefore we use the
9139        default ABI.  */
9140       if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9141 	return false;
9142       return TARGET_FLOAT_RETURNS_IN_80387;
9143 
9144       /* Complex values are returned in %xmm0/%xmm1 pair.  */
9145     case XMM0_REG:
9146     case XMM1_REG:
9147       return TARGET_SSE;
9148 
9149     case MM0_REG:
9150       if (TARGET_MACHO || TARGET_64BIT)
9151 	return false;
9152       return TARGET_MMX;
9153     }
9154 
9155   return false;
9156 }
9157 
9158 /* Define how to find the value returned by a function.
9159    VALTYPE is the data type of the value (as a tree).
9160    If the precise function being called is known, FUNC is its FUNCTION_DECL;
9161    otherwise, FUNC is 0.  */
9162 
9163 static rtx
9164 function_value_32 (machine_mode orig_mode, machine_mode mode,
9165 		   const_tree fntype, const_tree fn)
9166 {
9167   unsigned int regno;
9168 
9169   /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9170      we normally prevent this case when mmx is not available.  However
9171      some ABIs may require the result to be returned like DImode.  */
9172   if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9173     regno = FIRST_MMX_REG;
9174 
9175   /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
9176      we prevent this case when sse is not available.  However some ABIs
9177      may require the result to be returned like integer TImode.  */
9178   else if (mode == TImode
9179 	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9180     regno = FIRST_SSE_REG;
9181 
9182   /* 32-byte vector modes in %ymm0.   */
9183   else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9184     regno = FIRST_SSE_REG;
9185 
9186   /* 64-byte vector modes in %zmm0.   */
9187   else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9188     regno = FIRST_SSE_REG;
9189 
9190   /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
9191   else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9192     regno = FIRST_FLOAT_REG;
9193   else
9194     /* Most things go in %eax.  */
9195     regno = AX_REG;
9196 
9197   /* Override FP return register with %xmm0 for local functions when
9198      SSE math is enabled or for functions with sseregparm attribute.  */
9199   if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9200     {
9201       int sse_level = ix86_function_sseregparm (fntype, fn, false);
9202       if (sse_level == -1)
9203 	{
9204 	  error ("calling %qD with SSE calling convention without "
9205 		 "SSE/SSE2 enabled", fn);
9206 	  sorry ("this is a GCC bug that can be worked around by adding "
9207 		 "attribute used to function called");
9208 	}
9209       else if ((sse_level >= 1 && mode == SFmode)
9210 	       || (sse_level == 2 && mode == DFmode))
9211 	regno = FIRST_SSE_REG;
9212     }
9213 
9214   /* OImode shouldn't be used directly.  */
9215   gcc_assert (mode != OImode);
9216 
9217   return gen_rtx_REG (orig_mode, regno);
9218 }
9219 
9220 static rtx
9221 function_value_64 (machine_mode orig_mode, machine_mode mode,
9222 		   const_tree valtype)
9223 {
9224   rtx ret;
9225 
9226   /* Handle libcalls, which don't provide a type node.  */
9227   if (valtype == NULL)
9228     {
9229       unsigned int regno;
9230 
9231       switch (mode)
9232 	{
9233 	case E_SFmode:
9234 	case E_SCmode:
9235 	case E_DFmode:
9236 	case E_DCmode:
9237 	case E_TFmode:
9238 	case E_SDmode:
9239 	case E_DDmode:
9240 	case E_TDmode:
9241 	  regno = FIRST_SSE_REG;
9242 	  break;
9243 	case E_XFmode:
9244 	case E_XCmode:
9245 	  regno = FIRST_FLOAT_REG;
9246 	  break;
9247 	case E_TCmode:
9248 	  return NULL;
9249 	default:
9250 	  regno = AX_REG;
9251 	}
9252 
9253       return gen_rtx_REG (mode, regno);
9254     }
9255   else if (POINTER_TYPE_P (valtype))
9256     {
9257       /* Pointers are always returned in word_mode.  */
9258       mode = word_mode;
9259     }
9260 
9261   ret = construct_container (mode, orig_mode, valtype, 1,
9262 			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9263 			     x86_64_int_return_registers, 0);
9264 
9265   /* For zero sized structures, construct_container returns NULL, but we
9266      need to keep rest of compiler happy by returning meaningful value.  */
9267   if (!ret)
9268     ret = gen_rtx_REG (orig_mode, AX_REG);
9269 
9270   return ret;
9271 }
9272 
9273 static rtx
9274 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9275 		      const_tree valtype)
9276 {
9277   unsigned int regno = AX_REG;
9278 
9279   if (TARGET_SSE)
9280     {
9281       switch (GET_MODE_SIZE (mode))
9282 	{
9283 	case 16:
9284 	  if (valtype != NULL_TREE
9285 	      && !VECTOR_INTEGER_TYPE_P (valtype)
9286 	      && !VECTOR_INTEGER_TYPE_P (valtype)
9287 	      && !INTEGRAL_TYPE_P (valtype)
9288 	      && !VECTOR_FLOAT_TYPE_P (valtype))
9289 	    break;
9290 	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9291 	      && !COMPLEX_MODE_P (mode))
9292 	    regno = FIRST_SSE_REG;
9293 	  break;
9294 	case 8:
9295 	case 4:
9296 	  if (mode == SFmode || mode == DFmode)
9297 	    regno = FIRST_SSE_REG;
9298 	  break;
9299 	default:
9300 	  break;
9301         }
9302     }
9303   return gen_rtx_REG (orig_mode, regno);
9304 }
9305 
9306 static rtx
9307 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9308 		       machine_mode orig_mode, machine_mode mode)
9309 {
9310   const_tree fn, fntype;
9311 
9312   fn = NULL_TREE;
9313   if (fntype_or_decl && DECL_P (fntype_or_decl))
9314     fn = fntype_or_decl;
9315   fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9316 
9317   if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9318       || POINTER_BOUNDS_MODE_P (mode))
9319     return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9320   else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9321     return function_value_ms_64 (orig_mode, mode, valtype);
9322   else if (TARGET_64BIT)
9323     return function_value_64 (orig_mode, mode, valtype);
9324   else
9325     return function_value_32 (orig_mode, mode, fntype, fn);
9326 }
9327 
9328 static rtx
9329 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9330 {
9331   machine_mode mode, orig_mode;
9332 
9333   orig_mode = TYPE_MODE (valtype);
9334   mode = type_natural_mode (valtype, NULL, true);
9335   return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9336 }
9337 
9338 /*  Return an RTX representing a place where a function returns
9339     or recieves pointer bounds or NULL if no bounds are returned.
9340 
9341     VALTYPE is a data type of a value returned by the function.
9342 
9343     FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9344     or FUNCTION_TYPE of the function.
9345 
9346     If OUTGOING is false, return a place in which the caller will
9347     see the return value.  Otherwise, return a place where a
9348     function returns a value.  */
9349 
9350 static rtx
9351 ix86_function_value_bounds (const_tree valtype,
9352 			    const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9353 			    bool outgoing ATTRIBUTE_UNUSED)
9354 {
9355   rtx res = NULL_RTX;
9356 
9357   if (BOUNDED_TYPE_P (valtype))
9358     res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9359   else if (chkp_type_has_pointer (valtype))
9360     {
9361       bitmap slots;
9362       rtx bounds[2];
9363       bitmap_iterator bi;
9364       unsigned i, bnd_no = 0;
9365 
9366       bitmap_obstack_initialize (NULL);
9367       slots = BITMAP_ALLOC (NULL);
9368       chkp_find_bound_slots (valtype, slots);
9369 
9370       EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9371 	{
9372 	  rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9373 	  rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9374 	  gcc_assert (bnd_no < 2);
9375 	  bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9376 	}
9377 
9378       res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9379 
9380       BITMAP_FREE (slots);
9381       bitmap_obstack_release (NULL);
9382     }
9383   else
9384     res = NULL_RTX;
9385 
9386   return res;
9387 }
9388 
9389 /* Pointer function arguments and return values are promoted to
9390    word_mode for normal functions.  */
9391 
9392 static machine_mode
9393 ix86_promote_function_mode (const_tree type, machine_mode mode,
9394 			    int *punsignedp, const_tree fntype,
9395 			    int for_return)
9396 {
9397   if (cfun->machine->func_type == TYPE_NORMAL
9398       && type != NULL_TREE
9399       && POINTER_TYPE_P (type))
9400     {
9401       *punsignedp = POINTERS_EXTEND_UNSIGNED;
9402       return word_mode;
9403     }
9404   return default_promote_function_mode (type, mode, punsignedp, fntype,
9405 					for_return);
9406 }
9407 
9408 /* Return true if a structure, union or array with MODE containing FIELD
9409    should be accessed using BLKmode.  */
9410 
9411 static bool
9412 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9413 {
9414   /* Union with XFmode must be in BLKmode.  */
9415   return (mode == XFmode
9416 	  && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9417 	      || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9418 }
9419 
9420 rtx
9421 ix86_libcall_value (machine_mode mode)
9422 {
9423   return ix86_function_value_1 (NULL, NULL, mode, mode);
9424 }
9425 
9426 /* Return true iff type is returned in memory.  */
9427 
9428 static bool
9429 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9430 {
9431 #ifdef SUBTARGET_RETURN_IN_MEMORY
9432   return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9433 #else
9434   const machine_mode mode = type_natural_mode (type, NULL, true);
9435   HOST_WIDE_INT size;
9436 
9437   if (POINTER_BOUNDS_TYPE_P (type))
9438     return false;
9439 
9440   if (TARGET_64BIT)
9441     {
9442       if (ix86_function_type_abi (fntype) == MS_ABI)
9443 	{
9444 	  size = int_size_in_bytes (type);
9445 
9446 	  /* __m128 is returned in xmm0.  */
9447 	  if ((!type || VECTOR_INTEGER_TYPE_P (type)
9448 	       || INTEGRAL_TYPE_P (type)
9449 	       || VECTOR_FLOAT_TYPE_P (type))
9450 	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9451 	      && !COMPLEX_MODE_P (mode)
9452 	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
9453 	    return false;
9454 
9455 	  /* Otherwise, the size must be exactly in [1248]. */
9456 	  return size != 1 && size != 2 && size != 4 && size != 8;
9457 	}
9458       else
9459 	{
9460 	  int needed_intregs, needed_sseregs;
9461 
9462 	  return examine_argument (mode, type, 1,
9463 				   &needed_intregs, &needed_sseregs);
9464 	}
9465     }
9466   else
9467     {
9468       size = int_size_in_bytes (type);
9469 
9470       /* Intel MCU psABI returns scalars and aggregates no larger than 8
9471 	 bytes in registers.  */
9472       if (TARGET_IAMCU)
9473 	return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9474 
9475       if (mode == BLKmode)
9476 	return true;
9477 
9478       if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9479 	return false;
9480 
9481       if (VECTOR_MODE_P (mode) || mode == TImode)
9482 	{
9483 	  /* User-created vectors small enough to fit in EAX.  */
9484 	  if (size < 8)
9485 	    return false;
9486 
9487 	  /* Unless ABI prescibes otherwise,
9488 	     MMX/3dNow values are returned in MM0 if available.  */
9489 
9490 	  if (size == 8)
9491 	    return TARGET_VECT8_RETURNS || !TARGET_MMX;
9492 
9493 	  /* SSE values are returned in XMM0 if available.  */
9494 	  if (size == 16)
9495 	    return !TARGET_SSE;
9496 
9497 	  /* AVX values are returned in YMM0 if available.  */
9498 	  if (size == 32)
9499 	    return !TARGET_AVX;
9500 
9501 	  /* AVX512F values are returned in ZMM0 if available.  */
9502 	  if (size == 64)
9503 	    return !TARGET_AVX512F;
9504 	}
9505 
9506       if (mode == XFmode)
9507 	return false;
9508 
9509       if (size > 12)
9510 	return true;
9511 
9512       /* OImode shouldn't be used directly.  */
9513       gcc_assert (mode != OImode);
9514 
9515       return false;
9516     }
9517 #endif
9518 }
9519 
9520 
9521 /* Create the va_list data type.  */
9522 
9523 static tree
9524 ix86_build_builtin_va_list_64 (void)
9525 {
9526   tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9527 
9528   record = lang_hooks.types.make_type (RECORD_TYPE);
9529   type_decl = build_decl (BUILTINS_LOCATION,
9530 			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
9531 
9532   f_gpr = build_decl (BUILTINS_LOCATION,
9533 		      FIELD_DECL, get_identifier ("gp_offset"),
9534 		      unsigned_type_node);
9535   f_fpr = build_decl (BUILTINS_LOCATION,
9536 		      FIELD_DECL, get_identifier ("fp_offset"),
9537 		      unsigned_type_node);
9538   f_ovf = build_decl (BUILTINS_LOCATION,
9539 		      FIELD_DECL, get_identifier ("overflow_arg_area"),
9540 		      ptr_type_node);
9541   f_sav = build_decl (BUILTINS_LOCATION,
9542 		      FIELD_DECL, get_identifier ("reg_save_area"),
9543 		      ptr_type_node);
9544 
9545   va_list_gpr_counter_field = f_gpr;
9546   va_list_fpr_counter_field = f_fpr;
9547 
9548   DECL_FIELD_CONTEXT (f_gpr) = record;
9549   DECL_FIELD_CONTEXT (f_fpr) = record;
9550   DECL_FIELD_CONTEXT (f_ovf) = record;
9551   DECL_FIELD_CONTEXT (f_sav) = record;
9552 
9553   TYPE_STUB_DECL (record) = type_decl;
9554   TYPE_NAME (record) = type_decl;
9555   TYPE_FIELDS (record) = f_gpr;
9556   DECL_CHAIN (f_gpr) = f_fpr;
9557   DECL_CHAIN (f_fpr) = f_ovf;
9558   DECL_CHAIN (f_ovf) = f_sav;
9559 
9560   layout_type (record);
9561 
9562   TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9563 					NULL_TREE, TYPE_ATTRIBUTES (record));
9564 
9565   /* The correct type is an array type of one element.  */
9566   return build_array_type (record, build_index_type (size_zero_node));
9567 }
9568 
9569 /* Setup the builtin va_list data type and for 64-bit the additional
9570    calling convention specific va_list data types.  */
9571 
9572 static tree
9573 ix86_build_builtin_va_list (void)
9574 {
9575   if (TARGET_64BIT)
9576     {
9577       /* Initialize ABI specific va_list builtin types.
9578 
9579 	 In lto1, we can encounter two va_list types:
9580 	 - one as a result of the type-merge across TUs, and
9581 	 - the one constructed here.
9582 	 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9583 	 a type identity check in canonical_va_list_type based on
9584 	 TYPE_MAIN_VARIANT (which we used to have) will not work.
9585 	 Instead, we tag each va_list_type_node with its unique attribute, and
9586 	 look for the attribute in the type identity check in
9587 	 canonical_va_list_type.
9588 
9589 	 Tagging sysv_va_list_type_node directly with the attribute is
9590 	 problematic since it's a array of one record, which will degrade into a
9591 	 pointer to record when used as parameter (see build_va_arg comments for
9592 	 an example), dropping the attribute in the process.  So we tag the
9593 	 record instead.  */
9594 
9595       /* For SYSV_ABI we use an array of one record.  */
9596       sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9597 
9598       /* For MS_ABI we use plain pointer to argument area.  */
9599       tree char_ptr_type = build_pointer_type (char_type_node);
9600       tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9601 			     TYPE_ATTRIBUTES (char_ptr_type));
9602       ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9603 
9604       return ((ix86_abi == MS_ABI)
9605 	      ? ms_va_list_type_node
9606 	      : sysv_va_list_type_node);
9607     }
9608   else
9609     {
9610       /* For i386 we use plain pointer to argument area.  */
9611       return build_pointer_type (char_type_node);
9612     }
9613 }
9614 
9615 /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
9616 
9617 static void
9618 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9619 {
9620   rtx save_area, mem;
9621   alias_set_type set;
9622   int i, max;
9623 
9624   /* GPR size of varargs save area.  */
9625   if (cfun->va_list_gpr_size)
9626     ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9627   else
9628     ix86_varargs_gpr_size = 0;
9629 
9630   /* FPR size of varargs save area.  We don't need it if we don't pass
9631      anything in SSE registers.  */
9632   if (TARGET_SSE && cfun->va_list_fpr_size)
9633     ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9634   else
9635     ix86_varargs_fpr_size = 0;
9636 
9637   if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9638     return;
9639 
9640   save_area = frame_pointer_rtx;
9641   set = get_varargs_alias_set ();
9642 
9643   max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9644   if (max > X86_64_REGPARM_MAX)
9645     max = X86_64_REGPARM_MAX;
9646 
9647   for (i = cum->regno; i < max; i++)
9648     {
9649       mem = gen_rtx_MEM (word_mode,
9650 			 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9651       MEM_NOTRAP_P (mem) = 1;
9652       set_mem_alias_set (mem, set);
9653       emit_move_insn (mem,
9654 		      gen_rtx_REG (word_mode,
9655 				   x86_64_int_parameter_registers[i]));
9656     }
9657 
9658   if (ix86_varargs_fpr_size)
9659     {
9660       machine_mode smode;
9661       rtx_code_label *label;
9662       rtx test;
9663 
9664       /* Now emit code to save SSE registers.  The AX parameter contains number
9665 	 of SSE parameter registers used to call this function, though all we
9666 	 actually check here is the zero/non-zero status.  */
9667 
9668       label = gen_label_rtx ();
9669       test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9670       emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9671 				      label));
9672 
9673       /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9674 	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
9675 	 be if we could determine the real mode of the data, via a hook
9676 	 into pass_stdarg.  Ignore all that for now.  */
9677       smode = V4SFmode;
9678       if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9679 	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9680 
9681       max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9682       if (max > X86_64_SSE_REGPARM_MAX)
9683 	max = X86_64_SSE_REGPARM_MAX;
9684 
9685       for (i = cum->sse_regno; i < max; ++i)
9686 	{
9687 	  mem = plus_constant (Pmode, save_area,
9688 			       i * 16 + ix86_varargs_gpr_size);
9689 	  mem = gen_rtx_MEM (smode, mem);
9690 	  MEM_NOTRAP_P (mem) = 1;
9691 	  set_mem_alias_set (mem, set);
9692 	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9693 
9694 	  emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
9695 	}
9696 
9697       emit_label (label);
9698     }
9699 }
9700 
9701 static void
9702 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9703 {
9704   alias_set_type set = get_varargs_alias_set ();
9705   int i;
9706 
9707   /* Reset to zero, as there might be a sysv vaarg used
9708      before.  */
9709   ix86_varargs_gpr_size = 0;
9710   ix86_varargs_fpr_size = 0;
9711 
9712   for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9713     {
9714       rtx reg, mem;
9715 
9716       mem = gen_rtx_MEM (Pmode,
9717 			 plus_constant (Pmode, virtual_incoming_args_rtx,
9718 					i * UNITS_PER_WORD));
9719       MEM_NOTRAP_P (mem) = 1;
9720       set_mem_alias_set (mem, set);
9721 
9722       reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9723       emit_move_insn (mem, reg);
9724     }
9725 }
9726 
9727 static void
9728 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9729 			     tree type, int *, int no_rtl)
9730 {
9731   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9732   CUMULATIVE_ARGS next_cum;
9733   tree fntype;
9734 
9735   /* This argument doesn't appear to be used anymore.  Which is good,
9736      because the old code here didn't suppress rtl generation.  */
9737   gcc_assert (!no_rtl);
9738 
9739   if (!TARGET_64BIT)
9740     return;
9741 
9742   fntype = TREE_TYPE (current_function_decl);
9743 
9744   /* For varargs, we do not want to skip the dummy va_dcl argument.
9745      For stdargs, we do want to skip the last named argument.  */
9746   next_cum = *cum;
9747   if (stdarg_p (fntype))
9748     ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9749 			       true);
9750 
9751   if (cum->call_abi == MS_ABI)
9752     setup_incoming_varargs_ms_64 (&next_cum);
9753   else
9754     setup_incoming_varargs_64 (&next_cum);
9755 }
9756 
9757 static void
9758 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9759 				   machine_mode mode,
9760 				   tree type,
9761 				   int *pretend_size ATTRIBUTE_UNUSED,
9762 				   int no_rtl)
9763 {
9764   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9765   CUMULATIVE_ARGS next_cum;
9766   tree fntype;
9767   rtx save_area;
9768   int bnd_reg, i, max;
9769 
9770   gcc_assert (!no_rtl);
9771 
9772   /* Do nothing if we use plain pointer to argument area.  */
9773   if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9774     return;
9775 
9776   fntype = TREE_TYPE (current_function_decl);
9777 
9778   /* For varargs, we do not want to skip the dummy va_dcl argument.
9779      For stdargs, we do want to skip the last named argument.  */
9780   next_cum = *cum;
9781   if (stdarg_p (fntype))
9782     ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9783 			       true);
9784   save_area = frame_pointer_rtx;
9785 
9786   max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9787   if (max > X86_64_REGPARM_MAX)
9788     max = X86_64_REGPARM_MAX;
9789 
9790   bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9791   if (chkp_function_instrumented_p (current_function_decl))
9792     for (i = cum->regno; i < max; i++)
9793       {
9794 	rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9795 	rtx ptr = gen_rtx_REG (Pmode,
9796 			       x86_64_int_parameter_registers[i]);
9797 	rtx bounds;
9798 
9799 	if (bnd_reg <= LAST_BND_REG)
9800 	  bounds = gen_rtx_REG (BNDmode, bnd_reg);
9801 	else
9802 	  {
9803 	    rtx ldx_addr =
9804 	      plus_constant (Pmode, arg_pointer_rtx,
9805 			     (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9806 	    bounds = gen_reg_rtx (BNDmode);
9807 	    emit_insn (BNDmode == BND64mode
9808 		       ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9809 		       : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9810 	  }
9811 
9812 	emit_insn (BNDmode == BND64mode
9813 		   ? gen_bnd64_stx (addr, ptr, bounds)
9814 		   : gen_bnd32_stx (addr, ptr, bounds));
9815 
9816 	bnd_reg++;
9817       }
9818 }
9819 
9820 
9821 /* Checks if TYPE is of kind va_list char *.  */
9822 
9823 static bool
9824 is_va_list_char_pointer (tree type)
9825 {
9826   tree canonic;
9827 
9828   /* For 32-bit it is always true.  */
9829   if (!TARGET_64BIT)
9830     return true;
9831   canonic = ix86_canonical_va_list_type (type);
9832   return (canonic == ms_va_list_type_node
9833           || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9834 }
9835 
9836 /* Implement va_start.  */
9837 
9838 static void
9839 ix86_va_start (tree valist, rtx nextarg)
9840 {
9841   HOST_WIDE_INT words, n_gpr, n_fpr;
9842   tree f_gpr, f_fpr, f_ovf, f_sav;
9843   tree gpr, fpr, ovf, sav, t;
9844   tree type;
9845   rtx ovf_rtx;
9846 
9847   if (flag_split_stack
9848       && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9849     {
9850       unsigned int scratch_regno;
9851 
9852       /* When we are splitting the stack, we can't refer to the stack
9853 	 arguments using internal_arg_pointer, because they may be on
9854 	 the old stack.  The split stack prologue will arrange to
9855 	 leave a pointer to the old stack arguments in a scratch
9856 	 register, which we here copy to a pseudo-register.  The split
9857 	 stack prologue can't set the pseudo-register directly because
9858 	 it (the prologue) runs before any registers have been saved.  */
9859 
9860       scratch_regno = split_stack_prologue_scratch_regno ();
9861       if (scratch_regno != INVALID_REGNUM)
9862 	{
9863 	  rtx reg;
9864 	  rtx_insn *seq;
9865 
9866 	  reg = gen_reg_rtx (Pmode);
9867 	  cfun->machine->split_stack_varargs_pointer = reg;
9868 
9869 	  start_sequence ();
9870 	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9871 	  seq = get_insns ();
9872 	  end_sequence ();
9873 
9874 	  push_topmost_sequence ();
9875 	  emit_insn_after (seq, entry_of_function ());
9876 	  pop_topmost_sequence ();
9877 	}
9878     }
9879 
9880   /* Only 64bit target needs something special.  */
9881   if (is_va_list_char_pointer (TREE_TYPE (valist)))
9882     {
9883       if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9884 	std_expand_builtin_va_start (valist, nextarg);
9885       else
9886 	{
9887 	  rtx va_r, next;
9888 
9889 	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9890 	  next = expand_binop (ptr_mode, add_optab,
9891 			       cfun->machine->split_stack_varargs_pointer,
9892 			       crtl->args.arg_offset_rtx,
9893 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
9894 	  convert_move (va_r, next, 0);
9895 
9896 	  /* Store zero bounds for va_list.  */
9897 	  if (chkp_function_instrumented_p (current_function_decl))
9898 	    chkp_expand_bounds_reset_for_mem (valist,
9899 					      make_tree (TREE_TYPE (valist),
9900 							 next));
9901 
9902 	}
9903       return;
9904     }
9905 
9906   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9907   f_fpr = DECL_CHAIN (f_gpr);
9908   f_ovf = DECL_CHAIN (f_fpr);
9909   f_sav = DECL_CHAIN (f_ovf);
9910 
9911   valist = build_simple_mem_ref (valist);
9912   TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9913   /* The following should be folded into the MEM_REF offset.  */
9914   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9915 		f_gpr, NULL_TREE);
9916   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9917 		f_fpr, NULL_TREE);
9918   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9919 		f_ovf, NULL_TREE);
9920   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9921 		f_sav, NULL_TREE);
9922 
9923   /* Count number of gp and fp argument registers used.  */
9924   words = crtl->args.info.words;
9925   n_gpr = crtl->args.info.regno;
9926   n_fpr = crtl->args.info.sse_regno;
9927 
9928   if (cfun->va_list_gpr_size)
9929     {
9930       type = TREE_TYPE (gpr);
9931       t = build2 (MODIFY_EXPR, type,
9932 		  gpr, build_int_cst (type, n_gpr * 8));
9933       TREE_SIDE_EFFECTS (t) = 1;
9934       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9935     }
9936 
9937   if (TARGET_SSE && cfun->va_list_fpr_size)
9938     {
9939       type = TREE_TYPE (fpr);
9940       t = build2 (MODIFY_EXPR, type, fpr,
9941 		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9942       TREE_SIDE_EFFECTS (t) = 1;
9943       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9944     }
9945 
9946   /* Find the overflow area.  */
9947   type = TREE_TYPE (ovf);
9948   if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9949     ovf_rtx = crtl->args.internal_arg_pointer;
9950   else
9951     ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9952   t = make_tree (type, ovf_rtx);
9953   if (words != 0)
9954     t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9955 
9956   /* Store zero bounds for overflow area pointer.  */
9957   if (chkp_function_instrumented_p (current_function_decl))
9958     chkp_expand_bounds_reset_for_mem (ovf, t);
9959 
9960   t = build2 (MODIFY_EXPR, type, ovf, t);
9961   TREE_SIDE_EFFECTS (t) = 1;
9962   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9963 
9964   if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9965     {
9966       /* Find the register save area.
9967 	 Prologue of the function save it right above stack frame.  */
9968       type = TREE_TYPE (sav);
9969       t = make_tree (type, frame_pointer_rtx);
9970       if (!ix86_varargs_gpr_size)
9971 	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9972 
9973       /* Store zero bounds for save area pointer.  */
9974       if (chkp_function_instrumented_p (current_function_decl))
9975 	chkp_expand_bounds_reset_for_mem (sav, t);
9976 
9977       t = build2 (MODIFY_EXPR, type, sav, t);
9978       TREE_SIDE_EFFECTS (t) = 1;
9979       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9980     }
9981 }
9982 
9983 /* Implement va_arg.  */
9984 
9985 static tree
9986 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9987 		      gimple_seq *post_p)
9988 {
9989   static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9990   tree f_gpr, f_fpr, f_ovf, f_sav;
9991   tree gpr, fpr, ovf, sav, t;
9992   int size, rsize;
9993   tree lab_false, lab_over = NULL_TREE;
9994   tree addr, t2;
9995   rtx container;
9996   int indirect_p = 0;
9997   tree ptrtype;
9998   machine_mode nat_mode;
9999   unsigned int arg_boundary;
10000 
10001   /* Only 64bit target needs something special.  */
10002   if (is_va_list_char_pointer (TREE_TYPE (valist)))
10003     return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
10004 
10005   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10006   f_fpr = DECL_CHAIN (f_gpr);
10007   f_ovf = DECL_CHAIN (f_fpr);
10008   f_sav = DECL_CHAIN (f_ovf);
10009 
10010   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
10011 		valist, f_gpr, NULL_TREE);
10012 
10013   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
10014   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
10015   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
10016 
10017   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10018   if (indirect_p)
10019     type = build_pointer_type (type);
10020   size = arg_int_size_in_bytes (type);
10021   rsize = CEIL (size, UNITS_PER_WORD);
10022 
10023   nat_mode = type_natural_mode (type, NULL, false);
10024   switch (nat_mode)
10025     {
10026     case E_V8SFmode:
10027     case E_V8SImode:
10028     case E_V32QImode:
10029     case E_V16HImode:
10030     case E_V4DFmode:
10031     case E_V4DImode:
10032     case E_V16SFmode:
10033     case E_V16SImode:
10034     case E_V64QImode:
10035     case E_V32HImode:
10036     case E_V8DFmode:
10037     case E_V8DImode:
10038       /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
10039       if (!TARGET_64BIT_MS_ABI)
10040 	{
10041 	  container = NULL;
10042 	  break;
10043 	}
10044       /* FALLTHRU */
10045 
10046     default:
10047       container = construct_container (nat_mode, TYPE_MODE (type),
10048 				       type, 0, X86_64_REGPARM_MAX,
10049 				       X86_64_SSE_REGPARM_MAX, intreg,
10050 				       0);
10051       break;
10052     }
10053 
10054   /* Pull the value out of the saved registers.  */
10055 
10056   addr = create_tmp_var (ptr_type_node, "addr");
10057 
10058   if (container)
10059     {
10060       int needed_intregs, needed_sseregs;
10061       bool need_temp;
10062       tree int_addr, sse_addr;
10063 
10064       lab_false = create_artificial_label (UNKNOWN_LOCATION);
10065       lab_over = create_artificial_label (UNKNOWN_LOCATION);
10066 
10067       examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
10068 
10069       need_temp = (!REG_P (container)
10070 		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
10071 		       || TYPE_ALIGN (type) > 128));
10072 
10073       /* In case we are passing structure, verify that it is consecutive block
10074          on the register save area.  If not we need to do moves.  */
10075       if (!need_temp && !REG_P (container))
10076 	{
10077 	  /* Verify that all registers are strictly consecutive  */
10078 	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10079 	    {
10080 	      int i;
10081 
10082 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10083 		{
10084 		  rtx slot = XVECEXP (container, 0, i);
10085 		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10086 		      || INTVAL (XEXP (slot, 1)) != i * 16)
10087 		    need_temp = true;
10088 		}
10089 	    }
10090 	  else
10091 	    {
10092 	      int i;
10093 
10094 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10095 		{
10096 		  rtx slot = XVECEXP (container, 0, i);
10097 		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10098 		      || INTVAL (XEXP (slot, 1)) != i * 8)
10099 		    need_temp = true;
10100 		}
10101 	    }
10102 	}
10103       if (!need_temp)
10104 	{
10105 	  int_addr = addr;
10106 	  sse_addr = addr;
10107 	}
10108       else
10109 	{
10110 	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
10111 	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10112 	}
10113 
10114       /* First ensure that we fit completely in registers.  */
10115       if (needed_intregs)
10116 	{
10117 	  t = build_int_cst (TREE_TYPE (gpr),
10118 			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10119 	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10120 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10121 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10122 	  gimplify_and_add (t, pre_p);
10123 	}
10124       if (needed_sseregs)
10125 	{
10126 	  t = build_int_cst (TREE_TYPE (fpr),
10127 			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10128 			     + X86_64_REGPARM_MAX * 8);
10129 	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10130 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10131 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10132 	  gimplify_and_add (t, pre_p);
10133 	}
10134 
10135       /* Compute index to start of area used for integer regs.  */
10136       if (needed_intregs)
10137 	{
10138 	  /* int_addr = gpr + sav; */
10139 	  t = fold_build_pointer_plus (sav, gpr);
10140 	  gimplify_assign (int_addr, t, pre_p);
10141 	}
10142       if (needed_sseregs)
10143 	{
10144 	  /* sse_addr = fpr + sav; */
10145 	  t = fold_build_pointer_plus (sav, fpr);
10146 	  gimplify_assign (sse_addr, t, pre_p);
10147 	}
10148       if (need_temp)
10149 	{
10150 	  int i, prev_size = 0;
10151 	  tree temp = create_tmp_var (type, "va_arg_tmp");
10152 
10153 	  /* addr = &temp; */
10154 	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10155 	  gimplify_assign (addr, t, pre_p);
10156 
10157 	  for (i = 0; i < XVECLEN (container, 0); i++)
10158 	    {
10159 	      rtx slot = XVECEXP (container, 0, i);
10160 	      rtx reg = XEXP (slot, 0);
10161 	      machine_mode mode = GET_MODE (reg);
10162 	      tree piece_type;
10163 	      tree addr_type;
10164 	      tree daddr_type;
10165 	      tree src_addr, src;
10166 	      int src_offset;
10167 	      tree dest_addr, dest;
10168 	      int cur_size = GET_MODE_SIZE (mode);
10169 
10170 	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10171 	      prev_size = INTVAL (XEXP (slot, 1));
10172 	      if (prev_size + cur_size > size)
10173 		{
10174 		  cur_size = size - prev_size;
10175 		  unsigned int nbits = cur_size * BITS_PER_UNIT;
10176 		  if (!int_mode_for_size (nbits, 1).exists (&mode))
10177 		    mode = QImode;
10178 		}
10179 	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
10180 	      if (mode == GET_MODE (reg))
10181 		addr_type = build_pointer_type (piece_type);
10182 	      else
10183 		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10184 							 true);
10185 	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10186 							true);
10187 
10188 	      if (SSE_REGNO_P (REGNO (reg)))
10189 		{
10190 		  src_addr = sse_addr;
10191 		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10192 		}
10193 	      else
10194 		{
10195 		  src_addr = int_addr;
10196 		  src_offset = REGNO (reg) * 8;
10197 		}
10198 	      src_addr = fold_convert (addr_type, src_addr);
10199 	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10200 
10201 	      dest_addr = fold_convert (daddr_type, addr);
10202 	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10203 	      if (cur_size == GET_MODE_SIZE (mode))
10204 		{
10205 		  src = build_va_arg_indirect_ref (src_addr);
10206 		  dest = build_va_arg_indirect_ref (dest_addr);
10207 
10208 		  gimplify_assign (dest, src, pre_p);
10209 		}
10210 	      else
10211 		{
10212 		  tree copy
10213 		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10214 				       3, dest_addr, src_addr,
10215 				       size_int (cur_size));
10216 		  gimplify_and_add (copy, pre_p);
10217 		}
10218 	      prev_size += cur_size;
10219 	    }
10220 	}
10221 
10222       if (needed_intregs)
10223 	{
10224 	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10225 		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10226 	  gimplify_assign (gpr, t, pre_p);
10227 	}
10228 
10229       if (needed_sseregs)
10230 	{
10231 	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10232 		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10233 	  gimplify_assign (unshare_expr (fpr), t, pre_p);
10234 	}
10235 
10236       gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10237 
10238       gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10239     }
10240 
10241   /* ... otherwise out of the overflow area.  */
10242 
10243   /* When we align parameter on stack for caller, if the parameter
10244      alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10245      aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
10246      here with caller.  */
10247   arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10248   if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10249     arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10250 
10251   /* Care for on-stack alignment if needed.  */
10252   if (arg_boundary <= 64 || size == 0)
10253     t = ovf;
10254  else
10255     {
10256       HOST_WIDE_INT align = arg_boundary / 8;
10257       t = fold_build_pointer_plus_hwi (ovf, align - 1);
10258       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10259 		  build_int_cst (TREE_TYPE (t), -align));
10260     }
10261 
10262   gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10263   gimplify_assign (addr, t, pre_p);
10264 
10265   t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10266   gimplify_assign (unshare_expr (ovf), t, pre_p);
10267 
10268   if (container)
10269     gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10270 
10271   ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10272   addr = fold_convert (ptrtype, addr);
10273 
10274   if (indirect_p)
10275     addr = build_va_arg_indirect_ref (addr);
10276   return build_va_arg_indirect_ref (addr);
10277 }
10278 
10279 /* Return true if OPNUM's MEM should be matched
10280    in movabs* patterns.  */
10281 
10282 bool
10283 ix86_check_movabs (rtx insn, int opnum)
10284 {
10285   rtx set, mem;
10286 
10287   set = PATTERN (insn);
10288   if (GET_CODE (set) == PARALLEL)
10289     set = XVECEXP (set, 0, 0);
10290   gcc_assert (GET_CODE (set) == SET);
10291   mem = XEXP (set, opnum);
10292   while (SUBREG_P (mem))
10293     mem = SUBREG_REG (mem);
10294   gcc_assert (MEM_P (mem));
10295   return volatile_ok || !MEM_VOLATILE_P (mem);
10296 }
10297 
10298 /* Return false if INSN contains a MEM with a non-default address space.  */
10299 bool
10300 ix86_check_no_addr_space (rtx insn)
10301 {
10302   subrtx_var_iterator::array_type array;
10303   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10304     {
10305       rtx x = *iter;
10306       if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10307 	return false;
10308     }
10309   return true;
10310 }
10311 
10312 /* Initialize the table of extra 80387 mathematical constants.  */
10313 
10314 static void
10315 init_ext_80387_constants (void)
10316 {
10317   static const char * cst[5] =
10318   {
10319     "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
10320     "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
10321     "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
10322     "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
10323     "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
10324   };
10325   int i;
10326 
10327   for (i = 0; i < 5; i++)
10328     {
10329       real_from_string (&ext_80387_constants_table[i], cst[i]);
10330       /* Ensure each constant is rounded to XFmode precision.  */
10331       real_convert (&ext_80387_constants_table[i],
10332 		    XFmode, &ext_80387_constants_table[i]);
10333     }
10334 
10335   ext_80387_constants_init = 1;
10336 }
10337 
10338 /* Return non-zero if the constant is something that
10339    can be loaded with a special instruction.  */
10340 
10341 int
10342 standard_80387_constant_p (rtx x)
10343 {
10344   machine_mode mode = GET_MODE (x);
10345 
10346   const REAL_VALUE_TYPE *r;
10347 
10348   if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10349     return -1;
10350 
10351   if (x == CONST0_RTX (mode))
10352     return 1;
10353   if (x == CONST1_RTX (mode))
10354     return 2;
10355 
10356   r = CONST_DOUBLE_REAL_VALUE (x);
10357 
10358   /* For XFmode constants, try to find a special 80387 instruction when
10359      optimizing for size or on those CPUs that benefit from them.  */
10360   if (mode == XFmode
10361       && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10362     {
10363       int i;
10364 
10365       if (! ext_80387_constants_init)
10366 	init_ext_80387_constants ();
10367 
10368       for (i = 0; i < 5; i++)
10369         if (real_identical (r, &ext_80387_constants_table[i]))
10370 	  return i + 3;
10371     }
10372 
10373   /* Load of the constant -0.0 or -1.0 will be split as
10374      fldz;fchs or fld1;fchs sequence.  */
10375   if (real_isnegzero (r))
10376     return 8;
10377   if (real_identical (r, &dconstm1))
10378     return 9;
10379 
10380   return 0;
10381 }
10382 
10383 /* Return the opcode of the special instruction to be used to load
10384    the constant X.  */
10385 
10386 const char *
10387 standard_80387_constant_opcode (rtx x)
10388 {
10389   switch (standard_80387_constant_p (x))
10390     {
10391     case 1:
10392       return "fldz";
10393     case 2:
10394       return "fld1";
10395     case 3:
10396       return "fldlg2";
10397     case 4:
10398       return "fldln2";
10399     case 5:
10400       return "fldl2e";
10401     case 6:
10402       return "fldl2t";
10403     case 7:
10404       return "fldpi";
10405     case 8:
10406     case 9:
10407       return "#";
10408     default:
10409       gcc_unreachable ();
10410     }
10411 }
10412 
10413 /* Return the CONST_DOUBLE representing the 80387 constant that is
10414    loaded by the specified special instruction.  The argument IDX
10415    matches the return value from standard_80387_constant_p.  */
10416 
10417 rtx
10418 standard_80387_constant_rtx (int idx)
10419 {
10420   int i;
10421 
10422   if (! ext_80387_constants_init)
10423     init_ext_80387_constants ();
10424 
10425   switch (idx)
10426     {
10427     case 3:
10428     case 4:
10429     case 5:
10430     case 6:
10431     case 7:
10432       i = idx - 3;
10433       break;
10434 
10435     default:
10436       gcc_unreachable ();
10437     }
10438 
10439   return const_double_from_real_value (ext_80387_constants_table[i],
10440 				       XFmode);
10441 }
10442 
10443 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10444    in supported SSE/AVX vector mode.  */
10445 
10446 int
10447 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10448 {
10449   machine_mode mode;
10450 
10451   if (!TARGET_SSE)
10452     return 0;
10453 
10454   mode = GET_MODE (x);
10455 
10456   if (x == const0_rtx || const0_operand (x, mode))
10457     return 1;
10458 
10459   if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10460     {
10461       /* VOIDmode integer constant, get mode from the predicate.  */
10462       if (mode == VOIDmode)
10463 	mode = pred_mode;
10464 
10465       switch (GET_MODE_SIZE (mode))
10466 	{
10467 	case 64:
10468 	  if (TARGET_AVX512F)
10469 	    return 2;
10470 	  break;
10471 	case 32:
10472 	  if (TARGET_AVX2)
10473 	    return 2;
10474 	  break;
10475 	case 16:
10476 	  if (TARGET_SSE2)
10477 	    return 2;
10478 	  break;
10479 	case 0:
10480 	  /* VOIDmode */
10481 	  gcc_unreachable ();
10482 	default:
10483 	  break;
10484 	}
10485     }
10486 
10487   return 0;
10488 }
10489 
10490 /* Return the opcode of the special instruction to be used to load
10491    the constant operands[1] into operands[0].  */
10492 
10493 const char *
10494 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10495 {
10496   machine_mode mode;
10497   rtx x = operands[1];
10498 
10499   gcc_assert (TARGET_SSE);
10500 
10501   mode = GET_MODE (x);
10502 
10503   if (x == const0_rtx || const0_operand (x, mode))
10504     {
10505       switch (get_attr_mode (insn))
10506 	{
10507 	case MODE_TI:
10508 	  if (!EXT_REX_SSE_REG_P (operands[0]))
10509 	    return "%vpxor\t%0, %d0";
10510 	  /* FALLTHRU */
10511 	case MODE_XI:
10512 	case MODE_OI:
10513 	  if (EXT_REX_SSE_REG_P (operands[0]))
10514 	    return (TARGET_AVX512VL
10515 		    ? "vpxord\t%x0, %x0, %x0"
10516 		    : "vpxord\t%g0, %g0, %g0");
10517 	  return "vpxor\t%x0, %x0, %x0";
10518 
10519 	case MODE_V2DF:
10520 	  if (!EXT_REX_SSE_REG_P (operands[0]))
10521 	    return "%vxorpd\t%0, %d0";
10522 	  /* FALLTHRU */
10523 	case MODE_V8DF:
10524 	case MODE_V4DF:
10525 	  if (!EXT_REX_SSE_REG_P (operands[0]))
10526 	    return "vxorpd\t%x0, %x0, %x0";
10527 	  else if (TARGET_AVX512DQ)
10528 	    return (TARGET_AVX512VL
10529 		    ? "vxorpd\t%x0, %x0, %x0"
10530 		    : "vxorpd\t%g0, %g0, %g0");
10531 	  else
10532 	    return (TARGET_AVX512VL
10533 		    ? "vpxorq\t%x0, %x0, %x0"
10534 		    : "vpxorq\t%g0, %g0, %g0");
10535 
10536 	case MODE_V4SF:
10537 	  if (!EXT_REX_SSE_REG_P (operands[0]))
10538 	    return "%vxorps\t%0, %d0";
10539 	  /* FALLTHRU */
10540 	case MODE_V16SF:
10541 	case MODE_V8SF:
10542 	  if (!EXT_REX_SSE_REG_P (operands[0]))
10543 	    return "vxorps\t%x0, %x0, %x0";
10544 	  else if (TARGET_AVX512DQ)
10545 	    return (TARGET_AVX512VL
10546 		    ? "vxorps\t%x0, %x0, %x0"
10547 		    : "vxorps\t%g0, %g0, %g0");
10548 	  else
10549 	    return (TARGET_AVX512VL
10550 		    ? "vpxord\t%x0, %x0, %x0"
10551 		    : "vpxord\t%g0, %g0, %g0");
10552 
10553 	default:
10554 	  gcc_unreachable ();
10555 	}
10556     }
10557   else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10558     {
10559       enum attr_mode insn_mode = get_attr_mode (insn);
10560 
10561       switch (insn_mode)
10562 	{
10563 	case MODE_XI:
10564 	case MODE_V8DF:
10565 	case MODE_V16SF:
10566 	  gcc_assert (TARGET_AVX512F);
10567 	  return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10568 
10569 	case MODE_OI:
10570 	case MODE_V4DF:
10571 	case MODE_V8SF:
10572 	  gcc_assert (TARGET_AVX2);
10573 	  /* FALLTHRU */
10574 	case MODE_TI:
10575 	case MODE_V2DF:
10576 	case MODE_V4SF:
10577 	  gcc_assert (TARGET_SSE2);
10578 	  if (!EXT_REX_SSE_REG_P (operands[0]))
10579 	    return (TARGET_AVX
10580 		    ? "vpcmpeqd\t%0, %0, %0"
10581 		    : "pcmpeqd\t%0, %0");
10582 	  else if (TARGET_AVX512VL)
10583 	    return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10584 	  else
10585 	    return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10586 
10587 	default:
10588 	  gcc_unreachable ();
10589 	}
10590    }
10591 
10592   gcc_unreachable ();
10593 }
10594 
10595 /* Returns true if INSN can be transformed from a memory load
10596    to a supported FP constant load.  */
10597 
10598 bool
10599 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10600 {
10601   rtx src = find_constant_src (insn);
10602 
10603   gcc_assert (REG_P (dst));
10604 
10605   if (src == NULL
10606       || (SSE_REGNO_P (REGNO (dst))
10607 	  && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10608       || (STACK_REGNO_P (REGNO (dst))
10609 	   && standard_80387_constant_p (src) < 1))
10610     return false;
10611 
10612   return true;
10613 }
10614 
10615 /* Returns true if OP contains a symbol reference */
10616 
10617 bool
10618 symbolic_reference_mentioned_p (rtx op)
10619 {
10620   const char *fmt;
10621   int i;
10622 
10623   if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10624     return true;
10625 
10626   fmt = GET_RTX_FORMAT (GET_CODE (op));
10627   for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10628     {
10629       if (fmt[i] == 'E')
10630 	{
10631 	  int j;
10632 
10633 	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10634 	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10635 	      return true;
10636 	}
10637 
10638       else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10639 	return true;
10640     }
10641 
10642   return false;
10643 }
10644 
10645 /* Return true if it is appropriate to emit `ret' instructions in the
10646    body of a function.  Do this only if the epilogue is simple, needing a
10647    couple of insns.  Prior to reloading, we can't tell how many registers
10648    must be saved, so return false then.  Return false if there is no frame
10649    marker to de-allocate.  */
10650 
10651 bool
10652 ix86_can_use_return_insn_p (void)
10653 {
10654   if (ix86_function_naked (current_function_decl))
10655     return false;
10656 
10657   /* Don't use `ret' instruction in interrupt handler.  */
10658   if (! reload_completed
10659       || frame_pointer_needed
10660       || cfun->machine->func_type != TYPE_NORMAL)
10661     return 0;
10662 
10663   /* Don't allow more than 32k pop, since that's all we can do
10664      with one instruction.  */
10665   if (crtl->args.pops_args && crtl->args.size >= 32768)
10666     return 0;
10667 
10668   struct ix86_frame &frame = cfun->machine->frame;
10669   return (frame.stack_pointer_offset == UNITS_PER_WORD
10670 	  && (frame.nregs + frame.nsseregs) == 0);
10671 }
10672 
10673 /* Value should be nonzero if functions must have frame pointers.
10674    Zero means the frame pointer need not be set up (and parms may
10675    be accessed via the stack pointer) in functions that seem suitable.  */
10676 
10677 static bool
10678 ix86_frame_pointer_required (void)
10679 {
10680   /* If we accessed previous frames, then the generated code expects
10681      to be able to access the saved ebp value in our frame.  */
10682   if (cfun->machine->accesses_prev_frame)
10683     return true;
10684 
10685   /* Several x86 os'es need a frame pointer for other reasons,
10686      usually pertaining to setjmp.  */
10687   if (SUBTARGET_FRAME_POINTER_REQUIRED)
10688     return true;
10689 
10690   /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
10691   if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10692     return true;
10693 
10694   /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10695      allocation is 4GB.  */
10696   if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10697     return true;
10698 
10699   /* SSE saves require frame-pointer when stack is misaligned.  */
10700   if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10701     return true;
10702 
10703   /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10704      turns off the frame pointer by default.  Turn it back on now if
10705      we've not got a leaf function.  */
10706   if (TARGET_OMIT_LEAF_FRAME_POINTER
10707       && (!crtl->is_leaf
10708 	  || ix86_current_function_calls_tls_descriptor))
10709     return true;
10710 
10711   if (crtl->profile && !flag_fentry)
10712     return true;
10713 
10714   return false;
10715 }
10716 
10717 /* Record that the current function accesses previous call frames.  */
10718 
10719 void
10720 ix86_setup_frame_addresses (void)
10721 {
10722   cfun->machine->accesses_prev_frame = 1;
10723 }
10724 
10725 #ifndef USE_HIDDEN_LINKONCE
10726 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10727 #  define USE_HIDDEN_LINKONCE 1
10728 # else
10729 #  define USE_HIDDEN_LINKONCE 0
10730 # endif
10731 #endif
10732 
10733 /* Label count for call and return thunks.  It is used to make unique
10734    labels in call and return thunks.  */
10735 static int indirectlabelno;
10736 
10737 /* True if call thunk function is needed.  */
10738 static bool indirect_thunk_needed = false;
10739 /* True if call thunk function with the BND prefix is needed.  */
10740 static bool indirect_thunk_bnd_needed = false;
10741 
10742 /* Bit masks of integer registers, which contain branch target, used
10743    by call thunk functions.  */
10744 static int indirect_thunks_used;
10745 /* Bit masks of integer registers, which contain branch target, used
10746    by call thunk functions with the BND prefix.  */
10747 static int indirect_thunks_bnd_used;
10748 
10749 /* True if return thunk function is needed.  */
10750 static bool indirect_return_needed = false;
10751 /* True if return thunk function with the BND prefix is needed.  */
10752 static bool indirect_return_bnd_needed = false;
10753 
10754 /* True if return thunk function via CX is needed.  */
10755 static bool indirect_return_via_cx;
10756 /* True if return thunk function via CX with the BND prefix is
10757    needed.  */
10758 static bool indirect_return_via_cx_bnd;
10759 
10760 #ifndef INDIRECT_LABEL
10761 # define INDIRECT_LABEL "LIND"
10762 #endif
10763 
10764 /* Indicate what prefix is needed for an indirect branch.  */
10765 enum indirect_thunk_prefix
10766 {
10767   indirect_thunk_prefix_none,
10768   indirect_thunk_prefix_bnd,
10769   indirect_thunk_prefix_nt
10770 };
10771 
10772 /* Return the prefix needed for an indirect branch INSN.  */
10773 
10774 enum indirect_thunk_prefix
10775 indirect_thunk_need_prefix (rtx_insn *insn)
10776 {
10777   enum indirect_thunk_prefix need_prefix;
10778   if (ix86_bnd_prefixed_insn_p (insn))
10779     need_prefix = indirect_thunk_prefix_bnd;
10780   else if ((cfun->machine->indirect_branch_type
10781 	    == indirect_branch_thunk_extern)
10782 	   && ix86_notrack_prefixed_insn_p (insn))
10783     {
10784       /* NOTRACK prefix is only used with external thunk so that it
10785 	 can be properly updated to support CET at run-time.  */
10786       need_prefix = indirect_thunk_prefix_nt;
10787     }
10788   else
10789     need_prefix = indirect_thunk_prefix_none;
10790   return need_prefix;
10791 }
10792 
10793 /* Fills in the label name that should be used for the indirect thunk.  */
10794 
10795 static void
10796 indirect_thunk_name (char name[32], unsigned int regno,
10797 		     enum indirect_thunk_prefix need_prefix,
10798 		     bool ret_p)
10799 {
10800   if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
10801     gcc_unreachable ();
10802 
10803   if (USE_HIDDEN_LINKONCE)
10804     {
10805       const char *prefix;
10806 
10807       if (need_prefix == indirect_thunk_prefix_bnd)
10808 	prefix = "_bnd";
10809       else if (need_prefix == indirect_thunk_prefix_nt
10810 	       && regno != INVALID_REGNUM)
10811 	{
10812 	  /* NOTRACK prefix is only used with external thunk via
10813 	     register so that NOTRACK prefix can be added to indirect
10814 	     branch via register to support CET at run-time.  */
10815 	  prefix = "_nt";
10816 	}
10817       else
10818 	prefix = "";
10819 
10820       const char *ret = ret_p ? "return" : "indirect";
10821 
10822       if (regno != INVALID_REGNUM)
10823 	{
10824 	  const char *reg_prefix;
10825 	  if (LEGACY_INT_REGNO_P (regno))
10826 	    reg_prefix = TARGET_64BIT ? "r" : "e";
10827 	  else
10828 	    reg_prefix = "";
10829 	  sprintf (name, "__x86_%s_thunk%s_%s%s",
10830 		   ret, prefix, reg_prefix, reg_names[regno]);
10831 	}
10832       else
10833 	sprintf (name, "__x86_%s_thunk%s", ret, prefix);
10834     }
10835   else
10836     {
10837       if (regno != INVALID_REGNUM)
10838 	{
10839 	  if (need_prefix == indirect_thunk_prefix_bnd)
10840 	    ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10841 	  else
10842 	    ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10843 	}
10844       else
10845 	{
10846 	  if (ret_p)
10847 	    {
10848 	      if (need_prefix == indirect_thunk_prefix_bnd)
10849 		ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10850 	      else
10851 		ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10852 	    }
10853 	  else
10854 	    {
10855 	      if (need_prefix == indirect_thunk_prefix_bnd)
10856 		ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10857 	      else
10858 		ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10859 	    }
10860 	}
10861     }
10862 }
10863 
10864 /* Output a call and return thunk for indirect branch.  If BND_P is
10865    true, the BND prefix is needed.   If REGNO != -1,  the function
10866    address is in REGNO and the call and return thunk looks like:
10867 
10868 	call	L2
10869    L1:
10870 	pause
10871 	lfence
10872 	jmp	L1
10873    L2:
10874 	mov	%REG, (%sp)
10875 	ret
10876 
10877    Otherwise, the function address is on the top of stack and the
10878    call and return thunk looks like:
10879 
10880 	call L2
10881   L1:
10882 	pause
10883 	lfence
10884 	jmp L1
10885   L2:
10886 	lea WORD_SIZE(%sp), %sp
10887 	ret
10888  */
10889 
10890 static void
10891 output_indirect_thunk (enum indirect_thunk_prefix need_prefix,
10892 		       unsigned int regno)
10893 {
10894   char indirectlabel1[32];
10895   char indirectlabel2[32];
10896 
10897   ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10898 			       indirectlabelno++);
10899   ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10900 			       indirectlabelno++);
10901 
10902   /* Call */
10903   if (need_prefix == indirect_thunk_prefix_bnd)
10904     fputs ("\tbnd call\t", asm_out_file);
10905   else
10906     fputs ("\tcall\t", asm_out_file);
10907   assemble_name_raw (asm_out_file, indirectlabel2);
10908   fputc ('\n', asm_out_file);
10909 
10910   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10911 
10912   /* AMD and Intel CPUs prefer each a different instruction as loop filler.
10913      Usage of both pause + lfence is compromise solution.  */
10914   fprintf (asm_out_file, "\tpause\n\tlfence\n");
10915 
10916   /* Jump.  */
10917   fputs ("\tjmp\t", asm_out_file);
10918   assemble_name_raw (asm_out_file, indirectlabel1);
10919   fputc ('\n', asm_out_file);
10920 
10921   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
10922 
10923   /* The above call insn pushed a word to stack.  Adjust CFI info.  */
10924   if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
10925     {
10926       if (! dwarf2out_do_cfi_asm ())
10927 	{
10928 	  dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
10929 	  xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
10930 	  xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
10931 	  vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
10932 	}
10933       dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
10934       xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
10935       xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
10936       vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
10937       dwarf2out_emit_cfi (xcfi);
10938     }
10939 
10940   if (regno != INVALID_REGNUM)
10941     {
10942       /* MOV.  */
10943       rtx xops[2];
10944       xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
10945       xops[1] = gen_rtx_REG (word_mode, regno);
10946       output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
10947     }
10948   else
10949     {
10950       /* LEA.  */
10951       rtx xops[2];
10952       xops[0] = stack_pointer_rtx;
10953       xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10954       output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
10955     }
10956 
10957   if (need_prefix == indirect_thunk_prefix_bnd)
10958     fputs ("\tbnd ret\n", asm_out_file);
10959   else
10960     fputs ("\tret\n", asm_out_file);
10961 }
10962 
10963 /* Output a funtion with a call and return thunk for indirect branch.
10964    If BND_P is true, the BND prefix is needed.  If REGNO != UNVALID_REGNUM,
10965    the function address is in REGNO.  Otherwise, the function address is
10966    on the top of stack.  Thunk is used for function return if RET_P is
10967    true.  */
10968 
10969 static void
10970 output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
10971 				unsigned int regno, bool ret_p)
10972 {
10973   char name[32];
10974   tree decl;
10975 
10976   /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd.  */
10977   indirect_thunk_name (name, regno, need_prefix, ret_p);
10978   decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10979 		     get_identifier (name),
10980 		     build_function_type_list (void_type_node, NULL_TREE));
10981   DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10982 				   NULL_TREE, void_type_node);
10983   TREE_PUBLIC (decl) = 1;
10984   TREE_STATIC (decl) = 1;
10985   DECL_IGNORED_P (decl) = 1;
10986 
10987 #if TARGET_MACHO
10988   if (TARGET_MACHO)
10989     {
10990       switch_to_section (darwin_sections[picbase_thunk_section]);
10991       fputs ("\t.weak_definition\t", asm_out_file);
10992       assemble_name (asm_out_file, name);
10993       fputs ("\n\t.private_extern\t", asm_out_file);
10994       assemble_name (asm_out_file, name);
10995       putc ('\n', asm_out_file);
10996       ASM_OUTPUT_LABEL (asm_out_file, name);
10997       DECL_WEAK (decl) = 1;
10998     }
10999   else
11000 #endif
11001     if (USE_HIDDEN_LINKONCE)
11002       {
11003 	cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11004 
11005 	targetm.asm_out.unique_section (decl, 0);
11006 	switch_to_section (get_named_section (decl, NULL, 0));
11007 
11008 	targetm.asm_out.globalize_label (asm_out_file, name);
11009 	fputs ("\t.hidden\t", asm_out_file);
11010 	assemble_name (asm_out_file, name);
11011 	putc ('\n', asm_out_file);
11012 	ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11013       }
11014     else
11015       {
11016 	switch_to_section (text_section);
11017 	ASM_OUTPUT_LABEL (asm_out_file, name);
11018       }
11019 
11020   DECL_INITIAL (decl) = make_node (BLOCK);
11021   current_function_decl = decl;
11022   allocate_struct_function (decl, false);
11023   init_function_start (decl);
11024   /* We're about to hide the function body from callees of final_* by
11025      emitting it directly; tell them we're a thunk, if they care.  */
11026   cfun->is_thunk = true;
11027   first_function_block_is_cold = false;
11028   /* Make sure unwind info is emitted for the thunk if needed.  */
11029   final_start_function (emit_barrier (), asm_out_file, 1);
11030 
11031   output_indirect_thunk (need_prefix, regno);
11032 
11033   final_end_function ();
11034   init_insn_lengths ();
11035   free_after_compilation (cfun);
11036   set_cfun (NULL);
11037   current_function_decl = NULL;
11038 }
11039 
11040 static int pic_labels_used;
11041 
11042 /* Fills in the label name that should be used for a pc thunk for
11043    the given register.  */
11044 
11045 static void
11046 get_pc_thunk_name (char name[32], unsigned int regno)
11047 {
11048   gcc_assert (!TARGET_64BIT);
11049 
11050   if (USE_HIDDEN_LINKONCE)
11051     sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11052   else
11053     ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11054 }
11055 
11056 
11057 /* This function generates code for -fpic that loads %ebx with
11058    the return address of the caller and then returns.  */
11059 
11060 static void
11061 ix86_code_end (void)
11062 {
11063   rtx xops[2];
11064   unsigned int regno;
11065 
11066   if (indirect_return_needed)
11067     output_indirect_thunk_function (indirect_thunk_prefix_none,
11068 				    INVALID_REGNUM, true);
11069   if (indirect_return_bnd_needed)
11070     output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11071 				    INVALID_REGNUM, true);
11072 
11073   if (indirect_return_via_cx)
11074     output_indirect_thunk_function (indirect_thunk_prefix_none,
11075 				    CX_REG, true);
11076   if (indirect_return_via_cx_bnd)
11077     output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11078 				    CX_REG, true);
11079 
11080   if (indirect_thunk_needed)
11081     output_indirect_thunk_function (indirect_thunk_prefix_none,
11082 				    INVALID_REGNUM, false);
11083   if (indirect_thunk_bnd_needed)
11084     output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11085 				    INVALID_REGNUM, false);
11086 
11087   for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
11088     {
11089       unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
11090       if ((indirect_thunks_used & (1 << i)))
11091 	output_indirect_thunk_function (indirect_thunk_prefix_none,
11092 					regno, false);
11093 
11094       if ((indirect_thunks_bnd_used & (1 << i)))
11095 	output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11096 					regno, false);
11097     }
11098 
11099   for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11100     {
11101       char name[32];
11102       tree decl;
11103 
11104       if ((indirect_thunks_used & (1 << regno)))
11105 	output_indirect_thunk_function (indirect_thunk_prefix_none,
11106 					regno, false);
11107 
11108       if ((indirect_thunks_bnd_used & (1 << regno)))
11109 	output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11110 					regno, false);
11111 
11112       if (!(pic_labels_used & (1 << regno)))
11113 	continue;
11114 
11115       get_pc_thunk_name (name, regno);
11116 
11117       decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11118 			 get_identifier (name),
11119 			 build_function_type_list (void_type_node, NULL_TREE));
11120       DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11121 				       NULL_TREE, void_type_node);
11122       TREE_PUBLIC (decl) = 1;
11123       TREE_STATIC (decl) = 1;
11124       DECL_IGNORED_P (decl) = 1;
11125 
11126 #if TARGET_MACHO
11127       if (TARGET_MACHO)
11128 	{
11129 	  switch_to_section (darwin_sections[picbase_thunk_section]);
11130 	  fputs ("\t.weak_definition\t", asm_out_file);
11131 	  assemble_name (asm_out_file, name);
11132 	  fputs ("\n\t.private_extern\t", asm_out_file);
11133 	  assemble_name (asm_out_file, name);
11134 	  putc ('\n', asm_out_file);
11135 	  ASM_OUTPUT_LABEL (asm_out_file, name);
11136 	  DECL_WEAK (decl) = 1;
11137 	}
11138       else
11139 #endif
11140       if (USE_HIDDEN_LINKONCE)
11141 	{
11142 	  cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11143 
11144 	  targetm.asm_out.unique_section (decl, 0);
11145 	  switch_to_section (get_named_section (decl, NULL, 0));
11146 
11147 	  targetm.asm_out.globalize_label (asm_out_file, name);
11148 	  fputs ("\t.hidden\t", asm_out_file);
11149 	  assemble_name (asm_out_file, name);
11150 	  putc ('\n', asm_out_file);
11151 	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11152 	}
11153       else
11154 	{
11155 	  switch_to_section (text_section);
11156 	  ASM_OUTPUT_LABEL (asm_out_file, name);
11157 	}
11158 
11159       DECL_INITIAL (decl) = make_node (BLOCK);
11160       current_function_decl = decl;
11161       allocate_struct_function (decl, false);
11162       init_function_start (decl);
11163       /* We're about to hide the function body from callees of final_* by
11164 	 emitting it directly; tell them we're a thunk, if they care.  */
11165       cfun->is_thunk = true;
11166       first_function_block_is_cold = false;
11167       /* Make sure unwind info is emitted for the thunk if needed.  */
11168       final_start_function (emit_barrier (), asm_out_file, 1);
11169 
11170       /* Pad stack IP move with 4 instructions (two NOPs count
11171 	 as one instruction).  */
11172       if (TARGET_PAD_SHORT_FUNCTION)
11173 	{
11174 	  int i = 8;
11175 
11176 	  while (i--)
11177 	    fputs ("\tnop\n", asm_out_file);
11178 	}
11179 
11180       xops[0] = gen_rtx_REG (Pmode, regno);
11181       xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11182       output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11183       output_asm_insn ("%!ret", NULL);
11184       final_end_function ();
11185       init_insn_lengths ();
11186       free_after_compilation (cfun);
11187       set_cfun (NULL);
11188       current_function_decl = NULL;
11189     }
11190 
11191   if (flag_split_stack)
11192     file_end_indicate_split_stack ();
11193 }
11194 
11195 /* Emit code for the SET_GOT patterns.  */
11196 
11197 const char *
11198 output_set_got (rtx dest, rtx label)
11199 {
11200   rtx xops[3];
11201 
11202   xops[0] = dest;
11203 
11204   if (TARGET_VXWORKS_RTP && flag_pic)
11205     {
11206       /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
11207       xops[2] = gen_rtx_MEM (Pmode,
11208 			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11209       output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11210 
11211       /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11212 	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11213 	 an unadorned address.  */
11214       xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11215       SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11216       output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11217       return "";
11218     }
11219 
11220   xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11221 
11222   if (flag_pic)
11223     {
11224       char name[32];
11225       get_pc_thunk_name (name, REGNO (dest));
11226       pic_labels_used |= 1 << REGNO (dest);
11227 
11228       xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11229       xops[2] = gen_rtx_MEM (QImode, xops[2]);
11230       output_asm_insn ("%!call\t%X2", xops);
11231 
11232 #if TARGET_MACHO
11233       /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11234          This is what will be referenced by the Mach-O PIC subsystem.  */
11235       if (machopic_should_output_picbase_label () || !label)
11236 	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11237 
11238       /* When we are restoring the pic base at the site of a nonlocal label,
11239          and we decided to emit the pic base above, we will still output a
11240          local label used for calculating the correction offset (even though
11241          the offset will be 0 in that case).  */
11242       if (label)
11243         targetm.asm_out.internal_label (asm_out_file, "L",
11244 					   CODE_LABEL_NUMBER (label));
11245 #endif
11246     }
11247   else
11248     {
11249       if (TARGET_MACHO)
11250 	/* We don't need a pic base, we're not producing pic.  */
11251 	gcc_unreachable ();
11252 
11253       xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11254       output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11255       targetm.asm_out.internal_label (asm_out_file, "L",
11256 				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11257     }
11258 
11259   if (!TARGET_MACHO)
11260     output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11261 
11262   return "";
11263 }
11264 
11265 /* Generate an "push" pattern for input ARG.  */
11266 
11267 static rtx
11268 gen_push (rtx arg)
11269 {
11270   struct machine_function *m = cfun->machine;
11271 
11272   if (m->fs.cfa_reg == stack_pointer_rtx)
11273     m->fs.cfa_offset += UNITS_PER_WORD;
11274   m->fs.sp_offset += UNITS_PER_WORD;
11275 
11276   if (REG_P (arg) && GET_MODE (arg) != word_mode)
11277     arg = gen_rtx_REG (word_mode, REGNO (arg));
11278 
11279   return gen_rtx_SET (gen_rtx_MEM (word_mode,
11280 				   gen_rtx_PRE_DEC (Pmode,
11281 						    stack_pointer_rtx)),
11282 		      arg);
11283 }
11284 
11285 /* Generate an "pop" pattern for input ARG.  */
11286 
11287 static rtx
11288 gen_pop (rtx arg)
11289 {
11290   if (REG_P (arg) && GET_MODE (arg) != word_mode)
11291     arg = gen_rtx_REG (word_mode, REGNO (arg));
11292 
11293   return gen_rtx_SET (arg,
11294 		      gen_rtx_MEM (word_mode,
11295 				   gen_rtx_POST_INC (Pmode,
11296 						     stack_pointer_rtx)));
11297 }
11298 
11299 /* Return >= 0 if there is an unused call-clobbered register available
11300    for the entire function.  */
11301 
11302 static unsigned int
11303 ix86_select_alt_pic_regnum (void)
11304 {
11305   if (ix86_use_pseudo_pic_reg ())
11306     return INVALID_REGNUM;
11307 
11308   if (crtl->is_leaf
11309       && !crtl->profile
11310       && !ix86_current_function_calls_tls_descriptor)
11311     {
11312       int i, drap;
11313       /* Can't use the same register for both PIC and DRAP.  */
11314       if (crtl->drap_reg)
11315 	drap = REGNO (crtl->drap_reg);
11316       else
11317 	drap = -1;
11318       for (i = 2; i >= 0; --i)
11319         if (i != drap && !df_regs_ever_live_p (i))
11320 	  return i;
11321     }
11322 
11323   return INVALID_REGNUM;
11324 }
11325 
11326 /* Return true if REGNO is used by the epilogue.  */
11327 
11328 bool
11329 ix86_epilogue_uses (int regno)
11330 {
11331   /* If there are no caller-saved registers, we preserve all registers,
11332      except for MMX and x87 registers which aren't supported when saving
11333      and restoring registers.  Don't explicitly save SP register since
11334      it is always preserved.  */
11335   return (epilogue_completed
11336 	  && cfun->machine->no_caller_saved_registers
11337 	  && !fixed_regs[regno]
11338 	  && !STACK_REGNO_P (regno)
11339 	  && !MMX_REGNO_P (regno));
11340 }
11341 
11342 /* Return nonzero if register REGNO can be used as a scratch register
11343    in peephole2.  */
11344 
11345 static bool
11346 ix86_hard_regno_scratch_ok (unsigned int regno)
11347 {
11348   /* If there are no caller-saved registers, we can't use any register
11349      as a scratch register after epilogue and use REGNO as scratch
11350      register only if it has been used before to avoid saving and
11351      restoring it.  */
11352   return (!cfun->machine->no_caller_saved_registers
11353 	  || (!epilogue_completed
11354 	      && df_regs_ever_live_p (regno)));
11355 }
11356 
11357 /* Return true if register class CL should be an additional allocno
11358    class.  */
11359 
11360 static bool
11361 ix86_additional_allocno_class_p (reg_class_t cl)
11362 {
11363   return cl == MOD4_SSE_REGS;
11364 }
11365 
11366 /* Return TRUE if we need to save REGNO.  */
11367 
11368 static bool
11369 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11370 {
11371   /* If there are no caller-saved registers, we preserve all registers,
11372      except for MMX and x87 registers which aren't supported when saving
11373      and restoring registers.  Don't explicitly save SP register since
11374      it is always preserved.  */
11375   if (cfun->machine->no_caller_saved_registers)
11376     {
11377       /* Don't preserve registers used for function return value.  */
11378       rtx reg = crtl->return_rtx;
11379       if (reg)
11380 	{
11381 	  unsigned int i = REGNO (reg);
11382 	  unsigned int nregs = REG_NREGS (reg);
11383 	  while (nregs-- > 0)
11384 	    if ((i + nregs) == regno)
11385 	      return false;
11386 
11387 	  reg = crtl->return_bnd;
11388 	  if (reg)
11389 	    {
11390 	      i = REGNO (reg);
11391 	      nregs = REG_NREGS (reg);
11392 	      while (nregs-- > 0)
11393 		if ((i + nregs) == regno)
11394 		  return false;
11395 	    }
11396 	}
11397 
11398       return (df_regs_ever_live_p (regno)
11399 	      && !fixed_regs[regno]
11400 	      && !STACK_REGNO_P (regno)
11401 	      && !MMX_REGNO_P (regno)
11402 	      && (regno != HARD_FRAME_POINTER_REGNUM
11403 		  || !frame_pointer_needed));
11404     }
11405 
11406   if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11407       && pic_offset_table_rtx)
11408     {
11409       if (ix86_use_pseudo_pic_reg ())
11410 	{
11411 	  /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11412 	  _mcount in prologue.  */
11413 	  if (!TARGET_64BIT && flag_pic && crtl->profile)
11414 	    return true;
11415 	}
11416       else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11417 	       || crtl->profile
11418 	       || crtl->calls_eh_return
11419 	       || crtl->uses_const_pool
11420 	       || cfun->has_nonlocal_label)
11421         return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11422     }
11423 
11424   if (crtl->calls_eh_return && maybe_eh_return)
11425     {
11426       unsigned i;
11427       for (i = 0; ; i++)
11428 	{
11429 	  unsigned test = EH_RETURN_DATA_REGNO (i);
11430 	  if (test == INVALID_REGNUM)
11431 	    break;
11432 	  if (test == regno)
11433 	    return true;
11434 	}
11435     }
11436 
11437   if (ignore_outlined && cfun->machine->call_ms2sysv)
11438     {
11439       unsigned count = cfun->machine->call_ms2sysv_extra_regs
11440 		       + xlogue_layout::MIN_REGS;
11441       if (xlogue_layout::is_stub_managed_reg (regno, count))
11442 	return false;
11443     }
11444 
11445   if (crtl->drap_reg
11446       && regno == REGNO (crtl->drap_reg)
11447       && !cfun->machine->no_drap_save_restore)
11448     return true;
11449 
11450   return (df_regs_ever_live_p (regno)
11451 	  && !call_used_regs[regno]
11452 	  && !fixed_regs[regno]
11453 	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11454 }
11455 
11456 /* Return number of saved general prupose registers.  */
11457 
11458 static int
11459 ix86_nsaved_regs (void)
11460 {
11461   int nregs = 0;
11462   int regno;
11463 
11464   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11465     if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11466       nregs ++;
11467   return nregs;
11468 }
11469 
11470 /* Return number of saved SSE registers.  */
11471 
11472 static int
11473 ix86_nsaved_sseregs (void)
11474 {
11475   int nregs = 0;
11476   int regno;
11477 
11478   if (!TARGET_64BIT_MS_ABI)
11479     return 0;
11480   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11481     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11482       nregs ++;
11483   return nregs;
11484 }
11485 
11486 /* Given FROM and TO register numbers, say whether this elimination is
11487    allowed.  If stack alignment is needed, we can only replace argument
11488    pointer with hard frame pointer, or replace frame pointer with stack
11489    pointer.  Otherwise, frame pointer elimination is automatically
11490    handled and all other eliminations are valid.  */
11491 
11492 static bool
11493 ix86_can_eliminate (const int from, const int to)
11494 {
11495   if (stack_realign_fp)
11496     return ((from == ARG_POINTER_REGNUM
11497 	     && to == HARD_FRAME_POINTER_REGNUM)
11498 	    || (from == FRAME_POINTER_REGNUM
11499 		&& to == STACK_POINTER_REGNUM));
11500   else
11501     return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11502 }
11503 
11504 /* Return the offset between two registers, one to be eliminated, and the other
11505    its replacement, at the start of a routine.  */
11506 
11507 HOST_WIDE_INT
11508 ix86_initial_elimination_offset (int from, int to)
11509 {
11510   struct ix86_frame &frame = cfun->machine->frame;
11511 
11512   if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11513     return frame.hard_frame_pointer_offset;
11514   else if (from == FRAME_POINTER_REGNUM
11515 	   && to == HARD_FRAME_POINTER_REGNUM)
11516     return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11517   else
11518     {
11519       gcc_assert (to == STACK_POINTER_REGNUM);
11520 
11521       if (from == ARG_POINTER_REGNUM)
11522 	return frame.stack_pointer_offset;
11523 
11524       gcc_assert (from == FRAME_POINTER_REGNUM);
11525       return frame.stack_pointer_offset - frame.frame_pointer_offset;
11526     }
11527 }
11528 
11529 /* In a dynamically-aligned function, we can't know the offset from
11530    stack pointer to frame pointer, so we must ensure that setjmp
11531    eliminates fp against the hard fp (%ebp) rather than trying to
11532    index from %esp up to the top of the frame across a gap that is
11533    of unknown (at compile-time) size.  */
11534 static rtx
11535 ix86_builtin_setjmp_frame_value (void)
11536 {
11537   return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11538 }
11539 
11540 /* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
11541 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11542 {
11543   static bool warned_once = false;
11544   if (!warned_once)
11545     {
11546       warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11547 	       feature);
11548       warned_once = true;
11549     }
11550 }
11551 
11552 /* Return the probing interval for -fstack-clash-protection.  */
11553 
11554 static HOST_WIDE_INT
11555 get_probe_interval (void)
11556 {
11557   if (flag_stack_clash_protection)
11558     return (HOST_WIDE_INT_1U
11559 	    << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11560   else
11561     return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11562 }
11563 
11564 /* When using -fsplit-stack, the allocation routines set a field in
11565    the TCB to the bottom of the stack plus this much space, measured
11566    in bytes.  */
11567 
11568 #define SPLIT_STACK_AVAILABLE 256
11569 
11570 /* Fill structure ix86_frame about frame of currently computed function.  */
11571 
11572 static void
11573 ix86_compute_frame_layout (void)
11574 {
11575   struct ix86_frame *frame = &cfun->machine->frame;
11576   struct machine_function *m = cfun->machine;
11577   unsigned HOST_WIDE_INT stack_alignment_needed;
11578   HOST_WIDE_INT offset;
11579   unsigned HOST_WIDE_INT preferred_alignment;
11580   HOST_WIDE_INT size = get_frame_size ();
11581   HOST_WIDE_INT to_allocate;
11582 
11583   /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11584    * ms_abi functions that call a sysv function.  We now need to prune away
11585    * cases where it should be disabled.  */
11586   if (TARGET_64BIT && m->call_ms2sysv)
11587     {
11588       gcc_assert (TARGET_64BIT_MS_ABI);
11589       gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11590       gcc_assert (!TARGET_SEH);
11591       gcc_assert (TARGET_SSE);
11592       gcc_assert (!ix86_using_red_zone ());
11593 
11594       if (crtl->calls_eh_return)
11595 	{
11596 	  gcc_assert (!reload_completed);
11597 	  m->call_ms2sysv = false;
11598 	  warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11599 	}
11600 
11601       else if (ix86_static_chain_on_stack)
11602 	{
11603 	  gcc_assert (!reload_completed);
11604 	  m->call_ms2sysv = false;
11605 	  warn_once_call_ms2sysv_xlogues ("static call chains");
11606 	}
11607 
11608       /* Finally, compute which registers the stub will manage.  */
11609       else
11610 	{
11611 	  unsigned count = xlogue_layout::count_stub_managed_regs ();
11612 	  m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11613 	  m->call_ms2sysv_pad_in = 0;
11614 	}
11615     }
11616 
11617   frame->nregs = ix86_nsaved_regs ();
11618   frame->nsseregs = ix86_nsaved_sseregs ();
11619 
11620   /* 64-bit MS ABI seem to require stack alignment to be always 16,
11621      except for function prologues, leaf functions and when the defult
11622      incoming stack boundary is overriden at command line or via
11623      force_align_arg_pointer attribute.
11624 
11625      Darwin's ABI specifies 128b alignment for both 32 and  64 bit variants
11626      at call sites, including profile function calls.
11627  */
11628   if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
11629         && crtl->preferred_stack_boundary < 128)
11630       && (!crtl->is_leaf || cfun->calls_alloca != 0
11631 	  || ix86_current_function_calls_tls_descriptor
11632 	  || (TARGET_MACHO && crtl->profile)
11633 	  || ix86_incoming_stack_boundary < 128))
11634     {
11635       crtl->preferred_stack_boundary = 128;
11636       crtl->stack_alignment_needed = 128;
11637     }
11638 
11639   stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11640   preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11641 
11642   gcc_assert (!size || stack_alignment_needed);
11643   gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11644   gcc_assert (preferred_alignment <= stack_alignment_needed);
11645 
11646   /* The only ABI saving SSE regs should be 64-bit ms_abi.  */
11647   gcc_assert (TARGET_64BIT || !frame->nsseregs);
11648   if (TARGET_64BIT && m->call_ms2sysv)
11649     {
11650       gcc_assert (stack_alignment_needed >= 16);
11651       gcc_assert (!frame->nsseregs);
11652     }
11653 
11654   /* For SEH we have to limit the amount of code movement into the prologue.
11655      At present we do this via a BLOCKAGE, at which point there's very little
11656      scheduling that can be done, which means that there's very little point
11657      in doing anything except PUSHs.  */
11658   if (TARGET_SEH)
11659     m->use_fast_prologue_epilogue = false;
11660   else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11661     {
11662       int count = frame->nregs;
11663       struct cgraph_node *node = cgraph_node::get (current_function_decl);
11664 
11665       /* The fast prologue uses move instead of push to save registers.  This
11666          is significantly longer, but also executes faster as modern hardware
11667          can execute the moves in parallel, but can't do that for push/pop.
11668 
11669 	 Be careful about choosing what prologue to emit:  When function takes
11670 	 many instructions to execute we may use slow version as well as in
11671 	 case function is known to be outside hot spot (this is known with
11672 	 feedback only).  Weight the size of function by number of registers
11673 	 to save as it is cheap to use one or two push instructions but very
11674 	 slow to use many of them.  */
11675       if (count)
11676 	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11677       if (node->frequency < NODE_FREQUENCY_NORMAL
11678 	  || (flag_branch_probabilities
11679 	      && node->frequency < NODE_FREQUENCY_HOT))
11680 	m->use_fast_prologue_epilogue = false;
11681       else
11682 	m->use_fast_prologue_epilogue
11683 	   = !expensive_function_p (count);
11684     }
11685 
11686   frame->save_regs_using_mov
11687     = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11688        /* If static stack checking is enabled and done with probes,
11689 	  the registers need to be saved before allocating the frame.  */
11690        && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11691 
11692   /* Skip return address and error code in exception handler.  */
11693   offset = INCOMING_FRAME_SP_OFFSET;
11694 
11695   /* Skip pushed static chain.  */
11696   if (ix86_static_chain_on_stack)
11697     offset += UNITS_PER_WORD;
11698 
11699   /* Skip saved base pointer.  */
11700   if (frame_pointer_needed)
11701     offset += UNITS_PER_WORD;
11702   frame->hfp_save_offset = offset;
11703 
11704   /* The traditional frame pointer location is at the top of the frame.  */
11705   frame->hard_frame_pointer_offset = offset;
11706 
11707   /* Register save area */
11708   offset += frame->nregs * UNITS_PER_WORD;
11709   frame->reg_save_offset = offset;
11710 
11711   /* On SEH target, registers are pushed just before the frame pointer
11712      location.  */
11713   if (TARGET_SEH)
11714     frame->hard_frame_pointer_offset = offset;
11715 
11716   /* Calculate the size of the va-arg area (not including padding, if any).  */
11717   frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11718 
11719   /* Also adjust stack_realign_offset for the largest alignment of
11720      stack slot actually used.  */
11721   if (stack_realign_fp
11722       || (cfun->machine->max_used_stack_alignment != 0
11723 	  && (offset % cfun->machine->max_used_stack_alignment) != 0))
11724     {
11725       /* We may need a 16-byte aligned stack for the remainder of the
11726 	 register save area, but the stack frame for the local function
11727 	 may require a greater alignment if using AVX/2/512.  In order
11728 	 to avoid wasting space, we first calculate the space needed for
11729 	 the rest of the register saves, add that to the stack pointer,
11730 	 and then realign the stack to the boundary of the start of the
11731 	 frame for the local function.  */
11732       HOST_WIDE_INT space_needed = 0;
11733       HOST_WIDE_INT sse_reg_space_needed = 0;
11734 
11735       if (TARGET_64BIT)
11736 	{
11737 	  if (m->call_ms2sysv)
11738 	    {
11739 	      m->call_ms2sysv_pad_in = 0;
11740 	      space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11741 	    }
11742 
11743 	  else if (frame->nsseregs)
11744 	    /* The only ABI that has saved SSE registers (Win64) also has a
11745 	       16-byte aligned default stack.  However, many programs violate
11746 	       the ABI, and Wine64 forces stack realignment to compensate.  */
11747 	    space_needed = frame->nsseregs * 16;
11748 
11749 	  sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11750 
11751 	  /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11752 	     rounding to be pedantic.  */
11753 	  space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11754 	}
11755       else
11756 	space_needed = frame->va_arg_size;
11757 
11758       /* Record the allocation size required prior to the realignment AND.  */
11759       frame->stack_realign_allocate = space_needed;
11760 
11761       /* The re-aligned stack starts at frame->stack_realign_offset.  Values
11762 	 before this point are not directly comparable with values below
11763 	 this point.  Use sp_valid_at to determine if the stack pointer is
11764 	 valid for a given offset, fp_valid_at for the frame pointer, or
11765 	 choose_baseaddr to have a base register chosen for you.
11766 
11767 	 Note that the result of (frame->stack_realign_offset
11768 	 & (stack_alignment_needed - 1)) may not equal zero.  */
11769       offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11770       frame->stack_realign_offset = offset - space_needed;
11771       frame->sse_reg_save_offset = frame->stack_realign_offset
11772 							+ sse_reg_space_needed;
11773     }
11774   else
11775     {
11776       frame->stack_realign_offset = offset;
11777 
11778       if (TARGET_64BIT && m->call_ms2sysv)
11779 	{
11780 	  m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11781 	  offset += xlogue_layout::get_instance ().get_stack_space_used ();
11782 	}
11783 
11784       /* Align and set SSE register save area.  */
11785       else if (frame->nsseregs)
11786 	{
11787 	  /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11788 	     required and the DRAP re-alignment boundary is at least 16 bytes,
11789 	     then we want the SSE register save area properly aligned.  */
11790 	  if (ix86_incoming_stack_boundary >= 128
11791 		  || (stack_realign_drap && stack_alignment_needed >= 16))
11792 	    offset = ROUND_UP (offset, 16);
11793 	  offset += frame->nsseregs * 16;
11794 	}
11795       frame->sse_reg_save_offset = offset;
11796       offset += frame->va_arg_size;
11797     }
11798 
11799   /* Align start of frame for local function.  When a function call
11800      is removed, it may become a leaf function.  But if argument may
11801      be passed on stack, we need to align the stack when there is no
11802      tail call.  */
11803   if (m->call_ms2sysv
11804       || frame->va_arg_size != 0
11805       || size != 0
11806       || !crtl->is_leaf
11807       || (!crtl->tail_call_emit
11808 	  && cfun->machine->outgoing_args_on_stack)
11809       || cfun->calls_alloca
11810       || ix86_current_function_calls_tls_descriptor)
11811     offset = ROUND_UP (offset, stack_alignment_needed);
11812 
11813   /* Frame pointer points here.  */
11814   frame->frame_pointer_offset = offset;
11815 
11816   offset += size;
11817 
11818   /* Add outgoing arguments area.  Can be skipped if we eliminated
11819      all the function calls as dead code.
11820      Skipping is however impossible when function calls alloca.  Alloca
11821      expander assumes that last crtl->outgoing_args_size
11822      of stack frame are unused.  */
11823   if (ACCUMULATE_OUTGOING_ARGS
11824       && (!crtl->is_leaf || cfun->calls_alloca
11825 	  || ix86_current_function_calls_tls_descriptor))
11826     {
11827       offset += crtl->outgoing_args_size;
11828       frame->outgoing_arguments_size = crtl->outgoing_args_size;
11829     }
11830   else
11831     frame->outgoing_arguments_size = 0;
11832 
11833   /* Align stack boundary.  Only needed if we're calling another function
11834      or using alloca.  */
11835   if (!crtl->is_leaf || cfun->calls_alloca
11836       || ix86_current_function_calls_tls_descriptor)
11837     offset = ROUND_UP (offset, preferred_alignment);
11838 
11839   /* We've reached end of stack frame.  */
11840   frame->stack_pointer_offset = offset;
11841 
11842   /* Size prologue needs to allocate.  */
11843   to_allocate = offset - frame->sse_reg_save_offset;
11844 
11845   if ((!to_allocate && frame->nregs <= 1)
11846       || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11847       /* If stack clash probing needs a loop, then it needs a
11848 	 scratch register.  But the returned register is only guaranteed
11849 	 to be safe to use after register saves are complete.  So if
11850 	 stack clash protections are enabled and the allocated frame is
11851 	 larger than the probe interval, then use pushes to save
11852 	 callee saved registers.  */
11853       || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11854     frame->save_regs_using_mov = false;
11855 
11856   if (ix86_using_red_zone ()
11857       && crtl->sp_is_unchanging
11858       && crtl->is_leaf
11859       && !ix86_pc_thunk_call_expanded
11860       && !ix86_current_function_calls_tls_descriptor)
11861     {
11862       frame->red_zone_size = to_allocate;
11863       if (frame->save_regs_using_mov)
11864 	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11865       if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11866 	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11867     }
11868   else
11869     frame->red_zone_size = 0;
11870   frame->stack_pointer_offset -= frame->red_zone_size;
11871 
11872   /* The SEH frame pointer location is near the bottom of the frame.
11873      This is enforced by the fact that the difference between the
11874      stack pointer and the frame pointer is limited to 240 bytes in
11875      the unwind data structure.  */
11876   if (TARGET_SEH)
11877     {
11878       HOST_WIDE_INT diff;
11879 
11880       /* If we can leave the frame pointer where it is, do so.  Also, returns
11881 	 the establisher frame for __builtin_frame_address (0).  */
11882       diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11883       if (diff <= SEH_MAX_FRAME_SIZE
11884 	  && (diff > 240 || (diff & 15) != 0)
11885 	  && !crtl->accesses_prior_frames)
11886 	{
11887 	  /* Ideally we'd determine what portion of the local stack frame
11888 	     (within the constraint of the lowest 240) is most heavily used.
11889 	     But without that complication, simply bias the frame pointer
11890 	     by 128 bytes so as to maximize the amount of the local stack
11891 	     frame that is addressable with 8-bit offsets.  */
11892 	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11893 	}
11894     }
11895 }
11896 
11897 /* This is semi-inlined memory_address_length, but simplified
11898    since we know that we're always dealing with reg+offset, and
11899    to avoid having to create and discard all that rtl.  */
11900 
11901 static inline int
11902 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11903 {
11904   int len = 4;
11905 
11906   if (offset == 0)
11907     {
11908       /* EBP and R13 cannot be encoded without an offset.  */
11909       len = (regno == BP_REG || regno == R13_REG);
11910     }
11911   else if (IN_RANGE (offset, -128, 127))
11912     len = 1;
11913 
11914   /* ESP and R12 must be encoded with a SIB byte.  */
11915   if (regno == SP_REG || regno == R12_REG)
11916     len++;
11917 
11918   return len;
11919 }
11920 
11921 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11922    the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
11923 
11924 static bool
11925 sp_valid_at (HOST_WIDE_INT cfa_offset)
11926 {
11927   const struct machine_frame_state &fs = cfun->machine->fs;
11928   if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11929     {
11930       /* Validate that the cfa_offset isn't in a "no-man's land".  */
11931       gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11932       return false;
11933     }
11934   return fs.sp_valid;
11935 }
11936 
11937 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11938    the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
11939 
11940 static inline bool
11941 fp_valid_at (HOST_WIDE_INT cfa_offset)
11942 {
11943   const struct machine_frame_state &fs = cfun->machine->fs;
11944   if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11945     {
11946       /* Validate that the cfa_offset isn't in a "no-man's land".  */
11947       gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11948       return false;
11949     }
11950   return fs.fp_valid;
11951 }
11952 
11953 /* Choose a base register based upon alignment requested, speed and/or
11954    size.  */
11955 
11956 static void
11957 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11958 		HOST_WIDE_INT &base_offset,
11959 		unsigned int align_reqested, unsigned int *align)
11960 {
11961   const struct machine_function *m = cfun->machine;
11962   unsigned int hfp_align;
11963   unsigned int drap_align;
11964   unsigned int sp_align;
11965   bool hfp_ok  = fp_valid_at (cfa_offset);
11966   bool drap_ok = m->fs.drap_valid;
11967   bool sp_ok   = sp_valid_at (cfa_offset);
11968 
11969   hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11970 
11971   /* Filter out any registers that don't meet the requested alignment
11972      criteria.  */
11973   if (align_reqested)
11974     {
11975       if (m->fs.realigned)
11976 	hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11977       /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11978 	 notes (which we would need to use a realigned stack pointer),
11979 	 so disable on SEH targets.  */
11980       else if (m->fs.sp_realigned)
11981 	sp_align = crtl->stack_alignment_needed;
11982 
11983       hfp_ok = hfp_ok && hfp_align >= align_reqested;
11984       drap_ok = drap_ok && drap_align >= align_reqested;
11985       sp_ok = sp_ok && sp_align >= align_reqested;
11986     }
11987 
11988   if (m->use_fast_prologue_epilogue)
11989     {
11990       /* Choose the base register most likely to allow the most scheduling
11991          opportunities.  Generally FP is valid throughout the function,
11992          while DRAP must be reloaded within the epilogue.  But choose either
11993          over the SP due to increased encoding size.  */
11994 
11995       if (hfp_ok)
11996 	{
11997 	  base_reg = hard_frame_pointer_rtx;
11998 	  base_offset = m->fs.fp_offset - cfa_offset;
11999 	}
12000       else if (drap_ok)
12001 	{
12002 	  base_reg = crtl->drap_reg;
12003 	  base_offset = 0 - cfa_offset;
12004 	}
12005       else if (sp_ok)
12006 	{
12007 	  base_reg = stack_pointer_rtx;
12008 	  base_offset = m->fs.sp_offset - cfa_offset;
12009 	}
12010     }
12011   else
12012     {
12013       HOST_WIDE_INT toffset;
12014       int len = 16, tlen;
12015 
12016       /* Choose the base register with the smallest address encoding.
12017          With a tie, choose FP > DRAP > SP.  */
12018       if (sp_ok)
12019 	{
12020 	  base_reg = stack_pointer_rtx;
12021 	  base_offset = m->fs.sp_offset - cfa_offset;
12022           len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12023 	}
12024       if (drap_ok)
12025 	{
12026 	  toffset = 0 - cfa_offset;
12027 	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12028 	  if (tlen <= len)
12029 	    {
12030 	      base_reg = crtl->drap_reg;
12031 	      base_offset = toffset;
12032 	      len = tlen;
12033 	    }
12034 	}
12035       if (hfp_ok)
12036 	{
12037 	  toffset = m->fs.fp_offset - cfa_offset;
12038 	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12039 	  if (tlen <= len)
12040 	    {
12041 	      base_reg = hard_frame_pointer_rtx;
12042 	      base_offset = toffset;
12043 	      len = tlen;
12044 	    }
12045 	}
12046     }
12047 
12048     /* Set the align return value.  */
12049     if (align)
12050       {
12051 	if (base_reg == stack_pointer_rtx)
12052 	  *align = sp_align;
12053 	else if (base_reg == crtl->drap_reg)
12054 	  *align = drap_align;
12055 	else if (base_reg == hard_frame_pointer_rtx)
12056 	  *align = hfp_align;
12057       }
12058 }
12059 
12060 /* Return an RTX that points to CFA_OFFSET within the stack frame and
12061    the alignment of address.  If ALIGN is non-null, it should point to
12062    an alignment value (in bits) that is preferred or zero and will
12063    recieve the alignment of the base register that was selected,
12064    irrespective of rather or not CFA_OFFSET is a multiple of that
12065    alignment value.  If it is possible for the base register offset to be
12066    non-immediate then SCRATCH_REGNO should specify a scratch register to
12067    use.
12068 
12069    The valid base registers are taken from CFUN->MACHINE->FS.  */
12070 
12071 static rtx
12072 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
12073 		 unsigned int scratch_regno = INVALID_REGNUM)
12074 {
12075   rtx base_reg = NULL;
12076   HOST_WIDE_INT base_offset = 0;
12077 
12078   /* If a specific alignment is requested, try to get a base register
12079      with that alignment first.  */
12080   if (align && *align)
12081     choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
12082 
12083   if (!base_reg)
12084     choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
12085 
12086   gcc_assert (base_reg != NULL);
12087 
12088   rtx base_offset_rtx = GEN_INT (base_offset);
12089 
12090   if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
12091     {
12092       gcc_assert (scratch_regno != INVALID_REGNUM);
12093 
12094       rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
12095       emit_move_insn (scratch_reg, base_offset_rtx);
12096 
12097       return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
12098     }
12099 
12100   return plus_constant (Pmode, base_reg, base_offset);
12101 }
12102 
12103 /* Emit code to save registers in the prologue.  */
12104 
12105 static void
12106 ix86_emit_save_regs (void)
12107 {
12108   unsigned int regno;
12109   rtx_insn *insn;
12110 
12111   for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12112     if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12113       {
12114 	insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12115 	RTX_FRAME_RELATED_P (insn) = 1;
12116       }
12117 }
12118 
12119 /* Emit a single register save at CFA - CFA_OFFSET.  */
12120 
12121 static void
12122 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12123 			      HOST_WIDE_INT cfa_offset)
12124 {
12125   struct machine_function *m = cfun->machine;
12126   rtx reg = gen_rtx_REG (mode, regno);
12127   rtx mem, addr, base, insn;
12128   unsigned int align = GET_MODE_ALIGNMENT (mode);
12129 
12130   addr = choose_baseaddr (cfa_offset, &align);
12131   mem = gen_frame_mem (mode, addr);
12132 
12133   /* The location aligment depends upon the base register.  */
12134   align = MIN (GET_MODE_ALIGNMENT (mode), align);
12135   gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12136   set_mem_align (mem, align);
12137 
12138   insn = emit_insn (gen_rtx_SET (mem, reg));
12139   RTX_FRAME_RELATED_P (insn) = 1;
12140 
12141   base = addr;
12142   if (GET_CODE (base) == PLUS)
12143     base = XEXP (base, 0);
12144   gcc_checking_assert (REG_P (base));
12145 
12146   /* When saving registers into a re-aligned local stack frame, avoid
12147      any tricky guessing by dwarf2out.  */
12148   if (m->fs.realigned)
12149     {
12150       gcc_checking_assert (stack_realign_drap);
12151 
12152       if (regno == REGNO (crtl->drap_reg))
12153 	{
12154 	  /* A bit of a hack.  We force the DRAP register to be saved in
12155 	     the re-aligned stack frame, which provides us with a copy
12156 	     of the CFA that will last past the prologue.  Install it.  */
12157 	  gcc_checking_assert (cfun->machine->fs.fp_valid);
12158 	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12159 				cfun->machine->fs.fp_offset - cfa_offset);
12160 	  mem = gen_rtx_MEM (mode, addr);
12161 	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12162 	}
12163       else
12164 	{
12165 	  /* The frame pointer is a stable reference within the
12166 	     aligned frame.  Use it.  */
12167 	  gcc_checking_assert (cfun->machine->fs.fp_valid);
12168 	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12169 				cfun->machine->fs.fp_offset - cfa_offset);
12170 	  mem = gen_rtx_MEM (mode, addr);
12171 	  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12172 	}
12173     }
12174 
12175   else if (base == stack_pointer_rtx && m->fs.sp_realigned
12176 	   && cfa_offset >= m->fs.sp_realigned_offset)
12177     {
12178       gcc_checking_assert (stack_realign_fp);
12179       add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12180     }
12181 
12182   /* The memory may not be relative to the current CFA register,
12183      which means that we may need to generate a new pattern for
12184      use by the unwind info.  */
12185   else if (base != m->fs.cfa_reg)
12186     {
12187       addr = plus_constant (Pmode, m->fs.cfa_reg,
12188 			    m->fs.cfa_offset - cfa_offset);
12189       mem = gen_rtx_MEM (mode, addr);
12190       add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12191     }
12192 }
12193 
12194 /* Emit code to save registers using MOV insns.
12195    First register is stored at CFA - CFA_OFFSET.  */
12196 static void
12197 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12198 {
12199   unsigned int regno;
12200 
12201   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12202     if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12203       {
12204         ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12205 	cfa_offset -= UNITS_PER_WORD;
12206       }
12207 }
12208 
12209 /* Emit code to save SSE registers using MOV insns.
12210    First register is stored at CFA - CFA_OFFSET.  */
12211 static void
12212 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12213 {
12214   unsigned int regno;
12215 
12216   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12217     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12218       {
12219 	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12220 	cfa_offset -= GET_MODE_SIZE (V4SFmode);
12221       }
12222 }
12223 
12224 static GTY(()) rtx queued_cfa_restores;
12225 
12226 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12227    manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
12228    Don't add the note if the previously saved value will be left untouched
12229    within stack red-zone till return, as unwinders can find the same value
12230    in the register and on the stack.  */
12231 
12232 static void
12233 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12234 {
12235   if (!crtl->shrink_wrapped
12236       && cfa_offset <= cfun->machine->fs.red_zone_offset)
12237     return;
12238 
12239   if (insn)
12240     {
12241       add_reg_note (insn, REG_CFA_RESTORE, reg);
12242       RTX_FRAME_RELATED_P (insn) = 1;
12243     }
12244   else
12245     queued_cfa_restores
12246       = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12247 }
12248 
12249 /* Add queued REG_CFA_RESTORE notes if any to INSN.  */
12250 
12251 static void
12252 ix86_add_queued_cfa_restore_notes (rtx insn)
12253 {
12254   rtx last;
12255   if (!queued_cfa_restores)
12256     return;
12257   for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12258     ;
12259   XEXP (last, 1) = REG_NOTES (insn);
12260   REG_NOTES (insn) = queued_cfa_restores;
12261   queued_cfa_restores = NULL_RTX;
12262   RTX_FRAME_RELATED_P (insn) = 1;
12263 }
12264 
12265 /* Expand prologue or epilogue stack adjustment.
12266    The pattern exist to put a dependency on all ebp-based memory accesses.
12267    STYLE should be negative if instructions should be marked as frame related,
12268    zero if %r11 register is live and cannot be freely used and positive
12269    otherwise.  */
12270 
12271 static rtx
12272 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12273 			   int style, bool set_cfa)
12274 {
12275   struct machine_function *m = cfun->machine;
12276   rtx insn;
12277   bool add_frame_related_expr = false;
12278 
12279   if (Pmode == SImode)
12280     insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12281   else if (x86_64_immediate_operand (offset, DImode))
12282     insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12283   else
12284     {
12285       rtx tmp;
12286       /* r11 is used by indirect sibcall return as well, set before the
12287 	 epilogue and used after the epilogue.  */
12288       if (style)
12289         tmp = gen_rtx_REG (DImode, R11_REG);
12290       else
12291 	{
12292 	  gcc_assert (src != hard_frame_pointer_rtx
12293 		      && dest != hard_frame_pointer_rtx);
12294 	  tmp = hard_frame_pointer_rtx;
12295 	}
12296       insn = emit_insn (gen_rtx_SET (tmp, offset));
12297       if (style < 0)
12298 	add_frame_related_expr = true;
12299 
12300       insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12301     }
12302 
12303   insn = emit_insn (insn);
12304   if (style >= 0)
12305     ix86_add_queued_cfa_restore_notes (insn);
12306 
12307   if (set_cfa)
12308     {
12309       rtx r;
12310 
12311       gcc_assert (m->fs.cfa_reg == src);
12312       m->fs.cfa_offset += INTVAL (offset);
12313       m->fs.cfa_reg = dest;
12314 
12315       r = gen_rtx_PLUS (Pmode, src, offset);
12316       r = gen_rtx_SET (dest, r);
12317       add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12318       RTX_FRAME_RELATED_P (insn) = 1;
12319     }
12320   else if (style < 0)
12321     {
12322       RTX_FRAME_RELATED_P (insn) = 1;
12323       if (add_frame_related_expr)
12324 	{
12325 	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
12326 	  r = gen_rtx_SET (dest, r);
12327 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12328 	}
12329     }
12330 
12331   if (dest == stack_pointer_rtx)
12332     {
12333       HOST_WIDE_INT ooffset = m->fs.sp_offset;
12334       bool valid = m->fs.sp_valid;
12335       bool realigned = m->fs.sp_realigned;
12336 
12337       if (src == hard_frame_pointer_rtx)
12338 	{
12339 	  valid = m->fs.fp_valid;
12340 	  realigned = false;
12341 	  ooffset = m->fs.fp_offset;
12342 	}
12343       else if (src == crtl->drap_reg)
12344 	{
12345 	  valid = m->fs.drap_valid;
12346 	  realigned = false;
12347 	  ooffset = 0;
12348 	}
12349       else
12350 	{
12351 	  /* Else there are two possibilities: SP itself, which we set
12352 	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
12353 	     taken care of this by hand along the eh_return path.  */
12354 	  gcc_checking_assert (src == stack_pointer_rtx
12355 			       || offset == const0_rtx);
12356 	}
12357 
12358       m->fs.sp_offset = ooffset - INTVAL (offset);
12359       m->fs.sp_valid = valid;
12360       m->fs.sp_realigned = realigned;
12361     }
12362   return insn;
12363 }
12364 
12365 /* Find an available register to be used as dynamic realign argument
12366    pointer regsiter.  Such a register will be written in prologue and
12367    used in begin of body, so it must not be
12368 	1. parameter passing register.
12369 	2. GOT pointer.
12370    We reuse static-chain register if it is available.  Otherwise, we
12371    use DI for i386 and R13 for x86-64.  We chose R13 since it has
12372    shorter encoding.
12373 
12374    Return: the regno of chosen register.  */
12375 
12376 static unsigned int
12377 find_drap_reg (void)
12378 {
12379   tree decl = cfun->decl;
12380 
12381   /* Always use callee-saved register if there are no caller-saved
12382      registers.  */
12383   if (TARGET_64BIT)
12384     {
12385       /* Use R13 for nested function or function need static chain.
12386 	 Since function with tail call may use any caller-saved
12387 	 registers in epilogue, DRAP must not use caller-saved
12388 	 register in such case.  */
12389       if (DECL_STATIC_CHAIN (decl)
12390 	  || cfun->machine->no_caller_saved_registers
12391 	  || crtl->tail_call_emit)
12392 	return R13_REG;
12393 
12394       return R10_REG;
12395     }
12396   else
12397     {
12398       /* Use DI for nested function or function need static chain.
12399 	 Since function with tail call may use any caller-saved
12400 	 registers in epilogue, DRAP must not use caller-saved
12401 	 register in such case.  */
12402       if (DECL_STATIC_CHAIN (decl)
12403 	  || cfun->machine->no_caller_saved_registers
12404 	  || crtl->tail_call_emit)
12405 	return DI_REG;
12406 
12407       /* Reuse static chain register if it isn't used for parameter
12408          passing.  */
12409       if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12410 	{
12411 	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12412 	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12413 	    return CX_REG;
12414 	}
12415       return DI_REG;
12416     }
12417 }
12418 
12419 /* Handle a "force_align_arg_pointer" attribute.  */
12420 
12421 static tree
12422 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12423 					       tree, int, bool *no_add_attrs)
12424 {
12425   if (TREE_CODE (*node) != FUNCTION_TYPE
12426       && TREE_CODE (*node) != METHOD_TYPE
12427       && TREE_CODE (*node) != FIELD_DECL
12428       && TREE_CODE (*node) != TYPE_DECL)
12429     {
12430       warning (OPT_Wattributes, "%qE attribute only applies to functions",
12431 	       name);
12432       *no_add_attrs = true;
12433     }
12434 
12435   return NULL_TREE;
12436 }
12437 
12438 /* Return minimum incoming stack alignment.  */
12439 
12440 static unsigned int
12441 ix86_minimum_incoming_stack_boundary (bool sibcall)
12442 {
12443   unsigned int incoming_stack_boundary;
12444 
12445   /* Stack of interrupt handler is aligned to 128 bits in 64bit mode.  */
12446   if (cfun->machine->func_type != TYPE_NORMAL)
12447     incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12448   /* Prefer the one specified at command line. */
12449   else if (ix86_user_incoming_stack_boundary)
12450     incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12451   /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12452      if -mstackrealign is used, it isn't used for sibcall check and
12453      estimated stack alignment is 128bit.  */
12454   else if (!sibcall
12455 	   && ix86_force_align_arg_pointer
12456 	   && crtl->stack_alignment_estimated == 128)
12457     incoming_stack_boundary = MIN_STACK_BOUNDARY;
12458   else
12459     incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12460 
12461   /* Incoming stack alignment can be changed on individual functions
12462      via force_align_arg_pointer attribute.  We use the smallest
12463      incoming stack boundary.  */
12464   if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12465       && lookup_attribute (ix86_force_align_arg_pointer_string,
12466 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12467     incoming_stack_boundary = MIN_STACK_BOUNDARY;
12468 
12469   /* The incoming stack frame has to be aligned at least at
12470      parm_stack_boundary.  */
12471   if (incoming_stack_boundary < crtl->parm_stack_boundary)
12472     incoming_stack_boundary = crtl->parm_stack_boundary;
12473 
12474   /* Stack at entrance of main is aligned by runtime.  We use the
12475      smallest incoming stack boundary. */
12476   if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12477       && DECL_NAME (current_function_decl)
12478       && MAIN_NAME_P (DECL_NAME (current_function_decl))
12479       && DECL_FILE_SCOPE_P (current_function_decl))
12480     incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12481 
12482   return incoming_stack_boundary;
12483 }
12484 
12485 /* Update incoming stack boundary and estimated stack alignment.  */
12486 
12487 static void
12488 ix86_update_stack_boundary (void)
12489 {
12490   ix86_incoming_stack_boundary
12491     = ix86_minimum_incoming_stack_boundary (false);
12492 
12493   /* x86_64 vararg needs 16byte stack alignment for register save
12494      area.  */
12495   if (TARGET_64BIT
12496       && cfun->stdarg
12497       && crtl->stack_alignment_estimated < 128)
12498     crtl->stack_alignment_estimated = 128;
12499 
12500   /* __tls_get_addr needs to be called with 16-byte aligned stack.  */
12501   if (ix86_tls_descriptor_calls_expanded_in_cfun
12502       && crtl->preferred_stack_boundary < 128)
12503     crtl->preferred_stack_boundary = 128;
12504 }
12505 
12506 /* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
12507    needed or an rtx for DRAP otherwise.  */
12508 
12509 static rtx
12510 ix86_get_drap_rtx (void)
12511 {
12512   /* We must use DRAP if there are outgoing arguments on stack and
12513      ACCUMULATE_OUTGOING_ARGS is false.  */
12514   if (ix86_force_drap
12515       || (cfun->machine->outgoing_args_on_stack
12516 	  && !ACCUMULATE_OUTGOING_ARGS))
12517     crtl->need_drap = true;
12518 
12519   if (stack_realign_drap)
12520     {
12521       /* Assign DRAP to vDRAP and returns vDRAP */
12522       unsigned int regno = find_drap_reg ();
12523       rtx drap_vreg;
12524       rtx arg_ptr;
12525       rtx_insn *seq, *insn;
12526 
12527       arg_ptr = gen_rtx_REG (Pmode, regno);
12528       crtl->drap_reg = arg_ptr;
12529 
12530       start_sequence ();
12531       drap_vreg = copy_to_reg (arg_ptr);
12532       seq = get_insns ();
12533       end_sequence ();
12534 
12535       insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12536       if (!optimize)
12537 	{
12538 	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12539 	  RTX_FRAME_RELATED_P (insn) = 1;
12540 	}
12541       return drap_vreg;
12542     }
12543   else
12544     return NULL;
12545 }
12546 
12547 /* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
12548 
12549 static rtx
12550 ix86_internal_arg_pointer (void)
12551 {
12552   return virtual_incoming_args_rtx;
12553 }
12554 
12555 struct scratch_reg {
12556   rtx reg;
12557   bool saved;
12558 };
12559 
12560 /* Return a short-lived scratch register for use on function entry.
12561    In 32-bit mode, it is valid only after the registers are saved
12562    in the prologue.  This register must be released by means of
12563    release_scratch_register_on_entry once it is dead.  */
12564 
12565 static void
12566 get_scratch_register_on_entry (struct scratch_reg *sr)
12567 {
12568   int regno;
12569 
12570   sr->saved = false;
12571 
12572   if (TARGET_64BIT)
12573     {
12574       /* We always use R11 in 64-bit mode.  */
12575       regno = R11_REG;
12576     }
12577   else
12578     {
12579       tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12580       bool fastcall_p
12581 	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12582       bool thiscall_p
12583 	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12584       bool static_chain_p = DECL_STATIC_CHAIN (decl);
12585       int regparm = ix86_function_regparm (fntype, decl);
12586       int drap_regno
12587 	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12588 
12589       /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12590 	  for the static chain register.  */
12591       if ((regparm < 1 || (fastcall_p && !static_chain_p))
12592 	  && drap_regno != AX_REG)
12593 	regno = AX_REG;
12594       /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12595 	  for the static chain register.  */
12596       else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12597         regno = AX_REG;
12598       else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12599 	regno = DX_REG;
12600       /* ecx is the static chain register.  */
12601       else if (regparm < 3 && !fastcall_p && !thiscall_p
12602 	       && !static_chain_p
12603 	       && drap_regno != CX_REG)
12604 	regno = CX_REG;
12605       else if (ix86_save_reg (BX_REG, true, false))
12606 	regno = BX_REG;
12607       /* esi is the static chain register.  */
12608       else if (!(regparm == 3 && static_chain_p)
12609 	       && ix86_save_reg (SI_REG, true, false))
12610 	regno = SI_REG;
12611       else if (ix86_save_reg (DI_REG, true, false))
12612 	regno = DI_REG;
12613       else
12614 	{
12615 	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12616 	  sr->saved = true;
12617 	}
12618     }
12619 
12620   sr->reg = gen_rtx_REG (Pmode, regno);
12621   if (sr->saved)
12622     {
12623       rtx_insn *insn = emit_insn (gen_push (sr->reg));
12624       RTX_FRAME_RELATED_P (insn) = 1;
12625     }
12626 }
12627 
12628 /* Release a scratch register obtained from the preceding function.
12629 
12630    If RELEASE_VIA_POP is true, we just pop the register off the stack
12631    to release it.  This is what non-Linux systems use with -fstack-check.
12632 
12633    Otherwise we use OFFSET to locate the saved register and the
12634    allocated stack space becomes part of the local frame and is
12635    deallocated by the epilogue.  */
12636 
12637 static void
12638 release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
12639 				   bool release_via_pop)
12640 {
12641   if (sr->saved)
12642     {
12643       if (release_via_pop)
12644 	{
12645 	  struct machine_function *m = cfun->machine;
12646 	  rtx x, insn = emit_insn (gen_pop (sr->reg));
12647 
12648 	  /* The RX FRAME_RELATED_P mechanism doesn't know about pop.  */
12649 	  RTX_FRAME_RELATED_P (insn) = 1;
12650 	  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12651 	  x = gen_rtx_SET (stack_pointer_rtx, x);
12652 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12653 	  m->fs.sp_offset -= UNITS_PER_WORD;
12654 	}
12655       else
12656 	{
12657 	  rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
12658 	  x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
12659 	  emit_insn (x);
12660 	}
12661     }
12662 }
12663 
12664 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12665 
12666    This differs from the next routine in that it tries hard to prevent
12667    attacks that jump the stack guard.  Thus it is never allowed to allocate
12668    more than PROBE_INTERVAL bytes of stack space without a suitable
12669    probe.
12670 
12671    INT_REGISTERS_SAVED is true if integer registers have already been
12672    pushed on the stack.  */
12673 
12674 static void
12675 ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
12676 					 const bool int_registers_saved)
12677 {
12678   struct machine_function *m = cfun->machine;
12679 
12680   /* If this function does not statically allocate stack space, then
12681      no probes are needed.  */
12682   if (!size)
12683     {
12684       /* However, the allocation of space via pushes for register
12685 	 saves could be viewed as allocating space, but without the
12686 	 need to probe.  */
12687       if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12688         dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12689       else
12690 	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12691       return;
12692     }
12693 
12694   /* If we are a noreturn function, then we have to consider the
12695      possibility that we're called via a jump rather than a call.
12696 
12697      Thus we don't have the implicit probe generated by saving the
12698      return address into the stack at the call.  Thus, the stack
12699      pointer could be anywhere in the guard page.  The safe thing
12700      to do is emit a probe now.
12701 
12702      The probe can be avoided if we have already emitted any callee
12703      register saves into the stack or have a frame pointer (which will
12704      have been saved as well).  Those saves will function as implicit
12705      probes.
12706 
12707      ?!? This should be revamped to work like aarch64 and s390 where
12708      we track the offset from the most recent probe.  Normally that
12709      offset would be zero.  For a noreturn function we would reset
12710      it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT).   Then
12711      we just probe when we cross PROBE_INTERVAL.  */
12712   if (TREE_THIS_VOLATILE (cfun->decl)
12713       && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12714     {
12715       /* We can safely use any register here since we're just going to push
12716 	 its value and immediately pop it back.  But we do try and avoid
12717 	 argument passing registers so as not to introduce dependencies in
12718 	 the pipeline.  For 32 bit we use %esi and for 64 bit we use %rax.  */
12719       rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12720       rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12721       rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12722       m->fs.sp_offset -= UNITS_PER_WORD;
12723       if (m->fs.cfa_reg == stack_pointer_rtx)
12724 	{
12725 	  m->fs.cfa_offset -= UNITS_PER_WORD;
12726 	  rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12727 	  x = gen_rtx_SET (stack_pointer_rtx, x);
12728 	  add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12729 	  RTX_FRAME_RELATED_P (insn_push) = 1;
12730 	  x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12731 	  x = gen_rtx_SET (stack_pointer_rtx, x);
12732 	  add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12733 	  RTX_FRAME_RELATED_P (insn_pop) = 1;
12734 	}
12735       emit_insn (gen_blockage ());
12736     }
12737 
12738   /* If we allocate less than the size of the guard statically,
12739      then no probing is necessary, but we do need to allocate
12740      the stack.  */
12741   if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12742     {
12743       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12744 			         GEN_INT (-size), -1,
12745 			         m->fs.cfa_reg == stack_pointer_rtx);
12746       dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12747       return;
12748     }
12749 
12750   /* We're allocating a large enough stack frame that we need to
12751      emit probes.  Either emit them inline or in a loop depending
12752      on the size.  */
12753   HOST_WIDE_INT probe_interval = get_probe_interval ();
12754   if (size <= 4 * probe_interval)
12755     {
12756       HOST_WIDE_INT i;
12757       for (i = probe_interval; i <= size; i += probe_interval)
12758 	{
12759 	  /* Allocate PROBE_INTERVAL bytes.  */
12760 	  rtx insn
12761 	    = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12762 					 GEN_INT (-probe_interval), -1,
12763 					 m->fs.cfa_reg == stack_pointer_rtx);
12764 	  add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12765 
12766 	  /* And probe at *sp.  */
12767 	  emit_stack_probe (stack_pointer_rtx);
12768 	  emit_insn (gen_blockage ());
12769 	}
12770 
12771       /* We need to allocate space for the residual, but we do not need
12772 	 to probe the residual.  */
12773       HOST_WIDE_INT residual = (i - probe_interval - size);
12774       if (residual)
12775 	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12776 				   GEN_INT (residual), -1,
12777 				   m->fs.cfa_reg == stack_pointer_rtx);
12778       dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12779     }
12780   else
12781     {
12782       /* We expect the GP registers to be saved when probes are used
12783 	 as the probing sequences might need a scratch register and
12784 	 the routine to allocate one assumes the integer registers
12785 	 have already been saved.  */
12786       gcc_assert (int_registers_saved);
12787 
12788       struct scratch_reg sr;
12789       get_scratch_register_on_entry (&sr);
12790 
12791       /* If we needed to save a register, then account for any space
12792 	 that was pushed (we are not going to pop the register when
12793 	 we do the restore).  */
12794       if (sr.saved)
12795 	size -= UNITS_PER_WORD;
12796 
12797       /* Step 1: round SIZE down to a multiple of the interval.  */
12798       HOST_WIDE_INT rounded_size = size & -probe_interval;
12799 
12800       /* Step 2: compute final value of the loop counter.  Use lea if
12801 	 possible.  */
12802       rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12803       rtx insn;
12804       if (address_no_seg_operand (addr, Pmode))
12805 	insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12806       else
12807 	{
12808 	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12809 	  insn = emit_insn (gen_rtx_SET (sr.reg,
12810 					 gen_rtx_PLUS (Pmode, sr.reg,
12811 						       stack_pointer_rtx)));
12812 	}
12813       if (m->fs.cfa_reg == stack_pointer_rtx)
12814 	{
12815 	  add_reg_note (insn, REG_CFA_DEF_CFA,
12816 			plus_constant (Pmode, sr.reg,
12817 				       m->fs.cfa_offset + rounded_size));
12818 	  RTX_FRAME_RELATED_P (insn) = 1;
12819 	}
12820 
12821       /* Step 3: the loop.  */
12822       rtx size_rtx = GEN_INT (rounded_size);
12823       insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12824 							 size_rtx));
12825       if (m->fs.cfa_reg == stack_pointer_rtx)
12826 	{
12827 	  m->fs.cfa_offset += rounded_size;
12828 	  add_reg_note (insn, REG_CFA_DEF_CFA,
12829 			plus_constant (Pmode, stack_pointer_rtx,
12830 				       m->fs.cfa_offset));
12831 	  RTX_FRAME_RELATED_P (insn) = 1;
12832 	}
12833       m->fs.sp_offset += rounded_size;
12834       emit_insn (gen_blockage ());
12835 
12836       /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12837 	 is equal to ROUNDED_SIZE.  */
12838 
12839       if (size != rounded_size)
12840 	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12841 				   GEN_INT (rounded_size - size), -1,
12842 				   m->fs.cfa_reg == stack_pointer_rtx);
12843       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12844 
12845       /* This does not deallocate the space reserved for the scratch
12846 	 register.  That will be deallocated in the epilogue.  */
12847       release_scratch_register_on_entry (&sr, size, false);
12848     }
12849 
12850   /* Make sure nothing is scheduled before we are done.  */
12851   emit_insn (gen_blockage ());
12852 }
12853 
12854 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12855 
12856    INT_REGISTERS_SAVED is true if integer registers have already been
12857    pushed on the stack.  */
12858 
12859 static void
12860 ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
12861 			     const bool int_registers_saved)
12862 {
12863   /* We skip the probe for the first interval + a small dope of 4 words and
12864      probe that many bytes past the specified size to maintain a protection
12865      area at the botton of the stack.  */
12866   const int dope = 4 * UNITS_PER_WORD;
12867   rtx size_rtx = GEN_INT (size), last;
12868 
12869   /* See if we have a constant small number of probes to generate.  If so,
12870      that's the easy case.  The run-time loop is made up of 9 insns in the
12871      generic case while the compile-time loop is made up of 3+2*(n-1) insns
12872      for n # of intervals.  */
12873   if (size <= 4 * get_probe_interval ())
12874     {
12875       HOST_WIDE_INT i, adjust;
12876       bool first_probe = true;
12877 
12878       /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12879 	 values of N from 1 until it exceeds SIZE.  If only one probe is
12880 	 needed, this will not generate any code.  Then adjust and probe
12881 	 to PROBE_INTERVAL + SIZE.  */
12882       for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12883 	{
12884 	  if (first_probe)
12885 	    {
12886 	      adjust = 2 * get_probe_interval () + dope;
12887 	      first_probe = false;
12888 	    }
12889 	  else
12890 	    adjust = get_probe_interval ();
12891 
12892 	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
12893 				  plus_constant (Pmode, stack_pointer_rtx,
12894 						 -adjust)));
12895 	  emit_stack_probe (stack_pointer_rtx);
12896 	}
12897 
12898       if (first_probe)
12899 	adjust = size + get_probe_interval () + dope;
12900       else
12901         adjust = size + get_probe_interval () - i;
12902 
12903       emit_insn (gen_rtx_SET (stack_pointer_rtx,
12904 			      plus_constant (Pmode, stack_pointer_rtx,
12905 					     -adjust)));
12906       emit_stack_probe (stack_pointer_rtx);
12907 
12908       /* Adjust back to account for the additional first interval.  */
12909       last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12910 				     plus_constant (Pmode, stack_pointer_rtx,
12911 						    (get_probe_interval ()
12912 						     + dope))));
12913     }
12914 
12915   /* Otherwise, do the same as above, but in a loop.  Note that we must be
12916      extra careful with variables wrapping around because we might be at
12917      the very top (or the very bottom) of the address space and we have
12918      to be able to handle this case properly; in particular, we use an
12919      equality test for the loop condition.  */
12920   else
12921     {
12922       /* We expect the GP registers to be saved when probes are used
12923 	 as the probing sequences might need a scratch register and
12924 	 the routine to allocate one assumes the integer registers
12925 	 have already been saved.  */
12926       gcc_assert (int_registers_saved);
12927 
12928       HOST_WIDE_INT rounded_size;
12929       struct scratch_reg sr;
12930 
12931       get_scratch_register_on_entry (&sr);
12932 
12933       /* If we needed to save a register, then account for any space
12934 	 that was pushed (we are not going to pop the register when
12935 	 we do the restore).  */
12936       if (sr.saved)
12937 	size -= UNITS_PER_WORD;
12938 
12939       /* Step 1: round SIZE to the previous multiple of the interval.  */
12940 
12941       rounded_size = ROUND_DOWN (size, get_probe_interval ());
12942 
12943 
12944       /* Step 2: compute initial and final value of the loop counter.  */
12945 
12946       /* SP = SP_0 + PROBE_INTERVAL.  */
12947       emit_insn (gen_rtx_SET (stack_pointer_rtx,
12948 			      plus_constant (Pmode, stack_pointer_rtx,
12949 					     - (get_probe_interval () + dope))));
12950 
12951       /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
12952       if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12953 	emit_insn (gen_rtx_SET (sr.reg,
12954 				plus_constant (Pmode, stack_pointer_rtx,
12955 					       -rounded_size)));
12956       else
12957 	{
12958 	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12959 	  emit_insn (gen_rtx_SET (sr.reg,
12960 				  gen_rtx_PLUS (Pmode, sr.reg,
12961 						stack_pointer_rtx)));
12962 	}
12963 
12964 
12965       /* Step 3: the loop
12966 
12967 	 do
12968 	   {
12969 	     SP = SP + PROBE_INTERVAL
12970 	     probe at SP
12971 	   }
12972 	 while (SP != LAST_ADDR)
12973 
12974 	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12975 	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
12976 
12977       emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12978 
12979 
12980       /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12981 	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
12982 
12983       if (size != rounded_size)
12984 	{
12985 	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
12986 			          plus_constant (Pmode, stack_pointer_rtx,
12987 						 rounded_size - size)));
12988 	  emit_stack_probe (stack_pointer_rtx);
12989 	}
12990 
12991       /* Adjust back to account for the additional first interval.  */
12992       last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12993 				     plus_constant (Pmode, stack_pointer_rtx,
12994 						    (get_probe_interval ()
12995 						     + dope))));
12996 
12997       /* This does not deallocate the space reserved for the scratch
12998 	 register.  That will be deallocated in the epilogue.  */
12999       release_scratch_register_on_entry (&sr, size, false);
13000     }
13001 
13002   /* Even if the stack pointer isn't the CFA register, we need to correctly
13003      describe the adjustments made to it, in particular differentiate the
13004      frame-related ones from the frame-unrelated ones.  */
13005   if (size > 0)
13006     {
13007       rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13008       XVECEXP (expr, 0, 0)
13009 	= gen_rtx_SET (stack_pointer_rtx,
13010 		       plus_constant (Pmode, stack_pointer_rtx, -size));
13011       XVECEXP (expr, 0, 1)
13012 	= gen_rtx_SET (stack_pointer_rtx,
13013 		       plus_constant (Pmode, stack_pointer_rtx,
13014 				      get_probe_interval () + dope + size));
13015       add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13016       RTX_FRAME_RELATED_P (last) = 1;
13017 
13018       cfun->machine->fs.sp_offset += size;
13019     }
13020 
13021   /* Make sure nothing is scheduled before we are done.  */
13022   emit_insn (gen_blockage ());
13023 }
13024 
13025 /* Adjust the stack pointer up to REG while probing it.  */
13026 
13027 const char *
13028 output_adjust_stack_and_probe (rtx reg)
13029 {
13030   static int labelno = 0;
13031   char loop_lab[32];
13032   rtx xops[2];
13033 
13034   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13035 
13036   /* Loop.  */
13037   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13038 
13039   /* SP = SP + PROBE_INTERVAL.  */
13040   xops[0] = stack_pointer_rtx;
13041   xops[1] = GEN_INT (get_probe_interval ());
13042   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13043 
13044   /* Probe at SP.  */
13045   xops[1] = const0_rtx;
13046   output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13047 
13048   /* Test if SP == LAST_ADDR.  */
13049   xops[0] = stack_pointer_rtx;
13050   xops[1] = reg;
13051   output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13052 
13053   /* Branch.  */
13054   fputs ("\tjne\t", asm_out_file);
13055   assemble_name_raw (asm_out_file, loop_lab);
13056   fputc ('\n', asm_out_file);
13057 
13058   return "";
13059 }
13060 
13061 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13062    inclusive.  These are offsets from the current stack pointer.
13063 
13064    INT_REGISTERS_SAVED is true if integer registers have already been
13065    pushed on the stack.  */
13066 
13067 static void
13068 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
13069 			     const bool int_registers_saved)
13070 {
13071   /* See if we have a constant small number of probes to generate.  If so,
13072      that's the easy case.  The run-time loop is made up of 6 insns in the
13073      generic case while the compile-time loop is made up of n insns for n #
13074      of intervals.  */
13075   if (size <= 6 * get_probe_interval ())
13076     {
13077       HOST_WIDE_INT i;
13078 
13079       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13080 	 it exceeds SIZE.  If only one probe is needed, this will not
13081 	 generate any code.  Then probe at FIRST + SIZE.  */
13082       for (i = get_probe_interval (); i < size; i += get_probe_interval ())
13083 	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13084 					 -(first + i)));
13085 
13086       emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13087 				       -(first + size)));
13088     }
13089 
13090   /* Otherwise, do the same as above, but in a loop.  Note that we must be
13091      extra careful with variables wrapping around because we might be at
13092      the very top (or the very bottom) of the address space and we have
13093      to be able to handle this case properly; in particular, we use an
13094      equality test for the loop condition.  */
13095   else
13096     {
13097       /* We expect the GP registers to be saved when probes are used
13098 	 as the probing sequences might need a scratch register and
13099 	 the routine to allocate one assumes the integer registers
13100 	 have already been saved.  */
13101       gcc_assert (int_registers_saved);
13102 
13103       HOST_WIDE_INT rounded_size, last;
13104       struct scratch_reg sr;
13105 
13106       get_scratch_register_on_entry (&sr);
13107 
13108 
13109       /* Step 1: round SIZE to the previous multiple of the interval.  */
13110 
13111       rounded_size = ROUND_DOWN (size, get_probe_interval ());
13112 
13113 
13114       /* Step 2: compute initial and final value of the loop counter.  */
13115 
13116       /* TEST_OFFSET = FIRST.  */
13117       emit_move_insn (sr.reg, GEN_INT (-first));
13118 
13119       /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
13120       last = first + rounded_size;
13121 
13122 
13123       /* Step 3: the loop
13124 
13125 	 do
13126 	   {
13127 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13128 	     probe at TEST_ADDR
13129 	   }
13130 	 while (TEST_ADDR != LAST_ADDR)
13131 
13132          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13133          until it is equal to ROUNDED_SIZE.  */
13134 
13135       emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13136 
13137 
13138       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13139 	 that SIZE is equal to ROUNDED_SIZE.  */
13140 
13141       if (size != rounded_size)
13142 	emit_stack_probe (plus_constant (Pmode,
13143 					 gen_rtx_PLUS (Pmode,
13144 						       stack_pointer_rtx,
13145 						       sr.reg),
13146 					 rounded_size - size));
13147 
13148       release_scratch_register_on_entry (&sr, size, true);
13149     }
13150 
13151   /* Make sure nothing is scheduled before we are done.  */
13152   emit_insn (gen_blockage ());
13153 }
13154 
13155 /* Probe a range of stack addresses from REG to END, inclusive.  These are
13156    offsets from the current stack pointer.  */
13157 
13158 const char *
13159 output_probe_stack_range (rtx reg, rtx end)
13160 {
13161   static int labelno = 0;
13162   char loop_lab[32];
13163   rtx xops[3];
13164 
13165   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13166 
13167   /* Loop.  */
13168   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13169 
13170   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
13171   xops[0] = reg;
13172   xops[1] = GEN_INT (get_probe_interval ());
13173   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13174 
13175   /* Probe at TEST_ADDR.  */
13176   xops[0] = stack_pointer_rtx;
13177   xops[1] = reg;
13178   xops[2] = const0_rtx;
13179   output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13180 
13181   /* Test if TEST_ADDR == LAST_ADDR.  */
13182   xops[0] = reg;
13183   xops[1] = end;
13184   output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13185 
13186   /* Branch.  */
13187   fputs ("\tjne\t", asm_out_file);
13188   assemble_name_raw (asm_out_file, loop_lab);
13189   fputc ('\n', asm_out_file);
13190 
13191   return "";
13192 }
13193 
13194 /* Return true if stack frame is required.  Update STACK_ALIGNMENT
13195    to the largest alignment, in bits, of stack slot used if stack
13196    frame is required and CHECK_STACK_SLOT is true.  */
13197 
13198 static bool
13199 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13200 				    bool check_stack_slot)
13201 {
13202   HARD_REG_SET set_up_by_prologue, prologue_used;
13203   basic_block bb;
13204 
13205   CLEAR_HARD_REG_SET (prologue_used);
13206   CLEAR_HARD_REG_SET (set_up_by_prologue);
13207   add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13208   add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13209   add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13210 		       HARD_FRAME_POINTER_REGNUM);
13211 
13212   /* The preferred stack alignment is the minimum stack alignment.  */
13213   if (stack_alignment > crtl->preferred_stack_boundary)
13214     stack_alignment = crtl->preferred_stack_boundary;
13215 
13216   bool require_stack_frame = false;
13217 
13218   FOR_EACH_BB_FN (bb, cfun)
13219     {
13220       rtx_insn *insn;
13221       FOR_BB_INSNS (bb, insn)
13222 	if (NONDEBUG_INSN_P (insn)
13223 	    && requires_stack_frame_p (insn, prologue_used,
13224 				       set_up_by_prologue))
13225 	  {
13226 	    require_stack_frame = true;
13227 
13228 	    if (check_stack_slot)
13229 	      {
13230 		/* Find the maximum stack alignment.  */
13231 		subrtx_iterator::array_type array;
13232 		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13233 		  if (MEM_P (*iter)
13234 		      && (reg_mentioned_p (stack_pointer_rtx,
13235 					   *iter)
13236 			  || reg_mentioned_p (frame_pointer_rtx,
13237 					      *iter)))
13238 		    {
13239 		      unsigned int alignment = MEM_ALIGN (*iter);
13240 		      if (alignment > stack_alignment)
13241 			stack_alignment = alignment;
13242 		    }
13243 	      }
13244 	  }
13245     }
13246 
13247   return require_stack_frame;
13248 }
13249 
13250 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13251    will guide prologue/epilogue to be generated in correct form.  */
13252 
13253 static void
13254 ix86_finalize_stack_frame_flags (void)
13255 {
13256   /* Check if stack realign is really needed after reload, and
13257      stores result in cfun */
13258   unsigned int incoming_stack_boundary
13259     = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13260        ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13261   unsigned int stack_alignment
13262     = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13263        ? crtl->max_used_stack_slot_alignment
13264        : crtl->stack_alignment_needed);
13265   unsigned int stack_realign
13266     = (incoming_stack_boundary < stack_alignment);
13267   bool recompute_frame_layout_p = false;
13268 
13269   if (crtl->stack_realign_finalized)
13270     {
13271       /* After stack_realign_needed is finalized, we can't no longer
13272 	 change it.  */
13273       gcc_assert (crtl->stack_realign_needed == stack_realign);
13274       return;
13275     }
13276 
13277   /* If the only reason for frame_pointer_needed is that we conservatively
13278      assumed stack realignment might be needed or -fno-omit-frame-pointer
13279      is used, but in the end nothing that needed the stack alignment had
13280      been spilled nor stack access, clear frame_pointer_needed and say we
13281      don't need stack realignment.  */
13282   if ((stack_realign || (!flag_omit_frame_pointer && optimize))
13283       && frame_pointer_needed
13284       && crtl->is_leaf
13285       && crtl->sp_is_unchanging
13286       && !ix86_current_function_calls_tls_descriptor
13287       && !crtl->accesses_prior_frames
13288       && !cfun->calls_alloca
13289       && !crtl->calls_eh_return
13290       /* See ira_setup_eliminable_regset for the rationale.  */
13291       && !(STACK_CHECK_MOVING_SP
13292 	   && flag_stack_check
13293 	   && flag_exceptions
13294 	   && cfun->can_throw_non_call_exceptions)
13295       && !ix86_frame_pointer_required ()
13296       && get_frame_size () == 0
13297       && ix86_nsaved_sseregs () == 0
13298       && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13299     {
13300       if (ix86_find_max_used_stack_alignment (stack_alignment,
13301 					      stack_realign))
13302 	{
13303 	  /* Stack frame is required.  If stack alignment needed is less
13304 	     than incoming stack boundary, don't realign stack.  */
13305 	  stack_realign = incoming_stack_boundary < stack_alignment;
13306 	  if (!stack_realign)
13307 	    {
13308 	      crtl->max_used_stack_slot_alignment
13309 		= incoming_stack_boundary;
13310 	      crtl->stack_alignment_needed
13311 		= incoming_stack_boundary;
13312 	      /* Also update preferred_stack_boundary for leaf
13313 	         functions.  */
13314 	      crtl->preferred_stack_boundary
13315 		= incoming_stack_boundary;
13316 	    }
13317 	}
13318       else
13319 	{
13320 	  /* If drap has been set, but it actually isn't live at the
13321 	     start of the function, there is no reason to set it up.  */
13322 	  if (crtl->drap_reg)
13323 	    {
13324 	      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13325 	      if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13326 				     REGNO (crtl->drap_reg)))
13327 		{
13328 		  crtl->drap_reg = NULL_RTX;
13329 		  crtl->need_drap = false;
13330 		}
13331 	    }
13332 	  else
13333 	    cfun->machine->no_drap_save_restore = true;
13334 
13335 	  frame_pointer_needed = false;
13336 	  stack_realign = false;
13337 	  crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13338 	  crtl->stack_alignment_needed = incoming_stack_boundary;
13339 	  crtl->stack_alignment_estimated = incoming_stack_boundary;
13340 	  if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13341 	    crtl->preferred_stack_boundary = incoming_stack_boundary;
13342 	  df_finish_pass (true);
13343 	  df_scan_alloc (NULL);
13344 	  df_scan_blocks ();
13345 	  df_compute_regs_ever_live (true);
13346 	  df_analyze ();
13347 
13348 	  if (flag_var_tracking)
13349 	    {
13350 	      /* Since frame pointer is no longer available, replace it with
13351 		 stack pointer - UNITS_PER_WORD in debug insns.  */
13352 	      df_ref ref, next;
13353 	      for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13354 		   ref; ref = next)
13355 		{
13356 		  next = DF_REF_NEXT_REG (ref);
13357 		  if (!DF_REF_INSN_INFO (ref))
13358 		    continue;
13359 
13360 		  /* Make sure the next ref is for a different instruction,
13361 		     so that we're not affected by the rescan.  */
13362 		  rtx_insn *insn = DF_REF_INSN (ref);
13363 		  while (next && DF_REF_INSN (next) == insn)
13364 		    next = DF_REF_NEXT_REG (next);
13365 
13366 		  if (DEBUG_INSN_P (insn))
13367 		    {
13368 		      bool changed = false;
13369 		      for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13370 			{
13371 			  rtx *loc = DF_REF_LOC (ref);
13372 			  if (*loc == hard_frame_pointer_rtx)
13373 			    {
13374 			      *loc = plus_constant (Pmode,
13375 						    stack_pointer_rtx,
13376 						    -UNITS_PER_WORD);
13377 			      changed = true;
13378 			    }
13379 			}
13380 		      if (changed)
13381 			df_insn_rescan (insn);
13382 		    }
13383 		}
13384 	    }
13385 
13386 	  recompute_frame_layout_p = true;
13387 	}
13388     }
13389   else if (crtl->max_used_stack_slot_alignment >= 128)
13390     {
13391       /* We don't need to realign stack.  max_used_stack_alignment is
13392 	 used to decide how stack frame should be aligned.  This is
13393 	 independent of any psABIs nor 32-bit vs 64-bit.  It is always
13394 	 safe to compute max_used_stack_alignment.  We compute it only
13395 	 if 128-bit aligned load/store may be generated on misaligned
13396 	 stack slot which will lead to segfault.   */
13397       if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13398 	cfun->machine->max_used_stack_alignment
13399 	  = stack_alignment / BITS_PER_UNIT;
13400     }
13401 
13402   if (crtl->stack_realign_needed != stack_realign)
13403     recompute_frame_layout_p = true;
13404   crtl->stack_realign_needed = stack_realign;
13405   crtl->stack_realign_finalized = true;
13406   if (recompute_frame_layout_p)
13407     ix86_compute_frame_layout ();
13408 }
13409 
13410 /* Delete SET_GOT right after entry block if it is allocated to reg.  */
13411 
13412 static void
13413 ix86_elim_entry_set_got (rtx reg)
13414 {
13415   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13416   rtx_insn *c_insn = BB_HEAD (bb);
13417   if (!NONDEBUG_INSN_P (c_insn))
13418     c_insn = next_nonnote_nondebug_insn (c_insn);
13419   if (c_insn && NONJUMP_INSN_P (c_insn))
13420     {
13421       rtx pat = PATTERN (c_insn);
13422       if (GET_CODE (pat) == PARALLEL)
13423 	{
13424 	  rtx vec = XVECEXP (pat, 0, 0);
13425 	  if (GET_CODE (vec) == SET
13426 	      && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13427 	      && REGNO (XEXP (vec, 0)) == REGNO (reg))
13428 	    delete_insn (c_insn);
13429 	}
13430     }
13431 }
13432 
13433 static rtx
13434 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13435 {
13436   rtx addr, mem;
13437 
13438   if (offset)
13439     addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13440   mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13441   return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13442 }
13443 
13444 static inline rtx
13445 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13446 {
13447   return gen_frame_set (reg, frame_reg, offset, false);
13448 }
13449 
13450 static inline rtx
13451 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13452 {
13453   return gen_frame_set (reg, frame_reg, offset, true);
13454 }
13455 
13456 static void
13457 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13458 {
13459   struct machine_function *m = cfun->machine;
13460   const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13461 			  + m->call_ms2sysv_extra_regs;
13462   rtvec v = rtvec_alloc (ncregs + 1);
13463   unsigned int align, i, vi = 0;
13464   rtx_insn *insn;
13465   rtx sym, addr;
13466   rtx rax = gen_rtx_REG (word_mode, AX_REG);
13467   const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13468 
13469   /* AL should only be live with sysv_abi.  */
13470   gcc_assert (!ix86_eax_live_at_start_p ());
13471   gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13472 
13473   /* Setup RAX as the stub's base pointer.  We use stack_realign_offset rather
13474      we've actually realigned the stack or not.  */
13475   align = GET_MODE_ALIGNMENT (V4SFmode);
13476   addr = choose_baseaddr (frame.stack_realign_offset
13477 			  + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13478   gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13479 
13480   emit_insn (gen_rtx_SET (rax, addr));
13481 
13482   /* Get the stub symbol.  */
13483   sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13484 						  : XLOGUE_STUB_SAVE);
13485   RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13486 
13487   for (i = 0; i < ncregs; ++i)
13488     {
13489       const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13490       rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13491 			     r.regno);
13492       RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13493     }
13494 
13495   gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13496 
13497   insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13498   RTX_FRAME_RELATED_P (insn) = true;
13499 }
13500 
13501 /* Expand the prologue into a bunch of separate insns.  */
13502 
13503 void
13504 ix86_expand_prologue (void)
13505 {
13506   struct machine_function *m = cfun->machine;
13507   rtx insn, t;
13508   HOST_WIDE_INT allocate;
13509   bool int_registers_saved;
13510   bool sse_registers_saved;
13511   bool save_stub_call_needed;
13512   rtx static_chain = NULL_RTX;
13513 
13514   if (ix86_function_naked (current_function_decl))
13515     return;
13516 
13517   ix86_finalize_stack_frame_flags ();
13518 
13519   /* DRAP should not coexist with stack_realign_fp */
13520   gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13521 
13522   memset (&m->fs, 0, sizeof (m->fs));
13523 
13524   /* Initialize CFA state for before the prologue.  */
13525   m->fs.cfa_reg = stack_pointer_rtx;
13526   m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13527 
13528   /* Track SP offset to the CFA.  We continue tracking this after we've
13529      swapped the CFA register away from SP.  In the case of re-alignment
13530      this is fudged; we're interested to offsets within the local frame.  */
13531   m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13532   m->fs.sp_valid = true;
13533   m->fs.sp_realigned = false;
13534 
13535   const struct ix86_frame &frame = cfun->machine->frame;
13536 
13537   if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13538     {
13539       /* We should have already generated an error for any use of
13540          ms_hook on a nested function.  */
13541       gcc_checking_assert (!ix86_static_chain_on_stack);
13542 
13543       /* Check if profiling is active and we shall use profiling before
13544          prologue variant. If so sorry.  */
13545       if (crtl->profile && flag_fentry != 0)
13546         sorry ("ms_hook_prologue attribute isn%'t compatible "
13547 	       "with -mfentry for 32-bit");
13548 
13549       /* In ix86_asm_output_function_label we emitted:
13550 	 8b ff     movl.s %edi,%edi
13551 	 55        push   %ebp
13552 	 8b ec     movl.s %esp,%ebp
13553 
13554 	 This matches the hookable function prologue in Win32 API
13555 	 functions in Microsoft Windows XP Service Pack 2 and newer.
13556 	 Wine uses this to enable Windows apps to hook the Win32 API
13557 	 functions provided by Wine.
13558 
13559 	 What that means is that we've already set up the frame pointer.  */
13560 
13561       if (frame_pointer_needed
13562 	  && !(crtl->drap_reg && crtl->stack_realign_needed))
13563 	{
13564 	  rtx push, mov;
13565 
13566 	  /* We've decided to use the frame pointer already set up.
13567 	     Describe this to the unwinder by pretending that both
13568 	     push and mov insns happen right here.
13569 
13570 	     Putting the unwind info here at the end of the ms_hook
13571 	     is done so that we can make absolutely certain we get
13572 	     the required byte sequence at the start of the function,
13573 	     rather than relying on an assembler that can produce
13574 	     the exact encoding required.
13575 
13576 	     However it does mean (in the unpatched case) that we have
13577 	     a 1 insn window where the asynchronous unwind info is
13578 	     incorrect.  However, if we placed the unwind info at
13579 	     its correct location we would have incorrect unwind info
13580 	     in the patched case.  Which is probably all moot since
13581 	     I don't expect Wine generates dwarf2 unwind info for the
13582 	     system libraries that use this feature.  */
13583 
13584 	  insn = emit_insn (gen_blockage ());
13585 
13586 	  push = gen_push (hard_frame_pointer_rtx);
13587 	  mov = gen_rtx_SET (hard_frame_pointer_rtx,
13588 			     stack_pointer_rtx);
13589 	  RTX_FRAME_RELATED_P (push) = 1;
13590 	  RTX_FRAME_RELATED_P (mov) = 1;
13591 
13592 	  RTX_FRAME_RELATED_P (insn) = 1;
13593 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13594 			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13595 
13596 	  /* Note that gen_push incremented m->fs.cfa_offset, even
13597 	     though we didn't emit the push insn here.  */
13598 	  m->fs.cfa_reg = hard_frame_pointer_rtx;
13599 	  m->fs.fp_offset = m->fs.cfa_offset;
13600 	  m->fs.fp_valid = true;
13601 	}
13602       else
13603 	{
13604 	  /* The frame pointer is not needed so pop %ebp again.
13605 	     This leaves us with a pristine state.  */
13606 	  emit_insn (gen_pop (hard_frame_pointer_rtx));
13607 	}
13608     }
13609 
13610   /* The first insn of a function that accepts its static chain on the
13611      stack is to push the register that would be filled in by a direct
13612      call.  This insn will be skipped by the trampoline.  */
13613   else if (ix86_static_chain_on_stack)
13614     {
13615       static_chain = ix86_static_chain (cfun->decl, false);
13616       insn = emit_insn (gen_push (static_chain));
13617       emit_insn (gen_blockage ());
13618 
13619       /* We don't want to interpret this push insn as a register save,
13620 	 only as a stack adjustment.  The real copy of the register as
13621 	 a save will be done later, if needed.  */
13622       t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13623       t = gen_rtx_SET (stack_pointer_rtx, t);
13624       add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13625       RTX_FRAME_RELATED_P (insn) = 1;
13626     }
13627 
13628   /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13629      of DRAP is needed and stack realignment is really needed after reload */
13630   if (stack_realign_drap)
13631     {
13632       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13633 
13634       /* Can't use DRAP in interrupt function.  */
13635       if (cfun->machine->func_type != TYPE_NORMAL)
13636 	sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13637 	       "in interrupt service routine.  This may be worked "
13638 	       "around by avoiding functions with aggregate return.");
13639 
13640       /* Only need to push parameter pointer reg if it is caller saved.  */
13641       if (!call_used_regs[REGNO (crtl->drap_reg)])
13642 	{
13643 	  /* Push arg pointer reg */
13644 	  insn = emit_insn (gen_push (crtl->drap_reg));
13645 	  RTX_FRAME_RELATED_P (insn) = 1;
13646 	}
13647 
13648       /* Grab the argument pointer.  */
13649       t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13650       insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13651       RTX_FRAME_RELATED_P (insn) = 1;
13652       m->fs.cfa_reg = crtl->drap_reg;
13653       m->fs.cfa_offset = 0;
13654 
13655       /* Align the stack.  */
13656       insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13657 					stack_pointer_rtx,
13658 					GEN_INT (-align_bytes)));
13659       RTX_FRAME_RELATED_P (insn) = 1;
13660 
13661       /* Replicate the return address on the stack so that return
13662 	 address can be reached via (argp - 1) slot.  This is needed
13663 	 to implement macro RETURN_ADDR_RTX and intrinsic function
13664 	 expand_builtin_return_addr etc.  */
13665       t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13666       t = gen_frame_mem (word_mode, t);
13667       insn = emit_insn (gen_push (t));
13668       RTX_FRAME_RELATED_P (insn) = 1;
13669 
13670       /* For the purposes of frame and register save area addressing,
13671 	 we've started over with a new frame.  */
13672       m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13673       m->fs.realigned = true;
13674 
13675       if (static_chain)
13676 	{
13677 	  /* Replicate static chain on the stack so that static chain
13678 	     can be reached via (argp - 2) slot.  This is needed for
13679 	     nested function with stack realignment.  */
13680 	  insn = emit_insn (gen_push (static_chain));
13681 	  RTX_FRAME_RELATED_P (insn) = 1;
13682 	}
13683     }
13684 
13685   int_registers_saved = (frame.nregs == 0);
13686   sse_registers_saved = (frame.nsseregs == 0);
13687   save_stub_call_needed = (m->call_ms2sysv);
13688   gcc_assert (sse_registers_saved || !save_stub_call_needed);
13689 
13690   if (frame_pointer_needed && !m->fs.fp_valid)
13691     {
13692       /* Note: AT&T enter does NOT have reversed args.  Enter is probably
13693          slower on all targets.  Also sdb didn't like it.  */
13694       insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13695       RTX_FRAME_RELATED_P (insn) = 1;
13696 
13697       /* Push registers now, before setting the frame pointer
13698 	 on SEH target.  */
13699       if (!int_registers_saved
13700 	  && TARGET_SEH
13701 	  && !frame.save_regs_using_mov)
13702 	{
13703 	  ix86_emit_save_regs ();
13704 	  int_registers_saved = true;
13705 	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13706 	}
13707 
13708       if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13709 	{
13710 	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13711 	  RTX_FRAME_RELATED_P (insn) = 1;
13712 
13713 	  if (m->fs.cfa_reg == stack_pointer_rtx)
13714 	    m->fs.cfa_reg = hard_frame_pointer_rtx;
13715 	  m->fs.fp_offset = m->fs.sp_offset;
13716 	  m->fs.fp_valid = true;
13717 	}
13718     }
13719 
13720   if (!int_registers_saved)
13721     {
13722       /* If saving registers via PUSH, do so now.  */
13723       if (!frame.save_regs_using_mov)
13724 	{
13725 	  ix86_emit_save_regs ();
13726 	  int_registers_saved = true;
13727 	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13728 	}
13729 
13730       /* When using red zone we may start register saving before allocating
13731 	 the stack frame saving one cycle of the prologue.  However, avoid
13732 	 doing this if we have to probe the stack; at least on x86_64 the
13733 	 stack probe can turn into a call that clobbers a red zone location. */
13734       else if (ix86_using_red_zone ()
13735 	       && (! TARGET_STACK_PROBE
13736 		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13737 	{
13738 	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13739 	  int_registers_saved = true;
13740 	}
13741     }
13742 
13743   if (stack_realign_fp)
13744     {
13745       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13746       gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13747 
13748       /* Record last valid frame pointer offset.  */
13749       m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13750 
13751       /* The computation of the size of the re-aligned stack frame means
13752 	 that we must allocate the size of the register save area before
13753 	 performing the actual alignment.  Otherwise we cannot guarantee
13754 	 that there's enough storage above the realignment point.  */
13755       allocate = frame.reg_save_offset - m->fs.sp_offset
13756 		 + frame.stack_realign_allocate;
13757       if (allocate)
13758         pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13759 				   GEN_INT (-allocate), -1, false);
13760 
13761       /* Align the stack.  */
13762       insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13763 					stack_pointer_rtx,
13764 					GEN_INT (-align_bytes)));
13765       m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13766       m->fs.sp_realigned_offset = m->fs.sp_offset
13767 					      - frame.stack_realign_allocate;
13768       /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13769 	 Beyond this point, stack access should be done via choose_baseaddr or
13770 	 by using sp_valid_at and fp_valid_at to determine the correct base
13771 	 register.  Henceforth, any CFA offset should be thought of as logical
13772 	 and not physical.  */
13773       gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13774       gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13775       m->fs.sp_realigned = true;
13776 
13777       /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13778 	 is needed to describe where a register is saved using a realigned
13779 	 stack pointer, so we need to invalidate the stack pointer for that
13780 	 target.  */
13781       if (TARGET_SEH)
13782 	m->fs.sp_valid = false;
13783 
13784       /* If SP offset is non-immediate after allocation of the stack frame,
13785 	 then emit SSE saves or stub call prior to allocating the rest of the
13786 	 stack frame.  This is less efficient for the out-of-line stub because
13787 	 we can't combine allocations across the call barrier, but it's better
13788 	 than using a scratch register.  */
13789       else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13790 						   - m->fs.sp_realigned_offset),
13791 					  Pmode))
13792 	{
13793 	  if (!sse_registers_saved)
13794 	    {
13795 	      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13796 	      sse_registers_saved = true;
13797 	    }
13798 	  else if (save_stub_call_needed)
13799 	    {
13800 	      ix86_emit_outlined_ms2sysv_save (frame);
13801 	      save_stub_call_needed = false;
13802 	    }
13803 	}
13804     }
13805 
13806   allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13807 
13808   if (flag_stack_usage_info)
13809     {
13810       /* We start to count from ARG_POINTER.  */
13811       HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13812 
13813       /* If it was realigned, take into account the fake frame.  */
13814       if (stack_realign_drap)
13815 	{
13816 	  if (ix86_static_chain_on_stack)
13817 	    stack_size += UNITS_PER_WORD;
13818 
13819 	  if (!call_used_regs[REGNO (crtl->drap_reg)])
13820 	    stack_size += UNITS_PER_WORD;
13821 
13822 	  /* This over-estimates by 1 minimal-stack-alignment-unit but
13823 	     mitigates that by counting in the new return address slot.  */
13824 	  current_function_dynamic_stack_size
13825 	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
13826 	}
13827 
13828       current_function_static_stack_size = stack_size;
13829     }
13830 
13831   /* On SEH target with very large frame size, allocate an area to save
13832      SSE registers (as the very large allocation won't be described).  */
13833   if (TARGET_SEH
13834       && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13835       && !sse_registers_saved)
13836     {
13837       HOST_WIDE_INT sse_size =
13838 	frame.sse_reg_save_offset - frame.reg_save_offset;
13839 
13840       gcc_assert (int_registers_saved);
13841 
13842       /* No need to do stack checking as the area will be immediately
13843 	 written.  */
13844       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13845 			         GEN_INT (-sse_size), -1,
13846 				 m->fs.cfa_reg == stack_pointer_rtx);
13847       allocate -= sse_size;
13848       ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13849       sse_registers_saved = true;
13850     }
13851 
13852   /* The stack has already been decremented by the instruction calling us
13853      so probe if the size is non-negative to preserve the protection area.  */
13854   if (allocate >= 0
13855       && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13856 	  || flag_stack_clash_protection))
13857     {
13858       if (flag_stack_clash_protection)
13859 	{
13860 	  ix86_adjust_stack_and_probe_stack_clash (allocate,
13861 						   int_registers_saved);
13862 	  allocate = 0;
13863 	}
13864       else if (STACK_CHECK_MOVING_SP)
13865 	{
13866 	  if (!(crtl->is_leaf && !cfun->calls_alloca
13867 		&& allocate <= get_probe_interval ()))
13868 	    {
13869 	      ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13870 	      allocate = 0;
13871 	    }
13872 	}
13873       else
13874 	{
13875 	  HOST_WIDE_INT size = allocate;
13876 
13877 	  if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13878 	    size = 0x80000000 - get_stack_check_protect () - 1;
13879 
13880 	  if (TARGET_STACK_PROBE)
13881 	    {
13882 	      if (crtl->is_leaf && !cfun->calls_alloca)
13883 		{
13884 		  if (size > get_probe_interval ())
13885 		    ix86_emit_probe_stack_range (0, size, int_registers_saved);
13886 		}
13887 	      else
13888 		ix86_emit_probe_stack_range (0,
13889 					     size + get_stack_check_protect (),
13890 					     int_registers_saved);
13891 	    }
13892 	  else
13893 	    {
13894 	      if (crtl->is_leaf && !cfun->calls_alloca)
13895 		{
13896 		  if (size > get_probe_interval ()
13897 		      && size > get_stack_check_protect ())
13898 		    ix86_emit_probe_stack_range (get_stack_check_protect (),
13899 						 (size
13900 						  - get_stack_check_protect ()),
13901 						 int_registers_saved);
13902 		}
13903 	      else
13904 		ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13905 					     int_registers_saved);
13906 	    }
13907 	}
13908     }
13909 
13910   if (allocate == 0)
13911     ;
13912   else if (!ix86_target_stack_probe ()
13913 	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13914     {
13915       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13916 			         GEN_INT (-allocate), -1,
13917 			         m->fs.cfa_reg == stack_pointer_rtx);
13918     }
13919   else
13920     {
13921       rtx eax = gen_rtx_REG (Pmode, AX_REG);
13922       rtx r10 = NULL;
13923       rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13924       const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13925       bool eax_live = ix86_eax_live_at_start_p ();
13926       bool r10_live = false;
13927 
13928       if (TARGET_64BIT)
13929         r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13930 
13931       if (eax_live)
13932 	{
13933 	  insn = emit_insn (gen_push (eax));
13934 	  allocate -= UNITS_PER_WORD;
13935 	  /* Note that SEH directives need to continue tracking the stack
13936 	     pointer even after the frame pointer has been set up.  */
13937 	  if (sp_is_cfa_reg || TARGET_SEH)
13938 	    {
13939 	      if (sp_is_cfa_reg)
13940 		m->fs.cfa_offset += UNITS_PER_WORD;
13941 	      RTX_FRAME_RELATED_P (insn) = 1;
13942 	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13943 			    gen_rtx_SET (stack_pointer_rtx,
13944 					 plus_constant (Pmode, stack_pointer_rtx,
13945 							-UNITS_PER_WORD)));
13946 	    }
13947 	}
13948 
13949       if (r10_live)
13950 	{
13951 	  r10 = gen_rtx_REG (Pmode, R10_REG);
13952 	  insn = emit_insn (gen_push (r10));
13953 	  allocate -= UNITS_PER_WORD;
13954 	  if (sp_is_cfa_reg || TARGET_SEH)
13955 	    {
13956 	      if (sp_is_cfa_reg)
13957 		m->fs.cfa_offset += UNITS_PER_WORD;
13958 	      RTX_FRAME_RELATED_P (insn) = 1;
13959 	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13960 			    gen_rtx_SET (stack_pointer_rtx,
13961 					 plus_constant (Pmode, stack_pointer_rtx,
13962 							-UNITS_PER_WORD)));
13963 	    }
13964 	}
13965 
13966       emit_move_insn (eax, GEN_INT (allocate));
13967       emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13968 
13969       /* Use the fact that AX still contains ALLOCATE.  */
13970       adjust_stack_insn = (Pmode == DImode
13971 			   ? gen_pro_epilogue_adjust_stack_di_sub
13972 			   : gen_pro_epilogue_adjust_stack_si_sub);
13973 
13974       insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13975 					   stack_pointer_rtx, eax));
13976 
13977       if (sp_is_cfa_reg || TARGET_SEH)
13978 	{
13979 	  if (sp_is_cfa_reg)
13980 	    m->fs.cfa_offset += allocate;
13981 	  RTX_FRAME_RELATED_P (insn) = 1;
13982 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13983 			gen_rtx_SET (stack_pointer_rtx,
13984 				     plus_constant (Pmode, stack_pointer_rtx,
13985 						    -allocate)));
13986 	}
13987       m->fs.sp_offset += allocate;
13988 
13989       /* Use stack_pointer_rtx for relative addressing so that code works for
13990 	 realigned stack.  But this means that we need a blockage to prevent
13991 	 stores based on the frame pointer from being scheduled before.  */
13992       if (r10_live && eax_live)
13993         {
13994 	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13995 	  emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13996 			  gen_frame_mem (word_mode, t));
13997 	  t = plus_constant (Pmode, t, UNITS_PER_WORD);
13998 	  emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13999 			  gen_frame_mem (word_mode, t));
14000 	  emit_insn (gen_memory_blockage ());
14001 	}
14002       else if (eax_live || r10_live)
14003 	{
14004 	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14005 	  emit_move_insn (gen_rtx_REG (word_mode,
14006 				       (eax_live ? AX_REG : R10_REG)),
14007 			  gen_frame_mem (word_mode, t));
14008 	  emit_insn (gen_memory_blockage ());
14009 	}
14010     }
14011   gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14012 
14013   /* If we havn't already set up the frame pointer, do so now.  */
14014   if (frame_pointer_needed && !m->fs.fp_valid)
14015     {
14016       insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14017 			    GEN_INT (frame.stack_pointer_offset
14018 				     - frame.hard_frame_pointer_offset));
14019       insn = emit_insn (insn);
14020       RTX_FRAME_RELATED_P (insn) = 1;
14021       add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14022 
14023       if (m->fs.cfa_reg == stack_pointer_rtx)
14024 	m->fs.cfa_reg = hard_frame_pointer_rtx;
14025       m->fs.fp_offset = frame.hard_frame_pointer_offset;
14026       m->fs.fp_valid = true;
14027     }
14028 
14029   if (!int_registers_saved)
14030     ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14031   if (!sse_registers_saved)
14032     ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14033   else if (save_stub_call_needed)
14034     ix86_emit_outlined_ms2sysv_save (frame);
14035 
14036   /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14037      in PROLOGUE.  */
14038   if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14039     {
14040       rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14041       insn = emit_insn (gen_set_got (pic));
14042       RTX_FRAME_RELATED_P (insn) = 1;
14043       add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14044       emit_insn (gen_prologue_use (pic));
14045       /* Deleting already emmitted SET_GOT if exist and allocated to
14046 	 REAL_PIC_OFFSET_TABLE_REGNUM.  */
14047       ix86_elim_entry_set_got (pic);
14048     }
14049 
14050   if (crtl->drap_reg && !crtl->stack_realign_needed)
14051     {
14052       /* vDRAP is setup but after reload it turns out stack realign
14053          isn't necessary, here we will emit prologue to setup DRAP
14054          without stack realign adjustment */
14055       t = choose_baseaddr (0, NULL);
14056       emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14057     }
14058 
14059   /* Prevent instructions from being scheduled into register save push
14060      sequence when access to the redzone area is done through frame pointer.
14061      The offset between the frame pointer and the stack pointer is calculated
14062      relative to the value of the stack pointer at the end of the function
14063      prologue, and moving instructions that access redzone area via frame
14064      pointer inside push sequence violates this assumption.  */
14065   if (frame_pointer_needed && frame.red_zone_size)
14066     emit_insn (gen_memory_blockage ());
14067 
14068   /* SEH requires that the prologue end within 256 bytes of the start of
14069      the function.  Prevent instruction schedules that would extend that.
14070      Further, prevent alloca modifications to the stack pointer from being
14071      combined with prologue modifications.  */
14072   if (TARGET_SEH)
14073     emit_insn (gen_prologue_use (stack_pointer_rtx));
14074 }
14075 
14076 /* Emit code to restore REG using a POP insn.  */
14077 
14078 static void
14079 ix86_emit_restore_reg_using_pop (rtx reg)
14080 {
14081   struct machine_function *m = cfun->machine;
14082   rtx_insn *insn = emit_insn (gen_pop (reg));
14083 
14084   ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14085   m->fs.sp_offset -= UNITS_PER_WORD;
14086 
14087   if (m->fs.cfa_reg == crtl->drap_reg
14088       && REGNO (reg) == REGNO (crtl->drap_reg))
14089     {
14090       /* Previously we'd represented the CFA as an expression
14091 	 like *(%ebp - 8).  We've just popped that value from
14092 	 the stack, which means we need to reset the CFA to
14093 	 the drap register.  This will remain until we restore
14094 	 the stack pointer.  */
14095       add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14096       RTX_FRAME_RELATED_P (insn) = 1;
14097 
14098       /* This means that the DRAP register is valid for addressing too.  */
14099       m->fs.drap_valid = true;
14100       return;
14101     }
14102 
14103   if (m->fs.cfa_reg == stack_pointer_rtx)
14104     {
14105       rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14106       x = gen_rtx_SET (stack_pointer_rtx, x);
14107       add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14108       RTX_FRAME_RELATED_P (insn) = 1;
14109 
14110       m->fs.cfa_offset -= UNITS_PER_WORD;
14111     }
14112 
14113   /* When the frame pointer is the CFA, and we pop it, we are
14114      swapping back to the stack pointer as the CFA.  This happens
14115      for stack frames that don't allocate other data, so we assume
14116      the stack pointer is now pointing at the return address, i.e.
14117      the function entry state, which makes the offset be 1 word.  */
14118   if (reg == hard_frame_pointer_rtx)
14119     {
14120       m->fs.fp_valid = false;
14121       if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14122 	{
14123 	  m->fs.cfa_reg = stack_pointer_rtx;
14124 	  m->fs.cfa_offset -= UNITS_PER_WORD;
14125 
14126 	  add_reg_note (insn, REG_CFA_DEF_CFA,
14127 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14128 				      GEN_INT (m->fs.cfa_offset)));
14129 	  RTX_FRAME_RELATED_P (insn) = 1;
14130 	}
14131     }
14132 }
14133 
14134 /* Emit code to restore saved registers using POP insns.  */
14135 
14136 static void
14137 ix86_emit_restore_regs_using_pop (void)
14138 {
14139   unsigned int regno;
14140 
14141   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14142     if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14143       ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14144 }
14145 
14146 /* Emit code and notes for the LEAVE instruction.  If insn is non-null,
14147    omits the emit and only attaches the notes.  */
14148 
14149 static void
14150 ix86_emit_leave (rtx_insn *insn)
14151 {
14152   struct machine_function *m = cfun->machine;
14153   if (!insn)
14154     insn = emit_insn (ix86_gen_leave ());
14155 
14156   ix86_add_queued_cfa_restore_notes (insn);
14157 
14158   gcc_assert (m->fs.fp_valid);
14159   m->fs.sp_valid = true;
14160   m->fs.sp_realigned = false;
14161   m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14162   m->fs.fp_valid = false;
14163 
14164   if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14165     {
14166       m->fs.cfa_reg = stack_pointer_rtx;
14167       m->fs.cfa_offset = m->fs.sp_offset;
14168 
14169       add_reg_note (insn, REG_CFA_DEF_CFA,
14170 		    plus_constant (Pmode, stack_pointer_rtx,
14171 				   m->fs.sp_offset));
14172       RTX_FRAME_RELATED_P (insn) = 1;
14173     }
14174   ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14175 			     m->fs.fp_offset);
14176 }
14177 
14178 /* Emit code to restore saved registers using MOV insns.
14179    First register is restored from CFA - CFA_OFFSET.  */
14180 static void
14181 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14182 				  bool maybe_eh_return)
14183 {
14184   struct machine_function *m = cfun->machine;
14185   unsigned int regno;
14186 
14187   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14188     if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14189       {
14190 	rtx reg = gen_rtx_REG (word_mode, regno);
14191 	rtx mem;
14192 	rtx_insn *insn;
14193 
14194 	mem = choose_baseaddr (cfa_offset, NULL);
14195 	mem = gen_frame_mem (word_mode, mem);
14196 	insn = emit_move_insn (reg, mem);
14197 
14198         if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14199 	  {
14200 	    /* Previously we'd represented the CFA as an expression
14201 	       like *(%ebp - 8).  We've just popped that value from
14202 	       the stack, which means we need to reset the CFA to
14203 	       the drap register.  This will remain until we restore
14204 	       the stack pointer.  */
14205 	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14206 	    RTX_FRAME_RELATED_P (insn) = 1;
14207 
14208 	    /* This means that the DRAP register is valid for addressing.  */
14209 	    m->fs.drap_valid = true;
14210 	  }
14211 	else
14212 	  ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14213 
14214 	cfa_offset -= UNITS_PER_WORD;
14215       }
14216 }
14217 
14218 /* Emit code to restore saved registers using MOV insns.
14219    First register is restored from CFA - CFA_OFFSET.  */
14220 static void
14221 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14222 				      bool maybe_eh_return)
14223 {
14224   unsigned int regno;
14225 
14226   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14227     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14228       {
14229 	rtx reg = gen_rtx_REG (V4SFmode, regno);
14230 	rtx mem;
14231 	unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14232 
14233 	mem = choose_baseaddr (cfa_offset, &align);
14234 	mem = gen_rtx_MEM (V4SFmode, mem);
14235 
14236 	/* The location aligment depends upon the base register.  */
14237 	align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14238 	gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14239 	set_mem_align (mem, align);
14240 	emit_insn (gen_rtx_SET (reg, mem));
14241 
14242 	ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14243 
14244 	cfa_offset -= GET_MODE_SIZE (V4SFmode);
14245       }
14246 }
14247 
14248 static void
14249 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14250 				  bool use_call, int style)
14251 {
14252   struct machine_function *m = cfun->machine;
14253   const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14254 			  + m->call_ms2sysv_extra_regs;
14255   rtvec v;
14256   unsigned int elems_needed, align, i, vi = 0;
14257   rtx_insn *insn;
14258   rtx sym, tmp;
14259   rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14260   rtx r10 = NULL_RTX;
14261   const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14262   HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14263   HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14264   rtx rsi_frame_load = NULL_RTX;
14265   HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14266   enum xlogue_stub stub;
14267 
14268   gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14269 
14270   /* If using a realigned stack, we should never start with padding.  */
14271   gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14272 
14273   /* Setup RSI as the stub's base pointer.  */
14274   align = GET_MODE_ALIGNMENT (V4SFmode);
14275   tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14276   gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14277 
14278   emit_insn (gen_rtx_SET (rsi, tmp));
14279 
14280   /* Get a symbol for the stub.  */
14281   if (frame_pointer_needed)
14282     stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14283 		    : XLOGUE_STUB_RESTORE_HFP_TAIL;
14284   else
14285     stub = use_call ? XLOGUE_STUB_RESTORE
14286 		    : XLOGUE_STUB_RESTORE_TAIL;
14287   sym = xlogue.get_stub_rtx (stub);
14288 
14289   elems_needed = ncregs;
14290   if (use_call)
14291     elems_needed += 1;
14292   else
14293     elems_needed += frame_pointer_needed ? 5 : 3;
14294   v = rtvec_alloc (elems_needed);
14295 
14296   /* We call the epilogue stub when we need to pop incoming args or we are
14297      doing a sibling call as the tail.  Otherwise, we will emit a jmp to the
14298      epilogue stub and it is the tail-call.  */
14299   if (use_call)
14300       RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14301   else
14302     {
14303       RTVEC_ELT (v, vi++) = ret_rtx;
14304       RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14305       if (frame_pointer_needed)
14306 	{
14307 	  rtx rbp = gen_rtx_REG (DImode, BP_REG);
14308 	  gcc_assert (m->fs.fp_valid);
14309 	  gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14310 
14311 	  tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14312 	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14313 	  RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14314 	  tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14315 	  RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14316 	}
14317       else
14318 	{
14319 	  /* If no hard frame pointer, we set R10 to the SP restore value.  */
14320 	  gcc_assert (!m->fs.fp_valid);
14321 	  gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14322 	  gcc_assert (m->fs.sp_valid);
14323 
14324 	  r10 = gen_rtx_REG (DImode, R10_REG);
14325 	  tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14326 	  emit_insn (gen_rtx_SET (r10, tmp));
14327 
14328 	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14329 	}
14330     }
14331 
14332   /* Generate frame load insns and restore notes.  */
14333   for (i = 0; i < ncregs; ++i)
14334     {
14335       const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14336       machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14337       rtx reg, frame_load;
14338 
14339       reg = gen_rtx_REG (mode, r.regno);
14340       frame_load = gen_frame_load (reg, rsi, r.offset);
14341 
14342       /* Save RSI frame load insn & note to add last.  */
14343       if (r.regno == SI_REG)
14344 	{
14345 	  gcc_assert (!rsi_frame_load);
14346 	  rsi_frame_load = frame_load;
14347 	  rsi_restore_offset = r.offset;
14348 	}
14349       else
14350 	{
14351 	  RTVEC_ELT (v, vi++) = frame_load;
14352 	  ix86_add_cfa_restore_note (NULL, reg, r.offset);
14353 	}
14354     }
14355 
14356   /* Add RSI frame load & restore note at the end.  */
14357   gcc_assert (rsi_frame_load);
14358   gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14359   RTVEC_ELT (v, vi++) = rsi_frame_load;
14360   ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14361 			     rsi_restore_offset);
14362 
14363   /* Finally, for tail-call w/o a hard frame pointer, set SP to R10.  */
14364   if (!use_call && !frame_pointer_needed)
14365     {
14366       gcc_assert (m->fs.sp_valid);
14367       gcc_assert (!m->fs.sp_realigned);
14368 
14369       /* At this point, R10 should point to frame.stack_realign_offset.  */
14370       if (m->fs.cfa_reg == stack_pointer_rtx)
14371 	m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14372       m->fs.sp_offset = frame.stack_realign_offset;
14373     }
14374 
14375   gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14376   tmp = gen_rtx_PARALLEL (VOIDmode, v);
14377   if (use_call)
14378       insn = emit_insn (tmp);
14379   else
14380     {
14381       insn = emit_jump_insn (tmp);
14382       JUMP_LABEL (insn) = ret_rtx;
14383 
14384       if (frame_pointer_needed)
14385 	ix86_emit_leave (insn);
14386       else
14387 	{
14388 	  /* Need CFA adjust note.  */
14389 	  tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14390 	  add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14391 	}
14392     }
14393 
14394   RTX_FRAME_RELATED_P (insn) = true;
14395   ix86_add_queued_cfa_restore_notes (insn);
14396 
14397   /* If we're not doing a tail-call, we need to adjust the stack.  */
14398   if (use_call && m->fs.sp_valid)
14399     {
14400       HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14401       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14402 				GEN_INT (dealloc), style,
14403 				m->fs.cfa_reg == stack_pointer_rtx);
14404     }
14405 }
14406 
14407 /* Restore function stack, frame, and registers.  */
14408 
14409 void
14410 ix86_expand_epilogue (int style)
14411 {
14412   struct machine_function *m = cfun->machine;
14413   struct machine_frame_state frame_state_save = m->fs;
14414   bool restore_regs_via_mov;
14415   bool using_drap;
14416   bool restore_stub_is_tail = false;
14417 
14418   if (ix86_function_naked (current_function_decl))
14419     {
14420       /* The program should not reach this point.  */
14421       emit_insn (gen_ud2 ());
14422       return;
14423     }
14424 
14425   ix86_finalize_stack_frame_flags ();
14426   const struct ix86_frame &frame = cfun->machine->frame;
14427 
14428   m->fs.sp_realigned = stack_realign_fp;
14429   m->fs.sp_valid = stack_realign_fp
14430 		   || !frame_pointer_needed
14431 		   || crtl->sp_is_unchanging;
14432   gcc_assert (!m->fs.sp_valid
14433 	      || m->fs.sp_offset == frame.stack_pointer_offset);
14434 
14435   /* The FP must be valid if the frame pointer is present.  */
14436   gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14437   gcc_assert (!m->fs.fp_valid
14438 	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14439 
14440   /* We must have *some* valid pointer to the stack frame.  */
14441   gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14442 
14443   /* The DRAP is never valid at this point.  */
14444   gcc_assert (!m->fs.drap_valid);
14445 
14446   /* See the comment about red zone and frame
14447      pointer usage in ix86_expand_prologue.  */
14448   if (frame_pointer_needed && frame.red_zone_size)
14449     emit_insn (gen_memory_blockage ());
14450 
14451   using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14452   gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14453 
14454   /* Determine the CFA offset of the end of the red-zone.  */
14455   m->fs.red_zone_offset = 0;
14456   if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14457     {
14458       /* The red-zone begins below return address and error code in
14459 	 exception handler.  */
14460       m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14461 
14462       /* When the register save area is in the aligned portion of
14463          the stack, determine the maximum runtime displacement that
14464 	 matches up with the aligned frame.  */
14465       if (stack_realign_drap)
14466 	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14467 				  + UNITS_PER_WORD);
14468     }
14469 
14470   HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14471 
14472   /* Special care must be taken for the normal return case of a function
14473      using eh_return: the eax and edx registers are marked as saved, but
14474      not restored along this path.  Adjust the save location to match.  */
14475   if (crtl->calls_eh_return && style != 2)
14476     reg_save_offset -= 2 * UNITS_PER_WORD;
14477 
14478   /* EH_RETURN requires the use of moves to function properly.  */
14479   if (crtl->calls_eh_return)
14480     restore_regs_via_mov = true;
14481   /* SEH requires the use of pops to identify the epilogue.  */
14482   else if (TARGET_SEH)
14483     restore_regs_via_mov = false;
14484   /* If we're only restoring one register and sp cannot be used then
14485      using a move instruction to restore the register since it's
14486      less work than reloading sp and popping the register.  */
14487   else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14488     restore_regs_via_mov = true;
14489   else if (TARGET_EPILOGUE_USING_MOVE
14490 	   && cfun->machine->use_fast_prologue_epilogue
14491 	   && (frame.nregs > 1
14492 	       || m->fs.sp_offset != reg_save_offset))
14493     restore_regs_via_mov = true;
14494   else if (frame_pointer_needed
14495 	   && !frame.nregs
14496 	   && m->fs.sp_offset != reg_save_offset)
14497     restore_regs_via_mov = true;
14498   else if (frame_pointer_needed
14499 	   && TARGET_USE_LEAVE
14500 	   && cfun->machine->use_fast_prologue_epilogue
14501 	   && frame.nregs == 1)
14502     restore_regs_via_mov = true;
14503   else
14504     restore_regs_via_mov = false;
14505 
14506   if (restore_regs_via_mov || frame.nsseregs)
14507     {
14508       /* Ensure that the entire register save area is addressable via
14509 	 the stack pointer, if we will restore SSE regs via sp.  */
14510       if (TARGET_64BIT
14511 	  && m->fs.sp_offset > 0x7fffffff
14512 	  && sp_valid_at (frame.stack_realign_offset + 1)
14513 	  && (frame.nsseregs + frame.nregs) != 0)
14514 	{
14515 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14516 				     GEN_INT (m->fs.sp_offset
14517 					      - frame.sse_reg_save_offset),
14518 				     style,
14519 				     m->fs.cfa_reg == stack_pointer_rtx);
14520 	}
14521     }
14522 
14523   /* If there are any SSE registers to restore, then we have to do it
14524      via moves, since there's obviously no pop for SSE regs.  */
14525   if (frame.nsseregs)
14526     ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14527 					  style == 2);
14528 
14529   if (m->call_ms2sysv)
14530     {
14531       int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14532 
14533       /* We cannot use a tail-call for the stub if:
14534 	 1. We have to pop incoming args,
14535 	 2. We have additional int regs to restore, or
14536 	 3. A sibling call will be the tail-call, or
14537 	 4. We are emitting an eh_return_internal epilogue.
14538 
14539 	 TODO: Item 4 has not yet tested!
14540 
14541 	 If any of the above are true, we will call the stub rather than
14542 	 jump to it.  */
14543       restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14544       ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14545     }
14546 
14547   /* If using out-of-line stub that is a tail-call, then...*/
14548   if (m->call_ms2sysv && restore_stub_is_tail)
14549     {
14550       /* TODO: parinoid tests. (remove eventually)  */
14551       gcc_assert (m->fs.sp_valid);
14552       gcc_assert (!m->fs.sp_realigned);
14553       gcc_assert (!m->fs.fp_valid);
14554       gcc_assert (!m->fs.realigned);
14555       gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14556       gcc_assert (!crtl->drap_reg);
14557       gcc_assert (!frame.nregs);
14558     }
14559   else if (restore_regs_via_mov)
14560     {
14561       rtx t;
14562 
14563       if (frame.nregs)
14564 	ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14565 
14566       /* eh_return epilogues need %ecx added to the stack pointer.  */
14567       if (style == 2)
14568 	{
14569 	  rtx sa = EH_RETURN_STACKADJ_RTX;
14570 	  rtx_insn *insn;
14571 
14572 	  /* %ecx can't be used for both DRAP register and eh_return.  */
14573 	  if (crtl->drap_reg)
14574 	    gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14575 
14576 	  /* regparm nested functions don't work with eh_return.  */
14577 	  gcc_assert (!ix86_static_chain_on_stack);
14578 
14579 	  if (frame_pointer_needed)
14580 	    {
14581 	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14582 	      t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14583 	      emit_insn (gen_rtx_SET (sa, t));
14584 
14585 	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14586 	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
14587 
14588 	      /* Note that we use SA as a temporary CFA, as the return
14589 		 address is at the proper place relative to it.  We
14590 		 pretend this happens at the FP restore insn because
14591 		 prior to this insn the FP would be stored at the wrong
14592 		 offset relative to SA, and after this insn we have no
14593 		 other reasonable register to use for the CFA.  We don't
14594 		 bother resetting the CFA to the SP for the duration of
14595 		 the return insn, unless the control flow instrumentation
14596 		 is done.  In this case the SP is used later and we have
14597 		 to reset CFA to SP.  */
14598 	      add_reg_note (insn, REG_CFA_DEF_CFA,
14599 			    plus_constant (Pmode, sa, UNITS_PER_WORD));
14600 	      ix86_add_queued_cfa_restore_notes (insn);
14601 	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14602 	      RTX_FRAME_RELATED_P (insn) = 1;
14603 
14604 	      m->fs.cfa_reg = sa;
14605 	      m->fs.cfa_offset = UNITS_PER_WORD;
14606 	      m->fs.fp_valid = false;
14607 
14608 	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14609 					 const0_rtx, style,
14610 					 flag_cf_protection);
14611 	    }
14612 	  else
14613 	    {
14614 	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14615 	      t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14616 	      insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14617 	      ix86_add_queued_cfa_restore_notes (insn);
14618 
14619 	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14620 	      if (m->fs.cfa_offset != UNITS_PER_WORD)
14621 		{
14622 		  m->fs.cfa_offset = UNITS_PER_WORD;
14623 		  add_reg_note (insn, REG_CFA_DEF_CFA,
14624 				plus_constant (Pmode, stack_pointer_rtx,
14625 					       UNITS_PER_WORD));
14626 		  RTX_FRAME_RELATED_P (insn) = 1;
14627 		}
14628 	    }
14629 	  m->fs.sp_offset = UNITS_PER_WORD;
14630 	  m->fs.sp_valid = true;
14631 	  m->fs.sp_realigned = false;
14632 	}
14633     }
14634   else
14635     {
14636       /* SEH requires that the function end with (1) a stack adjustment
14637 	 if necessary, (2) a sequence of pops, and (3) a return or
14638 	 jump instruction.  Prevent insns from the function body from
14639 	 being scheduled into this sequence.  */
14640       if (TARGET_SEH)
14641 	{
14642 	  /* Prevent a catch region from being adjacent to the standard
14643 	     epilogue sequence.  Unfortunately neither crtl->uses_eh_lsda
14644 	     nor several other flags that would be interesting to test are
14645 	     set up yet.  */
14646 	  if (flag_non_call_exceptions)
14647 	    emit_insn (gen_nops (const1_rtx));
14648 	  else
14649 	    emit_insn (gen_blockage ());
14650 	}
14651 
14652       /* First step is to deallocate the stack frame so that we can
14653 	 pop the registers.  If the stack pointer was realigned, it needs
14654 	 to be restored now.  Also do it on SEH target for very large
14655 	 frame as the emitted instructions aren't allowed by the ABI
14656 	 in epilogues.  */
14657       if (!m->fs.sp_valid || m->fs.sp_realigned
14658  	  || (TARGET_SEH
14659 	      && (m->fs.sp_offset - reg_save_offset
14660 		  >= SEH_MAX_FRAME_SIZE)))
14661 	{
14662 	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14663 				     GEN_INT (m->fs.fp_offset
14664 					      - reg_save_offset),
14665 				     style, false);
14666 	}
14667       else if (m->fs.sp_offset != reg_save_offset)
14668 	{
14669 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14670 				     GEN_INT (m->fs.sp_offset
14671 					      - reg_save_offset),
14672 				     style,
14673 				     m->fs.cfa_reg == stack_pointer_rtx);
14674 	}
14675 
14676       ix86_emit_restore_regs_using_pop ();
14677     }
14678 
14679   /* If we used a stack pointer and haven't already got rid of it,
14680      then do so now.  */
14681   if (m->fs.fp_valid)
14682     {
14683       /* If the stack pointer is valid and pointing at the frame
14684 	 pointer store address, then we only need a pop.  */
14685       if (sp_valid_at (frame.hfp_save_offset)
14686 	  && m->fs.sp_offset == frame.hfp_save_offset)
14687 	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14688       /* Leave results in shorter dependency chains on CPUs that are
14689 	 able to grok it fast.  */
14690       else if (TARGET_USE_LEAVE
14691 	       || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14692 	       || !cfun->machine->use_fast_prologue_epilogue)
14693 	ix86_emit_leave (NULL);
14694       else
14695         {
14696 	  pro_epilogue_adjust_stack (stack_pointer_rtx,
14697 				     hard_frame_pointer_rtx,
14698 				     const0_rtx, style, !using_drap);
14699 	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14700         }
14701     }
14702 
14703   if (using_drap)
14704     {
14705       int param_ptr_offset = UNITS_PER_WORD;
14706       rtx_insn *insn;
14707 
14708       gcc_assert (stack_realign_drap);
14709 
14710       if (ix86_static_chain_on_stack)
14711 	param_ptr_offset += UNITS_PER_WORD;
14712       if (!call_used_regs[REGNO (crtl->drap_reg)])
14713 	param_ptr_offset += UNITS_PER_WORD;
14714 
14715       insn = emit_insn (gen_rtx_SET
14716 			(stack_pointer_rtx,
14717 			 gen_rtx_PLUS (Pmode,
14718 				       crtl->drap_reg,
14719 				       GEN_INT (-param_ptr_offset))));
14720       m->fs.cfa_reg = stack_pointer_rtx;
14721       m->fs.cfa_offset = param_ptr_offset;
14722       m->fs.sp_offset = param_ptr_offset;
14723       m->fs.realigned = false;
14724 
14725       add_reg_note (insn, REG_CFA_DEF_CFA,
14726 		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14727 				  GEN_INT (param_ptr_offset)));
14728       RTX_FRAME_RELATED_P (insn) = 1;
14729 
14730       if (!call_used_regs[REGNO (crtl->drap_reg)])
14731 	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14732     }
14733 
14734   /* At this point the stack pointer must be valid, and we must have
14735      restored all of the registers.  We may not have deallocated the
14736      entire stack frame.  We've delayed this until now because it may
14737      be possible to merge the local stack deallocation with the
14738      deallocation forced by ix86_static_chain_on_stack.   */
14739   gcc_assert (m->fs.sp_valid);
14740   gcc_assert (!m->fs.sp_realigned);
14741   gcc_assert (!m->fs.fp_valid);
14742   gcc_assert (!m->fs.realigned);
14743   if (m->fs.sp_offset != UNITS_PER_WORD)
14744     {
14745       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14746 				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14747 				 style, true);
14748     }
14749   else
14750     ix86_add_queued_cfa_restore_notes (get_last_insn ());
14751 
14752   /* Sibcall epilogues don't want a return instruction.  */
14753   if (style == 0)
14754     {
14755       m->fs = frame_state_save;
14756       return;
14757     }
14758 
14759   if (cfun->machine->func_type != TYPE_NORMAL)
14760     emit_jump_insn (gen_interrupt_return ());
14761   else if (crtl->args.pops_args && crtl->args.size)
14762     {
14763       rtx popc = GEN_INT (crtl->args.pops_args);
14764 
14765       /* i386 can only pop 64K bytes.  If asked to pop more, pop return
14766 	 address, do explicit add, and jump indirectly to the caller.  */
14767 
14768       if (crtl->args.pops_args >= 65536)
14769 	{
14770 	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
14771 	  rtx_insn *insn;
14772 
14773 	  /* There is no "pascal" calling convention in any 64bit ABI.  */
14774 	  gcc_assert (!TARGET_64BIT);
14775 
14776 	  insn = emit_insn (gen_pop (ecx));
14777 	  m->fs.cfa_offset -= UNITS_PER_WORD;
14778 	  m->fs.sp_offset -= UNITS_PER_WORD;
14779 
14780 	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14781 	  x = gen_rtx_SET (stack_pointer_rtx, x);
14782 	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14783 	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14784 	  RTX_FRAME_RELATED_P (insn) = 1;
14785 
14786 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14787 				     popc, -1, true);
14788 	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14789 	}
14790       else
14791 	emit_jump_insn (gen_simple_return_pop_internal (popc));
14792     }
14793   else if (!m->call_ms2sysv || !restore_stub_is_tail)
14794     {
14795       /* In case of return from EH a simple return cannot be used
14796 	 as a return address will be compared with a shadow stack
14797 	 return address.  Use indirect jump instead.  */
14798       if (style == 2 && flag_cf_protection)
14799 	{
14800 	  /* Register used in indirect jump must be in word_mode.  But
14801 	     Pmode may not be the same as word_mode for x32.  */
14802 	  rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14803 	  rtx_insn *insn;
14804 
14805 	  insn = emit_insn (gen_pop (ecx));
14806 	  m->fs.cfa_offset -= UNITS_PER_WORD;
14807 	  m->fs.sp_offset -= UNITS_PER_WORD;
14808 
14809 	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14810 	  x = gen_rtx_SET (stack_pointer_rtx, x);
14811 	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14812 	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14813 	  RTX_FRAME_RELATED_P (insn) = 1;
14814 
14815 	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14816 	}
14817       else
14818 	emit_jump_insn (gen_simple_return_internal ());
14819     }
14820 
14821   /* Restore the state back to the state from the prologue,
14822      so that it's correct for the next epilogue.  */
14823   m->fs = frame_state_save;
14824 }
14825 
14826 /* Reset from the function's potential modifications.  */
14827 
14828 static void
14829 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14830 {
14831   if (pic_offset_table_rtx
14832       && !ix86_use_pseudo_pic_reg ())
14833     SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14834 
14835   if (TARGET_MACHO)
14836     {
14837       rtx_insn *insn = get_last_insn ();
14838       rtx_insn *deleted_debug_label = NULL;
14839 
14840       /* Mach-O doesn't support labels at the end of objects, so if
14841          it looks like we might want one, take special action.
14842         First, collect any sequence of deleted debug labels.  */
14843       while (insn
14844 	     && NOTE_P (insn)
14845 	     && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14846 	{
14847 	  /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14848 	     notes only, instead set their CODE_LABEL_NUMBER to -1,
14849 	     otherwise there would be code generation differences
14850 	     in between -g and -g0.  */
14851 	  if (NOTE_P (insn) && NOTE_KIND (insn)
14852 	      == NOTE_INSN_DELETED_DEBUG_LABEL)
14853 	    deleted_debug_label = insn;
14854 	  insn = PREV_INSN (insn);
14855 	}
14856 
14857       /* If we have:
14858 	 label:
14859 	    barrier
14860 	  then this needs to be detected, so skip past the barrier.  */
14861 
14862       if (insn && BARRIER_P (insn))
14863 	insn = PREV_INSN (insn);
14864 
14865       /* Up to now we've only seen notes or barriers.  */
14866       if (insn)
14867 	{
14868 	  if (LABEL_P (insn)
14869 	      || (NOTE_P (insn)
14870 		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14871 	    /* Trailing label.  */
14872 	    fputs ("\tnop\n", file);
14873 	  else if (cfun && ! cfun->is_thunk)
14874 	    {
14875 	      /* See if we have a completely empty function body, skipping
14876 	         the special case of the picbase thunk emitted as asm.  */
14877 	      while (insn && ! INSN_P (insn))
14878 		insn = PREV_INSN (insn);
14879 	      /* If we don't find any insns, we've got an empty function body;
14880 		 I.e. completely empty - without a return or branch.  This is
14881 		 taken as the case where a function body has been removed
14882 		 because it contains an inline __builtin_unreachable().  GCC
14883 		 declares that reaching __builtin_unreachable() means UB so
14884 		 we're not obliged to do anything special; however, we want
14885 		 non-zero-sized function bodies.  To meet this, and help the
14886 		 user out, let's trap the case.  */
14887 	      if (insn == NULL)
14888 		fputs ("\tud2\n", file);
14889 	    }
14890 	}
14891       else if (deleted_debug_label)
14892 	for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14893 	  if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14894 	    CODE_LABEL_NUMBER (insn) = -1;
14895     }
14896 }
14897 
14898 /* Return a scratch register to use in the split stack prologue.  The
14899    split stack prologue is used for -fsplit-stack.  It is the first
14900    instructions in the function, even before the regular prologue.
14901    The scratch register can be any caller-saved register which is not
14902    used for parameters or for the static chain.  */
14903 
14904 static unsigned int
14905 split_stack_prologue_scratch_regno (void)
14906 {
14907   if (TARGET_64BIT)
14908     return R11_REG;
14909   else
14910     {
14911       bool is_fastcall, is_thiscall;
14912       int regparm;
14913 
14914       is_fastcall = (lookup_attribute ("fastcall",
14915 				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14916 		     != NULL);
14917       is_thiscall = (lookup_attribute ("thiscall",
14918 				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14919 		     != NULL);
14920       regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14921 
14922       if (is_fastcall)
14923 	{
14924 	  if (DECL_STATIC_CHAIN (cfun->decl))
14925 	    {
14926 	      sorry ("-fsplit-stack does not support fastcall with "
14927 		     "nested function");
14928 	      return INVALID_REGNUM;
14929 	    }
14930 	  return AX_REG;
14931 	}
14932       else if (is_thiscall)
14933         {
14934 	  if (!DECL_STATIC_CHAIN (cfun->decl))
14935 	    return DX_REG;
14936 	  return AX_REG;
14937 	}
14938       else if (regparm < 3)
14939 	{
14940 	  if (!DECL_STATIC_CHAIN (cfun->decl))
14941 	    return CX_REG;
14942 	  else
14943 	    {
14944 	      if (regparm >= 2)
14945 		{
14946 		  sorry ("-fsplit-stack does not support 2 register "
14947 			 "parameters for a nested function");
14948 		  return INVALID_REGNUM;
14949 		}
14950 	      return DX_REG;
14951 	    }
14952 	}
14953       else
14954 	{
14955 	  /* FIXME: We could make this work by pushing a register
14956 	     around the addition and comparison.  */
14957 	  sorry ("-fsplit-stack does not support 3 register parameters");
14958 	  return INVALID_REGNUM;
14959 	}
14960     }
14961 }
14962 
14963 /* A SYMBOL_REF for the function which allocates new stackspace for
14964    -fsplit-stack.  */
14965 
14966 static GTY(()) rtx split_stack_fn;
14967 
14968 /* A SYMBOL_REF for the more stack function when using the large
14969    model.  */
14970 
14971 static GTY(()) rtx split_stack_fn_large;
14972 
14973 /* Return location of the stack guard value in the TLS block.  */
14974 
14975 rtx
14976 ix86_split_stack_guard (void)
14977 {
14978   int offset;
14979   addr_space_t as = DEFAULT_TLS_SEG_REG;
14980   rtx r;
14981 
14982   gcc_assert (flag_split_stack);
14983 
14984 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14985   offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14986 #else
14987   gcc_unreachable ();
14988 #endif
14989 
14990   r = GEN_INT (offset);
14991   r = gen_const_mem (Pmode, r);
14992   set_mem_addr_space (r, as);
14993 
14994   return r;
14995 }
14996 
14997 /* Handle -fsplit-stack.  These are the first instructions in the
14998    function, even before the regular prologue.  */
14999 
15000 void
15001 ix86_expand_split_stack_prologue (void)
15002 {
15003   HOST_WIDE_INT allocate;
15004   unsigned HOST_WIDE_INT args_size;
15005   rtx_code_label *label;
15006   rtx limit, current, allocate_rtx, call_insn, call_fusage;
15007   rtx scratch_reg = NULL_RTX;
15008   rtx_code_label *varargs_label = NULL;
15009   rtx fn;
15010 
15011   gcc_assert (flag_split_stack && reload_completed);
15012 
15013   ix86_finalize_stack_frame_flags ();
15014   struct ix86_frame &frame = cfun->machine->frame;
15015   allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15016 
15017   /* This is the label we will branch to if we have enough stack
15018      space.  We expect the basic block reordering pass to reverse this
15019      branch if optimizing, so that we branch in the unlikely case.  */
15020   label = gen_label_rtx ();
15021 
15022   /* We need to compare the stack pointer minus the frame size with
15023      the stack boundary in the TCB.  The stack boundary always gives
15024      us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15025      can compare directly.  Otherwise we need to do an addition.  */
15026 
15027   limit = ix86_split_stack_guard ();
15028 
15029   if (allocate < SPLIT_STACK_AVAILABLE)
15030     current = stack_pointer_rtx;
15031   else
15032     {
15033       unsigned int scratch_regno;
15034       rtx offset;
15035 
15036       /* We need a scratch register to hold the stack pointer minus
15037 	 the required frame size.  Since this is the very start of the
15038 	 function, the scratch register can be any caller-saved
15039 	 register which is not used for parameters.  */
15040       offset = GEN_INT (- allocate);
15041       scratch_regno = split_stack_prologue_scratch_regno ();
15042       if (scratch_regno == INVALID_REGNUM)
15043 	return;
15044       scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15045       if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15046 	{
15047 	  /* We don't use ix86_gen_add3 in this case because it will
15048 	     want to split to lea, but when not optimizing the insn
15049 	     will not be split after this point.  */
15050 	  emit_insn (gen_rtx_SET (scratch_reg,
15051 				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15052 						offset)));
15053 	}
15054       else
15055 	{
15056 	  emit_move_insn (scratch_reg, offset);
15057 	  emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15058 				    stack_pointer_rtx));
15059 	}
15060       current = scratch_reg;
15061     }
15062 
15063   ix86_expand_branch (GEU, current, limit, label);
15064   rtx_insn *jump_insn = get_last_insn ();
15065   JUMP_LABEL (jump_insn) = label;
15066 
15067   /* Mark the jump as very likely to be taken.  */
15068   add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15069 
15070   if (split_stack_fn == NULL_RTX)
15071     {
15072       split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15073       SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15074     }
15075   fn = split_stack_fn;
15076 
15077   /* Get more stack space.  We pass in the desired stack space and the
15078      size of the arguments to copy to the new stack.  In 32-bit mode
15079      we push the parameters; __morestack will return on a new stack
15080      anyhow.  In 64-bit mode we pass the parameters in r10 and
15081      r11.  */
15082   allocate_rtx = GEN_INT (allocate);
15083   args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
15084   call_fusage = NULL_RTX;
15085   rtx pop = NULL_RTX;
15086   if (TARGET_64BIT)
15087     {
15088       rtx reg10, reg11;
15089 
15090       reg10 = gen_rtx_REG (Pmode, R10_REG);
15091       reg11 = gen_rtx_REG (Pmode, R11_REG);
15092 
15093       /* If this function uses a static chain, it will be in %r10.
15094 	 Preserve it across the call to __morestack.  */
15095       if (DECL_STATIC_CHAIN (cfun->decl))
15096 	{
15097 	  rtx rax;
15098 
15099 	  rax = gen_rtx_REG (word_mode, AX_REG);
15100 	  emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15101 	  use_reg (&call_fusage, rax);
15102 	}
15103 
15104       if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15105           && !TARGET_PECOFF)
15106 	{
15107 	  HOST_WIDE_INT argval;
15108 
15109 	  gcc_assert (Pmode == DImode);
15110 	  /* When using the large model we need to load the address
15111 	     into a register, and we've run out of registers.  So we
15112 	     switch to a different calling convention, and we call a
15113 	     different function: __morestack_large.  We pass the
15114 	     argument size in the upper 32 bits of r10 and pass the
15115 	     frame size in the lower 32 bits.  */
15116 	  gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15117 	  gcc_assert ((args_size & 0xffffffff) == args_size);
15118 
15119 	  if (split_stack_fn_large == NULL_RTX)
15120 	    {
15121 	      split_stack_fn_large =
15122 	        gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15123 	      SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15124 	    }
15125 	  if (ix86_cmodel == CM_LARGE_PIC)
15126 	    {
15127 	      rtx_code_label *label;
15128 	      rtx x;
15129 
15130 	      label = gen_label_rtx ();
15131 	      emit_label (label);
15132 	      LABEL_PRESERVE_P (label) = 1;
15133 	      emit_insn (gen_set_rip_rex64 (reg10, label));
15134 	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
15135 	      emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15136 	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15137 				  UNSPEC_GOT);
15138 	      x = gen_rtx_CONST (Pmode, x);
15139 	      emit_move_insn (reg11, x);
15140 	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
15141 	      x = gen_const_mem (Pmode, x);
15142 	      emit_move_insn (reg11, x);
15143 	    }
15144 	  else
15145 	    emit_move_insn (reg11, split_stack_fn_large);
15146 
15147 	  fn = reg11;
15148 
15149 	  argval = ((args_size << 16) << 16) + allocate;
15150 	  emit_move_insn (reg10, GEN_INT (argval));
15151 	}
15152       else
15153 	{
15154 	  emit_move_insn (reg10, allocate_rtx);
15155 	  emit_move_insn (reg11, GEN_INT (args_size));
15156 	  use_reg (&call_fusage, reg11);
15157 	}
15158 
15159       use_reg (&call_fusage, reg10);
15160     }
15161   else
15162     {
15163       rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15164       add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15165       insn = emit_insn (gen_push (allocate_rtx));
15166       add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15167       pop = GEN_INT (2 * UNITS_PER_WORD);
15168     }
15169   call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15170 				GEN_INT (UNITS_PER_WORD), constm1_rtx,
15171 				pop, false);
15172   add_function_usage_to (call_insn, call_fusage);
15173   if (!TARGET_64BIT)
15174     add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15175   /* Indicate that this function can't jump to non-local gotos.  */
15176   make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15177 
15178   /* In order to make call/return prediction work right, we now need
15179      to execute a return instruction.  See
15180      libgcc/config/i386/morestack.S for the details on how this works.
15181 
15182      For flow purposes gcc must not see this as a return
15183      instruction--we need control flow to continue at the subsequent
15184      label.  Therefore, we use an unspec.  */
15185   gcc_assert (crtl->args.pops_args < 65536);
15186   rtx_insn *ret_insn
15187     = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15188 
15189   if ((flag_cf_protection & CF_BRANCH))
15190     {
15191       /* Insert ENDBR since __morestack will jump back here via indirect
15192 	 call.  */
15193       rtx cet_eb = gen_nop_endbr ();
15194       emit_insn_after (cet_eb, ret_insn);
15195     }
15196 
15197   /* If we are in 64-bit mode and this function uses a static chain,
15198      we saved %r10 in %rax before calling _morestack.  */
15199   if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15200     emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15201 		    gen_rtx_REG (word_mode, AX_REG));
15202 
15203   /* If this function calls va_start, we need to store a pointer to
15204      the arguments on the old stack, because they may not have been
15205      all copied to the new stack.  At this point the old stack can be
15206      found at the frame pointer value used by __morestack, because
15207      __morestack has set that up before calling back to us.  Here we
15208      store that pointer in a scratch register, and in
15209      ix86_expand_prologue we store the scratch register in a stack
15210      slot.  */
15211   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15212     {
15213       unsigned int scratch_regno;
15214       rtx frame_reg;
15215       int words;
15216 
15217       scratch_regno = split_stack_prologue_scratch_regno ();
15218       scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15219       frame_reg = gen_rtx_REG (Pmode, BP_REG);
15220 
15221       /* 64-bit:
15222 	 fp -> old fp value
15223 	       return address within this function
15224 	       return address of caller of this function
15225 	       stack arguments
15226 	 So we add three words to get to the stack arguments.
15227 
15228 	 32-bit:
15229 	 fp -> old fp value
15230 	       return address within this function
15231                first argument to __morestack
15232                second argument to __morestack
15233                return address of caller of this function
15234                stack arguments
15235          So we add five words to get to the stack arguments.
15236       */
15237       words = TARGET_64BIT ? 3 : 5;
15238       emit_insn (gen_rtx_SET (scratch_reg,
15239 			      gen_rtx_PLUS (Pmode, frame_reg,
15240 					    GEN_INT (words * UNITS_PER_WORD))));
15241 
15242       varargs_label = gen_label_rtx ();
15243       emit_jump_insn (gen_jump (varargs_label));
15244       JUMP_LABEL (get_last_insn ()) = varargs_label;
15245 
15246       emit_barrier ();
15247     }
15248 
15249   emit_label (label);
15250   LABEL_NUSES (label) = 1;
15251 
15252   /* If this function calls va_start, we now have to set the scratch
15253      register for the case where we do not call __morestack.  In this
15254      case we need to set it based on the stack pointer.  */
15255   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15256     {
15257       emit_insn (gen_rtx_SET (scratch_reg,
15258 			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15259 					    GEN_INT (UNITS_PER_WORD))));
15260 
15261       emit_label (varargs_label);
15262       LABEL_NUSES (varargs_label) = 1;
15263     }
15264 }
15265 
15266 /* We may have to tell the dataflow pass that the split stack prologue
15267    is initializing a scratch register.  */
15268 
15269 static void
15270 ix86_live_on_entry (bitmap regs)
15271 {
15272   if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15273     {
15274       gcc_assert (flag_split_stack);
15275       bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15276     }
15277 }
15278 
15279 /* Extract the parts of an RTL expression that is a valid memory address
15280    for an instruction.  Return 0 if the structure of the address is
15281    grossly off.  Return -1 if the address contains ASHIFT, so it is not
15282    strictly valid, but still used for computing length of lea instruction.  */
15283 
15284 int
15285 ix86_decompose_address (rtx addr, struct ix86_address *out)
15286 {
15287   rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15288   rtx base_reg, index_reg;
15289   HOST_WIDE_INT scale = 1;
15290   rtx scale_rtx = NULL_RTX;
15291   rtx tmp;
15292   int retval = 1;
15293   addr_space_t seg = ADDR_SPACE_GENERIC;
15294 
15295   /* Allow zero-extended SImode addresses,
15296      they will be emitted with addr32 prefix.  */
15297   if (TARGET_64BIT && GET_MODE (addr) == DImode)
15298     {
15299       if (GET_CODE (addr) == ZERO_EXTEND
15300 	  && GET_MODE (XEXP (addr, 0)) == SImode)
15301 	{
15302 	  addr = XEXP (addr, 0);
15303 	  if (CONST_INT_P (addr))
15304 	    return 0;
15305 	}
15306       else if (GET_CODE (addr) == AND
15307 	       && const_32bit_mask (XEXP (addr, 1), DImode))
15308 	{
15309 	  addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15310 	  if (addr == NULL_RTX)
15311 	    return 0;
15312 
15313 	  if (CONST_INT_P (addr))
15314 	    return 0;
15315 	}
15316     }
15317 
15318   /* Allow SImode subregs of DImode addresses,
15319      they will be emitted with addr32 prefix.  */
15320   if (TARGET_64BIT && GET_MODE (addr) == SImode)
15321     {
15322       if (SUBREG_P (addr)
15323 	  && GET_MODE (SUBREG_REG (addr)) == DImode)
15324 	{
15325 	  addr = SUBREG_REG (addr);
15326 	  if (CONST_INT_P (addr))
15327 	    return 0;
15328 	}
15329     }
15330 
15331   if (REG_P (addr))
15332     base = addr;
15333   else if (SUBREG_P (addr))
15334     {
15335       if (REG_P (SUBREG_REG (addr)))
15336 	base = addr;
15337       else
15338 	return 0;
15339     }
15340   else if (GET_CODE (addr) == PLUS)
15341     {
15342       rtx addends[4], op;
15343       int n = 0, i;
15344 
15345       op = addr;
15346       do
15347 	{
15348 	  if (n >= 4)
15349 	    return 0;
15350 	  addends[n++] = XEXP (op, 1);
15351 	  op = XEXP (op, 0);
15352 	}
15353       while (GET_CODE (op) == PLUS);
15354       if (n >= 4)
15355 	return 0;
15356       addends[n] = op;
15357 
15358       for (i = n; i >= 0; --i)
15359 	{
15360 	  op = addends[i];
15361 	  switch (GET_CODE (op))
15362 	    {
15363 	    case MULT:
15364 	      if (index)
15365 		return 0;
15366 	      index = XEXP (op, 0);
15367 	      scale_rtx = XEXP (op, 1);
15368 	      break;
15369 
15370 	    case ASHIFT:
15371 	      if (index)
15372 		return 0;
15373 	      index = XEXP (op, 0);
15374 	      tmp = XEXP (op, 1);
15375 	      if (!CONST_INT_P (tmp))
15376 		return 0;
15377 	      scale = INTVAL (tmp);
15378 	      if ((unsigned HOST_WIDE_INT) scale > 3)
15379 		return 0;
15380 	      scale = 1 << scale;
15381 	      break;
15382 
15383 	    case ZERO_EXTEND:
15384 	      op = XEXP (op, 0);
15385 	      if (GET_CODE (op) != UNSPEC)
15386 		return 0;
15387 	      /* FALLTHRU */
15388 
15389 	    case UNSPEC:
15390 	      if (XINT (op, 1) == UNSPEC_TP
15391 	          && TARGET_TLS_DIRECT_SEG_REFS
15392 	          && seg == ADDR_SPACE_GENERIC)
15393 		seg = DEFAULT_TLS_SEG_REG;
15394 	      else
15395 		return 0;
15396 	      break;
15397 
15398 	    case SUBREG:
15399 	      if (!REG_P (SUBREG_REG (op)))
15400 		return 0;
15401 	      /* FALLTHRU */
15402 
15403 	    case REG:
15404 	      if (!base)
15405 		base = op;
15406 	      else if (!index)
15407 		index = op;
15408 	      else
15409 		return 0;
15410 	      break;
15411 
15412 	    case CONST:
15413 	    case CONST_INT:
15414 	    case SYMBOL_REF:
15415 	    case LABEL_REF:
15416 	      if (disp)
15417 		return 0;
15418 	      disp = op;
15419 	      break;
15420 
15421 	    default:
15422 	      return 0;
15423 	    }
15424 	}
15425     }
15426   else if (GET_CODE (addr) == MULT)
15427     {
15428       index = XEXP (addr, 0);		/* index*scale */
15429       scale_rtx = XEXP (addr, 1);
15430     }
15431   else if (GET_CODE (addr) == ASHIFT)
15432     {
15433       /* We're called for lea too, which implements ashift on occasion.  */
15434       index = XEXP (addr, 0);
15435       tmp = XEXP (addr, 1);
15436       if (!CONST_INT_P (tmp))
15437 	return 0;
15438       scale = INTVAL (tmp);
15439       if ((unsigned HOST_WIDE_INT) scale > 3)
15440 	return 0;
15441       scale = 1 << scale;
15442       retval = -1;
15443     }
15444   else
15445     disp = addr;			/* displacement */
15446 
15447   if (index)
15448     {
15449       if (REG_P (index))
15450 	;
15451       else if (SUBREG_P (index)
15452 	       && REG_P (SUBREG_REG (index)))
15453 	;
15454       else
15455 	return 0;
15456     }
15457 
15458   /* Extract the integral value of scale.  */
15459   if (scale_rtx)
15460     {
15461       if (!CONST_INT_P (scale_rtx))
15462 	return 0;
15463       scale = INTVAL (scale_rtx);
15464     }
15465 
15466   base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15467   index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15468 
15469   /* Avoid useless 0 displacement.  */
15470   if (disp == const0_rtx && (base || index))
15471     disp = NULL_RTX;
15472 
15473   /* Allow arg pointer and stack pointer as index if there is not scaling.  */
15474   if (base_reg && index_reg && scale == 1
15475       && (REGNO (index_reg) == ARG_POINTER_REGNUM
15476 	  || REGNO (index_reg) == FRAME_POINTER_REGNUM
15477 	  || REGNO (index_reg) == SP_REG))
15478     {
15479       std::swap (base, index);
15480       std::swap (base_reg, index_reg);
15481     }
15482 
15483   /* Special case: %ebp cannot be encoded as a base without a displacement.
15484      Similarly %r13.  */
15485   if (!disp && base_reg
15486       && (REGNO (base_reg) == ARG_POINTER_REGNUM
15487 	  || REGNO (base_reg) == FRAME_POINTER_REGNUM
15488 	  || REGNO (base_reg) == BP_REG
15489 	  || REGNO (base_reg) == R13_REG))
15490     disp = const0_rtx;
15491 
15492   /* Special case: on K6, [%esi] makes the instruction vector decoded.
15493      Avoid this by transforming to [%esi+0].
15494      Reload calls address legitimization without cfun defined, so we need
15495      to test cfun for being non-NULL. */
15496   if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15497       && base_reg && !index_reg && !disp
15498       && REGNO (base_reg) == SI_REG)
15499     disp = const0_rtx;
15500 
15501   /* Special case: encode reg+reg instead of reg*2.  */
15502   if (!base && index && scale == 2)
15503     base = index, base_reg = index_reg, scale = 1;
15504 
15505   /* Special case: scaling cannot be encoded without base or displacement.  */
15506   if (!base && !disp && index && scale != 1)
15507     disp = const0_rtx;
15508 
15509   out->base = base;
15510   out->index = index;
15511   out->disp = disp;
15512   out->scale = scale;
15513   out->seg = seg;
15514 
15515   return retval;
15516 }
15517 
15518 /* Return cost of the memory address x.
15519    For i386, it is better to use a complex address than let gcc copy
15520    the address into a reg and make a new pseudo.  But not if the address
15521    requires to two regs - that would mean more pseudos with longer
15522    lifetimes.  */
15523 static int
15524 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15525 {
15526   struct ix86_address parts;
15527   int cost = 1;
15528   int ok = ix86_decompose_address (x, &parts);
15529 
15530   gcc_assert (ok);
15531 
15532   if (parts.base && SUBREG_P (parts.base))
15533     parts.base = SUBREG_REG (parts.base);
15534   if (parts.index && SUBREG_P (parts.index))
15535     parts.index = SUBREG_REG (parts.index);
15536 
15537   /* Attempt to minimize number of registers in the address by increasing
15538      address cost for each used register.  We don't increase address cost
15539      for "pic_offset_table_rtx".  When a memopt with "pic_offset_table_rtx"
15540      is not invariant itself it most likely means that base or index is not
15541      invariant.  Therefore only "pic_offset_table_rtx" could be hoisted out,
15542      which is not profitable for x86.  */
15543   if (parts.base
15544       && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15545       && (current_pass->type == GIMPLE_PASS
15546 	  || !pic_offset_table_rtx
15547 	  || !REG_P (parts.base)
15548 	  || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15549     cost++;
15550 
15551   if (parts.index
15552       && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15553       && (current_pass->type == GIMPLE_PASS
15554 	  || !pic_offset_table_rtx
15555 	  || !REG_P (parts.index)
15556 	  || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15557     cost++;
15558 
15559   /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15560      since it's predecode logic can't detect the length of instructions
15561      and it degenerates to vector decoded.  Increase cost of such
15562      addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
15563      to split such addresses or even refuse such addresses at all.
15564 
15565      Following addressing modes are affected:
15566       [base+scale*index]
15567       [scale*index+disp]
15568       [base+index]
15569 
15570      The first and last case  may be avoidable by explicitly coding the zero in
15571      memory address, but I don't have AMD-K6 machine handy to check this
15572      theory.  */
15573 
15574   if (TARGET_K6
15575       && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15576 	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15577 	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15578     cost += 10;
15579 
15580   return cost;
15581 }
15582 
15583 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15584    this is used for to form addresses to local data when -fPIC is in
15585    use.  */
15586 
15587 static bool
15588 darwin_local_data_pic (rtx disp)
15589 {
15590   return (GET_CODE (disp) == UNSPEC
15591 	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15592 }
15593 
15594 /* True if operand X should be loaded from GOT.  */
15595 
15596 bool
15597 ix86_force_load_from_GOT_p (rtx x)
15598 {
15599   return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15600 	  && !TARGET_PECOFF && !TARGET_MACHO
15601 	  && !flag_plt && !flag_pic
15602 	  && ix86_cmodel != CM_LARGE
15603 	  && GET_CODE (x) == SYMBOL_REF
15604 	  && SYMBOL_REF_FUNCTION_P (x)
15605 	  && !SYMBOL_REF_LOCAL_P (x));
15606 }
15607 
15608 /* Determine if a given RTX is a valid constant.  We already know this
15609    satisfies CONSTANT_P.  */
15610 
15611 static bool
15612 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15613 {
15614   /* Pointer bounds constants are not valid.  */
15615   if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15616     return false;
15617 
15618   switch (GET_CODE (x))
15619     {
15620     case CONST:
15621       x = XEXP (x, 0);
15622 
15623       if (GET_CODE (x) == PLUS)
15624 	{
15625 	  if (!CONST_INT_P (XEXP (x, 1)))
15626 	    return false;
15627 	  x = XEXP (x, 0);
15628 	}
15629 
15630       if (TARGET_MACHO && darwin_local_data_pic (x))
15631 	return true;
15632 
15633       /* Only some unspecs are valid as "constants".  */
15634       if (GET_CODE (x) == UNSPEC)
15635 	switch (XINT (x, 1))
15636 	  {
15637 	  case UNSPEC_GOT:
15638 	  case UNSPEC_GOTOFF:
15639 	  case UNSPEC_PLTOFF:
15640 	    return TARGET_64BIT;
15641 	  case UNSPEC_TPOFF:
15642 	  case UNSPEC_NTPOFF:
15643 	    x = XVECEXP (x, 0, 0);
15644 	    return (GET_CODE (x) == SYMBOL_REF
15645 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15646 	  case UNSPEC_DTPOFF:
15647 	    x = XVECEXP (x, 0, 0);
15648 	    return (GET_CODE (x) == SYMBOL_REF
15649 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15650 	  default:
15651 	    return false;
15652 	  }
15653 
15654       /* We must have drilled down to a symbol.  */
15655       if (GET_CODE (x) == LABEL_REF)
15656 	return true;
15657       if (GET_CODE (x) != SYMBOL_REF)
15658 	return false;
15659       /* FALLTHRU */
15660 
15661     case SYMBOL_REF:
15662       /* TLS symbols are never valid.  */
15663       if (SYMBOL_REF_TLS_MODEL (x))
15664 	return false;
15665 
15666       /* DLLIMPORT symbols are never valid.  */
15667       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15668 	  && SYMBOL_REF_DLLIMPORT_P (x))
15669 	return false;
15670 
15671 #if TARGET_MACHO
15672       /* mdynamic-no-pic */
15673       if (MACHO_DYNAMIC_NO_PIC_P)
15674 	return machopic_symbol_defined_p (x);
15675 #endif
15676 
15677       /* External function address should be loaded
15678 	 via the GOT slot to avoid PLT.  */
15679       if (ix86_force_load_from_GOT_p (x))
15680 	return false;
15681 
15682       break;
15683 
15684     CASE_CONST_SCALAR_INT:
15685       switch (mode)
15686 	{
15687 	case E_TImode:
15688 	  if (TARGET_64BIT)
15689 	    return true;
15690 	  /* FALLTHRU */
15691 	case E_OImode:
15692 	case E_XImode:
15693 	  if (!standard_sse_constant_p (x, mode))
15694 	    return false;
15695 	default:
15696 	  break;
15697 	}
15698       break;
15699 
15700     case CONST_VECTOR:
15701       if (!standard_sse_constant_p (x, mode))
15702 	return false;
15703 
15704     default:
15705       break;
15706     }
15707 
15708   /* Otherwise we handle everything else in the move patterns.  */
15709   return true;
15710 }
15711 
15712 /* Determine if it's legal to put X into the constant pool.  This
15713    is not possible for the address of thread-local symbols, which
15714    is checked above.  */
15715 
15716 static bool
15717 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15718 {
15719   /* We can put any immediate constant in memory.  */
15720   switch (GET_CODE (x))
15721     {
15722     CASE_CONST_ANY:
15723       return false;
15724 
15725     default:
15726       break;
15727     }
15728 
15729   return !ix86_legitimate_constant_p (mode, x);
15730 }
15731 
15732 /*  Nonzero if the symbol is marked as dllimport, or as stub-variable,
15733     otherwise zero.  */
15734 
15735 static bool
15736 is_imported_p (rtx x)
15737 {
15738   if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15739       || GET_CODE (x) != SYMBOL_REF)
15740     return false;
15741 
15742   return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15743 }
15744 
15745 
15746 /* Nonzero if the constant value X is a legitimate general operand
15747    when generating PIC code.  It is given that flag_pic is on and
15748    that X satisfies CONSTANT_P.  */
15749 
15750 bool
15751 legitimate_pic_operand_p (rtx x)
15752 {
15753   rtx inner;
15754 
15755   switch (GET_CODE (x))
15756     {
15757     case CONST:
15758       inner = XEXP (x, 0);
15759       if (GET_CODE (inner) == PLUS
15760 	  && CONST_INT_P (XEXP (inner, 1)))
15761 	inner = XEXP (inner, 0);
15762 
15763       /* Only some unspecs are valid as "constants".  */
15764       if (GET_CODE (inner) == UNSPEC)
15765 	switch (XINT (inner, 1))
15766 	  {
15767 	  case UNSPEC_GOT:
15768 	  case UNSPEC_GOTOFF:
15769 	  case UNSPEC_PLTOFF:
15770 	    return TARGET_64BIT;
15771 	  case UNSPEC_TPOFF:
15772 	    x = XVECEXP (inner, 0, 0);
15773 	    return (GET_CODE (x) == SYMBOL_REF
15774 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15775 	  case UNSPEC_MACHOPIC_OFFSET:
15776 	    return legitimate_pic_address_disp_p (x);
15777 	  default:
15778 	    return false;
15779 	  }
15780       /* FALLTHRU */
15781 
15782     case SYMBOL_REF:
15783     case LABEL_REF:
15784       return legitimate_pic_address_disp_p (x);
15785 
15786     default:
15787       return true;
15788     }
15789 }
15790 
15791 /* Determine if a given CONST RTX is a valid memory displacement
15792    in PIC mode.  */
15793 
15794 bool
15795 legitimate_pic_address_disp_p (rtx disp)
15796 {
15797   bool saw_plus;
15798 
15799   /* In 64bit mode we can allow direct addresses of symbols and labels
15800      when they are not dynamic symbols.  */
15801   if (TARGET_64BIT)
15802     {
15803       rtx op0 = disp, op1;
15804 
15805       switch (GET_CODE (disp))
15806 	{
15807 	case LABEL_REF:
15808 	  return true;
15809 
15810 	case CONST:
15811 	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
15812 	    break;
15813 	  op0 = XEXP (XEXP (disp, 0), 0);
15814 	  op1 = XEXP (XEXP (disp, 0), 1);
15815 	  if (!CONST_INT_P (op1))
15816 	    break;
15817 	  if (GET_CODE (op0) == UNSPEC
15818 	      && (XINT (op0, 1) == UNSPEC_DTPOFF
15819 		  || XINT (op0, 1) == UNSPEC_NTPOFF)
15820 	      && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15821 	    return true;
15822 	  if (INTVAL (op1) >= 16*1024*1024
15823 	      || INTVAL (op1) < -16*1024*1024)
15824 	    break;
15825 	  if (GET_CODE (op0) == LABEL_REF)
15826 	    return true;
15827 	  if (GET_CODE (op0) == CONST
15828 	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
15829 	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15830 	    return true;
15831 	  if (GET_CODE (op0) == UNSPEC
15832 	      && XINT (op0, 1) == UNSPEC_PCREL)
15833 	    return true;
15834 	  if (GET_CODE (op0) != SYMBOL_REF)
15835 	    break;
15836 	  /* FALLTHRU */
15837 
15838 	case SYMBOL_REF:
15839 	  /* TLS references should always be enclosed in UNSPEC.
15840 	     The dllimported symbol needs always to be resolved.  */
15841 	  if (SYMBOL_REF_TLS_MODEL (op0)
15842 	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15843 	    return false;
15844 
15845 	  if (TARGET_PECOFF)
15846 	    {
15847 	      if (is_imported_p (op0))
15848 		return true;
15849 
15850 	      if (SYMBOL_REF_FAR_ADDR_P (op0)
15851 		  || !SYMBOL_REF_LOCAL_P (op0))
15852 		break;
15853 
15854 	      /* Function-symbols need to be resolved only for
15855 	         large-model.
15856 	         For the small-model we don't need to resolve anything
15857 	         here.  */
15858 	      if ((ix86_cmodel != CM_LARGE_PIC
15859 	           && SYMBOL_REF_FUNCTION_P (op0))
15860 		  || ix86_cmodel == CM_SMALL_PIC)
15861 		return true;
15862 	      /* Non-external symbols don't need to be resolved for
15863 	         large, and medium-model.  */
15864 	      if ((ix86_cmodel == CM_LARGE_PIC
15865 		   || ix86_cmodel == CM_MEDIUM_PIC)
15866 		  && !SYMBOL_REF_EXTERNAL_P (op0))
15867 		return true;
15868 	    }
15869 	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15870 		   && (SYMBOL_REF_LOCAL_P (op0)
15871 		       || (HAVE_LD_PIE_COPYRELOC
15872 			   && flag_pie
15873 			   && !SYMBOL_REF_WEAK (op0)
15874 			   && !SYMBOL_REF_FUNCTION_P (op0)))
15875 		   && ix86_cmodel != CM_LARGE_PIC)
15876 	    return true;
15877 	  break;
15878 
15879 	default:
15880 	  break;
15881 	}
15882     }
15883   if (GET_CODE (disp) != CONST)
15884     return false;
15885   disp = XEXP (disp, 0);
15886 
15887   if (TARGET_64BIT)
15888     {
15889       /* We are unsafe to allow PLUS expressions.  This limit allowed distance
15890          of GOT tables.  We should not need these anyway.  */
15891       if (GET_CODE (disp) != UNSPEC
15892 	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
15893 	      && XINT (disp, 1) != UNSPEC_GOTOFF
15894 	      && XINT (disp, 1) != UNSPEC_PCREL
15895 	      && XINT (disp, 1) != UNSPEC_PLTOFF))
15896 	return false;
15897 
15898       if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15899 	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15900 	return false;
15901       return true;
15902     }
15903 
15904   saw_plus = false;
15905   if (GET_CODE (disp) == PLUS)
15906     {
15907       if (!CONST_INT_P (XEXP (disp, 1)))
15908 	return false;
15909       disp = XEXP (disp, 0);
15910       saw_plus = true;
15911     }
15912 
15913   if (TARGET_MACHO && darwin_local_data_pic (disp))
15914     return true;
15915 
15916   if (GET_CODE (disp) != UNSPEC)
15917     return false;
15918 
15919   switch (XINT (disp, 1))
15920     {
15921     case UNSPEC_GOT:
15922       if (saw_plus)
15923 	return false;
15924       /* We need to check for both symbols and labels because VxWorks loads
15925 	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
15926 	 details.  */
15927       return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15928 	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15929     case UNSPEC_GOTOFF:
15930       /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15931 	 While ABI specify also 32bit relocation but we don't produce it in
15932 	 small PIC model at all.  */
15933       if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15934 	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15935 	  && !TARGET_64BIT)
15936         return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15937       return false;
15938     case UNSPEC_GOTTPOFF:
15939     case UNSPEC_GOTNTPOFF:
15940     case UNSPEC_INDNTPOFF:
15941       if (saw_plus)
15942 	return false;
15943       disp = XVECEXP (disp, 0, 0);
15944       return (GET_CODE (disp) == SYMBOL_REF
15945 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15946     case UNSPEC_NTPOFF:
15947       disp = XVECEXP (disp, 0, 0);
15948       return (GET_CODE (disp) == SYMBOL_REF
15949 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15950     case UNSPEC_DTPOFF:
15951       disp = XVECEXP (disp, 0, 0);
15952       return (GET_CODE (disp) == SYMBOL_REF
15953 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15954     }
15955 
15956   return false;
15957 }
15958 
15959 /* Determine if op is suitable RTX for an address register.
15960    Return naked register if a register or a register subreg is
15961    found, otherwise return NULL_RTX.  */
15962 
15963 static rtx
15964 ix86_validate_address_register (rtx op)
15965 {
15966   machine_mode mode = GET_MODE (op);
15967 
15968   /* Only SImode or DImode registers can form the address.  */
15969   if (mode != SImode && mode != DImode)
15970     return NULL_RTX;
15971 
15972   if (REG_P (op))
15973     return op;
15974   else if (SUBREG_P (op))
15975     {
15976       rtx reg = SUBREG_REG (op);
15977 
15978       if (!REG_P (reg))
15979 	return NULL_RTX;
15980 
15981       mode = GET_MODE (reg);
15982 
15983       /* Don't allow SUBREGs that span more than a word.  It can
15984 	 lead to spill failures when the register is one word out
15985 	 of a two word structure.  */
15986       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15987 	return NULL_RTX;
15988 
15989       /* Allow only SUBREGs of non-eliminable hard registers.  */
15990       if (register_no_elim_operand (reg, mode))
15991 	return reg;
15992     }
15993 
15994   /* Op is not a register.  */
15995   return NULL_RTX;
15996 }
15997 
15998 /* Recognizes RTL expressions that are valid memory addresses for an
15999    instruction.  The MODE argument is the machine mode for the MEM
16000    expression that wants to use this address.
16001 
16002    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
16003    convert common non-canonical forms to canonical form so that they will
16004    be recognized.  */
16005 
16006 static bool
16007 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16008 {
16009   struct ix86_address parts;
16010   rtx base, index, disp;
16011   HOST_WIDE_INT scale;
16012   addr_space_t seg;
16013 
16014   if (ix86_decompose_address (addr, &parts) <= 0)
16015     /* Decomposition failed.  */
16016     return false;
16017 
16018   base = parts.base;
16019   index = parts.index;
16020   disp = parts.disp;
16021   scale = parts.scale;
16022   seg = parts.seg;
16023 
16024   /* Validate base register.  */
16025   if (base)
16026     {
16027       rtx reg = ix86_validate_address_register (base);
16028 
16029       if (reg == NULL_RTX)
16030 	return false;
16031 
16032       if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16033 	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16034 	/* Base is not valid.  */
16035 	return false;
16036     }
16037 
16038   /* Validate index register.  */
16039   if (index)
16040     {
16041       rtx reg = ix86_validate_address_register (index);
16042 
16043       if (reg == NULL_RTX)
16044 	return false;
16045 
16046       if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16047 	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16048 	/* Index is not valid.  */
16049 	return false;
16050     }
16051 
16052   /* Index and base should have the same mode.  */
16053   if (base && index
16054       && GET_MODE (base) != GET_MODE (index))
16055     return false;
16056 
16057   /* Address override works only on the (%reg) part of %fs:(%reg).  */
16058   if (seg != ADDR_SPACE_GENERIC
16059       && ((base && GET_MODE (base) != word_mode)
16060 	  || (index && GET_MODE (index) != word_mode)))
16061     return false;
16062 
16063   /* Validate scale factor.  */
16064   if (scale != 1)
16065     {
16066       if (!index)
16067 	/* Scale without index.  */
16068 	return false;
16069 
16070       if (scale != 2 && scale != 4 && scale != 8)
16071 	/* Scale is not a valid multiplier.  */
16072 	return false;
16073     }
16074 
16075   /* Validate displacement.  */
16076   if (disp)
16077     {
16078       if (GET_CODE (disp) == CONST
16079 	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
16080 	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16081 	switch (XINT (XEXP (disp, 0), 1))
16082 	  {
16083 	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16084 	     when used.  While ABI specify also 32bit relocations, we
16085 	     don't produce them at all and use IP relative instead.
16086 	     Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16087 	     should be loaded via GOT.  */
16088 	  case UNSPEC_GOT:
16089 	    if (!TARGET_64BIT
16090 		&& ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16091 	      goto is_legitimate_pic;
16092 	    /* FALLTHRU */
16093 	  case UNSPEC_GOTOFF:
16094 	    gcc_assert (flag_pic);
16095 	    if (!TARGET_64BIT)
16096 	      goto is_legitimate_pic;
16097 
16098 	    /* 64bit address unspec.  */
16099 	    return false;
16100 
16101 	  case UNSPEC_GOTPCREL:
16102 	    if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16103 	      goto is_legitimate_pic;
16104 	    /* FALLTHRU */
16105 	  case UNSPEC_PCREL:
16106 	    gcc_assert (flag_pic);
16107 	    goto is_legitimate_pic;
16108 
16109 	  case UNSPEC_GOTTPOFF:
16110 	  case UNSPEC_GOTNTPOFF:
16111 	  case UNSPEC_INDNTPOFF:
16112 	  case UNSPEC_NTPOFF:
16113 	  case UNSPEC_DTPOFF:
16114 	    break;
16115 
16116 	  default:
16117 	    /* Invalid address unspec.  */
16118 	    return false;
16119 	  }
16120 
16121       else if (SYMBOLIC_CONST (disp)
16122 	       && (flag_pic
16123 		   || (TARGET_MACHO
16124 #if TARGET_MACHO
16125 		       && MACHOPIC_INDIRECT
16126 		       && !machopic_operand_p (disp)
16127 #endif
16128 	       )))
16129 	{
16130 
16131 	is_legitimate_pic:
16132 	  if (TARGET_64BIT && (index || base))
16133 	    {
16134 	      /* foo@dtpoff(%rX) is ok.  */
16135 	      if (GET_CODE (disp) != CONST
16136 		  || GET_CODE (XEXP (disp, 0)) != PLUS
16137 		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16138 		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16139 		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16140 		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16141 		/* Non-constant pic memory reference.  */
16142 		return false;
16143 	    }
16144 	  else if ((!TARGET_MACHO || flag_pic)
16145 		    && ! legitimate_pic_address_disp_p (disp))
16146 	    /* Displacement is an invalid pic construct.  */
16147 	    return false;
16148 #if TARGET_MACHO
16149 	  else if (MACHO_DYNAMIC_NO_PIC_P
16150 		   && !ix86_legitimate_constant_p (Pmode, disp))
16151 	    /* displacment must be referenced via non_lazy_pointer */
16152 	    return false;
16153 #endif
16154 
16155           /* This code used to verify that a symbolic pic displacement
16156 	     includes the pic_offset_table_rtx register.
16157 
16158 	     While this is good idea, unfortunately these constructs may
16159 	     be created by "adds using lea" optimization for incorrect
16160 	     code like:
16161 
16162 	     int a;
16163 	     int foo(int i)
16164 	       {
16165 	         return *(&a+i);
16166 	       }
16167 
16168 	     This code is nonsensical, but results in addressing
16169 	     GOT table with pic_offset_table_rtx base.  We can't
16170 	     just refuse it easily, since it gets matched by
16171 	     "addsi3" pattern, that later gets split to lea in the
16172 	     case output register differs from input.  While this
16173 	     can be handled by separate addsi pattern for this case
16174 	     that never results in lea, this seems to be easier and
16175 	     correct fix for crash to disable this test.  */
16176 	}
16177       else if (GET_CODE (disp) != LABEL_REF
16178 	       && !CONST_INT_P (disp)
16179 	       && (GET_CODE (disp) != CONST
16180 		   || !ix86_legitimate_constant_p (Pmode, disp))
16181 	       && (GET_CODE (disp) != SYMBOL_REF
16182 		   || !ix86_legitimate_constant_p (Pmode, disp)))
16183 	/* Displacement is not constant.  */
16184 	return false;
16185       else if (TARGET_64BIT
16186 	       && !x86_64_immediate_operand (disp, VOIDmode))
16187 	/* Displacement is out of range.  */
16188 	return false;
16189       /* In x32 mode, constant addresses are sign extended to 64bit, so
16190 	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
16191       else if (TARGET_X32 && !(index || base)
16192 	       && CONST_INT_P (disp)
16193 	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
16194 	return false;
16195     }
16196 
16197   /* Everything looks valid.  */
16198   return true;
16199 }
16200 
16201 /* Determine if a given RTX is a valid constant address.  */
16202 
16203 bool
16204 constant_address_p (rtx x)
16205 {
16206   return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16207 }
16208 
16209 /* Return a unique alias set for the GOT.  */
16210 
16211 static alias_set_type
16212 ix86_GOT_alias_set (void)
16213 {
16214   static alias_set_type set = -1;
16215   if (set == -1)
16216     set = new_alias_set ();
16217   return set;
16218 }
16219 
16220 /* Return a legitimate reference for ORIG (an address) using the
16221    register REG.  If REG is 0, a new pseudo is generated.
16222 
16223    There are two types of references that must be handled:
16224 
16225    1. Global data references must load the address from the GOT, via
16226       the PIC reg.  An insn is emitted to do this load, and the reg is
16227       returned.
16228 
16229    2. Static data references, constant pool addresses, and code labels
16230       compute the address as an offset from the GOT, whose base is in
16231       the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
16232       differentiate them from global data objects.  The returned
16233       address is the PIC reg + an unspec constant.
16234 
16235    TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16236    reg also appears in the address.  */
16237 
16238 static rtx
16239 legitimize_pic_address (rtx orig, rtx reg)
16240 {
16241   rtx addr = orig;
16242   rtx new_rtx = orig;
16243 
16244 #if TARGET_MACHO
16245   if (TARGET_MACHO && !TARGET_64BIT)
16246     {
16247       if (reg == 0)
16248 	reg = gen_reg_rtx (Pmode);
16249       /* Use the generic Mach-O PIC machinery.  */
16250       return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16251     }
16252 #endif
16253 
16254   if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16255     {
16256       rtx tmp = legitimize_pe_coff_symbol (addr, true);
16257       if (tmp)
16258         return tmp;
16259     }
16260 
16261   if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16262     new_rtx = addr;
16263   else if ((!TARGET_64BIT
16264 	    || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16265 	   && !TARGET_PECOFF
16266 	   && gotoff_operand (addr, Pmode))
16267     {
16268       /* This symbol may be referenced via a displacement
16269 	 from the PIC base address (@GOTOFF).  */
16270       if (GET_CODE (addr) == CONST)
16271 	addr = XEXP (addr, 0);
16272 
16273       if (GET_CODE (addr) == PLUS)
16274 	  {
16275             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16276 				      UNSPEC_GOTOFF);
16277 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16278 	  }
16279 	else
16280           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16281 
16282       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16283 
16284       if (TARGET_64BIT)
16285 	new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16286 
16287       if (reg != 0)
16288 	{
16289  	  gcc_assert (REG_P (reg));
16290 	  new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16291 					 new_rtx, reg, 1, OPTAB_DIRECT);
16292  	}
16293       else
16294 	new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16295     }
16296   else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16297 	   /* We can't use @GOTOFF for text labels
16298 	      on VxWorks, see gotoff_operand.  */
16299 	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16300     {
16301       rtx tmp = legitimize_pe_coff_symbol (addr, true);
16302       if (tmp)
16303         return tmp;
16304 
16305       /* For x64 PE-COFF there is no GOT table,
16306 	 so we use address directly.  */
16307       if (TARGET_64BIT && TARGET_PECOFF)
16308 	{
16309 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16310 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16311 	}
16312       else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16313 	{
16314 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16315 				    UNSPEC_GOTPCREL);
16316 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16317 	  new_rtx = gen_const_mem (Pmode, new_rtx);
16318 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16319 	}
16320       else
16321 	{
16322 	  /* This symbol must be referenced via a load
16323 	     from the Global Offset Table (@GOT).  */
16324 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16325 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16326 	  if (TARGET_64BIT)
16327 	    new_rtx = force_reg (Pmode, new_rtx);
16328 	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16329 	  new_rtx = gen_const_mem (Pmode, new_rtx);
16330 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16331 	}
16332 
16333       new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16334     }
16335   else
16336     {
16337       if (CONST_INT_P (addr)
16338 	  && !x86_64_immediate_operand (addr, VOIDmode))
16339 	new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16340       else if (GET_CODE (addr) == CONST)
16341 	{
16342 	  addr = XEXP (addr, 0);
16343 
16344 	  /* We must match stuff we generate before.  Assume the only
16345 	     unspecs that can get here are ours.  Not that we could do
16346 	     anything with them anyway....  */
16347 	  if (GET_CODE (addr) == UNSPEC
16348 	      || (GET_CODE (addr) == PLUS
16349 		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16350 	    return orig;
16351 	  gcc_assert (GET_CODE (addr) == PLUS);
16352 	}
16353 
16354       if (GET_CODE (addr) == PLUS)
16355 	{
16356 	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16357 
16358 	  /* Check first to see if this is a constant
16359 	     offset from a @GOTOFF symbol reference.  */
16360 	  if (!TARGET_PECOFF
16361 	      && gotoff_operand (op0, Pmode)
16362 	      && CONST_INT_P (op1))
16363 	    {
16364 	      if (!TARGET_64BIT)
16365 		{
16366 		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16367 					    UNSPEC_GOTOFF);
16368 		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16369 		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16370 
16371 		  if (reg != 0)
16372 		    {
16373 		      gcc_assert (REG_P (reg));
16374 		      new_rtx = expand_simple_binop (Pmode, PLUS,
16375 						     pic_offset_table_rtx,
16376 						     new_rtx, reg, 1,
16377 						     OPTAB_DIRECT);
16378 		    }
16379 		  else
16380 		    new_rtx
16381 		      = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16382 		}
16383 	      else
16384 		{
16385 		  if (INTVAL (op1) < -16*1024*1024
16386 		      || INTVAL (op1) >= 16*1024*1024)
16387 		    {
16388 		      if (!x86_64_immediate_operand (op1, Pmode))
16389 			op1 = force_reg (Pmode, op1);
16390 
16391 		      new_rtx
16392 			= gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16393 		    }
16394 		}
16395 	    }
16396 	  else
16397 	    {
16398 	      rtx base = legitimize_pic_address (op0, reg);
16399 	      machine_mode mode = GET_MODE (base);
16400 	      new_rtx
16401 	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16402 
16403 	      if (CONST_INT_P (new_rtx))
16404 		{
16405 		  if (INTVAL (new_rtx) < -16*1024*1024
16406 		      || INTVAL (new_rtx) >= 16*1024*1024)
16407 		    {
16408 		      if (!x86_64_immediate_operand (new_rtx, mode))
16409 			new_rtx = force_reg (mode, new_rtx);
16410 
16411 		      new_rtx
16412 		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16413 		    }
16414 		  else
16415 		    new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16416 		}
16417 	      else
16418 		{
16419 		  /* For %rip addressing, we have to use
16420 		     just disp32, not base nor index.  */
16421 		  if (TARGET_64BIT
16422 		      && (GET_CODE (base) == SYMBOL_REF
16423 			  || GET_CODE (base) == LABEL_REF))
16424 		    base = force_reg (mode, base);
16425 		  if (GET_CODE (new_rtx) == PLUS
16426 		      && CONSTANT_P (XEXP (new_rtx, 1)))
16427 		    {
16428 		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16429 		      new_rtx = XEXP (new_rtx, 1);
16430 		    }
16431 		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16432 		}
16433 	    }
16434 	}
16435     }
16436   return new_rtx;
16437 }
16438 
16439 /* Load the thread pointer.  If TO_REG is true, force it into a register.  */
16440 
16441 static rtx
16442 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16443 {
16444   rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16445 
16446   if (GET_MODE (tp) != tp_mode)
16447     {
16448       gcc_assert (GET_MODE (tp) == SImode);
16449       gcc_assert (tp_mode == DImode);
16450 
16451       tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16452     }
16453 
16454   if (to_reg)
16455     tp = copy_to_mode_reg (tp_mode, tp);
16456 
16457   return tp;
16458 }
16459 
16460 /* Construct the SYMBOL_REF for the tls_get_addr function.  */
16461 
16462 static GTY(()) rtx ix86_tls_symbol;
16463 
16464 static rtx
16465 ix86_tls_get_addr (void)
16466 {
16467   if (!ix86_tls_symbol)
16468     {
16469       const char *sym
16470 	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16471 	   ? "___tls_get_addr" : "__tls_get_addr");
16472 
16473       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16474     }
16475 
16476   if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16477     {
16478       rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16479 				   UNSPEC_PLTOFF);
16480       return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16481 			   gen_rtx_CONST (Pmode, unspec));
16482     }
16483 
16484   return ix86_tls_symbol;
16485 }
16486 
16487 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
16488 
16489 static GTY(()) rtx ix86_tls_module_base_symbol;
16490 
16491 rtx
16492 ix86_tls_module_base (void)
16493 {
16494   if (!ix86_tls_module_base_symbol)
16495     {
16496       ix86_tls_module_base_symbol
16497 	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16498 
16499       SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16500 	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16501     }
16502 
16503   return ix86_tls_module_base_symbol;
16504 }
16505 
16506 /* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
16507    false if we expect this to be used for a memory address and true if
16508    we expect to load the address into a register.  */
16509 
16510 static rtx
16511 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16512 {
16513   rtx dest, base, off;
16514   rtx pic = NULL_RTX, tp = NULL_RTX;
16515   machine_mode tp_mode = Pmode;
16516   int type;
16517 
16518   /* Fall back to global dynamic model if tool chain cannot support local
16519      dynamic.  */
16520   if (TARGET_SUN_TLS && !TARGET_64BIT
16521       && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16522       && model == TLS_MODEL_LOCAL_DYNAMIC)
16523     model = TLS_MODEL_GLOBAL_DYNAMIC;
16524 
16525   switch (model)
16526     {
16527     case TLS_MODEL_GLOBAL_DYNAMIC:
16528       dest = gen_reg_rtx (Pmode);
16529 
16530       if (!TARGET_64BIT)
16531 	{
16532 	  if (flag_pic && !TARGET_PECOFF)
16533 	    pic = pic_offset_table_rtx;
16534 	  else
16535 	    {
16536 	      pic = gen_reg_rtx (Pmode);
16537 	      emit_insn (gen_set_got (pic));
16538 	    }
16539 	}
16540 
16541       if (TARGET_GNU2_TLS)
16542 	{
16543 	  if (TARGET_64BIT)
16544 	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16545 	  else
16546 	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16547 
16548 	  tp = get_thread_pointer (Pmode, true);
16549 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16550 
16551 	  if (GET_MODE (x) != Pmode)
16552 	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
16553 
16554 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16555 	}
16556       else
16557 	{
16558 	  rtx caddr = ix86_tls_get_addr ();
16559 
16560 	  if (TARGET_64BIT)
16561 	    {
16562 	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
16563 	      rtx_insn *insns;
16564 
16565 	      start_sequence ();
16566 	      emit_call_insn
16567 		(ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16568 	      insns = get_insns ();
16569 	      end_sequence ();
16570 
16571 	      if (GET_MODE (x) != Pmode)
16572 		x = gen_rtx_ZERO_EXTEND (Pmode, x);
16573 
16574 	      RTL_CONST_CALL_P (insns) = 1;
16575 	      emit_libcall_block (insns, dest, rax, x);
16576 	    }
16577 	  else
16578 	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16579 	}
16580       break;
16581 
16582     case TLS_MODEL_LOCAL_DYNAMIC:
16583       base = gen_reg_rtx (Pmode);
16584 
16585       if (!TARGET_64BIT)
16586 	{
16587 	  if (flag_pic)
16588 	    pic = pic_offset_table_rtx;
16589 	  else
16590 	    {
16591 	      pic = gen_reg_rtx (Pmode);
16592 	      emit_insn (gen_set_got (pic));
16593 	    }
16594 	}
16595 
16596       if (TARGET_GNU2_TLS)
16597 	{
16598 	  rtx tmp = ix86_tls_module_base ();
16599 
16600 	  if (TARGET_64BIT)
16601 	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16602 	  else
16603 	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16604 
16605 	  tp = get_thread_pointer (Pmode, true);
16606 	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
16607 			       gen_rtx_MINUS (Pmode, tmp, tp));
16608 	}
16609       else
16610 	{
16611 	  rtx caddr = ix86_tls_get_addr ();
16612 
16613 	  if (TARGET_64BIT)
16614 	    {
16615 	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
16616 	      rtx_insn *insns;
16617 	      rtx eqv;
16618 
16619 	      start_sequence ();
16620 	      emit_call_insn
16621 		(ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16622 	      insns = get_insns ();
16623 	      end_sequence ();
16624 
16625 	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16626 		 share the LD_BASE result with other LD model accesses.  */
16627 	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16628 				    UNSPEC_TLS_LD_BASE);
16629 
16630 	      RTL_CONST_CALL_P (insns) = 1;
16631 	      emit_libcall_block (insns, base, rax, eqv);
16632 	    }
16633 	  else
16634 	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16635 	}
16636 
16637       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16638       off = gen_rtx_CONST (Pmode, off);
16639 
16640       dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16641 
16642       if (TARGET_GNU2_TLS)
16643 	{
16644 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16645 
16646 	  if (GET_MODE (x) != Pmode)
16647 	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
16648 
16649 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16650 	}
16651       break;
16652 
16653     case TLS_MODEL_INITIAL_EXEC:
16654       if (TARGET_64BIT)
16655 	{
16656 	  if (TARGET_SUN_TLS && !TARGET_X32)
16657 	    {
16658 	      /* The Sun linker took the AMD64 TLS spec literally
16659 		 and can only handle %rax as destination of the
16660 		 initial executable code sequence.  */
16661 
16662 	      dest = gen_reg_rtx (DImode);
16663 	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16664 	      return dest;
16665 	    }
16666 
16667 	  /* Generate DImode references to avoid %fs:(%reg32)
16668 	     problems and linker IE->LE relaxation bug.  */
16669 	  tp_mode = DImode;
16670 	  pic = NULL;
16671 	  type = UNSPEC_GOTNTPOFF;
16672 	}
16673       else if (flag_pic)
16674 	{
16675 	  pic = pic_offset_table_rtx;
16676 	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16677 	}
16678       else if (!TARGET_ANY_GNU_TLS)
16679 	{
16680 	  pic = gen_reg_rtx (Pmode);
16681 	  emit_insn (gen_set_got (pic));
16682 	  type = UNSPEC_GOTTPOFF;
16683 	}
16684       else
16685 	{
16686 	  pic = NULL;
16687 	  type = UNSPEC_INDNTPOFF;
16688 	}
16689 
16690       off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16691       off = gen_rtx_CONST (tp_mode, off);
16692       if (pic)
16693 	off = gen_rtx_PLUS (tp_mode, pic, off);
16694       off = gen_const_mem (tp_mode, off);
16695       set_mem_alias_set (off, ix86_GOT_alias_set ());
16696 
16697       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16698 	{
16699 	  base = get_thread_pointer (tp_mode,
16700 				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16701 	  off = force_reg (tp_mode, off);
16702 	  dest = gen_rtx_PLUS (tp_mode, base, off);
16703 	  if (tp_mode != Pmode)
16704 	    dest = convert_to_mode (Pmode, dest, 1);
16705 	}
16706       else
16707 	{
16708 	  base = get_thread_pointer (Pmode, true);
16709 	  dest = gen_reg_rtx (Pmode);
16710 	  emit_insn (ix86_gen_sub3 (dest, base, off));
16711 	}
16712       break;
16713 
16714     case TLS_MODEL_LOCAL_EXEC:
16715       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16716 			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16717 			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16718       off = gen_rtx_CONST (Pmode, off);
16719 
16720       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16721 	{
16722 	  base = get_thread_pointer (Pmode,
16723 				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16724 	  return gen_rtx_PLUS (Pmode, base, off);
16725 	}
16726       else
16727 	{
16728 	  base = get_thread_pointer (Pmode, true);
16729 	  dest = gen_reg_rtx (Pmode);
16730 	  emit_insn (ix86_gen_sub3 (dest, base, off));
16731 	}
16732       break;
16733 
16734     default:
16735       gcc_unreachable ();
16736     }
16737 
16738   return dest;
16739 }
16740 
16741 /* Return true if OP refers to a TLS address.  */
16742 bool
16743 ix86_tls_address_pattern_p (rtx op)
16744 {
16745   subrtx_var_iterator::array_type array;
16746   FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16747     {
16748       rtx op = *iter;
16749       if (MEM_P (op))
16750 	{
16751 	  rtx *x = &XEXP (op, 0);
16752 	  while (GET_CODE (*x) == PLUS)
16753 	    {
16754 	      int i;
16755 	      for (i = 0; i < 2; i++)
16756 		{
16757 		  rtx u = XEXP (*x, i);
16758 		  if (GET_CODE (u) == ZERO_EXTEND)
16759 		    u = XEXP (u, 0);
16760 		  if (GET_CODE (u) == UNSPEC
16761 		      && XINT (u, 1) == UNSPEC_TP)
16762 		    return true;
16763 		}
16764 	      x = &XEXP (*x, 0);
16765 	    }
16766 
16767 	  iter.skip_subrtxes ();
16768 	}
16769     }
16770 
16771   return false;
16772 }
16773 
16774 /* Rewrite *LOC so that it refers to a default TLS address space.  */
16775 void
16776 ix86_rewrite_tls_address_1 (rtx *loc)
16777 {
16778   subrtx_ptr_iterator::array_type array;
16779   FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16780     {
16781       rtx *loc = *iter;
16782       if (MEM_P (*loc))
16783 	{
16784 	  rtx addr = XEXP (*loc, 0);
16785 	  rtx *x = &addr;
16786 	  while (GET_CODE (*x) == PLUS)
16787 	    {
16788 	      int i;
16789 	      for (i = 0; i < 2; i++)
16790 		{
16791 		  rtx u = XEXP (*x, i);
16792 		  if (GET_CODE (u) == ZERO_EXTEND)
16793 		    u = XEXP (u, 0);
16794 		  if (GET_CODE (u) == UNSPEC
16795 		      && XINT (u, 1) == UNSPEC_TP)
16796 		    {
16797 		      addr_space_t as = DEFAULT_TLS_SEG_REG;
16798 
16799 		      *x = XEXP (*x, 1 - i);
16800 
16801 		      *loc = replace_equiv_address_nv (*loc, addr, true);
16802 		      set_mem_addr_space (*loc, as);
16803 		      return;
16804 		    }
16805 		}
16806 	      x = &XEXP (*x, 0);
16807 	    }
16808 
16809 	  iter.skip_subrtxes ();
16810 	}
16811     }
16812 }
16813 
16814 /* Rewrite instruction pattern involvning TLS address
16815    so that it refers to a default TLS address space.  */
16816 rtx
16817 ix86_rewrite_tls_address (rtx pattern)
16818 {
16819   pattern = copy_insn (pattern);
16820   ix86_rewrite_tls_address_1 (&pattern);
16821   return pattern;
16822 }
16823 
16824 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16825    to symbol DECL if BEIMPORT is true.  Otherwise create or return the
16826    unique refptr-DECL symbol corresponding to symbol DECL.  */
16827 
16828 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16829 {
16830   static inline hashval_t hash (tree_map *m) { return m->hash; }
16831   static inline bool
16832   equal (tree_map *a, tree_map *b)
16833   {
16834     return a->base.from == b->base.from;
16835   }
16836 
16837   static int
16838   keep_cache_entry (tree_map *&m)
16839   {
16840     return ggc_marked_p (m->base.from);
16841   }
16842 };
16843 
16844 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16845 
16846 static tree
16847 get_dllimport_decl (tree decl, bool beimport)
16848 {
16849   struct tree_map *h, in;
16850   const char *name;
16851   const char *prefix;
16852   size_t namelen, prefixlen;
16853   char *imp_name;
16854   tree to;
16855   rtx rtl;
16856 
16857   if (!dllimport_map)
16858     dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16859 
16860   in.hash = htab_hash_pointer (decl);
16861   in.base.from = decl;
16862   tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16863   h = *loc;
16864   if (h)
16865     return h->to;
16866 
16867   *loc = h = ggc_alloc<tree_map> ();
16868   h->hash = in.hash;
16869   h->base.from = decl;
16870   h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16871 			   VAR_DECL, NULL, ptr_type_node);
16872   DECL_ARTIFICIAL (to) = 1;
16873   DECL_IGNORED_P (to) = 1;
16874   DECL_EXTERNAL (to) = 1;
16875   TREE_READONLY (to) = 1;
16876 
16877   name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16878   name = targetm.strip_name_encoding (name);
16879   if (beimport)
16880     prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16881       ? "*__imp_" : "*__imp__";
16882   else
16883     prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16884   namelen = strlen (name);
16885   prefixlen = strlen (prefix);
16886   imp_name = (char *) alloca (namelen + prefixlen + 1);
16887   memcpy (imp_name, prefix, prefixlen);
16888   memcpy (imp_name + prefixlen, name, namelen + 1);
16889 
16890   name = ggc_alloc_string (imp_name, namelen + prefixlen);
16891   rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16892   SET_SYMBOL_REF_DECL (rtl, to);
16893   SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16894   if (!beimport)
16895     {
16896       SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16897 #ifdef SUB_TARGET_RECORD_STUB
16898       SUB_TARGET_RECORD_STUB (name);
16899 #endif
16900     }
16901 
16902   rtl = gen_const_mem (Pmode, rtl);
16903   set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16904 
16905   SET_DECL_RTL (to, rtl);
16906   SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16907 
16908   return to;
16909 }
16910 
16911 /* Expand SYMBOL into its corresponding far-address symbol.
16912    WANT_REG is true if we require the result be a register.  */
16913 
16914 static rtx
16915 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16916 {
16917   tree imp_decl;
16918   rtx x;
16919 
16920   gcc_assert (SYMBOL_REF_DECL (symbol));
16921   imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16922 
16923   x = DECL_RTL (imp_decl);
16924   if (want_reg)
16925     x = force_reg (Pmode, x);
16926   return x;
16927 }
16928 
16929 /* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
16930    true if we require the result be a register.  */
16931 
16932 static rtx
16933 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16934 {
16935   tree imp_decl;
16936   rtx x;
16937 
16938   gcc_assert (SYMBOL_REF_DECL (symbol));
16939   imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16940 
16941   x = DECL_RTL (imp_decl);
16942   if (want_reg)
16943     x = force_reg (Pmode, x);
16944   return x;
16945 }
16946 
16947 /* Expand SYMBOL into its corresponding dllimport or refptr symbol.  WANT_REG
16948    is true if we require the result be a register.  */
16949 
16950 static rtx
16951 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16952 {
16953   if (!TARGET_PECOFF)
16954     return NULL_RTX;
16955 
16956   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16957     {
16958       if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16959 	return legitimize_dllimport_symbol (addr, inreg);
16960       if (GET_CODE (addr) == CONST
16961 	  && GET_CODE (XEXP (addr, 0)) == PLUS
16962 	  && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16963 	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16964 	{
16965 	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16966 	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16967 	}
16968     }
16969 
16970   if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16971     return NULL_RTX;
16972   if (GET_CODE (addr) == SYMBOL_REF
16973       && !is_imported_p (addr)
16974       && SYMBOL_REF_EXTERNAL_P (addr)
16975       && SYMBOL_REF_DECL (addr))
16976     return legitimize_pe_coff_extern_decl (addr, inreg);
16977 
16978   if (GET_CODE (addr) == CONST
16979       && GET_CODE (XEXP (addr, 0)) == PLUS
16980       && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16981       && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16982       && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16983       && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16984     {
16985       rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16986       return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16987     }
16988   return NULL_RTX;
16989 }
16990 
16991 /* Try machine-dependent ways of modifying an illegitimate address
16992    to be legitimate.  If we find one, return the new, valid address.
16993    This macro is used in only one place: `memory_address' in explow.c.
16994 
16995    OLDX is the address as it was before break_out_memory_refs was called.
16996    In some cases it is useful to look at this to decide what needs to be done.
16997 
16998    It is always safe for this macro to do nothing.  It exists to recognize
16999    opportunities to optimize the output.
17000 
17001    For the 80386, we handle X+REG by loading X into a register R and
17002    using R+REG.  R will go in a general reg and indexing will be used.
17003    However, if REG is a broken-out memory address or multiplication,
17004    nothing needs to be done because REG can certainly go in a general reg.
17005 
17006    When -fpic is used, special handling is needed for symbolic references.
17007    See comments by legitimize_pic_address in i386.c for details.  */
17008 
17009 static rtx
17010 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17011 {
17012   bool changed = false;
17013   unsigned log;
17014 
17015   log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17016   if (log)
17017     return legitimize_tls_address (x, (enum tls_model) log, false);
17018   if (GET_CODE (x) == CONST
17019       && GET_CODE (XEXP (x, 0)) == PLUS
17020       && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17021       && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17022     {
17023       rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17024 				      (enum tls_model) log, false);
17025       return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17026     }
17027 
17028   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17029     {
17030       rtx tmp = legitimize_pe_coff_symbol (x, true);
17031       if (tmp)
17032         return tmp;
17033     }
17034 
17035   if (flag_pic && SYMBOLIC_CONST (x))
17036     return legitimize_pic_address (x, 0);
17037 
17038 #if TARGET_MACHO
17039   if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17040     return machopic_indirect_data_reference (x, 0);
17041 #endif
17042 
17043   /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17044   if (GET_CODE (x) == ASHIFT
17045       && CONST_INT_P (XEXP (x, 1))
17046       && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17047     {
17048       changed = true;
17049       log = INTVAL (XEXP (x, 1));
17050       x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17051 			GEN_INT (1 << log));
17052     }
17053 
17054   if (GET_CODE (x) == PLUS)
17055     {
17056       /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
17057 
17058       if (GET_CODE (XEXP (x, 0)) == ASHIFT
17059 	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17060 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17061 	{
17062 	  changed = true;
17063 	  log = INTVAL (XEXP (XEXP (x, 0), 1));
17064 	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
17065 				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17066 				      GEN_INT (1 << log));
17067 	}
17068 
17069       if (GET_CODE (XEXP (x, 1)) == ASHIFT
17070 	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17071 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17072 	{
17073 	  changed = true;
17074 	  log = INTVAL (XEXP (XEXP (x, 1), 1));
17075 	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
17076 				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17077 				      GEN_INT (1 << log));
17078 	}
17079 
17080       /* Put multiply first if it isn't already.  */
17081       if (GET_CODE (XEXP (x, 1)) == MULT)
17082 	{
17083 	  std::swap (XEXP (x, 0), XEXP (x, 1));
17084 	  changed = true;
17085 	}
17086 
17087       /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17088 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
17089 	 created by virtual register instantiation, register elimination, and
17090 	 similar optimizations.  */
17091       if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17092 	{
17093 	  changed = true;
17094 	  x = gen_rtx_PLUS (Pmode,
17095 			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
17096 					  XEXP (XEXP (x, 1), 0)),
17097 			    XEXP (XEXP (x, 1), 1));
17098 	}
17099 
17100       /* Canonicalize
17101 	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17102 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
17103       else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17104 	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17105 	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17106 	       && CONSTANT_P (XEXP (x, 1)))
17107 	{
17108 	  rtx constant;
17109 	  rtx other = NULL_RTX;
17110 
17111 	  if (CONST_INT_P (XEXP (x, 1)))
17112 	    {
17113 	      constant = XEXP (x, 1);
17114 	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17115 	    }
17116 	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17117 	    {
17118 	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17119 	      other = XEXP (x, 1);
17120 	    }
17121 	  else
17122 	    constant = 0;
17123 
17124 	  if (constant)
17125 	    {
17126 	      changed = true;
17127 	      x = gen_rtx_PLUS (Pmode,
17128 				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17129 					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
17130 				plus_constant (Pmode, other,
17131 					       INTVAL (constant)));
17132 	    }
17133 	}
17134 
17135       if (changed && ix86_legitimate_address_p (mode, x, false))
17136 	return x;
17137 
17138       if (GET_CODE (XEXP (x, 0)) == MULT)
17139 	{
17140 	  changed = true;
17141 	  XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17142 	}
17143 
17144       if (GET_CODE (XEXP (x, 1)) == MULT)
17145 	{
17146 	  changed = true;
17147 	  XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17148 	}
17149 
17150       if (changed
17151 	  && REG_P (XEXP (x, 1))
17152 	  && REG_P (XEXP (x, 0)))
17153 	return x;
17154 
17155       if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17156 	{
17157 	  changed = true;
17158 	  x = legitimize_pic_address (x, 0);
17159 	}
17160 
17161       if (changed && ix86_legitimate_address_p (mode, x, false))
17162 	return x;
17163 
17164       if (REG_P (XEXP (x, 0)))
17165 	{
17166 	  rtx temp = gen_reg_rtx (Pmode);
17167 	  rtx val  = force_operand (XEXP (x, 1), temp);
17168 	  if (val != temp)
17169 	    {
17170 	      val = convert_to_mode (Pmode, val, 1);
17171 	      emit_move_insn (temp, val);
17172 	    }
17173 
17174 	  XEXP (x, 1) = temp;
17175 	  return x;
17176 	}
17177 
17178       else if (REG_P (XEXP (x, 1)))
17179 	{
17180 	  rtx temp = gen_reg_rtx (Pmode);
17181 	  rtx val  = force_operand (XEXP (x, 0), temp);
17182 	  if (val != temp)
17183 	    {
17184 	      val = convert_to_mode (Pmode, val, 1);
17185 	      emit_move_insn (temp, val);
17186 	    }
17187 
17188 	  XEXP (x, 0) = temp;
17189 	  return x;
17190 	}
17191     }
17192 
17193   return x;
17194 }
17195 
17196 /* Print an integer constant expression in assembler syntax.  Addition
17197    and subtraction are the only arithmetic that may appear in these
17198    expressions.  FILE is the stdio stream to write to, X is the rtx, and
17199    CODE is the operand print code from the output string.  */
17200 
17201 static void
17202 output_pic_addr_const (FILE *file, rtx x, int code)
17203 {
17204   char buf[256];
17205 
17206   switch (GET_CODE (x))
17207     {
17208     case PC:
17209       gcc_assert (flag_pic);
17210       putc ('.', file);
17211       break;
17212 
17213     case SYMBOL_REF:
17214       if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17215 	output_addr_const (file, x);
17216       else
17217 	{
17218 	  const char *name = XSTR (x, 0);
17219 
17220 	  /* Mark the decl as referenced so that cgraph will
17221 	     output the function.  */
17222 	  if (SYMBOL_REF_DECL (x))
17223 	    mark_decl_referenced (SYMBOL_REF_DECL (x));
17224 
17225 #if TARGET_MACHO
17226 	  if (MACHOPIC_INDIRECT
17227 	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17228 	    name = machopic_indirection_name (x, /*stub_p=*/true);
17229 #endif
17230 	  assemble_name (file, name);
17231 	}
17232       if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17233 	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17234 	fputs ("@PLT", file);
17235       break;
17236 
17237     case LABEL_REF:
17238       x = XEXP (x, 0);
17239       /* FALLTHRU */
17240     case CODE_LABEL:
17241       ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17242       assemble_name (asm_out_file, buf);
17243       break;
17244 
17245     case CONST_INT:
17246       fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17247       break;
17248 
17249     case CONST:
17250       /* This used to output parentheses around the expression,
17251 	 but that does not work on the 386 (either ATT or BSD assembler).  */
17252       output_pic_addr_const (file, XEXP (x, 0), code);
17253       break;
17254 
17255     case CONST_DOUBLE:
17256       /* We can't handle floating point constants;
17257 	 TARGET_PRINT_OPERAND must handle them.  */
17258       output_operand_lossage ("floating constant misused");
17259       break;
17260 
17261     case PLUS:
17262       /* Some assemblers need integer constants to appear first.  */
17263       if (CONST_INT_P (XEXP (x, 0)))
17264 	{
17265 	  output_pic_addr_const (file, XEXP (x, 0), code);
17266 	  putc ('+', file);
17267 	  output_pic_addr_const (file, XEXP (x, 1), code);
17268 	}
17269       else
17270 	{
17271 	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
17272 	  output_pic_addr_const (file, XEXP (x, 1), code);
17273 	  putc ('+', file);
17274 	  output_pic_addr_const (file, XEXP (x, 0), code);
17275 	}
17276       break;
17277 
17278     case MINUS:
17279       if (!TARGET_MACHO)
17280 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17281       output_pic_addr_const (file, XEXP (x, 0), code);
17282       putc ('-', file);
17283       output_pic_addr_const (file, XEXP (x, 1), code);
17284       if (!TARGET_MACHO)
17285 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17286       break;
17287 
17288     case UNSPEC:
17289       gcc_assert (XVECLEN (x, 0) == 1);
17290       output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17291       switch (XINT (x, 1))
17292 	{
17293 	case UNSPEC_GOT:
17294 	  fputs ("@GOT", file);
17295 	  break;
17296 	case UNSPEC_GOTOFF:
17297 	  fputs ("@GOTOFF", file);
17298 	  break;
17299 	case UNSPEC_PLTOFF:
17300 	  fputs ("@PLTOFF", file);
17301 	  break;
17302 	case UNSPEC_PCREL:
17303 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17304 		 "(%rip)" : "[rip]", file);
17305 	  break;
17306 	case UNSPEC_GOTPCREL:
17307 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17308 		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17309 	  break;
17310 	case UNSPEC_GOTTPOFF:
17311 	  /* FIXME: This might be @TPOFF in Sun ld too.  */
17312 	  fputs ("@gottpoff", file);
17313 	  break;
17314 	case UNSPEC_TPOFF:
17315 	  fputs ("@tpoff", file);
17316 	  break;
17317 	case UNSPEC_NTPOFF:
17318 	  if (TARGET_64BIT)
17319 	    fputs ("@tpoff", file);
17320 	  else
17321 	    fputs ("@ntpoff", file);
17322 	  break;
17323 	case UNSPEC_DTPOFF:
17324 	  fputs ("@dtpoff", file);
17325 	  break;
17326 	case UNSPEC_GOTNTPOFF:
17327 	  if (TARGET_64BIT)
17328 	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17329 		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
17330 	  else
17331 	    fputs ("@gotntpoff", file);
17332 	  break;
17333 	case UNSPEC_INDNTPOFF:
17334 	  fputs ("@indntpoff", file);
17335 	  break;
17336 #if TARGET_MACHO
17337 	case UNSPEC_MACHOPIC_OFFSET:
17338 	  putc ('-', file);
17339 	  machopic_output_function_base_name (file);
17340 	  break;
17341 #endif
17342 	default:
17343 	  output_operand_lossage ("invalid UNSPEC as operand");
17344 	  break;
17345 	}
17346        break;
17347 
17348     default:
17349       output_operand_lossage ("invalid expression as operand");
17350     }
17351 }
17352 
17353 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17354    We need to emit DTP-relative relocations.  */
17355 
17356 static void ATTRIBUTE_UNUSED
17357 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17358 {
17359   fputs (ASM_LONG, file);
17360   output_addr_const (file, x);
17361   fputs ("@dtpoff", file);
17362   switch (size)
17363     {
17364     case 4:
17365       break;
17366     case 8:
17367       fputs (", 0", file);
17368       break;
17369     default:
17370       gcc_unreachable ();
17371    }
17372 }
17373 
17374 /* Return true if X is a representation of the PIC register.  This copes
17375    with calls from ix86_find_base_term, where the register might have
17376    been replaced by a cselib value.  */
17377 
17378 static bool
17379 ix86_pic_register_p (rtx x)
17380 {
17381   if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17382     return (pic_offset_table_rtx
17383 	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17384   else if (!REG_P (x))
17385     return false;
17386   else if (pic_offset_table_rtx)
17387     {
17388       if (REGNO (x) == REGNO (pic_offset_table_rtx))
17389 	return true;
17390       if (HARD_REGISTER_P (x)
17391 	  && !HARD_REGISTER_P (pic_offset_table_rtx)
17392 	  && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17393 	return true;
17394       return false;
17395     }
17396   else
17397     return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17398 }
17399 
17400 /* Helper function for ix86_delegitimize_address.
17401    Attempt to delegitimize TLS local-exec accesses.  */
17402 
17403 static rtx
17404 ix86_delegitimize_tls_address (rtx orig_x)
17405 {
17406   rtx x = orig_x, unspec;
17407   struct ix86_address addr;
17408 
17409   if (!TARGET_TLS_DIRECT_SEG_REFS)
17410     return orig_x;
17411   if (MEM_P (x))
17412     x = XEXP (x, 0);
17413   if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17414     return orig_x;
17415   if (ix86_decompose_address (x, &addr) == 0
17416       || addr.seg != DEFAULT_TLS_SEG_REG
17417       || addr.disp == NULL_RTX
17418       || GET_CODE (addr.disp) != CONST)
17419     return orig_x;
17420   unspec = XEXP (addr.disp, 0);
17421   if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17422     unspec = XEXP (unspec, 0);
17423   if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17424     return orig_x;
17425   x = XVECEXP (unspec, 0, 0);
17426   gcc_assert (GET_CODE (x) == SYMBOL_REF);
17427   if (unspec != XEXP (addr.disp, 0))
17428     x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17429   if (addr.index)
17430     {
17431       rtx idx = addr.index;
17432       if (addr.scale != 1)
17433 	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17434       x = gen_rtx_PLUS (Pmode, idx, x);
17435     }
17436   if (addr.base)
17437     x = gen_rtx_PLUS (Pmode, addr.base, x);
17438   if (MEM_P (orig_x))
17439     x = replace_equiv_address_nv (orig_x, x);
17440   return x;
17441 }
17442 
17443 /* In the name of slightly smaller debug output, and to cater to
17444    general assembler lossage, recognize PIC+GOTOFF and turn it back
17445    into a direct symbol reference.
17446 
17447    On Darwin, this is necessary to avoid a crash, because Darwin
17448    has a different PIC label for each routine but the DWARF debugging
17449    information is not associated with any particular routine, so it's
17450    necessary to remove references to the PIC label from RTL stored by
17451    the DWARF output code.
17452 
17453    This helper is used in the normal ix86_delegitimize_address
17454    entrypoint (e.g. used in the target delegitimization hook) and
17455    in ix86_find_base_term.  As compile time memory optimization, we
17456    avoid allocating rtxes that will not change anything on the outcome
17457    of the callers (find_base_value and find_base_term).  */
17458 
17459 static inline rtx
17460 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17461 {
17462   rtx orig_x = delegitimize_mem_from_attrs (x);
17463   /* addend is NULL or some rtx if x is something+GOTOFF where
17464      something doesn't include the PIC register.  */
17465   rtx addend = NULL_RTX;
17466   /* reg_addend is NULL or a multiple of some register.  */
17467   rtx reg_addend = NULL_RTX;
17468   /* const_addend is NULL or a const_int.  */
17469   rtx const_addend = NULL_RTX;
17470   /* This is the result, or NULL.  */
17471   rtx result = NULL_RTX;
17472 
17473   x = orig_x;
17474 
17475   if (MEM_P (x))
17476     x = XEXP (x, 0);
17477 
17478   if (TARGET_64BIT)
17479     {
17480       if (GET_CODE (x) == CONST
17481           && GET_CODE (XEXP (x, 0)) == PLUS
17482           && GET_MODE (XEXP (x, 0)) == Pmode
17483           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17484           && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17485           && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17486         {
17487 	  /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17488 	     base.  A CONST can't be arg_pointer_rtx based.  */
17489 	  if (base_term_p && MEM_P (orig_x))
17490 	    return orig_x;
17491 	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17492 	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17493 	  if (MEM_P (orig_x))
17494 	    x = replace_equiv_address_nv (orig_x, x);
17495 	  return x;
17496 	}
17497 
17498       if (GET_CODE (x) == CONST
17499 	  && GET_CODE (XEXP (x, 0)) == UNSPEC
17500 	  && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17501 	      || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17502 	  && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17503 	{
17504 	  x = XVECEXP (XEXP (x, 0), 0, 0);
17505 	  if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17506 	    {
17507 	      x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17508 	      if (x == NULL_RTX)
17509 		return orig_x;
17510 	    }
17511 	  return x;
17512 	}
17513 
17514       if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17515 	return ix86_delegitimize_tls_address (orig_x);
17516 
17517       /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17518 	 and -mcmodel=medium -fpic.  */
17519     }
17520 
17521   if (GET_CODE (x) != PLUS
17522       || GET_CODE (XEXP (x, 1)) != CONST)
17523     return ix86_delegitimize_tls_address (orig_x);
17524 
17525   if (ix86_pic_register_p (XEXP (x, 0)))
17526     /* %ebx + GOT/GOTOFF */
17527     ;
17528   else if (GET_CODE (XEXP (x, 0)) == PLUS)
17529     {
17530       /* %ebx + %reg * scale + GOT/GOTOFF */
17531       reg_addend = XEXP (x, 0);
17532       if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17533 	reg_addend = XEXP (reg_addend, 1);
17534       else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17535 	reg_addend = XEXP (reg_addend, 0);
17536       else
17537 	{
17538 	  reg_addend = NULL_RTX;
17539 	  addend = XEXP (x, 0);
17540 	}
17541     }
17542   else
17543     addend = XEXP (x, 0);
17544 
17545   x = XEXP (XEXP (x, 1), 0);
17546   if (GET_CODE (x) == PLUS
17547       && CONST_INT_P (XEXP (x, 1)))
17548     {
17549       const_addend = XEXP (x, 1);
17550       x = XEXP (x, 0);
17551     }
17552 
17553   if (GET_CODE (x) == UNSPEC
17554       && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17555 	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17556 	  || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17557 	      && !MEM_P (orig_x) && !addend)))
17558     result = XVECEXP (x, 0, 0);
17559 
17560   if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17561       && !MEM_P (orig_x))
17562     result = XVECEXP (x, 0, 0);
17563 
17564   if (! result)
17565     return ix86_delegitimize_tls_address (orig_x);
17566 
17567   /* For (PLUS something CONST_INT) both find_base_{value,term} just
17568      recurse on the first operand.  */
17569   if (const_addend && !base_term_p)
17570     result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17571   if (reg_addend)
17572     result = gen_rtx_PLUS (Pmode, reg_addend, result);
17573   if (addend)
17574     {
17575       /* If the rest of original X doesn't involve the PIC register, add
17576 	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
17577 	 for code like:
17578 	 leal (%ebx, %ecx, 4), %ecx
17579 	 ...
17580 	 movl foo@GOTOFF(%ecx), %edx
17581 	 in which case we return (%ecx - %ebx) + foo
17582 	 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17583 	 and reload has completed.  Don't do the latter for debug,
17584 	 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly.  */
17585       if (pic_offset_table_rtx
17586 	  && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17587         result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17588 						     pic_offset_table_rtx),
17589 			       result);
17590       else if (base_term_p
17591 	       && pic_offset_table_rtx
17592 	       && !TARGET_MACHO
17593 	       && !TARGET_VXWORKS_RTP)
17594 	{
17595 	  rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17596 	  tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17597 	  result = gen_rtx_PLUS (Pmode, tmp, result);
17598 	}
17599       else
17600 	return orig_x;
17601     }
17602   if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17603     {
17604       result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17605       if (result == NULL_RTX)
17606 	return orig_x;
17607     }
17608   return result;
17609 }
17610 
17611 /* The normal instantiation of the above template.  */
17612 
17613 static rtx
17614 ix86_delegitimize_address (rtx x)
17615 {
17616   return ix86_delegitimize_address_1 (x, false);
17617 }
17618 
17619 /* If X is a machine specific address (i.e. a symbol or label being
17620    referenced as a displacement from the GOT implemented using an
17621    UNSPEC), then return the base term.  Otherwise return X.  */
17622 
17623 rtx
17624 ix86_find_base_term (rtx x)
17625 {
17626   rtx term;
17627 
17628   if (TARGET_64BIT)
17629     {
17630       if (GET_CODE (x) != CONST)
17631 	return x;
17632       term = XEXP (x, 0);
17633       if (GET_CODE (term) == PLUS
17634 	  && CONST_INT_P (XEXP (term, 1)))
17635 	term = XEXP (term, 0);
17636       if (GET_CODE (term) != UNSPEC
17637 	  || (XINT (term, 1) != UNSPEC_GOTPCREL
17638 	      && XINT (term, 1) != UNSPEC_PCREL))
17639 	return x;
17640 
17641       return XVECEXP (term, 0, 0);
17642     }
17643 
17644   return ix86_delegitimize_address_1 (x, true);
17645 }
17646 
17647 /* Return true if X shouldn't be emitted into the debug info.
17648    Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17649    symbol easily into the .debug_info section, so we need not to
17650    delegitimize, but instead assemble as @gotoff.
17651    Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17652    assembles that as _GLOBAL_OFFSET_TABLE_-. expression.  */
17653 
17654 static bool
17655 ix86_const_not_ok_for_debug_p (rtx x)
17656 {
17657   if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17658     return true;
17659 
17660   if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17661     return true;
17662 
17663   return false;
17664 }
17665 
17666 static void
17667 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17668 		    bool fp, FILE *file)
17669 {
17670   const char *suffix;
17671 
17672   if (mode == CCFPmode)
17673     {
17674       code = ix86_fp_compare_code_to_integer (code);
17675       mode = CCmode;
17676     }
17677   if (reverse)
17678     code = reverse_condition (code);
17679 
17680   switch (code)
17681     {
17682     case EQ:
17683       gcc_assert (mode != CCGZmode);
17684       switch (mode)
17685 	{
17686 	case E_CCAmode:
17687 	  suffix = "a";
17688 	  break;
17689 	case E_CCCmode:
17690 	  suffix = "c";
17691 	  break;
17692 	case E_CCOmode:
17693 	  suffix = "o";
17694 	  break;
17695 	case E_CCPmode:
17696 	  suffix = "p";
17697 	  break;
17698 	case E_CCSmode:
17699 	  suffix = "s";
17700 	  break;
17701 	default:
17702 	  suffix = "e";
17703 	  break;
17704 	}
17705       break;
17706     case NE:
17707       gcc_assert (mode != CCGZmode);
17708       switch (mode)
17709 	{
17710 	case E_CCAmode:
17711 	  suffix = "na";
17712 	  break;
17713 	case E_CCCmode:
17714 	  suffix = "nc";
17715 	  break;
17716 	case E_CCOmode:
17717 	  suffix = "no";
17718 	  break;
17719 	case E_CCPmode:
17720 	  suffix = "np";
17721 	  break;
17722 	case E_CCSmode:
17723 	  suffix = "ns";
17724 	  break;
17725 	default:
17726 	  suffix = "ne";
17727 	  break;
17728 	}
17729       break;
17730     case GT:
17731       gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17732       suffix = "g";
17733       break;
17734     case GTU:
17735       /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17736 	 Those same assemblers have the same but opposite lossage on cmov.  */
17737       if (mode == CCmode)
17738 	suffix = fp ? "nbe" : "a";
17739       else
17740 	gcc_unreachable ();
17741       break;
17742     case LT:
17743       switch (mode)
17744 	{
17745 	case E_CCNOmode:
17746 	case E_CCGOCmode:
17747 	  suffix = "s";
17748 	  break;
17749 
17750 	case E_CCmode:
17751 	case E_CCGCmode:
17752 	case E_CCGZmode:
17753 	  suffix = "l";
17754 	  break;
17755 
17756 	default:
17757 	  gcc_unreachable ();
17758 	}
17759       break;
17760     case LTU:
17761       if (mode == CCmode || mode == CCGZmode)
17762 	suffix = "b";
17763       else if (mode == CCCmode)
17764 	suffix = fp ? "b" : "c";
17765       else
17766 	gcc_unreachable ();
17767       break;
17768     case GE:
17769       switch (mode)
17770 	{
17771 	case E_CCNOmode:
17772 	case E_CCGOCmode:
17773 	  suffix = "ns";
17774 	  break;
17775 
17776 	case E_CCmode:
17777 	case E_CCGCmode:
17778 	case E_CCGZmode:
17779 	  suffix = "ge";
17780 	  break;
17781 
17782 	default:
17783 	  gcc_unreachable ();
17784 	}
17785       break;
17786     case GEU:
17787       if (mode == CCmode || mode == CCGZmode)
17788 	suffix = "nb";
17789       else if (mode == CCCmode)
17790 	suffix = fp ? "nb" : "nc";
17791       else
17792 	gcc_unreachable ();
17793       break;
17794     case LE:
17795       gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17796       suffix = "le";
17797       break;
17798     case LEU:
17799       if (mode == CCmode)
17800 	suffix = "be";
17801       else
17802 	gcc_unreachable ();
17803       break;
17804     case UNORDERED:
17805       suffix = fp ? "u" : "p";
17806       break;
17807     case ORDERED:
17808       suffix = fp ? "nu" : "np";
17809       break;
17810     default:
17811       gcc_unreachable ();
17812     }
17813   fputs (suffix, file);
17814 }
17815 
17816 /* Print the name of register X to FILE based on its machine mode and number.
17817    If CODE is 'w', pretend the mode is HImode.
17818    If CODE is 'b', pretend the mode is QImode.
17819    If CODE is 'k', pretend the mode is SImode.
17820    If CODE is 'q', pretend the mode is DImode.
17821    If CODE is 'x', pretend the mode is V4SFmode.
17822    If CODE is 't', pretend the mode is V8SFmode.
17823    If CODE is 'g', pretend the mode is V16SFmode.
17824    If CODE is 'h', pretend the reg is the 'high' byte register.
17825    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17826    If CODE is 'd', duplicate the operand for AVX instruction.
17827    If CODE is 'V', print naked full integer register name without %.
17828  */
17829 
17830 void
17831 print_reg (rtx x, int code, FILE *file)
17832 {
17833   const char *reg;
17834   int msize;
17835   unsigned int regno;
17836   bool duplicated;
17837 
17838   if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17839     putc ('%', file);
17840 
17841   if (x == pc_rtx)
17842     {
17843       gcc_assert (TARGET_64BIT);
17844       fputs ("rip", file);
17845       return;
17846     }
17847 
17848   if (code == 'y' && STACK_TOP_P (x))
17849     {
17850       fputs ("st(0)", file);
17851       return;
17852     }
17853 
17854   if (code == 'w')
17855     msize = 2;
17856   else if (code == 'b')
17857     msize = 1;
17858   else if (code == 'k')
17859     msize = 4;
17860   else if (code == 'q')
17861     msize = 8;
17862   else if (code == 'h')
17863     msize = 0;
17864   else if (code == 'x')
17865     msize = 16;
17866   else if (code == 't')
17867     msize = 32;
17868   else if (code == 'g')
17869     msize = 64;
17870   else
17871     msize = GET_MODE_SIZE (GET_MODE (x));
17872 
17873   regno = REGNO (x);
17874 
17875   if (regno == ARG_POINTER_REGNUM
17876       || regno == FRAME_POINTER_REGNUM
17877       || regno == FPSR_REG
17878       || regno == FPCR_REG)
17879     {
17880       output_operand_lossage
17881 	("invalid use of register '%s'", reg_names[regno]);
17882       return;
17883     }
17884   else if (regno == FLAGS_REG)
17885     {
17886       output_operand_lossage ("invalid use of asm flag output");
17887       return;
17888     }
17889 
17890   if (code == 'V')
17891     {
17892       if (GENERAL_REGNO_P (regno))
17893 	msize = GET_MODE_SIZE (word_mode);
17894       else
17895 	error ("'V' modifier on non-integer register");
17896     }
17897 
17898   duplicated = code == 'd' && TARGET_AVX;
17899 
17900   switch (msize)
17901     {
17902     case 16:
17903     case 12:
17904     case 8:
17905       if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17906 	warning (0, "unsupported size for integer register");
17907       /* FALLTHRU */
17908     case 4:
17909       if (LEGACY_INT_REGNO_P (regno))
17910 	putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17911       /* FALLTHRU */
17912     case 2:
17913     normal:
17914       reg = hi_reg_name[regno];
17915       break;
17916     case 1:
17917       if (regno >= ARRAY_SIZE (qi_reg_name))
17918 	goto normal;
17919       if (!ANY_QI_REGNO_P (regno))
17920 	error ("unsupported size for integer register");
17921       reg = qi_reg_name[regno];
17922       break;
17923     case 0:
17924       if (regno >= ARRAY_SIZE (qi_high_reg_name))
17925 	goto normal;
17926       reg = qi_high_reg_name[regno];
17927       break;
17928     case 32:
17929     case 64:
17930       if (SSE_REGNO_P (regno))
17931 	{
17932 	  gcc_assert (!duplicated);
17933 	  putc (msize == 32 ? 'y' : 'z', file);
17934 	  reg = hi_reg_name[regno] + 1;
17935 	  break;
17936 	}
17937       goto normal;
17938     default:
17939       gcc_unreachable ();
17940     }
17941 
17942   fputs (reg, file);
17943 
17944   /* Irritatingly, AMD extended registers use
17945      different naming convention: "r%d[bwd]"  */
17946   if (REX_INT_REGNO_P (regno))
17947     {
17948       gcc_assert (TARGET_64BIT);
17949       switch (msize)
17950 	{
17951 	  case 0:
17952 	    error ("extended registers have no high halves");
17953 	    break;
17954 	  case 1:
17955 	    putc ('b', file);
17956 	    break;
17957 	  case 2:
17958 	    putc ('w', file);
17959 	    break;
17960 	  case 4:
17961 	    putc ('d', file);
17962 	    break;
17963 	  case 8:
17964 	    /* no suffix */
17965 	    break;
17966 	  default:
17967 	    error ("unsupported operand size for extended register");
17968 	    break;
17969 	}
17970       return;
17971     }
17972 
17973   if (duplicated)
17974     {
17975       if (ASSEMBLER_DIALECT == ASM_ATT)
17976 	fprintf (file, ", %%%s", reg);
17977       else
17978 	fprintf (file, ", %s", reg);
17979     }
17980 }
17981 
17982 /* Meaning of CODE:
17983    L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17984    C -- print opcode suffix for set/cmov insn.
17985    c -- like C, but print reversed condition
17986    F,f -- likewise, but for floating-point.
17987    O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17988 	otherwise nothing
17989    R -- print embedded rounding and sae.
17990    r -- print only sae.
17991    z -- print the opcode suffix for the size of the current operand.
17992    Z -- likewise, with special suffixes for x87 instructions.
17993    * -- print a star (in certain assembler syntax)
17994    A -- print an absolute memory reference.
17995    E -- print address with DImode register names if TARGET_64BIT.
17996    w -- print the operand as if it's a "word" (HImode) even if it isn't.
17997    s -- print a shift double count, followed by the assemblers argument
17998 	delimiter.
17999    b -- print the QImode name of the register for the indicated operand.
18000 	%b0 would print %al if operands[0] is reg 0.
18001    w --  likewise, print the HImode name of the register.
18002    k --  likewise, print the SImode name of the register.
18003    q --  likewise, print the DImode name of the register.
18004    x --  likewise, print the V4SFmode name of the register.
18005    t --  likewise, print the V8SFmode name of the register.
18006    g --  likewise, print the V16SFmode name of the register.
18007    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18008    y -- print "st(0)" instead of "st" as a register.
18009    d -- print duplicated register operand for AVX instruction.
18010    D -- print condition for SSE cmp instruction.
18011    P -- if PIC, print an @PLT suffix.
18012    p -- print raw symbol name.
18013    X -- don't print any sort of PIC '@' suffix for a symbol.
18014    & -- print some in-use local-dynamic symbol name.
18015    H -- print a memory address offset by 8; used for sse high-parts
18016    Y -- print condition for XOP pcom* instruction.
18017    V -- print naked full integer register name without %.
18018    + -- print a branch hint as 'cs' or 'ds' prefix
18019    ; -- print a semicolon (after prefixes due to bug in older gas).
18020    ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18021    ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18022    M -- print addr32 prefix for TARGET_X32 with VSIB address.
18023    ! -- print MPX prefix for jxx/call/ret instructions if required.
18024  */
18025 
18026 void
18027 ix86_print_operand (FILE *file, rtx x, int code)
18028 {
18029   if (code)
18030     {
18031       switch (code)
18032 	{
18033 	case 'A':
18034 	  switch (ASSEMBLER_DIALECT)
18035 	    {
18036 	    case ASM_ATT:
18037 	      putc ('*', file);
18038 	      break;
18039 
18040 	    case ASM_INTEL:
18041 	      /* Intel syntax. For absolute addresses, registers should not
18042 		 be surrounded by braces.  */
18043 	      if (!REG_P (x))
18044 		{
18045 		  putc ('[', file);
18046 		  ix86_print_operand (file, x, 0);
18047 		  putc (']', file);
18048 		  return;
18049 		}
18050 	      break;
18051 
18052 	    default:
18053 	      gcc_unreachable ();
18054 	    }
18055 
18056 	  ix86_print_operand (file, x, 0);
18057 	  return;
18058 
18059 	case 'E':
18060 	  /* Wrap address in an UNSPEC to declare special handling.  */
18061 	  if (TARGET_64BIT)
18062 	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18063 
18064 	  output_address (VOIDmode, x);
18065 	  return;
18066 
18067 	case 'L':
18068 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18069 	    putc ('l', file);
18070 	  return;
18071 
18072 	case 'W':
18073 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18074 	    putc ('w', file);
18075 	  return;
18076 
18077 	case 'B':
18078 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18079 	    putc ('b', file);
18080 	  return;
18081 
18082 	case 'Q':
18083 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18084 	    putc ('l', file);
18085 	  return;
18086 
18087 	case 'S':
18088 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18089 	    putc ('s', file);
18090 	  return;
18091 
18092 	case 'T':
18093 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18094 	    putc ('t', file);
18095 	  return;
18096 
18097 	case 'O':
18098 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18099 	  if (ASSEMBLER_DIALECT != ASM_ATT)
18100 	    return;
18101 
18102 	  switch (GET_MODE_SIZE (GET_MODE (x)))
18103 	    {
18104 	    case 2:
18105 	      putc ('w', file);
18106 	      break;
18107 
18108 	    case 4:
18109 	      putc ('l', file);
18110 	      break;
18111 
18112 	    case 8:
18113 	      putc ('q', file);
18114 	      break;
18115 
18116 	    default:
18117 	      output_operand_lossage ("invalid operand size for operand "
18118 				      "code 'O'");
18119 	      return;
18120 	    }
18121 
18122 	  putc ('.', file);
18123 #endif
18124 	  return;
18125 
18126 	case 'z':
18127 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18128 	    {
18129 	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
18130 	      if (ASSEMBLER_DIALECT == ASM_INTEL)
18131 		return;
18132 
18133 	      switch (GET_MODE_SIZE (GET_MODE (x)))
18134 		{
18135 		case 1:
18136 		  putc ('b', file);
18137 		  return;
18138 
18139 		case 2:
18140 		  putc ('w', file);
18141 		  return;
18142 
18143 		case 4:
18144 		  putc ('l', file);
18145 		  return;
18146 
18147 		case 8:
18148 		  putc ('q', file);
18149 		  return;
18150 
18151 		default:
18152 		  output_operand_lossage ("invalid operand size for operand "
18153 					  "code 'z'");
18154 		  return;
18155 		}
18156 	    }
18157 
18158 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18159 	    warning (0, "non-integer operand used with operand code 'z'");
18160 	  /* FALLTHRU */
18161 
18162 	case 'Z':
18163 	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
18164 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
18165 	    return;
18166 
18167 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18168 	    {
18169 	      switch (GET_MODE_SIZE (GET_MODE (x)))
18170 		{
18171 		case 2:
18172 #ifdef HAVE_AS_IX86_FILDS
18173 		  putc ('s', file);
18174 #endif
18175 		  return;
18176 
18177 		case 4:
18178 		  putc ('l', file);
18179 		  return;
18180 
18181 		case 8:
18182 #ifdef HAVE_AS_IX86_FILDQ
18183 		  putc ('q', file);
18184 #else
18185 		  fputs ("ll", file);
18186 #endif
18187 		  return;
18188 
18189 		default:
18190 		  break;
18191 		}
18192 	    }
18193 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18194 	    {
18195 	      /* 387 opcodes don't get size suffixes
18196 		 if the operands are registers.  */
18197 	      if (STACK_REG_P (x))
18198 		return;
18199 
18200 	      switch (GET_MODE_SIZE (GET_MODE (x)))
18201 		{
18202 		case 4:
18203 		  putc ('s', file);
18204 		  return;
18205 
18206 		case 8:
18207 		  putc ('l', file);
18208 		  return;
18209 
18210 		case 12:
18211 		case 16:
18212 		  putc ('t', file);
18213 		  return;
18214 
18215 		default:
18216 		  break;
18217 		}
18218 	    }
18219 	  else
18220 	    {
18221 	      output_operand_lossage ("invalid operand type used with "
18222 				      "operand code 'Z'");
18223 	      return;
18224 	    }
18225 
18226 	  output_operand_lossage ("invalid operand size for operand code 'Z'");
18227 	  return;
18228 
18229 	case 'd':
18230 	case 'b':
18231 	case 'w':
18232 	case 'k':
18233 	case 'q':
18234 	case 'h':
18235 	case 't':
18236 	case 'g':
18237 	case 'y':
18238 	case 'x':
18239 	case 'X':
18240 	case 'P':
18241 	case 'p':
18242 	case 'V':
18243 	  break;
18244 
18245 	case 's':
18246 	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18247 	    {
18248 	      ix86_print_operand (file, x, 0);
18249 	      fputs (", ", file);
18250 	    }
18251 	  return;
18252 
18253 	case 'Y':
18254 	  switch (GET_CODE (x))
18255 	    {
18256 	    case NE:
18257 	      fputs ("neq", file);
18258 	      break;
18259 	    case EQ:
18260 	      fputs ("eq", file);
18261 	      break;
18262 	    case GE:
18263 	    case GEU:
18264 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18265 	      break;
18266 	    case GT:
18267 	    case GTU:
18268 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18269 	      break;
18270 	    case LE:
18271 	    case LEU:
18272 	      fputs ("le", file);
18273 	      break;
18274 	    case LT:
18275 	    case LTU:
18276 	      fputs ("lt", file);
18277 	      break;
18278 	    case UNORDERED:
18279 	      fputs ("unord", file);
18280 	      break;
18281 	    case ORDERED:
18282 	      fputs ("ord", file);
18283 	      break;
18284 	    case UNEQ:
18285 	      fputs ("ueq", file);
18286 	      break;
18287 	    case UNGE:
18288 	      fputs ("nlt", file);
18289 	      break;
18290 	    case UNGT:
18291 	      fputs ("nle", file);
18292 	      break;
18293 	    case UNLE:
18294 	      fputs ("ule", file);
18295 	      break;
18296 	    case UNLT:
18297 	      fputs ("ult", file);
18298 	      break;
18299 	    case LTGT:
18300 	      fputs ("une", file);
18301 	      break;
18302 	    default:
18303 	      output_operand_lossage ("operand is not a condition code, "
18304 				      "invalid operand code 'Y'");
18305 	      return;
18306 	    }
18307 	  return;
18308 
18309 	case 'D':
18310 	  /* Little bit of braindamage here.  The SSE compare instructions
18311 	     does use completely different names for the comparisons that the
18312 	     fp conditional moves.  */
18313 	  switch (GET_CODE (x))
18314 	    {
18315 	    case UNEQ:
18316 	      if (TARGET_AVX)
18317 		{
18318 		  fputs ("eq_us", file);
18319 		  break;
18320 		}
18321 	     /* FALLTHRU */
18322 	    case EQ:
18323 	      fputs ("eq", file);
18324 	      break;
18325 	    case UNLT:
18326 	      if (TARGET_AVX)
18327 		{
18328 		  fputs ("nge", file);
18329 		  break;
18330 		}
18331 	     /* FALLTHRU */
18332 	    case LT:
18333 	      fputs ("lt", file);
18334 	      break;
18335 	    case UNLE:
18336 	      if (TARGET_AVX)
18337 		{
18338 		  fputs ("ngt", file);
18339 		  break;
18340 		}
18341 	     /* FALLTHRU */
18342 	    case LE:
18343 	      fputs ("le", file);
18344 	      break;
18345 	    case UNORDERED:
18346 	      fputs ("unord", file);
18347 	      break;
18348 	    case LTGT:
18349 	      if (TARGET_AVX)
18350 		{
18351 		  fputs ("neq_oq", file);
18352 		  break;
18353 		}
18354 	     /* FALLTHRU */
18355 	    case NE:
18356 	      fputs ("neq", file);
18357 	      break;
18358 	    case GE:
18359 	      if (TARGET_AVX)
18360 		{
18361 		  fputs ("ge", file);
18362 		  break;
18363 		}
18364 	     /* FALLTHRU */
18365 	    case UNGE:
18366 	      fputs ("nlt", file);
18367 	      break;
18368 	    case GT:
18369 	      if (TARGET_AVX)
18370 		{
18371 		  fputs ("gt", file);
18372 		  break;
18373 		}
18374 	     /* FALLTHRU */
18375 	    case UNGT:
18376 	      fputs ("nle", file);
18377 	      break;
18378 	    case ORDERED:
18379 	      fputs ("ord", file);
18380 	      break;
18381 	    default:
18382 	      output_operand_lossage ("operand is not a condition code, "
18383 				      "invalid operand code 'D'");
18384 	      return;
18385 	    }
18386 	  return;
18387 
18388 	case 'F':
18389 	case 'f':
18390 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18391 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18392 	    putc ('.', file);
18393 	  gcc_fallthrough ();
18394 #endif
18395 
18396 	case 'C':
18397 	case 'c':
18398 	  if (!COMPARISON_P (x))
18399 	    {
18400 	      output_operand_lossage ("operand is not a condition code, "
18401 				      "invalid operand code '%c'", code);
18402 	      return;
18403 	    }
18404 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18405 			      code == 'c' || code == 'f',
18406 			      code == 'F' || code == 'f',
18407 			      file);
18408 	  return;
18409 
18410 	case 'H':
18411 	  if (!offsettable_memref_p (x))
18412 	    {
18413 	      output_operand_lossage ("operand is not an offsettable memory "
18414 				      "reference, invalid operand code 'H'");
18415 	      return;
18416 	    }
18417 	  /* It doesn't actually matter what mode we use here, as we're
18418 	     only going to use this for printing.  */
18419 	  x = adjust_address_nv (x, DImode, 8);
18420 	  /* Output 'qword ptr' for intel assembler dialect.  */
18421 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
18422 	    code = 'q';
18423 	  break;
18424 
18425 	case 'K':
18426 	  if (!CONST_INT_P (x))
18427 	    {
18428 	      output_operand_lossage ("operand is not an integer, invalid "
18429 				      "operand code 'K'");
18430 	      return;
18431 	    }
18432 
18433 	  if (INTVAL (x) & IX86_HLE_ACQUIRE)
18434 #ifdef HAVE_AS_IX86_HLE
18435 	    fputs ("xacquire ", file);
18436 #else
18437 	    fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18438 #endif
18439 	  else if (INTVAL (x) & IX86_HLE_RELEASE)
18440 #ifdef HAVE_AS_IX86_HLE
18441 	    fputs ("xrelease ", file);
18442 #else
18443 	    fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18444 #endif
18445 	  /* We do not want to print value of the operand.  */
18446 	  return;
18447 
18448 	case 'N':
18449 	  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18450 	    fputs ("{z}", file);
18451 	  return;
18452 
18453 	case 'r':
18454 	  if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18455 	    {
18456 	      output_operand_lossage ("operand is not a specific integer, "
18457 				      "invalid operand code 'r'");
18458 	      return;
18459 	    }
18460 
18461 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
18462 	    fputs (", ", file);
18463 
18464 	  fputs ("{sae}", file);
18465 
18466 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18467 	    fputs (", ", file);
18468 
18469 	  return;
18470 
18471 	case 'R':
18472 	  if (!CONST_INT_P (x))
18473 	    {
18474 	      output_operand_lossage ("operand is not an integer, invalid "
18475 				      "operand code 'R'");
18476 	      return;
18477 	    }
18478 
18479 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
18480 	    fputs (", ", file);
18481 
18482 	  switch (INTVAL (x))
18483 	    {
18484 	    case ROUND_NEAREST_INT | ROUND_SAE:
18485 	      fputs ("{rn-sae}", file);
18486 	      break;
18487 	    case ROUND_NEG_INF | ROUND_SAE:
18488 	      fputs ("{rd-sae}", file);
18489 	      break;
18490 	    case ROUND_POS_INF | ROUND_SAE:
18491 	      fputs ("{ru-sae}", file);
18492 	      break;
18493 	    case ROUND_ZERO | ROUND_SAE:
18494 	      fputs ("{rz-sae}", file);
18495 	      break;
18496 	    default:
18497 	      output_operand_lossage ("operand is not a specific integer, "
18498 				      "invalid operand code 'R'");
18499 	    }
18500 
18501 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18502 	    fputs (", ", file);
18503 
18504 	  return;
18505 
18506 	case '*':
18507 	  if (ASSEMBLER_DIALECT == ASM_ATT)
18508 	    putc ('*', file);
18509 	  return;
18510 
18511 	case '&':
18512 	  {
18513 	    const char *name = get_some_local_dynamic_name ();
18514 	    if (name == NULL)
18515 	      output_operand_lossage ("'%%&' used without any "
18516 				      "local dynamic TLS references");
18517 	    else
18518 	      assemble_name (file, name);
18519 	    return;
18520 	  }
18521 
18522 	case '+':
18523 	  {
18524 	    rtx x;
18525 
18526 	    if (!optimize
18527 	        || optimize_function_for_size_p (cfun)
18528 		|| !TARGET_BRANCH_PREDICTION_HINTS)
18529 	      return;
18530 
18531 	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18532 	    if (x)
18533 	      {
18534 		int pred_val = profile_probability::from_reg_br_prob_note
18535 				 (XINT (x, 0)).to_reg_br_prob_base ();
18536 
18537 		if (pred_val < REG_BR_PROB_BASE * 45 / 100
18538 		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
18539 		  {
18540 		    bool taken = pred_val > REG_BR_PROB_BASE / 2;
18541 		    bool cputaken
18542 		      = final_forward_branch_p (current_output_insn) == 0;
18543 
18544 		    /* Emit hints only in the case default branch prediction
18545 		       heuristics would fail.  */
18546 		    if (taken != cputaken)
18547 		      {
18548 			/* We use 3e (DS) prefix for taken branches and
18549 			   2e (CS) prefix for not taken branches.  */
18550 			if (taken)
18551 			  fputs ("ds ; ", file);
18552 			else
18553 			  fputs ("cs ; ", file);
18554 		      }
18555 		  }
18556 	      }
18557 	    return;
18558 	  }
18559 
18560 	case ';':
18561 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18562 	  putc (';', file);
18563 #endif
18564 	  return;
18565 
18566 	case '~':
18567 	  putc (TARGET_AVX2 ? 'i' : 'f', file);
18568 	  return;
18569 
18570 	case 'M':
18571 	  if (TARGET_X32)
18572 	    {
18573 	      /* NB: 32-bit indices in VSIB address are sign-extended
18574 		 to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
18575 		 sign-extended to 0xfffffffff7fa3010 which is invalid
18576 		 address.  Add addr32 prefix if there is no base
18577 		 register nor symbol.  */
18578 	      bool ok;
18579 	      struct ix86_address parts;
18580 	      ok = ix86_decompose_address (x, &parts);
18581 	      gcc_assert (ok && parts.index == NULL_RTX);
18582 	      if (parts.base == NULL_RTX
18583 		  && (parts.disp == NULL_RTX
18584 		      || !symbolic_operand (parts.disp,
18585 					    GET_MODE (parts.disp))))
18586 		fputs ("addr32 ", file);
18587 	    }
18588 	  return;
18589 
18590 	case '^':
18591 	  if (TARGET_64BIT && Pmode != word_mode)
18592 	    fputs ("addr32 ", file);
18593 	  return;
18594 
18595 	case '!':
18596 	  if (ix86_bnd_prefixed_insn_p (current_output_insn))
18597 	    fputs ("bnd ", file);
18598 	  if (ix86_notrack_prefixed_insn_p (current_output_insn))
18599 	    fputs ("notrack ", file);
18600 	  return;
18601 
18602 	default:
18603 	  output_operand_lossage ("invalid operand code '%c'", code);
18604 	}
18605     }
18606 
18607   if (REG_P (x))
18608     print_reg (x, code, file);
18609 
18610   else if (MEM_P (x))
18611     {
18612       rtx addr = XEXP (x, 0);
18613 
18614       /* No `byte ptr' prefix for call instructions ... */
18615       if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18616 	{
18617 	  machine_mode mode = GET_MODE (x);
18618 	  const char *size;
18619 
18620 	  /* Check for explicit size override codes.  */
18621 	  if (code == 'b')
18622 	    size = "BYTE";
18623 	  else if (code == 'w')
18624 	    size = "WORD";
18625 	  else if (code == 'k')
18626 	    size = "DWORD";
18627 	  else if (code == 'q')
18628 	    size = "QWORD";
18629 	  else if (code == 'x')
18630 	    size = "XMMWORD";
18631 	  else if (code == 't')
18632 	    size = "YMMWORD";
18633 	  else if (code == 'g')
18634 	    size = "ZMMWORD";
18635 	  else if (mode == BLKmode)
18636 	    /* ... or BLKmode operands, when not overridden.  */
18637 	    size = NULL;
18638 	  else
18639 	    switch (GET_MODE_SIZE (mode))
18640 	      {
18641 	      case 1: size = "BYTE"; break;
18642 	      case 2: size = "WORD"; break;
18643 	      case 4: size = "DWORD"; break;
18644 	      case 8: size = "QWORD"; break;
18645 	      case 12: size = "TBYTE"; break;
18646 	      case 16:
18647 		if (mode == XFmode)
18648 		  size = "TBYTE";
18649 		else
18650 		  size = "XMMWORD";
18651 		break;
18652 	      case 32: size = "YMMWORD"; break;
18653 	      case 64: size = "ZMMWORD"; break;
18654 	      default:
18655 		gcc_unreachable ();
18656 	      }
18657 	  if (size)
18658 	    {
18659 	      fputs (size, file);
18660 	      fputs (" PTR ", file);
18661 	    }
18662 	}
18663 
18664       if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18665 	output_operand_lossage ("invalid constraints for operand");
18666       else
18667 	ix86_print_operand_address_as
18668 	  (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18669     }
18670 
18671   else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18672     {
18673       long l;
18674 
18675       REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18676 
18677       if (ASSEMBLER_DIALECT == ASM_ATT)
18678 	putc ('$', file);
18679       /* Sign extend 32bit SFmode immediate to 8 bytes.  */
18680       if (code == 'q')
18681 	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18682 		 (unsigned long long) (int) l);
18683       else
18684 	fprintf (file, "0x%08x", (unsigned int) l);
18685     }
18686 
18687   else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18688     {
18689       long l[2];
18690 
18691       REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18692 
18693       if (ASSEMBLER_DIALECT == ASM_ATT)
18694 	putc ('$', file);
18695       fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18696     }
18697 
18698   /* These float cases don't actually occur as immediate operands.  */
18699   else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18700     {
18701       char dstr[30];
18702 
18703       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18704       fputs (dstr, file);
18705     }
18706 
18707   else
18708     {
18709       /* We have patterns that allow zero sets of memory, for instance.
18710 	 In 64-bit mode, we should probably support all 8-byte vectors,
18711 	 since we can in fact encode that into an immediate.  */
18712       if (GET_CODE (x) == CONST_VECTOR)
18713 	{
18714 	  if (x != CONST0_RTX (GET_MODE (x)))
18715 	    output_operand_lossage ("invalid vector immediate");
18716 	  x = const0_rtx;
18717 	}
18718 
18719       if (code != 'P' && code != 'p')
18720 	{
18721 	  if (CONST_INT_P (x))
18722 	    {
18723 	      if (ASSEMBLER_DIALECT == ASM_ATT)
18724 		putc ('$', file);
18725 	    }
18726 	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18727 		   || GET_CODE (x) == LABEL_REF)
18728 	    {
18729 	      if (ASSEMBLER_DIALECT == ASM_ATT)
18730 		putc ('$', file);
18731 	      else
18732 		fputs ("OFFSET FLAT:", file);
18733 	    }
18734 	}
18735       if (CONST_INT_P (x))
18736 	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18737       else if (flag_pic || MACHOPIC_INDIRECT)
18738 	output_pic_addr_const (file, x, code);
18739       else
18740 	output_addr_const (file, x);
18741     }
18742 }
18743 
18744 static bool
18745 ix86_print_operand_punct_valid_p (unsigned char code)
18746 {
18747   return (code == '*' || code == '+' || code == '&' || code == ';'
18748 	  || code == '~' || code == '^' || code == '!');
18749 }
18750 
18751 /* Print a memory operand whose address is ADDR.  */
18752 
18753 static void
18754 ix86_print_operand_address_as (FILE *file, rtx addr,
18755 			       addr_space_t as, bool no_rip)
18756 {
18757   struct ix86_address parts;
18758   rtx base, index, disp;
18759   int scale;
18760   int ok;
18761   bool vsib = false;
18762   int code = 0;
18763 
18764   if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18765     {
18766       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18767       gcc_assert (parts.index == NULL_RTX);
18768       parts.index = XVECEXP (addr, 0, 1);
18769       parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18770       addr = XVECEXP (addr, 0, 0);
18771       vsib = true;
18772     }
18773   else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18774     {
18775       gcc_assert (TARGET_64BIT);
18776       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18777       code = 'q';
18778     }
18779   else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18780     {
18781       ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18782       gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18783       if (parts.base != NULL_RTX)
18784 	{
18785 	  parts.index = parts.base;
18786 	  parts.scale = 1;
18787 	}
18788       parts.base = XVECEXP (addr, 0, 0);
18789       addr = XVECEXP (addr, 0, 0);
18790     }
18791   else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18792     {
18793       ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18794       gcc_assert (parts.index == NULL_RTX);
18795       parts.index = XVECEXP (addr, 0, 1);
18796       addr = XVECEXP (addr, 0, 0);
18797     }
18798   else
18799     ok = ix86_decompose_address (addr, &parts);
18800 
18801   gcc_assert (ok);
18802 
18803   base = parts.base;
18804   index = parts.index;
18805   disp = parts.disp;
18806   scale = parts.scale;
18807 
18808   if (ADDR_SPACE_GENERIC_P (as))
18809     as = parts.seg;
18810   else
18811     gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18812 
18813   if (!ADDR_SPACE_GENERIC_P (as))
18814     {
18815       const char *string;
18816 
18817       if (as == ADDR_SPACE_SEG_FS)
18818 	string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18819       else if (as == ADDR_SPACE_SEG_GS)
18820 	string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18821       else
18822 	gcc_unreachable ();
18823       fputs (string, file);
18824     }
18825 
18826   /* Use one byte shorter RIP relative addressing for 64bit mode.  */
18827   if (TARGET_64BIT && !base && !index && !no_rip)
18828     {
18829       rtx symbol = disp;
18830 
18831       if (GET_CODE (disp) == CONST
18832 	  && GET_CODE (XEXP (disp, 0)) == PLUS
18833 	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18834 	symbol = XEXP (XEXP (disp, 0), 0);
18835 
18836       if (GET_CODE (symbol) == LABEL_REF
18837 	  || (GET_CODE (symbol) == SYMBOL_REF
18838 	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18839 	base = pc_rtx;
18840     }
18841 
18842   if (!base && !index)
18843     {
18844       /* Displacement only requires special attention.  */
18845       if (CONST_INT_P (disp))
18846 	{
18847 	  if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18848 	    fputs ("ds:", file);
18849 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18850 	}
18851       /* Load the external function address via the GOT slot to avoid PLT.  */
18852       else if (GET_CODE (disp) == CONST
18853 	       && GET_CODE (XEXP (disp, 0)) == UNSPEC
18854 	       && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18855 		   || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18856 	       && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18857 	output_pic_addr_const (file, disp, 0);
18858       else if (flag_pic)
18859 	output_pic_addr_const (file, disp, 0);
18860       else
18861 	output_addr_const (file, disp);
18862     }
18863   else
18864     {
18865       /* Print SImode register names to force addr32 prefix.  */
18866       if (SImode_address_operand (addr, VOIDmode))
18867 	{
18868 	  if (flag_checking)
18869 	    {
18870 	      gcc_assert (TARGET_64BIT);
18871 	      switch (GET_CODE (addr))
18872 		{
18873 		case SUBREG:
18874 		  gcc_assert (GET_MODE (addr) == SImode);
18875 		  gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18876 		  break;
18877 		case ZERO_EXTEND:
18878 		case AND:
18879 		  gcc_assert (GET_MODE (addr) == DImode);
18880 		  break;
18881 		default:
18882 		  gcc_unreachable ();
18883 		}
18884 	    }
18885 	  gcc_assert (!code);
18886 	  code = 'k';
18887 	}
18888       else if (code == 0
18889 	       && TARGET_X32
18890 	       && disp
18891 	       && CONST_INT_P (disp)
18892 	       && INTVAL (disp) < -16*1024*1024)
18893 	{
18894 	  /* X32 runs in 64-bit mode, where displacement, DISP, in
18895 	     address DISP(%r64), is encoded as 32-bit immediate sign-
18896 	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
18897 	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
18898 	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18899 	     which is invalid for x32.  The correct address is %r64
18900 	     - 0x40000300 == 0xf7ffdd64.  To properly encode
18901 	     -0x40000300(%r64) for x32, we zero-extend negative
18902 	     displacement by forcing addr32 prefix which truncates
18903 	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
18904 	     zero-extend all negative displacements, including -1(%rsp).
18905 	     However, for small negative displacements, sign-extension
18906 	     won't cause overflow.  We only zero-extend negative
18907 	     displacements if they < -16*1024*1024, which is also used
18908 	     to check legitimate address displacements for PIC.  */
18909 	  code = 'k';
18910 	}
18911 
18912       /* Since the upper 32 bits of RSP are always zero for x32,
18913 	 we can encode %esp as %rsp to avoid 0x67 prefix if
18914 	 there is no index register.  */
18915       if (TARGET_X32 && Pmode == SImode
18916 	  && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18917 	code = 'q';
18918 
18919       if (ASSEMBLER_DIALECT == ASM_ATT)
18920 	{
18921 	  if (disp)
18922 	    {
18923 	      if (flag_pic)
18924 		output_pic_addr_const (file, disp, 0);
18925 	      else if (GET_CODE (disp) == LABEL_REF)
18926 		output_asm_label (disp);
18927 	      else
18928 		output_addr_const (file, disp);
18929 	    }
18930 
18931 	  putc ('(', file);
18932 	  if (base)
18933 	    print_reg (base, code, file);
18934 	  if (index)
18935 	    {
18936 	      putc (',', file);
18937 	      print_reg (index, vsib ? 0 : code, file);
18938 	      if (scale != 1 || vsib)
18939 		fprintf (file, ",%d", scale);
18940 	    }
18941 	  putc (')', file);
18942 	}
18943       else
18944 	{
18945 	  rtx offset = NULL_RTX;
18946 
18947 	  if (disp)
18948 	    {
18949 	      /* Pull out the offset of a symbol; print any symbol itself.  */
18950 	      if (GET_CODE (disp) == CONST
18951 		  && GET_CODE (XEXP (disp, 0)) == PLUS
18952 		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18953 		{
18954 		  offset = XEXP (XEXP (disp, 0), 1);
18955 		  disp = gen_rtx_CONST (VOIDmode,
18956 					XEXP (XEXP (disp, 0), 0));
18957 		}
18958 
18959 	      if (flag_pic)
18960 		output_pic_addr_const (file, disp, 0);
18961 	      else if (GET_CODE (disp) == LABEL_REF)
18962 		output_asm_label (disp);
18963 	      else if (CONST_INT_P (disp))
18964 		offset = disp;
18965 	      else
18966 		output_addr_const (file, disp);
18967 	    }
18968 
18969 	  putc ('[', file);
18970 	  if (base)
18971 	    {
18972 	      print_reg (base, code, file);
18973 	      if (offset)
18974 		{
18975 		  if (INTVAL (offset) >= 0)
18976 		    putc ('+', file);
18977 		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18978 		}
18979 	    }
18980 	  else if (offset)
18981 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18982 	  else
18983 	    putc ('0', file);
18984 
18985 	  if (index)
18986 	    {
18987 	      putc ('+', file);
18988 	      print_reg (index, vsib ? 0 : code, file);
18989 	      if (scale != 1 || vsib)
18990 		fprintf (file, "*%d", scale);
18991 	    }
18992 	  putc (']', file);
18993 	}
18994     }
18995 }
18996 
18997 static void
18998 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18999 {
19000   ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19001 }
19002 
19003 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
19004 
19005 static bool
19006 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19007 {
19008   rtx op;
19009 
19010   if (GET_CODE (x) != UNSPEC)
19011     return false;
19012 
19013   op = XVECEXP (x, 0, 0);
19014   switch (XINT (x, 1))
19015     {
19016     case UNSPEC_GOTOFF:
19017       output_addr_const (file, op);
19018       fputs ("@gotoff", file);
19019       break;
19020     case UNSPEC_GOTTPOFF:
19021       output_addr_const (file, op);
19022       /* FIXME: This might be @TPOFF in Sun ld.  */
19023       fputs ("@gottpoff", file);
19024       break;
19025     case UNSPEC_TPOFF:
19026       output_addr_const (file, op);
19027       fputs ("@tpoff", file);
19028       break;
19029     case UNSPEC_NTPOFF:
19030       output_addr_const (file, op);
19031       if (TARGET_64BIT)
19032 	fputs ("@tpoff", file);
19033       else
19034 	fputs ("@ntpoff", file);
19035       break;
19036     case UNSPEC_DTPOFF:
19037       output_addr_const (file, op);
19038       fputs ("@dtpoff", file);
19039       break;
19040     case UNSPEC_GOTNTPOFF:
19041       output_addr_const (file, op);
19042       if (TARGET_64BIT)
19043 	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19044 	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19045       else
19046 	fputs ("@gotntpoff", file);
19047       break;
19048     case UNSPEC_INDNTPOFF:
19049       output_addr_const (file, op);
19050       fputs ("@indntpoff", file);
19051       break;
19052 #if TARGET_MACHO
19053     case UNSPEC_MACHOPIC_OFFSET:
19054       output_addr_const (file, op);
19055       putc ('-', file);
19056       machopic_output_function_base_name (file);
19057       break;
19058 #endif
19059 
19060     default:
19061       return false;
19062     }
19063 
19064   return true;
19065 }
19066 
19067 /* Split one or more double-mode RTL references into pairs of half-mode
19068    references.  The RTL can be REG, offsettable MEM, integer constant, or
19069    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
19070    split and "num" is its length.  lo_half and hi_half are output arrays
19071    that parallel "operands".  */
19072 
19073 void
19074 split_double_mode (machine_mode mode, rtx operands[],
19075 		   int num, rtx lo_half[], rtx hi_half[])
19076 {
19077   machine_mode half_mode;
19078   unsigned int byte;
19079 
19080   switch (mode)
19081     {
19082     case E_TImode:
19083       half_mode = DImode;
19084       break;
19085     case E_DImode:
19086       half_mode = SImode;
19087       break;
19088     default:
19089       gcc_unreachable ();
19090     }
19091 
19092   byte = GET_MODE_SIZE (half_mode);
19093 
19094   while (num--)
19095     {
19096       rtx op = operands[num];
19097 
19098       /* simplify_subreg refuse to split volatile memory addresses,
19099          but we still have to handle it.  */
19100       if (MEM_P (op))
19101 	{
19102 	  lo_half[num] = adjust_address (op, half_mode, 0);
19103 	  hi_half[num] = adjust_address (op, half_mode, byte);
19104 	}
19105       else
19106 	{
19107 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
19108 					      GET_MODE (op) == VOIDmode
19109 					      ? mode : GET_MODE (op), 0);
19110 	  hi_half[num] = simplify_gen_subreg (half_mode, op,
19111 					      GET_MODE (op) == VOIDmode
19112 					      ? mode : GET_MODE (op), byte);
19113 	}
19114     }
19115 }
19116 
19117 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19118    MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
19119    is the expression of the binary operation.  The output may either be
19120    emitted here, or returned to the caller, like all output_* functions.
19121 
19122    There is no guarantee that the operands are the same mode, as they
19123    might be within FLOAT or FLOAT_EXTEND expressions.  */
19124 
19125 #ifndef SYSV386_COMPAT
19126 /* Set to 1 for compatibility with brain-damaged assemblers.  No-one
19127    wants to fix the assemblers because that causes incompatibility
19128    with gcc.  No-one wants to fix gcc because that causes
19129    incompatibility with assemblers...  You can use the option of
19130    -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
19131 #define SYSV386_COMPAT 1
19132 #endif
19133 
19134 const char *
19135 output_387_binary_op (rtx_insn *insn, rtx *operands)
19136 {
19137   static char buf[40];
19138   const char *p;
19139   bool is_sse
19140     = (SSE_REG_P (operands[0])
19141        || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19142 
19143   if (is_sse)
19144     p = "%v";
19145   else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19146 	   || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19147     p = "fi";
19148   else
19149     p = "f";
19150 
19151   strcpy (buf, p);
19152 
19153   switch (GET_CODE (operands[3]))
19154     {
19155     case PLUS:
19156       p = "add"; break;
19157     case MINUS:
19158       p = "sub"; break;
19159     case MULT:
19160       p = "mul"; break;
19161     case DIV:
19162       p = "div"; break;
19163     default:
19164       gcc_unreachable ();
19165     }
19166 
19167   strcat (buf, p);
19168 
19169   if (is_sse)
19170    {
19171      p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19172      strcat (buf, p);
19173 
19174      if (TARGET_AVX)
19175        p = "\t{%2, %1, %0|%0, %1, %2}";
19176      else
19177        p = "\t{%2, %0|%0, %2}";
19178 
19179      strcat (buf, p);
19180      return buf;
19181    }
19182 
19183   /* Even if we do not want to check the inputs, this documents input
19184      constraints.  Which helps in understanding the following code.  */
19185   if (flag_checking)
19186     {
19187       if (STACK_REG_P (operands[0])
19188 	  && ((REG_P (operands[1])
19189 	       && REGNO (operands[0]) == REGNO (operands[1])
19190 	       && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19191 	      || (REG_P (operands[2])
19192 		  && REGNO (operands[0]) == REGNO (operands[2])
19193 		  && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19194 	  && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19195 	; /* ok */
19196       else
19197 	gcc_unreachable ();
19198     }
19199 
19200   switch (GET_CODE (operands[3]))
19201     {
19202     case MULT:
19203     case PLUS:
19204       if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19205 	std::swap (operands[1], operands[2]);
19206 
19207       /* know operands[0] == operands[1].  */
19208 
19209       if (MEM_P (operands[2]))
19210 	{
19211 	  p = "%Z2\t%2";
19212 	  break;
19213 	}
19214 
19215       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19216 	{
19217 	  if (STACK_TOP_P (operands[0]))
19218 	    /* How is it that we are storing to a dead operand[2]?
19219 	       Well, presumably operands[1] is dead too.  We can't
19220 	       store the result to st(0) as st(0) gets popped on this
19221 	       instruction.  Instead store to operands[2] (which I
19222 	       think has to be st(1)).  st(1) will be popped later.
19223 	       gcc <= 2.8.1 didn't have this check and generated
19224 	       assembly code that the Unixware assembler rejected.  */
19225 	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
19226 	  else
19227 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
19228 	  break;
19229 	}
19230 
19231       if (STACK_TOP_P (operands[0]))
19232 	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
19233       else
19234 	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
19235       break;
19236 
19237     case MINUS:
19238     case DIV:
19239       if (MEM_P (operands[1]))
19240 	{
19241 	  p = "r%Z1\t%1";
19242 	  break;
19243 	}
19244 
19245       if (MEM_P (operands[2]))
19246 	{
19247 	  p = "%Z2\t%2";
19248 	  break;
19249 	}
19250 
19251       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19252 	{
19253 #if SYSV386_COMPAT
19254 	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19255 	     derived assemblers, confusingly reverse the direction of
19256 	     the operation for fsub{r} and fdiv{r} when the
19257 	     destination register is not st(0).  The Intel assembler
19258 	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
19259 	     figure out what the hardware really does.  */
19260 	  if (STACK_TOP_P (operands[0]))
19261 	    p = "{p\t%0, %2|rp\t%2, %0}";
19262 	  else
19263 	    p = "{rp\t%2, %0|p\t%0, %2}";
19264 #else
19265 	  if (STACK_TOP_P (operands[0]))
19266 	    /* As above for fmul/fadd, we can't store to st(0).  */
19267 	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
19268 	  else
19269 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
19270 #endif
19271 	  break;
19272 	}
19273 
19274       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19275 	{
19276 #if SYSV386_COMPAT
19277 	  if (STACK_TOP_P (operands[0]))
19278 	    p = "{rp\t%0, %1|p\t%1, %0}";
19279 	  else
19280 	    p = "{p\t%1, %0|rp\t%0, %1}";
19281 #else
19282 	  if (STACK_TOP_P (operands[0]))
19283 	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
19284 	  else
19285 	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
19286 #endif
19287 	  break;
19288 	}
19289 
19290       if (STACK_TOP_P (operands[0]))
19291 	{
19292 	  if (STACK_TOP_P (operands[1]))
19293 	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
19294 	  else
19295 	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
19296 	  break;
19297 	}
19298       else if (STACK_TOP_P (operands[1]))
19299 	{
19300 #if SYSV386_COMPAT
19301 	  p = "{\t%1, %0|r\t%0, %1}";
19302 #else
19303 	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
19304 #endif
19305 	}
19306       else
19307 	{
19308 #if SYSV386_COMPAT
19309 	  p = "{r\t%2, %0|\t%0, %2}";
19310 #else
19311 	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
19312 #endif
19313 	}
19314       break;
19315 
19316     default:
19317       gcc_unreachable ();
19318     }
19319 
19320   strcat (buf, p);
19321   return buf;
19322 }
19323 
19324 /* Return needed mode for entity in optimize_mode_switching pass.  */
19325 
19326 static int
19327 ix86_dirflag_mode_needed (rtx_insn *insn)
19328 {
19329   if (CALL_P (insn))
19330     {
19331       if (cfun->machine->func_type == TYPE_NORMAL)
19332 	return X86_DIRFLAG_ANY;
19333       else
19334 	/* No need to emit CLD in interrupt handler for TARGET_CLD.  */
19335 	return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19336     }
19337 
19338   if (recog_memoized (insn) < 0)
19339     return X86_DIRFLAG_ANY;
19340 
19341   if (get_attr_type (insn) == TYPE_STR)
19342     {
19343       /* Emit cld instruction if stringops are used in the function.  */
19344       if (cfun->machine->func_type == TYPE_NORMAL)
19345 	return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19346       else
19347 	return X86_DIRFLAG_RESET;
19348     }
19349 
19350   return X86_DIRFLAG_ANY;
19351 }
19352 
19353 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP.   */
19354 
19355 static bool
19356 ix86_check_avx_upper_register (const_rtx exp)
19357 {
19358   if (SUBREG_P (exp))
19359     exp = SUBREG_REG (exp);
19360 
19361   return (REG_P (exp)
19362 	&& (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19363 	|| VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19364 }
19365 
19366 /* Return needed mode for entity in optimize_mode_switching pass.  */
19367 
19368 static int
19369 ix86_avx_u128_mode_needed (rtx_insn *insn)
19370 {
19371   if (CALL_P (insn))
19372     {
19373       rtx link;
19374 
19375       /* Needed mode is set to AVX_U128_CLEAN if there are
19376 	 no 256bit or 512bit modes used in function arguments. */
19377       for (link = CALL_INSN_FUNCTION_USAGE (insn);
19378 	   link;
19379 	   link = XEXP (link, 1))
19380 	{
19381 	  if (GET_CODE (XEXP (link, 0)) == USE)
19382 	    {
19383 	      rtx arg = XEXP (XEXP (link, 0), 0);
19384 
19385 	      if (ix86_check_avx_upper_register (arg))
19386 		return AVX_U128_DIRTY;
19387 	    }
19388 	}
19389 
19390       return AVX_U128_CLEAN;
19391     }
19392 
19393   /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19394      Hardware changes state only when a 256bit register is written to,
19395      but we need to prevent the compiler from moving optimal insertion
19396      point above eventual read from 256bit or 512 bit register.  */
19397   subrtx_iterator::array_type array;
19398   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19399     if (ix86_check_avx_upper_register (*iter))
19400       return AVX_U128_DIRTY;
19401 
19402   return AVX_U128_ANY;
19403 }
19404 
19405 /* Return mode that i387 must be switched into
19406    prior to the execution of insn.  */
19407 
19408 static int
19409 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19410 {
19411   enum attr_i387_cw mode;
19412 
19413   /* The mode UNINITIALIZED is used to store control word after a
19414      function call or ASM pattern.  The mode ANY specify that function
19415      has no requirements on the control word and make no changes in the
19416      bits we are interested in.  */
19417 
19418   if (CALL_P (insn)
19419       || (NONJUMP_INSN_P (insn)
19420 	  && (asm_noperands (PATTERN (insn)) >= 0
19421 	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19422     return I387_CW_UNINITIALIZED;
19423 
19424   if (recog_memoized (insn) < 0)
19425     return I387_CW_ANY;
19426 
19427   mode = get_attr_i387_cw (insn);
19428 
19429   switch (entity)
19430     {
19431     case I387_TRUNC:
19432       if (mode == I387_CW_TRUNC)
19433 	return mode;
19434       break;
19435 
19436     case I387_FLOOR:
19437       if (mode == I387_CW_FLOOR)
19438 	return mode;
19439       break;
19440 
19441     case I387_CEIL:
19442       if (mode == I387_CW_CEIL)
19443 	return mode;
19444       break;
19445 
19446     case I387_MASK_PM:
19447       if (mode == I387_CW_MASK_PM)
19448 	return mode;
19449       break;
19450 
19451     default:
19452       gcc_unreachable ();
19453     }
19454 
19455   return I387_CW_ANY;
19456 }
19457 
19458 /* Return mode that entity must be switched into
19459    prior to the execution of insn.  */
19460 
19461 static int
19462 ix86_mode_needed (int entity, rtx_insn *insn)
19463 {
19464   switch (entity)
19465     {
19466     case X86_DIRFLAG:
19467       return ix86_dirflag_mode_needed (insn);
19468     case AVX_U128:
19469       return ix86_avx_u128_mode_needed (insn);
19470     case I387_TRUNC:
19471     case I387_FLOOR:
19472     case I387_CEIL:
19473     case I387_MASK_PM:
19474       return ix86_i387_mode_needed (entity, insn);
19475     default:
19476       gcc_unreachable ();
19477     }
19478   return 0;
19479 }
19480 
19481 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
19482 
19483 static void
19484 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19485  {
19486    if (ix86_check_avx_upper_register (dest))
19487     {
19488       bool *used = (bool *) data;
19489       *used = true;
19490     }
19491  }
19492 
19493 /* Calculate mode of upper 128bit AVX registers after the insn.  */
19494 
19495 static int
19496 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19497 {
19498   rtx pat = PATTERN (insn);
19499 
19500   if (vzeroupper_operation (pat, VOIDmode)
19501       || vzeroall_operation (pat, VOIDmode))
19502     return AVX_U128_CLEAN;
19503 
19504   /* We know that state is clean after CALL insn if there are no
19505      256bit or 512bit registers used in the function return register. */
19506   if (CALL_P (insn))
19507     {
19508       bool avx_upper_reg_found = false;
19509       note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19510 
19511       return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19512     }
19513 
19514   /* Otherwise, return current mode.  Remember that if insn
19515      references AVX 256bit or 512bit registers, the mode was already
19516      changed to DIRTY from MODE_NEEDED.  */
19517   return mode;
19518 }
19519 
19520 /* Return the mode that an insn results in.  */
19521 
19522 static int
19523 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19524 {
19525   switch (entity)
19526     {
19527     case X86_DIRFLAG:
19528       return mode;
19529     case AVX_U128:
19530       return ix86_avx_u128_mode_after (mode, insn);
19531     case I387_TRUNC:
19532     case I387_FLOOR:
19533     case I387_CEIL:
19534     case I387_MASK_PM:
19535       return mode;
19536     default:
19537       gcc_unreachable ();
19538     }
19539 }
19540 
19541 static int
19542 ix86_dirflag_mode_entry (void)
19543 {
19544   /* For TARGET_CLD or in the interrupt handler we can't assume
19545      direction flag state at function entry.  */
19546   if (TARGET_CLD
19547       || cfun->machine->func_type != TYPE_NORMAL)
19548     return X86_DIRFLAG_ANY;
19549 
19550   return X86_DIRFLAG_RESET;
19551 }
19552 
19553 static int
19554 ix86_avx_u128_mode_entry (void)
19555 {
19556   tree arg;
19557 
19558   /* Entry mode is set to AVX_U128_DIRTY if there are
19559      256bit or 512bit modes used in function arguments.  */
19560   for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19561        arg = TREE_CHAIN (arg))
19562     {
19563       rtx incoming = DECL_INCOMING_RTL (arg);
19564 
19565       if (incoming && ix86_check_avx_upper_register (incoming))
19566 	return AVX_U128_DIRTY;
19567     }
19568 
19569   return AVX_U128_CLEAN;
19570 }
19571 
19572 /* Return a mode that ENTITY is assumed to be
19573    switched to at function entry.  */
19574 
19575 static int
19576 ix86_mode_entry (int entity)
19577 {
19578   switch (entity)
19579     {
19580     case X86_DIRFLAG:
19581       return ix86_dirflag_mode_entry ();
19582     case AVX_U128:
19583       return ix86_avx_u128_mode_entry ();
19584     case I387_TRUNC:
19585     case I387_FLOOR:
19586     case I387_CEIL:
19587     case I387_MASK_PM:
19588       return I387_CW_ANY;
19589     default:
19590       gcc_unreachable ();
19591     }
19592 }
19593 
19594 static int
19595 ix86_avx_u128_mode_exit (void)
19596 {
19597   rtx reg = crtl->return_rtx;
19598 
19599   /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19600      or 512 bit modes used in the function return register. */
19601   if (reg && ix86_check_avx_upper_register (reg))
19602     return AVX_U128_DIRTY;
19603 
19604   return AVX_U128_CLEAN;
19605 }
19606 
19607 /* Return a mode that ENTITY is assumed to be
19608    switched to at function exit.  */
19609 
19610 static int
19611 ix86_mode_exit (int entity)
19612 {
19613   switch (entity)
19614     {
19615     case X86_DIRFLAG:
19616       return X86_DIRFLAG_ANY;
19617     case AVX_U128:
19618       return ix86_avx_u128_mode_exit ();
19619     case I387_TRUNC:
19620     case I387_FLOOR:
19621     case I387_CEIL:
19622     case I387_MASK_PM:
19623       return I387_CW_ANY;
19624     default:
19625       gcc_unreachable ();
19626     }
19627 }
19628 
19629 static int
19630 ix86_mode_priority (int, int n)
19631 {
19632   return n;
19633 }
19634 
19635 /* Output code to initialize control word copies used by trunc?f?i and
19636    rounding patterns.  CURRENT_MODE is set to current control word,
19637    while NEW_MODE is set to new control word.  */
19638 
19639 static void
19640 emit_i387_cw_initialization (int mode)
19641 {
19642   rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19643   rtx new_mode;
19644 
19645   enum ix86_stack_slot slot;
19646 
19647   rtx reg = gen_reg_rtx (HImode);
19648 
19649   emit_insn (gen_x86_fnstcw_1 (stored_mode));
19650   emit_move_insn (reg, copy_rtx (stored_mode));
19651 
19652   switch (mode)
19653     {
19654     case I387_CW_TRUNC:
19655       /* round toward zero (truncate) */
19656       emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19657       slot = SLOT_CW_TRUNC;
19658       break;
19659 
19660     case I387_CW_FLOOR:
19661       /* round down toward -oo */
19662       emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19663       emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19664       slot = SLOT_CW_FLOOR;
19665       break;
19666 
19667     case I387_CW_CEIL:
19668       /* round up toward +oo */
19669       emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19670       emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19671       slot = SLOT_CW_CEIL;
19672       break;
19673 
19674     case I387_CW_MASK_PM:
19675       /* mask precision exception for nearbyint() */
19676       emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19677       slot = SLOT_CW_MASK_PM;
19678       break;
19679 
19680     default:
19681       gcc_unreachable ();
19682     }
19683 
19684   gcc_assert (slot < MAX_386_STACK_LOCALS);
19685 
19686   new_mode = assign_386_stack_local (HImode, slot);
19687   emit_move_insn (new_mode, reg);
19688 }
19689 
19690 /* Emit vzeroupper.  */
19691 
19692 void
19693 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19694 {
19695   int i;
19696 
19697   /* Cancel automatic vzeroupper insertion if there are
19698      live call-saved SSE registers at the insertion point.  */
19699 
19700   for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19701     if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19702       return;
19703 
19704   if (TARGET_64BIT)
19705     for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19706       if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19707 	return;
19708 
19709   emit_insn (gen_avx_vzeroupper ());
19710 }
19711 
19712 /* Generate one or more insns to set ENTITY to MODE.  */
19713 
19714 /* Generate one or more insns to set ENTITY to MODE.  HARD_REG_LIVE
19715    is the set of hard registers live at the point where the insn(s)
19716    are to be inserted.  */
19717 
19718 static void
19719 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19720 		    HARD_REG_SET regs_live)
19721 {
19722   switch (entity)
19723     {
19724     case X86_DIRFLAG:
19725       if (mode == X86_DIRFLAG_RESET)
19726 	emit_insn (gen_cld ());
19727       break;
19728     case AVX_U128:
19729       if (mode == AVX_U128_CLEAN)
19730 	ix86_avx_emit_vzeroupper (regs_live);
19731       break;
19732     case I387_TRUNC:
19733     case I387_FLOOR:
19734     case I387_CEIL:
19735     case I387_MASK_PM:
19736       if (mode != I387_CW_ANY
19737 	  && mode != I387_CW_UNINITIALIZED)
19738 	emit_i387_cw_initialization (mode);
19739       break;
19740     default:
19741       gcc_unreachable ();
19742     }
19743 }
19744 
19745 /* Output code for INSN to convert a float to a signed int.  OPERANDS
19746    are the insn operands.  The output may be [HSD]Imode and the input
19747    operand may be [SDX]Fmode.  */
19748 
19749 const char *
19750 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19751 {
19752   bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19753   bool dimode_p = GET_MODE (operands[0]) == DImode;
19754   int round_mode = get_attr_i387_cw (insn);
19755 
19756   static char buf[40];
19757   const char *p;
19758 
19759   /* Jump through a hoop or two for DImode, since the hardware has no
19760      non-popping instruction.  We used to do this a different way, but
19761      that was somewhat fragile and broke with post-reload splitters.  */
19762   if ((dimode_p || fisttp) && !stack_top_dies)
19763     output_asm_insn ("fld\t%y1", operands);
19764 
19765   gcc_assert (STACK_TOP_P (operands[1]));
19766   gcc_assert (MEM_P (operands[0]));
19767   gcc_assert (GET_MODE (operands[1]) != TFmode);
19768 
19769   if (fisttp)
19770     return "fisttp%Z0\t%0";
19771 
19772   strcpy (buf, "fist");
19773 
19774   if (round_mode != I387_CW_ANY)
19775     output_asm_insn ("fldcw\t%3", operands);
19776 
19777   p = "p%Z0\t%0";
19778   strcat (buf, p + !(stack_top_dies || dimode_p));
19779 
19780   output_asm_insn (buf, operands);
19781 
19782   if (round_mode != I387_CW_ANY)
19783     output_asm_insn ("fldcw\t%2", operands);
19784 
19785   return "";
19786 }
19787 
19788 /* Output code for x87 ffreep insn.  The OPNO argument, which may only
19789    have the values zero or one, indicates the ffreep insn's operand
19790    from the OPERANDS array.  */
19791 
19792 static const char *
19793 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19794 {
19795   if (TARGET_USE_FFREEP)
19796 #ifdef HAVE_AS_IX86_FFREEP
19797     return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19798 #else
19799     {
19800       static char retval[32];
19801       int regno = REGNO (operands[opno]);
19802 
19803       gcc_assert (STACK_REGNO_P (regno));
19804 
19805       regno -= FIRST_STACK_REG;
19806 
19807       snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19808       return retval;
19809     }
19810 #endif
19811 
19812   return opno ? "fstp\t%y1" : "fstp\t%y0";
19813 }
19814 
19815 
19816 /* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
19817    should be used.  UNORDERED_P is true when fucom should be used.  */
19818 
19819 const char *
19820 output_fp_compare (rtx_insn *insn, rtx *operands,
19821 		   bool eflags_p, bool unordered_p)
19822 {
19823   rtx *xops = eflags_p ? &operands[0] : &operands[1];
19824   bool stack_top_dies;
19825 
19826   static char buf[40];
19827   const char *p;
19828 
19829   gcc_assert (STACK_TOP_P (xops[0]));
19830 
19831   stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19832 
19833   if (eflags_p)
19834     {
19835       p = unordered_p ? "fucomi" : "fcomi";
19836       strcpy (buf, p);
19837 
19838       p = "p\t{%y1, %0|%0, %y1}";
19839       strcat (buf, p + !stack_top_dies);
19840 
19841       return buf;
19842     }
19843 
19844   if (STACK_REG_P (xops[1])
19845       && stack_top_dies
19846       && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19847     {
19848       gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19849 
19850       /* If both the top of the 387 stack die, and the other operand
19851 	 is also a stack register that dies, then this must be a
19852 	 `fcompp' float compare.  */
19853       p = unordered_p ? "fucompp" : "fcompp";
19854       strcpy (buf, p);
19855     }
19856   else if (const0_operand (xops[1], VOIDmode))
19857     {
19858       gcc_assert (!unordered_p);
19859       strcpy (buf, "ftst");
19860     }
19861   else
19862     {
19863       if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19864 	{
19865 	  gcc_assert (!unordered_p);
19866 	  p = "ficom";
19867 	}
19868       else
19869 	p = unordered_p ? "fucom" : "fcom";
19870 
19871       strcpy (buf, p);
19872 
19873       p = "p%Z2\t%y2";
19874       strcat (buf, p + !stack_top_dies);
19875     }
19876 
19877   output_asm_insn (buf, operands);
19878   return "fnstsw\t%0";
19879 }
19880 
19881 void
19882 ix86_output_addr_vec_elt (FILE *file, int value)
19883 {
19884   const char *directive = ASM_LONG;
19885 
19886 #ifdef ASM_QUAD
19887   if (TARGET_LP64)
19888     directive = ASM_QUAD;
19889 #else
19890   gcc_assert (!TARGET_64BIT);
19891 #endif
19892 
19893   fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19894 }
19895 
19896 void
19897 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19898 {
19899   const char *directive = ASM_LONG;
19900 
19901 #ifdef ASM_QUAD
19902   if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19903     directive = ASM_QUAD;
19904 #else
19905   gcc_assert (!TARGET_64BIT);
19906 #endif
19907   /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
19908   if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19909     fprintf (file, "%s%s%d-%s%d\n",
19910 	     directive, LPREFIX, value, LPREFIX, rel);
19911   else if (HAVE_AS_GOTOFF_IN_DATA)
19912     fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19913 #if TARGET_MACHO
19914   else if (TARGET_MACHO)
19915     {
19916       fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19917       machopic_output_function_base_name (file);
19918       putc ('\n', file);
19919     }
19920 #endif
19921   else
19922     asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19923 		 GOT_SYMBOL_NAME, LPREFIX, value);
19924 }
19925 
19926 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19927    for the target.  */
19928 
19929 void
19930 ix86_expand_clear (rtx dest)
19931 {
19932   rtx tmp;
19933 
19934   /* We play register width games, which are only valid after reload.  */
19935   gcc_assert (reload_completed);
19936 
19937   /* Avoid HImode and its attendant prefix byte.  */
19938   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19939     dest = gen_rtx_REG (SImode, REGNO (dest));
19940   tmp = gen_rtx_SET (dest, const0_rtx);
19941 
19942   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19943     {
19944       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19945       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19946     }
19947 
19948   emit_insn (tmp);
19949 }
19950 
19951 void
19952 ix86_expand_move (machine_mode mode, rtx operands[])
19953 {
19954   rtx op0, op1;
19955   rtx tmp, addend = NULL_RTX;
19956   enum tls_model model;
19957 
19958   op0 = operands[0];
19959   op1 = operands[1];
19960 
19961   switch (GET_CODE (op1))
19962     {
19963     case CONST:
19964       tmp = XEXP (op1, 0);
19965 
19966       if (GET_CODE (tmp) != PLUS
19967 	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19968 	break;
19969 
19970       op1 = XEXP (tmp, 0);
19971       addend = XEXP (tmp, 1);
19972       /* FALLTHRU */
19973 
19974     case SYMBOL_REF:
19975       model = SYMBOL_REF_TLS_MODEL (op1);
19976 
19977       if (model)
19978 	op1 = legitimize_tls_address (op1, model, true);
19979       else if (ix86_force_load_from_GOT_p (op1))
19980 	{
19981 	  /* Load the external function address via GOT slot to avoid PLT.  */
19982 	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19983 				(TARGET_64BIT
19984 				 ? UNSPEC_GOTPCREL
19985 				 : UNSPEC_GOT));
19986 	  op1 = gen_rtx_CONST (Pmode, op1);
19987 	  op1 = gen_const_mem (Pmode, op1);
19988 	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
19989 	}
19990       else
19991 	{
19992 	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19993 	  if (tmp)
19994 	    {
19995 	      op1 = tmp;
19996 	      if (!addend)
19997 		break;
19998 	    }
19999 	  else
20000 	    {
20001 	      op1 = operands[1];
20002 	      break;
20003 	    }
20004 	}
20005 
20006       if (addend)
20007 	{
20008 	  op1 = force_operand (op1, NULL_RTX);
20009 	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20010 				     op0, 1, OPTAB_DIRECT);
20011 	}
20012       else
20013 	op1 = force_operand (op1, op0);
20014 
20015       if (op1 == op0)
20016 	return;
20017 
20018       op1 = convert_to_mode (mode, op1, 1);
20019 
20020     default:
20021       break;
20022     }
20023 
20024   if ((flag_pic || MACHOPIC_INDIRECT)
20025       && symbolic_operand (op1, mode))
20026     {
20027       if (TARGET_MACHO && !TARGET_64BIT)
20028 	{
20029 #if TARGET_MACHO
20030 	  /* dynamic-no-pic */
20031 	  if (MACHOPIC_INDIRECT)
20032 	    {
20033 	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20034 			 ? op0 : gen_reg_rtx (Pmode);
20035 	      op1 = machopic_indirect_data_reference (op1, temp);
20036 	      if (MACHOPIC_PURE)
20037 		op1 = machopic_legitimize_pic_address (op1, mode,
20038 						       temp == op1 ? 0 : temp);
20039 	    }
20040 	  if (op0 != op1 && GET_CODE (op0) != MEM)
20041 	    {
20042 	      rtx insn = gen_rtx_SET (op0, op1);
20043 	      emit_insn (insn);
20044 	      return;
20045 	    }
20046 	  if (GET_CODE (op0) == MEM)
20047 	    op1 = force_reg (Pmode, op1);
20048 	  else
20049 	    {
20050 	      rtx temp = op0;
20051 	      if (GET_CODE (temp) != REG)
20052 		temp = gen_reg_rtx (Pmode);
20053 	      temp = legitimize_pic_address (op1, temp);
20054 	      if (temp == op0)
20055 	    return;
20056 	      op1 = temp;
20057 	    }
20058       /* dynamic-no-pic */
20059 #endif
20060 	}
20061       else
20062 	{
20063 	  if (MEM_P (op0))
20064 	    op1 = force_reg (mode, op1);
20065 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20066 	    {
20067 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20068 	      op1 = legitimize_pic_address (op1, reg);
20069 	      if (op0 == op1)
20070 		return;
20071 	      op1 = convert_to_mode (mode, op1, 1);
20072 	    }
20073 	}
20074     }
20075   else
20076     {
20077       if (MEM_P (op0)
20078 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20079 	      || !push_operand (op0, mode))
20080 	  && MEM_P (op1))
20081 	op1 = force_reg (mode, op1);
20082 
20083       if (push_operand (op0, mode)
20084 	  && ! general_no_elim_operand (op1, mode))
20085 	op1 = copy_to_mode_reg (mode, op1);
20086 
20087       /* Force large constants in 64bit compilation into register
20088 	 to get them CSEed.  */
20089       if (can_create_pseudo_p ()
20090 	  && (mode == DImode) && TARGET_64BIT
20091 	  && immediate_operand (op1, mode)
20092 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
20093 	  && !register_operand (op0, mode)
20094 	  && optimize)
20095 	op1 = copy_to_mode_reg (mode, op1);
20096 
20097       if (can_create_pseudo_p ()
20098 	  && CONST_DOUBLE_P (op1))
20099 	{
20100 	  /* If we are loading a floating point constant to a register,
20101 	     force the value to memory now, since we'll get better code
20102 	     out the back end.  */
20103 
20104 	  op1 = validize_mem (force_const_mem (mode, op1));
20105 	  if (!register_operand (op0, mode))
20106 	    {
20107 	      rtx temp = gen_reg_rtx (mode);
20108 	      emit_insn (gen_rtx_SET (temp, op1));
20109 	      emit_move_insn (op0, temp);
20110 	      return;
20111 	    }
20112 	}
20113     }
20114 
20115   emit_insn (gen_rtx_SET (op0, op1));
20116 }
20117 
20118 void
20119 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20120 {
20121   rtx op0 = operands[0], op1 = operands[1];
20122   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20123      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
20124   unsigned int align = (TARGET_IAMCU
20125 			? GET_MODE_BITSIZE (mode)
20126 			: GET_MODE_ALIGNMENT (mode));
20127 
20128   if (push_operand (op0, VOIDmode))
20129     op0 = emit_move_resolve_push (mode, op0);
20130 
20131   /* Force constants other than zero into memory.  We do not know how
20132      the instructions used to build constants modify the upper 64 bits
20133      of the register, once we have that information we may be able
20134      to handle some of them more efficiently.  */
20135   if (can_create_pseudo_p ()
20136       && (CONSTANT_P (op1)
20137 	  || (SUBREG_P (op1)
20138 	      && CONSTANT_P (SUBREG_REG (op1))))
20139       && ((register_operand (op0, mode)
20140 	   && !standard_sse_constant_p (op1, mode))
20141 	  /* ix86_expand_vector_move_misalign() does not like constants.  */
20142 	  || (SSE_REG_MODE_P (mode)
20143 	      && MEM_P (op0)
20144 	      && MEM_ALIGN (op0) < align)))
20145     {
20146       if (SUBREG_P (op1))
20147 	{
20148 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
20149 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
20150 	  if (r)
20151 	    r = validize_mem (r);
20152 	  else
20153 	    r = force_reg (imode, SUBREG_REG (op1));
20154 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20155 	}
20156       else
20157 	op1 = validize_mem (force_const_mem (mode, op1));
20158     }
20159 
20160   /* We need to check memory alignment for SSE mode since attribute
20161      can make operands unaligned.  */
20162   if (can_create_pseudo_p ()
20163       && SSE_REG_MODE_P (mode)
20164       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20165 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20166     {
20167       rtx tmp[2];
20168 
20169       /* ix86_expand_vector_move_misalign() does not like both
20170 	 arguments in memory.  */
20171       if (!register_operand (op0, mode)
20172 	  && !register_operand (op1, mode))
20173 	op1 = force_reg (mode, op1);
20174 
20175       tmp[0] = op0; tmp[1] = op1;
20176       ix86_expand_vector_move_misalign (mode, tmp);
20177       return;
20178     }
20179 
20180   /* Make operand1 a register if it isn't already.  */
20181   if (can_create_pseudo_p ()
20182       && !register_operand (op0, mode)
20183       && !register_operand (op1, mode))
20184     {
20185       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20186       return;
20187     }
20188 
20189   emit_insn (gen_rtx_SET (op0, op1));
20190 }
20191 
20192 /* Split 32-byte AVX unaligned load and store if needed.  */
20193 
20194 static void
20195 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20196 {
20197   rtx m;
20198   rtx (*extract) (rtx, rtx, rtx);
20199   machine_mode mode;
20200 
20201   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20202       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20203     {
20204       emit_insn (gen_rtx_SET (op0, op1));
20205       return;
20206     }
20207 
20208   rtx orig_op0 = NULL_RTX;
20209   mode = GET_MODE (op0);
20210   switch (GET_MODE_CLASS (mode))
20211     {
20212     case MODE_VECTOR_INT:
20213     case MODE_INT:
20214       if (mode != V32QImode)
20215 	{
20216 	  if (!MEM_P (op0))
20217 	    {
20218 	      orig_op0 = op0;
20219 	      op0 = gen_reg_rtx (V32QImode);
20220 	    }
20221 	  else
20222 	    op0 = gen_lowpart (V32QImode, op0);
20223 	  op1 = gen_lowpart (V32QImode, op1);
20224 	  mode = V32QImode;
20225 	}
20226       break;
20227     case MODE_VECTOR_FLOAT:
20228       break;
20229     default:
20230       gcc_unreachable ();
20231     }
20232 
20233   switch (mode)
20234     {
20235     default:
20236       gcc_unreachable ();
20237     case E_V32QImode:
20238       extract = gen_avx_vextractf128v32qi;
20239       mode = V16QImode;
20240       break;
20241     case E_V8SFmode:
20242       extract = gen_avx_vextractf128v8sf;
20243       mode = V4SFmode;
20244       break;
20245     case E_V4DFmode:
20246       extract = gen_avx_vextractf128v4df;
20247       mode = V2DFmode;
20248       break;
20249     }
20250 
20251   if (MEM_P (op1))
20252     {
20253       rtx r = gen_reg_rtx (mode);
20254       m = adjust_address (op1, mode, 0);
20255       emit_move_insn (r, m);
20256       m = adjust_address (op1, mode, 16);
20257       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20258       emit_move_insn (op0, r);
20259     }
20260   else if (MEM_P (op0))
20261     {
20262       m = adjust_address (op0, mode, 0);
20263       emit_insn (extract (m, op1, const0_rtx));
20264       m = adjust_address (op0, mode, 16);
20265       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20266     }
20267   else
20268     gcc_unreachable ();
20269 
20270   if (orig_op0)
20271     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20272 }
20273 
20274 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
20275    straight to ix86_expand_vector_move.  */
20276 /* Code generation for scalar reg-reg moves of single and double precision data:
20277      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20278        movaps reg, reg
20279      else
20280        movss reg, reg
20281      if (x86_sse_partial_reg_dependency == true)
20282        movapd reg, reg
20283      else
20284        movsd reg, reg
20285 
20286    Code generation for scalar loads of double precision data:
20287      if (x86_sse_split_regs == true)
20288        movlpd mem, reg      (gas syntax)
20289      else
20290        movsd mem, reg
20291 
20292    Code generation for unaligned packed loads of single precision data
20293    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20294      if (x86_sse_unaligned_move_optimal)
20295        movups mem, reg
20296 
20297      if (x86_sse_partial_reg_dependency == true)
20298        {
20299          xorps  reg, reg
20300          movlps mem, reg
20301          movhps mem+8, reg
20302        }
20303      else
20304        {
20305          movlps mem, reg
20306          movhps mem+8, reg
20307        }
20308 
20309    Code generation for unaligned packed loads of double precision data
20310    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20311      if (x86_sse_unaligned_move_optimal)
20312        movupd mem, reg
20313 
20314      if (x86_sse_split_regs == true)
20315        {
20316          movlpd mem, reg
20317          movhpd mem+8, reg
20318        }
20319      else
20320        {
20321          movsd  mem, reg
20322          movhpd mem+8, reg
20323        }
20324  */
20325 
20326 void
20327 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20328 {
20329   rtx op0, op1, m;
20330 
20331   op0 = operands[0];
20332   op1 = operands[1];
20333 
20334   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
20335   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20336     {
20337       emit_insn (gen_rtx_SET (op0, op1));
20338       return;
20339     }
20340 
20341   if (TARGET_AVX)
20342     {
20343       if (GET_MODE_SIZE (mode) == 32)
20344 	ix86_avx256_split_vector_move_misalign (op0, op1);
20345       else
20346 	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
20347 	emit_insn (gen_rtx_SET (op0, op1));
20348       return;
20349     }
20350 
20351   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20352       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20353     {
20354       emit_insn (gen_rtx_SET (op0, op1));
20355       return;
20356     }
20357 
20358   /* ??? If we have typed data, then it would appear that using
20359      movdqu is the only way to get unaligned data loaded with
20360      integer type.  */
20361   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20362     {
20363       emit_insn (gen_rtx_SET (op0, op1));
20364       return;
20365     }
20366 
20367   if (MEM_P (op1))
20368     {
20369       if (TARGET_SSE2 && mode == V2DFmode)
20370         {
20371           rtx zero;
20372 
20373 	  /* When SSE registers are split into halves, we can avoid
20374 	     writing to the top half twice.  */
20375 	  if (TARGET_SSE_SPLIT_REGS)
20376 	    {
20377 	      emit_clobber (op0);
20378 	      zero = op0;
20379 	    }
20380 	  else
20381 	    {
20382 	      /* ??? Not sure about the best option for the Intel chips.
20383 		 The following would seem to satisfy; the register is
20384 		 entirely cleared, breaking the dependency chain.  We
20385 		 then store to the upper half, with a dependency depth
20386 		 of one.  A rumor has it that Intel recommends two movsd
20387 		 followed by an unpacklpd, but this is unconfirmed.  And
20388 		 given that the dependency depth of the unpacklpd would
20389 		 still be one, I'm not sure why this would be better.  */
20390 	      zero = CONST0_RTX (V2DFmode);
20391 	    }
20392 
20393 	  m = adjust_address (op1, DFmode, 0);
20394 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
20395 	  m = adjust_address (op1, DFmode, 8);
20396 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
20397 	}
20398       else
20399         {
20400 	  rtx t;
20401 
20402 	  if (mode != V4SFmode)
20403 	    t = gen_reg_rtx (V4SFmode);
20404 	  else
20405 	    t = op0;
20406 
20407 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20408 	    emit_move_insn (t, CONST0_RTX (V4SFmode));
20409 	  else
20410 	    emit_clobber (t);
20411 
20412 	  m = adjust_address (op1, V2SFmode, 0);
20413 	  emit_insn (gen_sse_loadlps (t, t, m));
20414 	  m = adjust_address (op1, V2SFmode, 8);
20415 	  emit_insn (gen_sse_loadhps (t, t, m));
20416 	  if (mode != V4SFmode)
20417 	    emit_move_insn (op0, gen_lowpart (mode, t));
20418 	}
20419     }
20420   else if (MEM_P (op0))
20421     {
20422       if (TARGET_SSE2 && mode == V2DFmode)
20423 	{
20424 	  m = adjust_address (op0, DFmode, 0);
20425 	  emit_insn (gen_sse2_storelpd (m, op1));
20426 	  m = adjust_address (op0, DFmode, 8);
20427 	  emit_insn (gen_sse2_storehpd (m, op1));
20428 	}
20429       else
20430 	{
20431 	  if (mode != V4SFmode)
20432 	    op1 = gen_lowpart (V4SFmode, op1);
20433 
20434 	  m = adjust_address (op0, V2SFmode, 0);
20435 	  emit_insn (gen_sse_storelps (m, op1));
20436 	  m = adjust_address (op0, V2SFmode, 8);
20437 	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20438 	}
20439     }
20440   else
20441     gcc_unreachable ();
20442 }
20443 
20444 /* Helper function of ix86_fixup_binary_operands to canonicalize
20445    operand order.  Returns true if the operands should be swapped.  */
20446 
20447 static bool
20448 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20449 			     rtx operands[])
20450 {
20451   rtx dst = operands[0];
20452   rtx src1 = operands[1];
20453   rtx src2 = operands[2];
20454 
20455   /* If the operation is not commutative, we can't do anything.  */
20456   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20457       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20458     return false;
20459 
20460   /* Highest priority is that src1 should match dst.  */
20461   if (rtx_equal_p (dst, src1))
20462     return false;
20463   if (rtx_equal_p (dst, src2))
20464     return true;
20465 
20466   /* Next highest priority is that immediate constants come second.  */
20467   if (immediate_operand (src2, mode))
20468     return false;
20469   if (immediate_operand (src1, mode))
20470     return true;
20471 
20472   /* Lowest priority is that memory references should come second.  */
20473   if (MEM_P (src2))
20474     return false;
20475   if (MEM_P (src1))
20476     return true;
20477 
20478   return false;
20479 }
20480 
20481 
20482 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
20483    destination to use for the operation.  If different from the true
20484    destination in operands[0], a copy operation will be required.  */
20485 
20486 rtx
20487 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20488 			    rtx operands[])
20489 {
20490   rtx dst = operands[0];
20491   rtx src1 = operands[1];
20492   rtx src2 = operands[2];
20493 
20494   /* Canonicalize operand order.  */
20495   if (ix86_swap_binary_operands_p (code, mode, operands))
20496     {
20497       /* It is invalid to swap operands of different modes.  */
20498       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20499 
20500       std::swap (src1, src2);
20501     }
20502 
20503   /* Both source operands cannot be in memory.  */
20504   if (MEM_P (src1) && MEM_P (src2))
20505     {
20506       /* Optimization: Only read from memory once.  */
20507       if (rtx_equal_p (src1, src2))
20508 	{
20509 	  src2 = force_reg (mode, src2);
20510 	  src1 = src2;
20511 	}
20512       else if (rtx_equal_p (dst, src1))
20513 	src2 = force_reg (mode, src2);
20514       else
20515 	src1 = force_reg (mode, src1);
20516     }
20517 
20518   /* If the destination is memory, and we do not have matching source
20519      operands, do things in registers.  */
20520   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20521     dst = gen_reg_rtx (mode);
20522 
20523   /* Source 1 cannot be a constant.  */
20524   if (CONSTANT_P (src1))
20525     src1 = force_reg (mode, src1);
20526 
20527   /* Source 1 cannot be a non-matching memory.  */
20528   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20529     src1 = force_reg (mode, src1);
20530 
20531   /* Improve address combine.  */
20532   if (code == PLUS
20533       && GET_MODE_CLASS (mode) == MODE_INT
20534       && MEM_P (src2))
20535     src2 = force_reg (mode, src2);
20536 
20537   operands[1] = src1;
20538   operands[2] = src2;
20539   return dst;
20540 }
20541 
20542 /* Similarly, but assume that the destination has already been
20543    set up properly.  */
20544 
20545 void
20546 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20547 				    machine_mode mode, rtx operands[])
20548 {
20549   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20550   gcc_assert (dst == operands[0]);
20551 }
20552 
20553 /* Attempt to expand a binary operator.  Make the expansion closer to the
20554    actual machine, then just general_operand, which will allow 3 separate
20555    memory references (one output, two input) in a single insn.  */
20556 
20557 void
20558 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20559 			     rtx operands[])
20560 {
20561   rtx src1, src2, dst, op, clob;
20562 
20563   dst = ix86_fixup_binary_operands (code, mode, operands);
20564   src1 = operands[1];
20565   src2 = operands[2];
20566 
20567  /* Emit the instruction.  */
20568 
20569   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20570 
20571   if (reload_completed
20572       && code == PLUS
20573       && !rtx_equal_p (dst, src1))
20574     {
20575       /* This is going to be an LEA; avoid splitting it later.  */
20576       emit_insn (op);
20577     }
20578   else
20579     {
20580       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20581       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20582     }
20583 
20584   /* Fix up the destination if needed.  */
20585   if (dst != operands[0])
20586     emit_move_insn (operands[0], dst);
20587 }
20588 
20589 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20590    the given OPERANDS.  */
20591 
20592 void
20593 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20594 				     rtx operands[])
20595 {
20596   rtx op1 = NULL_RTX, op2 = NULL_RTX;
20597   if (SUBREG_P (operands[1]))
20598     {
20599       op1 = operands[1];
20600       op2 = operands[2];
20601     }
20602   else if (SUBREG_P (operands[2]))
20603     {
20604       op1 = operands[2];
20605       op2 = operands[1];
20606     }
20607   /* Optimize (__m128i) d | (__m128i) e and similar code
20608      when d and e are float vectors into float vector logical
20609      insn.  In C/C++ without using intrinsics there is no other way
20610      to express vector logical operation on float vectors than
20611      to cast them temporarily to integer vectors.  */
20612   if (op1
20613       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20614       && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20615       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20616       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20617       && SUBREG_BYTE (op1) == 0
20618       && (GET_CODE (op2) == CONST_VECTOR
20619 	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20620 	      && SUBREG_BYTE (op2) == 0))
20621       && can_create_pseudo_p ())
20622     {
20623       rtx dst;
20624       switch (GET_MODE (SUBREG_REG (op1)))
20625 	{
20626 	case E_V4SFmode:
20627 	case E_V8SFmode:
20628 	case E_V16SFmode:
20629 	case E_V2DFmode:
20630 	case E_V4DFmode:
20631 	case E_V8DFmode:
20632 	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20633 	  if (GET_CODE (op2) == CONST_VECTOR)
20634 	    {
20635 	      op2 = gen_lowpart (GET_MODE (dst), op2);
20636 	      op2 = force_reg (GET_MODE (dst), op2);
20637 	    }
20638 	  else
20639 	    {
20640 	      op1 = operands[1];
20641 	      op2 = SUBREG_REG (operands[2]);
20642 	      if (!vector_operand (op2, GET_MODE (dst)))
20643 		op2 = force_reg (GET_MODE (dst), op2);
20644 	    }
20645 	  op1 = SUBREG_REG (op1);
20646 	  if (!vector_operand (op1, GET_MODE (dst)))
20647 	    op1 = force_reg (GET_MODE (dst), op1);
20648 	  emit_insn (gen_rtx_SET (dst,
20649 				  gen_rtx_fmt_ee (code, GET_MODE (dst),
20650 						  op1, op2)));
20651 	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
20652 	  return;
20653 	default:
20654 	  break;
20655 	}
20656     }
20657   if (!vector_operand (operands[1], mode))
20658     operands[1] = force_reg (mode, operands[1]);
20659   if (!vector_operand (operands[2], mode))
20660     operands[2] = force_reg (mode, operands[2]);
20661   ix86_fixup_binary_operands_no_copy (code, mode, operands);
20662   emit_insn (gen_rtx_SET (operands[0],
20663 			  gen_rtx_fmt_ee (code, mode, operands[1],
20664 					  operands[2])));
20665 }
20666 
20667 /* Return TRUE or FALSE depending on whether the binary operator meets the
20668    appropriate constraints.  */
20669 
20670 bool
20671 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20672 			 rtx operands[3])
20673 {
20674   rtx dst = operands[0];
20675   rtx src1 = operands[1];
20676   rtx src2 = operands[2];
20677 
20678   /* Both source operands cannot be in memory.  */
20679   if (MEM_P (src1) && MEM_P (src2))
20680     return false;
20681 
20682   /* Canonicalize operand order for commutative operators.  */
20683   if (ix86_swap_binary_operands_p (code, mode, operands))
20684     std::swap (src1, src2);
20685 
20686   /* If the destination is memory, we must have a matching source operand.  */
20687   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20688     return false;
20689 
20690   /* Source 1 cannot be a constant.  */
20691   if (CONSTANT_P (src1))
20692     return false;
20693 
20694   /* Source 1 cannot be a non-matching memory.  */
20695   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20696     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
20697     return (code == AND
20698 	    && (mode == HImode
20699 		|| mode == SImode
20700 		|| (TARGET_64BIT && mode == DImode))
20701 	    && satisfies_constraint_L (src2));
20702 
20703   return true;
20704 }
20705 
20706 /* Attempt to expand a unary operator.  Make the expansion closer to the
20707    actual machine, then just general_operand, which will allow 2 separate
20708    memory references (one output, one input) in a single insn.  */
20709 
20710 void
20711 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20712 			    rtx operands[])
20713 {
20714   bool matching_memory = false;
20715   rtx src, dst, op, clob;
20716 
20717   dst = operands[0];
20718   src = operands[1];
20719 
20720   /* If the destination is memory, and we do not have matching source
20721      operands, do things in registers.  */
20722   if (MEM_P (dst))
20723     {
20724       if (rtx_equal_p (dst, src))
20725 	matching_memory = true;
20726       else
20727 	dst = gen_reg_rtx (mode);
20728     }
20729 
20730   /* When source operand is memory, destination must match.  */
20731   if (MEM_P (src) && !matching_memory)
20732     src = force_reg (mode, src);
20733 
20734   /* Emit the instruction.  */
20735 
20736   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20737 
20738   if (code == NOT)
20739     emit_insn (op);
20740   else
20741     {
20742       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20743       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20744     }
20745 
20746   /* Fix up the destination if needed.  */
20747   if (dst != operands[0])
20748     emit_move_insn (operands[0], dst);
20749 }
20750 
20751 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20752    divisor are within the range [0-255].  */
20753 
20754 void
20755 ix86_split_idivmod (machine_mode mode, rtx operands[],
20756 		    bool signed_p)
20757 {
20758   rtx_code_label *end_label, *qimode_label;
20759   rtx div, mod;
20760   rtx_insn *insn;
20761   rtx scratch, tmp0, tmp1, tmp2;
20762   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20763   rtx (*gen_zero_extend) (rtx, rtx);
20764   rtx (*gen_test_ccno_1) (rtx, rtx);
20765 
20766   switch (mode)
20767     {
20768     case E_SImode:
20769       if (GET_MODE (operands[0]) == SImode)
20770 	{
20771 	  if (GET_MODE (operands[1]) == SImode)
20772 	    gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20773 	  else
20774 	    gen_divmod4_1
20775 	      = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20776 	  gen_zero_extend = gen_zero_extendqisi2;
20777 	}
20778       else
20779 	{
20780 	  gen_divmod4_1
20781 	    = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20782 	  gen_zero_extend = gen_zero_extendqidi2;
20783 	}
20784       gen_test_ccno_1 = gen_testsi_ccno_1;
20785       break;
20786     case E_DImode:
20787       gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20788       gen_test_ccno_1 = gen_testdi_ccno_1;
20789       gen_zero_extend = gen_zero_extendqidi2;
20790       break;
20791     default:
20792       gcc_unreachable ();
20793     }
20794 
20795   end_label = gen_label_rtx ();
20796   qimode_label = gen_label_rtx ();
20797 
20798   scratch = gen_reg_rtx (mode);
20799 
20800   /* Use 8bit unsigned divimod if dividend and divisor are within
20801      the range [0-255].  */
20802   emit_move_insn (scratch, operands[2]);
20803   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20804 				 scratch, 1, OPTAB_DIRECT);
20805   emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20806   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20807   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20808   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20809 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20810 			       pc_rtx);
20811   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20812   predict_jump (REG_BR_PROB_BASE * 50 / 100);
20813   JUMP_LABEL (insn) = qimode_label;
20814 
20815   /* Generate original signed/unsigned divimod.  */
20816   div = gen_divmod4_1 (operands[0], operands[1],
20817 		       operands[2], operands[3]);
20818   emit_insn (div);
20819 
20820   /* Branch to the end.  */
20821   emit_jump_insn (gen_jump (end_label));
20822   emit_barrier ();
20823 
20824   /* Generate 8bit unsigned divide.  */
20825   emit_label (qimode_label);
20826   /* Don't use operands[0] for result of 8bit divide since not all
20827      registers support QImode ZERO_EXTRACT.  */
20828   tmp0 = lowpart_subreg (HImode, scratch, mode);
20829   tmp1 = lowpart_subreg (HImode, operands[2], mode);
20830   tmp2 = lowpart_subreg (QImode, operands[3], mode);
20831   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20832 
20833   if (signed_p)
20834     {
20835       div = gen_rtx_DIV (mode, operands[2], operands[3]);
20836       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20837     }
20838   else
20839     {
20840       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20841       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20842     }
20843   if (mode == SImode)
20844     {
20845       if (GET_MODE (operands[0]) != SImode)
20846 	div = gen_rtx_ZERO_EXTEND (DImode, div);
20847       if (GET_MODE (operands[1]) != SImode)
20848 	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20849     }
20850 
20851   /* Extract remainder from AH.  */
20852   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20853 			       tmp0, GEN_INT (8), GEN_INT (8));
20854   if (REG_P (operands[1]))
20855     insn = emit_move_insn (operands[1], tmp1);
20856   else
20857     {
20858       /* Need a new scratch register since the old one has result
20859 	 of 8bit divide.  */
20860       scratch = gen_reg_rtx (GET_MODE (operands[1]));
20861       emit_move_insn (scratch, tmp1);
20862       insn = emit_move_insn (operands[1], scratch);
20863     }
20864   set_unique_reg_note (insn, REG_EQUAL, mod);
20865 
20866   /* Zero extend quotient from AL.  */
20867   tmp1 = gen_lowpart (QImode, tmp0);
20868   insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20869   set_unique_reg_note (insn, REG_EQUAL, div);
20870 
20871   emit_label (end_label);
20872 }
20873 
20874 #define LEA_MAX_STALL (3)
20875 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20876 
20877 /* Increase given DISTANCE in half-cycles according to
20878    dependencies between PREV and NEXT instructions.
20879    Add 1 half-cycle if there is no dependency and
20880    go to next cycle if there is some dependecy.  */
20881 
20882 static unsigned int
20883 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20884 {
20885   df_ref def, use;
20886 
20887   if (!prev || !next)
20888     return distance + (distance & 1) + 2;
20889 
20890   if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20891     return distance + 1;
20892 
20893   FOR_EACH_INSN_USE (use, next)
20894     FOR_EACH_INSN_DEF (def, prev)
20895       if (!DF_REF_IS_ARTIFICIAL (def)
20896 	  && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20897 	return distance + (distance & 1) + 2;
20898 
20899   return distance + 1;
20900 }
20901 
20902 /* Function checks if instruction INSN defines register number
20903    REGNO1 or REGNO2.  */
20904 
20905 static bool
20906 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20907 		  rtx_insn *insn)
20908 {
20909   df_ref def;
20910 
20911   FOR_EACH_INSN_DEF (def, insn)
20912     if (DF_REF_REG_DEF_P (def)
20913 	&& !DF_REF_IS_ARTIFICIAL (def)
20914 	&& (regno1 == DF_REF_REGNO (def)
20915 	    || regno2 == DF_REF_REGNO (def)))
20916       return true;
20917 
20918   return false;
20919 }
20920 
20921 /* Function checks if instruction INSN uses register number
20922    REGNO as a part of address expression.  */
20923 
20924 static bool
20925 insn_uses_reg_mem (unsigned int regno, rtx insn)
20926 {
20927   df_ref use;
20928 
20929   FOR_EACH_INSN_USE (use, insn)
20930     if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20931       return true;
20932 
20933   return false;
20934 }
20935 
20936 /* Search backward for non-agu definition of register number REGNO1
20937    or register number REGNO2 in basic block starting from instruction
20938    START up to head of basic block or instruction INSN.
20939 
20940    Function puts true value into *FOUND var if definition was found
20941    and false otherwise.
20942 
20943    Distance in half-cycles between START and found instruction or head
20944    of BB is added to DISTANCE and returned.  */
20945 
20946 static int
20947 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20948 			       rtx_insn *insn, int distance,
20949 			       rtx_insn *start, bool *found)
20950 {
20951   basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20952   rtx_insn *prev = start;
20953   rtx_insn *next = NULL;
20954 
20955   *found = false;
20956 
20957   while (prev
20958 	 && prev != insn
20959 	 && distance < LEA_SEARCH_THRESHOLD)
20960     {
20961       if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20962 	{
20963 	  distance = increase_distance (prev, next, distance);
20964 	  if (insn_defines_reg (regno1, regno2, prev))
20965 	    {
20966 	      if (recog_memoized (prev) < 0
20967 		  || get_attr_type (prev) != TYPE_LEA)
20968 		{
20969 		  *found = true;
20970 		  return distance;
20971 		}
20972 	    }
20973 
20974 	  next = prev;
20975 	}
20976       if (prev == BB_HEAD (bb))
20977 	break;
20978 
20979       prev = PREV_INSN (prev);
20980     }
20981 
20982   return distance;
20983 }
20984 
20985 /* Search backward for non-agu definition of register number REGNO1
20986    or register number REGNO2 in INSN's basic block until
20987    1. Pass LEA_SEARCH_THRESHOLD instructions, or
20988    2. Reach neighbor BBs boundary, or
20989    3. Reach agu definition.
20990    Returns the distance between the non-agu definition point and INSN.
20991    If no definition point, returns -1.  */
20992 
20993 static int
20994 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20995 			 rtx_insn *insn)
20996 {
20997   basic_block bb = BLOCK_FOR_INSN (insn);
20998   int distance = 0;
20999   bool found = false;
21000 
21001   if (insn != BB_HEAD (bb))
21002     distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21003 					      distance, PREV_INSN (insn),
21004 					      &found);
21005 
21006   if (!found && distance < LEA_SEARCH_THRESHOLD)
21007     {
21008       edge e;
21009       edge_iterator ei;
21010       bool simple_loop = false;
21011 
21012       FOR_EACH_EDGE (e, ei, bb->preds)
21013 	if (e->src == bb)
21014 	  {
21015 	    simple_loop = true;
21016 	    break;
21017 	  }
21018 
21019       if (simple_loop)
21020 	distance = distance_non_agu_define_in_bb (regno1, regno2,
21021 						  insn, distance,
21022 						  BB_END (bb), &found);
21023       else
21024 	{
21025 	  int shortest_dist = -1;
21026 	  bool found_in_bb = false;
21027 
21028 	  FOR_EACH_EDGE (e, ei, bb->preds)
21029 	    {
21030 	      int bb_dist
21031 		= distance_non_agu_define_in_bb (regno1, regno2,
21032 						 insn, distance,
21033 						 BB_END (e->src),
21034 						 &found_in_bb);
21035 	      if (found_in_bb)
21036 		{
21037 		  if (shortest_dist < 0)
21038 		    shortest_dist = bb_dist;
21039 		  else if (bb_dist > 0)
21040 		    shortest_dist = MIN (bb_dist, shortest_dist);
21041 
21042 		  found = true;
21043 		}
21044 	    }
21045 
21046 	  distance = shortest_dist;
21047 	}
21048     }
21049 
21050   /* get_attr_type may modify recog data.  We want to make sure
21051      that recog data is valid for instruction INSN, on which
21052      distance_non_agu_define is called.  INSN is unchanged here.  */
21053   extract_insn_cached (insn);
21054 
21055   if (!found)
21056     return -1;
21057 
21058   return distance >> 1;
21059 }
21060 
21061 /* Return the distance in half-cycles between INSN and the next
21062    insn that uses register number REGNO in memory address added
21063    to DISTANCE.  Return -1 if REGNO0 is set.
21064 
21065    Put true value into *FOUND if register usage was found and
21066    false otherwise.
21067    Put true value into *REDEFINED if register redefinition was
21068    found and false otherwise.  */
21069 
21070 static int
21071 distance_agu_use_in_bb (unsigned int regno,
21072 			rtx_insn *insn, int distance, rtx_insn *start,
21073 			bool *found, bool *redefined)
21074 {
21075   basic_block bb = NULL;
21076   rtx_insn *next = start;
21077   rtx_insn *prev = NULL;
21078 
21079   *found = false;
21080   *redefined = false;
21081 
21082   if (start != NULL_RTX)
21083     {
21084       bb = BLOCK_FOR_INSN (start);
21085       if (start != BB_HEAD (bb))
21086 	/* If insn and start belong to the same bb, set prev to insn,
21087 	   so the call to increase_distance will increase the distance
21088 	   between insns by 1.  */
21089 	prev = insn;
21090     }
21091 
21092   while (next
21093 	 && next != insn
21094 	 && distance < LEA_SEARCH_THRESHOLD)
21095     {
21096       if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21097 	{
21098 	  distance = increase_distance(prev, next, distance);
21099 	  if (insn_uses_reg_mem (regno, next))
21100 	    {
21101 	      /* Return DISTANCE if OP0 is used in memory
21102 		 address in NEXT.  */
21103 	      *found = true;
21104 	      return distance;
21105 	    }
21106 
21107 	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
21108 	    {
21109 	      /* Return -1 if OP0 is set in NEXT.  */
21110 	      *redefined = true;
21111 	      return -1;
21112 	    }
21113 
21114 	  prev = next;
21115 	}
21116 
21117       if (next == BB_END (bb))
21118 	break;
21119 
21120       next = NEXT_INSN (next);
21121     }
21122 
21123   return distance;
21124 }
21125 
21126 /* Return the distance between INSN and the next insn that uses
21127    register number REGNO0 in memory address.  Return -1 if no such
21128    a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
21129 
21130 static int
21131 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21132 {
21133   basic_block bb = BLOCK_FOR_INSN (insn);
21134   int distance = 0;
21135   bool found = false;
21136   bool redefined = false;
21137 
21138   if (insn != BB_END (bb))
21139     distance = distance_agu_use_in_bb (regno0, insn, distance,
21140 				       NEXT_INSN (insn),
21141 				       &found, &redefined);
21142 
21143   if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21144     {
21145       edge e;
21146       edge_iterator ei;
21147       bool simple_loop = false;
21148 
21149       FOR_EACH_EDGE (e, ei, bb->succs)
21150         if (e->dest == bb)
21151 	  {
21152 	    simple_loop = true;
21153 	    break;
21154 	  }
21155 
21156       if (simple_loop)
21157 	distance = distance_agu_use_in_bb (regno0, insn,
21158 					   distance, BB_HEAD (bb),
21159 					   &found, &redefined);
21160       else
21161 	{
21162 	  int shortest_dist = -1;
21163 	  bool found_in_bb = false;
21164 	  bool redefined_in_bb = false;
21165 
21166 	  FOR_EACH_EDGE (e, ei, bb->succs)
21167 	    {
21168 	      int bb_dist
21169 		= distance_agu_use_in_bb (regno0, insn,
21170 					  distance, BB_HEAD (e->dest),
21171 					  &found_in_bb, &redefined_in_bb);
21172 	      if (found_in_bb)
21173 		{
21174 		  if (shortest_dist < 0)
21175 		    shortest_dist = bb_dist;
21176 		  else if (bb_dist > 0)
21177 		    shortest_dist = MIN (bb_dist, shortest_dist);
21178 
21179 		  found = true;
21180 		}
21181 	    }
21182 
21183 	  distance = shortest_dist;
21184 	}
21185     }
21186 
21187   if (!found || redefined)
21188     return -1;
21189 
21190   return distance >> 1;
21191 }
21192 
21193 /* Define this macro to tune LEA priority vs ADD, it take effect when
21194    there is a dilemma of choicing LEA or ADD
21195    Negative value: ADD is more preferred than LEA
21196    Zero: Netrual
21197    Positive value: LEA is more preferred than ADD*/
21198 #define IX86_LEA_PRIORITY 0
21199 
21200 /* Return true if usage of lea INSN has performance advantage
21201    over a sequence of instructions.  Instructions sequence has
21202    SPLIT_COST cycles higher latency than lea latency.  */
21203 
21204 static bool
21205 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21206 		      unsigned int regno2, int split_cost, bool has_scale)
21207 {
21208   int dist_define, dist_use;
21209 
21210   /* For Silvermont if using a 2-source or 3-source LEA for
21211      non-destructive destination purposes, or due to wanting
21212      ability to use SCALE, the use of LEA is justified.  */
21213   if (TARGET_SILVERMONT || TARGET_INTEL)
21214     {
21215       if (has_scale)
21216 	return true;
21217       if (split_cost < 1)
21218 	return false;
21219       if (regno0 == regno1 || regno0 == regno2)
21220 	return false;
21221       return true;
21222     }
21223 
21224   dist_define = distance_non_agu_define (regno1, regno2, insn);
21225   dist_use = distance_agu_use (regno0, insn);
21226 
21227   if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21228     {
21229       /* If there is no non AGU operand definition, no AGU
21230 	 operand usage and split cost is 0 then both lea
21231 	 and non lea variants have same priority.  Currently
21232 	 we prefer lea for 64 bit code and non lea on 32 bit
21233 	 code.  */
21234       if (dist_use < 0 && split_cost == 0)
21235 	return TARGET_64BIT || IX86_LEA_PRIORITY;
21236       else
21237 	return true;
21238     }
21239 
21240   /* With longer definitions distance lea is more preferable.
21241      Here we change it to take into account splitting cost and
21242      lea priority.  */
21243   dist_define += split_cost + IX86_LEA_PRIORITY;
21244 
21245   /* If there is no use in memory addess then we just check
21246      that split cost exceeds AGU stall.  */
21247   if (dist_use < 0)
21248     return dist_define > LEA_MAX_STALL;
21249 
21250   /* If this insn has both backward non-agu dependence and forward
21251      agu dependence, the one with short distance takes effect.  */
21252   return dist_define >= dist_use;
21253 }
21254 
21255 /* Return true if it is legal to clobber flags by INSN and
21256    false otherwise.  */
21257 
21258 static bool
21259 ix86_ok_to_clobber_flags (rtx_insn *insn)
21260 {
21261   basic_block bb = BLOCK_FOR_INSN (insn);
21262   df_ref use;
21263   bitmap live;
21264 
21265   while (insn)
21266     {
21267       if (NONDEBUG_INSN_P (insn))
21268 	{
21269 	  FOR_EACH_INSN_USE (use, insn)
21270 	    if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21271 	      return false;
21272 
21273 	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21274 	    return true;
21275 	}
21276 
21277       if (insn == BB_END (bb))
21278 	break;
21279 
21280       insn = NEXT_INSN (insn);
21281     }
21282 
21283   live = df_get_live_out(bb);
21284   return !REGNO_REG_SET_P (live, FLAGS_REG);
21285 }
21286 
21287 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21288    move and add to avoid AGU stalls.  */
21289 
21290 bool
21291 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21292 {
21293   unsigned int regno0, regno1, regno2;
21294 
21295   /* Check if we need to optimize.  */
21296   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21297     return false;
21298 
21299   /* Check it is correct to split here.  */
21300   if (!ix86_ok_to_clobber_flags(insn))
21301     return false;
21302 
21303   regno0 = true_regnum (operands[0]);
21304   regno1 = true_regnum (operands[1]);
21305   regno2 = true_regnum (operands[2]);
21306 
21307   /* We need to split only adds with non destructive
21308      destination operand.  */
21309   if (regno0 == regno1 || regno0 == regno2)
21310     return false;
21311   else
21312     return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21313 }
21314 
21315 /* Return true if we should emit lea instruction instead of mov
21316    instruction.  */
21317 
21318 bool
21319 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21320 {
21321   unsigned int regno0, regno1;
21322 
21323   /* Check if we need to optimize.  */
21324   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21325     return false;
21326 
21327   /* Use lea for reg to reg moves only.  */
21328   if (!REG_P (operands[0]) || !REG_P (operands[1]))
21329     return false;
21330 
21331   regno0 = true_regnum (operands[0]);
21332   regno1 = true_regnum (operands[1]);
21333 
21334   return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21335 }
21336 
21337 /* Return true if we need to split lea into a sequence of
21338    instructions to avoid AGU stalls. */
21339 
21340 bool
21341 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21342 {
21343   unsigned int regno0, regno1, regno2;
21344   int split_cost;
21345   struct ix86_address parts;
21346   int ok;
21347 
21348   /* Check we need to optimize.  */
21349   if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21350     return false;
21351 
21352   /* The "at least two components" test below might not catch simple
21353      move or zero extension insns if parts.base is non-NULL and parts.disp
21354      is const0_rtx as the only components in the address, e.g. if the
21355      register is %rbp or %r13.  As this test is much cheaper and moves or
21356      zero extensions are the common case, do this check first.  */
21357   if (REG_P (operands[1])
21358       || (SImode_address_operand (operands[1], VOIDmode)
21359 	  && REG_P (XEXP (operands[1], 0))))
21360     return false;
21361 
21362   /* Check if it is OK to split here.  */
21363   if (!ix86_ok_to_clobber_flags (insn))
21364     return false;
21365 
21366   ok = ix86_decompose_address (operands[1], &parts);
21367   gcc_assert (ok);
21368 
21369   /* There should be at least two components in the address.  */
21370   if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21371       + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21372     return false;
21373 
21374   /* We should not split into add if non legitimate pic
21375      operand is used as displacement. */
21376   if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21377     return false;
21378 
21379   regno0 = true_regnum (operands[0]) ;
21380   regno1 = INVALID_REGNUM;
21381   regno2 = INVALID_REGNUM;
21382 
21383   if (parts.base)
21384     regno1 = true_regnum (parts.base);
21385   if (parts.index)
21386     regno2 = true_regnum (parts.index);
21387 
21388   split_cost = 0;
21389 
21390   /* Compute how many cycles we will add to execution time
21391      if split lea into a sequence of instructions.  */
21392   if (parts.base || parts.index)
21393     {
21394       /* Have to use mov instruction if non desctructive
21395 	 destination form is used.  */
21396       if (regno1 != regno0 && regno2 != regno0)
21397 	split_cost += 1;
21398 
21399       /* Have to add index to base if both exist.  */
21400       if (parts.base && parts.index)
21401 	split_cost += 1;
21402 
21403       /* Have to use shift and adds if scale is 2 or greater.  */
21404       if (parts.scale > 1)
21405 	{
21406 	  if (regno0 != regno1)
21407 	    split_cost += 1;
21408 	  else if (regno2 == regno0)
21409 	    split_cost += 4;
21410 	  else
21411 	    split_cost += parts.scale;
21412 	}
21413 
21414       /* Have to use add instruction with immediate if
21415 	 disp is non zero.  */
21416       if (parts.disp && parts.disp != const0_rtx)
21417 	split_cost += 1;
21418 
21419       /* Subtract the price of lea.  */
21420       split_cost -= 1;
21421     }
21422 
21423   return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21424 				parts.scale > 1);
21425 }
21426 
21427 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21428    matches destination.  RTX includes clobber of FLAGS_REG.  */
21429 
21430 static void
21431 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21432 		 rtx dst, rtx src)
21433 {
21434   rtx op, clob;
21435 
21436   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21437   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21438 
21439   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21440 }
21441 
21442 /* Return true if regno1 def is nearest to the insn.  */
21443 
21444 static bool
21445 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21446 {
21447   rtx_insn *prev = insn;
21448   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21449 
21450   if (insn == start)
21451     return false;
21452   while (prev && prev != start)
21453     {
21454       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21455 	{
21456 	  prev = PREV_INSN (prev);
21457 	  continue;
21458 	}
21459       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21460 	return true;
21461       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21462 	return false;
21463       prev = PREV_INSN (prev);
21464     }
21465 
21466   /* None of the regs is defined in the bb.  */
21467   return false;
21468 }
21469 
21470 /* Split lea instructions into a sequence of instructions
21471    which are executed on ALU to avoid AGU stalls.
21472    It is assumed that it is allowed to clobber flags register
21473    at lea position.  */
21474 
21475 void
21476 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21477 {
21478   unsigned int regno0, regno1, regno2;
21479   struct ix86_address parts;
21480   rtx target, tmp;
21481   int ok, adds;
21482 
21483   ok = ix86_decompose_address (operands[1], &parts);
21484   gcc_assert (ok);
21485 
21486   target = gen_lowpart (mode, operands[0]);
21487 
21488   regno0 = true_regnum (target);
21489   regno1 = INVALID_REGNUM;
21490   regno2 = INVALID_REGNUM;
21491 
21492   if (parts.base)
21493     {
21494       parts.base = gen_lowpart (mode, parts.base);
21495       regno1 = true_regnum (parts.base);
21496     }
21497 
21498   if (parts.index)
21499     {
21500       parts.index = gen_lowpart (mode, parts.index);
21501       regno2 = true_regnum (parts.index);
21502     }
21503 
21504   if (parts.disp)
21505     parts.disp = gen_lowpart (mode, parts.disp);
21506 
21507   if (parts.scale > 1)
21508     {
21509       /* Case r1 = r1 + ...  */
21510       if (regno1 == regno0)
21511 	{
21512 	  /* If we have a case r1 = r1 + C * r2 then we
21513 	     should use multiplication which is very
21514 	     expensive.  Assume cost model is wrong if we
21515 	     have such case here.  */
21516 	  gcc_assert (regno2 != regno0);
21517 
21518 	  for (adds = parts.scale; adds > 0; adds--)
21519 	    ix86_emit_binop (PLUS, mode, target, parts.index);
21520 	}
21521       else
21522 	{
21523 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
21524 	  if (regno0 != regno2)
21525 	    emit_insn (gen_rtx_SET (target, parts.index));
21526 
21527 	  /* Use shift for scaling.  */
21528 	  ix86_emit_binop (ASHIFT, mode, target,
21529 			   GEN_INT (exact_log2 (parts.scale)));
21530 
21531 	  if (parts.base)
21532 	    ix86_emit_binop (PLUS, mode, target, parts.base);
21533 
21534 	  if (parts.disp && parts.disp != const0_rtx)
21535 	    ix86_emit_binop (PLUS, mode, target, parts.disp);
21536 	}
21537     }
21538   else if (!parts.base && !parts.index)
21539     {
21540       gcc_assert(parts.disp);
21541       emit_insn (gen_rtx_SET (target, parts.disp));
21542     }
21543   else
21544     {
21545       if (!parts.base)
21546 	{
21547 	  if (regno0 != regno2)
21548 	    emit_insn (gen_rtx_SET (target, parts.index));
21549 	}
21550       else if (!parts.index)
21551 	{
21552 	  if (regno0 != regno1)
21553 	    emit_insn (gen_rtx_SET (target, parts.base));
21554 	}
21555       else
21556 	{
21557 	  if (regno0 == regno1)
21558 	    tmp = parts.index;
21559 	  else if (regno0 == regno2)
21560 	    tmp = parts.base;
21561 	  else
21562 	    {
21563 	      rtx tmp1;
21564 
21565 	      /* Find better operand for SET instruction, depending
21566 		 on which definition is farther from the insn.  */
21567 	      if (find_nearest_reg_def (insn, regno1, regno2))
21568 		tmp = parts.index, tmp1 = parts.base;
21569 	      else
21570 		tmp = parts.base, tmp1 = parts.index;
21571 
21572 	      emit_insn (gen_rtx_SET (target, tmp));
21573 
21574 	      if (parts.disp && parts.disp != const0_rtx)
21575 		ix86_emit_binop (PLUS, mode, target, parts.disp);
21576 
21577 	      ix86_emit_binop (PLUS, mode, target, tmp1);
21578 	      return;
21579 	    }
21580 
21581 	  ix86_emit_binop (PLUS, mode, target, tmp);
21582 	}
21583 
21584       if (parts.disp && parts.disp != const0_rtx)
21585 	ix86_emit_binop (PLUS, mode, target, parts.disp);
21586     }
21587 }
21588 
21589 /* Return true if it is ok to optimize an ADD operation to LEA
21590    operation to avoid flag register consumation.  For most processors,
21591    ADD is faster than LEA.  For the processors like BONNELL, if the
21592    destination register of LEA holds an actual address which will be
21593    used soon, LEA is better and otherwise ADD is better.  */
21594 
21595 bool
21596 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21597 {
21598   unsigned int regno0 = true_regnum (operands[0]);
21599   unsigned int regno1 = true_regnum (operands[1]);
21600   unsigned int regno2 = true_regnum (operands[2]);
21601 
21602   /* If a = b + c, (a!=b && a!=c), must use lea form. */
21603   if (regno0 != regno1 && regno0 != regno2)
21604     return true;
21605 
21606   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21607     return false;
21608 
21609   return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21610 }
21611 
21612 /* Return true if destination reg of SET_BODY is shift count of
21613    USE_BODY.  */
21614 
21615 static bool
21616 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21617 {
21618   rtx set_dest;
21619   rtx shift_rtx;
21620   int i;
21621 
21622   /* Retrieve destination of SET_BODY.  */
21623   switch (GET_CODE (set_body))
21624     {
21625     case SET:
21626       set_dest = SET_DEST (set_body);
21627       if (!set_dest || !REG_P (set_dest))
21628 	return false;
21629       break;
21630     case PARALLEL:
21631       for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21632 	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21633 					  use_body))
21634 	  return true;
21635       /* FALLTHROUGH */
21636     default:
21637       return false;
21638     }
21639 
21640   /* Retrieve shift count of USE_BODY.  */
21641   switch (GET_CODE (use_body))
21642     {
21643     case SET:
21644       shift_rtx = XEXP (use_body, 1);
21645       break;
21646     case PARALLEL:
21647       for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21648 	if (ix86_dep_by_shift_count_body (set_body,
21649 					  XVECEXP (use_body, 0, i)))
21650 	  return true;
21651       /* FALLTHROUGH */
21652     default:
21653       return false;
21654     }
21655 
21656   if (shift_rtx
21657       && (GET_CODE (shift_rtx) == ASHIFT
21658 	  || GET_CODE (shift_rtx) == LSHIFTRT
21659 	  || GET_CODE (shift_rtx) == ASHIFTRT
21660 	  || GET_CODE (shift_rtx) == ROTATE
21661 	  || GET_CODE (shift_rtx) == ROTATERT))
21662     {
21663       rtx shift_count = XEXP (shift_rtx, 1);
21664 
21665       /* Return true if shift count is dest of SET_BODY.  */
21666       if (REG_P (shift_count))
21667 	{
21668 	  /* Add check since it can be invoked before register
21669 	     allocation in pre-reload schedule.  */
21670 	  if (reload_completed
21671 	      && true_regnum (set_dest) == true_regnum (shift_count))
21672 	    return true;
21673 	  else if (REGNO(set_dest) == REGNO(shift_count))
21674 	    return true;
21675 	}
21676     }
21677 
21678   return false;
21679 }
21680 
21681 /* Return true if destination reg of SET_INSN is shift count of
21682    USE_INSN.  */
21683 
21684 bool
21685 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21686 {
21687   return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21688 				       PATTERN (use_insn));
21689 }
21690 
21691 /* Return TRUE or FALSE depending on whether the unary operator meets the
21692    appropriate constraints.  */
21693 
21694 bool
21695 ix86_unary_operator_ok (enum rtx_code,
21696 			machine_mode,
21697 			rtx operands[2])
21698 {
21699   /* If one of operands is memory, source and destination must match.  */
21700   if ((MEM_P (operands[0])
21701        || MEM_P (operands[1]))
21702       && ! rtx_equal_p (operands[0], operands[1]))
21703     return false;
21704   return true;
21705 }
21706 
21707 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21708    are ok, keeping in mind the possible movddup alternative.  */
21709 
21710 bool
21711 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21712 {
21713   if (MEM_P (operands[0]))
21714     return rtx_equal_p (operands[0], operands[1 + high]);
21715   if (MEM_P (operands[1]) && MEM_P (operands[2]))
21716     return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21717   return true;
21718 }
21719 
21720 /* Post-reload splitter for converting an SF or DFmode value in an
21721    SSE register into an unsigned SImode.  */
21722 
21723 void
21724 ix86_split_convert_uns_si_sse (rtx operands[])
21725 {
21726   machine_mode vecmode;
21727   rtx value, large, zero_or_two31, input, two31, x;
21728 
21729   large = operands[1];
21730   zero_or_two31 = operands[2];
21731   input = operands[3];
21732   two31 = operands[4];
21733   vecmode = GET_MODE (large);
21734   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21735 
21736   /* Load up the value into the low element.  We must ensure that the other
21737      elements are valid floats -- zero is the easiest such value.  */
21738   if (MEM_P (input))
21739     {
21740       if (vecmode == V4SFmode)
21741 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21742       else
21743 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21744     }
21745   else
21746     {
21747       input = gen_rtx_REG (vecmode, REGNO (input));
21748       emit_move_insn (value, CONST0_RTX (vecmode));
21749       if (vecmode == V4SFmode)
21750 	emit_insn (gen_sse_movss (value, value, input));
21751       else
21752 	emit_insn (gen_sse2_movsd (value, value, input));
21753     }
21754 
21755   emit_move_insn (large, two31);
21756   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21757 
21758   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21759   emit_insn (gen_rtx_SET (large, x));
21760 
21761   x = gen_rtx_AND (vecmode, zero_or_two31, large);
21762   emit_insn (gen_rtx_SET (zero_or_two31, x));
21763 
21764   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21765   emit_insn (gen_rtx_SET (value, x));
21766 
21767   large = gen_rtx_REG (V4SImode, REGNO (large));
21768   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21769 
21770   x = gen_rtx_REG (V4SImode, REGNO (value));
21771   if (vecmode == V4SFmode)
21772     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21773   else
21774     emit_insn (gen_sse2_cvttpd2dq (x, value));
21775   value = x;
21776 
21777   emit_insn (gen_xorv4si3 (value, value, large));
21778 }
21779 
21780 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21781    Expects the 64-bit DImode to be supplied in a pair of integral
21782    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
21783    -mfpmath=sse, !optimize_size only.  */
21784 
21785 void
21786 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21787 {
21788   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21789   rtx int_xmm, fp_xmm;
21790   rtx biases, exponents;
21791   rtx x;
21792 
21793   int_xmm = gen_reg_rtx (V4SImode);
21794   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21795     emit_insn (gen_movdi_to_sse (int_xmm, input));
21796   else if (TARGET_SSE_SPLIT_REGS)
21797     {
21798       emit_clobber (int_xmm);
21799       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21800     }
21801   else
21802     {
21803       x = gen_reg_rtx (V2DImode);
21804       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21805       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21806     }
21807 
21808   x = gen_rtx_CONST_VECTOR (V4SImode,
21809 			    gen_rtvec (4, GEN_INT (0x43300000UL),
21810 				       GEN_INT (0x45300000UL),
21811 				       const0_rtx, const0_rtx));
21812   exponents = validize_mem (force_const_mem (V4SImode, x));
21813 
21814   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21815   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21816 
21817   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21818      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21819      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21820      (0x1.0p84 + double(fp_value_hi_xmm)).
21821      Note these exponents differ by 32.  */
21822 
21823   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21824 
21825   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21826      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
21827   real_ldexp (&bias_lo_rvt, &dconst1, 52);
21828   real_ldexp (&bias_hi_rvt, &dconst1, 84);
21829   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21830   x = const_double_from_real_value (bias_hi_rvt, DFmode);
21831   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21832   biases = validize_mem (force_const_mem (V2DFmode, biases));
21833   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21834 
21835   /* Add the upper and lower DFmode values together.  */
21836   if (TARGET_SSE3)
21837     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21838   else
21839     {
21840       x = copy_to_mode_reg (V2DFmode, fp_xmm);
21841       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21842       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21843     }
21844 
21845   ix86_expand_vector_extract (false, target, fp_xmm, 0);
21846 }
21847 
21848 /* Not used, but eases macroization of patterns.  */
21849 void
21850 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21851 {
21852   gcc_unreachable ();
21853 }
21854 
21855 /* Convert an unsigned SImode value into a DFmode.  Only currently used
21856    for SSE, but applicable anywhere.  */
21857 
21858 void
21859 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21860 {
21861   REAL_VALUE_TYPE TWO31r;
21862   rtx x, fp;
21863 
21864   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21865 			   NULL, 1, OPTAB_DIRECT);
21866 
21867   fp = gen_reg_rtx (DFmode);
21868   emit_insn (gen_floatsidf2 (fp, x));
21869 
21870   real_ldexp (&TWO31r, &dconst1, 31);
21871   x = const_double_from_real_value (TWO31r, DFmode);
21872 
21873   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21874   if (x != target)
21875     emit_move_insn (target, x);
21876 }
21877 
21878 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
21879    32-bit mode; otherwise we have a direct convert instruction.  */
21880 
21881 void
21882 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21883 {
21884   REAL_VALUE_TYPE TWO32r;
21885   rtx fp_lo, fp_hi, x;
21886 
21887   fp_lo = gen_reg_rtx (DFmode);
21888   fp_hi = gen_reg_rtx (DFmode);
21889 
21890   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21891 
21892   real_ldexp (&TWO32r, &dconst1, 32);
21893   x = const_double_from_real_value (TWO32r, DFmode);
21894   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21895 
21896   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21897 
21898   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21899 			   0, OPTAB_DIRECT);
21900   if (x != target)
21901     emit_move_insn (target, x);
21902 }
21903 
21904 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21905    For x86_32, -mfpmath=sse, !optimize_size only.  */
21906 void
21907 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21908 {
21909   REAL_VALUE_TYPE ONE16r;
21910   rtx fp_hi, fp_lo, int_hi, int_lo, x;
21911 
21912   real_ldexp (&ONE16r, &dconst1, 16);
21913   x = const_double_from_real_value (ONE16r, SFmode);
21914   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21915 				      NULL, 0, OPTAB_DIRECT);
21916   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21917 				      NULL, 0, OPTAB_DIRECT);
21918   fp_hi = gen_reg_rtx (SFmode);
21919   fp_lo = gen_reg_rtx (SFmode);
21920   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21921   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21922   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21923 			       0, OPTAB_DIRECT);
21924   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21925 			       0, OPTAB_DIRECT);
21926   if (!rtx_equal_p (target, fp_hi))
21927     emit_move_insn (target, fp_hi);
21928 }
21929 
21930 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
21931    a vector of unsigned ints VAL to vector of floats TARGET.  */
21932 
21933 void
21934 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21935 {
21936   rtx tmp[8];
21937   REAL_VALUE_TYPE TWO16r;
21938   machine_mode intmode = GET_MODE (val);
21939   machine_mode fltmode = GET_MODE (target);
21940   rtx (*cvt) (rtx, rtx);
21941 
21942   if (intmode == V4SImode)
21943     cvt = gen_floatv4siv4sf2;
21944   else
21945     cvt = gen_floatv8siv8sf2;
21946   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21947   tmp[0] = force_reg (intmode, tmp[0]);
21948   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21949 				OPTAB_DIRECT);
21950   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21951 				NULL_RTX, 1, OPTAB_DIRECT);
21952   tmp[3] = gen_reg_rtx (fltmode);
21953   emit_insn (cvt (tmp[3], tmp[1]));
21954   tmp[4] = gen_reg_rtx (fltmode);
21955   emit_insn (cvt (tmp[4], tmp[2]));
21956   real_ldexp (&TWO16r, &dconst1, 16);
21957   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21958   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21959   tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21960 				OPTAB_DIRECT);
21961   tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21962 				OPTAB_DIRECT);
21963   if (tmp[7] != target)
21964     emit_move_insn (target, tmp[7]);
21965 }
21966 
21967 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21968    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21969    This is done by doing just signed conversion if < 0x1p31, and otherwise by
21970    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
21971 
21972 rtx
21973 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21974 {
21975   REAL_VALUE_TYPE TWO31r;
21976   rtx two31r, tmp[4];
21977   machine_mode mode = GET_MODE (val);
21978   machine_mode scalarmode = GET_MODE_INNER (mode);
21979   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21980   rtx (*cmp) (rtx, rtx, rtx, rtx);
21981   int i;
21982 
21983   for (i = 0; i < 3; i++)
21984     tmp[i] = gen_reg_rtx (mode);
21985   real_ldexp (&TWO31r, &dconst1, 31);
21986   two31r = const_double_from_real_value (TWO31r, scalarmode);
21987   two31r = ix86_build_const_vector (mode, 1, two31r);
21988   two31r = force_reg (mode, two31r);
21989   switch (mode)
21990     {
21991     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21992     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21993     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21994     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21995     default: gcc_unreachable ();
21996     }
21997   tmp[3] = gen_rtx_LE (mode, two31r, val);
21998   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21999   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22000 				0, OPTAB_DIRECT);
22001   if (intmode == V4SImode || TARGET_AVX2)
22002     *xorp = expand_simple_binop (intmode, ASHIFT,
22003 				 gen_lowpart (intmode, tmp[0]),
22004 				 GEN_INT (31), NULL_RTX, 0,
22005 				 OPTAB_DIRECT);
22006   else
22007     {
22008       rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22009       two31 = ix86_build_const_vector (intmode, 1, two31);
22010       *xorp = expand_simple_binop (intmode, AND,
22011 				   gen_lowpart (intmode, tmp[0]),
22012 				   two31, NULL_RTX, 0,
22013 				   OPTAB_DIRECT);
22014     }
22015   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22016 			      0, OPTAB_DIRECT);
22017 }
22018 
22019 /* A subroutine of ix86_build_signbit_mask.  If VECT is true,
22020    then replicate the value for all elements of the vector
22021    register.  */
22022 
22023 rtx
22024 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22025 {
22026   int i, n_elt;
22027   rtvec v;
22028   machine_mode scalar_mode;
22029 
22030   switch (mode)
22031     {
22032     case E_V64QImode:
22033     case E_V32QImode:
22034     case E_V16QImode:
22035     case E_V32HImode:
22036     case E_V16HImode:
22037     case E_V8HImode:
22038     case E_V16SImode:
22039     case E_V8SImode:
22040     case E_V4SImode:
22041     case E_V8DImode:
22042     case E_V4DImode:
22043     case E_V2DImode:
22044       gcc_assert (vect);
22045       /* FALLTHRU */
22046     case E_V16SFmode:
22047     case E_V8SFmode:
22048     case E_V4SFmode:
22049     case E_V8DFmode:
22050     case E_V4DFmode:
22051     case E_V2DFmode:
22052       n_elt = GET_MODE_NUNITS (mode);
22053       v = rtvec_alloc (n_elt);
22054       scalar_mode = GET_MODE_INNER (mode);
22055 
22056       RTVEC_ELT (v, 0) = value;
22057 
22058       for (i = 1; i < n_elt; ++i)
22059 	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22060 
22061       return gen_rtx_CONST_VECTOR (mode, v);
22062 
22063     default:
22064       gcc_unreachable ();
22065     }
22066 }
22067 
22068 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22069    and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
22070    for an SSE register.  If VECT is true, then replicate the mask for
22071    all elements of the vector register.  If INVERT is true, then create
22072    a mask excluding the sign bit.  */
22073 
22074 rtx
22075 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22076 {
22077   machine_mode vec_mode, imode;
22078   wide_int w;
22079   rtx mask, v;
22080 
22081   switch (mode)
22082     {
22083     case E_V16SImode:
22084     case E_V16SFmode:
22085     case E_V8SImode:
22086     case E_V4SImode:
22087     case E_V8SFmode:
22088     case E_V4SFmode:
22089       vec_mode = mode;
22090       imode = SImode;
22091       break;
22092 
22093     case E_V8DImode:
22094     case E_V4DImode:
22095     case E_V2DImode:
22096     case E_V8DFmode:
22097     case E_V4DFmode:
22098     case E_V2DFmode:
22099       vec_mode = mode;
22100       imode = DImode;
22101       break;
22102 
22103     case E_TImode:
22104     case E_TFmode:
22105       vec_mode = VOIDmode;
22106       imode = TImode;
22107       break;
22108 
22109     default:
22110       gcc_unreachable ();
22111     }
22112 
22113   machine_mode inner_mode = GET_MODE_INNER (mode);
22114   w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22115 			   GET_MODE_BITSIZE (inner_mode));
22116   if (invert)
22117     w = wi::bit_not (w);
22118 
22119   /* Force this value into the low part of a fp vector constant.  */
22120   mask = immed_wide_int_const (w, imode);
22121   mask = gen_lowpart (inner_mode, mask);
22122 
22123   if (vec_mode == VOIDmode)
22124     return force_reg (inner_mode, mask);
22125 
22126   v = ix86_build_const_vector (vec_mode, vect, mask);
22127   return force_reg (vec_mode, v);
22128 }
22129 
22130 /* Generate code for floating point ABS or NEG.  */
22131 
22132 void
22133 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22134 				rtx operands[])
22135 {
22136   rtx mask, set, dst, src;
22137   bool use_sse = false;
22138   bool vector_mode = VECTOR_MODE_P (mode);
22139   machine_mode vmode = mode;
22140 
22141   if (vector_mode)
22142     use_sse = true;
22143   else if (mode == TFmode)
22144     use_sse = true;
22145   else if (TARGET_SSE_MATH)
22146     {
22147       use_sse = SSE_FLOAT_MODE_P (mode);
22148       if (mode == SFmode)
22149 	vmode = V4SFmode;
22150       else if (mode == DFmode)
22151 	vmode = V2DFmode;
22152     }
22153 
22154   /* NEG and ABS performed with SSE use bitwise mask operations.
22155      Create the appropriate mask now.  */
22156   if (use_sse)
22157     mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22158   else
22159     mask = NULL_RTX;
22160 
22161   dst = operands[0];
22162   src = operands[1];
22163 
22164   set = gen_rtx_fmt_e (code, mode, src);
22165   set = gen_rtx_SET (dst, set);
22166 
22167   if (mask)
22168     {
22169       rtx use, clob;
22170       rtvec par;
22171 
22172       use = gen_rtx_USE (VOIDmode, mask);
22173       if (vector_mode)
22174 	par = gen_rtvec (2, set, use);
22175       else
22176 	{
22177           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22178 	  par = gen_rtvec (3, set, use, clob);
22179         }
22180       emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22181     }
22182   else
22183     emit_insn (set);
22184 }
22185 
22186 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
22187 
22188 void
22189 ix86_expand_copysign (rtx operands[])
22190 {
22191   machine_mode mode, vmode;
22192   rtx dest, op0, op1, mask, nmask;
22193 
22194   dest = operands[0];
22195   op0 = operands[1];
22196   op1 = operands[2];
22197 
22198   mode = GET_MODE (dest);
22199 
22200   if (mode == SFmode)
22201     vmode = V4SFmode;
22202   else if (mode == DFmode)
22203     vmode = V2DFmode;
22204   else
22205     vmode = mode;
22206 
22207   if (CONST_DOUBLE_P (op0))
22208     {
22209       rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22210 
22211       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22212 	op0 = simplify_unary_operation (ABS, mode, op0, mode);
22213 
22214       if (mode == SFmode || mode == DFmode)
22215 	{
22216 	  if (op0 == CONST0_RTX (mode))
22217 	    op0 = CONST0_RTX (vmode);
22218 	  else
22219 	    {
22220 	      rtx v = ix86_build_const_vector (vmode, false, op0);
22221 
22222 	      op0 = force_reg (vmode, v);
22223 	    }
22224 	}
22225       else if (op0 != CONST0_RTX (mode))
22226 	op0 = force_reg (mode, op0);
22227 
22228       mask = ix86_build_signbit_mask (vmode, 0, 0);
22229 
22230       if (mode == SFmode)
22231 	copysign_insn = gen_copysignsf3_const;
22232       else if (mode == DFmode)
22233 	copysign_insn = gen_copysigndf3_const;
22234       else
22235 	copysign_insn = gen_copysigntf3_const;
22236 
22237       emit_insn (copysign_insn (dest, op0, op1, mask));
22238     }
22239   else
22240     {
22241       rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22242 
22243       nmask = ix86_build_signbit_mask (vmode, 0, 1);
22244       mask = ix86_build_signbit_mask (vmode, 0, 0);
22245 
22246       if (mode == SFmode)
22247 	copysign_insn = gen_copysignsf3_var;
22248       else if (mode == DFmode)
22249 	copysign_insn = gen_copysigndf3_var;
22250       else
22251 	copysign_insn = gen_copysigntf3_var;
22252 
22253       emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22254     }
22255 }
22256 
22257 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
22258    be a constant, and so has already been expanded into a vector constant.  */
22259 
22260 void
22261 ix86_split_copysign_const (rtx operands[])
22262 {
22263   machine_mode mode, vmode;
22264   rtx dest, op0, mask, x;
22265 
22266   dest = operands[0];
22267   op0 = operands[1];
22268   mask = operands[3];
22269 
22270   mode = GET_MODE (dest);
22271   vmode = GET_MODE (mask);
22272 
22273   dest = lowpart_subreg (vmode, dest, mode);
22274   x = gen_rtx_AND (vmode, dest, mask);
22275   emit_insn (gen_rtx_SET (dest, x));
22276 
22277   if (op0 != CONST0_RTX (vmode))
22278     {
22279       x = gen_rtx_IOR (vmode, dest, op0);
22280       emit_insn (gen_rtx_SET (dest, x));
22281     }
22282 }
22283 
22284 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
22285    so we have to do two masks.  */
22286 
22287 void
22288 ix86_split_copysign_var (rtx operands[])
22289 {
22290   machine_mode mode, vmode;
22291   rtx dest, scratch, op0, op1, mask, nmask, x;
22292 
22293   dest = operands[0];
22294   scratch = operands[1];
22295   op0 = operands[2];
22296   op1 = operands[3];
22297   nmask = operands[4];
22298   mask = operands[5];
22299 
22300   mode = GET_MODE (dest);
22301   vmode = GET_MODE (mask);
22302 
22303   if (rtx_equal_p (op0, op1))
22304     {
22305       /* Shouldn't happen often (it's useless, obviously), but when it does
22306 	 we'd generate incorrect code if we continue below.  */
22307       emit_move_insn (dest, op0);
22308       return;
22309     }
22310 
22311   if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
22312     {
22313       gcc_assert (REGNO (op1) == REGNO (scratch));
22314 
22315       x = gen_rtx_AND (vmode, scratch, mask);
22316       emit_insn (gen_rtx_SET (scratch, x));
22317 
22318       dest = mask;
22319       op0 = lowpart_subreg (vmode, op0, mode);
22320       x = gen_rtx_NOT (vmode, dest);
22321       x = gen_rtx_AND (vmode, x, op0);
22322       emit_insn (gen_rtx_SET (dest, x));
22323     }
22324   else
22325     {
22326       if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
22327 	{
22328 	  x = gen_rtx_AND (vmode, scratch, mask);
22329 	}
22330       else						/* alternative 2,4 */
22331 	{
22332           gcc_assert (REGNO (mask) == REGNO (scratch));
22333           op1 = lowpart_subreg (vmode, op1, mode);
22334 	  x = gen_rtx_AND (vmode, scratch, op1);
22335 	}
22336       emit_insn (gen_rtx_SET (scratch, x));
22337 
22338       if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
22339 	{
22340 	  dest = lowpart_subreg (vmode, op0, mode);
22341 	  x = gen_rtx_AND (vmode, dest, nmask);
22342 	}
22343       else						/* alternative 3,4 */
22344 	{
22345           gcc_assert (REGNO (nmask) == REGNO (dest));
22346 	  dest = nmask;
22347 	  op0 = lowpart_subreg (vmode, op0, mode);
22348 	  x = gen_rtx_AND (vmode, dest, op0);
22349 	}
22350       emit_insn (gen_rtx_SET (dest, x));
22351     }
22352 
22353   x = gen_rtx_IOR (vmode, dest, scratch);
22354   emit_insn (gen_rtx_SET (dest, x));
22355 }
22356 
22357 /* Return TRUE or FALSE depending on whether the first SET in INSN
22358    has source and destination with matching CC modes, and that the
22359    CC mode is at least as constrained as REQ_MODE.  */
22360 
22361 bool
22362 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22363 {
22364   rtx set;
22365   machine_mode set_mode;
22366 
22367   set = PATTERN (insn);
22368   if (GET_CODE (set) == PARALLEL)
22369     set = XVECEXP (set, 0, 0);
22370   gcc_assert (GET_CODE (set) == SET);
22371   gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22372 
22373   set_mode = GET_MODE (SET_DEST (set));
22374   switch (set_mode)
22375     {
22376     case E_CCNOmode:
22377       if (req_mode != CCNOmode
22378 	  && (req_mode != CCmode
22379 	      || XEXP (SET_SRC (set), 1) != const0_rtx))
22380 	return false;
22381       break;
22382     case E_CCmode:
22383       if (req_mode == CCGCmode)
22384 	return false;
22385       /* FALLTHRU */
22386     case E_CCGCmode:
22387       if (req_mode == CCGOCmode || req_mode == CCNOmode)
22388 	return false;
22389       /* FALLTHRU */
22390     case E_CCGOCmode:
22391       if (req_mode == CCZmode)
22392 	return false;
22393       /* FALLTHRU */
22394     case E_CCZmode:
22395       break;
22396 
22397     case E_CCGZmode:
22398 
22399     case E_CCAmode:
22400     case E_CCCmode:
22401     case E_CCOmode:
22402     case E_CCPmode:
22403     case E_CCSmode:
22404       if (set_mode != req_mode)
22405 	return false;
22406       break;
22407 
22408     default:
22409       gcc_unreachable ();
22410     }
22411 
22412   return GET_MODE (SET_SRC (set)) == set_mode;
22413 }
22414 
22415 /* Generate insn patterns to do an integer compare of OPERANDS.  */
22416 
22417 static rtx
22418 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22419 {
22420   machine_mode cmpmode;
22421   rtx tmp, flags;
22422 
22423   cmpmode = SELECT_CC_MODE (code, op0, op1);
22424   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22425 
22426   /* This is very simple, but making the interface the same as in the
22427      FP case makes the rest of the code easier.  */
22428   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22429   emit_insn (gen_rtx_SET (flags, tmp));
22430 
22431   /* Return the test that should be put into the flags user, i.e.
22432      the bcc, scc, or cmov instruction.  */
22433   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22434 }
22435 
22436 /* Figure out whether to use unordered fp comparisons.  */
22437 
22438 static bool
22439 ix86_unordered_fp_compare (enum rtx_code code)
22440 {
22441   if (!TARGET_IEEE_FP)
22442     return false;
22443 
22444   switch (code)
22445     {
22446     case GT:
22447     case GE:
22448     case LT:
22449     case LE:
22450       return false;
22451 
22452     case EQ:
22453     case NE:
22454 
22455     case LTGT:
22456     case UNORDERED:
22457     case ORDERED:
22458     case UNLT:
22459     case UNLE:
22460     case UNGT:
22461     case UNGE:
22462     case UNEQ:
22463       return true;
22464 
22465     default:
22466       gcc_unreachable ();
22467     }
22468 }
22469 
22470 machine_mode
22471 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22472 {
22473   machine_mode mode = GET_MODE (op0);
22474 
22475   if (SCALAR_FLOAT_MODE_P (mode))
22476     {
22477       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22478       return CCFPmode;
22479     }
22480 
22481   switch (code)
22482     {
22483       /* Only zero flag is needed.  */
22484     case EQ:			/* ZF=0 */
22485     case NE:			/* ZF!=0 */
22486       return CCZmode;
22487       /* Codes needing carry flag.  */
22488     case GEU:			/* CF=0 */
22489     case LTU:			/* CF=1 */
22490       /* Detect overflow checks.  They need just the carry flag.  */
22491       if (GET_CODE (op0) == PLUS
22492 	  && (rtx_equal_p (op1, XEXP (op0, 0))
22493 	      || rtx_equal_p (op1, XEXP (op0, 1))))
22494 	return CCCmode;
22495       else
22496 	return CCmode;
22497     case GTU:			/* CF=0 & ZF=0 */
22498     case LEU:			/* CF=1 | ZF=1 */
22499       return CCmode;
22500       /* Codes possibly doable only with sign flag when
22501          comparing against zero.  */
22502     case GE:			/* SF=OF   or   SF=0 */
22503     case LT:			/* SF<>OF  or   SF=1 */
22504       if (op1 == const0_rtx)
22505 	return CCGOCmode;
22506       else
22507 	/* For other cases Carry flag is not required.  */
22508 	return CCGCmode;
22509       /* Codes doable only with sign flag when comparing
22510          against zero, but we miss jump instruction for it
22511          so we need to use relational tests against overflow
22512          that thus needs to be zero.  */
22513     case GT:			/* ZF=0 & SF=OF */
22514     case LE:			/* ZF=1 | SF<>OF */
22515       if (op1 == const0_rtx)
22516 	return CCNOmode;
22517       else
22518 	return CCGCmode;
22519       /* strcmp pattern do (use flags) and combine may ask us for proper
22520 	 mode.  */
22521     case USE:
22522       return CCmode;
22523     default:
22524       gcc_unreachable ();
22525     }
22526 }
22527 
22528 /* Return the fixed registers used for condition codes.  */
22529 
22530 static bool
22531 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22532 {
22533   *p1 = FLAGS_REG;
22534   *p2 = FPSR_REG;
22535   return true;
22536 }
22537 
22538 /* If two condition code modes are compatible, return a condition code
22539    mode which is compatible with both.  Otherwise, return
22540    VOIDmode.  */
22541 
22542 static machine_mode
22543 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22544 {
22545   if (m1 == m2)
22546     return m1;
22547 
22548   if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22549     return VOIDmode;
22550 
22551   if ((m1 == CCGCmode && m2 == CCGOCmode)
22552       || (m1 == CCGOCmode && m2 == CCGCmode))
22553     return CCGCmode;
22554 
22555   if ((m1 == CCNOmode && m2 == CCGOCmode)
22556       || (m1 == CCGOCmode && m2 == CCNOmode))
22557     return CCNOmode;
22558 
22559   if (m1 == CCZmode
22560       && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22561     return m2;
22562   else if (m2 == CCZmode
22563 	   && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22564     return m1;
22565 
22566   switch (m1)
22567     {
22568     default:
22569       gcc_unreachable ();
22570 
22571     case E_CCmode:
22572     case E_CCGCmode:
22573     case E_CCGOCmode:
22574     case E_CCNOmode:
22575     case E_CCAmode:
22576     case E_CCCmode:
22577     case E_CCOmode:
22578     case E_CCPmode:
22579     case E_CCSmode:
22580     case E_CCZmode:
22581       switch (m2)
22582 	{
22583 	default:
22584 	  return VOIDmode;
22585 
22586 	case E_CCmode:
22587 	case E_CCGCmode:
22588 	case E_CCGOCmode:
22589 	case E_CCNOmode:
22590 	case E_CCAmode:
22591 	case E_CCCmode:
22592 	case E_CCOmode:
22593 	case E_CCPmode:
22594 	case E_CCSmode:
22595 	case E_CCZmode:
22596 	  return CCmode;
22597 	}
22598 
22599     case E_CCFPmode:
22600       /* These are only compatible with themselves, which we already
22601 	 checked above.  */
22602       return VOIDmode;
22603     }
22604 }
22605 
22606 
22607 /* Return a comparison we can do and that it is equivalent to
22608    swap_condition (code) apart possibly from orderedness.
22609    But, never change orderedness if TARGET_IEEE_FP, returning
22610    UNKNOWN in that case if necessary.  */
22611 
22612 static enum rtx_code
22613 ix86_fp_swap_condition (enum rtx_code code)
22614 {
22615   switch (code)
22616     {
22617     case GT:                   /* GTU - CF=0 & ZF=0 */
22618       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22619     case GE:                   /* GEU - CF=0 */
22620       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22621     case UNLT:                 /* LTU - CF=1 */
22622       return TARGET_IEEE_FP ? UNKNOWN : GT;
22623     case UNLE:                 /* LEU - CF=1 | ZF=1 */
22624       return TARGET_IEEE_FP ? UNKNOWN : GE;
22625     default:
22626       return swap_condition (code);
22627     }
22628 }
22629 
22630 /* Return cost of comparison CODE using the best strategy for performance.
22631    All following functions do use number of instructions as a cost metrics.
22632    In future this should be tweaked to compute bytes for optimize_size and
22633    take into account performance of various instructions on various CPUs.  */
22634 
22635 static int
22636 ix86_fp_comparison_cost (enum rtx_code code)
22637 {
22638   int arith_cost;
22639 
22640   /* The cost of code using bit-twiddling on %ah.  */
22641   switch (code)
22642     {
22643     case UNLE:
22644     case UNLT:
22645     case LTGT:
22646     case GT:
22647     case GE:
22648     case UNORDERED:
22649     case ORDERED:
22650     case UNEQ:
22651       arith_cost = 4;
22652       break;
22653     case LT:
22654     case NE:
22655     case EQ:
22656     case UNGE:
22657       arith_cost = TARGET_IEEE_FP ? 5 : 4;
22658       break;
22659     case LE:
22660     case UNGT:
22661       arith_cost = TARGET_IEEE_FP ? 6 : 4;
22662       break;
22663     default:
22664       gcc_unreachable ();
22665     }
22666 
22667   switch (ix86_fp_comparison_strategy (code))
22668     {
22669     case IX86_FPCMP_COMI:
22670       return arith_cost > 4 ? 3 : 2;
22671     case IX86_FPCMP_SAHF:
22672       return arith_cost > 4 ? 4 : 3;
22673     default:
22674       return arith_cost;
22675     }
22676 }
22677 
22678 /* Return strategy to use for floating-point.  We assume that fcomi is always
22679    preferrable where available, since that is also true when looking at size
22680    (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
22681 
22682 enum ix86_fpcmp_strategy
22683 ix86_fp_comparison_strategy (enum rtx_code)
22684 {
22685   /* Do fcomi/sahf based test when profitable.  */
22686 
22687   if (TARGET_CMOVE)
22688     return IX86_FPCMP_COMI;
22689 
22690   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22691     return IX86_FPCMP_SAHF;
22692 
22693   return IX86_FPCMP_ARITH;
22694 }
22695 
22696 /* Swap, force into registers, or otherwise massage the two operands
22697    to a fp comparison.  The operands are updated in place; the new
22698    comparison code is returned.  */
22699 
22700 static enum rtx_code
22701 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22702 {
22703   bool unordered_compare = ix86_unordered_fp_compare (code);
22704   rtx op0 = *pop0, op1 = *pop1;
22705   machine_mode op_mode = GET_MODE (op0);
22706   bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22707 
22708   /* All of the unordered compare instructions only work on registers.
22709      The same is true of the fcomi compare instructions.  The XFmode
22710      compare instructions require registers except when comparing
22711      against zero or when converting operand 1 from fixed point to
22712      floating point.  */
22713 
22714   if (!is_sse
22715       && (unordered_compare
22716 	  || (op_mode == XFmode
22717 	      && ! (standard_80387_constant_p (op0) == 1
22718 		    || standard_80387_constant_p (op1) == 1)
22719 	      && GET_CODE (op1) != FLOAT)
22720 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22721     {
22722       op0 = force_reg (op_mode, op0);
22723       op1 = force_reg (op_mode, op1);
22724     }
22725   else
22726     {
22727       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
22728 	 things around if they appear profitable, otherwise force op0
22729 	 into a register.  */
22730 
22731       if (standard_80387_constant_p (op0) == 0
22732 	  || (MEM_P (op0)
22733 	      && ! (standard_80387_constant_p (op1) == 0
22734 		    || MEM_P (op1))))
22735 	{
22736 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
22737 	  if (new_code != UNKNOWN)
22738 	    {
22739 	      std::swap (op0, op1);
22740 	      code = new_code;
22741 	    }
22742 	}
22743 
22744       if (!REG_P (op0))
22745 	op0 = force_reg (op_mode, op0);
22746 
22747       if (CONSTANT_P (op1))
22748 	{
22749 	  int tmp = standard_80387_constant_p (op1);
22750 	  if (tmp == 0)
22751 	    op1 = validize_mem (force_const_mem (op_mode, op1));
22752 	  else if (tmp == 1)
22753 	    {
22754 	      if (TARGET_CMOVE)
22755 		op1 = force_reg (op_mode, op1);
22756 	    }
22757 	  else
22758 	    op1 = force_reg (op_mode, op1);
22759 	}
22760     }
22761 
22762   /* Try to rearrange the comparison to make it cheaper.  */
22763   if (ix86_fp_comparison_cost (code)
22764       > ix86_fp_comparison_cost (swap_condition (code))
22765       && (REG_P (op1) || can_create_pseudo_p ()))
22766     {
22767       std::swap (op0, op1);
22768       code = swap_condition (code);
22769       if (!REG_P (op0))
22770 	op0 = force_reg (op_mode, op0);
22771     }
22772 
22773   *pop0 = op0;
22774   *pop1 = op1;
22775   return code;
22776 }
22777 
22778 /* Convert comparison codes we use to represent FP comparison to integer
22779    code that will result in proper branch.  Return UNKNOWN if no such code
22780    is available.  */
22781 
22782 enum rtx_code
22783 ix86_fp_compare_code_to_integer (enum rtx_code code)
22784 {
22785   switch (code)
22786     {
22787     case GT:
22788       return GTU;
22789     case GE:
22790       return GEU;
22791     case ORDERED:
22792     case UNORDERED:
22793       return code;
22794     case UNEQ:
22795       return EQ;
22796     case UNLT:
22797       return LTU;
22798     case UNLE:
22799       return LEU;
22800     case LTGT:
22801       return NE;
22802     default:
22803       return UNKNOWN;
22804     }
22805 }
22806 
22807 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
22808 
22809 static rtx
22810 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22811 {
22812   bool unordered_compare = ix86_unordered_fp_compare (code);
22813   machine_mode intcmp_mode;
22814   rtx tmp, tmp2;
22815 
22816   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22817 
22818   /* Do fcomi/sahf based test when profitable.  */
22819   switch (ix86_fp_comparison_strategy (code))
22820     {
22821     case IX86_FPCMP_COMI:
22822       intcmp_mode = CCFPmode;
22823       tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22824       if (unordered_compare)
22825 	tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22826       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22827       break;
22828 
22829     case IX86_FPCMP_SAHF:
22830       intcmp_mode = CCFPmode;
22831       tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22832       if (unordered_compare)
22833 	tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22834       tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22835       if (!scratch)
22836 	scratch = gen_reg_rtx (HImode);
22837       tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22838       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22839       break;
22840 
22841     case IX86_FPCMP_ARITH:
22842       /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
22843       tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22844       if (unordered_compare)
22845 	tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22846       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22847       if (!scratch)
22848 	scratch = gen_reg_rtx (HImode);
22849       emit_insn (gen_rtx_SET (scratch, tmp));
22850 
22851       /* In the unordered case, we have to check C2 for NaN's, which
22852 	 doesn't happen to work out to anything nice combination-wise.
22853 	 So do some bit twiddling on the value we've got in AH to come
22854 	 up with an appropriate set of condition codes.  */
22855 
22856       intcmp_mode = CCNOmode;
22857       switch (code)
22858 	{
22859 	case GT:
22860 	case UNGT:
22861 	  if (code == GT || !TARGET_IEEE_FP)
22862 	    {
22863 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22864 	      code = EQ;
22865 	    }
22866 	  else
22867 	    {
22868 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22869 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22870 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22871 	      intcmp_mode = CCmode;
22872 	      code = GEU;
22873 	    }
22874 	  break;
22875 	case LT:
22876 	case UNLT:
22877 	  if (code == LT && TARGET_IEEE_FP)
22878 	    {
22879 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22880 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22881 	      intcmp_mode = CCmode;
22882 	      code = EQ;
22883 	    }
22884 	  else
22885 	    {
22886 	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22887 	      code = NE;
22888 	    }
22889 	  break;
22890 	case GE:
22891 	case UNGE:
22892 	  if (code == GE || !TARGET_IEEE_FP)
22893 	    {
22894 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22895 	      code = EQ;
22896 	    }
22897 	  else
22898 	    {
22899 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22900 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22901 	      code = NE;
22902 	    }
22903 	  break;
22904 	case LE:
22905 	case UNLE:
22906 	  if (code == LE && TARGET_IEEE_FP)
22907 	    {
22908 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22909 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22910 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22911 	      intcmp_mode = CCmode;
22912 	      code = LTU;
22913 	    }
22914 	  else
22915 	    {
22916 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22917 	      code = NE;
22918 	    }
22919 	  break;
22920 	case EQ:
22921 	case UNEQ:
22922 	  if (code == EQ && TARGET_IEEE_FP)
22923 	    {
22924 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22925 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22926 	      intcmp_mode = CCmode;
22927 	      code = EQ;
22928 	    }
22929 	  else
22930 	    {
22931 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22932 	      code = NE;
22933 	    }
22934 	  break;
22935 	case NE:
22936 	case LTGT:
22937 	  if (code == NE && TARGET_IEEE_FP)
22938 	    {
22939 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22940 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22941 					     GEN_INT (0x40)));
22942 	      code = NE;
22943 	    }
22944 	  else
22945 	    {
22946 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22947 	      code = EQ;
22948 	    }
22949 	  break;
22950 
22951 	case UNORDERED:
22952 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22953 	  code = NE;
22954 	  break;
22955 	case ORDERED:
22956 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22957 	  code = EQ;
22958 	  break;
22959 
22960 	default:
22961 	  gcc_unreachable ();
22962 	}
22963 	break;
22964 
22965     default:
22966       gcc_unreachable();
22967     }
22968 
22969   /* Return the test that should be put into the flags user, i.e.
22970      the bcc, scc, or cmov instruction.  */
22971   return gen_rtx_fmt_ee (code, VOIDmode,
22972 			 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22973 			 const0_rtx);
22974 }
22975 
22976 static rtx
22977 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22978 {
22979   rtx ret;
22980 
22981   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22982     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22983 
22984   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22985     {
22986       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22987       ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22988     }
22989   else
22990     ret = ix86_expand_int_compare (code, op0, op1);
22991 
22992   return ret;
22993 }
22994 
22995 void
22996 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22997 {
22998   machine_mode mode = GET_MODE (op0);
22999   rtx tmp;
23000 
23001   /* Handle special case - vector comparsion with boolean result, transform
23002      it using ptest instruction.  */
23003   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23004     {
23005       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23006       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23007 
23008       gcc_assert (code == EQ || code == NE);
23009       /* Generate XOR since we can't check that one operand is zero vector.  */
23010       tmp = gen_reg_rtx (mode);
23011       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23012       tmp = gen_lowpart (p_mode, tmp);
23013       emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23014 			      gen_rtx_UNSPEC (CCmode,
23015 					      gen_rtvec (2, tmp, tmp),
23016 					      UNSPEC_PTEST)));
23017       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23018       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23019 				  gen_rtx_LABEL_REF (VOIDmode, label),
23020 				  pc_rtx);
23021       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23022       return;
23023     }
23024 
23025   switch (mode)
23026     {
23027     case E_SFmode:
23028     case E_DFmode:
23029     case E_XFmode:
23030     case E_QImode:
23031     case E_HImode:
23032     case E_SImode:
23033       simple:
23034       tmp = ix86_expand_compare (code, op0, op1);
23035       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23036 				  gen_rtx_LABEL_REF (VOIDmode, label),
23037 				  pc_rtx);
23038       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23039       return;
23040 
23041     case E_DImode:
23042       if (TARGET_64BIT)
23043 	goto simple;
23044       /* For 32-bit target DI comparison may be performed on
23045 	 SSE registers.  To allow this we should avoid split
23046 	 to SI mode which is achieved by doing xor in DI mode
23047 	 and then comparing with zero (which is recognized by
23048 	 STV pass).  We don't compare using xor when optimizing
23049 	 for size.  */
23050       if (!optimize_insn_for_size_p ()
23051 	  && TARGET_STV
23052 	  && (code == EQ || code == NE))
23053 	{
23054 	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23055 	  op1 = const0_rtx;
23056 	}
23057       /* FALLTHRU */
23058     case E_TImode:
23059       /* Expand DImode branch into multiple compare+branch.  */
23060       {
23061 	rtx lo[2], hi[2];
23062 	rtx_code_label *label2;
23063 	enum rtx_code code1, code2, code3;
23064 	machine_mode submode;
23065 
23066 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23067 	  {
23068 	    std::swap (op0, op1);
23069 	    code = swap_condition (code);
23070 	  }
23071 
23072 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
23073 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
23074 
23075 	submode = mode == DImode ? SImode : DImode;
23076 
23077 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23078 	   avoid two branches.  This costs one extra insn, so disable when
23079 	   optimizing for size.  */
23080 
23081 	if ((code == EQ || code == NE)
23082 	    && (!optimize_insn_for_size_p ()
23083 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
23084 	  {
23085 	    rtx xor0, xor1;
23086 
23087 	    xor1 = hi[0];
23088 	    if (hi[1] != const0_rtx)
23089 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23090 				   NULL_RTX, 0, OPTAB_WIDEN);
23091 
23092 	    xor0 = lo[0];
23093 	    if (lo[1] != const0_rtx)
23094 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23095 				   NULL_RTX, 0, OPTAB_WIDEN);
23096 
23097 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
23098 				NULL_RTX, 0, OPTAB_WIDEN);
23099 
23100 	    ix86_expand_branch (code, tmp, const0_rtx, label);
23101 	    return;
23102 	  }
23103 
23104 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
23105 	   op1 is a constant and the low word is zero, then we can just
23106 	   examine the high word.  Similarly for low word -1 and
23107 	   less-or-equal-than or greater-than.  */
23108 
23109 	if (CONST_INT_P (hi[1]))
23110 	  switch (code)
23111 	    {
23112 	    case LT: case LTU: case GE: case GEU:
23113 	      if (lo[1] == const0_rtx)
23114 		{
23115 		  ix86_expand_branch (code, hi[0], hi[1], label);
23116 		  return;
23117 		}
23118 	      break;
23119 	    case LE: case LEU: case GT: case GTU:
23120 	      if (lo[1] == constm1_rtx)
23121 		{
23122 		  ix86_expand_branch (code, hi[0], hi[1], label);
23123 		  return;
23124 		}
23125 	      break;
23126 	    default:
23127 	      break;
23128 	    }
23129 
23130 	/* Emulate comparisons that do not depend on Zero flag with
23131 	   double-word subtraction.  Note that only Overflow, Sign
23132 	   and Carry flags are valid, so swap arguments and condition
23133 	   of comparisons that would otherwise test Zero flag.  */
23134 
23135 	switch (code)
23136 	  {
23137 	  case LE: case LEU: case GT: case GTU:
23138 	    std::swap (lo[0], lo[1]);
23139 	    std::swap (hi[0], hi[1]);
23140 	    code = swap_condition (code);
23141 	    /* FALLTHRU */
23142 
23143 	  case LT: case LTU: case GE: case GEU:
23144 	    {
23145 	      rtx (*cmp_insn) (rtx, rtx);
23146 	      rtx (*sbb_insn) (rtx, rtx, rtx);
23147 	      bool uns = (code == LTU || code == GEU);
23148 
23149 	      if (TARGET_64BIT)
23150 		{
23151 		  cmp_insn = gen_cmpdi_1;
23152 		  sbb_insn
23153 		    = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23154 		}
23155 	      else
23156 		{
23157 		  cmp_insn = gen_cmpsi_1;
23158 		  sbb_insn
23159 		    = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23160 		}
23161 
23162 	      if (!nonimmediate_operand (lo[0], submode))
23163 		lo[0] = force_reg (submode, lo[0]);
23164 	      if (!x86_64_general_operand (lo[1], submode))
23165 		lo[1] = force_reg (submode, lo[1]);
23166 
23167 	      if (!register_operand (hi[0], submode))
23168 		hi[0] = force_reg (submode, hi[0]);
23169 	      if ((uns && !nonimmediate_operand (hi[1], submode))
23170 		  || (!uns && !x86_64_general_operand (hi[1], submode)))
23171 		hi[1] = force_reg (submode, hi[1]);
23172 
23173 	      emit_insn (cmp_insn (lo[0], lo[1]));
23174 	      emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23175 
23176 	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23177 
23178 	      ix86_expand_branch (code, tmp, const0_rtx, label);
23179 	      return;
23180 	    }
23181 
23182 	  default:
23183 	    break;
23184 	  }
23185 
23186 	/* Otherwise, we need two or three jumps.  */
23187 
23188 	label2 = gen_label_rtx ();
23189 
23190 	code1 = code;
23191 	code2 = swap_condition (code);
23192 	code3 = unsigned_condition (code);
23193 
23194 	switch (code)
23195 	  {
23196 	  case LT: case GT: case LTU: case GTU:
23197 	    break;
23198 
23199 	  case LE:   code1 = LT;  code2 = GT;  break;
23200 	  case GE:   code1 = GT;  code2 = LT;  break;
23201 	  case LEU:  code1 = LTU; code2 = GTU; break;
23202 	  case GEU:  code1 = GTU; code2 = LTU; break;
23203 
23204 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
23205 	  case NE:   code2 = UNKNOWN; break;
23206 
23207 	  default:
23208 	    gcc_unreachable ();
23209 	  }
23210 
23211 	/*
23212 	 * a < b =>
23213 	 *    if (hi(a) < hi(b)) goto true;
23214 	 *    if (hi(a) > hi(b)) goto false;
23215 	 *    if (lo(a) < lo(b)) goto true;
23216 	 *  false:
23217 	 */
23218 
23219 	if (code1 != UNKNOWN)
23220 	  ix86_expand_branch (code1, hi[0], hi[1], label);
23221 	if (code2 != UNKNOWN)
23222 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
23223 
23224 	ix86_expand_branch (code3, lo[0], lo[1], label);
23225 
23226 	if (code2 != UNKNOWN)
23227 	  emit_label (label2);
23228 	return;
23229       }
23230 
23231     default:
23232       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23233       goto simple;
23234     }
23235 }
23236 
23237 void
23238 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23239 {
23240   rtx ret;
23241 
23242   gcc_assert (GET_MODE (dest) == QImode);
23243 
23244   ret = ix86_expand_compare (code, op0, op1);
23245   PUT_MODE (ret, QImode);
23246   emit_insn (gen_rtx_SET (dest, ret));
23247 }
23248 
23249 /* Expand comparison setting or clearing carry flag.  Return true when
23250    successful and set pop for the operation.  */
23251 static bool
23252 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23253 {
23254   machine_mode mode =
23255     GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23256 
23257   /* Do not handle double-mode compares that go through special path.  */
23258   if (mode == (TARGET_64BIT ? TImode : DImode))
23259     return false;
23260 
23261   if (SCALAR_FLOAT_MODE_P (mode))
23262     {
23263       rtx compare_op;
23264       rtx_insn *compare_seq;
23265 
23266       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23267 
23268       /* Shortcut:  following common codes never translate
23269 	 into carry flag compares.  */
23270       if (code == EQ || code == NE || code == UNEQ || code == LTGT
23271 	  || code == ORDERED || code == UNORDERED)
23272 	return false;
23273 
23274       /* These comparisons require zero flag; swap operands so they won't.  */
23275       if ((code == GT || code == UNLE || code == LE || code == UNGT)
23276 	  && !TARGET_IEEE_FP)
23277 	{
23278 	  std::swap (op0, op1);
23279 	  code = swap_condition (code);
23280 	}
23281 
23282       /* Try to expand the comparison and verify that we end up with
23283 	 carry flag based comparison.  This fails to be true only when
23284 	 we decide to expand comparison using arithmetic that is not
23285 	 too common scenario.  */
23286       start_sequence ();
23287       compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23288       compare_seq = get_insns ();
23289       end_sequence ();
23290 
23291       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23292         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23293       else
23294 	code = GET_CODE (compare_op);
23295 
23296       if (code != LTU && code != GEU)
23297 	return false;
23298 
23299       emit_insn (compare_seq);
23300       *pop = compare_op;
23301       return true;
23302     }
23303 
23304   if (!INTEGRAL_MODE_P (mode))
23305     return false;
23306 
23307   switch (code)
23308     {
23309     case LTU:
23310     case GEU:
23311       break;
23312 
23313     /* Convert a==0 into (unsigned)a<1.  */
23314     case EQ:
23315     case NE:
23316       if (op1 != const0_rtx)
23317 	return false;
23318       op1 = const1_rtx;
23319       code = (code == EQ ? LTU : GEU);
23320       break;
23321 
23322     /* Convert a>b into b<a or a>=b-1.  */
23323     case GTU:
23324     case LEU:
23325       if (CONST_INT_P (op1))
23326 	{
23327 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23328 	  /* Bail out on overflow.  We still can swap operands but that
23329 	     would force loading of the constant into register.  */
23330 	  if (op1 == const0_rtx
23331 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23332 	    return false;
23333 	  code = (code == GTU ? GEU : LTU);
23334 	}
23335       else
23336 	{
23337 	  std::swap (op0, op1);
23338 	  code = (code == GTU ? LTU : GEU);
23339 	}
23340       break;
23341 
23342     /* Convert a>=0 into (unsigned)a<0x80000000.  */
23343     case LT:
23344     case GE:
23345       if (mode == DImode || op1 != const0_rtx)
23346 	return false;
23347       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23348       code = (code == LT ? GEU : LTU);
23349       break;
23350     case LE:
23351     case GT:
23352       if (mode == DImode || op1 != constm1_rtx)
23353 	return false;
23354       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23355       code = (code == LE ? GEU : LTU);
23356       break;
23357 
23358     default:
23359       return false;
23360     }
23361   /* Swapping operands may cause constant to appear as first operand.  */
23362   if (!nonimmediate_operand (op0, VOIDmode))
23363     {
23364       if (!can_create_pseudo_p ())
23365 	return false;
23366       op0 = force_reg (mode, op0);
23367     }
23368   *pop = ix86_expand_compare (code, op0, op1);
23369   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23370   return true;
23371 }
23372 
23373 bool
23374 ix86_expand_int_movcc (rtx operands[])
23375 {
23376   enum rtx_code code = GET_CODE (operands[1]), compare_code;
23377   rtx_insn *compare_seq;
23378   rtx compare_op;
23379   machine_mode mode = GET_MODE (operands[0]);
23380   bool sign_bit_compare_p = false;
23381   rtx op0 = XEXP (operands[1], 0);
23382   rtx op1 = XEXP (operands[1], 1);
23383 
23384   if (GET_MODE (op0) == TImode
23385       || (GET_MODE (op0) == DImode
23386 	  && !TARGET_64BIT))
23387     return false;
23388 
23389   start_sequence ();
23390   compare_op = ix86_expand_compare (code, op0, op1);
23391   compare_seq = get_insns ();
23392   end_sequence ();
23393 
23394   compare_code = GET_CODE (compare_op);
23395 
23396   if ((op1 == const0_rtx && (code == GE || code == LT))
23397       || (op1 == constm1_rtx && (code == GT || code == LE)))
23398     sign_bit_compare_p = true;
23399 
23400   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23401      HImode insns, we'd be swallowed in word prefix ops.  */
23402 
23403   if ((mode != HImode || TARGET_FAST_PREFIX)
23404       && (mode != (TARGET_64BIT ? TImode : DImode))
23405       && CONST_INT_P (operands[2])
23406       && CONST_INT_P (operands[3]))
23407     {
23408       rtx out = operands[0];
23409       HOST_WIDE_INT ct = INTVAL (operands[2]);
23410       HOST_WIDE_INT cf = INTVAL (operands[3]);
23411       HOST_WIDE_INT diff;
23412 
23413       diff = ct - cf;
23414       /*  Sign bit compares are better done using shifts than we do by using
23415 	  sbb.  */
23416       if (sign_bit_compare_p
23417 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23418 	{
23419 	  /* Detect overlap between destination and compare sources.  */
23420 	  rtx tmp = out;
23421 
23422           if (!sign_bit_compare_p)
23423 	    {
23424 	      rtx flags;
23425 	      bool fpcmp = false;
23426 
23427 	      compare_code = GET_CODE (compare_op);
23428 
23429 	      flags = XEXP (compare_op, 0);
23430 
23431 	      if (GET_MODE (flags) == CCFPmode)
23432 		{
23433 		  fpcmp = true;
23434 		  compare_code
23435 		    = ix86_fp_compare_code_to_integer (compare_code);
23436 		}
23437 
23438 	      /* To simplify rest of code, restrict to the GEU case.  */
23439 	      if (compare_code == LTU)
23440 		{
23441 		  std::swap (ct, cf);
23442 		  compare_code = reverse_condition (compare_code);
23443 		  code = reverse_condition (code);
23444 		}
23445 	      else
23446 		{
23447 		  if (fpcmp)
23448 		    PUT_CODE (compare_op,
23449 			      reverse_condition_maybe_unordered
23450 			        (GET_CODE (compare_op)));
23451 		  else
23452 		    PUT_CODE (compare_op,
23453 			      reverse_condition (GET_CODE (compare_op)));
23454 		}
23455 	      diff = ct - cf;
23456 
23457 	      if (reg_overlap_mentioned_p (out, op0)
23458 		  || reg_overlap_mentioned_p (out, op1))
23459 		tmp = gen_reg_rtx (mode);
23460 
23461 	      if (mode == DImode)
23462 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23463 	      else
23464 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
23465 						 flags, compare_op));
23466 	    }
23467 	  else
23468 	    {
23469 	      if (code == GT || code == GE)
23470 		code = reverse_condition (code);
23471 	      else
23472 		{
23473 		  std::swap (ct, cf);
23474 		  diff = ct - cf;
23475 		}
23476 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23477 	    }
23478 
23479 	  if (diff == 1)
23480 	    {
23481 	      /*
23482 	       * cmpl op0,op1
23483 	       * sbbl dest,dest
23484 	       * [addl dest, ct]
23485 	       *
23486 	       * Size 5 - 8.
23487 	       */
23488 	      if (ct)
23489 		tmp = expand_simple_binop (mode, PLUS,
23490 					   tmp, GEN_INT (ct),
23491 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
23492 	    }
23493 	  else if (cf == -1)
23494 	    {
23495 	      /*
23496 	       * cmpl op0,op1
23497 	       * sbbl dest,dest
23498 	       * orl $ct, dest
23499 	       *
23500 	       * Size 8.
23501 	       */
23502 	      tmp = expand_simple_binop (mode, IOR,
23503 					 tmp, GEN_INT (ct),
23504 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
23505 	    }
23506 	  else if (diff == -1 && ct)
23507 	    {
23508 	      /*
23509 	       * cmpl op0,op1
23510 	       * sbbl dest,dest
23511 	       * notl dest
23512 	       * [addl dest, cf]
23513 	       *
23514 	       * Size 8 - 11.
23515 	       */
23516 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23517 	      if (cf)
23518 		tmp = expand_simple_binop (mode, PLUS,
23519 					   copy_rtx (tmp), GEN_INT (cf),
23520 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
23521 	    }
23522 	  else
23523 	    {
23524 	      /*
23525 	       * cmpl op0,op1
23526 	       * sbbl dest,dest
23527 	       * [notl dest]
23528 	       * andl cf - ct, dest
23529 	       * [addl dest, ct]
23530 	       *
23531 	       * Size 8 - 11.
23532 	       */
23533 
23534 	      if (cf == 0)
23535 		{
23536 		  cf = ct;
23537 		  ct = 0;
23538 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23539 		}
23540 
23541 	      tmp = expand_simple_binop (mode, AND,
23542 					 copy_rtx (tmp),
23543 					 gen_int_mode (cf - ct, mode),
23544 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
23545 	      if (ct)
23546 		tmp = expand_simple_binop (mode, PLUS,
23547 					   copy_rtx (tmp), GEN_INT (ct),
23548 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
23549 	    }
23550 
23551 	  if (!rtx_equal_p (tmp, out))
23552 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23553 
23554 	  return true;
23555 	}
23556 
23557       if (diff < 0)
23558 	{
23559 	  machine_mode cmp_mode = GET_MODE (op0);
23560 	  enum rtx_code new_code;
23561 
23562 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
23563 	    {
23564 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23565 
23566 	      /* We may be reversing unordered compare to normal compare, that
23567 		 is not valid in general (we may convert non-trapping condition
23568 		 to trapping one), however on i386 we currently emit all
23569 		 comparisons unordered.  */
23570 	      new_code = reverse_condition_maybe_unordered (code);
23571 	    }
23572 	  else
23573 	    new_code = ix86_reverse_condition (code, cmp_mode);
23574 	  if (new_code != UNKNOWN)
23575 	    {
23576 	      std::swap (ct, cf);
23577 	      diff = -diff;
23578 	      code = new_code;
23579 	    }
23580 	}
23581 
23582       compare_code = UNKNOWN;
23583       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23584 	  && CONST_INT_P (op1))
23585 	{
23586 	  if (op1 == const0_rtx
23587 	      && (code == LT || code == GE))
23588 	    compare_code = code;
23589 	  else if (op1 == constm1_rtx)
23590 	    {
23591 	      if (code == LE)
23592 		compare_code = LT;
23593 	      else if (code == GT)
23594 		compare_code = GE;
23595 	    }
23596 	}
23597 
23598       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
23599       if (compare_code != UNKNOWN
23600 	  && GET_MODE (op0) == GET_MODE (out)
23601 	  && (cf == -1 || ct == -1))
23602 	{
23603 	  /* If lea code below could be used, only optimize
23604 	     if it results in a 2 insn sequence.  */
23605 
23606 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23607 		 || diff == 3 || diff == 5 || diff == 9)
23608 	      || (compare_code == LT && ct == -1)
23609 	      || (compare_code == GE && cf == -1))
23610 	    {
23611 	      /*
23612 	       * notl op1	(if necessary)
23613 	       * sarl $31, op1
23614 	       * orl cf, op1
23615 	       */
23616 	      if (ct != -1)
23617 		{
23618 		  cf = ct;
23619 		  ct = -1;
23620 		  code = reverse_condition (code);
23621 		}
23622 
23623 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23624 
23625 	      out = expand_simple_binop (mode, IOR,
23626 					 out, GEN_INT (cf),
23627 					 out, 1, OPTAB_DIRECT);
23628 	      if (out != operands[0])
23629 		emit_move_insn (operands[0], out);
23630 
23631 	      return true;
23632 	    }
23633 	}
23634 
23635 
23636       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23637 	   || diff == 3 || diff == 5 || diff == 9)
23638 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23639 	  && (mode != DImode
23640 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23641 	{
23642 	  /*
23643 	   * xorl dest,dest
23644 	   * cmpl op1,op2
23645 	   * setcc dest
23646 	   * lea cf(dest*(ct-cf)),dest
23647 	   *
23648 	   * Size 14.
23649 	   *
23650 	   * This also catches the degenerate setcc-only case.
23651 	   */
23652 
23653 	  rtx tmp;
23654 	  int nops;
23655 
23656 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23657 
23658 	  nops = 0;
23659 	  /* On x86_64 the lea instruction operates on Pmode, so we need
23660 	     to get arithmetics done in proper mode to match.  */
23661 	  if (diff == 1)
23662 	    tmp = copy_rtx (out);
23663 	  else
23664 	    {
23665 	      rtx out1;
23666 	      out1 = copy_rtx (out);
23667 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23668 	      nops++;
23669 	      if (diff & 1)
23670 		{
23671 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
23672 		  nops++;
23673 		}
23674 	    }
23675 	  if (cf != 0)
23676 	    {
23677 	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23678 	      nops++;
23679 	    }
23680 	  if (!rtx_equal_p (tmp, out))
23681 	    {
23682 	      if (nops == 1)
23683 		out = force_operand (tmp, copy_rtx (out));
23684 	      else
23685 		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23686 	    }
23687 	  if (!rtx_equal_p (out, operands[0]))
23688 	    emit_move_insn (operands[0], copy_rtx (out));
23689 
23690 	  return true;
23691 	}
23692 
23693       /*
23694        * General case:			Jumpful:
23695        *   xorl dest,dest		cmpl op1, op2
23696        *   cmpl op1, op2		movl ct, dest
23697        *   setcc dest			jcc 1f
23698        *   decl dest			movl cf, dest
23699        *   andl (cf-ct),dest		1:
23700        *   addl ct,dest
23701        *
23702        * Size 20.			Size 14.
23703        *
23704        * This is reasonably steep, but branch mispredict costs are
23705        * high on modern cpus, so consider failing only if optimizing
23706        * for space.
23707        */
23708 
23709       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23710 	  && BRANCH_COST (optimize_insn_for_speed_p (),
23711 		  	  false) >= 2)
23712 	{
23713 	  if (cf == 0)
23714 	    {
23715 	      machine_mode cmp_mode = GET_MODE (op0);
23716 	      enum rtx_code new_code;
23717 
23718 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
23719 		{
23720 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23721 
23722 		  /* We may be reversing unordered compare to normal compare,
23723 		     that is not valid in general (we may convert non-trapping
23724 		     condition to trapping one), however on i386 we currently
23725 		     emit all comparisons unordered.  */
23726 		  new_code = reverse_condition_maybe_unordered (code);
23727 		}
23728 	      else
23729 		{
23730 		  new_code = ix86_reverse_condition (code, cmp_mode);
23731 		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
23732 		    compare_code = reverse_condition (compare_code);
23733 		}
23734 
23735 	      if (new_code != UNKNOWN)
23736 		{
23737 		  cf = ct;
23738 		  ct = 0;
23739 		  code = new_code;
23740 		}
23741 	    }
23742 
23743 	  if (compare_code != UNKNOWN)
23744 	    {
23745 	      /* notl op1	(if needed)
23746 		 sarl $31, op1
23747 		 andl (cf-ct), op1
23748 		 addl ct, op1
23749 
23750 		 For x < 0 (resp. x <= -1) there will be no notl,
23751 		 so if possible swap the constants to get rid of the
23752 		 complement.
23753 		 True/false will be -1/0 while code below (store flag
23754 		 followed by decrement) is 0/-1, so the constants need
23755 		 to be exchanged once more.  */
23756 
23757 	      if (compare_code == GE || !cf)
23758 		{
23759 		  code = reverse_condition (code);
23760 		  compare_code = LT;
23761 		}
23762 	      else
23763 		std::swap (ct, cf);
23764 
23765 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23766 	    }
23767 	  else
23768 	    {
23769 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23770 
23771 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23772 					 constm1_rtx,
23773 					 copy_rtx (out), 1, OPTAB_DIRECT);
23774 	    }
23775 
23776 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
23777 				     gen_int_mode (cf - ct, mode),
23778 				     copy_rtx (out), 1, OPTAB_DIRECT);
23779 	  if (ct)
23780 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23781 				       copy_rtx (out), 1, OPTAB_DIRECT);
23782 	  if (!rtx_equal_p (out, operands[0]))
23783 	    emit_move_insn (operands[0], copy_rtx (out));
23784 
23785 	  return true;
23786 	}
23787     }
23788 
23789   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23790     {
23791       /* Try a few things more with specific constants and a variable.  */
23792 
23793       optab op;
23794       rtx var, orig_out, out, tmp;
23795 
23796       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23797 	return false;
23798 
23799       /* If one of the two operands is an interesting constant, load a
23800 	 constant with the above and mask it in with a logical operation.  */
23801 
23802       if (CONST_INT_P (operands[2]))
23803 	{
23804 	  var = operands[3];
23805 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23806 	    operands[3] = constm1_rtx, op = and_optab;
23807 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23808 	    operands[3] = const0_rtx, op = ior_optab;
23809 	  else
23810 	    return false;
23811 	}
23812       else if (CONST_INT_P (operands[3]))
23813 	{
23814 	  var = operands[2];
23815 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23816 	    operands[2] = constm1_rtx, op = and_optab;
23817 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23818 	    operands[2] = const0_rtx, op = ior_optab;
23819 	  else
23820 	    return false;
23821 	}
23822       else
23823         return false;
23824 
23825       orig_out = operands[0];
23826       tmp = gen_reg_rtx (mode);
23827       operands[0] = tmp;
23828 
23829       /* Recurse to get the constant loaded.  */
23830       if (!ix86_expand_int_movcc (operands))
23831         return false;
23832 
23833       /* Mask in the interesting variable.  */
23834       out = expand_binop (mode, op, var, tmp, orig_out, 0,
23835 			  OPTAB_WIDEN);
23836       if (!rtx_equal_p (out, orig_out))
23837 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23838 
23839       return true;
23840     }
23841 
23842   /*
23843    * For comparison with above,
23844    *
23845    * movl cf,dest
23846    * movl ct,tmp
23847    * cmpl op1,op2
23848    * cmovcc tmp,dest
23849    *
23850    * Size 15.
23851    */
23852 
23853   if (! nonimmediate_operand (operands[2], mode))
23854     operands[2] = force_reg (mode, operands[2]);
23855   if (! nonimmediate_operand (operands[3], mode))
23856     operands[3] = force_reg (mode, operands[3]);
23857 
23858   if (! register_operand (operands[2], VOIDmode)
23859       && (mode == QImode
23860           || ! register_operand (operands[3], VOIDmode)))
23861     operands[2] = force_reg (mode, operands[2]);
23862 
23863   if (mode == QImode
23864       && ! register_operand (operands[3], VOIDmode))
23865     operands[3] = force_reg (mode, operands[3]);
23866 
23867   emit_insn (compare_seq);
23868   emit_insn (gen_rtx_SET (operands[0],
23869 			  gen_rtx_IF_THEN_ELSE (mode,
23870 						compare_op, operands[2],
23871 						operands[3])));
23872   return true;
23873 }
23874 
23875 /* Swap, force into registers, or otherwise massage the two operands
23876    to an sse comparison with a mask result.  Thus we differ a bit from
23877    ix86_prepare_fp_compare_args which expects to produce a flags result.
23878 
23879    The DEST operand exists to help determine whether to commute commutative
23880    operators.  The POP0/POP1 operands are updated in place.  The new
23881    comparison code is returned, or UNKNOWN if not implementable.  */
23882 
23883 static enum rtx_code
23884 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23885 				  rtx *pop0, rtx *pop1)
23886 {
23887   switch (code)
23888     {
23889     case LTGT:
23890     case UNEQ:
23891       /* AVX supports all the needed comparisons.  */
23892       if (TARGET_AVX)
23893 	break;
23894       /* We have no LTGT as an operator.  We could implement it with
23895 	 NE & ORDERED, but this requires an extra temporary.  It's
23896 	 not clear that it's worth it.  */
23897       return UNKNOWN;
23898 
23899     case LT:
23900     case LE:
23901     case UNGT:
23902     case UNGE:
23903       /* These are supported directly.  */
23904       break;
23905 
23906     case EQ:
23907     case NE:
23908     case UNORDERED:
23909     case ORDERED:
23910       /* AVX has 3 operand comparisons, no need to swap anything.  */
23911       if (TARGET_AVX)
23912 	break;
23913       /* For commutative operators, try to canonicalize the destination
23914 	 operand to be first in the comparison - this helps reload to
23915 	 avoid extra moves.  */
23916       if (!dest || !rtx_equal_p (dest, *pop1))
23917 	break;
23918       /* FALLTHRU */
23919 
23920     case GE:
23921     case GT:
23922     case UNLE:
23923     case UNLT:
23924       /* These are not supported directly before AVX, and furthermore
23925 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
23926 	 comparison operands to transform into something that is
23927 	 supported.  */
23928       std::swap (*pop0, *pop1);
23929       code = swap_condition (code);
23930       break;
23931 
23932     default:
23933       gcc_unreachable ();
23934     }
23935 
23936   return code;
23937 }
23938 
23939 /* Detect conditional moves that exactly match min/max operational
23940    semantics.  Note that this is IEEE safe, as long as we don't
23941    interchange the operands.
23942 
23943    Returns FALSE if this conditional move doesn't match a MIN/MAX,
23944    and TRUE if the operation is successful and instructions are emitted.  */
23945 
23946 static bool
23947 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23948 			   rtx cmp_op1, rtx if_true, rtx if_false)
23949 {
23950   machine_mode mode;
23951   bool is_min;
23952   rtx tmp;
23953 
23954   if (code == LT)
23955     ;
23956   else if (code == UNGE)
23957     std::swap (if_true, if_false);
23958   else
23959     return false;
23960 
23961   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23962     is_min = true;
23963   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23964     is_min = false;
23965   else
23966     return false;
23967 
23968   mode = GET_MODE (dest);
23969 
23970   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23971      but MODE may be a vector mode and thus not appropriate.  */
23972   if (!flag_finite_math_only || flag_signed_zeros)
23973     {
23974       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23975       rtvec v;
23976 
23977       if_true = force_reg (mode, if_true);
23978       v = gen_rtvec (2, if_true, if_false);
23979       tmp = gen_rtx_UNSPEC (mode, v, u);
23980     }
23981   else
23982     {
23983       code = is_min ? SMIN : SMAX;
23984       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23985     }
23986 
23987   emit_insn (gen_rtx_SET (dest, tmp));
23988   return true;
23989 }
23990 
23991 /* Expand an SSE comparison.  Return the register with the result.  */
23992 
23993 static rtx
23994 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23995 		     rtx op_true, rtx op_false)
23996 {
23997   machine_mode mode = GET_MODE (dest);
23998   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23999 
24000   /* In general case result of comparison can differ from operands' type.  */
24001   machine_mode cmp_mode;
24002 
24003   /* In AVX512F the result of comparison is an integer mask.  */
24004   bool maskcmp = false;
24005   rtx x;
24006 
24007   if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24008     {
24009       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
24010       cmp_mode = int_mode_for_size (nbits, 0).require ();
24011       maskcmp = true;
24012     }
24013   else
24014     cmp_mode = cmp_ops_mode;
24015 
24016   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24017 
24018   int (*op1_predicate)(rtx, machine_mode)
24019     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
24020 
24021   if (!op1_predicate (cmp_op1, cmp_ops_mode))
24022     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24023 
24024   if (optimize
24025       || (maskcmp && cmp_mode != mode)
24026       || (op_true && reg_overlap_mentioned_p (dest, op_true))
24027       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24028     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24029 
24030   /* Compare patterns for int modes are unspec in AVX512F only.  */
24031   if (maskcmp && (code == GT || code == EQ))
24032     {
24033       rtx (*gen)(rtx, rtx, rtx);
24034 
24035       switch (cmp_ops_mode)
24036 	{
24037 	case E_V64QImode:
24038 	  gcc_assert (TARGET_AVX512BW);
24039 	  gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24040 	  break;
24041 	case E_V32HImode:
24042 	  gcc_assert (TARGET_AVX512BW);
24043 	  gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24044 	  break;
24045 	case E_V16SImode:
24046 	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24047 	  break;
24048 	case E_V8DImode:
24049 	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24050 	  break;
24051 	default:
24052 	  gen = NULL;
24053 	}
24054 
24055       if (gen)
24056 	{
24057 	  emit_insn (gen (dest, cmp_op0, cmp_op1));
24058 	  return dest;
24059 	}
24060     }
24061   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24062 
24063   if (cmp_mode != mode && !maskcmp)
24064     {
24065       x = force_reg (cmp_ops_mode, x);
24066       convert_move (dest, x, false);
24067     }
24068   else
24069     emit_insn (gen_rtx_SET (dest, x));
24070 
24071   return dest;
24072 }
24073 
24074 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24075    operations.  This is used for both scalar and vector conditional moves.  */
24076 
24077 void
24078 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24079 {
24080   machine_mode mode = GET_MODE (dest);
24081   machine_mode cmpmode = GET_MODE (cmp);
24082 
24083   /* In AVX512F the result of comparison is an integer mask.  */
24084   bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24085 
24086   rtx t2, t3, x;
24087 
24088   /* If we have an integer mask and FP value then we need
24089      to cast mask to FP mode.  */
24090   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24091     {
24092       cmp = force_reg (cmpmode, cmp);
24093       cmp = gen_rtx_SUBREG (mode, cmp, 0);
24094     }
24095 
24096   if (vector_all_ones_operand (op_true, mode)
24097       && rtx_equal_p (op_false, CONST0_RTX (mode))
24098       && !maskcmp)
24099     {
24100       emit_insn (gen_rtx_SET (dest, cmp));
24101     }
24102   else if (op_false == CONST0_RTX (mode)
24103       && !maskcmp)
24104     {
24105       op_true = force_reg (mode, op_true);
24106       x = gen_rtx_AND (mode, cmp, op_true);
24107       emit_insn (gen_rtx_SET (dest, x));
24108     }
24109   else if (op_true == CONST0_RTX (mode)
24110       && !maskcmp)
24111     {
24112       op_false = force_reg (mode, op_false);
24113       x = gen_rtx_NOT (mode, cmp);
24114       x = gen_rtx_AND (mode, x, op_false);
24115       emit_insn (gen_rtx_SET (dest, x));
24116     }
24117   else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24118       && !maskcmp)
24119     {
24120       op_false = force_reg (mode, op_false);
24121       x = gen_rtx_IOR (mode, cmp, op_false);
24122       emit_insn (gen_rtx_SET (dest, x));
24123     }
24124   else if (TARGET_XOP
24125       && !maskcmp)
24126     {
24127       op_true = force_reg (mode, op_true);
24128 
24129       if (!nonimmediate_operand (op_false, mode))
24130 	op_false = force_reg (mode, op_false);
24131 
24132       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24133 							  op_true,
24134 							  op_false)));
24135     }
24136   else
24137     {
24138       rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24139       rtx d = dest;
24140 
24141       if (!vector_operand (op_true, mode))
24142 	op_true = force_reg (mode, op_true);
24143 
24144       op_false = force_reg (mode, op_false);
24145 
24146       switch (mode)
24147 	{
24148 	case E_V4SFmode:
24149 	  if (TARGET_SSE4_1)
24150 	    gen = gen_sse4_1_blendvps;
24151 	  break;
24152 	case E_V2DFmode:
24153 	  if (TARGET_SSE4_1)
24154 	    gen = gen_sse4_1_blendvpd;
24155 	  break;
24156 	case E_V16QImode:
24157 	case E_V8HImode:
24158 	case E_V4SImode:
24159 	case E_V2DImode:
24160 	  if (TARGET_SSE4_1)
24161 	    {
24162 	      gen = gen_sse4_1_pblendvb;
24163 	      if (mode != V16QImode)
24164 		d = gen_reg_rtx (V16QImode);
24165 	      op_false = gen_lowpart (V16QImode, op_false);
24166 	      op_true = gen_lowpart (V16QImode, op_true);
24167 	      cmp = gen_lowpart (V16QImode, cmp);
24168 	    }
24169 	  break;
24170 	case E_V8SFmode:
24171 	  if (TARGET_AVX)
24172 	    gen = gen_avx_blendvps256;
24173 	  break;
24174 	case E_V4DFmode:
24175 	  if (TARGET_AVX)
24176 	    gen = gen_avx_blendvpd256;
24177 	  break;
24178 	case E_V32QImode:
24179 	case E_V16HImode:
24180 	case E_V8SImode:
24181 	case E_V4DImode:
24182 	  if (TARGET_AVX2)
24183 	    {
24184 	      gen = gen_avx2_pblendvb;
24185 	      if (mode != V32QImode)
24186 		d = gen_reg_rtx (V32QImode);
24187 	      op_false = gen_lowpart (V32QImode, op_false);
24188 	      op_true = gen_lowpart (V32QImode, op_true);
24189 	      cmp = gen_lowpart (V32QImode, cmp);
24190 	    }
24191 	  break;
24192 
24193 	case E_V64QImode:
24194 	  gen = gen_avx512bw_blendmv64qi;
24195 	  break;
24196 	case E_V32HImode:
24197 	  gen = gen_avx512bw_blendmv32hi;
24198 	  break;
24199 	case E_V16SImode:
24200 	  gen = gen_avx512f_blendmv16si;
24201 	  break;
24202 	case E_V8DImode:
24203 	  gen = gen_avx512f_blendmv8di;
24204 	  break;
24205 	case E_V8DFmode:
24206 	  gen = gen_avx512f_blendmv8df;
24207 	  break;
24208 	case E_V16SFmode:
24209 	  gen = gen_avx512f_blendmv16sf;
24210 	  break;
24211 
24212 	default:
24213 	  break;
24214 	}
24215 
24216       if (gen != NULL)
24217 	{
24218 	  emit_insn (gen (d, op_false, op_true, cmp));
24219 	  if (d != dest)
24220 	    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24221 	}
24222       else
24223 	{
24224 	  op_true = force_reg (mode, op_true);
24225 
24226 	  t2 = gen_reg_rtx (mode);
24227 	  if (optimize)
24228 	    t3 = gen_reg_rtx (mode);
24229 	  else
24230 	    t3 = dest;
24231 
24232 	  x = gen_rtx_AND (mode, op_true, cmp);
24233 	  emit_insn (gen_rtx_SET (t2, x));
24234 
24235 	  x = gen_rtx_NOT (mode, cmp);
24236 	  x = gen_rtx_AND (mode, x, op_false);
24237 	  emit_insn (gen_rtx_SET (t3, x));
24238 
24239 	  x = gen_rtx_IOR (mode, t3, t2);
24240 	  emit_insn (gen_rtx_SET (dest, x));
24241 	}
24242     }
24243 }
24244 
24245 /* Expand a floating-point conditional move.  Return true if successful.  */
24246 
24247 bool
24248 ix86_expand_fp_movcc (rtx operands[])
24249 {
24250   machine_mode mode = GET_MODE (operands[0]);
24251   enum rtx_code code = GET_CODE (operands[1]);
24252   rtx tmp, compare_op;
24253   rtx op0 = XEXP (operands[1], 0);
24254   rtx op1 = XEXP (operands[1], 1);
24255 
24256   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24257     {
24258       machine_mode cmode;
24259 
24260       /* Since we've no cmove for sse registers, don't force bad register
24261 	 allocation just to gain access to it.  Deny movcc when the
24262 	 comparison mode doesn't match the move mode.  */
24263       cmode = GET_MODE (op0);
24264       if (cmode == VOIDmode)
24265 	cmode = GET_MODE (op1);
24266       if (cmode != mode)
24267 	return false;
24268 
24269       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24270       if (code == UNKNOWN)
24271 	return false;
24272 
24273       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24274 				     operands[2], operands[3]))
24275 	return true;
24276 
24277       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24278 				 operands[2], operands[3]);
24279       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24280       return true;
24281     }
24282 
24283   if (GET_MODE (op0) == TImode
24284       || (GET_MODE (op0) == DImode
24285 	  && !TARGET_64BIT))
24286     return false;
24287 
24288   /* The floating point conditional move instructions don't directly
24289      support conditions resulting from a signed integer comparison.  */
24290 
24291   compare_op = ix86_expand_compare (code, op0, op1);
24292   if (!fcmov_comparison_operator (compare_op, VOIDmode))
24293     {
24294       tmp = gen_reg_rtx (QImode);
24295       ix86_expand_setcc (tmp, code, op0, op1);
24296 
24297       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24298     }
24299 
24300   emit_insn (gen_rtx_SET (operands[0],
24301 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
24302 						operands[2], operands[3])));
24303 
24304   return true;
24305 }
24306 
24307 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
24308 
24309 static int
24310 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24311 {
24312   switch (code)
24313     {
24314     case EQ:
24315       return 0;
24316     case LT:
24317     case LTU:
24318       return 1;
24319     case LE:
24320     case LEU:
24321       return 2;
24322     case NE:
24323       return 4;
24324     case GE:
24325     case GEU:
24326       return 5;
24327     case GT:
24328     case GTU:
24329       return 6;
24330     default:
24331       gcc_unreachable ();
24332     }
24333 }
24334 
24335 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
24336 
24337 static int
24338 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24339 {
24340   switch (code)
24341     {
24342     case EQ:
24343       return 0x00;
24344     case NE:
24345       return 0x04;
24346     case GT:
24347       return 0x0e;
24348     case LE:
24349       return 0x02;
24350     case GE:
24351       return 0x0d;
24352     case LT:
24353       return 0x01;
24354     case UNLE:
24355       return 0x0a;
24356     case UNLT:
24357       return 0x09;
24358     case UNGE:
24359       return 0x05;
24360     case UNGT:
24361       return 0x06;
24362     case UNEQ:
24363       return 0x18;
24364     case LTGT:
24365       return 0x0c;
24366     case ORDERED:
24367       return 0x07;
24368     case UNORDERED:
24369       return 0x03;
24370     default:
24371       gcc_unreachable ();
24372     }
24373 }
24374 
24375 /* Return immediate value to be used in UNSPEC_PCMP
24376    for comparison CODE in MODE.  */
24377 
24378 static int
24379 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24380 {
24381   if (FLOAT_MODE_P (mode))
24382     return ix86_fp_cmp_code_to_pcmp_immediate (code);
24383   return ix86_int_cmp_code_to_pcmp_immediate (code);
24384 }
24385 
24386 /* Expand AVX-512 vector comparison.  */
24387 
24388 bool
24389 ix86_expand_mask_vec_cmp (rtx operands[])
24390 {
24391   machine_mode mask_mode = GET_MODE (operands[0]);
24392   machine_mode cmp_mode = GET_MODE (operands[2]);
24393   enum rtx_code code = GET_CODE (operands[1]);
24394   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24395   int unspec_code;
24396   rtx unspec;
24397 
24398   switch (code)
24399     {
24400     case LEU:
24401     case GTU:
24402     case GEU:
24403     case LTU:
24404       unspec_code = UNSPEC_UNSIGNED_PCMP;
24405       break;
24406 
24407     default:
24408       unspec_code = UNSPEC_PCMP;
24409     }
24410 
24411   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24412 						 operands[3], imm),
24413 			   unspec_code);
24414   emit_insn (gen_rtx_SET (operands[0], unspec));
24415 
24416   return true;
24417 }
24418 
24419 /* Expand fp vector comparison.  */
24420 
24421 bool
24422 ix86_expand_fp_vec_cmp (rtx operands[])
24423 {
24424   enum rtx_code code = GET_CODE (operands[1]);
24425   rtx cmp;
24426 
24427   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24428 					   &operands[2], &operands[3]);
24429   if (code == UNKNOWN)
24430     {
24431       rtx temp;
24432       switch (GET_CODE (operands[1]))
24433 	{
24434 	case LTGT:
24435 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24436 				      operands[3], NULL, NULL);
24437 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24438 				     operands[3], NULL, NULL);
24439 	  code = AND;
24440 	  break;
24441 	case UNEQ:
24442 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24443 				      operands[3], NULL, NULL);
24444 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24445 				     operands[3], NULL, NULL);
24446 	  code = IOR;
24447 	  break;
24448 	default:
24449 	  gcc_unreachable ();
24450 	}
24451       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24452 				 OPTAB_DIRECT);
24453     }
24454   else
24455     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24456 			       operands[1], operands[2]);
24457 
24458   if (operands[0] != cmp)
24459     emit_move_insn (operands[0], cmp);
24460 
24461   return true;
24462 }
24463 
24464 static rtx
24465 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24466 			 rtx op_true, rtx op_false, bool *negate)
24467 {
24468   machine_mode data_mode = GET_MODE (dest);
24469   machine_mode mode = GET_MODE (cop0);
24470   rtx x;
24471 
24472   *negate = false;
24473 
24474   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
24475   if (TARGET_XOP
24476       && (mode == V16QImode || mode == V8HImode
24477 	  || mode == V4SImode || mode == V2DImode))
24478     ;
24479   else
24480     {
24481       /* Canonicalize the comparison to EQ, GT, GTU.  */
24482       switch (code)
24483 	{
24484 	case EQ:
24485 	case GT:
24486 	case GTU:
24487 	  break;
24488 
24489 	case NE:
24490 	case LE:
24491 	case LEU:
24492 	  code = reverse_condition (code);
24493 	  *negate = true;
24494 	  break;
24495 
24496 	case GE:
24497 	case GEU:
24498 	  code = reverse_condition (code);
24499 	  *negate = true;
24500 	  /* FALLTHRU */
24501 
24502 	case LT:
24503 	case LTU:
24504 	  std::swap (cop0, cop1);
24505 	  code = swap_condition (code);
24506 	  break;
24507 
24508 	default:
24509 	  gcc_unreachable ();
24510 	}
24511 
24512       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
24513       if (mode == V2DImode)
24514 	{
24515 	  switch (code)
24516 	    {
24517 	    case EQ:
24518 	      /* SSE4.1 supports EQ.  */
24519 	      if (!TARGET_SSE4_1)
24520 		return NULL;
24521 	      break;
24522 
24523 	    case GT:
24524 	    case GTU:
24525 	      /* SSE4.2 supports GT/GTU.  */
24526 	      if (!TARGET_SSE4_2)
24527 		return NULL;
24528 	      break;
24529 
24530 	    default:
24531 	      gcc_unreachable ();
24532 	    }
24533 	}
24534 
24535       /* Unsigned parallel compare is not supported by the hardware.
24536 	 Play some tricks to turn this into a signed comparison
24537 	 against 0.  */
24538       if (code == GTU)
24539 	{
24540 	  cop0 = force_reg (mode, cop0);
24541 
24542 	  switch (mode)
24543 	    {
24544 	    case E_V16SImode:
24545 	    case E_V8DImode:
24546 	    case E_V8SImode:
24547 	    case E_V4DImode:
24548 	    case E_V4SImode:
24549 	    case E_V2DImode:
24550 		{
24551 		  rtx t1, t2, mask;
24552 		  rtx (*gen_sub3) (rtx, rtx, rtx);
24553 
24554 		  switch (mode)
24555 		    {
24556 		    case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24557 		    case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24558 		    case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24559 		    case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24560 		    case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24561 		    case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24562 		    default:
24563 		      gcc_unreachable ();
24564 		    }
24565 		  /* Subtract (-(INT MAX) - 1) from both operands to make
24566 		     them signed.  */
24567 		  mask = ix86_build_signbit_mask (mode, true, false);
24568 		  t1 = gen_reg_rtx (mode);
24569 		  emit_insn (gen_sub3 (t1, cop0, mask));
24570 
24571 		  t2 = gen_reg_rtx (mode);
24572 		  emit_insn (gen_sub3 (t2, cop1, mask));
24573 
24574 		  cop0 = t1;
24575 		  cop1 = t2;
24576 		  code = GT;
24577 		}
24578 	      break;
24579 
24580 	    case E_V64QImode:
24581 	    case E_V32HImode:
24582 	    case E_V32QImode:
24583 	    case E_V16HImode:
24584 	    case E_V16QImode:
24585 	    case E_V8HImode:
24586 	      /* Perform a parallel unsigned saturating subtraction.  */
24587 	      x = gen_reg_rtx (mode);
24588 	      emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24589 							   cop1)));
24590 
24591 	      cop0 = x;
24592 	      cop1 = CONST0_RTX (mode);
24593 	      code = EQ;
24594 	      *negate = !*negate;
24595 	      break;
24596 
24597 	    default:
24598 	      gcc_unreachable ();
24599 	    }
24600 	}
24601     }
24602 
24603   if (*negate)
24604     std::swap (op_true, op_false);
24605 
24606   /* Allow the comparison to be done in one mode, but the movcc to
24607      happen in another mode.  */
24608   if (data_mode == mode)
24609     {
24610       x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24611 			       op_true, op_false);
24612     }
24613   else
24614     {
24615       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24616       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24617 			       op_true, op_false);
24618       if (GET_MODE (x) == mode)
24619 	x = gen_lowpart (data_mode, x);
24620     }
24621 
24622   return x;
24623 }
24624 
24625 /* Expand integer vector comparison.  */
24626 
24627 bool
24628 ix86_expand_int_vec_cmp (rtx operands[])
24629 {
24630   rtx_code code = GET_CODE (operands[1]);
24631   bool negate = false;
24632   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24633 				     operands[3], NULL, NULL, &negate);
24634 
24635   if (!cmp)
24636     return false;
24637 
24638   if (negate)
24639     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24640 				   CONST0_RTX (GET_MODE (cmp)),
24641 				   NULL, NULL, &negate);
24642 
24643   gcc_assert (!negate);
24644 
24645   if (operands[0] != cmp)
24646     emit_move_insn (operands[0], cmp);
24647 
24648   return true;
24649 }
24650 
24651 /* Expand a floating-point vector conditional move; a vcond operation
24652    rather than a movcc operation.  */
24653 
24654 bool
24655 ix86_expand_fp_vcond (rtx operands[])
24656 {
24657   enum rtx_code code = GET_CODE (operands[3]);
24658   rtx cmp;
24659 
24660   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24661 					   &operands[4], &operands[5]);
24662   if (code == UNKNOWN)
24663     {
24664       rtx temp;
24665       switch (GET_CODE (operands[3]))
24666 	{
24667 	case LTGT:
24668 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24669 				      operands[5], operands[0], operands[0]);
24670 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24671 				     operands[5], operands[1], operands[2]);
24672 	  code = AND;
24673 	  break;
24674 	case UNEQ:
24675 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24676 				      operands[5], operands[0], operands[0]);
24677 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24678 				     operands[5], operands[1], operands[2]);
24679 	  code = IOR;
24680 	  break;
24681 	default:
24682 	  gcc_unreachable ();
24683 	}
24684       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24685 				 OPTAB_DIRECT);
24686       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24687       return true;
24688     }
24689 
24690   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24691 				 operands[5], operands[1], operands[2]))
24692     return true;
24693 
24694   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24695 			     operands[1], operands[2]);
24696   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24697   return true;
24698 }
24699 
24700 /* Expand a signed/unsigned integral vector conditional move.  */
24701 
24702 bool
24703 ix86_expand_int_vcond (rtx operands[])
24704 {
24705   machine_mode data_mode = GET_MODE (operands[0]);
24706   machine_mode mode = GET_MODE (operands[4]);
24707   enum rtx_code code = GET_CODE (operands[3]);
24708   bool negate = false;
24709   rtx x, cop0, cop1;
24710 
24711   cop0 = operands[4];
24712   cop1 = operands[5];
24713 
24714   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24715      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
24716   if ((code == LT || code == GE)
24717       && data_mode == mode
24718       && cop1 == CONST0_RTX (mode)
24719       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24720       && GET_MODE_UNIT_SIZE (data_mode) > 1
24721       && GET_MODE_UNIT_SIZE (data_mode) <= 8
24722       && (GET_MODE_SIZE (data_mode) == 16
24723 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24724     {
24725       rtx negop = operands[2 - (code == LT)];
24726       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24727       if (negop == CONST1_RTX (data_mode))
24728 	{
24729 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24730 					 operands[0], 1, OPTAB_DIRECT);
24731 	  if (res != operands[0])
24732 	    emit_move_insn (operands[0], res);
24733 	  return true;
24734 	}
24735       else if (GET_MODE_INNER (data_mode) != DImode
24736 	       && vector_all_ones_operand (negop, data_mode))
24737 	{
24738 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24739 					 operands[0], 0, OPTAB_DIRECT);
24740 	  if (res != operands[0])
24741 	    emit_move_insn (operands[0], res);
24742 	  return true;
24743 	}
24744     }
24745 
24746   if (!nonimmediate_operand (cop1, mode))
24747     cop1 = force_reg (mode, cop1);
24748   if (!general_operand (operands[1], data_mode))
24749     operands[1] = force_reg (data_mode, operands[1]);
24750   if (!general_operand (operands[2], data_mode))
24751     operands[2] = force_reg (data_mode, operands[2]);
24752 
24753   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24754 			       operands[1], operands[2], &negate);
24755 
24756   if (!x)
24757     return false;
24758 
24759   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24760 			 operands[2-negate]);
24761   return true;
24762 }
24763 
24764 /* AVX512F does support 64-byte integer vector operations,
24765    thus the longest vector we are faced with is V64QImode.  */
24766 #define MAX_VECT_LEN	64
24767 
24768 struct expand_vec_perm_d
24769 {
24770   rtx target, op0, op1;
24771   unsigned char perm[MAX_VECT_LEN];
24772   machine_mode vmode;
24773   unsigned char nelt;
24774   bool one_operand_p;
24775   bool testing_p;
24776 };
24777 
24778 static bool
24779 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24780 			      struct expand_vec_perm_d *d)
24781 {
24782   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24783      expander, so args are either in d, or in op0, op1 etc.  */
24784   machine_mode mode = GET_MODE (d ? d->op0 : op0);
24785   machine_mode maskmode = mode;
24786   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24787 
24788   switch (mode)
24789     {
24790     case E_V8HImode:
24791       if (TARGET_AVX512VL && TARGET_AVX512BW)
24792 	gen = gen_avx512vl_vpermt2varv8hi3;
24793       break;
24794     case E_V16HImode:
24795       if (TARGET_AVX512VL && TARGET_AVX512BW)
24796 	gen = gen_avx512vl_vpermt2varv16hi3;
24797       break;
24798     case E_V64QImode:
24799       if (TARGET_AVX512VBMI)
24800 	gen = gen_avx512bw_vpermt2varv64qi3;
24801       break;
24802     case E_V32HImode:
24803       if (TARGET_AVX512BW)
24804 	gen = gen_avx512bw_vpermt2varv32hi3;
24805       break;
24806     case E_V4SImode:
24807       if (TARGET_AVX512VL)
24808 	gen = gen_avx512vl_vpermt2varv4si3;
24809       break;
24810     case E_V8SImode:
24811       if (TARGET_AVX512VL)
24812 	gen = gen_avx512vl_vpermt2varv8si3;
24813       break;
24814     case E_V16SImode:
24815       if (TARGET_AVX512F)
24816 	gen = gen_avx512f_vpermt2varv16si3;
24817       break;
24818     case E_V4SFmode:
24819       if (TARGET_AVX512VL)
24820 	{
24821 	  gen = gen_avx512vl_vpermt2varv4sf3;
24822 	  maskmode = V4SImode;
24823 	}
24824       break;
24825     case E_V8SFmode:
24826       if (TARGET_AVX512VL)
24827 	{
24828 	  gen = gen_avx512vl_vpermt2varv8sf3;
24829 	  maskmode = V8SImode;
24830 	}
24831       break;
24832     case E_V16SFmode:
24833       if (TARGET_AVX512F)
24834 	{
24835 	  gen = gen_avx512f_vpermt2varv16sf3;
24836 	  maskmode = V16SImode;
24837 	}
24838       break;
24839     case E_V2DImode:
24840       if (TARGET_AVX512VL)
24841 	gen = gen_avx512vl_vpermt2varv2di3;
24842       break;
24843     case E_V4DImode:
24844       if (TARGET_AVX512VL)
24845 	gen = gen_avx512vl_vpermt2varv4di3;
24846       break;
24847     case E_V8DImode:
24848       if (TARGET_AVX512F)
24849 	gen = gen_avx512f_vpermt2varv8di3;
24850       break;
24851     case E_V2DFmode:
24852       if (TARGET_AVX512VL)
24853 	{
24854 	  gen = gen_avx512vl_vpermt2varv2df3;
24855 	  maskmode = V2DImode;
24856 	}
24857       break;
24858     case E_V4DFmode:
24859       if (TARGET_AVX512VL)
24860 	{
24861 	  gen = gen_avx512vl_vpermt2varv4df3;
24862 	  maskmode = V4DImode;
24863 	}
24864       break;
24865     case E_V8DFmode:
24866       if (TARGET_AVX512F)
24867 	{
24868 	  gen = gen_avx512f_vpermt2varv8df3;
24869 	  maskmode = V8DImode;
24870 	}
24871       break;
24872     default:
24873       break;
24874     }
24875 
24876   if (gen == NULL)
24877     return false;
24878 
24879   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24880      expander, so args are either in d, or in op0, op1 etc.  */
24881   if (d)
24882     {
24883       rtx vec[64];
24884       target = d->target;
24885       op0 = d->op0;
24886       op1 = d->op1;
24887       for (int i = 0; i < d->nelt; ++i)
24888 	vec[i] = GEN_INT (d->perm[i]);
24889       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24890     }
24891 
24892   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24893   return true;
24894 }
24895 
24896 /* Expand a variable vector permutation.  */
24897 
24898 void
24899 ix86_expand_vec_perm (rtx operands[])
24900 {
24901   rtx target = operands[0];
24902   rtx op0 = operands[1];
24903   rtx op1 = operands[2];
24904   rtx mask = operands[3];
24905   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24906   machine_mode mode = GET_MODE (op0);
24907   machine_mode maskmode = GET_MODE (mask);
24908   int w, e, i;
24909   bool one_operand_shuffle = rtx_equal_p (op0, op1);
24910 
24911   /* Number of elements in the vector.  */
24912   w = GET_MODE_NUNITS (mode);
24913   e = GET_MODE_UNIT_SIZE (mode);
24914   gcc_assert (w <= 64);
24915 
24916   if (TARGET_AVX512F && one_operand_shuffle)
24917     {
24918       rtx (*gen) (rtx, rtx, rtx) = NULL;
24919       switch (mode)
24920 	{
24921 	case E_V16SImode:
24922 	  gen =gen_avx512f_permvarv16si;
24923 	  break;
24924 	case E_V16SFmode:
24925 	  gen = gen_avx512f_permvarv16sf;
24926 	  break;
24927 	case E_V8DImode:
24928 	  gen = gen_avx512f_permvarv8di;
24929 	  break;
24930 	case E_V8DFmode:
24931 	  gen = gen_avx512f_permvarv8df;
24932 	  break;
24933 	default:
24934 	  break;
24935 	}
24936       if (gen != NULL)
24937 	{
24938 	  emit_insn (gen (target, op0, mask));
24939 	  return;
24940 	}
24941     }
24942 
24943   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24944     return;
24945 
24946   if (TARGET_AVX2)
24947     {
24948       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24949 	{
24950 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24951 	     an constant shuffle operand.  With a tiny bit of effort we can
24952 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
24953 	     unfortunate but there's no avoiding it.
24954 	     Similarly for V16HImode we don't have instructions for variable
24955 	     shuffling, while for V32QImode we can use after preparing suitable
24956 	     masks vpshufb; vpshufb; vpermq; vpor.  */
24957 
24958 	  if (mode == V16HImode)
24959 	    {
24960 	      maskmode = mode = V32QImode;
24961 	      w = 32;
24962 	      e = 1;
24963 	    }
24964 	  else
24965 	    {
24966 	      maskmode = mode = V8SImode;
24967 	      w = 8;
24968 	      e = 4;
24969 	    }
24970 	  t1 = gen_reg_rtx (maskmode);
24971 
24972 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
24973 	       mask = { A B C D }
24974 	       t1 = { A A B B C C D D }.  */
24975 	  for (i = 0; i < w / 2; ++i)
24976 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24977 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24978 	  vt = force_reg (maskmode, vt);
24979 	  mask = gen_lowpart (maskmode, mask);
24980 	  if (maskmode == V8SImode)
24981 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24982 	  else
24983 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24984 
24985 	  /* Multiply the shuffle indicies by two.  */
24986 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24987 				    OPTAB_DIRECT);
24988 
24989 	  /* Add one to the odd shuffle indicies:
24990 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
24991 	  for (i = 0; i < w / 2; ++i)
24992 	    {
24993 	      vec[i * 2] = const0_rtx;
24994 	      vec[i * 2 + 1] = const1_rtx;
24995 	    }
24996 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24997 	  vt = validize_mem (force_const_mem (maskmode, vt));
24998 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24999 				    OPTAB_DIRECT);
25000 
25001 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
25002 	  operands[3] = mask = t1;
25003 	  target = gen_reg_rtx (mode);
25004 	  op0 = gen_lowpart (mode, op0);
25005 	  op1 = gen_lowpart (mode, op1);
25006 	}
25007 
25008       switch (mode)
25009 	{
25010 	case E_V8SImode:
25011 	  /* The VPERMD and VPERMPS instructions already properly ignore
25012 	     the high bits of the shuffle elements.  No need for us to
25013 	     perform an AND ourselves.  */
25014 	  if (one_operand_shuffle)
25015 	    {
25016 	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25017 	      if (target != operands[0])
25018 		emit_move_insn (operands[0],
25019 				gen_lowpart (GET_MODE (operands[0]), target));
25020 	    }
25021 	  else
25022 	    {
25023 	      t1 = gen_reg_rtx (V8SImode);
25024 	      t2 = gen_reg_rtx (V8SImode);
25025 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25026 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25027 	      goto merge_two;
25028 	    }
25029 	  return;
25030 
25031 	case E_V8SFmode:
25032 	  mask = gen_lowpart (V8SImode, mask);
25033 	  if (one_operand_shuffle)
25034 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25035 	  else
25036 	    {
25037 	      t1 = gen_reg_rtx (V8SFmode);
25038 	      t2 = gen_reg_rtx (V8SFmode);
25039 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25040 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25041 	      goto merge_two;
25042 	    }
25043 	  return;
25044 
25045         case E_V4SImode:
25046 	  /* By combining the two 128-bit input vectors into one 256-bit
25047 	     input vector, we can use VPERMD and VPERMPS for the full
25048 	     two-operand shuffle.  */
25049 	  t1 = gen_reg_rtx (V8SImode);
25050 	  t2 = gen_reg_rtx (V8SImode);
25051 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25052 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25053 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25054 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25055 	  return;
25056 
25057         case E_V4SFmode:
25058 	  t1 = gen_reg_rtx (V8SFmode);
25059 	  t2 = gen_reg_rtx (V8SImode);
25060 	  mask = gen_lowpart (V4SImode, mask);
25061 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25062 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25063 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25064 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25065 	  return;
25066 
25067 	case E_V32QImode:
25068 	  t1 = gen_reg_rtx (V32QImode);
25069 	  t2 = gen_reg_rtx (V32QImode);
25070 	  t3 = gen_reg_rtx (V32QImode);
25071 	  vt2 = GEN_INT (-128);
25072 	  vt = gen_const_vec_duplicate (V32QImode, vt2);
25073 	  vt = force_reg (V32QImode, vt);
25074 	  for (i = 0; i < 32; i++)
25075 	    vec[i] = i < 16 ? vt2 : const0_rtx;
25076 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25077 	  vt2 = force_reg (V32QImode, vt2);
25078 	  /* From mask create two adjusted masks, which contain the same
25079 	     bits as mask in the low 7 bits of each vector element.
25080 	     The first mask will have the most significant bit clear
25081 	     if it requests element from the same 128-bit lane
25082 	     and MSB set if it requests element from the other 128-bit lane.
25083 	     The second mask will have the opposite values of the MSB,
25084 	     and additionally will have its 128-bit lanes swapped.
25085 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25086 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
25087 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25088 	     stands for other 12 bytes.  */
25089 	  /* The bit whether element is from the same lane or the other
25090 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
25091 	  t5 = gen_reg_rtx (V4DImode);
25092 	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25093 				    GEN_INT (3)));
25094 	  /* Clear MSB bits from the mask just in case it had them set.  */
25095 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25096 	  /* After this t1 will have MSB set for elements from other lane.  */
25097 	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25098 	  /* Clear bits other than MSB.  */
25099 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
25100 	  /* Or in the lower bits from mask into t3.  */
25101 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
25102 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
25103 	     lane.  */
25104 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
25105 	  /* Swap 128-bit lanes in t3.  */
25106 	  t6 = gen_reg_rtx (V4DImode);
25107 	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25108 					  const2_rtx, GEN_INT (3),
25109 					  const0_rtx, const1_rtx));
25110 	  /* And or in the lower bits from mask into t1.  */
25111 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
25112 	  if (one_operand_shuffle)
25113 	    {
25114 	      /* Each of these shuffles will put 0s in places where
25115 		 element from the other 128-bit lane is needed, otherwise
25116 		 will shuffle in the requested value.  */
25117 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25118 						gen_lowpart (V32QImode, t6)));
25119 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25120 	      /* For t3 the 128-bit lanes are swapped again.  */
25121 	      t7 = gen_reg_rtx (V4DImode);
25122 	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25123 					      const2_rtx, GEN_INT (3),
25124 					      const0_rtx, const1_rtx));
25125 	      /* And oring both together leads to the result.  */
25126 	      emit_insn (gen_iorv32qi3 (target, t1,
25127 					gen_lowpart (V32QImode, t7)));
25128 	      if (target != operands[0])
25129 		emit_move_insn (operands[0],
25130 				gen_lowpart (GET_MODE (operands[0]), target));
25131 	      return;
25132 	    }
25133 
25134 	  t4 = gen_reg_rtx (V32QImode);
25135 	  /* Similarly to the above one_operand_shuffle code,
25136 	     just for repeated twice for each operand.  merge_two:
25137 	     code will merge the two results together.  */
25138 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25139 					    gen_lowpart (V32QImode, t6)));
25140 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25141 					    gen_lowpart (V32QImode, t6)));
25142 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25143 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25144 	  t7 = gen_reg_rtx (V4DImode);
25145 	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25146 					  const2_rtx, GEN_INT (3),
25147 					  const0_rtx, const1_rtx));
25148 	  t8 = gen_reg_rtx (V4DImode);
25149 	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25150 					  const2_rtx, GEN_INT (3),
25151 					  const0_rtx, const1_rtx));
25152 	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25153 	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25154 	  t1 = t4;
25155 	  t2 = t3;
25156 	  goto merge_two;
25157 
25158 	default:
25159 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
25160 	  break;
25161 	}
25162     }
25163 
25164   if (TARGET_XOP)
25165     {
25166       /* The XOP VPPERM insn supports three inputs.  By ignoring the
25167 	 one_operand_shuffle special case, we avoid creating another
25168 	 set of constant vectors in memory.  */
25169       one_operand_shuffle = false;
25170 
25171       /* mask = mask & {2*w-1, ...} */
25172       vt = GEN_INT (2*w - 1);
25173     }
25174   else
25175     {
25176       /* mask = mask & {w-1, ...} */
25177       vt = GEN_INT (w - 1);
25178     }
25179 
25180   vt = gen_const_vec_duplicate (maskmode, vt);
25181   mask = expand_simple_binop (maskmode, AND, mask, vt,
25182 			      NULL_RTX, 0, OPTAB_DIRECT);
25183 
25184   /* For non-QImode operations, convert the word permutation control
25185      into a byte permutation control.  */
25186   if (mode != V16QImode)
25187     {
25188       mask = expand_simple_binop (maskmode, ASHIFT, mask,
25189 				  GEN_INT (exact_log2 (e)),
25190 				  NULL_RTX, 0, OPTAB_DIRECT);
25191 
25192       /* Convert mask to vector of chars.  */
25193       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25194 
25195       /* Replicate each of the input bytes into byte positions:
25196 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25197 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25198 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
25199       for (i = 0; i < 16; ++i)
25200 	vec[i] = GEN_INT (i/e * e);
25201       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25202       vt = validize_mem (force_const_mem (V16QImode, vt));
25203       if (TARGET_XOP)
25204 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25205       else
25206 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25207 
25208       /* Convert it into the byte positions by doing
25209 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
25210       for (i = 0; i < 16; ++i)
25211 	vec[i] = GEN_INT (i % e);
25212       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25213       vt = validize_mem (force_const_mem (V16QImode, vt));
25214       emit_insn (gen_addv16qi3 (mask, mask, vt));
25215     }
25216 
25217   /* The actual shuffle operations all operate on V16QImode.  */
25218   op0 = gen_lowpart (V16QImode, op0);
25219   op1 = gen_lowpart (V16QImode, op1);
25220 
25221   if (TARGET_XOP)
25222     {
25223       if (GET_MODE (target) != V16QImode)
25224 	target = gen_reg_rtx (V16QImode);
25225       emit_insn (gen_xop_pperm (target, op0, op1, mask));
25226       if (target != operands[0])
25227 	emit_move_insn (operands[0],
25228 			gen_lowpart (GET_MODE (operands[0]), target));
25229     }
25230   else if (one_operand_shuffle)
25231     {
25232       if (GET_MODE (target) != V16QImode)
25233 	target = gen_reg_rtx (V16QImode);
25234       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25235       if (target != operands[0])
25236 	emit_move_insn (operands[0],
25237 			gen_lowpart (GET_MODE (operands[0]), target));
25238     }
25239   else
25240     {
25241       rtx xops[6];
25242       bool ok;
25243 
25244       /* Shuffle the two input vectors independently.  */
25245       t1 = gen_reg_rtx (V16QImode);
25246       t2 = gen_reg_rtx (V16QImode);
25247       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25248       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25249 
25250  merge_two:
25251       /* Then merge them together.  The key is whether any given control
25252          element contained a bit set that indicates the second word.  */
25253       mask = operands[3];
25254       vt = GEN_INT (w);
25255       if (maskmode == V2DImode && !TARGET_SSE4_1)
25256 	{
25257 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
25258 	     more shuffle to convert the V2DI input mask into a V4SI
25259 	     input mask.  At which point the masking that expand_int_vcond
25260 	     will work as desired.  */
25261 	  rtx t3 = gen_reg_rtx (V4SImode);
25262 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25263 				        const0_rtx, const0_rtx,
25264 				        const2_rtx, const2_rtx));
25265 	  mask = t3;
25266 	  maskmode = V4SImode;
25267 	  e = w = 4;
25268 	}
25269 
25270       vt = gen_const_vec_duplicate (maskmode, vt);
25271       vt = force_reg (maskmode, vt);
25272       mask = expand_simple_binop (maskmode, AND, mask, vt,
25273 				  NULL_RTX, 0, OPTAB_DIRECT);
25274 
25275       if (GET_MODE (target) != mode)
25276 	target = gen_reg_rtx (mode);
25277       xops[0] = target;
25278       xops[1] = gen_lowpart (mode, t2);
25279       xops[2] = gen_lowpart (mode, t1);
25280       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25281       xops[4] = mask;
25282       xops[5] = vt;
25283       ok = ix86_expand_int_vcond (xops);
25284       gcc_assert (ok);
25285       if (target != operands[0])
25286 	emit_move_insn (operands[0],
25287 			gen_lowpart (GET_MODE (operands[0]), target));
25288     }
25289 }
25290 
25291 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
25292    true if we should do zero extension, else sign extension.  HIGH_P is
25293    true if we want the N/2 high elements, else the low elements.  */
25294 
25295 void
25296 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25297 {
25298   machine_mode imode = GET_MODE (src);
25299   rtx tmp;
25300 
25301   if (TARGET_SSE4_1)
25302     {
25303       rtx (*unpack)(rtx, rtx);
25304       rtx (*extract)(rtx, rtx) = NULL;
25305       machine_mode halfmode = BLKmode;
25306 
25307       switch (imode)
25308 	{
25309 	case E_V64QImode:
25310 	  if (unsigned_p)
25311 	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25312 	  else
25313 	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25314 	  halfmode = V32QImode;
25315 	  extract
25316 	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25317 	  break;
25318 	case E_V32QImode:
25319 	  if (unsigned_p)
25320 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
25321 	  else
25322 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
25323 	  halfmode = V16QImode;
25324 	  extract
25325 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25326 	  break;
25327 	case E_V32HImode:
25328 	  if (unsigned_p)
25329 	    unpack = gen_avx512f_zero_extendv16hiv16si2;
25330 	  else
25331 	    unpack = gen_avx512f_sign_extendv16hiv16si2;
25332 	  halfmode = V16HImode;
25333 	  extract
25334 	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25335 	  break;
25336 	case E_V16HImode:
25337 	  if (unsigned_p)
25338 	    unpack = gen_avx2_zero_extendv8hiv8si2;
25339 	  else
25340 	    unpack = gen_avx2_sign_extendv8hiv8si2;
25341 	  halfmode = V8HImode;
25342 	  extract
25343 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25344 	  break;
25345 	case E_V16SImode:
25346 	  if (unsigned_p)
25347 	    unpack = gen_avx512f_zero_extendv8siv8di2;
25348 	  else
25349 	    unpack = gen_avx512f_sign_extendv8siv8di2;
25350 	  halfmode = V8SImode;
25351 	  extract
25352 	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25353 	  break;
25354 	case E_V8SImode:
25355 	  if (unsigned_p)
25356 	    unpack = gen_avx2_zero_extendv4siv4di2;
25357 	  else
25358 	    unpack = gen_avx2_sign_extendv4siv4di2;
25359 	  halfmode = V4SImode;
25360 	  extract
25361 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25362 	  break;
25363 	case E_V16QImode:
25364 	  if (unsigned_p)
25365 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25366 	  else
25367 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25368 	  break;
25369 	case E_V8HImode:
25370 	  if (unsigned_p)
25371 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
25372 	  else
25373 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
25374 	  break;
25375 	case E_V4SImode:
25376 	  if (unsigned_p)
25377 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
25378 	  else
25379 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
25380 	  break;
25381 	default:
25382 	  gcc_unreachable ();
25383 	}
25384 
25385       if (GET_MODE_SIZE (imode) >= 32)
25386 	{
25387 	  tmp = gen_reg_rtx (halfmode);
25388 	  emit_insn (extract (tmp, src));
25389 	}
25390       else if (high_p)
25391 	{
25392 	  /* Shift higher 8 bytes to lower 8 bytes.  */
25393 	  tmp = gen_reg_rtx (V1TImode);
25394 	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25395 					 GEN_INT (64)));
25396 	  tmp = gen_lowpart (imode, tmp);
25397 	}
25398       else
25399 	tmp = src;
25400 
25401       emit_insn (unpack (dest, tmp));
25402     }
25403   else
25404     {
25405       rtx (*unpack)(rtx, rtx, rtx);
25406 
25407       switch (imode)
25408 	{
25409 	case E_V16QImode:
25410 	  if (high_p)
25411 	    unpack = gen_vec_interleave_highv16qi;
25412 	  else
25413 	    unpack = gen_vec_interleave_lowv16qi;
25414 	  break;
25415 	case E_V8HImode:
25416 	  if (high_p)
25417 	    unpack = gen_vec_interleave_highv8hi;
25418 	  else
25419 	    unpack = gen_vec_interleave_lowv8hi;
25420 	  break;
25421 	case E_V4SImode:
25422 	  if (high_p)
25423 	    unpack = gen_vec_interleave_highv4si;
25424 	  else
25425 	    unpack = gen_vec_interleave_lowv4si;
25426 	  break;
25427 	default:
25428 	  gcc_unreachable ();
25429 	}
25430 
25431       if (unsigned_p)
25432 	tmp = force_reg (imode, CONST0_RTX (imode));
25433       else
25434 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25435 				   src, pc_rtx, pc_rtx);
25436 
25437       rtx tmp2 = gen_reg_rtx (imode);
25438       emit_insn (unpack (tmp2, src, tmp));
25439       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25440     }
25441 }
25442 
25443 /* Expand conditional increment or decrement using adb/sbb instructions.
25444    The default case using setcc followed by the conditional move can be
25445    done by generic code.  */
25446 bool
25447 ix86_expand_int_addcc (rtx operands[])
25448 {
25449   enum rtx_code code = GET_CODE (operands[1]);
25450   rtx flags;
25451   rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25452   rtx compare_op;
25453   rtx val = const0_rtx;
25454   bool fpcmp = false;
25455   machine_mode mode;
25456   rtx op0 = XEXP (operands[1], 0);
25457   rtx op1 = XEXP (operands[1], 1);
25458 
25459   if (operands[3] != const1_rtx
25460       && operands[3] != constm1_rtx)
25461     return false;
25462   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25463      return false;
25464   code = GET_CODE (compare_op);
25465 
25466   flags = XEXP (compare_op, 0);
25467 
25468   if (GET_MODE (flags) == CCFPmode)
25469     {
25470       fpcmp = true;
25471       code = ix86_fp_compare_code_to_integer (code);
25472     }
25473 
25474   if (code != LTU)
25475     {
25476       val = constm1_rtx;
25477       if (fpcmp)
25478 	PUT_CODE (compare_op,
25479 		  reverse_condition_maybe_unordered
25480 		    (GET_CODE (compare_op)));
25481       else
25482 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25483     }
25484 
25485   mode = GET_MODE (operands[0]);
25486 
25487   /* Construct either adc or sbb insn.  */
25488   if ((code == LTU) == (operands[3] == constm1_rtx))
25489     {
25490       switch (mode)
25491 	{
25492 	  case E_QImode:
25493 	    insn = gen_subqi3_carry;
25494 	    break;
25495 	  case E_HImode:
25496 	    insn = gen_subhi3_carry;
25497 	    break;
25498 	  case E_SImode:
25499 	    insn = gen_subsi3_carry;
25500 	    break;
25501 	  case E_DImode:
25502 	    insn = gen_subdi3_carry;
25503 	    break;
25504 	  default:
25505 	    gcc_unreachable ();
25506 	}
25507     }
25508   else
25509     {
25510       switch (mode)
25511 	{
25512 	  case E_QImode:
25513 	    insn = gen_addqi3_carry;
25514 	    break;
25515 	  case E_HImode:
25516 	    insn = gen_addhi3_carry;
25517 	    break;
25518 	  case E_SImode:
25519 	    insn = gen_addsi3_carry;
25520 	    break;
25521 	  case E_DImode:
25522 	    insn = gen_adddi3_carry;
25523 	    break;
25524 	  default:
25525 	    gcc_unreachable ();
25526 	}
25527     }
25528   emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25529 
25530   return true;
25531 }
25532 
25533 
25534 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
25535    but works for floating pointer parameters and nonoffsetable memories.
25536    For pushes, it returns just stack offsets; the values will be saved
25537    in the right order.  Maximally three parts are generated.  */
25538 
25539 static int
25540 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25541 {
25542   int size;
25543 
25544   if (!TARGET_64BIT)
25545     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25546   else
25547     size = (GET_MODE_SIZE (mode) + 4) / 8;
25548 
25549   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25550   gcc_assert (size >= 2 && size <= 4);
25551 
25552   /* Optimize constant pool reference to immediates.  This is used by fp
25553      moves, that force all constants to memory to allow combining.  */
25554   if (MEM_P (operand) && MEM_READONLY_P (operand))
25555     operand = avoid_constant_pool_reference (operand);
25556 
25557   if (MEM_P (operand) && !offsettable_memref_p (operand))
25558     {
25559       /* The only non-offsetable memories we handle are pushes.  */
25560       int ok = push_operand (operand, VOIDmode);
25561 
25562       gcc_assert (ok);
25563 
25564       operand = copy_rtx (operand);
25565       PUT_MODE (operand, word_mode);
25566       parts[0] = parts[1] = parts[2] = parts[3] = operand;
25567       return size;
25568     }
25569 
25570   if (GET_CODE (operand) == CONST_VECTOR)
25571     {
25572       scalar_int_mode imode = int_mode_for_mode (mode).require ();
25573       /* Caution: if we looked through a constant pool memory above,
25574 	 the operand may actually have a different mode now.  That's
25575 	 ok, since we want to pun this all the way back to an integer.  */
25576       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25577       gcc_assert (operand != NULL);
25578       mode = imode;
25579     }
25580 
25581   if (!TARGET_64BIT)
25582     {
25583       if (mode == DImode)
25584 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25585       else
25586 	{
25587 	  int i;
25588 
25589 	  if (REG_P (operand))
25590 	    {
25591 	      gcc_assert (reload_completed);
25592 	      for (i = 0; i < size; i++)
25593 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25594 	    }
25595 	  else if (offsettable_memref_p (operand))
25596 	    {
25597 	      operand = adjust_address (operand, SImode, 0);
25598 	      parts[0] = operand;
25599 	      for (i = 1; i < size; i++)
25600 		parts[i] = adjust_address (operand, SImode, 4 * i);
25601 	    }
25602 	  else if (CONST_DOUBLE_P (operand))
25603 	    {
25604 	      const REAL_VALUE_TYPE *r;
25605 	      long l[4];
25606 
25607 	      r = CONST_DOUBLE_REAL_VALUE (operand);
25608 	      switch (mode)
25609 		{
25610 		case E_TFmode:
25611 		  real_to_target (l, r, mode);
25612 		  parts[3] = gen_int_mode (l[3], SImode);
25613 		  parts[2] = gen_int_mode (l[2], SImode);
25614 		  break;
25615 		case E_XFmode:
25616 		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25617 		     long double may not be 80-bit.  */
25618 		  real_to_target (l, r, mode);
25619 		  parts[2] = gen_int_mode (l[2], SImode);
25620 		  break;
25621 		case E_DFmode:
25622 		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25623 		  break;
25624 		default:
25625 		  gcc_unreachable ();
25626 		}
25627 	      parts[1] = gen_int_mode (l[1], SImode);
25628 	      parts[0] = gen_int_mode (l[0], SImode);
25629 	    }
25630 	  else
25631 	    gcc_unreachable ();
25632 	}
25633     }
25634   else
25635     {
25636       if (mode == TImode)
25637 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25638       if (mode == XFmode || mode == TFmode)
25639 	{
25640 	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25641 	  if (REG_P (operand))
25642 	    {
25643 	      gcc_assert (reload_completed);
25644 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25645 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25646 	    }
25647 	  else if (offsettable_memref_p (operand))
25648 	    {
25649 	      operand = adjust_address (operand, DImode, 0);
25650 	      parts[0] = operand;
25651 	      parts[1] = adjust_address (operand, upper_mode, 8);
25652 	    }
25653 	  else if (CONST_DOUBLE_P (operand))
25654 	    {
25655 	      long l[4];
25656 
25657 	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25658 
25659 	      /* real_to_target puts 32-bit pieces in each long.  */
25660 	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25661 				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25662 					  << 32), DImode);
25663 
25664 	      if (upper_mode == SImode)
25665 	        parts[1] = gen_int_mode (l[2], SImode);
25666 	      else
25667 	        parts[1]
25668 		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25669 				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25670 				     << 32), DImode);
25671 	    }
25672 	  else
25673 	    gcc_unreachable ();
25674 	}
25675     }
25676 
25677   return size;
25678 }
25679 
25680 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25681    Return false when normal moves are needed; true when all required
25682    insns have been emitted.  Operands 2-4 contain the input values
25683    int the correct order; operands 5-7 contain the output values.  */
25684 
25685 void
25686 ix86_split_long_move (rtx operands[])
25687 {
25688   rtx part[2][4];
25689   int nparts, i, j;
25690   int push = 0;
25691   int collisions = 0;
25692   machine_mode mode = GET_MODE (operands[0]);
25693   bool collisionparts[4];
25694 
25695   /* The DFmode expanders may ask us to move double.
25696      For 64bit target this is single move.  By hiding the fact
25697      here we simplify i386.md splitters.  */
25698   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25699     {
25700       /* Optimize constant pool reference to immediates.  This is used by
25701 	 fp moves, that force all constants to memory to allow combining.  */
25702 
25703       if (MEM_P (operands[1])
25704 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25705 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25706 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
25707       if (push_operand (operands[0], VOIDmode))
25708 	{
25709 	  operands[0] = copy_rtx (operands[0]);
25710 	  PUT_MODE (operands[0], word_mode);
25711 	}
25712       else
25713         operands[0] = gen_lowpart (DImode, operands[0]);
25714       operands[1] = gen_lowpart (DImode, operands[1]);
25715       emit_move_insn (operands[0], operands[1]);
25716       return;
25717     }
25718 
25719   /* The only non-offsettable memory we handle is push.  */
25720   if (push_operand (operands[0], VOIDmode))
25721     push = 1;
25722   else
25723     gcc_assert (!MEM_P (operands[0])
25724 		|| offsettable_memref_p (operands[0]));
25725 
25726   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25727   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25728 
25729   /* When emitting push, take care for source operands on the stack.  */
25730   if (push && MEM_P (operands[1])
25731       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25732     {
25733       rtx src_base = XEXP (part[1][nparts - 1], 0);
25734 
25735       /* Compensate for the stack decrement by 4.  */
25736       if (!TARGET_64BIT && nparts == 3
25737 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25738 	src_base = plus_constant (Pmode, src_base, 4);
25739 
25740       /* src_base refers to the stack pointer and is
25741 	 automatically decreased by emitted push.  */
25742       for (i = 0; i < nparts; i++)
25743 	part[1][i] = change_address (part[1][i],
25744 				     GET_MODE (part[1][i]), src_base);
25745     }
25746 
25747   /* We need to do copy in the right order in case an address register
25748      of the source overlaps the destination.  */
25749   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25750     {
25751       rtx tmp;
25752 
25753       for (i = 0; i < nparts; i++)
25754 	{
25755 	  collisionparts[i]
25756 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25757 	  if (collisionparts[i])
25758 	    collisions++;
25759 	}
25760 
25761       /* Collision in the middle part can be handled by reordering.  */
25762       if (collisions == 1 && nparts == 3 && collisionparts [1])
25763 	{
25764 	  std::swap (part[0][1], part[0][2]);
25765 	  std::swap (part[1][1], part[1][2]);
25766 	}
25767       else if (collisions == 1
25768 	       && nparts == 4
25769 	       && (collisionparts [1] || collisionparts [2]))
25770 	{
25771 	  if (collisionparts [1])
25772 	    {
25773 	      std::swap (part[0][1], part[0][2]);
25774 	      std::swap (part[1][1], part[1][2]);
25775 	    }
25776 	  else
25777 	    {
25778 	      std::swap (part[0][2], part[0][3]);
25779 	      std::swap (part[1][2], part[1][3]);
25780 	    }
25781 	}
25782 
25783       /* If there are more collisions, we can't handle it by reordering.
25784 	 Do an lea to the last part and use only one colliding move.  */
25785       else if (collisions > 1)
25786 	{
25787 	  rtx base, addr;
25788 
25789 	  collisions = 1;
25790 
25791 	  base = part[0][nparts - 1];
25792 
25793 	  /* Handle the case when the last part isn't valid for lea.
25794 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
25795 	  if (GET_MODE (base) != Pmode)
25796 	    base = gen_rtx_REG (Pmode, REGNO (base));
25797 
25798 	  addr = XEXP (part[1][0], 0);
25799 	  if (TARGET_TLS_DIRECT_SEG_REFS)
25800 	    {
25801 	      struct ix86_address parts;
25802 	      int ok = ix86_decompose_address (addr, &parts);
25803 	      gcc_assert (ok);
25804 	      /* It is not valid to use %gs: or %fs: in lea.  */
25805 	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25806 	    }
25807 	  emit_insn (gen_rtx_SET (base, addr));
25808 	  part[1][0] = replace_equiv_address (part[1][0], base);
25809 	  for (i = 1; i < nparts; i++)
25810 	    {
25811 	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25812 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
25813 	    }
25814 	}
25815     }
25816 
25817   if (push)
25818     {
25819       if (!TARGET_64BIT)
25820 	{
25821 	  if (nparts == 3)
25822 	    {
25823 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25824                 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25825 					  stack_pointer_rtx, GEN_INT (-4)));
25826 	      emit_move_insn (part[0][2], part[1][2]);
25827 	    }
25828 	  else if (nparts == 4)
25829 	    {
25830 	      emit_move_insn (part[0][3], part[1][3]);
25831 	      emit_move_insn (part[0][2], part[1][2]);
25832 	    }
25833 	}
25834       else
25835 	{
25836 	  /* In 64bit mode we don't have 32bit push available.  In case this is
25837 	     register, it is OK - we will just use larger counterpart.  We also
25838 	     retype memory - these comes from attempt to avoid REX prefix on
25839 	     moving of second half of TFmode value.  */
25840 	  if (GET_MODE (part[1][1]) == SImode)
25841 	    {
25842 	      switch (GET_CODE (part[1][1]))
25843 		{
25844 		case MEM:
25845 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
25846 		  break;
25847 
25848 		case REG:
25849 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25850 		  break;
25851 
25852 		default:
25853 		  gcc_unreachable ();
25854 		}
25855 
25856 	      if (GET_MODE (part[1][0]) == SImode)
25857 		part[1][0] = part[1][1];
25858 	    }
25859 	}
25860       emit_move_insn (part[0][1], part[1][1]);
25861       emit_move_insn (part[0][0], part[1][0]);
25862       return;
25863     }
25864 
25865   /* Choose correct order to not overwrite the source before it is copied.  */
25866   if ((REG_P (part[0][0])
25867        && REG_P (part[1][1])
25868        && (REGNO (part[0][0]) == REGNO (part[1][1])
25869 	   || (nparts == 3
25870 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
25871 	   || (nparts == 4
25872 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
25873       || (collisions > 0
25874 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25875     {
25876       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25877 	{
25878 	  operands[2 + i] = part[0][j];
25879 	  operands[6 + i] = part[1][j];
25880 	}
25881     }
25882   else
25883     {
25884       for (i = 0; i < nparts; i++)
25885 	{
25886 	  operands[2 + i] = part[0][i];
25887 	  operands[6 + i] = part[1][i];
25888 	}
25889     }
25890 
25891   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
25892   if (optimize_insn_for_size_p ())
25893     {
25894       for (j = 0; j < nparts - 1; j++)
25895 	if (CONST_INT_P (operands[6 + j])
25896 	    && operands[6 + j] != const0_rtx
25897 	    && REG_P (operands[2 + j]))
25898 	  for (i = j; i < nparts - 1; i++)
25899 	    if (CONST_INT_P (operands[7 + i])
25900 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25901 	      operands[7 + i] = operands[2 + j];
25902     }
25903 
25904   for (i = 0; i < nparts; i++)
25905     emit_move_insn (operands[2 + i], operands[6 + i]);
25906 
25907   return;
25908 }
25909 
25910 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25911    left shift by a constant, either using a single shift or
25912    a sequence of add instructions.  */
25913 
25914 static void
25915 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25916 {
25917   rtx (*insn)(rtx, rtx, rtx);
25918 
25919   if (count == 1
25920       || (count * ix86_cost->add <= ix86_cost->shift_const
25921 	  && !optimize_insn_for_size_p ()))
25922     {
25923       insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25924       while (count-- > 0)
25925 	emit_insn (insn (operand, operand, operand));
25926     }
25927   else
25928     {
25929       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25930       emit_insn (insn (operand, operand, GEN_INT (count)));
25931     }
25932 }
25933 
25934 void
25935 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25936 {
25937   rtx (*gen_ashl3)(rtx, rtx, rtx);
25938   rtx (*gen_shld)(rtx, rtx, rtx);
25939   int half_width = GET_MODE_BITSIZE (mode) >> 1;
25940 
25941   rtx low[2], high[2];
25942   int count;
25943 
25944   if (CONST_INT_P (operands[2]))
25945     {
25946       split_double_mode (mode, operands, 2, low, high);
25947       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25948 
25949       if (count >= half_width)
25950 	{
25951 	  emit_move_insn (high[0], low[1]);
25952 	  emit_move_insn (low[0], const0_rtx);
25953 
25954 	  if (count > half_width)
25955 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
25956 	}
25957       else
25958 	{
25959 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25960 
25961 	  if (!rtx_equal_p (operands[0], operands[1]))
25962 	    emit_move_insn (operands[0], operands[1]);
25963 
25964 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25965 	  ix86_expand_ashl_const (low[0], count, mode);
25966 	}
25967       return;
25968     }
25969 
25970   split_double_mode (mode, operands, 1, low, high);
25971 
25972   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25973 
25974   if (operands[1] == const1_rtx)
25975     {
25976       /* Assuming we've chosen a QImode capable registers, then 1 << N
25977 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
25978       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25979 	{
25980 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25981 
25982 	  ix86_expand_clear (low[0]);
25983 	  ix86_expand_clear (high[0]);
25984 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25985 
25986 	  d = gen_lowpart (QImode, low[0]);
25987 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25988 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
25989 	  emit_insn (gen_rtx_SET (d, s));
25990 
25991 	  d = gen_lowpart (QImode, high[0]);
25992 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25993 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
25994 	  emit_insn (gen_rtx_SET (d, s));
25995 	}
25996 
25997       /* Otherwise, we can get the same results by manually performing
25998 	 a bit extract operation on bit 5/6, and then performing the two
25999 	 shifts.  The two methods of getting 0/1 into low/high are exactly
26000 	 the same size.  Avoiding the shift in the bit extract case helps
26001 	 pentium4 a bit; no one else seems to care much either way.  */
26002       else
26003 	{
26004 	  machine_mode half_mode;
26005 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
26006 	  rtx (*gen_and3)(rtx, rtx, rtx);
26007 	  rtx (*gen_xor3)(rtx, rtx, rtx);
26008 	  HOST_WIDE_INT bits;
26009 	  rtx x;
26010 
26011 	  if (mode == DImode)
26012 	    {
26013 	      half_mode = SImode;
26014 	      gen_lshr3 = gen_lshrsi3;
26015 	      gen_and3 = gen_andsi3;
26016 	      gen_xor3 = gen_xorsi3;
26017 	      bits = 5;
26018 	    }
26019 	  else
26020 	    {
26021 	      half_mode = DImode;
26022 	      gen_lshr3 = gen_lshrdi3;
26023 	      gen_and3 = gen_anddi3;
26024 	      gen_xor3 = gen_xordi3;
26025 	      bits = 6;
26026 	    }
26027 
26028 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26029 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26030 	  else
26031 	    x = gen_lowpart (half_mode, operands[2]);
26032 	  emit_insn (gen_rtx_SET (high[0], x));
26033 
26034 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26035 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26036 	  emit_move_insn (low[0], high[0]);
26037 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26038 	}
26039 
26040       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26041       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26042       return;
26043     }
26044 
26045   if (operands[1] == constm1_rtx)
26046     {
26047       /* For -1 << N, we can avoid the shld instruction, because we
26048 	 know that we're shifting 0...31/63 ones into a -1.  */
26049       emit_move_insn (low[0], constm1_rtx);
26050       if (optimize_insn_for_size_p ())
26051 	emit_move_insn (high[0], low[0]);
26052       else
26053 	emit_move_insn (high[0], constm1_rtx);
26054     }
26055   else
26056     {
26057       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26058 
26059       if (!rtx_equal_p (operands[0], operands[1]))
26060 	emit_move_insn (operands[0], operands[1]);
26061 
26062       split_double_mode (mode, operands, 1, low, high);
26063       emit_insn (gen_shld (high[0], low[0], operands[2]));
26064     }
26065 
26066   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26067 
26068   if (TARGET_CMOVE && scratch)
26069     {
26070       rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26071 	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26072 
26073       ix86_expand_clear (scratch);
26074       emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26075     }
26076   else
26077     {
26078       rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26079 	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26080 
26081       emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26082     }
26083 }
26084 
26085 void
26086 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26087 {
26088   rtx (*gen_ashr3)(rtx, rtx, rtx)
26089     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26090   rtx (*gen_shrd)(rtx, rtx, rtx);
26091   int half_width = GET_MODE_BITSIZE (mode) >> 1;
26092 
26093   rtx low[2], high[2];
26094   int count;
26095 
26096   if (CONST_INT_P (operands[2]))
26097     {
26098       split_double_mode (mode, operands, 2, low, high);
26099       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26100 
26101       if (count == GET_MODE_BITSIZE (mode) - 1)
26102 	{
26103 	  emit_move_insn (high[0], high[1]);
26104 	  emit_insn (gen_ashr3 (high[0], high[0],
26105 				GEN_INT (half_width - 1)));
26106 	  emit_move_insn (low[0], high[0]);
26107 
26108 	}
26109       else if (count >= half_width)
26110 	{
26111 	  emit_move_insn (low[0], high[1]);
26112 	  emit_move_insn (high[0], low[0]);
26113 	  emit_insn (gen_ashr3 (high[0], high[0],
26114 				GEN_INT (half_width - 1)));
26115 
26116 	  if (count > half_width)
26117 	    emit_insn (gen_ashr3 (low[0], low[0],
26118 				  GEN_INT (count - half_width)));
26119 	}
26120       else
26121 	{
26122 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26123 
26124 	  if (!rtx_equal_p (operands[0], operands[1]))
26125 	    emit_move_insn (operands[0], operands[1]);
26126 
26127 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26128 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26129 	}
26130     }
26131   else
26132     {
26133       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26134 
26135      if (!rtx_equal_p (operands[0], operands[1]))
26136 	emit_move_insn (operands[0], operands[1]);
26137 
26138       split_double_mode (mode, operands, 1, low, high);
26139 
26140       emit_insn (gen_shrd (low[0], high[0], operands[2]));
26141       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26142 
26143       if (TARGET_CMOVE && scratch)
26144 	{
26145 	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26146 	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26147 
26148 	  emit_move_insn (scratch, high[0]);
26149 	  emit_insn (gen_ashr3 (scratch, scratch,
26150 				GEN_INT (half_width - 1)));
26151 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26152 					  scratch));
26153 	}
26154       else
26155 	{
26156 	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26157 	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26158 
26159 	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26160 	}
26161     }
26162 }
26163 
26164 void
26165 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26166 {
26167   rtx (*gen_lshr3)(rtx, rtx, rtx)
26168     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26169   rtx (*gen_shrd)(rtx, rtx, rtx);
26170   int half_width = GET_MODE_BITSIZE (mode) >> 1;
26171 
26172   rtx low[2], high[2];
26173   int count;
26174 
26175   if (CONST_INT_P (operands[2]))
26176     {
26177       split_double_mode (mode, operands, 2, low, high);
26178       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26179 
26180       if (count >= half_width)
26181 	{
26182 	  emit_move_insn (low[0], high[1]);
26183 	  ix86_expand_clear (high[0]);
26184 
26185 	  if (count > half_width)
26186 	    emit_insn (gen_lshr3 (low[0], low[0],
26187 				  GEN_INT (count - half_width)));
26188 	}
26189       else
26190 	{
26191 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26192 
26193 	  if (!rtx_equal_p (operands[0], operands[1]))
26194 	    emit_move_insn (operands[0], operands[1]);
26195 
26196 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26197 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26198 	}
26199     }
26200   else
26201     {
26202       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26203 
26204       if (!rtx_equal_p (operands[0], operands[1]))
26205 	emit_move_insn (operands[0], operands[1]);
26206 
26207       split_double_mode (mode, operands, 1, low, high);
26208 
26209       emit_insn (gen_shrd (low[0], high[0], operands[2]));
26210       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26211 
26212       if (TARGET_CMOVE && scratch)
26213 	{
26214 	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26215 	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26216 
26217 	  ix86_expand_clear (scratch);
26218 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26219 					  scratch));
26220 	}
26221       else
26222 	{
26223 	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26224 	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26225 
26226 	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26227 	}
26228     }
26229 }
26230 
26231 /* Predict just emitted jump instruction to be taken with probability PROB.  */
26232 static void
26233 predict_jump (int prob)
26234 {
26235   rtx_insn *insn = get_last_insn ();
26236   gcc_assert (JUMP_P (insn));
26237   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26238 }
26239 
26240 /* Helper function for the string operations below.  Dest VARIABLE whether
26241    it is aligned to VALUE bytes.  If true, jump to the label.  */
26242 static rtx_code_label *
26243 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26244 {
26245   rtx_code_label *label = gen_label_rtx ();
26246   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26247   if (GET_MODE (variable) == DImode)
26248     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26249   else
26250     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26251   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26252 			   1, label);
26253   if (epilogue)
26254     predict_jump (REG_BR_PROB_BASE * 50 / 100);
26255   else
26256     predict_jump (REG_BR_PROB_BASE * 90 / 100);
26257   return label;
26258 }
26259 
26260 /* Adjust COUNTER by the VALUE.  */
26261 static void
26262 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26263 {
26264   rtx (*gen_add)(rtx, rtx, rtx)
26265     = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26266 
26267   emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26268 }
26269 
26270 /* Zero extend possibly SImode EXP to Pmode register.  */
26271 rtx
26272 ix86_zero_extend_to_Pmode (rtx exp)
26273 {
26274   return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26275 }
26276 
26277 /* Divide COUNTREG by SCALE.  */
26278 static rtx
26279 scale_counter (rtx countreg, int scale)
26280 {
26281   rtx sc;
26282 
26283   if (scale == 1)
26284     return countreg;
26285   if (CONST_INT_P (countreg))
26286     return GEN_INT (INTVAL (countreg) / scale);
26287   gcc_assert (REG_P (countreg));
26288 
26289   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26290 			    GEN_INT (exact_log2 (scale)),
26291 			    NULL, 1, OPTAB_DIRECT);
26292   return sc;
26293 }
26294 
26295 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
26296    DImode for constant loop counts.  */
26297 
26298 static machine_mode
26299 counter_mode (rtx count_exp)
26300 {
26301   if (GET_MODE (count_exp) != VOIDmode)
26302     return GET_MODE (count_exp);
26303   if (!CONST_INT_P (count_exp))
26304     return Pmode;
26305   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26306     return DImode;
26307   return SImode;
26308 }
26309 
26310 /* Copy the address to a Pmode register.  This is used for x32 to
26311    truncate DImode TLS address to a SImode register. */
26312 
26313 static rtx
26314 ix86_copy_addr_to_reg (rtx addr)
26315 {
26316   rtx reg;
26317   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26318     {
26319       reg = copy_addr_to_reg (addr);
26320       REG_POINTER (reg) = 1;
26321       return reg;
26322     }
26323   else
26324     {
26325       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26326       reg = copy_to_mode_reg (DImode, addr);
26327       REG_POINTER (reg) = 1;
26328       return gen_rtx_SUBREG (SImode, reg, 0);
26329     }
26330 }
26331 
26332 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26333    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26334    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
26335    memory by VALUE (supposed to be in MODE).
26336 
26337    The size is rounded down to whole number of chunk size moved at once.
26338    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
26339 
26340 
26341 static void
26342 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26343 			       rtx destptr, rtx srcptr, rtx value,
26344 			       rtx count, machine_mode mode, int unroll,
26345 			       int expected_size, bool issetmem)
26346 {
26347   rtx_code_label *out_label, *top_label;
26348   rtx iter, tmp;
26349   machine_mode iter_mode = counter_mode (count);
26350   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26351   rtx piece_size = GEN_INT (piece_size_n);
26352   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26353   rtx size;
26354   int i;
26355 
26356   top_label = gen_label_rtx ();
26357   out_label = gen_label_rtx ();
26358   iter = gen_reg_rtx (iter_mode);
26359 
26360   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26361 			      NULL, 1, OPTAB_DIRECT);
26362   /* Those two should combine.  */
26363   if (piece_size == const1_rtx)
26364     {
26365       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26366 			       true, out_label);
26367       predict_jump (REG_BR_PROB_BASE * 10 / 100);
26368     }
26369   emit_move_insn (iter, const0_rtx);
26370 
26371   emit_label (top_label);
26372 
26373   tmp = convert_modes (Pmode, iter_mode, iter, true);
26374 
26375   /* This assert could be relaxed - in this case we'll need to compute
26376      smallest power of two, containing in PIECE_SIZE_N and pass it to
26377      offset_address.  */
26378   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26379   destmem = offset_address (destmem, tmp, piece_size_n);
26380   destmem = adjust_address (destmem, mode, 0);
26381 
26382   if (!issetmem)
26383     {
26384       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26385       srcmem = adjust_address (srcmem, mode, 0);
26386 
26387       /* When unrolling for chips that reorder memory reads and writes,
26388 	 we can save registers by using single temporary.
26389 	 Also using 4 temporaries is overkill in 32bit mode.  */
26390       if (!TARGET_64BIT && 0)
26391 	{
26392 	  for (i = 0; i < unroll; i++)
26393 	    {
26394 	      if (i)
26395 		{
26396 		  destmem =
26397 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26398 		  srcmem =
26399 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26400 		}
26401 	      emit_move_insn (destmem, srcmem);
26402 	    }
26403 	}
26404       else
26405 	{
26406 	  rtx tmpreg[4];
26407 	  gcc_assert (unroll <= 4);
26408 	  for (i = 0; i < unroll; i++)
26409 	    {
26410 	      tmpreg[i] = gen_reg_rtx (mode);
26411 	      if (i)
26412 		{
26413 		  srcmem =
26414 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26415 		}
26416 	      emit_move_insn (tmpreg[i], srcmem);
26417 	    }
26418 	  for (i = 0; i < unroll; i++)
26419 	    {
26420 	      if (i)
26421 		{
26422 		  destmem =
26423 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26424 		}
26425 	      emit_move_insn (destmem, tmpreg[i]);
26426 	    }
26427 	}
26428     }
26429   else
26430     for (i = 0; i < unroll; i++)
26431       {
26432 	if (i)
26433 	  destmem =
26434 	    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26435 	emit_move_insn (destmem, value);
26436       }
26437 
26438   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26439 			     true, OPTAB_LIB_WIDEN);
26440   if (tmp != iter)
26441     emit_move_insn (iter, tmp);
26442 
26443   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26444 			   true, top_label);
26445   if (expected_size != -1)
26446     {
26447       expected_size /= GET_MODE_SIZE (mode) * unroll;
26448       if (expected_size == 0)
26449 	predict_jump (0);
26450       else if (expected_size > REG_BR_PROB_BASE)
26451 	predict_jump (REG_BR_PROB_BASE - 1);
26452       else
26453         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26454     }
26455   else
26456     predict_jump (REG_BR_PROB_BASE * 80 / 100);
26457   iter = ix86_zero_extend_to_Pmode (iter);
26458   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26459 			     true, OPTAB_LIB_WIDEN);
26460   if (tmp != destptr)
26461     emit_move_insn (destptr, tmp);
26462   if (!issetmem)
26463     {
26464       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26465 				 true, OPTAB_LIB_WIDEN);
26466       if (tmp != srcptr)
26467 	emit_move_insn (srcptr, tmp);
26468     }
26469   emit_label (out_label);
26470 }
26471 
26472 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26473    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26474    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26475    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26476    ORIG_VALUE is the original value passed to memset to fill the memory with.
26477    Other arguments have same meaning as for previous function.  */
26478 
26479 static void
26480 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26481 			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26482 			   rtx count,
26483 			   machine_mode mode, bool issetmem)
26484 {
26485   rtx destexp;
26486   rtx srcexp;
26487   rtx countreg;
26488   HOST_WIDE_INT rounded_count;
26489 
26490   /* If possible, it is shorter to use rep movs.
26491      TODO: Maybe it is better to move this logic to decide_alg.  */
26492   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26493       && (!issetmem || orig_value == const0_rtx))
26494     mode = SImode;
26495 
26496   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26497     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26498 
26499   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26500 						       GET_MODE_SIZE (mode)));
26501   if (mode != QImode)
26502     {
26503       destexp = gen_rtx_ASHIFT (Pmode, countreg,
26504 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26505       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26506     }
26507   else
26508     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26509   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26510     {
26511       rounded_count
26512 	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26513       destmem = shallow_copy_rtx (destmem);
26514       set_mem_size (destmem, rounded_count);
26515     }
26516   else if (MEM_SIZE_KNOWN_P (destmem))
26517     clear_mem_size (destmem);
26518 
26519   if (issetmem)
26520     {
26521       value = force_reg (mode, gen_lowpart (mode, value));
26522       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26523     }
26524   else
26525     {
26526       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26527 	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26528       if (mode != QImode)
26529 	{
26530 	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26531 				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26532 	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26533 	}
26534       else
26535 	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26536       if (CONST_INT_P (count))
26537 	{
26538 	  rounded_count
26539 	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26540 	  srcmem = shallow_copy_rtx (srcmem);
26541 	  set_mem_size (srcmem, rounded_count);
26542 	}
26543       else
26544 	{
26545 	  if (MEM_SIZE_KNOWN_P (srcmem))
26546 	    clear_mem_size (srcmem);
26547 	}
26548       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26549 			      destexp, srcexp));
26550     }
26551 }
26552 
26553 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26554    DESTMEM.
26555    SRC is passed by pointer to be updated on return.
26556    Return value is updated DST.  */
26557 static rtx
26558 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26559 	     HOST_WIDE_INT size_to_move)
26560 {
26561   rtx dst = destmem, src = *srcmem, adjust, tempreg;
26562   enum insn_code code;
26563   machine_mode move_mode;
26564   int piece_size, i;
26565 
26566   /* Find the widest mode in which we could perform moves.
26567      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26568      it until move of such size is supported.  */
26569   piece_size = 1 << floor_log2 (size_to_move);
26570   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26571 	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26572     {
26573       gcc_assert (piece_size > 1);
26574       piece_size >>= 1;
26575     }
26576 
26577   /* Find the corresponding vector mode with the same size as MOVE_MODE.
26578      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
26579   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26580     {
26581       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26582       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26583 	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26584 	{
26585 	  move_mode = word_mode;
26586 	  piece_size = GET_MODE_SIZE (move_mode);
26587 	  code = optab_handler (mov_optab, move_mode);
26588 	}
26589     }
26590   gcc_assert (code != CODE_FOR_nothing);
26591 
26592   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26593   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26594 
26595   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
26596   gcc_assert (size_to_move % piece_size == 0);
26597   adjust = GEN_INT (piece_size);
26598   for (i = 0; i < size_to_move; i += piece_size)
26599     {
26600       /* We move from memory to memory, so we'll need to do it via
26601 	 a temporary register.  */
26602       tempreg = gen_reg_rtx (move_mode);
26603       emit_insn (GEN_FCN (code) (tempreg, src));
26604       emit_insn (GEN_FCN (code) (dst, tempreg));
26605 
26606       emit_move_insn (destptr,
26607 		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26608       emit_move_insn (srcptr,
26609 		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26610 
26611       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26612 					  piece_size);
26613       src = adjust_automodify_address_nv (src, move_mode, srcptr,
26614 					  piece_size);
26615     }
26616 
26617   /* Update DST and SRC rtx.  */
26618   *srcmem = src;
26619   return dst;
26620 }
26621 
26622 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
26623 static void
26624 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26625 			rtx destptr, rtx srcptr, rtx count, int max_size)
26626 {
26627   rtx src, dest;
26628   if (CONST_INT_P (count))
26629     {
26630       HOST_WIDE_INT countval = INTVAL (count);
26631       HOST_WIDE_INT epilogue_size = countval % max_size;
26632       int i;
26633 
26634       /* For now MAX_SIZE should be a power of 2.  This assert could be
26635 	 relaxed, but it'll require a bit more complicated epilogue
26636 	 expanding.  */
26637       gcc_assert ((max_size & (max_size - 1)) == 0);
26638       for (i = max_size; i >= 1; i >>= 1)
26639 	{
26640 	  if (epilogue_size & i)
26641 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26642 	}
26643       return;
26644     }
26645   if (max_size > 8)
26646     {
26647       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26648 				    count, 1, OPTAB_DIRECT);
26649       expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26650 				     count, QImode, 1, 4, false);
26651       return;
26652     }
26653 
26654   /* When there are stringops, we can cheaply increase dest and src pointers.
26655      Otherwise we save code size by maintaining offset (zero is readily
26656      available from preceding rep operation) and using x86 addressing modes.
26657    */
26658   if (TARGET_SINGLE_STRINGOP)
26659     {
26660       if (max_size > 4)
26661 	{
26662 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26663 	  src = change_address (srcmem, SImode, srcptr);
26664 	  dest = change_address (destmem, SImode, destptr);
26665 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
26666 	  emit_label (label);
26667 	  LABEL_NUSES (label) = 1;
26668 	}
26669       if (max_size > 2)
26670 	{
26671 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26672 	  src = change_address (srcmem, HImode, srcptr);
26673 	  dest = change_address (destmem, HImode, destptr);
26674 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
26675 	  emit_label (label);
26676 	  LABEL_NUSES (label) = 1;
26677 	}
26678       if (max_size > 1)
26679 	{
26680 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26681 	  src = change_address (srcmem, QImode, srcptr);
26682 	  dest = change_address (destmem, QImode, destptr);
26683 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
26684 	  emit_label (label);
26685 	  LABEL_NUSES (label) = 1;
26686 	}
26687     }
26688   else
26689     {
26690       rtx offset = force_reg (Pmode, const0_rtx);
26691       rtx tmp;
26692 
26693       if (max_size > 4)
26694 	{
26695 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26696 	  src = change_address (srcmem, SImode, srcptr);
26697 	  dest = change_address (destmem, SImode, destptr);
26698 	  emit_move_insn (dest, src);
26699 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26700 				     true, OPTAB_LIB_WIDEN);
26701 	  if (tmp != offset)
26702 	    emit_move_insn (offset, tmp);
26703 	  emit_label (label);
26704 	  LABEL_NUSES (label) = 1;
26705 	}
26706       if (max_size > 2)
26707 	{
26708 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26709 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26710 	  src = change_address (srcmem, HImode, tmp);
26711 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26712 	  dest = change_address (destmem, HImode, tmp);
26713 	  emit_move_insn (dest, src);
26714 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26715 				     true, OPTAB_LIB_WIDEN);
26716 	  if (tmp != offset)
26717 	    emit_move_insn (offset, tmp);
26718 	  emit_label (label);
26719 	  LABEL_NUSES (label) = 1;
26720 	}
26721       if (max_size > 1)
26722 	{
26723 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26724 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26725 	  src = change_address (srcmem, QImode, tmp);
26726 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26727 	  dest = change_address (destmem, QImode, tmp);
26728 	  emit_move_insn (dest, src);
26729 	  emit_label (label);
26730 	  LABEL_NUSES (label) = 1;
26731 	}
26732     }
26733 }
26734 
26735 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26736    with value PROMOTED_VAL.
26737    SRC is passed by pointer to be updated on return.
26738    Return value is updated DST.  */
26739 static rtx
26740 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26741 	     HOST_WIDE_INT size_to_move)
26742 {
26743   rtx dst = destmem, adjust;
26744   enum insn_code code;
26745   machine_mode move_mode;
26746   int piece_size, i;
26747 
26748   /* Find the widest mode in which we could perform moves.
26749      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26750      it until move of such size is supported.  */
26751   move_mode = GET_MODE (promoted_val);
26752   if (move_mode == VOIDmode)
26753     move_mode = QImode;
26754   if (size_to_move < GET_MODE_SIZE (move_mode))
26755     {
26756       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26757       move_mode = int_mode_for_size (move_bits, 0).require ();
26758       promoted_val = gen_lowpart (move_mode, promoted_val);
26759     }
26760   piece_size = GET_MODE_SIZE (move_mode);
26761   code = optab_handler (mov_optab, move_mode);
26762   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26763 
26764   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26765 
26766   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
26767   gcc_assert (size_to_move % piece_size == 0);
26768   adjust = GEN_INT (piece_size);
26769   for (i = 0; i < size_to_move; i += piece_size)
26770     {
26771       if (piece_size <= GET_MODE_SIZE (word_mode))
26772 	{
26773 	  emit_insn (gen_strset (destptr, dst, promoted_val));
26774 	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26775 					      piece_size);
26776 	  continue;
26777 	}
26778 
26779       emit_insn (GEN_FCN (code) (dst, promoted_val));
26780 
26781       emit_move_insn (destptr,
26782 		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26783 
26784       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26785 					  piece_size);
26786     }
26787 
26788   /* Update DST rtx.  */
26789   return dst;
26790 }
26791 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
26792 static void
26793 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26794 				 rtx count, int max_size)
26795 {
26796   count =
26797     expand_simple_binop (counter_mode (count), AND, count,
26798 			 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26799   expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26800 				 gen_lowpart (QImode, value), count, QImode,
26801 				 1, max_size / 2, true);
26802 }
26803 
26804 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
26805 static void
26806 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26807 			rtx count, int max_size)
26808 {
26809   rtx dest;
26810 
26811   if (CONST_INT_P (count))
26812     {
26813       HOST_WIDE_INT countval = INTVAL (count);
26814       HOST_WIDE_INT epilogue_size = countval % max_size;
26815       int i;
26816 
26817       /* For now MAX_SIZE should be a power of 2.  This assert could be
26818 	 relaxed, but it'll require a bit more complicated epilogue
26819 	 expanding.  */
26820       gcc_assert ((max_size & (max_size - 1)) == 0);
26821       for (i = max_size; i >= 1; i >>= 1)
26822 	{
26823 	  if (epilogue_size & i)
26824 	    {
26825 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26826 		destmem = emit_memset (destmem, destptr, vec_value, i);
26827 	      else
26828 		destmem = emit_memset (destmem, destptr, value, i);
26829 	    }
26830 	}
26831       return;
26832     }
26833   if (max_size > 32)
26834     {
26835       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26836       return;
26837     }
26838   if (max_size > 16)
26839     {
26840       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26841       if (TARGET_64BIT)
26842 	{
26843 	  dest = change_address (destmem, DImode, destptr);
26844 	  emit_insn (gen_strset (destptr, dest, value));
26845 	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26846 	  emit_insn (gen_strset (destptr, dest, value));
26847 	}
26848       else
26849 	{
26850 	  dest = change_address (destmem, SImode, destptr);
26851 	  emit_insn (gen_strset (destptr, dest, value));
26852 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26853 	  emit_insn (gen_strset (destptr, dest, value));
26854 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26855 	  emit_insn (gen_strset (destptr, dest, value));
26856 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26857 	  emit_insn (gen_strset (destptr, dest, value));
26858 	}
26859       emit_label (label);
26860       LABEL_NUSES (label) = 1;
26861     }
26862   if (max_size > 8)
26863     {
26864       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26865       if (TARGET_64BIT)
26866 	{
26867 	  dest = change_address (destmem, DImode, destptr);
26868 	  emit_insn (gen_strset (destptr, dest, value));
26869 	}
26870       else
26871 	{
26872 	  dest = change_address (destmem, SImode, destptr);
26873 	  emit_insn (gen_strset (destptr, dest, value));
26874 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26875 	  emit_insn (gen_strset (destptr, dest, value));
26876 	}
26877       emit_label (label);
26878       LABEL_NUSES (label) = 1;
26879     }
26880   if (max_size > 4)
26881     {
26882       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26883       dest = change_address (destmem, SImode, destptr);
26884       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26885       emit_label (label);
26886       LABEL_NUSES (label) = 1;
26887     }
26888   if (max_size > 2)
26889     {
26890       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26891       dest = change_address (destmem, HImode, destptr);
26892       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26893       emit_label (label);
26894       LABEL_NUSES (label) = 1;
26895     }
26896   if (max_size > 1)
26897     {
26898       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26899       dest = change_address (destmem, QImode, destptr);
26900       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26901       emit_label (label);
26902       LABEL_NUSES (label) = 1;
26903     }
26904 }
26905 
26906 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26907    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
26908    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26909    ignored.
26910    Return value is updated DESTMEM.  */
26911 static rtx
26912 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26913 				  rtx destptr, rtx srcptr, rtx value,
26914 				  rtx vec_value, rtx count, int align,
26915 				  int desired_alignment, bool issetmem)
26916 {
26917   int i;
26918   for (i = 1; i < desired_alignment; i <<= 1)
26919     {
26920       if (align <= i)
26921 	{
26922 	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26923 	  if (issetmem)
26924 	    {
26925 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26926 		destmem = emit_memset (destmem, destptr, vec_value, i);
26927 	      else
26928 		destmem = emit_memset (destmem, destptr, value, i);
26929 	    }
26930 	  else
26931 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26932 	  ix86_adjust_counter (count, i);
26933 	  emit_label (label);
26934 	  LABEL_NUSES (label) = 1;
26935 	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26936 	}
26937     }
26938   return destmem;
26939 }
26940 
26941 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26942    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26943    and jump to DONE_LABEL.  */
26944 static void
26945 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26946 			       rtx destptr, rtx srcptr,
26947 			       rtx value, rtx vec_value,
26948 			       rtx count, int size,
26949 			       rtx done_label, bool issetmem)
26950 {
26951   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26952   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26953   rtx modesize;
26954   int n;
26955 
26956   /* If we do not have vector value to copy, we must reduce size.  */
26957   if (issetmem)
26958     {
26959       if (!vec_value)
26960 	{
26961 	  if (GET_MODE (value) == VOIDmode && size > 8)
26962 	    mode = Pmode;
26963 	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26964 	    mode = GET_MODE (value);
26965 	}
26966       else
26967 	mode = GET_MODE (vec_value), value = vec_value;
26968     }
26969   else
26970     {
26971       /* Choose appropriate vector mode.  */
26972       if (size >= 32)
26973 	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26974       else if (size >= 16)
26975 	mode = TARGET_SSE ? V16QImode : DImode;
26976       srcmem = change_address (srcmem, mode, srcptr);
26977     }
26978   destmem = change_address (destmem, mode, destptr);
26979   modesize = GEN_INT (GET_MODE_SIZE (mode));
26980   gcc_assert (GET_MODE_SIZE (mode) <= size);
26981   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26982     {
26983       if (issetmem)
26984 	emit_move_insn (destmem, gen_lowpart (mode, value));
26985       else
26986 	{
26987           emit_move_insn (destmem, srcmem);
26988           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26989 	}
26990       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26991     }
26992 
26993   destmem = offset_address (destmem, count, 1);
26994   destmem = offset_address (destmem, GEN_INT (-2 * size),
26995 			    GET_MODE_SIZE (mode));
26996   if (!issetmem)
26997     {
26998       srcmem = offset_address (srcmem, count, 1);
26999       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27000 			       GET_MODE_SIZE (mode));
27001     }
27002   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27003     {
27004       if (issetmem)
27005 	emit_move_insn (destmem, gen_lowpart (mode, value));
27006       else
27007 	{
27008 	  emit_move_insn (destmem, srcmem);
27009 	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27010 	}
27011       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27012     }
27013   emit_jump_insn (gen_jump (done_label));
27014   emit_barrier ();
27015 
27016   emit_label (label);
27017   LABEL_NUSES (label) = 1;
27018 }
27019 
27020 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27021    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27022    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27023    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27024    DONE_LABEL is a label after the whole copying sequence. The label is created
27025    on demand if *DONE_LABEL is NULL.
27026    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
27027    bounds after the initial copies.
27028 
27029    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27030    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27031    we will dispatch to a library call for large blocks.
27032 
27033    In pseudocode we do:
27034 
27035    if (COUNT < SIZE)
27036      {
27037        Assume that SIZE is 4. Bigger sizes are handled analogously
27038        if (COUNT & 4)
27039 	 {
27040 	    copy 4 bytes from SRCPTR to DESTPTR
27041 	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27042 	    goto done_label
27043 	 }
27044        if (!COUNT)
27045 	 goto done_label;
27046        copy 1 byte from SRCPTR to DESTPTR
27047        if (COUNT & 2)
27048 	 {
27049 	    copy 2 bytes from SRCPTR to DESTPTR
27050 	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27051 	 }
27052      }
27053    else
27054      {
27055        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27056        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27057 
27058        OLD_DESPTR = DESTPTR;
27059        Align DESTPTR up to DESIRED_ALIGN
27060        SRCPTR += DESTPTR - OLD_DESTPTR
27061        COUNT -= DEST_PTR - OLD_DESTPTR
27062        if (DYNAMIC_CHECK)
27063 	 Round COUNT down to multiple of SIZE
27064        << optional caller supplied zero size guard is here >>
27065        << optional caller supplied dynamic check is here >>
27066        << caller supplied main copy loop is here >>
27067      }
27068    done_label:
27069   */
27070 static void
27071 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27072 							    rtx *destptr, rtx *srcptr,
27073 							    machine_mode mode,
27074 							    rtx value, rtx vec_value,
27075 							    rtx *count,
27076 							    rtx_code_label **done_label,
27077 							    int size,
27078 							    int desired_align,
27079 							    int align,
27080 							    unsigned HOST_WIDE_INT *min_size,
27081 							    bool dynamic_check,
27082 							    bool issetmem)
27083 {
27084   rtx_code_label *loop_label = NULL, *label;
27085   int n;
27086   rtx modesize;
27087   int prolog_size = 0;
27088   rtx mode_value;
27089 
27090   /* Chose proper value to copy.  */
27091   if (issetmem && VECTOR_MODE_P (mode))
27092     mode_value = vec_value;
27093   else
27094     mode_value = value;
27095   gcc_assert (GET_MODE_SIZE (mode) <= size);
27096 
27097   /* See if block is big or small, handle small blocks.  */
27098   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27099     {
27100       int size2 = size;
27101       loop_label = gen_label_rtx ();
27102 
27103       if (!*done_label)
27104 	*done_label = gen_label_rtx ();
27105 
27106       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27107 			       1, loop_label);
27108       size2 >>= 1;
27109 
27110       /* Handle sizes > 3.  */
27111       for (;size2 > 2; size2 >>= 1)
27112 	expand_small_movmem_or_setmem (destmem, srcmem,
27113 				       *destptr, *srcptr,
27114 				       value, vec_value,
27115 				       *count,
27116 				       size2, *done_label, issetmem);
27117       /* Nothing to copy?  Jump to DONE_LABEL if so */
27118       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27119 			       1, *done_label);
27120 
27121       /* Do a byte copy.  */
27122       destmem = change_address (destmem, QImode, *destptr);
27123       if (issetmem)
27124 	emit_move_insn (destmem, gen_lowpart (QImode, value));
27125       else
27126 	{
27127           srcmem = change_address (srcmem, QImode, *srcptr);
27128           emit_move_insn (destmem, srcmem);
27129 	}
27130 
27131       /* Handle sizes 2 and 3.  */
27132       label = ix86_expand_aligntest (*count, 2, false);
27133       destmem = change_address (destmem, HImode, *destptr);
27134       destmem = offset_address (destmem, *count, 1);
27135       destmem = offset_address (destmem, GEN_INT (-2), 2);
27136       if (issetmem)
27137         emit_move_insn (destmem, gen_lowpart (HImode, value));
27138       else
27139 	{
27140 	  srcmem = change_address (srcmem, HImode, *srcptr);
27141 	  srcmem = offset_address (srcmem, *count, 1);
27142 	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27143 	  emit_move_insn (destmem, srcmem);
27144 	}
27145 
27146       emit_label (label);
27147       LABEL_NUSES (label) = 1;
27148       emit_jump_insn (gen_jump (*done_label));
27149       emit_barrier ();
27150     }
27151   else
27152     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27153 		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27154 
27155   /* Start memcpy for COUNT >= SIZE.  */
27156   if (loop_label)
27157     {
27158        emit_label (loop_label);
27159        LABEL_NUSES (loop_label) = 1;
27160     }
27161 
27162   /* Copy first desired_align bytes.  */
27163   if (!issetmem)
27164     srcmem = change_address (srcmem, mode, *srcptr);
27165   destmem = change_address (destmem, mode, *destptr);
27166   modesize = GEN_INT (GET_MODE_SIZE (mode));
27167   for (n = 0; prolog_size < desired_align - align; n++)
27168     {
27169       if (issetmem)
27170         emit_move_insn (destmem, mode_value);
27171       else
27172 	{
27173           emit_move_insn (destmem, srcmem);
27174           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27175 	}
27176       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27177       prolog_size += GET_MODE_SIZE (mode);
27178     }
27179 
27180 
27181   /* Copy last SIZE bytes.  */
27182   destmem = offset_address (destmem, *count, 1);
27183   destmem = offset_address (destmem,
27184 			    GEN_INT (-size - prolog_size),
27185 			    1);
27186   if (issetmem)
27187     emit_move_insn (destmem, mode_value);
27188   else
27189     {
27190       srcmem = offset_address (srcmem, *count, 1);
27191       srcmem = offset_address (srcmem,
27192 			       GEN_INT (-size - prolog_size),
27193 			       1);
27194       emit_move_insn (destmem, srcmem);
27195     }
27196   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27197     {
27198       destmem = offset_address (destmem, modesize, 1);
27199       if (issetmem)
27200 	emit_move_insn (destmem, mode_value);
27201       else
27202 	{
27203           srcmem = offset_address (srcmem, modesize, 1);
27204           emit_move_insn (destmem, srcmem);
27205 	}
27206     }
27207 
27208   /* Align destination.  */
27209   if (desired_align > 1 && desired_align > align)
27210     {
27211       rtx saveddest = *destptr;
27212 
27213       gcc_assert (desired_align <= size);
27214       /* Align destptr up, place it to new register.  */
27215       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27216 				      GEN_INT (prolog_size),
27217 				      NULL_RTX, 1, OPTAB_DIRECT);
27218       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27219 	REG_POINTER (*destptr) = 1;
27220       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27221 				      GEN_INT (-desired_align),
27222 				      *destptr, 1, OPTAB_DIRECT);
27223       /* See how many bytes we skipped.  */
27224       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27225 				       *destptr,
27226 				       saveddest, 1, OPTAB_DIRECT);
27227       /* Adjust srcptr and count.  */
27228       if (!issetmem)
27229 	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27230 				       saveddest, *srcptr, 1, OPTAB_DIRECT);
27231       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27232 				    saveddest, *count, 1, OPTAB_DIRECT);
27233       /* We copied at most size + prolog_size.  */
27234       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27235 	*min_size
27236 	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27237       else
27238 	*min_size = 0;
27239 
27240       /* Our loops always round down the block size, but for dispatch to
27241          library we need precise value.  */
27242       if (dynamic_check)
27243 	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
27244 				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27245     }
27246   else
27247     {
27248       gcc_assert (prolog_size == 0);
27249       /* Decrease count, so we won't end up copying last word twice.  */
27250       if (!CONST_INT_P (*count))
27251 	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27252 				      constm1_rtx, *count, 1, OPTAB_DIRECT);
27253       else
27254 	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27255 				      (unsigned HOST_WIDE_INT)size));
27256       if (*min_size)
27257 	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27258     }
27259 }
27260 
27261 
27262 /* This function is like the previous one, except here we know how many bytes
27263    need to be copied.  That allows us to update alignment not only of DST, which
27264    is returned, but also of SRC, which is passed as a pointer for that
27265    reason.  */
27266 static rtx
27267 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27268 					   rtx srcreg, rtx value, rtx vec_value,
27269 					   int desired_align, int align_bytes,
27270 					   bool issetmem)
27271 {
27272   rtx src = NULL;
27273   rtx orig_dst = dst;
27274   rtx orig_src = NULL;
27275   int piece_size = 1;
27276   int copied_bytes = 0;
27277 
27278   if (!issetmem)
27279     {
27280       gcc_assert (srcp != NULL);
27281       src = *srcp;
27282       orig_src = src;
27283     }
27284 
27285   for (piece_size = 1;
27286        piece_size <= desired_align && copied_bytes < align_bytes;
27287        piece_size <<= 1)
27288     {
27289       if (align_bytes & piece_size)
27290 	{
27291 	  if (issetmem)
27292 	    {
27293 	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27294 		dst = emit_memset (dst, destreg, vec_value, piece_size);
27295 	      else
27296 		dst = emit_memset (dst, destreg, value, piece_size);
27297 	    }
27298 	  else
27299 	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27300 	  copied_bytes += piece_size;
27301 	}
27302     }
27303   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27304     set_mem_align (dst, desired_align * BITS_PER_UNIT);
27305   if (MEM_SIZE_KNOWN_P (orig_dst))
27306     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27307 
27308   if (!issetmem)
27309     {
27310       int src_align_bytes = get_mem_align_offset (src, desired_align
27311 						       * BITS_PER_UNIT);
27312       if (src_align_bytes >= 0)
27313 	src_align_bytes = desired_align - src_align_bytes;
27314       if (src_align_bytes >= 0)
27315 	{
27316 	  unsigned int src_align;
27317 	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27318 	    {
27319 	      if ((src_align_bytes & (src_align - 1))
27320 		   == (align_bytes & (src_align - 1)))
27321 		break;
27322 	    }
27323 	  if (src_align > (unsigned int) desired_align)
27324 	    src_align = desired_align;
27325 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27326 	    set_mem_align (src, src_align * BITS_PER_UNIT);
27327 	}
27328       if (MEM_SIZE_KNOWN_P (orig_src))
27329 	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27330       *srcp = src;
27331     }
27332 
27333   return dst;
27334 }
27335 
27336 /* Return true if ALG can be used in current context.
27337    Assume we expand memset if MEMSET is true.  */
27338 static bool
27339 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27340 {
27341   if (alg == no_stringop)
27342     return false;
27343   if (alg == vector_loop)
27344     return TARGET_SSE || TARGET_AVX;
27345   /* Algorithms using the rep prefix want at least edi and ecx;
27346      additionally, memset wants eax and memcpy wants esi.  Don't
27347      consider such algorithms if the user has appropriated those
27348      registers for their own purposes, or if we have a non-default
27349      address space, since some string insns cannot override the segment.  */
27350   if (alg == rep_prefix_1_byte
27351       || alg == rep_prefix_4_byte
27352       || alg == rep_prefix_8_byte)
27353     {
27354       if (have_as)
27355 	return false;
27356       if (fixed_regs[CX_REG]
27357 	  || fixed_regs[DI_REG]
27358 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27359 	return false;
27360     }
27361   return true;
27362 }
27363 
27364 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
27365 static enum stringop_alg
27366 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27367 	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27368 	    bool memset, bool zero_memset, bool have_as,
27369 	    int *dynamic_check, bool *noalign, bool recur)
27370 {
27371   const struct stringop_algs *algs;
27372   bool optimize_for_speed;
27373   int max = 0;
27374   const struct processor_costs *cost;
27375   int i;
27376   bool any_alg_usable_p = false;
27377 
27378   *noalign = false;
27379   *dynamic_check = -1;
27380 
27381   /* Even if the string operation call is cold, we still might spend a lot
27382      of time processing large blocks.  */
27383   if (optimize_function_for_size_p (cfun)
27384       || (optimize_insn_for_size_p ()
27385  	  && (max_size < 256
27386               || (expected_size != -1 && expected_size < 256))))
27387     optimize_for_speed = false;
27388   else
27389     optimize_for_speed = true;
27390 
27391   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27392   if (memset)
27393     algs = &cost->memset[TARGET_64BIT != 0];
27394   else
27395     algs = &cost->memcpy[TARGET_64BIT != 0];
27396 
27397   /* See maximal size for user defined algorithm.  */
27398   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27399     {
27400       enum stringop_alg candidate = algs->size[i].alg;
27401       bool usable = alg_usable_p (candidate, memset, have_as);
27402       any_alg_usable_p |= usable;
27403 
27404       if (candidate != libcall && candidate && usable)
27405 	max = algs->size[i].max;
27406     }
27407 
27408   /* If expected size is not known but max size is small enough
27409      so inline version is a win, set expected size into
27410      the range.  */
27411   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27412       && expected_size == -1)
27413     expected_size = min_size / 2 + max_size / 2;
27414 
27415   /* If user specified the algorithm, honor it if possible.  */
27416   if (ix86_stringop_alg != no_stringop
27417       && alg_usable_p (ix86_stringop_alg, memset, have_as))
27418     return ix86_stringop_alg;
27419   /* rep; movq or rep; movl is the smallest variant.  */
27420   else if (!optimize_for_speed)
27421     {
27422       *noalign = true;
27423       if (!count || (count & 3) || (memset && !zero_memset))
27424 	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27425 	       ? rep_prefix_1_byte : loop_1_byte;
27426       else
27427 	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27428 	       ? rep_prefix_4_byte : loop;
27429     }
27430   /* Very tiny blocks are best handled via the loop, REP is expensive to
27431      setup.  */
27432   else if (expected_size != -1 && expected_size < 4)
27433     return loop_1_byte;
27434   else if (expected_size != -1)
27435     {
27436       enum stringop_alg alg = libcall;
27437       bool alg_noalign = false;
27438       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27439 	{
27440 	  /* We get here if the algorithms that were not libcall-based
27441 	     were rep-prefix based and we are unable to use rep prefixes
27442 	     based on global register usage.  Break out of the loop and
27443 	     use the heuristic below.  */
27444 	  if (algs->size[i].max == 0)
27445 	    break;
27446 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27447 	    {
27448 	      enum stringop_alg candidate = algs->size[i].alg;
27449 
27450 	      if (candidate != libcall
27451 		  && alg_usable_p (candidate, memset, have_as))
27452 		{
27453 		  alg = candidate;
27454 		  alg_noalign = algs->size[i].noalign;
27455 		}
27456 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27457 		 last non-libcall inline algorithm.  */
27458 	      if (TARGET_INLINE_ALL_STRINGOPS)
27459 		{
27460 		  /* When the current size is best to be copied by a libcall,
27461 		     but we are still forced to inline, run the heuristic below
27462 		     that will pick code for medium sized blocks.  */
27463 		  if (alg != libcall)
27464 		    {
27465 		      *noalign = alg_noalign;
27466 		      return alg;
27467 		    }
27468 		  else if (!any_alg_usable_p)
27469 		    break;
27470 		}
27471 	      else if (alg_usable_p (candidate, memset, have_as))
27472 		{
27473 		  *noalign = algs->size[i].noalign;
27474 		  return candidate;
27475 		}
27476 	    }
27477 	}
27478     }
27479   /* When asked to inline the call anyway, try to pick meaningful choice.
27480      We look for maximal size of block that is faster to copy by hand and
27481      take blocks of at most of that size guessing that average size will
27482      be roughly half of the block.
27483 
27484      If this turns out to be bad, we might simply specify the preferred
27485      choice in ix86_costs.  */
27486   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27487       && (algs->unknown_size == libcall
27488 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
27489     {
27490       enum stringop_alg alg;
27491       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27492 
27493       /* If there aren't any usable algorithms or if recursing already,
27494 	 then recursing on smaller sizes or same size isn't going to
27495 	 find anything.  Just return the simple byte-at-a-time copy loop.  */
27496       if (!any_alg_usable_p || recur)
27497 	{
27498 	  /* Pick something reasonable.  */
27499 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27500 	    *dynamic_check = 128;
27501 	  return loop_1_byte;
27502 	}
27503       alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27504 			zero_memset, have_as, dynamic_check, noalign, true);
27505       gcc_assert (*dynamic_check == -1);
27506       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27507 	*dynamic_check = max;
27508       else
27509 	gcc_assert (alg != libcall);
27510       return alg;
27511     }
27512   return (alg_usable_p (algs->unknown_size, memset, have_as)
27513 	  ? algs->unknown_size : libcall);
27514 }
27515 
27516 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
27517    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
27518 static int
27519 decide_alignment (int align,
27520 		  enum stringop_alg alg,
27521 		  int expected_size,
27522 		  machine_mode move_mode)
27523 {
27524   int desired_align = 0;
27525 
27526   gcc_assert (alg != no_stringop);
27527 
27528   if (alg == libcall)
27529     return 0;
27530   if (move_mode == VOIDmode)
27531     return 0;
27532 
27533   desired_align = GET_MODE_SIZE (move_mode);
27534   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27535      copying whole cacheline at once.  */
27536   if (TARGET_PENTIUMPRO
27537       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27538     desired_align = 8;
27539 
27540   if (optimize_size)
27541     desired_align = 1;
27542   if (desired_align < align)
27543     desired_align = align;
27544   if (expected_size != -1 && expected_size < 4)
27545     desired_align = align;
27546 
27547   return desired_align;
27548 }
27549 
27550 
27551 /* Helper function for memcpy.  For QImode value 0xXY produce
27552    0xXYXYXYXY of wide specified by MODE.  This is essentially
27553    a * 0x10101010, but we can do slightly better than
27554    synth_mult by unwinding the sequence by hand on CPUs with
27555    slow multiply.  */
27556 static rtx
27557 promote_duplicated_reg (machine_mode mode, rtx val)
27558 {
27559   machine_mode valmode = GET_MODE (val);
27560   rtx tmp;
27561   int nops = mode == DImode ? 3 : 2;
27562 
27563   gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27564   if (val == const0_rtx)
27565     return copy_to_mode_reg (mode, CONST0_RTX (mode));
27566   if (CONST_INT_P (val))
27567     {
27568       HOST_WIDE_INT v = INTVAL (val) & 255;
27569 
27570       v |= v << 8;
27571       v |= v << 16;
27572       if (mode == DImode)
27573         v |= (v << 16) << 16;
27574       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27575     }
27576 
27577   if (valmode == VOIDmode)
27578     valmode = QImode;
27579   if (valmode != QImode)
27580     val = gen_lowpart (QImode, val);
27581   if (mode == QImode)
27582     return val;
27583   if (!TARGET_PARTIAL_REG_STALL)
27584     nops--;
27585   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27586       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27587       <= (ix86_cost->shift_const + ix86_cost->add) * nops
27588           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27589     {
27590       rtx reg = convert_modes (mode, QImode, val, true);
27591       tmp = promote_duplicated_reg (mode, const1_rtx);
27592       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27593 				  OPTAB_DIRECT);
27594     }
27595   else
27596     {
27597       rtx reg = convert_modes (mode, QImode, val, true);
27598 
27599       if (!TARGET_PARTIAL_REG_STALL)
27600 	if (mode == SImode)
27601 	  emit_insn (gen_insvsi_1 (reg, reg));
27602 	else
27603 	  emit_insn (gen_insvdi_1 (reg, reg));
27604       else
27605 	{
27606 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27607 				     NULL, 1, OPTAB_DIRECT);
27608 	  reg =
27609 	    expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27610 	}
27611       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27612 			         NULL, 1, OPTAB_DIRECT);
27613       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27614       if (mode == SImode)
27615 	return reg;
27616       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27617 				 NULL, 1, OPTAB_DIRECT);
27618       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27619       return reg;
27620     }
27621 }
27622 
27623 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27624    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27625    alignment from ALIGN to DESIRED_ALIGN.  */
27626 static rtx
27627 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27628 				int align)
27629 {
27630   rtx promoted_val;
27631 
27632   if (TARGET_64BIT
27633       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27634     promoted_val = promote_duplicated_reg (DImode, val);
27635   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27636     promoted_val = promote_duplicated_reg (SImode, val);
27637   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27638     promoted_val = promote_duplicated_reg (HImode, val);
27639   else
27640     promoted_val = val;
27641 
27642   return promoted_val;
27643 }
27644 
27645 /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
27646    operations when profitable.  The code depends upon architecture, block size
27647    and alignment, but always has one of the following overall structures:
27648 
27649    Aligned move sequence:
27650 
27651      1) Prologue guard: Conditional that jumps up to epilogues for small
27652 	blocks that can be handled by epilogue alone.  This is faster
27653 	but also needed for correctness, since prologue assume the block
27654 	is larger than the desired alignment.
27655 
27656 	Optional dynamic check for size and libcall for large
27657 	blocks is emitted here too, with -minline-stringops-dynamically.
27658 
27659      2) Prologue: copy first few bytes in order to get destination
27660 	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
27661 	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27662 	copied.  We emit either a jump tree on power of two sized
27663 	blocks, or a byte loop.
27664 
27665      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27666 	with specified algorithm.
27667 
27668      4) Epilogue: code copying tail of the block that is too small to be
27669 	handled by main body (or up to size guarded by prologue guard).
27670 
27671   Misaligned move sequence
27672 
27673      1) missaligned move prologue/epilogue containing:
27674         a) Prologue handling small memory blocks and jumping to done_label
27675 	   (skipped if blocks are known to be large enough)
27676 	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27677            needed by single possibly misaligned move
27678 	   (skipped if alignment is not needed)
27679         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27680 
27681      2) Zero size guard dispatching to done_label, if needed
27682 
27683      3) dispatch to library call, if needed,
27684 
27685      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27686 	with specified algorithm.  */
27687 bool
27688 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27689 			   rtx align_exp, rtx expected_align_exp,
27690 			   rtx expected_size_exp, rtx min_size_exp,
27691 			   rtx max_size_exp, rtx probable_max_size_exp,
27692 			   bool issetmem)
27693 {
27694   rtx destreg;
27695   rtx srcreg = NULL;
27696   rtx_code_label *label = NULL;
27697   rtx tmp;
27698   rtx_code_label *jump_around_label = NULL;
27699   HOST_WIDE_INT align = 1;
27700   unsigned HOST_WIDE_INT count = 0;
27701   HOST_WIDE_INT expected_size = -1;
27702   int size_needed = 0, epilogue_size_needed;
27703   int desired_align = 0, align_bytes = 0;
27704   enum stringop_alg alg;
27705   rtx promoted_val = NULL;
27706   rtx vec_promoted_val = NULL;
27707   bool force_loopy_epilogue = false;
27708   int dynamic_check;
27709   bool need_zero_guard = false;
27710   bool noalign;
27711   machine_mode move_mode = VOIDmode;
27712   machine_mode wider_mode;
27713   int unroll_factor = 1;
27714   /* TODO: Once value ranges are available, fill in proper data.  */
27715   unsigned HOST_WIDE_INT min_size = 0;
27716   unsigned HOST_WIDE_INT max_size = -1;
27717   unsigned HOST_WIDE_INT probable_max_size = -1;
27718   bool misaligned_prologue_used = false;
27719   bool have_as;
27720 
27721   if (CONST_INT_P (align_exp))
27722     align = INTVAL (align_exp);
27723   /* i386 can do misaligned access on reasonably increased cost.  */
27724   if (CONST_INT_P (expected_align_exp)
27725       && INTVAL (expected_align_exp) > align)
27726     align = INTVAL (expected_align_exp);
27727   /* ALIGN is the minimum of destination and source alignment, but we care here
27728      just about destination alignment.  */
27729   else if (!issetmem
27730 	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27731     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27732 
27733   if (CONST_INT_P (count_exp))
27734     {
27735       min_size = max_size = probable_max_size = count = expected_size
27736 	= INTVAL (count_exp);
27737       /* When COUNT is 0, there is nothing to do.  */
27738       if (!count)
27739 	return true;
27740     }
27741   else
27742     {
27743       if (min_size_exp)
27744 	min_size = INTVAL (min_size_exp);
27745       if (max_size_exp)
27746 	max_size = INTVAL (max_size_exp);
27747       if (probable_max_size_exp)
27748 	probable_max_size = INTVAL (probable_max_size_exp);
27749       if (CONST_INT_P (expected_size_exp))
27750 	expected_size = INTVAL (expected_size_exp);
27751      }
27752 
27753   /* Make sure we don't need to care about overflow later on.  */
27754   if (count > (HOST_WIDE_INT_1U << 30))
27755     return false;
27756 
27757   have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27758   if (!issetmem)
27759     have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27760 
27761   /* Step 0: Decide on preferred algorithm, desired alignment and
27762      size of chunks to be copied by main loop.  */
27763   alg = decide_alg (count, expected_size, min_size, probable_max_size,
27764 		    issetmem,
27765 		    issetmem && val_exp == const0_rtx, have_as,
27766 		    &dynamic_check, &noalign, false);
27767   if (alg == libcall)
27768     return false;
27769   gcc_assert (alg != no_stringop);
27770 
27771   /* For now vector-version of memset is generated only for memory zeroing, as
27772      creating of promoted vector value is very cheap in this case.  */
27773   if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27774     alg = unrolled_loop;
27775 
27776   if (!count)
27777     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27778   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27779   if (!issetmem)
27780     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27781 
27782   unroll_factor = 1;
27783   move_mode = word_mode;
27784   switch (alg)
27785     {
27786     case libcall:
27787     case no_stringop:
27788     case last_alg:
27789       gcc_unreachable ();
27790     case loop_1_byte:
27791       need_zero_guard = true;
27792       move_mode = QImode;
27793       break;
27794     case loop:
27795       need_zero_guard = true;
27796       break;
27797     case unrolled_loop:
27798       need_zero_guard = true;
27799       unroll_factor = (TARGET_64BIT ? 4 : 2);
27800       break;
27801     case vector_loop:
27802       need_zero_guard = true;
27803       unroll_factor = 4;
27804       /* Find the widest supported mode.  */
27805       move_mode = word_mode;
27806       while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27807 	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27808 	move_mode = wider_mode;
27809 
27810       if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27811 	move_mode = TImode;
27812 
27813       /* Find the corresponding vector mode with the same size as MOVE_MODE.
27814 	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
27815       if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27816 	{
27817 	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27818 	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27819 	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27820 	    move_mode = word_mode;
27821 	}
27822       gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27823       break;
27824     case rep_prefix_8_byte:
27825       move_mode = DImode;
27826       break;
27827     case rep_prefix_4_byte:
27828       move_mode = SImode;
27829       break;
27830     case rep_prefix_1_byte:
27831       move_mode = QImode;
27832       break;
27833     }
27834   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27835   epilogue_size_needed = size_needed;
27836 
27837   /* If we are going to call any library calls conditionally, make sure any
27838      pending stack adjustment happen before the first conditional branch,
27839      otherwise they will be emitted before the library call only and won't
27840      happen from the other branches.  */
27841   if (dynamic_check != -1)
27842     do_pending_stack_adjust ();
27843 
27844   desired_align = decide_alignment (align, alg, expected_size, move_mode);
27845   if (!TARGET_ALIGN_STRINGOPS || noalign)
27846     align = desired_align;
27847 
27848   /* Step 1: Prologue guard.  */
27849 
27850   /* Alignment code needs count to be in register.  */
27851   if (CONST_INT_P (count_exp) && desired_align > align)
27852     {
27853       if (INTVAL (count_exp) > desired_align
27854 	  && INTVAL (count_exp) > size_needed)
27855 	{
27856 	  align_bytes
27857 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27858 	  if (align_bytes <= 0)
27859 	    align_bytes = 0;
27860 	  else
27861 	    align_bytes = desired_align - align_bytes;
27862 	}
27863       if (align_bytes == 0)
27864 	count_exp = force_reg (counter_mode (count_exp), count_exp);
27865     }
27866   gcc_assert (desired_align >= 1 && align >= 1);
27867 
27868   /* Misaligned move sequences handle both prologue and epilogue at once.
27869      Default code generation results in a smaller code for large alignments
27870      and also avoids redundant job when sizes are known precisely.  */
27871   misaligned_prologue_used
27872     = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27873        && MAX (desired_align, epilogue_size_needed) <= 32
27874        && desired_align <= epilogue_size_needed
27875        && ((desired_align > align && !align_bytes)
27876 	   || (!count && epilogue_size_needed > 1)));
27877 
27878   /* Do the cheap promotion to allow better CSE across the
27879      main loop and epilogue (ie one load of the big constant in the
27880      front of all code.
27881      For now the misaligned move sequences do not have fast path
27882      without broadcasting.  */
27883   if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27884     {
27885       if (alg == vector_loop)
27886 	{
27887 	  gcc_assert (val_exp == const0_rtx);
27888 	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27889 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
27890 							 GET_MODE_SIZE (word_mode),
27891 							 desired_align, align);
27892 	}
27893       else
27894 	{
27895 	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27896 							 desired_align, align);
27897 	}
27898     }
27899   /* Misaligned move sequences handles both prologues and epilogues at once.
27900      Default code generation results in smaller code for large alignments and
27901      also avoids redundant job when sizes are known precisely.  */
27902   if (misaligned_prologue_used)
27903     {
27904       /* Misaligned move prologue handled small blocks by itself.  */
27905       expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27906 	   (dst, src, &destreg, &srcreg,
27907 	    move_mode, promoted_val, vec_promoted_val,
27908 	    &count_exp,
27909 	    &jump_around_label,
27910             desired_align < align
27911 	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27912 	    desired_align, align, &min_size, dynamic_check, issetmem);
27913       if (!issetmem)
27914         src = change_address (src, BLKmode, srcreg);
27915       dst = change_address (dst, BLKmode, destreg);
27916       set_mem_align (dst, desired_align * BITS_PER_UNIT);
27917       epilogue_size_needed = 0;
27918       if (need_zero_guard
27919 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
27920 	{
27921 	  /* It is possible that we copied enough so the main loop will not
27922 	     execute.  */
27923 	  gcc_assert (size_needed > 1);
27924 	  if (jump_around_label == NULL_RTX)
27925 	    jump_around_label = gen_label_rtx ();
27926 	  emit_cmp_and_jump_insns (count_exp,
27927 				   GEN_INT (size_needed),
27928 				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27929 	  if (expected_size == -1
27930 	      || expected_size < (desired_align - align) / 2 + size_needed)
27931 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
27932 	  else
27933 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
27934 	}
27935     }
27936   /* Ensure that alignment prologue won't copy past end of block.  */
27937   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27938     {
27939       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27940       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27941 	 Make sure it is power of 2.  */
27942       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27943 
27944       /* To improve performance of small blocks, we jump around the VAL
27945 	 promoting mode.  This mean that if the promoted VAL is not constant,
27946 	 we might not use it in the epilogue and have to use byte
27947 	 loop variant.  */
27948       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27949 	force_loopy_epilogue = true;
27950       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27951 	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27952 	{
27953 	  /* If main algorithm works on QImode, no epilogue is needed.
27954 	     For small sizes just don't align anything.  */
27955 	  if (size_needed == 1)
27956 	    desired_align = align;
27957 	  else
27958 	    goto epilogue;
27959 	}
27960       else if (!count
27961 	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27962 	{
27963 	  label = gen_label_rtx ();
27964 	  emit_cmp_and_jump_insns (count_exp,
27965 				   GEN_INT (epilogue_size_needed),
27966 				   LTU, 0, counter_mode (count_exp), 1, label);
27967 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
27968 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
27969 	  else
27970 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
27971 	}
27972     }
27973 
27974   /* Emit code to decide on runtime whether library call or inline should be
27975      used.  */
27976   if (dynamic_check != -1)
27977     {
27978       if (!issetmem && CONST_INT_P (count_exp))
27979 	{
27980 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27981 	    {
27982 	      emit_block_copy_via_libcall (dst, src, count_exp);
27983 	      count_exp = const0_rtx;
27984 	      goto epilogue;
27985 	    }
27986 	}
27987       else
27988 	{
27989 	  rtx_code_label *hot_label = gen_label_rtx ();
27990 	  if (jump_around_label == NULL_RTX)
27991 	    jump_around_label = gen_label_rtx ();
27992 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27993 				   LEU, 0, counter_mode (count_exp),
27994 				   1, hot_label);
27995 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
27996 	  if (issetmem)
27997 	    set_storage_via_libcall (dst, count_exp, val_exp);
27998 	  else
27999 	    emit_block_copy_via_libcall (dst, src, count_exp);
28000 	  emit_jump (jump_around_label);
28001 	  emit_label (hot_label);
28002 	}
28003     }
28004 
28005   /* Step 2: Alignment prologue.  */
28006   /* Do the expensive promotion once we branched off the small blocks.  */
28007   if (issetmem && !promoted_val)
28008     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28009 						   desired_align, align);
28010 
28011   if (desired_align > align && !misaligned_prologue_used)
28012     {
28013       if (align_bytes == 0)
28014 	{
28015 	  /* Except for the first move in prologue, we no longer know
28016 	     constant offset in aliasing info.  It don't seems to worth
28017 	     the pain to maintain it for the first move, so throw away
28018 	     the info early.  */
28019 	  dst = change_address (dst, BLKmode, destreg);
28020 	  if (!issetmem)
28021 	    src = change_address (src, BLKmode, srcreg);
28022 	  dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28023 					    promoted_val, vec_promoted_val,
28024 					    count_exp, align, desired_align,
28025 					    issetmem);
28026 	  /* At most desired_align - align bytes are copied.  */
28027 	  if (min_size < (unsigned)(desired_align - align))
28028 	    min_size = 0;
28029 	  else
28030 	    min_size -= desired_align - align;
28031 	}
28032       else
28033 	{
28034 	  /* If we know how many bytes need to be stored before dst is
28035 	     sufficiently aligned, maintain aliasing info accurately.  */
28036 	  dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28037 							   srcreg,
28038 							   promoted_val,
28039 							   vec_promoted_val,
28040 							   desired_align,
28041 							   align_bytes,
28042 							   issetmem);
28043 
28044 	  count_exp = plus_constant (counter_mode (count_exp),
28045 				     count_exp, -align_bytes);
28046 	  count -= align_bytes;
28047 	  min_size -= align_bytes;
28048 	  max_size -= align_bytes;
28049 	}
28050       if (need_zero_guard
28051 	  && min_size < (unsigned HOST_WIDE_INT) size_needed
28052 	  && (count < (unsigned HOST_WIDE_INT) size_needed
28053 	      || (align_bytes == 0
28054 		  && count < ((unsigned HOST_WIDE_INT) size_needed
28055 			      + desired_align - align))))
28056 	{
28057 	  /* It is possible that we copied enough so the main loop will not
28058 	     execute.  */
28059 	  gcc_assert (size_needed > 1);
28060 	  if (label == NULL_RTX)
28061 	    label = gen_label_rtx ();
28062 	  emit_cmp_and_jump_insns (count_exp,
28063 				   GEN_INT (size_needed),
28064 				   LTU, 0, counter_mode (count_exp), 1, label);
28065 	  if (expected_size == -1
28066 	      || expected_size < (desired_align - align) / 2 + size_needed)
28067 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
28068 	  else
28069 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
28070 	}
28071     }
28072   if (label && size_needed == 1)
28073     {
28074       emit_label (label);
28075       LABEL_NUSES (label) = 1;
28076       label = NULL;
28077       epilogue_size_needed = 1;
28078       if (issetmem)
28079 	promoted_val = val_exp;
28080     }
28081   else if (label == NULL_RTX && !misaligned_prologue_used)
28082     epilogue_size_needed = size_needed;
28083 
28084   /* Step 3: Main loop.  */
28085 
28086   switch (alg)
28087     {
28088     case libcall:
28089     case no_stringop:
28090     case last_alg:
28091       gcc_unreachable ();
28092     case loop_1_byte:
28093     case loop:
28094     case unrolled_loop:
28095       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28096 				     count_exp, move_mode, unroll_factor,
28097 				     expected_size, issetmem);
28098       break;
28099     case vector_loop:
28100       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28101 				     vec_promoted_val, count_exp, move_mode,
28102 				     unroll_factor, expected_size, issetmem);
28103       break;
28104     case rep_prefix_8_byte:
28105     case rep_prefix_4_byte:
28106     case rep_prefix_1_byte:
28107       expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28108 				       val_exp, count_exp, move_mode, issetmem);
28109       break;
28110     }
28111   /* Adjust properly the offset of src and dest memory for aliasing.  */
28112   if (CONST_INT_P (count_exp))
28113     {
28114       if (!issetmem)
28115 	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28116 					    (count / size_needed) * size_needed);
28117       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28118 					  (count / size_needed) * size_needed);
28119     }
28120   else
28121     {
28122       if (!issetmem)
28123 	src = change_address (src, BLKmode, srcreg);
28124       dst = change_address (dst, BLKmode, destreg);
28125     }
28126 
28127   /* Step 4: Epilogue to copy the remaining bytes.  */
28128  epilogue:
28129   if (label)
28130     {
28131       /* When the main loop is done, COUNT_EXP might hold original count,
28132 	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28133 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28134 	 bytes. Compensate if needed.  */
28135 
28136       if (size_needed < epilogue_size_needed)
28137 	{
28138 	  tmp =
28139 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28140 				 GEN_INT (size_needed - 1), count_exp, 1,
28141 				 OPTAB_DIRECT);
28142 	  if (tmp != count_exp)
28143 	    emit_move_insn (count_exp, tmp);
28144 	}
28145       emit_label (label);
28146       LABEL_NUSES (label) = 1;
28147     }
28148 
28149   if (count_exp != const0_rtx && epilogue_size_needed > 1)
28150     {
28151       if (force_loopy_epilogue)
28152 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28153 					 epilogue_size_needed);
28154       else
28155 	{
28156 	  if (issetmem)
28157 	    expand_setmem_epilogue (dst, destreg, promoted_val,
28158 				    vec_promoted_val, count_exp,
28159 				    epilogue_size_needed);
28160 	  else
28161 	    expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28162 				    epilogue_size_needed);
28163 	}
28164     }
28165   if (jump_around_label)
28166     emit_label (jump_around_label);
28167   return true;
28168 }
28169 
28170 
28171 /* Expand the appropriate insns for doing strlen if not just doing
28172    repnz; scasb
28173 
28174    out = result, initialized with the start address
28175    align_rtx = alignment of the address.
28176    scratch = scratch register, initialized with the startaddress when
28177 	not aligned, otherwise undefined
28178 
28179    This is just the body. It needs the initializations mentioned above and
28180    some address computing at the end.  These things are done in i386.md.  */
28181 
28182 static void
28183 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28184 {
28185   int align;
28186   rtx tmp;
28187   rtx_code_label *align_2_label = NULL;
28188   rtx_code_label *align_3_label = NULL;
28189   rtx_code_label *align_4_label = gen_label_rtx ();
28190   rtx_code_label *end_0_label = gen_label_rtx ();
28191   rtx mem;
28192   rtx tmpreg = gen_reg_rtx (SImode);
28193   rtx scratch = gen_reg_rtx (SImode);
28194   rtx cmp;
28195 
28196   align = 0;
28197   if (CONST_INT_P (align_rtx))
28198     align = INTVAL (align_rtx);
28199 
28200   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
28201 
28202   /* Is there a known alignment and is it less than 4?  */
28203   if (align < 4)
28204     {
28205       rtx scratch1 = gen_reg_rtx (Pmode);
28206       emit_move_insn (scratch1, out);
28207       /* Is there a known alignment and is it not 2? */
28208       if (align != 2)
28209 	{
28210 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28211 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28212 
28213 	  /* Leave just the 3 lower bits.  */
28214 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28215 				    NULL_RTX, 0, OPTAB_WIDEN);
28216 
28217 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28218 				   Pmode, 1, align_4_label);
28219 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28220 				   Pmode, 1, align_2_label);
28221 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28222 				   Pmode, 1, align_3_label);
28223 	}
28224       else
28225         {
28226 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
28227 	     check if is aligned to 4 - byte.  */
28228 
28229 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28230 				    NULL_RTX, 0, OPTAB_WIDEN);
28231 
28232 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28233 				   Pmode, 1, align_4_label);
28234         }
28235 
28236       mem = change_address (src, QImode, out);
28237 
28238       /* Now compare the bytes.  */
28239 
28240       /* Compare the first n unaligned byte on a byte per byte basis.  */
28241       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28242 			       QImode, 1, end_0_label);
28243 
28244       /* Increment the address.  */
28245       emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28246 
28247       /* Not needed with an alignment of 2 */
28248       if (align != 2)
28249 	{
28250 	  emit_label (align_2_label);
28251 
28252 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28253 				   end_0_label);
28254 
28255 	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28256 
28257 	  emit_label (align_3_label);
28258 	}
28259 
28260       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28261 			       end_0_label);
28262 
28263       emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28264     }
28265 
28266   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
28267      align this loop.  It gives only huge programs, but does not help to
28268      speed up.  */
28269   emit_label (align_4_label);
28270 
28271   mem = change_address (src, SImode, out);
28272   emit_move_insn (scratch, mem);
28273   emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28274 
28275   /* This formula yields a nonzero result iff one of the bytes is zero.
28276      This saves three branches inside loop and many cycles.  */
28277 
28278   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28279   emit_insn (gen_one_cmplsi2 (scratch, scratch));
28280   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28281   emit_insn (gen_andsi3 (tmpreg, tmpreg,
28282 			 gen_int_mode (0x80808080, SImode)));
28283   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28284 			   align_4_label);
28285 
28286   if (TARGET_CMOVE)
28287     {
28288        rtx reg = gen_reg_rtx (SImode);
28289        rtx reg2 = gen_reg_rtx (Pmode);
28290        emit_move_insn (reg, tmpreg);
28291        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28292 
28293        /* If zero is not in the first two bytes, move two bytes forward.  */
28294        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28295        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28296        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28297        emit_insn (gen_rtx_SET (tmpreg,
28298 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
28299 						     reg,
28300 						     tmpreg)));
28301        /* Emit lea manually to avoid clobbering of flags.  */
28302        emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28303 
28304        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28305        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28306        emit_insn (gen_rtx_SET (out,
28307 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28308 						     reg2,
28309 						     out)));
28310     }
28311   else
28312     {
28313        rtx_code_label *end_2_label = gen_label_rtx ();
28314        /* Is zero in the first two bytes? */
28315 
28316        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28317        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28318        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28319        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28320                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28321                             pc_rtx);
28322        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28323        JUMP_LABEL (tmp) = end_2_label;
28324 
28325        /* Not in the first two.  Move two bytes forward.  */
28326        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28327        emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28328 
28329        emit_label (end_2_label);
28330 
28331     }
28332 
28333   /* Avoid branch in fixing the byte.  */
28334   tmpreg = gen_lowpart (QImode, tmpreg);
28335   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28336   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28337   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28338   emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28339 
28340   emit_label (end_0_label);
28341 }
28342 
28343 /* Expand strlen.  */
28344 
28345 bool
28346 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28347 {
28348   rtx addr, scratch1, scratch2, scratch3, scratch4;
28349 
28350   /* The generic case of strlen expander is long.  Avoid it's
28351      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
28352 
28353   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28354       && !TARGET_INLINE_ALL_STRINGOPS
28355       && !optimize_insn_for_size_p ()
28356       && (!CONST_INT_P (align) || INTVAL (align) < 4))
28357     return false;
28358 
28359   addr = force_reg (Pmode, XEXP (src, 0));
28360   scratch1 = gen_reg_rtx (Pmode);
28361 
28362   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28363       && !optimize_insn_for_size_p ())
28364     {
28365       /* Well it seems that some optimizer does not combine a call like
28366          foo(strlen(bar), strlen(bar));
28367          when the move and the subtraction is done here.  It does calculate
28368          the length just once when these instructions are done inside of
28369          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
28370          often used and I use one fewer register for the lifetime of
28371          output_strlen_unroll() this is better.  */
28372 
28373       emit_move_insn (out, addr);
28374 
28375       ix86_expand_strlensi_unroll_1 (out, src, align);
28376 
28377       /* strlensi_unroll_1 returns the address of the zero at the end of
28378          the string, like memchr(), so compute the length by subtracting
28379          the start address.  */
28380       emit_insn (ix86_gen_sub3 (out, out, addr));
28381     }
28382   else
28383     {
28384       rtx unspec;
28385 
28386       /* Can't use this if the user has appropriated eax, ecx, or edi.  */
28387       if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28388         return false;
28389       /* Can't use this for non-default address spaces.  */
28390       if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28391 	return false;
28392 
28393       scratch2 = gen_reg_rtx (Pmode);
28394       scratch3 = gen_reg_rtx (Pmode);
28395       scratch4 = force_reg (Pmode, constm1_rtx);
28396 
28397       emit_move_insn (scratch3, addr);
28398       eoschar = force_reg (QImode, eoschar);
28399 
28400       src = replace_equiv_address_nv (src, scratch3);
28401 
28402       /* If .md starts supporting :P, this can be done in .md.  */
28403       unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28404 						 scratch4), UNSPEC_SCAS);
28405       emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28406       emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28407       emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28408     }
28409   return true;
28410 }
28411 
28412 /* For given symbol (function) construct code to compute address of it's PLT
28413    entry in large x86-64 PIC model.  */
28414 static rtx
28415 construct_plt_address (rtx symbol)
28416 {
28417   rtx tmp, unspec;
28418 
28419   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28420   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28421   gcc_assert (Pmode == DImode);
28422 
28423   tmp = gen_reg_rtx (Pmode);
28424   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28425 
28426   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28427   emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28428   return tmp;
28429 }
28430 
28431 rtx
28432 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28433 		  rtx callarg2,
28434 		  rtx pop, bool sibcall)
28435 {
28436   rtx vec[3];
28437   rtx use = NULL, call;
28438   unsigned int vec_len = 0;
28439   tree fndecl;
28440 
28441   if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28442     {
28443       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28444       if (fndecl
28445 	  && (lookup_attribute ("interrupt",
28446 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28447 	error ("interrupt service routine can't be called directly");
28448     }
28449   else
28450     fndecl = NULL_TREE;
28451 
28452   if (pop == const0_rtx)
28453     pop = NULL;
28454   gcc_assert (!TARGET_64BIT || !pop);
28455 
28456   if (TARGET_MACHO && !TARGET_64BIT)
28457     {
28458 #if TARGET_MACHO
28459       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28460 	fnaddr = machopic_indirect_call_target (fnaddr);
28461 #endif
28462     }
28463   else
28464     {
28465       /* Static functions and indirect calls don't need the pic register.  Also,
28466 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28467 	 it an indirect call.  */
28468       rtx addr = XEXP (fnaddr, 0);
28469       if (flag_pic
28470 	  && GET_CODE (addr) == SYMBOL_REF
28471 	  && !SYMBOL_REF_LOCAL_P (addr))
28472 	{
28473 	  if (flag_plt
28474 	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
28475 		  || !lookup_attribute ("noplt",
28476 					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28477 	    {
28478 	      if (!TARGET_64BIT
28479 		  || (ix86_cmodel == CM_LARGE_PIC
28480 		      && DEFAULT_ABI != MS_ABI))
28481 		{
28482 		  use_reg (&use, gen_rtx_REG (Pmode,
28483 					      REAL_PIC_OFFSET_TABLE_REGNUM));
28484 		  if (ix86_use_pseudo_pic_reg ())
28485 		    emit_move_insn (gen_rtx_REG (Pmode,
28486 						 REAL_PIC_OFFSET_TABLE_REGNUM),
28487 				    pic_offset_table_rtx);
28488 		}
28489 	    }
28490 	  else if (!TARGET_PECOFF && !TARGET_MACHO)
28491 	    {
28492 	      if (TARGET_64BIT)
28493 		{
28494 		  fnaddr = gen_rtx_UNSPEC (Pmode,
28495 					   gen_rtvec (1, addr),
28496 					   UNSPEC_GOTPCREL);
28497 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28498 		}
28499 	      else
28500 		{
28501 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28502 					   UNSPEC_GOT);
28503 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28504 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28505 					 fnaddr);
28506 		}
28507 	      fnaddr = gen_const_mem (Pmode, fnaddr);
28508 	      /* Pmode may not be the same as word_mode for x32, which
28509 		 doesn't support indirect branch via 32-bit memory slot.
28510 		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28511 		 indirect branch via x32 GOT slot is OK.  */
28512 	      if (GET_MODE (fnaddr) != word_mode)
28513 		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28514 	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
28515 	    }
28516 	}
28517     }
28518 
28519   /* Skip setting up RAX register for -mskip-rax-setup when there are no
28520      parameters passed in vector registers.  */
28521   if (TARGET_64BIT
28522       && (INTVAL (callarg2) > 0
28523 	  || (INTVAL (callarg2) == 0
28524 	      && (TARGET_SSE || !flag_skip_rax_setup))))
28525     {
28526       rtx al = gen_rtx_REG (QImode, AX_REG);
28527       emit_move_insn (al, callarg2);
28528       use_reg (&use, al);
28529     }
28530 
28531   if (ix86_cmodel == CM_LARGE_PIC
28532       && !TARGET_PECOFF
28533       && MEM_P (fnaddr)
28534       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28535       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28536     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28537   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28538      branch via x32 GOT slot is OK.  */
28539   else if (!(TARGET_X32
28540 	     && MEM_P (fnaddr)
28541 	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28542 	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28543 	   && (sibcall
28544 	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28545 	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28546     {
28547       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28548       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28549     }
28550 
28551   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28552 
28553   if (retval)
28554     {
28555       /* We should add bounds as destination register in case
28556 	 pointer with bounds may be returned.  */
28557       if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28558 	{
28559 	  rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28560 	  rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28561 	  if (GET_CODE (retval) == PARALLEL)
28562 	    {
28563 	      b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28564 	      b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28565 	      rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28566 	      retval = chkp_join_splitted_slot (retval, par);
28567 	    }
28568 	  else
28569 	    {
28570 	      retval = gen_rtx_PARALLEL (VOIDmode,
28571 					 gen_rtvec (3, retval, b0, b1));
28572 	      chkp_put_regs_to_expr_list (retval);
28573 	    }
28574 	}
28575 
28576       call = gen_rtx_SET (retval, call);
28577     }
28578   vec[vec_len++] = call;
28579 
28580   if (pop)
28581     {
28582       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28583       pop = gen_rtx_SET (stack_pointer_rtx, pop);
28584       vec[vec_len++] = pop;
28585     }
28586 
28587   if (cfun->machine->no_caller_saved_registers
28588       && (!fndecl
28589 	  || (!TREE_THIS_VOLATILE (fndecl)
28590 	      && !lookup_attribute ("no_caller_saved_registers",
28591 				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28592     {
28593       static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28594       bool is_64bit_ms_abi = (TARGET_64BIT
28595 			      && ix86_function_abi (fndecl) == MS_ABI);
28596       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28597 
28598       /* If there are no caller-saved registers, add all registers
28599 	 that are clobbered by the call which returns.  */
28600       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28601 	if (!fixed_regs[i]
28602 	    && (ix86_call_used_regs[i] == 1
28603 		|| (ix86_call_used_regs[i] & c_mask))
28604 	    && !STACK_REGNO_P (i)
28605 	    && !MMX_REGNO_P (i))
28606 	  clobber_reg (&use,
28607 		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28608     }
28609   else if (TARGET_64BIT_MS_ABI
28610 	   && (!callarg2 || INTVAL (callarg2) != -2))
28611     {
28612       unsigned i;
28613 
28614       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28615 	{
28616 	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28617 	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28618 
28619 	  clobber_reg (&use, gen_rtx_REG (mode, regno));
28620 	}
28621 
28622       /* Set here, but it may get cleared later.  */
28623       if (TARGET_CALL_MS2SYSV_XLOGUES)
28624 	{
28625 	  if (!TARGET_SSE)
28626 	    ;
28627 
28628 	  /* Don't break hot-patched functions.  */
28629 	  else if (ix86_function_ms_hook_prologue (current_function_decl))
28630 	    ;
28631 
28632 	  /* TODO: Cases not yet examined.  */
28633 	  else if (flag_split_stack)
28634 	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28635 
28636 	  else
28637 	    {
28638 	      gcc_assert (!reload_completed);
28639 	      cfun->machine->call_ms2sysv = true;
28640 	    }
28641 	}
28642     }
28643 
28644   if (vec_len > 1)
28645     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28646   call = emit_call_insn (call);
28647   if (use)
28648     CALL_INSN_FUNCTION_USAGE (call) = use;
28649 
28650   return call;
28651 }
28652 
28653 /* Return true if the function being called was marked with attribute
28654    "noplt" or using -fno-plt and we are compiling for non-PIC.  We need
28655    to handle the non-PIC case in the backend because there is no easy
28656    interface for the front-end to force non-PLT calls to use the GOT.
28657    This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28658    to call the function marked "noplt" indirectly.  */
28659 
28660 static bool
28661 ix86_nopic_noplt_attribute_p (rtx call_op)
28662 {
28663   if (flag_pic || ix86_cmodel == CM_LARGE
28664       || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28665       || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28666       || SYMBOL_REF_LOCAL_P (call_op))
28667     return false;
28668 
28669   tree symbol_decl = SYMBOL_REF_DECL (call_op);
28670 
28671   if (!flag_plt
28672       || (symbol_decl != NULL_TREE
28673           && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28674     return true;
28675 
28676   return false;
28677 }
28678 
28679 /* Output indirect branch via a call and return thunk.  CALL_OP is a
28680    register which contains the branch target.  XASM is the assembly
28681    template for CALL_OP.  Branch is a tail call if SIBCALL_P is true.
28682    A normal call is converted to:
28683 
28684 	call __x86_indirect_thunk_reg
28685 
28686    and a tail call is converted to:
28687 
28688 	jmp __x86_indirect_thunk_reg
28689  */
28690 
28691 static void
28692 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28693 {
28694   char thunk_name_buf[32];
28695   char *thunk_name;
28696   enum indirect_thunk_prefix need_prefix
28697     = indirect_thunk_need_prefix (current_output_insn);
28698   int regno = REGNO (call_op);
28699 
28700   if (cfun->machine->indirect_branch_type
28701       != indirect_branch_thunk_inline)
28702     {
28703       if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28704 	{
28705 	  int i = regno;
28706 	  if (i >= FIRST_REX_INT_REG)
28707 	    i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28708 	  if (need_prefix == indirect_thunk_prefix_bnd)
28709 	    indirect_thunks_bnd_used |= 1 << i;
28710 	  else
28711 	    indirect_thunks_used |= 1 << i;
28712 	}
28713       indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28714       thunk_name = thunk_name_buf;
28715     }
28716   else
28717     thunk_name = NULL;
28718 
28719   if (sibcall_p)
28720     {
28721       if (thunk_name != NULL)
28722 	{
28723 	  if (need_prefix == indirect_thunk_prefix_bnd)
28724 	    fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28725 	  else
28726 	    fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28727 	}
28728       else
28729 	output_indirect_thunk (need_prefix, regno);
28730     }
28731   else
28732     {
28733       if (thunk_name != NULL)
28734 	{
28735 	  if (need_prefix == indirect_thunk_prefix_bnd)
28736 	    fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28737 	  else
28738 	    fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28739 	  return;
28740 	}
28741 
28742       char indirectlabel1[32];
28743       char indirectlabel2[32];
28744 
28745       ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28746 				   INDIRECT_LABEL,
28747 				   indirectlabelno++);
28748       ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28749 				   INDIRECT_LABEL,
28750 				   indirectlabelno++);
28751 
28752       /* Jump.  */
28753       if (need_prefix == indirect_thunk_prefix_bnd)
28754 	fputs ("\tbnd jmp\t", asm_out_file);
28755       else
28756 	fputs ("\tjmp\t", asm_out_file);
28757       assemble_name_raw (asm_out_file, indirectlabel2);
28758       fputc ('\n', asm_out_file);
28759 
28760       ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28761 
28762       if (thunk_name != NULL)
28763 	{
28764 	  if (need_prefix == indirect_thunk_prefix_bnd)
28765 	    fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28766 	  else
28767 	    fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28768 	}
28769       else
28770 	output_indirect_thunk (need_prefix, regno);
28771 
28772       ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28773 
28774       /* Call.  */
28775       if (need_prefix == indirect_thunk_prefix_bnd)
28776 	fputs ("\tbnd call\t", asm_out_file);
28777       else
28778 	fputs ("\tcall\t", asm_out_file);
28779       assemble_name_raw (asm_out_file, indirectlabel1);
28780       fputc ('\n', asm_out_file);
28781     }
28782 }
28783 
28784 /* Output indirect branch via a call and return thunk.  CALL_OP is
28785    the branch target.  XASM is the assembly template for CALL_OP.
28786    Branch is a tail call if SIBCALL_P is true.  A normal call is
28787    converted to:
28788 
28789 	jmp L2
28790    L1:
28791 	push CALL_OP
28792 	jmp __x86_indirect_thunk
28793    L2:
28794 	call L1
28795 
28796    and a tail call is converted to:
28797 
28798 	push CALL_OP
28799 	jmp __x86_indirect_thunk
28800  */
28801 
28802 static void
28803 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28804 				      bool sibcall_p)
28805 {
28806   char thunk_name_buf[32];
28807   char *thunk_name;
28808   char push_buf[64];
28809   enum indirect_thunk_prefix need_prefix
28810     = indirect_thunk_need_prefix (current_output_insn);
28811   int regno = -1;
28812 
28813   if (cfun->machine->indirect_branch_type
28814       != indirect_branch_thunk_inline)
28815     {
28816       if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28817 	{
28818 	  if (need_prefix == indirect_thunk_prefix_bnd)
28819 	    indirect_thunk_bnd_needed = true;
28820 	  else
28821 	    indirect_thunk_needed = true;
28822 	}
28823       indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28824       thunk_name = thunk_name_buf;
28825     }
28826   else
28827     thunk_name = NULL;
28828 
28829   snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28830 	    TARGET_64BIT ? 'q' : 'l', xasm);
28831 
28832   if (sibcall_p)
28833     {
28834       output_asm_insn (push_buf, &call_op);
28835       if (thunk_name != NULL)
28836 	{
28837 	  if (need_prefix == indirect_thunk_prefix_bnd)
28838 	    fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28839 	  else
28840 	    fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28841 	}
28842       else
28843 	output_indirect_thunk (need_prefix, regno);
28844     }
28845   else
28846     {
28847       char indirectlabel1[32];
28848       char indirectlabel2[32];
28849 
28850       ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28851 				   INDIRECT_LABEL,
28852 				   indirectlabelno++);
28853       ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28854 				   INDIRECT_LABEL,
28855 				   indirectlabelno++);
28856 
28857       /* Jump.  */
28858       if (need_prefix == indirect_thunk_prefix_bnd)
28859 	fputs ("\tbnd jmp\t", asm_out_file);
28860       else
28861 	fputs ("\tjmp\t", asm_out_file);
28862       assemble_name_raw (asm_out_file, indirectlabel2);
28863       fputc ('\n', asm_out_file);
28864 
28865       ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28866 
28867       /* An external function may be called via GOT, instead of PLT.  */
28868       if (MEM_P (call_op))
28869 	{
28870 	  struct ix86_address parts;
28871 	  rtx addr = XEXP (call_op, 0);
28872 	  if (ix86_decompose_address (addr, &parts)
28873 	      && parts.base == stack_pointer_rtx)
28874 	    {
28875 	      /* Since call will adjust stack by -UNITS_PER_WORD,
28876 		 we must convert "disp(stack, index, scale)" to
28877 		 "disp+UNITS_PER_WORD(stack, index, scale)".  */
28878 	      if (parts.index)
28879 		{
28880 		  addr = gen_rtx_MULT (Pmode, parts.index,
28881 				       GEN_INT (parts.scale));
28882 		  addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28883 				       addr);
28884 		}
28885 	      else
28886 		addr = stack_pointer_rtx;
28887 
28888 	      rtx disp;
28889 	      if (parts.disp != NULL_RTX)
28890 		disp = plus_constant (Pmode, parts.disp,
28891 				      UNITS_PER_WORD);
28892 	      else
28893 		disp = GEN_INT (UNITS_PER_WORD);
28894 
28895 	      addr = gen_rtx_PLUS (Pmode, addr, disp);
28896 	      call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28897 	    }
28898 	}
28899 
28900       output_asm_insn (push_buf, &call_op);
28901 
28902       if (thunk_name != NULL)
28903 	{
28904 	  if (need_prefix == indirect_thunk_prefix_bnd)
28905 	    fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28906 	  else
28907 	    fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28908 	}
28909       else
28910 	output_indirect_thunk (need_prefix, regno);
28911 
28912       ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28913 
28914       /* Call.  */
28915       if (need_prefix == indirect_thunk_prefix_bnd)
28916 	fputs ("\tbnd call\t", asm_out_file);
28917       else
28918 	fputs ("\tcall\t", asm_out_file);
28919       assemble_name_raw (asm_out_file, indirectlabel1);
28920       fputc ('\n', asm_out_file);
28921     }
28922 }
28923 
28924 /* Output indirect branch via a call and return thunk.  CALL_OP is
28925    the branch target.  XASM is the assembly template for CALL_OP.
28926    Branch is a tail call if SIBCALL_P is true.   */
28927 
28928 static void
28929 ix86_output_indirect_branch (rtx call_op, const char *xasm,
28930 			     bool sibcall_p)
28931 {
28932   if (REG_P (call_op))
28933     ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
28934   else
28935     ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
28936 }
28937 
28938 /* Output indirect jump.  CALL_OP is the jump target.  */
28939 
28940 const char *
28941 ix86_output_indirect_jmp (rtx call_op)
28942 {
28943   if (cfun->machine->indirect_branch_type != indirect_branch_keep)
28944     {
28945       /* We can't have red-zone since "call" in the indirect thunk
28946          pushes the return address onto stack, destroying red-zone.  */
28947       if (ix86_red_zone_size != 0)
28948 	gcc_unreachable ();
28949 
28950       ix86_output_indirect_branch (call_op, "%0", true);
28951       return "";
28952     }
28953   else
28954     return "%!jmp\t%A0";
28955 }
28956 
28957 /* Output function return.  CALL_OP is the jump target.  Add a REP
28958    prefix to RET if LONG_P is true and function return is kept.  */
28959 
28960 const char *
28961 ix86_output_function_return (bool long_p)
28962 {
28963   if (cfun->machine->function_return_type != indirect_branch_keep)
28964     {
28965       char thunk_name[32];
28966       enum indirect_thunk_prefix need_prefix
28967 	= indirect_thunk_need_prefix (current_output_insn);
28968 
28969       if (cfun->machine->function_return_type
28970 	  != indirect_branch_thunk_inline)
28971 	{
28972 	  bool need_thunk = (cfun->machine->function_return_type
28973 			     == indirect_branch_thunk);
28974 	  indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
28975 			       true);
28976 	  if (need_prefix == indirect_thunk_prefix_bnd)
28977 	    {
28978 	      indirect_return_bnd_needed |= need_thunk;
28979 	      fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28980 	    }
28981 	  else
28982 	    {
28983 	      indirect_return_needed |= need_thunk;
28984 	      fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28985 	    }
28986 	}
28987       else
28988 	output_indirect_thunk (need_prefix, INVALID_REGNUM);
28989 
28990       return "";
28991     }
28992 
28993   if (!long_p || ix86_bnd_prefixed_insn_p (current_output_insn))
28994     return "%!ret";
28995 
28996   return "rep%; ret";
28997 }
28998 
28999 /* Output indirect function return.  RET_OP is the function return
29000    target.  */
29001 
29002 const char *
29003 ix86_output_indirect_function_return (rtx ret_op)
29004 {
29005   if (cfun->machine->function_return_type != indirect_branch_keep)
29006     {
29007       char thunk_name[32];
29008       enum indirect_thunk_prefix need_prefix
29009 	= indirect_thunk_need_prefix (current_output_insn);
29010       unsigned int regno = REGNO (ret_op);
29011       gcc_assert (regno == CX_REG);
29012 
29013       if (cfun->machine->function_return_type
29014 	  != indirect_branch_thunk_inline)
29015 	{
29016 	  bool need_thunk = (cfun->machine->function_return_type
29017 			     == indirect_branch_thunk);
29018 	  indirect_thunk_name (thunk_name, regno, need_prefix, true);
29019 	  if (need_prefix == indirect_thunk_prefix_bnd)
29020 	    {
29021 	      if (need_thunk)
29022 		{
29023 		  indirect_return_via_cx_bnd = true;
29024 		  indirect_thunks_bnd_used |= 1 << CX_REG;
29025 		}
29026 	      fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29027 	    }
29028 	  else
29029 	    {
29030 	      if (need_thunk)
29031 		{
29032 		  indirect_return_via_cx = true;
29033 		  indirect_thunks_used |= 1 << CX_REG;
29034 		}
29035 	      fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29036 	    }
29037 	}
29038       else
29039 	output_indirect_thunk (need_prefix, regno);
29040 
29041       return "";
29042     }
29043   else
29044     return "%!jmp\t%A0";
29045 }
29046 
29047 /* Split simple return with popping POPC bytes from stack to indirect
29048    branch with stack adjustment .  */
29049 
29050 void
29051 ix86_split_simple_return_pop_internal (rtx popc)
29052 {
29053   struct machine_function *m = cfun->machine;
29054   rtx ecx = gen_rtx_REG (SImode, CX_REG);
29055   rtx_insn *insn;
29056 
29057   /* There is no "pascal" calling convention in any 64bit ABI.  */
29058   gcc_assert (!TARGET_64BIT);
29059 
29060   insn = emit_insn (gen_pop (ecx));
29061   m->fs.cfa_offset -= UNITS_PER_WORD;
29062   m->fs.sp_offset -= UNITS_PER_WORD;
29063 
29064   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
29065   x = gen_rtx_SET (stack_pointer_rtx, x);
29066   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29067   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
29068   RTX_FRAME_RELATED_P (insn) = 1;
29069 
29070   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
29071   x = gen_rtx_SET (stack_pointer_rtx, x);
29072   insn = emit_insn (x);
29073   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29074   RTX_FRAME_RELATED_P (insn) = 1;
29075 
29076   /* Now return address is in ECX.  */
29077   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
29078 }
29079 
29080 /* Output the assembly for a call instruction.  */
29081 
29082 const char *
29083 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29084 {
29085   bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29086   bool output_indirect_p
29087     = (!TARGET_SEH
29088        && cfun->machine->indirect_branch_type != indirect_branch_keep);
29089   bool seh_nop_p = false;
29090   const char *xasm;
29091 
29092   if (SIBLING_CALL_P (insn))
29093     {
29094       if (direct_p)
29095 	{
29096 	  if (ix86_nopic_noplt_attribute_p (call_op))
29097 	    {
29098 	      direct_p = false;
29099 	      if (TARGET_64BIT)
29100 		{
29101 		  if (output_indirect_p)
29102 		    xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29103 		  else
29104 		    xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29105 		}
29106 	      else
29107 		{
29108 		  if (output_indirect_p)
29109 		    xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29110 		  else
29111 		    xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29112 		}
29113 	    }
29114 	  else
29115 	    xasm = "%!jmp\t%P0";
29116 	}
29117       /* SEH epilogue detection requires the indirect branch case
29118 	 to include REX.W.  */
29119       else if (TARGET_SEH)
29120 	xasm = "%!rex.W jmp\t%A0";
29121       else
29122 	{
29123 	  if (output_indirect_p)
29124 	    xasm = "%0";
29125 	  else
29126 	    xasm = "%!jmp\t%A0";
29127 	}
29128 
29129       if (output_indirect_p && !direct_p)
29130 	ix86_output_indirect_branch (call_op, xasm, true);
29131       else
29132 	output_asm_insn (xasm, &call_op);
29133       return "";
29134     }
29135 
29136   /* SEH unwinding can require an extra nop to be emitted in several
29137      circumstances.  Determine if we have one of those.  */
29138   if (TARGET_SEH)
29139     {
29140       rtx_insn *i;
29141 
29142       for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29143 	{
29144 	  /* Prevent a catch region from being adjacent to a jump that would
29145 	     be interpreted as an epilogue sequence by the unwinder.  */
29146 	  if (JUMP_P(i) && CROSSING_JUMP_P (i))
29147 	    {
29148 	      seh_nop_p = true;
29149 	      break;
29150 	    }
29151 
29152 	  /* If we get to another real insn, we don't need the nop.  */
29153 	  if (INSN_P (i))
29154 	    break;
29155 
29156 	  /* If we get to the epilogue note, prevent a catch region from
29157 	     being adjacent to the standard epilogue sequence.  If non-
29158 	     call-exceptions, we'll have done this during epilogue emission. */
29159 	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29160 	      && !flag_non_call_exceptions
29161 	      && !can_throw_internal (insn))
29162 	    {
29163 	      seh_nop_p = true;
29164 	      break;
29165 	    }
29166 	}
29167 
29168       /* If we didn't find a real insn following the call, prevent the
29169 	 unwinder from looking into the next function.  */
29170       if (i == NULL)
29171 	seh_nop_p = true;
29172     }
29173 
29174   if (direct_p)
29175     {
29176       if (ix86_nopic_noplt_attribute_p (call_op))
29177 	{
29178 	  direct_p = false;
29179 	  if (TARGET_64BIT)
29180 	    {
29181 	      if (output_indirect_p)
29182 		xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29183 	      else
29184 		xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29185 	    }
29186 	  else
29187 	    {
29188 	      if (output_indirect_p)
29189 		xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29190 	      else
29191 		xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29192 	    }
29193 	}
29194       else
29195 	xasm = "%!call\t%P0";
29196     }
29197   else
29198     {
29199       if (output_indirect_p)
29200 	xasm = "%0";
29201       else
29202 	xasm = "%!call\t%A0";
29203     }
29204 
29205   if (output_indirect_p && !direct_p)
29206     ix86_output_indirect_branch (call_op, xasm, false);
29207   else
29208     output_asm_insn (xasm, &call_op);
29209 
29210   if (seh_nop_p)
29211     return "nop";
29212 
29213   return "";
29214 }
29215 
29216 /* Clear stack slot assignments remembered from previous functions.
29217    This is called from INIT_EXPANDERS once before RTL is emitted for each
29218    function.  */
29219 
29220 static struct machine_function *
29221 ix86_init_machine_status (void)
29222 {
29223   struct machine_function *f;
29224 
29225   f = ggc_cleared_alloc<machine_function> ();
29226   f->call_abi = ix86_abi;
29227 
29228   return f;
29229 }
29230 
29231 /* Return a MEM corresponding to a stack slot with mode MODE.
29232    Allocate a new slot if necessary.
29233 
29234    The RTL for a function can have several slots available: N is
29235    which slot to use.  */
29236 
29237 rtx
29238 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29239 {
29240   struct stack_local_entry *s;
29241 
29242   gcc_assert (n < MAX_386_STACK_LOCALS);
29243 
29244   for (s = ix86_stack_locals; s; s = s->next)
29245     if (s->mode == mode && s->n == n)
29246       return validize_mem (copy_rtx (s->rtl));
29247 
29248   s = ggc_alloc<stack_local_entry> ();
29249   s->n = n;
29250   s->mode = mode;
29251   s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29252 
29253   s->next = ix86_stack_locals;
29254   ix86_stack_locals = s;
29255   return validize_mem (copy_rtx (s->rtl));
29256 }
29257 
29258 static void
29259 ix86_instantiate_decls (void)
29260 {
29261   struct stack_local_entry *s;
29262 
29263   for (s = ix86_stack_locals; s; s = s->next)
29264     if (s->rtl != NULL_RTX)
29265       instantiate_decl_rtl (s->rtl);
29266 }
29267 
29268 /* Return the number used for encoding REG, in the range 0..7.  */
29269 
29270 static int
29271 reg_encoded_number (rtx reg)
29272 {
29273   unsigned regno = REGNO (reg);
29274   switch (regno)
29275     {
29276     case AX_REG:
29277       return 0;
29278     case CX_REG:
29279       return 1;
29280     case DX_REG:
29281       return 2;
29282     case BX_REG:
29283       return 3;
29284     case SP_REG:
29285       return 4;
29286     case BP_REG:
29287       return 5;
29288     case SI_REG:
29289       return 6;
29290     case DI_REG:
29291       return 7;
29292     default:
29293       break;
29294     }
29295   if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29296     return regno - FIRST_STACK_REG;
29297   if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29298     return regno - FIRST_SSE_REG;
29299   if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29300     return regno - FIRST_MMX_REG;
29301   if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29302     return regno - FIRST_REX_SSE_REG;
29303   if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29304     return regno - FIRST_REX_INT_REG;
29305   if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29306     return regno - FIRST_MASK_REG;
29307   if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29308     return regno - FIRST_BND_REG;
29309   return -1;
29310 }
29311 
29312 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29313    in its encoding if it could be relevant for ROP mitigation, otherwise
29314    return -1.  If POPNO0 and POPNO1 are nonnull, store the operand numbers
29315    used for calculating it into them.  */
29316 
29317 static int
29318 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29319 			int *popno0 = 0, int *popno1 = 0)
29320 {
29321   if (asm_noperands (PATTERN (insn)) >= 0)
29322     return -1;
29323   int has_modrm = get_attr_modrm (insn);
29324   if (!has_modrm)
29325     return -1;
29326   enum attr_modrm_class cls = get_attr_modrm_class (insn);
29327   rtx op0, op1;
29328   switch (cls)
29329     {
29330     case MODRM_CLASS_OP02:
29331       gcc_assert (noperands >= 3);
29332       if (popno0)
29333 	{
29334 	  *popno0 = 0;
29335 	  *popno1 = 2;
29336 	}
29337       op0 = operands[0];
29338       op1 = operands[2];
29339       break;
29340     case MODRM_CLASS_OP01:
29341       gcc_assert (noperands >= 2);
29342       if (popno0)
29343 	{
29344 	  *popno0 = 0;
29345 	  *popno1 = 1;
29346 	}
29347       op0 = operands[0];
29348       op1 = operands[1];
29349       break;
29350     default:
29351       return -1;
29352     }
29353   if (REG_P (op0) && REG_P (op1))
29354     {
29355       int enc0 = reg_encoded_number (op0);
29356       int enc1 = reg_encoded_number (op1);
29357       return 0xc0 + (enc1 << 3) + enc0;
29358     }
29359   return -1;
29360 }
29361 
29362 /* Check whether x86 address PARTS is a pc-relative address.  */
29363 
29364 bool
29365 ix86_rip_relative_addr_p (struct ix86_address *parts)
29366 {
29367   rtx base, index, disp;
29368 
29369   base = parts->base;
29370   index = parts->index;
29371   disp = parts->disp;
29372 
29373   if (disp && !base && !index)
29374     {
29375       if (TARGET_64BIT)
29376 	{
29377 	  rtx symbol = disp;
29378 
29379 	  if (GET_CODE (disp) == CONST)
29380 	    symbol = XEXP (disp, 0);
29381 	  if (GET_CODE (symbol) == PLUS
29382 	      && CONST_INT_P (XEXP (symbol, 1)))
29383 	    symbol = XEXP (symbol, 0);
29384 
29385 	  if (GET_CODE (symbol) == LABEL_REF
29386 	      || (GET_CODE (symbol) == SYMBOL_REF
29387 		  && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29388 	      || (GET_CODE (symbol) == UNSPEC
29389 		  && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29390 		      || XINT (symbol, 1) == UNSPEC_PCREL
29391 		      || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29392 	    return true;
29393 	}
29394     }
29395   return false;
29396 }
29397 
29398 /* Calculate the length of the memory address in the instruction encoding.
29399    Includes addr32 prefix, does not include the one-byte modrm, opcode,
29400    or other prefixes.  We never generate addr32 prefix for LEA insn.  */
29401 
29402 int
29403 memory_address_length (rtx addr, bool lea)
29404 {
29405   struct ix86_address parts;
29406   rtx base, index, disp;
29407   int len;
29408   int ok;
29409 
29410   if (GET_CODE (addr) == PRE_DEC
29411       || GET_CODE (addr) == POST_INC
29412       || GET_CODE (addr) == PRE_MODIFY
29413       || GET_CODE (addr) == POST_MODIFY)
29414     return 0;
29415 
29416   ok = ix86_decompose_address (addr, &parts);
29417   gcc_assert (ok);
29418 
29419   len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29420 
29421   /*  If this is not LEA instruction, add the length of addr32 prefix.  */
29422   if (TARGET_64BIT && !lea
29423       && (SImode_address_operand (addr, VOIDmode)
29424 	  || (parts.base && GET_MODE (parts.base) == SImode)
29425 	  || (parts.index && GET_MODE (parts.index) == SImode)))
29426     len++;
29427 
29428   base = parts.base;
29429   index = parts.index;
29430   disp = parts.disp;
29431 
29432   if (base && SUBREG_P (base))
29433     base = SUBREG_REG (base);
29434   if (index && SUBREG_P (index))
29435     index = SUBREG_REG (index);
29436 
29437   gcc_assert (base == NULL_RTX || REG_P (base));
29438   gcc_assert (index == NULL_RTX || REG_P (index));
29439 
29440   /* Rule of thumb:
29441        - esp as the base always wants an index,
29442        - ebp as the base always wants a displacement,
29443        - r12 as the base always wants an index,
29444        - r13 as the base always wants a displacement.  */
29445 
29446   /* Register Indirect.  */
29447   if (base && !index && !disp)
29448     {
29449       /* esp (for its index) and ebp (for its displacement) need
29450 	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
29451 	 code.  */
29452       if (base == arg_pointer_rtx
29453 	  || base == frame_pointer_rtx
29454 	  || REGNO (base) == SP_REG
29455 	  || REGNO (base) == BP_REG
29456 	  || REGNO (base) == R12_REG
29457 	  || REGNO (base) == R13_REG)
29458 	len++;
29459     }
29460 
29461   /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
29462      is not disp32, but disp32(%rip), so for disp32
29463      SIB byte is needed, unless print_operand_address
29464      optimizes it into disp32(%rip) or (%rip) is implied
29465      by UNSPEC.  */
29466   else if (disp && !base && !index)
29467     {
29468       len += 4;
29469       if (!ix86_rip_relative_addr_p (&parts))
29470 	len++;
29471     }
29472   else
29473     {
29474       /* Find the length of the displacement constant.  */
29475       if (disp)
29476 	{
29477 	  if (base && satisfies_constraint_K (disp))
29478 	    len += 1;
29479 	  else
29480 	    len += 4;
29481 	}
29482       /* ebp always wants a displacement.  Similarly r13.  */
29483       else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29484 	len++;
29485 
29486       /* An index requires the two-byte modrm form....  */
29487       if (index
29488 	  /* ...like esp (or r12), which always wants an index.  */
29489 	  || base == arg_pointer_rtx
29490 	  || base == frame_pointer_rtx
29491 	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29492 	len++;
29493     }
29494 
29495   return len;
29496 }
29497 
29498 /* Compute default value for "length_immediate" attribute.  When SHORTFORM
29499    is set, expect that insn have 8bit immediate alternative.  */
29500 int
29501 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29502 {
29503   int len = 0;
29504   int i;
29505   extract_insn_cached (insn);
29506   for (i = recog_data.n_operands - 1; i >= 0; --i)
29507     if (CONSTANT_P (recog_data.operand[i]))
29508       {
29509         enum attr_mode mode = get_attr_mode (insn);
29510 
29511 	gcc_assert (!len);
29512 	if (shortform && CONST_INT_P (recog_data.operand[i]))
29513 	  {
29514 	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29515 	    switch (mode)
29516 	      {
29517 	      case MODE_QI:
29518 		len = 1;
29519 		continue;
29520 	      case MODE_HI:
29521 		ival = trunc_int_for_mode (ival, HImode);
29522 		break;
29523 	      case MODE_SI:
29524 		ival = trunc_int_for_mode (ival, SImode);
29525 		break;
29526 	      default:
29527 		break;
29528 	      }
29529 	    if (IN_RANGE (ival, -128, 127))
29530 	      {
29531 		len = 1;
29532 		continue;
29533 	      }
29534 	  }
29535 	switch (mode)
29536 	  {
29537 	  case MODE_QI:
29538 	    len = 1;
29539 	    break;
29540 	  case MODE_HI:
29541 	    len = 2;
29542 	    break;
29543 	  case MODE_SI:
29544 	    len = 4;
29545 	    break;
29546 	  /* Immediates for DImode instructions are encoded
29547 	     as 32bit sign extended values.  */
29548 	  case MODE_DI:
29549 	    len = 4;
29550 	    break;
29551 	  default:
29552 	    fatal_insn ("unknown insn mode", insn);
29553 	}
29554       }
29555   return len;
29556 }
29557 
29558 /* Compute default value for "length_address" attribute.  */
29559 int
29560 ix86_attr_length_address_default (rtx_insn *insn)
29561 {
29562   int i;
29563 
29564   if (get_attr_type (insn) == TYPE_LEA)
29565     {
29566       rtx set = PATTERN (insn), addr;
29567 
29568       if (GET_CODE (set) == PARALLEL)
29569 	set = XVECEXP (set, 0, 0);
29570 
29571       gcc_assert (GET_CODE (set) == SET);
29572 
29573       addr = SET_SRC (set);
29574 
29575       return memory_address_length (addr, true);
29576     }
29577 
29578   extract_insn_cached (insn);
29579   for (i = recog_data.n_operands - 1; i >= 0; --i)
29580     {
29581       rtx op = recog_data.operand[i];
29582       if (MEM_P (op))
29583 	{
29584 	  constrain_operands_cached (insn, reload_completed);
29585 	  if (which_alternative != -1)
29586 	    {
29587 	      const char *constraints = recog_data.constraints[i];
29588 	      int alt = which_alternative;
29589 
29590 	      while (*constraints == '=' || *constraints == '+')
29591 		constraints++;
29592 	      while (alt-- > 0)
29593 	        while (*constraints++ != ',')
29594 		  ;
29595 	      /* Skip ignored operands.  */
29596 	      if (*constraints == 'X')
29597 		continue;
29598 	    }
29599 
29600 	  int len = memory_address_length (XEXP (op, 0), false);
29601 
29602 	  /* Account for segment prefix for non-default addr spaces.  */
29603 	  if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29604 	    len++;
29605 
29606 	  return len;
29607 	}
29608     }
29609   return 0;
29610 }
29611 
29612 /* Compute default value for "length_vex" attribute. It includes
29613    2 or 3 byte VEX prefix and 1 opcode byte.  */
29614 
29615 int
29616 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29617 			      bool has_vex_w)
29618 {
29619   int i;
29620 
29621   /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
29622      byte VEX prefix.  */
29623   if (!has_0f_opcode || has_vex_w)
29624     return 3 + 1;
29625 
29626  /* We can always use 2 byte VEX prefix in 32bit.  */
29627   if (!TARGET_64BIT)
29628     return 2 + 1;
29629 
29630   extract_insn_cached (insn);
29631 
29632   for (i = recog_data.n_operands - 1; i >= 0; --i)
29633     if (REG_P (recog_data.operand[i]))
29634       {
29635 	/* REX.W bit uses 3 byte VEX prefix.  */
29636 	if (GET_MODE (recog_data.operand[i]) == DImode
29637 	    && GENERAL_REG_P (recog_data.operand[i]))
29638 	  return 3 + 1;
29639       }
29640     else
29641       {
29642 	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
29643 	if (MEM_P (recog_data.operand[i])
29644 	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29645 	  return 3 + 1;
29646       }
29647 
29648   return 2 + 1;
29649 }
29650 
29651 
29652 static bool
29653 ix86_class_likely_spilled_p (reg_class_t);
29654 
29655 /* Returns true if lhs of insn is HW function argument register and set up
29656    is_spilled to true if it is likely spilled HW register.  */
29657 static bool
29658 insn_is_function_arg (rtx insn, bool* is_spilled)
29659 {
29660   rtx dst;
29661 
29662   if (!NONDEBUG_INSN_P (insn))
29663     return false;
29664   /* Call instructions are not movable, ignore it.  */
29665   if (CALL_P (insn))
29666     return false;
29667   insn = PATTERN (insn);
29668   if (GET_CODE (insn) == PARALLEL)
29669     insn = XVECEXP (insn, 0, 0);
29670   if (GET_CODE (insn) != SET)
29671     return false;
29672   dst = SET_DEST (insn);
29673   if (REG_P (dst) && HARD_REGISTER_P (dst)
29674       && ix86_function_arg_regno_p (REGNO (dst)))
29675     {
29676       /* Is it likely spilled HW register?  */
29677       if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29678 	  && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29679 	*is_spilled = true;
29680       return true;
29681     }
29682   return false;
29683 }
29684 
29685 /* Add output dependencies for chain of function adjacent arguments if only
29686    there is a move to likely spilled HW register.  Return first argument
29687    if at least one dependence was added or NULL otherwise.  */
29688 static rtx_insn *
29689 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29690 {
29691   rtx_insn *insn;
29692   rtx_insn *last = call;
29693   rtx_insn *first_arg = NULL;
29694   bool is_spilled = false;
29695 
29696   head = PREV_INSN (head);
29697 
29698   /* Find nearest to call argument passing instruction.  */
29699   while (true)
29700     {
29701       last = PREV_INSN (last);
29702       if (last == head)
29703 	return NULL;
29704       if (!NONDEBUG_INSN_P (last))
29705 	continue;
29706       if (insn_is_function_arg (last, &is_spilled))
29707 	break;
29708       return NULL;
29709     }
29710 
29711   first_arg = last;
29712   while (true)
29713     {
29714       insn = PREV_INSN (last);
29715       if (!INSN_P (insn))
29716 	break;
29717       if (insn == head)
29718 	break;
29719       if (!NONDEBUG_INSN_P (insn))
29720 	{
29721 	  last = insn;
29722 	  continue;
29723 	}
29724       if (insn_is_function_arg (insn, &is_spilled))
29725 	{
29726 	  /* Add output depdendence between two function arguments if chain
29727 	     of output arguments contains likely spilled HW registers.  */
29728 	  if (is_spilled)
29729 	    add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29730 	  first_arg = last = insn;
29731 	}
29732       else
29733 	break;
29734     }
29735   if (!is_spilled)
29736     return NULL;
29737   return first_arg;
29738 }
29739 
29740 /* Add output or anti dependency from insn to first_arg to restrict its code
29741    motion.  */
29742 static void
29743 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29744 {
29745   rtx set;
29746   rtx tmp;
29747 
29748   /* Add anti dependencies for bounds stores.  */
29749   if (INSN_P (insn)
29750       && GET_CODE (PATTERN (insn)) == PARALLEL
29751       && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29752       && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29753     {
29754       add_dependence (first_arg, insn, REG_DEP_ANTI);
29755       return;
29756     }
29757 
29758   set = single_set (insn);
29759   if (!set)
29760     return;
29761   tmp = SET_DEST (set);
29762   if (REG_P (tmp))
29763     {
29764       /* Add output dependency to the first function argument.  */
29765       add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29766       return;
29767     }
29768   /* Add anti dependency.  */
29769   add_dependence (first_arg, insn, REG_DEP_ANTI);
29770 }
29771 
29772 /* Avoid cross block motion of function argument through adding dependency
29773    from the first non-jump instruction in bb.  */
29774 static void
29775 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29776 {
29777   rtx_insn *insn = BB_END (bb);
29778 
29779   while (insn)
29780     {
29781       if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29782 	{
29783 	  rtx set = single_set (insn);
29784 	  if (set)
29785 	    {
29786 	      avoid_func_arg_motion (arg, insn);
29787 	      return;
29788 	    }
29789 	}
29790       if (insn == BB_HEAD (bb))
29791 	return;
29792       insn = PREV_INSN (insn);
29793     }
29794 }
29795 
29796 /* Hook for pre-reload schedule - avoid motion of function arguments
29797    passed in likely spilled HW registers.  */
29798 static void
29799 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29800 {
29801   rtx_insn *insn;
29802   rtx_insn *first_arg = NULL;
29803   if (reload_completed)
29804     return;
29805   while (head != tail && DEBUG_INSN_P (head))
29806     head = NEXT_INSN (head);
29807   for (insn = tail; insn != head; insn = PREV_INSN (insn))
29808     if (INSN_P (insn) && CALL_P (insn))
29809       {
29810 	first_arg = add_parameter_dependencies (insn, head);
29811 	if (first_arg)
29812 	  {
29813 	    /* Add dependee for first argument to predecessors if only
29814 	       region contains more than one block.  */
29815 	    basic_block bb =  BLOCK_FOR_INSN (insn);
29816 	    int rgn = CONTAINING_RGN (bb->index);
29817 	    int nr_blks = RGN_NR_BLOCKS (rgn);
29818 	    /* Skip trivial regions and region head blocks that can have
29819 	       predecessors outside of region.  */
29820 	    if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29821 	      {
29822 		edge e;
29823 		edge_iterator ei;
29824 
29825 		/* Regions are SCCs with the exception of selective
29826 		   scheduling with pipelining of outer blocks enabled.
29827 		   So also check that immediate predecessors of a non-head
29828 		   block are in the same region.  */
29829 		FOR_EACH_EDGE (e, ei, bb->preds)
29830 		  {
29831 		    /* Avoid creating of loop-carried dependencies through
29832 		       using topological ordering in the region.  */
29833 		    if (rgn == CONTAINING_RGN (e->src->index)
29834 			&& BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29835 		      add_dependee_for_func_arg (first_arg, e->src);
29836 		  }
29837 	      }
29838 	    insn = first_arg;
29839 	    if (insn == head)
29840 	      break;
29841 	  }
29842       }
29843     else if (first_arg)
29844       avoid_func_arg_motion (first_arg, insn);
29845 }
29846 
29847 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29848    HW registers to maximum, to schedule them at soon as possible. These are
29849    moves from function argument registers at the top of the function entry
29850    and moves from function return value registers after call.  */
29851 static int
29852 ix86_adjust_priority (rtx_insn *insn, int priority)
29853 {
29854   rtx set;
29855 
29856   if (reload_completed)
29857     return priority;
29858 
29859   if (!NONDEBUG_INSN_P (insn))
29860     return priority;
29861 
29862   set = single_set (insn);
29863   if (set)
29864     {
29865       rtx tmp = SET_SRC (set);
29866       if (REG_P (tmp)
29867           && HARD_REGISTER_P (tmp)
29868           && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29869           && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29870 	return current_sched_info->sched_max_insns_priority;
29871     }
29872 
29873   return priority;
29874 }
29875 
29876 /* Prepare for scheduling pass.  */
29877 static void
29878 ix86_sched_init_global (FILE *, int, int)
29879 {
29880   /* Install scheduling hooks for current CPU.  Some of these hooks are used
29881      in time-critical parts of the scheduler, so we only set them up when
29882      they are actually used.  */
29883   switch (ix86_tune)
29884     {
29885     case PROCESSOR_CORE2:
29886     case PROCESSOR_NEHALEM:
29887     case PROCESSOR_SANDYBRIDGE:
29888     case PROCESSOR_HASWELL:
29889     case PROCESSOR_GENERIC:
29890       /* Do not perform multipass scheduling for pre-reload schedule
29891          to save compile time.  */
29892       if (reload_completed)
29893 	{
29894 	  ix86_core2i7_init_hooks ();
29895 	  break;
29896 	}
29897       /* Fall through.  */
29898     default:
29899       targetm.sched.dfa_post_advance_cycle = NULL;
29900       targetm.sched.first_cycle_multipass_init = NULL;
29901       targetm.sched.first_cycle_multipass_begin = NULL;
29902       targetm.sched.first_cycle_multipass_issue = NULL;
29903       targetm.sched.first_cycle_multipass_backtrack = NULL;
29904       targetm.sched.first_cycle_multipass_end = NULL;
29905       targetm.sched.first_cycle_multipass_fini = NULL;
29906       break;
29907     }
29908 }
29909 
29910 
29911 /* Implement TARGET_STATIC_RTX_ALIGNMENT.  */
29912 
29913 static HOST_WIDE_INT
29914 ix86_static_rtx_alignment (machine_mode mode)
29915 {
29916   if (mode == DFmode)
29917     return 64;
29918   if (ALIGN_MODE_128 (mode))
29919     return MAX (128, GET_MODE_ALIGNMENT (mode));
29920   return GET_MODE_ALIGNMENT (mode);
29921 }
29922 
29923 /* Implement TARGET_CONSTANT_ALIGNMENT.  */
29924 
29925 static HOST_WIDE_INT
29926 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29927 {
29928   if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29929       || TREE_CODE (exp) == INTEGER_CST)
29930     {
29931       machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29932       HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29933       return MAX (mode_align, align);
29934     }
29935   else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29936 	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29937     return BITS_PER_WORD;
29938 
29939   return align;
29940 }
29941 
29942 /* Implement TARGET_EMPTY_RECORD_P.  */
29943 
29944 static bool
29945 ix86_is_empty_record (const_tree type)
29946 {
29947   if (!TARGET_64BIT)
29948     return false;
29949   return default_is_empty_record (type);
29950 }
29951 
29952 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI.  */
29953 
29954 static void
29955 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
29956 {
29957   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
29958 
29959   if (!cum->warn_empty)
29960     return;
29961 
29962   if (!TYPE_EMPTY_P (type))
29963     return;
29964 
29965   /* Don't warn if the function isn't visible outside of the TU.  */
29966   if (cum->decl && !TREE_PUBLIC (cum->decl))
29967     return;
29968 
29969   const_tree ctx = get_ultimate_context (cum->decl);
29970   if (ctx != NULL_TREE
29971       && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
29972     return;
29973 
29974   /* If the actual size of the type is zero, then there is no change
29975      in how objects of this size are passed.  */
29976   if (int_size_in_bytes (type) == 0)
29977     return;
29978 
29979   warning (OPT_Wabi, "empty class %qT parameter passing ABI "
29980 	   "changes in -fabi-version=12 (GCC 8)", type);
29981 
29982   /* Only warn once.  */
29983   cum->warn_empty = false;
29984 }
29985 
29986 /* Compute the alignment for a variable for Intel MCU psABI.  TYPE is
29987    the data type, and ALIGN is the alignment that the object would
29988    ordinarily have.  */
29989 
29990 static int
29991 iamcu_alignment (tree type, int align)
29992 {
29993   machine_mode mode;
29994 
29995   if (align < 32 || TYPE_USER_ALIGN (type))
29996     return align;
29997 
29998   /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
29999      bytes.  */
30000   mode = TYPE_MODE (strip_array_types (type));
30001   switch (GET_MODE_CLASS (mode))
30002     {
30003     case MODE_INT:
30004     case MODE_COMPLEX_INT:
30005     case MODE_COMPLEX_FLOAT:
30006     case MODE_FLOAT:
30007     case MODE_DECIMAL_FLOAT:
30008       return 32;
30009     default:
30010       return align;
30011     }
30012 }
30013 
30014 /* Compute the alignment for a static variable.
30015    TYPE is the data type, and ALIGN is the alignment that
30016    the object would ordinarily have.  The value of this function is used
30017    instead of that alignment to align the object.  */
30018 
30019 int
30020 ix86_data_alignment (tree type, int align, bool opt)
30021 {
30022   /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30023      for symbols from other compilation units or symbols that don't need
30024      to bind locally.  In order to preserve some ABI compatibility with
30025      those compilers, ensure we don't decrease alignment from what we
30026      used to assume.  */
30027 
30028   int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30029 
30030   /* A data structure, equal or greater than the size of a cache line
30031      (64 bytes in the Pentium 4 and other recent Intel processors, including
30032      processors based on Intel Core microarchitecture) should be aligned
30033      so that its base address is a multiple of a cache line size.  */
30034 
30035   int max_align
30036     = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30037 
30038   if (max_align < BITS_PER_WORD)
30039     max_align = BITS_PER_WORD;
30040 
30041   switch (ix86_align_data_type)
30042     {
30043     case ix86_align_data_type_abi: opt = false; break;
30044     case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30045     case ix86_align_data_type_cacheline: break;
30046     }
30047 
30048   if (TARGET_IAMCU)
30049     align = iamcu_alignment (type, align);
30050 
30051   if (opt
30052       && AGGREGATE_TYPE_P (type)
30053       && TYPE_SIZE (type)
30054       && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30055     {
30056       if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
30057 	  && align < max_align_compat)
30058 	align = max_align_compat;
30059       if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
30060 	  && align < max_align)
30061 	align = max_align;
30062     }
30063 
30064   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30065      to 16byte boundary.  */
30066   if (TARGET_64BIT)
30067     {
30068       if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30069 	  && TYPE_SIZE (type)
30070 	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30071 	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30072 	  && align < 128)
30073 	return 128;
30074     }
30075 
30076   if (!opt)
30077     return align;
30078 
30079   if (TREE_CODE (type) == ARRAY_TYPE)
30080     {
30081       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30082 	return 64;
30083       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30084 	return 128;
30085     }
30086   else if (TREE_CODE (type) == COMPLEX_TYPE)
30087     {
30088 
30089       if (TYPE_MODE (type) == DCmode && align < 64)
30090 	return 64;
30091       if ((TYPE_MODE (type) == XCmode
30092 	   || TYPE_MODE (type) == TCmode) && align < 128)
30093 	return 128;
30094     }
30095   else if ((TREE_CODE (type) == RECORD_TYPE
30096 	    || TREE_CODE (type) == UNION_TYPE
30097 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
30098 	   && TYPE_FIELDS (type))
30099     {
30100       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30101 	return 64;
30102       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30103 	return 128;
30104     }
30105   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30106 	   || TREE_CODE (type) == INTEGER_TYPE)
30107     {
30108       if (TYPE_MODE (type) == DFmode && align < 64)
30109 	return 64;
30110       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30111 	return 128;
30112     }
30113 
30114   return align;
30115 }
30116 
30117 /* Compute the alignment for a local variable or a stack slot.  EXP is
30118    the data type or decl itself, MODE is the widest mode available and
30119    ALIGN is the alignment that the object would ordinarily have.  The
30120    value of this macro is used instead of that alignment to align the
30121    object.  */
30122 
30123 unsigned int
30124 ix86_local_alignment (tree exp, machine_mode mode,
30125 		      unsigned int align)
30126 {
30127   tree type, decl;
30128 
30129   if (exp && DECL_P (exp))
30130     {
30131       type = TREE_TYPE (exp);
30132       decl = exp;
30133     }
30134   else
30135     {
30136       type = exp;
30137       decl = NULL;
30138     }
30139 
30140   /* Don't do dynamic stack realignment for long long objects with
30141      -mpreferred-stack-boundary=2.  */
30142   if (!TARGET_64BIT
30143       && align == 64
30144       && ix86_preferred_stack_boundary < 64
30145       && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30146       && (!type || !TYPE_USER_ALIGN (type))
30147       && (!decl || !DECL_USER_ALIGN (decl)))
30148     align = 32;
30149 
30150   /* If TYPE is NULL, we are allocating a stack slot for caller-save
30151      register in MODE.  We will return the largest alignment of XF
30152      and DF.  */
30153   if (!type)
30154     {
30155       if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30156 	align = GET_MODE_ALIGNMENT (DFmode);
30157       return align;
30158     }
30159 
30160   /* Don't increase alignment for Intel MCU psABI.  */
30161   if (TARGET_IAMCU)
30162     return align;
30163 
30164   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30165      to 16byte boundary.  Exact wording is:
30166 
30167      An array uses the same alignment as its elements, except that a local or
30168      global array variable of length at least 16 bytes or
30169      a C99 variable-length array variable always has alignment of at least 16 bytes.
30170 
30171      This was added to allow use of aligned SSE instructions at arrays.  This
30172      rule is meant for static storage (where compiler can not do the analysis
30173      by itself).  We follow it for automatic variables only when convenient.
30174      We fully control everything in the function compiled and functions from
30175      other unit can not rely on the alignment.
30176 
30177      Exclude va_list type.  It is the common case of local array where
30178      we can not benefit from the alignment.
30179 
30180      TODO: Probably one should optimize for size only when var is not escaping.  */
30181   if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30182       && TARGET_SSE)
30183     {
30184       if (AGGREGATE_TYPE_P (type)
30185 	  && (va_list_type_node == NULL_TREE
30186 	      || (TYPE_MAIN_VARIANT (type)
30187 		  != TYPE_MAIN_VARIANT (va_list_type_node)))
30188 	  && TYPE_SIZE (type)
30189 	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30190 	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30191 	  && align < 128)
30192 	return 128;
30193     }
30194   if (TREE_CODE (type) == ARRAY_TYPE)
30195     {
30196       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30197 	return 64;
30198       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30199 	return 128;
30200     }
30201   else if (TREE_CODE (type) == COMPLEX_TYPE)
30202     {
30203       if (TYPE_MODE (type) == DCmode && align < 64)
30204 	return 64;
30205       if ((TYPE_MODE (type) == XCmode
30206 	   || TYPE_MODE (type) == TCmode) && align < 128)
30207 	return 128;
30208     }
30209   else if ((TREE_CODE (type) == RECORD_TYPE
30210 	    || TREE_CODE (type) == UNION_TYPE
30211 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
30212 	   && TYPE_FIELDS (type))
30213     {
30214       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30215 	return 64;
30216       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30217 	return 128;
30218     }
30219   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30220 	   || TREE_CODE (type) == INTEGER_TYPE)
30221     {
30222 
30223       if (TYPE_MODE (type) == DFmode && align < 64)
30224 	return 64;
30225       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30226 	return 128;
30227     }
30228   return align;
30229 }
30230 
30231 /* Compute the minimum required alignment for dynamic stack realignment
30232    purposes for a local variable, parameter or a stack slot.  EXP is
30233    the data type or decl itself, MODE is its mode and ALIGN is the
30234    alignment that the object would ordinarily have.  */
30235 
30236 unsigned int
30237 ix86_minimum_alignment (tree exp, machine_mode mode,
30238 			unsigned int align)
30239 {
30240   tree type, decl;
30241 
30242   if (exp && DECL_P (exp))
30243     {
30244       type = TREE_TYPE (exp);
30245       decl = exp;
30246     }
30247   else
30248     {
30249       type = exp;
30250       decl = NULL;
30251     }
30252 
30253   if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30254     return align;
30255 
30256   /* Don't do dynamic stack realignment for long long objects with
30257      -mpreferred-stack-boundary=2.  */
30258   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30259       && (!type || !TYPE_USER_ALIGN (type))
30260       && (!decl || !DECL_USER_ALIGN (decl)))
30261     {
30262       gcc_checking_assert (!TARGET_STV);
30263       return 32;
30264     }
30265 
30266   return align;
30267 }
30268 
30269 /* Find a location for the static chain incoming to a nested function.
30270    This is a register, unless all free registers are used by arguments.  */
30271 
30272 static rtx
30273 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30274 {
30275   unsigned regno;
30276 
30277   if (TARGET_64BIT)
30278     {
30279       /* We always use R10 in 64-bit mode.  */
30280       regno = R10_REG;
30281     }
30282   else
30283     {
30284       const_tree fntype, fndecl;
30285       unsigned int ccvt;
30286 
30287       /* By default in 32-bit mode we use ECX to pass the static chain.  */
30288       regno = CX_REG;
30289 
30290       if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30291 	{
30292           fntype = TREE_TYPE (fndecl_or_type);
30293 	  fndecl = fndecl_or_type;
30294 	}
30295       else
30296 	{
30297 	  fntype = fndecl_or_type;
30298 	  fndecl = NULL;
30299 	}
30300 
30301       ccvt = ix86_get_callcvt (fntype);
30302       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30303 	{
30304 	  /* Fastcall functions use ecx/edx for arguments, which leaves
30305 	     us with EAX for the static chain.
30306 	     Thiscall functions use ecx for arguments, which also
30307 	     leaves us with EAX for the static chain.  */
30308 	  regno = AX_REG;
30309 	}
30310       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30311 	{
30312 	  /* Thiscall functions use ecx for arguments, which leaves
30313 	     us with EAX and EDX for the static chain.
30314 	     We are using for abi-compatibility EAX.  */
30315 	  regno = AX_REG;
30316 	}
30317       else if (ix86_function_regparm (fntype, fndecl) == 3)
30318 	{
30319 	  /* For regparm 3, we have no free call-clobbered registers in
30320 	     which to store the static chain.  In order to implement this,
30321 	     we have the trampoline push the static chain to the stack.
30322 	     However, we can't push a value below the return address when
30323 	     we call the nested function directly, so we have to use an
30324 	     alternate entry point.  For this we use ESI, and have the
30325 	     alternate entry point push ESI, so that things appear the
30326 	     same once we're executing the nested function.  */
30327 	  if (incoming_p)
30328 	    {
30329 	      if (fndecl == current_function_decl
30330 		  && !ix86_static_chain_on_stack)
30331 		{
30332 		  gcc_assert (!reload_completed);
30333 		  ix86_static_chain_on_stack = true;
30334 		}
30335 	      return gen_frame_mem (SImode,
30336 				    plus_constant (Pmode,
30337 						   arg_pointer_rtx, -8));
30338 	    }
30339 	  regno = SI_REG;
30340 	}
30341     }
30342 
30343   return gen_rtx_REG (Pmode, regno);
30344 }
30345 
30346 /* Emit RTL insns to initialize the variable parts of a trampoline.
30347    FNDECL is the decl of the target address; M_TRAMP is a MEM for
30348    the trampoline, and CHAIN_VALUE is an RTX for the static chain
30349    to be passed to the target function.  */
30350 
30351 static void
30352 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30353 {
30354   rtx mem, fnaddr;
30355   int opcode;
30356   int offset = 0;
30357   bool need_endbr = (flag_cf_protection & CF_BRANCH);
30358 
30359   fnaddr = XEXP (DECL_RTL (fndecl), 0);
30360 
30361   if (TARGET_64BIT)
30362     {
30363       int size;
30364 
30365       if (need_endbr)
30366 	{
30367 	  /* Insert ENDBR64.  */
30368 	  mem = adjust_address (m_tramp, SImode, offset);
30369 	  emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
30370 	  offset += 4;
30371 	}
30372 
30373       /* Load the function address to r11.  Try to load address using
30374 	 the shorter movl instead of movabs.  We may want to support
30375 	 movq for kernel mode, but kernel does not use trampolines at
30376 	 the moment.  FNADDR is a 32bit address and may not be in
30377 	 DImode when ptr_mode == SImode.  Always use movl in this
30378 	 case.  */
30379       if (ptr_mode == SImode
30380 	  || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30381 	{
30382 	  fnaddr = copy_addr_to_reg (fnaddr);
30383 
30384 	  mem = adjust_address (m_tramp, HImode, offset);
30385 	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30386 
30387 	  mem = adjust_address (m_tramp, SImode, offset + 2);
30388 	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30389 	  offset += 6;
30390 	}
30391       else
30392 	{
30393 	  mem = adjust_address (m_tramp, HImode, offset);
30394 	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30395 
30396 	  mem = adjust_address (m_tramp, DImode, offset + 2);
30397 	  emit_move_insn (mem, fnaddr);
30398 	  offset += 10;
30399 	}
30400 
30401       /* Load static chain using movabs to r10.  Use the shorter movl
30402          instead of movabs when ptr_mode == SImode.  */
30403       if (ptr_mode == SImode)
30404 	{
30405 	  opcode = 0xba41;
30406 	  size = 6;
30407 	}
30408       else
30409 	{
30410 	  opcode = 0xba49;
30411 	  size = 10;
30412 	}
30413 
30414       mem = adjust_address (m_tramp, HImode, offset);
30415       emit_move_insn (mem, gen_int_mode (opcode, HImode));
30416 
30417       mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30418       emit_move_insn (mem, chain_value);
30419       offset += size;
30420 
30421       /* Jump to r11; the last (unused) byte is a nop, only there to
30422 	 pad the write out to a single 32-bit store.  */
30423       mem = adjust_address (m_tramp, SImode, offset);
30424       emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30425       offset += 4;
30426     }
30427   else
30428     {
30429       rtx disp, chain;
30430 
30431       /* Depending on the static chain location, either load a register
30432 	 with a constant, or push the constant to the stack.  All of the
30433 	 instructions are the same size.  */
30434       chain = ix86_static_chain (fndecl, true);
30435       if (REG_P (chain))
30436 	{
30437 	  switch (REGNO (chain))
30438 	    {
30439 	    case AX_REG:
30440 	      opcode = 0xb8; break;
30441 	    case CX_REG:
30442 	      opcode = 0xb9; break;
30443 	    default:
30444 	      gcc_unreachable ();
30445 	    }
30446 	}
30447       else
30448 	opcode = 0x68;
30449 
30450       if (need_endbr)
30451 	{
30452 	  /* Insert ENDBR32.  */
30453 	  mem = adjust_address (m_tramp, SImode, offset);
30454 	  emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
30455 	  offset += 4;
30456 	}
30457 
30458       mem = adjust_address (m_tramp, QImode, offset);
30459       emit_move_insn (mem, gen_int_mode (opcode, QImode));
30460 
30461       mem = adjust_address (m_tramp, SImode, offset + 1);
30462       emit_move_insn (mem, chain_value);
30463       offset += 5;
30464 
30465       mem = adjust_address (m_tramp, QImode, offset);
30466       emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30467 
30468       mem = adjust_address (m_tramp, SImode, offset + 1);
30469 
30470       /* Compute offset from the end of the jmp to the target function.
30471 	 In the case in which the trampoline stores the static chain on
30472 	 the stack, we need to skip the first insn which pushes the
30473 	 (call-saved) register static chain; this push is 1 byte.  */
30474       offset += 5;
30475       disp = expand_binop (SImode, sub_optab, fnaddr,
30476 			   plus_constant (Pmode, XEXP (m_tramp, 0),
30477 					  offset - (MEM_P (chain) ? 1 : 0)),
30478 			   NULL_RTX, 1, OPTAB_DIRECT);
30479       emit_move_insn (mem, disp);
30480     }
30481 
30482   gcc_assert (offset <= TRAMPOLINE_SIZE);
30483 
30484 #ifdef HAVE_ENABLE_EXECUTE_STACK
30485 #ifdef CHECK_EXECUTE_STACK_ENABLED
30486   if (CHECK_EXECUTE_STACK_ENABLED)
30487 #endif
30488   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30489 		     LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30490 #endif
30491 }
30492 
30493 static bool
30494 ix86_allocate_stack_slots_for_args (void)
30495 {
30496   /* Naked functions should not allocate stack slots for arguments.  */
30497   return !ix86_function_naked (current_function_decl);
30498 }
30499 
30500 static bool
30501 ix86_warn_func_return (tree decl)
30502 {
30503   /* Naked functions are implemented entirely in assembly, including the
30504      return sequence, so suppress warnings about this.  */
30505   return !ix86_function_naked (decl);
30506 }
30507 
30508 /* The following file contains several enumerations and data structures
30509    built from the definitions in i386-builtin-types.def.  */
30510 
30511 #include "i386-builtin-types.inc"
30512 
30513 /* Table for the ix86 builtin non-function types.  */
30514 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30515 
30516 /* Retrieve an element from the above table, building some of
30517    the types lazily.  */
30518 
30519 static tree
30520 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30521 {
30522   unsigned int index;
30523   tree type, itype;
30524 
30525   gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30526 
30527   type = ix86_builtin_type_tab[(int) tcode];
30528   if (type != NULL)
30529     return type;
30530 
30531   gcc_assert (tcode > IX86_BT_LAST_PRIM);
30532   if (tcode <= IX86_BT_LAST_VECT)
30533     {
30534       machine_mode mode;
30535 
30536       index = tcode - IX86_BT_LAST_PRIM - 1;
30537       itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30538       mode = ix86_builtin_type_vect_mode[index];
30539 
30540       type = build_vector_type_for_mode (itype, mode);
30541     }
30542   else
30543     {
30544       int quals;
30545 
30546       index = tcode - IX86_BT_LAST_VECT - 1;
30547       if (tcode <= IX86_BT_LAST_PTR)
30548 	quals = TYPE_UNQUALIFIED;
30549       else
30550 	quals = TYPE_QUAL_CONST;
30551 
30552       itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30553       if (quals != TYPE_UNQUALIFIED)
30554 	itype = build_qualified_type (itype, quals);
30555 
30556       type = build_pointer_type (itype);
30557     }
30558 
30559   ix86_builtin_type_tab[(int) tcode] = type;
30560   return type;
30561 }
30562 
30563 /* Table for the ix86 builtin function types.  */
30564 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30565 
30566 /* Retrieve an element from the above table, building some of
30567    the types lazily.  */
30568 
30569 static tree
30570 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30571 {
30572   tree type;
30573 
30574   gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30575 
30576   type = ix86_builtin_func_type_tab[(int) tcode];
30577   if (type != NULL)
30578     return type;
30579 
30580   if (tcode <= IX86_BT_LAST_FUNC)
30581     {
30582       unsigned start = ix86_builtin_func_start[(int) tcode];
30583       unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30584       tree rtype, atype, args = void_list_node;
30585       unsigned i;
30586 
30587       rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30588       for (i = after - 1; i > start; --i)
30589 	{
30590 	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30591 	  args = tree_cons (NULL, atype, args);
30592 	}
30593 
30594       type = build_function_type (rtype, args);
30595     }
30596   else
30597     {
30598       unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30599       enum ix86_builtin_func_type icode;
30600 
30601       icode = ix86_builtin_func_alias_base[index];
30602       type = ix86_get_builtin_func_type (icode);
30603     }
30604 
30605   ix86_builtin_func_type_tab[(int) tcode] = type;
30606   return type;
30607 }
30608 
30609 
30610 /* Codes for all the SSE/MMX builtins.  Builtins not mentioned in any
30611    bdesc_* arrays below should come first, then builtins for each bdesc_*
30612    array in ascending order, so that we can use direct array accesses.  */
30613 enum ix86_builtins
30614 {
30615   IX86_BUILTIN_MASKMOVQ,
30616   IX86_BUILTIN_LDMXCSR,
30617   IX86_BUILTIN_STMXCSR,
30618   IX86_BUILTIN_MASKMOVDQU,
30619   IX86_BUILTIN_PSLLDQ128,
30620   IX86_BUILTIN_CLFLUSH,
30621   IX86_BUILTIN_MONITOR,
30622   IX86_BUILTIN_MWAIT,
30623   IX86_BUILTIN_CLZERO,
30624   IX86_BUILTIN_VEC_INIT_V2SI,
30625   IX86_BUILTIN_VEC_INIT_V4HI,
30626   IX86_BUILTIN_VEC_INIT_V8QI,
30627   IX86_BUILTIN_VEC_EXT_V2DF,
30628   IX86_BUILTIN_VEC_EXT_V2DI,
30629   IX86_BUILTIN_VEC_EXT_V4SF,
30630   IX86_BUILTIN_VEC_EXT_V4SI,
30631   IX86_BUILTIN_VEC_EXT_V8HI,
30632   IX86_BUILTIN_VEC_EXT_V2SI,
30633   IX86_BUILTIN_VEC_EXT_V4HI,
30634   IX86_BUILTIN_VEC_EXT_V16QI,
30635   IX86_BUILTIN_VEC_SET_V2DI,
30636   IX86_BUILTIN_VEC_SET_V4SF,
30637   IX86_BUILTIN_VEC_SET_V4SI,
30638   IX86_BUILTIN_VEC_SET_V8HI,
30639   IX86_BUILTIN_VEC_SET_V4HI,
30640   IX86_BUILTIN_VEC_SET_V16QI,
30641   IX86_BUILTIN_GATHERSIV2DF,
30642   IX86_BUILTIN_GATHERSIV4DF,
30643   IX86_BUILTIN_GATHERDIV2DF,
30644   IX86_BUILTIN_GATHERDIV4DF,
30645   IX86_BUILTIN_GATHERSIV4SF,
30646   IX86_BUILTIN_GATHERSIV8SF,
30647   IX86_BUILTIN_GATHERDIV4SF,
30648   IX86_BUILTIN_GATHERDIV8SF,
30649   IX86_BUILTIN_GATHERSIV2DI,
30650   IX86_BUILTIN_GATHERSIV4DI,
30651   IX86_BUILTIN_GATHERDIV2DI,
30652   IX86_BUILTIN_GATHERDIV4DI,
30653   IX86_BUILTIN_GATHERSIV4SI,
30654   IX86_BUILTIN_GATHERSIV8SI,
30655   IX86_BUILTIN_GATHERDIV4SI,
30656   IX86_BUILTIN_GATHERDIV8SI,
30657   IX86_BUILTIN_VFMSUBSD3_MASK3,
30658   IX86_BUILTIN_VFMSUBSS3_MASK3,
30659   IX86_BUILTIN_GATHER3SIV8SF,
30660   IX86_BUILTIN_GATHER3SIV4SF,
30661   IX86_BUILTIN_GATHER3SIV4DF,
30662   IX86_BUILTIN_GATHER3SIV2DF,
30663   IX86_BUILTIN_GATHER3DIV8SF,
30664   IX86_BUILTIN_GATHER3DIV4SF,
30665   IX86_BUILTIN_GATHER3DIV4DF,
30666   IX86_BUILTIN_GATHER3DIV2DF,
30667   IX86_BUILTIN_GATHER3SIV8SI,
30668   IX86_BUILTIN_GATHER3SIV4SI,
30669   IX86_BUILTIN_GATHER3SIV4DI,
30670   IX86_BUILTIN_GATHER3SIV2DI,
30671   IX86_BUILTIN_GATHER3DIV8SI,
30672   IX86_BUILTIN_GATHER3DIV4SI,
30673   IX86_BUILTIN_GATHER3DIV4DI,
30674   IX86_BUILTIN_GATHER3DIV2DI,
30675   IX86_BUILTIN_SCATTERSIV8SF,
30676   IX86_BUILTIN_SCATTERSIV4SF,
30677   IX86_BUILTIN_SCATTERSIV4DF,
30678   IX86_BUILTIN_SCATTERSIV2DF,
30679   IX86_BUILTIN_SCATTERDIV8SF,
30680   IX86_BUILTIN_SCATTERDIV4SF,
30681   IX86_BUILTIN_SCATTERDIV4DF,
30682   IX86_BUILTIN_SCATTERDIV2DF,
30683   IX86_BUILTIN_SCATTERSIV8SI,
30684   IX86_BUILTIN_SCATTERSIV4SI,
30685   IX86_BUILTIN_SCATTERSIV4DI,
30686   IX86_BUILTIN_SCATTERSIV2DI,
30687   IX86_BUILTIN_SCATTERDIV8SI,
30688   IX86_BUILTIN_SCATTERDIV4SI,
30689   IX86_BUILTIN_SCATTERDIV4DI,
30690   IX86_BUILTIN_SCATTERDIV2DI,
30691   /* Alternate 4 and 8 element gather/scatter for the vectorizer
30692      where all operands are 32-byte or 64-byte wide respectively.  */
30693   IX86_BUILTIN_GATHERALTSIV4DF,
30694   IX86_BUILTIN_GATHERALTDIV8SF,
30695   IX86_BUILTIN_GATHERALTSIV4DI,
30696   IX86_BUILTIN_GATHERALTDIV8SI,
30697   IX86_BUILTIN_GATHER3ALTDIV16SF,
30698   IX86_BUILTIN_GATHER3ALTDIV16SI,
30699   IX86_BUILTIN_GATHER3ALTSIV4DF,
30700   IX86_BUILTIN_GATHER3ALTDIV8SF,
30701   IX86_BUILTIN_GATHER3ALTSIV4DI,
30702   IX86_BUILTIN_GATHER3ALTDIV8SI,
30703   IX86_BUILTIN_GATHER3ALTSIV8DF,
30704   IX86_BUILTIN_GATHER3ALTSIV8DI,
30705   IX86_BUILTIN_GATHER3DIV16SF,
30706   IX86_BUILTIN_GATHER3DIV16SI,
30707   IX86_BUILTIN_GATHER3DIV8DF,
30708   IX86_BUILTIN_GATHER3DIV8DI,
30709   IX86_BUILTIN_GATHER3SIV16SF,
30710   IX86_BUILTIN_GATHER3SIV16SI,
30711   IX86_BUILTIN_GATHER3SIV8DF,
30712   IX86_BUILTIN_GATHER3SIV8DI,
30713   IX86_BUILTIN_SCATTERALTSIV8DF,
30714   IX86_BUILTIN_SCATTERALTDIV16SF,
30715   IX86_BUILTIN_SCATTERALTSIV8DI,
30716   IX86_BUILTIN_SCATTERALTDIV16SI,
30717   IX86_BUILTIN_SCATTERDIV16SF,
30718   IX86_BUILTIN_SCATTERDIV16SI,
30719   IX86_BUILTIN_SCATTERDIV8DF,
30720   IX86_BUILTIN_SCATTERDIV8DI,
30721   IX86_BUILTIN_SCATTERSIV16SF,
30722   IX86_BUILTIN_SCATTERSIV16SI,
30723   IX86_BUILTIN_SCATTERSIV8DF,
30724   IX86_BUILTIN_SCATTERSIV8DI,
30725   IX86_BUILTIN_GATHERPFQPD,
30726   IX86_BUILTIN_GATHERPFDPS,
30727   IX86_BUILTIN_GATHERPFDPD,
30728   IX86_BUILTIN_GATHERPFQPS,
30729   IX86_BUILTIN_SCATTERPFDPD,
30730   IX86_BUILTIN_SCATTERPFDPS,
30731   IX86_BUILTIN_SCATTERPFQPD,
30732   IX86_BUILTIN_SCATTERPFQPS,
30733   IX86_BUILTIN_CLWB,
30734   IX86_BUILTIN_CLFLUSHOPT,
30735   IX86_BUILTIN_INFQ,
30736   IX86_BUILTIN_HUGE_VALQ,
30737   IX86_BUILTIN_NANQ,
30738   IX86_BUILTIN_NANSQ,
30739   IX86_BUILTIN_XABORT,
30740   IX86_BUILTIN_ADDCARRYX32,
30741   IX86_BUILTIN_ADDCARRYX64,
30742   IX86_BUILTIN_SBB32,
30743   IX86_BUILTIN_SBB64,
30744   IX86_BUILTIN_RDRAND16_STEP,
30745   IX86_BUILTIN_RDRAND32_STEP,
30746   IX86_BUILTIN_RDRAND64_STEP,
30747   IX86_BUILTIN_RDSEED16_STEP,
30748   IX86_BUILTIN_RDSEED32_STEP,
30749   IX86_BUILTIN_RDSEED64_STEP,
30750   IX86_BUILTIN_MONITORX,
30751   IX86_BUILTIN_MWAITX,
30752   IX86_BUILTIN_CFSTRING,
30753   IX86_BUILTIN_CPU_INIT,
30754   IX86_BUILTIN_CPU_IS,
30755   IX86_BUILTIN_CPU_SUPPORTS,
30756   IX86_BUILTIN_READ_FLAGS,
30757   IX86_BUILTIN_WRITE_FLAGS,
30758 
30759   /* All the remaining builtins are tracked in bdesc_* arrays in
30760      i386-builtin.def.  Don't add any IX86_BUILTIN_* enumerators after
30761      this point.  */
30762 #define BDESC(mask, icode, name, code, comparison, flag) \
30763   code,
30764 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30765   code,									    \
30766   IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30767 #define BDESC_END(kind, next_kind)
30768 
30769 #include "i386-builtin.def"
30770 
30771 #undef BDESC
30772 #undef BDESC_FIRST
30773 #undef BDESC_END
30774 
30775   IX86_BUILTIN_MAX,
30776 
30777   IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30778 
30779   /* Now just the aliases for bdesc_* start/end.  */
30780 #define BDESC(mask, icode, name, code, comparison, flag)
30781 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30782 #define BDESC_END(kind, next_kind) \
30783   IX86_BUILTIN__BDESC_##kind##_LAST					    \
30784     = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30785 
30786 #include "i386-builtin.def"
30787 
30788 #undef BDESC
30789 #undef BDESC_FIRST
30790 #undef BDESC_END
30791 
30792   /* Just to make sure there is no comma after the last enumerator.  */
30793   IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30794 };
30795 
30796 /* Table for the ix86 builtin decls.  */
30797 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30798 
30799 /* Table of all of the builtin functions that are possible with different ISA's
30800    but are waiting to be built until a function is declared to use that
30801    ISA.  */
30802 struct builtin_isa {
30803   HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
30804   HOST_WIDE_INT isa2;		/* additional isa_flags this builtin is defined for */
30805   const char *name;		/* function name */
30806   enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30807   unsigned char const_p:1;	/* true if the declaration is constant */
30808   unsigned char pure_p:1;	/* true if the declaration has pure attribute */
30809   bool leaf_p;			/* true if the declaration has leaf attribute */
30810   bool nothrow_p;		/* true if the declaration has nothrow attribute */
30811   bool set_and_not_built_p;
30812 };
30813 
30814 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30815 
30816 /* Bits that can still enable any inclusion of a builtin.  */
30817 static HOST_WIDE_INT deferred_isa_values = 0;
30818 static HOST_WIDE_INT deferred_isa_values2 = 0;
30819 
30820 /* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the MASK
30821    of which isa_flags to use in the ix86_builtins_isa array.  Stores the
30822    function decl in the ix86_builtins array.  Returns the function decl or
30823    NULL_TREE, if the builtin was not added.
30824 
30825    If the front end has a special hook for builtin functions, delay adding
30826    builtin functions that aren't in the current ISA until the ISA is changed
30827    with function specific optimization.  Doing so, can save about 300K for the
30828    default compiler.  When the builtin is expanded, check at that time whether
30829    it is valid.
30830 
30831    If the front end doesn't have a special hook, record all builtins, even if
30832    it isn't an instruction set in the current ISA in case the user uses
30833    function specific options for a different ISA, so that we don't get scope
30834    errors if a builtin is added in the middle of a function scope.  */
30835 
30836 static inline tree
30837 def_builtin (HOST_WIDE_INT mask, const char *name,
30838 	     enum ix86_builtin_func_type tcode,
30839 	     enum ix86_builtins code)
30840 {
30841   tree decl = NULL_TREE;
30842 
30843   if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30844     {
30845       ix86_builtins_isa[(int) code].isa = mask;
30846 
30847       mask &= ~OPTION_MASK_ISA_64BIT;
30848 
30849       /* Filter out the masks most often ored together with others.  */
30850       if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30851 	  && mask != OPTION_MASK_ISA_AVX512VL)
30852 	mask &= ~OPTION_MASK_ISA_AVX512VL;
30853       if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30854 	  && mask != OPTION_MASK_ISA_AVX512BW)
30855 	mask &= ~OPTION_MASK_ISA_AVX512BW;
30856 
30857       if (mask == 0
30858 	  || (mask & ix86_isa_flags) != 0
30859 	  || (lang_hooks.builtin_function
30860 	      == lang_hooks.builtin_function_ext_scope))
30861 	{
30862 	  tree type = ix86_get_builtin_func_type (tcode);
30863 	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30864 				       NULL, NULL_TREE);
30865 	  ix86_builtins[(int) code] = decl;
30866 	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30867 	}
30868       else
30869 	{
30870 	  /* Just a MASK where set_and_not_built_p == true can potentially
30871 	     include a builtin.  */
30872 	  deferred_isa_values |= mask;
30873 	  ix86_builtins[(int) code] = NULL_TREE;
30874 	  ix86_builtins_isa[(int) code].tcode = tcode;
30875 	  ix86_builtins_isa[(int) code].name = name;
30876 	  ix86_builtins_isa[(int) code].leaf_p = false;
30877 	  ix86_builtins_isa[(int) code].nothrow_p = false;
30878 	  ix86_builtins_isa[(int) code].const_p = false;
30879 	  ix86_builtins_isa[(int) code].pure_p = false;
30880 	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30881 	}
30882     }
30883 
30884   return decl;
30885 }
30886 
30887 /* Like def_builtin, but also marks the function decl "const".  */
30888 
30889 static inline tree
30890 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30891 		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30892 {
30893   tree decl = def_builtin (mask, name, tcode, code);
30894   if (decl)
30895     TREE_READONLY (decl) = 1;
30896   else
30897     ix86_builtins_isa[(int) code].const_p = true;
30898 
30899   return decl;
30900 }
30901 
30902 /* Like def_builtin, but also marks the function decl "pure".  */
30903 
30904 static inline tree
30905 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30906 		  enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30907 {
30908   tree decl = def_builtin (mask, name, tcode, code);
30909   if (decl)
30910     DECL_PURE_P (decl) = 1;
30911   else
30912     ix86_builtins_isa[(int) code].pure_p = true;
30913 
30914   return decl;
30915 }
30916 
30917 /* Like def_builtin, but for additional isa2 flags.  */
30918 
30919 static inline tree
30920 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30921 	      enum ix86_builtin_func_type tcode,
30922 	      enum ix86_builtins code)
30923 {
30924   tree decl = NULL_TREE;
30925 
30926   ix86_builtins_isa[(int) code].isa2 = mask;
30927 
30928   if (mask == 0
30929       || (mask & ix86_isa_flags2) != 0
30930       || (lang_hooks.builtin_function
30931 	  == lang_hooks.builtin_function_ext_scope))
30932 
30933     {
30934       tree type = ix86_get_builtin_func_type (tcode);
30935       decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30936 				   NULL, NULL_TREE);
30937       ix86_builtins[(int) code] = decl;
30938       ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30939     }
30940   else
30941     {
30942       /* Just a MASK where set_and_not_built_p == true can potentially
30943 	 include a builtin.  */
30944       deferred_isa_values2 |= mask;
30945       ix86_builtins[(int) code] = NULL_TREE;
30946       ix86_builtins_isa[(int) code].tcode = tcode;
30947       ix86_builtins_isa[(int) code].name = name;
30948       ix86_builtins_isa[(int) code].leaf_p = false;
30949       ix86_builtins_isa[(int) code].nothrow_p = false;
30950       ix86_builtins_isa[(int) code].const_p = false;
30951       ix86_builtins_isa[(int) code].pure_p = false;
30952       ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30953     }
30954 
30955   return decl;
30956 }
30957 
30958 /* Like def_builtin, but also marks the function decl "const".  */
30959 
30960 static inline tree
30961 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
30962 		    enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30963 {
30964   tree decl = def_builtin2 (mask, name, tcode, code);
30965   if (decl)
30966     TREE_READONLY (decl) = 1;
30967   else
30968     ix86_builtins_isa[(int) code].const_p = true;
30969 
30970   return decl;
30971 }
30972 
30973 /* Like def_builtin, but also marks the function decl "pure".  */
30974 
30975 static inline tree
30976 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
30977 		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30978 {
30979   tree decl = def_builtin2 (mask, name, tcode, code);
30980   if (decl)
30981     DECL_PURE_P (decl) = 1;
30982   else
30983     ix86_builtins_isa[(int) code].pure_p = true;
30984 
30985   return decl;
30986 }
30987 
30988 /* Add any new builtin functions for a given ISA that may not have been
30989    declared.  This saves a bit of space compared to adding all of the
30990    declarations to the tree, even if we didn't use them.  */
30991 
30992 static void
30993 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
30994 {
30995   isa &= ~OPTION_MASK_ISA_64BIT;
30996 
30997   if ((isa & deferred_isa_values) == 0
30998       && (isa2 & deferred_isa_values2) == 0)
30999     return;
31000 
31001   /* Bits in ISA value can be removed from potential isa values.  */
31002   deferred_isa_values &= ~isa;
31003   deferred_isa_values2 &= ~isa2;
31004 
31005   int i;
31006   tree saved_current_target_pragma = current_target_pragma;
31007   current_target_pragma = NULL_TREE;
31008 
31009   for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31010     {
31011       if (((ix86_builtins_isa[i].isa & isa) != 0
31012 	   || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31013 	  && ix86_builtins_isa[i].set_and_not_built_p)
31014 	{
31015 	  tree decl, type;
31016 
31017 	  /* Don't define the builtin again.  */
31018 	  ix86_builtins_isa[i].set_and_not_built_p = false;
31019 
31020 	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31021 	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31022 						 type, i, BUILT_IN_MD, NULL,
31023 						 NULL_TREE);
31024 
31025 	  ix86_builtins[i] = decl;
31026 	  if (ix86_builtins_isa[i].const_p)
31027 	    TREE_READONLY (decl) = 1;
31028 	  if (ix86_builtins_isa[i].pure_p)
31029 	    DECL_PURE_P (decl) = 1;
31030 	  if (ix86_builtins_isa[i].leaf_p)
31031 	    DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31032 						      NULL_TREE);
31033 	  if (ix86_builtins_isa[i].nothrow_p)
31034 	    TREE_NOTHROW (decl) = 1;
31035 	}
31036     }
31037 
31038   current_target_pragma = saved_current_target_pragma;
31039 }
31040 
31041 /* Bits for builtin_description.flag.  */
31042 
31043 /* Set when we don't support the comparison natively, and should
31044    swap_comparison in order to support it.  */
31045 #define BUILTIN_DESC_SWAP_OPERANDS	1
31046 
31047 struct builtin_description
31048 {
31049   const HOST_WIDE_INT mask;
31050   const enum insn_code icode;
31051   const char *const name;
31052   const enum ix86_builtins code;
31053   const enum rtx_code comparison;
31054   const int flag;
31055 };
31056 
31057 #define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31058 #define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31059 #define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31060 #define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31061 #define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
31062 #define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
31063 #define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
31064 #define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
31065 #define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
31066 #define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
31067 #define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
31068 #define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
31069 #define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
31070 #define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
31071 #define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
31072 #define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
31073 #define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
31074 #define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
31075 #define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
31076 #define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
31077 #define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
31078 #define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
31079 #define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
31080 #define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
31081 #define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
31082 #define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
31083 #define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
31084 #define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
31085 #define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
31086 #define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
31087 #define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
31088 #define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
31089 #define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
31090 #define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
31091 #define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
31092 #define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
31093 #define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
31094 #define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
31095 #define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
31096 #define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
31097 #define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
31098 #define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
31099 #define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
31100 #define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
31101 #define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
31102 #define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
31103 #define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
31104 #define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
31105 #define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
31106 #define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
31107 #define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
31108 #define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
31109 
31110 #define BDESC(mask, icode, name, code, comparison, flag) \
31111   { mask, icode, name, code, comparison, flag },
31112 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31113 static const struct builtin_description bdesc_##kind[] =		    \
31114 {									    \
31115   BDESC (mask, icode, name, code, comparison, flag)
31116 #define BDESC_END(kind, next_kind) \
31117 };
31118 
31119 #include "i386-builtin.def"
31120 
31121 #undef BDESC
31122 #undef BDESC_FIRST
31123 #undef BDESC_END
31124 
31125 /* TM vector builtins.  */
31126 
31127 /* Reuse the existing x86-specific `struct builtin_description' cause
31128    we're lazy.  Add casts to make them fit.  */
31129 static const struct builtin_description bdesc_tm[] =
31130 {
31131   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31132   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31133   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31134   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31135   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31136   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31137   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31138 
31139   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31140   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31141   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31142   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31143   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31144   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31145   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31146 
31147   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31148   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31149   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31150   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31151   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31152   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31153   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31154 
31155   { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31156   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31157   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31158 };
31159 
31160 /* Initialize the transactional memory vector load/store builtins.  */
31161 
31162 static void
31163 ix86_init_tm_builtins (void)
31164 {
31165   enum ix86_builtin_func_type ftype;
31166   const struct builtin_description *d;
31167   size_t i;
31168   tree decl;
31169   tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31170   tree attrs_log, attrs_type_log;
31171 
31172   if (!flag_tm)
31173     return;
31174 
31175   /* If there are no builtins defined, we must be compiling in a
31176      language without trans-mem support.  */
31177   if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31178     return;
31179 
31180   /* Use whatever attributes a normal TM load has.  */
31181   decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31182   attrs_load = DECL_ATTRIBUTES (decl);
31183   attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31184   /* Use whatever attributes a normal TM store has.  */
31185   decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31186   attrs_store = DECL_ATTRIBUTES (decl);
31187   attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31188   /* Use whatever attributes a normal TM log has.  */
31189   decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31190   attrs_log = DECL_ATTRIBUTES (decl);
31191   attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31192 
31193   for (i = 0, d = bdesc_tm;
31194        i < ARRAY_SIZE (bdesc_tm);
31195        i++, d++)
31196     {
31197       if ((d->mask & ix86_isa_flags) != 0
31198 	  || (lang_hooks.builtin_function
31199 	      == lang_hooks.builtin_function_ext_scope))
31200 	{
31201 	  tree type, attrs, attrs_type;
31202 	  enum built_in_function code = (enum built_in_function) d->code;
31203 
31204 	  ftype = (enum ix86_builtin_func_type) d->flag;
31205 	  type = ix86_get_builtin_func_type (ftype);
31206 
31207 	  if (BUILTIN_TM_LOAD_P (code))
31208 	    {
31209 	      attrs = attrs_load;
31210 	      attrs_type = attrs_type_load;
31211 	    }
31212 	  else if (BUILTIN_TM_STORE_P (code))
31213 	    {
31214 	      attrs = attrs_store;
31215 	      attrs_type = attrs_type_store;
31216 	    }
31217 	  else
31218 	    {
31219 	      attrs = attrs_log;
31220 	      attrs_type = attrs_type_log;
31221 	    }
31222 	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31223 				       /* The builtin without the prefix for
31224 					  calling it directly.  */
31225 				       d->name + strlen ("__builtin_"),
31226 				       attrs);
31227 	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31228 	     set the TYPE_ATTRIBUTES.  */
31229 	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31230 
31231 	  set_builtin_decl (code, decl, false);
31232 	}
31233     }
31234 }
31235 
31236 /* Macros for verification of enum ix86_builtins order.  */
31237 #define BDESC_VERIFY(x, y, z) \
31238   gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31239 #define BDESC_VERIFYS(x, y, z) \
31240   STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31241 
31242 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31243 	       IX86_BUILTIN__BDESC_COMI_LAST, 1);
31244 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31245 	       IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31246 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31247 	       IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31248 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31249 	       IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31250 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31251 	       IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31252 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31253 	       IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31254 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31255 	       IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31256 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31257 	       IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
31258 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31259 	       IX86_BUILTIN__BDESC_MPX_LAST, 1);
31260 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31261 	       IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31262 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31263 	       IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31264 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31265 	       IX86_BUILTIN__BDESC_CET_LAST, 1);
31266 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31267 	       IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31268 
31269 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31270    in the current target ISA to allow the user to compile particular modules
31271    with different target specific options that differ from the command line
31272    options.  */
31273 static void
31274 ix86_init_mmx_sse_builtins (void)
31275 {
31276   const struct builtin_description * d;
31277   enum ix86_builtin_func_type ftype;
31278   size_t i;
31279 
31280   /* Add all special builtins with variable number of operands.  */
31281   for (i = 0, d = bdesc_special_args;
31282        i < ARRAY_SIZE (bdesc_special_args);
31283        i++, d++)
31284     {
31285       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31286       if (d->name == 0)
31287 	continue;
31288 
31289       ftype = (enum ix86_builtin_func_type) d->flag;
31290       def_builtin (d->mask, d->name, ftype, d->code);
31291     }
31292   BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31293 		 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31294 		 ARRAY_SIZE (bdesc_special_args) - 1);
31295 
31296   /* Add all special builtins with variable number of operands.  */
31297   for (i = 0, d = bdesc_special_args2;
31298        i < ARRAY_SIZE (bdesc_special_args2);
31299        i++, d++)
31300     {
31301       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
31302       if (d->name == 0)
31303 	continue;
31304 
31305       ftype = (enum ix86_builtin_func_type) d->flag;
31306       def_builtin2 (d->mask, d->name, ftype, d->code);
31307     }
31308   BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
31309 		 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31310 		 ARRAY_SIZE (bdesc_special_args2) - 1);
31311 
31312   /* Add all builtins with variable number of operands.  */
31313   for (i = 0, d = bdesc_args;
31314        i < ARRAY_SIZE (bdesc_args);
31315        i++, d++)
31316     {
31317       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31318       if (d->name == 0)
31319 	continue;
31320 
31321       ftype = (enum ix86_builtin_func_type) d->flag;
31322       def_builtin_const (d->mask, d->name, ftype, d->code);
31323     }
31324   BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31325 		 IX86_BUILTIN__BDESC_ARGS_FIRST,
31326 		 ARRAY_SIZE (bdesc_args) - 1);
31327 
31328   /* Add all builtins with variable number of operands.  */
31329   for (i = 0, d = bdesc_args2;
31330        i < ARRAY_SIZE (bdesc_args2);
31331        i++, d++)
31332     {
31333       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31334       if (d->name == 0)
31335 	continue;
31336 
31337       ftype = (enum ix86_builtin_func_type) d->flag;
31338       def_builtin_const2 (d->mask, d->name, ftype, d->code);
31339     }
31340   BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31341 		 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31342 		 ARRAY_SIZE (bdesc_args2) - 1);
31343 
31344   /* Add all builtins with rounding.  */
31345   for (i = 0, d = bdesc_round_args;
31346        i < ARRAY_SIZE (bdesc_round_args);
31347        i++, d++)
31348     {
31349       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31350       if (d->name == 0)
31351 	continue;
31352 
31353       ftype = (enum ix86_builtin_func_type) d->flag;
31354       def_builtin_const (d->mask, d->name, ftype, d->code);
31355     }
31356   BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31357 		 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31358 		 ARRAY_SIZE (bdesc_round_args) - 1);
31359 
31360   /* pcmpestr[im] insns.  */
31361   for (i = 0, d = bdesc_pcmpestr;
31362        i < ARRAY_SIZE (bdesc_pcmpestr);
31363        i++, d++)
31364     {
31365       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31366       if (d->code == IX86_BUILTIN_PCMPESTRM128)
31367 	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31368       else
31369 	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31370       def_builtin_const (d->mask, d->name, ftype, d->code);
31371     }
31372   BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31373 		 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31374 		 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31375 
31376   /* pcmpistr[im] insns.  */
31377   for (i = 0, d = bdesc_pcmpistr;
31378        i < ARRAY_SIZE (bdesc_pcmpistr);
31379        i++, d++)
31380     {
31381       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31382       if (d->code == IX86_BUILTIN_PCMPISTRM128)
31383 	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31384       else
31385 	ftype = INT_FTYPE_V16QI_V16QI_INT;
31386       def_builtin_const (d->mask, d->name, ftype, d->code);
31387     }
31388   BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31389 		 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31390 		 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31391 
31392   /* comi/ucomi insns.  */
31393   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31394     {
31395       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31396       if (d->mask == OPTION_MASK_ISA_SSE2)
31397 	ftype = INT_FTYPE_V2DF_V2DF;
31398       else
31399 	ftype = INT_FTYPE_V4SF_V4SF;
31400       def_builtin_const (d->mask, d->name, ftype, d->code);
31401     }
31402   BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31403 		 IX86_BUILTIN__BDESC_COMI_FIRST,
31404 		 ARRAY_SIZE (bdesc_comi) - 1);
31405 
31406   /* SSE */
31407   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31408 	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31409   def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31410 		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31411 
31412   /* SSE or 3DNow!A */
31413   def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31414 	       /* As it uses V4HImode, we have to require -mmmx too.  */
31415 	       | OPTION_MASK_ISA_MMX,
31416 	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31417 	       IX86_BUILTIN_MASKMOVQ);
31418 
31419   /* SSE2 */
31420   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31421 	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31422 
31423   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31424 	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31425   x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31426 			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31427 
31428   /* SSE3.  */
31429   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31430 	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31431   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31432 	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31433 
31434   /* AES */
31435   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31436 		     "__builtin_ia32_aesenc128",
31437 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31438   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31439 		     "__builtin_ia32_aesenclast128",
31440 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31441   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31442 		     "__builtin_ia32_aesdec128",
31443 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31444   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31445 		     "__builtin_ia32_aesdeclast128",
31446 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31447   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31448 		     "__builtin_ia32_aesimc128",
31449 		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31450   def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31451 		     "__builtin_ia32_aeskeygenassist128",
31452 		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31453 
31454   /* PCLMUL */
31455   def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2,
31456 		     "__builtin_ia32_pclmulqdq128",
31457 		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31458 
31459   /* RDRND */
31460   def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31461 	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31462   def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31463 	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31464   def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31465 	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31466 	       IX86_BUILTIN_RDRAND64_STEP);
31467 
31468   /* AVX2 */
31469   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31470 		    V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31471 		    IX86_BUILTIN_GATHERSIV2DF);
31472 
31473   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31474 		    V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31475 		    IX86_BUILTIN_GATHERSIV4DF);
31476 
31477   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31478 		    V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31479 		    IX86_BUILTIN_GATHERDIV2DF);
31480 
31481   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31482 		    V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31483 		    IX86_BUILTIN_GATHERDIV4DF);
31484 
31485   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31486 		    V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31487 		    IX86_BUILTIN_GATHERSIV4SF);
31488 
31489   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31490 		    V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31491 		    IX86_BUILTIN_GATHERSIV8SF);
31492 
31493   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31494 		    V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31495 		    IX86_BUILTIN_GATHERDIV4SF);
31496 
31497   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31498 		    V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31499 		    IX86_BUILTIN_GATHERDIV8SF);
31500 
31501   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31502 		    V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31503 		    IX86_BUILTIN_GATHERSIV2DI);
31504 
31505   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31506 		    V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31507 		    IX86_BUILTIN_GATHERSIV4DI);
31508 
31509   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31510 		    V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31511 		    IX86_BUILTIN_GATHERDIV2DI);
31512 
31513   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31514 		    V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31515 		    IX86_BUILTIN_GATHERDIV4DI);
31516 
31517   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31518 		    V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31519 		    IX86_BUILTIN_GATHERSIV4SI);
31520 
31521   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31522 		    V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31523 		    IX86_BUILTIN_GATHERSIV8SI);
31524 
31525   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31526 		    V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31527 		    IX86_BUILTIN_GATHERDIV4SI);
31528 
31529   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31530 		    V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31531 		    IX86_BUILTIN_GATHERDIV8SI);
31532 
31533   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31534 		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31535 		    IX86_BUILTIN_GATHERALTSIV4DF);
31536 
31537   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31538 		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31539 		    IX86_BUILTIN_GATHERALTDIV8SF);
31540 
31541   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31542 		    V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31543 		    IX86_BUILTIN_GATHERALTSIV4DI);
31544 
31545   def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31546 		    V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31547 		    IX86_BUILTIN_GATHERALTDIV8SI);
31548 
31549   /* AVX512F */
31550   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31551 		    V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31552 		    IX86_BUILTIN_GATHER3SIV16SF);
31553 
31554   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31555 		    V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31556 		    IX86_BUILTIN_GATHER3SIV8DF);
31557 
31558   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31559 		    V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31560 		    IX86_BUILTIN_GATHER3DIV16SF);
31561 
31562   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31563 		    V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31564 		    IX86_BUILTIN_GATHER3DIV8DF);
31565 
31566   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31567 		    V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31568 		    IX86_BUILTIN_GATHER3SIV16SI);
31569 
31570   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31571 		    V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31572 		    IX86_BUILTIN_GATHER3SIV8DI);
31573 
31574   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31575 		    V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31576 		    IX86_BUILTIN_GATHER3DIV16SI);
31577 
31578   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31579 		    V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31580 		    IX86_BUILTIN_GATHER3DIV8DI);
31581 
31582   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31583 		    V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31584 		    IX86_BUILTIN_GATHER3ALTSIV8DF);
31585 
31586   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31587 		    V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31588 		    IX86_BUILTIN_GATHER3ALTDIV16SF);
31589 
31590   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31591 		    V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31592 		    IX86_BUILTIN_GATHER3ALTSIV8DI);
31593 
31594   def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31595 		    V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31596 		    IX86_BUILTIN_GATHER3ALTDIV16SI);
31597 
31598   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31599 	       VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31600 	       IX86_BUILTIN_SCATTERSIV16SF);
31601 
31602   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31603 	       VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31604 	       IX86_BUILTIN_SCATTERSIV8DF);
31605 
31606   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31607 	       VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31608 	       IX86_BUILTIN_SCATTERDIV16SF);
31609 
31610   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31611 	       VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31612 	       IX86_BUILTIN_SCATTERDIV8DF);
31613 
31614   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31615 	       VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31616 	       IX86_BUILTIN_SCATTERSIV16SI);
31617 
31618   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31619 	       VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31620 	       IX86_BUILTIN_SCATTERSIV8DI);
31621 
31622   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31623 	       VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31624 	       IX86_BUILTIN_SCATTERDIV16SI);
31625 
31626   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31627 	       VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31628 	       IX86_BUILTIN_SCATTERDIV8DI);
31629 
31630   /* AVX512VL */
31631   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31632 		    V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31633 		    IX86_BUILTIN_GATHER3SIV2DF);
31634 
31635   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31636 		    V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31637 		    IX86_BUILTIN_GATHER3SIV4DF);
31638 
31639   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31640 		    V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31641 		    IX86_BUILTIN_GATHER3DIV2DF);
31642 
31643   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31644 		    V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31645 		    IX86_BUILTIN_GATHER3DIV4DF);
31646 
31647   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31648 		    V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31649 		    IX86_BUILTIN_GATHER3SIV4SF);
31650 
31651   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31652 		    V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31653 		    IX86_BUILTIN_GATHER3SIV8SF);
31654 
31655   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31656 		    V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31657 		    IX86_BUILTIN_GATHER3DIV4SF);
31658 
31659   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31660 		    V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31661 		    IX86_BUILTIN_GATHER3DIV8SF);
31662 
31663   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31664 		    V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31665 		    IX86_BUILTIN_GATHER3SIV2DI);
31666 
31667   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31668 		    V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31669 		    IX86_BUILTIN_GATHER3SIV4DI);
31670 
31671   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31672 		    V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31673 		    IX86_BUILTIN_GATHER3DIV2DI);
31674 
31675   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31676 		    V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31677 		    IX86_BUILTIN_GATHER3DIV4DI);
31678 
31679   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31680 		    V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31681 		    IX86_BUILTIN_GATHER3SIV4SI);
31682 
31683   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31684 		    V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31685 		    IX86_BUILTIN_GATHER3SIV8SI);
31686 
31687   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31688 		    V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31689 		    IX86_BUILTIN_GATHER3DIV4SI);
31690 
31691   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31692 		    V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31693 		    IX86_BUILTIN_GATHER3DIV8SI);
31694 
31695   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31696 		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31697 		    IX86_BUILTIN_GATHER3ALTSIV4DF);
31698 
31699   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31700 		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31701 		    IX86_BUILTIN_GATHER3ALTDIV8SF);
31702 
31703   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31704 		    V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31705 		    IX86_BUILTIN_GATHER3ALTSIV4DI);
31706 
31707   def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31708 		    V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31709 		    IX86_BUILTIN_GATHER3ALTDIV8SI);
31710 
31711   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31712 	       VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31713 	       IX86_BUILTIN_SCATTERSIV8SF);
31714 
31715   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31716 	       VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31717 	       IX86_BUILTIN_SCATTERSIV4SF);
31718 
31719   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31720 	       VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31721 	       IX86_BUILTIN_SCATTERSIV4DF);
31722 
31723   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31724 	       VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31725 	       IX86_BUILTIN_SCATTERSIV2DF);
31726 
31727   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31728 	       VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31729 	       IX86_BUILTIN_SCATTERDIV8SF);
31730 
31731   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31732 	       VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31733 	       IX86_BUILTIN_SCATTERDIV4SF);
31734 
31735   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31736 	       VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31737 	       IX86_BUILTIN_SCATTERDIV4DF);
31738 
31739   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31740 	       VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31741 	       IX86_BUILTIN_SCATTERDIV2DF);
31742 
31743   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31744 	       VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31745 	       IX86_BUILTIN_SCATTERSIV8SI);
31746 
31747   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31748 	       VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31749 	       IX86_BUILTIN_SCATTERSIV4SI);
31750 
31751   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31752 	       VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31753 	       IX86_BUILTIN_SCATTERSIV4DI);
31754 
31755   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31756 	       VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31757 	       IX86_BUILTIN_SCATTERSIV2DI);
31758 
31759   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31760 	       VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31761 	       IX86_BUILTIN_SCATTERDIV8SI);
31762 
31763   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31764 	       VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31765 	       IX86_BUILTIN_SCATTERDIV4SI);
31766 
31767   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31768 	       VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31769 	       IX86_BUILTIN_SCATTERDIV4DI);
31770 
31771   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31772 	       VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31773 	       IX86_BUILTIN_SCATTERDIV2DI);
31774   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31775 	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31776 	       IX86_BUILTIN_SCATTERALTSIV8DF);
31777 
31778   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31779 	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31780 	       IX86_BUILTIN_SCATTERALTDIV16SF);
31781 
31782   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31783 	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31784 	       IX86_BUILTIN_SCATTERALTSIV8DI);
31785 
31786   def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31787 	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31788 	       IX86_BUILTIN_SCATTERALTDIV16SI);
31789 
31790   /* AVX512PF */
31791   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31792 	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31793 	       IX86_BUILTIN_GATHERPFDPD);
31794   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31795 	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31796 	       IX86_BUILTIN_GATHERPFDPS);
31797   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31798 	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31799 	       IX86_BUILTIN_GATHERPFQPD);
31800   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31801 	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31802 	       IX86_BUILTIN_GATHERPFQPS);
31803   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31804 	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31805 	       IX86_BUILTIN_SCATTERPFDPD);
31806   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31807 	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31808 	       IX86_BUILTIN_SCATTERPFDPS);
31809   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31810 	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31811 	       IX86_BUILTIN_SCATTERPFQPD);
31812   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31813 	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31814 	       IX86_BUILTIN_SCATTERPFQPS);
31815 
31816   /* SHA */
31817   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31818 		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31819   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31820 		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31821   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31822 		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31823   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31824 		     V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31825   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31826 		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31827   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31828 		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31829   def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31830 		     V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31831 
31832   /* RTM.  */
31833   def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31834 	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31835 
31836   /* MMX access to the vec_init patterns.  */
31837   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31838 		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31839 
31840   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31841 		     V4HI_FTYPE_HI_HI_HI_HI,
31842 		     IX86_BUILTIN_VEC_INIT_V4HI);
31843 
31844   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31845 		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31846 		     IX86_BUILTIN_VEC_INIT_V8QI);
31847 
31848   /* Access to the vec_extract patterns.  */
31849   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31850 		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31851   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31852 		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31853   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31854 		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31855   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31856 		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31857   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31858 		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31859 
31860   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31861 		     /* As it uses V4HImode, we have to require -mmmx too.  */
31862 		     | OPTION_MASK_ISA_MMX,
31863 		     "__builtin_ia32_vec_ext_v4hi",
31864 		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31865 
31866   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31867 		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31868 
31869   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31870 		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31871 
31872   /* Access to the vec_set patterns.  */
31873   def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31874 		     "__builtin_ia32_vec_set_v2di",
31875 		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31876 
31877   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31878 		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31879 
31880   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31881 		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31882 
31883   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31884 		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31885 
31886   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31887 		     /* As it uses V4HImode, we have to require -mmmx too.  */
31888 		     | OPTION_MASK_ISA_MMX,
31889 		     "__builtin_ia32_vec_set_v4hi",
31890 		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31891 
31892   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31893 		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31894 
31895   /* RDSEED */
31896   def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31897 	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31898   def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31899 	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31900   def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31901 	       "__builtin_ia32_rdseed_di_step",
31902 	       INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31903 
31904   /* ADCX */
31905   def_builtin (0, "__builtin_ia32_addcarryx_u32",
31906 	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31907   def_builtin (OPTION_MASK_ISA_64BIT,
31908 	       "__builtin_ia32_addcarryx_u64",
31909 	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31910 	       IX86_BUILTIN_ADDCARRYX64);
31911 
31912   /* SBB */
31913   def_builtin (0, "__builtin_ia32_sbb_u32",
31914 	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31915   def_builtin (OPTION_MASK_ISA_64BIT,
31916 	       "__builtin_ia32_sbb_u64",
31917 	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31918 	       IX86_BUILTIN_SBB64);
31919 
31920   /* Read/write FLAGS.  */
31921   if (TARGET_64BIT)
31922     {
31923       def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31924 		   UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31925       def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31926 		   VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31927     }
31928   else
31929     {
31930       def_builtin (0, "__builtin_ia32_readeflags_u32",
31931 		   UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31932       def_builtin (0, "__builtin_ia32_writeeflags_u32",
31933 		   VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31934     }
31935 
31936   /* CLFLUSHOPT.  */
31937   def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31938 	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31939 
31940   /* CLWB.  */
31941   def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31942 	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31943 
31944   /* MONITORX and MWAITX.  */
31945   def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31946 		VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31947   def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31948 		VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31949 
31950   /* CLZERO.  */
31951   def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31952 		VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31953 
31954   /* Add FMA4 multi-arg argument instructions */
31955   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31956     {
31957       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31958       if (d->name == 0)
31959 	continue;
31960 
31961       ftype = (enum ix86_builtin_func_type) d->flag;
31962       def_builtin_const (d->mask, d->name, ftype, d->code);
31963     }
31964   BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31965 		 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31966 		 ARRAY_SIZE (bdesc_multi_arg) - 1);
31967 
31968   /* Add CET inrinsics.  */
31969   for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
31970     {
31971       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
31972       if (d->name == 0)
31973 	continue;
31974 
31975       ftype = (enum ix86_builtin_func_type) d->flag;
31976       def_builtin (d->mask, d->name, ftype, d->code);
31977     }
31978   BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
31979 		 IX86_BUILTIN__BDESC_CET_FIRST,
31980 		 ARRAY_SIZE (bdesc_cet) - 1);
31981 
31982   for (i = 0, d = bdesc_cet_rdssp;
31983        i < ARRAY_SIZE (bdesc_cet_rdssp);
31984        i++, d++)
31985     {
31986       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
31987       if (d->name == 0)
31988 	continue;
31989 
31990       ftype = (enum ix86_builtin_func_type) d->flag;
31991       def_builtin (d->mask, d->name, ftype, d->code);
31992     }
31993   BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
31994 		 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31995 		 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
31996 }
31997 
31998 static void
31999 ix86_init_mpx_builtins ()
32000 {
32001   const struct builtin_description * d;
32002   enum ix86_builtin_func_type ftype;
32003   tree decl;
32004   size_t i;
32005 
32006   for (i = 0, d = bdesc_mpx;
32007        i < ARRAY_SIZE (bdesc_mpx);
32008        i++, d++)
32009     {
32010       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32011       if (d->name == 0)
32012 	continue;
32013 
32014       ftype = (enum ix86_builtin_func_type) d->flag;
32015       decl = def_builtin2 (d->mask, d->name, ftype, d->code);
32016 
32017       /* With no leaf and nothrow flags for MPX builtins
32018 	 abnormal edges may follow its call when setjmp
32019 	 presents in the function.  Since we may have a lot
32020 	 of MPX builtins calls it causes lots of useless
32021 	 edges and enormous PHI nodes.  To avoid this we mark
32022 	 MPX builtins as leaf and nothrow.  */
32023       if (decl)
32024 	{
32025 	  DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32026 						    NULL_TREE);
32027 	  TREE_NOTHROW (decl) = 1;
32028 	}
32029       else
32030 	{
32031 	  ix86_builtins_isa[(int)d->code].leaf_p = true;
32032 	  ix86_builtins_isa[(int)d->code].nothrow_p = true;
32033 	}
32034     }
32035   BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32036 		 IX86_BUILTIN__BDESC_MPX_FIRST,
32037 		 ARRAY_SIZE (bdesc_mpx) - 1);
32038 
32039   for (i = 0, d = bdesc_mpx_const;
32040        i < ARRAY_SIZE (bdesc_mpx_const);
32041        i++, d++)
32042     {
32043       BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32044       if (d->name == 0)
32045 	continue;
32046 
32047       ftype = (enum ix86_builtin_func_type) d->flag;
32048       decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
32049 
32050       if (decl)
32051 	{
32052 	  DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32053 						    NULL_TREE);
32054 	  TREE_NOTHROW (decl) = 1;
32055 	}
32056       else
32057 	{
32058 	  ix86_builtins_isa[(int)d->code].leaf_p = true;
32059 	  ix86_builtins_isa[(int)d->code].nothrow_p = true;
32060 	}
32061     }
32062   BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32063 		 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32064 		 ARRAY_SIZE (bdesc_mpx_const) - 1);
32065 }
32066 #undef BDESC_VERIFY
32067 #undef BDESC_VERIFYS
32068 
32069 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32070    to return a pointer to VERSION_DECL if the outcome of the expression
32071    formed by PREDICATE_CHAIN is true.  This function will be called during
32072    version dispatch to decide which function version to execute.  It returns
32073    the basic block at the end, to which more conditions can be added.  */
32074 
32075 static basic_block
32076 add_condition_to_bb (tree function_decl, tree version_decl,
32077 		     tree predicate_chain, basic_block new_bb)
32078 {
32079   gimple *return_stmt;
32080   tree convert_expr, result_var;
32081   gimple *convert_stmt;
32082   gimple *call_cond_stmt;
32083   gimple *if_else_stmt;
32084 
32085   basic_block bb1, bb2, bb3;
32086   edge e12, e23;
32087 
32088   tree cond_var, and_expr_var = NULL_TREE;
32089   gimple_seq gseq;
32090 
32091   tree predicate_decl, predicate_arg;
32092 
32093   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32094 
32095   gcc_assert (new_bb != NULL);
32096   gseq = bb_seq (new_bb);
32097 
32098 
32099   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32100 	     		 build_fold_addr_expr (version_decl));
32101   result_var = create_tmp_var (ptr_type_node);
32102   convert_stmt = gimple_build_assign (result_var, convert_expr);
32103   return_stmt = gimple_build_return (result_var);
32104 
32105   if (predicate_chain == NULL_TREE)
32106     {
32107       gimple_seq_add_stmt (&gseq, convert_stmt);
32108       gimple_seq_add_stmt (&gseq, return_stmt);
32109       set_bb_seq (new_bb, gseq);
32110       gimple_set_bb (convert_stmt, new_bb);
32111       gimple_set_bb (return_stmt, new_bb);
32112       pop_cfun ();
32113       return new_bb;
32114     }
32115 
32116   while (predicate_chain != NULL)
32117     {
32118       cond_var = create_tmp_var (integer_type_node);
32119       predicate_decl = TREE_PURPOSE (predicate_chain);
32120       predicate_arg = TREE_VALUE (predicate_chain);
32121       call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32122       gimple_call_set_lhs (call_cond_stmt, cond_var);
32123 
32124       gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32125       gimple_set_bb (call_cond_stmt, new_bb);
32126       gimple_seq_add_stmt (&gseq, call_cond_stmt);
32127 
32128       predicate_chain = TREE_CHAIN (predicate_chain);
32129 
32130       if (and_expr_var == NULL)
32131         and_expr_var = cond_var;
32132       else
32133 	{
32134 	  gimple *assign_stmt;
32135 	  /* Use MIN_EXPR to check if any integer is zero?.
32136 	     and_expr_var = min_expr <cond_var, and_expr_var>  */
32137 	  assign_stmt = gimple_build_assign (and_expr_var,
32138 			  build2 (MIN_EXPR, integer_type_node,
32139 				  cond_var, and_expr_var));
32140 
32141 	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32142 	  gimple_set_bb (assign_stmt, new_bb);
32143 	  gimple_seq_add_stmt (&gseq, assign_stmt);
32144 	}
32145     }
32146 
32147   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32148 	  		            integer_zero_node,
32149 				    NULL_TREE, NULL_TREE);
32150   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32151   gimple_set_bb (if_else_stmt, new_bb);
32152   gimple_seq_add_stmt (&gseq, if_else_stmt);
32153 
32154   gimple_seq_add_stmt (&gseq, convert_stmt);
32155   gimple_seq_add_stmt (&gseq, return_stmt);
32156   set_bb_seq (new_bb, gseq);
32157 
32158   bb1 = new_bb;
32159   e12 = split_block (bb1, if_else_stmt);
32160   bb2 = e12->dest;
32161   e12->flags &= ~EDGE_FALLTHRU;
32162   e12->flags |= EDGE_TRUE_VALUE;
32163 
32164   e23 = split_block (bb2, return_stmt);
32165 
32166   gimple_set_bb (convert_stmt, bb2);
32167   gimple_set_bb (return_stmt, bb2);
32168 
32169   bb3 = e23->dest;
32170   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32171 
32172   remove_edge (e23);
32173   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32174 
32175   pop_cfun ();
32176 
32177   return bb3;
32178 }
32179 
32180 /* This parses the attribute arguments to target in DECL and determines
32181    the right builtin to use to match the platform specification.
32182    It returns the priority value for this version decl.  If PREDICATE_LIST
32183    is not NULL, it stores the list of cpu features that need to be checked
32184    before dispatching this function.  */
32185 
32186 static unsigned int
32187 get_builtin_code_for_version (tree decl, tree *predicate_list)
32188 {
32189   tree attrs;
32190   struct cl_target_option cur_target;
32191   tree target_node;
32192   struct cl_target_option *new_target;
32193   const char *arg_str = NULL;
32194   const char *attrs_str = NULL;
32195   char *tok_str = NULL;
32196   char *token;
32197 
32198   /* Priority of i386 features, greater value is higher priority.   This is
32199      used to decide the order in which function dispatch must happen.  For
32200      instance, a version specialized for SSE4.2 should be checked for dispatch
32201      before a version for SSE3, as SSE4.2 implies SSE3.  */
32202   enum feature_priority
32203   {
32204     P_ZERO = 0,
32205     P_MMX,
32206     P_SSE,
32207     P_SSE2,
32208     P_SSE3,
32209     P_SSSE3,
32210     P_PROC_SSSE3,
32211     P_SSE4_A,
32212     P_PROC_SSE4_A,
32213     P_SSE4_1,
32214     P_SSE4_2,
32215     P_PROC_SSE4_2,
32216     P_POPCNT,
32217     P_AES,
32218     P_PCLMUL,
32219     P_AVX,
32220     P_PROC_AVX,
32221     P_BMI,
32222     P_PROC_BMI,
32223     P_FMA4,
32224     P_XOP,
32225     P_PROC_XOP,
32226     P_FMA,
32227     P_PROC_FMA,
32228     P_BMI2,
32229     P_AVX2,
32230     P_PROC_AVX2,
32231     P_AVX512F,
32232     P_PROC_AVX512F
32233   };
32234 
32235   enum feature_priority priority = P_ZERO;
32236 
32237   /* These are the target attribute strings for which a dispatcher is
32238      available, from fold_builtin_cpu.  */
32239 
32240   static struct _feature_list
32241     {
32242       const char *const name;
32243       const enum feature_priority priority;
32244     }
32245   const feature_list[] =
32246     {
32247       {"mmx", P_MMX},
32248       {"sse", P_SSE},
32249       {"sse2", P_SSE2},
32250       {"sse3", P_SSE3},
32251       {"sse4a", P_SSE4_A},
32252       {"ssse3", P_SSSE3},
32253       {"sse4.1", P_SSE4_1},
32254       {"sse4.2", P_SSE4_2},
32255       {"popcnt", P_POPCNT},
32256       {"aes", P_AES},
32257       {"pclmul", P_PCLMUL},
32258       {"avx", P_AVX},
32259       {"bmi", P_BMI},
32260       {"fma4", P_FMA4},
32261       {"xop", P_XOP},
32262       {"fma", P_FMA},
32263       {"bmi2", P_BMI2},
32264       {"avx2", P_AVX2},
32265       {"avx512f", P_AVX512F}
32266     };
32267 
32268 
32269   static unsigned int NUM_FEATURES
32270     = sizeof (feature_list) / sizeof (struct _feature_list);
32271 
32272   unsigned int i;
32273 
32274   tree predicate_chain = NULL_TREE;
32275   tree predicate_decl, predicate_arg;
32276 
32277   attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32278   gcc_assert (attrs != NULL);
32279 
32280   attrs = TREE_VALUE (TREE_VALUE (attrs));
32281 
32282   gcc_assert (TREE_CODE (attrs) == STRING_CST);
32283   attrs_str = TREE_STRING_POINTER (attrs);
32284 
32285   /* Return priority zero for default function.  */
32286   if (strcmp (attrs_str, "default") == 0)
32287     return 0;
32288 
32289   /* Handle arch= if specified.  For priority, set it to be 1 more than
32290      the best instruction set the processor can handle.  For instance, if
32291      there is a version for atom and a version for ssse3 (the highest ISA
32292      priority for atom), the atom version must be checked for dispatch
32293      before the ssse3 version. */
32294   if (strstr (attrs_str, "arch=") != NULL)
32295     {
32296       cl_target_option_save (&cur_target, &global_options);
32297       target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32298 						      &global_options_set);
32299 
32300       gcc_assert (target_node);
32301       if (target_node == error_mark_node)
32302 	return 0;
32303       new_target = TREE_TARGET_OPTION (target_node);
32304       gcc_assert (new_target);
32305 
32306       if (new_target->arch_specified && new_target->arch > 0)
32307 	{
32308 	  switch (new_target->arch)
32309 	    {
32310 	    case PROCESSOR_CORE2:
32311 	      arg_str = "core2";
32312 	      priority = P_PROC_SSSE3;
32313 	      break;
32314 	    case PROCESSOR_NEHALEM:
32315 	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32316 		{
32317 		  arg_str = "westmere";
32318 		  priority = P_AES;
32319 		}
32320 	      else
32321 		{
32322 		  /* We translate "arch=corei7" and "arch=nehalem" to
32323 		     "corei7" so that it will be mapped to M_INTEL_COREI7
32324 		     as cpu type to cover all M_INTEL_COREI7_XXXs.  */
32325 		  arg_str = "corei7";
32326 		  priority = P_PROC_SSE4_2;
32327 		}
32328 	      break;
32329 	    case PROCESSOR_SANDYBRIDGE:
32330 	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32331 		arg_str = "ivybridge";
32332 	      else
32333 		arg_str = "sandybridge";
32334 	      priority = P_PROC_AVX;
32335 	      break;
32336 	    case PROCESSOR_HASWELL:
32337 	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32338 		arg_str = "broadwell";
32339 	      else
32340 		arg_str = "haswell";
32341 	      priority = P_PROC_AVX2;
32342 	      break;
32343 	    case PROCESSOR_SKYLAKE:
32344 	      arg_str = "skylake";
32345 	      priority = P_PROC_AVX2;
32346 	      break;
32347 	    case PROCESSOR_SKYLAKE_AVX512:
32348 	      arg_str = "skylake-avx512";
32349 	      priority = P_PROC_AVX512F;
32350 	      break;
32351 	    case PROCESSOR_CANNONLAKE:
32352 	      arg_str = "cannonlake";
32353 	      priority = P_PROC_AVX512F;
32354 	      break;
32355 	    case PROCESSOR_ICELAKE_CLIENT:
32356 	      arg_str = "icelake-client";
32357 	      priority = P_PROC_AVX512F;
32358 	      break;
32359 	    case PROCESSOR_ICELAKE_SERVER:
32360 	      arg_str = "icelake-server";
32361 	      priority = P_PROC_AVX512F;
32362 	      break;
32363 	    case PROCESSOR_BONNELL:
32364 	      arg_str = "bonnell";
32365 	      priority = P_PROC_SSSE3;
32366 	      break;
32367 	    case PROCESSOR_KNL:
32368 	      arg_str = "knl";
32369 	      priority = P_PROC_AVX512F;
32370 	      break;
32371 	    case PROCESSOR_KNM:
32372 	      arg_str = "knm";
32373 	      priority = P_PROC_AVX512F;
32374 	      break;
32375 	    case PROCESSOR_SILVERMONT:
32376 	      arg_str = "silvermont";
32377 	      priority = P_PROC_SSE4_2;
32378 	      break;
32379 	    case PROCESSOR_AMDFAM10:
32380 	      arg_str = "amdfam10h";
32381 	      priority = P_PROC_SSE4_A;
32382 	      break;
32383 	    case PROCESSOR_BTVER1:
32384 	      arg_str = "btver1";
32385 	      priority = P_PROC_SSE4_A;
32386 	      break;
32387 	    case PROCESSOR_BTVER2:
32388 	      arg_str = "btver2";
32389 	      priority = P_PROC_BMI;
32390 	      break;
32391 	    case PROCESSOR_BDVER1:
32392 	      arg_str = "bdver1";
32393 	      priority = P_PROC_XOP;
32394 	      break;
32395 	    case PROCESSOR_BDVER2:
32396 	      arg_str = "bdver2";
32397 	      priority = P_PROC_FMA;
32398 	      break;
32399 	    case PROCESSOR_BDVER3:
32400 	      arg_str = "bdver3";
32401 	      priority = P_PROC_FMA;
32402 	      break;
32403 	    case PROCESSOR_BDVER4:
32404 	      arg_str = "bdver4";
32405 	      priority = P_PROC_AVX2;
32406 	      break;
32407 	    case PROCESSOR_ZNVER1:
32408 	      arg_str = "znver1";
32409 	      priority = P_PROC_AVX2;
32410 	      break;
32411 	    }
32412 	}
32413 
32414       cl_target_option_restore (&global_options, &cur_target);
32415 
32416       if (predicate_list && arg_str == NULL)
32417 	{
32418 	  error_at (DECL_SOURCE_LOCATION (decl),
32419 	    	"No dispatcher found for the versioning attributes");
32420 	  return 0;
32421 	}
32422 
32423       if (predicate_list)
32424 	{
32425           predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32426           /* For a C string literal the length includes the trailing NULL.  */
32427           predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32428           predicate_chain = tree_cons (predicate_decl, predicate_arg,
32429 				       predicate_chain);
32430 	}
32431     }
32432 
32433   /* Process feature name.  */
32434   tok_str =  (char *) xmalloc (strlen (attrs_str) + 1);
32435   strcpy (tok_str, attrs_str);
32436   token = strtok (tok_str, ",");
32437   predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32438 
32439   while (token != NULL)
32440     {
32441       /* Do not process "arch="  */
32442       if (strncmp (token, "arch=", 5) == 0)
32443 	{
32444 	  token = strtok (NULL, ",");
32445 	  continue;
32446 	}
32447       for (i = 0; i < NUM_FEATURES; ++i)
32448 	{
32449 	  if (strcmp (token, feature_list[i].name) == 0)
32450 	    {
32451 	      if (predicate_list)
32452 		{
32453 		  predicate_arg = build_string_literal (
32454 				  strlen (feature_list[i].name) + 1,
32455 				  feature_list[i].name);
32456 		  predicate_chain = tree_cons (predicate_decl, predicate_arg,
32457 					       predicate_chain);
32458 		}
32459 	      /* Find the maximum priority feature.  */
32460 	      if (feature_list[i].priority > priority)
32461 		priority = feature_list[i].priority;
32462 
32463 	      break;
32464 	    }
32465 	}
32466       if (predicate_list && i == NUM_FEATURES)
32467 	{
32468 	  error_at (DECL_SOURCE_LOCATION (decl),
32469 		    "No dispatcher found for %s", token);
32470 	  return 0;
32471 	}
32472       token = strtok (NULL, ",");
32473     }
32474   free (tok_str);
32475 
32476   if (predicate_list && predicate_chain == NULL_TREE)
32477     {
32478       error_at (DECL_SOURCE_LOCATION (decl),
32479 	        "No dispatcher found for the versioning attributes : %s",
32480 	        attrs_str);
32481       return 0;
32482     }
32483   else if (predicate_list)
32484     {
32485       predicate_chain = nreverse (predicate_chain);
32486       *predicate_list = predicate_chain;
32487     }
32488 
32489   return priority;
32490 }
32491 
32492 /* This compares the priority of target features in function DECL1
32493    and DECL2.  It returns positive value if DECL1 is higher priority,
32494    negative value if DECL2 is higher priority and 0 if they are the
32495    same.  */
32496 
32497 static int
32498 ix86_compare_version_priority (tree decl1, tree decl2)
32499 {
32500   unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32501   unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32502 
32503   return (int)priority1 - (int)priority2;
32504 }
32505 
32506 /* V1 and V2 point to function versions with different priorities
32507    based on the target ISA.  This function compares their priorities.  */
32508 
32509 static int
32510 feature_compare (const void *v1, const void *v2)
32511 {
32512   typedef struct _function_version_info
32513     {
32514       tree version_decl;
32515       tree predicate_chain;
32516       unsigned int dispatch_priority;
32517     } function_version_info;
32518 
32519   const function_version_info c1 = *(const function_version_info *)v1;
32520   const function_version_info c2 = *(const function_version_info *)v2;
32521   return (c2.dispatch_priority - c1.dispatch_priority);
32522 }
32523 
32524 /* This function generates the dispatch function for
32525    multi-versioned functions.  DISPATCH_DECL is the function which will
32526    contain the dispatch logic.  FNDECLS are the function choices for
32527    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
32528    in DISPATCH_DECL in which the dispatch code is generated.  */
32529 
32530 static int
32531 dispatch_function_versions (tree dispatch_decl,
32532 			    void *fndecls_p,
32533 			    basic_block *empty_bb)
32534 {
32535   tree default_decl;
32536   gimple *ifunc_cpu_init_stmt;
32537   gimple_seq gseq;
32538   int ix;
32539   tree ele;
32540   vec<tree> *fndecls;
32541   unsigned int num_versions = 0;
32542   unsigned int actual_versions = 0;
32543   unsigned int i;
32544 
32545   struct _function_version_info
32546     {
32547       tree version_decl;
32548       tree predicate_chain;
32549       unsigned int dispatch_priority;
32550     }*function_version_info;
32551 
32552   gcc_assert (dispatch_decl != NULL
32553 	      && fndecls_p != NULL
32554 	      && empty_bb != NULL);
32555 
32556   /*fndecls_p is actually a vector.  */
32557   fndecls = static_cast<vec<tree> *> (fndecls_p);
32558 
32559   /* At least one more version other than the default.  */
32560   num_versions = fndecls->length ();
32561   gcc_assert (num_versions >= 2);
32562 
32563   function_version_info = (struct _function_version_info *)
32564     XNEWVEC (struct _function_version_info, (num_versions - 1));
32565 
32566   /* The first version in the vector is the default decl.  */
32567   default_decl = (*fndecls)[0];
32568 
32569   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32570 
32571   gseq = bb_seq (*empty_bb);
32572   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
32573      constructors, so explicity call __builtin_cpu_init here.  */
32574   ifunc_cpu_init_stmt = gimple_build_call_vec (
32575                      ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32576   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32577   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32578   set_bb_seq (*empty_bb, gseq);
32579 
32580   pop_cfun ();
32581 
32582 
32583   for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32584     {
32585       tree version_decl = ele;
32586       tree predicate_chain = NULL_TREE;
32587       unsigned int priority;
32588       /* Get attribute string, parse it and find the right predicate decl.
32589          The predicate function could be a lengthy combination of many
32590 	 features, like arch-type and various isa-variants.  */
32591       priority = get_builtin_code_for_version (version_decl,
32592 	 			               &predicate_chain);
32593 
32594       if (predicate_chain == NULL_TREE)
32595 	continue;
32596 
32597       function_version_info [actual_versions].version_decl = version_decl;
32598       function_version_info [actual_versions].predicate_chain
32599 	 = predicate_chain;
32600       function_version_info [actual_versions].dispatch_priority = priority;
32601       actual_versions++;
32602     }
32603 
32604   /* Sort the versions according to descending order of dispatch priority.  The
32605      priority is based on the ISA.  This is not a perfect solution.  There
32606      could still be ambiguity.  If more than one function version is suitable
32607      to execute,  which one should be dispatched?  In future, allow the user
32608      to specify a dispatch  priority next to the version.  */
32609   qsort (function_version_info, actual_versions,
32610          sizeof (struct _function_version_info), feature_compare);
32611 
32612   for  (i = 0; i < actual_versions; ++i)
32613     *empty_bb = add_condition_to_bb (dispatch_decl,
32614 				     function_version_info[i].version_decl,
32615 				     function_version_info[i].predicate_chain,
32616 				     *empty_bb);
32617 
32618   /* dispatch default version at the end.  */
32619   *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32620 				   NULL, *empty_bb);
32621 
32622   free (function_version_info);
32623   return 0;
32624 }
32625 
32626 /* This function changes the assembler name for functions that are
32627    versions.  If DECL is a function version and has a "target"
32628    attribute, it appends the attribute string to its assembler name.  */
32629 
32630 static tree
32631 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32632 {
32633   tree version_attr;
32634   const char *orig_name, *version_string;
32635   char *attr_str, *assembler_name;
32636 
32637   if (DECL_DECLARED_INLINE_P (decl)
32638       && lookup_attribute ("gnu_inline",
32639 			   DECL_ATTRIBUTES (decl)))
32640     error_at (DECL_SOURCE_LOCATION (decl),
32641 	      "Function versions cannot be marked as gnu_inline,"
32642 	      " bodies have to be generated");
32643 
32644   if (DECL_VIRTUAL_P (decl)
32645       || DECL_VINDEX (decl))
32646     sorry ("Virtual function multiversioning not supported");
32647 
32648   version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32649 
32650   /* target attribute string cannot be NULL.  */
32651   gcc_assert (version_attr != NULL_TREE);
32652 
32653   orig_name = IDENTIFIER_POINTER (id);
32654   version_string
32655     = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32656 
32657   if (strcmp (version_string, "default") == 0)
32658     return id;
32659 
32660   attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32661   assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32662 
32663   sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32664 
32665   /* Allow assembler name to be modified if already set.  */
32666   if (DECL_ASSEMBLER_NAME_SET_P (decl))
32667     SET_DECL_RTL (decl, NULL);
32668 
32669   tree ret = get_identifier (assembler_name);
32670   XDELETEVEC (attr_str);
32671   XDELETEVEC (assembler_name);
32672   return ret;
32673 }
32674 
32675 
32676 static tree
32677 ix86_mangle_decl_assembler_name (tree decl, tree id)
32678 {
32679   /* For function version, add the target suffix to the assembler name.  */
32680   if (TREE_CODE (decl) == FUNCTION_DECL
32681       && DECL_FUNCTION_VERSIONED (decl))
32682     id = ix86_mangle_function_version_assembler_name (decl, id);
32683 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32684   id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32685 #endif
32686 
32687   return id;
32688 }
32689 
32690 /* Make a dispatcher declaration for the multi-versioned function DECL.
32691    Calls to DECL function will be replaced with calls to the dispatcher
32692    by the front-end.  Returns the decl of the dispatcher function.  */
32693 
32694 static tree
32695 ix86_get_function_versions_dispatcher (void *decl)
32696 {
32697   tree fn = (tree) decl;
32698   struct cgraph_node *node = NULL;
32699   struct cgraph_node *default_node = NULL;
32700   struct cgraph_function_version_info *node_v = NULL;
32701   struct cgraph_function_version_info *first_v = NULL;
32702 
32703   tree dispatch_decl = NULL;
32704 
32705   struct cgraph_function_version_info *default_version_info = NULL;
32706 
32707   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32708 
32709   node = cgraph_node::get (fn);
32710   gcc_assert (node != NULL);
32711 
32712   node_v = node->function_version ();
32713   gcc_assert (node_v != NULL);
32714 
32715   if (node_v->dispatcher_resolver != NULL)
32716     return node_v->dispatcher_resolver;
32717 
32718   /* Find the default version and make it the first node.  */
32719   first_v = node_v;
32720   /* Go to the beginning of the chain.  */
32721   while (first_v->prev != NULL)
32722     first_v = first_v->prev;
32723   default_version_info = first_v;
32724   while (default_version_info != NULL)
32725     {
32726       if (is_function_default_version
32727 	    (default_version_info->this_node->decl))
32728         break;
32729       default_version_info = default_version_info->next;
32730     }
32731 
32732   /* If there is no default node, just return NULL.  */
32733   if (default_version_info == NULL)
32734     return NULL;
32735 
32736   /* Make default info the first node.  */
32737   if (first_v != default_version_info)
32738     {
32739       default_version_info->prev->next = default_version_info->next;
32740       if (default_version_info->next)
32741         default_version_info->next->prev = default_version_info->prev;
32742       first_v->prev = default_version_info;
32743       default_version_info->next = first_v;
32744       default_version_info->prev = NULL;
32745     }
32746 
32747   default_node = default_version_info->this_node;
32748 
32749 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32750   if (targetm.has_ifunc_p ())
32751     {
32752       struct cgraph_function_version_info *it_v = NULL;
32753       struct cgraph_node *dispatcher_node = NULL;
32754       struct cgraph_function_version_info *dispatcher_version_info = NULL;
32755 
32756       /* Right now, the dispatching is done via ifunc.  */
32757       dispatch_decl = make_dispatcher_decl (default_node->decl);
32758 
32759       dispatcher_node = cgraph_node::get_create (dispatch_decl);
32760       gcc_assert (dispatcher_node != NULL);
32761       dispatcher_node->dispatcher_function = 1;
32762       dispatcher_version_info
32763 	= dispatcher_node->insert_new_function_version ();
32764       dispatcher_version_info->next = default_version_info;
32765       dispatcher_node->definition = 1;
32766 
32767       /* Set the dispatcher for all the versions.  */
32768       it_v = default_version_info;
32769       while (it_v != NULL)
32770 	{
32771 	  it_v->dispatcher_resolver = dispatch_decl;
32772 	  it_v = it_v->next;
32773 	}
32774     }
32775   else
32776 #endif
32777     {
32778       error_at (DECL_SOURCE_LOCATION (default_node->decl),
32779 		"multiversioning needs ifunc which is not supported "
32780 		"on this target");
32781     }
32782 
32783   return dispatch_decl;
32784 }
32785 
32786 /* Make the resolver function decl to dispatch the versions of
32787    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
32788    ifunc alias that will point to the created resolver.  Create an
32789    empty basic block in the resolver and store the pointer in
32790    EMPTY_BB.  Return the decl of the resolver function.  */
32791 
32792 static tree
32793 make_resolver_func (const tree default_decl,
32794 		    const tree ifunc_alias_decl,
32795 		    basic_block *empty_bb)
32796 {
32797   char *resolver_name;
32798   tree decl, type, decl_name, t;
32799 
32800   /* IFUNC's have to be globally visible.  So, if the default_decl is
32801      not, then the name of the IFUNC should be made unique.  */
32802   if (TREE_PUBLIC (default_decl) == 0)
32803     {
32804       char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32805       symtab->change_decl_assembler_name (ifunc_alias_decl,
32806 					  get_identifier (ifunc_name));
32807       XDELETEVEC (ifunc_name);
32808     }
32809 
32810   resolver_name = make_unique_name (default_decl, "resolver", false);
32811 
32812   /* The resolver function should return a (void *). */
32813   type = build_function_type_list (ptr_type_node, NULL_TREE);
32814 
32815   decl = build_fn_decl (resolver_name, type);
32816   decl_name = get_identifier (resolver_name);
32817   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32818 
32819   DECL_NAME (decl) = decl_name;
32820   TREE_USED (decl) = 1;
32821   DECL_ARTIFICIAL (decl) = 1;
32822   DECL_IGNORED_P (decl) = 1;
32823   TREE_PUBLIC (decl) = 0;
32824   DECL_UNINLINABLE (decl) = 1;
32825 
32826   /* Resolver is not external, body is generated.  */
32827   DECL_EXTERNAL (decl) = 0;
32828   DECL_EXTERNAL (ifunc_alias_decl) = 0;
32829 
32830   DECL_CONTEXT (decl) = NULL_TREE;
32831   DECL_INITIAL (decl) = make_node (BLOCK);
32832   DECL_STATIC_CONSTRUCTOR (decl) = 0;
32833 
32834   if (DECL_COMDAT_GROUP (default_decl)
32835       || TREE_PUBLIC (default_decl))
32836     {
32837       /* In this case, each translation unit with a call to this
32838 	 versioned function will put out a resolver.  Ensure it
32839 	 is comdat to keep just one copy.  */
32840       DECL_COMDAT (decl) = 1;
32841       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32842     }
32843   /* Build result decl and add to function_decl. */
32844   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32845   DECL_ARTIFICIAL (t) = 1;
32846   DECL_IGNORED_P (t) = 1;
32847   DECL_RESULT (decl) = t;
32848 
32849   gimplify_function_tree (decl);
32850   push_cfun (DECL_STRUCT_FUNCTION (decl));
32851   *empty_bb = init_lowered_empty_function (decl, false,
32852 					   profile_count::uninitialized ());
32853 
32854   cgraph_node::add_new_function (decl, true);
32855   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32856 
32857   pop_cfun ();
32858 
32859   gcc_assert (ifunc_alias_decl != NULL);
32860   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
32861   DECL_ATTRIBUTES (ifunc_alias_decl)
32862     = make_attribute ("ifunc", resolver_name,
32863 		      DECL_ATTRIBUTES (ifunc_alias_decl));
32864 
32865   /* Create the alias for dispatch to resolver here.  */
32866   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32867   XDELETEVEC (resolver_name);
32868   return decl;
32869 }
32870 
32871 /* Generate the dispatching code body to dispatch multi-versioned function
32872    DECL.  The target hook is called to process the "target" attributes and
32873    provide the code to dispatch the right function at run-time.  NODE points
32874    to the dispatcher decl whose body will be created.  */
32875 
32876 static tree
32877 ix86_generate_version_dispatcher_body (void *node_p)
32878 {
32879   tree resolver_decl;
32880   basic_block empty_bb;
32881   tree default_ver_decl;
32882   struct cgraph_node *versn;
32883   struct cgraph_node *node;
32884 
32885   struct cgraph_function_version_info *node_version_info = NULL;
32886   struct cgraph_function_version_info *versn_info = NULL;
32887 
32888   node = (cgraph_node *)node_p;
32889 
32890   node_version_info = node->function_version ();
32891   gcc_assert (node->dispatcher_function
32892 	      && node_version_info != NULL);
32893 
32894   if (node_version_info->dispatcher_resolver)
32895     return node_version_info->dispatcher_resolver;
32896 
32897   /* The first version in the chain corresponds to the default version.  */
32898   default_ver_decl = node_version_info->next->this_node->decl;
32899 
32900   /* node is going to be an alias, so remove the finalized bit.  */
32901   node->definition = false;
32902 
32903   resolver_decl = make_resolver_func (default_ver_decl,
32904 				      node->decl, &empty_bb);
32905 
32906   node_version_info->dispatcher_resolver = resolver_decl;
32907 
32908   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32909 
32910   auto_vec<tree, 2> fn_ver_vec;
32911 
32912   for (versn_info = node_version_info->next; versn_info;
32913        versn_info = versn_info->next)
32914     {
32915       versn = versn_info->this_node;
32916       /* Check for virtual functions here again, as by this time it should
32917 	 have been determined if this function needs a vtable index or
32918 	 not.  This happens for methods in derived classes that override
32919 	 virtual methods in base classes but are not explicitly marked as
32920 	 virtual.  */
32921       if (DECL_VINDEX (versn->decl))
32922 	sorry ("Virtual function multiversioning not supported");
32923 
32924       fn_ver_vec.safe_push (versn->decl);
32925     }
32926 
32927   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32928   cgraph_edge::rebuild_edges ();
32929   pop_cfun ();
32930   return resolver_decl;
32931 }
32932 /* This builds the processor_model struct type defined in
32933    libgcc/config/i386/cpuinfo.c  */
32934 
32935 static tree
32936 build_processor_model_struct (void)
32937 {
32938   const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32939 			      "__cpu_features"};
32940   tree field = NULL_TREE, field_chain = NULL_TREE;
32941   int i;
32942   tree type = make_node (RECORD_TYPE);
32943 
32944   /* The first 3 fields are unsigned int.  */
32945   for (i = 0; i < 3; ++i)
32946     {
32947       field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32948 			  get_identifier (field_name[i]), unsigned_type_node);
32949       if (field_chain != NULL_TREE)
32950 	DECL_CHAIN (field) = field_chain;
32951       field_chain = field;
32952     }
32953 
32954   /* The last field is an array of unsigned integers of size one.  */
32955   field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32956 		      get_identifier (field_name[3]),
32957 		      build_array_type (unsigned_type_node,
32958 					build_index_type (size_one_node)));
32959   if (field_chain != NULL_TREE)
32960     DECL_CHAIN (field) = field_chain;
32961   field_chain = field;
32962 
32963   finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32964   return type;
32965 }
32966 
32967 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32968 
32969 static tree
32970 make_var_decl (tree type, const char *name)
32971 {
32972   tree new_decl;
32973 
32974   new_decl = build_decl (UNKNOWN_LOCATION,
32975 	                 VAR_DECL,
32976 	  	         get_identifier(name),
32977 		         type);
32978 
32979   DECL_EXTERNAL (new_decl) = 1;
32980   TREE_STATIC (new_decl) = 1;
32981   TREE_PUBLIC (new_decl) = 1;
32982   DECL_INITIAL (new_decl) = 0;
32983   DECL_ARTIFICIAL (new_decl) = 0;
32984   DECL_PRESERVE_P (new_decl) = 1;
32985 
32986   make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
32987   assemble_variable (new_decl, 0, 0, 0);
32988 
32989   return new_decl;
32990 }
32991 
32992 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
32993    into an integer defined in libgcc/config/i386/cpuinfo.c */
32994 
32995 static tree
32996 fold_builtin_cpu (tree fndecl, tree *args)
32997 {
32998   unsigned int i;
32999   enum ix86_builtins fn_code = (enum ix86_builtins)
33000 				DECL_FUNCTION_CODE (fndecl);
33001   tree param_string_cst = NULL;
33002 
33003   /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33004   enum processor_features
33005   {
33006     F_CMOV = 0,
33007     F_MMX,
33008     F_POPCNT,
33009     F_SSE,
33010     F_SSE2,
33011     F_SSE3,
33012     F_SSSE3,
33013     F_SSE4_1,
33014     F_SSE4_2,
33015     F_AVX,
33016     F_AVX2,
33017     F_SSE4_A,
33018     F_FMA4,
33019     F_XOP,
33020     F_FMA,
33021     F_AVX512F,
33022     F_BMI,
33023     F_BMI2,
33024     F_AES,
33025     F_PCLMUL,
33026     F_AVX512VL,
33027     F_AVX512BW,
33028     F_AVX512DQ,
33029     F_AVX512CD,
33030     F_AVX512ER,
33031     F_AVX512PF,
33032     F_AVX512VBMI,
33033     F_AVX512IFMA,
33034     F_AVX5124VNNIW,
33035     F_AVX5124FMAPS,
33036     F_AVX512VPOPCNTDQ,
33037     F_AVX512VBMI2,
33038     F_GFNI,
33039     F_VPCLMULQDQ,
33040     F_AVX512VNNI,
33041     F_AVX512BITALG,
33042     F_MAX
33043   };
33044 
33045   /* These are the values for vendor types and cpu types  and subtypes
33046      in cpuinfo.c.  Cpu types and subtypes should be subtracted by
33047      the corresponding start value.  */
33048   enum processor_model
33049   {
33050     M_INTEL = 1,
33051     M_AMD,
33052     M_CPU_TYPE_START,
33053     M_INTEL_BONNELL,
33054     M_INTEL_CORE2,
33055     M_INTEL_COREI7,
33056     M_AMDFAM10H,
33057     M_AMDFAM15H,
33058     M_INTEL_SILVERMONT,
33059     M_INTEL_KNL,
33060     M_AMD_BTVER1,
33061     M_AMD_BTVER2,
33062     M_AMDFAM17H,
33063     M_INTEL_KNM,
33064     M_CPU_SUBTYPE_START,
33065     M_INTEL_COREI7_NEHALEM,
33066     M_INTEL_COREI7_WESTMERE,
33067     M_INTEL_COREI7_SANDYBRIDGE,
33068     M_AMDFAM10H_BARCELONA,
33069     M_AMDFAM10H_SHANGHAI,
33070     M_AMDFAM10H_ISTANBUL,
33071     M_AMDFAM15H_BDVER1,
33072     M_AMDFAM15H_BDVER2,
33073     M_AMDFAM15H_BDVER3,
33074     M_AMDFAM15H_BDVER4,
33075     M_AMDFAM17H_ZNVER1,
33076     M_INTEL_COREI7_IVYBRIDGE,
33077     M_INTEL_COREI7_HASWELL,
33078     M_INTEL_COREI7_BROADWELL,
33079     M_INTEL_COREI7_SKYLAKE,
33080     M_INTEL_COREI7_SKYLAKE_AVX512,
33081     M_INTEL_COREI7_CANNONLAKE,
33082     M_INTEL_COREI7_ICELAKE_CLIENT,
33083     M_INTEL_COREI7_ICELAKE_SERVER
33084   };
33085 
33086   static struct _arch_names_table
33087     {
33088       const char *const name;
33089       const enum processor_model model;
33090     }
33091   const arch_names_table[] =
33092     {
33093       {"amd", M_AMD},
33094       {"intel", M_INTEL},
33095       {"atom", M_INTEL_BONNELL},
33096       {"slm", M_INTEL_SILVERMONT},
33097       {"core2", M_INTEL_CORE2},
33098       {"corei7", M_INTEL_COREI7},
33099       {"nehalem", M_INTEL_COREI7_NEHALEM},
33100       {"westmere", M_INTEL_COREI7_WESTMERE},
33101       {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33102       {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33103       {"haswell", M_INTEL_COREI7_HASWELL},
33104       {"broadwell", M_INTEL_COREI7_BROADWELL},
33105       {"skylake", M_INTEL_COREI7_SKYLAKE},
33106       {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33107       {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
33108       {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
33109       {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
33110       {"bonnell", M_INTEL_BONNELL},
33111       {"silvermont", M_INTEL_SILVERMONT},
33112       {"knl", M_INTEL_KNL},
33113       {"knm", M_INTEL_KNM},
33114       {"amdfam10h", M_AMDFAM10H},
33115       {"barcelona", M_AMDFAM10H_BARCELONA},
33116       {"shanghai", M_AMDFAM10H_SHANGHAI},
33117       {"istanbul", M_AMDFAM10H_ISTANBUL},
33118       {"btver1", M_AMD_BTVER1},
33119       {"amdfam15h", M_AMDFAM15H},
33120       {"bdver1", M_AMDFAM15H_BDVER1},
33121       {"bdver2", M_AMDFAM15H_BDVER2},
33122       {"bdver3", M_AMDFAM15H_BDVER3},
33123       {"bdver4", M_AMDFAM15H_BDVER4},
33124       {"btver2", M_AMD_BTVER2},
33125       {"amdfam17h", M_AMDFAM17H},
33126       {"znver1", M_AMDFAM17H_ZNVER1},
33127     };
33128 
33129   static struct _isa_names_table
33130     {
33131       const char *const name;
33132       const enum processor_features feature;
33133     }
33134   const isa_names_table[] =
33135     {
33136       {"cmov",    F_CMOV},
33137       {"mmx",     F_MMX},
33138       {"popcnt",  F_POPCNT},
33139       {"sse",     F_SSE},
33140       {"sse2",    F_SSE2},
33141       {"sse3",    F_SSE3},
33142       {"ssse3",   F_SSSE3},
33143       {"sse4a",   F_SSE4_A},
33144       {"sse4.1",  F_SSE4_1},
33145       {"sse4.2",  F_SSE4_2},
33146       {"avx",     F_AVX},
33147       {"fma4",    F_FMA4},
33148       {"xop",     F_XOP},
33149       {"fma",     F_FMA},
33150       {"avx2",    F_AVX2},
33151       {"avx512f", F_AVX512F},
33152       {"bmi",     F_BMI},
33153       {"bmi2",    F_BMI2},
33154       {"aes",     F_AES},
33155       {"pclmul",  F_PCLMUL},
33156       {"avx512vl",F_AVX512VL},
33157       {"avx512bw",F_AVX512BW},
33158       {"avx512dq",F_AVX512DQ},
33159       {"avx512cd",F_AVX512CD},
33160       {"avx512er",F_AVX512ER},
33161       {"avx512pf",F_AVX512PF},
33162       {"avx512vbmi",F_AVX512VBMI},
33163       {"avx512ifma",F_AVX512IFMA},
33164       {"avx5124vnniw",F_AVX5124VNNIW},
33165       {"avx5124fmaps",F_AVX5124FMAPS},
33166       {"avx512vpopcntdq",F_AVX512VPOPCNTDQ},
33167       {"avx512vbmi2", F_AVX512VBMI2},
33168       {"gfni", F_GFNI},
33169       {"vpclmulqdq", F_VPCLMULQDQ},
33170       {"avx512vnni", F_AVX512VNNI},
33171       {"avx512bitalg", F_AVX512BITALG}
33172     };
33173 
33174   tree __processor_model_type = build_processor_model_struct ();
33175   tree __cpu_model_var = make_var_decl (__processor_model_type,
33176 					"__cpu_model");
33177 
33178 
33179   varpool_node::add (__cpu_model_var);
33180 
33181   gcc_assert ((args != NULL) && (*args != NULL));
33182 
33183   param_string_cst = *args;
33184   while (param_string_cst
33185 	 && TREE_CODE (param_string_cst) !=  STRING_CST)
33186     {
33187       /* *args must be a expr that can contain other EXPRS leading to a
33188 	 STRING_CST.   */
33189       if (!EXPR_P (param_string_cst))
33190  	{
33191 	  error ("Parameter to builtin must be a string constant or literal");
33192 	  return integer_zero_node;
33193 	}
33194       param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33195     }
33196 
33197   gcc_assert (param_string_cst);
33198 
33199   if (fn_code == IX86_BUILTIN_CPU_IS)
33200     {
33201       tree ref;
33202       tree field;
33203       tree final;
33204 
33205       unsigned int field_val = 0;
33206       unsigned int NUM_ARCH_NAMES
33207 	= sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33208 
33209       for (i = 0; i < NUM_ARCH_NAMES; i++)
33210 	if (strcmp (arch_names_table[i].name,
33211 	    TREE_STRING_POINTER (param_string_cst)) == 0)
33212 	  break;
33213 
33214       if (i == NUM_ARCH_NAMES)
33215 	{
33216 	  error ("Parameter to builtin not valid: %s",
33217 	         TREE_STRING_POINTER (param_string_cst));
33218 	  return integer_zero_node;
33219 	}
33220 
33221       field = TYPE_FIELDS (__processor_model_type);
33222       field_val = arch_names_table[i].model;
33223 
33224       /* CPU types are stored in the next field.  */
33225       if (field_val > M_CPU_TYPE_START
33226 	  && field_val < M_CPU_SUBTYPE_START)
33227 	{
33228 	  field = DECL_CHAIN (field);
33229 	  field_val -= M_CPU_TYPE_START;
33230 	}
33231 
33232       /* CPU subtypes are stored in the next field.  */
33233       if (field_val > M_CPU_SUBTYPE_START)
33234 	{
33235 	  field = DECL_CHAIN ( DECL_CHAIN (field));
33236 	  field_val -= M_CPU_SUBTYPE_START;
33237 	}
33238 
33239       /* Get the appropriate field in __cpu_model.  */
33240       ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33241 		    field, NULL_TREE);
33242 
33243       /* Check the value.  */
33244       final = build2 (EQ_EXPR, unsigned_type_node, ref,
33245 		      build_int_cstu (unsigned_type_node, field_val));
33246       return build1 (CONVERT_EXPR, integer_type_node, final);
33247     }
33248   else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33249     {
33250       tree ref;
33251       tree array_elt;
33252       tree field;
33253       tree final;
33254 
33255       unsigned int field_val = 0;
33256       unsigned int NUM_ISA_NAMES
33257 	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33258 
33259       for (i = 0; i < NUM_ISA_NAMES; i++)
33260 	if (strcmp (isa_names_table[i].name,
33261 	    TREE_STRING_POINTER (param_string_cst)) == 0)
33262 	  break;
33263 
33264       if (i == NUM_ISA_NAMES)
33265 	{
33266 	  error ("Parameter to builtin not valid: %s",
33267 	       	 TREE_STRING_POINTER (param_string_cst));
33268 	  return integer_zero_node;
33269 	}
33270 
33271       if (isa_names_table[i].feature >= 32)
33272 	{
33273 	  tree __cpu_features2_var = make_var_decl (unsigned_type_node,
33274 						    "__cpu_features2");
33275 
33276 	  varpool_node::add (__cpu_features2_var);
33277 	  field_val = (1U << (isa_names_table[i].feature - 32));
33278 	  /* Return __cpu_features2 & field_val  */
33279 	  final = build2 (BIT_AND_EXPR, unsigned_type_node,
33280 			  __cpu_features2_var,
33281 			  build_int_cstu (unsigned_type_node, field_val));
33282 	  return build1 (CONVERT_EXPR, integer_type_node, final);
33283 	}
33284 
33285       field = TYPE_FIELDS (__processor_model_type);
33286       /* Get the last field, which is __cpu_features.  */
33287       while (DECL_CHAIN (field))
33288         field = DECL_CHAIN (field);
33289 
33290       /* Get the appropriate field: __cpu_model.__cpu_features  */
33291       ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33292 		    field, NULL_TREE);
33293 
33294       /* Access the 0th element of __cpu_features array.  */
33295       array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33296 			  integer_zero_node, NULL_TREE, NULL_TREE);
33297 
33298       field_val = (1U << isa_names_table[i].feature);
33299       /* Return __cpu_model.__cpu_features[0] & field_val  */
33300       final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33301 		      build_int_cstu (unsigned_type_node, field_val));
33302       return build1 (CONVERT_EXPR, integer_type_node, final);
33303     }
33304   gcc_unreachable ();
33305 }
33306 
33307 static tree
33308 ix86_fold_builtin (tree fndecl, int n_args,
33309 		   tree *args, bool ignore ATTRIBUTE_UNUSED)
33310 {
33311   if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33312     {
33313       enum ix86_builtins fn_code = (enum ix86_builtins)
33314 				   DECL_FUNCTION_CODE (fndecl);
33315       switch (fn_code)
33316 	{
33317 	case IX86_BUILTIN_CPU_IS:
33318 	case IX86_BUILTIN_CPU_SUPPORTS:
33319 	  gcc_assert (n_args == 1);
33320 	  return fold_builtin_cpu (fndecl, args);
33321 
33322 	case IX86_BUILTIN_NANQ:
33323 	case IX86_BUILTIN_NANSQ:
33324 	  {
33325 	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
33326 	    const char *str = c_getstr (*args);
33327 	    int quiet = fn_code == IX86_BUILTIN_NANQ;
33328 	    REAL_VALUE_TYPE real;
33329 
33330 	    if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33331 	      return build_real (type, real);
33332 	    return NULL_TREE;
33333 	  }
33334 
33335 	case IX86_BUILTIN_INFQ:
33336 	case IX86_BUILTIN_HUGE_VALQ:
33337 	  {
33338 	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
33339 	    REAL_VALUE_TYPE inf;
33340 	    real_inf (&inf);
33341 	    return build_real (type, inf);
33342 	  }
33343 
33344 	case IX86_BUILTIN_TZCNT16:
33345 	case IX86_BUILTIN_CTZS:
33346 	case IX86_BUILTIN_TZCNT32:
33347 	case IX86_BUILTIN_TZCNT64:
33348 	  gcc_assert (n_args == 1);
33349 	  if (TREE_CODE (args[0]) == INTEGER_CST)
33350 	    {
33351 	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
33352 	      tree arg = args[0];
33353 	      if (fn_code == IX86_BUILTIN_TZCNT16
33354 		  || fn_code == IX86_BUILTIN_CTZS)
33355 		arg = fold_convert (short_unsigned_type_node, arg);
33356 	      if (integer_zerop (arg))
33357 		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33358 	      else
33359 		return fold_const_call (CFN_CTZ, type, arg);
33360 	    }
33361 	  break;
33362 
33363 	case IX86_BUILTIN_LZCNT16:
33364 	case IX86_BUILTIN_CLZS:
33365 	case IX86_BUILTIN_LZCNT32:
33366 	case IX86_BUILTIN_LZCNT64:
33367 	  gcc_assert (n_args == 1);
33368 	  if (TREE_CODE (args[0]) == INTEGER_CST)
33369 	    {
33370 	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
33371 	      tree arg = args[0];
33372 	      if (fn_code == IX86_BUILTIN_LZCNT16
33373 		  || fn_code == IX86_BUILTIN_CLZS)
33374 		arg = fold_convert (short_unsigned_type_node, arg);
33375 	      if (integer_zerop (arg))
33376 		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33377 	      else
33378 		return fold_const_call (CFN_CLZ, type, arg);
33379 	    }
33380 	  break;
33381 
33382 	case IX86_BUILTIN_BEXTR32:
33383 	case IX86_BUILTIN_BEXTR64:
33384 	case IX86_BUILTIN_BEXTRI32:
33385 	case IX86_BUILTIN_BEXTRI64:
33386 	  gcc_assert (n_args == 2);
33387 	  if (tree_fits_uhwi_p (args[1]))
33388 	    {
33389 	      unsigned HOST_WIDE_INT res = 0;
33390 	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33391 	      unsigned int start = tree_to_uhwi (args[1]);
33392 	      unsigned int len = (start & 0xff00) >> 8;
33393 	      start &= 0xff;
33394 	      if (start >= prec || len == 0)
33395 		res = 0;
33396 	      else if (!tree_fits_uhwi_p (args[0]))
33397 		break;
33398 	      else
33399 		res = tree_to_uhwi (args[0]) >> start;
33400 	      if (len > prec)
33401 		len = prec;
33402 	      if (len < HOST_BITS_PER_WIDE_INT)
33403 		res &= (HOST_WIDE_INT_1U << len) - 1;
33404 	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33405 	    }
33406 	  break;
33407 
33408 	case IX86_BUILTIN_BZHI32:
33409 	case IX86_BUILTIN_BZHI64:
33410 	  gcc_assert (n_args == 2);
33411 	  if (tree_fits_uhwi_p (args[1]))
33412 	    {
33413 	      unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33414 	      if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33415 		return args[0];
33416 	      if (!tree_fits_uhwi_p (args[0]))
33417 		break;
33418 	      unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33419 	      res &= ~(HOST_WIDE_INT_M1U << idx);
33420 	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33421 	    }
33422 	  break;
33423 
33424 	case IX86_BUILTIN_PDEP32:
33425 	case IX86_BUILTIN_PDEP64:
33426 	  gcc_assert (n_args == 2);
33427 	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33428 	    {
33429 	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33430 	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33431 	      unsigned HOST_WIDE_INT res = 0;
33432 	      unsigned HOST_WIDE_INT m, k = 1;
33433 	      for (m = 1; m; m <<= 1)
33434 		if ((mask & m) != 0)
33435 		  {
33436 		    if ((src & k) != 0)
33437 		      res |= m;
33438 		    k <<= 1;
33439 		  }
33440 	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33441 	    }
33442 	  break;
33443 
33444 	case IX86_BUILTIN_PEXT32:
33445 	case IX86_BUILTIN_PEXT64:
33446 	  gcc_assert (n_args == 2);
33447 	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33448 	    {
33449 	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33450 	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33451 	      unsigned HOST_WIDE_INT res = 0;
33452 	      unsigned HOST_WIDE_INT m, k = 1;
33453 	      for (m = 1; m; m <<= 1)
33454 		if ((mask & m) != 0)
33455 		  {
33456 		    if ((src & m) != 0)
33457 		      res |= k;
33458 		    k <<= 1;
33459 		  }
33460 	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33461 	    }
33462 	  break;
33463 
33464 	default:
33465 	  break;
33466 	}
33467     }
33468 
33469 #ifdef SUBTARGET_FOLD_BUILTIN
33470   return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33471 #endif
33472 
33473   return NULL_TREE;
33474 }
33475 
33476 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33477    constant) in GIMPLE.  */
33478 
33479 bool
33480 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33481 {
33482   gimple *stmt = gsi_stmt (*gsi);
33483   tree fndecl = gimple_call_fndecl (stmt);
33484   gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33485   int n_args = gimple_call_num_args (stmt);
33486   enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33487   tree decl = NULL_TREE;
33488   tree arg0, arg1;
33489 
33490   switch (fn_code)
33491     {
33492     case IX86_BUILTIN_TZCNT32:
33493       decl = builtin_decl_implicit (BUILT_IN_CTZ);
33494       goto fold_tzcnt_lzcnt;
33495 
33496     case IX86_BUILTIN_TZCNT64:
33497       decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33498       goto fold_tzcnt_lzcnt;
33499 
33500     case IX86_BUILTIN_LZCNT32:
33501       decl = builtin_decl_implicit (BUILT_IN_CLZ);
33502       goto fold_tzcnt_lzcnt;
33503 
33504     case IX86_BUILTIN_LZCNT64:
33505       decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33506       goto fold_tzcnt_lzcnt;
33507 
33508     fold_tzcnt_lzcnt:
33509       gcc_assert (n_args == 1);
33510       arg0 = gimple_call_arg (stmt, 0);
33511       if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33512 	{
33513 	  int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33514 	  /* If arg0 is provably non-zero, optimize into generic
33515 	     __builtin_c[tl]z{,ll} function the middle-end handles
33516 	     better.  */
33517 	  if (!expr_not_equal_to (arg0, wi::zero (prec)))
33518 	    return false;
33519 
33520 	  location_t loc = gimple_location (stmt);
33521 	  gimple *g = gimple_build_call (decl, 1, arg0);
33522 	  gimple_set_location (g, loc);
33523 	  tree lhs = make_ssa_name (integer_type_node);
33524 	  gimple_call_set_lhs (g, lhs);
33525 	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
33526 	  g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33527 	  gimple_set_location (g, loc);
33528 	  gsi_replace (gsi, g, false);
33529 	  return true;
33530 	}
33531       break;
33532 
33533     case IX86_BUILTIN_BZHI32:
33534     case IX86_BUILTIN_BZHI64:
33535       gcc_assert (n_args == 2);
33536       arg1 = gimple_call_arg (stmt, 1);
33537       if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33538 	{
33539 	  unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33540 	  arg0 = gimple_call_arg (stmt, 0);
33541 	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33542 	    break;
33543 	  location_t loc = gimple_location (stmt);
33544 	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33545 	  gimple_set_location (g, loc);
33546 	  gsi_replace (gsi, g, false);
33547 	  return true;
33548 	}
33549       break;
33550 
33551     case IX86_BUILTIN_PDEP32:
33552     case IX86_BUILTIN_PDEP64:
33553     case IX86_BUILTIN_PEXT32:
33554     case IX86_BUILTIN_PEXT64:
33555       gcc_assert (n_args == 2);
33556       arg1 = gimple_call_arg (stmt, 1);
33557       if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33558 	{
33559 	  location_t loc = gimple_location (stmt);
33560 	  arg0 = gimple_call_arg (stmt, 0);
33561 	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33562 	  gimple_set_location (g, loc);
33563 	  gsi_replace (gsi, g, false);
33564 	  return true;
33565 	}
33566       break;
33567 
33568     default:
33569       break;
33570     }
33571 
33572   return false;
33573 }
33574 
33575 /* Make builtins to detect cpu type and features supported.  NAME is
33576    the builtin name, CODE is the builtin code, and FTYPE is the function
33577    type of the builtin.  */
33578 
33579 static void
33580 make_cpu_type_builtin (const char* name, int code,
33581 		       enum ix86_builtin_func_type ftype, bool is_const)
33582 {
33583   tree decl;
33584   tree type;
33585 
33586   type = ix86_get_builtin_func_type (ftype);
33587   decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33588 			       NULL, NULL_TREE);
33589   gcc_assert (decl != NULL_TREE);
33590   ix86_builtins[(int) code] = decl;
33591   TREE_READONLY (decl) = is_const;
33592 }
33593 
33594 /* Make builtins to get CPU type and features supported.  The created
33595    builtins are :
33596 
33597    __builtin_cpu_init (), to detect cpu type and features,
33598    __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33599    __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33600    */
33601 
33602 static void
33603 ix86_init_platform_type_builtins (void)
33604 {
33605   make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33606 			 INT_FTYPE_VOID, false);
33607   make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33608 			 INT_FTYPE_PCCHAR, true);
33609   make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33610 			 INT_FTYPE_PCCHAR, true);
33611 }
33612 
33613 /* Internal method for ix86_init_builtins.  */
33614 
33615 static void
33616 ix86_init_builtins_va_builtins_abi (void)
33617 {
33618   tree ms_va_ref, sysv_va_ref;
33619   tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33620   tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33621   tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33622   tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33623 
33624   if (!TARGET_64BIT)
33625     return;
33626   fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33627   fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33628   ms_va_ref = build_reference_type (ms_va_list_type_node);
33629   sysv_va_ref =
33630     build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33631 
33632   fnvoid_va_end_ms =
33633     build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33634   fnvoid_va_start_ms =
33635     build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33636   fnvoid_va_end_sysv =
33637     build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33638   fnvoid_va_start_sysv =
33639     build_varargs_function_type_list (void_type_node, sysv_va_ref,
33640     				       NULL_TREE);
33641   fnvoid_va_copy_ms =
33642     build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33643     			      NULL_TREE);
33644   fnvoid_va_copy_sysv =
33645     build_function_type_list (void_type_node, sysv_va_ref,
33646     			      sysv_va_ref, NULL_TREE);
33647 
33648   add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33649   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33650   add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33651   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33652   add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33653 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33654   add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33655   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33656   add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33657   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33658   add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33659 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33660 }
33661 
33662 static void
33663 ix86_init_builtin_types (void)
33664 {
33665   tree float80_type_node, const_string_type_node;
33666 
33667   /* The __float80 type.  */
33668   float80_type_node = long_double_type_node;
33669   if (TYPE_MODE (float80_type_node) != XFmode)
33670     {
33671       if (float64x_type_node != NULL_TREE
33672 	  && TYPE_MODE (float64x_type_node) == XFmode)
33673 	float80_type_node = float64x_type_node;
33674       else
33675 	{
33676 	  /* The __float80 type.  */
33677 	  float80_type_node = make_node (REAL_TYPE);
33678 
33679 	  TYPE_PRECISION (float80_type_node) = 80;
33680 	  layout_type (float80_type_node);
33681 	}
33682     }
33683   lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33684 
33685   /* The __float128 type.  The node has already been created as
33686      _Float128, so we only need to register the __float128 name for
33687      it.  */
33688   lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33689 
33690   const_string_type_node
33691     = build_pointer_type (build_qualified_type
33692 			  (char_type_node, TYPE_QUAL_CONST));
33693 
33694   /* This macro is built by i386-builtin-types.awk.  */
33695   DEFINE_BUILTIN_PRIMITIVE_TYPES;
33696 }
33697 
33698 static void
33699 ix86_init_builtins (void)
33700 {
33701   tree ftype, decl;
33702 
33703   ix86_init_builtin_types ();
33704 
33705   /* Builtins to get CPU type and features. */
33706   ix86_init_platform_type_builtins ();
33707 
33708   /* TFmode support builtins.  */
33709   def_builtin_const (0, "__builtin_infq",
33710 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33711   def_builtin_const (0, "__builtin_huge_valq",
33712 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33713 
33714   ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33715   decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33716 			       BUILT_IN_MD, "nanq", NULL_TREE);
33717   TREE_READONLY (decl) = 1;
33718   ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33719 
33720   decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33721 			       BUILT_IN_MD, "nansq", NULL_TREE);
33722   TREE_READONLY (decl) = 1;
33723   ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33724 
33725   /* We will expand them to normal call if SSE isn't available since
33726      they are used by libgcc. */
33727   ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33728   decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33729 			       BUILT_IN_MD, "__fabstf2", NULL_TREE);
33730   TREE_READONLY (decl) = 1;
33731   ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33732 
33733   ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33734   decl = add_builtin_function ("__builtin_copysignq", ftype,
33735 			       IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33736 			       "__copysigntf3", NULL_TREE);
33737   TREE_READONLY (decl) = 1;
33738   ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33739 
33740   ix86_init_tm_builtins ();
33741   ix86_init_mmx_sse_builtins ();
33742   ix86_init_mpx_builtins ();
33743 
33744   if (TARGET_LP64)
33745     ix86_init_builtins_va_builtins_abi ();
33746 
33747 #ifdef SUBTARGET_INIT_BUILTINS
33748   SUBTARGET_INIT_BUILTINS;
33749 #endif
33750 }
33751 
33752 /* Return the ix86 builtin for CODE.  */
33753 
33754 static tree
33755 ix86_builtin_decl (unsigned code, bool)
33756 {
33757   if (code >= IX86_BUILTIN_MAX)
33758     return error_mark_node;
33759 
33760   return ix86_builtins[code];
33761 }
33762 
33763 /* Errors in the source file can cause expand_expr to return const0_rtx
33764    where we expect a vector.  To avoid crashing, use one of the vector
33765    clear instructions.  */
33766 static rtx
33767 safe_vector_operand (rtx x, machine_mode mode)
33768 {
33769   if (x == const0_rtx)
33770     x = CONST0_RTX (mode);
33771   return x;
33772 }
33773 
33774 /* Fixup modeless constants to fit required mode.  */
33775 static rtx
33776 fixup_modeless_constant (rtx x, machine_mode mode)
33777 {
33778   if (GET_MODE (x) == VOIDmode)
33779     x = convert_to_mode (mode, x, 1);
33780   return x;
33781 }
33782 
33783 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
33784 
33785 static rtx
33786 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33787 {
33788   rtx pat;
33789   tree arg0 = CALL_EXPR_ARG (exp, 0);
33790   tree arg1 = CALL_EXPR_ARG (exp, 1);
33791   rtx op0 = expand_normal (arg0);
33792   rtx op1 = expand_normal (arg1);
33793   machine_mode tmode = insn_data[icode].operand[0].mode;
33794   machine_mode mode0 = insn_data[icode].operand[1].mode;
33795   machine_mode mode1 = insn_data[icode].operand[2].mode;
33796 
33797   if (VECTOR_MODE_P (mode0))
33798     op0 = safe_vector_operand (op0, mode0);
33799   if (VECTOR_MODE_P (mode1))
33800     op1 = safe_vector_operand (op1, mode1);
33801 
33802   if (optimize || !target
33803       || GET_MODE (target) != tmode
33804       || !insn_data[icode].operand[0].predicate (target, tmode))
33805     target = gen_reg_rtx (tmode);
33806 
33807   if (GET_MODE (op1) == SImode && mode1 == TImode)
33808     {
33809       rtx x = gen_reg_rtx (V4SImode);
33810       emit_insn (gen_sse2_loadd (x, op1));
33811       op1 = gen_lowpart (TImode, x);
33812     }
33813 
33814   if (!insn_data[icode].operand[1].predicate (op0, mode0))
33815     op0 = copy_to_mode_reg (mode0, op0);
33816   if (!insn_data[icode].operand[2].predicate (op1, mode1))
33817     op1 = copy_to_mode_reg (mode1, op1);
33818 
33819   pat = GEN_FCN (icode) (target, op0, op1);
33820   if (! pat)
33821     return 0;
33822 
33823   emit_insn (pat);
33824 
33825   return target;
33826 }
33827 
33828 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
33829 
33830 static rtx
33831 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33832 			       enum ix86_builtin_func_type m_type,
33833 			       enum rtx_code sub_code)
33834 {
33835   rtx pat;
33836   int i;
33837   int nargs;
33838   bool comparison_p = false;
33839   bool tf_p = false;
33840   bool last_arg_constant = false;
33841   int num_memory = 0;
33842   struct {
33843     rtx op;
33844     machine_mode mode;
33845   } args[4];
33846 
33847   machine_mode tmode = insn_data[icode].operand[0].mode;
33848 
33849   switch (m_type)
33850     {
33851     case MULTI_ARG_4_DF2_DI_I:
33852     case MULTI_ARG_4_DF2_DI_I1:
33853     case MULTI_ARG_4_SF2_SI_I:
33854     case MULTI_ARG_4_SF2_SI_I1:
33855       nargs = 4;
33856       last_arg_constant = true;
33857       break;
33858 
33859     case MULTI_ARG_3_SF:
33860     case MULTI_ARG_3_DF:
33861     case MULTI_ARG_3_SF2:
33862     case MULTI_ARG_3_DF2:
33863     case MULTI_ARG_3_DI:
33864     case MULTI_ARG_3_SI:
33865     case MULTI_ARG_3_SI_DI:
33866     case MULTI_ARG_3_HI:
33867     case MULTI_ARG_3_HI_SI:
33868     case MULTI_ARG_3_QI:
33869     case MULTI_ARG_3_DI2:
33870     case MULTI_ARG_3_SI2:
33871     case MULTI_ARG_3_HI2:
33872     case MULTI_ARG_3_QI2:
33873       nargs = 3;
33874       break;
33875 
33876     case MULTI_ARG_2_SF:
33877     case MULTI_ARG_2_DF:
33878     case MULTI_ARG_2_DI:
33879     case MULTI_ARG_2_SI:
33880     case MULTI_ARG_2_HI:
33881     case MULTI_ARG_2_QI:
33882       nargs = 2;
33883       break;
33884 
33885     case MULTI_ARG_2_DI_IMM:
33886     case MULTI_ARG_2_SI_IMM:
33887     case MULTI_ARG_2_HI_IMM:
33888     case MULTI_ARG_2_QI_IMM:
33889       nargs = 2;
33890       last_arg_constant = true;
33891       break;
33892 
33893     case MULTI_ARG_1_SF:
33894     case MULTI_ARG_1_DF:
33895     case MULTI_ARG_1_SF2:
33896     case MULTI_ARG_1_DF2:
33897     case MULTI_ARG_1_DI:
33898     case MULTI_ARG_1_SI:
33899     case MULTI_ARG_1_HI:
33900     case MULTI_ARG_1_QI:
33901     case MULTI_ARG_1_SI_DI:
33902     case MULTI_ARG_1_HI_DI:
33903     case MULTI_ARG_1_HI_SI:
33904     case MULTI_ARG_1_QI_DI:
33905     case MULTI_ARG_1_QI_SI:
33906     case MULTI_ARG_1_QI_HI:
33907       nargs = 1;
33908       break;
33909 
33910     case MULTI_ARG_2_DI_CMP:
33911     case MULTI_ARG_2_SI_CMP:
33912     case MULTI_ARG_2_HI_CMP:
33913     case MULTI_ARG_2_QI_CMP:
33914       nargs = 2;
33915       comparison_p = true;
33916       break;
33917 
33918     case MULTI_ARG_2_SF_TF:
33919     case MULTI_ARG_2_DF_TF:
33920     case MULTI_ARG_2_DI_TF:
33921     case MULTI_ARG_2_SI_TF:
33922     case MULTI_ARG_2_HI_TF:
33923     case MULTI_ARG_2_QI_TF:
33924       nargs = 2;
33925       tf_p = true;
33926       break;
33927 
33928     default:
33929       gcc_unreachable ();
33930     }
33931 
33932   if (optimize || !target
33933       || GET_MODE (target) != tmode
33934       || !insn_data[icode].operand[0].predicate (target, tmode))
33935     target = gen_reg_rtx (tmode);
33936   else if (memory_operand (target, tmode))
33937     num_memory++;
33938 
33939   gcc_assert (nargs <= 4);
33940 
33941   for (i = 0; i < nargs; i++)
33942     {
33943       tree arg = CALL_EXPR_ARG (exp, i);
33944       rtx op = expand_normal (arg);
33945       int adjust = (comparison_p) ? 1 : 0;
33946       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33947 
33948       if (last_arg_constant && i == nargs - 1)
33949 	{
33950 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33951 	    {
33952 	      enum insn_code new_icode = icode;
33953 	      switch (icode)
33954 		{
33955 		case CODE_FOR_xop_vpermil2v2df3:
33956 		case CODE_FOR_xop_vpermil2v4sf3:
33957 		case CODE_FOR_xop_vpermil2v4df3:
33958 		case CODE_FOR_xop_vpermil2v8sf3:
33959 		  error ("the last argument must be a 2-bit immediate");
33960 		  return gen_reg_rtx (tmode);
33961 		case CODE_FOR_xop_rotlv2di3:
33962 		  new_icode = CODE_FOR_rotlv2di3;
33963 		  goto xop_rotl;
33964 		case CODE_FOR_xop_rotlv4si3:
33965 		  new_icode = CODE_FOR_rotlv4si3;
33966 		  goto xop_rotl;
33967 		case CODE_FOR_xop_rotlv8hi3:
33968 		  new_icode = CODE_FOR_rotlv8hi3;
33969 		  goto xop_rotl;
33970 		case CODE_FOR_xop_rotlv16qi3:
33971 		  new_icode = CODE_FOR_rotlv16qi3;
33972 		xop_rotl:
33973 		  if (CONST_INT_P (op))
33974 		    {
33975 		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33976 		      op = GEN_INT (INTVAL (op) & mask);
33977 		      gcc_checking_assert
33978 			(insn_data[icode].operand[i + 1].predicate (op, mode));
33979 		    }
33980 		  else
33981 		    {
33982 		      gcc_checking_assert
33983 			(nargs == 2
33984 			 && insn_data[new_icode].operand[0].mode == tmode
33985 			 && insn_data[new_icode].operand[1].mode == tmode
33986 			 && insn_data[new_icode].operand[2].mode == mode
33987 			 && insn_data[new_icode].operand[0].predicate
33988 			    == insn_data[icode].operand[0].predicate
33989 			 && insn_data[new_icode].operand[1].predicate
33990 			    == insn_data[icode].operand[1].predicate);
33991 		      icode = new_icode;
33992 		      goto non_constant;
33993 		    }
33994 		  break;
33995 		default:
33996 		  gcc_unreachable ();
33997 		}
33998 	    }
33999 	}
34000       else
34001 	{
34002 	non_constant:
34003 	  if (VECTOR_MODE_P (mode))
34004 	    op = safe_vector_operand (op, mode);
34005 
34006 	  /* If we aren't optimizing, only allow one memory operand to be
34007 	     generated.  */
34008 	  if (memory_operand (op, mode))
34009 	    num_memory++;
34010 
34011 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34012 
34013 	  if (optimize
34014 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34015 	      || num_memory > 1)
34016 	    op = force_reg (mode, op);
34017 	}
34018 
34019       args[i].op = op;
34020       args[i].mode = mode;
34021     }
34022 
34023   switch (nargs)
34024     {
34025     case 1:
34026       pat = GEN_FCN (icode) (target, args[0].op);
34027       break;
34028 
34029     case 2:
34030       if (tf_p)
34031 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34032 			       GEN_INT ((int)sub_code));
34033       else if (! comparison_p)
34034 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34035       else
34036 	{
34037 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34038 				       args[0].op,
34039 				       args[1].op);
34040 
34041 	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34042 	}
34043       break;
34044 
34045     case 3:
34046       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34047       break;
34048 
34049     case 4:
34050       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34051       break;
34052 
34053     default:
34054       gcc_unreachable ();
34055     }
34056 
34057   if (! pat)
34058     return 0;
34059 
34060   emit_insn (pat);
34061   return target;
34062 }
34063 
34064 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34065    insns with vec_merge.  */
34066 
34067 static rtx
34068 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34069 				    rtx target)
34070 {
34071   rtx pat;
34072   tree arg0 = CALL_EXPR_ARG (exp, 0);
34073   rtx op1, op0 = expand_normal (arg0);
34074   machine_mode tmode = insn_data[icode].operand[0].mode;
34075   machine_mode mode0 = insn_data[icode].operand[1].mode;
34076 
34077   if (optimize || !target
34078       || GET_MODE (target) != tmode
34079       || !insn_data[icode].operand[0].predicate (target, tmode))
34080     target = gen_reg_rtx (tmode);
34081 
34082   if (VECTOR_MODE_P (mode0))
34083     op0 = safe_vector_operand (op0, mode0);
34084 
34085   if ((optimize && !register_operand (op0, mode0))
34086       || !insn_data[icode].operand[1].predicate (op0, mode0))
34087     op0 = copy_to_mode_reg (mode0, op0);
34088 
34089   op1 = op0;
34090   if (!insn_data[icode].operand[2].predicate (op1, mode0))
34091     op1 = copy_to_mode_reg (mode0, op1);
34092 
34093   pat = GEN_FCN (icode) (target, op0, op1);
34094   if (! pat)
34095     return 0;
34096   emit_insn (pat);
34097   return target;
34098 }
34099 
34100 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
34101 
34102 static rtx
34103 ix86_expand_sse_compare (const struct builtin_description *d,
34104 			 tree exp, rtx target, bool swap)
34105 {
34106   rtx pat;
34107   tree arg0 = CALL_EXPR_ARG (exp, 0);
34108   tree arg1 = CALL_EXPR_ARG (exp, 1);
34109   rtx op0 = expand_normal (arg0);
34110   rtx op1 = expand_normal (arg1);
34111   rtx op2;
34112   machine_mode tmode = insn_data[d->icode].operand[0].mode;
34113   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34114   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34115   enum rtx_code comparison = d->comparison;
34116 
34117   if (VECTOR_MODE_P (mode0))
34118     op0 = safe_vector_operand (op0, mode0);
34119   if (VECTOR_MODE_P (mode1))
34120     op1 = safe_vector_operand (op1, mode1);
34121 
34122   /* Swap operands if we have a comparison that isn't available in
34123      hardware.  */
34124   if (swap)
34125     std::swap (op0, op1);
34126 
34127   if (optimize || !target
34128       || GET_MODE (target) != tmode
34129       || !insn_data[d->icode].operand[0].predicate (target, tmode))
34130     target = gen_reg_rtx (tmode);
34131 
34132   if ((optimize && !register_operand (op0, mode0))
34133       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34134     op0 = copy_to_mode_reg (mode0, op0);
34135   if ((optimize && !register_operand (op1, mode1))
34136       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34137     op1 = copy_to_mode_reg (mode1, op1);
34138 
34139   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34140   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34141   if (! pat)
34142     return 0;
34143   emit_insn (pat);
34144   return target;
34145 }
34146 
34147 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
34148 
34149 static rtx
34150 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34151 		      rtx target)
34152 {
34153   rtx pat;
34154   tree arg0 = CALL_EXPR_ARG (exp, 0);
34155   tree arg1 = CALL_EXPR_ARG (exp, 1);
34156   rtx op0 = expand_normal (arg0);
34157   rtx op1 = expand_normal (arg1);
34158   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34159   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34160   enum rtx_code comparison = d->comparison;
34161 
34162   if (VECTOR_MODE_P (mode0))
34163     op0 = safe_vector_operand (op0, mode0);
34164   if (VECTOR_MODE_P (mode1))
34165     op1 = safe_vector_operand (op1, mode1);
34166 
34167   /* Swap operands if we have a comparison that isn't available in
34168      hardware.  */
34169   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34170     std::swap (op0, op1);
34171 
34172   target = gen_reg_rtx (SImode);
34173   emit_move_insn (target, const0_rtx);
34174   target = gen_rtx_SUBREG (QImode, target, 0);
34175 
34176   if ((optimize && !register_operand (op0, mode0))
34177       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34178     op0 = copy_to_mode_reg (mode0, op0);
34179   if ((optimize && !register_operand (op1, mode1))
34180       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34181     op1 = copy_to_mode_reg (mode1, op1);
34182 
34183   pat = GEN_FCN (d->icode) (op0, op1);
34184   if (! pat)
34185     return 0;
34186   emit_insn (pat);
34187   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34188 			  gen_rtx_fmt_ee (comparison, QImode,
34189 					  SET_DEST (pat),
34190 					  const0_rtx)));
34191 
34192   return SUBREG_REG (target);
34193 }
34194 
34195 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
34196 
34197 static rtx
34198 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34199 		       rtx target)
34200 {
34201   rtx pat;
34202   tree arg0 = CALL_EXPR_ARG (exp, 0);
34203   rtx op1, op0 = expand_normal (arg0);
34204   machine_mode tmode = insn_data[d->icode].operand[0].mode;
34205   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34206 
34207   if (optimize || target == 0
34208       || GET_MODE (target) != tmode
34209       || !insn_data[d->icode].operand[0].predicate (target, tmode))
34210     target = gen_reg_rtx (tmode);
34211 
34212   if (VECTOR_MODE_P (mode0))
34213     op0 = safe_vector_operand (op0, mode0);
34214 
34215   if ((optimize && !register_operand (op0, mode0))
34216       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34217     op0 = copy_to_mode_reg (mode0, op0);
34218 
34219   op1 = GEN_INT (d->comparison);
34220 
34221   pat = GEN_FCN (d->icode) (target, op0, op1);
34222   if (! pat)
34223     return 0;
34224   emit_insn (pat);
34225   return target;
34226 }
34227 
34228 static rtx
34229 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34230 				     tree exp, rtx target)
34231 {
34232   rtx pat;
34233   tree arg0 = CALL_EXPR_ARG (exp, 0);
34234   tree arg1 = CALL_EXPR_ARG (exp, 1);
34235   rtx op0 = expand_normal (arg0);
34236   rtx op1 = expand_normal (arg1);
34237   rtx op2;
34238   machine_mode tmode = insn_data[d->icode].operand[0].mode;
34239   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34240   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34241 
34242   if (optimize || target == 0
34243       || GET_MODE (target) != tmode
34244       || !insn_data[d->icode].operand[0].predicate (target, tmode))
34245     target = gen_reg_rtx (tmode);
34246 
34247   op0 = safe_vector_operand (op0, mode0);
34248   op1 = safe_vector_operand (op1, mode1);
34249 
34250   if ((optimize && !register_operand (op0, mode0))
34251       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34252     op0 = copy_to_mode_reg (mode0, op0);
34253   if ((optimize && !register_operand (op1, mode1))
34254       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34255     op1 = copy_to_mode_reg (mode1, op1);
34256 
34257   op2 = GEN_INT (d->comparison);
34258 
34259   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34260   if (! pat)
34261     return 0;
34262   emit_insn (pat);
34263   return target;
34264 }
34265 
34266 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
34267 
34268 static rtx
34269 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34270 		       rtx target)
34271 {
34272   rtx pat;
34273   tree arg0 = CALL_EXPR_ARG (exp, 0);
34274   tree arg1 = CALL_EXPR_ARG (exp, 1);
34275   rtx op0 = expand_normal (arg0);
34276   rtx op1 = expand_normal (arg1);
34277   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34278   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34279   enum rtx_code comparison = d->comparison;
34280 
34281   if (VECTOR_MODE_P (mode0))
34282     op0 = safe_vector_operand (op0, mode0);
34283   if (VECTOR_MODE_P (mode1))
34284     op1 = safe_vector_operand (op1, mode1);
34285 
34286   target = gen_reg_rtx (SImode);
34287   emit_move_insn (target, const0_rtx);
34288   target = gen_rtx_SUBREG (QImode, target, 0);
34289 
34290   if ((optimize && !register_operand (op0, mode0))
34291       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34292     op0 = copy_to_mode_reg (mode0, op0);
34293   if ((optimize && !register_operand (op1, mode1))
34294       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34295     op1 = copy_to_mode_reg (mode1, op1);
34296 
34297   pat = GEN_FCN (d->icode) (op0, op1);
34298   if (! pat)
34299     return 0;
34300   emit_insn (pat);
34301   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34302 			  gen_rtx_fmt_ee (comparison, QImode,
34303 					  SET_DEST (pat),
34304 					  const0_rtx)));
34305 
34306   return SUBREG_REG (target);
34307 }
34308 
34309 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
34310 
34311 static rtx
34312 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34313 			  tree exp, rtx target)
34314 {
34315   rtx pat;
34316   tree arg0 = CALL_EXPR_ARG (exp, 0);
34317   tree arg1 = CALL_EXPR_ARG (exp, 1);
34318   tree arg2 = CALL_EXPR_ARG (exp, 2);
34319   tree arg3 = CALL_EXPR_ARG (exp, 3);
34320   tree arg4 = CALL_EXPR_ARG (exp, 4);
34321   rtx scratch0, scratch1;
34322   rtx op0 = expand_normal (arg0);
34323   rtx op1 = expand_normal (arg1);
34324   rtx op2 = expand_normal (arg2);
34325   rtx op3 = expand_normal (arg3);
34326   rtx op4 = expand_normal (arg4);
34327   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34328 
34329   tmode0 = insn_data[d->icode].operand[0].mode;
34330   tmode1 = insn_data[d->icode].operand[1].mode;
34331   modev2 = insn_data[d->icode].operand[2].mode;
34332   modei3 = insn_data[d->icode].operand[3].mode;
34333   modev4 = insn_data[d->icode].operand[4].mode;
34334   modei5 = insn_data[d->icode].operand[5].mode;
34335   modeimm = insn_data[d->icode].operand[6].mode;
34336 
34337   if (VECTOR_MODE_P (modev2))
34338     op0 = safe_vector_operand (op0, modev2);
34339   if (VECTOR_MODE_P (modev4))
34340     op2 = safe_vector_operand (op2, modev4);
34341 
34342   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34343     op0 = copy_to_mode_reg (modev2, op0);
34344   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34345     op1 = copy_to_mode_reg (modei3, op1);
34346   if ((optimize && !register_operand (op2, modev4))
34347       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34348     op2 = copy_to_mode_reg (modev4, op2);
34349   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34350     op3 = copy_to_mode_reg (modei5, op3);
34351 
34352   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34353     {
34354       error ("the fifth argument must be an 8-bit immediate");
34355       return const0_rtx;
34356     }
34357 
34358   if (d->code == IX86_BUILTIN_PCMPESTRI128)
34359     {
34360       if (optimize || !target
34361 	  || GET_MODE (target) != tmode0
34362 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34363 	target = gen_reg_rtx (tmode0);
34364 
34365       scratch1 = gen_reg_rtx (tmode1);
34366 
34367       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34368     }
34369   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34370     {
34371       if (optimize || !target
34372 	  || GET_MODE (target) != tmode1
34373 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34374 	target = gen_reg_rtx (tmode1);
34375 
34376       scratch0 = gen_reg_rtx (tmode0);
34377 
34378       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34379     }
34380   else
34381     {
34382       gcc_assert (d->flag);
34383 
34384       scratch0 = gen_reg_rtx (tmode0);
34385       scratch1 = gen_reg_rtx (tmode1);
34386 
34387       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34388     }
34389 
34390   if (! pat)
34391     return 0;
34392 
34393   emit_insn (pat);
34394 
34395   if (d->flag)
34396     {
34397       target = gen_reg_rtx (SImode);
34398       emit_move_insn (target, const0_rtx);
34399       target = gen_rtx_SUBREG (QImode, target, 0);
34400 
34401       emit_insn
34402 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34403 		      gen_rtx_fmt_ee (EQ, QImode,
34404 				      gen_rtx_REG ((machine_mode) d->flag,
34405 						   FLAGS_REG),
34406 				      const0_rtx)));
34407       return SUBREG_REG (target);
34408     }
34409   else
34410     return target;
34411 }
34412 
34413 
34414 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
34415 
34416 static rtx
34417 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34418 			  tree exp, rtx target)
34419 {
34420   rtx pat;
34421   tree arg0 = CALL_EXPR_ARG (exp, 0);
34422   tree arg1 = CALL_EXPR_ARG (exp, 1);
34423   tree arg2 = CALL_EXPR_ARG (exp, 2);
34424   rtx scratch0, scratch1;
34425   rtx op0 = expand_normal (arg0);
34426   rtx op1 = expand_normal (arg1);
34427   rtx op2 = expand_normal (arg2);
34428   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34429 
34430   tmode0 = insn_data[d->icode].operand[0].mode;
34431   tmode1 = insn_data[d->icode].operand[1].mode;
34432   modev2 = insn_data[d->icode].operand[2].mode;
34433   modev3 = insn_data[d->icode].operand[3].mode;
34434   modeimm = insn_data[d->icode].operand[4].mode;
34435 
34436   if (VECTOR_MODE_P (modev2))
34437     op0 = safe_vector_operand (op0, modev2);
34438   if (VECTOR_MODE_P (modev3))
34439     op1 = safe_vector_operand (op1, modev3);
34440 
34441   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34442     op0 = copy_to_mode_reg (modev2, op0);
34443   if ((optimize && !register_operand (op1, modev3))
34444       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34445     op1 = copy_to_mode_reg (modev3, op1);
34446 
34447   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34448     {
34449       error ("the third argument must be an 8-bit immediate");
34450       return const0_rtx;
34451     }
34452 
34453   if (d->code == IX86_BUILTIN_PCMPISTRI128)
34454     {
34455       if (optimize || !target
34456 	  || GET_MODE (target) != tmode0
34457 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34458 	target = gen_reg_rtx (tmode0);
34459 
34460       scratch1 = gen_reg_rtx (tmode1);
34461 
34462       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34463     }
34464   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34465     {
34466       if (optimize || !target
34467 	  || GET_MODE (target) != tmode1
34468 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34469 	target = gen_reg_rtx (tmode1);
34470 
34471       scratch0 = gen_reg_rtx (tmode0);
34472 
34473       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34474     }
34475   else
34476     {
34477       gcc_assert (d->flag);
34478 
34479       scratch0 = gen_reg_rtx (tmode0);
34480       scratch1 = gen_reg_rtx (tmode1);
34481 
34482       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34483     }
34484 
34485   if (! pat)
34486     return 0;
34487 
34488   emit_insn (pat);
34489 
34490   if (d->flag)
34491     {
34492       target = gen_reg_rtx (SImode);
34493       emit_move_insn (target, const0_rtx);
34494       target = gen_rtx_SUBREG (QImode, target, 0);
34495 
34496       emit_insn
34497 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34498 		      gen_rtx_fmt_ee (EQ, QImode,
34499 				      gen_rtx_REG ((machine_mode) d->flag,
34500 						   FLAGS_REG),
34501 				      const0_rtx)));
34502       return SUBREG_REG (target);
34503     }
34504   else
34505     return target;
34506 }
34507 
34508 /* Subroutine of ix86_expand_builtin to take care of insns with
34509    variable number of operands.  */
34510 
34511 static rtx
34512 ix86_expand_args_builtin (const struct builtin_description *d,
34513 			  tree exp, rtx target)
34514 {
34515   rtx pat, real_target;
34516   unsigned int i, nargs;
34517   unsigned int nargs_constant = 0;
34518   unsigned int mask_pos = 0;
34519   int num_memory = 0;
34520   struct
34521     {
34522       rtx op;
34523       machine_mode mode;
34524     } args[6];
34525   bool second_arg_count = false;
34526   enum insn_code icode = d->icode;
34527   const struct insn_data_d *insn_p = &insn_data[icode];
34528   machine_mode tmode = insn_p->operand[0].mode;
34529   machine_mode rmode = VOIDmode;
34530   bool swap = false;
34531   enum rtx_code comparison = d->comparison;
34532 
34533   switch ((enum ix86_builtin_func_type) d->flag)
34534     {
34535     case V2DF_FTYPE_V2DF_ROUND:
34536     case V4DF_FTYPE_V4DF_ROUND:
34537     case V8DF_FTYPE_V8DF_ROUND:
34538     case V4SF_FTYPE_V4SF_ROUND:
34539     case V8SF_FTYPE_V8SF_ROUND:
34540     case V16SF_FTYPE_V16SF_ROUND:
34541     case V4SI_FTYPE_V4SF_ROUND:
34542     case V8SI_FTYPE_V8SF_ROUND:
34543     case V16SI_FTYPE_V16SF_ROUND:
34544       return ix86_expand_sse_round (d, exp, target);
34545     case V4SI_FTYPE_V2DF_V2DF_ROUND:
34546     case V8SI_FTYPE_V4DF_V4DF_ROUND:
34547     case V16SI_FTYPE_V8DF_V8DF_ROUND:
34548       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34549     case INT_FTYPE_V8SF_V8SF_PTEST:
34550     case INT_FTYPE_V4DI_V4DI_PTEST:
34551     case INT_FTYPE_V4DF_V4DF_PTEST:
34552     case INT_FTYPE_V4SF_V4SF_PTEST:
34553     case INT_FTYPE_V2DI_V2DI_PTEST:
34554     case INT_FTYPE_V2DF_V2DF_PTEST:
34555       return ix86_expand_sse_ptest (d, exp, target);
34556     case FLOAT128_FTYPE_FLOAT128:
34557     case FLOAT_FTYPE_FLOAT:
34558     case INT_FTYPE_INT:
34559     case UINT_FTYPE_UINT:
34560     case UINT16_FTYPE_UINT16:
34561     case UINT64_FTYPE_INT:
34562     case UINT64_FTYPE_UINT64:
34563     case INT64_FTYPE_INT64:
34564     case INT64_FTYPE_V4SF:
34565     case INT64_FTYPE_V2DF:
34566     case INT_FTYPE_V16QI:
34567     case INT_FTYPE_V8QI:
34568     case INT_FTYPE_V8SF:
34569     case INT_FTYPE_V4DF:
34570     case INT_FTYPE_V4SF:
34571     case INT_FTYPE_V2DF:
34572     case INT_FTYPE_V32QI:
34573     case V16QI_FTYPE_V16QI:
34574     case V8SI_FTYPE_V8SF:
34575     case V8SI_FTYPE_V4SI:
34576     case V8HI_FTYPE_V8HI:
34577     case V8HI_FTYPE_V16QI:
34578     case V8QI_FTYPE_V8QI:
34579     case V8SF_FTYPE_V8SF:
34580     case V8SF_FTYPE_V8SI:
34581     case V8SF_FTYPE_V4SF:
34582     case V8SF_FTYPE_V8HI:
34583     case V4SI_FTYPE_V4SI:
34584     case V4SI_FTYPE_V16QI:
34585     case V4SI_FTYPE_V4SF:
34586     case V4SI_FTYPE_V8SI:
34587     case V4SI_FTYPE_V8HI:
34588     case V4SI_FTYPE_V4DF:
34589     case V4SI_FTYPE_V2DF:
34590     case V4HI_FTYPE_V4HI:
34591     case V4DF_FTYPE_V4DF:
34592     case V4DF_FTYPE_V4SI:
34593     case V4DF_FTYPE_V4SF:
34594     case V4DF_FTYPE_V2DF:
34595     case V4SF_FTYPE_V4SF:
34596     case V4SF_FTYPE_V4SI:
34597     case V4SF_FTYPE_V8SF:
34598     case V4SF_FTYPE_V4DF:
34599     case V4SF_FTYPE_V8HI:
34600     case V4SF_FTYPE_V2DF:
34601     case V2DI_FTYPE_V2DI:
34602     case V2DI_FTYPE_V16QI:
34603     case V2DI_FTYPE_V8HI:
34604     case V2DI_FTYPE_V4SI:
34605     case V2DF_FTYPE_V2DF:
34606     case V2DF_FTYPE_V4SI:
34607     case V2DF_FTYPE_V4DF:
34608     case V2DF_FTYPE_V4SF:
34609     case V2DF_FTYPE_V2SI:
34610     case V2SI_FTYPE_V2SI:
34611     case V2SI_FTYPE_V4SF:
34612     case V2SI_FTYPE_V2SF:
34613     case V2SI_FTYPE_V2DF:
34614     case V2SF_FTYPE_V2SF:
34615     case V2SF_FTYPE_V2SI:
34616     case V32QI_FTYPE_V32QI:
34617     case V32QI_FTYPE_V16QI:
34618     case V16HI_FTYPE_V16HI:
34619     case V16HI_FTYPE_V8HI:
34620     case V8SI_FTYPE_V8SI:
34621     case V16HI_FTYPE_V16QI:
34622     case V8SI_FTYPE_V16QI:
34623     case V4DI_FTYPE_V16QI:
34624     case V8SI_FTYPE_V8HI:
34625     case V4DI_FTYPE_V8HI:
34626     case V4DI_FTYPE_V4SI:
34627     case V4DI_FTYPE_V2DI:
34628     case UQI_FTYPE_UQI:
34629     case UHI_FTYPE_UHI:
34630     case USI_FTYPE_USI:
34631     case USI_FTYPE_UQI:
34632     case USI_FTYPE_UHI:
34633     case UDI_FTYPE_UDI:
34634     case UHI_FTYPE_V16QI:
34635     case USI_FTYPE_V32QI:
34636     case UDI_FTYPE_V64QI:
34637     case V16QI_FTYPE_UHI:
34638     case V32QI_FTYPE_USI:
34639     case V64QI_FTYPE_UDI:
34640     case V8HI_FTYPE_UQI:
34641     case V16HI_FTYPE_UHI:
34642     case V32HI_FTYPE_USI:
34643     case V4SI_FTYPE_UQI:
34644     case V8SI_FTYPE_UQI:
34645     case V4SI_FTYPE_UHI:
34646     case V8SI_FTYPE_UHI:
34647     case UQI_FTYPE_V8HI:
34648     case UHI_FTYPE_V16HI:
34649     case USI_FTYPE_V32HI:
34650     case UQI_FTYPE_V4SI:
34651     case UQI_FTYPE_V8SI:
34652     case UHI_FTYPE_V16SI:
34653     case UQI_FTYPE_V2DI:
34654     case UQI_FTYPE_V4DI:
34655     case UQI_FTYPE_V8DI:
34656     case V16SI_FTYPE_UHI:
34657     case V2DI_FTYPE_UQI:
34658     case V4DI_FTYPE_UQI:
34659     case V16SI_FTYPE_INT:
34660     case V16SF_FTYPE_V8SF:
34661     case V16SI_FTYPE_V8SI:
34662     case V16SF_FTYPE_V4SF:
34663     case V16SI_FTYPE_V4SI:
34664     case V16SI_FTYPE_V16SF:
34665     case V16SI_FTYPE_V16SI:
34666     case V64QI_FTYPE_V64QI:
34667     case V32HI_FTYPE_V32HI:
34668     case V16SF_FTYPE_V16SF:
34669     case V8DI_FTYPE_UQI:
34670     case V8DI_FTYPE_V8DI:
34671     case V8DF_FTYPE_V4DF:
34672     case V8DF_FTYPE_V2DF:
34673     case V8DF_FTYPE_V8DF:
34674     case V4DI_FTYPE_V4DI:
34675       nargs = 1;
34676       break;
34677     case V4SF_FTYPE_V4SF_VEC_MERGE:
34678     case V2DF_FTYPE_V2DF_VEC_MERGE:
34679       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34680     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34681     case V16QI_FTYPE_V16QI_V16QI:
34682     case V16QI_FTYPE_V8HI_V8HI:
34683     case V16SF_FTYPE_V16SF_V16SF:
34684     case V8QI_FTYPE_V8QI_V8QI:
34685     case V8QI_FTYPE_V4HI_V4HI:
34686     case V8HI_FTYPE_V8HI_V8HI:
34687     case V8HI_FTYPE_V16QI_V16QI:
34688     case V8HI_FTYPE_V4SI_V4SI:
34689     case V8SF_FTYPE_V8SF_V8SF:
34690     case V8SF_FTYPE_V8SF_V8SI:
34691     case V8DF_FTYPE_V8DF_V8DF:
34692     case V4SI_FTYPE_V4SI_V4SI:
34693     case V4SI_FTYPE_V8HI_V8HI:
34694     case V4SI_FTYPE_V2DF_V2DF:
34695     case V4HI_FTYPE_V4HI_V4HI:
34696     case V4HI_FTYPE_V8QI_V8QI:
34697     case V4HI_FTYPE_V2SI_V2SI:
34698     case V4DF_FTYPE_V4DF_V4DF:
34699     case V4DF_FTYPE_V4DF_V4DI:
34700     case V4SF_FTYPE_V4SF_V4SF:
34701     case V4SF_FTYPE_V4SF_V4SI:
34702     case V4SF_FTYPE_V4SF_V2SI:
34703     case V4SF_FTYPE_V4SF_V2DF:
34704     case V4SF_FTYPE_V4SF_UINT:
34705     case V4SF_FTYPE_V4SF_DI:
34706     case V4SF_FTYPE_V4SF_SI:
34707     case V2DI_FTYPE_V2DI_V2DI:
34708     case V2DI_FTYPE_V16QI_V16QI:
34709     case V2DI_FTYPE_V4SI_V4SI:
34710     case V2DI_FTYPE_V2DI_V16QI:
34711     case V2SI_FTYPE_V2SI_V2SI:
34712     case V2SI_FTYPE_V4HI_V4HI:
34713     case V2SI_FTYPE_V2SF_V2SF:
34714     case V2DF_FTYPE_V2DF_V2DF:
34715     case V2DF_FTYPE_V2DF_V4SF:
34716     case V2DF_FTYPE_V2DF_V2DI:
34717     case V2DF_FTYPE_V2DF_DI:
34718     case V2DF_FTYPE_V2DF_SI:
34719     case V2DF_FTYPE_V2DF_UINT:
34720     case V2SF_FTYPE_V2SF_V2SF:
34721     case V1DI_FTYPE_V1DI_V1DI:
34722     case V1DI_FTYPE_V8QI_V8QI:
34723     case V1DI_FTYPE_V2SI_V2SI:
34724     case V32QI_FTYPE_V16HI_V16HI:
34725     case V16HI_FTYPE_V8SI_V8SI:
34726     case V64QI_FTYPE_V64QI_V64QI:
34727     case V32QI_FTYPE_V32QI_V32QI:
34728     case V16HI_FTYPE_V32QI_V32QI:
34729     case V16HI_FTYPE_V16HI_V16HI:
34730     case V8SI_FTYPE_V4DF_V4DF:
34731     case V8SI_FTYPE_V8SI_V8SI:
34732     case V8SI_FTYPE_V16HI_V16HI:
34733     case V4DI_FTYPE_V4DI_V4DI:
34734     case V4DI_FTYPE_V8SI_V8SI:
34735     case V8DI_FTYPE_V64QI_V64QI:
34736       if (comparison == UNKNOWN)
34737 	return ix86_expand_binop_builtin (icode, exp, target);
34738       nargs = 2;
34739       break;
34740     case V4SF_FTYPE_V4SF_V4SF_SWAP:
34741     case V2DF_FTYPE_V2DF_V2DF_SWAP:
34742       gcc_assert (comparison != UNKNOWN);
34743       nargs = 2;
34744       swap = true;
34745       break;
34746     case V16HI_FTYPE_V16HI_V8HI_COUNT:
34747     case V16HI_FTYPE_V16HI_SI_COUNT:
34748     case V8SI_FTYPE_V8SI_V4SI_COUNT:
34749     case V8SI_FTYPE_V8SI_SI_COUNT:
34750     case V4DI_FTYPE_V4DI_V2DI_COUNT:
34751     case V4DI_FTYPE_V4DI_INT_COUNT:
34752     case V8HI_FTYPE_V8HI_V8HI_COUNT:
34753     case V8HI_FTYPE_V8HI_SI_COUNT:
34754     case V4SI_FTYPE_V4SI_V4SI_COUNT:
34755     case V4SI_FTYPE_V4SI_SI_COUNT:
34756     case V4HI_FTYPE_V4HI_V4HI_COUNT:
34757     case V4HI_FTYPE_V4HI_SI_COUNT:
34758     case V2DI_FTYPE_V2DI_V2DI_COUNT:
34759     case V2DI_FTYPE_V2DI_SI_COUNT:
34760     case V2SI_FTYPE_V2SI_V2SI_COUNT:
34761     case V2SI_FTYPE_V2SI_SI_COUNT:
34762     case V1DI_FTYPE_V1DI_V1DI_COUNT:
34763     case V1DI_FTYPE_V1DI_SI_COUNT:
34764       nargs = 2;
34765       second_arg_count = true;
34766       break;
34767     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
34768     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
34769     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
34770     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
34771     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
34772     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
34773     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
34774     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
34775     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
34776     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
34777     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
34778     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
34779     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
34780     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
34781     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
34782     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
34783     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
34784     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
34785       nargs = 4;
34786       second_arg_count = true;
34787       break;
34788     case UINT64_FTYPE_UINT64_UINT64:
34789     case UINT_FTYPE_UINT_UINT:
34790     case UINT_FTYPE_UINT_USHORT:
34791     case UINT_FTYPE_UINT_UCHAR:
34792     case UINT16_FTYPE_UINT16_INT:
34793     case UINT8_FTYPE_UINT8_INT:
34794     case UQI_FTYPE_UQI_UQI:
34795     case UHI_FTYPE_UHI_UHI:
34796     case USI_FTYPE_USI_USI:
34797     case UDI_FTYPE_UDI_UDI:
34798     case V16SI_FTYPE_V8DF_V8DF:
34799       nargs = 2;
34800       break;
34801     case V2DI_FTYPE_V2DI_INT_CONVERT:
34802       nargs = 2;
34803       rmode = V1TImode;
34804       nargs_constant = 1;
34805       break;
34806     case V4DI_FTYPE_V4DI_INT_CONVERT:
34807       nargs = 2;
34808       rmode = V2TImode;
34809       nargs_constant = 1;
34810       break;
34811     case V8DI_FTYPE_V8DI_INT_CONVERT:
34812       nargs = 2;
34813       rmode = V4TImode;
34814       nargs_constant = 1;
34815       break;
34816     case V8HI_FTYPE_V8HI_INT:
34817     case V8HI_FTYPE_V8SF_INT:
34818     case V16HI_FTYPE_V16SF_INT:
34819     case V8HI_FTYPE_V4SF_INT:
34820     case V8SF_FTYPE_V8SF_INT:
34821     case V4SF_FTYPE_V16SF_INT:
34822     case V16SF_FTYPE_V16SF_INT:
34823     case V4SI_FTYPE_V4SI_INT:
34824     case V4SI_FTYPE_V8SI_INT:
34825     case V4HI_FTYPE_V4HI_INT:
34826     case V4DF_FTYPE_V4DF_INT:
34827     case V4DF_FTYPE_V8DF_INT:
34828     case V4SF_FTYPE_V4SF_INT:
34829     case V4SF_FTYPE_V8SF_INT:
34830     case V2DI_FTYPE_V2DI_INT:
34831     case V2DF_FTYPE_V2DF_INT:
34832     case V2DF_FTYPE_V4DF_INT:
34833     case V16HI_FTYPE_V16HI_INT:
34834     case V8SI_FTYPE_V8SI_INT:
34835     case V16SI_FTYPE_V16SI_INT:
34836     case V4SI_FTYPE_V16SI_INT:
34837     case V4DI_FTYPE_V4DI_INT:
34838     case V2DI_FTYPE_V4DI_INT:
34839     case V4DI_FTYPE_V8DI_INT:
34840     case QI_FTYPE_V4SF_INT:
34841     case QI_FTYPE_V2DF_INT:
34842     case UQI_FTYPE_UQI_UQI_CONST:
34843     case UHI_FTYPE_UHI_UQI:
34844     case USI_FTYPE_USI_UQI:
34845     case UDI_FTYPE_UDI_UQI:
34846       nargs = 2;
34847       nargs_constant = 1;
34848       break;
34849     case V16QI_FTYPE_V16QI_V16QI_V16QI:
34850     case V8SF_FTYPE_V8SF_V8SF_V8SF:
34851     case V4DF_FTYPE_V4DF_V4DF_V4DF:
34852     case V4SF_FTYPE_V4SF_V4SF_V4SF:
34853     case V2DF_FTYPE_V2DF_V2DF_V2DF:
34854     case V32QI_FTYPE_V32QI_V32QI_V32QI:
34855     case UHI_FTYPE_V16SI_V16SI_UHI:
34856     case UQI_FTYPE_V8DI_V8DI_UQI:
34857     case V16HI_FTYPE_V16SI_V16HI_UHI:
34858     case V16QI_FTYPE_V16SI_V16QI_UHI:
34859     case V16QI_FTYPE_V8DI_V16QI_UQI:
34860     case V16SF_FTYPE_V16SF_V16SF_UHI:
34861     case V16SF_FTYPE_V4SF_V16SF_UHI:
34862     case V16SI_FTYPE_SI_V16SI_UHI:
34863     case V16SI_FTYPE_V16HI_V16SI_UHI:
34864     case V16SI_FTYPE_V16QI_V16SI_UHI:
34865     case V8SF_FTYPE_V4SF_V8SF_UQI:
34866     case V4DF_FTYPE_V2DF_V4DF_UQI:
34867     case V8SI_FTYPE_V4SI_V8SI_UQI:
34868     case V8SI_FTYPE_SI_V8SI_UQI:
34869     case V4SI_FTYPE_V4SI_V4SI_UQI:
34870     case V4SI_FTYPE_SI_V4SI_UQI:
34871     case V4DI_FTYPE_V2DI_V4DI_UQI:
34872     case V4DI_FTYPE_DI_V4DI_UQI:
34873     case V2DI_FTYPE_V2DI_V2DI_UQI:
34874     case V2DI_FTYPE_DI_V2DI_UQI:
34875     case V64QI_FTYPE_V64QI_V64QI_UDI:
34876     case V64QI_FTYPE_V16QI_V64QI_UDI:
34877     case V64QI_FTYPE_QI_V64QI_UDI:
34878     case V32QI_FTYPE_V32QI_V32QI_USI:
34879     case V32QI_FTYPE_V16QI_V32QI_USI:
34880     case V32QI_FTYPE_QI_V32QI_USI:
34881     case V16QI_FTYPE_V16QI_V16QI_UHI:
34882     case V16QI_FTYPE_QI_V16QI_UHI:
34883     case V32HI_FTYPE_V8HI_V32HI_USI:
34884     case V32HI_FTYPE_HI_V32HI_USI:
34885     case V16HI_FTYPE_V8HI_V16HI_UHI:
34886     case V16HI_FTYPE_HI_V16HI_UHI:
34887     case V8HI_FTYPE_V8HI_V8HI_UQI:
34888     case V8HI_FTYPE_HI_V8HI_UQI:
34889     case V8SF_FTYPE_V8HI_V8SF_UQI:
34890     case V4SF_FTYPE_V8HI_V4SF_UQI:
34891     case V8SI_FTYPE_V8SF_V8SI_UQI:
34892     case V4SI_FTYPE_V4SF_V4SI_UQI:
34893     case V4DI_FTYPE_V4SF_V4DI_UQI:
34894     case V2DI_FTYPE_V4SF_V2DI_UQI:
34895     case V4SF_FTYPE_V4DI_V4SF_UQI:
34896     case V4SF_FTYPE_V2DI_V4SF_UQI:
34897     case V4DF_FTYPE_V4DI_V4DF_UQI:
34898     case V2DF_FTYPE_V2DI_V2DF_UQI:
34899     case V16QI_FTYPE_V8HI_V16QI_UQI:
34900     case V16QI_FTYPE_V16HI_V16QI_UHI:
34901     case V16QI_FTYPE_V4SI_V16QI_UQI:
34902     case V16QI_FTYPE_V8SI_V16QI_UQI:
34903     case V8HI_FTYPE_V4SI_V8HI_UQI:
34904     case V8HI_FTYPE_V8SI_V8HI_UQI:
34905     case V16QI_FTYPE_V2DI_V16QI_UQI:
34906     case V16QI_FTYPE_V4DI_V16QI_UQI:
34907     case V8HI_FTYPE_V2DI_V8HI_UQI:
34908     case V8HI_FTYPE_V4DI_V8HI_UQI:
34909     case V4SI_FTYPE_V2DI_V4SI_UQI:
34910     case V4SI_FTYPE_V4DI_V4SI_UQI:
34911     case V32QI_FTYPE_V32HI_V32QI_USI:
34912     case UHI_FTYPE_V16QI_V16QI_UHI:
34913     case USI_FTYPE_V32QI_V32QI_USI:
34914     case UDI_FTYPE_V64QI_V64QI_UDI:
34915     case UQI_FTYPE_V8HI_V8HI_UQI:
34916     case UHI_FTYPE_V16HI_V16HI_UHI:
34917     case USI_FTYPE_V32HI_V32HI_USI:
34918     case UQI_FTYPE_V4SI_V4SI_UQI:
34919     case UQI_FTYPE_V8SI_V8SI_UQI:
34920     case UQI_FTYPE_V2DI_V2DI_UQI:
34921     case UQI_FTYPE_V4DI_V4DI_UQI:
34922     case V4SF_FTYPE_V2DF_V4SF_UQI:
34923     case V4SF_FTYPE_V4DF_V4SF_UQI:
34924     case V16SI_FTYPE_V16SI_V16SI_UHI:
34925     case V16SI_FTYPE_V4SI_V16SI_UHI:
34926     case V2DI_FTYPE_V4SI_V2DI_UQI:
34927     case V2DI_FTYPE_V8HI_V2DI_UQI:
34928     case V2DI_FTYPE_V16QI_V2DI_UQI:
34929     case V4DI_FTYPE_V4DI_V4DI_UQI:
34930     case V4DI_FTYPE_V4SI_V4DI_UQI:
34931     case V4DI_FTYPE_V8HI_V4DI_UQI:
34932     case V4DI_FTYPE_V16QI_V4DI_UQI:
34933     case V4DI_FTYPE_V4DF_V4DI_UQI:
34934     case V2DI_FTYPE_V2DF_V2DI_UQI:
34935     case V4SI_FTYPE_V4DF_V4SI_UQI:
34936     case V4SI_FTYPE_V2DF_V4SI_UQI:
34937     case V4SI_FTYPE_V8HI_V4SI_UQI:
34938     case V4SI_FTYPE_V16QI_V4SI_UQI:
34939     case V4DI_FTYPE_V4DI_V4DI_V4DI:
34940     case V8DF_FTYPE_V2DF_V8DF_UQI:
34941     case V8DF_FTYPE_V4DF_V8DF_UQI:
34942     case V8DF_FTYPE_V8DF_V8DF_UQI:
34943     case V8SF_FTYPE_V8SF_V8SF_UQI:
34944     case V8SF_FTYPE_V8SI_V8SF_UQI:
34945     case V4DF_FTYPE_V4DF_V4DF_UQI:
34946     case V4SF_FTYPE_V4SF_V4SF_UQI:
34947     case V2DF_FTYPE_V2DF_V2DF_UQI:
34948     case V2DF_FTYPE_V4SF_V2DF_UQI:
34949     case V2DF_FTYPE_V4SI_V2DF_UQI:
34950     case V4SF_FTYPE_V4SI_V4SF_UQI:
34951     case V4DF_FTYPE_V4SF_V4DF_UQI:
34952     case V4DF_FTYPE_V4SI_V4DF_UQI:
34953     case V8SI_FTYPE_V8SI_V8SI_UQI:
34954     case V8SI_FTYPE_V8HI_V8SI_UQI:
34955     case V8SI_FTYPE_V16QI_V8SI_UQI:
34956     case V8DF_FTYPE_V8SI_V8DF_UQI:
34957     case V8DI_FTYPE_DI_V8DI_UQI:
34958     case V16SF_FTYPE_V8SF_V16SF_UHI:
34959     case V16SI_FTYPE_V8SI_V16SI_UHI:
34960     case V16HI_FTYPE_V16HI_V16HI_UHI:
34961     case V8HI_FTYPE_V16QI_V8HI_UQI:
34962     case V16HI_FTYPE_V16QI_V16HI_UHI:
34963     case V32HI_FTYPE_V32HI_V32HI_USI:
34964     case V32HI_FTYPE_V32QI_V32HI_USI:
34965     case V8DI_FTYPE_V16QI_V8DI_UQI:
34966     case V8DI_FTYPE_V2DI_V8DI_UQI:
34967     case V8DI_FTYPE_V4DI_V8DI_UQI:
34968     case V8DI_FTYPE_V8DI_V8DI_UQI:
34969     case V8DI_FTYPE_V8HI_V8DI_UQI:
34970     case V8DI_FTYPE_V8SI_V8DI_UQI:
34971     case V8HI_FTYPE_V8DI_V8HI_UQI:
34972     case V8SI_FTYPE_V8DI_V8SI_UQI:
34973     case V4SI_FTYPE_V4SI_V4SI_V4SI:
34974     case V16SI_FTYPE_V16SI_V16SI_V16SI:
34975     case V8DI_FTYPE_V8DI_V8DI_V8DI:
34976     case V32HI_FTYPE_V32HI_V32HI_V32HI:
34977     case V2DI_FTYPE_V2DI_V2DI_V2DI:
34978     case V16HI_FTYPE_V16HI_V16HI_V16HI:
34979     case V8SI_FTYPE_V8SI_V8SI_V8SI:
34980     case V8HI_FTYPE_V8HI_V8HI_V8HI:
34981       nargs = 3;
34982       break;
34983     case V32QI_FTYPE_V32QI_V32QI_INT:
34984     case V16HI_FTYPE_V16HI_V16HI_INT:
34985     case V16QI_FTYPE_V16QI_V16QI_INT:
34986     case V4DI_FTYPE_V4DI_V4DI_INT:
34987     case V8HI_FTYPE_V8HI_V8HI_INT:
34988     case V8SI_FTYPE_V8SI_V8SI_INT:
34989     case V8SI_FTYPE_V8SI_V4SI_INT:
34990     case V8SF_FTYPE_V8SF_V8SF_INT:
34991     case V8SF_FTYPE_V8SF_V4SF_INT:
34992     case V4SI_FTYPE_V4SI_V4SI_INT:
34993     case V4DF_FTYPE_V4DF_V4DF_INT:
34994     case V16SF_FTYPE_V16SF_V16SF_INT:
34995     case V16SF_FTYPE_V16SF_V4SF_INT:
34996     case V16SI_FTYPE_V16SI_V4SI_INT:
34997     case V4DF_FTYPE_V4DF_V2DF_INT:
34998     case V4SF_FTYPE_V4SF_V4SF_INT:
34999     case V2DI_FTYPE_V2DI_V2DI_INT:
35000     case V4DI_FTYPE_V4DI_V2DI_INT:
35001     case V2DF_FTYPE_V2DF_V2DF_INT:
35002     case UQI_FTYPE_V8DI_V8UDI_INT:
35003     case UQI_FTYPE_V8DF_V8DF_INT:
35004     case UQI_FTYPE_V2DF_V2DF_INT:
35005     case UQI_FTYPE_V4SF_V4SF_INT:
35006     case UHI_FTYPE_V16SI_V16SI_INT:
35007     case UHI_FTYPE_V16SF_V16SF_INT:
35008     case V64QI_FTYPE_V64QI_V64QI_INT:
35009     case V32HI_FTYPE_V32HI_V32HI_INT:
35010     case V16SI_FTYPE_V16SI_V16SI_INT:
35011     case V8DI_FTYPE_V8DI_V8DI_INT:
35012       nargs = 3;
35013       nargs_constant = 1;
35014       break;
35015     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35016       nargs = 3;
35017       rmode = V4DImode;
35018       nargs_constant = 1;
35019       break;
35020     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35021       nargs = 3;
35022       rmode = V2DImode;
35023       nargs_constant = 1;
35024       break;
35025     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35026       nargs = 3;
35027       rmode = DImode;
35028       nargs_constant = 1;
35029       break;
35030     case V2DI_FTYPE_V2DI_UINT_UINT:
35031       nargs = 3;
35032       nargs_constant = 2;
35033       break;
35034     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35035       nargs = 3;
35036       rmode = V8DImode;
35037       nargs_constant = 1;
35038       break;
35039     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35040       nargs = 5;
35041       rmode = V8DImode;
35042       mask_pos = 2;
35043       nargs_constant = 1;
35044       break;
35045     case QI_FTYPE_V8DF_INT_UQI:
35046     case QI_FTYPE_V4DF_INT_UQI:
35047     case QI_FTYPE_V2DF_INT_UQI:
35048     case HI_FTYPE_V16SF_INT_UHI:
35049     case QI_FTYPE_V8SF_INT_UQI:
35050     case QI_FTYPE_V4SF_INT_UQI:
35051     case V4SI_FTYPE_V4SI_V4SI_UHI:
35052     case V8SI_FTYPE_V8SI_V8SI_UHI:
35053       nargs = 3;
35054       mask_pos = 1;
35055       nargs_constant = 1;
35056       break;
35057     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35058       nargs = 5;
35059       rmode = V4DImode;
35060       mask_pos = 2;
35061       nargs_constant = 1;
35062       break;
35063     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35064       nargs = 5;
35065       rmode = V2DImode;
35066       mask_pos = 2;
35067       nargs_constant = 1;
35068       break;
35069     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35070     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35071     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35072     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35073     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35074     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35075     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35076     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35077     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35078     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35079     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35080     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35081     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35082     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35083     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35084     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35085     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35086     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35087     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35088     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35089     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35090     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35091     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35092     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35093     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35094     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35095     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35096     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35097     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35098     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35099     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35100     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35101     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35102     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35103     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35104     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35105     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35106     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35107     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35108     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35109     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35110     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35111     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35112     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35113     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35114     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35115     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35116     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35117     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35118     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35119     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35120       nargs = 4;
35121       break;
35122     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35123     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35124     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35125     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35126     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35127       nargs = 4;
35128       nargs_constant = 1;
35129       break;
35130     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35131     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35132     case QI_FTYPE_V4DF_V4DF_INT_UQI:
35133     case QI_FTYPE_V8SF_V8SF_INT_UQI:
35134     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35135     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35136     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35137     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35138     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35139     case USI_FTYPE_V32QI_V32QI_INT_USI:
35140     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35141     case USI_FTYPE_V32HI_V32HI_INT_USI:
35142     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35143     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35144     case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
35145     case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
35146     case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
35147     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
35148     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
35149     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
35150     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
35151     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
35152     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
35153       nargs = 4;
35154       mask_pos = 1;
35155       nargs_constant = 1;
35156       break;
35157     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35158       nargs = 4;
35159       nargs_constant = 2;
35160       break;
35161     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35162     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35163       nargs = 4;
35164       break;
35165     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35166     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35167       mask_pos = 1;
35168       nargs = 4;
35169       nargs_constant = 1;
35170       break;
35171     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35172     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35173     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35174     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35175     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35176     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35177     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35178     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35179     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35180     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35181     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35182     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35183     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35184     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35185     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35186     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35187     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35188     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35189     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35190     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35191     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35192     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35193     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35194     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35195     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35196     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35197     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35198     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35199     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35200     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35201       nargs = 4;
35202       mask_pos = 2;
35203       nargs_constant = 1;
35204       break;
35205     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35206     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35207     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35208     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35209     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35210     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35211     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35212     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35213     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35214     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35215     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35216     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35217     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35218     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35219     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35220     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35221     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35222     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35223     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35224     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35225     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35226     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35227     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35228     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35229     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35230     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35231     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35232       nargs = 5;
35233       mask_pos = 2;
35234       nargs_constant = 1;
35235       break;
35236     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35237     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35238     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35239     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35240     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35241     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35242     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35243     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35244     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35245     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35246       nargs = 5;
35247       mask_pos = 1;
35248       nargs_constant = 1;
35249       break;
35250     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
35251     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
35252     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
35253     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
35254     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
35255     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
35256     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
35257     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
35258     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
35259     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
35260     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
35261     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
35262       nargs = 5;
35263       mask_pos = 1;
35264       nargs_constant = 2;
35265       break;
35266 
35267     default:
35268       gcc_unreachable ();
35269     }
35270 
35271   gcc_assert (nargs <= ARRAY_SIZE (args));
35272 
35273   if (comparison != UNKNOWN)
35274     {
35275       gcc_assert (nargs == 2);
35276       return ix86_expand_sse_compare (d, exp, target, swap);
35277     }
35278 
35279   if (rmode == VOIDmode || rmode == tmode)
35280     {
35281       if (optimize
35282 	  || target == 0
35283 	  || GET_MODE (target) != tmode
35284 	  || !insn_p->operand[0].predicate (target, tmode))
35285 	target = gen_reg_rtx (tmode);
35286       else if (memory_operand (target, tmode))
35287 	num_memory++;
35288       real_target = target;
35289     }
35290   else
35291     {
35292       real_target = gen_reg_rtx (tmode);
35293       target = lowpart_subreg (rmode, real_target, tmode);
35294     }
35295 
35296   for (i = 0; i < nargs; i++)
35297     {
35298       tree arg = CALL_EXPR_ARG (exp, i);
35299       rtx op = expand_normal (arg);
35300       machine_mode mode = insn_p->operand[i + 1].mode;
35301       bool match = insn_p->operand[i + 1].predicate (op, mode);
35302 
35303       if (second_arg_count && i == 1)
35304 	{
35305 	  /* SIMD shift insns take either an 8-bit immediate or
35306 	     register as count.  But builtin functions take int as
35307 	     count.  If count doesn't match, we put it in register.
35308 	     The instructions are using 64-bit count, if op is just
35309 	     32-bit, zero-extend it, as negative shift counts
35310 	     are undefined behavior and zero-extension is more
35311 	     efficient.  */
35312 	  if (!match)
35313 	    {
35314 	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
35315 		op = convert_modes (mode, GET_MODE (op), op, 1);
35316 	      else
35317 		op = lowpart_subreg (mode, op, GET_MODE (op));
35318 	      if (!insn_p->operand[i + 1].predicate (op, mode))
35319 		op = copy_to_reg (op);
35320 	    }
35321 	}
35322       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35323 	       (!mask_pos && (nargs - i) <= nargs_constant))
35324 	{
35325 	  if (!match)
35326 	    switch (icode)
35327 	      {
35328 	      case CODE_FOR_avx_vinsertf128v4di:
35329 	      case CODE_FOR_avx_vextractf128v4di:
35330 		error ("the last argument must be an 1-bit immediate");
35331 		return const0_rtx;
35332 
35333 	      case CODE_FOR_avx512f_cmpv8di3_mask:
35334 	      case CODE_FOR_avx512f_cmpv16si3_mask:
35335 	      case CODE_FOR_avx512f_ucmpv8di3_mask:
35336 	      case CODE_FOR_avx512f_ucmpv16si3_mask:
35337 	      case CODE_FOR_avx512vl_cmpv4di3_mask:
35338 	      case CODE_FOR_avx512vl_cmpv8si3_mask:
35339 	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
35340 	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
35341 	      case CODE_FOR_avx512vl_cmpv2di3_mask:
35342 	      case CODE_FOR_avx512vl_cmpv4si3_mask:
35343 	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
35344 	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
35345 		error ("the last argument must be a 3-bit immediate");
35346 		return const0_rtx;
35347 
35348 	      case CODE_FOR_sse4_1_roundsd:
35349 	      case CODE_FOR_sse4_1_roundss:
35350 
35351 	      case CODE_FOR_sse4_1_roundpd:
35352 	      case CODE_FOR_sse4_1_roundps:
35353 	      case CODE_FOR_avx_roundpd256:
35354 	      case CODE_FOR_avx_roundps256:
35355 
35356 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35357 	      case CODE_FOR_sse4_1_roundps_sfix:
35358 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35359 	      case CODE_FOR_avx_roundps_sfix256:
35360 
35361 	      case CODE_FOR_sse4_1_blendps:
35362 	      case CODE_FOR_avx_blendpd256:
35363 	      case CODE_FOR_avx_vpermilv4df:
35364 	      case CODE_FOR_avx_vpermilv4df_mask:
35365 	      case CODE_FOR_avx512f_getmantv8df_mask:
35366 	      case CODE_FOR_avx512f_getmantv16sf_mask:
35367 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
35368 	      case CODE_FOR_avx512vl_getmantv4df_mask:
35369 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
35370 	      case CODE_FOR_avx512vl_getmantv2df_mask:
35371 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
35372 	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35373 	      case CODE_FOR_avx512dq_rangepv4df_mask:
35374 	      case CODE_FOR_avx512dq_rangepv8sf_mask:
35375 	      case CODE_FOR_avx512dq_rangepv2df_mask:
35376 	      case CODE_FOR_avx512dq_rangepv4sf_mask:
35377 	      case CODE_FOR_avx_shufpd256_mask:
35378 		error ("the last argument must be a 4-bit immediate");
35379 		return const0_rtx;
35380 
35381 	      case CODE_FOR_sha1rnds4:
35382 	      case CODE_FOR_sse4_1_blendpd:
35383 	      case CODE_FOR_avx_vpermilv2df:
35384 	      case CODE_FOR_avx_vpermilv2df_mask:
35385 	      case CODE_FOR_xop_vpermil2v2df3:
35386 	      case CODE_FOR_xop_vpermil2v4sf3:
35387 	      case CODE_FOR_xop_vpermil2v4df3:
35388 	      case CODE_FOR_xop_vpermil2v8sf3:
35389 	      case CODE_FOR_avx512f_vinsertf32x4_mask:
35390 	      case CODE_FOR_avx512f_vinserti32x4_mask:
35391 	      case CODE_FOR_avx512f_vextractf32x4_mask:
35392 	      case CODE_FOR_avx512f_vextracti32x4_mask:
35393 	      case CODE_FOR_sse2_shufpd:
35394 	      case CODE_FOR_sse2_shufpd_mask:
35395 	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
35396 	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
35397 	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
35398 	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
35399 		error ("the last argument must be a 2-bit immediate");
35400 		return const0_rtx;
35401 
35402 	      case CODE_FOR_avx_vextractf128v4df:
35403 	      case CODE_FOR_avx_vextractf128v8sf:
35404 	      case CODE_FOR_avx_vextractf128v8si:
35405 	      case CODE_FOR_avx_vinsertf128v4df:
35406 	      case CODE_FOR_avx_vinsertf128v8sf:
35407 	      case CODE_FOR_avx_vinsertf128v8si:
35408 	      case CODE_FOR_avx512f_vinsertf64x4_mask:
35409 	      case CODE_FOR_avx512f_vinserti64x4_mask:
35410 	      case CODE_FOR_avx512f_vextractf64x4_mask:
35411 	      case CODE_FOR_avx512f_vextracti64x4_mask:
35412 	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
35413 	      case CODE_FOR_avx512dq_vinserti32x8_mask:
35414 	      case CODE_FOR_avx512vl_vinsertv4df:
35415 	      case CODE_FOR_avx512vl_vinsertv4di:
35416 	      case CODE_FOR_avx512vl_vinsertv8sf:
35417 	      case CODE_FOR_avx512vl_vinsertv8si:
35418 		error ("the last argument must be a 1-bit immediate");
35419 		return const0_rtx;
35420 
35421 	      case CODE_FOR_avx_vmcmpv2df3:
35422 	      case CODE_FOR_avx_vmcmpv4sf3:
35423 	      case CODE_FOR_avx_cmpv2df3:
35424 	      case CODE_FOR_avx_cmpv4sf3:
35425 	      case CODE_FOR_avx_cmpv4df3:
35426 	      case CODE_FOR_avx_cmpv8sf3:
35427 	      case CODE_FOR_avx512f_cmpv8df3_mask:
35428 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
35429 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
35430 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35431 		error ("the last argument must be a 5-bit immediate");
35432 		return const0_rtx;
35433 
35434 	      default:
35435 		switch (nargs_constant)
35436 		  {
35437 		  case 2:
35438 		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35439 			(!mask_pos && (nargs - i) == nargs_constant))
35440 		      {
35441 			error ("the next to last argument must be an 8-bit immediate");
35442 			break;
35443 		      }
35444 		    /* FALLTHRU */
35445 		  case 1:
35446 		    error ("the last argument must be an 8-bit immediate");
35447 		    break;
35448 		  default:
35449 		    gcc_unreachable ();
35450 		  }
35451 		return const0_rtx;
35452 	      }
35453 	}
35454       else
35455 	{
35456 	  if (VECTOR_MODE_P (mode))
35457 	    op = safe_vector_operand (op, mode);
35458 
35459 	  /* If we aren't optimizing, only allow one memory operand to
35460 	     be generated.  */
35461 	  if (memory_operand (op, mode))
35462 	    num_memory++;
35463 
35464 	  op = fixup_modeless_constant (op, mode);
35465 
35466 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35467 	    {
35468 	      if (optimize || !match || num_memory > 1)
35469 		op = copy_to_mode_reg (mode, op);
35470 	    }
35471 	  else
35472 	    {
35473 	      op = copy_to_reg (op);
35474 	      op = lowpart_subreg (mode, op, GET_MODE (op));
35475 	    }
35476 	}
35477 
35478       args[i].op = op;
35479       args[i].mode = mode;
35480     }
35481 
35482   switch (nargs)
35483     {
35484     case 1:
35485       pat = GEN_FCN (icode) (real_target, args[0].op);
35486       break;
35487     case 2:
35488       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35489       break;
35490     case 3:
35491       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35492 			     args[2].op);
35493       break;
35494     case 4:
35495       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35496 			     args[2].op, args[3].op);
35497       break;
35498     case 5:
35499       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35500 			     args[2].op, args[3].op, args[4].op);
35501       break;
35502     case 6:
35503       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35504 			     args[2].op, args[3].op, args[4].op,
35505 			     args[5].op);
35506       break;
35507     default:
35508       gcc_unreachable ();
35509     }
35510 
35511   if (! pat)
35512     return 0;
35513 
35514   emit_insn (pat);
35515   return target;
35516 }
35517 
35518 /* Transform pattern of following layout:
35519      (set A
35520        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35521      )
35522    into:
35523      (set (A B)) */
35524 
35525 static rtx
35526 ix86_erase_embedded_rounding (rtx pat)
35527 {
35528   if (GET_CODE (pat) == INSN)
35529     pat = PATTERN (pat);
35530 
35531   gcc_assert (GET_CODE (pat) == SET);
35532   rtx src = SET_SRC (pat);
35533   gcc_assert (XVECLEN (src, 0) == 2);
35534   rtx p0 = XVECEXP (src, 0, 0);
35535   gcc_assert (GET_CODE (src) == UNSPEC
35536 	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35537   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35538   return res;
35539 }
35540 
35541 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35542    with rounding.  */
35543 static rtx
35544 ix86_expand_sse_comi_round (const struct builtin_description *d,
35545 			    tree exp, rtx target)
35546 {
35547   rtx pat, set_dst;
35548   tree arg0 = CALL_EXPR_ARG (exp, 0);
35549   tree arg1 = CALL_EXPR_ARG (exp, 1);
35550   tree arg2 = CALL_EXPR_ARG (exp, 2);
35551   tree arg3 = CALL_EXPR_ARG (exp, 3);
35552   rtx op0 = expand_normal (arg0);
35553   rtx op1 = expand_normal (arg1);
35554   rtx op2 = expand_normal (arg2);
35555   rtx op3 = expand_normal (arg3);
35556   enum insn_code icode = d->icode;
35557   const struct insn_data_d *insn_p = &insn_data[icode];
35558   machine_mode mode0 = insn_p->operand[0].mode;
35559   machine_mode mode1 = insn_p->operand[1].mode;
35560   enum rtx_code comparison = UNEQ;
35561   bool need_ucomi = false;
35562 
35563   /* See avxintrin.h for values.  */
35564   enum rtx_code comi_comparisons[32] =
35565     {
35566       UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35567       UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35568       UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35569     };
35570   bool need_ucomi_values[32] =
35571     {
35572       true,  false, false, true,  true,  false, false, true,
35573       true,  false, false, true,  true,  false, false, true,
35574       false, true,  true,  false, false, true,  true,  false,
35575       false, true,  true,  false, false, true,  true,  false
35576     };
35577 
35578   if (!CONST_INT_P (op2))
35579     {
35580       error ("the third argument must be comparison constant");
35581       return const0_rtx;
35582     }
35583   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35584     {
35585       error ("incorrect comparison mode");
35586       return const0_rtx;
35587     }
35588 
35589   if (!insn_p->operand[2].predicate (op3, SImode))
35590     {
35591       error ("incorrect rounding operand");
35592       return const0_rtx;
35593     }
35594 
35595   comparison = comi_comparisons[INTVAL (op2)];
35596   need_ucomi = need_ucomi_values[INTVAL (op2)];
35597 
35598   if (VECTOR_MODE_P (mode0))
35599     op0 = safe_vector_operand (op0, mode0);
35600   if (VECTOR_MODE_P (mode1))
35601     op1 = safe_vector_operand (op1, mode1);
35602 
35603   target = gen_reg_rtx (SImode);
35604   emit_move_insn (target, const0_rtx);
35605   target = gen_rtx_SUBREG (QImode, target, 0);
35606 
35607   if ((optimize && !register_operand (op0, mode0))
35608       || !insn_p->operand[0].predicate (op0, mode0))
35609     op0 = copy_to_mode_reg (mode0, op0);
35610   if ((optimize && !register_operand (op1, mode1))
35611       || !insn_p->operand[1].predicate (op1, mode1))
35612     op1 = copy_to_mode_reg (mode1, op1);
35613 
35614   if (need_ucomi)
35615     icode = icode == CODE_FOR_sse_comi_round
35616 		     ? CODE_FOR_sse_ucomi_round
35617 		     : CODE_FOR_sse2_ucomi_round;
35618 
35619   pat = GEN_FCN (icode) (op0, op1, op3);
35620   if (! pat)
35621     return 0;
35622 
35623   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
35624   if (INTVAL (op3) == NO_ROUND)
35625     {
35626       pat = ix86_erase_embedded_rounding (pat);
35627       if (! pat)
35628 	return 0;
35629 
35630       set_dst = SET_DEST (pat);
35631     }
35632   else
35633     {
35634       gcc_assert (GET_CODE (pat) == SET);
35635       set_dst = SET_DEST (pat);
35636     }
35637 
35638   emit_insn (pat);
35639   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35640 			  gen_rtx_fmt_ee (comparison, QImode,
35641 					  set_dst,
35642 					  const0_rtx)));
35643 
35644   return SUBREG_REG (target);
35645 }
35646 
35647 static rtx
35648 ix86_expand_round_builtin (const struct builtin_description *d,
35649 			   tree exp, rtx target)
35650 {
35651   rtx pat;
35652   unsigned int i, nargs;
35653   struct
35654     {
35655       rtx op;
35656       machine_mode mode;
35657     } args[6];
35658   enum insn_code icode = d->icode;
35659   const struct insn_data_d *insn_p = &insn_data[icode];
35660   machine_mode tmode = insn_p->operand[0].mode;
35661   unsigned int nargs_constant = 0;
35662   unsigned int redundant_embed_rnd = 0;
35663 
35664   switch ((enum ix86_builtin_func_type) d->flag)
35665     {
35666     case UINT64_FTYPE_V2DF_INT:
35667     case UINT64_FTYPE_V4SF_INT:
35668     case UINT_FTYPE_V2DF_INT:
35669     case UINT_FTYPE_V4SF_INT:
35670     case INT64_FTYPE_V2DF_INT:
35671     case INT64_FTYPE_V4SF_INT:
35672     case INT_FTYPE_V2DF_INT:
35673     case INT_FTYPE_V4SF_INT:
35674       nargs = 2;
35675       break;
35676     case V4SF_FTYPE_V4SF_UINT_INT:
35677     case V4SF_FTYPE_V4SF_UINT64_INT:
35678     case V2DF_FTYPE_V2DF_UINT64_INT:
35679     case V4SF_FTYPE_V4SF_INT_INT:
35680     case V4SF_FTYPE_V4SF_INT64_INT:
35681     case V2DF_FTYPE_V2DF_INT64_INT:
35682     case V4SF_FTYPE_V4SF_V4SF_INT:
35683     case V2DF_FTYPE_V2DF_V2DF_INT:
35684     case V4SF_FTYPE_V4SF_V2DF_INT:
35685     case V2DF_FTYPE_V2DF_V4SF_INT:
35686       nargs = 3;
35687       break;
35688     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35689     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35690     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35691     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35692     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35693     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35694     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35695     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35696     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35697     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35698     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35699     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35700     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35701     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35702       nargs = 4;
35703       break;
35704     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35705     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35706       nargs_constant = 2;
35707       nargs = 4;
35708       break;
35709     case INT_FTYPE_V4SF_V4SF_INT_INT:
35710     case INT_FTYPE_V2DF_V2DF_INT_INT:
35711       return ix86_expand_sse_comi_round (d, exp, target);
35712     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35713     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
35714     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
35715     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35716     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35717     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35718     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35719     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35720       nargs = 5;
35721       break;
35722     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35723     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35724       nargs_constant = 4;
35725       nargs = 5;
35726       break;
35727     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35728     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35729     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35730     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35731       nargs_constant = 3;
35732       nargs = 5;
35733       break;
35734     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35735     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35736     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35737     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35738     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
35739     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
35740       nargs = 6;
35741       nargs_constant = 4;
35742       break;
35743     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35744     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35745     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35746     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35747       nargs = 6;
35748       nargs_constant = 3;
35749       break;
35750     default:
35751       gcc_unreachable ();
35752     }
35753   gcc_assert (nargs <= ARRAY_SIZE (args));
35754 
35755   if (optimize
35756       || target == 0
35757       || GET_MODE (target) != tmode
35758       || !insn_p->operand[0].predicate (target, tmode))
35759     target = gen_reg_rtx (tmode);
35760 
35761   for (i = 0; i < nargs; i++)
35762     {
35763       tree arg = CALL_EXPR_ARG (exp, i);
35764       rtx op = expand_normal (arg);
35765       machine_mode mode = insn_p->operand[i + 1].mode;
35766       bool match = insn_p->operand[i + 1].predicate (op, mode);
35767 
35768       if (i == nargs - nargs_constant)
35769 	{
35770 	  if (!match)
35771 	    {
35772 	      switch (icode)
35773 		{
35774 		case CODE_FOR_avx512f_getmantv8df_mask_round:
35775 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
35776 		case CODE_FOR_avx512f_vgetmantv2df_round:
35777 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
35778 		case CODE_FOR_avx512f_vgetmantv4sf_round:
35779 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
35780 		  error ("the immediate argument must be a 4-bit immediate");
35781 		  return const0_rtx;
35782 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
35783 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35784 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35785 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35786 		  error ("the immediate argument must be a 5-bit immediate");
35787 		  return const0_rtx;
35788 		default:
35789 		  error ("the immediate argument must be an 8-bit immediate");
35790 		  return const0_rtx;
35791 		}
35792 	    }
35793 	}
35794       else if (i == nargs-1)
35795 	{
35796 	  if (!insn_p->operand[nargs].predicate (op, SImode))
35797 	    {
35798 	      error ("incorrect rounding operand");
35799 	      return const0_rtx;
35800 	    }
35801 
35802 	  /* If there is no rounding use normal version of the pattern.  */
35803 	  if (INTVAL (op) == NO_ROUND)
35804 	    redundant_embed_rnd = 1;
35805 	}
35806       else
35807 	{
35808 	  if (VECTOR_MODE_P (mode))
35809 	    op = safe_vector_operand (op, mode);
35810 
35811 	  op = fixup_modeless_constant (op, mode);
35812 
35813 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35814 	    {
35815 	      if (optimize || !match)
35816 		op = copy_to_mode_reg (mode, op);
35817 	    }
35818 	  else
35819 	    {
35820 	      op = copy_to_reg (op);
35821 	      op = lowpart_subreg (mode, op, GET_MODE (op));
35822 	    }
35823 	}
35824 
35825       args[i].op = op;
35826       args[i].mode = mode;
35827     }
35828 
35829   switch (nargs)
35830     {
35831     case 1:
35832       pat = GEN_FCN (icode) (target, args[0].op);
35833       break;
35834     case 2:
35835       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35836       break;
35837     case 3:
35838       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35839 			     args[2].op);
35840       break;
35841     case 4:
35842       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35843 			     args[2].op, args[3].op);
35844       break;
35845     case 5:
35846       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35847 			     args[2].op, args[3].op, args[4].op);
35848       break;
35849     case 6:
35850       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35851 			     args[2].op, args[3].op, args[4].op,
35852 			     args[5].op);
35853       break;
35854     default:
35855       gcc_unreachable ();
35856     }
35857 
35858   if (!pat)
35859     return 0;
35860 
35861   if (redundant_embed_rnd)
35862     pat = ix86_erase_embedded_rounding (pat);
35863 
35864   emit_insn (pat);
35865   return target;
35866 }
35867 
35868 /* Subroutine of ix86_expand_builtin to take care of special insns
35869    with variable number of operands.  */
35870 
35871 static rtx
35872 ix86_expand_special_args_builtin (const struct builtin_description *d,
35873 				  tree exp, rtx target)
35874 {
35875   tree arg;
35876   rtx pat, op;
35877   unsigned int i, nargs, arg_adjust, memory;
35878   bool aligned_mem = false;
35879   struct
35880     {
35881       rtx op;
35882       machine_mode mode;
35883     } args[3];
35884   enum insn_code icode = d->icode;
35885   bool last_arg_constant = false;
35886   const struct insn_data_d *insn_p = &insn_data[icode];
35887   machine_mode tmode = insn_p->operand[0].mode;
35888   enum { load, store } klass;
35889 
35890   switch ((enum ix86_builtin_func_type) d->flag)
35891     {
35892     case VOID_FTYPE_VOID:
35893       emit_insn (GEN_FCN (icode) (target));
35894       return 0;
35895     case VOID_FTYPE_UINT64:
35896     case VOID_FTYPE_UNSIGNED:
35897       nargs = 0;
35898       klass = store;
35899       memory = 0;
35900       break;
35901 
35902     case INT_FTYPE_VOID:
35903     case USHORT_FTYPE_VOID:
35904     case UINT64_FTYPE_VOID:
35905     case UINT_FTYPE_VOID:
35906     case UNSIGNED_FTYPE_VOID:
35907       nargs = 0;
35908       klass = load;
35909       memory = 0;
35910       break;
35911     case UINT64_FTYPE_PUNSIGNED:
35912     case V2DI_FTYPE_PV2DI:
35913     case V4DI_FTYPE_PV4DI:
35914     case V32QI_FTYPE_PCCHAR:
35915     case V16QI_FTYPE_PCCHAR:
35916     case V8SF_FTYPE_PCV4SF:
35917     case V8SF_FTYPE_PCFLOAT:
35918     case V4SF_FTYPE_PCFLOAT:
35919     case V4DF_FTYPE_PCV2DF:
35920     case V4DF_FTYPE_PCDOUBLE:
35921     case V2DF_FTYPE_PCDOUBLE:
35922     case VOID_FTYPE_PVOID:
35923     case V8DI_FTYPE_PV8DI:
35924       nargs = 1;
35925       klass = load;
35926       memory = 0;
35927       switch (icode)
35928 	{
35929 	case CODE_FOR_sse4_1_movntdqa:
35930 	case CODE_FOR_avx2_movntdqa:
35931 	case CODE_FOR_avx512f_movntdqa:
35932 	  aligned_mem = true;
35933 	  break;
35934 	default:
35935 	  break;
35936 	}
35937       break;
35938     case VOID_FTYPE_PV2SF_V4SF:
35939     case VOID_FTYPE_PV8DI_V8DI:
35940     case VOID_FTYPE_PV4DI_V4DI:
35941     case VOID_FTYPE_PV2DI_V2DI:
35942     case VOID_FTYPE_PCHAR_V32QI:
35943     case VOID_FTYPE_PCHAR_V16QI:
35944     case VOID_FTYPE_PFLOAT_V16SF:
35945     case VOID_FTYPE_PFLOAT_V8SF:
35946     case VOID_FTYPE_PFLOAT_V4SF:
35947     case VOID_FTYPE_PDOUBLE_V8DF:
35948     case VOID_FTYPE_PDOUBLE_V4DF:
35949     case VOID_FTYPE_PDOUBLE_V2DF:
35950     case VOID_FTYPE_PLONGLONG_LONGLONG:
35951     case VOID_FTYPE_PULONGLONG_ULONGLONG:
35952     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
35953     case VOID_FTYPE_PINT_INT:
35954       nargs = 1;
35955       klass = store;
35956       /* Reserve memory operand for target.  */
35957       memory = ARRAY_SIZE (args);
35958       switch (icode)
35959 	{
35960 	/* These builtins and instructions require the memory
35961 	   to be properly aligned.  */
35962 	case CODE_FOR_avx_movntv4di:
35963 	case CODE_FOR_sse2_movntv2di:
35964 	case CODE_FOR_avx_movntv8sf:
35965 	case CODE_FOR_sse_movntv4sf:
35966 	case CODE_FOR_sse4a_vmmovntv4sf:
35967 	case CODE_FOR_avx_movntv4df:
35968 	case CODE_FOR_sse2_movntv2df:
35969 	case CODE_FOR_sse4a_vmmovntv2df:
35970 	case CODE_FOR_sse2_movntidi:
35971 	case CODE_FOR_sse_movntq:
35972 	case CODE_FOR_sse2_movntisi:
35973 	case CODE_FOR_avx512f_movntv16sf:
35974 	case CODE_FOR_avx512f_movntv8df:
35975 	case CODE_FOR_avx512f_movntv8di:
35976 	  aligned_mem = true;
35977 	  break;
35978 	default:
35979 	  break;
35980 	}
35981       break;
35982     case VOID_FTYPE_PVOID_PCVOID:
35983 	nargs = 1;
35984 	klass = store;
35985 	memory = 0;
35986 
35987 	break;
35988     case V4SF_FTYPE_V4SF_PCV2SF:
35989     case V2DF_FTYPE_V2DF_PCDOUBLE:
35990       nargs = 2;
35991       klass = load;
35992       memory = 1;
35993       break;
35994     case V8SF_FTYPE_PCV8SF_V8SI:
35995     case V4DF_FTYPE_PCV4DF_V4DI:
35996     case V4SF_FTYPE_PCV4SF_V4SI:
35997     case V2DF_FTYPE_PCV2DF_V2DI:
35998     case V8SI_FTYPE_PCV8SI_V8SI:
35999     case V4DI_FTYPE_PCV4DI_V4DI:
36000     case V4SI_FTYPE_PCV4SI_V4SI:
36001     case V2DI_FTYPE_PCV2DI_V2DI:
36002     case VOID_FTYPE_INT_INT64:
36003       nargs = 2;
36004       klass = load;
36005       memory = 0;
36006       break;
36007     case VOID_FTYPE_PV8DF_V8DF_UQI:
36008     case VOID_FTYPE_PV4DF_V4DF_UQI:
36009     case VOID_FTYPE_PV2DF_V2DF_UQI:
36010     case VOID_FTYPE_PV16SF_V16SF_UHI:
36011     case VOID_FTYPE_PV8SF_V8SF_UQI:
36012     case VOID_FTYPE_PV4SF_V4SF_UQI:
36013     case VOID_FTYPE_PV8DI_V8DI_UQI:
36014     case VOID_FTYPE_PV4DI_V4DI_UQI:
36015     case VOID_FTYPE_PV2DI_V2DI_UQI:
36016     case VOID_FTYPE_PV16SI_V16SI_UHI:
36017     case VOID_FTYPE_PV8SI_V8SI_UQI:
36018     case VOID_FTYPE_PV4SI_V4SI_UQI:
36019     case VOID_FTYPE_PV64QI_V64QI_UDI:
36020     case VOID_FTYPE_PV32HI_V32HI_USI:
36021     case VOID_FTYPE_PV32QI_V32QI_USI:
36022     case VOID_FTYPE_PV16QI_V16QI_UHI:
36023     case VOID_FTYPE_PV16HI_V16HI_UHI:
36024     case VOID_FTYPE_PV8HI_V8HI_UQI:
36025       switch (icode)
36026 	{
36027 	/* These builtins and instructions require the memory
36028 	   to be properly aligned.  */
36029 	case CODE_FOR_avx512f_storev16sf_mask:
36030 	case CODE_FOR_avx512f_storev16si_mask:
36031 	case CODE_FOR_avx512f_storev8df_mask:
36032 	case CODE_FOR_avx512f_storev8di_mask:
36033 	case CODE_FOR_avx512vl_storev8sf_mask:
36034 	case CODE_FOR_avx512vl_storev8si_mask:
36035 	case CODE_FOR_avx512vl_storev4df_mask:
36036 	case CODE_FOR_avx512vl_storev4di_mask:
36037 	case CODE_FOR_avx512vl_storev4sf_mask:
36038 	case CODE_FOR_avx512vl_storev4si_mask:
36039 	case CODE_FOR_avx512vl_storev2df_mask:
36040 	case CODE_FOR_avx512vl_storev2di_mask:
36041 	  aligned_mem = true;
36042 	  break;
36043 	default:
36044 	  break;
36045 	}
36046       /* FALLTHRU */
36047     case VOID_FTYPE_PV8SF_V8SI_V8SF:
36048     case VOID_FTYPE_PV4DF_V4DI_V4DF:
36049     case VOID_FTYPE_PV4SF_V4SI_V4SF:
36050     case VOID_FTYPE_PV2DF_V2DI_V2DF:
36051     case VOID_FTYPE_PV8SI_V8SI_V8SI:
36052     case VOID_FTYPE_PV4DI_V4DI_V4DI:
36053     case VOID_FTYPE_PV4SI_V4SI_V4SI:
36054     case VOID_FTYPE_PV2DI_V2DI_V2DI:
36055     case VOID_FTYPE_PV8SI_V8DI_UQI:
36056     case VOID_FTYPE_PV8HI_V8DI_UQI:
36057     case VOID_FTYPE_PV16HI_V16SI_UHI:
36058     case VOID_FTYPE_PV16QI_V8DI_UQI:
36059     case VOID_FTYPE_PV16QI_V16SI_UHI:
36060     case VOID_FTYPE_PV4SI_V4DI_UQI:
36061     case VOID_FTYPE_PV4SI_V2DI_UQI:
36062     case VOID_FTYPE_PV8HI_V4DI_UQI:
36063     case VOID_FTYPE_PV8HI_V2DI_UQI:
36064     case VOID_FTYPE_PV8HI_V8SI_UQI:
36065     case VOID_FTYPE_PV8HI_V4SI_UQI:
36066     case VOID_FTYPE_PV16QI_V4DI_UQI:
36067     case VOID_FTYPE_PV16QI_V2DI_UQI:
36068     case VOID_FTYPE_PV16QI_V8SI_UQI:
36069     case VOID_FTYPE_PV16QI_V4SI_UQI:
36070     case VOID_FTYPE_PCHAR_V64QI_UDI:
36071     case VOID_FTYPE_PCHAR_V32QI_USI:
36072     case VOID_FTYPE_PCHAR_V16QI_UHI:
36073     case VOID_FTYPE_PSHORT_V32HI_USI:
36074     case VOID_FTYPE_PSHORT_V16HI_UHI:
36075     case VOID_FTYPE_PSHORT_V8HI_UQI:
36076     case VOID_FTYPE_PINT_V16SI_UHI:
36077     case VOID_FTYPE_PINT_V8SI_UQI:
36078     case VOID_FTYPE_PINT_V4SI_UQI:
36079     case VOID_FTYPE_PINT64_V8DI_UQI:
36080     case VOID_FTYPE_PINT64_V4DI_UQI:
36081     case VOID_FTYPE_PINT64_V2DI_UQI:
36082     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36083     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36084     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36085     case VOID_FTYPE_PFLOAT_V16SF_UHI:
36086     case VOID_FTYPE_PFLOAT_V8SF_UQI:
36087     case VOID_FTYPE_PFLOAT_V4SF_UQI:
36088     case VOID_FTYPE_PV32QI_V32HI_USI:
36089     case VOID_FTYPE_PV16QI_V16HI_UHI:
36090     case VOID_FTYPE_PV8QI_V8HI_UQI:
36091       nargs = 2;
36092       klass = store;
36093       /* Reserve memory operand for target.  */
36094       memory = ARRAY_SIZE (args);
36095       break;
36096     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36097     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36098     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36099     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36100     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36101     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36102     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36103     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36104     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36105     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36106     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36107     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36108     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
36109     case V32HI_FTYPE_PCV32HI_V32HI_USI:
36110     case V32QI_FTYPE_PCV32QI_V32QI_USI:
36111     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
36112     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
36113     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
36114       switch (icode)
36115 	{
36116 	/* These builtins and instructions require the memory
36117 	   to be properly aligned.  */
36118 	case CODE_FOR_avx512f_loadv16sf_mask:
36119 	case CODE_FOR_avx512f_loadv16si_mask:
36120 	case CODE_FOR_avx512f_loadv8df_mask:
36121 	case CODE_FOR_avx512f_loadv8di_mask:
36122 	case CODE_FOR_avx512vl_loadv8sf_mask:
36123 	case CODE_FOR_avx512vl_loadv8si_mask:
36124 	case CODE_FOR_avx512vl_loadv4df_mask:
36125 	case CODE_FOR_avx512vl_loadv4di_mask:
36126 	case CODE_FOR_avx512vl_loadv4sf_mask:
36127 	case CODE_FOR_avx512vl_loadv4si_mask:
36128 	case CODE_FOR_avx512vl_loadv2df_mask:
36129 	case CODE_FOR_avx512vl_loadv2di_mask:
36130 	case CODE_FOR_avx512bw_loadv64qi_mask:
36131 	case CODE_FOR_avx512vl_loadv32qi_mask:
36132 	case CODE_FOR_avx512vl_loadv16qi_mask:
36133 	case CODE_FOR_avx512bw_loadv32hi_mask:
36134 	case CODE_FOR_avx512vl_loadv16hi_mask:
36135 	case CODE_FOR_avx512vl_loadv8hi_mask:
36136 	  aligned_mem = true;
36137 	  break;
36138 	default:
36139 	  break;
36140 	}
36141       /* FALLTHRU */
36142     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36143     case V32QI_FTYPE_PCCHAR_V32QI_USI:
36144     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36145     case V32HI_FTYPE_PCSHORT_V32HI_USI:
36146     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36147     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36148     case V16SI_FTYPE_PCINT_V16SI_UHI:
36149     case V8SI_FTYPE_PCINT_V8SI_UQI:
36150     case V4SI_FTYPE_PCINT_V4SI_UQI:
36151     case V8DI_FTYPE_PCINT64_V8DI_UQI:
36152     case V4DI_FTYPE_PCINT64_V4DI_UQI:
36153     case V2DI_FTYPE_PCINT64_V2DI_UQI:
36154     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36155     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36156     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36157     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36158     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36159     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36160       nargs = 3;
36161       klass = load;
36162       memory = 0;
36163       break;
36164     case VOID_FTYPE_UINT_UINT_UINT:
36165     case VOID_FTYPE_UINT64_UINT_UINT:
36166     case UCHAR_FTYPE_UINT_UINT_UINT:
36167     case UCHAR_FTYPE_UINT64_UINT_UINT:
36168       nargs = 3;
36169       klass = load;
36170       memory = ARRAY_SIZE (args);
36171       last_arg_constant = true;
36172       break;
36173     default:
36174       gcc_unreachable ();
36175     }
36176 
36177   gcc_assert (nargs <= ARRAY_SIZE (args));
36178 
36179   if (klass == store)
36180     {
36181       arg = CALL_EXPR_ARG (exp, 0);
36182       op = expand_normal (arg);
36183       gcc_assert (target == 0);
36184       if (memory)
36185 	{
36186 	  op = ix86_zero_extend_to_Pmode (op);
36187 	  target = gen_rtx_MEM (tmode, op);
36188 	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36189 	     on it.  Try to improve it using get_pointer_alignment,
36190 	     and if the special builtin is one that requires strict
36191 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
36192 	     Failure to do so could lead to ix86_legitimate_combined_insn
36193 	     rejecting all changes to such insns.  */
36194 	  unsigned int align = get_pointer_alignment (arg);
36195 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36196 	    align = GET_MODE_ALIGNMENT (tmode);
36197 	  if (MEM_ALIGN (target) < align)
36198 	    set_mem_align (target, align);
36199 	}
36200       else
36201 	target = force_reg (tmode, op);
36202       arg_adjust = 1;
36203     }
36204   else
36205     {
36206       arg_adjust = 0;
36207       if (optimize
36208 	  || target == 0
36209 	  || !register_operand (target, tmode)
36210 	  || GET_MODE (target) != tmode)
36211 	target = gen_reg_rtx (tmode);
36212     }
36213 
36214   for (i = 0; i < nargs; i++)
36215     {
36216       machine_mode mode = insn_p->operand[i + 1].mode;
36217       bool match;
36218 
36219       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36220       op = expand_normal (arg);
36221       match = insn_p->operand[i + 1].predicate (op, mode);
36222 
36223       if (last_arg_constant && (i + 1) == nargs)
36224 	{
36225 	  if (!match)
36226 	    {
36227 	      if (icode == CODE_FOR_lwp_lwpvalsi3
36228 		  || icode == CODE_FOR_lwp_lwpinssi3
36229 		  || icode == CODE_FOR_lwp_lwpvaldi3
36230 		  || icode == CODE_FOR_lwp_lwpinsdi3)
36231 		error ("the last argument must be a 32-bit immediate");
36232 	      else
36233 		error ("the last argument must be an 8-bit immediate");
36234 	      return const0_rtx;
36235 	    }
36236 	}
36237       else
36238 	{
36239 	  if (i == memory)
36240 	    {
36241 	      /* This must be the memory operand.  */
36242 	      op = ix86_zero_extend_to_Pmode (op);
36243 	      op = gen_rtx_MEM (mode, op);
36244 	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36245 		 on it.  Try to improve it using get_pointer_alignment,
36246 		 and if the special builtin is one that requires strict
36247 		 mode alignment, also from it's GET_MODE_ALIGNMENT.
36248 		 Failure to do so could lead to ix86_legitimate_combined_insn
36249 		 rejecting all changes to such insns.  */
36250 	      unsigned int align = get_pointer_alignment (arg);
36251 	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36252 		align = GET_MODE_ALIGNMENT (mode);
36253 	      if (MEM_ALIGN (op) < align)
36254 		set_mem_align (op, align);
36255 	    }
36256 	  else
36257 	    {
36258 	      /* This must be register.  */
36259 	      if (VECTOR_MODE_P (mode))
36260 		op = safe_vector_operand (op, mode);
36261 
36262 	      op = fixup_modeless_constant (op, mode);
36263 
36264 	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36265 		op = copy_to_mode_reg (mode, op);
36266 	      else
36267 	        {
36268 	          op = copy_to_reg (op);
36269 	          op = lowpart_subreg (mode, op, GET_MODE (op));
36270 	        }
36271 	    }
36272 	}
36273 
36274       args[i].op = op;
36275       args[i].mode = mode;
36276     }
36277 
36278   switch (nargs)
36279     {
36280     case 0:
36281       pat = GEN_FCN (icode) (target);
36282       break;
36283     case 1:
36284       pat = GEN_FCN (icode) (target, args[0].op);
36285       break;
36286     case 2:
36287       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36288       break;
36289     case 3:
36290       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36291       break;
36292     default:
36293       gcc_unreachable ();
36294     }
36295 
36296   if (! pat)
36297     return 0;
36298   emit_insn (pat);
36299   return klass == store ? 0 : target;
36300 }
36301 
36302 /* Return the integer constant in ARG.  Constrain it to be in the range
36303    of the subparts of VEC_TYPE; issue an error if not.  */
36304 
36305 static int
36306 get_element_number (tree vec_type, tree arg)
36307 {
36308   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36309 
36310   if (!tree_fits_uhwi_p (arg)
36311       || (elt = tree_to_uhwi (arg), elt > max))
36312     {
36313       error ("selector must be an integer constant in the range 0..%wi", max);
36314       return 0;
36315     }
36316 
36317   return elt;
36318 }
36319 
36320 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
36321    ix86_expand_vector_init.  We DO have language-level syntax for this, in
36322    the form of  (type){ init-list }.  Except that since we can't place emms
36323    instructions from inside the compiler, we can't allow the use of MMX
36324    registers unless the user explicitly asks for it.  So we do *not* define
36325    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
36326    we have builtins invoked by mmintrin.h that gives us license to emit
36327    these sorts of instructions.  */
36328 
36329 static rtx
36330 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36331 {
36332   machine_mode tmode = TYPE_MODE (type);
36333   machine_mode inner_mode = GET_MODE_INNER (tmode);
36334   int i, n_elt = GET_MODE_NUNITS (tmode);
36335   rtvec v = rtvec_alloc (n_elt);
36336 
36337   gcc_assert (VECTOR_MODE_P (tmode));
36338   gcc_assert (call_expr_nargs (exp) == n_elt);
36339 
36340   for (i = 0; i < n_elt; ++i)
36341     {
36342       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36343       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36344     }
36345 
36346   if (!target || !register_operand (target, tmode))
36347     target = gen_reg_rtx (tmode);
36348 
36349   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36350   return target;
36351 }
36352 
36353 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
36354    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
36355    had a language-level syntax for referencing vector elements.  */
36356 
36357 static rtx
36358 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36359 {
36360   machine_mode tmode, mode0;
36361   tree arg0, arg1;
36362   int elt;
36363   rtx op0;
36364 
36365   arg0 = CALL_EXPR_ARG (exp, 0);
36366   arg1 = CALL_EXPR_ARG (exp, 1);
36367 
36368   op0 = expand_normal (arg0);
36369   elt = get_element_number (TREE_TYPE (arg0), arg1);
36370 
36371   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36372   mode0 = TYPE_MODE (TREE_TYPE (arg0));
36373   gcc_assert (VECTOR_MODE_P (mode0));
36374 
36375   op0 = force_reg (mode0, op0);
36376 
36377   if (optimize || !target || !register_operand (target, tmode))
36378     target = gen_reg_rtx (tmode);
36379 
36380   ix86_expand_vector_extract (true, target, op0, elt);
36381 
36382   return target;
36383 }
36384 
36385 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
36386    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
36387    a language-level syntax for referencing vector elements.  */
36388 
36389 static rtx
36390 ix86_expand_vec_set_builtin (tree exp)
36391 {
36392   machine_mode tmode, mode1;
36393   tree arg0, arg1, arg2;
36394   int elt;
36395   rtx op0, op1, target;
36396 
36397   arg0 = CALL_EXPR_ARG (exp, 0);
36398   arg1 = CALL_EXPR_ARG (exp, 1);
36399   arg2 = CALL_EXPR_ARG (exp, 2);
36400 
36401   tmode = TYPE_MODE (TREE_TYPE (arg0));
36402   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36403   gcc_assert (VECTOR_MODE_P (tmode));
36404 
36405   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36406   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36407   elt = get_element_number (TREE_TYPE (arg0), arg2);
36408 
36409   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36410     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36411 
36412   op0 = force_reg (tmode, op0);
36413   op1 = force_reg (mode1, op1);
36414 
36415   /* OP0 is the source of these builtin functions and shouldn't be
36416      modified.  Create a copy, use it and return it as target.  */
36417   target = gen_reg_rtx (tmode);
36418   emit_move_insn (target, op0);
36419   ix86_expand_vector_set (true, target, op1, elt);
36420 
36421   return target;
36422 }
36423 
36424 /* Emit conditional move of SRC to DST with condition
36425    OP1 CODE OP2.  */
36426 static void
36427 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36428 {
36429   rtx t;
36430 
36431   if (TARGET_CMOVE)
36432     {
36433       t = ix86_expand_compare (code, op1, op2);
36434       emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36435 							 src, dst)));
36436     }
36437   else
36438     {
36439       rtx_code_label *nomove = gen_label_rtx ();
36440       emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36441 			       const0_rtx, GET_MODE (op1), 1, nomove);
36442       emit_move_insn (dst, src);
36443       emit_label (nomove);
36444     }
36445 }
36446 
36447 /* Choose max of DST and SRC and put it to DST.  */
36448 static void
36449 ix86_emit_move_max (rtx dst, rtx src)
36450 {
36451   ix86_emit_cmove (dst, src, LTU, dst, src);
36452 }
36453 
36454 /* Expand an expression EXP that calls a built-in function,
36455    with result going to TARGET if that's convenient
36456    (and in mode MODE if that's convenient).
36457    SUBTARGET may be used as the target for computing one of EXP's operands.
36458    IGNORE is nonzero if the value is to be ignored.  */
36459 
36460 static rtx
36461 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36462 		     machine_mode mode, int ignore)
36463 {
36464   size_t i;
36465   enum insn_code icode, icode2;
36466   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36467   tree arg0, arg1, arg2, arg3, arg4;
36468   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36469   machine_mode mode0, mode1, mode2, mode3, mode4;
36470   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36471 
36472   /* For CPU builtins that can be folded, fold first and expand the fold.  */
36473   switch (fcode)
36474     {
36475     case IX86_BUILTIN_CPU_INIT:
36476       {
36477 	/* Make it call __cpu_indicator_init in libgcc. */
36478 	tree call_expr, fndecl, type;
36479         type = build_function_type_list (integer_type_node, NULL_TREE);
36480 	fndecl = build_fn_decl ("__cpu_indicator_init", type);
36481 	call_expr = build_call_expr (fndecl, 0);
36482 	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36483       }
36484     case IX86_BUILTIN_CPU_IS:
36485     case IX86_BUILTIN_CPU_SUPPORTS:
36486       {
36487 	tree arg0 = CALL_EXPR_ARG (exp, 0);
36488 	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36489 	gcc_assert (fold_expr != NULL_TREE);
36490 	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36491       }
36492     }
36493 
36494   HOST_WIDE_INT isa = ix86_isa_flags;
36495   HOST_WIDE_INT isa2 = ix86_isa_flags2;
36496   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36497   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36498   /* The general case is we require all the ISAs specified in bisa{,2}
36499      to be enabled.
36500      The exceptions are:
36501      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36502      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36503      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36504      where for each this pair it is sufficient if either of the ISAs is
36505      enabled, plus if it is ored with other options also those others.  */
36506   if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36507        == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36508       && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36509     isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36510   if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36511        == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36512       && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36513     isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36514   if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36515        == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36516       && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36517     isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36518   if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36519     {
36520       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36521 				       (enum fpmath_unit) 0, false);
36522       if (!opts)
36523 	error ("%qE needs unknown isa option", fndecl);
36524       else
36525 	{
36526 	  gcc_assert (opts != NULL);
36527 	  error ("%qE needs isa option %s", fndecl, opts);
36528 	  free (opts);
36529 	}
36530       return expand_call (exp, target, ignore);
36531     }
36532 
36533   switch (fcode)
36534     {
36535     case IX86_BUILTIN_BNDMK:
36536       if (!target
36537 	  || GET_MODE (target) != BNDmode
36538 	  || !register_operand (target, BNDmode))
36539 	target = gen_reg_rtx (BNDmode);
36540 
36541       arg0 = CALL_EXPR_ARG (exp, 0);
36542       arg1 = CALL_EXPR_ARG (exp, 1);
36543 
36544       op0 = expand_normal (arg0);
36545       op1 = expand_normal (arg1);
36546 
36547       if (!register_operand (op0, Pmode))
36548 	op0 = ix86_zero_extend_to_Pmode (op0);
36549       if (!register_operand (op1, Pmode))
36550 	op1 = ix86_zero_extend_to_Pmode (op1);
36551 
36552       /* Builtin arg1 is size of block but instruction op1 should
36553 	 be (size - 1).  */
36554       op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36555 				 NULL_RTX, 1, OPTAB_DIRECT);
36556 
36557       emit_insn (BNDmode == BND64mode
36558                  ? gen_bnd64_mk (target, op0, op1)
36559                  : gen_bnd32_mk (target, op0, op1));
36560       return target;
36561 
36562     case IX86_BUILTIN_BNDSTX:
36563       arg0 = CALL_EXPR_ARG (exp, 0);
36564       arg1 = CALL_EXPR_ARG (exp, 1);
36565       arg2 = CALL_EXPR_ARG (exp, 2);
36566 
36567       op0 = expand_normal (arg0);
36568       op1 = expand_normal (arg1);
36569       op2 = expand_normal (arg2);
36570 
36571       if (!register_operand (op0, Pmode))
36572 	op0 = ix86_zero_extend_to_Pmode (op0);
36573       if (!register_operand (op1, BNDmode))
36574 	op1 = copy_to_mode_reg (BNDmode, op1);
36575       if (!register_operand (op2, Pmode))
36576 	op2 = ix86_zero_extend_to_Pmode (op2);
36577 
36578       emit_insn (BNDmode == BND64mode
36579                  ? gen_bnd64_stx (op2, op0, op1)
36580                  : gen_bnd32_stx (op2, op0, op1));
36581       return 0;
36582 
36583     case IX86_BUILTIN_BNDLDX:
36584       if (!target
36585 	  || GET_MODE (target) != BNDmode
36586 	  || !register_operand (target, BNDmode))
36587 	target = gen_reg_rtx (BNDmode);
36588 
36589       arg0 = CALL_EXPR_ARG (exp, 0);
36590       arg1 = CALL_EXPR_ARG (exp, 1);
36591 
36592       op0 = expand_normal (arg0);
36593       op1 = expand_normal (arg1);
36594 
36595       if (!register_operand (op0, Pmode))
36596 	op0 = ix86_zero_extend_to_Pmode (op0);
36597       if (!register_operand (op1, Pmode))
36598 	op1 = ix86_zero_extend_to_Pmode (op1);
36599 
36600       emit_insn (BNDmode == BND64mode
36601 		 ? gen_bnd64_ldx (target, op0, op1)
36602 		 : gen_bnd32_ldx (target, op0, op1));
36603       return target;
36604 
36605     case IX86_BUILTIN_BNDCL:
36606       arg0 = CALL_EXPR_ARG (exp, 0);
36607       arg1 = CALL_EXPR_ARG (exp, 1);
36608 
36609       op0 = expand_normal (arg0);
36610       op1 = expand_normal (arg1);
36611 
36612       if (!register_operand (op0, Pmode))
36613 	op0 = ix86_zero_extend_to_Pmode (op0);
36614       if (!register_operand (op1, BNDmode))
36615 	op1 = copy_to_mode_reg (BNDmode, op1);
36616 
36617       emit_insn (BNDmode == BND64mode
36618                  ? gen_bnd64_cl (op1, op0)
36619                  : gen_bnd32_cl (op1, op0));
36620       return 0;
36621 
36622     case IX86_BUILTIN_BNDCU:
36623       arg0 = CALL_EXPR_ARG (exp, 0);
36624       arg1 = CALL_EXPR_ARG (exp, 1);
36625 
36626       op0 = expand_normal (arg0);
36627       op1 = expand_normal (arg1);
36628 
36629       if (!register_operand (op0, Pmode))
36630 	op0 = ix86_zero_extend_to_Pmode (op0);
36631       if (!register_operand (op1, BNDmode))
36632 	op1 = copy_to_mode_reg (BNDmode, op1);
36633 
36634       emit_insn (BNDmode == BND64mode
36635                  ? gen_bnd64_cu (op1, op0)
36636                  : gen_bnd32_cu (op1, op0));
36637       return 0;
36638 
36639     case IX86_BUILTIN_BNDRET:
36640       arg0 = CALL_EXPR_ARG (exp, 0);
36641       target = chkp_get_rtl_bounds (arg0);
36642 
36643       /* If no bounds were specified for returned value,
36644 	 then use INIT bounds.  It usually happens when
36645 	 some built-in function is expanded.  */
36646       if (!target)
36647 	{
36648 	  rtx t1 = gen_reg_rtx (Pmode);
36649 	  rtx t2 = gen_reg_rtx (Pmode);
36650 	  target = gen_reg_rtx (BNDmode);
36651 	  emit_move_insn (t1, const0_rtx);
36652 	  emit_move_insn (t2, constm1_rtx);
36653 	  emit_insn (BNDmode == BND64mode
36654 		     ? gen_bnd64_mk (target, t1, t2)
36655 		     : gen_bnd32_mk (target, t1, t2));
36656 	}
36657 
36658       gcc_assert (target && REG_P (target));
36659       return target;
36660 
36661     case IX86_BUILTIN_BNDNARROW:
36662       {
36663 	rtx m1, m1h1, m1h2, lb, ub, t1;
36664 
36665 	/* Return value and lb.  */
36666 	arg0 = CALL_EXPR_ARG (exp, 0);
36667 	/* Bounds.  */
36668 	arg1 = CALL_EXPR_ARG (exp, 1);
36669 	/* Size.  */
36670 	arg2 = CALL_EXPR_ARG (exp, 2);
36671 
36672 	lb = expand_normal (arg0);
36673 	op1 = expand_normal (arg1);
36674 	op2 = expand_normal (arg2);
36675 
36676 	/* Size was passed but we need to use (size - 1) as for bndmk.  */
36677 	op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36678 				   NULL_RTX, 1, OPTAB_DIRECT);
36679 
36680 	/* Add LB to size and inverse to get UB.  */
36681 	op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36682 				   op2, 1, OPTAB_DIRECT);
36683 	ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36684 
36685 	if (!register_operand (lb, Pmode))
36686 	  lb = ix86_zero_extend_to_Pmode (lb);
36687 	if (!register_operand (ub, Pmode))
36688 	  ub = ix86_zero_extend_to_Pmode (ub);
36689 
36690 	/* We need to move bounds to memory before any computations.  */
36691 	if (MEM_P (op1))
36692 	  m1 = op1;
36693 	else
36694 	  {
36695 	    m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36696 	    emit_move_insn (m1, op1);
36697 	  }
36698 
36699 	/* Generate mem expression to be used for access to LB and UB.  */
36700 	m1h1 = adjust_address (m1, Pmode, 0);
36701 	m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36702 
36703 	t1 = gen_reg_rtx (Pmode);
36704 
36705 	/* Compute LB.  */
36706 	emit_move_insn (t1, m1h1);
36707 	ix86_emit_move_max (t1, lb);
36708 	emit_move_insn (m1h1, t1);
36709 
36710 	/* Compute UB.  UB is stored in 1's complement form.  Therefore
36711 	   we also use max here.  */
36712 	emit_move_insn (t1, m1h2);
36713 	ix86_emit_move_max (t1, ub);
36714 	emit_move_insn (m1h2, t1);
36715 
36716 	op2 = gen_reg_rtx (BNDmode);
36717 	emit_move_insn (op2, m1);
36718 
36719 	return chkp_join_splitted_slot (lb, op2);
36720       }
36721 
36722     case IX86_BUILTIN_BNDINT:
36723       {
36724 	rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36725 
36726 	if (!target
36727 	    || GET_MODE (target) != BNDmode
36728 	    || !register_operand (target, BNDmode))
36729 	  target = gen_reg_rtx (BNDmode);
36730 
36731 	arg0 = CALL_EXPR_ARG (exp, 0);
36732 	arg1 = CALL_EXPR_ARG (exp, 1);
36733 
36734 	op0 = expand_normal (arg0);
36735 	op1 = expand_normal (arg1);
36736 
36737 	res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36738 	rh1 = adjust_address (res, Pmode, 0);
36739 	rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36740 
36741 	/* Put first bounds to temporaries.  */
36742 	lb1 = gen_reg_rtx (Pmode);
36743 	ub1 = gen_reg_rtx (Pmode);
36744 	if (MEM_P (op0))
36745 	  {
36746 	    emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36747 	    emit_move_insn (ub1, adjust_address (op0, Pmode,
36748 						 GET_MODE_SIZE (Pmode)));
36749 	  }
36750 	else
36751 	  {
36752 	    emit_move_insn (res, op0);
36753 	    emit_move_insn (lb1, rh1);
36754 	    emit_move_insn (ub1, rh2);
36755 	  }
36756 
36757 	/* Put second bounds to temporaries.  */
36758 	lb2 = gen_reg_rtx (Pmode);
36759 	ub2 = gen_reg_rtx (Pmode);
36760 	if (MEM_P (op1))
36761 	  {
36762 	    emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36763 	    emit_move_insn (ub2, adjust_address (op1, Pmode,
36764 						 GET_MODE_SIZE (Pmode)));
36765 	  }
36766 	else
36767 	  {
36768 	    emit_move_insn (res, op1);
36769 	    emit_move_insn (lb2, rh1);
36770 	    emit_move_insn (ub2, rh2);
36771 	  }
36772 
36773 	/* Compute LB.  */
36774 	ix86_emit_move_max (lb1, lb2);
36775 	emit_move_insn (rh1, lb1);
36776 
36777 	/* Compute UB.  UB is stored in 1's complement form.  Therefore
36778 	   we also use max here.  */
36779 	ix86_emit_move_max (ub1, ub2);
36780 	emit_move_insn (rh2, ub1);
36781 
36782 	emit_move_insn (target, res);
36783 
36784 	return target;
36785       }
36786 
36787     case IX86_BUILTIN_SIZEOF:
36788       {
36789 	tree name;
36790 	rtx symbol;
36791 
36792 	if (!target
36793 	    || GET_MODE (target) != Pmode
36794 	    || !register_operand (target, Pmode))
36795 	  target = gen_reg_rtx (Pmode);
36796 
36797 	arg0 = CALL_EXPR_ARG (exp, 0);
36798 	gcc_assert (VAR_P (arg0));
36799 
36800 	name = DECL_ASSEMBLER_NAME (arg0);
36801 	symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36802 
36803 	emit_insn (Pmode == SImode
36804 		   ? gen_move_size_reloc_si (target, symbol)
36805 		   : gen_move_size_reloc_di (target, symbol));
36806 
36807 	return target;
36808       }
36809 
36810     case IX86_BUILTIN_BNDLOWER:
36811       {
36812 	rtx mem, hmem;
36813 
36814 	if (!target
36815 	    || GET_MODE (target) != Pmode
36816 	    || !register_operand (target, Pmode))
36817 	  target = gen_reg_rtx (Pmode);
36818 
36819 	arg0 = CALL_EXPR_ARG (exp, 0);
36820 	op0 = expand_normal (arg0);
36821 
36822 	/* We need to move bounds to memory first.  */
36823 	if (MEM_P (op0))
36824 	  mem = op0;
36825 	else
36826 	  {
36827 	    mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36828 	    emit_move_insn (mem, op0);
36829 	  }
36830 
36831 	/* Generate mem expression to access LB and load it.  */
36832 	hmem = adjust_address (mem, Pmode, 0);
36833 	emit_move_insn (target, hmem);
36834 
36835 	return target;
36836       }
36837 
36838     case IX86_BUILTIN_BNDUPPER:
36839       {
36840 	rtx mem, hmem, res;
36841 
36842 	if (!target
36843 	    || GET_MODE (target) != Pmode
36844 	    || !register_operand (target, Pmode))
36845 	  target = gen_reg_rtx (Pmode);
36846 
36847 	arg0 = CALL_EXPR_ARG (exp, 0);
36848 	op0 = expand_normal (arg0);
36849 
36850 	/* We need to move bounds to memory first.  */
36851 	if (MEM_P (op0))
36852 	  mem = op0;
36853 	else
36854 	  {
36855 	    mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36856 	    emit_move_insn (mem, op0);
36857 	  }
36858 
36859 	/* Generate mem expression to access UB.  */
36860 	hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36861 
36862 	/* We need to inverse all bits of UB.  */
36863 	res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36864 
36865 	if (res != target)
36866 	  emit_move_insn (target, res);
36867 
36868 	return target;
36869       }
36870 
36871     case IX86_BUILTIN_MASKMOVQ:
36872     case IX86_BUILTIN_MASKMOVDQU:
36873       icode = (fcode == IX86_BUILTIN_MASKMOVQ
36874 	       ? CODE_FOR_mmx_maskmovq
36875 	       : CODE_FOR_sse2_maskmovdqu);
36876       /* Note the arg order is different from the operand order.  */
36877       arg1 = CALL_EXPR_ARG (exp, 0);
36878       arg2 = CALL_EXPR_ARG (exp, 1);
36879       arg0 = CALL_EXPR_ARG (exp, 2);
36880       op0 = expand_normal (arg0);
36881       op1 = expand_normal (arg1);
36882       op2 = expand_normal (arg2);
36883       mode0 = insn_data[icode].operand[0].mode;
36884       mode1 = insn_data[icode].operand[1].mode;
36885       mode2 = insn_data[icode].operand[2].mode;
36886 
36887       op0 = ix86_zero_extend_to_Pmode (op0);
36888       op0 = gen_rtx_MEM (mode1, op0);
36889 
36890       if (!insn_data[icode].operand[0].predicate (op0, mode0))
36891 	op0 = copy_to_mode_reg (mode0, op0);
36892       if (!insn_data[icode].operand[1].predicate (op1, mode1))
36893 	op1 = copy_to_mode_reg (mode1, op1);
36894       if (!insn_data[icode].operand[2].predicate (op2, mode2))
36895 	op2 = copy_to_mode_reg (mode2, op2);
36896       pat = GEN_FCN (icode) (op0, op1, op2);
36897       if (! pat)
36898 	return 0;
36899       emit_insn (pat);
36900       return 0;
36901 
36902     case IX86_BUILTIN_LDMXCSR:
36903       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36904       target = assign_386_stack_local (SImode, SLOT_TEMP);
36905       emit_move_insn (target, op0);
36906       emit_insn (gen_sse_ldmxcsr (target));
36907       return 0;
36908 
36909     case IX86_BUILTIN_STMXCSR:
36910       target = assign_386_stack_local (SImode, SLOT_TEMP);
36911       emit_insn (gen_sse_stmxcsr (target));
36912       return copy_to_mode_reg (SImode, target);
36913 
36914     case IX86_BUILTIN_CLFLUSH:
36915 	arg0 = CALL_EXPR_ARG (exp, 0);
36916 	op0 = expand_normal (arg0);
36917 	icode = CODE_FOR_sse2_clflush;
36918 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36919 	  op0 = ix86_zero_extend_to_Pmode (op0);
36920 
36921 	emit_insn (gen_sse2_clflush (op0));
36922 	return 0;
36923 
36924     case IX86_BUILTIN_CLWB:
36925 	arg0 = CALL_EXPR_ARG (exp, 0);
36926 	op0 = expand_normal (arg0);
36927 	icode = CODE_FOR_clwb;
36928 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36929 	  op0 = ix86_zero_extend_to_Pmode (op0);
36930 
36931 	emit_insn (gen_clwb (op0));
36932 	return 0;
36933 
36934     case IX86_BUILTIN_CLFLUSHOPT:
36935 	arg0 = CALL_EXPR_ARG (exp, 0);
36936 	op0 = expand_normal (arg0);
36937 	icode = CODE_FOR_clflushopt;
36938 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36939 	  op0 = ix86_zero_extend_to_Pmode (op0);
36940 
36941 	emit_insn (gen_clflushopt (op0));
36942 	return 0;
36943 
36944     case IX86_BUILTIN_MONITOR:
36945     case IX86_BUILTIN_MONITORX:
36946       arg0 = CALL_EXPR_ARG (exp, 0);
36947       arg1 = CALL_EXPR_ARG (exp, 1);
36948       arg2 = CALL_EXPR_ARG (exp, 2);
36949       op0 = expand_normal (arg0);
36950       op1 = expand_normal (arg1);
36951       op2 = expand_normal (arg2);
36952       if (!REG_P (op0))
36953 	op0 = ix86_zero_extend_to_Pmode (op0);
36954       if (!REG_P (op1))
36955 	op1 = copy_to_mode_reg (SImode, op1);
36956       if (!REG_P (op2))
36957 	op2 = copy_to_mode_reg (SImode, op2);
36958 
36959       emit_insn (fcode == IX86_BUILTIN_MONITOR
36960 		 ? ix86_gen_monitor (op0, op1, op2)
36961 		 : ix86_gen_monitorx (op0, op1, op2));
36962       return 0;
36963 
36964     case IX86_BUILTIN_MWAIT:
36965       arg0 = CALL_EXPR_ARG (exp, 0);
36966       arg1 = CALL_EXPR_ARG (exp, 1);
36967       op0 = expand_normal (arg0);
36968       op1 = expand_normal (arg1);
36969       if (!REG_P (op0))
36970 	op0 = copy_to_mode_reg (SImode, op0);
36971       if (!REG_P (op1))
36972 	op1 = copy_to_mode_reg (SImode, op1);
36973       emit_insn (gen_sse3_mwait (op0, op1));
36974       return 0;
36975 
36976     case IX86_BUILTIN_MWAITX:
36977       arg0 = CALL_EXPR_ARG (exp, 0);
36978       arg1 = CALL_EXPR_ARG (exp, 1);
36979       arg2 = CALL_EXPR_ARG (exp, 2);
36980       op0 = expand_normal (arg0);
36981       op1 = expand_normal (arg1);
36982       op2 = expand_normal (arg2);
36983       if (!REG_P (op0))
36984 	op0 = copy_to_mode_reg (SImode, op0);
36985       if (!REG_P (op1))
36986 	op1 = copy_to_mode_reg (SImode, op1);
36987       if (!REG_P (op2))
36988 	op2 = copy_to_mode_reg (SImode, op2);
36989       emit_insn (gen_mwaitx (op0, op1, op2));
36990       return 0;
36991 
36992     case IX86_BUILTIN_CLZERO:
36993       arg0 = CALL_EXPR_ARG (exp, 0);
36994       op0 = expand_normal (arg0);
36995       if (!REG_P (op0))
36996 	op0 = ix86_zero_extend_to_Pmode (op0);
36997       emit_insn (ix86_gen_clzero (op0));
36998       return 0;
36999 
37000     case IX86_BUILTIN_VEC_INIT_V2SI:
37001     case IX86_BUILTIN_VEC_INIT_V4HI:
37002     case IX86_BUILTIN_VEC_INIT_V8QI:
37003       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37004 
37005     case IX86_BUILTIN_VEC_EXT_V2DF:
37006     case IX86_BUILTIN_VEC_EXT_V2DI:
37007     case IX86_BUILTIN_VEC_EXT_V4SF:
37008     case IX86_BUILTIN_VEC_EXT_V4SI:
37009     case IX86_BUILTIN_VEC_EXT_V8HI:
37010     case IX86_BUILTIN_VEC_EXT_V2SI:
37011     case IX86_BUILTIN_VEC_EXT_V4HI:
37012     case IX86_BUILTIN_VEC_EXT_V16QI:
37013       return ix86_expand_vec_ext_builtin (exp, target);
37014 
37015     case IX86_BUILTIN_VEC_SET_V2DI:
37016     case IX86_BUILTIN_VEC_SET_V4SF:
37017     case IX86_BUILTIN_VEC_SET_V4SI:
37018     case IX86_BUILTIN_VEC_SET_V8HI:
37019     case IX86_BUILTIN_VEC_SET_V4HI:
37020     case IX86_BUILTIN_VEC_SET_V16QI:
37021       return ix86_expand_vec_set_builtin (exp);
37022 
37023     case IX86_BUILTIN_NANQ:
37024     case IX86_BUILTIN_NANSQ:
37025       return expand_call (exp, target, ignore);
37026 
37027     case IX86_BUILTIN_RDPID:
37028 
37029       op0 = gen_reg_rtx (word_mode);
37030 
37031       if (TARGET_64BIT)
37032 	{
37033 	  insn = gen_rdpid_rex64 (op0);
37034 	  op0 = convert_to_mode (SImode, op0, 1);
37035 	}
37036       else
37037 	insn = gen_rdpid (op0);
37038 
37039       emit_insn (insn);
37040 
37041       if (target == 0
37042 	  || !register_operand (target, SImode))
37043 	target = gen_reg_rtx (SImode);
37044 
37045       emit_move_insn (target, op0);
37046       return target;
37047 
37048     case IX86_BUILTIN_RDPMC:
37049     case IX86_BUILTIN_RDTSC:
37050     case IX86_BUILTIN_RDTSCP:
37051     case IX86_BUILTIN_XGETBV:
37052 
37053       op0 = gen_reg_rtx (DImode);
37054       op1 = gen_reg_rtx (DImode);
37055 
37056       if (fcode == IX86_BUILTIN_RDPMC)
37057 	{
37058 	  arg0 = CALL_EXPR_ARG (exp, 0);
37059 	  op2 = expand_normal (arg0);
37060 	  if (!register_operand (op2, SImode))
37061 	    op2 = copy_to_mode_reg (SImode, op2);
37062 
37063 	  insn = (TARGET_64BIT
37064 		  ? gen_rdpmc_rex64 (op0, op1, op2)
37065 		  : gen_rdpmc (op0, op2));
37066 	  emit_insn (insn);
37067 	}
37068       else if (fcode == IX86_BUILTIN_XGETBV)
37069 	{
37070 	  arg0 = CALL_EXPR_ARG (exp, 0);
37071 	  op2 = expand_normal (arg0);
37072 	  if (!register_operand (op2, SImode))
37073 	    op2 = copy_to_mode_reg (SImode, op2);
37074 
37075 	  insn = (TARGET_64BIT
37076 		  ? gen_xgetbv_rex64 (op0, op1, op2)
37077 		  : gen_xgetbv (op0, op2));
37078 	  emit_insn (insn);
37079 	}
37080       else if (fcode == IX86_BUILTIN_RDTSC)
37081 	{
37082 	  insn = (TARGET_64BIT
37083 		  ? gen_rdtsc_rex64 (op0, op1)
37084 		  : gen_rdtsc (op0));
37085 	  emit_insn (insn);
37086 	}
37087       else
37088 	{
37089 	  op2 = gen_reg_rtx (SImode);
37090 
37091 	  insn = (TARGET_64BIT
37092 		  ? gen_rdtscp_rex64 (op0, op1, op2)
37093 		  : gen_rdtscp (op0, op2));
37094 	  emit_insn (insn);
37095 
37096 	  arg0 = CALL_EXPR_ARG (exp, 0);
37097 	  op4 = expand_normal (arg0);
37098 	  if (!address_operand (op4, VOIDmode))
37099 	    {
37100 	      op4 = convert_memory_address (Pmode, op4);
37101 	      op4 = copy_addr_to_reg (op4);
37102 	    }
37103 	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37104 	}
37105 
37106       if (target == 0
37107 	  || !register_operand (target, DImode))
37108         target = gen_reg_rtx (DImode);
37109 
37110       if (TARGET_64BIT)
37111 	{
37112 	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37113 				     op1, 1, OPTAB_DIRECT);
37114 	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
37115 				     op0, 1, OPTAB_DIRECT);
37116 	}
37117 
37118       emit_move_insn (target, op0);
37119       return target;
37120 
37121     case IX86_BUILTIN_MOVDIR64B:
37122 
37123       arg0 = CALL_EXPR_ARG (exp, 0);
37124       arg1 = CALL_EXPR_ARG (exp, 1);
37125       op0 = expand_normal (arg0);
37126       op1 = expand_normal (arg1);
37127 
37128       op0 = ix86_zero_extend_to_Pmode (op0);
37129       if (!address_operand (op1, VOIDmode))
37130       {
37131 	op1 = convert_memory_address (Pmode, op1);
37132 	op1 = copy_addr_to_reg (op1);
37133       }
37134       op1 = gen_rtx_MEM (XImode, op1);
37135 
37136       insn = (TARGET_64BIT
37137 		? gen_movdir64b_di (op0, op1)
37138 		: gen_movdir64b_si (op0, op1));
37139       emit_insn (insn);
37140       return 0;
37141 
37142     case IX86_BUILTIN_FXSAVE:
37143     case IX86_BUILTIN_FXRSTOR:
37144     case IX86_BUILTIN_FXSAVE64:
37145     case IX86_BUILTIN_FXRSTOR64:
37146     case IX86_BUILTIN_FNSTENV:
37147     case IX86_BUILTIN_FLDENV:
37148       mode0 = BLKmode;
37149       switch (fcode)
37150 	{
37151 	case IX86_BUILTIN_FXSAVE:
37152 	  icode = CODE_FOR_fxsave;
37153 	  break;
37154 	case IX86_BUILTIN_FXRSTOR:
37155 	  icode = CODE_FOR_fxrstor;
37156 	  break;
37157 	case IX86_BUILTIN_FXSAVE64:
37158 	  icode = CODE_FOR_fxsave64;
37159 	  break;
37160 	case IX86_BUILTIN_FXRSTOR64:
37161 	  icode = CODE_FOR_fxrstor64;
37162 	  break;
37163 	case IX86_BUILTIN_FNSTENV:
37164 	  icode = CODE_FOR_fnstenv;
37165 	  break;
37166 	case IX86_BUILTIN_FLDENV:
37167 	  icode = CODE_FOR_fldenv;
37168 	  break;
37169 	default:
37170 	  gcc_unreachable ();
37171 	}
37172 
37173       arg0 = CALL_EXPR_ARG (exp, 0);
37174       op0 = expand_normal (arg0);
37175 
37176       if (!address_operand (op0, VOIDmode))
37177 	{
37178 	  op0 = convert_memory_address (Pmode, op0);
37179 	  op0 = copy_addr_to_reg (op0);
37180 	}
37181       op0 = gen_rtx_MEM (mode0, op0);
37182 
37183       pat = GEN_FCN (icode) (op0);
37184       if (pat)
37185 	emit_insn (pat);
37186       return 0;
37187 
37188     case IX86_BUILTIN_XSETBV:
37189       arg0 = CALL_EXPR_ARG (exp, 0);
37190       arg1 = CALL_EXPR_ARG (exp, 1);
37191       op0 = expand_normal (arg0);
37192       op1 = expand_normal (arg1);
37193 
37194       if (!REG_P (op0))
37195 	op0 = copy_to_mode_reg (SImode, op0);
37196 
37197       op1 = force_reg (DImode, op1);
37198 
37199       if (TARGET_64BIT)
37200 	{
37201 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37202 				     NULL, 1, OPTAB_DIRECT);
37203 
37204 	  icode = CODE_FOR_xsetbv_rex64;
37205 
37206 	  op2 = gen_lowpart (SImode, op2);
37207 	  op1 = gen_lowpart (SImode, op1);
37208 	  pat = GEN_FCN (icode) (op0, op1, op2);
37209 	}
37210       else
37211 	{
37212 	  icode = CODE_FOR_xsetbv;
37213 
37214 	  pat = GEN_FCN (icode) (op0, op1);
37215 	}
37216       if (pat)
37217 	emit_insn (pat);
37218       return 0;
37219 
37220     case IX86_BUILTIN_XSAVE:
37221     case IX86_BUILTIN_XRSTOR:
37222     case IX86_BUILTIN_XSAVE64:
37223     case IX86_BUILTIN_XRSTOR64:
37224     case IX86_BUILTIN_XSAVEOPT:
37225     case IX86_BUILTIN_XSAVEOPT64:
37226     case IX86_BUILTIN_XSAVES:
37227     case IX86_BUILTIN_XRSTORS:
37228     case IX86_BUILTIN_XSAVES64:
37229     case IX86_BUILTIN_XRSTORS64:
37230     case IX86_BUILTIN_XSAVEC:
37231     case IX86_BUILTIN_XSAVEC64:
37232       arg0 = CALL_EXPR_ARG (exp, 0);
37233       arg1 = CALL_EXPR_ARG (exp, 1);
37234       op0 = expand_normal (arg0);
37235       op1 = expand_normal (arg1);
37236 
37237       if (!address_operand (op0, VOIDmode))
37238 	{
37239 	  op0 = convert_memory_address (Pmode, op0);
37240 	  op0 = copy_addr_to_reg (op0);
37241 	}
37242       op0 = gen_rtx_MEM (BLKmode, op0);
37243 
37244       op1 = force_reg (DImode, op1);
37245 
37246       if (TARGET_64BIT)
37247 	{
37248 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37249 				     NULL, 1, OPTAB_DIRECT);
37250 	  switch (fcode)
37251 	    {
37252 	    case IX86_BUILTIN_XSAVE:
37253 	      icode = CODE_FOR_xsave_rex64;
37254 	      break;
37255 	    case IX86_BUILTIN_XRSTOR:
37256 	      icode = CODE_FOR_xrstor_rex64;
37257 	      break;
37258 	    case IX86_BUILTIN_XSAVE64:
37259 	      icode = CODE_FOR_xsave64;
37260 	      break;
37261 	    case IX86_BUILTIN_XRSTOR64:
37262 	      icode = CODE_FOR_xrstor64;
37263 	      break;
37264 	    case IX86_BUILTIN_XSAVEOPT:
37265 	      icode = CODE_FOR_xsaveopt_rex64;
37266 	      break;
37267 	    case IX86_BUILTIN_XSAVEOPT64:
37268 	      icode = CODE_FOR_xsaveopt64;
37269 	      break;
37270 	    case IX86_BUILTIN_XSAVES:
37271 	      icode = CODE_FOR_xsaves_rex64;
37272 	      break;
37273 	    case IX86_BUILTIN_XRSTORS:
37274 	      icode = CODE_FOR_xrstors_rex64;
37275 	      break;
37276 	    case IX86_BUILTIN_XSAVES64:
37277 	      icode = CODE_FOR_xsaves64;
37278 	      break;
37279 	    case IX86_BUILTIN_XRSTORS64:
37280 	      icode = CODE_FOR_xrstors64;
37281 	      break;
37282 	    case IX86_BUILTIN_XSAVEC:
37283 	      icode = CODE_FOR_xsavec_rex64;
37284 	      break;
37285 	    case IX86_BUILTIN_XSAVEC64:
37286 	      icode = CODE_FOR_xsavec64;
37287 	      break;
37288 	    default:
37289 	      gcc_unreachable ();
37290 	    }
37291 
37292 	  op2 = gen_lowpart (SImode, op2);
37293 	  op1 = gen_lowpart (SImode, op1);
37294 	  pat = GEN_FCN (icode) (op0, op1, op2);
37295 	}
37296       else
37297 	{
37298 	  switch (fcode)
37299 	    {
37300 	    case IX86_BUILTIN_XSAVE:
37301 	      icode = CODE_FOR_xsave;
37302 	      break;
37303 	    case IX86_BUILTIN_XRSTOR:
37304 	      icode = CODE_FOR_xrstor;
37305 	      break;
37306 	    case IX86_BUILTIN_XSAVEOPT:
37307 	      icode = CODE_FOR_xsaveopt;
37308 	      break;
37309 	    case IX86_BUILTIN_XSAVES:
37310 	      icode = CODE_FOR_xsaves;
37311 	      break;
37312 	    case IX86_BUILTIN_XRSTORS:
37313 	      icode = CODE_FOR_xrstors;
37314 	      break;
37315 	    case IX86_BUILTIN_XSAVEC:
37316 	      icode = CODE_FOR_xsavec;
37317 	      break;
37318 	    default:
37319 	      gcc_unreachable ();
37320 	    }
37321 	  pat = GEN_FCN (icode) (op0, op1);
37322 	}
37323 
37324       if (pat)
37325 	emit_insn (pat);
37326       return 0;
37327 
37328     case IX86_BUILTIN_LLWPCB:
37329       arg0 = CALL_EXPR_ARG (exp, 0);
37330       op0 = expand_normal (arg0);
37331       icode = CODE_FOR_lwp_llwpcb;
37332       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37333 	op0 = ix86_zero_extend_to_Pmode (op0);
37334       emit_insn (gen_lwp_llwpcb (op0));
37335       return 0;
37336 
37337     case IX86_BUILTIN_SLWPCB:
37338       icode = CODE_FOR_lwp_slwpcb;
37339       if (!target
37340 	  || !insn_data[icode].operand[0].predicate (target, Pmode))
37341 	target = gen_reg_rtx (Pmode);
37342       emit_insn (gen_lwp_slwpcb (target));
37343       return target;
37344 
37345     case IX86_BUILTIN_BEXTRI32:
37346     case IX86_BUILTIN_BEXTRI64:
37347       arg0 = CALL_EXPR_ARG (exp, 0);
37348       arg1 = CALL_EXPR_ARG (exp, 1);
37349       op0 = expand_normal (arg0);
37350       op1 = expand_normal (arg1);
37351       icode = (fcode == IX86_BUILTIN_BEXTRI32
37352 	  ? CODE_FOR_tbm_bextri_si
37353 	  : CODE_FOR_tbm_bextri_di);
37354       if (!CONST_INT_P (op1))
37355         {
37356           error ("last argument must be an immediate");
37357           return const0_rtx;
37358         }
37359       else
37360         {
37361           unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37362           unsigned char lsb_index = INTVAL (op1) & 0xFF;
37363           op1 = GEN_INT (length);
37364           op2 = GEN_INT (lsb_index);
37365 
37366 	  mode1 = insn_data[icode].operand[1].mode;
37367 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
37368 	    op0 = copy_to_mode_reg (mode1, op0);
37369 
37370 	  mode0 = insn_data[icode].operand[0].mode;
37371 	  if (target == 0
37372 	      || !register_operand (target, mode0))
37373 	    target = gen_reg_rtx (mode0);
37374 
37375           pat = GEN_FCN (icode) (target, op0, op1, op2);
37376           if (pat)
37377             emit_insn (pat);
37378           return target;
37379         }
37380 
37381     case IX86_BUILTIN_RDRAND16_STEP:
37382       icode = CODE_FOR_rdrandhi_1;
37383       mode0 = HImode;
37384       goto rdrand_step;
37385 
37386     case IX86_BUILTIN_RDRAND32_STEP:
37387       icode = CODE_FOR_rdrandsi_1;
37388       mode0 = SImode;
37389       goto rdrand_step;
37390 
37391     case IX86_BUILTIN_RDRAND64_STEP:
37392       icode = CODE_FOR_rdranddi_1;
37393       mode0 = DImode;
37394 
37395 rdrand_step:
37396       arg0 = CALL_EXPR_ARG (exp, 0);
37397       op1 = expand_normal (arg0);
37398       if (!address_operand (op1, VOIDmode))
37399 	{
37400 	  op1 = convert_memory_address (Pmode, op1);
37401 	  op1 = copy_addr_to_reg (op1);
37402 	}
37403 
37404       op0 = gen_reg_rtx (mode0);
37405       emit_insn (GEN_FCN (icode) (op0));
37406 
37407       emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37408 
37409       op1 = gen_reg_rtx (SImode);
37410       emit_move_insn (op1, CONST1_RTX (SImode));
37411 
37412       /* Emit SImode conditional move.  */
37413       if (mode0 == HImode)
37414 	{
37415 	  if (TARGET_ZERO_EXTEND_WITH_AND
37416 	      && optimize_function_for_speed_p (cfun))
37417 	    {
37418 	      op2 = force_reg (SImode, const0_rtx);
37419 
37420 	      emit_insn (gen_movstricthi
37421 			 (gen_lowpart (HImode, op2), op0));
37422 	    }
37423 	  else
37424 	    {
37425 	      op2 = gen_reg_rtx (SImode);
37426 
37427 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
37428 	    }
37429 	}
37430       else if (mode0 == SImode)
37431 	op2 = op0;
37432       else
37433 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
37434 
37435       if (target == 0
37436 	  || !register_operand (target, SImode))
37437 	target = gen_reg_rtx (SImode);
37438 
37439       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37440 			 const0_rtx);
37441       emit_insn (gen_rtx_SET (target,
37442 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37443       return target;
37444 
37445     case IX86_BUILTIN_RDSEED16_STEP:
37446       icode = CODE_FOR_rdseedhi_1;
37447       mode0 = HImode;
37448       goto rdseed_step;
37449 
37450     case IX86_BUILTIN_RDSEED32_STEP:
37451       icode = CODE_FOR_rdseedsi_1;
37452       mode0 = SImode;
37453       goto rdseed_step;
37454 
37455     case IX86_BUILTIN_RDSEED64_STEP:
37456       icode = CODE_FOR_rdseeddi_1;
37457       mode0 = DImode;
37458 
37459 rdseed_step:
37460       arg0 = CALL_EXPR_ARG (exp, 0);
37461       op1 = expand_normal (arg0);
37462       if (!address_operand (op1, VOIDmode))
37463 	{
37464 	  op1 = convert_memory_address (Pmode, op1);
37465 	  op1 = copy_addr_to_reg (op1);
37466 	}
37467 
37468       op0 = gen_reg_rtx (mode0);
37469       emit_insn (GEN_FCN (icode) (op0));
37470 
37471       emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37472 
37473       op2 = gen_reg_rtx (QImode);
37474 
37475       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37476                          const0_rtx);
37477       emit_insn (gen_rtx_SET (op2, pat));
37478 
37479       if (target == 0
37480 	  || !register_operand (target, SImode))
37481         target = gen_reg_rtx (SImode);
37482 
37483       emit_insn (gen_zero_extendqisi2 (target, op2));
37484       return target;
37485 
37486     case IX86_BUILTIN_SBB32:
37487       icode = CODE_FOR_subborrowsi;
37488       icode2 = CODE_FOR_subborrowsi_0;
37489       mode0 = SImode;
37490       mode1 = DImode;
37491       mode2 = CCmode;
37492       goto handlecarry;
37493 
37494     case IX86_BUILTIN_SBB64:
37495       icode = CODE_FOR_subborrowdi;
37496       icode2 = CODE_FOR_subborrowdi_0;
37497       mode0 = DImode;
37498       mode1 = TImode;
37499       mode2 = CCmode;
37500       goto handlecarry;
37501 
37502     case IX86_BUILTIN_ADDCARRYX32:
37503       icode = CODE_FOR_addcarrysi;
37504       icode2 = CODE_FOR_addcarrysi_0;
37505       mode0 = SImode;
37506       mode1 = DImode;
37507       mode2 = CCCmode;
37508       goto handlecarry;
37509 
37510     case IX86_BUILTIN_ADDCARRYX64:
37511       icode = CODE_FOR_addcarrydi;
37512       icode2 = CODE_FOR_addcarrydi_0;
37513       mode0 = DImode;
37514       mode1 = TImode;
37515       mode2 = CCCmode;
37516 
37517     handlecarry:
37518       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
37519       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
37520       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
37521       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
37522 
37523       op1 = expand_normal (arg0);
37524       if (!integer_zerop (arg0))
37525 	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37526 
37527       op2 = expand_normal (arg1);
37528       if (!register_operand (op2, mode0))
37529 	op2 = copy_to_mode_reg (mode0, op2);
37530 
37531       op3 = expand_normal (arg2);
37532       if (!register_operand (op3, mode0))
37533 	op3 = copy_to_mode_reg (mode0, op3);
37534 
37535       op4 = expand_normal (arg3);
37536       if (!address_operand (op4, VOIDmode))
37537 	{
37538 	  op4 = convert_memory_address (Pmode, op4);
37539 	  op4 = copy_addr_to_reg (op4);
37540 	}
37541 
37542       op0 = gen_reg_rtx (mode0);
37543       if (integer_zerop (arg0))
37544 	{
37545 	  /* If arg0 is 0, optimize right away into add or sub
37546 	     instruction that sets CCCmode flags.  */
37547 	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
37548 	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37549 	}
37550       else
37551 	{
37552 	  /* Generate CF from input operand.  */
37553 	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37554 
37555 	  /* Generate instruction that consumes CF.  */
37556 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37557 	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37558 	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37559 	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37560 	}
37561 
37562       /* Return current CF value.  */
37563       if (target == 0)
37564         target = gen_reg_rtx (QImode);
37565 
37566       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37567       emit_insn (gen_rtx_SET (target, pat));
37568 
37569       /* Store the result.  */
37570       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37571 
37572       return target;
37573 
37574     case IX86_BUILTIN_READ_FLAGS:
37575       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37576 
37577       if (optimize
37578 	  || target == NULL_RTX
37579 	  || !nonimmediate_operand (target, word_mode)
37580 	  || GET_MODE (target) != word_mode)
37581 	target = gen_reg_rtx (word_mode);
37582 
37583       emit_insn (gen_pop (target));
37584       return target;
37585 
37586     case IX86_BUILTIN_WRITE_FLAGS:
37587 
37588       arg0 = CALL_EXPR_ARG (exp, 0);
37589       op0 = expand_normal (arg0);
37590       if (!general_no_elim_operand (op0, word_mode))
37591 	op0 = copy_to_mode_reg (word_mode, op0);
37592 
37593       emit_insn (gen_push (op0));
37594       emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37595       return 0;
37596 
37597     case IX86_BUILTIN_KTESTC8:
37598       icode = CODE_FOR_ktestqi;
37599       mode3 = CCCmode;
37600       goto kortest;
37601 
37602     case IX86_BUILTIN_KTESTZ8:
37603       icode = CODE_FOR_ktestqi;
37604       mode3 = CCZmode;
37605       goto kortest;
37606 
37607     case IX86_BUILTIN_KTESTC16:
37608       icode = CODE_FOR_ktesthi;
37609       mode3 = CCCmode;
37610       goto kortest;
37611 
37612     case IX86_BUILTIN_KTESTZ16:
37613       icode = CODE_FOR_ktesthi;
37614       mode3 = CCZmode;
37615       goto kortest;
37616 
37617     case IX86_BUILTIN_KTESTC32:
37618       icode = CODE_FOR_ktestsi;
37619       mode3 = CCCmode;
37620       goto kortest;
37621 
37622     case IX86_BUILTIN_KTESTZ32:
37623       icode = CODE_FOR_ktestsi;
37624       mode3 = CCZmode;
37625       goto kortest;
37626 
37627     case IX86_BUILTIN_KTESTC64:
37628       icode = CODE_FOR_ktestdi;
37629       mode3 = CCCmode;
37630       goto kortest;
37631 
37632     case IX86_BUILTIN_KTESTZ64:
37633       icode = CODE_FOR_ktestdi;
37634       mode3 = CCZmode;
37635       goto kortest;
37636 
37637     case IX86_BUILTIN_KORTESTC8:
37638       icode = CODE_FOR_kortestqi;
37639       mode3 = CCCmode;
37640       goto kortest;
37641 
37642     case IX86_BUILTIN_KORTESTZ8:
37643       icode = CODE_FOR_kortestqi;
37644       mode3 = CCZmode;
37645       goto kortest;
37646 
37647     case IX86_BUILTIN_KORTESTC16:
37648       icode = CODE_FOR_kortesthi;
37649       mode3 = CCCmode;
37650       goto kortest;
37651 
37652     case IX86_BUILTIN_KORTESTZ16:
37653       icode = CODE_FOR_kortesthi;
37654       mode3 = CCZmode;
37655       goto kortest;
37656 
37657     case IX86_BUILTIN_KORTESTC32:
37658       icode = CODE_FOR_kortestsi;
37659       mode3 = CCCmode;
37660       goto kortest;
37661 
37662     case IX86_BUILTIN_KORTESTZ32:
37663       icode = CODE_FOR_kortestsi;
37664       mode3 = CCZmode;
37665       goto kortest;
37666 
37667     case IX86_BUILTIN_KORTESTC64:
37668       icode = CODE_FOR_kortestdi;
37669       mode3 = CCCmode;
37670       goto kortest;
37671 
37672     case IX86_BUILTIN_KORTESTZ64:
37673       icode = CODE_FOR_kortestdi;
37674       mode3 = CCZmode;
37675 
37676     kortest:
37677       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
37678       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
37679       op0 = expand_normal (arg0);
37680       op1 = expand_normal (arg1);
37681 
37682       mode0 = insn_data[icode].operand[0].mode;
37683       mode1 = insn_data[icode].operand[1].mode;
37684 
37685       if (GET_MODE (op0) != VOIDmode)
37686 	op0 = force_reg (GET_MODE (op0), op0);
37687 
37688       op0 = gen_lowpart (mode0, op0);
37689 
37690       if (!insn_data[icode].operand[0].predicate (op0, mode0))
37691 	op0 = copy_to_mode_reg (mode0, op0);
37692 
37693       if (GET_MODE (op1) != VOIDmode)
37694 	op1 = force_reg (GET_MODE (op1), op1);
37695 
37696       op1 = gen_lowpart (mode1, op1);
37697 
37698       if (!insn_data[icode].operand[1].predicate (op1, mode1))
37699 	op1 = copy_to_mode_reg (mode1, op1);
37700 
37701       target = gen_reg_rtx (QImode);
37702 
37703       /* Emit kortest.  */
37704       emit_insn (GEN_FCN (icode) (op0, op1));
37705       /* And use setcc to return result from flags.  */
37706       ix86_expand_setcc (target, EQ,
37707 			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37708       return target;
37709 
37710     case IX86_BUILTIN_GATHERSIV2DF:
37711       icode = CODE_FOR_avx2_gathersiv2df;
37712       goto gather_gen;
37713     case IX86_BUILTIN_GATHERSIV4DF:
37714       icode = CODE_FOR_avx2_gathersiv4df;
37715       goto gather_gen;
37716     case IX86_BUILTIN_GATHERDIV2DF:
37717       icode = CODE_FOR_avx2_gatherdiv2df;
37718       goto gather_gen;
37719     case IX86_BUILTIN_GATHERDIV4DF:
37720       icode = CODE_FOR_avx2_gatherdiv4df;
37721       goto gather_gen;
37722     case IX86_BUILTIN_GATHERSIV4SF:
37723       icode = CODE_FOR_avx2_gathersiv4sf;
37724       goto gather_gen;
37725     case IX86_BUILTIN_GATHERSIV8SF:
37726       icode = CODE_FOR_avx2_gathersiv8sf;
37727       goto gather_gen;
37728     case IX86_BUILTIN_GATHERDIV4SF:
37729       icode = CODE_FOR_avx2_gatherdiv4sf;
37730       goto gather_gen;
37731     case IX86_BUILTIN_GATHERDIV8SF:
37732       icode = CODE_FOR_avx2_gatherdiv8sf;
37733       goto gather_gen;
37734     case IX86_BUILTIN_GATHERSIV2DI:
37735       icode = CODE_FOR_avx2_gathersiv2di;
37736       goto gather_gen;
37737     case IX86_BUILTIN_GATHERSIV4DI:
37738       icode = CODE_FOR_avx2_gathersiv4di;
37739       goto gather_gen;
37740     case IX86_BUILTIN_GATHERDIV2DI:
37741       icode = CODE_FOR_avx2_gatherdiv2di;
37742       goto gather_gen;
37743     case IX86_BUILTIN_GATHERDIV4DI:
37744       icode = CODE_FOR_avx2_gatherdiv4di;
37745       goto gather_gen;
37746     case IX86_BUILTIN_GATHERSIV4SI:
37747       icode = CODE_FOR_avx2_gathersiv4si;
37748       goto gather_gen;
37749     case IX86_BUILTIN_GATHERSIV8SI:
37750       icode = CODE_FOR_avx2_gathersiv8si;
37751       goto gather_gen;
37752     case IX86_BUILTIN_GATHERDIV4SI:
37753       icode = CODE_FOR_avx2_gatherdiv4si;
37754       goto gather_gen;
37755     case IX86_BUILTIN_GATHERDIV8SI:
37756       icode = CODE_FOR_avx2_gatherdiv8si;
37757       goto gather_gen;
37758     case IX86_BUILTIN_GATHERALTSIV4DF:
37759       icode = CODE_FOR_avx2_gathersiv4df;
37760       goto gather_gen;
37761     case IX86_BUILTIN_GATHERALTDIV8SF:
37762       icode = CODE_FOR_avx2_gatherdiv8sf;
37763       goto gather_gen;
37764     case IX86_BUILTIN_GATHERALTSIV4DI:
37765       icode = CODE_FOR_avx2_gathersiv4di;
37766       goto gather_gen;
37767     case IX86_BUILTIN_GATHERALTDIV8SI:
37768       icode = CODE_FOR_avx2_gatherdiv8si;
37769       goto gather_gen;
37770     case IX86_BUILTIN_GATHER3SIV16SF:
37771       icode = CODE_FOR_avx512f_gathersiv16sf;
37772       goto gather_gen;
37773     case IX86_BUILTIN_GATHER3SIV8DF:
37774       icode = CODE_FOR_avx512f_gathersiv8df;
37775       goto gather_gen;
37776     case IX86_BUILTIN_GATHER3DIV16SF:
37777       icode = CODE_FOR_avx512f_gatherdiv16sf;
37778       goto gather_gen;
37779     case IX86_BUILTIN_GATHER3DIV8DF:
37780       icode = CODE_FOR_avx512f_gatherdiv8df;
37781       goto gather_gen;
37782     case IX86_BUILTIN_GATHER3SIV16SI:
37783       icode = CODE_FOR_avx512f_gathersiv16si;
37784       goto gather_gen;
37785     case IX86_BUILTIN_GATHER3SIV8DI:
37786       icode = CODE_FOR_avx512f_gathersiv8di;
37787       goto gather_gen;
37788     case IX86_BUILTIN_GATHER3DIV16SI:
37789       icode = CODE_FOR_avx512f_gatherdiv16si;
37790       goto gather_gen;
37791     case IX86_BUILTIN_GATHER3DIV8DI:
37792       icode = CODE_FOR_avx512f_gatherdiv8di;
37793       goto gather_gen;
37794     case IX86_BUILTIN_GATHER3ALTSIV8DF:
37795       icode = CODE_FOR_avx512f_gathersiv8df;
37796       goto gather_gen;
37797     case IX86_BUILTIN_GATHER3ALTDIV16SF:
37798       icode = CODE_FOR_avx512f_gatherdiv16sf;
37799       goto gather_gen;
37800     case IX86_BUILTIN_GATHER3ALTSIV8DI:
37801       icode = CODE_FOR_avx512f_gathersiv8di;
37802       goto gather_gen;
37803     case IX86_BUILTIN_GATHER3ALTDIV16SI:
37804       icode = CODE_FOR_avx512f_gatherdiv16si;
37805       goto gather_gen;
37806     case IX86_BUILTIN_GATHER3SIV2DF:
37807       icode = CODE_FOR_avx512vl_gathersiv2df;
37808       goto gather_gen;
37809     case IX86_BUILTIN_GATHER3SIV4DF:
37810       icode = CODE_FOR_avx512vl_gathersiv4df;
37811       goto gather_gen;
37812     case IX86_BUILTIN_GATHER3DIV2DF:
37813       icode = CODE_FOR_avx512vl_gatherdiv2df;
37814       goto gather_gen;
37815     case IX86_BUILTIN_GATHER3DIV4DF:
37816       icode = CODE_FOR_avx512vl_gatherdiv4df;
37817       goto gather_gen;
37818     case IX86_BUILTIN_GATHER3SIV4SF:
37819       icode = CODE_FOR_avx512vl_gathersiv4sf;
37820       goto gather_gen;
37821     case IX86_BUILTIN_GATHER3SIV8SF:
37822       icode = CODE_FOR_avx512vl_gathersiv8sf;
37823       goto gather_gen;
37824     case IX86_BUILTIN_GATHER3DIV4SF:
37825       icode = CODE_FOR_avx512vl_gatherdiv4sf;
37826       goto gather_gen;
37827     case IX86_BUILTIN_GATHER3DIV8SF:
37828       icode = CODE_FOR_avx512vl_gatherdiv8sf;
37829       goto gather_gen;
37830     case IX86_BUILTIN_GATHER3SIV2DI:
37831       icode = CODE_FOR_avx512vl_gathersiv2di;
37832       goto gather_gen;
37833     case IX86_BUILTIN_GATHER3SIV4DI:
37834       icode = CODE_FOR_avx512vl_gathersiv4di;
37835       goto gather_gen;
37836     case IX86_BUILTIN_GATHER3DIV2DI:
37837       icode = CODE_FOR_avx512vl_gatherdiv2di;
37838       goto gather_gen;
37839     case IX86_BUILTIN_GATHER3DIV4DI:
37840       icode = CODE_FOR_avx512vl_gatherdiv4di;
37841       goto gather_gen;
37842     case IX86_BUILTIN_GATHER3SIV4SI:
37843       icode = CODE_FOR_avx512vl_gathersiv4si;
37844       goto gather_gen;
37845     case IX86_BUILTIN_GATHER3SIV8SI:
37846       icode = CODE_FOR_avx512vl_gathersiv8si;
37847       goto gather_gen;
37848     case IX86_BUILTIN_GATHER3DIV4SI:
37849       icode = CODE_FOR_avx512vl_gatherdiv4si;
37850       goto gather_gen;
37851     case IX86_BUILTIN_GATHER3DIV8SI:
37852       icode = CODE_FOR_avx512vl_gatherdiv8si;
37853       goto gather_gen;
37854     case IX86_BUILTIN_GATHER3ALTSIV4DF:
37855       icode = CODE_FOR_avx512vl_gathersiv4df;
37856       goto gather_gen;
37857     case IX86_BUILTIN_GATHER3ALTDIV8SF:
37858       icode = CODE_FOR_avx512vl_gatherdiv8sf;
37859       goto gather_gen;
37860     case IX86_BUILTIN_GATHER3ALTSIV4DI:
37861       icode = CODE_FOR_avx512vl_gathersiv4di;
37862       goto gather_gen;
37863     case IX86_BUILTIN_GATHER3ALTDIV8SI:
37864       icode = CODE_FOR_avx512vl_gatherdiv8si;
37865       goto gather_gen;
37866     case IX86_BUILTIN_SCATTERSIV16SF:
37867       icode = CODE_FOR_avx512f_scattersiv16sf;
37868       goto scatter_gen;
37869     case IX86_BUILTIN_SCATTERSIV8DF:
37870       icode = CODE_FOR_avx512f_scattersiv8df;
37871       goto scatter_gen;
37872     case IX86_BUILTIN_SCATTERDIV16SF:
37873       icode = CODE_FOR_avx512f_scatterdiv16sf;
37874       goto scatter_gen;
37875     case IX86_BUILTIN_SCATTERDIV8DF:
37876       icode = CODE_FOR_avx512f_scatterdiv8df;
37877       goto scatter_gen;
37878     case IX86_BUILTIN_SCATTERSIV16SI:
37879       icode = CODE_FOR_avx512f_scattersiv16si;
37880       goto scatter_gen;
37881     case IX86_BUILTIN_SCATTERSIV8DI:
37882       icode = CODE_FOR_avx512f_scattersiv8di;
37883       goto scatter_gen;
37884     case IX86_BUILTIN_SCATTERDIV16SI:
37885       icode = CODE_FOR_avx512f_scatterdiv16si;
37886       goto scatter_gen;
37887     case IX86_BUILTIN_SCATTERDIV8DI:
37888       icode = CODE_FOR_avx512f_scatterdiv8di;
37889       goto scatter_gen;
37890     case IX86_BUILTIN_SCATTERSIV8SF:
37891       icode = CODE_FOR_avx512vl_scattersiv8sf;
37892       goto scatter_gen;
37893     case IX86_BUILTIN_SCATTERSIV4SF:
37894       icode = CODE_FOR_avx512vl_scattersiv4sf;
37895       goto scatter_gen;
37896     case IX86_BUILTIN_SCATTERSIV4DF:
37897       icode = CODE_FOR_avx512vl_scattersiv4df;
37898       goto scatter_gen;
37899     case IX86_BUILTIN_SCATTERSIV2DF:
37900       icode = CODE_FOR_avx512vl_scattersiv2df;
37901       goto scatter_gen;
37902     case IX86_BUILTIN_SCATTERDIV8SF:
37903       icode = CODE_FOR_avx512vl_scatterdiv8sf;
37904       goto scatter_gen;
37905     case IX86_BUILTIN_SCATTERDIV4SF:
37906       icode = CODE_FOR_avx512vl_scatterdiv4sf;
37907       goto scatter_gen;
37908     case IX86_BUILTIN_SCATTERDIV4DF:
37909       icode = CODE_FOR_avx512vl_scatterdiv4df;
37910       goto scatter_gen;
37911     case IX86_BUILTIN_SCATTERDIV2DF:
37912       icode = CODE_FOR_avx512vl_scatterdiv2df;
37913       goto scatter_gen;
37914     case IX86_BUILTIN_SCATTERSIV8SI:
37915       icode = CODE_FOR_avx512vl_scattersiv8si;
37916       goto scatter_gen;
37917     case IX86_BUILTIN_SCATTERSIV4SI:
37918       icode = CODE_FOR_avx512vl_scattersiv4si;
37919       goto scatter_gen;
37920     case IX86_BUILTIN_SCATTERSIV4DI:
37921       icode = CODE_FOR_avx512vl_scattersiv4di;
37922       goto scatter_gen;
37923     case IX86_BUILTIN_SCATTERSIV2DI:
37924       icode = CODE_FOR_avx512vl_scattersiv2di;
37925       goto scatter_gen;
37926     case IX86_BUILTIN_SCATTERDIV8SI:
37927       icode = CODE_FOR_avx512vl_scatterdiv8si;
37928       goto scatter_gen;
37929     case IX86_BUILTIN_SCATTERDIV4SI:
37930       icode = CODE_FOR_avx512vl_scatterdiv4si;
37931       goto scatter_gen;
37932     case IX86_BUILTIN_SCATTERDIV4DI:
37933       icode = CODE_FOR_avx512vl_scatterdiv4di;
37934       goto scatter_gen;
37935     case IX86_BUILTIN_SCATTERDIV2DI:
37936       icode = CODE_FOR_avx512vl_scatterdiv2di;
37937       goto scatter_gen;
37938     case IX86_BUILTIN_GATHERPFDPD:
37939       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37940       goto vec_prefetch_gen;
37941     case IX86_BUILTIN_SCATTERALTSIV8DF:
37942       icode = CODE_FOR_avx512f_scattersiv8df;
37943       goto scatter_gen;
37944     case IX86_BUILTIN_SCATTERALTDIV16SF:
37945       icode = CODE_FOR_avx512f_scatterdiv16sf;
37946       goto scatter_gen;
37947     case IX86_BUILTIN_SCATTERALTSIV8DI:
37948       icode = CODE_FOR_avx512f_scattersiv8di;
37949       goto scatter_gen;
37950     case IX86_BUILTIN_SCATTERALTDIV16SI:
37951       icode = CODE_FOR_avx512f_scatterdiv16si;
37952       goto scatter_gen;
37953     case IX86_BUILTIN_GATHERPFDPS:
37954       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37955       goto vec_prefetch_gen;
37956     case IX86_BUILTIN_GATHERPFQPD:
37957       icode = CODE_FOR_avx512pf_gatherpfv8didf;
37958       goto vec_prefetch_gen;
37959     case IX86_BUILTIN_GATHERPFQPS:
37960       icode = CODE_FOR_avx512pf_gatherpfv8disf;
37961       goto vec_prefetch_gen;
37962     case IX86_BUILTIN_SCATTERPFDPD:
37963       icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37964       goto vec_prefetch_gen;
37965     case IX86_BUILTIN_SCATTERPFDPS:
37966       icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37967       goto vec_prefetch_gen;
37968     case IX86_BUILTIN_SCATTERPFQPD:
37969       icode = CODE_FOR_avx512pf_scatterpfv8didf;
37970       goto vec_prefetch_gen;
37971     case IX86_BUILTIN_SCATTERPFQPS:
37972       icode = CODE_FOR_avx512pf_scatterpfv8disf;
37973       goto vec_prefetch_gen;
37974 
37975     gather_gen:
37976       rtx half;
37977       rtx (*gen) (rtx, rtx);
37978 
37979       arg0 = CALL_EXPR_ARG (exp, 0);
37980       arg1 = CALL_EXPR_ARG (exp, 1);
37981       arg2 = CALL_EXPR_ARG (exp, 2);
37982       arg3 = CALL_EXPR_ARG (exp, 3);
37983       arg4 = CALL_EXPR_ARG (exp, 4);
37984       op0 = expand_normal (arg0);
37985       op1 = expand_normal (arg1);
37986       op2 = expand_normal (arg2);
37987       op3 = expand_normal (arg3);
37988       op4 = expand_normal (arg4);
37989       /* Note the arg order is different from the operand order.  */
37990       mode0 = insn_data[icode].operand[1].mode;
37991       mode2 = insn_data[icode].operand[3].mode;
37992       mode3 = insn_data[icode].operand[4].mode;
37993       mode4 = insn_data[icode].operand[5].mode;
37994 
37995       if (target == NULL_RTX
37996 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
37997 	  || !insn_data[icode].operand[0].predicate (target,
37998 						     GET_MODE (target)))
37999 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38000       else
38001 	subtarget = target;
38002 
38003       switch (fcode)
38004 	{
38005 	case IX86_BUILTIN_GATHER3ALTSIV8DF:
38006 	case IX86_BUILTIN_GATHER3ALTSIV8DI:
38007 	  half = gen_reg_rtx (V8SImode);
38008 	  if (!nonimmediate_operand (op2, V16SImode))
38009 	    op2 = copy_to_mode_reg (V16SImode, op2);
38010 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
38011 	  op2 = half;
38012 	  break;
38013 	case IX86_BUILTIN_GATHER3ALTSIV4DF:
38014 	case IX86_BUILTIN_GATHER3ALTSIV4DI:
38015 	case IX86_BUILTIN_GATHERALTSIV4DF:
38016 	case IX86_BUILTIN_GATHERALTSIV4DI:
38017 	  half = gen_reg_rtx (V4SImode);
38018 	  if (!nonimmediate_operand (op2, V8SImode))
38019 	    op2 = copy_to_mode_reg (V8SImode, op2);
38020 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
38021 	  op2 = half;
38022 	  break;
38023 	case IX86_BUILTIN_GATHER3ALTDIV16SF:
38024 	case IX86_BUILTIN_GATHER3ALTDIV16SI:
38025 	  half = gen_reg_rtx (mode0);
38026 	  if (mode0 == V8SFmode)
38027 	    gen = gen_vec_extract_lo_v16sf;
38028 	  else
38029 	    gen = gen_vec_extract_lo_v16si;
38030 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
38031 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38032 	  emit_insn (gen (half, op0));
38033 	  op0 = half;
38034 	  if (GET_MODE (op3) != VOIDmode)
38035 	    {
38036 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
38037 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38038 	      emit_insn (gen (half, op3));
38039 	      op3 = half;
38040 	    }
38041 	  break;
38042 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
38043 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
38044 	case IX86_BUILTIN_GATHERALTDIV8SF:
38045 	case IX86_BUILTIN_GATHERALTDIV8SI:
38046 	  half = gen_reg_rtx (mode0);
38047 	  if (mode0 == V4SFmode)
38048 	    gen = gen_vec_extract_lo_v8sf;
38049 	  else
38050 	    gen = gen_vec_extract_lo_v8si;
38051 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
38052 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38053 	  emit_insn (gen (half, op0));
38054 	  op0 = half;
38055 	  if (GET_MODE (op3) != VOIDmode)
38056 	    {
38057 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
38058 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38059 	      emit_insn (gen (half, op3));
38060 	      op3 = half;
38061 	    }
38062 	  break;
38063 	default:
38064 	  break;
38065 	}
38066 
38067       /* Force memory operand only with base register here.  But we
38068 	 don't want to do it on memory operand for other builtin
38069 	 functions.  */
38070       op1 = ix86_zero_extend_to_Pmode (op1);
38071 
38072       if (!insn_data[icode].operand[1].predicate (op0, mode0))
38073 	op0 = copy_to_mode_reg (mode0, op0);
38074       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38075 	op1 = copy_to_mode_reg (Pmode, op1);
38076       if (!insn_data[icode].operand[3].predicate (op2, mode2))
38077 	op2 = copy_to_mode_reg (mode2, op2);
38078 
38079       op3 = fixup_modeless_constant (op3, mode3);
38080 
38081       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38082 	{
38083 	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
38084 	    op3 = copy_to_mode_reg (mode3, op3);
38085 	}
38086       else
38087 	{
38088 	  op3 = copy_to_reg (op3);
38089 	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38090 	}
38091       if (!insn_data[icode].operand[5].predicate (op4, mode4))
38092 	{
38093           error ("the last argument must be scale 1, 2, 4, 8");
38094           return const0_rtx;
38095 	}
38096 
38097       /* Optimize.  If mask is known to have all high bits set,
38098 	 replace op0 with pc_rtx to signal that the instruction
38099 	 overwrites the whole destination and doesn't use its
38100 	 previous contents.  */
38101       if (optimize)
38102 	{
38103 	  if (TREE_CODE (arg3) == INTEGER_CST)
38104 	    {
38105 	      if (integer_all_onesp (arg3))
38106 		op0 = pc_rtx;
38107 	    }
38108 	  else if (TREE_CODE (arg3) == VECTOR_CST)
38109 	    {
38110 	      unsigned int negative = 0;
38111 	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38112 		{
38113 		  tree cst = VECTOR_CST_ELT (arg3, i);
38114 		  if (TREE_CODE (cst) == INTEGER_CST
38115 		      && tree_int_cst_sign_bit (cst))
38116 		    negative++;
38117 		  else if (TREE_CODE (cst) == REAL_CST
38118 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38119 		    negative++;
38120 		}
38121 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38122 		op0 = pc_rtx;
38123 	    }
38124 	  else if (TREE_CODE (arg3) == SSA_NAME
38125 		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38126 	    {
38127 	      /* Recognize also when mask is like:
38128 		 __v2df src = _mm_setzero_pd ();
38129 		 __v2df mask = _mm_cmpeq_pd (src, src);
38130 		 or
38131 		 __v8sf src = _mm256_setzero_ps ();
38132 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38133 		 as that is a cheaper way to load all ones into
38134 		 a register than having to load a constant from
38135 		 memory.  */
38136 	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38137 	      if (is_gimple_call (def_stmt))
38138 		{
38139 		  tree fndecl = gimple_call_fndecl (def_stmt);
38140 		  if (fndecl
38141 		      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38142 		    switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38143 		      {
38144 		      case IX86_BUILTIN_CMPPD:
38145 		      case IX86_BUILTIN_CMPPS:
38146 		      case IX86_BUILTIN_CMPPD256:
38147 		      case IX86_BUILTIN_CMPPS256:
38148 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38149 			  break;
38150 			/* FALLTHRU */
38151 		      case IX86_BUILTIN_CMPEQPD:
38152 		      case IX86_BUILTIN_CMPEQPS:
38153 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38154 			    && initializer_zerop (gimple_call_arg (def_stmt,
38155 								   1)))
38156 			  op0 = pc_rtx;
38157 			break;
38158 		      default:
38159 			break;
38160 		      }
38161 		}
38162 	    }
38163 	}
38164 
38165       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38166       if (! pat)
38167 	return const0_rtx;
38168       emit_insn (pat);
38169 
38170       switch (fcode)
38171 	{
38172 	case IX86_BUILTIN_GATHER3DIV16SF:
38173 	  if (target == NULL_RTX)
38174 	    target = gen_reg_rtx (V8SFmode);
38175 	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38176 	  break;
38177 	case IX86_BUILTIN_GATHER3DIV16SI:
38178 	  if (target == NULL_RTX)
38179 	    target = gen_reg_rtx (V8SImode);
38180 	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38181 	  break;
38182 	case IX86_BUILTIN_GATHER3DIV8SF:
38183 	case IX86_BUILTIN_GATHERDIV8SF:
38184 	  if (target == NULL_RTX)
38185 	    target = gen_reg_rtx (V4SFmode);
38186 	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38187 	  break;
38188 	case IX86_BUILTIN_GATHER3DIV8SI:
38189 	case IX86_BUILTIN_GATHERDIV8SI:
38190 	  if (target == NULL_RTX)
38191 	    target = gen_reg_rtx (V4SImode);
38192 	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38193 	  break;
38194 	default:
38195 	  target = subtarget;
38196 	  break;
38197 	}
38198       return target;
38199 
38200     scatter_gen:
38201       arg0 = CALL_EXPR_ARG (exp, 0);
38202       arg1 = CALL_EXPR_ARG (exp, 1);
38203       arg2 = CALL_EXPR_ARG (exp, 2);
38204       arg3 = CALL_EXPR_ARG (exp, 3);
38205       arg4 = CALL_EXPR_ARG (exp, 4);
38206       op0 = expand_normal (arg0);
38207       op1 = expand_normal (arg1);
38208       op2 = expand_normal (arg2);
38209       op3 = expand_normal (arg3);
38210       op4 = expand_normal (arg4);
38211       mode1 = insn_data[icode].operand[1].mode;
38212       mode2 = insn_data[icode].operand[2].mode;
38213       mode3 = insn_data[icode].operand[3].mode;
38214       mode4 = insn_data[icode].operand[4].mode;
38215 
38216       /* Scatter instruction stores operand op3 to memory with
38217 	 indices from op2 and scale from op4 under writemask op1.
38218 	 If index operand op2 has more elements then source operand
38219 	 op3 one need to use only its low half. And vice versa.  */
38220       switch (fcode)
38221 	{
38222 	case IX86_BUILTIN_SCATTERALTSIV8DF:
38223 	case IX86_BUILTIN_SCATTERALTSIV8DI:
38224 	  half = gen_reg_rtx (V8SImode);
38225 	  if (!nonimmediate_operand (op2, V16SImode))
38226 	    op2 = copy_to_mode_reg (V16SImode, op2);
38227 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
38228 	  op2 = half;
38229 	  break;
38230 	case IX86_BUILTIN_SCATTERALTDIV16SF:
38231 	case IX86_BUILTIN_SCATTERALTDIV16SI:
38232 	  half = gen_reg_rtx (mode3);
38233 	  if (mode3 == V8SFmode)
38234 	    gen = gen_vec_extract_lo_v16sf;
38235 	  else
38236 	    gen = gen_vec_extract_lo_v16si;
38237 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
38238 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38239 	  emit_insn (gen (half, op3));
38240 	  op3 = half;
38241 	  break;
38242 	default:
38243 	  break;
38244 	}
38245 
38246       /* Force memory operand only with base register here.  But we
38247 	 don't want to do it on memory operand for other builtin
38248 	 functions.  */
38249       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38250 
38251       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38252 	op0 = copy_to_mode_reg (Pmode, op0);
38253 
38254       op1 = fixup_modeless_constant (op1, mode1);
38255 
38256       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38257 	{
38258 	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
38259 	    op1 = copy_to_mode_reg (mode1, op1);
38260 	}
38261       else
38262 	{
38263 	  op1 = copy_to_reg (op1);
38264 	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38265 	}
38266 
38267       if (!insn_data[icode].operand[2].predicate (op2, mode2))
38268 	op2 = copy_to_mode_reg (mode2, op2);
38269 
38270       if (!insn_data[icode].operand[3].predicate (op3, mode3))
38271 	op3 = copy_to_mode_reg (mode3, op3);
38272 
38273       if (!insn_data[icode].operand[4].predicate (op4, mode4))
38274 	{
38275 	  error ("the last argument must be scale 1, 2, 4, 8");
38276 	  return const0_rtx;
38277 	}
38278 
38279       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38280       if (! pat)
38281 	return const0_rtx;
38282 
38283       emit_insn (pat);
38284       return 0;
38285 
38286     vec_prefetch_gen:
38287       arg0 = CALL_EXPR_ARG (exp, 0);
38288       arg1 = CALL_EXPR_ARG (exp, 1);
38289       arg2 = CALL_EXPR_ARG (exp, 2);
38290       arg3 = CALL_EXPR_ARG (exp, 3);
38291       arg4 = CALL_EXPR_ARG (exp, 4);
38292       op0 = expand_normal (arg0);
38293       op1 = expand_normal (arg1);
38294       op2 = expand_normal (arg2);
38295       op3 = expand_normal (arg3);
38296       op4 = expand_normal (arg4);
38297       mode0 = insn_data[icode].operand[0].mode;
38298       mode1 = insn_data[icode].operand[1].mode;
38299       mode3 = insn_data[icode].operand[3].mode;
38300       mode4 = insn_data[icode].operand[4].mode;
38301 
38302       op0 = fixup_modeless_constant (op0, mode0);
38303 
38304       if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38305 	{
38306 	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
38307 	    op0 = copy_to_mode_reg (mode0, op0);
38308 	}
38309       else
38310 	{
38311 	  op0 = copy_to_reg (op0);
38312 	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38313 	}
38314 
38315       if (!insn_data[icode].operand[1].predicate (op1, mode1))
38316 	op1 = copy_to_mode_reg (mode1, op1);
38317 
38318       /* Force memory operand only with base register here.  But we
38319 	 don't want to do it on memory operand for other builtin
38320 	 functions.  */
38321       op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38322 
38323       if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38324 	op2 = copy_to_mode_reg (Pmode, op2);
38325 
38326       if (!insn_data[icode].operand[3].predicate (op3, mode3))
38327 	{
38328 	  error ("the forth argument must be scale 1, 2, 4, 8");
38329 	  return const0_rtx;
38330 	}
38331 
38332       if (!insn_data[icode].operand[4].predicate (op4, mode4))
38333 	{
38334 	  error ("incorrect hint operand");
38335 	  return const0_rtx;
38336 	}
38337 
38338       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38339       if (! pat)
38340 	return const0_rtx;
38341 
38342       emit_insn (pat);
38343 
38344       return 0;
38345 
38346     case IX86_BUILTIN_XABORT:
38347       icode = CODE_FOR_xabort;
38348       arg0 = CALL_EXPR_ARG (exp, 0);
38349       op0 = expand_normal (arg0);
38350       mode0 = insn_data[icode].operand[0].mode;
38351       if (!insn_data[icode].operand[0].predicate (op0, mode0))
38352 	{
38353 	  error ("the xabort's argument must be an 8-bit immediate");
38354 	  return const0_rtx;
38355 	}
38356       emit_insn (gen_xabort (op0));
38357       return 0;
38358 
38359     case IX86_BUILTIN_RSTORSSP:
38360     case IX86_BUILTIN_CLRSSBSY:
38361       arg0 = CALL_EXPR_ARG (exp, 0);
38362       op0 = expand_normal (arg0);
38363       icode = (fcode == IX86_BUILTIN_RSTORSSP
38364 	  ? CODE_FOR_rstorssp
38365 	  : CODE_FOR_clrssbsy);
38366       if (!address_operand (op0, VOIDmode))
38367 	{
38368 	  op1 = convert_memory_address (Pmode, op0);
38369 	  op0 = copy_addr_to_reg (op1);
38370 	}
38371       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38372       return 0;
38373 
38374     case IX86_BUILTIN_WRSSD:
38375     case IX86_BUILTIN_WRSSQ:
38376     case IX86_BUILTIN_WRUSSD:
38377     case IX86_BUILTIN_WRUSSQ:
38378       arg0 = CALL_EXPR_ARG (exp, 0);
38379       op0 = expand_normal (arg0);
38380       arg1 = CALL_EXPR_ARG (exp, 1);
38381       op1 = expand_normal (arg1);
38382       switch (fcode)
38383 	{
38384 	case IX86_BUILTIN_WRSSD:
38385 	  icode = CODE_FOR_wrsssi;
38386 	  mode = SImode;
38387 	  break;
38388 	case IX86_BUILTIN_WRSSQ:
38389 	  icode = CODE_FOR_wrssdi;
38390 	  mode = DImode;
38391 	  break;
38392 	case IX86_BUILTIN_WRUSSD:
38393 	  icode = CODE_FOR_wrusssi;
38394 	  mode = SImode;
38395 	  break;
38396 	case IX86_BUILTIN_WRUSSQ:
38397 	  icode = CODE_FOR_wrussdi;
38398 	  mode = DImode;
38399 	  break;
38400 	}
38401       op0 = force_reg (mode, op0);
38402       if (!address_operand (op1, VOIDmode))
38403 	{
38404 	  op2 = convert_memory_address (Pmode, op1);
38405 	  op1 = copy_addr_to_reg (op2);
38406 	}
38407       emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38408       return 0;
38409 
38410     default:
38411       break;
38412     }
38413 
38414   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38415       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38416     {
38417       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38418       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38419 					       target);
38420     }
38421 
38422   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
38423       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
38424     {
38425       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
38426       return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
38427 					       target);
38428     }
38429 
38430   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38431       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38432     {
38433       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38434       switch (fcode)
38435 	{
38436 	case IX86_BUILTIN_FABSQ:
38437 	case IX86_BUILTIN_COPYSIGNQ:
38438 	  if (!TARGET_SSE)
38439 	    /* Emit a normal call if SSE isn't available.  */
38440 	    return expand_call (exp, target, ignore);
38441 	  /* FALLTHRU */
38442 	default:
38443 	  return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38444 	}
38445     }
38446 
38447   if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38448       && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38449     {
38450       i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38451       rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38452       rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38453       rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38454       int masked = 1;
38455       machine_mode mode, wide_mode, nar_mode;
38456 
38457       nar_mode  = V4SFmode;
38458       mode      = V16SFmode;
38459       wide_mode = V64SFmode;
38460       fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
38461       fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38462 
38463       switch (fcode)
38464 	{
38465 	case IX86_BUILTIN_4FMAPS:
38466 	  fcn = gen_avx5124fmaddps_4fmaddps;
38467 	  masked = 0;
38468 	  goto v4fma_expand;
38469 
38470 	case IX86_BUILTIN_4DPWSSD:
38471 	  nar_mode  = V4SImode;
38472 	  mode      = V16SImode;
38473 	  wide_mode = V64SImode;
38474 	  fcn = gen_avx5124vnniw_vp4dpwssd;
38475 	  masked = 0;
38476 	  goto v4fma_expand;
38477 
38478 	case IX86_BUILTIN_4DPWSSDS:
38479 	  nar_mode  = V4SImode;
38480 	  mode      = V16SImode;
38481 	  wide_mode = V64SImode;
38482 	  fcn = gen_avx5124vnniw_vp4dpwssds;
38483 	  masked = 0;
38484 	  goto v4fma_expand;
38485 
38486 	case IX86_BUILTIN_4FNMAPS:
38487 	  fcn = gen_avx5124fmaddps_4fnmaddps;
38488 	  masked = 0;
38489 	  goto v4fma_expand;
38490 
38491 	case IX86_BUILTIN_4FNMAPS_MASK:
38492 	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
38493 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38494 	  goto v4fma_expand;
38495 
38496 	case IX86_BUILTIN_4DPWSSD_MASK:
38497 	  nar_mode  = V4SImode;
38498 	  mode      = V16SImode;
38499 	  wide_mode = V64SImode;
38500 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
38501 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38502 	  goto v4fma_expand;
38503 
38504 	case IX86_BUILTIN_4DPWSSDS_MASK:
38505 	  nar_mode  = V4SImode;
38506 	  mode      = V16SImode;
38507 	  wide_mode = V64SImode;
38508 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
38509 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38510 	  goto v4fma_expand;
38511 
38512 	case IX86_BUILTIN_4FMAPS_MASK:
38513 	  {
38514 	    tree args[4];
38515 	    rtx ops[4];
38516 	    rtx wide_reg;
38517 	    rtx accum;
38518 	    rtx addr;
38519 	    rtx mem;
38520 
38521 v4fma_expand:
38522 	    wide_reg = gen_reg_rtx (wide_mode);
38523 	    for (i = 0; i < 4; i++)
38524 	      {
38525 		args[i] = CALL_EXPR_ARG (exp, i);
38526 		ops[i] = expand_normal (args[i]);
38527 
38528 		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38529 				ops[i]);
38530 	      }
38531 
38532 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38533 	    accum = force_reg (mode, accum);
38534 
38535 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38536 	    addr = force_reg (Pmode, addr);
38537 
38538 	    mem = gen_rtx_MEM (nar_mode, addr);
38539 
38540 	    target = gen_reg_rtx (mode);
38541 
38542 	    emit_move_insn (target, accum);
38543 
38544 	    if (! masked)
38545 	      emit_insn (fcn (target, accum, wide_reg, mem));
38546 	    else
38547 	      {
38548 		rtx merge, mask;
38549 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38550 
38551 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38552 
38553 		if (CONST_INT_P (mask))
38554 		  mask = fixup_modeless_constant (mask, HImode);
38555 
38556 		mask = force_reg (HImode, mask);
38557 
38558 		if (GET_MODE (mask) != HImode)
38559 		  mask = gen_rtx_SUBREG (HImode, mask, 0);
38560 
38561 		/* If merge is 0 then we're about to emit z-masked variant.  */
38562 		if (const0_operand (merge, mode))
38563 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38564 		/* If merge is the same as accum then emit merge-masked variant.  */
38565 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38566 		  {
38567 		    merge = force_reg (mode, merge);
38568 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38569 		  }
38570 		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
38571 		else
38572 		  {
38573 		    target = gen_reg_rtx (mode);
38574 		    emit_move_insn (target, merge);
38575 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38576 		  }
38577 	      }
38578 	    return target;
38579 	  }
38580 
38581 	case IX86_BUILTIN_4FNMASS:
38582 	  fcn = gen_avx5124fmaddps_4fnmaddss;
38583 	  masked = 0;
38584 	  goto s4fma_expand;
38585 
38586 	case IX86_BUILTIN_4FMASS:
38587 	  fcn = gen_avx5124fmaddps_4fmaddss;
38588 	  masked = 0;
38589 	  goto s4fma_expand;
38590 
38591 	case IX86_BUILTIN_4FNMASS_MASK:
38592 	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38593 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38594 	  goto s4fma_expand;
38595 
38596 	case IX86_BUILTIN_4FMASS_MASK:
38597 	  {
38598 	    tree args[4];
38599 	    rtx ops[4];
38600 	    rtx wide_reg;
38601 	    rtx accum;
38602 	    rtx addr;
38603 	    rtx mem;
38604 
38605 	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38606 	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38607 
38608 s4fma_expand:
38609 	    mode = V4SFmode;
38610 	    wide_reg = gen_reg_rtx (V64SFmode);
38611 	    for (i = 0; i < 4; i++)
38612 	      {
38613 		rtx tmp;
38614 		args[i] = CALL_EXPR_ARG (exp, i);
38615 		ops[i] = expand_normal (args[i]);
38616 
38617 		tmp = gen_reg_rtx (SFmode);
38618 		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38619 
38620 		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38621 				gen_rtx_SUBREG (V16SFmode, tmp, 0));
38622 	      }
38623 
38624 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38625 	    accum = force_reg (V4SFmode, accum);
38626 
38627 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38628 	    addr = force_reg (Pmode, addr);
38629 
38630 	    mem = gen_rtx_MEM (V4SFmode, addr);
38631 
38632 	    target = gen_reg_rtx (V4SFmode);
38633 
38634 	    emit_move_insn (target, accum);
38635 
38636 	    if (! masked)
38637 	      emit_insn (fcn (target, accum, wide_reg, mem));
38638 	    else
38639 	      {
38640 		rtx merge, mask;
38641 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38642 
38643 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38644 
38645 		if (CONST_INT_P (mask))
38646 		  mask = fixup_modeless_constant (mask, QImode);
38647 
38648 		mask = force_reg (QImode, mask);
38649 
38650 		if (GET_MODE (mask) != QImode)
38651 		  mask = gen_rtx_SUBREG (QImode, mask, 0);
38652 
38653 		/* If merge is 0 then we're about to emit z-masked variant.  */
38654 		if (const0_operand (merge, mode))
38655 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38656 		/* If merge is the same as accum then emit merge-masked
38657 		   variant.  */
38658 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38659 		  {
38660 		    merge = force_reg (mode, merge);
38661 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38662 		  }
38663 		/* Merge with something unknown might happen if we z-mask
38664 		   w/ -O0.  */
38665 		else
38666 		  {
38667 		    target = gen_reg_rtx (mode);
38668 		    emit_move_insn (target, merge);
38669 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38670 		  }
38671 		}
38672 	      return target;
38673 	    }
38674 	  case IX86_BUILTIN_RDPID:
38675 	    return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38676 						     target);
38677 	  default:
38678 	    return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38679 	  }
38680     }
38681 
38682   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38683       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38684     {
38685       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38686       return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38687     }
38688 
38689   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38690       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38691     {
38692       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38693       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38694     }
38695 
38696   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38697       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38698     {
38699       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38700       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38701     }
38702 
38703   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38704       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38705     {
38706       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38707       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38708     }
38709 
38710   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38711       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38712     {
38713       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38714       const struct builtin_description *d = bdesc_multi_arg + i;
38715       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38716 					    (enum ix86_builtin_func_type)
38717 					    d->flag, d->comparison);
38718     }
38719 
38720   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38721       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38722     {
38723       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38724       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38725 					       target);
38726     }
38727 
38728   if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38729       && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38730     {
38731       i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38732       return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
38733 				       target);
38734     }
38735 
38736   gcc_unreachable ();
38737 }
38738 
38739 /* This returns the target-specific builtin with code CODE if
38740    current_function_decl has visibility on this builtin, which is checked
38741    using isa flags.  Returns NULL_TREE otherwise.  */
38742 
38743 static tree ix86_get_builtin (enum ix86_builtins code)
38744 {
38745   struct cl_target_option *opts;
38746   tree target_tree = NULL_TREE;
38747 
38748   /* Determine the isa flags of current_function_decl.  */
38749 
38750   if (current_function_decl)
38751     target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38752 
38753   if (target_tree == NULL)
38754     target_tree = target_option_default_node;
38755 
38756   opts = TREE_TARGET_OPTION (target_tree);
38757 
38758   if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38759       || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38760     return ix86_builtin_decl (code, true);
38761   else
38762     return NULL_TREE;
38763 }
38764 
38765 /* Return function decl for target specific builtin
38766    for given MPX builtin passed i FCODE.  */
38767 static tree
38768 ix86_builtin_mpx_function (unsigned fcode)
38769 {
38770   switch (fcode)
38771     {
38772     case BUILT_IN_CHKP_BNDMK:
38773       return ix86_builtins[IX86_BUILTIN_BNDMK];
38774 
38775     case BUILT_IN_CHKP_BNDSTX:
38776       return ix86_builtins[IX86_BUILTIN_BNDSTX];
38777 
38778     case BUILT_IN_CHKP_BNDLDX:
38779       return ix86_builtins[IX86_BUILTIN_BNDLDX];
38780 
38781     case BUILT_IN_CHKP_BNDCL:
38782       return ix86_builtins[IX86_BUILTIN_BNDCL];
38783 
38784     case BUILT_IN_CHKP_BNDCU:
38785       return ix86_builtins[IX86_BUILTIN_BNDCU];
38786 
38787     case BUILT_IN_CHKP_BNDRET:
38788       return ix86_builtins[IX86_BUILTIN_BNDRET];
38789 
38790     case BUILT_IN_CHKP_INTERSECT:
38791       return ix86_builtins[IX86_BUILTIN_BNDINT];
38792 
38793     case BUILT_IN_CHKP_NARROW:
38794       return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38795 
38796     case BUILT_IN_CHKP_SIZEOF:
38797       return ix86_builtins[IX86_BUILTIN_SIZEOF];
38798 
38799     case BUILT_IN_CHKP_EXTRACT_LOWER:
38800       return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38801 
38802     case BUILT_IN_CHKP_EXTRACT_UPPER:
38803       return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38804 
38805     default:
38806       return NULL_TREE;
38807     }
38808 
38809   gcc_unreachable ();
38810 }
38811 
38812 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38813 
38814    Return an address to be used to load/store bounds for pointer
38815    passed in SLOT.
38816 
38817    SLOT_NO is an integer constant holding number of a target
38818    dependent special slot to be used in case SLOT is not a memory.
38819 
38820    SPECIAL_BASE is a pointer to be used as a base of fake address
38821    to access special slots in Bounds Table.  SPECIAL_BASE[-1],
38822    SPECIAL_BASE[-2] etc. will be used as fake pointer locations.  */
38823 
38824 static rtx
38825 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38826 {
38827   rtx addr = NULL;
38828 
38829   /* NULL slot means we pass bounds for pointer not passed to the
38830      function at all.  Register slot means we pass pointer in a
38831      register.  In both these cases bounds are passed via Bounds
38832      Table.  Since we do not have actual pointer stored in memory,
38833      we have to use fake addresses to access Bounds Table.  We
38834      start with (special_base - sizeof (void*)) and decrease this
38835      address by pointer size to get addresses for other slots.  */
38836   if (!slot || REG_P (slot))
38837     {
38838       gcc_assert (CONST_INT_P (slot_no));
38839       addr = plus_constant (Pmode, special_base,
38840 			    -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38841     }
38842   /* If pointer is passed in a memory then its address is used to
38843      access Bounds Table.  */
38844   else if (MEM_P (slot))
38845     {
38846       addr = XEXP (slot, 0);
38847       if (!register_operand (addr, Pmode))
38848 	addr = copy_addr_to_reg (addr);
38849     }
38850   else
38851     gcc_unreachable ();
38852 
38853   return addr;
38854 }
38855 
38856 /* Expand pass uses this hook to load bounds for function parameter
38857    PTR passed in SLOT in case its bounds are not passed in a register.
38858 
38859    If SLOT is a memory, then bounds are loaded as for regular pointer
38860    loaded from memory.  PTR may be NULL in case SLOT is a memory.
38861    In such case value of PTR (if required) may be loaded from SLOT.
38862 
38863    If SLOT is NULL or a register then SLOT_NO is an integer constant
38864    holding number of the target dependent special slot which should be
38865    used to obtain bounds.
38866 
38867    Return loaded bounds.  */
38868 
38869 static rtx
38870 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38871 {
38872   rtx reg = gen_reg_rtx (BNDmode);
38873   rtx addr;
38874 
38875   /* Get address to be used to access Bounds Table.  Special slots start
38876      at the location of return address of the current function.  */
38877   addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38878 
38879   /* Load pointer value from a memory if we don't have it.  */
38880   if (!ptr)
38881     {
38882       gcc_assert (MEM_P (slot));
38883       ptr = copy_addr_to_reg (slot);
38884     }
38885 
38886   if (!register_operand (ptr, Pmode))
38887     ptr = ix86_zero_extend_to_Pmode (ptr);
38888 
38889   emit_insn (BNDmode == BND64mode
38890 	     ? gen_bnd64_ldx (reg, addr, ptr)
38891 	     : gen_bnd32_ldx (reg, addr, ptr));
38892 
38893   return reg;
38894 }
38895 
38896 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38897    passed in SLOT in case BOUNDS are not passed in a register.
38898 
38899    If SLOT is a memory, then BOUNDS are stored as for regular pointer
38900    stored in memory.  PTR may be NULL in case SLOT is a memory.
38901    In such case value of PTR (if required) may be loaded from SLOT.
38902 
38903    If SLOT is NULL or a register then SLOT_NO is an integer constant
38904    holding number of the target dependent special slot which should be
38905    used to store BOUNDS.  */
38906 
38907 static void
38908 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38909 {
38910   rtx addr;
38911 
38912   /* Get address to be used to access Bounds Table.  Special slots start
38913      at the location of return address of a called function.  */
38914   addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38915 
38916   /* Load pointer value from a memory if we don't have it.  */
38917   if (!ptr)
38918     {
38919       gcc_assert (MEM_P (slot));
38920       ptr = copy_addr_to_reg (slot);
38921     }
38922 
38923   if (!register_operand (ptr, Pmode))
38924     ptr = ix86_zero_extend_to_Pmode (ptr);
38925 
38926   gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38927   if (!register_operand (bounds, BNDmode))
38928     bounds = copy_to_mode_reg (BNDmode, bounds);
38929 
38930   emit_insn (BNDmode == BND64mode
38931 	     ? gen_bnd64_stx (addr, ptr, bounds)
38932 	     : gen_bnd32_stx (addr, ptr, bounds));
38933 }
38934 
38935 /* Load and return bounds returned by function in SLOT.  */
38936 
38937 static rtx
38938 ix86_load_returned_bounds (rtx slot)
38939 {
38940   rtx res;
38941 
38942   gcc_assert (REG_P (slot));
38943   res = gen_reg_rtx (BNDmode);
38944   emit_move_insn (res, slot);
38945 
38946   return res;
38947 }
38948 
38949 /* Store BOUNDS returned by function into SLOT.  */
38950 
38951 static void
38952 ix86_store_returned_bounds (rtx slot, rtx bounds)
38953 {
38954   gcc_assert (REG_P (slot));
38955   emit_move_insn (slot, bounds);
38956 }
38957 
38958 /* Returns a function decl for a vectorized version of the combined function
38959    with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38960    if it is not available.  */
38961 
38962 static tree
38963 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38964 				  tree type_in)
38965 {
38966   machine_mode in_mode, out_mode;
38967   int in_n, out_n;
38968 
38969   if (TREE_CODE (type_out) != VECTOR_TYPE
38970       || TREE_CODE (type_in) != VECTOR_TYPE)
38971     return NULL_TREE;
38972 
38973   out_mode = TYPE_MODE (TREE_TYPE (type_out));
38974   out_n = TYPE_VECTOR_SUBPARTS (type_out);
38975   in_mode = TYPE_MODE (TREE_TYPE (type_in));
38976   in_n = TYPE_VECTOR_SUBPARTS (type_in);
38977 
38978   switch (fn)
38979     {
38980     CASE_CFN_EXP2:
38981       if (out_mode == SFmode && in_mode == SFmode)
38982 	{
38983 	  if (out_n == 16 && in_n == 16)
38984 	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38985 	}
38986       break;
38987 
38988     CASE_CFN_IFLOOR:
38989     CASE_CFN_LFLOOR:
38990     CASE_CFN_LLFLOOR:
38991       /* The round insn does not trap on denormals.  */
38992       if (flag_trapping_math || !TARGET_SSE4_1)
38993 	break;
38994 
38995       if (out_mode == SImode && in_mode == DFmode)
38996 	{
38997 	  if (out_n == 4 && in_n == 2)
38998 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38999 	  else if (out_n == 8 && in_n == 4)
39000 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39001 	  else if (out_n == 16 && in_n == 8)
39002 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39003 	}
39004       if (out_mode == SImode && in_mode == SFmode)
39005 	{
39006 	  if (out_n == 4 && in_n == 4)
39007 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39008 	  else if (out_n == 8 && in_n == 8)
39009 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39010 	  else if (out_n == 16 && in_n == 16)
39011 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39012 	}
39013       break;
39014 
39015     CASE_CFN_ICEIL:
39016     CASE_CFN_LCEIL:
39017     CASE_CFN_LLCEIL:
39018       /* The round insn does not trap on denormals.  */
39019       if (flag_trapping_math || !TARGET_SSE4_1)
39020 	break;
39021 
39022       if (out_mode == SImode && in_mode == DFmode)
39023 	{
39024 	  if (out_n == 4 && in_n == 2)
39025 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39026 	  else if (out_n == 8 && in_n == 4)
39027 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39028 	  else if (out_n == 16 && in_n == 8)
39029 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39030 	}
39031       if (out_mode == SImode && in_mode == SFmode)
39032 	{
39033 	  if (out_n == 4 && in_n == 4)
39034 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39035 	  else if (out_n == 8 && in_n == 8)
39036 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39037 	  else if (out_n == 16 && in_n == 16)
39038 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39039 	}
39040       break;
39041 
39042     CASE_CFN_IRINT:
39043     CASE_CFN_LRINT:
39044     CASE_CFN_LLRINT:
39045       if (out_mode == SImode && in_mode == DFmode)
39046 	{
39047 	  if (out_n == 4 && in_n == 2)
39048 	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39049 	  else if (out_n == 8 && in_n == 4)
39050 	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39051 	  else if (out_n == 16 && in_n == 8)
39052 	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39053 	}
39054       if (out_mode == SImode && in_mode == SFmode)
39055 	{
39056 	  if (out_n == 4 && in_n == 4)
39057 	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39058 	  else if (out_n == 8 && in_n == 8)
39059 	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39060 	  else if (out_n == 16 && in_n == 16)
39061 	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39062 	}
39063       break;
39064 
39065     CASE_CFN_IROUND:
39066     CASE_CFN_LROUND:
39067     CASE_CFN_LLROUND:
39068       /* The round insn does not trap on denormals.  */
39069       if (flag_trapping_math || !TARGET_SSE4_1)
39070 	break;
39071 
39072       if (out_mode == SImode && in_mode == DFmode)
39073 	{
39074 	  if (out_n == 4 && in_n == 2)
39075 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39076 	  else if (out_n == 8 && in_n == 4)
39077 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39078 	  else if (out_n == 16 && in_n == 8)
39079 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39080 	}
39081       if (out_mode == SImode && in_mode == SFmode)
39082 	{
39083 	  if (out_n == 4 && in_n == 4)
39084 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39085 	  else if (out_n == 8 && in_n == 8)
39086 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39087 	  else if (out_n == 16 && in_n == 16)
39088 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39089 	}
39090       break;
39091 
39092     CASE_CFN_FLOOR:
39093       /* The round insn does not trap on denormals.  */
39094       if (flag_trapping_math || !TARGET_SSE4_1)
39095 	break;
39096 
39097       if (out_mode == DFmode && in_mode == DFmode)
39098 	{
39099 	  if (out_n == 2 && in_n == 2)
39100 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39101 	  else if (out_n == 4 && in_n == 4)
39102 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39103 	  else if (out_n == 8 && in_n == 8)
39104 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39105 	}
39106       if (out_mode == SFmode && in_mode == SFmode)
39107 	{
39108 	  if (out_n == 4 && in_n == 4)
39109 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39110 	  else if (out_n == 8 && in_n == 8)
39111 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39112 	  else if (out_n == 16 && in_n == 16)
39113 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39114 	}
39115       break;
39116 
39117     CASE_CFN_CEIL:
39118       /* The round insn does not trap on denormals.  */
39119       if (flag_trapping_math || !TARGET_SSE4_1)
39120 	break;
39121 
39122       if (out_mode == DFmode && in_mode == DFmode)
39123 	{
39124 	  if (out_n == 2 && in_n == 2)
39125 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39126 	  else if (out_n == 4 && in_n == 4)
39127 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39128 	  else if (out_n == 8 && in_n == 8)
39129 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39130 	}
39131       if (out_mode == SFmode && in_mode == SFmode)
39132 	{
39133 	  if (out_n == 4 && in_n == 4)
39134 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39135 	  else if (out_n == 8 && in_n == 8)
39136 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39137 	  else if (out_n == 16 && in_n == 16)
39138 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39139 	}
39140       break;
39141 
39142     CASE_CFN_TRUNC:
39143       /* The round insn does not trap on denormals.  */
39144       if (flag_trapping_math || !TARGET_SSE4_1)
39145 	break;
39146 
39147       if (out_mode == DFmode && in_mode == DFmode)
39148 	{
39149 	  if (out_n == 2 && in_n == 2)
39150 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39151 	  else if (out_n == 4 && in_n == 4)
39152 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39153 	  else if (out_n == 8 && in_n == 8)
39154 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39155 	}
39156       if (out_mode == SFmode && in_mode == SFmode)
39157 	{
39158 	  if (out_n == 4 && in_n == 4)
39159 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39160 	  else if (out_n == 8 && in_n == 8)
39161 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39162 	  else if (out_n == 16 && in_n == 16)
39163 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39164 	}
39165       break;
39166 
39167     CASE_CFN_RINT:
39168       /* The round insn does not trap on denormals.  */
39169       if (flag_trapping_math || !TARGET_SSE4_1)
39170 	break;
39171 
39172       if (out_mode == DFmode && in_mode == DFmode)
39173 	{
39174 	  if (out_n == 2 && in_n == 2)
39175 	    return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39176 	  else if (out_n == 4 && in_n == 4)
39177 	    return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39178 	}
39179       if (out_mode == SFmode && in_mode == SFmode)
39180 	{
39181 	  if (out_n == 4 && in_n == 4)
39182 	    return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39183 	  else if (out_n == 8 && in_n == 8)
39184 	    return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39185 	}
39186       break;
39187 
39188     CASE_CFN_FMA:
39189       if (out_mode == DFmode && in_mode == DFmode)
39190 	{
39191 	  if (out_n == 2 && in_n == 2)
39192 	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39193 	  if (out_n == 4 && in_n == 4)
39194 	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39195 	}
39196       if (out_mode == SFmode && in_mode == SFmode)
39197 	{
39198 	  if (out_n == 4 && in_n == 4)
39199 	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39200 	  if (out_n == 8 && in_n == 8)
39201 	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39202 	}
39203       break;
39204 
39205     default:
39206       break;
39207     }
39208 
39209   /* Dispatch to a handler for a vectorization library.  */
39210   if (ix86_veclib_handler)
39211     return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39212 
39213   return NULL_TREE;
39214 }
39215 
39216 /* Handler for an SVML-style interface to
39217    a library with vectorized intrinsics.  */
39218 
39219 static tree
39220 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39221 {
39222   char name[20];
39223   tree fntype, new_fndecl, args;
39224   unsigned arity;
39225   const char *bname;
39226   machine_mode el_mode, in_mode;
39227   int n, in_n;
39228 
39229   /* The SVML is suitable for unsafe math only.  */
39230   if (!flag_unsafe_math_optimizations)
39231     return NULL_TREE;
39232 
39233   el_mode = TYPE_MODE (TREE_TYPE (type_out));
39234   n = TYPE_VECTOR_SUBPARTS (type_out);
39235   in_mode = TYPE_MODE (TREE_TYPE (type_in));
39236   in_n = TYPE_VECTOR_SUBPARTS (type_in);
39237   if (el_mode != in_mode
39238       || n != in_n)
39239     return NULL_TREE;
39240 
39241   switch (fn)
39242     {
39243     CASE_CFN_EXP:
39244     CASE_CFN_LOG:
39245     CASE_CFN_LOG10:
39246     CASE_CFN_POW:
39247     CASE_CFN_TANH:
39248     CASE_CFN_TAN:
39249     CASE_CFN_ATAN:
39250     CASE_CFN_ATAN2:
39251     CASE_CFN_ATANH:
39252     CASE_CFN_CBRT:
39253     CASE_CFN_SINH:
39254     CASE_CFN_SIN:
39255     CASE_CFN_ASINH:
39256     CASE_CFN_ASIN:
39257     CASE_CFN_COSH:
39258     CASE_CFN_COS:
39259     CASE_CFN_ACOSH:
39260     CASE_CFN_ACOS:
39261       if ((el_mode != DFmode || n != 2)
39262 	  && (el_mode != SFmode || n != 4))
39263 	return NULL_TREE;
39264       break;
39265 
39266     default:
39267       return NULL_TREE;
39268     }
39269 
39270   tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39271   bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39272 
39273   if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39274     strcpy (name, "vmlsLn4");
39275   else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39276     strcpy (name, "vmldLn2");
39277   else if (n == 4)
39278     {
39279       sprintf (name, "vmls%s", bname+10);
39280       name[strlen (name)-1] = '4';
39281     }
39282   else
39283     sprintf (name, "vmld%s2", bname+10);
39284 
39285   /* Convert to uppercase. */
39286   name[4] &= ~0x20;
39287 
39288   arity = 0;
39289   for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39290     arity++;
39291 
39292   if (arity == 1)
39293     fntype = build_function_type_list (type_out, type_in, NULL);
39294   else
39295     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39296 
39297   /* Build a function declaration for the vectorized function.  */
39298   new_fndecl = build_decl (BUILTINS_LOCATION,
39299 			   FUNCTION_DECL, get_identifier (name), fntype);
39300   TREE_PUBLIC (new_fndecl) = 1;
39301   DECL_EXTERNAL (new_fndecl) = 1;
39302   DECL_IS_NOVOPS (new_fndecl) = 1;
39303   TREE_READONLY (new_fndecl) = 1;
39304 
39305   return new_fndecl;
39306 }
39307 
39308 /* Handler for an ACML-style interface to
39309    a library with vectorized intrinsics.  */
39310 
39311 static tree
39312 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39313 {
39314   char name[20] = "__vr.._";
39315   tree fntype, new_fndecl, args;
39316   unsigned arity;
39317   const char *bname;
39318   machine_mode el_mode, in_mode;
39319   int n, in_n;
39320 
39321   /* The ACML is 64bits only and suitable for unsafe math only as
39322      it does not correctly support parts of IEEE with the required
39323      precision such as denormals.  */
39324   if (!TARGET_64BIT
39325       || !flag_unsafe_math_optimizations)
39326     return NULL_TREE;
39327 
39328   el_mode = TYPE_MODE (TREE_TYPE (type_out));
39329   n = TYPE_VECTOR_SUBPARTS (type_out);
39330   in_mode = TYPE_MODE (TREE_TYPE (type_in));
39331   in_n = TYPE_VECTOR_SUBPARTS (type_in);
39332   if (el_mode != in_mode
39333       || n != in_n)
39334     return NULL_TREE;
39335 
39336   switch (fn)
39337     {
39338     CASE_CFN_SIN:
39339     CASE_CFN_COS:
39340     CASE_CFN_EXP:
39341     CASE_CFN_LOG:
39342     CASE_CFN_LOG2:
39343     CASE_CFN_LOG10:
39344       if (el_mode == DFmode && n == 2)
39345 	{
39346 	  name[4] = 'd';
39347 	  name[5] = '2';
39348 	}
39349       else if (el_mode == SFmode && n == 4)
39350 	{
39351 	  name[4] = 's';
39352 	  name[5] = '4';
39353 	}
39354       else
39355 	return NULL_TREE;
39356       break;
39357 
39358     default:
39359       return NULL_TREE;
39360     }
39361 
39362   tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39363   bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39364   sprintf (name + 7, "%s", bname+10);
39365 
39366   arity = 0;
39367   for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39368     arity++;
39369 
39370   if (arity == 1)
39371     fntype = build_function_type_list (type_out, type_in, NULL);
39372   else
39373     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39374 
39375   /* Build a function declaration for the vectorized function.  */
39376   new_fndecl = build_decl (BUILTINS_LOCATION,
39377 			   FUNCTION_DECL, get_identifier (name), fntype);
39378   TREE_PUBLIC (new_fndecl) = 1;
39379   DECL_EXTERNAL (new_fndecl) = 1;
39380   DECL_IS_NOVOPS (new_fndecl) = 1;
39381   TREE_READONLY (new_fndecl) = 1;
39382 
39383   return new_fndecl;
39384 }
39385 
39386 /* Returns a decl of a function that implements gather load with
39387    memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39388    Return NULL_TREE if it is not available.  */
39389 
39390 static tree
39391 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39392 			       const_tree index_type, int scale)
39393 {
39394   bool si;
39395   enum ix86_builtins code;
39396 
39397   if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39398     return NULL_TREE;
39399 
39400   if ((TREE_CODE (index_type) != INTEGER_TYPE
39401        && !POINTER_TYPE_P (index_type))
39402       || (TYPE_MODE (index_type) != SImode
39403 	  && TYPE_MODE (index_type) != DImode))
39404     return NULL_TREE;
39405 
39406   if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39407     return NULL_TREE;
39408 
39409   /* v*gather* insn sign extends index to pointer mode.  */
39410   if (TYPE_PRECISION (index_type) < POINTER_SIZE
39411       && TYPE_UNSIGNED (index_type))
39412     return NULL_TREE;
39413 
39414   if (scale <= 0
39415       || scale > 8
39416       || (scale & (scale - 1)) != 0)
39417     return NULL_TREE;
39418 
39419   si = TYPE_MODE (index_type) == SImode;
39420   switch (TYPE_MODE (mem_vectype))
39421     {
39422     case E_V2DFmode:
39423       if (TARGET_AVX512VL)
39424 	code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39425       else
39426 	code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39427       break;
39428     case E_V4DFmode:
39429       if (TARGET_AVX512VL)
39430 	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39431       else
39432 	code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39433       break;
39434     case E_V2DImode:
39435       if (TARGET_AVX512VL)
39436 	code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39437       else
39438 	code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39439       break;
39440     case E_V4DImode:
39441       if (TARGET_AVX512VL)
39442 	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39443       else
39444 	code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39445       break;
39446     case E_V4SFmode:
39447       if (TARGET_AVX512VL)
39448 	code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39449       else
39450 	code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39451       break;
39452     case E_V8SFmode:
39453       if (TARGET_AVX512VL)
39454 	code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39455       else
39456 	code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39457       break;
39458     case E_V4SImode:
39459       if (TARGET_AVX512VL)
39460 	code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39461       else
39462 	code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39463       break;
39464     case E_V8SImode:
39465       if (TARGET_AVX512VL)
39466 	code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39467       else
39468 	code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39469       break;
39470     case E_V8DFmode:
39471       if (TARGET_AVX512F)
39472 	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39473       else
39474 	return NULL_TREE;
39475       break;
39476     case E_V8DImode:
39477       if (TARGET_AVX512F)
39478 	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39479       else
39480 	return NULL_TREE;
39481       break;
39482     case E_V16SFmode:
39483       if (TARGET_AVX512F)
39484 	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39485       else
39486 	return NULL_TREE;
39487       break;
39488     case E_V16SImode:
39489       if (TARGET_AVX512F)
39490 	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39491       else
39492 	return NULL_TREE;
39493       break;
39494     default:
39495       return NULL_TREE;
39496     }
39497 
39498   return ix86_get_builtin (code);
39499 }
39500 
39501 /* Returns a decl of a function that implements scatter store with
39502    register type VECTYPE and index type INDEX_TYPE and SCALE.
39503    Return NULL_TREE if it is not available.  */
39504 
39505 static tree
39506 ix86_vectorize_builtin_scatter (const_tree vectype,
39507 				const_tree index_type, int scale)
39508 {
39509   bool si;
39510   enum ix86_builtins code;
39511 
39512   if (!TARGET_AVX512F)
39513     return NULL_TREE;
39514 
39515   if ((TREE_CODE (index_type) != INTEGER_TYPE
39516        && !POINTER_TYPE_P (index_type))
39517       || (TYPE_MODE (index_type) != SImode
39518 	  && TYPE_MODE (index_type) != DImode))
39519     return NULL_TREE;
39520 
39521   if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39522     return NULL_TREE;
39523 
39524   /* v*scatter* insn sign extends index to pointer mode.  */
39525   if (TYPE_PRECISION (index_type) < POINTER_SIZE
39526       && TYPE_UNSIGNED (index_type))
39527     return NULL_TREE;
39528 
39529   /* Scale can be 1, 2, 4 or 8.  */
39530   if (scale <= 0
39531       || scale > 8
39532       || (scale & (scale - 1)) != 0)
39533     return NULL_TREE;
39534 
39535   si = TYPE_MODE (index_type) == SImode;
39536   switch (TYPE_MODE (vectype))
39537     {
39538     case E_V8DFmode:
39539       code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39540       break;
39541     case E_V8DImode:
39542       code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39543       break;
39544     case E_V16SFmode:
39545       code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39546       break;
39547     case E_V16SImode:
39548       code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39549       break;
39550     default:
39551       return NULL_TREE;
39552     }
39553 
39554   return ix86_builtins[code];
39555 }
39556 
39557 /* Return true if it is safe to use the rsqrt optabs to optimize
39558    1.0/sqrt.  */
39559 
39560 static bool
39561 use_rsqrt_p ()
39562 {
39563   return (TARGET_SSE && TARGET_SSE_MATH
39564 	  && flag_finite_math_only
39565 	  && !flag_trapping_math
39566 	  && flag_unsafe_math_optimizations);
39567 }
39568 
39569 /* Returns a code for a target-specific builtin that implements
39570    reciprocal of the function, or NULL_TREE if not available.  */
39571 
39572 static tree
39573 ix86_builtin_reciprocal (tree fndecl)
39574 {
39575   switch (DECL_FUNCTION_CODE (fndecl))
39576     {
39577       /* Vectorized version of sqrt to rsqrt conversion.  */
39578     case IX86_BUILTIN_SQRTPS_NR:
39579       return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39580 
39581     case IX86_BUILTIN_SQRTPS_NR256:
39582       return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39583 
39584     default:
39585       return NULL_TREE;
39586     }
39587 }
39588 
39589 /* Helper for avx_vpermilps256_operand et al.  This is also used by
39590    the expansion functions to turn the parallel back into a mask.
39591    The return value is 0 for no match and the imm8+1 for a match.  */
39592 
39593 int
39594 avx_vpermilp_parallel (rtx par, machine_mode mode)
39595 {
39596   unsigned i, nelt = GET_MODE_NUNITS (mode);
39597   unsigned mask = 0;
39598   unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
39599 
39600   if (XVECLEN (par, 0) != (int) nelt)
39601     return 0;
39602 
39603   /* Validate that all of the elements are constants, and not totally
39604      out of range.  Copy the data into an integral array to make the
39605      subsequent checks easier.  */
39606   for (i = 0; i < nelt; ++i)
39607     {
39608       rtx er = XVECEXP (par, 0, i);
39609       unsigned HOST_WIDE_INT ei;
39610 
39611       if (!CONST_INT_P (er))
39612 	return 0;
39613       ei = INTVAL (er);
39614       if (ei >= nelt)
39615 	return 0;
39616       ipar[i] = ei;
39617     }
39618 
39619   switch (mode)
39620     {
39621     case E_V8DFmode:
39622       /* In the 512-bit DFmode case, we can only move elements within
39623          a 128-bit lane.  First fill the second part of the mask,
39624 	 then fallthru.  */
39625       for (i = 4; i < 6; ++i)
39626 	{
39627 	  if (ipar[i] < 4 || ipar[i] >= 6)
39628 	    return 0;
39629 	  mask |= (ipar[i] - 4) << i;
39630 	}
39631       for (i = 6; i < 8; ++i)
39632 	{
39633 	  if (ipar[i] < 6)
39634 	    return 0;
39635 	  mask |= (ipar[i] - 6) << i;
39636 	}
39637       /* FALLTHRU */
39638 
39639     case E_V4DFmode:
39640       /* In the 256-bit DFmode case, we can only move elements within
39641          a 128-bit lane.  */
39642       for (i = 0; i < 2; ++i)
39643 	{
39644 	  if (ipar[i] >= 2)
39645 	    return 0;
39646 	  mask |= ipar[i] << i;
39647 	}
39648       for (i = 2; i < 4; ++i)
39649 	{
39650 	  if (ipar[i] < 2)
39651 	    return 0;
39652 	  mask |= (ipar[i] - 2) << i;
39653 	}
39654       break;
39655 
39656     case E_V16SFmode:
39657       /* In 512 bit SFmode case, permutation in the upper 256 bits
39658 	 must mirror the permutation in the lower 256-bits.  */
39659       for (i = 0; i < 8; ++i)
39660 	if (ipar[i] + 8 != ipar[i + 8])
39661 	  return 0;
39662       /* FALLTHRU */
39663 
39664     case E_V8SFmode:
39665       /* In 256 bit SFmode case, we have full freedom of
39666          movement within the low 128-bit lane, but the high 128-bit
39667          lane must mirror the exact same pattern.  */
39668       for (i = 0; i < 4; ++i)
39669 	if (ipar[i] + 4 != ipar[i + 4])
39670 	  return 0;
39671       nelt = 4;
39672       /* FALLTHRU */
39673 
39674     case E_V2DFmode:
39675     case E_V4SFmode:
39676       /* In the 128-bit case, we've full freedom in the placement of
39677 	 the elements from the source operand.  */
39678       for (i = 0; i < nelt; ++i)
39679 	mask |= ipar[i] << (i * (nelt / 2));
39680       break;
39681 
39682     default:
39683       gcc_unreachable ();
39684     }
39685 
39686   /* Make sure success has a non-zero value by adding one.  */
39687   return mask + 1;
39688 }
39689 
39690 /* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
39691    the expansion functions to turn the parallel back into a mask.
39692    The return value is 0 for no match and the imm8+1 for a match.  */
39693 
39694 int
39695 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39696 {
39697   unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39698   unsigned mask = 0;
39699   unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
39700 
39701   if (XVECLEN (par, 0) != (int) nelt)
39702     return 0;
39703 
39704   /* Validate that all of the elements are constants, and not totally
39705      out of range.  Copy the data into an integral array to make the
39706      subsequent checks easier.  */
39707   for (i = 0; i < nelt; ++i)
39708     {
39709       rtx er = XVECEXP (par, 0, i);
39710       unsigned HOST_WIDE_INT ei;
39711 
39712       if (!CONST_INT_P (er))
39713 	return 0;
39714       ei = INTVAL (er);
39715       if (ei >= 2 * nelt)
39716 	return 0;
39717       ipar[i] = ei;
39718     }
39719 
39720   /* Validate that the halves of the permute are halves.  */
39721   for (i = 0; i < nelt2 - 1; ++i)
39722     if (ipar[i] + 1 != ipar[i + 1])
39723       return 0;
39724   for (i = nelt2; i < nelt - 1; ++i)
39725     if (ipar[i] + 1 != ipar[i + 1])
39726       return 0;
39727 
39728   /* Reconstruct the mask.  */
39729   for (i = 0; i < 2; ++i)
39730     {
39731       unsigned e = ipar[i * nelt2];
39732       if (e % nelt2)
39733 	return 0;
39734       e /= nelt2;
39735       mask |= e << (i * 4);
39736     }
39737 
39738   /* Make sure success has a non-zero value by adding one.  */
39739   return mask + 1;
39740 }
39741 
39742 /* Return a register priority for hard reg REGNO.  */
39743 static int
39744 ix86_register_priority (int hard_regno)
39745 {
39746   /* ebp and r13 as the base always wants a displacement, r12 as the
39747      base always wants an index.  So discourage their usage in an
39748      address.  */
39749   if (hard_regno == R12_REG || hard_regno == R13_REG)
39750     return 0;
39751   if (hard_regno == BP_REG)
39752     return 1;
39753   /* New x86-64 int registers result in bigger code size.  Discourage
39754      them.  */
39755   if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39756     return 2;
39757   /* New x86-64 SSE registers result in bigger code size.  Discourage
39758      them.  */
39759   if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39760     return 2;
39761   /* Usage of AX register results in smaller code.  Prefer it.  */
39762   if (hard_regno == AX_REG)
39763     return 4;
39764   return 3;
39765 }
39766 
39767 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39768 
39769    Put float CONST_DOUBLE in the constant pool instead of fp regs.
39770    QImode must go into class Q_REGS.
39771    Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
39772    movdf to do mem-to-mem moves through integer regs.  */
39773 
39774 static reg_class_t
39775 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39776 {
39777   machine_mode mode = GET_MODE (x);
39778 
39779   /* We're only allowed to return a subclass of CLASS.  Many of the
39780      following checks fail for NO_REGS, so eliminate that early.  */
39781   if (regclass == NO_REGS)
39782     return NO_REGS;
39783 
39784   /* All classes can load zeros.  */
39785   if (x == CONST0_RTX (mode))
39786     return regclass;
39787 
39788   /* Force constants into memory if we are loading a (nonzero) constant into
39789      an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
39790      instructions to load from a constant.  */
39791   if (CONSTANT_P (x)
39792       && (MAYBE_MMX_CLASS_P (regclass)
39793 	  || MAYBE_SSE_CLASS_P (regclass)
39794 	  || MAYBE_MASK_CLASS_P (regclass)))
39795     return NO_REGS;
39796 
39797   /* Floating-point constants need more complex checks.  */
39798   if (CONST_DOUBLE_P (x))
39799     {
39800       /* General regs can load everything.  */
39801       if (INTEGER_CLASS_P (regclass))
39802         return regclass;
39803 
39804       /* Floats can load 0 and 1 plus some others.  Note that we eliminated
39805 	 zero above.  We only want to wind up preferring 80387 registers if
39806 	 we plan on doing computation with them.  */
39807       if (IS_STACK_MODE (mode)
39808 	  && standard_80387_constant_p (x) > 0)
39809 	{
39810 	  /* Limit class to FP regs.  */
39811 	  if (FLOAT_CLASS_P (regclass))
39812 	    return FLOAT_REGS;
39813 	  else if (regclass == FP_TOP_SSE_REGS)
39814 	    return FP_TOP_REG;
39815 	  else if (regclass == FP_SECOND_SSE_REGS)
39816 	    return FP_SECOND_REG;
39817 	}
39818 
39819       return NO_REGS;
39820     }
39821 
39822   /* Prefer SSE regs only, if we can use them for math.  */
39823   if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39824     return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39825 
39826   /* Generally when we see PLUS here, it's the function invariant
39827      (plus soft-fp const_int).  Which can only be computed into general
39828      regs.  */
39829   if (GET_CODE (x) == PLUS)
39830     return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39831 
39832   /* QImode constants are easy to load, but non-constant QImode data
39833      must go into Q_REGS.  */
39834   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39835     {
39836       if (Q_CLASS_P (regclass))
39837 	return regclass;
39838       else if (reg_class_subset_p (Q_REGS, regclass))
39839 	return Q_REGS;
39840       else
39841 	return NO_REGS;
39842     }
39843 
39844   return regclass;
39845 }
39846 
39847 /* Discourage putting floating-point values in SSE registers unless
39848    SSE math is being used, and likewise for the 387 registers.  */
39849 static reg_class_t
39850 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39851 {
39852   machine_mode mode = GET_MODE (x);
39853 
39854   /* Restrict the output reload class to the register bank that we are doing
39855      math on.  If we would like not to return a subset of CLASS, reject this
39856      alternative: if reload cannot do this, it will still use its choice.  */
39857   mode = GET_MODE (x);
39858   if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39859     return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39860 
39861   if (IS_STACK_MODE (mode))
39862     {
39863       if (regclass == FP_TOP_SSE_REGS)
39864 	return FP_TOP_REG;
39865       else if (regclass == FP_SECOND_SSE_REGS)
39866 	return FP_SECOND_REG;
39867       else
39868 	return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39869     }
39870 
39871   return regclass;
39872 }
39873 
39874 static reg_class_t
39875 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39876 		       machine_mode mode, secondary_reload_info *sri)
39877 {
39878   /* Double-word spills from general registers to non-offsettable memory
39879      references (zero-extended addresses) require special handling.  */
39880   if (TARGET_64BIT
39881       && MEM_P (x)
39882       && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39883       && INTEGER_CLASS_P (rclass)
39884       && !offsettable_memref_p (x))
39885     {
39886       sri->icode = (in_p
39887 		    ? CODE_FOR_reload_noff_load
39888 		    : CODE_FOR_reload_noff_store);
39889       /* Add the cost of moving address to a temporary.  */
39890       sri->extra_cost = 1;
39891 
39892       return NO_REGS;
39893     }
39894 
39895   /* QImode spills from non-QI registers require
39896      intermediate register on 32bit targets.  */
39897   if (mode == QImode
39898       && ((!TARGET_64BIT && !in_p
39899 	   && INTEGER_CLASS_P (rclass)
39900 	   && MAYBE_NON_Q_CLASS_P (rclass))
39901 	  || (!TARGET_AVX512DQ
39902 	      && MAYBE_MASK_CLASS_P (rclass))))
39903     {
39904       int regno = true_regnum (x);
39905 
39906       /* Return Q_REGS if the operand is in memory.  */
39907       if (regno == -1)
39908 	return Q_REGS;
39909 
39910       return NO_REGS;
39911     }
39912 
39913   /* This condition handles corner case where an expression involving
39914      pointers gets vectorized.  We're trying to use the address of a
39915      stack slot as a vector initializer.
39916 
39917      (set (reg:V2DI 74 [ vect_cst_.2 ])
39918           (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39919 
39920      Eventually frame gets turned into sp+offset like this:
39921 
39922      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39923           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39924 	                               (const_int 392 [0x188]))))
39925 
39926      That later gets turned into:
39927 
39928      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39929           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39930 	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39931 
39932      We'll have the following reload recorded:
39933 
39934      Reload 0: reload_in (DI) =
39935            (plus:DI (reg/f:DI 7 sp)
39936             (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39937      reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39938      SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39939      reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39940      reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39941      reload_reg_rtx: (reg:V2DI 22 xmm1)
39942 
39943      Which isn't going to work since SSE instructions can't handle scalar
39944      additions.  Returning GENERAL_REGS forces the addition into integer
39945      register and reload can handle subsequent reloads without problems.  */
39946 
39947   if (in_p && GET_CODE (x) == PLUS
39948       && SSE_CLASS_P (rclass)
39949       && SCALAR_INT_MODE_P (mode))
39950     return GENERAL_REGS;
39951 
39952   return NO_REGS;
39953 }
39954 
39955 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
39956 
39957 static bool
39958 ix86_class_likely_spilled_p (reg_class_t rclass)
39959 {
39960   switch (rclass)
39961     {
39962       case AREG:
39963       case DREG:
39964       case CREG:
39965       case BREG:
39966       case AD_REGS:
39967       case SIREG:
39968       case DIREG:
39969       case SSE_FIRST_REG:
39970       case FP_TOP_REG:
39971       case FP_SECOND_REG:
39972       case BND_REGS:
39973 	return true;
39974 
39975       default:
39976 	break;
39977     }
39978 
39979   return false;
39980 }
39981 
39982 /* If we are copying between registers from different register sets
39983    (e.g. FP and integer), we may need a memory location.
39984 
39985    The function can't work reliably when one of the CLASSES is a class
39986    containing registers from multiple sets.  We avoid this by never combining
39987    different sets in a single alternative in the machine description.
39988    Ensure that this constraint holds to avoid unexpected surprises.
39989 
39990    When STRICT is false, we are being called from REGISTER_MOVE_COST,
39991    so do not enforce these sanity checks.
39992 
39993    To optimize register_move_cost performance, define inline variant.  */
39994 
39995 static inline bool
39996 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39997 				reg_class_t class2, int strict)
39998 {
39999   if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40000     return false;
40001 
40002   if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40003       || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40004       || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40005       || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40006       || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40007       || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40008       || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40009       || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40010     {
40011       gcc_assert (!strict || lra_in_progress);
40012       return true;
40013     }
40014 
40015   if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40016     return true;
40017 
40018   /* Between mask and general, we have moves no larger than word size.  */
40019   if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40020       && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40021   return true;
40022 
40023   /* ??? This is a lie.  We do have moves between mmx/general, and for
40024      mmx/sse2.  But by saying we need secondary memory we discourage the
40025      register allocator from using the mmx registers unless needed.  */
40026   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40027     return true;
40028 
40029   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40030     {
40031       /* SSE1 doesn't have any direct moves from other classes.  */
40032       if (!TARGET_SSE2)
40033 	return true;
40034 
40035       /* If the target says that inter-unit moves are more expensive
40036 	 than moving through memory, then don't generate them.  */
40037       if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40038 	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40039 	return true;
40040 
40041       /* Between SSE and general, we have moves no larger than word size.  */
40042       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40043 	return true;
40044     }
40045 
40046   return false;
40047 }
40048 
40049 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
40050 
40051 static bool
40052 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
40053 			      reg_class_t class2)
40054 {
40055   return inline_secondary_memory_needed (mode, class1, class2, true);
40056 }
40057 
40058 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
40059 
40060    get_secondary_mem widens integral modes to BITS_PER_WORD.
40061    There is no need to emit full 64 bit move on 64 bit targets
40062    for integral modes that can be moved using 32 bit move.  */
40063 
40064 static machine_mode
40065 ix86_secondary_memory_needed_mode (machine_mode mode)
40066 {
40067   if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
40068     return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
40069   return mode;
40070 }
40071 
40072 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40073 
40074    On the 80386, this is the size of MODE in words,
40075    except in the FP regs, where a single reg is always enough.  */
40076 
40077 static unsigned char
40078 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40079 {
40080   if (MAYBE_INTEGER_CLASS_P (rclass))
40081     {
40082       if (mode == XFmode)
40083 	return (TARGET_64BIT ? 2 : 3);
40084       else if (mode == XCmode)
40085 	return (TARGET_64BIT ? 4 : 6);
40086       else
40087 	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40088     }
40089   else
40090     {
40091       if (COMPLEX_MODE_P (mode))
40092 	return 2;
40093       else
40094 	return 1;
40095     }
40096 }
40097 
40098 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
40099 
40100 static bool
40101 ix86_can_change_mode_class (machine_mode from, machine_mode to,
40102 			    reg_class_t regclass)
40103 {
40104   if (from == to)
40105     return true;
40106 
40107   /* x87 registers can't do subreg at all, as all values are reformatted
40108      to extended precision.  */
40109   if (MAYBE_FLOAT_CLASS_P (regclass))
40110     return false;
40111 
40112   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40113     {
40114       /* Vector registers do not support QI or HImode loads.  If we don't
40115 	 disallow a change to these modes, reload will assume it's ok to
40116 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
40117 	 the vec_dupv4hi pattern.  */
40118       if (GET_MODE_SIZE (from) < 4)
40119 	return false;
40120     }
40121 
40122   return true;
40123 }
40124 
40125 /* Return index of MODE in the sse load/store tables.  */
40126 
40127 static inline int
40128 sse_store_index (machine_mode mode)
40129 {
40130       switch (GET_MODE_SIZE (mode))
40131 	{
40132 	  case 4:
40133 	    return 0;
40134 	  case 8:
40135 	    return 1;
40136 	  case 16:
40137 	    return 2;
40138 	  case 32:
40139 	    return 3;
40140 	  case 64:
40141 	    return 4;
40142 	  default:
40143 	    return -1;
40144 	}
40145 }
40146 
40147 /* Return the cost of moving data of mode M between a
40148    register and memory.  A value of 2 is the default; this cost is
40149    relative to those in `REGISTER_MOVE_COST'.
40150 
40151    This function is used extensively by register_move_cost that is used to
40152    build tables at startup.  Make it inline in this case.
40153    When IN is 2, return maximum of in and out move cost.
40154 
40155    If moving between registers and memory is more expensive than
40156    between two registers, you should define this macro to express the
40157    relative cost.
40158 
40159    Model also increased moving costs of QImode registers in non
40160    Q_REGS classes.
40161  */
40162 static inline int
40163 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40164 			 int in)
40165 {
40166   int cost;
40167   if (FLOAT_CLASS_P (regclass))
40168     {
40169       int index;
40170       switch (mode)
40171 	{
40172 	  case E_SFmode:
40173 	    index = 0;
40174 	    break;
40175 	  case E_DFmode:
40176 	    index = 1;
40177 	    break;
40178 	  case E_XFmode:
40179 	    index = 2;
40180 	    break;
40181 	  default:
40182 	    return 100;
40183 	}
40184       if (in == 2)
40185         return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40186       return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40187     }
40188   if (SSE_CLASS_P (regclass))
40189     {
40190       int index = sse_store_index (mode);
40191       if (index == -1)
40192 	return 100;
40193       if (in == 2)
40194         return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40195       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40196     }
40197   if (MMX_CLASS_P (regclass))
40198     {
40199       int index;
40200       switch (GET_MODE_SIZE (mode))
40201 	{
40202 	  case 4:
40203 	    index = 0;
40204 	    break;
40205 	  case 8:
40206 	    index = 1;
40207 	    break;
40208 	  default:
40209 	    return 100;
40210 	}
40211       if (in)
40212         return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40213       return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40214     }
40215   switch (GET_MODE_SIZE (mode))
40216     {
40217       case 1:
40218 	if (Q_CLASS_P (regclass) || TARGET_64BIT)
40219 	  {
40220 	    if (!in)
40221 	      return ix86_cost->int_store[0];
40222 	    if (TARGET_PARTIAL_REG_DEPENDENCY
40223 	        && optimize_function_for_speed_p (cfun))
40224 	      cost = ix86_cost->movzbl_load;
40225 	    else
40226 	      cost = ix86_cost->int_load[0];
40227 	    if (in == 2)
40228 	      return MAX (cost, ix86_cost->int_store[0]);
40229 	    return cost;
40230 	  }
40231 	else
40232 	  {
40233 	   if (in == 2)
40234 	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40235 	   if (in)
40236 	     return ix86_cost->movzbl_load;
40237 	   else
40238 	     return ix86_cost->int_store[0] + 4;
40239 	  }
40240 	break;
40241       case 2:
40242 	if (in == 2)
40243 	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40244 	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40245       default:
40246 	/* Compute number of 32bit moves needed.  TFmode is moved as XFmode.  */
40247 	if (mode == TFmode)
40248 	  mode = XFmode;
40249 	if (in == 2)
40250 	  cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40251 	else if (in)
40252 	  cost = ix86_cost->int_load[2];
40253 	else
40254 	  cost = ix86_cost->int_store[2];
40255 	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40256     }
40257 }
40258 
40259 static int
40260 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40261 		       bool in)
40262 {
40263   return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40264 }
40265 
40266 
40267 /* Return the cost of moving data from a register in class CLASS1 to
40268    one in class CLASS2.
40269 
40270    It is not required that the cost always equal 2 when FROM is the same as TO;
40271    on some machines it is expensive to move between registers if they are not
40272    general registers.  */
40273 
40274 static int
40275 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40276 			 reg_class_t class2_i)
40277 {
40278   enum reg_class class1 = (enum reg_class) class1_i;
40279   enum reg_class class2 = (enum reg_class) class2_i;
40280 
40281   /* In case we require secondary memory, compute cost of the store followed
40282      by load.  In order to avoid bad register allocation choices, we need
40283      for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
40284 
40285   if (inline_secondary_memory_needed (mode, class1, class2, false))
40286     {
40287       int cost = 1;
40288 
40289       cost += inline_memory_move_cost (mode, class1, 2);
40290       cost += inline_memory_move_cost (mode, class2, 2);
40291 
40292       /* In case of copying from general_purpose_register we may emit multiple
40293          stores followed by single load causing memory size mismatch stall.
40294          Count this as arbitrarily high cost of 20.  */
40295       if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
40296 	  && TARGET_MEMORY_MISMATCH_STALL
40297 	  && targetm.class_max_nregs (class1, mode)
40298 	     > targetm.class_max_nregs (class2, mode))
40299 	cost += 20;
40300 
40301       /* In the case of FP/MMX moves, the registers actually overlap, and we
40302 	 have to switch modes in order to treat them differently.  */
40303       if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40304           || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40305 	cost += 20;
40306 
40307       return cost;
40308     }
40309 
40310   /* Moves between SSE/MMX and integer unit are expensive.  */
40311   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40312       || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40313 
40314     /* ??? By keeping returned value relatively high, we limit the number
40315        of moves between integer and MMX/SSE registers for all targets.
40316        Additionally, high value prevents problem with x86_modes_tieable_p(),
40317        where integer modes in MMX/SSE registers are not tieable
40318        because of missing QImode and HImode moves to, from or between
40319        MMX/SSE registers.  */
40320     return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
40321 		? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
40322 
40323   if (MAYBE_FLOAT_CLASS_P (class1))
40324     return ix86_cost->fp_move;
40325   if (MAYBE_SSE_CLASS_P (class1))
40326     {
40327       if (GET_MODE_BITSIZE (mode) <= 128)
40328 	return ix86_cost->xmm_move;
40329       if (GET_MODE_BITSIZE (mode) <= 256)
40330 	return ix86_cost->ymm_move;
40331       return ix86_cost->zmm_move;
40332     }
40333   if (MAYBE_MMX_CLASS_P (class1))
40334     return ix86_cost->mmx_move;
40335   return 2;
40336 }
40337 
40338 /* Implement TARGET_HARD_REGNO_NREGS.  This is ordinarily the length in
40339    words of a value of mode MODE but can be less for certain modes in
40340    special long registers.
40341 
40342    Actually there are no two word move instructions for consecutive
40343    registers.  And only registers 0-3 may have mov byte instructions
40344    applied to them.  */
40345 
40346 static unsigned int
40347 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40348 {
40349   if (GENERAL_REGNO_P (regno))
40350     {
40351       if (mode == XFmode)
40352 	return TARGET_64BIT ? 2 : 3;
40353       if (mode == XCmode)
40354 	return TARGET_64BIT ? 4 : 6;
40355       return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40356     }
40357   if (COMPLEX_MODE_P (mode))
40358     return 2;
40359   if (mode == V64SFmode || mode == V64SImode)
40360     return 4;
40361   return 1;
40362 }
40363 
40364 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
40365 
40366 static bool
40367 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40368 {
40369   /* Flags and only flags can only hold CCmode values.  */
40370   if (CC_REGNO_P (regno))
40371     return GET_MODE_CLASS (mode) == MODE_CC;
40372   if (GET_MODE_CLASS (mode) == MODE_CC
40373       || GET_MODE_CLASS (mode) == MODE_RANDOM
40374       || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40375     return false;
40376   if (STACK_REGNO_P (regno))
40377     return VALID_FP_MODE_P (mode);
40378   if (MASK_REGNO_P (regno))
40379     return (VALID_MASK_REG_MODE (mode)
40380 	    || (TARGET_AVX512BW
40381 		&& VALID_MASK_AVX512BW_MODE (mode)));
40382   if (BND_REGNO_P (regno))
40383     return VALID_BND_REG_MODE (mode);
40384   if (SSE_REGNO_P (regno))
40385     {
40386       /* We implement the move patterns for all vector modes into and
40387 	 out of SSE registers, even when no operation instructions
40388 	 are available.  */
40389 
40390       /* For AVX-512 we allow, regardless of regno:
40391 	  - XI mode
40392 	  - any of 512-bit wide vector mode
40393 	  - any scalar mode.  */
40394       if (TARGET_AVX512F
40395 	  && (mode == XImode
40396 	      || VALID_AVX512F_REG_MODE (mode)
40397 	      || VALID_AVX512F_SCALAR_MODE (mode)))
40398 	return true;
40399 
40400       /* For AVX-5124FMAPS allow V64SFmode for special regnos.  */
40401       if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40402 	  && MOD4_SSE_REGNO_P (regno)
40403 	  && mode == V64SFmode)
40404 	return true;
40405 
40406       /* For AVX-5124VNNIW allow V64SImode for special regnos.  */
40407       if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40408 	  && MOD4_SSE_REGNO_P (regno)
40409 	  && mode == V64SImode)
40410 	return true;
40411 
40412       /* TODO check for QI/HI scalars.  */
40413       /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
40414       if (TARGET_AVX512VL
40415 	  && (mode == OImode
40416 	      || mode == TImode
40417 	      || VALID_AVX256_REG_MODE (mode)
40418 	      || VALID_AVX512VL_128_REG_MODE (mode)))
40419 	return true;
40420 
40421       /* xmm16-xmm31 are only available for AVX-512.  */
40422       if (EXT_REX_SSE_REGNO_P (regno))
40423 	return false;
40424 
40425       /* OImode and AVX modes are available only when AVX is enabled.  */
40426       return ((TARGET_AVX
40427 	       && VALID_AVX256_REG_OR_OI_MODE (mode))
40428 	      || VALID_SSE_REG_MODE (mode)
40429 	      || VALID_SSE2_REG_MODE (mode)
40430 	      || VALID_MMX_REG_MODE (mode)
40431 	      || VALID_MMX_REG_MODE_3DNOW (mode));
40432     }
40433   if (MMX_REGNO_P (regno))
40434     {
40435       /* We implement the move patterns for 3DNOW modes even in MMX mode,
40436 	 so if the register is available at all, then we can move data of
40437 	 the given mode into or out of it.  */
40438       return (VALID_MMX_REG_MODE (mode)
40439 	      || VALID_MMX_REG_MODE_3DNOW (mode));
40440     }
40441 
40442   if (mode == QImode)
40443     {
40444       /* Take care for QImode values - they can be in non-QI regs,
40445 	 but then they do cause partial register stalls.  */
40446       if (ANY_QI_REGNO_P (regno))
40447 	return true;
40448       if (!TARGET_PARTIAL_REG_STALL)
40449 	return true;
40450       /* LRA checks if the hard register is OK for the given mode.
40451 	 QImode values can live in non-QI regs, so we allow all
40452 	 registers here.  */
40453       if (lra_in_progress)
40454        return true;
40455       return !can_create_pseudo_p ();
40456     }
40457   /* We handle both integer and floats in the general purpose registers.  */
40458   else if (VALID_INT_MODE_P (mode))
40459     return true;
40460   else if (VALID_FP_MODE_P (mode))
40461     return true;
40462   else if (VALID_DFP_MODE_P (mode))
40463     return true;
40464   /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
40465      on to use that value in smaller contexts, this can easily force a
40466      pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
40467      supporting DImode, allow it.  */
40468   else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40469     return true;
40470 
40471   return false;
40472 }
40473 
40474 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The only ABI that
40475    saves SSE registers across calls is Win64 (thus no need to check the
40476    current ABI here), and with AVX enabled Win64 only guarantees that
40477    the low 16 bytes are saved.  */
40478 
40479 static bool
40480 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40481 {
40482   return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40483 }
40484 
40485 /* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
40486    tieable integer mode.  */
40487 
40488 static bool
40489 ix86_tieable_integer_mode_p (machine_mode mode)
40490 {
40491   switch (mode)
40492     {
40493     case E_HImode:
40494     case E_SImode:
40495       return true;
40496 
40497     case E_QImode:
40498       return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40499 
40500     case E_DImode:
40501       return TARGET_64BIT;
40502 
40503     default:
40504       return false;
40505     }
40506 }
40507 
40508 /* Implement TARGET_MODES_TIEABLE_P.
40509 
40510    Return true if MODE1 is accessible in a register that can hold MODE2
40511    without copying.  That is, all register classes that can hold MODE2
40512    can also hold MODE1.  */
40513 
40514 static bool
40515 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40516 {
40517   if (mode1 == mode2)
40518     return true;
40519 
40520   if (ix86_tieable_integer_mode_p (mode1)
40521       && ix86_tieable_integer_mode_p (mode2))
40522     return true;
40523 
40524   /* MODE2 being XFmode implies fp stack or general regs, which means we
40525      can tie any smaller floating point modes to it.  Note that we do not
40526      tie this with TFmode.  */
40527   if (mode2 == XFmode)
40528     return mode1 == SFmode || mode1 == DFmode;
40529 
40530   /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40531      that we can tie it with SFmode.  */
40532   if (mode2 == DFmode)
40533     return mode1 == SFmode;
40534 
40535   /* If MODE2 is only appropriate for an SSE register, then tie with
40536      any other mode acceptable to SSE registers.  */
40537   if (GET_MODE_SIZE (mode2) == 32
40538       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40539     return (GET_MODE_SIZE (mode1) == 32
40540 	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40541   if (GET_MODE_SIZE (mode2) == 16
40542       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40543     return (GET_MODE_SIZE (mode1) == 16
40544 	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40545 
40546   /* If MODE2 is appropriate for an MMX register, then tie
40547      with any other mode acceptable to MMX registers.  */
40548   if (GET_MODE_SIZE (mode2) == 8
40549       && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40550     return (GET_MODE_SIZE (mode1) == 8
40551 	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40552 
40553   return false;
40554 }
40555 
40556 /* Return the cost of moving between two registers of mode MODE.  */
40557 
40558 static int
40559 ix86_set_reg_reg_cost (machine_mode mode)
40560 {
40561   unsigned int units = UNITS_PER_WORD;
40562 
40563   switch (GET_MODE_CLASS (mode))
40564     {
40565     default:
40566       break;
40567 
40568     case MODE_CC:
40569       units = GET_MODE_SIZE (CCmode);
40570       break;
40571 
40572     case MODE_FLOAT:
40573       if ((TARGET_SSE && mode == TFmode)
40574 	  || (TARGET_80387 && mode == XFmode)
40575 	  || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40576 	  || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40577 	units = GET_MODE_SIZE (mode);
40578       break;
40579 
40580     case MODE_COMPLEX_FLOAT:
40581       if ((TARGET_SSE && mode == TCmode)
40582 	  || (TARGET_80387 && mode == XCmode)
40583 	  || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40584 	  || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40585 	units = GET_MODE_SIZE (mode);
40586       break;
40587 
40588     case MODE_VECTOR_INT:
40589     case MODE_VECTOR_FLOAT:
40590       if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40591 	  || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40592 	  || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40593 	  || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40594 	  || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40595 	units = GET_MODE_SIZE (mode);
40596     }
40597 
40598   /* Return the cost of moving between two registers of mode MODE,
40599      assuming that the move will be in pieces of at most UNITS bytes.  */
40600   return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40601 }
40602 
40603 /* Return cost of vector operation in MODE given that scalar version has
40604    COST.  If PARALLEL is true assume that CPU has more than one unit
40605    performing the operation.  */
40606 
40607 static int
40608 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40609 {
40610   if (!VECTOR_MODE_P (mode))
40611     return cost;
40612 
40613   if (!parallel)
40614     return cost * GET_MODE_NUNITS (mode);
40615   if (GET_MODE_BITSIZE (mode) == 128
40616       && TARGET_SSE_SPLIT_REGS)
40617     return cost * 2;
40618   if (GET_MODE_BITSIZE (mode) > 128
40619       && TARGET_AVX128_OPTIMAL)
40620     return cost * GET_MODE_BITSIZE (mode) / 128;
40621   return cost;
40622 }
40623 
40624 /* Return cost of multiplication in MODE.  */
40625 
40626 static int
40627 ix86_multiplication_cost (const struct processor_costs *cost,
40628 			  enum machine_mode mode)
40629 {
40630   machine_mode inner_mode = mode;
40631   if (VECTOR_MODE_P (mode))
40632     inner_mode = GET_MODE_INNER (mode);
40633 
40634   if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40635     return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40636   else if (X87_FLOAT_MODE_P (mode))
40637     return cost->fmul;
40638   else if (FLOAT_MODE_P (mode))
40639     return  ix86_vec_cost (mode,
40640 			   inner_mode == DFmode
40641 			   ? cost->mulsd : cost->mulss, true);
40642   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40643     {
40644       /* vpmullq is used in this case. No emulation is needed.  */
40645       if (TARGET_AVX512DQ)
40646 	return ix86_vec_cost (mode, cost->mulss, true);
40647 
40648       /* V*QImode is emulated with 7-13 insns.  */
40649       if (mode == V16QImode || mode == V32QImode)
40650 	{
40651 	  int extra = 11;
40652 	  if (TARGET_XOP && mode == V16QImode)
40653 	    extra = 5;
40654 	  else if (TARGET_SSSE3)
40655 	    extra = 6;
40656 	  return ix86_vec_cost (mode,
40657 				cost->mulss * 2 + cost->sse_op * extra,
40658 				true);
40659 	}
40660       /* V*DImode is emulated with 5-8 insns.  */
40661       else if (mode == V2DImode || mode == V4DImode)
40662 	{
40663 	  if (TARGET_XOP && mode == V2DImode)
40664 	    return ix86_vec_cost (mode,
40665 				  cost->mulss * 2 + cost->sse_op * 3,
40666 				  true);
40667 	  else
40668 	    return ix86_vec_cost (mode,
40669 				  cost->mulss * 3 + cost->sse_op * 5,
40670 				  true);
40671 	}
40672       /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40673 	 insns, including two PMULUDQ.  */
40674       else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40675 	return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40676 				true);
40677       else
40678 	return ix86_vec_cost (mode, cost->mulss, true);
40679     }
40680   else
40681     return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40682 }
40683 
40684 /* Return cost of multiplication in MODE.  */
40685 
40686 static int
40687 ix86_division_cost (const struct processor_costs *cost,
40688 			  enum machine_mode mode)
40689 {
40690   machine_mode inner_mode = mode;
40691   if (VECTOR_MODE_P (mode))
40692     inner_mode = GET_MODE_INNER (mode);
40693 
40694   if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40695     return inner_mode == DFmode ? cost->divsd : cost->divss;
40696   else if (X87_FLOAT_MODE_P (mode))
40697     return cost->fdiv;
40698   else if (FLOAT_MODE_P (mode))
40699     return ix86_vec_cost (mode,
40700 			    inner_mode == DFmode ? cost->divsd : cost->divss,
40701 			    true);
40702   else
40703     return cost->divide[MODE_INDEX (mode)];
40704 }
40705 
40706 /* Return cost of shift in MODE.
40707    If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40708    AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40709    if op1 is a result of subreg.
40710 
40711    SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored.  */
40712 
40713 static int
40714 ix86_shift_rotate_cost (const struct processor_costs *cost,
40715 			enum machine_mode mode, bool constant_op1,
40716 			HOST_WIDE_INT op1_val,
40717 			bool speed,
40718 			bool and_in_op1,
40719 			bool shift_and_truncate,
40720 			bool *skip_op0, bool *skip_op1)
40721 {
40722   if (skip_op0)
40723     *skip_op0 = *skip_op1 = false;
40724   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40725     {
40726       /* V*QImode is emulated with 1-11 insns.  */
40727       if (mode == V16QImode || mode == V32QImode)
40728 	{
40729 	  int count = 11;
40730 	  if (TARGET_XOP && mode == V16QImode)
40731 	    {
40732 	      /* For XOP we use vpshab, which requires a broadcast of the
40733 		 value to the variable shift insn.  For constants this
40734 		 means a V16Q const in mem; even when we can perform the
40735 		 shift with one insn set the cost to prefer paddb.  */
40736 	      if (constant_op1)
40737 		{
40738 		  if (skip_op1)
40739 		    *skip_op1 = true;
40740 		  return ix86_vec_cost (mode,
40741 			    cost->sse_op
40742 			    + (speed
40743 			       ? 2
40744 			       : COSTS_N_BYTES
40745 				 (GET_MODE_UNIT_SIZE (mode))), true);
40746 		}
40747 	      count = 3;
40748 	    }
40749 	  else if (TARGET_SSSE3)
40750 	    count = 7;
40751 	  return ix86_vec_cost (mode, cost->sse_op * count, true);
40752 	}
40753       else
40754 	return ix86_vec_cost (mode, cost->sse_op, true);
40755     }
40756   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40757     {
40758       if (constant_op1)
40759 	{
40760 	  if (op1_val > 32)
40761 	    return cost->shift_const + COSTS_N_INSNS (2);
40762 	  else
40763 	    return cost->shift_const * 2;
40764 	}
40765       else
40766 	{
40767 	  if (and_in_op1)
40768 	    return cost->shift_var * 2;
40769 	  else
40770 	    return cost->shift_var * 6 + COSTS_N_INSNS (2);
40771 	}
40772     }
40773   else
40774     {
40775       if (constant_op1)
40776 	return cost->shift_const;
40777       else if (shift_and_truncate)
40778 	{
40779 	  if (skip_op0)
40780 	    *skip_op0 = *skip_op1 = true;
40781 	  /* Return the cost after shift-and truncation.  */
40782 	  return cost->shift_var;
40783 	}
40784       else
40785 	return cost->shift_var;
40786     }
40787   return cost->shift_const;
40788 }
40789 
40790 /* Compute a (partial) cost for rtx X.  Return true if the complete
40791    cost has been computed, and false if subexpressions should be
40792    scanned.  In either case, *TOTAL contains the cost result.  */
40793 
40794 static bool
40795 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40796 		int *total, bool speed)
40797 {
40798   rtx mask;
40799   enum rtx_code code = GET_CODE (x);
40800   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40801   const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40802   int src_cost;
40803 
40804   switch (code)
40805     {
40806     case SET:
40807       if (register_operand (SET_DEST (x), VOIDmode)
40808 	  && reg_or_0_operand (SET_SRC (x), VOIDmode))
40809 	{
40810 	  *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40811 	  return true;
40812 	}
40813 
40814       if (register_operand (SET_SRC (x), VOIDmode))
40815 	/* Avoid potentially incorrect high cost from rtx_costs
40816 	   for non-tieable SUBREGs.  */
40817 	src_cost = 0;
40818       else
40819 	{
40820 	  src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40821 
40822 	  if (CONSTANT_P (SET_SRC (x)))
40823 	    /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40824 	       a small value, possibly zero for cheap constants.  */
40825 	    src_cost += COSTS_N_INSNS (1);
40826 	}
40827 
40828       *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40829       return true;
40830 
40831     case CONST_INT:
40832     case CONST:
40833     case LABEL_REF:
40834     case SYMBOL_REF:
40835       if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40836 	*total = 3;
40837       else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40838 	*total = 2;
40839       else if (flag_pic && SYMBOLIC_CONST (x)
40840 	       && !(TARGET_64BIT
40841 		    && (GET_CODE (x) == LABEL_REF
40842 			|| (GET_CODE (x) == SYMBOL_REF
40843 			    && SYMBOL_REF_LOCAL_P (x))))
40844 	       /* Use 0 cost for CONST to improve its propagation.  */
40845 	       && (TARGET_64BIT || GET_CODE (x) != CONST))
40846 	*total = 1;
40847       else
40848 	*total = 0;
40849       return true;
40850 
40851     case CONST_DOUBLE:
40852       if (IS_STACK_MODE (mode))
40853 	switch (standard_80387_constant_p (x))
40854 	  {
40855 	  case -1:
40856 	  case 0:
40857 	    break;
40858 	  case 1: /* 0.0 */
40859 	    *total = 1;
40860 	    return true;
40861 	  default: /* Other constants */
40862 	    *total = 2;
40863 	    return true;
40864 	  }
40865       /* FALLTHRU */
40866 
40867     case CONST_VECTOR:
40868       switch (standard_sse_constant_p (x, mode))
40869 	{
40870 	case 0:
40871 	  break;
40872 	case 1:  /* 0: xor eliminates false dependency */
40873 	  *total = 0;
40874 	  return true;
40875 	default: /* -1: cmp contains false dependency */
40876 	  *total = 1;
40877 	  return true;
40878 	}
40879       /* FALLTHRU */
40880 
40881     case CONST_WIDE_INT:
40882       /* Fall back to (MEM (SYMBOL_REF)), since that's where
40883 	 it'll probably end up.  Add a penalty for size.  */
40884       *total = (COSTS_N_INSNS (1)
40885 		+ (!TARGET_64BIT && flag_pic)
40886 		+ (GET_MODE_SIZE (mode) <= 4
40887 		   ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40888       return true;
40889 
40890     case ZERO_EXTEND:
40891       /* The zero extensions is often completely free on x86_64, so make
40892 	 it as cheap as possible.  */
40893       if (TARGET_64BIT && mode == DImode
40894 	  && GET_MODE (XEXP (x, 0)) == SImode)
40895 	*total = 1;
40896       else if (TARGET_ZERO_EXTEND_WITH_AND)
40897 	*total = cost->add;
40898       else
40899 	*total = cost->movzx;
40900       return false;
40901 
40902     case SIGN_EXTEND:
40903       *total = cost->movsx;
40904       return false;
40905 
40906     case ASHIFT:
40907       if (SCALAR_INT_MODE_P (mode)
40908 	  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40909 	  && CONST_INT_P (XEXP (x, 1)))
40910 	{
40911 	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40912 	  if (value == 1)
40913 	    {
40914 	      *total = cost->add;
40915 	      return false;
40916 	    }
40917 	  if ((value == 2 || value == 3)
40918 	      && cost->lea <= cost->shift_const)
40919 	    {
40920 	      *total = cost->lea;
40921 	      return false;
40922 	    }
40923 	}
40924       /* FALLTHRU */
40925 
40926     case ROTATE:
40927     case ASHIFTRT:
40928     case LSHIFTRT:
40929     case ROTATERT:
40930       bool skip_op0, skip_op1;
40931       *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40932 				       CONST_INT_P (XEXP (x, 1))
40933 					 ? INTVAL (XEXP (x, 1)) : -1,
40934 				       speed,
40935 				       GET_CODE (XEXP (x, 1)) == AND,
40936 				       SUBREG_P (XEXP (x, 1))
40937 				       && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40938 				       &skip_op0, &skip_op1);
40939       if (skip_op0 || skip_op1)
40940 	{
40941 	  if (!skip_op0)
40942 	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40943 	  if (!skip_op1)
40944 	    *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40945 	  return true;
40946 	}
40947       return false;
40948 
40949     case FMA:
40950       {
40951 	rtx sub;
40952 
40953         gcc_assert (FLOAT_MODE_P (mode));
40954         gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40955 
40956         *total = ix86_vec_cost (mode,
40957 				mode == SFmode ? cost->fmass : cost->fmasd,
40958 				true);
40959 	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40960 
40961         /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
40962 	sub = XEXP (x, 0);
40963 	if (GET_CODE (sub) == NEG)
40964 	  sub = XEXP (sub, 0);
40965 	*total += rtx_cost (sub, mode, FMA, 0, speed);
40966 
40967 	sub = XEXP (x, 2);
40968 	if (GET_CODE (sub) == NEG)
40969 	  sub = XEXP (sub, 0);
40970 	*total += rtx_cost (sub, mode, FMA, 2, speed);
40971 	return true;
40972       }
40973 
40974     case MULT:
40975       if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40976 	{
40977 	  rtx op0 = XEXP (x, 0);
40978 	  rtx op1 = XEXP (x, 1);
40979 	  int nbits;
40980 	  if (CONST_INT_P (XEXP (x, 1)))
40981 	    {
40982 	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40983 	      for (nbits = 0; value != 0; value &= value - 1)
40984 	        nbits++;
40985 	    }
40986 	  else
40987 	    /* This is arbitrary.  */
40988 	    nbits = 7;
40989 
40990 	  /* Compute costs correctly for widening multiplication.  */
40991 	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40992 	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40993 	         == GET_MODE_SIZE (mode))
40994 	    {
40995 	      int is_mulwiden = 0;
40996 	      machine_mode inner_mode = GET_MODE (op0);
40997 
40998 	      if (GET_CODE (op0) == GET_CODE (op1))
40999 		is_mulwiden = 1, op1 = XEXP (op1, 0);
41000 	      else if (CONST_INT_P (op1))
41001 		{
41002 		  if (GET_CODE (op0) == SIGN_EXTEND)
41003 		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41004 			          == INTVAL (op1);
41005 		  else
41006 		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41007 	        }
41008 
41009 	      if (is_mulwiden)
41010 	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41011 	    }
41012 
41013   	  *total = (cost->mult_init[MODE_INDEX (mode)]
41014 		    + nbits * cost->mult_bit
41015 	            + rtx_cost (op0, mode, outer_code, opno, speed)
41016 		    + rtx_cost (op1, mode, outer_code, opno, speed));
41017 
41018           return true;
41019 	}
41020       *total = ix86_multiplication_cost (cost, mode);
41021       return false;
41022 
41023     case DIV:
41024     case UDIV:
41025     case MOD:
41026     case UMOD:
41027       *total = ix86_division_cost (cost, mode);
41028       return false;
41029 
41030     case PLUS:
41031       if (GET_MODE_CLASS (mode) == MODE_INT
41032 	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41033 	{
41034 	  if (GET_CODE (XEXP (x, 0)) == PLUS
41035 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41036 	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41037 	      && CONSTANT_P (XEXP (x, 1)))
41038 	    {
41039 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41040 	      if (val == 2 || val == 4 || val == 8)
41041 		{
41042 		  *total = cost->lea;
41043 		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41044 				      outer_code, opno, speed);
41045 		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41046 				      outer_code, opno, speed);
41047 		  *total += rtx_cost (XEXP (x, 1), mode,
41048 				      outer_code, opno, speed);
41049 		  return true;
41050 		}
41051 	    }
41052 	  else if (GET_CODE (XEXP (x, 0)) == MULT
41053 		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41054 	    {
41055 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41056 	      if (val == 2 || val == 4 || val == 8)
41057 		{
41058 		  *total = cost->lea;
41059 		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41060 				      outer_code, opno, speed);
41061 		  *total += rtx_cost (XEXP (x, 1), mode,
41062 				      outer_code, opno, speed);
41063 		  return true;
41064 		}
41065 	    }
41066 	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
41067 	    {
41068 	      /* Add with carry, ignore the cost of adding a carry flag.  */
41069 	      if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41070 		*total = cost->add;
41071 	      else
41072 		{
41073 		  *total = cost->lea;
41074 		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41075 				      outer_code, opno, speed);
41076 		}
41077 
41078 	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41079 				  outer_code, opno, speed);
41080 	      *total += rtx_cost (XEXP (x, 1), mode,
41081 				  outer_code, opno, speed);
41082 	      return true;
41083 	    }
41084 	}
41085       /* FALLTHRU */
41086 
41087     case MINUS:
41088       /* Subtract with borrow, ignore the cost of subtracting a carry flag.  */
41089       if (GET_MODE_CLASS (mode) == MODE_INT
41090 	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41091 	  && GET_CODE (XEXP (x, 0)) == MINUS
41092 	  && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41093 	{
41094 	  *total = cost->add;
41095 	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41096 			      outer_code, opno, speed);
41097 	  *total += rtx_cost (XEXP (x, 1), mode,
41098 			      outer_code, opno, speed);
41099 	  return true;
41100 	}
41101 
41102       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41103 	{
41104 	  *total = cost->addss;
41105 	  return false;
41106 	}
41107       else if (X87_FLOAT_MODE_P (mode))
41108 	{
41109 	  *total = cost->fadd;
41110 	  return false;
41111 	}
41112       else if (FLOAT_MODE_P (mode))
41113 	{
41114 	  *total = ix86_vec_cost (mode, cost->addss, true);
41115 	  return false;
41116 	}
41117       /* FALLTHRU */
41118 
41119     case AND:
41120     case IOR:
41121     case XOR:
41122       if (GET_MODE_CLASS (mode) == MODE_INT
41123 	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41124 	{
41125 	  *total = (cost->add * 2
41126 		    + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41127 		       << (GET_MODE (XEXP (x, 0)) != DImode))
41128 		    + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41129 	               << (GET_MODE (XEXP (x, 1)) != DImode)));
41130 	  return true;
41131 	}
41132       /* FALLTHRU */
41133 
41134     case NEG:
41135       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41136 	{
41137 	  *total = cost->sse_op;
41138 	  return false;
41139 	}
41140       else if (X87_FLOAT_MODE_P (mode))
41141 	{
41142 	  *total = cost->fchs;
41143 	  return false;
41144 	}
41145       else if (FLOAT_MODE_P (mode))
41146 	{
41147 	  *total = ix86_vec_cost (mode, cost->sse_op, true);
41148 	  return false;
41149 	}
41150       /* FALLTHRU */
41151 
41152     case NOT:
41153       if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41154 	*total = ix86_vec_cost (mode, cost->sse_op, true);
41155       else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41156 	*total = cost->add * 2;
41157       else
41158 	*total = cost->add;
41159       return false;
41160 
41161     case COMPARE:
41162       if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41163 	  && XEXP (XEXP (x, 0), 1) == const1_rtx
41164 	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41165 	  && XEXP (x, 1) == const0_rtx)
41166 	{
41167 	  /* This kind of construct is implemented using test[bwl].
41168 	     Treat it as if we had an AND.  */
41169 	  mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41170 	  *total = (cost->add
41171 		    + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41172 				opno, speed)
41173 		    + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41174 	  return true;
41175 	}
41176 
41177       /* The embedded comparison operand is completely free.  */
41178       if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41179 	  && XEXP (x, 1) == const0_rtx)
41180 	*total = 0;
41181 
41182       return false;
41183 
41184     case FLOAT_EXTEND:
41185       if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41186 	*total = 0;
41187       else
41188         *total = ix86_vec_cost (mode, cost->addss, true);
41189       return false;
41190 
41191     case FLOAT_TRUNCATE:
41192       if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41193 	*total = cost->fadd;
41194       else
41195         *total = ix86_vec_cost (mode, cost->addss, true);
41196       return false;
41197 
41198     case ABS:
41199       /* SSE requires memory load for the constant operand. It may make
41200 	 sense to account for this.  Of course the constant operand may or
41201 	 may not be reused. */
41202       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41203 	*total = cost->sse_op;
41204       else if (X87_FLOAT_MODE_P (mode))
41205 	*total = cost->fabs;
41206       else if (FLOAT_MODE_P (mode))
41207 	*total = ix86_vec_cost (mode, cost->sse_op, true);
41208       return false;
41209 
41210     case SQRT:
41211       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41212 	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
41213       else if (X87_FLOAT_MODE_P (mode))
41214 	*total = cost->fsqrt;
41215       else if (FLOAT_MODE_P (mode))
41216 	*total = ix86_vec_cost (mode,
41217 				mode == SFmode ? cost->sqrtss : cost->sqrtsd,
41218 				true);
41219       return false;
41220 
41221     case UNSPEC:
41222       if (XINT (x, 1) == UNSPEC_TP)
41223 	*total = 0;
41224       return false;
41225 
41226     case VEC_SELECT:
41227     case VEC_CONCAT:
41228     case VEC_DUPLICATE:
41229       /* ??? Assume all of these vector manipulation patterns are
41230 	 recognizable.  In which case they all pretty much have the
41231 	 same cost.  */
41232      *total = cost->sse_op;
41233      return true;
41234     case VEC_MERGE:
41235       mask = XEXP (x, 2);
41236       /* This is masked instruction, assume the same cost,
41237 	 as nonmasked variant.  */
41238       if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41239 	*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41240       else
41241 	*total = cost->sse_op;
41242       return true;
41243 
41244     default:
41245       return false;
41246     }
41247 }
41248 
41249 #if TARGET_MACHO
41250 
41251 static int current_machopic_label_num;
41252 
41253 /* Given a symbol name and its associated stub, write out the
41254    definition of the stub.  */
41255 
41256 void
41257 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41258 {
41259   unsigned int length;
41260   char *binder_name, *symbol_name, lazy_ptr_name[32];
41261   int label = ++current_machopic_label_num;
41262 
41263   /* For 64-bit we shouldn't get here.  */
41264   gcc_assert (!TARGET_64BIT);
41265 
41266   /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
41267   symb = targetm.strip_name_encoding (symb);
41268 
41269   length = strlen (stub);
41270   binder_name = XALLOCAVEC (char, length + 32);
41271   GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41272 
41273   length = strlen (symb);
41274   symbol_name = XALLOCAVEC (char, length + 32);
41275   GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41276 
41277   sprintf (lazy_ptr_name, "L%d$lz", label);
41278 
41279   if (MACHOPIC_ATT_STUB)
41280     switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41281   else if (MACHOPIC_PURE)
41282     switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41283   else
41284     switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41285 
41286   fprintf (file, "%s:\n", stub);
41287   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41288 
41289   if (MACHOPIC_ATT_STUB)
41290     {
41291       fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41292     }
41293   else if (MACHOPIC_PURE)
41294     {
41295       /* PIC stub.  */
41296       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
41297       rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41298       output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
41299       fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41300 	       label, lazy_ptr_name, label);
41301       fprintf (file, "\tjmp\t*%%ecx\n");
41302     }
41303   else
41304     fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41305 
41306   /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41307      it needs no stub-binding-helper.  */
41308   if (MACHOPIC_ATT_STUB)
41309     return;
41310 
41311   fprintf (file, "%s:\n", binder_name);
41312 
41313   if (MACHOPIC_PURE)
41314     {
41315       fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41316       fprintf (file, "\tpushl\t%%ecx\n");
41317     }
41318   else
41319     fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41320 
41321   fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41322 
41323   /* N.B. Keep the correspondence of these
41324      'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41325      old-pic/new-pic/non-pic stubs; altering this will break
41326      compatibility with existing dylibs.  */
41327   if (MACHOPIC_PURE)
41328     {
41329       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
41330       switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41331     }
41332   else
41333     /* 16-byte -mdynamic-no-pic stub.  */
41334     switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41335 
41336   fprintf (file, "%s:\n", lazy_ptr_name);
41337   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41338   fprintf (file, ASM_LONG "%s\n", binder_name);
41339 }
41340 #endif /* TARGET_MACHO */
41341 
41342 /* Order the registers for register allocator.  */
41343 
41344 void
41345 x86_order_regs_for_local_alloc (void)
41346 {
41347    int pos = 0;
41348    int i;
41349 
41350    /* First allocate the local general purpose registers.  */
41351    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41352      if (GENERAL_REGNO_P (i) && call_used_regs[i])
41353 	reg_alloc_order [pos++] = i;
41354 
41355    /* Global general purpose registers.  */
41356    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41357      if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41358 	reg_alloc_order [pos++] = i;
41359 
41360    /* x87 registers come first in case we are doing FP math
41361       using them.  */
41362    if (!TARGET_SSE_MATH)
41363      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41364        reg_alloc_order [pos++] = i;
41365 
41366    /* SSE registers.  */
41367    for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41368      reg_alloc_order [pos++] = i;
41369    for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41370      reg_alloc_order [pos++] = i;
41371 
41372    /* Extended REX SSE registers.  */
41373    for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41374      reg_alloc_order [pos++] = i;
41375 
41376    /* Mask register.  */
41377    for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41378      reg_alloc_order [pos++] = i;
41379 
41380    /* MPX bound registers.  */
41381    for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41382      reg_alloc_order [pos++] = i;
41383 
41384    /* x87 registers.  */
41385    if (TARGET_SSE_MATH)
41386      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41387        reg_alloc_order [pos++] = i;
41388 
41389    for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41390      reg_alloc_order [pos++] = i;
41391 
41392    /* Initialize the rest of array as we do not allocate some registers
41393       at all.  */
41394    while (pos < FIRST_PSEUDO_REGISTER)
41395      reg_alloc_order [pos++] = 0;
41396 }
41397 
41398 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41399    in struct attribute_spec handler.  */
41400 static tree
41401 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41402 					 bool *no_add_attrs)
41403 {
41404   if (TREE_CODE (*node) != FUNCTION_TYPE
41405       && TREE_CODE (*node) != METHOD_TYPE
41406       && TREE_CODE (*node) != FIELD_DECL
41407       && TREE_CODE (*node) != TYPE_DECL)
41408     {
41409       warning (OPT_Wattributes, "%qE attribute only applies to functions",
41410 	       name);
41411       *no_add_attrs = true;
41412       return NULL_TREE;
41413     }
41414   if (TARGET_64BIT)
41415     {
41416       warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41417 	       name);
41418       *no_add_attrs = true;
41419       return NULL_TREE;
41420     }
41421   if (is_attribute_p ("callee_pop_aggregate_return", name))
41422     {
41423       tree cst;
41424 
41425       cst = TREE_VALUE (args);
41426       if (TREE_CODE (cst) != INTEGER_CST)
41427 	{
41428 	  warning (OPT_Wattributes,
41429 		   "%qE attribute requires an integer constant argument",
41430 		   name);
41431 	  *no_add_attrs = true;
41432 	}
41433       else if (compare_tree_int (cst, 0) != 0
41434 	       && compare_tree_int (cst, 1) != 0)
41435 	{
41436 	  warning (OPT_Wattributes,
41437 		   "argument to %qE attribute is neither zero, nor one",
41438 		   name);
41439 	  *no_add_attrs = true;
41440 	}
41441 
41442       return NULL_TREE;
41443     }
41444 
41445   return NULL_TREE;
41446 }
41447 
41448 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41449    struct attribute_spec.handler.  */
41450 static tree
41451 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41452 			   bool *no_add_attrs)
41453 {
41454   if (TREE_CODE (*node) != FUNCTION_TYPE
41455       && TREE_CODE (*node) != METHOD_TYPE
41456       && TREE_CODE (*node) != FIELD_DECL
41457       && TREE_CODE (*node) != TYPE_DECL)
41458     {
41459       warning (OPT_Wattributes, "%qE attribute only applies to functions",
41460 	       name);
41461       *no_add_attrs = true;
41462       return NULL_TREE;
41463     }
41464 
41465   /* Can combine regparm with all attributes but fastcall.  */
41466   if (is_attribute_p ("ms_abi", name))
41467     {
41468       if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41469         {
41470 	  error ("ms_abi and sysv_abi attributes are not compatible");
41471 	}
41472 
41473       return NULL_TREE;
41474     }
41475   else if (is_attribute_p ("sysv_abi", name))
41476     {
41477       if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41478         {
41479 	  error ("ms_abi and sysv_abi attributes are not compatible");
41480 	}
41481 
41482       return NULL_TREE;
41483     }
41484 
41485   return NULL_TREE;
41486 }
41487 
41488 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41489    struct attribute_spec.handler.  */
41490 static tree
41491 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41492 			      bool *no_add_attrs)
41493 {
41494   tree *type = NULL;
41495   if (DECL_P (*node))
41496     {
41497       if (TREE_CODE (*node) == TYPE_DECL)
41498 	type = &TREE_TYPE (*node);
41499     }
41500   else
41501     type = node;
41502 
41503   if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41504     {
41505       warning (OPT_Wattributes, "%qE attribute ignored",
41506 	       name);
41507       *no_add_attrs = true;
41508     }
41509 
41510   else if ((is_attribute_p ("ms_struct", name)
41511 	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41512 	   || ((is_attribute_p ("gcc_struct", name)
41513 		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41514     {
41515       warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41516                name);
41517       *no_add_attrs = true;
41518     }
41519 
41520   return NULL_TREE;
41521 }
41522 
41523 static tree
41524 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41525 			      bool *no_add_attrs)
41526 {
41527   if (TREE_CODE (*node) != FUNCTION_DECL)
41528     {
41529       warning (OPT_Wattributes, "%qE attribute only applies to functions",
41530                name);
41531       *no_add_attrs = true;
41532     }
41533 
41534   if (is_attribute_p ("indirect_branch", name))
41535     {
41536       tree cst = TREE_VALUE (args);
41537       if (TREE_CODE (cst) != STRING_CST)
41538 	{
41539 	  warning (OPT_Wattributes,
41540 		   "%qE attribute requires a string constant argument",
41541 		   name);
41542 	  *no_add_attrs = true;
41543 	}
41544       else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41545 	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41546 	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41547 	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41548 	{
41549 	  warning (OPT_Wattributes,
41550 		   "argument to %qE attribute is not "
41551 		   "(keep|thunk|thunk-inline|thunk-extern)", name);
41552 	  *no_add_attrs = true;
41553 	}
41554     }
41555 
41556   if (is_attribute_p ("function_return", name))
41557     {
41558       tree cst = TREE_VALUE (args);
41559       if (TREE_CODE (cst) != STRING_CST)
41560 	{
41561 	  warning (OPT_Wattributes,
41562 		   "%qE attribute requires a string constant argument",
41563 		   name);
41564 	  *no_add_attrs = true;
41565 	}
41566       else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41567 	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41568 	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41569 	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41570 	{
41571 	  warning (OPT_Wattributes,
41572 		   "argument to %qE attribute is not "
41573 		   "(keep|thunk|thunk-inline|thunk-extern)", name);
41574 	  *no_add_attrs = true;
41575 	}
41576     }
41577 
41578   return NULL_TREE;
41579 }
41580 
41581 static tree
41582 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41583 						 int, bool *)
41584 {
41585   return NULL_TREE;
41586 }
41587 
41588 static tree
41589 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41590 {
41591   /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41592      but the function type contains args and return type data.  */
41593   tree func_type = *node;
41594   tree return_type = TREE_TYPE (func_type);
41595 
41596   int nargs = 0;
41597   tree current_arg_type = TYPE_ARG_TYPES (func_type);
41598   while (current_arg_type
41599 	 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41600     {
41601       if (nargs == 0)
41602 	{
41603 	  if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41604 	    error ("interrupt service routine should have a pointer "
41605 		   "as the first argument");
41606 	}
41607       else if (nargs == 1)
41608 	{
41609 	  if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41610 	      || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41611 	    error ("interrupt service routine should have unsigned %s"
41612 		   "int as the second argument",
41613 		   TARGET_64BIT
41614 		   ? (TARGET_X32 ? "long long " : "long ")
41615 		   : "");
41616 	}
41617       nargs++;
41618       current_arg_type = TREE_CHAIN (current_arg_type);
41619     }
41620   if (!nargs || nargs > 2)
41621     error ("interrupt service routine can only have a pointer argument "
41622 	   "and an optional integer argument");
41623   if (! VOID_TYPE_P (return_type))
41624     error ("interrupt service routine can't have non-void return value");
41625 
41626   return NULL_TREE;
41627 }
41628 
41629 static bool
41630 ix86_ms_bitfield_layout_p (const_tree record_type)
41631 {
41632   return ((TARGET_MS_BITFIELD_LAYOUT
41633 	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41634           || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41635 }
41636 
41637 /* Returns an expression indicating where the this parameter is
41638    located on entry to the FUNCTION.  */
41639 
41640 static rtx
41641 x86_this_parameter (tree function)
41642 {
41643   tree type = TREE_TYPE (function);
41644   bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41645   int nregs;
41646 
41647   if (TARGET_64BIT)
41648     {
41649       const int *parm_regs;
41650 
41651       if (ix86_function_type_abi (type) == MS_ABI)
41652         parm_regs = x86_64_ms_abi_int_parameter_registers;
41653       else
41654         parm_regs = x86_64_int_parameter_registers;
41655       return gen_rtx_REG (Pmode, parm_regs[aggr]);
41656     }
41657 
41658   nregs = ix86_function_regparm (type, function);
41659 
41660   if (nregs > 0 && !stdarg_p (type))
41661     {
41662       int regno;
41663       unsigned int ccvt = ix86_get_callcvt (type);
41664 
41665       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41666 	regno = aggr ? DX_REG : CX_REG;
41667       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41668         {
41669 	  regno = CX_REG;
41670 	  if (aggr)
41671 	    return gen_rtx_MEM (SImode,
41672 				plus_constant (Pmode, stack_pointer_rtx, 4));
41673 	}
41674       else
41675         {
41676 	  regno = AX_REG;
41677 	  if (aggr)
41678 	    {
41679 	      regno = DX_REG;
41680 	      if (nregs == 1)
41681 		return gen_rtx_MEM (SImode,
41682 				    plus_constant (Pmode,
41683 						   stack_pointer_rtx, 4));
41684 	    }
41685 	}
41686       return gen_rtx_REG (SImode, regno);
41687     }
41688 
41689   return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41690 					     aggr ? 8 : 4));
41691 }
41692 
41693 /* Determine whether x86_output_mi_thunk can succeed.  */
41694 
41695 static bool
41696 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41697 			 const_tree function)
41698 {
41699   /* 64-bit can handle anything.  */
41700   if (TARGET_64BIT)
41701     return true;
41702 
41703   /* For 32-bit, everything's fine if we have one free register.  */
41704   if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41705     return true;
41706 
41707   /* Need a free register for vcall_offset.  */
41708   if (vcall_offset)
41709     return false;
41710 
41711   /* Need a free register for GOT references.  */
41712   if (flag_pic && !targetm.binds_local_p (function))
41713     return false;
41714 
41715   /* Otherwise ok.  */
41716   return true;
41717 }
41718 
41719 /* Output the assembler code for a thunk function.  THUNK_DECL is the
41720    declaration for the thunk function itself, FUNCTION is the decl for
41721    the target function.  DELTA is an immediate constant offset to be
41722    added to THIS.  If VCALL_OFFSET is nonzero, the word at
41723    *(*this + vcall_offset) should be added to THIS.  */
41724 
41725 static void
41726 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41727 		     HOST_WIDE_INT vcall_offset, tree function)
41728 {
41729   rtx this_param = x86_this_parameter (function);
41730   rtx this_reg, tmp, fnaddr;
41731   unsigned int tmp_regno;
41732   rtx_insn *insn;
41733 
41734   if (TARGET_64BIT)
41735     tmp_regno = R10_REG;
41736   else
41737     {
41738       unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41739       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41740 	tmp_regno = AX_REG;
41741       else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41742 	tmp_regno = DX_REG;
41743       else
41744 	tmp_regno = CX_REG;
41745     }
41746 
41747   emit_note (NOTE_INSN_PROLOGUE_END);
41748 
41749   /* CET is enabled, insert EB instruction.  */
41750   if ((flag_cf_protection & CF_BRANCH))
41751     emit_insn (gen_nop_endbr ());
41752 
41753   /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
41754      pull it in now and let DELTA benefit.  */
41755   if (REG_P (this_param))
41756     this_reg = this_param;
41757   else if (vcall_offset)
41758     {
41759       /* Put the this parameter into %eax.  */
41760       this_reg = gen_rtx_REG (Pmode, AX_REG);
41761       emit_move_insn (this_reg, this_param);
41762     }
41763   else
41764     this_reg = NULL_RTX;
41765 
41766   /* Adjust the this parameter by a fixed constant.  */
41767   if (delta)
41768     {
41769       rtx delta_rtx = GEN_INT (delta);
41770       rtx delta_dst = this_reg ? this_reg : this_param;
41771 
41772       if (TARGET_64BIT)
41773 	{
41774 	  if (!x86_64_general_operand (delta_rtx, Pmode))
41775 	    {
41776 	      tmp = gen_rtx_REG (Pmode, tmp_regno);
41777 	      emit_move_insn (tmp, delta_rtx);
41778 	      delta_rtx = tmp;
41779 	    }
41780 	}
41781 
41782       ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41783     }
41784 
41785   /* Adjust the this parameter by a value stored in the vtable.  */
41786   if (vcall_offset)
41787     {
41788       rtx vcall_addr, vcall_mem, this_mem;
41789 
41790       tmp = gen_rtx_REG (Pmode, tmp_regno);
41791 
41792       this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41793       if (Pmode != ptr_mode)
41794 	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41795       emit_move_insn (tmp, this_mem);
41796 
41797       /* Adjust the this parameter.  */
41798       vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41799       if (TARGET_64BIT
41800 	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41801 	{
41802 	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41803 	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
41804 	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41805 	}
41806 
41807       vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41808       if (Pmode != ptr_mode)
41809 	emit_insn (gen_addsi_1_zext (this_reg,
41810 				     gen_rtx_REG (ptr_mode,
41811 						  REGNO (this_reg)),
41812 				     vcall_mem));
41813       else
41814 	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41815     }
41816 
41817   /* If necessary, drop THIS back to its stack slot.  */
41818   if (this_reg && this_reg != this_param)
41819     emit_move_insn (this_param, this_reg);
41820 
41821   fnaddr = XEXP (DECL_RTL (function), 0);
41822   if (TARGET_64BIT)
41823     {
41824       if (!flag_pic || targetm.binds_local_p (function)
41825 	  || TARGET_PECOFF)
41826 	;
41827       else
41828 	{
41829 	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41830 	  tmp = gen_rtx_CONST (Pmode, tmp);
41831 	  fnaddr = gen_const_mem (Pmode, tmp);
41832 	}
41833     }
41834   else
41835     {
41836       if (!flag_pic || targetm.binds_local_p (function))
41837 	;
41838 #if TARGET_MACHO
41839       else if (TARGET_MACHO)
41840 	{
41841 	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41842 	  fnaddr = XEXP (fnaddr, 0);
41843 	}
41844 #endif /* TARGET_MACHO */
41845       else
41846 	{
41847 	  tmp = gen_rtx_REG (Pmode, CX_REG);
41848 	  output_set_got (tmp, NULL_RTX);
41849 
41850 	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41851 	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41852 	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41853 	  fnaddr = gen_const_mem (Pmode, fnaddr);
41854 	}
41855     }
41856 
41857   /* Our sibling call patterns do not allow memories, because we have no
41858      predicate that can distinguish between frame and non-frame memory.
41859      For our purposes here, we can get away with (ab)using a jump pattern,
41860      because we're going to do no optimization.  */
41861   if (MEM_P (fnaddr))
41862     {
41863       if (sibcall_insn_operand (fnaddr, word_mode))
41864 	{
41865 	  fnaddr = XEXP (DECL_RTL (function), 0);
41866 	  tmp = gen_rtx_MEM (QImode, fnaddr);
41867 	  tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41868 	  tmp = emit_call_insn (tmp);
41869 	  SIBLING_CALL_P (tmp) = 1;
41870 	}
41871       else
41872 	emit_jump_insn (gen_indirect_jump (fnaddr));
41873     }
41874   else
41875     {
41876       if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41877 	{
41878 	  // CM_LARGE_PIC always uses pseudo PIC register which is
41879 	  // uninitialized.  Since FUNCTION is local and calling it
41880 	  // doesn't go through PLT, we use scratch register %r11 as
41881 	  // PIC register and initialize it here.
41882 	  pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41883 	  ix86_init_large_pic_reg (tmp_regno);
41884 	  fnaddr = legitimize_pic_address (fnaddr,
41885 					   gen_rtx_REG (Pmode, tmp_regno));
41886 	}
41887 
41888       if (!sibcall_insn_operand (fnaddr, word_mode))
41889 	{
41890 	  tmp = gen_rtx_REG (word_mode, tmp_regno);
41891 	  if (GET_MODE (fnaddr) != word_mode)
41892 	    fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41893 	  emit_move_insn (tmp, fnaddr);
41894 	  fnaddr = tmp;
41895 	}
41896 
41897       tmp = gen_rtx_MEM (QImode, fnaddr);
41898       tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41899       tmp = emit_call_insn (tmp);
41900       SIBLING_CALL_P (tmp) = 1;
41901     }
41902   emit_barrier ();
41903 
41904   /* Emit just enough of rest_of_compilation to get the insns emitted.
41905      Note that use_thunk calls assemble_start_function et al.  */
41906   insn = get_insns ();
41907   shorten_branches (insn);
41908   final_start_function (insn, file, 1);
41909   final (insn, file, 1);
41910   final_end_function ();
41911 }
41912 
41913 static void
41914 x86_file_start (void)
41915 {
41916   default_file_start ();
41917   if (TARGET_16BIT)
41918     fputs ("\t.code16gcc\n", asm_out_file);
41919 #if TARGET_MACHO
41920   darwin_file_start ();
41921 #endif
41922   if (X86_FILE_START_VERSION_DIRECTIVE)
41923     fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41924   if (X86_FILE_START_FLTUSED)
41925     fputs ("\t.global\t__fltused\n", asm_out_file);
41926   if (ix86_asm_dialect == ASM_INTEL)
41927     fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41928 }
41929 
41930 int
41931 x86_field_alignment (tree type, int computed)
41932 {
41933   machine_mode mode;
41934 
41935   if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41936     return computed;
41937   if (TARGET_IAMCU)
41938     return iamcu_alignment (type, computed);
41939   mode = TYPE_MODE (strip_array_types (type));
41940   if (mode == DFmode || mode == DCmode
41941       || GET_MODE_CLASS (mode) == MODE_INT
41942       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41943     return MIN (32, computed);
41944   return computed;
41945 }
41946 
41947 /* Print call to TARGET to FILE.  */
41948 
41949 static void
41950 x86_print_call_or_nop (FILE *file, const char *target)
41951 {
41952   if (flag_nop_mcount)
41953     /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41954     fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41955   else
41956     fprintf (file, "1:\tcall\t%s\n", target);
41957 }
41958 
41959 /* Output assembler code to FILE to increment profiler label # LABELNO
41960    for profiling a function entry.  */
41961 void
41962 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41963 {
41964   const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41965 					 : MCOUNT_NAME);
41966 
41967   if (cfun->machine->endbr_queued_at_entrance)
41968     fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
41969 
41970   if (TARGET_64BIT)
41971     {
41972 #ifndef NO_PROFILE_COUNTERS
41973       fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41974 #endif
41975 
41976       if (!TARGET_PECOFF && flag_pic)
41977 	fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41978       else
41979 	x86_print_call_or_nop (file, mcount_name);
41980     }
41981   else if (flag_pic)
41982     {
41983 #ifndef NO_PROFILE_COUNTERS
41984       fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41985 	       LPREFIX, labelno);
41986 #endif
41987       fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41988     }
41989   else
41990     {
41991 #ifndef NO_PROFILE_COUNTERS
41992       fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41993 	       LPREFIX, labelno);
41994 #endif
41995       x86_print_call_or_nop (file, mcount_name);
41996     }
41997 
41998   if (flag_record_mcount)
41999     {
42000       fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42001       fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42002       fprintf (file, "\t.previous\n");
42003     }
42004 }
42005 
42006 /* We don't have exact information about the insn sizes, but we may assume
42007    quite safely that we are informed about all 1 byte insns and memory
42008    address sizes.  This is enough to eliminate unnecessary padding in
42009    99% of cases.  */
42010 
42011 int
42012 ix86_min_insn_size (rtx_insn *insn)
42013 {
42014   int l = 0, len;
42015 
42016   if (!INSN_P (insn) || !active_insn_p (insn))
42017     return 0;
42018 
42019   /* Discard alignments we've emit and jump instructions.  */
42020   if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42021       && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42022     return 0;
42023 
42024   /* Important case - calls are always 5 bytes.
42025      It is common to have many calls in the row.  */
42026   if (CALL_P (insn)
42027       && symbolic_reference_mentioned_p (PATTERN (insn))
42028       && !SIBLING_CALL_P (insn))
42029     return 5;
42030   len = get_attr_length (insn);
42031   if (len <= 1)
42032     return 1;
42033 
42034   /* For normal instructions we rely on get_attr_length being exact,
42035      with a few exceptions.  */
42036   if (!JUMP_P (insn))
42037     {
42038       enum attr_type type = get_attr_type (insn);
42039 
42040       switch (type)
42041 	{
42042 	case TYPE_MULTI:
42043 	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42044 	      || asm_noperands (PATTERN (insn)) >= 0)
42045 	    return 0;
42046 	  break;
42047 	case TYPE_OTHER:
42048 	case TYPE_FCMP:
42049 	  break;
42050 	default:
42051 	  /* Otherwise trust get_attr_length.  */
42052 	  return len;
42053 	}
42054 
42055       l = get_attr_length_address (insn);
42056       if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42057 	l = 4;
42058     }
42059   if (l)
42060     return 1+l;
42061   else
42062     return 2;
42063 }
42064 
42065 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42066 
42067 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42068    window.  */
42069 
42070 static void
42071 ix86_avoid_jump_mispredicts (void)
42072 {
42073   rtx_insn *insn, *start = get_insns ();
42074   int nbytes = 0, njumps = 0;
42075   bool isjump = false;
42076 
42077   /* Look for all minimal intervals of instructions containing 4 jumps.
42078      The intervals are bounded by START and INSN.  NBYTES is the total
42079      size of instructions in the interval including INSN and not including
42080      START.  When the NBYTES is smaller than 16 bytes, it is possible
42081      that the end of START and INSN ends up in the same 16byte page.
42082 
42083      The smallest offset in the page INSN can start is the case where START
42084      ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
42085      We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42086 
42087      Don't consider asm goto as jump, while it can contain a jump, it doesn't
42088      have to, control transfer to label(s) can be performed through other
42089      means, and also we estimate minimum length of all asm stmts as 0.  */
42090   for (insn = start; insn; insn = NEXT_INSN (insn))
42091     {
42092       int min_size;
42093 
42094       if (LABEL_P (insn))
42095 	{
42096 	  int align = label_to_alignment (insn);
42097 	  int max_skip = label_to_max_skip (insn);
42098 
42099 	  if (max_skip > 15)
42100 	    max_skip = 15;
42101 	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42102 	     already in the current 16 byte page, because otherwise
42103 	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42104 	     bytes to reach 16 byte boundary.  */
42105 	  if (align <= 0
42106 	      || (align <= 3 && max_skip != (1 << align) - 1))
42107 	    max_skip = 0;
42108 	  if (dump_file)
42109 	    fprintf (dump_file, "Label %i with max_skip %i\n",
42110 		     INSN_UID (insn), max_skip);
42111 	  if (max_skip)
42112 	    {
42113 	      while (nbytes + max_skip >= 16)
42114 		{
42115 		  start = NEXT_INSN (start);
42116 		  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42117 		      || CALL_P (start))
42118 		    njumps--, isjump = true;
42119 		  else
42120 		    isjump = false;
42121 		  nbytes -= ix86_min_insn_size (start);
42122 		}
42123 	    }
42124 	  continue;
42125 	}
42126 
42127       min_size = ix86_min_insn_size (insn);
42128       nbytes += min_size;
42129       if (dump_file)
42130 	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42131 		 INSN_UID (insn), min_size);
42132       if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42133 	  || CALL_P (insn))
42134 	njumps++;
42135       else
42136 	continue;
42137 
42138       while (njumps > 3)
42139 	{
42140 	  start = NEXT_INSN (start);
42141 	  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42142 	      || CALL_P (start))
42143 	    njumps--, isjump = true;
42144 	  else
42145 	    isjump = false;
42146 	  nbytes -= ix86_min_insn_size (start);
42147 	}
42148       gcc_assert (njumps >= 0);
42149       if (dump_file)
42150         fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42151 		 INSN_UID (start), INSN_UID (insn), nbytes);
42152 
42153       if (njumps == 3 && isjump && nbytes < 16)
42154 	{
42155 	  int padsize = 15 - nbytes + ix86_min_insn_size (insn);
42156 
42157 	  if (dump_file)
42158 	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42159 		     INSN_UID (insn), padsize);
42160           emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42161 	}
42162     }
42163 }
42164 #endif
42165 
42166 /* AMD Athlon works faster
42167    when RET is not destination of conditional jump or directly preceded
42168    by other jump instruction.  We avoid the penalty by inserting NOP just
42169    before the RET instructions in such cases.  */
42170 static void
42171 ix86_pad_returns (void)
42172 {
42173   edge e;
42174   edge_iterator ei;
42175 
42176   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42177     {
42178       basic_block bb = e->src;
42179       rtx_insn *ret = BB_END (bb);
42180       rtx_insn *prev;
42181       bool replace = false;
42182 
42183       if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42184 	  || optimize_bb_for_size_p (bb))
42185 	continue;
42186       for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42187 	if (active_insn_p (prev) || LABEL_P (prev))
42188 	  break;
42189       if (prev && LABEL_P (prev))
42190 	{
42191 	  edge e;
42192 	  edge_iterator ei;
42193 
42194 	  FOR_EACH_EDGE (e, ei, bb->preds)
42195 	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
42196 		&& !(e->flags & EDGE_FALLTHRU))
42197 	      {
42198 		replace = true;
42199 		break;
42200 	      }
42201 	}
42202       if (!replace)
42203 	{
42204 	  prev = prev_active_insn (ret);
42205 	  if (prev
42206 	      && ((JUMP_P (prev) && any_condjump_p (prev))
42207 		  || CALL_P (prev)))
42208 	    replace = true;
42209 	  /* Empty functions get branch mispredict even when
42210 	     the jump destination is not visible to us.  */
42211 	  if (!prev && !optimize_function_for_size_p (cfun))
42212 	    replace = true;
42213 	}
42214       if (replace)
42215 	{
42216 	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42217 	  delete_insn (ret);
42218 	}
42219     }
42220 }
42221 
42222 /* Count the minimum number of instructions in BB.  Return 4 if the
42223    number of instructions >= 4.  */
42224 
42225 static int
42226 ix86_count_insn_bb (basic_block bb)
42227 {
42228   rtx_insn *insn;
42229   int insn_count = 0;
42230 
42231   /* Count number of instructions in this block.  Return 4 if the number
42232      of instructions >= 4.  */
42233   FOR_BB_INSNS (bb, insn)
42234     {
42235       /* Only happen in exit blocks.  */
42236       if (JUMP_P (insn)
42237 	  && ANY_RETURN_P (PATTERN (insn)))
42238 	break;
42239 
42240       if (NONDEBUG_INSN_P (insn)
42241 	  && GET_CODE (PATTERN (insn)) != USE
42242 	  && GET_CODE (PATTERN (insn)) != CLOBBER)
42243 	{
42244 	  insn_count++;
42245 	  if (insn_count >= 4)
42246 	    return insn_count;
42247 	}
42248     }
42249 
42250   return insn_count;
42251 }
42252 
42253 
42254 /* Count the minimum number of instructions in code path in BB.
42255    Return 4 if the number of instructions >= 4.  */
42256 
42257 static int
42258 ix86_count_insn (basic_block bb)
42259 {
42260   edge e;
42261   edge_iterator ei;
42262   int min_prev_count;
42263 
42264   /* Only bother counting instructions along paths with no
42265      more than 2 basic blocks between entry and exit.  Given
42266      that BB has an edge to exit, determine if a predecessor
42267      of BB has an edge from entry.  If so, compute the number
42268      of instructions in the predecessor block.  If there
42269      happen to be multiple such blocks, compute the minimum.  */
42270   min_prev_count = 4;
42271   FOR_EACH_EDGE (e, ei, bb->preds)
42272     {
42273       edge prev_e;
42274       edge_iterator prev_ei;
42275 
42276       if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42277 	{
42278 	  min_prev_count = 0;
42279 	  break;
42280 	}
42281       FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42282 	{
42283 	  if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42284 	    {
42285 	      int count = ix86_count_insn_bb (e->src);
42286 	      if (count < min_prev_count)
42287 		min_prev_count = count;
42288 	      break;
42289 	    }
42290 	}
42291     }
42292 
42293   if (min_prev_count < 4)
42294     min_prev_count += ix86_count_insn_bb (bb);
42295 
42296   return min_prev_count;
42297 }
42298 
42299 /* Pad short function to 4 instructions.   */
42300 
42301 static void
42302 ix86_pad_short_function (void)
42303 {
42304   edge e;
42305   edge_iterator ei;
42306 
42307   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42308     {
42309       rtx_insn *ret = BB_END (e->src);
42310       if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42311 	{
42312 	  int insn_count = ix86_count_insn (e->src);
42313 
42314 	  /* Pad short function.  */
42315 	  if (insn_count < 4)
42316 	    {
42317 	      rtx_insn *insn = ret;
42318 
42319 	      /* Find epilogue.  */
42320 	      while (insn
42321 		     && (!NOTE_P (insn)
42322 			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42323 		insn = PREV_INSN (insn);
42324 
42325 	      if (!insn)
42326 		insn = ret;
42327 
42328 	      /* Two NOPs count as one instruction.  */
42329 	      insn_count = 2 * (4 - insn_count);
42330 	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42331 	    }
42332 	}
42333     }
42334 }
42335 
42336 /* Fix up a Windows system unwinder issue.  If an EH region falls through into
42337    the epilogue, the Windows system unwinder will apply epilogue logic and
42338    produce incorrect offsets.  This can be avoided by adding a nop between
42339    the last insn that can throw and the first insn of the epilogue.  */
42340 
42341 static void
42342 ix86_seh_fixup_eh_fallthru (void)
42343 {
42344   edge e;
42345   edge_iterator ei;
42346 
42347   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42348     {
42349       rtx_insn *insn, *next;
42350 
42351       /* Find the beginning of the epilogue.  */
42352       for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42353 	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42354 	  break;
42355       if (insn == NULL)
42356 	continue;
42357 
42358       /* We only care about preceding insns that can throw.  */
42359       insn = prev_active_insn (insn);
42360       if (insn == NULL || !can_throw_internal (insn))
42361 	continue;
42362 
42363       /* Do not separate calls from their debug information.  */
42364       for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42365 	if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
42366 	  insn = next;
42367 	else
42368 	  break;
42369 
42370       emit_insn_after (gen_nops (const1_rtx), insn);
42371     }
42372 }
42373 
42374 /* Given a register number BASE, the lowest of a group of registers, update
42375    regsets IN and OUT with the registers that should be avoided in input
42376    and output operands respectively when trying to avoid generating a modr/m
42377    byte for -mmitigate-rop.  */
42378 
42379 static void
42380 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42381 {
42382   SET_HARD_REG_BIT (out, base);
42383   SET_HARD_REG_BIT (out, base + 1);
42384   SET_HARD_REG_BIT (in, base + 2);
42385   SET_HARD_REG_BIT (in, base + 3);
42386 }
42387 
42388 /* Called if -mmitigate-rop is in effect.  Try to rewrite instructions so
42389    that certain encodings of modr/m bytes do not occur.  */
42390 static void
42391 ix86_mitigate_rop (void)
42392 {
42393   HARD_REG_SET input_risky;
42394   HARD_REG_SET output_risky;
42395   HARD_REG_SET inout_risky;
42396 
42397   CLEAR_HARD_REG_SET (output_risky);
42398   CLEAR_HARD_REG_SET (input_risky);
42399   SET_HARD_REG_BIT (output_risky, AX_REG);
42400   SET_HARD_REG_BIT (output_risky, CX_REG);
42401   SET_HARD_REG_BIT (input_risky, BX_REG);
42402   SET_HARD_REG_BIT (input_risky, DX_REG);
42403   set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42404   set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42405   set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42406   set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42407   set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42408   set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42409   COPY_HARD_REG_SET (inout_risky, input_risky);
42410   IOR_HARD_REG_SET (inout_risky, output_risky);
42411 
42412   df_note_add_problem ();
42413   /* Fix up what stack-regs did.  */
42414   df_insn_rescan_all ();
42415   df_analyze ();
42416 
42417   regrename_init (true);
42418   regrename_analyze (NULL);
42419 
42420   auto_vec<du_head_p> cands;
42421 
42422   for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42423     {
42424       if (!NONDEBUG_INSN_P (insn))
42425 	continue;
42426 
42427       if (GET_CODE (PATTERN (insn)) == USE
42428 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
42429 	continue;
42430 
42431       extract_insn (insn);
42432 
42433       int opno0, opno1;
42434       int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42435 					  recog_data.n_operands, &opno0,
42436 					  &opno1);
42437 
42438       if (!ix86_rop_should_change_byte_p (modrm))
42439 	continue;
42440 
42441       insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42442 
42443       /* This happens when regrename has to fail a block.  */
42444       if (!info->op_info)
42445 	continue;
42446 
42447       if (info->op_info[opno0].n_chains != 0)
42448 	{
42449 	  gcc_assert (info->op_info[opno0].n_chains == 1);
42450 	  du_head_p op0c;
42451 	  op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42452 	  if (op0c->target_data_1 + op0c->target_data_2 == 0
42453 	      && !op0c->cannot_rename)
42454 	    cands.safe_push (op0c);
42455 
42456 	  op0c->target_data_1++;
42457 	}
42458       if (info->op_info[opno1].n_chains != 0)
42459 	{
42460 	  gcc_assert (info->op_info[opno1].n_chains == 1);
42461 	  du_head_p op1c;
42462 	  op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42463 	  if (op1c->target_data_1 + op1c->target_data_2 == 0
42464 	      && !op1c->cannot_rename)
42465 	    cands.safe_push (op1c);
42466 
42467 	  op1c->target_data_2++;
42468 	}
42469     }
42470 
42471   int i;
42472   du_head_p head;
42473   FOR_EACH_VEC_ELT (cands, i, head)
42474     {
42475       int old_reg, best_reg;
42476       HARD_REG_SET unavailable;
42477 
42478       CLEAR_HARD_REG_SET (unavailable);
42479       if (head->target_data_1)
42480 	IOR_HARD_REG_SET (unavailable, output_risky);
42481       if (head->target_data_2)
42482 	IOR_HARD_REG_SET (unavailable, input_risky);
42483 
42484       int n_uses;
42485       reg_class superclass = regrename_find_superclass (head, &n_uses,
42486 							&unavailable);
42487       old_reg = head->regno;
42488       best_reg = find_rename_reg (head, superclass, &unavailable,
42489 				  old_reg, false);
42490       bool ok = regrename_do_replace (head, best_reg);
42491       gcc_assert (ok);
42492       if (dump_file)
42493 	fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42494 		 reg_names[best_reg], reg_class_names[superclass]);
42495 
42496     }
42497 
42498   regrename_finish ();
42499 
42500   df_analyze ();
42501 
42502   basic_block bb;
42503   regset_head live;
42504 
42505   INIT_REG_SET (&live);
42506 
42507   FOR_EACH_BB_FN (bb, cfun)
42508     {
42509       rtx_insn *insn;
42510 
42511       COPY_REG_SET (&live, DF_LR_OUT (bb));
42512       df_simulate_initialize_backwards (bb, &live);
42513 
42514       FOR_BB_INSNS_REVERSE (bb, insn)
42515 	{
42516 	  if (!NONDEBUG_INSN_P (insn))
42517 	    continue;
42518 
42519 	  df_simulate_one_insn_backwards (bb, insn, &live);
42520 
42521 	  if (GET_CODE (PATTERN (insn)) == USE
42522 	      || GET_CODE (PATTERN (insn)) == CLOBBER)
42523 	    continue;
42524 
42525 	  extract_insn (insn);
42526 	  constrain_operands_cached (insn, reload_completed);
42527 	  int opno0, opno1;
42528 	  int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42529 					      recog_data.n_operands, &opno0,
42530 					      &opno1);
42531 	  if (modrm < 0
42532 	      || !ix86_rop_should_change_byte_p (modrm)
42533 	      || opno0 == opno1)
42534 	    continue;
42535 
42536 	  rtx oldreg = recog_data.operand[opno1];
42537 	  preprocess_constraints (insn);
42538 	  const operand_alternative *alt = which_op_alt ();
42539 
42540 	  int i;
42541 	  for (i = 0; i < recog_data.n_operands; i++)
42542 	    if (i != opno1
42543 		&& alt[i].earlyclobber
42544 		&& reg_overlap_mentioned_p (recog_data.operand[i],
42545 					    oldreg))
42546 	      break;
42547 
42548 	  if (i < recog_data.n_operands)
42549 	    continue;
42550 
42551 	  if (dump_file)
42552 	    fprintf (dump_file,
42553 		     "attempting to fix modrm byte in insn %d:"
42554 		     " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42555 		     reg_class_names[alt[opno1].cl]);
42556 
42557 	  HARD_REG_SET unavailable;
42558 	  REG_SET_TO_HARD_REG_SET (unavailable, &live);
42559 	  SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42560 	  IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42561 	  IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42562 	  IOR_HARD_REG_SET (unavailable, output_risky);
42563 	  IOR_COMPL_HARD_REG_SET (unavailable,
42564 				  reg_class_contents[alt[opno1].cl]);
42565 
42566 	  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42567 	      if (!TEST_HARD_REG_BIT (unavailable, i))
42568 		break;
42569 	  if (i == FIRST_PSEUDO_REGISTER)
42570 	    {
42571 	      if (dump_file)
42572 		fprintf (dump_file, ", none available\n");
42573 	      continue;
42574 	    }
42575 	  if (dump_file)
42576 	    fprintf (dump_file, " -> %d\n", i);
42577 	  rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42578 	  validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42579 	  insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42580 	}
42581     }
42582 }
42583 
42584 /* Implement machine specific optimizations.  We implement padding of returns
42585    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
42586 static void
42587 ix86_reorg (void)
42588 {
42589   /* We are freeing block_for_insn in the toplev to keep compatibility
42590      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
42591   compute_bb_for_insn ();
42592 
42593   if (flag_mitigate_rop)
42594     ix86_mitigate_rop ();
42595 
42596   if (TARGET_SEH && current_function_has_exception_handlers ())
42597     ix86_seh_fixup_eh_fallthru ();
42598 
42599   if (optimize && optimize_function_for_speed_p (cfun))
42600     {
42601       if (TARGET_PAD_SHORT_FUNCTION)
42602 	ix86_pad_short_function ();
42603       else if (TARGET_PAD_RETURNS)
42604 	ix86_pad_returns ();
42605 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42606       if (TARGET_FOUR_JUMP_LIMIT)
42607 	ix86_avoid_jump_mispredicts ();
42608 #endif
42609     }
42610 }
42611 
42612 /* Return nonzero when QImode register that must be represented via REX prefix
42613    is used.  */
42614 bool
42615 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42616 {
42617   int i;
42618   extract_insn_cached (insn);
42619   for (i = 0; i < recog_data.n_operands; i++)
42620     if (GENERAL_REG_P (recog_data.operand[i])
42621 	&& !QI_REGNO_P (REGNO (recog_data.operand[i])))
42622        return true;
42623   return false;
42624 }
42625 
42626 /* Return true when INSN mentions register that must be encoded using REX
42627    prefix.  */
42628 bool
42629 x86_extended_reg_mentioned_p (rtx insn)
42630 {
42631   subrtx_iterator::array_type array;
42632   FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42633     {
42634       const_rtx x = *iter;
42635       if (REG_P (x)
42636 	  && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42637 	return true;
42638     }
42639   return false;
42640 }
42641 
42642 /* If profitable, negate (without causing overflow) integer constant
42643    of mode MODE at location LOC.  Return true in this case.  */
42644 bool
42645 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42646 {
42647   HOST_WIDE_INT val;
42648 
42649   if (!CONST_INT_P (*loc))
42650     return false;
42651 
42652   switch (mode)
42653     {
42654     case E_DImode:
42655       /* DImode x86_64 constants must fit in 32 bits.  */
42656       gcc_assert (x86_64_immediate_operand (*loc, mode));
42657 
42658       mode = SImode;
42659       break;
42660 
42661     case E_SImode:
42662     case E_HImode:
42663     case E_QImode:
42664       break;
42665 
42666     default:
42667       gcc_unreachable ();
42668     }
42669 
42670   /* Avoid overflows.  */
42671   if (mode_signbit_p (mode, *loc))
42672     return false;
42673 
42674   val = INTVAL (*loc);
42675 
42676   /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42677      Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
42678   if ((val < 0 && val != -128)
42679       || val == 128)
42680     {
42681       *loc = GEN_INT (-val);
42682       return true;
42683     }
42684 
42685   return false;
42686 }
42687 
42688 /* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
42689    optabs would emit if we didn't have TFmode patterns.  */
42690 
42691 void
42692 x86_emit_floatuns (rtx operands[2])
42693 {
42694   rtx_code_label *neglab, *donelab;
42695   rtx i0, i1, f0, in, out;
42696   machine_mode mode, inmode;
42697 
42698   inmode = GET_MODE (operands[1]);
42699   gcc_assert (inmode == SImode || inmode == DImode);
42700 
42701   out = operands[0];
42702   in = force_reg (inmode, operands[1]);
42703   mode = GET_MODE (out);
42704   neglab = gen_label_rtx ();
42705   donelab = gen_label_rtx ();
42706   f0 = gen_reg_rtx (mode);
42707 
42708   emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42709 
42710   expand_float (out, in, 0);
42711 
42712   emit_jump_insn (gen_jump (donelab));
42713   emit_barrier ();
42714 
42715   emit_label (neglab);
42716 
42717   i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42718 			    1, OPTAB_DIRECT);
42719   i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42720 			    1, OPTAB_DIRECT);
42721   i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42722 
42723   expand_float (f0, i0, 0);
42724 
42725   emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42726 
42727   emit_label (donelab);
42728 }
42729 
42730 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42731 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42732 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42733 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42734 
42735 /* Get a vector mode of the same size as the original but with elements
42736    twice as wide.  This is only guaranteed to apply to integral vectors.  */
42737 
42738 static inline machine_mode
42739 get_mode_wider_vector (machine_mode o)
42740 {
42741   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
42742   machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42743   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42744   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42745   return n;
42746 }
42747 
42748 /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
42749    fill target with val via vec_duplicate.  */
42750 
42751 static bool
42752 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42753 {
42754   bool ok;
42755   rtx_insn *insn;
42756   rtx dup;
42757 
42758   /* First attempt to recognize VAL as-is.  */
42759   dup = gen_vec_duplicate (mode, val);
42760   insn = emit_insn (gen_rtx_SET (target, dup));
42761   if (recog_memoized (insn) < 0)
42762     {
42763       rtx_insn *seq;
42764       machine_mode innermode = GET_MODE_INNER (mode);
42765       rtx reg;
42766 
42767       /* If that fails, force VAL into a register.  */
42768 
42769       start_sequence ();
42770       reg = force_reg (innermode, val);
42771       if (GET_MODE (reg) != innermode)
42772 	reg = gen_lowpart (innermode, reg);
42773       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42774       seq = get_insns ();
42775       end_sequence ();
42776       if (seq)
42777 	emit_insn_before (seq, insn);
42778 
42779       ok = recog_memoized (insn) >= 0;
42780       gcc_assert (ok);
42781     }
42782   return true;
42783 }
42784 
42785 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
42786    with all elements equal to VAR.  Return true if successful.  */
42787 
42788 static bool
42789 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42790 				   rtx target, rtx val)
42791 {
42792   bool ok;
42793 
42794   switch (mode)
42795     {
42796     case E_V2SImode:
42797     case E_V2SFmode:
42798       if (!mmx_ok)
42799 	return false;
42800       /* FALLTHRU */
42801 
42802     case E_V4DFmode:
42803     case E_V4DImode:
42804     case E_V8SFmode:
42805     case E_V8SImode:
42806     case E_V2DFmode:
42807     case E_V2DImode:
42808     case E_V4SFmode:
42809     case E_V4SImode:
42810     case E_V16SImode:
42811     case E_V8DImode:
42812     case E_V16SFmode:
42813     case E_V8DFmode:
42814       return ix86_vector_duplicate_value (mode, target, val);
42815 
42816     case E_V4HImode:
42817       if (!mmx_ok)
42818 	return false;
42819       if (TARGET_SSE || TARGET_3DNOW_A)
42820 	{
42821 	  rtx x;
42822 
42823 	  val = gen_lowpart (SImode, val);
42824 	  x = gen_rtx_TRUNCATE (HImode, val);
42825 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
42826 	  emit_insn (gen_rtx_SET (target, x));
42827 	  return true;
42828 	}
42829       goto widen;
42830 
42831     case E_V8QImode:
42832       if (!mmx_ok)
42833 	return false;
42834       goto widen;
42835 
42836     case E_V8HImode:
42837       if (TARGET_AVX2)
42838 	return ix86_vector_duplicate_value (mode, target, val);
42839 
42840       if (TARGET_SSE2)
42841 	{
42842 	  struct expand_vec_perm_d dperm;
42843 	  rtx tmp1, tmp2;
42844 
42845 	permute:
42846 	  memset (&dperm, 0, sizeof (dperm));
42847 	  dperm.target = target;
42848 	  dperm.vmode = mode;
42849 	  dperm.nelt = GET_MODE_NUNITS (mode);
42850 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42851 	  dperm.one_operand_p = true;
42852 
42853 	  /* Extend to SImode using a paradoxical SUBREG.  */
42854 	  tmp1 = gen_reg_rtx (SImode);
42855 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
42856 
42857 	  /* Insert the SImode value as low element of a V4SImode vector. */
42858 	  tmp2 = gen_reg_rtx (V4SImode);
42859 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42860 	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42861 
42862 	  ok = (expand_vec_perm_1 (&dperm)
42863 		|| expand_vec_perm_broadcast_1 (&dperm));
42864 	  gcc_assert (ok);
42865 	  return ok;
42866 	}
42867       goto widen;
42868 
42869     case E_V16QImode:
42870       if (TARGET_AVX2)
42871 	return ix86_vector_duplicate_value (mode, target, val);
42872 
42873       if (TARGET_SSE2)
42874 	goto permute;
42875       goto widen;
42876 
42877     widen:
42878       /* Replicate the value once into the next wider mode and recurse.  */
42879       {
42880 	machine_mode smode, wsmode, wvmode;
42881 	rtx x;
42882 
42883 	smode = GET_MODE_INNER (mode);
42884 	wvmode = get_mode_wider_vector (mode);
42885 	wsmode = GET_MODE_INNER (wvmode);
42886 
42887 	val = convert_modes (wsmode, smode, val, true);
42888 	x = expand_simple_binop (wsmode, ASHIFT, val,
42889 				 GEN_INT (GET_MODE_BITSIZE (smode)),
42890 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42891 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42892 
42893 	x = gen_reg_rtx (wvmode);
42894 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42895 	gcc_assert (ok);
42896 	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42897 	return ok;
42898       }
42899 
42900     case E_V16HImode:
42901     case E_V32QImode:
42902       if (TARGET_AVX2)
42903 	return ix86_vector_duplicate_value (mode, target, val);
42904       else
42905 	{
42906 	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42907 	  rtx x = gen_reg_rtx (hvmode);
42908 
42909 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42910 	  gcc_assert (ok);
42911 
42912 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
42913 	  emit_insn (gen_rtx_SET (target, x));
42914 	}
42915       return true;
42916 
42917     case E_V64QImode:
42918     case E_V32HImode:
42919       if (TARGET_AVX512BW)
42920 	return ix86_vector_duplicate_value (mode, target, val);
42921       else
42922 	{
42923 	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42924 	  rtx x = gen_reg_rtx (hvmode);
42925 
42926 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42927 	  gcc_assert (ok);
42928 
42929 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
42930 	  emit_insn (gen_rtx_SET (target, x));
42931 	}
42932       return true;
42933 
42934     default:
42935       return false;
42936     }
42937 }
42938 
42939 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
42940    whose ONE_VAR element is VAR, and other elements are zero.  Return true
42941    if successful.  */
42942 
42943 static bool
42944 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42945 				     rtx target, rtx var, int one_var)
42946 {
42947   machine_mode vsimode;
42948   rtx new_target;
42949   rtx x, tmp;
42950   bool use_vector_set = false;
42951   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42952 
42953   switch (mode)
42954     {
42955     case E_V2DImode:
42956       /* For SSE4.1, we normally use vector set.  But if the second
42957 	 element is zero and inter-unit moves are OK, we use movq
42958 	 instead.  */
42959       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42960 			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
42961 			     && one_var == 0));
42962       break;
42963     case E_V16QImode:
42964     case E_V4SImode:
42965     case E_V4SFmode:
42966       use_vector_set = TARGET_SSE4_1;
42967       break;
42968     case E_V8HImode:
42969       use_vector_set = TARGET_SSE2;
42970       break;
42971     case E_V4HImode:
42972       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42973       break;
42974     case E_V32QImode:
42975     case E_V16HImode:
42976       use_vector_set = TARGET_AVX;
42977       break;
42978     case E_V8SImode:
42979       use_vector_set = TARGET_AVX;
42980       gen_vec_set_0 = gen_vec_setv8si_0;
42981       break;
42982     case E_V8SFmode:
42983       use_vector_set = TARGET_AVX;
42984       gen_vec_set_0 = gen_vec_setv8sf_0;
42985       break;
42986     case E_V4DFmode:
42987       use_vector_set = TARGET_AVX;
42988       gen_vec_set_0 = gen_vec_setv4df_0;
42989       break;
42990     case E_V4DImode:
42991       /* Use ix86_expand_vector_set in 64bit mode only.  */
42992       use_vector_set = TARGET_AVX && TARGET_64BIT;
42993       gen_vec_set_0 = gen_vec_setv4di_0;
42994       break;
42995     case E_V16SImode:
42996       use_vector_set = TARGET_AVX512F && one_var == 0;
42997       gen_vec_set_0 = gen_vec_setv16si_0;
42998       break;
42999     case E_V16SFmode:
43000       use_vector_set = TARGET_AVX512F && one_var == 0;
43001       gen_vec_set_0 = gen_vec_setv16sf_0;
43002       break;
43003     case E_V8DFmode:
43004       use_vector_set = TARGET_AVX512F && one_var == 0;
43005       gen_vec_set_0 = gen_vec_setv8df_0;
43006       break;
43007     case E_V8DImode:
43008       /* Use ix86_expand_vector_set in 64bit mode only.  */
43009       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
43010       gen_vec_set_0 = gen_vec_setv8di_0;
43011       break;
43012     default:
43013       break;
43014     }
43015 
43016   if (use_vector_set)
43017     {
43018       if (gen_vec_set_0 && one_var == 0)
43019 	{
43020 	  var = force_reg (GET_MODE_INNER (mode), var);
43021 	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
43022 	  return true;
43023 	}
43024       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43025       var = force_reg (GET_MODE_INNER (mode), var);
43026       ix86_expand_vector_set (mmx_ok, target, var, one_var);
43027       return true;
43028     }
43029 
43030   switch (mode)
43031     {
43032     case E_V2SFmode:
43033     case E_V2SImode:
43034       if (!mmx_ok)
43035 	return false;
43036       /* FALLTHRU */
43037 
43038     case E_V2DFmode:
43039     case E_V2DImode:
43040       if (one_var != 0)
43041 	return false;
43042       var = force_reg (GET_MODE_INNER (mode), var);
43043       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43044       emit_insn (gen_rtx_SET (target, x));
43045       return true;
43046 
43047     case E_V4SFmode:
43048     case E_V4SImode:
43049       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43050 	new_target = gen_reg_rtx (mode);
43051       else
43052 	new_target = target;
43053       var = force_reg (GET_MODE_INNER (mode), var);
43054       x = gen_rtx_VEC_DUPLICATE (mode, var);
43055       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43056       emit_insn (gen_rtx_SET (new_target, x));
43057       if (one_var != 0)
43058 	{
43059 	  /* We need to shuffle the value to the correct position, so
43060 	     create a new pseudo to store the intermediate result.  */
43061 
43062 	  /* With SSE2, we can use the integer shuffle insns.  */
43063 	  if (mode != V4SFmode && TARGET_SSE2)
43064 	    {
43065 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43066 					    const1_rtx,
43067 					    GEN_INT (one_var == 1 ? 0 : 1),
43068 					    GEN_INT (one_var == 2 ? 0 : 1),
43069 					    GEN_INT (one_var == 3 ? 0 : 1)));
43070 	      if (target != new_target)
43071 		emit_move_insn (target, new_target);
43072 	      return true;
43073 	    }
43074 
43075 	  /* Otherwise convert the intermediate result to V4SFmode and
43076 	     use the SSE1 shuffle instructions.  */
43077 	  if (mode != V4SFmode)
43078 	    {
43079 	      tmp = gen_reg_rtx (V4SFmode);
43080 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43081 	    }
43082 	  else
43083 	    tmp = new_target;
43084 
43085 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43086 				       const1_rtx,
43087 				       GEN_INT (one_var == 1 ? 0 : 1),
43088 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
43089 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43090 
43091 	  if (mode != V4SFmode)
43092 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43093 	  else if (tmp != target)
43094 	    emit_move_insn (target, tmp);
43095 	}
43096       else if (target != new_target)
43097 	emit_move_insn (target, new_target);
43098       return true;
43099 
43100     case E_V8HImode:
43101     case E_V16QImode:
43102       vsimode = V4SImode;
43103       goto widen;
43104     case E_V4HImode:
43105     case E_V8QImode:
43106       if (!mmx_ok)
43107 	return false;
43108       vsimode = V2SImode;
43109       goto widen;
43110     widen:
43111       if (one_var != 0)
43112 	return false;
43113 
43114       /* Zero extend the variable element to SImode and recurse.  */
43115       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43116 
43117       x = gen_reg_rtx (vsimode);
43118       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43119 						var, one_var))
43120 	gcc_unreachable ();
43121 
43122       emit_move_insn (target, gen_lowpart (mode, x));
43123       return true;
43124 
43125     default:
43126       return false;
43127     }
43128 }
43129 
43130 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
43131    consisting of the values in VALS.  It is known that all elements
43132    except ONE_VAR are constants.  Return true if successful.  */
43133 
43134 static bool
43135 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43136 				 rtx target, rtx vals, int one_var)
43137 {
43138   rtx var = XVECEXP (vals, 0, one_var);
43139   machine_mode wmode;
43140   rtx const_vec, x;
43141 
43142   const_vec = copy_rtx (vals);
43143   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43144   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43145 
43146   switch (mode)
43147     {
43148     case E_V2DFmode:
43149     case E_V2DImode:
43150     case E_V2SFmode:
43151     case E_V2SImode:
43152       /* For the two element vectors, it's just as easy to use
43153 	 the general case.  */
43154       return false;
43155 
43156     case E_V4DImode:
43157       /* Use ix86_expand_vector_set in 64bit mode only.  */
43158       if (!TARGET_64BIT)
43159 	return false;
43160       /* FALLTHRU */
43161     case E_V4DFmode:
43162     case E_V8SFmode:
43163     case E_V8SImode:
43164     case E_V16HImode:
43165     case E_V32QImode:
43166     case E_V4SFmode:
43167     case E_V4SImode:
43168     case E_V8HImode:
43169     case E_V4HImode:
43170       break;
43171 
43172     case E_V16QImode:
43173       if (TARGET_SSE4_1)
43174 	break;
43175       wmode = V8HImode;
43176       goto widen;
43177     case E_V8QImode:
43178       wmode = V4HImode;
43179       goto widen;
43180     widen:
43181       /* There's no way to set one QImode entry easily.  Combine
43182 	 the variable value with its adjacent constant value, and
43183 	 promote to an HImode set.  */
43184       x = XVECEXP (vals, 0, one_var ^ 1);
43185       if (one_var & 1)
43186 	{
43187 	  var = convert_modes (HImode, QImode, var, true);
43188 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43189 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
43190 	  x = GEN_INT (INTVAL (x) & 0xff);
43191 	}
43192       else
43193 	{
43194 	  var = convert_modes (HImode, QImode, var, true);
43195 	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
43196 	}
43197       if (x != const0_rtx)
43198 	var = expand_simple_binop (HImode, IOR, var, x, var,
43199 				   1, OPTAB_LIB_WIDEN);
43200 
43201       x = gen_reg_rtx (wmode);
43202       emit_move_insn (x, gen_lowpart (wmode, const_vec));
43203       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43204 
43205       emit_move_insn (target, gen_lowpart (mode, x));
43206       return true;
43207 
43208     default:
43209       return false;
43210     }
43211 
43212   emit_move_insn (target, const_vec);
43213   ix86_expand_vector_set (mmx_ok, target, var, one_var);
43214   return true;
43215 }
43216 
43217 /* A subroutine of ix86_expand_vector_init_general.  Use vector
43218    concatenate to handle the most general case: all values variable,
43219    and none identical.  */
43220 
43221 static void
43222 ix86_expand_vector_init_concat (machine_mode mode,
43223 				rtx target, rtx *ops, int n)
43224 {
43225   machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43226   rtx first[16], second[8], third[4];
43227   rtvec v;
43228   int i, j;
43229 
43230   switch (n)
43231     {
43232     case 2:
43233       switch (mode)
43234 	{
43235 	case E_V16SImode:
43236 	  cmode = V8SImode;
43237 	  break;
43238 	case E_V16SFmode:
43239 	  cmode = V8SFmode;
43240 	  break;
43241 	case E_V8DImode:
43242 	  cmode = V4DImode;
43243 	  break;
43244 	case E_V8DFmode:
43245 	  cmode = V4DFmode;
43246 	  break;
43247 	case E_V8SImode:
43248 	  cmode = V4SImode;
43249 	  break;
43250 	case E_V8SFmode:
43251 	  cmode = V4SFmode;
43252 	  break;
43253 	case E_V4DImode:
43254 	  cmode = V2DImode;
43255 	  break;
43256 	case E_V4DFmode:
43257 	  cmode = V2DFmode;
43258 	  break;
43259 	case E_V4SImode:
43260 	  cmode = V2SImode;
43261 	  break;
43262 	case E_V4SFmode:
43263 	  cmode = V2SFmode;
43264 	  break;
43265 	case E_V2DImode:
43266 	  cmode = DImode;
43267 	  break;
43268 	case E_V2SImode:
43269 	  cmode = SImode;
43270 	  break;
43271 	case E_V2DFmode:
43272 	  cmode = DFmode;
43273 	  break;
43274 	case E_V2SFmode:
43275 	  cmode = SFmode;
43276 	  break;
43277 	default:
43278 	  gcc_unreachable ();
43279 	}
43280 
43281       if (!register_operand (ops[1], cmode))
43282 	ops[1] = force_reg (cmode, ops[1]);
43283       if (!register_operand (ops[0], cmode))
43284 	ops[0] = force_reg (cmode, ops[0]);
43285       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43286 							  ops[1])));
43287       break;
43288 
43289     case 4:
43290       switch (mode)
43291 	{
43292 	case E_V4DImode:
43293 	  cmode = V2DImode;
43294 	  break;
43295 	case E_V4DFmode:
43296 	  cmode = V2DFmode;
43297 	  break;
43298 	case E_V4SImode:
43299 	  cmode = V2SImode;
43300 	  break;
43301 	case E_V4SFmode:
43302 	  cmode = V2SFmode;
43303 	  break;
43304 	default:
43305 	  gcc_unreachable ();
43306 	}
43307       goto half;
43308 
43309     case 8:
43310       switch (mode)
43311 	{
43312 	case E_V8DImode:
43313 	  cmode = V2DImode;
43314 	  hmode = V4DImode;
43315 	  break;
43316 	case E_V8DFmode:
43317 	  cmode = V2DFmode;
43318 	  hmode = V4DFmode;
43319 	  break;
43320 	case E_V8SImode:
43321 	  cmode = V2SImode;
43322 	  hmode = V4SImode;
43323 	  break;
43324 	case E_V8SFmode:
43325 	  cmode = V2SFmode;
43326 	  hmode = V4SFmode;
43327 	  break;
43328 	default:
43329 	  gcc_unreachable ();
43330 	}
43331       goto half;
43332 
43333     case 16:
43334       switch (mode)
43335 	{
43336 	case E_V16SImode:
43337 	  cmode = V2SImode;
43338 	  hmode = V4SImode;
43339 	  gmode = V8SImode;
43340 	  break;
43341 	case E_V16SFmode:
43342 	  cmode = V2SFmode;
43343 	  hmode = V4SFmode;
43344 	  gmode = V8SFmode;
43345 	  break;
43346 	default:
43347 	  gcc_unreachable ();
43348 	}
43349       goto half;
43350 
43351 half:
43352       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
43353       i = n - 1;
43354       j = (n >> 1) - 1;
43355       for (; i > 0; i -= 2, j--)
43356 	{
43357 	  first[j] = gen_reg_rtx (cmode);
43358 	  v = gen_rtvec (2, ops[i - 1], ops[i]);
43359 	  ix86_expand_vector_init (false, first[j],
43360 				   gen_rtx_PARALLEL (cmode, v));
43361 	}
43362 
43363       n >>= 1;
43364       if (n > 4)
43365 	{
43366 	  gcc_assert (hmode != VOIDmode);
43367 	  gcc_assert (gmode != VOIDmode);
43368 	  for (i = j = 0; i < n; i += 2, j++)
43369 	    {
43370 	      second[j] = gen_reg_rtx (hmode);
43371 	      ix86_expand_vector_init_concat (hmode, second [j],
43372 					      &first [i], 2);
43373 	    }
43374 	  n >>= 1;
43375 	  for (i = j = 0; i < n; i += 2, j++)
43376 	    {
43377 	      third[j] = gen_reg_rtx (gmode);
43378 	      ix86_expand_vector_init_concat (gmode, third[j],
43379 					      &second[i], 2);
43380 	    }
43381 	  n >>= 1;
43382 	  ix86_expand_vector_init_concat (mode, target, third, n);
43383 	}
43384       else if (n > 2)
43385 	{
43386 	  gcc_assert (hmode != VOIDmode);
43387 	  for (i = j = 0; i < n; i += 2, j++)
43388 	    {
43389 	      second[j] = gen_reg_rtx (hmode);
43390 	      ix86_expand_vector_init_concat (hmode, second [j],
43391 					      &first [i], 2);
43392 	    }
43393 	  n >>= 1;
43394 	  ix86_expand_vector_init_concat (mode, target, second, n);
43395 	}
43396       else
43397 	ix86_expand_vector_init_concat (mode, target, first, n);
43398       break;
43399 
43400     default:
43401       gcc_unreachable ();
43402     }
43403 }
43404 
43405 /* A subroutine of ix86_expand_vector_init_general.  Use vector
43406    interleave to handle the most general case: all values variable,
43407    and none identical.  */
43408 
43409 static void
43410 ix86_expand_vector_init_interleave (machine_mode mode,
43411 				    rtx target, rtx *ops, int n)
43412 {
43413   machine_mode first_imode, second_imode, third_imode, inner_mode;
43414   int i, j;
43415   rtx op0, op1;
43416   rtx (*gen_load_even) (rtx, rtx, rtx);
43417   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43418   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43419 
43420   switch (mode)
43421     {
43422     case E_V8HImode:
43423       gen_load_even = gen_vec_setv8hi;
43424       gen_interleave_first_low = gen_vec_interleave_lowv4si;
43425       gen_interleave_second_low = gen_vec_interleave_lowv2di;
43426       inner_mode = HImode;
43427       first_imode = V4SImode;
43428       second_imode = V2DImode;
43429       third_imode = VOIDmode;
43430       break;
43431     case E_V16QImode:
43432       gen_load_even = gen_vec_setv16qi;
43433       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43434       gen_interleave_second_low = gen_vec_interleave_lowv4si;
43435       inner_mode = QImode;
43436       first_imode = V8HImode;
43437       second_imode = V4SImode;
43438       third_imode = V2DImode;
43439       break;
43440     default:
43441       gcc_unreachable ();
43442     }
43443 
43444   for (i = 0; i < n; i++)
43445     {
43446       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
43447       op0 = gen_reg_rtx (SImode);
43448       emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43449 
43450       /* Insert the SImode value as low element of V4SImode vector. */
43451       op1 = gen_reg_rtx (V4SImode);
43452       op0 = gen_rtx_VEC_MERGE (V4SImode,
43453 			       gen_rtx_VEC_DUPLICATE (V4SImode,
43454 						      op0),
43455 			       CONST0_RTX (V4SImode),
43456 			       const1_rtx);
43457       emit_insn (gen_rtx_SET (op1, op0));
43458 
43459       /* Cast the V4SImode vector back to a vector in orignal mode.  */
43460       op0 = gen_reg_rtx (mode);
43461       emit_move_insn (op0, gen_lowpart (mode, op1));
43462 
43463       /* Load even elements into the second position.  */
43464       emit_insn (gen_load_even (op0,
43465 				force_reg (inner_mode,
43466 					   ops [i + i + 1]),
43467 				const1_rtx));
43468 
43469       /* Cast vector to FIRST_IMODE vector.  */
43470       ops[i] = gen_reg_rtx (first_imode);
43471       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43472     }
43473 
43474   /* Interleave low FIRST_IMODE vectors.  */
43475   for (i = j = 0; i < n; i += 2, j++)
43476     {
43477       op0 = gen_reg_rtx (first_imode);
43478       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43479 
43480       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
43481       ops[j] = gen_reg_rtx (second_imode);
43482       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43483     }
43484 
43485   /* Interleave low SECOND_IMODE vectors.  */
43486   switch (second_imode)
43487     {
43488     case E_V4SImode:
43489       for (i = j = 0; i < n / 2; i += 2, j++)
43490 	{
43491 	  op0 = gen_reg_rtx (second_imode);
43492 	  emit_insn (gen_interleave_second_low (op0, ops[i],
43493 						ops[i + 1]));
43494 
43495 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43496 	     vector.  */
43497 	  ops[j] = gen_reg_rtx (third_imode);
43498 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43499 	}
43500       second_imode = V2DImode;
43501       gen_interleave_second_low = gen_vec_interleave_lowv2di;
43502       /* FALLTHRU */
43503 
43504     case E_V2DImode:
43505       op0 = gen_reg_rtx (second_imode);
43506       emit_insn (gen_interleave_second_low (op0, ops[0],
43507 					    ops[1]));
43508 
43509       /* Cast the SECOND_IMODE vector back to a vector on original
43510 	 mode.  */
43511       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43512       break;
43513 
43514     default:
43515       gcc_unreachable ();
43516     }
43517 }
43518 
43519 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
43520    all values variable, and none identical.  */
43521 
43522 static void
43523 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43524 				 rtx target, rtx vals)
43525 {
43526   rtx ops[64], op0, op1, op2, op3, op4, op5;
43527   machine_mode half_mode = VOIDmode;
43528   machine_mode quarter_mode = VOIDmode;
43529   int n, i;
43530 
43531   switch (mode)
43532     {
43533     case E_V2SFmode:
43534     case E_V2SImode:
43535       if (!mmx_ok && !TARGET_SSE)
43536 	break;
43537       /* FALLTHRU */
43538 
43539     case E_V16SImode:
43540     case E_V16SFmode:
43541     case E_V8DFmode:
43542     case E_V8DImode:
43543     case E_V8SFmode:
43544     case E_V8SImode:
43545     case E_V4DFmode:
43546     case E_V4DImode:
43547     case E_V4SFmode:
43548     case E_V4SImode:
43549     case E_V2DFmode:
43550     case E_V2DImode:
43551       n = GET_MODE_NUNITS (mode);
43552       for (i = 0; i < n; i++)
43553 	ops[i] = XVECEXP (vals, 0, i);
43554       ix86_expand_vector_init_concat (mode, target, ops, n);
43555       return;
43556 
43557     case E_V2TImode:
43558       for (i = 0; i < 2; i++)
43559 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43560       op0 = gen_reg_rtx (V4DImode);
43561       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43562       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43563       return;
43564 
43565     case E_V4TImode:
43566       for (i = 0; i < 4; i++)
43567 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43568       ops[4] = gen_reg_rtx (V4DImode);
43569       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43570       ops[5] = gen_reg_rtx (V4DImode);
43571       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43572       op0 = gen_reg_rtx (V8DImode);
43573       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43574       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43575       return;
43576 
43577     case E_V32QImode:
43578       half_mode = V16QImode;
43579       goto half;
43580 
43581     case E_V16HImode:
43582       half_mode = V8HImode;
43583       goto half;
43584 
43585 half:
43586       n = GET_MODE_NUNITS (mode);
43587       for (i = 0; i < n; i++)
43588 	ops[i] = XVECEXP (vals, 0, i);
43589       op0 = gen_reg_rtx (half_mode);
43590       op1 = gen_reg_rtx (half_mode);
43591       ix86_expand_vector_init_interleave (half_mode, op0, ops,
43592 					  n >> 2);
43593       ix86_expand_vector_init_interleave (half_mode, op1,
43594 					  &ops [n >> 1], n >> 2);
43595       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43596       return;
43597 
43598     case E_V64QImode:
43599       quarter_mode = V16QImode;
43600       half_mode = V32QImode;
43601       goto quarter;
43602 
43603     case E_V32HImode:
43604       quarter_mode = V8HImode;
43605       half_mode = V16HImode;
43606       goto quarter;
43607 
43608 quarter:
43609       n = GET_MODE_NUNITS (mode);
43610       for (i = 0; i < n; i++)
43611 	ops[i] = XVECEXP (vals, 0, i);
43612       op0 = gen_reg_rtx (quarter_mode);
43613       op1 = gen_reg_rtx (quarter_mode);
43614       op2 = gen_reg_rtx (quarter_mode);
43615       op3 = gen_reg_rtx (quarter_mode);
43616       op4 = gen_reg_rtx (half_mode);
43617       op5 = gen_reg_rtx (half_mode);
43618       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43619 					  n >> 3);
43620       ix86_expand_vector_init_interleave (quarter_mode, op1,
43621 					  &ops [n >> 2], n >> 3);
43622       ix86_expand_vector_init_interleave (quarter_mode, op2,
43623 					  &ops [n >> 1], n >> 3);
43624       ix86_expand_vector_init_interleave (quarter_mode, op3,
43625 					  &ops [(n >> 1) | (n >> 2)], n >> 3);
43626       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43627       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43628       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43629       return;
43630 
43631     case E_V16QImode:
43632       if (!TARGET_SSE4_1)
43633 	break;
43634       /* FALLTHRU */
43635 
43636     case E_V8HImode:
43637       if (!TARGET_SSE2)
43638 	break;
43639 
43640       /* Don't use ix86_expand_vector_init_interleave if we can't
43641 	 move from GPR to SSE register directly.  */
43642       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43643 	break;
43644 
43645       n = GET_MODE_NUNITS (mode);
43646       for (i = 0; i < n; i++)
43647 	ops[i] = XVECEXP (vals, 0, i);
43648       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43649       return;
43650 
43651     case E_V4HImode:
43652     case E_V8QImode:
43653       break;
43654 
43655     default:
43656       gcc_unreachable ();
43657     }
43658 
43659     {
43660       int i, j, n_elts, n_words, n_elt_per_word;
43661       machine_mode inner_mode;
43662       rtx words[4], shift;
43663 
43664       inner_mode = GET_MODE_INNER (mode);
43665       n_elts = GET_MODE_NUNITS (mode);
43666       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43667       n_elt_per_word = n_elts / n_words;
43668       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43669 
43670       for (i = 0; i < n_words; ++i)
43671 	{
43672 	  rtx word = NULL_RTX;
43673 
43674 	  for (j = 0; j < n_elt_per_word; ++j)
43675 	    {
43676 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43677 	      elt = convert_modes (word_mode, inner_mode, elt, true);
43678 
43679 	      if (j == 0)
43680 		word = elt;
43681 	      else
43682 		{
43683 		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43684 					      word, 1, OPTAB_LIB_WIDEN);
43685 		  word = expand_simple_binop (word_mode, IOR, word, elt,
43686 					      word, 1, OPTAB_LIB_WIDEN);
43687 		}
43688 	    }
43689 
43690 	  words[i] = word;
43691 	}
43692 
43693       if (n_words == 1)
43694 	emit_move_insn (target, gen_lowpart (mode, words[0]));
43695       else if (n_words == 2)
43696 	{
43697 	  rtx tmp = gen_reg_rtx (mode);
43698 	  emit_clobber (tmp);
43699 	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43700 	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43701 	  emit_move_insn (target, tmp);
43702 	}
43703       else if (n_words == 4)
43704 	{
43705 	  rtx tmp = gen_reg_rtx (V4SImode);
43706 	  gcc_assert (word_mode == SImode);
43707 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43708 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43709 	  emit_move_insn (target, gen_lowpart (mode, tmp));
43710 	}
43711       else
43712 	gcc_unreachable ();
43713     }
43714 }
43715 
43716 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
43717    instructions unless MMX_OK is true.  */
43718 
43719 void
43720 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43721 {
43722   machine_mode mode = GET_MODE (target);
43723   machine_mode inner_mode = GET_MODE_INNER (mode);
43724   int n_elts = GET_MODE_NUNITS (mode);
43725   int n_var = 0, one_var = -1;
43726   bool all_same = true, all_const_zero = true;
43727   int i;
43728   rtx x;
43729 
43730   /* Handle first initialization from vector elts.  */
43731   if (n_elts != XVECLEN (vals, 0))
43732     {
43733       rtx subtarget = target;
43734       x = XVECEXP (vals, 0, 0);
43735       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43736       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43737 	{
43738 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43739 	  if (inner_mode == QImode || inner_mode == HImode)
43740 	    {
43741 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43742 	      mode = mode_for_vector (SImode, n_bits / 4).require ();
43743 	      inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43744 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
43745 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
43746 	      subtarget = gen_reg_rtx (mode);
43747 	    }
43748 	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43749 	  if (subtarget != target)
43750 	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43751 	  return;
43752 	}
43753       gcc_unreachable ();
43754     }
43755 
43756   for (i = 0; i < n_elts; ++i)
43757     {
43758       x = XVECEXP (vals, 0, i);
43759       if (!(CONST_SCALAR_INT_P (x)
43760 	    || CONST_DOUBLE_P (x)
43761 	    || CONST_FIXED_P (x)))
43762 	n_var++, one_var = i;
43763       else if (x != CONST0_RTX (inner_mode))
43764 	all_const_zero = false;
43765       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43766 	all_same = false;
43767     }
43768 
43769   /* Constants are best loaded from the constant pool.  */
43770   if (n_var == 0)
43771     {
43772       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43773       return;
43774     }
43775 
43776   /* If all values are identical, broadcast the value.  */
43777   if (all_same
43778       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43779 					    XVECEXP (vals, 0, 0)))
43780     return;
43781 
43782   /* Values where only one field is non-constant are best loaded from
43783      the pool and overwritten via move later.  */
43784   if (n_var == 1)
43785     {
43786       if (all_const_zero
43787 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43788 						  XVECEXP (vals, 0, one_var),
43789 						  one_var))
43790 	return;
43791 
43792       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43793 	return;
43794     }
43795 
43796   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43797 }
43798 
43799 void
43800 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43801 {
43802   machine_mode mode = GET_MODE (target);
43803   machine_mode inner_mode = GET_MODE_INNER (mode);
43804   machine_mode half_mode;
43805   bool use_vec_merge = false;
43806   rtx tmp;
43807   static rtx (*gen_extract[6][2]) (rtx, rtx)
43808     = {
43809 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43810 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43811 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43812 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43813 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43814 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43815       };
43816   static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43817     = {
43818 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43819 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43820 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43821 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43822 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43823 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43824       };
43825   int i, j, n;
43826   machine_mode mmode = VOIDmode;
43827   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43828 
43829   switch (mode)
43830     {
43831     case E_V2SFmode:
43832     case E_V2SImode:
43833       if (mmx_ok)
43834 	{
43835 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43836 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43837 	  if (elt == 0)
43838 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43839 	  else
43840 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43841 	  emit_insn (gen_rtx_SET (target, tmp));
43842 	  return;
43843 	}
43844       break;
43845 
43846     case E_V2DImode:
43847       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43848       if (use_vec_merge)
43849 	break;
43850 
43851       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43852       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43853       if (elt == 0)
43854 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43855       else
43856 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43857       emit_insn (gen_rtx_SET (target, tmp));
43858       return;
43859 
43860     case E_V2DFmode:
43861       {
43862 	rtx op0, op1;
43863 
43864 	/* For the two element vectors, we implement a VEC_CONCAT with
43865 	   the extraction of the other element.  */
43866 
43867 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43868 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43869 
43870 	if (elt == 0)
43871 	  op0 = val, op1 = tmp;
43872 	else
43873 	  op0 = tmp, op1 = val;
43874 
43875 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43876 	emit_insn (gen_rtx_SET (target, tmp));
43877       }
43878       return;
43879 
43880     case E_V4SFmode:
43881       use_vec_merge = TARGET_SSE4_1;
43882       if (use_vec_merge)
43883 	break;
43884 
43885       switch (elt)
43886 	{
43887 	case 0:
43888 	  use_vec_merge = true;
43889 	  break;
43890 
43891 	case 1:
43892 	  /* tmp = target = A B C D */
43893 	  tmp = copy_to_reg (target);
43894 	  /* target = A A B B */
43895 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43896 	  /* target = X A B B */
43897 	  ix86_expand_vector_set (false, target, val, 0);
43898 	  /* target = A X C D  */
43899 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43900 					  const1_rtx, const0_rtx,
43901 					  GEN_INT (2+4), GEN_INT (3+4)));
43902 	  return;
43903 
43904 	case 2:
43905 	  /* tmp = target = A B C D */
43906 	  tmp = copy_to_reg (target);
43907 	  /* tmp = X B C D */
43908 	  ix86_expand_vector_set (false, tmp, val, 0);
43909 	  /* target = A B X D */
43910 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43911 					  const0_rtx, const1_rtx,
43912 					  GEN_INT (0+4), GEN_INT (3+4)));
43913 	  return;
43914 
43915 	case 3:
43916 	  /* tmp = target = A B C D */
43917 	  tmp = copy_to_reg (target);
43918 	  /* tmp = X B C D */
43919 	  ix86_expand_vector_set (false, tmp, val, 0);
43920 	  /* target = A B X D */
43921 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43922 					  const0_rtx, const1_rtx,
43923 					  GEN_INT (2+4), GEN_INT (0+4)));
43924 	  return;
43925 
43926 	default:
43927 	  gcc_unreachable ();
43928 	}
43929       break;
43930 
43931     case E_V4SImode:
43932       use_vec_merge = TARGET_SSE4_1;
43933       if (use_vec_merge)
43934 	break;
43935 
43936       /* Element 0 handled by vec_merge below.  */
43937       if (elt == 0)
43938 	{
43939 	  use_vec_merge = true;
43940 	  break;
43941 	}
43942 
43943       if (TARGET_SSE2)
43944 	{
43945 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
43946 	     store into element 0, then shuffle them back.  */
43947 
43948 	  rtx order[4];
43949 
43950 	  order[0] = GEN_INT (elt);
43951 	  order[1] = const1_rtx;
43952 	  order[2] = const2_rtx;
43953 	  order[3] = GEN_INT (3);
43954 	  order[elt] = const0_rtx;
43955 
43956 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43957 					order[1], order[2], order[3]));
43958 
43959 	  ix86_expand_vector_set (false, target, val, 0);
43960 
43961 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43962 					order[1], order[2], order[3]));
43963 	}
43964       else
43965 	{
43966 	  /* For SSE1, we have to reuse the V4SF code.  */
43967 	  rtx t = gen_reg_rtx (V4SFmode);
43968 	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
43969 	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43970 	  emit_move_insn (target, gen_lowpart (mode, t));
43971 	}
43972       return;
43973 
43974     case E_V8HImode:
43975       use_vec_merge = TARGET_SSE2;
43976       break;
43977     case E_V4HImode:
43978       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43979       break;
43980 
43981     case E_V16QImode:
43982       use_vec_merge = TARGET_SSE4_1;
43983       break;
43984 
43985     case E_V8QImode:
43986       break;
43987 
43988     case E_V32QImode:
43989       half_mode = V16QImode;
43990       j = 0;
43991       n = 16;
43992       goto half;
43993 
43994     case E_V16HImode:
43995       half_mode = V8HImode;
43996       j = 1;
43997       n = 8;
43998       goto half;
43999 
44000     case E_V8SImode:
44001       half_mode = V4SImode;
44002       j = 2;
44003       n = 4;
44004       goto half;
44005 
44006     case E_V4DImode:
44007       half_mode = V2DImode;
44008       j = 3;
44009       n = 2;
44010       goto half;
44011 
44012     case E_V8SFmode:
44013       half_mode = V4SFmode;
44014       j = 4;
44015       n = 4;
44016       goto half;
44017 
44018     case E_V4DFmode:
44019       half_mode = V2DFmode;
44020       j = 5;
44021       n = 2;
44022       goto half;
44023 
44024 half:
44025       /* Compute offset.  */
44026       i = elt / n;
44027       elt %= n;
44028 
44029       gcc_assert (i <= 1);
44030 
44031       /* Extract the half.  */
44032       tmp = gen_reg_rtx (half_mode);
44033       emit_insn (gen_extract[j][i] (tmp, target));
44034 
44035       /* Put val in tmp at elt.  */
44036       ix86_expand_vector_set (false, tmp, val, elt);
44037 
44038       /* Put it back.  */
44039       emit_insn (gen_insert[j][i] (target, target, tmp));
44040       return;
44041 
44042     case E_V8DFmode:
44043       if (TARGET_AVX512F)
44044 	{
44045 	  mmode = QImode;
44046 	  gen_blendm = gen_avx512f_blendmv8df;
44047 	}
44048       break;
44049 
44050     case E_V8DImode:
44051       if (TARGET_AVX512F)
44052 	{
44053 	  mmode = QImode;
44054 	  gen_blendm = gen_avx512f_blendmv8di;
44055 	}
44056       break;
44057 
44058     case E_V16SFmode:
44059       if (TARGET_AVX512F)
44060 	{
44061 	  mmode = HImode;
44062 	  gen_blendm = gen_avx512f_blendmv16sf;
44063 	}
44064       break;
44065 
44066     case E_V16SImode:
44067       if (TARGET_AVX512F)
44068 	{
44069 	  mmode = HImode;
44070 	  gen_blendm = gen_avx512f_blendmv16si;
44071 	}
44072       break;
44073 
44074     case E_V32HImode:
44075       if (TARGET_AVX512BW)
44076 	{
44077 	  mmode = SImode;
44078 	  gen_blendm = gen_avx512bw_blendmv32hi;
44079 	}
44080       else if (TARGET_AVX512F)
44081 	{
44082 	  half_mode = E_V8HImode;
44083 	  n = 8;
44084 	  goto quarter;
44085 	}
44086       break;
44087 
44088     case E_V64QImode:
44089       if (TARGET_AVX512BW)
44090 	{
44091 	  mmode = DImode;
44092 	  gen_blendm = gen_avx512bw_blendmv64qi;
44093 	}
44094       else if (TARGET_AVX512F)
44095 	{
44096 	  half_mode = E_V16QImode;
44097 	  n = 16;
44098 	  goto quarter;
44099 	}
44100       break;
44101 
44102 quarter:
44103       /* Compute offset.  */
44104       i = elt / n;
44105       elt %= n;
44106 
44107       gcc_assert (i <= 3);
44108 
44109       {
44110 	/* Extract the quarter.  */
44111 	tmp = gen_reg_rtx (V4SImode);
44112 	rtx tmp2 = gen_lowpart (V16SImode, target);
44113 	rtx mask = gen_reg_rtx (QImode);
44114 
44115 	emit_move_insn (mask, constm1_rtx);
44116 	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
44117 						   tmp, mask));
44118 
44119 	tmp2 = gen_reg_rtx (half_mode);
44120 	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
44121 	tmp = tmp2;
44122 
44123 	/* Put val in tmp at elt.  */
44124 	ix86_expand_vector_set (false, tmp, val, elt);
44125 
44126 	/* Put it back.  */
44127 	tmp2 = gen_reg_rtx (V16SImode);
44128 	rtx tmp3 = gen_lowpart (V16SImode, target);
44129 	mask = gen_reg_rtx (HImode);
44130 	emit_move_insn (mask, constm1_rtx);
44131 	tmp = gen_lowpart (V4SImode, tmp);
44132 	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
44133 						  tmp3, mask));
44134 	emit_move_insn (target, gen_lowpart (mode, tmp2));
44135       }
44136       return;
44137 
44138     default:
44139       break;
44140     }
44141 
44142   if (mmode != VOIDmode)
44143     {
44144       tmp = gen_reg_rtx (mode);
44145       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44146       /* The avx512*_blendm<mode> expanders have different operand order
44147 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
44148 	 elements where the mask is set and second input operand otherwise,
44149 	 in {sse,avx}*_*blend* the first input operand is used for elements
44150 	 where the mask is clear and second input operand otherwise.  */
44151       emit_insn (gen_blendm (target, target, tmp,
44152 			     force_reg (mmode,
44153 					gen_int_mode (HOST_WIDE_INT_1U << elt,
44154 						      mmode))));
44155     }
44156   else if (use_vec_merge)
44157     {
44158       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44159       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
44160 			       GEN_INT (HOST_WIDE_INT_1U << elt));
44161       emit_insn (gen_rtx_SET (target, tmp));
44162     }
44163   else
44164     {
44165       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44166 
44167       emit_move_insn (mem, target);
44168 
44169       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
44170       emit_move_insn (tmp, val);
44171 
44172       emit_move_insn (target, mem);
44173     }
44174 }
44175 
44176 void
44177 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44178 {
44179   machine_mode mode = GET_MODE (vec);
44180   machine_mode inner_mode = GET_MODE_INNER (mode);
44181   bool use_vec_extr = false;
44182   rtx tmp;
44183 
44184   switch (mode)
44185     {
44186     case E_V2SImode:
44187     case E_V2SFmode:
44188       if (!mmx_ok)
44189 	break;
44190       /* FALLTHRU */
44191 
44192     case E_V2DFmode:
44193     case E_V2DImode:
44194     case E_V2TImode:
44195     case E_V4TImode:
44196       use_vec_extr = true;
44197       break;
44198 
44199     case E_V4SFmode:
44200       use_vec_extr = TARGET_SSE4_1;
44201       if (use_vec_extr)
44202 	break;
44203 
44204       switch (elt)
44205 	{
44206 	case 0:
44207 	  tmp = vec;
44208 	  break;
44209 
44210 	case 1:
44211 	case 3:
44212 	  tmp = gen_reg_rtx (mode);
44213 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44214 				       GEN_INT (elt), GEN_INT (elt),
44215 				       GEN_INT (elt+4), GEN_INT (elt+4)));
44216 	  break;
44217 
44218 	case 2:
44219 	  tmp = gen_reg_rtx (mode);
44220 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44221 	  break;
44222 
44223 	default:
44224 	  gcc_unreachable ();
44225 	}
44226       vec = tmp;
44227       use_vec_extr = true;
44228       elt = 0;
44229       break;
44230 
44231     case E_V4SImode:
44232       use_vec_extr = TARGET_SSE4_1;
44233       if (use_vec_extr)
44234 	break;
44235 
44236       if (TARGET_SSE2)
44237 	{
44238 	  switch (elt)
44239 	    {
44240 	    case 0:
44241 	      tmp = vec;
44242 	      break;
44243 
44244 	    case 1:
44245 	    case 3:
44246 	      tmp = gen_reg_rtx (mode);
44247 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44248 					    GEN_INT (elt), GEN_INT (elt),
44249 					    GEN_INT (elt), GEN_INT (elt)));
44250 	      break;
44251 
44252 	    case 2:
44253 	      tmp = gen_reg_rtx (mode);
44254 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44255 	      break;
44256 
44257 	    default:
44258 	      gcc_unreachable ();
44259 	    }
44260 	  vec = tmp;
44261 	  use_vec_extr = true;
44262 	  elt = 0;
44263 	}
44264       else
44265 	{
44266 	  /* For SSE1, we have to reuse the V4SF code.  */
44267 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44268 				      gen_lowpart (V4SFmode, vec), elt);
44269 	  return;
44270 	}
44271       break;
44272 
44273     case E_V8HImode:
44274       use_vec_extr = TARGET_SSE2;
44275       break;
44276     case E_V4HImode:
44277       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44278       break;
44279 
44280     case E_V16QImode:
44281       use_vec_extr = TARGET_SSE4_1;
44282       break;
44283 
44284     case E_V8SFmode:
44285       if (TARGET_AVX)
44286 	{
44287 	  tmp = gen_reg_rtx (V4SFmode);
44288 	  if (elt < 4)
44289 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44290 	  else
44291 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44292 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
44293 	  return;
44294 	}
44295       break;
44296 
44297     case E_V4DFmode:
44298       if (TARGET_AVX)
44299 	{
44300 	  tmp = gen_reg_rtx (V2DFmode);
44301 	  if (elt < 2)
44302 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44303 	  else
44304 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44305 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
44306 	  return;
44307 	}
44308       break;
44309 
44310     case E_V32QImode:
44311       if (TARGET_AVX)
44312 	{
44313 	  tmp = gen_reg_rtx (V16QImode);
44314 	  if (elt < 16)
44315 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44316 	  else
44317 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44318 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
44319 	  return;
44320 	}
44321       break;
44322 
44323     case E_V16HImode:
44324       if (TARGET_AVX)
44325 	{
44326 	  tmp = gen_reg_rtx (V8HImode);
44327 	  if (elt < 8)
44328 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44329 	  else
44330 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44331 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
44332 	  return;
44333 	}
44334       break;
44335 
44336     case E_V8SImode:
44337       if (TARGET_AVX)
44338 	{
44339 	  tmp = gen_reg_rtx (V4SImode);
44340 	  if (elt < 4)
44341 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44342 	  else
44343 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44344 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
44345 	  return;
44346 	}
44347       break;
44348 
44349     case E_V4DImode:
44350       if (TARGET_AVX)
44351 	{
44352 	  tmp = gen_reg_rtx (V2DImode);
44353 	  if (elt < 2)
44354 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44355 	  else
44356 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44357 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
44358 	  return;
44359 	}
44360       break;
44361 
44362     case E_V32HImode:
44363       if (TARGET_AVX512BW)
44364 	{
44365 	  tmp = gen_reg_rtx (V16HImode);
44366 	  if (elt < 16)
44367 	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44368 	  else
44369 	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44370 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
44371 	  return;
44372 	}
44373       break;
44374 
44375     case E_V64QImode:
44376       if (TARGET_AVX512BW)
44377 	{
44378 	  tmp = gen_reg_rtx (V32QImode);
44379 	  if (elt < 32)
44380 	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44381 	  else
44382 	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44383 	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
44384 	  return;
44385 	}
44386       break;
44387 
44388     case E_V16SFmode:
44389       tmp = gen_reg_rtx (V8SFmode);
44390       if (elt < 8)
44391 	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44392       else
44393 	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44394       ix86_expand_vector_extract (false, target, tmp, elt & 7);
44395       return;
44396 
44397     case E_V8DFmode:
44398       tmp = gen_reg_rtx (V4DFmode);
44399       if (elt < 4)
44400 	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44401       else
44402 	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44403       ix86_expand_vector_extract (false, target, tmp, elt & 3);
44404       return;
44405 
44406     case E_V16SImode:
44407       tmp = gen_reg_rtx (V8SImode);
44408       if (elt < 8)
44409 	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44410       else
44411 	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44412       ix86_expand_vector_extract (false, target, tmp, elt & 7);
44413       return;
44414 
44415     case E_V8DImode:
44416       tmp = gen_reg_rtx (V4DImode);
44417       if (elt < 4)
44418 	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44419       else
44420 	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44421       ix86_expand_vector_extract (false, target, tmp, elt & 3);
44422       return;
44423 
44424     case E_V8QImode:
44425       /* ??? Could extract the appropriate HImode element and shift.  */
44426     default:
44427       break;
44428     }
44429 
44430   if (use_vec_extr)
44431     {
44432       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44433       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44434 
44435       /* Let the rtl optimizers know about the zero extension performed.  */
44436       if (inner_mode == QImode || inner_mode == HImode)
44437 	{
44438 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44439 	  target = gen_lowpart (SImode, target);
44440 	}
44441 
44442       emit_insn (gen_rtx_SET (target, tmp));
44443     }
44444   else
44445     {
44446       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44447 
44448       emit_move_insn (mem, vec);
44449 
44450       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44451       emit_move_insn (target, tmp);
44452     }
44453 }
44454 
44455 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44456    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44457    The upper bits of DEST are undefined, though they shouldn't cause
44458    exceptions (some bits from src or all zeros are ok).  */
44459 
44460 static void
44461 emit_reduc_half (rtx dest, rtx src, int i)
44462 {
44463   rtx tem, d = dest;
44464   switch (GET_MODE (src))
44465     {
44466     case E_V4SFmode:
44467       if (i == 128)
44468 	tem = gen_sse_movhlps (dest, src, src);
44469       else
44470 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44471 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
44472       break;
44473     case E_V2DFmode:
44474       tem = gen_vec_interleave_highv2df (dest, src, src);
44475       break;
44476     case E_V16QImode:
44477     case E_V8HImode:
44478     case E_V4SImode:
44479     case E_V2DImode:
44480       d = gen_reg_rtx (V1TImode);
44481       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44482 				GEN_INT (i / 2));
44483       break;
44484     case E_V8SFmode:
44485       if (i == 256)
44486 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44487       else
44488 	tem = gen_avx_shufps256 (dest, src, src,
44489 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44490       break;
44491     case E_V4DFmode:
44492       if (i == 256)
44493 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44494       else
44495 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44496       break;
44497     case E_V32QImode:
44498     case E_V16HImode:
44499     case E_V8SImode:
44500     case E_V4DImode:
44501       if (i == 256)
44502 	{
44503 	  if (GET_MODE (dest) != V4DImode)
44504 	    d = gen_reg_rtx (V4DImode);
44505 	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44506 				   gen_lowpart (V4DImode, src),
44507 				   const1_rtx);
44508 	}
44509       else
44510 	{
44511 	  d = gen_reg_rtx (V2TImode);
44512 	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44513 				    GEN_INT (i / 2));
44514 	}
44515       break;
44516     case E_V64QImode:
44517     case E_V32HImode:
44518     case E_V16SImode:
44519     case E_V16SFmode:
44520     case E_V8DImode:
44521     case E_V8DFmode:
44522       if (i > 128)
44523 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44524 				      gen_lowpart (V16SImode, src),
44525 				      gen_lowpart (V16SImode, src),
44526 				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44527 				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44528 				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44529 				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44530 				      GEN_INT (0xC), GEN_INT (0xD),
44531 				      GEN_INT (0xE), GEN_INT (0xF),
44532 				      GEN_INT (0x10), GEN_INT (0x11),
44533 				      GEN_INT (0x12), GEN_INT (0x13),
44534 				      GEN_INT (0x14), GEN_INT (0x15),
44535 				      GEN_INT (0x16), GEN_INT (0x17));
44536       else
44537 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44538 				   gen_lowpart (V16SImode, src),
44539 				   GEN_INT (i == 128 ? 0x2 : 0x1),
44540 				   GEN_INT (0x3),
44541 				   GEN_INT (0x3),
44542 				   GEN_INT (0x3),
44543 				   GEN_INT (i == 128 ? 0x6 : 0x5),
44544 				   GEN_INT (0x7),
44545 				   GEN_INT (0x7),
44546 				   GEN_INT (0x7),
44547 				   GEN_INT (i == 128 ? 0xA : 0x9),
44548 				   GEN_INT (0xB),
44549 				   GEN_INT (0xB),
44550 				   GEN_INT (0xB),
44551 				   GEN_INT (i == 128 ? 0xE : 0xD),
44552 				   GEN_INT (0xF),
44553 				   GEN_INT (0xF),
44554 				   GEN_INT (0xF));
44555       break;
44556     default:
44557       gcc_unreachable ();
44558     }
44559   emit_insn (tem);
44560   if (d != dest)
44561     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44562 }
44563 
44564 /* Expand a vector reduction.  FN is the binary pattern to reduce;
44565    DEST is the destination; IN is the input vector.  */
44566 
44567 void
44568 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44569 {
44570   rtx half, dst, vec = in;
44571   machine_mode mode = GET_MODE (in);
44572   int i;
44573 
44574   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
44575   if (TARGET_SSE4_1
44576       && mode == V8HImode
44577       && fn == gen_uminv8hi3)
44578     {
44579       emit_insn (gen_sse4_1_phminposuw (dest, in));
44580       return;
44581     }
44582 
44583   for (i = GET_MODE_BITSIZE (mode);
44584        i > GET_MODE_UNIT_BITSIZE (mode);
44585        i >>= 1)
44586     {
44587       half = gen_reg_rtx (mode);
44588       emit_reduc_half (half, vec, i);
44589       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44590 	dst = dest;
44591       else
44592 	dst = gen_reg_rtx (mode);
44593       emit_insn (fn (dst, half, vec));
44594       vec = dst;
44595     }
44596 }
44597 
44598 /* Target hook for scalar_mode_supported_p.  */
44599 static bool
44600 ix86_scalar_mode_supported_p (scalar_mode mode)
44601 {
44602   if (DECIMAL_FLOAT_MODE_P (mode))
44603     return default_decimal_float_supported_p ();
44604   else if (mode == TFmode)
44605     return true;
44606   else
44607     return default_scalar_mode_supported_p (mode);
44608 }
44609 
44610 /* Implements target hook vector_mode_supported_p.  */
44611 static bool
44612 ix86_vector_mode_supported_p (machine_mode mode)
44613 {
44614   if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44615     return true;
44616   if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44617     return true;
44618   if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44619     return true;
44620   if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44621     return true;
44622   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44623     return true;
44624   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44625     return true;
44626   return false;
44627 }
44628 
44629 /* Target hook for c_mode_for_suffix.  */
44630 static machine_mode
44631 ix86_c_mode_for_suffix (char suffix)
44632 {
44633   if (suffix == 'q')
44634     return TFmode;
44635   if (suffix == 'w')
44636     return XFmode;
44637 
44638   return VOIDmode;
44639 }
44640 
44641 /* Worker function for TARGET_MD_ASM_ADJUST.
44642 
44643    We implement asm flag outputs, and maintain source compatibility
44644    with the old cc0-based compiler.  */
44645 
44646 static rtx_insn *
44647 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44648 		    vec<const char *> &constraints,
44649 		    vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44650 {
44651   clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44652   SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44653 
44654   bool saw_asm_flag = false;
44655 
44656   start_sequence ();
44657   for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44658     {
44659       const char *con = constraints[i];
44660       if (strncmp (con, "=@cc", 4) != 0)
44661 	continue;
44662       con += 4;
44663       if (strchr (con, ',') != NULL)
44664 	{
44665 	  error ("alternatives not allowed in asm flag output");
44666 	  continue;
44667 	}
44668 
44669       bool invert = false;
44670       if (con[0] == 'n')
44671 	invert = true, con++;
44672 
44673       machine_mode mode = CCmode;
44674       rtx_code code = UNKNOWN;
44675 
44676       switch (con[0])
44677 	{
44678 	case 'a':
44679 	  if (con[1] == 0)
44680 	    mode = CCAmode, code = EQ;
44681 	  else if (con[1] == 'e' && con[2] == 0)
44682 	    mode = CCCmode, code = NE;
44683 	  break;
44684 	case 'b':
44685 	  if (con[1] == 0)
44686 	    mode = CCCmode, code = EQ;
44687 	  else if (con[1] == 'e' && con[2] == 0)
44688 	    mode = CCAmode, code = NE;
44689 	  break;
44690 	case 'c':
44691 	  if (con[1] == 0)
44692 	    mode = CCCmode, code = EQ;
44693 	  break;
44694 	case 'e':
44695 	  if (con[1] == 0)
44696 	    mode = CCZmode, code = EQ;
44697 	  break;
44698 	case 'g':
44699 	  if (con[1] == 0)
44700 	    mode = CCGCmode, code = GT;
44701 	  else if (con[1] == 'e' && con[2] == 0)
44702 	    mode = CCGCmode, code = GE;
44703 	  break;
44704 	case 'l':
44705 	  if (con[1] == 0)
44706 	    mode = CCGCmode, code = LT;
44707 	  else if (con[1] == 'e' && con[2] == 0)
44708 	    mode = CCGCmode, code = LE;
44709 	  break;
44710 	case 'o':
44711 	  if (con[1] == 0)
44712 	    mode = CCOmode, code = EQ;
44713 	  break;
44714 	case 'p':
44715 	  if (con[1] == 0)
44716 	    mode = CCPmode, code = EQ;
44717 	  break;
44718 	case 's':
44719 	  if (con[1] == 0)
44720 	    mode = CCSmode, code = EQ;
44721 	  break;
44722 	case 'z':
44723 	  if (con[1] == 0)
44724 	    mode = CCZmode, code = EQ;
44725 	  break;
44726 	}
44727       if (code == UNKNOWN)
44728 	{
44729 	  error ("unknown asm flag output %qs", constraints[i]);
44730 	  continue;
44731 	}
44732       if (invert)
44733 	code = reverse_condition (code);
44734 
44735       rtx dest = outputs[i];
44736       if (!saw_asm_flag)
44737 	{
44738 	  /* This is the first asm flag output.  Here we put the flags
44739 	     register in as the real output and adjust the condition to
44740 	     allow it.  */
44741 	  constraints[i] = "=Bf";
44742 	  outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44743 	  saw_asm_flag = true;
44744 	}
44745       else
44746 	{
44747 	  /* We don't need the flags register as output twice.  */
44748 	  constraints[i] = "=X";
44749 	  outputs[i] = gen_rtx_SCRATCH (SImode);
44750 	}
44751 
44752       rtx x = gen_rtx_REG (mode, FLAGS_REG);
44753       x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44754 
44755       machine_mode dest_mode = GET_MODE (dest);
44756       if (!SCALAR_INT_MODE_P (dest_mode))
44757 	{
44758 	  error ("invalid type for asm flag output");
44759 	  continue;
44760 	}
44761 
44762       if (dest_mode == DImode && !TARGET_64BIT)
44763 	dest_mode = SImode;
44764 
44765       if (dest_mode != QImode)
44766 	{
44767 	  rtx destqi = gen_reg_rtx (QImode);
44768 	  emit_insn (gen_rtx_SET (destqi, x));
44769 
44770 	  if (TARGET_ZERO_EXTEND_WITH_AND
44771 	      && optimize_function_for_speed_p (cfun))
44772 	    {
44773 	      x = force_reg (dest_mode, const0_rtx);
44774 
44775 	      emit_insn (gen_movstrictqi
44776 			 (gen_lowpart (QImode, x), destqi));
44777 	    }
44778 	  else
44779 	    x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44780 	}
44781 
44782       if (dest_mode != GET_MODE (dest))
44783 	{
44784 	  rtx tmp = gen_reg_rtx (SImode);
44785 
44786 	  emit_insn (gen_rtx_SET (tmp, x));
44787 	  emit_insn (gen_zero_extendsidi2 (dest, tmp));
44788 	}
44789       else
44790 	emit_insn (gen_rtx_SET (dest, x));
44791     }
44792   rtx_insn *seq = get_insns ();
44793   end_sequence ();
44794 
44795   if (saw_asm_flag)
44796     return seq;
44797   else
44798     {
44799       /* If we had no asm flag outputs, clobber the flags.  */
44800       clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44801       SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44802       return NULL;
44803     }
44804 }
44805 
44806 /* Implements target vector targetm.asm.encode_section_info.  */
44807 
44808 static void ATTRIBUTE_UNUSED
44809 ix86_encode_section_info (tree decl, rtx rtl, int first)
44810 {
44811   default_encode_section_info (decl, rtl, first);
44812 
44813   if (ix86_in_large_data_p (decl))
44814     SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44815 }
44816 
44817 /* Worker function for REVERSE_CONDITION.  */
44818 
44819 enum rtx_code
44820 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44821 {
44822   return (mode == CCFPmode
44823 	  ? reverse_condition_maybe_unordered (code)
44824 	  : reverse_condition (code));
44825 }
44826 
44827 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44828    to OPERANDS[0].  */
44829 
44830 const char *
44831 output_387_reg_move (rtx_insn *insn, rtx *operands)
44832 {
44833   if (REG_P (operands[0]))
44834     {
44835       if (REG_P (operands[1])
44836 	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44837 	{
44838 	  if (REGNO (operands[0]) == FIRST_STACK_REG)
44839 	    return output_387_ffreep (operands, 0);
44840 	  return "fstp\t%y0";
44841 	}
44842       if (STACK_TOP_P (operands[0]))
44843 	return "fld%Z1\t%y1";
44844       return "fst\t%y0";
44845     }
44846   else if (MEM_P (operands[0]))
44847     {
44848       gcc_assert (REG_P (operands[1]));
44849       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44850 	return "fstp%Z0\t%y0";
44851       else
44852 	{
44853 	  /* There is no non-popping store to memory for XFmode.
44854 	     So if we need one, follow the store with a load.  */
44855 	  if (GET_MODE (operands[0]) == XFmode)
44856 	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44857 	  else
44858 	    return "fst%Z0\t%y0";
44859 	}
44860     }
44861   else
44862     gcc_unreachable();
44863 }
44864 
44865 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44866    FP status register is set.  */
44867 
44868 void
44869 ix86_emit_fp_unordered_jump (rtx label)
44870 {
44871   rtx reg = gen_reg_rtx (HImode);
44872   rtx temp;
44873 
44874   emit_insn (gen_x86_fnstsw_1 (reg));
44875 
44876   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44877     {
44878       emit_insn (gen_x86_sahf_1 (reg));
44879 
44880       temp = gen_rtx_REG (CCmode, FLAGS_REG);
44881       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44882     }
44883   else
44884     {
44885       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44886 
44887       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44888       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44889     }
44890 
44891   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44892 			      gen_rtx_LABEL_REF (VOIDmode, label),
44893 			      pc_rtx);
44894   temp = gen_rtx_SET (pc_rtx, temp);
44895 
44896   emit_jump_insn (temp);
44897   predict_jump (REG_BR_PROB_BASE * 10 / 100);
44898 }
44899 
44900 /* Output code to perform a log1p XFmode calculation.  */
44901 
44902 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44903 {
44904   rtx_code_label *label1 = gen_label_rtx ();
44905   rtx_code_label *label2 = gen_label_rtx ();
44906 
44907   rtx tmp = gen_reg_rtx (XFmode);
44908   rtx tmp2 = gen_reg_rtx (XFmode);
44909   rtx test;
44910 
44911   emit_insn (gen_absxf2 (tmp, op1));
44912   test = gen_rtx_GE (VOIDmode, tmp,
44913     const_double_from_real_value (
44914        REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44915        XFmode));
44916   emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44917 
44918   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44919   emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44920   emit_jump (label2);
44921 
44922   emit_label (label1);
44923   emit_move_insn (tmp, CONST1_RTX (XFmode));
44924   emit_insn (gen_addxf3 (tmp, op1, tmp));
44925   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44926   emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44927 
44928   emit_label (label2);
44929 }
44930 
44931 /* Emit code for round calculation.  */
44932 void ix86_emit_i387_round (rtx op0, rtx op1)
44933 {
44934   machine_mode inmode = GET_MODE (op1);
44935   machine_mode outmode = GET_MODE (op0);
44936   rtx e1, e2, res, tmp, tmp1, half;
44937   rtx scratch = gen_reg_rtx (HImode);
44938   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44939   rtx_code_label *jump_label = gen_label_rtx ();
44940   rtx insn;
44941   rtx (*gen_abs) (rtx, rtx);
44942   rtx (*gen_neg) (rtx, rtx);
44943 
44944   switch (inmode)
44945     {
44946     case E_SFmode:
44947       gen_abs = gen_abssf2;
44948       break;
44949     case E_DFmode:
44950       gen_abs = gen_absdf2;
44951       break;
44952     case E_XFmode:
44953       gen_abs = gen_absxf2;
44954       break;
44955     default:
44956       gcc_unreachable ();
44957     }
44958 
44959   switch (outmode)
44960     {
44961     case E_SFmode:
44962       gen_neg = gen_negsf2;
44963       break;
44964     case E_DFmode:
44965       gen_neg = gen_negdf2;
44966       break;
44967     case E_XFmode:
44968       gen_neg = gen_negxf2;
44969       break;
44970     case E_HImode:
44971       gen_neg = gen_neghi2;
44972       break;
44973     case E_SImode:
44974       gen_neg = gen_negsi2;
44975       break;
44976     case E_DImode:
44977       gen_neg = gen_negdi2;
44978       break;
44979     default:
44980       gcc_unreachable ();
44981     }
44982 
44983   e1 = gen_reg_rtx (inmode);
44984   e2 = gen_reg_rtx (inmode);
44985   res = gen_reg_rtx (outmode);
44986 
44987   half = const_double_from_real_value (dconsthalf, inmode);
44988 
44989   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44990 
44991   /* scratch = fxam(op1) */
44992   emit_insn (gen_rtx_SET (scratch,
44993 			  gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44994 					  UNSPEC_FXAM)));
44995   /* e1 = fabs(op1) */
44996   emit_insn (gen_abs (e1, op1));
44997 
44998   /* e2 = e1 + 0.5 */
44999   half = force_reg (inmode, half);
45000   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45001 
45002   /* res = floor(e2) */
45003   if (inmode != XFmode)
45004     {
45005       tmp1 = gen_reg_rtx (XFmode);
45006 
45007       emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45008     }
45009   else
45010     tmp1 = e2;
45011 
45012   switch (outmode)
45013     {
45014     case E_SFmode:
45015     case E_DFmode:
45016       {
45017 	rtx tmp0 = gen_reg_rtx (XFmode);
45018 
45019 	emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45020 
45021 	emit_insn (gen_rtx_SET (res,
45022 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45023 						UNSPEC_TRUNC_NOOP)));
45024       }
45025       break;
45026     case E_XFmode:
45027       emit_insn (gen_frndintxf2_floor (res, tmp1));
45028       break;
45029     case E_HImode:
45030       emit_insn (gen_lfloorxfhi2 (res, tmp1));
45031       break;
45032     case E_SImode:
45033       emit_insn (gen_lfloorxfsi2 (res, tmp1));
45034       break;
45035     case E_DImode:
45036       emit_insn (gen_lfloorxfdi2 (res, tmp1));
45037 	break;
45038     default:
45039       gcc_unreachable ();
45040     }
45041 
45042   /* flags = signbit(a) */
45043   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45044 
45045   /* if (flags) then res = -res */
45046   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45047 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45048 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
45049 			      pc_rtx);
45050   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45051   predict_jump (REG_BR_PROB_BASE * 50 / 100);
45052   JUMP_LABEL (insn) = jump_label;
45053 
45054   emit_insn (gen_neg (res, res));
45055 
45056   emit_label (jump_label);
45057   LABEL_NUSES (jump_label) = 1;
45058 
45059   emit_move_insn (op0, res);
45060 }
45061 
45062 /* Output code to perform a Newton-Rhapson approximation of a single precision
45063    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
45064 
45065 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45066 {
45067   rtx x0, x1, e0, e1;
45068 
45069   x0 = gen_reg_rtx (mode);
45070   e0 = gen_reg_rtx (mode);
45071   e1 = gen_reg_rtx (mode);
45072   x1 = gen_reg_rtx (mode);
45073 
45074   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45075 
45076   b = force_reg (mode, b);
45077 
45078   /* x0 = rcp(b) estimate */
45079   if (mode == V16SFmode || mode == V8DFmode)
45080     {
45081       if (TARGET_AVX512ER)
45082 	{
45083 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45084 						      UNSPEC_RCP28)));
45085 	  /* res = a * x0 */
45086 	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45087 	  return;
45088 	}
45089       else
45090 	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45091 						    UNSPEC_RCP14)));
45092     }
45093   else
45094     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45095 						UNSPEC_RCP)));
45096 
45097   /* e0 = x0 * b */
45098   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45099 
45100   /* e0 = x0 * e0 */
45101   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45102 
45103   /* e1 = x0 + x0 */
45104   emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45105 
45106   /* x1 = e1 - e0 */
45107   emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45108 
45109   /* res = a * x1 */
45110   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45111 }
45112 
45113 /* Output code to perform a Newton-Rhapson approximation of a
45114    single precision floating point [reciprocal] square root.  */
45115 
45116 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45117 {
45118   rtx x0, e0, e1, e2, e3, mthree, mhalf;
45119   REAL_VALUE_TYPE r;
45120   int unspec;
45121 
45122   x0 = gen_reg_rtx (mode);
45123   e0 = gen_reg_rtx (mode);
45124   e1 = gen_reg_rtx (mode);
45125   e2 = gen_reg_rtx (mode);
45126   e3 = gen_reg_rtx (mode);
45127 
45128   if (TARGET_AVX512ER && mode == V16SFmode)
45129     {
45130       if (recip)
45131 	/* res = rsqrt28(a) estimate */
45132 	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45133 						     UNSPEC_RSQRT28)));
45134       else
45135 	{
45136 	  /* x0 = rsqrt28(a) estimate */
45137 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45138 						      UNSPEC_RSQRT28)));
45139 	  /* res = rcp28(x0) estimate */
45140 	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45141 						       UNSPEC_RCP28)));
45142 	}
45143       return;
45144     }
45145 
45146   real_from_integer (&r, VOIDmode, -3, SIGNED);
45147   mthree = const_double_from_real_value (r, SFmode);
45148 
45149   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45150   mhalf = const_double_from_real_value (r, SFmode);
45151   unspec = UNSPEC_RSQRT;
45152 
45153   if (VECTOR_MODE_P (mode))
45154     {
45155       mthree = ix86_build_const_vector (mode, true, mthree);
45156       mhalf = ix86_build_const_vector (mode, true, mhalf);
45157       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
45158       if (GET_MODE_SIZE (mode) == 64)
45159 	unspec = UNSPEC_RSQRT14;
45160     }
45161 
45162   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45163      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45164 
45165   a = force_reg (mode, a);
45166 
45167   /* x0 = rsqrt(a) estimate */
45168   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45169 					      unspec)));
45170 
45171   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
45172   if (!recip)
45173     {
45174       rtx zero = force_reg (mode, CONST0_RTX(mode));
45175       rtx mask;
45176 
45177       /* Handle masked compare.  */
45178       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45179 	{
45180 	  mask = gen_reg_rtx (HImode);
45181 	  /* Imm value 0x4 corresponds to not-equal comparison.  */
45182 	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45183 	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45184 	}
45185       else
45186 	{
45187 	  mask = gen_reg_rtx (mode);
45188 	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45189 	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45190 	}
45191     }
45192 
45193   /* e0 = x0 * a */
45194   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45195   /* e1 = e0 * x0 */
45196   emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45197 
45198   /* e2 = e1 - 3. */
45199   mthree = force_reg (mode, mthree);
45200   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45201 
45202   mhalf = force_reg (mode, mhalf);
45203   if (recip)
45204     /* e3 = -.5 * x0 */
45205     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45206   else
45207     /* e3 = -.5 * e0 */
45208     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45209   /* ret = e2 * e3 */
45210   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45211 }
45212 
45213 #ifdef TARGET_SOLARIS
45214 /* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
45215 
45216 static void
45217 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45218 				tree decl)
45219 {
45220   /* With Binutils 2.15, the "@unwind" marker must be specified on
45221      every occurrence of the ".eh_frame" section, not just the first
45222      one.  */
45223   if (TARGET_64BIT
45224       && strcmp (name, ".eh_frame") == 0)
45225     {
45226       fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45227 	       flags & SECTION_WRITE ? "aw" : "a");
45228       return;
45229     }
45230 
45231 #ifndef USE_GAS
45232   if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45233     {
45234       solaris_elf_asm_comdat_section (name, flags, decl);
45235       return;
45236     }
45237 #endif
45238 
45239   default_elf_asm_named_section (name, flags, decl);
45240 }
45241 #endif /* TARGET_SOLARIS */
45242 
45243 /* Return the mangling of TYPE if it is an extended fundamental type.  */
45244 
45245 static const char *
45246 ix86_mangle_type (const_tree type)
45247 {
45248   type = TYPE_MAIN_VARIANT (type);
45249 
45250   if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45251       && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45252     return NULL;
45253 
45254   switch (TYPE_MODE (type))
45255     {
45256     case E_TFmode:
45257       /* __float128 is "g".  */
45258       return "g";
45259     case E_XFmode:
45260       /* "long double" or __float80 is "e".  */
45261       return "e";
45262     default:
45263       return NULL;
45264     }
45265 }
45266 
45267 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45268 
45269 static tree
45270 ix86_stack_protect_guard (void)
45271 {
45272   if (TARGET_SSP_TLS_GUARD)
45273     {
45274       tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45275       int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45276       tree type = build_qualified_type (type_node, qual);
45277       tree t;
45278 
45279       if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45280 	{
45281 	  t = ix86_tls_stack_chk_guard_decl;
45282 
45283 	  if (t == NULL)
45284 	    {
45285 	      rtx x;
45286 
45287 	      t = build_decl
45288 		(UNKNOWN_LOCATION, VAR_DECL,
45289 		 get_identifier (ix86_stack_protector_guard_symbol_str),
45290 		 type);
45291 	      TREE_STATIC (t) = 1;
45292 	      TREE_PUBLIC (t) = 1;
45293 	      DECL_EXTERNAL (t) = 1;
45294 	      TREE_USED (t) = 1;
45295 	      TREE_THIS_VOLATILE (t) = 1;
45296 	      DECL_ARTIFICIAL (t) = 1;
45297 	      DECL_IGNORED_P (t) = 1;
45298 
45299 	      /* Do not share RTL as the declaration is visible outside of
45300 		 current function.  */
45301 	      x = DECL_RTL (t);
45302 	      RTX_FLAG (x, used) = 1;
45303 
45304 	      ix86_tls_stack_chk_guard_decl = t;
45305 	    }
45306 	}
45307       else
45308 	{
45309 	  tree asptrtype = build_pointer_type (type);
45310 
45311 	  t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45312 	  t = build2 (MEM_REF, asptrtype, t,
45313 		      build_int_cst (asptrtype, 0));
45314 	}
45315 
45316       return t;
45317     }
45318 
45319   return default_stack_protect_guard ();
45320 }
45321 
45322 /* For 32-bit code we can save PIC register setup by using
45323    __stack_chk_fail_local hidden function instead of calling
45324    __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
45325    register, so it is better to call __stack_chk_fail directly.  */
45326 
45327 static tree ATTRIBUTE_UNUSED
45328 ix86_stack_protect_fail (void)
45329 {
45330   return TARGET_64BIT
45331 	 ? default_external_stack_protect_fail ()
45332 	 : default_hidden_stack_protect_fail ();
45333 }
45334 
45335 /* Select a format to encode pointers in exception handling data.  CODE
45336    is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
45337    true if the symbol may be affected by dynamic relocations.
45338 
45339    ??? All x86 object file formats are capable of representing this.
45340    After all, the relocation needed is the same as for the call insn.
45341    Whether or not a particular assembler allows us to enter such, I
45342    guess we'll have to see.  */
45343 int
45344 asm_preferred_eh_data_format (int code, int global)
45345 {
45346   if (flag_pic)
45347     {
45348       int type = DW_EH_PE_sdata8;
45349       if (!TARGET_64BIT
45350 	  || ix86_cmodel == CM_SMALL_PIC
45351 	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45352 	type = DW_EH_PE_sdata4;
45353       return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45354     }
45355   if (ix86_cmodel == CM_SMALL
45356       || (ix86_cmodel == CM_MEDIUM && code))
45357     return DW_EH_PE_udata4;
45358   return DW_EH_PE_absptr;
45359 }
45360 
45361 /* Expand copysign from SIGN to the positive value ABS_VALUE
45362    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
45363    the sign-bit.  */
45364 static void
45365 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45366 {
45367   machine_mode mode = GET_MODE (sign);
45368   rtx sgn = gen_reg_rtx (mode);
45369   if (mask == NULL_RTX)
45370     {
45371       machine_mode vmode;
45372 
45373       if (mode == SFmode)
45374 	vmode = V4SFmode;
45375       else if (mode == DFmode)
45376 	vmode = V2DFmode;
45377       else
45378 	vmode = mode;
45379 
45380       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45381       if (!VECTOR_MODE_P (mode))
45382 	{
45383 	  /* We need to generate a scalar mode mask in this case.  */
45384 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45385 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45386 	  mask = gen_reg_rtx (mode);
45387 	  emit_insn (gen_rtx_SET (mask, tmp));
45388 	}
45389     }
45390   else
45391     mask = gen_rtx_NOT (mode, mask);
45392   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45393   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45394 }
45395 
45396 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
45397    mask for masking out the sign-bit is stored in *SMASK, if that is
45398    non-null.  */
45399 static rtx
45400 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45401 {
45402   machine_mode vmode, mode = GET_MODE (op0);
45403   rtx xa, mask;
45404 
45405   xa = gen_reg_rtx (mode);
45406   if (mode == SFmode)
45407     vmode = V4SFmode;
45408   else if (mode == DFmode)
45409     vmode = V2DFmode;
45410   else
45411     vmode = mode;
45412   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45413   if (!VECTOR_MODE_P (mode))
45414     {
45415       /* We need to generate a scalar mode mask in this case.  */
45416       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45417       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45418       mask = gen_reg_rtx (mode);
45419       emit_insn (gen_rtx_SET (mask, tmp));
45420     }
45421   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45422 
45423   if (smask)
45424     *smask = mask;
45425 
45426   return xa;
45427 }
45428 
45429 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45430    swapping the operands if SWAP_OPERANDS is true.  The expanded
45431    code is a forward jump to a newly created label in case the
45432    comparison is true.  The generated label rtx is returned.  */
45433 static rtx_code_label *
45434 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45435                                   bool swap_operands)
45436 {
45437   bool unordered_compare = ix86_unordered_fp_compare (code);
45438   rtx_code_label *label;
45439   rtx tmp, reg;
45440 
45441   if (swap_operands)
45442     std::swap (op0, op1);
45443 
45444   label = gen_label_rtx ();
45445   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45446   if (unordered_compare)
45447     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45448   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45449   emit_insn (gen_rtx_SET (reg, tmp));
45450   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45451   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45452 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45453   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45454   JUMP_LABEL (tmp) = label;
45455 
45456   return label;
45457 }
45458 
45459 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45460    using comparison code CODE.  Operands are swapped for the comparison if
45461    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
45462 static rtx
45463 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45464 			      bool swap_operands)
45465 {
45466   rtx (*insn)(rtx, rtx, rtx, rtx);
45467   machine_mode mode = GET_MODE (op0);
45468   rtx mask = gen_reg_rtx (mode);
45469 
45470   if (swap_operands)
45471     std::swap (op0, op1);
45472 
45473   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45474 
45475   emit_insn (insn (mask, op0, op1,
45476 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
45477   return mask;
45478 }
45479 
45480 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45481    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
45482 static rtx
45483 ix86_gen_TWO52 (machine_mode mode)
45484 {
45485   REAL_VALUE_TYPE TWO52r;
45486   rtx TWO52;
45487 
45488   real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45489   TWO52 = const_double_from_real_value (TWO52r, mode);
45490   TWO52 = force_reg (mode, TWO52);
45491 
45492   return TWO52;
45493 }
45494 
45495 /* Expand SSE sequence for computing lround from OP1 storing
45496    into OP0.  */
45497 void
45498 ix86_expand_lround (rtx op0, rtx op1)
45499 {
45500   /* C code for the stuff we're doing below:
45501        tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45502        return (long)tmp;
45503    */
45504   machine_mode mode = GET_MODE (op1);
45505   const struct real_format *fmt;
45506   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45507   rtx adj;
45508 
45509   /* load nextafter (0.5, 0.0) */
45510   fmt = REAL_MODE_FORMAT (mode);
45511   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45512   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45513 
45514   /* adj = copysign (0.5, op1) */
45515   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45516   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45517 
45518   /* adj = op1 + adj */
45519   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45520 
45521   /* op0 = (imode)adj */
45522   expand_fix (op0, adj, 0);
45523 }
45524 
45525 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45526    into OPERAND0.  */
45527 void
45528 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45529 {
45530   /* C code for the stuff we're doing below (for do_floor):
45531 	xi = (long)op1;
45532         xi -= (double)xi > op1 ? 1 : 0;
45533         return xi;
45534    */
45535   machine_mode fmode = GET_MODE (op1);
45536   machine_mode imode = GET_MODE (op0);
45537   rtx ireg, freg, tmp;
45538   rtx_code_label *label;
45539 
45540   /* reg = (long)op1 */
45541   ireg = gen_reg_rtx (imode);
45542   expand_fix (ireg, op1, 0);
45543 
45544   /* freg = (double)reg */
45545   freg = gen_reg_rtx (fmode);
45546   expand_float (freg, ireg, 0);
45547 
45548   /* ireg = (freg > op1) ? ireg - 1 : ireg */
45549   label = ix86_expand_sse_compare_and_jump (UNLE,
45550 					    freg, op1, !do_floor);
45551   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45552 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45553   emit_move_insn (ireg, tmp);
45554 
45555   emit_label (label);
45556   LABEL_NUSES (label) = 1;
45557 
45558   emit_move_insn (op0, ireg);
45559 }
45560 
45561 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
45562 void
45563 ix86_expand_rint (rtx operand0, rtx operand1)
45564 {
45565   /* C code for the stuff we're doing below:
45566 	xa = fabs (operand1);
45567         if (!isless (xa, 2**52))
45568 	  return operand1;
45569         two52 = 2**52;
45570         if (flag_rounding_math)
45571 	  {
45572 	    two52 = copysign (two52, operand1);
45573 	    xa = operand1;
45574 	  }
45575         xa = xa + two52 - two52;
45576         return copysign (xa, operand1);
45577    */
45578   machine_mode mode = GET_MODE (operand0);
45579   rtx res, xa, TWO52, two52, mask;
45580   rtx_code_label *label;
45581 
45582   res = gen_reg_rtx (mode);
45583   emit_move_insn (res, operand1);
45584 
45585   /* xa = abs (operand1) */
45586   xa = ix86_expand_sse_fabs (res, &mask);
45587 
45588   /* if (!isless (xa, TWO52)) goto label; */
45589   TWO52 = ix86_gen_TWO52 (mode);
45590   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45591 
45592   two52 = TWO52;
45593   if (flag_rounding_math)
45594     {
45595       two52 = gen_reg_rtx (mode);
45596       ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45597       xa = res;
45598     }
45599 
45600   xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45601   xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45602 
45603   ix86_sse_copysign_to_positive (res, xa, res, mask);
45604 
45605   emit_label (label);
45606   LABEL_NUSES (label) = 1;
45607 
45608   emit_move_insn (operand0, res);
45609 }
45610 
45611 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45612    into OPERAND0.  */
45613 void
45614 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45615 {
45616   /* C code for the stuff we expand below.
45617         double xa = fabs (x), x2;
45618         if (!isless (xa, TWO52))
45619           return x;
45620         xa = xa + TWO52 - TWO52;
45621         x2 = copysign (xa, x);
45622      Compensate.  Floor:
45623         if (x2 > x)
45624           x2 -= 1;
45625      Compensate.  Ceil:
45626         if (x2 < x)
45627           x2 -= -1;
45628         return x2;
45629    */
45630   machine_mode mode = GET_MODE (operand0);
45631   rtx xa, TWO52, tmp, one, res, mask;
45632   rtx_code_label *label;
45633 
45634   TWO52 = ix86_gen_TWO52 (mode);
45635 
45636   /* Temporary for holding the result, initialized to the input
45637      operand to ease control flow.  */
45638   res = gen_reg_rtx (mode);
45639   emit_move_insn (res, operand1);
45640 
45641   /* xa = abs (operand1) */
45642   xa = ix86_expand_sse_fabs (res, &mask);
45643 
45644   /* if (!isless (xa, TWO52)) goto label; */
45645   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45646 
45647   /* xa = xa + TWO52 - TWO52; */
45648   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45649   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45650 
45651   /* xa = copysign (xa, operand1) */
45652   ix86_sse_copysign_to_positive (xa, xa, res, mask);
45653 
45654   /* generate 1.0 or -1.0 */
45655   one = force_reg (mode,
45656 	           const_double_from_real_value (do_floor
45657 						 ? dconst1 : dconstm1, mode));
45658 
45659   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45660   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45661   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45662   /* We always need to subtract here to preserve signed zero.  */
45663   tmp = expand_simple_binop (mode, MINUS,
45664 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45665   emit_move_insn (res, tmp);
45666 
45667   emit_label (label);
45668   LABEL_NUSES (label) = 1;
45669 
45670   emit_move_insn (operand0, res);
45671 }
45672 
45673 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45674    into OPERAND0.  */
45675 void
45676 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45677 {
45678   /* C code for the stuff we expand below.
45679 	double xa = fabs (x), x2;
45680         if (!isless (xa, TWO52))
45681           return x;
45682 	x2 = (double)(long)x;
45683      Compensate.  Floor:
45684 	if (x2 > x)
45685 	  x2 -= 1;
45686      Compensate.  Ceil:
45687 	if (x2 < x)
45688 	  x2 += 1;
45689 	if (HONOR_SIGNED_ZEROS (mode))
45690 	  return copysign (x2, x);
45691 	return x2;
45692    */
45693   machine_mode mode = GET_MODE (operand0);
45694   rtx xa, xi, TWO52, tmp, one, res, mask;
45695   rtx_code_label *label;
45696 
45697   TWO52 = ix86_gen_TWO52 (mode);
45698 
45699   /* Temporary for holding the result, initialized to the input
45700      operand to ease control flow.  */
45701   res = gen_reg_rtx (mode);
45702   emit_move_insn (res, operand1);
45703 
45704   /* xa = abs (operand1) */
45705   xa = ix86_expand_sse_fabs (res, &mask);
45706 
45707   /* if (!isless (xa, TWO52)) goto label; */
45708   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45709 
45710   /* xa = (double)(long)x */
45711   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45712   expand_fix (xi, res, 0);
45713   expand_float (xa, xi, 0);
45714 
45715   /* generate 1.0 */
45716   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45717 
45718   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45719   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45720   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45721   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45722 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45723   emit_move_insn (res, tmp);
45724 
45725   if (HONOR_SIGNED_ZEROS (mode))
45726     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45727 
45728   emit_label (label);
45729   LABEL_NUSES (label) = 1;
45730 
45731   emit_move_insn (operand0, res);
45732 }
45733 
45734 /* Expand SSE sequence for computing round from OPERAND1 storing
45735    into OPERAND0.  Sequence that works without relying on DImode truncation
45736    via cvttsd2siq that is only available on 64bit targets.  */
45737 void
45738 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45739 {
45740   /* C code for the stuff we expand below.
45741         double xa = fabs (x), xa2, x2;
45742         if (!isless (xa, TWO52))
45743           return x;
45744      Using the absolute value and copying back sign makes
45745      -0.0 -> -0.0 correct.
45746         xa2 = xa + TWO52 - TWO52;
45747      Compensate.
45748 	dxa = xa2 - xa;
45749         if (dxa <= -0.5)
45750           xa2 += 1;
45751         else if (dxa > 0.5)
45752           xa2 -= 1;
45753         x2 = copysign (xa2, x);
45754         return x2;
45755    */
45756   machine_mode mode = GET_MODE (operand0);
45757   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45758   rtx_code_label *label;
45759 
45760   TWO52 = ix86_gen_TWO52 (mode);
45761 
45762   /* Temporary for holding the result, initialized to the input
45763      operand to ease control flow.  */
45764   res = gen_reg_rtx (mode);
45765   emit_move_insn (res, operand1);
45766 
45767   /* xa = abs (operand1) */
45768   xa = ix86_expand_sse_fabs (res, &mask);
45769 
45770   /* if (!isless (xa, TWO52)) goto label; */
45771   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45772 
45773   /* xa2 = xa + TWO52 - TWO52; */
45774   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45775   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45776 
45777   /* dxa = xa2 - xa; */
45778   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45779 
45780   /* generate 0.5, 1.0 and -0.5 */
45781   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45782   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45783   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45784 			       0, OPTAB_DIRECT);
45785 
45786   /* Compensate.  */
45787   tmp = gen_reg_rtx (mode);
45788   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45789   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45790   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45791   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45792   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45793   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45794   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45795   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45796 
45797   /* res = copysign (xa2, operand1) */
45798   ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45799 
45800   emit_label (label);
45801   LABEL_NUSES (label) = 1;
45802 
45803   emit_move_insn (operand0, res);
45804 }
45805 
45806 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45807    into OPERAND0.  */
45808 void
45809 ix86_expand_trunc (rtx operand0, rtx operand1)
45810 {
45811   /* C code for SSE variant we expand below.
45812         double xa = fabs (x), x2;
45813         if (!isless (xa, TWO52))
45814           return x;
45815         x2 = (double)(long)x;
45816 	if (HONOR_SIGNED_ZEROS (mode))
45817 	  return copysign (x2, x);
45818 	return x2;
45819    */
45820   machine_mode mode = GET_MODE (operand0);
45821   rtx xa, xi, TWO52, res, mask;
45822   rtx_code_label *label;
45823 
45824   TWO52 = ix86_gen_TWO52 (mode);
45825 
45826   /* Temporary for holding the result, initialized to the input
45827      operand to ease control flow.  */
45828   res = gen_reg_rtx (mode);
45829   emit_move_insn (res, operand1);
45830 
45831   /* xa = abs (operand1) */
45832   xa = ix86_expand_sse_fabs (res, &mask);
45833 
45834   /* if (!isless (xa, TWO52)) goto label; */
45835   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45836 
45837   /* x = (double)(long)x */
45838   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45839   expand_fix (xi, res, 0);
45840   expand_float (res, xi, 0);
45841 
45842   if (HONOR_SIGNED_ZEROS (mode))
45843     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45844 
45845   emit_label (label);
45846   LABEL_NUSES (label) = 1;
45847 
45848   emit_move_insn (operand0, res);
45849 }
45850 
45851 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45852    into OPERAND0.  */
45853 void
45854 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45855 {
45856   machine_mode mode = GET_MODE (operand0);
45857   rtx xa, mask, TWO52, one, res, smask, tmp;
45858   rtx_code_label *label;
45859 
45860   /* C code for SSE variant we expand below.
45861         double xa = fabs (x), x2;
45862         if (!isless (xa, TWO52))
45863           return x;
45864         xa2 = xa + TWO52 - TWO52;
45865      Compensate:
45866         if (xa2 > xa)
45867           xa2 -= 1.0;
45868         x2 = copysign (xa2, x);
45869         return x2;
45870    */
45871 
45872   TWO52 = ix86_gen_TWO52 (mode);
45873 
45874   /* Temporary for holding the result, initialized to the input
45875      operand to ease control flow.  */
45876   res = gen_reg_rtx (mode);
45877   emit_move_insn (res, operand1);
45878 
45879   /* xa = abs (operand1) */
45880   xa = ix86_expand_sse_fabs (res, &smask);
45881 
45882   /* if (!isless (xa, TWO52)) goto label; */
45883   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45884 
45885   /* res = xa + TWO52 - TWO52; */
45886   tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45887   tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45888   emit_move_insn (res, tmp);
45889 
45890   /* generate 1.0 */
45891   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45892 
45893   /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
45894   mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45895   emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45896   tmp = expand_simple_binop (mode, MINUS,
45897 			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45898   emit_move_insn (res, tmp);
45899 
45900   /* res = copysign (res, operand1) */
45901   ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45902 
45903   emit_label (label);
45904   LABEL_NUSES (label) = 1;
45905 
45906   emit_move_insn (operand0, res);
45907 }
45908 
45909 /* Expand SSE sequence for computing round from OPERAND1 storing
45910    into OPERAND0.  */
45911 void
45912 ix86_expand_round (rtx operand0, rtx operand1)
45913 {
45914   /* C code for the stuff we're doing below:
45915         double xa = fabs (x);
45916         if (!isless (xa, TWO52))
45917           return x;
45918         xa = (double)(long)(xa + nextafter (0.5, 0.0));
45919         return copysign (xa, x);
45920    */
45921   machine_mode mode = GET_MODE (operand0);
45922   rtx res, TWO52, xa, xi, half, mask;
45923   rtx_code_label *label;
45924   const struct real_format *fmt;
45925   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45926 
45927   /* Temporary for holding the result, initialized to the input
45928      operand to ease control flow.  */
45929   res = gen_reg_rtx (mode);
45930   emit_move_insn (res, operand1);
45931 
45932   TWO52 = ix86_gen_TWO52 (mode);
45933   xa = ix86_expand_sse_fabs (res, &mask);
45934   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45935 
45936   /* load nextafter (0.5, 0.0) */
45937   fmt = REAL_MODE_FORMAT (mode);
45938   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45939   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45940 
45941   /* xa = xa + 0.5 */
45942   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45943   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45944 
45945   /* xa = (double)(int64_t)xa */
45946   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45947   expand_fix (xi, xa, 0);
45948   expand_float (xa, xi, 0);
45949 
45950   /* res = copysign (xa, operand1) */
45951   ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45952 
45953   emit_label (label);
45954   LABEL_NUSES (label) = 1;
45955 
45956   emit_move_insn (operand0, res);
45957 }
45958 
45959 /* Expand SSE sequence for computing round
45960    from OP1 storing into OP0 using sse4 round insn.  */
45961 void
45962 ix86_expand_round_sse4 (rtx op0, rtx op1)
45963 {
45964   machine_mode mode = GET_MODE (op0);
45965   rtx e1, e2, res, half;
45966   const struct real_format *fmt;
45967   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45968   rtx (*gen_copysign) (rtx, rtx, rtx);
45969   rtx (*gen_round) (rtx, rtx, rtx);
45970 
45971   switch (mode)
45972     {
45973     case E_SFmode:
45974       gen_copysign = gen_copysignsf3;
45975       gen_round = gen_sse4_1_roundsf2;
45976       break;
45977     case E_DFmode:
45978       gen_copysign = gen_copysigndf3;
45979       gen_round = gen_sse4_1_rounddf2;
45980       break;
45981     default:
45982       gcc_unreachable ();
45983     }
45984 
45985   /* round (a) = trunc (a + copysign (0.5, a)) */
45986 
45987   /* load nextafter (0.5, 0.0) */
45988   fmt = REAL_MODE_FORMAT (mode);
45989   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45990   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45991   half = const_double_from_real_value (pred_half, mode);
45992 
45993   /* e1 = copysign (0.5, op1) */
45994   e1 = gen_reg_rtx (mode);
45995   emit_insn (gen_copysign (e1, half, op1));
45996 
45997   /* e2 = op1 + e1 */
45998   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45999 
46000   /* res = trunc (e2) */
46001   res = gen_reg_rtx (mode);
46002   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46003 
46004   emit_move_insn (op0, res);
46005 }
46006 
46007 
46008 /* Table of valid machine attributes.  */
46009 static const struct attribute_spec ix86_attribute_table[] =
46010 {
46011   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
46012        affects_type_identity, handler, exclude } */
46013   /* Stdcall attribute says callee is responsible for popping arguments
46014      if they are not variable.  */
46015   { "stdcall",   0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
46016     NULL },
46017   /* Fastcall attribute says callee is responsible for popping arguments
46018      if they are not variable.  */
46019   { "fastcall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
46020     NULL },
46021   /* Thiscall attribute says callee is responsible for popping arguments
46022      if they are not variable.  */
46023   { "thiscall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
46024     NULL },
46025   /* Cdecl attribute says the callee is a normal C declaration */
46026   { "cdecl",     0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
46027     NULL },
46028   /* Regparm attribute specifies how many integer arguments are to be
46029      passed in registers.  */
46030   { "regparm",   1, 1, false, true,  true,  true, ix86_handle_cconv_attribute,
46031     NULL },
46032   /* Sseregparm attribute says we are using x86_64 calling conventions
46033      for FP arguments.  */
46034   { "sseregparm", 0, 0, false, true, true,  true, ix86_handle_cconv_attribute,
46035     NULL },
46036   /* The transactional memory builtins are implicitly regparm or fastcall
46037      depending on the ABI.  Override the generic do-nothing attribute that
46038      these builtins were declared with.  */
46039   { "*tm regparm", 0, 0, false, true, true, true,
46040     ix86_handle_tm_regparm_attribute, NULL },
46041   /* force_align_arg_pointer says this function realigns the stack at entry.  */
46042   { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46043     false, true,  true, false, ix86_handle_force_align_arg_pointer_attribute,
46044     NULL },
46045 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46046   { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
46047     NULL },
46048   { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
46049     NULL },
46050   { "shared",    0, 0, true,  false, false, false,
46051     ix86_handle_shared_attribute, NULL },
46052 #endif
46053   { "ms_struct", 0, 0, false, false,  false, false,
46054     ix86_handle_struct_attribute, NULL },
46055   { "gcc_struct", 0, 0, false, false,  false, false,
46056     ix86_handle_struct_attribute, NULL },
46057 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46058   SUBTARGET_ATTRIBUTE_TABLE,
46059 #endif
46060   /* ms_abi and sysv_abi calling convention function attributes.  */
46061   { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
46062   { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
46063     NULL },
46064   { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
46065   { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
46066   { "ms_hook_prologue", 0, 0, true, false, false, false,
46067     ix86_handle_fndecl_attribute, NULL },
46068   { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
46069     ix86_handle_callee_pop_aggregate_return, NULL },
46070   { "interrupt", 0, 0, false, true, true, false,
46071     ix86_handle_interrupt_attribute, NULL },
46072   { "no_caller_saved_registers", 0, 0, false, true, true, false,
46073     ix86_handle_no_caller_saved_registers_attribute, NULL },
46074   { "naked", 0, 0, true, false, false, false,
46075     ix86_handle_fndecl_attribute, NULL },
46076   { "indirect_branch", 1, 1, true, false, false, false,
46077     ix86_handle_fndecl_attribute, NULL },
46078   { "function_return", 1, 1, true, false, false, false,
46079     ix86_handle_fndecl_attribute, NULL },
46080 
46081   /* End element.  */
46082   { NULL, 0, 0, false, false, false, false, NULL, NULL }
46083 };
46084 
46085 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
46086 static int
46087 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46088                                  tree vectype, int)
46089 {
46090   bool fp = false;
46091   machine_mode mode = TImode;
46092   int index;
46093   if (vectype != NULL)
46094     {
46095       fp = FLOAT_TYPE_P (vectype);
46096       mode = TYPE_MODE (vectype);
46097     }
46098 
46099   switch (type_of_cost)
46100     {
46101       case scalar_stmt:
46102         return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
46103 
46104       case scalar_load:
46105 	/* load/store costs are relative to register move which is 2. Recompute
46106  	   it to COSTS_N_INSNS so everything have same base.  */
46107         return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
46108 			      : ix86_cost->int_load [2]) / 2;
46109 
46110       case scalar_store:
46111         return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
46112 			      : ix86_cost->int_store [2]) / 2;
46113 
46114       case vector_stmt:
46115         return ix86_vec_cost (mode,
46116 			      fp ? ix86_cost->addss : ix86_cost->sse_op,
46117 			      true);
46118 
46119       case vector_load:
46120 	index = sse_store_index (mode);
46121 	/* See PR82713 - we may end up being called on non-vector type.  */
46122 	if (index < 0)
46123 	  index = 2;
46124         return ix86_vec_cost (mode,
46125 			      COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
46126 			      true);
46127 
46128       case vector_store:
46129 	index = sse_store_index (mode);
46130 	/* See PR82713 - we may end up being called on non-vector type.  */
46131 	if (index < 0)
46132 	  index = 2;
46133         return ix86_vec_cost (mode,
46134 			      COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
46135 			      true);
46136 
46137       case vec_to_scalar:
46138       case scalar_to_vec:
46139         return ix86_vec_cost (mode, ix86_cost->sse_op, true);
46140 
46141       /* We should have separate costs for unaligned loads and gather/scatter.
46142 	 Do that incrementally.  */
46143       case unaligned_load:
46144 	index = sse_store_index (mode);
46145 	/* See PR82713 - we may end up being called on non-vector type.  */
46146 	if (index < 0)
46147 	  index = 2;
46148         return ix86_vec_cost (mode,
46149 			      COSTS_N_INSNS
46150 				 (ix86_cost->sse_unaligned_load[index]) / 2,
46151 			      true);
46152 
46153       case unaligned_store:
46154 	index = sse_store_index (mode);
46155 	/* See PR82713 - we may end up being called on non-vector type.  */
46156 	if (index < 0)
46157 	  index = 2;
46158         return ix86_vec_cost (mode,
46159 			      COSTS_N_INSNS
46160 				 (ix86_cost->sse_unaligned_store[index]) / 2,
46161 			      true);
46162 
46163       case vector_gather_load:
46164         return ix86_vec_cost (mode,
46165 			      COSTS_N_INSNS
46166 				 (ix86_cost->gather_static
46167 				  + ix86_cost->gather_per_elt
46168 				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46169 			      true);
46170 
46171       case vector_scatter_store:
46172         return ix86_vec_cost (mode,
46173 			      COSTS_N_INSNS
46174 				 (ix86_cost->scatter_static
46175 				  + ix86_cost->scatter_per_elt
46176 				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46177 			      true);
46178 
46179       case cond_branch_taken:
46180         return ix86_cost->cond_taken_branch_cost;
46181 
46182       case cond_branch_not_taken:
46183         return ix86_cost->cond_not_taken_branch_cost;
46184 
46185       case vec_perm:
46186       case vec_promote_demote:
46187         return ix86_vec_cost (mode,
46188 			      ix86_cost->sse_op, true);
46189 
46190       case vec_construct:
46191 	{
46192 	  /* N element inserts.  */
46193 	  int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
46194 	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
46195 	  if (GET_MODE_BITSIZE (mode) == 256)
46196 	    cost += ix86_vec_cost (mode, ix86_cost->addss, true);
46197 	  /* One vinserti64x4 and two vinserti128 for combining SSE
46198 	     and AVX256 vectors to AVX512.  */
46199 	  else if (GET_MODE_BITSIZE (mode) == 512)
46200 	    cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
46201 	  return cost;
46202 	}
46203 
46204       default:
46205         gcc_unreachable ();
46206     }
46207 }
46208 
46209 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46210    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46211    insn every time.  */
46212 
46213 static GTY(()) rtx_insn *vselect_insn;
46214 
46215 /* Initialize vselect_insn.  */
46216 
46217 static void
46218 init_vselect_insn (void)
46219 {
46220   unsigned i;
46221   rtx x;
46222 
46223   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46224   for (i = 0; i < MAX_VECT_LEN; ++i)
46225     XVECEXP (x, 0, i) = const0_rtx;
46226   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46227 							const0_rtx), x);
46228   x = gen_rtx_SET (const0_rtx, x);
46229   start_sequence ();
46230   vselect_insn = emit_insn (x);
46231   end_sequence ();
46232 }
46233 
46234 /* Construct (set target (vec_select op0 (parallel perm))) and
46235    return true if that's a valid instruction in the active ISA.  */
46236 
46237 static bool
46238 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46239 		unsigned nelt, bool testing_p)
46240 {
46241   unsigned int i;
46242   rtx x, save_vconcat;
46243   int icode;
46244 
46245   if (vselect_insn == NULL_RTX)
46246     init_vselect_insn ();
46247 
46248   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46249   PUT_NUM_ELEM (XVEC (x, 0), nelt);
46250   for (i = 0; i < nelt; ++i)
46251     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46252   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46253   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46254   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46255   SET_DEST (PATTERN (vselect_insn)) = target;
46256   icode = recog_memoized (vselect_insn);
46257 
46258   if (icode >= 0 && !testing_p)
46259     emit_insn (copy_rtx (PATTERN (vselect_insn)));
46260 
46261   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46262   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46263   INSN_CODE (vselect_insn) = -1;
46264 
46265   return icode >= 0;
46266 }
46267 
46268 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
46269 
46270 static bool
46271 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46272 			const unsigned char *perm, unsigned nelt,
46273 			bool testing_p)
46274 {
46275   machine_mode v2mode;
46276   rtx x;
46277   bool ok;
46278 
46279   if (vselect_insn == NULL_RTX)
46280     init_vselect_insn ();
46281 
46282   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
46283     return false;
46284   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46285   PUT_MODE (x, v2mode);
46286   XEXP (x, 0) = op0;
46287   XEXP (x, 1) = op1;
46288   ok = expand_vselect (target, x, perm, nelt, testing_p);
46289   XEXP (x, 0) = const0_rtx;
46290   XEXP (x, 1) = const0_rtx;
46291   return ok;
46292 }
46293 
46294 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
46295    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
46296 
46297 static bool
46298 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46299 {
46300   machine_mode mmode, vmode = d->vmode;
46301   unsigned i, mask, nelt = d->nelt;
46302   rtx target, op0, op1, maskop, x;
46303   rtx rperm[32], vperm;
46304 
46305   if (d->one_operand_p)
46306     return false;
46307   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46308       && (TARGET_AVX512BW
46309 	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
46310     ;
46311   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46312     ;
46313   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46314     ;
46315   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46316     ;
46317   else
46318     return false;
46319 
46320   /* This is a blend, not a permute.  Elements must stay in their
46321      respective lanes.  */
46322   for (i = 0; i < nelt; ++i)
46323     {
46324       unsigned e = d->perm[i];
46325       if (!(e == i || e == i + nelt))
46326 	return false;
46327     }
46328 
46329   if (d->testing_p)
46330     return true;
46331 
46332   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
46333      decision should be extracted elsewhere, so that we only try that
46334      sequence once all budget==3 options have been tried.  */
46335   target = d->target;
46336   op0 = d->op0;
46337   op1 = d->op1;
46338   mask = 0;
46339 
46340   switch (vmode)
46341     {
46342     case E_V8DFmode:
46343     case E_V16SFmode:
46344     case E_V4DFmode:
46345     case E_V8SFmode:
46346     case E_V2DFmode:
46347     case E_V4SFmode:
46348     case E_V8HImode:
46349     case E_V8SImode:
46350     case E_V32HImode:
46351     case E_V64QImode:
46352     case E_V16SImode:
46353     case E_V8DImode:
46354       for (i = 0; i < nelt; ++i)
46355 	mask |= (d->perm[i] >= nelt) << i;
46356       break;
46357 
46358     case E_V2DImode:
46359       for (i = 0; i < 2; ++i)
46360 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46361       vmode = V8HImode;
46362       goto do_subreg;
46363 
46364     case E_V4SImode:
46365       for (i = 0; i < 4; ++i)
46366 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46367       vmode = V8HImode;
46368       goto do_subreg;
46369 
46370     case E_V16QImode:
46371       /* See if bytes move in pairs so we can use pblendw with
46372 	 an immediate argument, rather than pblendvb with a vector
46373 	 argument.  */
46374       for (i = 0; i < 16; i += 2)
46375 	if (d->perm[i] + 1 != d->perm[i + 1])
46376 	  {
46377 	  use_pblendvb:
46378 	    for (i = 0; i < nelt; ++i)
46379 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46380 
46381 	  finish_pblendvb:
46382 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46383 	    vperm = force_reg (vmode, vperm);
46384 
46385 	    if (GET_MODE_SIZE (vmode) == 16)
46386 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46387 	    else
46388 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46389 	    if (target != d->target)
46390 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46391 	    return true;
46392 	  }
46393 
46394       for (i = 0; i < 8; ++i)
46395 	mask |= (d->perm[i * 2] >= 16) << i;
46396       vmode = V8HImode;
46397       /* FALLTHRU */
46398 
46399     do_subreg:
46400       target = gen_reg_rtx (vmode);
46401       op0 = gen_lowpart (vmode, op0);
46402       op1 = gen_lowpart (vmode, op1);
46403       break;
46404 
46405     case E_V32QImode:
46406       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
46407       for (i = 0; i < 32; i += 2)
46408 	if (d->perm[i] + 1 != d->perm[i + 1])
46409 	  goto use_pblendvb;
46410       /* See if bytes move in quadruplets.  If yes, vpblendd
46411 	 with immediate can be used.  */
46412       for (i = 0; i < 32; i += 4)
46413 	if (d->perm[i] + 2 != d->perm[i + 2])
46414 	  break;
46415       if (i < 32)
46416 	{
46417 	  /* See if bytes move the same in both lanes.  If yes,
46418 	     vpblendw with immediate can be used.  */
46419 	  for (i = 0; i < 16; i += 2)
46420 	    if (d->perm[i] + 16 != d->perm[i + 16])
46421 	      goto use_pblendvb;
46422 
46423 	  /* Use vpblendw.  */
46424 	  for (i = 0; i < 16; ++i)
46425 	    mask |= (d->perm[i * 2] >= 32) << i;
46426 	  vmode = V16HImode;
46427 	  goto do_subreg;
46428 	}
46429 
46430       /* Use vpblendd.  */
46431       for (i = 0; i < 8; ++i)
46432 	mask |= (d->perm[i * 4] >= 32) << i;
46433       vmode = V8SImode;
46434       goto do_subreg;
46435 
46436     case E_V16HImode:
46437       /* See if words move in pairs.  If yes, vpblendd can be used.  */
46438       for (i = 0; i < 16; i += 2)
46439 	if (d->perm[i] + 1 != d->perm[i + 1])
46440 	  break;
46441       if (i < 16)
46442 	{
46443 	  /* See if words move the same in both lanes.  If not,
46444 	     vpblendvb must be used.  */
46445 	  for (i = 0; i < 8; i++)
46446 	    if (d->perm[i] + 8 != d->perm[i + 8])
46447 	      {
46448 		/* Use vpblendvb.  */
46449 		for (i = 0; i < 32; ++i)
46450 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46451 
46452 		vmode = V32QImode;
46453 		nelt = 32;
46454 		target = gen_reg_rtx (vmode);
46455 		op0 = gen_lowpart (vmode, op0);
46456 		op1 = gen_lowpart (vmode, op1);
46457 		goto finish_pblendvb;
46458 	      }
46459 
46460 	  /* Use vpblendw.  */
46461 	  for (i = 0; i < 16; ++i)
46462 	    mask |= (d->perm[i] >= 16) << i;
46463 	  break;
46464 	}
46465 
46466       /* Use vpblendd.  */
46467       for (i = 0; i < 8; ++i)
46468 	mask |= (d->perm[i * 2] >= 16) << i;
46469       vmode = V8SImode;
46470       goto do_subreg;
46471 
46472     case E_V4DImode:
46473       /* Use vpblendd.  */
46474       for (i = 0; i < 4; ++i)
46475 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46476       vmode = V8SImode;
46477       goto do_subreg;
46478 
46479     default:
46480       gcc_unreachable ();
46481     }
46482 
46483   switch (vmode)
46484     {
46485     case E_V8DFmode:
46486     case E_V8DImode:
46487       mmode = QImode;
46488       break;
46489     case E_V16SFmode:
46490     case E_V16SImode:
46491       mmode = HImode;
46492       break;
46493     case E_V32HImode:
46494       mmode = SImode;
46495       break;
46496     case E_V64QImode:
46497       mmode = DImode;
46498       break;
46499     default:
46500       mmode = VOIDmode;
46501     }
46502 
46503   if (mmode != VOIDmode)
46504     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46505   else
46506     maskop = GEN_INT (mask);
46507 
46508   /* This matches five different patterns with the different modes.  */
46509   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46510   x = gen_rtx_SET (target, x);
46511   emit_insn (x);
46512   if (target != d->target)
46513     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46514 
46515   return true;
46516 }
46517 
46518 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
46519    in terms of the variable form of vpermilps.
46520 
46521    Note that we will have already failed the immediate input vpermilps,
46522    which requires that the high and low part shuffle be identical; the
46523    variable form doesn't require that.  */
46524 
46525 static bool
46526 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46527 {
46528   rtx rperm[8], vperm;
46529   unsigned i;
46530 
46531   if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46532     return false;
46533 
46534   /* We can only permute within the 128-bit lane.  */
46535   for (i = 0; i < 8; ++i)
46536     {
46537       unsigned e = d->perm[i];
46538       if (i < 4 ? e >= 4 : e < 4)
46539 	return false;
46540     }
46541 
46542   if (d->testing_p)
46543     return true;
46544 
46545   for (i = 0; i < 8; ++i)
46546     {
46547       unsigned e = d->perm[i];
46548 
46549       /* Within each 128-bit lane, the elements of op0 are numbered
46550 	 from 0 and the elements of op1 are numbered from 4.  */
46551       if (e >= 8 + 4)
46552 	e -= 8;
46553       else if (e >= 4)
46554 	e -= 4;
46555 
46556       rperm[i] = GEN_INT (e);
46557     }
46558 
46559   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46560   vperm = force_reg (V8SImode, vperm);
46561   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46562 
46563   return true;
46564 }
46565 
46566 /* Return true if permutation D can be performed as VMODE permutation
46567    instead.  */
46568 
46569 static bool
46570 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46571 {
46572   unsigned int i, j, chunk;
46573 
46574   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46575       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46576       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46577     return false;
46578 
46579   if (GET_MODE_NUNITS (vmode) >= d->nelt)
46580     return true;
46581 
46582   chunk = d->nelt / GET_MODE_NUNITS (vmode);
46583   for (i = 0; i < d->nelt; i += chunk)
46584     if (d->perm[i] & (chunk - 1))
46585       return false;
46586     else
46587       for (j = 1; j < chunk; ++j)
46588 	if (d->perm[i] + j != d->perm[i + j])
46589 	  return false;
46590 
46591   return true;
46592 }
46593 
46594 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
46595    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
46596 
46597 static bool
46598 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46599 {
46600   unsigned i, nelt, eltsz, mask;
46601   unsigned char perm[64];
46602   machine_mode vmode = V16QImode;
46603   rtx rperm[64], vperm, target, op0, op1;
46604 
46605   nelt = d->nelt;
46606 
46607   if (!d->one_operand_p)
46608     {
46609       if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46610 	{
46611 	  if (TARGET_AVX2
46612 	      && valid_perm_using_mode_p (V2TImode, d))
46613 	    {
46614 	      if (d->testing_p)
46615 		return true;
46616 
46617 	      /* Use vperm2i128 insn.  The pattern uses
46618 		 V4DImode instead of V2TImode.  */
46619 	      target = d->target;
46620 	      if (d->vmode != V4DImode)
46621 		target = gen_reg_rtx (V4DImode);
46622 	      op0 = gen_lowpart (V4DImode, d->op0);
46623 	      op1 = gen_lowpart (V4DImode, d->op1);
46624 	      rperm[0]
46625 		= GEN_INT ((d->perm[0] / (nelt / 2))
46626 			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46627 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46628 	      if (target != d->target)
46629 		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46630 	      return true;
46631 	    }
46632 	  return false;
46633 	}
46634     }
46635   else
46636     {
46637       if (GET_MODE_SIZE (d->vmode) == 16)
46638 	{
46639 	  if (!TARGET_SSSE3)
46640 	    return false;
46641 	}
46642       else if (GET_MODE_SIZE (d->vmode) == 32)
46643 	{
46644 	  if (!TARGET_AVX2)
46645 	    return false;
46646 
46647 	  /* V4DImode should be already handled through
46648 	     expand_vselect by vpermq instruction.  */
46649 	  gcc_assert (d->vmode != V4DImode);
46650 
46651 	  vmode = V32QImode;
46652 	  if (d->vmode == V8SImode
46653 	      || d->vmode == V16HImode
46654 	      || d->vmode == V32QImode)
46655 	    {
46656 	      /* First see if vpermq can be used for
46657 		 V8SImode/V16HImode/V32QImode.  */
46658 	      if (valid_perm_using_mode_p (V4DImode, d))
46659 		{
46660 		  for (i = 0; i < 4; i++)
46661 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46662 		  if (d->testing_p)
46663 		    return true;
46664 		  target = gen_reg_rtx (V4DImode);
46665 		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46666 				      perm, 4, false))
46667 		    {
46668 		      emit_move_insn (d->target,
46669 				      gen_lowpart (d->vmode, target));
46670 		      return true;
46671 		    }
46672 		  return false;
46673 		}
46674 
46675 	      /* Next see if vpermd can be used.  */
46676 	      if (valid_perm_using_mode_p (V8SImode, d))
46677 		vmode = V8SImode;
46678 	    }
46679 	  /* Or if vpermps can be used.  */
46680 	  else if (d->vmode == V8SFmode)
46681 	    vmode = V8SImode;
46682 
46683 	  if (vmode == V32QImode)
46684 	    {
46685 	      /* vpshufb only works intra lanes, it is not
46686 		 possible to shuffle bytes in between the lanes.  */
46687 	      for (i = 0; i < nelt; ++i)
46688 		if ((d->perm[i] ^ i) & (nelt / 2))
46689 		  return false;
46690 	    }
46691 	}
46692       else if (GET_MODE_SIZE (d->vmode) == 64)
46693 	{
46694 	  if (!TARGET_AVX512BW)
46695 	    return false;
46696 
46697 	  /* If vpermq didn't work, vpshufb won't work either.  */
46698 	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
46699 	    return false;
46700 
46701 	  vmode = V64QImode;
46702 	  if (d->vmode == V16SImode
46703 	      || d->vmode == V32HImode
46704 	      || d->vmode == V64QImode)
46705 	    {
46706 	      /* First see if vpermq can be used for
46707 		 V16SImode/V32HImode/V64QImode.  */
46708 	      if (valid_perm_using_mode_p (V8DImode, d))
46709 		{
46710 		  for (i = 0; i < 8; i++)
46711 		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46712 		  if (d->testing_p)
46713 		    return true;
46714 		  target = gen_reg_rtx (V8DImode);
46715 		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46716 				      perm, 8, false))
46717 		    {
46718 		      emit_move_insn (d->target,
46719 				      gen_lowpart (d->vmode, target));
46720 		      return true;
46721 		    }
46722 		  return false;
46723 		}
46724 
46725 	      /* Next see if vpermd can be used.  */
46726 	      if (valid_perm_using_mode_p (V16SImode, d))
46727 		vmode = V16SImode;
46728 	    }
46729 	  /* Or if vpermps can be used.  */
46730 	  else if (d->vmode == V16SFmode)
46731 	    vmode = V16SImode;
46732 	  if (vmode == V64QImode)
46733 	    {
46734 	      /* vpshufb only works intra lanes, it is not
46735 		 possible to shuffle bytes in between the lanes.  */
46736 	      for (i = 0; i < nelt; ++i)
46737 		if ((d->perm[i] ^ i) & (nelt / 4))
46738 		  return false;
46739 	    }
46740 	}
46741       else
46742 	return false;
46743     }
46744 
46745   if (d->testing_p)
46746     return true;
46747 
46748   if (vmode == V8SImode)
46749     for (i = 0; i < 8; ++i)
46750       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46751   else if (vmode == V16SImode)
46752     for (i = 0; i < 16; ++i)
46753       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46754   else
46755     {
46756       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46757       if (!d->one_operand_p)
46758 	mask = 2 * nelt - 1;
46759       else if (vmode == V16QImode)
46760 	mask = nelt - 1;
46761       else if (vmode == V64QImode)
46762 	mask = nelt / 4 - 1;
46763       else
46764 	mask = nelt / 2 - 1;
46765 
46766       for (i = 0; i < nelt; ++i)
46767 	{
46768 	  unsigned j, e = d->perm[i] & mask;
46769 	  for (j = 0; j < eltsz; ++j)
46770 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46771 	}
46772     }
46773 
46774   vperm = gen_rtx_CONST_VECTOR (vmode,
46775 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46776   vperm = force_reg (vmode, vperm);
46777 
46778   target = d->target;
46779   if (d->vmode != vmode)
46780     target = gen_reg_rtx (vmode);
46781   op0 = gen_lowpart (vmode, d->op0);
46782   if (d->one_operand_p)
46783     {
46784       if (vmode == V16QImode)
46785 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46786       else if (vmode == V32QImode)
46787 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46788       else if (vmode == V64QImode)
46789 	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46790       else if (vmode == V8SFmode)
46791 	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46792       else if (vmode == V8SImode)
46793 	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46794       else if (vmode == V16SFmode)
46795 	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46796       else if (vmode == V16SImode)
46797 	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46798       else
46799 	gcc_unreachable ();
46800     }
46801   else
46802     {
46803       op1 = gen_lowpart (vmode, d->op1);
46804       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46805     }
46806   if (target != d->target)
46807     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46808 
46809   return true;
46810 }
46811 
46812 /* For V*[QHS]Imode permutations, check if the same permutation
46813    can't be performed in a 2x, 4x or 8x wider inner mode.  */
46814 
46815 static bool
46816 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46817 			      struct expand_vec_perm_d *nd)
46818 {
46819   int i;
46820   machine_mode mode = VOIDmode;
46821 
46822   switch (d->vmode)
46823     {
46824     case E_V16QImode: mode = V8HImode; break;
46825     case E_V32QImode: mode = V16HImode; break;
46826     case E_V64QImode: mode = V32HImode; break;
46827     case E_V8HImode: mode = V4SImode; break;
46828     case E_V16HImode: mode = V8SImode; break;
46829     case E_V32HImode: mode = V16SImode; break;
46830     case E_V4SImode: mode = V2DImode; break;
46831     case E_V8SImode: mode = V4DImode; break;
46832     case E_V16SImode: mode = V8DImode; break;
46833     default: return false;
46834     }
46835   for (i = 0; i < d->nelt; i += 2)
46836     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46837       return false;
46838   nd->vmode = mode;
46839   nd->nelt = d->nelt / 2;
46840   for (i = 0; i < nd->nelt; i++)
46841     nd->perm[i] = d->perm[2 * i] / 2;
46842   if (GET_MODE_INNER (mode) != DImode)
46843     canonicalize_vector_int_perm (nd, nd);
46844   if (nd != d)
46845     {
46846       nd->one_operand_p = d->one_operand_p;
46847       nd->testing_p = d->testing_p;
46848       if (d->op0 == d->op1)
46849 	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46850       else
46851 	{
46852 	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
46853 	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
46854 	}
46855       if (d->testing_p)
46856 	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46857       else
46858 	nd->target = gen_reg_rtx (nd->vmode);
46859     }
46860   return true;
46861 }
46862 
46863 /* Try to expand one-operand permutation with constant mask.  */
46864 
46865 static bool
46866 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46867 {
46868   machine_mode mode = GET_MODE (d->op0);
46869   machine_mode maskmode = mode;
46870   rtx (*gen) (rtx, rtx, rtx) = NULL;
46871   rtx target, op0, mask;
46872   rtx vec[64];
46873 
46874   if (!rtx_equal_p (d->op0, d->op1))
46875     return false;
46876 
46877   if (!TARGET_AVX512F)
46878     return false;
46879 
46880   switch (mode)
46881     {
46882     case E_V16SImode:
46883       gen = gen_avx512f_permvarv16si;
46884       break;
46885     case E_V16SFmode:
46886       gen = gen_avx512f_permvarv16sf;
46887       maskmode = V16SImode;
46888       break;
46889     case E_V8DImode:
46890       gen = gen_avx512f_permvarv8di;
46891       break;
46892     case E_V8DFmode:
46893       gen = gen_avx512f_permvarv8df;
46894       maskmode = V8DImode;
46895       break;
46896     default:
46897       return false;
46898     }
46899 
46900   target = d->target;
46901   op0 = d->op0;
46902   for (int i = 0; i < d->nelt; ++i)
46903     vec[i] = GEN_INT (d->perm[i]);
46904   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46905   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46906   return true;
46907 }
46908 
46909 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
46910    in a single instruction.  */
46911 
46912 static bool
46913 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46914 {
46915   unsigned i, nelt = d->nelt;
46916   struct expand_vec_perm_d nd;
46917 
46918   /* Check plain VEC_SELECT first, because AVX has instructions that could
46919      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46920      input where SEL+CONCAT may not.  */
46921   if (d->one_operand_p)
46922     {
46923       int mask = nelt - 1;
46924       bool identity_perm = true;
46925       bool broadcast_perm = true;
46926 
46927       for (i = 0; i < nelt; i++)
46928 	{
46929 	  nd.perm[i] = d->perm[i] & mask;
46930 	  if (nd.perm[i] != i)
46931 	    identity_perm = false;
46932 	  if (nd.perm[i])
46933 	    broadcast_perm = false;
46934 	}
46935 
46936       if (identity_perm)
46937 	{
46938 	  if (!d->testing_p)
46939 	    emit_move_insn (d->target, d->op0);
46940 	  return true;
46941 	}
46942       else if (broadcast_perm && TARGET_AVX2)
46943 	{
46944 	  /* Use vpbroadcast{b,w,d}.  */
46945 	  rtx (*gen) (rtx, rtx) = NULL;
46946 	  switch (d->vmode)
46947 	    {
46948 	    case E_V64QImode:
46949 	      if (TARGET_AVX512BW)
46950 		gen = gen_avx512bw_vec_dupv64qi_1;
46951 	      break;
46952 	    case E_V32QImode:
46953 	      gen = gen_avx2_pbroadcastv32qi_1;
46954 	      break;
46955 	    case E_V32HImode:
46956 	      if (TARGET_AVX512BW)
46957 		gen = gen_avx512bw_vec_dupv32hi_1;
46958 	      break;
46959 	    case E_V16HImode:
46960 	      gen = gen_avx2_pbroadcastv16hi_1;
46961 	      break;
46962 	    case E_V16SImode:
46963 	      if (TARGET_AVX512F)
46964 		gen = gen_avx512f_vec_dupv16si_1;
46965 	      break;
46966 	    case E_V8SImode:
46967 	      gen = gen_avx2_pbroadcastv8si_1;
46968 	      break;
46969 	    case E_V16QImode:
46970 	      gen = gen_avx2_pbroadcastv16qi;
46971 	      break;
46972 	    case E_V8HImode:
46973 	      gen = gen_avx2_pbroadcastv8hi;
46974 	      break;
46975 	    case E_V16SFmode:
46976 	      if (TARGET_AVX512F)
46977 		gen = gen_avx512f_vec_dupv16sf_1;
46978 	      break;
46979 	    case E_V8SFmode:
46980 	      gen = gen_avx2_vec_dupv8sf_1;
46981 	      break;
46982 	    case E_V8DFmode:
46983 	      if (TARGET_AVX512F)
46984 		gen = gen_avx512f_vec_dupv8df_1;
46985 	      break;
46986 	    case E_V8DImode:
46987 	      if (TARGET_AVX512F)
46988 		gen = gen_avx512f_vec_dupv8di_1;
46989 	      break;
46990 	    /* For other modes prefer other shuffles this function creates.  */
46991 	    default: break;
46992 	    }
46993 	  if (gen != NULL)
46994 	    {
46995 	      if (!d->testing_p)
46996 		emit_insn (gen (d->target, d->op0));
46997 	      return true;
46998 	    }
46999 	}
47000 
47001       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47002 	return true;
47003 
47004       /* There are plenty of patterns in sse.md that are written for
47005 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
47006 	 that should be changed, to avoid the nastiness here.  */
47007 
47008       /* Recognize interleave style patterns, which means incrementing
47009 	 every other permutation operand.  */
47010       for (i = 0; i < nelt; i += 2)
47011 	{
47012 	  nd.perm[i] = d->perm[i] & mask;
47013 	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47014 	}
47015       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47016 				  d->testing_p))
47017 	return true;
47018 
47019       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
47020       if (nelt >= 4)
47021 	{
47022 	  for (i = 0; i < nelt; i += 4)
47023 	    {
47024 	      nd.perm[i + 0] = d->perm[i + 0] & mask;
47025 	      nd.perm[i + 1] = d->perm[i + 1] & mask;
47026 	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47027 	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47028 	    }
47029 
47030 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47031 				      d->testing_p))
47032 	    return true;
47033 	}
47034     }
47035 
47036   /* Finally, try the fully general two operand permute.  */
47037   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47038 			      d->testing_p))
47039     return true;
47040 
47041   /* Recognize interleave style patterns with reversed operands.  */
47042   if (!d->one_operand_p)
47043     {
47044       for (i = 0; i < nelt; ++i)
47045 	{
47046 	  unsigned e = d->perm[i];
47047 	  if (e >= nelt)
47048 	    e -= nelt;
47049 	  else
47050 	    e += nelt;
47051 	  nd.perm[i] = e;
47052 	}
47053 
47054       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47055 				  d->testing_p))
47056 	return true;
47057     }
47058 
47059   /* Try the SSE4.1 blend variable merge instructions.  */
47060   if (expand_vec_perm_blend (d))
47061     return true;
47062 
47063   /* Try one of the AVX vpermil variable permutations.  */
47064   if (expand_vec_perm_vpermil (d))
47065     return true;
47066 
47067   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47068      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
47069   if (expand_vec_perm_pshufb (d))
47070     return true;
47071 
47072   /* Try the AVX2 vpalignr instruction.  */
47073   if (expand_vec_perm_palignr (d, true))
47074     return true;
47075 
47076   /* Try the AVX512F vperm{s,d} instructions.  */
47077   if (ix86_expand_vec_one_operand_perm_avx512 (d))
47078     return true;
47079 
47080   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
47081   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47082     return true;
47083 
47084   /* See if we can get the same permutation in different vector integer
47085      mode.  */
47086   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47087     {
47088       if (!d->testing_p)
47089 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47090       return true;
47091     }
47092   return false;
47093 }
47094 
47095 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
47096    in terms of a pair of pshuflw + pshufhw instructions.  */
47097 
47098 static bool
47099 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47100 {
47101   unsigned char perm2[MAX_VECT_LEN];
47102   unsigned i;
47103   bool ok;
47104 
47105   if (d->vmode != V8HImode || !d->one_operand_p)
47106     return false;
47107 
47108   /* The two permutations only operate in 64-bit lanes.  */
47109   for (i = 0; i < 4; ++i)
47110     if (d->perm[i] >= 4)
47111       return false;
47112   for (i = 4; i < 8; ++i)
47113     if (d->perm[i] < 4)
47114       return false;
47115 
47116   if (d->testing_p)
47117     return true;
47118 
47119   /* Emit the pshuflw.  */
47120   memcpy (perm2, d->perm, 4);
47121   for (i = 4; i < 8; ++i)
47122     perm2[i] = i;
47123   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47124   gcc_assert (ok);
47125 
47126   /* Emit the pshufhw.  */
47127   memcpy (perm2 + 4, d->perm + 4, 4);
47128   for (i = 0; i < 4; ++i)
47129     perm2[i] = i;
47130   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47131   gcc_assert (ok);
47132 
47133   return true;
47134 }
47135 
47136 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
47137    the permutation using the SSSE3 palignr instruction.  This succeeds
47138    when all of the elements in PERM fit within one vector and we merely
47139    need to shift them down so that a single vector permutation has a
47140    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
47141    the vpalignr instruction itself can perform the requested permutation.  */
47142 
47143 static bool
47144 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47145 {
47146   unsigned i, nelt = d->nelt;
47147   unsigned min, max, minswap, maxswap;
47148   bool in_order, ok, swap = false;
47149   rtx shift, target;
47150   struct expand_vec_perm_d dcopy;
47151 
47152   /* Even with AVX, palignr only operates on 128-bit vectors,
47153      in AVX2 palignr operates on both 128-bit lanes.  */
47154   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47155       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47156     return false;
47157 
47158   min = 2 * nelt;
47159   max = 0;
47160   minswap = 2 * nelt;
47161   maxswap = 0;
47162   for (i = 0; i < nelt; ++i)
47163     {
47164       unsigned e = d->perm[i];
47165       unsigned eswap = d->perm[i] ^ nelt;
47166       if (GET_MODE_SIZE (d->vmode) == 32)
47167 	{
47168 	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47169 	  eswap = e ^ (nelt / 2);
47170 	}
47171       if (e < min)
47172 	min = e;
47173       if (e > max)
47174 	max = e;
47175       if (eswap < minswap)
47176 	minswap = eswap;
47177       if (eswap > maxswap)
47178 	maxswap = eswap;
47179     }
47180   if (min == 0
47181       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47182     {
47183       if (d->one_operand_p
47184 	  || minswap == 0
47185 	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47186 				   ? nelt / 2 : nelt))
47187 	return false;
47188       swap = true;
47189       min = minswap;
47190       max = maxswap;
47191     }
47192 
47193   /* Given that we have SSSE3, we know we'll be able to implement the
47194      single operand permutation after the palignr with pshufb for
47195      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
47196      first.  */
47197   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47198     return true;
47199 
47200   dcopy = *d;
47201   if (swap)
47202     {
47203       dcopy.op0 = d->op1;
47204       dcopy.op1 = d->op0;
47205       for (i = 0; i < nelt; ++i)
47206 	dcopy.perm[i] ^= nelt;
47207     }
47208 
47209   in_order = true;
47210   for (i = 0; i < nelt; ++i)
47211     {
47212       unsigned e = dcopy.perm[i];
47213       if (GET_MODE_SIZE (d->vmode) == 32
47214 	  && e >= nelt
47215 	  && (e & (nelt / 2 - 1)) < min)
47216 	e = e - min - (nelt / 2);
47217       else
47218 	e = e - min;
47219       if (e != i)
47220 	in_order = false;
47221       dcopy.perm[i] = e;
47222     }
47223   dcopy.one_operand_p = true;
47224 
47225   if (single_insn_only_p && !in_order)
47226     return false;
47227 
47228   /* For AVX2, test whether we can permute the result in one instruction.  */
47229   if (d->testing_p)
47230     {
47231       if (in_order)
47232 	return true;
47233       dcopy.op1 = dcopy.op0;
47234       return expand_vec_perm_1 (&dcopy);
47235     }
47236 
47237   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47238   if (GET_MODE_SIZE (d->vmode) == 16)
47239     {
47240       target = gen_reg_rtx (TImode);
47241       emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47242 				      gen_lowpart (TImode, dcopy.op0), shift));
47243     }
47244   else
47245     {
47246       target = gen_reg_rtx (V2TImode);
47247       emit_insn (gen_avx2_palignrv2ti (target,
47248 				       gen_lowpart (V2TImode, dcopy.op1),
47249 				       gen_lowpart (V2TImode, dcopy.op0),
47250 				       shift));
47251     }
47252 
47253   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47254 
47255   /* Test for the degenerate case where the alignment by itself
47256      produces the desired permutation.  */
47257   if (in_order)
47258     {
47259       emit_move_insn (d->target, dcopy.op0);
47260       return true;
47261     }
47262 
47263   ok = expand_vec_perm_1 (&dcopy);
47264   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47265 
47266   return ok;
47267 }
47268 
47269 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
47270    the permutation using the SSE4_1 pblendv instruction.  Potentially
47271    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
47272 
47273 static bool
47274 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47275 {
47276   unsigned i, which, nelt = d->nelt;
47277   struct expand_vec_perm_d dcopy, dcopy1;
47278   machine_mode vmode = d->vmode;
47279   bool ok;
47280 
47281   /* Use the same checks as in expand_vec_perm_blend.  */
47282   if (d->one_operand_p)
47283     return false;
47284   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47285     ;
47286   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47287     ;
47288   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47289     ;
47290   else
47291     return false;
47292 
47293   /* Figure out where permutation elements stay not in their
47294      respective lanes.  */
47295   for (i = 0, which = 0; i < nelt; ++i)
47296     {
47297       unsigned e = d->perm[i];
47298       if (e != i)
47299 	which |= (e < nelt ? 1 : 2);
47300     }
47301   /* We can pblend the part where elements stay not in their
47302      respective lanes only when these elements are all in one
47303      half of a permutation.
47304      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47305      lanes, but both 8 and 9 >= 8
47306      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47307      respective lanes and 8 >= 8, but 2 not.  */
47308   if (which != 1 && which != 2)
47309     return false;
47310   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47311     return true;
47312 
47313   /* First we apply one operand permutation to the part where
47314      elements stay not in their respective lanes.  */
47315   dcopy = *d;
47316   if (which == 2)
47317     dcopy.op0 = dcopy.op1 = d->op1;
47318   else
47319     dcopy.op0 = dcopy.op1 = d->op0;
47320   if (!d->testing_p)
47321     dcopy.target = gen_reg_rtx (vmode);
47322   dcopy.one_operand_p = true;
47323 
47324   for (i = 0; i < nelt; ++i)
47325     dcopy.perm[i] = d->perm[i] & (nelt - 1);
47326 
47327   ok = expand_vec_perm_1 (&dcopy);
47328   if (GET_MODE_SIZE (vmode) != 16 && !ok)
47329     return false;
47330   else
47331     gcc_assert (ok);
47332   if (d->testing_p)
47333     return true;
47334 
47335   /* Next we put permuted elements into their positions.  */
47336   dcopy1 = *d;
47337   if (which == 2)
47338     dcopy1.op1 = dcopy.target;
47339   else
47340     dcopy1.op0 = dcopy.target;
47341 
47342   for (i = 0; i < nelt; ++i)
47343     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47344 
47345   ok = expand_vec_perm_blend (&dcopy1);
47346   gcc_assert (ok);
47347 
47348   return true;
47349 }
47350 
47351 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47352 
47353 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
47354    a two vector permutation into a single vector permutation by using
47355    an interleave operation to merge the vectors.  */
47356 
47357 static bool
47358 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47359 {
47360   struct expand_vec_perm_d dremap, dfinal;
47361   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47362   unsigned HOST_WIDE_INT contents;
47363   unsigned char remap[2 * MAX_VECT_LEN];
47364   rtx_insn *seq;
47365   bool ok, same_halves = false;
47366 
47367   if (GET_MODE_SIZE (d->vmode) == 16)
47368     {
47369       if (d->one_operand_p)
47370 	return false;
47371     }
47372   else if (GET_MODE_SIZE (d->vmode) == 32)
47373     {
47374       if (!TARGET_AVX)
47375 	return false;
47376       /* For 32-byte modes allow even d->one_operand_p.
47377 	 The lack of cross-lane shuffling in some instructions
47378 	 might prevent a single insn shuffle.  */
47379       dfinal = *d;
47380       dfinal.testing_p = true;
47381       /* If expand_vec_perm_interleave3 can expand this into
47382 	 a 3 insn sequence, give up and let it be expanded as
47383 	 3 insn sequence.  While that is one insn longer,
47384 	 it doesn't need a memory operand and in the common
47385 	 case that both interleave low and high permutations
47386 	 with the same operands are adjacent needs 4 insns
47387 	 for both after CSE.  */
47388       if (expand_vec_perm_interleave3 (&dfinal))
47389 	return false;
47390     }
47391   else
47392     return false;
47393 
47394   /* Examine from whence the elements come.  */
47395   contents = 0;
47396   for (i = 0; i < nelt; ++i)
47397     contents |= HOST_WIDE_INT_1U << d->perm[i];
47398 
47399   memset (remap, 0xff, sizeof (remap));
47400   dremap = *d;
47401 
47402   if (GET_MODE_SIZE (d->vmode) == 16)
47403     {
47404       unsigned HOST_WIDE_INT h1, h2, h3, h4;
47405 
47406       /* Split the two input vectors into 4 halves.  */
47407       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47408       h2 = h1 << nelt2;
47409       h3 = h2 << nelt2;
47410       h4 = h3 << nelt2;
47411 
47412       /* If the elements from the low halves use interleave low, and similarly
47413 	 for interleave high.  If the elements are from mis-matched halves, we
47414 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
47415       if ((contents & (h1 | h3)) == contents)
47416 	{
47417 	  /* punpckl* */
47418 	  for (i = 0; i < nelt2; ++i)
47419 	    {
47420 	      remap[i] = i * 2;
47421 	      remap[i + nelt] = i * 2 + 1;
47422 	      dremap.perm[i * 2] = i;
47423 	      dremap.perm[i * 2 + 1] = i + nelt;
47424 	    }
47425 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
47426 	    dremap.vmode = V4SFmode;
47427 	}
47428       else if ((contents & (h2 | h4)) == contents)
47429 	{
47430 	  /* punpckh* */
47431 	  for (i = 0; i < nelt2; ++i)
47432 	    {
47433 	      remap[i + nelt2] = i * 2;
47434 	      remap[i + nelt + nelt2] = i * 2 + 1;
47435 	      dremap.perm[i * 2] = i + nelt2;
47436 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47437 	    }
47438 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
47439 	    dremap.vmode = V4SFmode;
47440 	}
47441       else if ((contents & (h1 | h4)) == contents)
47442 	{
47443 	  /* shufps */
47444 	  for (i = 0; i < nelt2; ++i)
47445 	    {
47446 	      remap[i] = i;
47447 	      remap[i + nelt + nelt2] = i + nelt2;
47448 	      dremap.perm[i] = i;
47449 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
47450 	    }
47451 	  if (nelt != 4)
47452 	    {
47453 	      /* shufpd */
47454 	      dremap.vmode = V2DImode;
47455 	      dremap.nelt = 2;
47456 	      dremap.perm[0] = 0;
47457 	      dremap.perm[1] = 3;
47458 	    }
47459 	}
47460       else if ((contents & (h2 | h3)) == contents)
47461 	{
47462 	  /* shufps */
47463 	  for (i = 0; i < nelt2; ++i)
47464 	    {
47465 	      remap[i + nelt2] = i;
47466 	      remap[i + nelt] = i + nelt2;
47467 	      dremap.perm[i] = i + nelt2;
47468 	      dremap.perm[i + nelt2] = i + nelt;
47469 	    }
47470 	  if (nelt != 4)
47471 	    {
47472 	      /* shufpd */
47473 	      dremap.vmode = V2DImode;
47474 	      dremap.nelt = 2;
47475 	      dremap.perm[0] = 1;
47476 	      dremap.perm[1] = 2;
47477 	    }
47478 	}
47479       else
47480 	return false;
47481     }
47482   else
47483     {
47484       unsigned int nelt4 = nelt / 4, nzcnt = 0;
47485       unsigned HOST_WIDE_INT q[8];
47486       unsigned int nonzero_halves[4];
47487 
47488       /* Split the two input vectors into 8 quarters.  */
47489       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47490       for (i = 1; i < 8; ++i)
47491 	q[i] = q[0] << (nelt4 * i);
47492       for (i = 0; i < 4; ++i)
47493 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47494 	  {
47495 	    nonzero_halves[nzcnt] = i;
47496 	    ++nzcnt;
47497 	  }
47498 
47499       if (nzcnt == 1)
47500 	{
47501 	  gcc_assert (d->one_operand_p);
47502 	  nonzero_halves[1] = nonzero_halves[0];
47503 	  same_halves = true;
47504 	}
47505       else if (d->one_operand_p)
47506 	{
47507 	  gcc_assert (nonzero_halves[0] == 0);
47508 	  gcc_assert (nonzero_halves[1] == 1);
47509 	}
47510 
47511       if (nzcnt <= 2)
47512 	{
47513 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
47514 	    {
47515 	      /* Attempt to increase the likelihood that dfinal
47516 		 shuffle will be intra-lane.  */
47517 	      std::swap (nonzero_halves[0], nonzero_halves[1]);
47518 	    }
47519 
47520 	  /* vperm2f128 or vperm2i128.  */
47521 	  for (i = 0; i < nelt2; ++i)
47522 	    {
47523 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47524 	      remap[i + nonzero_halves[0] * nelt2] = i;
47525 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47526 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47527 	    }
47528 
47529 	  if (d->vmode != V8SFmode
47530 	      && d->vmode != V4DFmode
47531 	      && d->vmode != V8SImode)
47532 	    {
47533 	      dremap.vmode = V8SImode;
47534 	      dremap.nelt = 8;
47535 	      for (i = 0; i < 4; ++i)
47536 		{
47537 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
47538 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47539 		}
47540 	    }
47541 	}
47542       else if (d->one_operand_p)
47543 	return false;
47544       else if (TARGET_AVX2
47545 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47546 	{
47547 	  /* vpunpckl* */
47548 	  for (i = 0; i < nelt4; ++i)
47549 	    {
47550 	      remap[i] = i * 2;
47551 	      remap[i + nelt] = i * 2 + 1;
47552 	      remap[i + nelt2] = i * 2 + nelt2;
47553 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47554 	      dremap.perm[i * 2] = i;
47555 	      dremap.perm[i * 2 + 1] = i + nelt;
47556 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
47557 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47558 	    }
47559 	}
47560       else if (TARGET_AVX2
47561 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47562 	{
47563 	  /* vpunpckh* */
47564 	  for (i = 0; i < nelt4; ++i)
47565 	    {
47566 	      remap[i + nelt4] = i * 2;
47567 	      remap[i + nelt + nelt4] = i * 2 + 1;
47568 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47569 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47570 	      dremap.perm[i * 2] = i + nelt4;
47571 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47572 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47573 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47574 	    }
47575 	}
47576       else
47577 	return false;
47578     }
47579 
47580   /* Use the remapping array set up above to move the elements from their
47581      swizzled locations into their final destinations.  */
47582   dfinal = *d;
47583   for (i = 0; i < nelt; ++i)
47584     {
47585       unsigned e = remap[d->perm[i]];
47586       gcc_assert (e < nelt);
47587       /* If same_halves is true, both halves of the remapped vector are the
47588 	 same.  Avoid cross-lane accesses if possible.  */
47589       if (same_halves && i >= nelt2)
47590 	{
47591 	  gcc_assert (e < nelt2);
47592 	  dfinal.perm[i] = e + nelt2;
47593 	}
47594       else
47595 	dfinal.perm[i] = e;
47596     }
47597   if (!d->testing_p)
47598     {
47599       dremap.target = gen_reg_rtx (dremap.vmode);
47600       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47601     }
47602   dfinal.op1 = dfinal.op0;
47603   dfinal.one_operand_p = true;
47604 
47605   /* Test if the final remap can be done with a single insn.  For V4SFmode or
47606      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
47607   start_sequence ();
47608   ok = expand_vec_perm_1 (&dfinal);
47609   seq = get_insns ();
47610   end_sequence ();
47611 
47612   if (!ok)
47613     return false;
47614 
47615   if (d->testing_p)
47616     return true;
47617 
47618   if (dremap.vmode != dfinal.vmode)
47619     {
47620       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47621       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47622     }
47623 
47624   ok = expand_vec_perm_1 (&dremap);
47625   gcc_assert (ok);
47626 
47627   emit_insn (seq);
47628   return true;
47629 }
47630 
47631 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
47632    a single vector cross-lane permutation into vpermq followed
47633    by any of the single insn permutations.  */
47634 
47635 static bool
47636 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47637 {
47638   struct expand_vec_perm_d dremap, dfinal;
47639   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47640   unsigned contents[2];
47641   bool ok;
47642 
47643   if (!(TARGET_AVX2
47644 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
47645 	&& d->one_operand_p))
47646     return false;
47647 
47648   contents[0] = 0;
47649   contents[1] = 0;
47650   for (i = 0; i < nelt2; ++i)
47651     {
47652       contents[0] |= 1u << (d->perm[i] / nelt4);
47653       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47654     }
47655 
47656   for (i = 0; i < 2; ++i)
47657     {
47658       unsigned int cnt = 0;
47659       for (j = 0; j < 4; ++j)
47660 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47661 	  return false;
47662     }
47663 
47664   if (d->testing_p)
47665     return true;
47666 
47667   dremap = *d;
47668   dremap.vmode = V4DImode;
47669   dremap.nelt = 4;
47670   dremap.target = gen_reg_rtx (V4DImode);
47671   dremap.op0 = gen_lowpart (V4DImode, d->op0);
47672   dremap.op1 = dremap.op0;
47673   dremap.one_operand_p = true;
47674   for (i = 0; i < 2; ++i)
47675     {
47676       unsigned int cnt = 0;
47677       for (j = 0; j < 4; ++j)
47678 	if ((contents[i] & (1u << j)) != 0)
47679 	  dremap.perm[2 * i + cnt++] = j;
47680       for (; cnt < 2; ++cnt)
47681 	dremap.perm[2 * i + cnt] = 0;
47682     }
47683 
47684   dfinal = *d;
47685   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47686   dfinal.op1 = dfinal.op0;
47687   dfinal.one_operand_p = true;
47688   for (i = 0, j = 0; i < nelt; ++i)
47689     {
47690       if (i == nelt2)
47691 	j = 2;
47692       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47693       if ((d->perm[i] / nelt4) == dremap.perm[j])
47694 	;
47695       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47696 	dfinal.perm[i] |= nelt4;
47697       else
47698 	gcc_unreachable ();
47699     }
47700 
47701   ok = expand_vec_perm_1 (&dremap);
47702   gcc_assert (ok);
47703 
47704   ok = expand_vec_perm_1 (&dfinal);
47705   gcc_assert (ok);
47706 
47707   return true;
47708 }
47709 
47710 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
47711    a vector permutation using two instructions, vperm2f128 resp.
47712    vperm2i128 followed by any single in-lane permutation.  */
47713 
47714 static bool
47715 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47716 {
47717   struct expand_vec_perm_d dfirst, dsecond;
47718   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47719   bool ok;
47720 
47721   if (!TARGET_AVX
47722       || GET_MODE_SIZE (d->vmode) != 32
47723       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47724     return false;
47725 
47726   dsecond = *d;
47727   dsecond.one_operand_p = false;
47728   dsecond.testing_p = true;
47729 
47730   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47731      immediate.  For perm < 16 the second permutation uses
47732      d->op0 as first operand, for perm >= 16 it uses d->op1
47733      as first operand.  The second operand is the result of
47734      vperm2[fi]128.  */
47735   for (perm = 0; perm < 32; perm++)
47736     {
47737       /* Ignore permutations which do not move anything cross-lane.  */
47738       if (perm < 16)
47739 	{
47740 	  /* The second shuffle for e.g. V4DFmode has
47741 	     0123 and ABCD operands.
47742 	     Ignore AB23, as 23 is already in the second lane
47743 	     of the first operand.  */
47744 	  if ((perm & 0xc) == (1 << 2)) continue;
47745 	  /* And 01CD, as 01 is in the first lane of the first
47746 	     operand.  */
47747 	  if ((perm & 3) == 0) continue;
47748 	  /* And 4567, as then the vperm2[fi]128 doesn't change
47749 	     anything on the original 4567 second operand.  */
47750 	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47751 	}
47752       else
47753 	{
47754 	  /* The second shuffle for e.g. V4DFmode has
47755 	     4567 and ABCD operands.
47756 	     Ignore AB67, as 67 is already in the second lane
47757 	     of the first operand.  */
47758 	  if ((perm & 0xc) == (3 << 2)) continue;
47759 	  /* And 45CD, as 45 is in the first lane of the first
47760 	     operand.  */
47761 	  if ((perm & 3) == 2) continue;
47762 	  /* And 0123, as then the vperm2[fi]128 doesn't change
47763 	     anything on the original 0123 first operand.  */
47764 	  if ((perm & 0xf) == (1 << 2)) continue;
47765 	}
47766 
47767       for (i = 0; i < nelt; i++)
47768 	{
47769 	  j = d->perm[i] / nelt2;
47770 	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47771 	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47772 	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47773 	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
47774 	  else
47775 	    break;
47776 	}
47777 
47778       if (i == nelt)
47779 	{
47780 	  start_sequence ();
47781 	  ok = expand_vec_perm_1 (&dsecond);
47782 	  end_sequence ();
47783 	}
47784       else
47785 	ok = false;
47786 
47787       if (ok)
47788 	{
47789 	  if (d->testing_p)
47790 	    return true;
47791 
47792 	  /* Found a usable second shuffle.  dfirst will be
47793 	     vperm2f128 on d->op0 and d->op1.  */
47794 	  dsecond.testing_p = false;
47795 	  dfirst = *d;
47796 	  dfirst.target = gen_reg_rtx (d->vmode);
47797 	  for (i = 0; i < nelt; i++)
47798 	    dfirst.perm[i] = (i & (nelt2 - 1))
47799 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47800 
47801 	  canonicalize_perm (&dfirst);
47802 	  ok = expand_vec_perm_1 (&dfirst);
47803 	  gcc_assert (ok);
47804 
47805 	  /* And dsecond is some single insn shuffle, taking
47806 	     d->op0 and result of vperm2f128 (if perm < 16) or
47807 	     d->op1 and result of vperm2f128 (otherwise).  */
47808 	  if (perm >= 16)
47809 	    dsecond.op0 = dsecond.op1;
47810 	  dsecond.op1 = dfirst.target;
47811 
47812 	  ok = expand_vec_perm_1 (&dsecond);
47813 	  gcc_assert (ok);
47814 
47815 	  return true;
47816 	}
47817 
47818       /* For one operand, the only useful vperm2f128 permutation is 0x01
47819 	 aka lanes swap.  */
47820       if (d->one_operand_p)
47821 	return false;
47822     }
47823 
47824   return false;
47825 }
47826 
47827 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
47828    a two vector permutation using 2 intra-lane interleave insns
47829    and cross-lane shuffle for 32-byte vectors.  */
47830 
47831 static bool
47832 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47833 {
47834   unsigned i, nelt;
47835   rtx (*gen) (rtx, rtx, rtx);
47836 
47837   if (d->one_operand_p)
47838     return false;
47839   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47840     ;
47841   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47842     ;
47843   else
47844     return false;
47845 
47846   nelt = d->nelt;
47847   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47848     return false;
47849   for (i = 0; i < nelt; i += 2)
47850     if (d->perm[i] != d->perm[0] + i / 2
47851 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47852       return false;
47853 
47854   if (d->testing_p)
47855     return true;
47856 
47857   switch (d->vmode)
47858     {
47859     case E_V32QImode:
47860       if (d->perm[0])
47861 	gen = gen_vec_interleave_highv32qi;
47862       else
47863 	gen = gen_vec_interleave_lowv32qi;
47864       break;
47865     case E_V16HImode:
47866       if (d->perm[0])
47867 	gen = gen_vec_interleave_highv16hi;
47868       else
47869 	gen = gen_vec_interleave_lowv16hi;
47870       break;
47871     case E_V8SImode:
47872       if (d->perm[0])
47873 	gen = gen_vec_interleave_highv8si;
47874       else
47875 	gen = gen_vec_interleave_lowv8si;
47876       break;
47877     case E_V4DImode:
47878       if (d->perm[0])
47879 	gen = gen_vec_interleave_highv4di;
47880       else
47881 	gen = gen_vec_interleave_lowv4di;
47882       break;
47883     case E_V8SFmode:
47884       if (d->perm[0])
47885 	gen = gen_vec_interleave_highv8sf;
47886       else
47887 	gen = gen_vec_interleave_lowv8sf;
47888       break;
47889     case E_V4DFmode:
47890       if (d->perm[0])
47891 	gen = gen_vec_interleave_highv4df;
47892       else
47893 	gen = gen_vec_interleave_lowv4df;
47894       break;
47895     default:
47896       gcc_unreachable ();
47897     }
47898 
47899   emit_insn (gen (d->target, d->op0, d->op1));
47900   return true;
47901 }
47902 
47903 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
47904    a single vector permutation using a single intra-lane vector
47905    permutation, vperm2f128 swapping the lanes and vblend* insn blending
47906    the non-swapped and swapped vectors together.  */
47907 
47908 static bool
47909 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47910 {
47911   struct expand_vec_perm_d dfirst, dsecond;
47912   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47913   rtx_insn *seq;
47914   bool ok;
47915   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47916 
47917   if (!TARGET_AVX
47918       || TARGET_AVX2
47919       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47920       || !d->one_operand_p)
47921     return false;
47922 
47923   dfirst = *d;
47924   for (i = 0; i < nelt; i++)
47925     dfirst.perm[i] = 0xff;
47926   for (i = 0, msk = 0; i < nelt; i++)
47927     {
47928       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47929       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47930 	return false;
47931       dfirst.perm[j] = d->perm[i];
47932       if (j != i)
47933 	msk |= (1 << i);
47934     }
47935   for (i = 0; i < nelt; i++)
47936     if (dfirst.perm[i] == 0xff)
47937       dfirst.perm[i] = i;
47938 
47939   if (!d->testing_p)
47940     dfirst.target = gen_reg_rtx (dfirst.vmode);
47941 
47942   start_sequence ();
47943   ok = expand_vec_perm_1 (&dfirst);
47944   seq = get_insns ();
47945   end_sequence ();
47946 
47947   if (!ok)
47948     return false;
47949 
47950   if (d->testing_p)
47951     return true;
47952 
47953   emit_insn (seq);
47954 
47955   dsecond = *d;
47956   dsecond.op0 = dfirst.target;
47957   dsecond.op1 = dfirst.target;
47958   dsecond.one_operand_p = true;
47959   dsecond.target = gen_reg_rtx (dsecond.vmode);
47960   for (i = 0; i < nelt; i++)
47961     dsecond.perm[i] = i ^ nelt2;
47962 
47963   ok = expand_vec_perm_1 (&dsecond);
47964   gcc_assert (ok);
47965 
47966   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47967   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47968   return true;
47969 }
47970 
47971 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
47972    permutation using two vperm2f128, followed by a vshufpd insn blending
47973    the two vectors together.  */
47974 
47975 static bool
47976 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47977 {
47978   struct expand_vec_perm_d dfirst, dsecond, dthird;
47979   bool ok;
47980 
47981   if (!TARGET_AVX || (d->vmode != V4DFmode))
47982     return false;
47983 
47984   if (d->testing_p)
47985     return true;
47986 
47987   dfirst = *d;
47988   dsecond = *d;
47989   dthird = *d;
47990 
47991   dfirst.perm[0] = (d->perm[0] & ~1);
47992   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47993   dfirst.perm[2] = (d->perm[2] & ~1);
47994   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47995   dsecond.perm[0] = (d->perm[1] & ~1);
47996   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47997   dsecond.perm[2] = (d->perm[3] & ~1);
47998   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47999   dthird.perm[0] = (d->perm[0] % 2);
48000   dthird.perm[1] = (d->perm[1] % 2) + 4;
48001   dthird.perm[2] = (d->perm[2] % 2) + 2;
48002   dthird.perm[3] = (d->perm[3] % 2) + 6;
48003 
48004   dfirst.target = gen_reg_rtx (dfirst.vmode);
48005   dsecond.target = gen_reg_rtx (dsecond.vmode);
48006   dthird.op0 = dfirst.target;
48007   dthird.op1 = dsecond.target;
48008   dthird.one_operand_p = false;
48009 
48010   canonicalize_perm (&dfirst);
48011   canonicalize_perm (&dsecond);
48012 
48013   ok = expand_vec_perm_1 (&dfirst)
48014        && expand_vec_perm_1 (&dsecond)
48015        && expand_vec_perm_1 (&dthird);
48016 
48017   gcc_assert (ok);
48018 
48019   return true;
48020 }
48021 
48022 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
48023    permutation with two pshufb insns and an ior.  We should have already
48024    failed all two instruction sequences.  */
48025 
48026 static bool
48027 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48028 {
48029   rtx rperm[2][16], vperm, l, h, op, m128;
48030   unsigned int i, nelt, eltsz;
48031 
48032   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48033     return false;
48034   gcc_assert (!d->one_operand_p);
48035 
48036   if (d->testing_p)
48037     return true;
48038 
48039   nelt = d->nelt;
48040   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48041 
48042   /* Generate two permutation masks.  If the required element is within
48043      the given vector it is shuffled into the proper lane.  If the required
48044      element is in the other vector, force a zero into the lane by setting
48045      bit 7 in the permutation mask.  */
48046   m128 = GEN_INT (-128);
48047   for (i = 0; i < nelt; ++i)
48048     {
48049       unsigned j, e = d->perm[i];
48050       unsigned which = (e >= nelt);
48051       if (e >= nelt)
48052 	e -= nelt;
48053 
48054       for (j = 0; j < eltsz; ++j)
48055 	{
48056 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48057 	  rperm[1-which][i*eltsz + j] = m128;
48058 	}
48059     }
48060 
48061   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48062   vperm = force_reg (V16QImode, vperm);
48063 
48064   l = gen_reg_rtx (V16QImode);
48065   op = gen_lowpart (V16QImode, d->op0);
48066   emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48067 
48068   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48069   vperm = force_reg (V16QImode, vperm);
48070 
48071   h = gen_reg_rtx (V16QImode);
48072   op = gen_lowpart (V16QImode, d->op1);
48073   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48074 
48075   op = d->target;
48076   if (d->vmode != V16QImode)
48077     op = gen_reg_rtx (V16QImode);
48078   emit_insn (gen_iorv16qi3 (op, l, h));
48079   if (op != d->target)
48080     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48081 
48082   return true;
48083 }
48084 
48085 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48086    with two vpshufb insns, vpermq and vpor.  We should have already failed
48087    all two or three instruction sequences.  */
48088 
48089 static bool
48090 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48091 {
48092   rtx rperm[2][32], vperm, l, h, hp, op, m128;
48093   unsigned int i, nelt, eltsz;
48094 
48095   if (!TARGET_AVX2
48096       || !d->one_operand_p
48097       || (d->vmode != V32QImode && d->vmode != V16HImode))
48098     return false;
48099 
48100   if (d->testing_p)
48101     return true;
48102 
48103   nelt = d->nelt;
48104   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48105 
48106   /* Generate two permutation masks.  If the required element is within
48107      the same lane, it is shuffled in.  If the required element from the
48108      other lane, force a zero by setting bit 7 in the permutation mask.
48109      In the other mask the mask has non-negative elements if element
48110      is requested from the other lane, but also moved to the other lane,
48111      so that the result of vpshufb can have the two V2TImode halves
48112      swapped.  */
48113   m128 = GEN_INT (-128);
48114   for (i = 0; i < nelt; ++i)
48115     {
48116       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48117       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48118 
48119       for (j = 0; j < eltsz; ++j)
48120 	{
48121 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48122 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48123 	}
48124     }
48125 
48126   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48127   vperm = force_reg (V32QImode, vperm);
48128 
48129   h = gen_reg_rtx (V32QImode);
48130   op = gen_lowpart (V32QImode, d->op0);
48131   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48132 
48133   /* Swap the 128-byte lanes of h into hp.  */
48134   hp = gen_reg_rtx (V4DImode);
48135   op = gen_lowpart (V4DImode, h);
48136   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48137 				  const1_rtx));
48138 
48139   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48140   vperm = force_reg (V32QImode, vperm);
48141 
48142   l = gen_reg_rtx (V32QImode);
48143   op = gen_lowpart (V32QImode, d->op0);
48144   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48145 
48146   op = d->target;
48147   if (d->vmode != V32QImode)
48148     op = gen_reg_rtx (V32QImode);
48149   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48150   if (op != d->target)
48151     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48152 
48153   return true;
48154 }
48155 
48156 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
48157    and extract-odd permutations of two V32QImode and V16QImode operand
48158    with two vpshufb insns, vpor and vpermq.  We should have already
48159    failed all two or three instruction sequences.  */
48160 
48161 static bool
48162 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48163 {
48164   rtx rperm[2][32], vperm, l, h, ior, op, m128;
48165   unsigned int i, nelt, eltsz;
48166 
48167   if (!TARGET_AVX2
48168       || d->one_operand_p
48169       || (d->vmode != V32QImode && d->vmode != V16HImode))
48170     return false;
48171 
48172   for (i = 0; i < d->nelt; ++i)
48173     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48174       return false;
48175 
48176   if (d->testing_p)
48177     return true;
48178 
48179   nelt = d->nelt;
48180   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48181 
48182   /* Generate two permutation masks.  In the first permutation mask
48183      the first quarter will contain indexes for the first half
48184      of the op0, the second quarter will contain bit 7 set, third quarter
48185      will contain indexes for the second half of the op0 and the
48186      last quarter bit 7 set.  In the second permutation mask
48187      the first quarter will contain bit 7 set, the second quarter
48188      indexes for the first half of the op1, the third quarter bit 7 set
48189      and last quarter indexes for the second half of the op1.
48190      I.e. the first mask e.g. for V32QImode extract even will be:
48191      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48192      (all values masked with 0xf except for -128) and second mask
48193      for extract even will be
48194      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
48195   m128 = GEN_INT (-128);
48196   for (i = 0; i < nelt; ++i)
48197     {
48198       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48199       unsigned which = d->perm[i] >= nelt;
48200       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48201 
48202       for (j = 0; j < eltsz; ++j)
48203 	{
48204 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48205 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48206 	}
48207     }
48208 
48209   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48210   vperm = force_reg (V32QImode, vperm);
48211 
48212   l = gen_reg_rtx (V32QImode);
48213   op = gen_lowpart (V32QImode, d->op0);
48214   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48215 
48216   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48217   vperm = force_reg (V32QImode, vperm);
48218 
48219   h = gen_reg_rtx (V32QImode);
48220   op = gen_lowpart (V32QImode, d->op1);
48221   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48222 
48223   ior = gen_reg_rtx (V32QImode);
48224   emit_insn (gen_iorv32qi3 (ior, l, h));
48225 
48226   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
48227   op = gen_reg_rtx (V4DImode);
48228   ior = gen_lowpart (V4DImode, ior);
48229   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48230 				  const1_rtx, GEN_INT (3)));
48231   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48232 
48233   return true;
48234 }
48235 
48236 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
48237    and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48238    with two "and" and "pack" or two "shift" and "pack" insns.  We should
48239    have already failed all two instruction sequences.  */
48240 
48241 static bool
48242 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48243 {
48244   rtx op, dop0, dop1, t;
48245   unsigned i, odd, c, s, nelt = d->nelt;
48246   bool end_perm = false;
48247   machine_mode half_mode;
48248   rtx (*gen_and) (rtx, rtx, rtx);
48249   rtx (*gen_pack) (rtx, rtx, rtx);
48250   rtx (*gen_shift) (rtx, rtx, rtx);
48251 
48252   if (d->one_operand_p)
48253     return false;
48254 
48255   switch (d->vmode)
48256     {
48257     case E_V8HImode:
48258       /* Required for "pack".  */
48259       if (!TARGET_SSE4_1)
48260         return false;
48261       c = 0xffff;
48262       s = 16;
48263       half_mode = V4SImode;
48264       gen_and = gen_andv4si3;
48265       gen_pack = gen_sse4_1_packusdw;
48266       gen_shift = gen_lshrv4si3;
48267       break;
48268     case E_V16QImode:
48269       /* No check as all instructions are SSE2.  */
48270       c = 0xff;
48271       s = 8;
48272       half_mode = V8HImode;
48273       gen_and = gen_andv8hi3;
48274       gen_pack = gen_sse2_packuswb;
48275       gen_shift = gen_lshrv8hi3;
48276       break;
48277     case E_V16HImode:
48278       if (!TARGET_AVX2)
48279         return false;
48280       c = 0xffff;
48281       s = 16;
48282       half_mode = V8SImode;
48283       gen_and = gen_andv8si3;
48284       gen_pack = gen_avx2_packusdw;
48285       gen_shift = gen_lshrv8si3;
48286       end_perm = true;
48287       break;
48288     case E_V32QImode:
48289       if (!TARGET_AVX2)
48290         return false;
48291       c = 0xff;
48292       s = 8;
48293       half_mode = V16HImode;
48294       gen_and = gen_andv16hi3;
48295       gen_pack = gen_avx2_packuswb;
48296       gen_shift = gen_lshrv16hi3;
48297       end_perm = true;
48298       break;
48299     default:
48300       /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48301 	 general shuffles.  */
48302       return false;
48303     }
48304 
48305   /* Check that permutation is even or odd.  */
48306   odd = d->perm[0];
48307   if (odd > 1)
48308     return false;
48309 
48310   for (i = 1; i < nelt; ++i)
48311     if (d->perm[i] != 2 * i + odd)
48312       return false;
48313 
48314   if (d->testing_p)
48315     return true;
48316 
48317   dop0 = gen_reg_rtx (half_mode);
48318   dop1 = gen_reg_rtx (half_mode);
48319   if (odd == 0)
48320     {
48321       t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
48322       t = force_reg (half_mode, t);
48323       emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48324       emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48325     }
48326   else
48327     {
48328       emit_insn (gen_shift (dop0,
48329 			    gen_lowpart (half_mode, d->op0),
48330 			    GEN_INT (s)));
48331       emit_insn (gen_shift (dop1,
48332 			    gen_lowpart (half_mode, d->op1),
48333 			    GEN_INT (s)));
48334     }
48335   /* In AVX2 for 256 bit case we need to permute pack result.  */
48336   if (TARGET_AVX2 && end_perm)
48337     {
48338       op = gen_reg_rtx (d->vmode);
48339       t = gen_reg_rtx (V4DImode);
48340       emit_insn (gen_pack (op, dop0, dop1));
48341       emit_insn (gen_avx2_permv4di_1 (t,
48342 				      gen_lowpart (V4DImode, op),
48343 				      const0_rtx,
48344 				      const2_rtx,
48345 				      const1_rtx,
48346 				      GEN_INT (3)));
48347       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48348     }
48349   else
48350     emit_insn (gen_pack (d->target, dop0, dop1));
48351 
48352   return true;
48353 }
48354 
48355 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
48356    and extract-odd permutations of two V64QI operands
48357    with two "shifts", two "truncs" and one "concat" insns for "odd"
48358    and two "truncs" and one concat insn for "even."
48359    Have already failed all two instruction sequences.  */
48360 
48361 static bool
48362 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48363 {
48364   rtx t1, t2, t3, t4;
48365   unsigned i, odd, nelt = d->nelt;
48366 
48367   if (!TARGET_AVX512BW
48368       || d->one_operand_p
48369       || d->vmode != V64QImode)
48370     return false;
48371 
48372   /* Check that permutation is even or odd.  */
48373   odd = d->perm[0];
48374   if (odd > 1)
48375     return false;
48376 
48377   for (i = 1; i < nelt; ++i)
48378     if (d->perm[i] != 2 * i + odd)
48379       return false;
48380 
48381   if (d->testing_p)
48382     return true;
48383 
48384 
48385   if (odd)
48386     {
48387       t1 = gen_reg_rtx (V32HImode);
48388       t2 = gen_reg_rtx (V32HImode);
48389       emit_insn (gen_lshrv32hi3 (t1,
48390 				 gen_lowpart (V32HImode, d->op0),
48391 				 GEN_INT (8)));
48392       emit_insn (gen_lshrv32hi3 (t2,
48393 				 gen_lowpart (V32HImode, d->op1),
48394 				 GEN_INT (8)));
48395     }
48396   else
48397     {
48398       t1 = gen_lowpart (V32HImode, d->op0);
48399       t2 = gen_lowpart (V32HImode, d->op1);
48400     }
48401 
48402   t3 = gen_reg_rtx (V32QImode);
48403   t4 = gen_reg_rtx (V32QImode);
48404   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48405   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48406   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48407 
48408   return true;
48409 }
48410 
48411 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
48412    and extract-odd permutations.  */
48413 
48414 static bool
48415 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48416 {
48417   rtx t1, t2, t3, t4, t5;
48418 
48419   switch (d->vmode)
48420     {
48421     case E_V4DFmode:
48422       if (d->testing_p)
48423 	break;
48424       t1 = gen_reg_rtx (V4DFmode);
48425       t2 = gen_reg_rtx (V4DFmode);
48426 
48427       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
48428       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48429       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48430 
48431       /* Now an unpck[lh]pd will produce the result required.  */
48432       if (odd)
48433 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48434       else
48435 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48436       emit_insn (t3);
48437       break;
48438 
48439     case E_V8SFmode:
48440       {
48441 	int mask = odd ? 0xdd : 0x88;
48442 
48443 	if (d->testing_p)
48444 	  break;
48445 	t1 = gen_reg_rtx (V8SFmode);
48446 	t2 = gen_reg_rtx (V8SFmode);
48447 	t3 = gen_reg_rtx (V8SFmode);
48448 
48449 	/* Shuffle within the 128-bit lanes to produce:
48450 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
48451 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48452 				      GEN_INT (mask)));
48453 
48454 	/* Shuffle the lanes around to produce:
48455 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
48456 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48457 					    GEN_INT (0x3)));
48458 
48459 	/* Shuffle within the 128-bit lanes to produce:
48460 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
48461 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48462 
48463 	/* Shuffle within the 128-bit lanes to produce:
48464 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
48465 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48466 
48467 	/* Shuffle the lanes around to produce:
48468 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
48469 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48470 					    GEN_INT (0x20)));
48471       }
48472       break;
48473 
48474     case E_V2DFmode:
48475     case E_V4SFmode:
48476     case E_V2DImode:
48477     case E_V4SImode:
48478       /* These are always directly implementable by expand_vec_perm_1.  */
48479       gcc_unreachable ();
48480 
48481     case E_V8HImode:
48482       if (TARGET_SSE4_1)
48483 	return expand_vec_perm_even_odd_pack (d);
48484       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48485 	return expand_vec_perm_pshufb2 (d);
48486       else
48487 	{
48488 	  if (d->testing_p)
48489 	    break;
48490 	  /* We need 2*log2(N)-1 operations to achieve odd/even
48491 	     with interleave. */
48492 	  t1 = gen_reg_rtx (V8HImode);
48493 	  t2 = gen_reg_rtx (V8HImode);
48494 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48495 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48496 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48497 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48498 	  if (odd)
48499 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48500 	  else
48501 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48502 	  emit_insn (t3);
48503 	}
48504       break;
48505 
48506     case E_V16QImode:
48507       return expand_vec_perm_even_odd_pack (d);
48508 
48509     case E_V16HImode:
48510     case E_V32QImode:
48511       return expand_vec_perm_even_odd_pack (d);
48512 
48513     case E_V64QImode:
48514       return expand_vec_perm_even_odd_trunc (d);
48515 
48516     case E_V4DImode:
48517       if (!TARGET_AVX2)
48518 	{
48519 	  struct expand_vec_perm_d d_copy = *d;
48520 	  d_copy.vmode = V4DFmode;
48521 	  if (d->testing_p)
48522 	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48523 	  else
48524 	    d_copy.target = gen_reg_rtx (V4DFmode);
48525 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48526 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48527 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48528 	    {
48529 	      if (!d->testing_p)
48530 		emit_move_insn (d->target,
48531 				gen_lowpart (V4DImode, d_copy.target));
48532 	      return true;
48533 	    }
48534 	  return false;
48535 	}
48536 
48537       if (d->testing_p)
48538 	break;
48539 
48540       t1 = gen_reg_rtx (V4DImode);
48541       t2 = gen_reg_rtx (V4DImode);
48542 
48543       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
48544       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48545       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48546 
48547       /* Now an vpunpck[lh]qdq will produce the result required.  */
48548       if (odd)
48549 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48550       else
48551 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48552       emit_insn (t3);
48553       break;
48554 
48555     case E_V8SImode:
48556       if (!TARGET_AVX2)
48557 	{
48558 	  struct expand_vec_perm_d d_copy = *d;
48559 	  d_copy.vmode = V8SFmode;
48560 	  if (d->testing_p)
48561 	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48562 	  else
48563 	    d_copy.target = gen_reg_rtx (V8SFmode);
48564 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48565 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48566 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48567 	    {
48568 	      if (!d->testing_p)
48569 		emit_move_insn (d->target,
48570 				gen_lowpart (V8SImode, d_copy.target));
48571 	      return true;
48572 	    }
48573 	  return false;
48574 	}
48575 
48576       if (d->testing_p)
48577 	break;
48578 
48579       t1 = gen_reg_rtx (V8SImode);
48580       t2 = gen_reg_rtx (V8SImode);
48581       t3 = gen_reg_rtx (V4DImode);
48582       t4 = gen_reg_rtx (V4DImode);
48583       t5 = gen_reg_rtx (V4DImode);
48584 
48585       /* Shuffle the lanes around into
48586 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
48587       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48588 				    gen_lowpart (V4DImode, d->op1),
48589 				    GEN_INT (0x20)));
48590       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48591 				    gen_lowpart (V4DImode, d->op1),
48592 				    GEN_INT (0x31)));
48593 
48594       /* Swap the 2nd and 3rd position in each lane into
48595 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
48596       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48597 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48598       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48599 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48600 
48601       /* Now an vpunpck[lh]qdq will produce
48602 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
48603       if (odd)
48604 	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48605 					   gen_lowpart (V4DImode, t2));
48606       else
48607 	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48608 					  gen_lowpart (V4DImode, t2));
48609       emit_insn (t3);
48610       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48611       break;
48612 
48613     default:
48614       gcc_unreachable ();
48615     }
48616 
48617   return true;
48618 }
48619 
48620 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
48621    extract-even and extract-odd permutations.  */
48622 
48623 static bool
48624 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48625 {
48626   unsigned i, odd, nelt = d->nelt;
48627 
48628   odd = d->perm[0];
48629   if (odd != 0 && odd != 1)
48630     return false;
48631 
48632   for (i = 1; i < nelt; ++i)
48633     if (d->perm[i] != 2 * i + odd)
48634       return false;
48635 
48636   return expand_vec_perm_even_odd_1 (d, odd);
48637 }
48638 
48639 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
48640    permutations.  We assume that expand_vec_perm_1 has already failed.  */
48641 
48642 static bool
48643 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48644 {
48645   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48646   machine_mode vmode = d->vmode;
48647   unsigned char perm2[4];
48648   rtx op0 = d->op0, dest;
48649   bool ok;
48650 
48651   switch (vmode)
48652     {
48653     case E_V4DFmode:
48654     case E_V8SFmode:
48655       /* These are special-cased in sse.md so that we can optionally
48656 	 use the vbroadcast instruction.  They expand to two insns
48657 	 if the input happens to be in a register.  */
48658       gcc_unreachable ();
48659 
48660     case E_V2DFmode:
48661     case E_V2DImode:
48662     case E_V4SFmode:
48663     case E_V4SImode:
48664       /* These are always implementable using standard shuffle patterns.  */
48665       gcc_unreachable ();
48666 
48667     case E_V8HImode:
48668     case E_V16QImode:
48669       /* These can be implemented via interleave.  We save one insn by
48670 	 stopping once we have promoted to V4SImode and then use pshufd.  */
48671       if (d->testing_p)
48672 	return true;
48673       do
48674 	{
48675 	  rtx dest;
48676 	  rtx (*gen) (rtx, rtx, rtx)
48677 	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48678 				 : gen_vec_interleave_lowv8hi;
48679 
48680 	  if (elt >= nelt2)
48681 	    {
48682 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48683 				       : gen_vec_interleave_highv8hi;
48684 	      elt -= nelt2;
48685 	    }
48686 	  nelt2 /= 2;
48687 
48688 	  dest = gen_reg_rtx (vmode);
48689 	  emit_insn (gen (dest, op0, op0));
48690 	  vmode = get_mode_wider_vector (vmode);
48691 	  op0 = gen_lowpart (vmode, dest);
48692 	}
48693       while (vmode != V4SImode);
48694 
48695       memset (perm2, elt, 4);
48696       dest = gen_reg_rtx (V4SImode);
48697       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48698       gcc_assert (ok);
48699       if (!d->testing_p)
48700 	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48701       return true;
48702 
48703     case E_V64QImode:
48704     case E_V32QImode:
48705     case E_V16HImode:
48706     case E_V8SImode:
48707     case E_V4DImode:
48708       /* For AVX2 broadcasts of the first element vpbroadcast* or
48709 	 vpermq should be used by expand_vec_perm_1.  */
48710       gcc_assert (!TARGET_AVX2 || d->perm[0]);
48711       return false;
48712 
48713     default:
48714       gcc_unreachable ();
48715     }
48716 }
48717 
48718 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
48719    broadcast permutations.  */
48720 
48721 static bool
48722 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48723 {
48724   unsigned i, elt, nelt = d->nelt;
48725 
48726   if (!d->one_operand_p)
48727     return false;
48728 
48729   elt = d->perm[0];
48730   for (i = 1; i < nelt; ++i)
48731     if (d->perm[i] != elt)
48732       return false;
48733 
48734   return expand_vec_perm_broadcast_1 (d);
48735 }
48736 
48737 /* Implement arbitrary permutations of two V64QImode operands
48738    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
48739 static bool
48740 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48741 {
48742   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48743     return false;
48744 
48745   if (d->testing_p)
48746     return true;
48747 
48748   struct expand_vec_perm_d ds[2];
48749   rtx rperm[128], vperm, target0, target1;
48750   unsigned int i, nelt;
48751   machine_mode vmode;
48752 
48753   nelt = d->nelt;
48754   vmode = V64QImode;
48755 
48756   for (i = 0; i < 2; i++)
48757     {
48758       ds[i] = *d;
48759       ds[i].vmode = V32HImode;
48760       ds[i].nelt = 32;
48761       ds[i].target = gen_reg_rtx (V32HImode);
48762       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48763       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48764     }
48765 
48766   /* Prepare permutations such that the first one takes care of
48767      putting the even bytes into the right positions or one higher
48768      positions (ds[0]) and the second one takes care of
48769      putting the odd bytes into the right positions or one below
48770      (ds[1]).  */
48771 
48772   for (i = 0; i < nelt; i++)
48773     {
48774       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48775       if (i & 1)
48776 	{
48777 	  rperm[i] = constm1_rtx;
48778 	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48779 	}
48780       else
48781 	{
48782 	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48783 	  rperm[i + 64] = constm1_rtx;
48784 	}
48785     }
48786 
48787   bool ok = expand_vec_perm_1 (&ds[0]);
48788   gcc_assert (ok);
48789   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48790 
48791   ok = expand_vec_perm_1 (&ds[1]);
48792   gcc_assert (ok);
48793   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48794 
48795   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48796   vperm = force_reg (vmode, vperm);
48797   target0 = gen_reg_rtx (V64QImode);
48798   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48799 
48800   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48801   vperm = force_reg (vmode, vperm);
48802   target1 = gen_reg_rtx (V64QImode);
48803   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48804 
48805   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48806   return true;
48807 }
48808 
48809 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48810    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
48811    all the shorter instruction sequences.  */
48812 
48813 static bool
48814 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48815 {
48816   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48817   unsigned int i, nelt, eltsz;
48818   bool used[4];
48819 
48820   if (!TARGET_AVX2
48821       || d->one_operand_p
48822       || (d->vmode != V32QImode && d->vmode != V16HImode))
48823     return false;
48824 
48825   if (d->testing_p)
48826     return true;
48827 
48828   nelt = d->nelt;
48829   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48830 
48831   /* Generate 4 permutation masks.  If the required element is within
48832      the same lane, it is shuffled in.  If the required element from the
48833      other lane, force a zero by setting bit 7 in the permutation mask.
48834      In the other mask the mask has non-negative elements if element
48835      is requested from the other lane, but also moved to the other lane,
48836      so that the result of vpshufb can have the two V2TImode halves
48837      swapped.  */
48838   m128 = GEN_INT (-128);
48839   for (i = 0; i < 32; ++i)
48840     {
48841       rperm[0][i] = m128;
48842       rperm[1][i] = m128;
48843       rperm[2][i] = m128;
48844       rperm[3][i] = m128;
48845     }
48846   used[0] = false;
48847   used[1] = false;
48848   used[2] = false;
48849   used[3] = false;
48850   for (i = 0; i < nelt; ++i)
48851     {
48852       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48853       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48854       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48855 
48856       for (j = 0; j < eltsz; ++j)
48857 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48858       used[which] = true;
48859     }
48860 
48861   for (i = 0; i < 2; ++i)
48862     {
48863       if (!used[2 * i + 1])
48864 	{
48865 	  h[i] = NULL_RTX;
48866 	  continue;
48867 	}
48868       vperm = gen_rtx_CONST_VECTOR (V32QImode,
48869 				    gen_rtvec_v (32, rperm[2 * i + 1]));
48870       vperm = force_reg (V32QImode, vperm);
48871       h[i] = gen_reg_rtx (V32QImode);
48872       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48873       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48874     }
48875 
48876   /* Swap the 128-byte lanes of h[X].  */
48877   for (i = 0; i < 2; ++i)
48878    {
48879      if (h[i] == NULL_RTX)
48880        continue;
48881      op = gen_reg_rtx (V4DImode);
48882      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48883 				     const2_rtx, GEN_INT (3), const0_rtx,
48884 				     const1_rtx));
48885      h[i] = gen_lowpart (V32QImode, op);
48886    }
48887 
48888   for (i = 0; i < 2; ++i)
48889     {
48890       if (!used[2 * i])
48891 	{
48892 	  l[i] = NULL_RTX;
48893 	  continue;
48894 	}
48895       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48896       vperm = force_reg (V32QImode, vperm);
48897       l[i] = gen_reg_rtx (V32QImode);
48898       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48899       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48900     }
48901 
48902   for (i = 0; i < 2; ++i)
48903     {
48904       if (h[i] && l[i])
48905 	{
48906 	  op = gen_reg_rtx (V32QImode);
48907 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48908 	  l[i] = op;
48909 	}
48910       else if (h[i])
48911 	l[i] = h[i];
48912     }
48913 
48914   gcc_assert (l[0] && l[1]);
48915   op = d->target;
48916   if (d->vmode != V32QImode)
48917     op = gen_reg_rtx (V32QImode);
48918   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48919   if (op != d->target)
48920     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48921   return true;
48922 }
48923 
48924 /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
48925    taken care of, perform the expansion in D and return true on success.  */
48926 
48927 static bool
48928 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48929 {
48930   /* Try a single instruction expansion.  */
48931   if (expand_vec_perm_1 (d))
48932     return true;
48933 
48934   /* Try sequences of two instructions.  */
48935 
48936   if (expand_vec_perm_pshuflw_pshufhw (d))
48937     return true;
48938 
48939   if (expand_vec_perm_palignr (d, false))
48940     return true;
48941 
48942   if (expand_vec_perm_interleave2 (d))
48943     return true;
48944 
48945   if (expand_vec_perm_broadcast (d))
48946     return true;
48947 
48948   if (expand_vec_perm_vpermq_perm_1 (d))
48949     return true;
48950 
48951   if (expand_vec_perm_vperm2f128 (d))
48952     return true;
48953 
48954   if (expand_vec_perm_pblendv (d))
48955     return true;
48956 
48957   /* Try sequences of three instructions.  */
48958 
48959   if (expand_vec_perm_even_odd_pack (d))
48960     return true;
48961 
48962   if (expand_vec_perm_2vperm2f128_vshuf (d))
48963     return true;
48964 
48965   if (expand_vec_perm_pshufb2 (d))
48966     return true;
48967 
48968   if (expand_vec_perm_interleave3 (d))
48969     return true;
48970 
48971   if (expand_vec_perm_vperm2f128_vblend (d))
48972     return true;
48973 
48974   /* Try sequences of four instructions.  */
48975 
48976   if (expand_vec_perm_even_odd_trunc (d))
48977     return true;
48978   if (expand_vec_perm_vpshufb2_vpermq (d))
48979     return true;
48980 
48981   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48982     return true;
48983 
48984   if (expand_vec_perm_vpermt2_vpshub2 (d))
48985     return true;
48986 
48987   /* ??? Look for narrow permutations whose element orderings would
48988      allow the promotion to a wider mode.  */
48989 
48990   /* ??? Look for sequences of interleave or a wider permute that place
48991      the data into the correct lanes for a half-vector shuffle like
48992      pshuf[lh]w or vpermilps.  */
48993 
48994   /* ??? Look for sequences of interleave that produce the desired results.
48995      The combinatorics of punpck[lh] get pretty ugly... */
48996 
48997   if (expand_vec_perm_even_odd (d))
48998     return true;
48999 
49000   /* Even longer sequences.  */
49001   if (expand_vec_perm_vpshufb4_vpermq2 (d))
49002     return true;
49003 
49004   /* See if we can get the same permutation in different vector integer
49005      mode.  */
49006   struct expand_vec_perm_d nd;
49007   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49008     {
49009       if (!d->testing_p)
49010 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49011       return true;
49012     }
49013 
49014   return false;
49015 }
49016 
49017 /* If a permutation only uses one operand, make it clear. Returns true
49018    if the permutation references both operands.  */
49019 
49020 static bool
49021 canonicalize_perm (struct expand_vec_perm_d *d)
49022 {
49023   int i, which, nelt = d->nelt;
49024 
49025   for (i = which = 0; i < nelt; ++i)
49026       which |= (d->perm[i] < nelt ? 1 : 2);
49027 
49028   d->one_operand_p = true;
49029   switch (which)
49030     {
49031     default:
49032       gcc_unreachable();
49033 
49034     case 3:
49035       if (!rtx_equal_p (d->op0, d->op1))
49036         {
49037 	  d->one_operand_p = false;
49038 	  break;
49039         }
49040       /* The elements of PERM do not suggest that only the first operand
49041 	 is used, but both operands are identical.  Allow easier matching
49042 	 of the permutation by folding the permutation into the single
49043 	 input vector.  */
49044       /* FALLTHRU */
49045 
49046     case 2:
49047       for (i = 0; i < nelt; ++i)
49048         d->perm[i] &= nelt - 1;
49049       d->op0 = d->op1;
49050       break;
49051 
49052     case 1:
49053       d->op1 = d->op0;
49054       break;
49055     }
49056 
49057   return (which == 3);
49058 }
49059 
49060 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
49061 
49062 static bool
49063 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
49064 			       rtx op1, const vec_perm_indices &sel)
49065 {
49066   struct expand_vec_perm_d d;
49067   unsigned char perm[MAX_VECT_LEN];
49068   unsigned int i, nelt, which;
49069   bool two_args;
49070 
49071   d.target = target;
49072   d.op0 = op0;
49073   d.op1 = op1;
49074 
49075   d.vmode = vmode;
49076   gcc_assert (VECTOR_MODE_P (d.vmode));
49077   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49078   d.testing_p = !target;
49079 
49080   gcc_assert (sel.length () == nelt);
49081   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49082 
49083   /* Given sufficient ISA support we can just return true here
49084      for selected vector modes.  */
49085   switch (d.vmode)
49086     {
49087     case E_V16SFmode:
49088     case E_V16SImode:
49089     case E_V8DImode:
49090     case E_V8DFmode:
49091       if (!TARGET_AVX512F)
49092 	return false;
49093       /* All implementable with a single vperm[it]2 insn.  */
49094       if (d.testing_p)
49095 	return true;
49096       break;
49097     case E_V32HImode:
49098       if (!TARGET_AVX512BW)
49099 	return false;
49100       if (d.testing_p)
49101 	/* All implementable with a single vperm[it]2 insn.  */
49102 	return true;
49103       break;
49104     case E_V64QImode:
49105       if (!TARGET_AVX512BW)
49106 	return false;
49107       if (d.testing_p)
49108 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
49109 	return true;
49110       break;
49111     case E_V8SImode:
49112     case E_V8SFmode:
49113     case E_V4DFmode:
49114     case E_V4DImode:
49115       if (!TARGET_AVX)
49116 	return false;
49117       if (d.testing_p && TARGET_AVX512VL)
49118 	/* All implementable with a single vperm[it]2 insn.  */
49119 	return true;
49120       break;
49121     case E_V16HImode:
49122       if (!TARGET_SSE2)
49123 	return false;
49124       if (d.testing_p && TARGET_AVX2)
49125 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
49126 	return true;
49127       break;
49128     case E_V32QImode:
49129       if (!TARGET_SSE2)
49130 	return false;
49131       if (d.testing_p && TARGET_AVX2)
49132 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
49133 	return true;
49134       break;
49135     case E_V8HImode:
49136     case E_V16QImode:
49137       if (!TARGET_SSE2)
49138 	return false;
49139       /* Fall through.  */
49140     case E_V4SImode:
49141     case E_V4SFmode:
49142       if (!TARGET_SSE)
49143 	return false;
49144       /* All implementable with a single vpperm insn.  */
49145       if (d.testing_p && TARGET_XOP)
49146 	return true;
49147       /* All implementable with 2 pshufb + 1 ior.  */
49148       if (d.testing_p && TARGET_SSSE3)
49149 	return true;
49150       break;
49151     case E_V2DImode:
49152     case E_V2DFmode:
49153       if (!TARGET_SSE)
49154 	return false;
49155       /* All implementable with shufpd or unpck[lh]pd.  */
49156       if (d.testing_p)
49157 	return true;
49158       break;
49159     default:
49160       return false;
49161     }
49162 
49163   for (i = which = 0; i < nelt; ++i)
49164     {
49165       unsigned char e = sel[i];
49166       gcc_assert (e < 2 * nelt);
49167       d.perm[i] = e;
49168       perm[i] = e;
49169       which |= (e < nelt ? 1 : 2);
49170     }
49171 
49172   if (d.testing_p)
49173     {
49174       /* For all elements from second vector, fold the elements to first.  */
49175       if (which == 2)
49176 	for (i = 0; i < nelt; ++i)
49177 	  d.perm[i] -= nelt;
49178 
49179       /* Check whether the mask can be applied to the vector type.  */
49180       d.one_operand_p = (which != 3);
49181 
49182       /* Implementable with shufps or pshufd.  */
49183       if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49184 	return true;
49185 
49186       /* Otherwise we have to go through the motions and see if we can
49187 	 figure out how to generate the requested permutation.  */
49188       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49189       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49190       if (!d.one_operand_p)
49191 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49192 
49193       start_sequence ();
49194       bool ret = ix86_expand_vec_perm_const_1 (&d);
49195       end_sequence ();
49196 
49197       return ret;
49198     }
49199 
49200   two_args = canonicalize_perm (&d);
49201 
49202   if (ix86_expand_vec_perm_const_1 (&d))
49203     return true;
49204 
49205   /* If the selector says both arguments are needed, but the operands are the
49206      same, the above tried to expand with one_operand_p and flattened selector.
49207      If that didn't work, retry without one_operand_p; we succeeded with that
49208      during testing.  */
49209   if (two_args && d.one_operand_p)
49210     {
49211       d.one_operand_p = false;
49212       memcpy (d.perm, perm, sizeof (perm));
49213       return ix86_expand_vec_perm_const_1 (&d);
49214     }
49215 
49216   return false;
49217 }
49218 
49219 void
49220 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49221 {
49222   struct expand_vec_perm_d d;
49223   unsigned i, nelt;
49224 
49225   d.target = targ;
49226   d.op0 = op0;
49227   d.op1 = op1;
49228   d.vmode = GET_MODE (targ);
49229   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49230   d.one_operand_p = false;
49231   d.testing_p = false;
49232 
49233   for (i = 0; i < nelt; ++i)
49234     d.perm[i] = i * 2 + odd;
49235 
49236   /* We'll either be able to implement the permutation directly...  */
49237   if (expand_vec_perm_1 (&d))
49238     return;
49239 
49240   /* ... or we use the special-case patterns.  */
49241   expand_vec_perm_even_odd_1 (&d, odd);
49242 }
49243 
49244 static void
49245 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49246 {
49247   struct expand_vec_perm_d d;
49248   unsigned i, nelt, base;
49249   bool ok;
49250 
49251   d.target = targ;
49252   d.op0 = op0;
49253   d.op1 = op1;
49254   d.vmode = GET_MODE (targ);
49255   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49256   d.one_operand_p = false;
49257   d.testing_p = false;
49258 
49259   base = high_p ? nelt / 2 : 0;
49260   for (i = 0; i < nelt / 2; ++i)
49261     {
49262       d.perm[i * 2] = i + base;
49263       d.perm[i * 2 + 1] = i + base + nelt;
49264     }
49265 
49266   /* Note that for AVX this isn't one instruction.  */
49267   ok = ix86_expand_vec_perm_const_1 (&d);
49268   gcc_assert (ok);
49269 }
49270 
49271 
49272 /* Expand a vector operation CODE for a V*QImode in terms of the
49273    same operation on V*HImode.  */
49274 
49275 void
49276 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49277 {
49278   machine_mode qimode = GET_MODE (dest);
49279   machine_mode himode;
49280   rtx (*gen_il) (rtx, rtx, rtx);
49281   rtx (*gen_ih) (rtx, rtx, rtx);
49282   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49283   struct expand_vec_perm_d d;
49284   bool ok, full_interleave;
49285   bool uns_p = false;
49286   int i;
49287 
49288   switch (qimode)
49289     {
49290     case E_V16QImode:
49291       himode = V8HImode;
49292       gen_il = gen_vec_interleave_lowv16qi;
49293       gen_ih = gen_vec_interleave_highv16qi;
49294       break;
49295     case E_V32QImode:
49296       himode = V16HImode;
49297       gen_il = gen_avx2_interleave_lowv32qi;
49298       gen_ih = gen_avx2_interleave_highv32qi;
49299       break;
49300     case E_V64QImode:
49301       himode = V32HImode;
49302       gen_il = gen_avx512bw_interleave_lowv64qi;
49303       gen_ih = gen_avx512bw_interleave_highv64qi;
49304       break;
49305     default:
49306       gcc_unreachable ();
49307     }
49308 
49309   op2_l = op2_h = op2;
49310   switch (code)
49311     {
49312     case MULT:
49313       /* Unpack data such that we've got a source byte in each low byte of
49314 	 each word.  We don't care what goes into the high byte of each word.
49315 	 Rather than trying to get zero in there, most convenient is to let
49316 	 it be a copy of the low byte.  */
49317       op2_l = gen_reg_rtx (qimode);
49318       op2_h = gen_reg_rtx (qimode);
49319       emit_insn (gen_il (op2_l, op2, op2));
49320       emit_insn (gen_ih (op2_h, op2, op2));
49321 
49322       op1_l = gen_reg_rtx (qimode);
49323       op1_h = gen_reg_rtx (qimode);
49324       emit_insn (gen_il (op1_l, op1, op1));
49325       emit_insn (gen_ih (op1_h, op1, op1));
49326       full_interleave = qimode == V16QImode;
49327       break;
49328 
49329     case ASHIFT:
49330     case LSHIFTRT:
49331       uns_p = true;
49332       /* FALLTHRU */
49333     case ASHIFTRT:
49334       op1_l = gen_reg_rtx (himode);
49335       op1_h = gen_reg_rtx (himode);
49336       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49337       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49338       full_interleave = true;
49339       break;
49340     default:
49341       gcc_unreachable ();
49342     }
49343 
49344   /* Perform the operation.  */
49345   res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49346 			       1, OPTAB_DIRECT);
49347   res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49348 			       1, OPTAB_DIRECT);
49349   gcc_assert (res_l && res_h);
49350 
49351   /* Merge the data back into the right place.  */
49352   d.target = dest;
49353   d.op0 = gen_lowpart (qimode, res_l);
49354   d.op1 = gen_lowpart (qimode, res_h);
49355   d.vmode = qimode;
49356   d.nelt = GET_MODE_NUNITS (qimode);
49357   d.one_operand_p = false;
49358   d.testing_p = false;
49359 
49360   if (full_interleave)
49361     {
49362       /* For SSE2, we used an full interleave, so the desired
49363 	 results are in the even elements.  */
49364       for (i = 0; i < d.nelt; ++i)
49365 	d.perm[i] = i * 2;
49366     }
49367   else
49368     {
49369       /* For AVX, the interleave used above was not cross-lane.  So the
49370 	 extraction is evens but with the second and third quarter swapped.
49371 	 Happily, that is even one insn shorter than even extraction.
49372 	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
49373 	 always first from the first and then from the second source operand,
49374 	 the index bits above the low 4 bits remains the same.
49375 	 Thus, for d.nelt == 32 we want permutation
49376 	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49377 	 and for d.nelt == 64 we want permutation
49378 	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49379 	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
49380       for (i = 0; i < d.nelt; ++i)
49381 	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49382     }
49383 
49384   ok = ix86_expand_vec_perm_const_1 (&d);
49385   gcc_assert (ok);
49386 
49387   set_unique_reg_note (get_last_insn (), REG_EQUAL,
49388 		       gen_rtx_fmt_ee (code, qimode, op1, op2));
49389 }
49390 
49391 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
49392    if op is CONST_VECTOR with all odd elements equal to their
49393    preceding element.  */
49394 
49395 static bool
49396 const_vector_equal_evenodd_p (rtx op)
49397 {
49398   machine_mode mode = GET_MODE (op);
49399   int i, nunits = GET_MODE_NUNITS (mode);
49400   if (GET_CODE (op) != CONST_VECTOR
49401       || nunits != CONST_VECTOR_NUNITS (op))
49402     return false;
49403   for (i = 0; i < nunits; i += 2)
49404     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49405       return false;
49406   return true;
49407 }
49408 
49409 void
49410 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49411 			       bool uns_p, bool odd_p)
49412 {
49413   machine_mode mode = GET_MODE (op1);
49414   machine_mode wmode = GET_MODE (dest);
49415   rtx x;
49416   rtx orig_op1 = op1, orig_op2 = op2;
49417 
49418   if (!nonimmediate_operand (op1, mode))
49419     op1 = force_reg (mode, op1);
49420   if (!nonimmediate_operand (op2, mode))
49421     op2 = force_reg (mode, op2);
49422 
49423   /* We only play even/odd games with vectors of SImode.  */
49424   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49425 
49426   /* If we're looking for the odd results, shift those members down to
49427      the even slots.  For some cpus this is faster than a PSHUFD.  */
49428   if (odd_p)
49429     {
49430       /* For XOP use vpmacsdqh, but only for smult, as it is only
49431 	 signed.  */
49432       if (TARGET_XOP && mode == V4SImode && !uns_p)
49433 	{
49434 	  x = force_reg (wmode, CONST0_RTX (wmode));
49435 	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49436 	  return;
49437 	}
49438 
49439       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49440       if (!const_vector_equal_evenodd_p (orig_op1))
49441 	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49442 			    x, NULL, 1, OPTAB_DIRECT);
49443       if (!const_vector_equal_evenodd_p (orig_op2))
49444 	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49445 			    x, NULL, 1, OPTAB_DIRECT);
49446       op1 = gen_lowpart (mode, op1);
49447       op2 = gen_lowpart (mode, op2);
49448     }
49449 
49450   if (mode == V16SImode)
49451     {
49452       if (uns_p)
49453 	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49454       else
49455 	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49456     }
49457   else if (mode == V8SImode)
49458     {
49459       if (uns_p)
49460 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49461       else
49462 	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49463     }
49464   else if (uns_p)
49465     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49466   else if (TARGET_SSE4_1)
49467     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49468   else
49469     {
49470       rtx s1, s2, t0, t1, t2;
49471 
49472       /* The easiest way to implement this without PMULDQ is to go through
49473 	 the motions as if we are performing a full 64-bit multiply.  With
49474 	 the exception that we need to do less shuffling of the elements.  */
49475 
49476       /* Compute the sign-extension, aka highparts, of the two operands.  */
49477       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49478 				op1, pc_rtx, pc_rtx);
49479       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49480 				op2, pc_rtx, pc_rtx);
49481 
49482       /* Multiply LO(A) * HI(B), and vice-versa.  */
49483       t1 = gen_reg_rtx (wmode);
49484       t2 = gen_reg_rtx (wmode);
49485       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49486       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49487 
49488       /* Multiply LO(A) * LO(B).  */
49489       t0 = gen_reg_rtx (wmode);
49490       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49491 
49492       /* Combine and shift the highparts into place.  */
49493       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49494       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49495 			 1, OPTAB_DIRECT);
49496 
49497       /* Combine high and low parts.  */
49498       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49499       return;
49500     }
49501   emit_insn (x);
49502 }
49503 
49504 void
49505 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49506 			    bool uns_p, bool high_p)
49507 {
49508   machine_mode wmode = GET_MODE (dest);
49509   machine_mode mode = GET_MODE (op1);
49510   rtx t1, t2, t3, t4, mask;
49511 
49512   switch (mode)
49513     {
49514     case E_V4SImode:
49515       t1 = gen_reg_rtx (mode);
49516       t2 = gen_reg_rtx (mode);
49517       if (TARGET_XOP && !uns_p)
49518 	{
49519 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
49520 	     shuffle the elements once so that all elements are in the right
49521 	     place for immediate use: { A C B D }.  */
49522 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49523 					const1_rtx, GEN_INT (3)));
49524 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49525 					const1_rtx, GEN_INT (3)));
49526 	}
49527       else
49528 	{
49529 	  /* Put the elements into place for the multiply.  */
49530 	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
49531 	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
49532 	  high_p = false;
49533 	}
49534       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49535       break;
49536 
49537     case E_V8SImode:
49538       /* Shuffle the elements between the lanes.  After this we
49539 	 have { A B E F | C D G H } for each operand.  */
49540       t1 = gen_reg_rtx (V4DImode);
49541       t2 = gen_reg_rtx (V4DImode);
49542       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49543 				      const0_rtx, const2_rtx,
49544 				      const1_rtx, GEN_INT (3)));
49545       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49546 				      const0_rtx, const2_rtx,
49547 				      const1_rtx, GEN_INT (3)));
49548 
49549       /* Shuffle the elements within the lanes.  After this we
49550 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
49551       t3 = gen_reg_rtx (V8SImode);
49552       t4 = gen_reg_rtx (V8SImode);
49553       mask = GEN_INT (high_p
49554 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49555 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49556       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49557       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49558 
49559       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49560       break;
49561 
49562     case E_V8HImode:
49563     case E_V16HImode:
49564       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49565 			 uns_p, OPTAB_DIRECT);
49566       t2 = expand_binop (mode,
49567 			 uns_p ? umul_highpart_optab : smul_highpart_optab,
49568 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49569       gcc_assert (t1 && t2);
49570 
49571       t3 = gen_reg_rtx (mode);
49572       ix86_expand_vec_interleave (t3, t1, t2, high_p);
49573       emit_move_insn (dest, gen_lowpart (wmode, t3));
49574       break;
49575 
49576     case E_V16QImode:
49577     case E_V32QImode:
49578     case E_V32HImode:
49579     case E_V16SImode:
49580     case E_V64QImode:
49581       t1 = gen_reg_rtx (wmode);
49582       t2 = gen_reg_rtx (wmode);
49583       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49584       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49585 
49586       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49587       break;
49588 
49589     default:
49590       gcc_unreachable ();
49591     }
49592 }
49593 
49594 void
49595 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49596 {
49597   rtx res_1, res_2, res_3, res_4;
49598 
49599   res_1 = gen_reg_rtx (V4SImode);
49600   res_2 = gen_reg_rtx (V4SImode);
49601   res_3 = gen_reg_rtx (V2DImode);
49602   res_4 = gen_reg_rtx (V2DImode);
49603   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49604   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49605 
49606   /* Move the results in element 2 down to element 1; we don't care
49607      what goes in elements 2 and 3.  Then we can merge the parts
49608      back together with an interleave.
49609 
49610      Note that two other sequences were tried:
49611      (1) Use interleaves at the start instead of psrldq, which allows
49612      us to use a single shufps to merge things back at the end.
49613      (2) Use shufps here to combine the two vectors, then pshufd to
49614      put the elements in the correct order.
49615      In both cases the cost of the reformatting stall was too high
49616      and the overall sequence slower.  */
49617 
49618   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49619 				const0_rtx, const2_rtx,
49620 				const0_rtx, const0_rtx));
49621   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49622 				const0_rtx, const2_rtx,
49623 				const0_rtx, const0_rtx));
49624   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49625 
49626   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49627 }
49628 
49629 void
49630 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49631 {
49632   machine_mode mode = GET_MODE (op0);
49633   rtx t1, t2, t3, t4, t5, t6;
49634 
49635   if (TARGET_AVX512DQ && mode == V8DImode)
49636     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49637   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49638     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49639   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49640     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49641   else if (TARGET_XOP && mode == V2DImode)
49642     {
49643       /* op1: A,B,C,D, op2: E,F,G,H */
49644       op1 = gen_lowpart (V4SImode, op1);
49645       op2 = gen_lowpart (V4SImode, op2);
49646 
49647       t1 = gen_reg_rtx (V4SImode);
49648       t2 = gen_reg_rtx (V4SImode);
49649       t3 = gen_reg_rtx (V2DImode);
49650       t4 = gen_reg_rtx (V2DImode);
49651 
49652       /* t1: B,A,D,C */
49653       emit_insn (gen_sse2_pshufd_1 (t1, op1,
49654 				    GEN_INT (1),
49655 				    GEN_INT (0),
49656 				    GEN_INT (3),
49657 				    GEN_INT (2)));
49658 
49659       /* t2: (B*E),(A*F),(D*G),(C*H) */
49660       emit_insn (gen_mulv4si3 (t2, t1, op2));
49661 
49662       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49663       emit_insn (gen_xop_phadddq (t3, t2));
49664 
49665       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49666       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49667 
49668       /* Multiply lower parts and add all */
49669       t5 = gen_reg_rtx (V2DImode);
49670       emit_insn (gen_vec_widen_umult_even_v4si (t5,
49671 					gen_lowpart (V4SImode, op1),
49672 					gen_lowpart (V4SImode, op2)));
49673       op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49674 
49675     }
49676   else
49677     {
49678       machine_mode nmode;
49679       rtx (*umul) (rtx, rtx, rtx);
49680 
49681       if (mode == V2DImode)
49682 	{
49683 	  umul = gen_vec_widen_umult_even_v4si;
49684 	  nmode = V4SImode;
49685 	}
49686       else if (mode == V4DImode)
49687 	{
49688 	  umul = gen_vec_widen_umult_even_v8si;
49689 	  nmode = V8SImode;
49690 	}
49691       else if (mode == V8DImode)
49692 	{
49693 	  umul = gen_vec_widen_umult_even_v16si;
49694 	  nmode = V16SImode;
49695 	}
49696       else
49697 	gcc_unreachable ();
49698 
49699 
49700       /* Multiply low parts.  */
49701       t1 = gen_reg_rtx (mode);
49702       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49703 
49704       /* Shift input vectors right 32 bits so we can multiply high parts.  */
49705       t6 = GEN_INT (32);
49706       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49707       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49708 
49709       /* Multiply high parts by low parts.  */
49710       t4 = gen_reg_rtx (mode);
49711       t5 = gen_reg_rtx (mode);
49712       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49713       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49714 
49715       /* Combine and shift the highparts back.  */
49716       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49717       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49718 
49719       /* Combine high and low parts.  */
49720       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49721     }
49722 
49723   set_unique_reg_note (get_last_insn (), REG_EQUAL,
49724 		       gen_rtx_MULT (mode, op1, op2));
49725 }
49726 
49727 /* Return 1 if control tansfer instruction INSN
49728    should be encoded with bnd prefix.
49729    If insn is NULL then return 1 when control
49730    transfer instructions should be prefixed with
49731    bnd by default for current function.  */
49732 
49733 bool
49734 ix86_bnd_prefixed_insn_p (rtx insn)
49735 {
49736   /* For call insns check special flag.  */
49737   if (insn && CALL_P (insn))
49738     {
49739       rtx call = get_call_rtx_from (insn);
49740       if (call)
49741 	return CALL_EXPR_WITH_BOUNDS_P (call);
49742     }
49743 
49744   /* All other insns are prefixed only if function is instrumented.  */
49745   return chkp_function_instrumented_p (current_function_decl);
49746 }
49747 
49748 /* Return 1 if control tansfer instruction INSN
49749    should be encoded with notrack prefix.  */
49750 
49751 static bool
49752 ix86_notrack_prefixed_insn_p (rtx insn)
49753 {
49754   if (!insn || !((flag_cf_protection & CF_BRANCH)))
49755     return false;
49756 
49757   if (CALL_P (insn))
49758     {
49759       rtx call = get_call_rtx_from (insn);
49760       gcc_assert (call != NULL_RTX);
49761       rtx addr = XEXP (call, 0);
49762 
49763       /* Do not emit 'notrack' if it's not an indirect call.  */
49764       if (MEM_P (addr)
49765 	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49766 	return false;
49767       else
49768 	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49769     }
49770 
49771   if (JUMP_P (insn) && !flag_cet_switch)
49772     {
49773       rtx target = JUMP_LABEL (insn);
49774       if (target == NULL_RTX || ANY_RETURN_P (target))
49775 	return false;
49776 
49777       /* Check the jump is a switch table.  */
49778       rtx_insn *label = as_a<rtx_insn *> (target);
49779       rtx_insn *table = next_insn (label);
49780       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49781 	return false;
49782       else
49783 	return true;
49784     }
49785   return false;
49786 }
49787 
49788 /* Calculate integer abs() using only SSE2 instructions.  */
49789 
49790 void
49791 ix86_expand_sse2_abs (rtx target, rtx input)
49792 {
49793   machine_mode mode = GET_MODE (target);
49794   rtx tmp0, tmp1, x;
49795 
49796   switch (mode)
49797     {
49798       /* For 32-bit signed integer X, the best way to calculate the absolute
49799 	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
49800       case E_V4SImode:
49801 	tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49802 				    GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49803 				    NULL, 0, OPTAB_DIRECT);
49804 	tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49805 				    NULL, 0, OPTAB_DIRECT);
49806 	x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49807 				 target, 0, OPTAB_DIRECT);
49808 	break;
49809 
49810       /* For 16-bit signed integer X, the best way to calculate the absolute
49811 	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
49812       case E_V8HImode:
49813 	tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49814 
49815 	x = expand_simple_binop (mode, SMAX, tmp0, input,
49816 				 target, 0, OPTAB_DIRECT);
49817 	break;
49818 
49819       /* For 8-bit signed integer X, the best way to calculate the absolute
49820 	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49821 	 as SSE2 provides the PMINUB insn.  */
49822       case E_V16QImode:
49823 	tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49824 
49825 	x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49826 				 target, 0, OPTAB_DIRECT);
49827 	break;
49828 
49829       default:
49830 	gcc_unreachable ();
49831     }
49832 
49833   if (x != target)
49834     emit_move_insn (target, x);
49835 }
49836 
49837 /* Expand an extract from a vector register through pextr insn.
49838    Return true if successful.  */
49839 
49840 bool
49841 ix86_expand_pextr (rtx *operands)
49842 {
49843   rtx dst = operands[0];
49844   rtx src = operands[1];
49845 
49846   unsigned int size = INTVAL (operands[2]);
49847   unsigned int pos = INTVAL (operands[3]);
49848 
49849   if (SUBREG_P (dst))
49850     {
49851       /* Reject non-lowpart subregs.  */
49852       if (SUBREG_BYTE (dst) > 0)
49853 	return false;
49854       dst = SUBREG_REG (dst);
49855     }
49856 
49857   if (SUBREG_P (src))
49858     {
49859       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49860       src = SUBREG_REG (src);
49861     }
49862 
49863   switch (GET_MODE (src))
49864     {
49865     case E_V16QImode:
49866     case E_V8HImode:
49867     case E_V4SImode:
49868     case E_V2DImode:
49869     case E_V1TImode:
49870     case E_TImode:
49871       {
49872 	machine_mode srcmode, dstmode;
49873 	rtx d, pat;
49874 
49875 	if (!int_mode_for_size (size, 0).exists (&dstmode))
49876 	  return false;
49877 
49878 	switch (dstmode)
49879 	  {
49880 	  case E_QImode:
49881 	    if (!TARGET_SSE4_1)
49882 	      return false;
49883 	    srcmode = V16QImode;
49884 	    break;
49885 
49886 	  case E_HImode:
49887 	    if (!TARGET_SSE2)
49888 	      return false;
49889 	    srcmode = V8HImode;
49890 	    break;
49891 
49892 	  case E_SImode:
49893 	    if (!TARGET_SSE4_1)
49894 	      return false;
49895 	    srcmode = V4SImode;
49896 	    break;
49897 
49898 	  case E_DImode:
49899 	    gcc_assert (TARGET_64BIT);
49900 	    if (!TARGET_SSE4_1)
49901 	      return false;
49902 	    srcmode = V2DImode;
49903 	    break;
49904 
49905 	  default:
49906 	    return false;
49907 	  }
49908 
49909 	/* Reject extractions from misaligned positions.  */
49910 	if (pos & (size-1))
49911 	  return false;
49912 
49913 	if (GET_MODE (dst) == dstmode)
49914 	  d = dst;
49915 	else
49916 	  d = gen_reg_rtx (dstmode);
49917 
49918 	/* Construct insn pattern.  */
49919 	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49920 	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49921 
49922 	/* Let the rtl optimizers know about the zero extension performed.  */
49923 	if (dstmode == QImode || dstmode == HImode)
49924 	  {
49925 	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49926 	    d = gen_lowpart (SImode, d);
49927 	  }
49928 
49929 	emit_insn (gen_rtx_SET (d, pat));
49930 
49931 	if (d != dst)
49932 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49933 	return true;
49934       }
49935 
49936     default:
49937       return false;
49938     }
49939 }
49940 
49941 /* Expand an insert into a vector register through pinsr insn.
49942    Return true if successful.  */
49943 
49944 bool
49945 ix86_expand_pinsr (rtx *operands)
49946 {
49947   rtx dst = operands[0];
49948   rtx src = operands[3];
49949 
49950   unsigned int size = INTVAL (operands[1]);
49951   unsigned int pos = INTVAL (operands[2]);
49952 
49953   if (SUBREG_P (dst))
49954     {
49955       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49956       dst = SUBREG_REG (dst);
49957     }
49958 
49959   switch (GET_MODE (dst))
49960     {
49961     case E_V16QImode:
49962     case E_V8HImode:
49963     case E_V4SImode:
49964     case E_V2DImode:
49965     case E_V1TImode:
49966     case E_TImode:
49967       {
49968 	machine_mode srcmode, dstmode;
49969 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
49970 	rtx d;
49971 
49972 	if (!int_mode_for_size (size, 0).exists (&srcmode))
49973 	  return false;
49974 
49975 	switch (srcmode)
49976 	  {
49977 	  case E_QImode:
49978 	    if (!TARGET_SSE4_1)
49979 	      return false;
49980 	    dstmode = V16QImode;
49981 	    pinsr = gen_sse4_1_pinsrb;
49982 	    break;
49983 
49984 	  case E_HImode:
49985 	    if (!TARGET_SSE2)
49986 	      return false;
49987 	    dstmode = V8HImode;
49988 	    pinsr = gen_sse2_pinsrw;
49989 	    break;
49990 
49991 	  case E_SImode:
49992 	    if (!TARGET_SSE4_1)
49993 	      return false;
49994 	    dstmode = V4SImode;
49995 	    pinsr = gen_sse4_1_pinsrd;
49996 	    break;
49997 
49998 	  case E_DImode:
49999 	    gcc_assert (TARGET_64BIT);
50000 	    if (!TARGET_SSE4_1)
50001 	      return false;
50002 	    dstmode = V2DImode;
50003 	    pinsr = gen_sse4_1_pinsrq;
50004 	    break;
50005 
50006 	  default:
50007 	    return false;
50008 	  }
50009 
50010 	/* Reject insertions to misaligned positions.  */
50011 	if (pos & (size-1))
50012 	  return false;
50013 
50014 	if (SUBREG_P (src))
50015 	  {
50016 	    unsigned int srcpos = SUBREG_BYTE (src);
50017 
50018 	    if (srcpos > 0)
50019 	      {
50020 		rtx extr_ops[4];
50021 
50022 		extr_ops[0] = gen_reg_rtx (srcmode);
50023 		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50024 		extr_ops[2] = GEN_INT (size);
50025 		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50026 
50027 		if (!ix86_expand_pextr (extr_ops))
50028 		  return false;
50029 
50030 		src = extr_ops[0];
50031 	      }
50032 	    else
50033 	      src = gen_lowpart (srcmode, SUBREG_REG (src));
50034 	  }
50035 
50036 	if (GET_MODE (dst) == dstmode)
50037 	  d = dst;
50038 	else
50039 	  d = gen_reg_rtx (dstmode);
50040 
50041 	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50042 			  gen_lowpart (srcmode, src),
50043 			  GEN_INT (1 << (pos / size))));
50044 	if (d != dst)
50045 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50046 	return true;
50047       }
50048 
50049     default:
50050       return false;
50051     }
50052 }
50053 
50054 /* This function returns the calling abi specific va_list type node.
50055    It returns  the FNDECL specific va_list type.  */
50056 
50057 static tree
50058 ix86_fn_abi_va_list (tree fndecl)
50059 {
50060   if (!TARGET_64BIT)
50061     return va_list_type_node;
50062   gcc_assert (fndecl != NULL_TREE);
50063 
50064   if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50065     return ms_va_list_type_node;
50066   else
50067     return sysv_va_list_type_node;
50068 }
50069 
50070 /* Returns the canonical va_list type specified by TYPE. If there
50071    is no valid TYPE provided, it return NULL_TREE.  */
50072 
50073 static tree
50074 ix86_canonical_va_list_type (tree type)
50075 {
50076   if (TARGET_64BIT)
50077     {
50078       if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50079 	return ms_va_list_type_node;
50080 
50081       if ((TREE_CODE (type) == ARRAY_TYPE
50082 	   && integer_zerop (array_type_nelts (type)))
50083 	  || POINTER_TYPE_P (type))
50084 	{
50085 	  tree elem_type = TREE_TYPE (type);
50086 	  if (TREE_CODE (elem_type) == RECORD_TYPE
50087 	      && lookup_attribute ("sysv_abi va_list",
50088 				   TYPE_ATTRIBUTES (elem_type)))
50089 	    return sysv_va_list_type_node;
50090 	}
50091 
50092       return NULL_TREE;
50093     }
50094 
50095   return std_canonical_va_list_type (type);
50096 }
50097 
50098 /* Iterate through the target-specific builtin types for va_list.
50099    IDX denotes the iterator, *PTREE is set to the result type of
50100    the va_list builtin, and *PNAME to its internal type.
50101    Returns zero if there is no element for this index, otherwise
50102    IDX should be increased upon the next call.
50103    Note, do not iterate a base builtin's name like __builtin_va_list.
50104    Used from c_common_nodes_and_builtins.  */
50105 
50106 static int
50107 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50108 {
50109   if (TARGET_64BIT)
50110     {
50111       switch (idx)
50112 	{
50113 	default:
50114 	  break;
50115 
50116 	case 0:
50117 	  *ptree = ms_va_list_type_node;
50118 	  *pname = "__builtin_ms_va_list";
50119 	  return 1;
50120 
50121 	case 1:
50122 	  *ptree = sysv_va_list_type_node;
50123 	  *pname = "__builtin_sysv_va_list";
50124 	  return 1;
50125 	}
50126     }
50127 
50128   return 0;
50129 }
50130 
50131 #undef TARGET_SCHED_DISPATCH
50132 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
50133 #undef TARGET_SCHED_DISPATCH_DO
50134 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
50135 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50136 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50137 #undef TARGET_SCHED_REORDER
50138 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
50139 #undef TARGET_SCHED_ADJUST_PRIORITY
50140 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50141 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50142 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50143   ix86_dependencies_evaluation_hook
50144 
50145 
50146 /* Implementation of reassociation_width target hook used by
50147    reassoc phase to identify parallelism level in reassociated
50148    tree.  Statements tree_code is passed in OPC.  Arguments type
50149    is passed in MODE.  */
50150 
50151 static int
50152 ix86_reassociation_width (unsigned int op, machine_mode mode)
50153 {
50154   int width = 1;
50155   /* Vector part.  */
50156   if (VECTOR_MODE_P (mode))
50157     {
50158       int div = 1;
50159       if (INTEGRAL_MODE_P (mode))
50160 	width = ix86_cost->reassoc_vec_int;
50161       else if (FLOAT_MODE_P (mode))
50162 	width = ix86_cost->reassoc_vec_fp;
50163 
50164       if (width == 1)
50165 	return 1;
50166 
50167       /* Integer vector instructions execute in FP unit
50168 	 and can execute 3 additions and one multiplication per cycle.  */
50169       if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
50170 	  && op != PLUS && op != MINUS)
50171 	return 1;
50172 
50173       /* Account for targets that splits wide vectors into multiple parts.  */
50174       if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
50175 	div = GET_MODE_BITSIZE (mode) / 128;
50176       else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
50177 	div = GET_MODE_BITSIZE (mode) / 64;
50178       width = (width + div - 1) / div;
50179     }
50180   /* Scalar part.  */
50181   else if (INTEGRAL_MODE_P (mode))
50182     width = ix86_cost->reassoc_int;
50183   else if (FLOAT_MODE_P (mode))
50184     width = ix86_cost->reassoc_fp;
50185 
50186   /* Avoid using too many registers in 32bit mode.  */
50187   if (!TARGET_64BIT && width > 2)
50188     width = 2;
50189   return width;
50190 }
50191 
50192 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50193    place emms and femms instructions.  */
50194 
50195 static machine_mode
50196 ix86_preferred_simd_mode (scalar_mode mode)
50197 {
50198   if (!TARGET_SSE)
50199     return word_mode;
50200 
50201   switch (mode)
50202     {
50203     case E_QImode:
50204       if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50205 	return V64QImode;
50206       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50207 	return V32QImode;
50208       else
50209 	return V16QImode;
50210 
50211     case E_HImode:
50212       if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50213 	return V32HImode;
50214       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50215 	return V16HImode;
50216       else
50217 	return V8HImode;
50218 
50219     case E_SImode:
50220       if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50221 	return V16SImode;
50222       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50223 	return V8SImode;
50224       else
50225 	return V4SImode;
50226 
50227     case E_DImode:
50228       if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50229 	return V8DImode;
50230       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50231 	return V4DImode;
50232       else
50233 	return V2DImode;
50234 
50235     case E_SFmode:
50236       if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50237 	return V16SFmode;
50238       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50239 	return V8SFmode;
50240       else
50241 	return V4SFmode;
50242 
50243     case E_DFmode:
50244       if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50245 	return V8DFmode;
50246       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50247 	return V4DFmode;
50248       else if (TARGET_SSE2)
50249 	return V2DFmode;
50250       /* FALLTHRU */
50251 
50252     default:
50253       return word_mode;
50254     }
50255 }
50256 
50257 /* All CPUs prefer to avoid cross-lane operations so perform reductions
50258    upper against lower halves up to SSE reg size.  */
50259 
50260 static machine_mode
50261 ix86_split_reduction (machine_mode mode)
50262 {
50263   /* Reduce lowpart against highpart until we reach SSE reg width to
50264      avoid cross-lane operations.  */
50265   switch (mode)
50266     {
50267     case E_V8DImode:
50268     case E_V4DImode:
50269       return V2DImode;
50270     case E_V16SImode:
50271     case E_V8SImode:
50272       return V4SImode;
50273     case E_V32HImode:
50274     case E_V16HImode:
50275       return V8HImode;
50276     case E_V64QImode:
50277     case E_V32QImode:
50278       return V16QImode;
50279     case E_V16SFmode:
50280     case E_V8SFmode:
50281       return V4SFmode;
50282     case E_V8DFmode:
50283     case E_V4DFmode:
50284       return V2DFmode;
50285     default:
50286       return mode;
50287     }
50288 }
50289 
50290 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50291    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
50292    256bit and 128bit vectors.  */
50293 
50294 static void
50295 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
50296 {
50297   if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50298     {
50299       sizes->safe_push (64);
50300       sizes->safe_push (32);
50301       sizes->safe_push (16);
50302     }
50303   else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50304     {
50305       sizes->safe_push (32);
50306       sizes->safe_push (16);
50307     }
50308 }
50309 
50310 /* Implemenation of targetm.vectorize.get_mask_mode.  */
50311 
50312 static opt_machine_mode
50313 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
50314 {
50315   unsigned elem_size = vector_size / nunits;
50316 
50317   /* Scalar mask case.  */
50318   if ((TARGET_AVX512F && vector_size == 64)
50319       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50320     {
50321       if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50322 	return smallest_int_mode_for_size (nunits);
50323     }
50324 
50325   scalar_int_mode elem_mode
50326     = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
50327 
50328   gcc_assert (elem_size * nunits == vector_size);
50329 
50330   return mode_for_vector (elem_mode, nunits);
50331 }
50332 
50333 
50334 
50335 /* Return class of registers which could be used for pseudo of MODE
50336    and of class RCLASS for spilling instead of memory.  Return NO_REGS
50337    if it is not possible or non-profitable.  */
50338 
50339 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657.  */
50340 
50341 static reg_class_t
50342 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50343 {
50344   if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50345       && TARGET_SSE2
50346       && TARGET_INTER_UNIT_MOVES_TO_VEC
50347       && TARGET_INTER_UNIT_MOVES_FROM_VEC
50348       && (mode == SImode || (TARGET_64BIT && mode == DImode))
50349       && INTEGER_CLASS_P (rclass))
50350     return ALL_SSE_REGS;
50351   return NO_REGS;
50352 }
50353 
50354 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST.  Like the default implementation,
50355    but returns a lower bound.  */
50356 
50357 static unsigned int
50358 ix86_max_noce_ifcvt_seq_cost (edge e)
50359 {
50360   bool predictable_p = predictable_edge_p (e);
50361 
50362   enum compiler_param param
50363     = (predictable_p
50364        ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50365        : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50366 
50367   /* If we have a parameter set, use that, otherwise take a guess using
50368      BRANCH_COST.  */
50369   if (global_options_set.x_param_values[param])
50370     return PARAM_VALUE (param);
50371   else
50372     return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50373 }
50374 
50375 /* Return true if SEQ is a good candidate as a replacement for the
50376    if-convertible sequence described in IF_INFO.  */
50377 
50378 static bool
50379 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
50380 {
50381   if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
50382     {
50383       int cmov_cnt = 0;
50384       /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
50385 	 Maybe we should allow even more conditional moves as long as they
50386 	 are used far enough not to stall the CPU, or also consider
50387 	 IF_INFO->TEST_BB succ edge probabilities.  */
50388       for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
50389 	{
50390 	  rtx set = single_set (insn);
50391 	  if (!set)
50392 	    continue;
50393 	  if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
50394 	    continue;
50395 	  rtx src = SET_SRC (set);
50396 	  machine_mode mode = GET_MODE (src);
50397 	  if (GET_MODE_CLASS (mode) != MODE_INT
50398 	      && GET_MODE_CLASS (mode) != MODE_FLOAT)
50399 	    continue;
50400 	  if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
50401 	      || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
50402 	    continue;
50403 	  /* insn is CMOV or FCMOV.  */
50404 	  if (++cmov_cnt > 1)
50405 	    return false;
50406 	}
50407     }
50408   return default_noce_conversion_profitable_p (seq, if_info);
50409 }
50410 
50411 /* Implement targetm.vectorize.init_cost.  */
50412 
50413 static void *
50414 ix86_init_cost (struct loop *)
50415 {
50416   unsigned *cost = XNEWVEC (unsigned, 3);
50417   cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50418   return cost;
50419 }
50420 
50421 /* Implement targetm.vectorize.add_stmt_cost.  */
50422 
50423 static unsigned
50424 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50425 		    struct _stmt_vec_info *stmt_info, int misalign,
50426 		    enum vect_cost_model_location where)
50427 {
50428   unsigned *cost = (unsigned *) data;
50429   unsigned retval = 0;
50430 
50431   tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50432   int stmt_cost = - 1;
50433 
50434   if ((kind == vector_stmt || kind == scalar_stmt)
50435       && stmt_info
50436       && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50437     {
50438       tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50439       bool fp = false;
50440       machine_mode mode = TImode;
50441 
50442       if (vectype != NULL)
50443 	{
50444 	  fp = FLOAT_TYPE_P (vectype);
50445 	  mode = TYPE_MODE (vectype);
50446 	}
50447       /*machine_mode inner_mode = mode;
50448       if (VECTOR_MODE_P (mode))
50449 	inner_mode = GET_MODE_INNER (mode);*/
50450 
50451       switch (subcode)
50452 	{
50453 	case PLUS_EXPR:
50454 	case POINTER_PLUS_EXPR:
50455 	case MINUS_EXPR:
50456 	  if (kind == scalar_stmt)
50457 	    {
50458 	      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50459 		stmt_cost = ix86_cost->addss;
50460 	      else if (X87_FLOAT_MODE_P (mode))
50461 		stmt_cost = ix86_cost->fadd;
50462 	      else
50463 	        stmt_cost = ix86_cost->add;
50464 	    }
50465 	  else
50466 	    stmt_cost = ix86_vec_cost (mode,
50467 				       fp ? ix86_cost->addss
50468 				       : ix86_cost->sse_op,
50469 				       true);
50470 	  break;
50471 
50472 	case MULT_EXPR:
50473 	case WIDEN_MULT_EXPR:
50474 	case MULT_HIGHPART_EXPR:
50475 	  stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50476 	  break;
50477 	case FMA_EXPR:
50478           stmt_cost = ix86_vec_cost (mode,
50479 				     mode == SFmode ? ix86_cost->fmass
50480 				     : ix86_cost->fmasd,
50481 				     true);
50482 	  break;
50483 	case NEGATE_EXPR:
50484 	  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50485 	    stmt_cost = ix86_cost->sse_op;
50486 	  else if (X87_FLOAT_MODE_P (mode))
50487 	    stmt_cost = ix86_cost->fchs;
50488 	  else if (VECTOR_MODE_P (mode))
50489 	    stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50490 	  else
50491 	    stmt_cost = ix86_cost->add;
50492 	  break;
50493 	case TRUNC_DIV_EXPR:
50494 	case CEIL_DIV_EXPR:
50495 	case FLOOR_DIV_EXPR:
50496 	case ROUND_DIV_EXPR:
50497 	case TRUNC_MOD_EXPR:
50498 	case CEIL_MOD_EXPR:
50499 	case FLOOR_MOD_EXPR:
50500 	case RDIV_EXPR:
50501 	case ROUND_MOD_EXPR:
50502 	case EXACT_DIV_EXPR:
50503 	  stmt_cost = ix86_division_cost (ix86_cost, mode);
50504 	  break;
50505 
50506 	case RSHIFT_EXPR:
50507 	case LSHIFT_EXPR:
50508 	case LROTATE_EXPR:
50509 	case RROTATE_EXPR:
50510 	  {
50511 	    tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50512 	    stmt_cost = ix86_shift_rotate_cost
50513 			   (ix86_cost, mode,
50514 		            TREE_CODE (op2) == INTEGER_CST,
50515 			    cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50516 		            true, false, false, NULL, NULL);
50517 	  }
50518 	  break;
50519 	case NOP_EXPR:
50520 	  /* Only sign-conversions are free.  */
50521 	  if (tree_nop_conversion_p
50522 	        (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
50523 		 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
50524 	    stmt_cost = 0;
50525 	  break;
50526 
50527 	case BIT_IOR_EXPR:
50528 	case ABS_EXPR:
50529 	case MIN_EXPR:
50530 	case MAX_EXPR:
50531 	case BIT_XOR_EXPR:
50532 	case BIT_AND_EXPR:
50533 	case BIT_NOT_EXPR:
50534 	  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50535 	    stmt_cost = ix86_cost->sse_op;
50536 	  else if (VECTOR_MODE_P (mode))
50537 	    stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50538 	  else
50539 	    stmt_cost = ix86_cost->add;
50540 	  break;
50541 	default:
50542 	  break;
50543 	}
50544     }
50545   /* If we do elementwise loads into a vector then we are bound by
50546      latency and execution resources for the many scalar loads
50547      (AGU and load ports).  Try to account for this by scaling the
50548      construction cost by the number of elements involved.  */
50549   if (kind == vec_construct
50550       && stmt_info
50551       && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
50552       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
50553       && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
50554     {
50555       stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50556       stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
50557     }
50558   if (stmt_cost == -1)
50559     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50560 
50561   /* Penalize DFmode vector operations for Bonnell.  */
50562   if (TARGET_BONNELL && kind == vector_stmt
50563       && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50564     stmt_cost *= 5;  /* FIXME: The value here is arbitrary.  */
50565 
50566   /* Statements in an inner loop relative to the loop being
50567      vectorized are weighted more heavily.  The value here is
50568      arbitrary and could potentially be improved with analysis.  */
50569   if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50570     count *= 50;  /* FIXME.  */
50571 
50572   retval = (unsigned) (count * stmt_cost);
50573 
50574   /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50575      for Silvermont as it has out of order integer pipeline and can execute
50576      2 scalar instruction per tick, but has in order SIMD pipeline.  */
50577   if ((TARGET_SILVERMONT || TARGET_INTEL)
50578       && stmt_info && stmt_info->stmt)
50579     {
50580       tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50581       if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50582 	retval = (retval * 17) / 10;
50583     }
50584 
50585   cost[where] += retval;
50586 
50587   return retval;
50588 }
50589 
50590 /* Implement targetm.vectorize.finish_cost.  */
50591 
50592 static void
50593 ix86_finish_cost (void *data, unsigned *prologue_cost,
50594 		  unsigned *body_cost, unsigned *epilogue_cost)
50595 {
50596   unsigned *cost = (unsigned *) data;
50597   *prologue_cost = cost[vect_prologue];
50598   *body_cost     = cost[vect_body];
50599   *epilogue_cost = cost[vect_epilogue];
50600 }
50601 
50602 /* Implement targetm.vectorize.destroy_cost_data.  */
50603 
50604 static void
50605 ix86_destroy_cost_data (void *data)
50606 {
50607   free (data);
50608 }
50609 
50610 /* Validate target specific memory model bits in VAL. */
50611 
50612 static unsigned HOST_WIDE_INT
50613 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50614 {
50615   enum memmodel model = memmodel_from_int (val);
50616   bool strong;
50617 
50618   if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50619 				      |MEMMODEL_MASK)
50620       || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50621     {
50622       warning (OPT_Winvalid_memory_model,
50623 	       "unknown architecture specific memory model");
50624       return MEMMODEL_SEQ_CST;
50625     }
50626   strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50627   if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50628     {
50629       warning (OPT_Winvalid_memory_model,
50630               "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50631       return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50632     }
50633   if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50634     {
50635       warning (OPT_Winvalid_memory_model,
50636               "HLE_RELEASE not used with RELEASE or stronger memory model");
50637       return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50638     }
50639   return val;
50640 }
50641 
50642 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50643    CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50644    CLONEI->simdlen.  Return 0 if SIMD clones shouldn't be emitted,
50645    or number of vecsize_mangle variants that should be emitted.  */
50646 
50647 static int
50648 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50649 					     struct cgraph_simd_clone *clonei,
50650 					     tree base_type, int num)
50651 {
50652   int ret = 1;
50653 
50654   if (clonei->simdlen
50655       && (clonei->simdlen < 2
50656 	  || clonei->simdlen > 1024
50657 	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50658     {
50659       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50660 		  "unsupported simdlen %d", clonei->simdlen);
50661       return 0;
50662     }
50663 
50664   tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50665   if (TREE_CODE (ret_type) != VOID_TYPE)
50666     switch (TYPE_MODE (ret_type))
50667       {
50668       case E_QImode:
50669       case E_HImode:
50670       case E_SImode:
50671       case E_DImode:
50672       case E_SFmode:
50673       case E_DFmode:
50674       /* case E_SCmode: */
50675       /* case E_DCmode: */
50676 	if (!AGGREGATE_TYPE_P (ret_type))
50677 	  break;
50678 	/* FALLTHRU */
50679       default:
50680 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50681 		    "unsupported return type %qT for simd", ret_type);
50682 	return 0;
50683       }
50684 
50685   tree t;
50686   int i;
50687   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
50688   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
50689 
50690   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
50691        t && t != void_list_node; t = TREE_CHAIN (t), i++)
50692     {
50693       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
50694       switch (TYPE_MODE (arg_type))
50695 	{
50696 	case E_QImode:
50697 	case E_HImode:
50698 	case E_SImode:
50699 	case E_DImode:
50700 	case E_SFmode:
50701 	case E_DFmode:
50702 	/* case E_SCmode: */
50703 	/* case E_DCmode: */
50704 	  if (!AGGREGATE_TYPE_P (arg_type))
50705 	    break;
50706 	  /* FALLTHRU */
50707 	default:
50708 	  if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_UNIFORM)
50709 	    break;
50710 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50711 		      "unsupported argument type %qT for simd", arg_type);
50712 	  return 0;
50713 	}
50714     }
50715 
50716   if (!TREE_PUBLIC (node->decl))
50717     {
50718       /* If the function isn't exported, we can pick up just one ISA
50719 	 for the clones.  */
50720       if (TARGET_AVX512F)
50721 	clonei->vecsize_mangle = 'e';
50722       else if (TARGET_AVX2)
50723 	clonei->vecsize_mangle = 'd';
50724       else if (TARGET_AVX)
50725 	clonei->vecsize_mangle = 'c';
50726       else
50727 	clonei->vecsize_mangle = 'b';
50728       ret = 1;
50729     }
50730   else
50731     {
50732       clonei->vecsize_mangle = "bcde"[num];
50733       ret = 4;
50734     }
50735   clonei->mask_mode = VOIDmode;
50736   switch (clonei->vecsize_mangle)
50737     {
50738     case 'b':
50739       clonei->vecsize_int = 128;
50740       clonei->vecsize_float = 128;
50741       break;
50742     case 'c':
50743       clonei->vecsize_int = 128;
50744       clonei->vecsize_float = 256;
50745       break;
50746     case 'd':
50747       clonei->vecsize_int = 256;
50748       clonei->vecsize_float = 256;
50749       break;
50750     case 'e':
50751       clonei->vecsize_int = 512;
50752       clonei->vecsize_float = 512;
50753       if (TYPE_MODE (base_type) == QImode)
50754 	clonei->mask_mode = DImode;
50755       else
50756 	clonei->mask_mode = SImode;
50757       break;
50758     }
50759   if (clonei->simdlen == 0)
50760     {
50761       if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50762 	clonei->simdlen = clonei->vecsize_int;
50763       else
50764 	clonei->simdlen = clonei->vecsize_float;
50765       clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50766     }
50767   else if (clonei->simdlen > 16)
50768     {
50769       /* For compatibility with ICC, use the same upper bounds
50770 	 for simdlen.  In particular, for CTYPE below, use the return type,
50771 	 unless the function returns void, in that case use the characteristic
50772 	 type.  If it is possible for given SIMDLEN to pass CTYPE value
50773 	 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50774 	 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50775 	 emit corresponding clone.  */
50776       tree ctype = ret_type;
50777       if (TREE_CODE (ret_type) == VOID_TYPE)
50778 	ctype = base_type;
50779       int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50780       if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50781 	cnt /= clonei->vecsize_int;
50782       else
50783 	cnt /= clonei->vecsize_float;
50784       if (cnt > (TARGET_64BIT ? 16 : 8))
50785 	{
50786 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50787 		      "unsupported simdlen %d", clonei->simdlen);
50788 	  return 0;
50789 	}
50790       }
50791   return ret;
50792 }
50793 
50794 /* Add target attribute to SIMD clone NODE if needed.  */
50795 
50796 static void
50797 ix86_simd_clone_adjust (struct cgraph_node *node)
50798 {
50799   const char *str = NULL;
50800   gcc_assert (node->decl == cfun->decl);
50801   switch (node->simdclone->vecsize_mangle)
50802     {
50803     case 'b':
50804       if (!TARGET_SSE2)
50805 	str = "sse2";
50806       break;
50807     case 'c':
50808       if (!TARGET_AVX)
50809 	str = "avx";
50810       break;
50811     case 'd':
50812       if (!TARGET_AVX2)
50813 	str = "avx2";
50814       break;
50815     case 'e':
50816       if (!TARGET_AVX512F)
50817 	str = "avx512f";
50818       break;
50819     default:
50820       gcc_unreachable ();
50821     }
50822   if (str == NULL)
50823     return;
50824   push_cfun (NULL);
50825   tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50826   bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50827   gcc_assert (ok);
50828   pop_cfun ();
50829   ix86_reset_previous_fndecl ();
50830   ix86_set_current_function (node->decl);
50831 }
50832 
50833 /* If SIMD clone NODE can't be used in a vectorized loop
50834    in current function, return -1, otherwise return a badness of using it
50835    (0 if it is most desirable from vecsize_mangle point of view, 1
50836    slightly less desirable, etc.).  */
50837 
50838 static int
50839 ix86_simd_clone_usable (struct cgraph_node *node)
50840 {
50841   switch (node->simdclone->vecsize_mangle)
50842     {
50843     case 'b':
50844       if (!TARGET_SSE2)
50845 	return -1;
50846       if (!TARGET_AVX)
50847 	return 0;
50848       return TARGET_AVX2 ? 2 : 1;
50849     case 'c':
50850       if (!TARGET_AVX)
50851 	return -1;
50852       return TARGET_AVX2 ? 1 : 0;
50853     case 'd':
50854       if (!TARGET_AVX2)
50855 	return -1;
50856       return 0;
50857     case 'e':
50858       if (!TARGET_AVX512F)
50859 	return -1;
50860       return 0;
50861     default:
50862       gcc_unreachable ();
50863     }
50864 }
50865 
50866 /* This function adjusts the unroll factor based on
50867    the hardware capabilities. For ex, bdver3 has
50868    a loop buffer which makes unrolling of smaller
50869    loops less important. This function decides the
50870    unroll factor using number of memory references
50871    (value 32 is used) as a heuristic. */
50872 
50873 static unsigned
50874 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50875 {
50876   basic_block *bbs;
50877   rtx_insn *insn;
50878   unsigned i;
50879   unsigned mem_count = 0;
50880 
50881   if (!TARGET_ADJUST_UNROLL)
50882      return nunroll;
50883 
50884   /* Count the number of memory references within the loop body.
50885      This value determines the unrolling factor for bdver3 and bdver4
50886      architectures. */
50887   subrtx_iterator::array_type array;
50888   bbs = get_loop_body (loop);
50889   for (i = 0; i < loop->num_nodes; i++)
50890     FOR_BB_INSNS (bbs[i], insn)
50891       if (NONDEBUG_INSN_P (insn))
50892 	FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50893 	  if (const_rtx x = *iter)
50894 	    if (MEM_P (x))
50895 	      {
50896 		machine_mode mode = GET_MODE (x);
50897 		unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50898 		if (n_words > 4)
50899 		  mem_count += 2;
50900 		else
50901 		  mem_count += 1;
50902 	      }
50903   free (bbs);
50904 
50905   if (mem_count && mem_count <=32)
50906     return MIN (nunroll, 32 / mem_count);
50907 
50908   return nunroll;
50909 }
50910 
50911 
50912 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P.  */
50913 
50914 static bool
50915 ix86_float_exceptions_rounding_supported_p (void)
50916 {
50917   /* For x87 floating point with standard excess precision handling,
50918      there is no adddf3 pattern (since x87 floating point only has
50919      XFmode operations) so the default hook implementation gets this
50920      wrong.  */
50921   return TARGET_80387 || (TARGET_SSE && TARGET_SSE_MATH);
50922 }
50923 
50924 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV.  */
50925 
50926 static void
50927 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50928 {
50929   if (!TARGET_80387 && !(TARGET_SSE && TARGET_SSE_MATH))
50930     return;
50931   tree exceptions_var = create_tmp_var_raw (integer_type_node);
50932   if (TARGET_80387)
50933     {
50934       tree fenv_index_type = build_index_type (size_int (6));
50935       tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50936       tree fenv_var = create_tmp_var_raw (fenv_type);
50937       TREE_ADDRESSABLE (fenv_var) = 1;
50938       tree fenv_ptr = build_pointer_type (fenv_type);
50939       tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50940       fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50941       tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50942       tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50943       tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50944       tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50945       tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50946       tree hold_fnclex = build_call_expr (fnclex, 0);
50947       fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50948 			 NULL_TREE, NULL_TREE);
50949       *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50950 		      hold_fnclex);
50951       *clear = build_call_expr (fnclex, 0);
50952       tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50953       tree fnstsw_call = build_call_expr (fnstsw, 0);
50954       tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50955 			    sw_var, fnstsw_call);
50956       tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50957       tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50958 				exceptions_var, exceptions_x87);
50959       *update = build2 (COMPOUND_EXPR, integer_type_node,
50960 			sw_mod, update_mod);
50961       tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50962       *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50963     }
50964   if (TARGET_SSE && TARGET_SSE_MATH)
50965     {
50966       tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50967       tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50968       tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50969       tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50970       tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50971       tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50972 				      mxcsr_orig_var, stmxcsr_hold_call);
50973       tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50974 				  mxcsr_orig_var,
50975 				  build_int_cst (unsigned_type_node, 0x1f80));
50976       hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50977 			     build_int_cst (unsigned_type_node, 0xffffffc0));
50978       tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50979 				     mxcsr_mod_var, hold_mod_val);
50980       tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50981       tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50982 			      hold_assign_orig, hold_assign_mod);
50983       hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50984 			 ldmxcsr_hold_call);
50985       if (*hold)
50986 	*hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50987       else
50988 	*hold = hold_all;
50989       tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50990       if (*clear)
50991 	*clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50992 			 ldmxcsr_clear_call);
50993       else
50994 	*clear = ldmxcsr_clear_call;
50995       tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50996       tree exceptions_sse = fold_convert (integer_type_node,
50997 					  stxmcsr_update_call);
50998       if (*update)
50999 	{
51000 	  tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51001 					exceptions_var, exceptions_sse);
51002 	  tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51003 					   exceptions_var, exceptions_mod);
51004 	  *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51005 			    exceptions_assign);
51006 	}
51007       else
51008 	*update = build2 (MODIFY_EXPR, integer_type_node,
51009 			  exceptions_var, exceptions_sse);
51010       tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51011       *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51012 			ldmxcsr_update_call);
51013     }
51014   tree atomic_feraiseexcept
51015     = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51016   tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51017 						    1, exceptions_var);
51018   *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51019 		    atomic_feraiseexcept_call);
51020 }
51021 
51022 /* Return mode to be used for bounds or VOIDmode
51023    if bounds are not supported.  */
51024 
51025 static machine_mode
51026 ix86_mpx_bound_mode ()
51027 {
51028   /* Do not support pointer checker if MPX
51029      is not enabled.  */
51030   if (!TARGET_MPX)
51031     {
51032       if (flag_check_pointer_bounds)
51033 	warning (0, "Pointer Checker requires MPX support on this target."
51034 		 " Use -mmpx options to enable MPX.");
51035       return VOIDmode;
51036     }
51037 
51038   return BNDmode;
51039 }
51040 
51041 /*  Return constant used to statically initialize constant bounds.
51042 
51043     This function is used to create special bound values.  For now
51044     only INIT bounds and NONE bounds are expected.  More special
51045     values may be added later.  */
51046 
51047 static tree
51048 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51049 {
51050   tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51051     : build_zero_cst (pointer_sized_int_node);
51052   tree high = ub ? build_zero_cst (pointer_sized_int_node)
51053     : build_minus_one_cst (pointer_sized_int_node);
51054 
51055   /* This function is supposed to be used to create INIT and
51056      NONE bounds only.  */
51057   gcc_assert ((lb == 0 && ub == -1)
51058 	      || (lb == -1 && ub == 0));
51059 
51060   return build_complex (NULL, low, high);
51061 }
51062 
51063 /* Generate a list of statements STMTS to initialize pointer bounds
51064    variable VAR with bounds LB and UB.  Return the number of generated
51065    statements.  */
51066 
51067 static int
51068 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
51069 {
51070   tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51071   tree lhs, modify, var_p;
51072 
51073   ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51074   var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51075 
51076   lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51077   modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51078   append_to_statement_list (modify, stmts);
51079 
51080   lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51081 		build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51082 			TYPE_SIZE_UNIT (pointer_sized_int_node)));
51083   modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51084   append_to_statement_list (modify, stmts);
51085 
51086   return 2;
51087 }
51088 
51089 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51090 /* For i386, common symbol is local only for non-PIE binaries.  For
51091    x86-64, common symbol is local only for non-PIE binaries or linker
51092    supports copy reloc in PIE binaries.   */
51093 
51094 static bool
51095 ix86_binds_local_p (const_tree exp)
51096 {
51097   return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51098 				  (!flag_pic
51099 				   || (TARGET_64BIT
51100 				       && HAVE_LD_PIE_COPYRELOC != 0)));
51101 }
51102 #endif
51103 
51104 /* If MEM is in the form of [base+offset], extract the two parts
51105    of address and set to BASE and OFFSET, otherwise return false.  */
51106 
51107 static bool
51108 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51109 {
51110   rtx addr;
51111 
51112   gcc_assert (MEM_P (mem));
51113 
51114   addr = XEXP (mem, 0);
51115 
51116   if (GET_CODE (addr) == CONST)
51117     addr = XEXP (addr, 0);
51118 
51119   if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51120     {
51121       *base = addr;
51122       *offset = const0_rtx;
51123       return true;
51124     }
51125 
51126   if (GET_CODE (addr) == PLUS
51127       && (REG_P (XEXP (addr, 0))
51128 	  || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51129       && CONST_INT_P (XEXP (addr, 1)))
51130     {
51131       *base = XEXP (addr, 0);
51132       *offset = XEXP (addr, 1);
51133       return true;
51134     }
51135 
51136   return false;
51137 }
51138 
51139 /* Given OPERANDS of consecutive load/store, check if we can merge
51140    them into move multiple.  LOAD is true if they are load instructions.
51141    MODE is the mode of memory operands.  */
51142 
51143 bool
51144 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51145 				    machine_mode mode)
51146 {
51147   HOST_WIDE_INT offval_1, offval_2, msize;
51148   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51149 
51150   if (load)
51151     {
51152       mem_1 = operands[1];
51153       mem_2 = operands[3];
51154       reg_1 = operands[0];
51155       reg_2 = operands[2];
51156     }
51157   else
51158     {
51159       mem_1 = operands[0];
51160       mem_2 = operands[2];
51161       reg_1 = operands[1];
51162       reg_2 = operands[3];
51163     }
51164 
51165   gcc_assert (REG_P (reg_1) && REG_P (reg_2));
51166 
51167   if (REGNO (reg_1) != REGNO (reg_2))
51168     return false;
51169 
51170   /* Check if the addresses are in the form of [base+offset].  */
51171   if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
51172     return false;
51173   if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51174     return false;
51175 
51176   /* Check if the bases are the same.  */
51177   if (!rtx_equal_p (base_1, base_2))
51178     return false;
51179 
51180   offval_1 = INTVAL (offset_1);
51181   offval_2 = INTVAL (offset_2);
51182   msize = GET_MODE_SIZE (mode);
51183   /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address.  */
51184   if (offval_1 + msize != offval_2)
51185     return false;
51186 
51187   return true;
51188 }
51189 
51190 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
51191 
51192 static bool
51193 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51194 			optimization_type opt_type)
51195 {
51196   switch (op)
51197     {
51198     case asin_optab:
51199     case acos_optab:
51200     case log1p_optab:
51201     case exp_optab:
51202     case exp10_optab:
51203     case exp2_optab:
51204     case expm1_optab:
51205     case ldexp_optab:
51206     case scalb_optab:
51207     case round_optab:
51208       return opt_type == OPTIMIZE_FOR_SPEED;
51209 
51210     case rint_optab:
51211       if (SSE_FLOAT_MODE_P (mode1)
51212 	  && TARGET_SSE_MATH
51213 	  && !flag_trapping_math
51214 	  && !TARGET_SSE4_1)
51215 	return opt_type == OPTIMIZE_FOR_SPEED;
51216       return true;
51217 
51218     case floor_optab:
51219     case ceil_optab:
51220     case btrunc_optab:
51221       if (SSE_FLOAT_MODE_P (mode1)
51222 	  && TARGET_SSE_MATH
51223 	  && !flag_trapping_math
51224 	  && TARGET_SSE4_1)
51225 	return true;
51226       return opt_type == OPTIMIZE_FOR_SPEED;
51227 
51228     case rsqrt_optab:
51229       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51230 
51231     default:
51232       return true;
51233     }
51234 }
51235 
51236 /* Address space support.
51237 
51238    This is not "far pointers" in the 16-bit sense, but an easy way
51239    to use %fs and %gs segment prefixes.  Therefore:
51240 
51241     (a) All address spaces have the same modes,
51242     (b) All address spaces have the same addresss forms,
51243     (c) While %fs and %gs are technically subsets of the generic
51244         address space, they are probably not subsets of each other.
51245     (d) Since we have no access to the segment base register values
51246         without resorting to a system call, we cannot convert a
51247         non-default address space to a default address space.
51248         Therefore we do not claim %fs or %gs are subsets of generic.
51249 
51250    Therefore we can (mostly) use the default hooks.  */
51251 
51252 /* All use of segmentation is assumed to make address 0 valid.  */
51253 
51254 static bool
51255 ix86_addr_space_zero_address_valid (addr_space_t as)
51256 {
51257   return as != ADDR_SPACE_GENERIC;
51258 }
51259 
51260 static void
51261 ix86_init_libfuncs (void)
51262 {
51263   if (TARGET_64BIT)
51264     {
51265       set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51266       set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51267     }
51268   else
51269     {
51270       set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51271       set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51272     }
51273 
51274 #if TARGET_MACHO
51275   darwin_rename_builtins ();
51276 #endif
51277 }
51278 
51279 /* Generate call to __divmoddi4.  */
51280 
51281 static void
51282 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51283 			    rtx op0, rtx op1,
51284 			    rtx *quot_p, rtx *rem_p)
51285 {
51286   rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51287 
51288   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51289 				      mode, op0, mode, op1, mode,
51290 				      XEXP (rem, 0), Pmode);
51291   *quot_p = quot;
51292   *rem_p = rem;
51293 }
51294 
51295 /* Set the value of FLT_EVAL_METHOD in float.h.  When using only the
51296    FPU, assume that the fpcw is set to extended precision; when using
51297    only SSE, rounding is correct; when using both SSE and the FPU,
51298    the rounding precision is indeterminate, since either may be chosen
51299    apparently at random.  */
51300 
51301 static enum flt_eval_method
51302 ix86_excess_precision (enum excess_precision_type type)
51303 {
51304   switch (type)
51305     {
51306       case EXCESS_PRECISION_TYPE_FAST:
51307 	/* The fastest type to promote to will always be the native type,
51308 	   whether that occurs with implicit excess precision or
51309 	   otherwise.  */
51310 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51311       case EXCESS_PRECISION_TYPE_STANDARD:
51312       case EXCESS_PRECISION_TYPE_IMPLICIT:
51313 	/* Otherwise, the excess precision we want when we are
51314 	   in a standards compliant mode, and the implicit precision we
51315 	   provide would be identical were it not for the unpredictable
51316 	   cases.  */
51317 	if (!TARGET_80387)
51318 	  return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51319 	else if (!TARGET_MIX_SSE_I387)
51320 	  {
51321 	    if (!(TARGET_SSE && TARGET_SSE_MATH))
51322 	      return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51323 	    else if (TARGET_SSE2)
51324 	      return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51325 	  }
51326 
51327 	/* If we are in standards compliant mode, but we know we will
51328 	   calculate in unpredictable precision, return
51329 	   FLT_EVAL_METHOD_FLOAT.  There is no reason to introduce explicit
51330 	   excess precision if the target can't guarantee it will honor
51331 	   it.  */
51332 	return (type == EXCESS_PRECISION_TYPE_STANDARD
51333 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51334 		: FLT_EVAL_METHOD_UNPREDICTABLE);
51335       default:
51336 	gcc_unreachable ();
51337     }
51338 
51339   return FLT_EVAL_METHOD_UNPREDICTABLE;
51340 }
51341 
51342 /* Implement PUSH_ROUNDING.  On 386, we have pushw instruction that
51343    decrements by exactly 2 no matter what the position was, there is no pushb.
51344 
51345    But as CIE data alignment factor on this arch is -4 for 32bit targets
51346    and -8 for 64bit targets, we need to make sure all stack pointer adjustments
51347    are in multiple of 4 for 32bit targets and 8 for 64bit targets.  */
51348 
51349 poly_int64
51350 ix86_push_rounding (poly_int64 bytes)
51351 {
51352   return ROUND_UP (bytes, UNITS_PER_WORD);
51353 }
51354 
51355 /* Target-specific selftests.  */
51356 
51357 #if CHECKING_P
51358 
51359 namespace selftest {
51360 
51361 /* Verify that hard regs are dumped as expected (in compact mode).  */
51362 
51363 static void
51364 ix86_test_dumping_hard_regs ()
51365 {
51366   ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51367   ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51368 }
51369 
51370 /* Test dumping an insn with repeated references to the same SCRATCH,
51371    to verify the rtx_reuse code.  */
51372 
51373 static void
51374 ix86_test_dumping_memory_blockage ()
51375 {
51376   set_new_first_and_last_insn (NULL, NULL);
51377 
51378   rtx pat = gen_memory_blockage ();
51379   rtx_reuse_manager r;
51380   r.preprocess (pat);
51381 
51382   /* Verify that the repeated references to the SCRATCH show use
51383      reuse IDS.  The first should be prefixed with a reuse ID,
51384      and the second should be dumped as a "reuse_rtx" of that ID.
51385      The expected string assumes Pmode == DImode.  */
51386   if (Pmode == DImode)
51387     ASSERT_RTL_DUMP_EQ_WITH_REUSE
51388       ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0  A8])\n"
51389        "        (unspec:BLK [\n"
51390        "                (mem/v:BLK (reuse_rtx 0) [0  A8])\n"
51391        "            ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51392 }
51393 
51394 /* Verify loading an RTL dump; specifically a dump of copying
51395    a param on x86_64 from a hard reg into the frame.
51396    This test is target-specific since the dump contains target-specific
51397    hard reg names.  */
51398 
51399 static void
51400 ix86_test_loading_dump_fragment_1 ()
51401 {
51402   rtl_dump_test t (SELFTEST_LOCATION,
51403 		   locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51404 
51405   rtx_insn *insn = get_insn_by_uid (1);
51406 
51407   /* The block structure and indentation here is purely for
51408      readability; it mirrors the structure of the rtx.  */
51409   tree mem_expr;
51410   {
51411     rtx pat = PATTERN (insn);
51412     ASSERT_EQ (SET, GET_CODE (pat));
51413     {
51414       rtx dest = SET_DEST (pat);
51415       ASSERT_EQ (MEM, GET_CODE (dest));
51416       /* Verify the "/c" was parsed.  */
51417       ASSERT_TRUE (RTX_FLAG (dest, call));
51418       ASSERT_EQ (SImode, GET_MODE (dest));
51419       {
51420 	rtx addr = XEXP (dest, 0);
51421 	ASSERT_EQ (PLUS, GET_CODE (addr));
51422 	ASSERT_EQ (DImode, GET_MODE (addr));
51423 	{
51424 	  rtx lhs = XEXP (addr, 0);
51425 	  /* Verify that the "frame" REG was consolidated.  */
51426 	  ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51427 	}
51428 	{
51429 	  rtx rhs = XEXP (addr, 1);
51430 	  ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51431 	  ASSERT_EQ (-4, INTVAL (rhs));
51432 	}
51433       }
51434       /* Verify the "[1 i+0 S4 A32]" was parsed.  */
51435       ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51436       /* "i" should have been handled by synthesizing a global int
51437 	 variable named "i".  */
51438       mem_expr = MEM_EXPR (dest);
51439       ASSERT_NE (mem_expr, NULL);
51440       ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51441       ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51442       ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51443       ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51444       /* "+0".  */
51445       ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51446       ASSERT_EQ (0, MEM_OFFSET (dest));
51447       /* "S4".  */
51448       ASSERT_EQ (4, MEM_SIZE (dest));
51449       /* "A32.  */
51450       ASSERT_EQ (32, MEM_ALIGN (dest));
51451     }
51452     {
51453       rtx src = SET_SRC (pat);
51454       ASSERT_EQ (REG, GET_CODE (src));
51455       ASSERT_EQ (SImode, GET_MODE (src));
51456       ASSERT_EQ (5, REGNO (src));
51457       tree reg_expr = REG_EXPR (src);
51458       /* "i" here should point to the same var as for the MEM_EXPR.  */
51459       ASSERT_EQ (reg_expr, mem_expr);
51460     }
51461   }
51462 }
51463 
51464 /* Verify that the RTL loader copes with a call_insn dump.
51465    This test is target-specific since the dump contains a target-specific
51466    hard reg name.  */
51467 
51468 static void
51469 ix86_test_loading_call_insn ()
51470 {
51471   /* The test dump includes register "xmm0", where requires TARGET_SSE
51472      to exist.  */
51473   if (!TARGET_SSE)
51474     return;
51475 
51476   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51477 
51478   rtx_insn *insn = get_insns ();
51479   ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51480 
51481   /* "/j".  */
51482   ASSERT_TRUE (RTX_FLAG (insn, jump));
51483 
51484   rtx pat = PATTERN (insn);
51485   ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51486 
51487   /* Verify REG_NOTES.  */
51488   {
51489     /* "(expr_list:REG_CALL_DECL".   */
51490     ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51491     rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51492     ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51493 
51494     /* "(expr_list:REG_EH_REGION (const_int 0 [0])".  */
51495     rtx_expr_list *note1 = note0->next ();
51496     ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51497 
51498     ASSERT_EQ (NULL, note1->next ());
51499   }
51500 
51501   /* Verify CALL_INSN_FUNCTION_USAGE.  */
51502   {
51503     /* "(expr_list:DF (use (reg:DF 21 xmm0))".  */
51504     rtx_expr_list *usage
51505       = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51506     ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51507     ASSERT_EQ (DFmode, GET_MODE (usage));
51508     ASSERT_EQ (USE, GET_CODE (usage->element ()));
51509     ASSERT_EQ (NULL, usage->next ());
51510   }
51511 }
51512 
51513 /* Verify that the RTL loader copes a dump from print_rtx_function.
51514    This test is target-specific since the dump contains target-specific
51515    hard reg names.  */
51516 
51517 static void
51518 ix86_test_loading_full_dump ()
51519 {
51520   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51521 
51522   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51523 
51524   rtx_insn *insn_1 = get_insn_by_uid (1);
51525   ASSERT_EQ (NOTE, GET_CODE (insn_1));
51526 
51527   rtx_insn *insn_7 = get_insn_by_uid (7);
51528   ASSERT_EQ (INSN, GET_CODE (insn_7));
51529   ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51530 
51531   rtx_insn *insn_15 = get_insn_by_uid (15);
51532   ASSERT_EQ (INSN, GET_CODE (insn_15));
51533   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51534 
51535   /* Verify crtl->return_rtx.  */
51536   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51537   ASSERT_EQ (0, REGNO (crtl->return_rtx));
51538   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51539 }
51540 
51541 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51542    In particular, verify that it correctly loads the 2nd operand.
51543    This test is target-specific since these are machine-specific
51544    operands (and enums).  */
51545 
51546 static void
51547 ix86_test_loading_unspec ()
51548 {
51549   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51550 
51551   ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51552 
51553   ASSERT_TRUE (cfun);
51554 
51555   /* Test of an UNSPEC.  */
51556    rtx_insn *insn = get_insns ();
51557   ASSERT_EQ (INSN, GET_CODE (insn));
51558   rtx set = single_set (insn);
51559   ASSERT_NE (NULL, set);
51560   rtx dst = SET_DEST (set);
51561   ASSERT_EQ (MEM, GET_CODE (dst));
51562   rtx src = SET_SRC (set);
51563   ASSERT_EQ (UNSPEC, GET_CODE (src));
51564   ASSERT_EQ (BLKmode, GET_MODE (src));
51565   ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51566 
51567   rtx v0 = XVECEXP (src, 0, 0);
51568 
51569   /* Verify that the two uses of the first SCRATCH have pointer
51570      equality.  */
51571   rtx scratch_a = XEXP (dst, 0);
51572   ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51573 
51574   rtx scratch_b = XEXP (v0, 0);
51575   ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51576 
51577   ASSERT_EQ (scratch_a, scratch_b);
51578 
51579   /* Verify that the two mems are thus treated as equal.  */
51580   ASSERT_TRUE (rtx_equal_p (dst, v0));
51581 
51582   /* Verify the the insn is recognized.  */
51583   ASSERT_NE(-1, recog_memoized (insn));
51584 
51585   /* Test of an UNSPEC_VOLATILE, which has its own enum values.  */
51586   insn = NEXT_INSN (insn);
51587   ASSERT_EQ (INSN, GET_CODE (insn));
51588 
51589   set = single_set (insn);
51590   ASSERT_NE (NULL, set);
51591 
51592   src = SET_SRC (set);
51593   ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51594   ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51595 }
51596 
51597 /* Run all target-specific selftests.  */
51598 
51599 static void
51600 ix86_run_selftests (void)
51601 {
51602   ix86_test_dumping_hard_regs ();
51603   ix86_test_dumping_memory_blockage ();
51604 
51605   /* Various tests of loading RTL dumps, here because they contain
51606      ix86-isms (e.g. names of hard regs).  */
51607   ix86_test_loading_dump_fragment_1 ();
51608   ix86_test_loading_call_insn ();
51609   ix86_test_loading_full_dump ();
51610   ix86_test_loading_unspec ();
51611 }
51612 
51613 } // namespace selftest
51614 
51615 #endif /* CHECKING_P */
51616 
51617 /* Initialize the GCC target structure.  */
51618 #undef TARGET_RETURN_IN_MEMORY
51619 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51620 
51621 #undef TARGET_LEGITIMIZE_ADDRESS
51622 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51623 
51624 #undef TARGET_ATTRIBUTE_TABLE
51625 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51626 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51627 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51628 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51629 #  undef TARGET_MERGE_DECL_ATTRIBUTES
51630 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51631 #endif
51632 
51633 #undef TARGET_COMP_TYPE_ATTRIBUTES
51634 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51635 
51636 #undef TARGET_INIT_BUILTINS
51637 #define TARGET_INIT_BUILTINS ix86_init_builtins
51638 #undef TARGET_BUILTIN_DECL
51639 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51640 #undef TARGET_EXPAND_BUILTIN
51641 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51642 
51643 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51644 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51645   ix86_builtin_vectorized_function
51646 
51647 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51648 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51649 
51650 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51651 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51652 
51653 #undef TARGET_BUILTIN_RECIPROCAL
51654 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51655 
51656 #undef TARGET_ASM_FUNCTION_EPILOGUE
51657 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51658 
51659 #undef TARGET_ENCODE_SECTION_INFO
51660 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51661 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51662 #else
51663 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51664 #endif
51665 
51666 #undef TARGET_ASM_OPEN_PAREN
51667 #define TARGET_ASM_OPEN_PAREN ""
51668 #undef TARGET_ASM_CLOSE_PAREN
51669 #define TARGET_ASM_CLOSE_PAREN ""
51670 
51671 #undef TARGET_ASM_BYTE_OP
51672 #define TARGET_ASM_BYTE_OP ASM_BYTE
51673 
51674 #undef TARGET_ASM_ALIGNED_HI_OP
51675 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51676 #undef TARGET_ASM_ALIGNED_SI_OP
51677 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51678 #ifdef ASM_QUAD
51679 #undef TARGET_ASM_ALIGNED_DI_OP
51680 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51681 #endif
51682 
51683 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51684 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51685 
51686 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51687 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51688 
51689 #undef TARGET_ASM_UNALIGNED_HI_OP
51690 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51691 #undef TARGET_ASM_UNALIGNED_SI_OP
51692 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51693 #undef TARGET_ASM_UNALIGNED_DI_OP
51694 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51695 
51696 #undef TARGET_PRINT_OPERAND
51697 #define TARGET_PRINT_OPERAND ix86_print_operand
51698 #undef TARGET_PRINT_OPERAND_ADDRESS
51699 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51700 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51701 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51702 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51703 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51704 
51705 #undef TARGET_SCHED_INIT_GLOBAL
51706 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51707 #undef TARGET_SCHED_ADJUST_COST
51708 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51709 #undef TARGET_SCHED_ISSUE_RATE
51710 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51711 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51712 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51713   ia32_multipass_dfa_lookahead
51714 #undef TARGET_SCHED_MACRO_FUSION_P
51715 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51716 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51717 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51718 
51719 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51720 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51721 
51722 #undef TARGET_MEMMODEL_CHECK
51723 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51724 
51725 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51726 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51727 
51728 #ifdef HAVE_AS_TLS
51729 #undef TARGET_HAVE_TLS
51730 #define TARGET_HAVE_TLS true
51731 #endif
51732 #undef TARGET_CANNOT_FORCE_CONST_MEM
51733 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51734 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51735 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51736 
51737 #undef TARGET_DELEGITIMIZE_ADDRESS
51738 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51739 
51740 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51741 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51742 
51743 #undef TARGET_MS_BITFIELD_LAYOUT_P
51744 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51745 
51746 #if TARGET_MACHO
51747 #undef TARGET_BINDS_LOCAL_P
51748 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51749 #else
51750 #undef TARGET_BINDS_LOCAL_P
51751 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51752 #endif
51753 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51754 #undef TARGET_BINDS_LOCAL_P
51755 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51756 #endif
51757 
51758 #undef TARGET_ASM_OUTPUT_MI_THUNK
51759 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51760 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51761 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51762 
51763 #undef TARGET_ASM_FILE_START
51764 #define TARGET_ASM_FILE_START x86_file_start
51765 
51766 #undef TARGET_OPTION_OVERRIDE
51767 #define TARGET_OPTION_OVERRIDE ix86_option_override
51768 
51769 #undef TARGET_REGISTER_MOVE_COST
51770 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51771 #undef TARGET_MEMORY_MOVE_COST
51772 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51773 #undef TARGET_RTX_COSTS
51774 #define TARGET_RTX_COSTS ix86_rtx_costs
51775 #undef TARGET_ADDRESS_COST
51776 #define TARGET_ADDRESS_COST ix86_address_cost
51777 
51778 #undef TARGET_FLAGS_REGNUM
51779 #define TARGET_FLAGS_REGNUM FLAGS_REG
51780 #undef TARGET_FIXED_CONDITION_CODE_REGS
51781 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51782 #undef TARGET_CC_MODES_COMPATIBLE
51783 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51784 
51785 #undef TARGET_MACHINE_DEPENDENT_REORG
51786 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51787 
51788 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51789 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51790 
51791 #undef TARGET_BUILD_BUILTIN_VA_LIST
51792 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51793 
51794 #undef TARGET_FOLD_BUILTIN
51795 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51796 
51797 #undef TARGET_GIMPLE_FOLD_BUILTIN
51798 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51799 
51800 #undef TARGET_COMPARE_VERSION_PRIORITY
51801 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51802 
51803 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51804 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51805   ix86_generate_version_dispatcher_body
51806 
51807 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51808 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51809   ix86_get_function_versions_dispatcher
51810 
51811 #undef TARGET_ENUM_VA_LIST_P
51812 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51813 
51814 #undef TARGET_FN_ABI_VA_LIST
51815 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51816 
51817 #undef TARGET_CANONICAL_VA_LIST_TYPE
51818 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51819 
51820 #undef TARGET_EXPAND_BUILTIN_VA_START
51821 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51822 
51823 #undef TARGET_MD_ASM_ADJUST
51824 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51825 
51826 #undef TARGET_C_EXCESS_PRECISION
51827 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51828 #undef TARGET_PROMOTE_PROTOTYPES
51829 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51830 #undef TARGET_SETUP_INCOMING_VARARGS
51831 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51832 #undef TARGET_MUST_PASS_IN_STACK
51833 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51834 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51835 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51836 #undef TARGET_FUNCTION_ARG_ADVANCE
51837 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51838 #undef TARGET_FUNCTION_ARG
51839 #define TARGET_FUNCTION_ARG ix86_function_arg
51840 #undef TARGET_INIT_PIC_REG
51841 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51842 #undef TARGET_USE_PSEUDO_PIC_REG
51843 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51844 #undef TARGET_FUNCTION_ARG_BOUNDARY
51845 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51846 #undef TARGET_PASS_BY_REFERENCE
51847 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51848 #undef TARGET_INTERNAL_ARG_POINTER
51849 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51850 #undef TARGET_UPDATE_STACK_BOUNDARY
51851 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51852 #undef TARGET_GET_DRAP_RTX
51853 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51854 #undef TARGET_STRICT_ARGUMENT_NAMING
51855 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51856 #undef TARGET_STATIC_CHAIN
51857 #define TARGET_STATIC_CHAIN ix86_static_chain
51858 #undef TARGET_TRAMPOLINE_INIT
51859 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51860 #undef TARGET_RETURN_POPS_ARGS
51861 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51862 
51863 #undef TARGET_WARN_FUNC_RETURN
51864 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51865 
51866 #undef TARGET_LEGITIMATE_COMBINED_INSN
51867 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51868 
51869 #undef TARGET_ASAN_SHADOW_OFFSET
51870 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51871 
51872 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51873 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51874 
51875 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51876 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51877 
51878 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51879 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51880 
51881 #undef TARGET_C_MODE_FOR_SUFFIX
51882 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51883 
51884 #ifdef HAVE_AS_TLS
51885 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51886 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51887 #endif
51888 
51889 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51890 #undef TARGET_INSERT_ATTRIBUTES
51891 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51892 #endif
51893 
51894 #undef TARGET_MANGLE_TYPE
51895 #define TARGET_MANGLE_TYPE ix86_mangle_type
51896 
51897 #undef TARGET_STACK_PROTECT_GUARD
51898 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51899 
51900 #if !TARGET_MACHO
51901 #undef TARGET_STACK_PROTECT_FAIL
51902 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51903 #endif
51904 
51905 #undef TARGET_FUNCTION_VALUE
51906 #define TARGET_FUNCTION_VALUE ix86_function_value
51907 
51908 #undef TARGET_FUNCTION_VALUE_REGNO_P
51909 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51910 
51911 #undef TARGET_PROMOTE_FUNCTION_MODE
51912 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51913 
51914 #undef  TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51915 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51916 
51917 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51918 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51919 
51920 #undef TARGET_INSTANTIATE_DECLS
51921 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51922 
51923 #undef TARGET_SECONDARY_RELOAD
51924 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51925 #undef TARGET_SECONDARY_MEMORY_NEEDED
51926 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51927 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51928 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51929 
51930 #undef TARGET_CLASS_MAX_NREGS
51931 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51932 
51933 #undef TARGET_PREFERRED_RELOAD_CLASS
51934 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51935 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51936 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51937 #undef TARGET_CLASS_LIKELY_SPILLED_P
51938 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51939 
51940 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51941 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51942   ix86_builtin_vectorization_cost
51943 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51944 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51945 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51946 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51947   ix86_preferred_simd_mode
51948 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51949 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51950   ix86_split_reduction
51951 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51952 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51953   ix86_autovectorize_vector_sizes
51954 #undef TARGET_VECTORIZE_GET_MASK_MODE
51955 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51956 #undef TARGET_VECTORIZE_INIT_COST
51957 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51958 #undef TARGET_VECTORIZE_ADD_STMT_COST
51959 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51960 #undef TARGET_VECTORIZE_FINISH_COST
51961 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51962 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51963 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51964 
51965 #undef TARGET_SET_CURRENT_FUNCTION
51966 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51967 
51968 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51969 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51970 
51971 #undef TARGET_OPTION_SAVE
51972 #define TARGET_OPTION_SAVE ix86_function_specific_save
51973 
51974 #undef TARGET_OPTION_RESTORE
51975 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51976 
51977 #undef TARGET_OPTION_POST_STREAM_IN
51978 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51979 
51980 #undef TARGET_OPTION_PRINT
51981 #define TARGET_OPTION_PRINT ix86_function_specific_print
51982 
51983 #undef TARGET_OPTION_FUNCTION_VERSIONS
51984 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51985 
51986 #undef TARGET_CAN_INLINE_P
51987 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51988 
51989 #undef TARGET_LEGITIMATE_ADDRESS_P
51990 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51991 
51992 #undef TARGET_REGISTER_PRIORITY
51993 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51994 
51995 #undef TARGET_REGISTER_USAGE_LEVELING_P
51996 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51997 
51998 #undef TARGET_LEGITIMATE_CONSTANT_P
51999 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52000 
52001 #undef TARGET_COMPUTE_FRAME_LAYOUT
52002 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
52003 
52004 #undef TARGET_FRAME_POINTER_REQUIRED
52005 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52006 
52007 #undef TARGET_CAN_ELIMINATE
52008 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52009 
52010 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52011 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52012 
52013 #undef TARGET_ASM_CODE_END
52014 #define TARGET_ASM_CODE_END ix86_code_end
52015 
52016 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52017 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52018 
52019 #undef TARGET_CANONICALIZE_COMPARISON
52020 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
52021 
52022 #undef TARGET_LOOP_UNROLL_ADJUST
52023 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52024 
52025 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657.  */
52026 #undef TARGET_SPILL_CLASS
52027 #define TARGET_SPILL_CLASS ix86_spill_class
52028 
52029 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52030 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52031   ix86_simd_clone_compute_vecsize_and_simdlen
52032 
52033 #undef TARGET_SIMD_CLONE_ADJUST
52034 #define TARGET_SIMD_CLONE_ADJUST \
52035   ix86_simd_clone_adjust
52036 
52037 #undef TARGET_SIMD_CLONE_USABLE
52038 #define TARGET_SIMD_CLONE_USABLE \
52039   ix86_simd_clone_usable
52040 
52041 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52042 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52043   ix86_float_exceptions_rounding_supported_p
52044 
52045 #undef TARGET_MODE_EMIT
52046 #define TARGET_MODE_EMIT ix86_emit_mode_set
52047 
52048 #undef TARGET_MODE_NEEDED
52049 #define TARGET_MODE_NEEDED ix86_mode_needed
52050 
52051 #undef TARGET_MODE_AFTER
52052 #define TARGET_MODE_AFTER ix86_mode_after
52053 
52054 #undef TARGET_MODE_ENTRY
52055 #define TARGET_MODE_ENTRY ix86_mode_entry
52056 
52057 #undef TARGET_MODE_EXIT
52058 #define TARGET_MODE_EXIT ix86_mode_exit
52059 
52060 #undef TARGET_MODE_PRIORITY
52061 #define TARGET_MODE_PRIORITY ix86_mode_priority
52062 
52063 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52064 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52065 
52066 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52067 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52068 
52069 #undef TARGET_STORE_BOUNDS_FOR_ARG
52070 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52071 
52072 #undef TARGET_LOAD_RETURNED_BOUNDS
52073 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52074 
52075 #undef TARGET_STORE_RETURNED_BOUNDS
52076 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52077 
52078 #undef TARGET_CHKP_BOUND_MODE
52079 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52080 
52081 #undef TARGET_BUILTIN_CHKP_FUNCTION
52082 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52083 
52084 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52085 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52086 
52087 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52088 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52089 
52090 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52091 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52092 
52093 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52094 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52095 
52096 #undef TARGET_OFFLOAD_OPTIONS
52097 #define TARGET_OFFLOAD_OPTIONS \
52098   ix86_offload_options
52099 
52100 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52101 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52102 
52103 #undef TARGET_OPTAB_SUPPORTED_P
52104 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52105 
52106 #undef TARGET_HARD_REGNO_SCRATCH_OK
52107 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52108 
52109 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52110 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52111 
52112 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52113 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52114 
52115 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52116 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52117 
52118 #undef TARGET_INIT_LIBFUNCS
52119 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52120 
52121 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52122 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52123 
52124 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52125 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52126 
52127 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
52128 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
52129 
52130 #undef TARGET_HARD_REGNO_NREGS
52131 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
52132 #undef TARGET_HARD_REGNO_MODE_OK
52133 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
52134 
52135 #undef TARGET_MODES_TIEABLE_P
52136 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
52137 
52138 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
52139 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
52140   ix86_hard_regno_call_part_clobbered
52141 
52142 #undef TARGET_CAN_CHANGE_MODE_CLASS
52143 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
52144 
52145 #undef TARGET_STATIC_RTX_ALIGNMENT
52146 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
52147 #undef TARGET_CONSTANT_ALIGNMENT
52148 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
52149 
52150 #undef TARGET_EMPTY_RECORD_P
52151 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
52152 
52153 #undef TARGET_WARN_PARAMETER_PASSING_ABI
52154 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
52155 
52156 #if CHECKING_P
52157 #undef TARGET_RUN_TARGET_SELFTESTS
52158 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52159 #endif /* #if CHECKING_P */
52160 
52161 struct gcc_target targetm = TARGET_INITIALIZER;
52162 
52163 #include "gt-i386.h"
52164