1/* Definitions of x86 tunable features.
2   Copyright (C) 2013-2018 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License and
17a copy of the GCC Runtime Library Exception along with this program;
18see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
19<http://www.gnu.org/licenses/>.  */
20
21/* Tuning for a given CPU XXXX consists of:
22    - adding new CPU into:
23	- adding PROCESSOR_XXX to processor_type (in i386.h)
24	- possibly adding XXX into CPU attribute in i386.md
25	- adding XXX to processor_alias_table (in i386.c)
26    - introducing ix86_XXX_cost in i386.c
27	- Stringop generation table can be build based on test_stringop
28	- script (once rest of tuning is complete)
29    - designing a scheduler model in
30	- XXXX.md file
31	- Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32	- possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
33	  and ix86_sched_init_global if those tricks are needed.
34    - Tunning the flags bellow. Those are split into sections and each
35      section is very roughly ordered by importance.  */
36
37/*****************************************************************************/
38/* Scheduling flags. 					                     */
39/*****************************************************************************/
40
41/* X86_TUNE_SCHEDULE: Enable scheduling.  */
42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
43          m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
44	  | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
45
46/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
47   on modern chips.  Preffer stores affecting whole integer register
48   over partial stores.  For example preffer MOVZBL or MOVQ to load 8bit
49   value over movb.  */
50DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
51          m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
52	  | m_BONNELL | m_SILVERMONT | m_INTEL
53	  | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC)
54
55/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
56   destinations to be 128bit to allow register renaming on 128bit SSE units,
57   but usually results in one extra microop on 64bit SSE units.
58   Experimental results shows that disabling this option on P4 brings over 20%
59   SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
60   that can be partly masked by careful scheduling of moves.  */
61DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
62          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
63	  | m_BDVER | m_ZNVER1 | m_GENERIC)
64
65/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
66   are resolved on SSE register parts instead of whole registers, so we may
67   maintain just lower part of scalar values in proper format leaving the
68   upper part undefined.  */
69DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
70
71/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags
72   set by instructions affecting just some flags (in particular shifts).
73   This is because Core2 resolves dependencies on whole flags register
74   and such sequences introduce false dependency on previous instruction
75   setting full flags.
76
77   The flags does not affect generation of INC and DEC that is controlled
78   by X86_TUNE_USE_INCDEC.  */
79
80DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
81          m_CORE2)
82
83/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
84   partial dependencies.  */
85DEF_TUNE (X86_TUNE_MOVX, "movx",
86          m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
87	  | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_AVX2
88	  | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)
89
90/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
91   full sized loads.  */
92DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
93          m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
94	  | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC)
95
96/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
97   conditional jump instruction for 32 bit TARGET.  */
98DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
99	  m_CORE_ALL | m_BDVER | m_ZNVER1 | m_GENERIC)
100
101/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
102   conditional jump instruction for TARGET_64BIT.  */
103DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
104	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
105
106/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
107   subsequent conditional jump instruction when the condition jump
108   check sign flag (SF) or overflow flag (OF).  */
109DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
110	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
111
112/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
113   jump instruction when the alu instruction produces the CCFLAG consumed by
114   the conditional jump instruction. */
115DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
116          m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
117
118
119/*****************************************************************************/
120/* Function prologue, epilogue and function calling sequences.               */
121/*****************************************************************************/
122
123/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
124   arguments in prologue/epilogue instead of separately for each call
125   by push/pop instructions.
126   This increase code size by about 5% in 32bit mode, less so in 64bit mode
127   because parameters are passed in registers.  It is considerable
128   win for targets without stack engine that prevents multple push operations
129   to happen in parallel.  */
130
131DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
132	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
133	  | m_ATHLON_K8)
134
135/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
136   considered on critical path.  */
137DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
138          m_PPRO | m_ATHLON_K8)
139
140/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
141   considered on critical path.  */
142DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
143          m_PPRO | m_ATHLON_K8)
144
145/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
146DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
147	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
148
149/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
150   Some chips, like 486 and Pentium works faster with separate load
151   and push instructions.  */
152DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
153          m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
154          | m_GENERIC)
155
156/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
157   over esp subtraction.  */
158DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
159	  | m_LAKEMONT | m_K6_GEODE)
160
161/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
162   over esp subtraction.  */
163DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
164	  | m_K6_GEODE)
165
166/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
167   over esp addition.  */
168DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
169	  | m_LAKEMONT | m_PPRO)
170
171/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
172   over esp addition.  */
173DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
174
175/*****************************************************************************/
176/* Branch predictor tuning  		                                     */
177/*****************************************************************************/
178
179/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
180   instructions long.  */
181DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
182
183/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
184   of conditional jump or directly preceded by other jump instruction.
185   This is important for AND K8-AMDFAM10 because the branch prediction
186   architecture expect at most one jump per 2 byte window.  Failing to
187   pad returns leads to misaligned return stack.  */
188DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
189          m_ATHLON_K8 | m_AMDFAM10)
190
191/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
192   than 4 branch instructions in the 16 byte window.  */
193DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
194          m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
195	 |m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
196
197/*****************************************************************************/
198/* Integer instruction selection tuning                                      */
199/*****************************************************************************/
200
201/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
202   at -O3.  For the moment, the prefetching seems badly tuned for Intel
203   chips.  */
204DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
205          m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
206
207/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
208   on 16-bit immediate moves into memory on Core2 and Corei7.  */
209DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
210
211/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
212   as "add mem, reg".  */
213DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
214
215/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
216
217   Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
218   Sandy bridge and Ivy bridge generate extra uop.  On Haswell this extra uop
219   is output only when the values needs to be really merged, which is not
220   done by GCC generated code.  */
221DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
222          ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
223	    | m_BONNELL | m_SILVERMONT | m_INTEL |  m_KNL | m_KNM | m_GENERIC))
224
225/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
226   for DFmode copies */
227DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
228          ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
229	    | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
230
231/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
232   will impact LEA instruction selection. */
233DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
234	 | m_KNM | m_INTEL)
235
236/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
237DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
238	  m_BONNELL | m_SILVERMONT | m_KNL | m_KNM)
239
240/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
241   vector path on AMD machines.
242   FIXME: Do we need to enable this for core? */
243DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
244          m_K8 | m_AMDFAM10)
245
246/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
247   machines.
248   FIXME: Do we need to enable this for core? */
249DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
250          m_K8 | m_AMDFAM10)
251
252/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
253   a conditional move.  */
254DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
255	  m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
256
257/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
258   as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
259DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
260
261/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
262   compact prologues and epilogues by issuing a misaligned moves.  This
263   requires target to handle misaligned moves and partial memory stalls
264   reasonably well.
265   FIXME: This may actualy be a win on more targets than listed here.  */
266DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
267	  "misaligned_move_string_pro_epilogues",
268	  m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
269
270/* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
271DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
272          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
273	  | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
274	  | m_BTVER | m_ZNVER1 | m_GENERIC)
275
276/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
277DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
278	  ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
279	    | m_K6))
280
281/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
282DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
283          m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
284	  | m_LAKEMONT | m_AMD_MULTIPLE | m_GENERIC)
285
286/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
287   for bit-manipulation instructions.  */
288DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
289	  m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
290
291/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
292   on hardware capabilities. Bdver3 hardware has a loop buffer which makes
293   unrolling small loop less important. For, such architectures we adjust
294   the unroll factor so that the unrolled loop fits the loop buffer.  */
295DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
296
297/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
298   if-converted sequence to one.  */
299DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
300	  m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GENERIC)
301
302/*****************************************************************************/
303/* 387 instruction selection tuning                                          */
304/*****************************************************************************/
305
306/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
307   integer operand.
308   FIXME: Why this is disabled for modern chips?  */
309DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
310          m_386 | m_486 | m_K6_GEODE)
311
312/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
313   integer operand.  */
314DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
315          ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
316	    | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
317
318/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
319DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
320
321/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
322DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
323          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
324	  | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
325
326/*****************************************************************************/
327/* SSE instruction selection tuning                                          */
328/*****************************************************************************/
329
330/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
331   regs instead of memory.  */
332DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
333          m_CORE_ALL)
334
335/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
336   of a sequence loading registers by parts.  */
337DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
338	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
339	  | m_INTEL | m_AMDFAM10 | m_BDVER | m_BTVER
340	  | m_ZNVER1 | m_GENERIC)
341
342/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
343   of a sequence loading registers by parts.  */
344DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
345	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
346	  | m_INTEL | m_BDVER | m_ZNVER1 | m_GENERIC)
347
348/* Use packed single precision instructions where posisble.  I.e. movups instead
349   of movupd.  */
350DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
351	  m_BDVER | m_ZNVER1)
352
353/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
354DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
355	  m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
356
357/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
358   xorps/xorpd and other variants.  */
359DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
360	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER1
361	  | m_GENERIC)
362
363/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
364   to SSE registers.  If disabled, the moves will be done by storing
365   the value to memory and reloading.  */
366DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
367          ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
368
369/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
370   to integer registers.  If disabled, the moves will be done by storing
371   the value to memory and reloading.  */
372DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
373          ~m_ATHLON_K8)
374
375/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
376   to use both SSE and integer registers at a same time.  */
377DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
378          ~(m_AMDFAM10 | m_BDVER))
379
380/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
381   fp converts to destination register.  */
382DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
383          m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
384
385/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
386   from FP to FP.  This form of instructions avoids partial write to the
387   destination.  */
388DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
389          m_AMDFAM10)
390
391/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
392   from integer to FP. */
393DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
394
395/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
396DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
397          m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
398
399/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
400DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
401          m_SILVERMONT | m_INTEL)
402
403/* X86_TUNE_USE_GATHER: Use gather instructions.  */
404DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
405          ~(m_ZNVER1 | m_GENERIC))
406
407/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
408   smaller FMA chain.  */
409DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1)
410
411/*****************************************************************************/
412/* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
413/*****************************************************************************/
414
415/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
416   split.  */
417DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
418          ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC))
419
420/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
421   split.  */
422DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
423	  ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC))
424
425/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
426   the auto-vectorizer.  */
427DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
428	  | m_ZNVER1)
429
430/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
431   instructions in the auto-vectorizer.  */
432DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
433
434/*****************************************************************************/
435/* Historical relics: tuning flags that helps a specific old CPU designs     */
436/*****************************************************************************/
437
438/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
439   an integer register.  */
440DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
441
442/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
443   such as fsqrt, fprem, fsin, fcos, fsincos etc.
444   Should be enabled for all targets that always has coprocesor.  */
445DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
446          ~(m_386 | m_486 | m_LAKEMONT))
447
448/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
449   inline strlen.  This affects only -minline-all-stringops mode. By
450   default we always dispatch to a library since our internal strlen
451   is bad.  */
452DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
453
454/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
455   longer "sal $1, reg".  */
456DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
457
458/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
459   of mozbl/movwl.  */
460DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
461	  m_486 | m_PENT)
462
463/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
464   and SImode multiply, but 386 and 486 do HImode multiply faster.  */
465DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
466          ~(m_386 | m_486))
467
468/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
469   into 16bit/8bit when resulting sequence is shorter.  For example
470   for "and $-65536, reg" to 16bit store of 0.  */
471DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
472	  ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
473
474/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
475   such as "add $1, mem".  */
476DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
477	  ~(m_PENT | m_LAKEMONT))
478
479/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
480   than a MOV.  */
481DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
482
483/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
484   but one byte longer.  */
485DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
486
487/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
488   use of partial registers by renaming.  This improved performance of 16bit
489   code where upper halves of registers are not used.  It also leads to
490   an penalty whenever a 16bit store is followed by 32bit use.  This flag
491   disables production of such sequences in common cases.
492   See also X86_TUNE_HIMODE_MATH.
493
494   In current implementation the partial register stalls are not eliminated
495   very well - they can be introduced via subregs synthesized by combine
496   and can happen in caller/callee saving sequences.  */
497DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
498
499/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
500   corresponding 32bit arithmetic.  */
501DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
502	  ~m_PPRO)
503
504/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
505   partial register stalls on PentiumPro targets. */
506DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
507
508/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
509   On PPro this flag is meant to avoid partial register stalls.  */
510DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
511
512/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
513   directly to memory.  */
514DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
515
516/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
517DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
518
519/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
520   integer register.  */
521DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
522
523/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
524   operand that cannot be represented using a modRM byte.  The XOR
525   replacement is long decoded, so this split helps here as well.  */
526DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
527
528/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
529   forms of instructions on K8 targets.  */
530DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
531          m_K8)
532
533/*****************************************************************************/
534/* This never worked well before.                                            */
535/*****************************************************************************/
536
537/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
538   on simulation result. But after P4 was made, no performance benefit
539   was observed with branch hints.  It also increases the code size.
540   As a result, icc never generates branch hints.  */
541DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0U)
542
543/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
544DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0U)
545
546/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
547   arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
548   is usually used for RISC targets.  */
549DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)
550
551/* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
552   before a transfer of control flow out of the function.  */
553DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
554