1/* Definitions of x86 tunable features. 2 Copyright (C) 2013-2014 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 3, or (at your option) 9any later version. 10 11GCC is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License and 17a copy of the GCC Runtime Library Exception along with this program; 18see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 19<http://www.gnu.org/licenses/>. */ 20 21/* Tuning for a given CPU XXXX consists of: 22 - adding new CPU into: 23 - adding PROCESSOR_XXX to processor_type (in i386.h) 24 - possibly adding XXX into CPU attribute in i386.md 25 - adding XXX to processor_alias_table (in i386.c) 26 - introducing ix86_XXX_cost in i386.c 27 - Stringop generation table can be build based on test_stringop 28 - script (once rest of tuning is complete) 29 - designing a scheduler model in 30 - XXXX.md file 31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md 32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder 33 and ix86_sched_init_global if those tricks are needed. 34 - Tunning the flags bellow. Those are split into sections and each 35 section is very roughly ordered by importance. */ 36 37/*****************************************************************************/ 38/* Scheduling flags. */ 39/*****************************************************************************/ 40 41/* X86_TUNE_SCHEDULE: Enable scheduling. */ 42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", 43 m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 44 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 45 46/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming 47 on modern chips. Preffer stores affecting whole integer register 48 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit 49 value over movb. */ 50DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", 51 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 52 | m_AMD_MULTIPLE | m_GENERIC) 53 54/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store 55 destinations to be 128bit to allow register renaming on 128bit SSE units, 56 but usually results in one extra microop on 64bit SSE units. 57 Experimental results shows that disabling this option on P4 brings over 20% 58 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression 59 that can be partly masked by careful scheduling of moves. */ 60DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", 61 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 62 | m_INTEL | m_AMDFAM10 | m_BDVER | m_GENERIC) 63 64/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies 65 are resolved on SSE register parts instead of whole registers, so we may 66 maintain just lower part of scalar values in proper format leaving the 67 upper part undefined. */ 68DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) 69 70/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags 71 set by instructions affecting just some flags (in particular shifts). 72 This is because Core2 resolves dependencies on whole flags register 73 and such sequences introduce false dependency on previous instruction 74 setting full flags. 75 76 The flags does not affect generation of INC and DEC that is controlled 77 by X86_TUNE_USE_INCDEC. 78 79 This flag may be dropped from generic once core2-corei5 machines are 80 rare enough. */ 81DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", 82 m_CORE2 | m_GENERIC) 83 84/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid 85 partial dependencies. */ 86DEF_TUNE (X86_TUNE_MOVX, "movx", 87 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 88 | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) 89 90/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by 91 full sized loads. */ 92DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", 93 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 94 | m_AMD_MULTIPLE | m_GENERIC) 95 96/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent 97 conditional jump instruction for 32 bit TARGET. 98 FIXME: revisit for generic. */ 99DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", 100 m_CORE_ALL | m_BDVER) 101 102/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent 103 conditional jump instruction for TARGET_64BIT. 104 FIXME: revisit for generic. */ 105DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", 106 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER) 107 108/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a 109 subsequent conditional jump instruction when the condition jump 110 check sign flag (SF) or overflow flag (OF). */ 111DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", 112 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER) 113 114/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional 115 jump instruction when the alu instruction produces the CCFLAG consumed by 116 the conditional jump instruction. */ 117DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", 118 m_SANDYBRIDGE | m_HASWELL) 119 120/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations 121 during reassociation of integer computation. */ 122DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel", 123 m_BONNELL) 124 125/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations 126 during reassociation of fp computation. */ 127DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", 128 m_BONNELL | m_SILVERMONT | m_HASWELL | m_INTEL | m_BDVER1 129 | m_BDVER2 | m_GENERIC) 130 131/*****************************************************************************/ 132/* Function prologue, epilogue and function calling sequences. */ 133/*****************************************************************************/ 134 135/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing 136 arguments in prologue/epilogue instead of separately for each call 137 by push/pop instructions. 138 This increase code size by about 5% in 32bit mode, less so in 64bit mode 139 because parameters are passed in registers. It is considerable 140 win for targets without stack engine that prevents multple push operations 141 to happen in parallel. 142 143 FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer, 144 Bobcat and Generic. This is because disabling it causes large 145 regression on mgrid due to IRA limitation leading to unecessary 146 use of the frame pointer in 32bit mode. */ 147DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 148 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL 149 | m_ATHLON_K8) 150 151/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are 152 considered on critical path. */ 153DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 154 m_PPRO | m_ATHLON_K8) 155 156/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are 157 considered on critical path. */ 158DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", 159 m_PPRO | m_ATHLON_K8) 160 161/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ 162DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 163 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 164 165/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. 166 Some chips, like 486 and Pentium works faster with separate load 167 and push instructions. */ 168DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 169 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 170 | m_GENERIC) 171 172/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred 173 over esp subtraction. */ 174DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT 175 | m_K6_GEODE) 176 177/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred 178 over esp subtraction. */ 179DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE) 180 181/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred 182 over esp addition. */ 183DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO) 184 185/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred 186 over esp addition. */ 187DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT) 188 189/*****************************************************************************/ 190/* Branch predictor tuning */ 191/*****************************************************************************/ 192 193/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4 194 instructions long. */ 195DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL) 196 197/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination 198 of conditional jump or directly preceded by other jump instruction. 199 This is important for AND K8-AMDFAM10 because the branch prediction 200 architecture expect at most one jump per 2 byte window. Failing to 201 pad returns leads to misaligned return stack. */ 202DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", 203 m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC) 204 205/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more 206 than 4 branch instructions in the 16 byte window. */ 207DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", 208 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL | 209 m_ATHLON_K8 | m_AMDFAM10) 210 211/*****************************************************************************/ 212/* Integer instruction selection tuning */ 213/*****************************************************************************/ 214 215/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching 216 at -O3. For the moment, the prefetching seems badly tuned for Intel 217 chips. */ 218DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial", 219 m_K6_GEODE | m_AMD_MULTIPLE) 220 221/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall 222 on 16-bit immediate moves into memory on Core2 and Corei7. */ 223DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) 224 225/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such 226 as "add mem, reg". */ 227DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO)) 228 229/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ 230DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", 231 ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 232 | m_GENERIC)) 233 234/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred 235 for DFmode copies */ 236DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", 237 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 238 | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)) 239 240/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag 241 will impact LEA instruction selection. */ 242DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_INTEL) 243 244/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ 245DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr", 246 m_BONNELL | m_SILVERMONT) 247 248/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is 249 vector path on AMD machines. 250 FIXME: Do we need to enable this for core? */ 251DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem", 252 m_K8 | m_AMDFAM10) 253 254/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD 255 machines. 256 FIXME: Do we need to enable this for core? */ 257DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", 258 m_K8 | m_AMDFAM10) 259 260/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for 261 a conditional move. */ 262DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", 263 m_BONNELL | m_SILVERMONT | m_INTEL) 264 265/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such 266 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ 267DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) 268 269/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of 270 compact prologues and epilogues by issuing a misaligned moves. This 271 requires target to handle misaligned moves and partial memory stalls 272 reasonably well. 273 FIXME: This may actualy be a win on more targets than listed here. */ 274DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, 275 "misaligned_move_string_pro_epilogues", 276 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) 277 278/* X86_TUNE_USE_SAHF: Controls use of SAHF. */ 279DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", 280 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 281 | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER 282 | m_GENERIC) 283 284/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ 285DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", 286 ~(m_PENT | m_BONNELL | m_SILVERMONT | m_INTEL | m_K6)) 287 288/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ 289DEF_TUNE (X86_TUNE_USE_BT, "use_bt", 290 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE 291 | m_GENERIC) 292 293/*****************************************************************************/ 294/* 387 instruction selection tuning */ 295/*****************************************************************************/ 296 297/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit 298 integer operand. 299 FIXME: Why this is disabled for modern chips? */ 300DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", 301 m_386 | m_486 | m_K6_GEODE) 302 303/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit 304 integer operand. */ 305DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", 306 ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT 307 | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) 308 309/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ 310DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) 311 312/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ 313DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", 314 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 315 | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC) 316 317/*****************************************************************************/ 318/* SSE instruction selection tuning */ 319/*****************************************************************************/ 320 321/* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector 322 instructions. */ 323DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_BONNELL) 324 325/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE 326 regs instead of memory. */ 327DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", 328 m_CORE_ALL) 329 330/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead 331 of a sequence loading registers by parts. */ 332DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", 333 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER 334 | m_BTVER | m_SILVERMONT | m_INTEL | m_GENERIC) 335 336/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead 337 of a sequence loading registers by parts. */ 338DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", 339 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT 340 | m_INTEL | m_GENERIC) 341 342/* Use packed single precision instructions where posisble. I.e. movups instead 343 of movupd. */ 344DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", 345 m_BDVER) 346 347/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ 348DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", 349 m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC) 350 351/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to 352 xorps/xorpd and other variants. */ 353DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", 354 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_GENERIC) 355 356/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer 357 to SSE registers. If disabled, the moves will be done by storing 358 the value to memory and reloading. */ 359DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec", 360 ~(m_AMD_MULTIPLE | m_GENERIC)) 361 362/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE 363 to integer registers. If disabled, the moves will be done by storing 364 the value to memory and reloading. */ 365DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec", 366 ~m_ATHLON_K8) 367 368/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions 369 to use both SSE and integer registers at a same time. 370 FIXME: revisit importance of this for generic. */ 371DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", 372 ~(m_AMDFAM10 | m_BDVER)) 373 374/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for 375 fp converts to destination register. */ 376DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", 377 m_SILVERMONT | m_INTEL) 378 379/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion 380 from FP to FP. This form of instructions avoids partial write to the 381 destination. */ 382DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", 383 m_AMDFAM10) 384 385/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion 386 from integer to FP. */ 387DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) 388 389/*****************************************************************************/ 390/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ 391/*****************************************************************************/ 392 393/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are 394 split. */ 395DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", 396 ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC)) 397 398/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are 399 split. */ 400DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", 401 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_GENERIC)) 402 403/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for 404 the auto-vectorizer. */ 405DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2) 406 407/*****************************************************************************/ 408/* Historical relics: tuning flags that helps a specific old CPU designs */ 409/*****************************************************************************/ 410 411/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in 412 an integer register. */ 413DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386) 414 415/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, 416 such as fsqrt, fprem, fsin, fcos, fsincos etc. 417 Should be enabled for all targets that always has coprocesor. */ 418DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", 419 ~(m_386 | m_486)) 420 421/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for 422 inline strlen. This affects only -minline-all-stringops mode. By 423 default we always dispatch to a library since our internal strlen 424 is bad. */ 425DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386) 426 427/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of 428 longer "sal $1, reg". */ 429DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486) 430 431/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead 432 of mozbl/movwl. */ 433DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", m_486 | m_PENT) 434 435/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode 436 and SImode multiply, but 386 and 486 do HImode multiply faster. */ 437DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", 438 ~(m_386 | m_486)) 439 440/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic 441 into 16bit/8bit when resulting sequence is shorter. For example 442 for "and $-65536, reg" to 16bit store of 0. */ 443DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT)) 444 445/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions 446 such as "add $1, mem". */ 447DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT) 448 449/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR 450 than a MOV. */ 451DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT) 452 453/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, 454 but one byte longer. */ 455DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT) 456 457/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled 458 use of partial registers by renaming. This improved performance of 16bit 459 code where upper halves of registers are not used. It also leads to 460 an penalty whenever a 16bit store is followed by 32bit use. This flag 461 disables production of such sequences in common cases. 462 See also X86_TUNE_HIMODE_MATH. 463 464 In current implementation the partial register stalls are not eliminated 465 very well - they can be introduced via subregs synthesized by combine 466 and can happen in caller/callee saving sequences. */ 467DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) 468 469/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to 470 corresponding 32bit arithmetic. */ 471DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode", 472 ~m_PPRO) 473 474/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid 475 partial register stalls on PentiumPro targets. */ 476DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO) 477 478/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic. 479 On PPro this flag is meant to avoid partial register stalls. */ 480DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO) 481 482/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates 483 directly to memory. */ 484DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO) 485 486/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ 487DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4) 488 489/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear 490 integer register. */ 491DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6) 492 493/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory 494 operand that cannot be represented using a modRM byte. The XOR 495 replacement is long decoded, so this split helps here as well. */ 496DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) 497 498/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded 499 forms of instructions on K8 targets. */ 500DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", 501 m_K8) 502 503/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency 504 for bit-manipulation instructions. */ 505DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", 506 m_SANDYBRIDGE | m_HASWELL | m_INTEL | m_GENERIC) 507 508/*****************************************************************************/ 509/* This never worked well before. */ 510/*****************************************************************************/ 511 512/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based 513 on simulation result. But after P4 was made, no performance benefit 514 was observed with branch hints. It also increases the code size. 515 As a result, icc never generates branch hints. */ 516DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0) 517 518/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ 519DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0) 520 521/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit 522 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme 523 is usually used for RISC targets. */ 524DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0) 525 526/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based 527 on hardware capabilities. Bdver3 hardware has a loop buffer which makes 528 unrolling small loop less important. For, such architectures we adjust 529 the unroll factor so that the unrolled loop fits the loop buffer. */ 530DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) 531