1 /* Scheduler hooks for IA-32 which implement CPU specific logic. 2 Copyright (C) 1988-2018 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GCC is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GCC; see the file COPYING3. If not see 18 <http://www.gnu.org/licenses/>. */ 19 20 #define IN_TARGET_CODE 1 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "rtl.h" 27 #include "tree.h" 28 #include "cfghooks.h" 29 #include "tm_p.h" 30 #include "insn-config.h" 31 #include "insn-attr.h" 32 #include "recog.h" 33 #include "target.h" 34 35 /* Return the maximum number of instructions a cpu can issue. */ 36 37 int 38 ix86_issue_rate (void) 39 { 40 switch (ix86_tune) 41 { 42 case PROCESSOR_PENTIUM: 43 case PROCESSOR_LAKEMONT: 44 case PROCESSOR_BONNELL: 45 case PROCESSOR_SILVERMONT: 46 case PROCESSOR_KNL: 47 case PROCESSOR_KNM: 48 case PROCESSOR_INTEL: 49 case PROCESSOR_K6: 50 case PROCESSOR_BTVER2: 51 case PROCESSOR_PENTIUM4: 52 case PROCESSOR_NOCONA: 53 return 2; 54 55 case PROCESSOR_PENTIUMPRO: 56 case PROCESSOR_ATHLON: 57 case PROCESSOR_K8: 58 case PROCESSOR_AMDFAM10: 59 case PROCESSOR_BTVER1: 60 return 3; 61 62 case PROCESSOR_BDVER1: 63 case PROCESSOR_BDVER2: 64 case PROCESSOR_BDVER3: 65 case PROCESSOR_BDVER4: 66 case PROCESSOR_ZNVER1: 67 case PROCESSOR_CORE2: 68 case PROCESSOR_NEHALEM: 69 case PROCESSOR_SANDYBRIDGE: 70 case PROCESSOR_HASWELL: 71 case PROCESSOR_GENERIC: 72 return 4; 73 74 default: 75 return 1; 76 } 77 } 78 79 /* Return true iff USE_INSN has a memory address with operands set by 80 SET_INSN. */ 81 82 bool 83 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn) 84 { 85 int i; 86 extract_insn_cached (use_insn); 87 for (i = recog_data.n_operands - 1; i >= 0; --i) 88 if (MEM_P (recog_data.operand[i])) 89 { 90 rtx addr = XEXP (recog_data.operand[i], 0); 91 if (modified_in_p (addr, set_insn) != 0) 92 { 93 /* No AGI stall if SET_INSN is a push or pop and USE_INSN 94 has SP based memory (unless index reg is modified in a pop). */ 95 rtx set = single_set (set_insn); 96 if (set 97 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))) 98 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set))))) 99 { 100 struct ix86_address parts; 101 if (ix86_decompose_address (addr, &parts) 102 && parts.base == stack_pointer_rtx 103 && (parts.index == NULL_RTX 104 || MEM_P (SET_DEST (set)) 105 || !modified_in_p (parts.index, set_insn))) 106 return false; 107 } 108 return true; 109 } 110 return false; 111 } 112 return false; 113 } 114 115 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set 116 by DEP_INSN and nothing set by DEP_INSN. */ 117 118 static bool 119 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type) 120 { 121 rtx set, set2; 122 123 /* Simplify the test for uninteresting insns. */ 124 if (insn_type != TYPE_SETCC 125 && insn_type != TYPE_ICMOV 126 && insn_type != TYPE_FCMOV 127 && insn_type != TYPE_IBR) 128 return false; 129 130 if ((set = single_set (dep_insn)) != 0) 131 { 132 set = SET_DEST (set); 133 set2 = NULL_RTX; 134 } 135 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL 136 && XVECLEN (PATTERN (dep_insn), 0) == 2 137 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET 138 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET) 139 { 140 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); 141 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); 142 } 143 else 144 return false; 145 146 if (!REG_P (set) || REGNO (set) != FLAGS_REG) 147 return false; 148 149 /* This test is true if the dependent insn reads the flags but 150 not any other potentially set register. */ 151 if (!reg_overlap_mentioned_p (set, PATTERN (insn))) 152 return false; 153 154 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn))) 155 return false; 156 157 return true; 158 } 159 160 /* Helper function for exact_store_load_dependency. 161 Return true if addr is found in insn. */ 162 static bool 163 exact_dependency_1 (rtx addr, rtx insn) 164 { 165 enum rtx_code code; 166 const char *format_ptr; 167 int i, j; 168 169 code = GET_CODE (insn); 170 switch (code) 171 { 172 case MEM: 173 if (rtx_equal_p (addr, insn)) 174 return true; 175 break; 176 case REG: 177 CASE_CONST_ANY: 178 case SYMBOL_REF: 179 case CODE_LABEL: 180 case PC: 181 case CC0: 182 case EXPR_LIST: 183 return false; 184 default: 185 break; 186 } 187 188 format_ptr = GET_RTX_FORMAT (code); 189 for (i = 0; i < GET_RTX_LENGTH (code); i++) 190 { 191 switch (*format_ptr++) 192 { 193 case 'e': 194 if (exact_dependency_1 (addr, XEXP (insn, i))) 195 return true; 196 break; 197 case 'E': 198 for (j = 0; j < XVECLEN (insn, i); j++) 199 if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) 200 return true; 201 break; 202 } 203 } 204 return false; 205 } 206 207 /* Return true if there exists exact dependency for store & load, i.e. 208 the same memory address is used in them. */ 209 static bool 210 exact_store_load_dependency (rtx_insn *store, rtx_insn *load) 211 { 212 rtx set1, set2; 213 214 set1 = single_set (store); 215 if (!set1) 216 return false; 217 if (!MEM_P (SET_DEST (set1))) 218 return false; 219 set2 = single_set (load); 220 if (!set2) 221 return false; 222 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) 223 return true; 224 return false; 225 } 226 227 228 /* This function corrects the value of COST (latency) based on the relationship 229 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength 230 DW. It should return the new value. 231 232 On x86 CPUs this is most commonly used to model the fact that valus of 233 registers used to compute address of memory operand needs to be ready 234 earlier than values of registers used in the actual operation. */ 235 236 int 237 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, 238 unsigned int) 239 { 240 enum attr_type insn_type, dep_insn_type; 241 enum attr_memory memory; 242 rtx set, set2; 243 int dep_insn_code_number; 244 245 /* Anti and output dependencies have zero cost on all CPUs. */ 246 if (dep_type != 0) 247 return 0; 248 249 dep_insn_code_number = recog_memoized (dep_insn); 250 251 /* If we can't recognize the insns, we can't really do anything. */ 252 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) 253 return cost; 254 255 insn_type = get_attr_type (insn); 256 dep_insn_type = get_attr_type (dep_insn); 257 258 switch (ix86_tune) 259 { 260 case PROCESSOR_PENTIUM: 261 case PROCESSOR_LAKEMONT: 262 /* Address Generation Interlock adds a cycle of latency. */ 263 if (insn_type == TYPE_LEA) 264 { 265 rtx addr = PATTERN (insn); 266 267 if (GET_CODE (addr) == PARALLEL) 268 addr = XVECEXP (addr, 0, 0); 269 270 gcc_assert (GET_CODE (addr) == SET); 271 272 addr = SET_SRC (addr); 273 if (modified_in_p (addr, dep_insn)) 274 cost += 1; 275 } 276 else if (ix86_agi_dependent (dep_insn, insn)) 277 cost += 1; 278 279 /* ??? Compares pair with jump/setcc. */ 280 if (ix86_flags_dependent (insn, dep_insn, insn_type)) 281 cost = 0; 282 283 /* Floating point stores require value to be ready one cycle earlier. */ 284 if (insn_type == TYPE_FMOV 285 && get_attr_memory (insn) == MEMORY_STORE 286 && !ix86_agi_dependent (dep_insn, insn)) 287 cost += 1; 288 break; 289 290 case PROCESSOR_PENTIUMPRO: 291 /* INT->FP conversion is expensive. */ 292 if (get_attr_fp_int_src (dep_insn)) 293 cost += 5; 294 295 /* There is one cycle extra latency between an FP op and a store. */ 296 if (insn_type == TYPE_FMOV 297 && (set = single_set (dep_insn)) != NULL_RTX 298 && (set2 = single_set (insn)) != NULL_RTX 299 && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) 300 && MEM_P (SET_DEST (set2))) 301 cost += 1; 302 303 memory = get_attr_memory (insn); 304 305 /* Show ability of reorder buffer to hide latency of load by executing 306 in parallel with previous instruction in case 307 previous instruction is not needed to compute the address. */ 308 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 309 && !ix86_agi_dependent (dep_insn, insn)) 310 { 311 /* Claim moves to take one cycle, as core can issue one load 312 at time and the next load can start cycle later. */ 313 if (dep_insn_type == TYPE_IMOV 314 || dep_insn_type == TYPE_FMOV) 315 cost = 1; 316 else if (cost > 1) 317 cost--; 318 } 319 break; 320 321 case PROCESSOR_K6: 322 /* The esp dependency is resolved before 323 the instruction is really finished. */ 324 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 325 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 326 return 1; 327 328 /* INT->FP conversion is expensive. */ 329 if (get_attr_fp_int_src (dep_insn)) 330 cost += 5; 331 332 memory = get_attr_memory (insn); 333 334 /* Show ability of reorder buffer to hide latency of load by executing 335 in parallel with previous instruction in case 336 previous instruction is not needed to compute the address. */ 337 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 338 && !ix86_agi_dependent (dep_insn, insn)) 339 { 340 /* Claim moves to take one cycle, as core can issue one load 341 at time and the next load can start cycle later. */ 342 if (dep_insn_type == TYPE_IMOV 343 || dep_insn_type == TYPE_FMOV) 344 cost = 1; 345 else if (cost > 2) 346 cost -= 2; 347 else 348 cost = 1; 349 } 350 break; 351 352 case PROCESSOR_AMDFAM10: 353 case PROCESSOR_BDVER1: 354 case PROCESSOR_BDVER2: 355 case PROCESSOR_BDVER3: 356 case PROCESSOR_BDVER4: 357 case PROCESSOR_BTVER1: 358 case PROCESSOR_BTVER2: 359 /* Stack engine allows to execute push&pop instructions in parall. */ 360 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 361 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 362 return 0; 363 /* FALLTHRU */ 364 365 case PROCESSOR_ATHLON: 366 case PROCESSOR_K8: 367 memory = get_attr_memory (insn); 368 369 /* Show ability of reorder buffer to hide latency of load by executing 370 in parallel with previous instruction in case 371 previous instruction is not needed to compute the address. */ 372 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 373 && !ix86_agi_dependent (dep_insn, insn)) 374 { 375 enum attr_unit unit = get_attr_unit (insn); 376 int loadcost = 3; 377 378 /* Because of the difference between the length of integer and 379 floating unit pipeline preparation stages, the memory operands 380 for floating point are cheaper. 381 382 ??? For Athlon it the difference is most probably 2. */ 383 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) 384 loadcost = 3; 385 else 386 loadcost = TARGET_ATHLON ? 2 : 0; 387 388 if (cost >= loadcost) 389 cost -= loadcost; 390 else 391 cost = 0; 392 } 393 break; 394 395 case PROCESSOR_ZNVER1: 396 /* Stack engine allows to execute push&pop instructions in parall. */ 397 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 398 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 399 return 0; 400 401 memory = get_attr_memory (insn); 402 403 /* Show ability of reorder buffer to hide latency of load by executing 404 in parallel with previous instruction in case 405 previous instruction is not needed to compute the address. */ 406 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 407 && !ix86_agi_dependent (dep_insn, insn)) 408 { 409 enum attr_unit unit = get_attr_unit (insn); 410 int loadcost; 411 412 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) 413 loadcost = 4; 414 else 415 loadcost = 7; 416 417 if (cost >= loadcost) 418 cost -= loadcost; 419 else 420 cost = 0; 421 } 422 break; 423 424 case PROCESSOR_CORE2: 425 case PROCESSOR_NEHALEM: 426 case PROCESSOR_SANDYBRIDGE: 427 case PROCESSOR_HASWELL: 428 case PROCESSOR_GENERIC: 429 /* Stack engine allows to execute push&pop instructions in parall. */ 430 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 431 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 432 return 0; 433 434 memory = get_attr_memory (insn); 435 436 /* Show ability of reorder buffer to hide latency of load by executing 437 in parallel with previous instruction in case 438 previous instruction is not needed to compute the address. */ 439 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 440 && !ix86_agi_dependent (dep_insn, insn)) 441 { 442 if (cost >= 4) 443 cost -= 4; 444 else 445 cost = 0; 446 } 447 break; 448 449 case PROCESSOR_SILVERMONT: 450 case PROCESSOR_KNL: 451 case PROCESSOR_KNM: 452 case PROCESSOR_INTEL: 453 if (!reload_completed) 454 return cost; 455 456 /* Increase cost of integer loads. */ 457 memory = get_attr_memory (dep_insn); 458 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) 459 { 460 enum attr_unit unit = get_attr_unit (dep_insn); 461 if (unit == UNIT_INTEGER && cost == 1) 462 { 463 if (memory == MEMORY_LOAD) 464 cost = 3; 465 else 466 { 467 /* Increase cost of ld/st for short int types only 468 because of store forwarding issue. */ 469 rtx set = single_set (dep_insn); 470 if (set && (GET_MODE (SET_DEST (set)) == QImode 471 || GET_MODE (SET_DEST (set)) == HImode)) 472 { 473 /* Increase cost of store/load insn if exact 474 dependence exists and it is load insn. */ 475 enum attr_memory insn_memory = get_attr_memory (insn); 476 if (insn_memory == MEMORY_LOAD 477 && exact_store_load_dependency (dep_insn, insn)) 478 cost = 3; 479 } 480 } 481 } 482 } 483 484 default: 485 break; 486 } 487 488 return cost; 489 } 490 491 /* How many alternative schedules to try. This should be as wide as the 492 scheduling freedom in the DFA, but no wider. Making this value too 493 large results extra work for the scheduler. */ 494 495 int 496 ia32_multipass_dfa_lookahead (void) 497 { 498 /* Generally, we want haifa-sched:max_issue() to look ahead as far 499 as many instructions can be executed on a cycle, i.e., 500 issue_rate. */ 501 if (reload_completed) 502 return ix86_issue_rate (); 503 /* Don't use lookahead for pre-reload schedule to save compile time. */ 504 return 0; 505 } 506 507 /* Return true if target platform supports macro-fusion. */ 508 509 bool 510 ix86_macro_fusion_p () 511 { 512 return TARGET_FUSE_CMP_AND_BRANCH; 513 } 514 515 /* Check whether current microarchitecture support macro fusion 516 for insn pair "CONDGEN + CONDJMP". Refer to 517 "Intel Architectures Optimization Reference Manual". */ 518 519 bool 520 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) 521 { 522 rtx src, dest; 523 enum rtx_code ccode; 524 rtx compare_set = NULL_RTX, test_if, cond; 525 rtx alu_set = NULL_RTX, addr = NULL_RTX; 526 527 if (!any_condjump_p (condjmp)) 528 return false; 529 530 unsigned int condreg1, condreg2; 531 rtx cc_reg_1; 532 targetm.fixed_condition_code_regs (&condreg1, &condreg2); 533 cc_reg_1 = gen_rtx_REG (CCmode, condreg1); 534 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp)) 535 || !condgen 536 || !modified_in_p (cc_reg_1, condgen)) 537 return false; 538 539 if (get_attr_type (condgen) != TYPE_TEST 540 && get_attr_type (condgen) != TYPE_ICMP 541 && get_attr_type (condgen) != TYPE_INCDEC 542 && get_attr_type (condgen) != TYPE_ALU) 543 return false; 544 545 compare_set = single_set (condgen); 546 if (compare_set == NULL_RTX 547 && !TARGET_FUSE_ALU_AND_BRANCH) 548 return false; 549 550 if (compare_set == NULL_RTX) 551 { 552 int i; 553 rtx pat = PATTERN (condgen); 554 for (i = 0; i < XVECLEN (pat, 0); i++) 555 if (GET_CODE (XVECEXP (pat, 0, i)) == SET) 556 { 557 rtx set_src = SET_SRC (XVECEXP (pat, 0, i)); 558 if (GET_CODE (set_src) == COMPARE) 559 compare_set = XVECEXP (pat, 0, i); 560 else 561 alu_set = XVECEXP (pat, 0, i); 562 } 563 } 564 if (compare_set == NULL_RTX) 565 return false; 566 src = SET_SRC (compare_set); 567 if (GET_CODE (src) != COMPARE) 568 return false; 569 570 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not 571 supported. */ 572 if ((MEM_P (XEXP (src, 0)) 573 && CONST_INT_P (XEXP (src, 1))) 574 || (MEM_P (XEXP (src, 1)) 575 && CONST_INT_P (XEXP (src, 0)))) 576 return false; 577 578 /* No fusion for RIP-relative address. */ 579 if (MEM_P (XEXP (src, 0))) 580 addr = XEXP (XEXP (src, 0), 0); 581 else if (MEM_P (XEXP (src, 1))) 582 addr = XEXP (XEXP (src, 1), 0); 583 584 if (addr) { 585 ix86_address parts; 586 int ok = ix86_decompose_address (addr, &parts); 587 gcc_assert (ok); 588 589 if (ix86_rip_relative_addr_p (&parts)) 590 return false; 591 } 592 593 test_if = SET_SRC (pc_set (condjmp)); 594 cond = XEXP (test_if, 0); 595 ccode = GET_CODE (cond); 596 /* Check whether conditional jump use Sign or Overflow Flags. */ 597 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS 598 && (ccode == GE 599 || ccode == GT 600 || ccode == LE 601 || ccode == LT)) 602 return false; 603 604 /* Return true for TYPE_TEST and TYPE_ICMP. */ 605 if (get_attr_type (condgen) == TYPE_TEST 606 || get_attr_type (condgen) == TYPE_ICMP) 607 return true; 608 609 /* The following is the case that macro-fusion for alu + jmp. */ 610 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set) 611 return false; 612 613 /* No fusion for alu op with memory destination operand. */ 614 dest = SET_DEST (alu_set); 615 if (MEM_P (dest)) 616 return false; 617 618 /* Macro-fusion for inc/dec + unsigned conditional jump is not 619 supported. */ 620 if (get_attr_type (condgen) == TYPE_INCDEC 621 && (ccode == GEU 622 || ccode == GTU 623 || ccode == LEU 624 || ccode == LTU)) 625 return false; 626 627 return true; 628 } 629 630