1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2    Copyright (C) 1988-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #define IN_TARGET_CODE 1
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "cfghooks.h"
29 #include "tm_p.h"
30 #include "insn-config.h"
31 #include "insn-attr.h"
32 #include "recog.h"
33 #include "target.h"
34 
35 /* Return the maximum number of instructions a cpu can issue.  */
36 
37 int
ix86_issue_rate(void)38 ix86_issue_rate (void)
39 {
40   switch (ix86_tune)
41     {
42     case PROCESSOR_PENTIUM:
43     case PROCESSOR_LAKEMONT:
44     case PROCESSOR_BONNELL:
45     case PROCESSOR_SILVERMONT:
46     case PROCESSOR_KNL:
47     case PROCESSOR_KNM:
48     case PROCESSOR_INTEL:
49     case PROCESSOR_K6:
50     case PROCESSOR_BTVER2:
51     case PROCESSOR_PENTIUM4:
52     case PROCESSOR_NOCONA:
53       return 2;
54 
55     case PROCESSOR_PENTIUMPRO:
56     case PROCESSOR_ATHLON:
57     case PROCESSOR_K8:
58     case PROCESSOR_AMDFAM10:
59     case PROCESSOR_BTVER1:
60       return 3;
61 
62     case PROCESSOR_BDVER1:
63     case PROCESSOR_BDVER2:
64     case PROCESSOR_BDVER3:
65     case PROCESSOR_BDVER4:
66     case PROCESSOR_ZNVER1:
67     case PROCESSOR_CORE2:
68     case PROCESSOR_NEHALEM:
69     case PROCESSOR_SANDYBRIDGE:
70     case PROCESSOR_HASWELL:
71     case PROCESSOR_GENERIC:
72       return 4;
73 
74     default:
75       return 1;
76     }
77 }
78 
79 /* Return true iff USE_INSN has a memory address with operands set by
80    SET_INSN.  */
81 
82 bool
ix86_agi_dependent(rtx_insn * set_insn,rtx_insn * use_insn)83 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
84 {
85   int i;
86   extract_insn_cached (use_insn);
87   for (i = recog_data.n_operands - 1; i >= 0; --i)
88     if (MEM_P (recog_data.operand[i]))
89       {
90 	rtx addr = XEXP (recog_data.operand[i], 0);
91 	if (modified_in_p (addr, set_insn) != 0)
92 	  {
93 	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
94 	       has SP based memory (unless index reg is modified in a pop).  */
95 	    rtx set = single_set (set_insn);
96 	    if (set
97 		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
98 		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
99 	      {
100 		struct ix86_address parts;
101 		if (ix86_decompose_address (addr, &parts)
102 		    && parts.base == stack_pointer_rtx
103 		    && (parts.index == NULL_RTX
104 			|| MEM_P (SET_DEST (set))
105 			|| !modified_in_p (parts.index, set_insn)))
106 		  return false;
107 	      }
108 	    return true;
109 	  }
110 	return false;
111       }
112   return false;
113 }
114 
115 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
116    by DEP_INSN and nothing set by DEP_INSN.  */
117 
118 static bool
ix86_flags_dependent(rtx_insn * insn,rtx_insn * dep_insn,enum attr_type insn_type)119 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
120 {
121   rtx set, set2;
122 
123   /* Simplify the test for uninteresting insns.  */
124   if (insn_type != TYPE_SETCC
125       && insn_type != TYPE_ICMOV
126       && insn_type != TYPE_FCMOV
127       && insn_type != TYPE_IBR)
128     return false;
129 
130   if ((set = single_set (dep_insn)) != 0)
131     {
132       set = SET_DEST (set);
133       set2 = NULL_RTX;
134     }
135   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
136 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
137 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
138 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
139     {
140       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
141       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
142     }
143   else
144     return false;
145 
146   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
147     return false;
148 
149   /* This test is true if the dependent insn reads the flags but
150      not any other potentially set register.  */
151   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
152     return false;
153 
154   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
155     return false;
156 
157   return true;
158 }
159 
160 /* Helper function for exact_store_load_dependency.
161    Return true if addr is found in insn.  */
162 static bool
exact_dependency_1(rtx addr,rtx insn)163 exact_dependency_1 (rtx addr, rtx insn)
164 {
165   enum rtx_code code;
166   const char *format_ptr;
167   int i, j;
168 
169   code = GET_CODE (insn);
170   switch (code)
171     {
172     case MEM:
173       if (rtx_equal_p (addr, insn))
174 	return true;
175       break;
176     case REG:
177     CASE_CONST_ANY:
178     case SYMBOL_REF:
179     case CODE_LABEL:
180     case PC:
181     case CC0:
182     case EXPR_LIST:
183       return false;
184     default:
185       break;
186     }
187 
188   format_ptr = GET_RTX_FORMAT (code);
189   for (i = 0; i < GET_RTX_LENGTH (code); i++)
190     {
191       switch (*format_ptr++)
192 	{
193 	case 'e':
194 	  if (exact_dependency_1 (addr, XEXP (insn, i)))
195 	    return true;
196 	  break;
197 	case 'E':
198 	  for (j = 0; j < XVECLEN (insn, i); j++)
199 	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
200 	      return true;
201 	  break;
202 	}
203     }
204   return false;
205 }
206 
207 /* Return true if there exists exact dependency for store & load, i.e.
208    the same memory address is used in them.  */
209 static bool
exact_store_load_dependency(rtx_insn * store,rtx_insn * load)210 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
211 {
212   rtx set1, set2;
213 
214   set1 = single_set (store);
215   if (!set1)
216     return false;
217   if (!MEM_P (SET_DEST (set1)))
218     return false;
219   set2 = single_set (load);
220   if (!set2)
221     return false;
222   if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
223     return true;
224   return false;
225 }
226 
227 
228 /* This function corrects the value of COST (latency) based on the relationship
229    between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
230    DW.  It should return the new value.
231 
232    On x86 CPUs this is most commonly used to model the fact that valus of
233    registers used to compute address of memory operand  needs to be ready
234    earlier than values of registers used in the actual operation.  */
235 
236 int
ix86_adjust_cost(rtx_insn * insn,int dep_type,rtx_insn * dep_insn,int cost,unsigned int)237 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
238 		  unsigned int)
239 {
240   enum attr_type insn_type, dep_insn_type;
241   enum attr_memory memory;
242   rtx set, set2;
243   int dep_insn_code_number;
244 
245   /* Anti and output dependencies have zero cost on all CPUs.  */
246   if (dep_type != 0)
247     return 0;
248 
249   dep_insn_code_number = recog_memoized (dep_insn);
250 
251   /* If we can't recognize the insns, we can't really do anything.  */
252   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
253     return cost;
254 
255   insn_type = get_attr_type (insn);
256   dep_insn_type = get_attr_type (dep_insn);
257 
258   switch (ix86_tune)
259     {
260     case PROCESSOR_PENTIUM:
261     case PROCESSOR_LAKEMONT:
262       /* Address Generation Interlock adds a cycle of latency.  */
263       if (insn_type == TYPE_LEA)
264 	{
265 	  rtx addr = PATTERN (insn);
266 
267 	  if (GET_CODE (addr) == PARALLEL)
268 	    addr = XVECEXP (addr, 0, 0);
269 
270 	  gcc_assert (GET_CODE (addr) == SET);
271 
272 	  addr = SET_SRC (addr);
273 	  if (modified_in_p (addr, dep_insn))
274 	    cost += 1;
275 	}
276       else if (ix86_agi_dependent (dep_insn, insn))
277 	cost += 1;
278 
279       /* ??? Compares pair with jump/setcc.  */
280       if (ix86_flags_dependent (insn, dep_insn, insn_type))
281 	cost = 0;
282 
283       /* Floating point stores require value to be ready one cycle earlier.  */
284       if (insn_type == TYPE_FMOV
285 	  && get_attr_memory (insn) == MEMORY_STORE
286 	  && !ix86_agi_dependent (dep_insn, insn))
287 	cost += 1;
288       break;
289 
290     case PROCESSOR_PENTIUMPRO:
291       /* INT->FP conversion is expensive.  */
292       if (get_attr_fp_int_src (dep_insn))
293 	cost += 5;
294 
295       /* There is one cycle extra latency between an FP op and a store.  */
296       if (insn_type == TYPE_FMOV
297 	  && (set = single_set (dep_insn)) != NULL_RTX
298 	  && (set2 = single_set (insn)) != NULL_RTX
299 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
300 	  && MEM_P (SET_DEST (set2)))
301 	cost += 1;
302 
303       memory = get_attr_memory (insn);
304 
305       /* Show ability of reorder buffer to hide latency of load by executing
306 	 in parallel with previous instruction in case
307 	 previous instruction is not needed to compute the address.  */
308       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
309 	  && !ix86_agi_dependent (dep_insn, insn))
310 	{
311 	  /* Claim moves to take one cycle, as core can issue one load
312 	     at time and the next load can start cycle later.  */
313 	  if (dep_insn_type == TYPE_IMOV
314 	      || dep_insn_type == TYPE_FMOV)
315 	    cost = 1;
316 	  else if (cost > 1)
317 	    cost--;
318 	}
319       break;
320 
321     case PROCESSOR_K6:
322      /* The esp dependency is resolved before
323 	the instruction is really finished.  */
324       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
325 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
326 	return 1;
327 
328       /* INT->FP conversion is expensive.  */
329       if (get_attr_fp_int_src (dep_insn))
330 	cost += 5;
331 
332       memory = get_attr_memory (insn);
333 
334       /* Show ability of reorder buffer to hide latency of load by executing
335 	 in parallel with previous instruction in case
336 	 previous instruction is not needed to compute the address.  */
337       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
338 	  && !ix86_agi_dependent (dep_insn, insn))
339 	{
340 	  /* Claim moves to take one cycle, as core can issue one load
341 	     at time and the next load can start cycle later.  */
342 	  if (dep_insn_type == TYPE_IMOV
343 	      || dep_insn_type == TYPE_FMOV)
344 	    cost = 1;
345 	  else if (cost > 2)
346 	    cost -= 2;
347 	  else
348 	    cost = 1;
349 	}
350       break;
351 
352     case PROCESSOR_AMDFAM10:
353     case PROCESSOR_BDVER1:
354     case PROCESSOR_BDVER2:
355     case PROCESSOR_BDVER3:
356     case PROCESSOR_BDVER4:
357     case PROCESSOR_BTVER1:
358     case PROCESSOR_BTVER2:
359       /* Stack engine allows to execute push&pop instructions in parall.  */
360       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
361 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
362 	return 0;
363       /* FALLTHRU */
364 
365     case PROCESSOR_ATHLON:
366     case PROCESSOR_K8:
367       memory = get_attr_memory (insn);
368 
369       /* Show ability of reorder buffer to hide latency of load by executing
370 	 in parallel with previous instruction in case
371 	 previous instruction is not needed to compute the address.  */
372       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
373 	  && !ix86_agi_dependent (dep_insn, insn))
374 	{
375 	  enum attr_unit unit = get_attr_unit (insn);
376 	  int loadcost = 3;
377 
378 	  /* Because of the difference between the length of integer and
379 	     floating unit pipeline preparation stages, the memory operands
380 	     for floating point are cheaper.
381 
382 	     ??? For Athlon it the difference is most probably 2.  */
383 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
384 	    loadcost = 3;
385 	  else
386 	    loadcost = TARGET_ATHLON ? 2 : 0;
387 
388 	  if (cost >= loadcost)
389 	    cost -= loadcost;
390 	  else
391 	    cost = 0;
392 	}
393       break;
394 
395     case PROCESSOR_ZNVER1:
396       /* Stack engine allows to execute push&pop instructions in parall.  */
397       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
398 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
399 	return 0;
400 
401       memory = get_attr_memory (insn);
402 
403       /* Show ability of reorder buffer to hide latency of load by executing
404 	 in parallel with previous instruction in case
405 	 previous instruction is not needed to compute the address.  */
406       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
407 	  && !ix86_agi_dependent (dep_insn, insn))
408 	{
409 	  enum attr_unit unit = get_attr_unit (insn);
410 	  int loadcost;
411 
412 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
413 	    loadcost = 4;
414 	  else
415 	    loadcost = 7;
416 
417 	  if (cost >= loadcost)
418 	    cost -= loadcost;
419 	  else
420 	    cost = 0;
421 	}
422       break;
423 
424     case PROCESSOR_CORE2:
425     case PROCESSOR_NEHALEM:
426     case PROCESSOR_SANDYBRIDGE:
427     case PROCESSOR_HASWELL:
428     case PROCESSOR_GENERIC:
429       /* Stack engine allows to execute push&pop instructions in parall.  */
430       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
431 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
432 	return 0;
433 
434       memory = get_attr_memory (insn);
435 
436       /* Show ability of reorder buffer to hide latency of load by executing
437 	 in parallel with previous instruction in case
438 	 previous instruction is not needed to compute the address.  */
439       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
440 	  && !ix86_agi_dependent (dep_insn, insn))
441 	{
442 	  if (cost >= 4)
443 	    cost -= 4;
444 	  else
445 	    cost = 0;
446 	}
447       break;
448 
449     case PROCESSOR_SILVERMONT:
450     case PROCESSOR_KNL:
451     case PROCESSOR_KNM:
452     case PROCESSOR_INTEL:
453       if (!reload_completed)
454 	return cost;
455 
456       /* Increase cost of integer loads.  */
457       memory = get_attr_memory (dep_insn);
458       if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
459 	{
460 	  enum attr_unit unit = get_attr_unit (dep_insn);
461 	  if (unit == UNIT_INTEGER && cost == 1)
462 	    {
463 	      if (memory == MEMORY_LOAD)
464 		cost = 3;
465 	      else
466 		{
467 		  /* Increase cost of ld/st for short int types only
468 		     because of store forwarding issue.  */
469 		  rtx set = single_set (dep_insn);
470 		  if (set && (GET_MODE (SET_DEST (set)) == QImode
471 			      || GET_MODE (SET_DEST (set)) == HImode))
472 		    {
473 		      /* Increase cost of store/load insn if exact
474 			 dependence exists and it is load insn.  */
475 		      enum attr_memory insn_memory = get_attr_memory (insn);
476 		      if (insn_memory == MEMORY_LOAD
477 			  && exact_store_load_dependency (dep_insn, insn))
478 			cost = 3;
479 		    }
480 		}
481 	    }
482 	}
483 
484     default:
485       break;
486     }
487 
488   return cost;
489 }
490 
491 /* How many alternative schedules to try.  This should be as wide as the
492    scheduling freedom in the DFA, but no wider.  Making this value too
493    large results extra work for the scheduler.  */
494 
495 int
ia32_multipass_dfa_lookahead(void)496 ia32_multipass_dfa_lookahead (void)
497 {
498   /* Generally, we want haifa-sched:max_issue() to look ahead as far
499      as many instructions can be executed on a cycle, i.e.,
500      issue_rate.  */
501   if (reload_completed)
502     return ix86_issue_rate ();
503   /* Don't use lookahead for pre-reload schedule to save compile time.  */
504   return 0;
505 }
506 
507 /* Return true if target platform supports macro-fusion.  */
508 
509 bool
ix86_macro_fusion_p()510 ix86_macro_fusion_p ()
511 {
512   return TARGET_FUSE_CMP_AND_BRANCH;
513 }
514 
515 /* Check whether current microarchitecture support macro fusion
516    for insn pair "CONDGEN + CONDJMP". Refer to
517    "Intel Architectures Optimization Reference Manual". */
518 
519 bool
ix86_macro_fusion_pair_p(rtx_insn * condgen,rtx_insn * condjmp)520 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
521 {
522   rtx src, dest;
523   enum rtx_code ccode;
524   rtx compare_set = NULL_RTX, test_if, cond;
525   rtx alu_set = NULL_RTX, addr = NULL_RTX;
526 
527   if (!any_condjump_p (condjmp))
528     return false;
529 
530   unsigned int condreg1, condreg2;
531   rtx cc_reg_1;
532   targetm.fixed_condition_code_regs (&condreg1, &condreg2);
533   cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
534   if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
535       || !condgen
536       || !modified_in_p (cc_reg_1, condgen))
537     return false;
538 
539   if (get_attr_type (condgen) != TYPE_TEST
540       && get_attr_type (condgen) != TYPE_ICMP
541       && get_attr_type (condgen) != TYPE_INCDEC
542       && get_attr_type (condgen) != TYPE_ALU)
543     return false;
544 
545   compare_set = single_set (condgen);
546   if (compare_set == NULL_RTX
547       && !TARGET_FUSE_ALU_AND_BRANCH)
548     return false;
549 
550   if (compare_set == NULL_RTX)
551     {
552       int i;
553       rtx pat = PATTERN (condgen);
554       for (i = 0; i < XVECLEN (pat, 0); i++)
555 	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
556 	  {
557 	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
558 	    if (GET_CODE (set_src) == COMPARE)
559 	      compare_set = XVECEXP (pat, 0, i);
560 	    else
561 	      alu_set = XVECEXP (pat, 0, i);
562 	  }
563     }
564   if (compare_set == NULL_RTX)
565     return false;
566   src = SET_SRC (compare_set);
567   if (GET_CODE (src) != COMPARE)
568     return false;
569 
570   /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
571      supported.  */
572   if ((MEM_P (XEXP (src, 0))
573        && CONST_INT_P (XEXP (src, 1)))
574       || (MEM_P (XEXP (src, 1))
575 	  && CONST_INT_P (XEXP (src, 0))))
576     return false;
577 
578   /* No fusion for RIP-relative address.  */
579   if (MEM_P (XEXP (src, 0)))
580     addr = XEXP (XEXP (src, 0), 0);
581   else if (MEM_P (XEXP (src, 1)))
582     addr = XEXP (XEXP (src, 1), 0);
583 
584   if (addr) {
585     ix86_address parts;
586     int ok = ix86_decompose_address (addr, &parts);
587     gcc_assert (ok);
588 
589     if (ix86_rip_relative_addr_p (&parts))
590       return false;
591   }
592 
593   test_if = SET_SRC (pc_set (condjmp));
594   cond = XEXP (test_if, 0);
595   ccode = GET_CODE (cond);
596   /* Check whether conditional jump use Sign or Overflow Flags.  */
597   if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
598       && (ccode == GE
599           || ccode == GT
600 	  || ccode == LE
601 	  || ccode == LT))
602     return false;
603 
604   /* Return true for TYPE_TEST and TYPE_ICMP.  */
605   if (get_attr_type (condgen) == TYPE_TEST
606       || get_attr_type (condgen) == TYPE_ICMP)
607     return true;
608 
609   /* The following is the case that macro-fusion for alu + jmp.  */
610   if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
611     return false;
612 
613   /* No fusion for alu op with memory destination operand.  */
614   dest = SET_DEST (alu_set);
615   if (MEM_P (dest))
616     return false;
617 
618   /* Macro-fusion for inc/dec + unsigned conditional jump is not
619      supported.  */
620   if (get_attr_type (condgen) == TYPE_INCDEC
621       && (ccode == GEU
622 	  || ccode == GTU
623 	  || ccode == LEU
624 	  || ccode == LTU))
625     return false;
626 
627   return true;
628 }
629 
630