1 /* Basic block reordering routines for the GNU compiler.
2    Copyright (C) 2000-2018 Free Software Foundation, Inc.
3 
4    This file is part of GCC.
5 
6    GCC is free software; you can redistribute it and/or modify it
7    under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    GCC is distributed in the hope that it will be useful, but WITHOUT
12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
14    License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with GCC; see the file COPYING3.  If not see
18    <http://www.gnu.org/licenses/>.  */
19 
20 /* This file contains the "reorder blocks" pass, which changes the control
21    flow of a function to encounter fewer branches; the "partition blocks"
22    pass, which divides the basic blocks into "hot" and "cold" partitions,
23    which are kept separate; and the "duplicate computed gotos" pass, which
24    duplicates blocks ending in an indirect jump.
25 
26    There are two algorithms for "reorder blocks": the "simple" algorithm,
27    which just rearranges blocks, trying to minimize the number of executed
28    unconditional branches; and the "software trace cache" algorithm, which
29    also copies code, and in general tries a lot harder to have long linear
30    pieces of machine code executed.  This algorithm is described next.  */
31 
32 /* This (greedy) algorithm constructs traces in several rounds.
33    The construction starts from "seeds".  The seed for the first round
34    is the entry point of the function.  When there are more than one seed,
35    the one with the lowest key in the heap is selected first (see bb_to_key).
36    Then the algorithm repeatedly adds the most probable successor to the end
37    of a trace.  Finally it connects the traces.
38 
39    There are two parameters: Branch Threshold and Exec Threshold.
40    If the probability of an edge to a successor of the current basic block is
41    lower than Branch Threshold or its count is lower than Exec Threshold,
42    then the successor will be the seed in one of the next rounds.
43    Each round has these parameters lower than the previous one.
44    The last round has to have these parameters set to zero so that the
45    remaining blocks are picked up.
46 
47    The algorithm selects the most probable successor from all unvisited
48    successors and successors that have been added to this trace.
49    The other successors (that has not been "sent" to the next round) will be
50    other seeds for this round and the secondary traces will start from them.
51    If the successor has not been visited in this trace, it is added to the
52    trace (however, there is some heuristic for simple branches).
53    If the successor has been visited in this trace, a loop has been found.
54    If the loop has many iterations, the loop is rotated so that the source
55    block of the most probable edge going out of the loop is the last block
56    of the trace.
57    If the loop has few iterations and there is no edge from the last block of
58    the loop going out of the loop, the loop header is duplicated.
59 
60    When connecting traces, the algorithm first checks whether there is an edge
61    from the last block of a trace to the first block of another trace.
62    When there are still some unconnected traces it checks whether there exists
63    a basic block BB such that BB is a successor of the last block of a trace
64    and BB is a predecessor of the first block of another trace.  In this case,
65    BB is duplicated, added at the end of the first trace and the traces are
66    connected through it.
67    The rest of traces are simply connected so there will be a jump to the
68    beginning of the rest of traces.
69 
70    The above description is for the full algorithm, which is used when the
71    function is optimized for speed.  When the function is optimized for size,
72    in order to reduce long jumps and connect more fallthru edges, the
73    algorithm is modified as follows:
74    (1) Break long traces to short ones.  A trace is broken at a block that has
75    multiple predecessors/ successors during trace discovery.  When connecting
76    traces, only connect Trace n with Trace n + 1.  This change reduces most
77    long jumps compared with the above algorithm.
78    (2) Ignore the edge probability and count for fallthru edges.
79    (3) Keep the original order of blocks when there is no chance to fall
80    through.  We rely on the results of cfg_cleanup.
81 
82    To implement the change for code size optimization, block's index is
83    selected as the key and all traces are found in one round.
84 
85    References:
86 
87    "Software Trace Cache"
88    A. Ramirez, J. Larriba-Pey, C. Navarro, J. Torrellas and M. Valero; 1999
89    http://citeseer.nj.nec.com/15361.html
90 
91 */
92 
93 #include "config.h"
94 #define INCLUDE_ALGORITHM /* stable_sort */
95 #include "system.h"
96 #include "coretypes.h"
97 #include "backend.h"
98 #include "target.h"
99 #include "rtl.h"
100 #include "tree.h"
101 #include "cfghooks.h"
102 #include "df.h"
103 #include "memmodel.h"
104 #include "optabs.h"
105 #include "regs.h"
106 #include "emit-rtl.h"
107 #include "output.h"
108 #include "expr.h"
109 #include "params.h"
110 #include "tree-pass.h"
111 #include "cfgrtl.h"
112 #include "cfganal.h"
113 #include "cfgbuild.h"
114 #include "cfgcleanup.h"
115 #include "bb-reorder.h"
116 #include "except.h"
117 #include "fibonacci_heap.h"
118 #include "stringpool.h"
119 #include "attribs.h"
120 #include "common/common-target.h"
121 
122 /* The number of rounds.  In most cases there will only be 4 rounds, but
123    when partitioning hot and cold basic blocks into separate sections of
124    the object file there will be an extra round.  */
125 #define N_ROUNDS 5
126 
127 struct target_bb_reorder default_target_bb_reorder;
128 #if SWITCHABLE_TARGET
129 struct target_bb_reorder *this_target_bb_reorder = &default_target_bb_reorder;
130 #endif
131 
132 #define uncond_jump_length \
133   (this_target_bb_reorder->x_uncond_jump_length)
134 
135 /* Branch thresholds in thousandths (per mille) of the REG_BR_PROB_BASE.  */
136 static const int branch_threshold[N_ROUNDS] = {400, 200, 100, 0, 0};
137 
138 /* Exec thresholds in thousandths (per mille) of the count of bb 0.  */
139 static const int exec_threshold[N_ROUNDS] = {500, 200, 50, 0, 0};
140 
141 /* If edge count is lower than DUPLICATION_THRESHOLD per mille of entry
142    block the edge destination is not duplicated while connecting traces.  */
143 #define DUPLICATION_THRESHOLD 100
144 
145 typedef fibonacci_heap <long, basic_block_def> bb_heap_t;
146 typedef fibonacci_node <long, basic_block_def> bb_heap_node_t;
147 
148 /* Structure to hold needed information for each basic block.  */
149 struct bbro_basic_block_data
150 {
151   /* Which trace is the bb start of (-1 means it is not a start of any).  */
152   int start_of_trace;
153 
154   /* Which trace is the bb end of (-1 means it is not an end of any).  */
155   int end_of_trace;
156 
157   /* Which trace is the bb in?  */
158   int in_trace;
159 
160   /* Which trace was this bb visited in?  */
161   int visited;
162 
163   /* Cached maximum frequency of interesting incoming edges.
164      Minus one means not yet computed.  */
165   int priority;
166 
167   /* Which heap is BB in (if any)?  */
168   bb_heap_t *heap;
169 
170   /* Which heap node is BB in (if any)?  */
171   bb_heap_node_t *node;
172 };
173 
174 /* The current size of the following dynamic array.  */
175 static int array_size;
176 
177 /* The array which holds needed information for basic blocks.  */
178 static bbro_basic_block_data *bbd;
179 
180 /* To avoid frequent reallocation the size of arrays is greater than needed,
181    the number of elements is (not less than) 1.25 * size_wanted.  */
182 #define GET_ARRAY_SIZE(X) ((((X) / 4) + 1) * 5)
183 
184 /* Free the memory and set the pointer to NULL.  */
185 #define FREE(P) (gcc_assert (P), free (P), P = 0)
186 
187 /* Structure for holding information about a trace.  */
188 struct trace
189 {
190   /* First and last basic block of the trace.  */
191   basic_block first, last;
192 
193   /* The round of the STC creation which this trace was found in.  */
194   int round;
195 
196   /* The length (i.e. the number of basic blocks) of the trace.  */
197   int length;
198 };
199 
200 /* Maximum count of one of the entry blocks.  */
201 static profile_count max_entry_count;
202 
203 /* Local function prototypes.  */
204 static void find_traces_1_round (int, profile_count, struct trace *, int *,
205 				 int, bb_heap_t **, int);
206 static basic_block copy_bb (basic_block, edge, basic_block, int);
207 static long bb_to_key (basic_block);
208 static bool better_edge_p (const_basic_block, const_edge, profile_probability,
209 			   profile_count, profile_probability, profile_count,
210 			   const_edge);
211 static bool copy_bb_p (const_basic_block, int);
212 
213 /* Return the trace number in which BB was visited.  */
214 
215 static int
bb_visited_trace(const_basic_block bb)216 bb_visited_trace (const_basic_block bb)
217 {
218   gcc_assert (bb->index < array_size);
219   return bbd[bb->index].visited;
220 }
221 
222 /* This function marks BB that it was visited in trace number TRACE.  */
223 
224 static void
mark_bb_visited(basic_block bb,int trace)225 mark_bb_visited (basic_block bb, int trace)
226 {
227   bbd[bb->index].visited = trace;
228   if (bbd[bb->index].heap)
229     {
230       bbd[bb->index].heap->delete_node (bbd[bb->index].node);
231       bbd[bb->index].heap = NULL;
232       bbd[bb->index].node = NULL;
233     }
234 }
235 
236 /* Check to see if bb should be pushed into the next round of trace
237    collections or not.  Reasons for pushing the block forward are 1).
238    If the block is cold, we are doing partitioning, and there will be
239    another round (cold partition blocks are not supposed to be
240    collected into traces until the very last round); or 2). There will
241    be another round, and the basic block is not "hot enough" for the
242    current round of trace collection.  */
243 
244 static bool
push_to_next_round_p(const_basic_block bb,int round,int number_of_rounds,profile_count count_th)245 push_to_next_round_p (const_basic_block bb, int round, int number_of_rounds,
246 		      profile_count count_th)
247 {
248   bool there_exists_another_round;
249   bool block_not_hot_enough;
250 
251   there_exists_another_round = round < number_of_rounds - 1;
252 
253   block_not_hot_enough = (bb->count < count_th
254 			  || probably_never_executed_bb_p (cfun, bb));
255 
256   if (there_exists_another_round
257       && block_not_hot_enough)
258     return true;
259   else
260     return false;
261 }
262 
263 /* Find the traces for Software Trace Cache.  Chain each trace through
264    RBI()->next.  Store the number of traces to N_TRACES and description of
265    traces to TRACES.  */
266 
267 static void
find_traces(int * n_traces,struct trace * traces)268 find_traces (int *n_traces, struct trace *traces)
269 {
270   int i;
271   int number_of_rounds;
272   edge e;
273   edge_iterator ei;
274   bb_heap_t *heap = new bb_heap_t (LONG_MIN);
275 
276   /* Add one extra round of trace collection when partitioning hot/cold
277      basic blocks into separate sections.  The last round is for all the
278      cold blocks (and ONLY the cold blocks).  */
279 
280   number_of_rounds = N_ROUNDS - 1;
281 
282   /* Insert entry points of function into heap.  */
283   max_entry_count = profile_count::zero ();
284   FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR_FOR_FN (cfun)->succs)
285     {
286       bbd[e->dest->index].heap = heap;
287       bbd[e->dest->index].node = heap->insert (bb_to_key (e->dest), e->dest);
288       if (e->dest->count > max_entry_count)
289 	max_entry_count = e->dest->count;
290     }
291 
292   /* Find the traces.  */
293   for (i = 0; i < number_of_rounds; i++)
294     {
295       profile_count count_threshold;
296 
297       if (dump_file)
298 	fprintf (dump_file, "STC - round %d\n", i + 1);
299 
300       count_threshold = max_entry_count.apply_scale (exec_threshold[i], 1000);
301 
302       find_traces_1_round (REG_BR_PROB_BASE * branch_threshold[i] / 1000,
303 			   count_threshold, traces, n_traces, i, &heap,
304 			   number_of_rounds);
305     }
306   delete heap;
307 
308   if (dump_file)
309     {
310       for (i = 0; i < *n_traces; i++)
311 	{
312 	  basic_block bb;
313 	  fprintf (dump_file, "Trace %d (round %d):  ", i + 1,
314 		   traces[i].round + 1);
315 	  for (bb = traces[i].first;
316 	       bb != traces[i].last;
317 	       bb = (basic_block) bb->aux)
318 	    {
319 	      fprintf (dump_file, "%d [", bb->index);
320 	      bb->count.dump (dump_file);
321 	      fprintf (dump_file, "] ");
322 	    }
323 	  fprintf (dump_file, "%d [", bb->index);
324 	  bb->count.dump (dump_file);
325 	  fprintf (dump_file, "]\n");
326 	}
327       fflush (dump_file);
328     }
329 }
330 
331 /* Rotate loop whose back edge is BACK_EDGE in the tail of trace TRACE
332    (with sequential number TRACE_N).  */
333 
334 static basic_block
rotate_loop(edge back_edge,struct trace * trace,int trace_n)335 rotate_loop (edge back_edge, struct trace *trace, int trace_n)
336 {
337   basic_block bb;
338 
339   /* Information about the best end (end after rotation) of the loop.  */
340   basic_block best_bb = NULL;
341   edge best_edge = NULL;
342   profile_count best_count = profile_count::uninitialized ();
343   /* The best edge is preferred when its destination is not visited yet
344      or is a start block of some trace.  */
345   bool is_preferred = false;
346 
347   /* Find the most frequent edge that goes out from current trace.  */
348   bb = back_edge->dest;
349   do
350     {
351       edge e;
352       edge_iterator ei;
353 
354       FOR_EACH_EDGE (e, ei, bb->succs)
355 	if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
356 	    && bb_visited_trace (e->dest) != trace_n
357 	    && (e->flags & EDGE_CAN_FALLTHRU)
358 	    && !(e->flags & EDGE_COMPLEX))
359 	{
360 	  if (is_preferred)
361 	    {
362 	      /* The best edge is preferred.  */
363 	      if (!bb_visited_trace (e->dest)
364 		  || bbd[e->dest->index].start_of_trace >= 0)
365 		{
366 		  /* The current edge E is also preferred.  */
367 		  if (e->count () > best_count)
368 		    {
369 		      best_count = e->count ();
370 		      best_edge = e;
371 		      best_bb = bb;
372 		    }
373 		}
374 	    }
375 	  else
376 	    {
377 	      if (!bb_visited_trace (e->dest)
378 		  || bbd[e->dest->index].start_of_trace >= 0)
379 		{
380 		  /* The current edge E is preferred.  */
381 		  is_preferred = true;
382 		  best_count = e->count ();
383 		  best_edge = e;
384 		  best_bb = bb;
385 		}
386 	      else
387 		{
388 		  if (!best_edge || e->count () > best_count)
389 		    {
390 		      best_count = e->count ();
391 		      best_edge = e;
392 		      best_bb = bb;
393 		    }
394 		}
395 	    }
396 	}
397       bb = (basic_block) bb->aux;
398     }
399   while (bb != back_edge->dest);
400 
401   if (best_bb)
402     {
403       /* Rotate the loop so that the BEST_EDGE goes out from the last block of
404 	 the trace.  */
405       if (back_edge->dest == trace->first)
406 	{
407 	  trace->first = (basic_block) best_bb->aux;
408 	}
409       else
410 	{
411 	  basic_block prev_bb;
412 
413 	  for (prev_bb = trace->first;
414 	       prev_bb->aux != back_edge->dest;
415 	       prev_bb = (basic_block) prev_bb->aux)
416 	    ;
417 	  prev_bb->aux = best_bb->aux;
418 
419 	  /* Try to get rid of uncond jump to cond jump.  */
420 	  if (single_succ_p (prev_bb))
421 	    {
422 	      basic_block header = single_succ (prev_bb);
423 
424 	      /* Duplicate HEADER if it is a small block containing cond jump
425 		 in the end.  */
426 	      if (any_condjump_p (BB_END (header)) && copy_bb_p (header, 0)
427 		  && !CROSSING_JUMP_P (BB_END (header)))
428 		copy_bb (header, single_succ_edge (prev_bb), prev_bb, trace_n);
429 	    }
430 	}
431     }
432   else
433     {
434       /* We have not found suitable loop tail so do no rotation.  */
435       best_bb = back_edge->src;
436     }
437   best_bb->aux = NULL;
438   return best_bb;
439 }
440 
441 /* One round of finding traces.  Find traces for BRANCH_TH and EXEC_TH i.e. do
442    not include basic blocks whose probability is lower than BRANCH_TH or whose
443    count is lower than EXEC_TH into traces (or whose count is lower than
444    COUNT_TH).  Store the new traces into TRACES and modify the number of
445    traces *N_TRACES.  Set the round (which the trace belongs to) to ROUND.
446    The function expects starting basic blocks to be in *HEAP and will delete
447    *HEAP and store starting points for the next round into new *HEAP.  */
448 
449 static void
find_traces_1_round(int branch_th,profile_count count_th,struct trace * traces,int * n_traces,int round,bb_heap_t ** heap,int number_of_rounds)450 find_traces_1_round (int branch_th, profile_count count_th,
451 		     struct trace *traces, int *n_traces, int round,
452 		     bb_heap_t **heap, int number_of_rounds)
453 {
454   /* Heap for discarded basic blocks which are possible starting points for
455      the next round.  */
456   bb_heap_t *new_heap = new bb_heap_t (LONG_MIN);
457   bool for_size = optimize_function_for_size_p (cfun);
458 
459   while (!(*heap)->empty ())
460     {
461       basic_block bb;
462       struct trace *trace;
463       edge best_edge, e;
464       long key;
465       edge_iterator ei;
466 
467       bb = (*heap)->extract_min ();
468       bbd[bb->index].heap = NULL;
469       bbd[bb->index].node = NULL;
470 
471       if (dump_file)
472 	fprintf (dump_file, "Getting bb %d\n", bb->index);
473 
474       /* If the BB's count is too low, send BB to the next round.  When
475 	 partitioning hot/cold blocks into separate sections, make sure all
476 	 the cold blocks (and ONLY the cold blocks) go into the (extra) final
477 	 round.  When optimizing for size, do not push to next round.  */
478 
479       if (!for_size
480 	  && push_to_next_round_p (bb, round, number_of_rounds,
481 				   count_th))
482 	{
483 	  int key = bb_to_key (bb);
484 	  bbd[bb->index].heap = new_heap;
485 	  bbd[bb->index].node = new_heap->insert (key, bb);
486 
487 	  if (dump_file)
488 	    fprintf (dump_file,
489 		     "  Possible start point of next round: %d (key: %d)\n",
490 		     bb->index, key);
491 	  continue;
492 	}
493 
494       trace = traces + *n_traces;
495       trace->first = bb;
496       trace->round = round;
497       trace->length = 0;
498       bbd[bb->index].in_trace = *n_traces;
499       (*n_traces)++;
500 
501       do
502 	{
503 	  bool ends_in_call;
504 
505 	  /* The probability and count of the best edge.  */
506 	  profile_probability best_prob = profile_probability::uninitialized ();
507 	  profile_count best_count = profile_count::uninitialized ();
508 
509 	  best_edge = NULL;
510 	  mark_bb_visited (bb, *n_traces);
511 	  trace->length++;
512 
513 	  if (dump_file)
514 	    fprintf (dump_file, "Basic block %d was visited in trace %d\n",
515 		     bb->index, *n_traces);
516 
517 	  ends_in_call = block_ends_with_call_p (bb);
518 
519 	  /* Select the successor that will be placed after BB.  */
520 	  FOR_EACH_EDGE (e, ei, bb->succs)
521 	    {
522 	      gcc_assert (!(e->flags & EDGE_FAKE));
523 
524 	      if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
525 		continue;
526 
527 	      if (bb_visited_trace (e->dest)
528 		  && bb_visited_trace (e->dest) != *n_traces)
529 		continue;
530 
531 	      /* If partitioning hot/cold basic blocks, don't consider edges
532 		 that cross section boundaries.  */
533 	      if (BB_PARTITION (e->dest) != BB_PARTITION (bb))
534 		continue;
535 
536 	      profile_probability prob = e->probability;
537 	      profile_count count = e->dest->count;
538 
539 	      /* The only sensible preference for a call instruction is the
540 		 fallthru edge.  Don't bother selecting anything else.  */
541 	      if (ends_in_call)
542 		{
543 		  if (e->flags & EDGE_CAN_FALLTHRU)
544 		    {
545 		      best_edge = e;
546 		      best_prob = prob;
547 		      best_count = count;
548 		    }
549 		  continue;
550 		}
551 
552 	      /* Edge that cannot be fallthru or improbable or infrequent
553 		 successor (i.e. it is unsuitable successor).  When optimizing
554 		 for size, ignore the probability and count.  */
555 	      if (!(e->flags & EDGE_CAN_FALLTHRU) || (e->flags & EDGE_COMPLEX)
556 		  || !prob.initialized_p ()
557 		  || ((prob.to_reg_br_prob_base () < branch_th
558 		      || e->count () < count_th) && (!for_size)))
559 		continue;
560 
561 	      if (better_edge_p (bb, e, prob, count, best_prob, best_count,
562 				 best_edge))
563 		{
564 		  best_edge = e;
565 		  best_prob = prob;
566 		  best_count = count;
567 		}
568 	    }
569 
570 	  /* If the best destination has multiple predecessors and can be
571 	     duplicated cheaper than a jump, don't allow it to be added to
572 	     a trace; we'll duplicate it when connecting the traces later.
573 	     However, we need to check that this duplication wouldn't leave
574 	     the best destination with only crossing predecessors, because
575 	     this would change its effective partition from hot to cold.  */
576 	  if (best_edge
577 	      && EDGE_COUNT (best_edge->dest->preds) >= 2
578 	      && copy_bb_p (best_edge->dest, 0))
579 	    {
580 	      bool only_crossing_preds = true;
581 	      edge e;
582 	      edge_iterator ei;
583 	      FOR_EACH_EDGE (e, ei, best_edge->dest->preds)
584 		if (e != best_edge && !(e->flags & EDGE_CROSSING))
585 		  {
586 		    only_crossing_preds = false;
587 		    break;
588 		  }
589 	      if (!only_crossing_preds)
590 		best_edge = NULL;
591 	    }
592 
593 	  /* If the best destination has multiple successors or predecessors,
594 	     don't allow it to be added when optimizing for size.  This makes
595 	     sure predecessors with smaller index are handled before the best
596 	     destinarion.  It breaks long trace and reduces long jumps.
597 
598 	     Take if-then-else as an example.
599 		A
600 	       / \
601 	      B   C
602 	       \ /
603 		D
604 	     If we do not remove the best edge B->D/C->D, the final order might
605 	     be A B D ... C.  C is at the end of the program.  If D's successors
606 	     and D are complicated, might need long jumps for A->C and C->D.
607 	     Similar issue for order: A C D ... B.
608 
609 	     After removing the best edge, the final result will be ABCD/ ACBD.
610 	     It does not add jump compared with the previous order.  But it
611 	     reduces the possibility of long jumps.  */
612 	  if (best_edge && for_size
613 	      && (EDGE_COUNT (best_edge->dest->succs) > 1
614 		 || EDGE_COUNT (best_edge->dest->preds) > 1))
615 	    best_edge = NULL;
616 
617 	  /* Add all non-selected successors to the heaps.  */
618 	  FOR_EACH_EDGE (e, ei, bb->succs)
619 	    {
620 	      if (e == best_edge
621 		  || e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)
622 		  || bb_visited_trace (e->dest))
623 		continue;
624 
625 	      key = bb_to_key (e->dest);
626 
627 	      if (bbd[e->dest->index].heap)
628 		{
629 		  /* E->DEST is already in some heap.  */
630 		  if (key != bbd[e->dest->index].node->get_key ())
631 		    {
632 		      if (dump_file)
633 			{
634 			  fprintf (dump_file,
635 				   "Changing key for bb %d from %ld to %ld.\n",
636 				   e->dest->index,
637 				   (long) bbd[e->dest->index].node->get_key (),
638 				   key);
639 			}
640 		      bbd[e->dest->index].heap->replace_key
641 		        (bbd[e->dest->index].node, key);
642 		    }
643 		}
644 	      else
645 		{
646 		  bb_heap_t *which_heap = *heap;
647 
648 		  profile_probability prob = e->probability;
649 
650 		  if (!(e->flags & EDGE_CAN_FALLTHRU)
651 		      || (e->flags & EDGE_COMPLEX)
652 		      || !prob.initialized_p ()
653 		      || prob.to_reg_br_prob_base () < branch_th
654 		      || e->count () < count_th)
655 		    {
656 		      /* When partitioning hot/cold basic blocks, make sure
657 			 the cold blocks (and only the cold blocks) all get
658 			 pushed to the last round of trace collection.  When
659 			 optimizing for size, do not push to next round.  */
660 
661 		      if (!for_size && push_to_next_round_p (e->dest, round,
662 							     number_of_rounds,
663 							     count_th))
664 			which_heap = new_heap;
665 		    }
666 
667 		  bbd[e->dest->index].heap = which_heap;
668 		  bbd[e->dest->index].node = which_heap->insert (key, e->dest);
669 
670 		  if (dump_file)
671 		    {
672 		      fprintf (dump_file,
673 			       "  Possible start of %s round: %d (key: %ld)\n",
674 			       (which_heap == new_heap) ? "next" : "this",
675 			       e->dest->index, (long) key);
676 		    }
677 
678 		}
679 	    }
680 
681 	  if (best_edge) /* Suitable successor was found.  */
682 	    {
683 	      if (bb_visited_trace (best_edge->dest) == *n_traces)
684 		{
685 		  /* We do nothing with one basic block loops.  */
686 		  if (best_edge->dest != bb)
687 		    {
688 		      if (best_edge->count ()
689 			  > best_edge->dest->count.apply_scale (4, 5))
690 			{
691 			  /* The loop has at least 4 iterations.  If the loop
692 			     header is not the first block of the function
693 			     we can rotate the loop.  */
694 
695 			  if (best_edge->dest
696 			      != ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb)
697 			    {
698 			      if (dump_file)
699 				{
700 				  fprintf (dump_file,
701 					   "Rotating loop %d - %d\n",
702 					   best_edge->dest->index, bb->index);
703 				}
704 			      bb->aux = best_edge->dest;
705 			      bbd[best_edge->dest->index].in_trace =
706 							     (*n_traces) - 1;
707 			      bb = rotate_loop (best_edge, trace, *n_traces);
708 			    }
709 			}
710 		      else
711 			{
712 			  /* The loop has less than 4 iterations.  */
713 
714 			  if (single_succ_p (bb)
715 			      && copy_bb_p (best_edge->dest,
716 			      		    optimize_edge_for_speed_p
717 			      		    (best_edge)))
718 			    {
719 			      bb = copy_bb (best_edge->dest, best_edge, bb,
720 					    *n_traces);
721 			      trace->length++;
722 			    }
723 			}
724 		    }
725 
726 		  /* Terminate the trace.  */
727 		  break;
728 		}
729 	      else
730 		{
731 		  /* Check for a situation
732 
733 		    A
734 		   /|
735 		  B |
736 		   \|
737 		    C
738 
739 		  where
740 		  AB->count () + BC->count () >= AC->count ().
741 		  (i.e. 2 * B->count >= AC->count )
742 		  Best ordering is then A B C.
743 
744 		  When optimizing for size, A B C is always the best order.
745 
746 		  This situation is created for example by:
747 
748 		  if (A) B;
749 		  C;
750 
751 		  */
752 
753 		  FOR_EACH_EDGE (e, ei, bb->succs)
754 		    if (e != best_edge
755 			&& (e->flags & EDGE_CAN_FALLTHRU)
756 			&& !(e->flags & EDGE_COMPLEX)
757 			&& !bb_visited_trace (e->dest)
758 			&& single_pred_p (e->dest)
759 			&& !(e->flags & EDGE_CROSSING)
760 			&& single_succ_p (e->dest)
761 			&& (single_succ_edge (e->dest)->flags
762 			    & EDGE_CAN_FALLTHRU)
763 			&& !(single_succ_edge (e->dest)->flags & EDGE_COMPLEX)
764 			&& single_succ (e->dest) == best_edge->dest
765 			&& (e->dest->count.apply_scale (2, 1)
766 			    >= best_edge->count () || for_size))
767 		      {
768 			best_edge = e;
769 			if (dump_file)
770 			  fprintf (dump_file, "Selecting BB %d\n",
771 				   best_edge->dest->index);
772 			break;
773 		      }
774 
775 		  bb->aux = best_edge->dest;
776 		  bbd[best_edge->dest->index].in_trace = (*n_traces) - 1;
777 		  bb = best_edge->dest;
778 		}
779 	    }
780 	}
781       while (best_edge);
782       trace->last = bb;
783       bbd[trace->first->index].start_of_trace = *n_traces - 1;
784       if (bbd[trace->last->index].end_of_trace != *n_traces - 1)
785 	{
786 	  bbd[trace->last->index].end_of_trace = *n_traces - 1;
787 	  /* Update the cached maximum frequency for interesting predecessor
788 	     edges for successors of the new trace end.  */
789 	  FOR_EACH_EDGE (e, ei, trace->last->succs)
790 	    if (EDGE_FREQUENCY (e) > bbd[e->dest->index].priority)
791 	      bbd[e->dest->index].priority = EDGE_FREQUENCY (e);
792 	}
793 
794       /* The trace is terminated so we have to recount the keys in heap
795 	 (some block can have a lower key because now one of its predecessors
796 	 is an end of the trace).  */
797       FOR_EACH_EDGE (e, ei, bb->succs)
798 	{
799 	  if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)
800 	      || bb_visited_trace (e->dest))
801 	    continue;
802 
803 	  if (bbd[e->dest->index].heap)
804 	    {
805 	      key = bb_to_key (e->dest);
806 	      if (key != bbd[e->dest->index].node->get_key ())
807 		{
808 		  if (dump_file)
809 		    {
810 		      fprintf (dump_file,
811 			       "Changing key for bb %d from %ld to %ld.\n",
812 			       e->dest->index,
813 			       (long) bbd[e->dest->index].node->get_key (), key);
814 		    }
815 		  bbd[e->dest->index].heap->replace_key
816 		    (bbd[e->dest->index].node, key);
817 		}
818 	    }
819 	}
820     }
821 
822   delete (*heap);
823 
824   /* "Return" the new heap.  */
825   *heap = new_heap;
826 }
827 
828 /* Create a duplicate of the basic block OLD_BB and redirect edge E to it, add
829    it to trace after BB, mark OLD_BB visited and update pass' data structures
830    (TRACE is a number of trace which OLD_BB is duplicated to).  */
831 
832 static basic_block
copy_bb(basic_block old_bb,edge e,basic_block bb,int trace)833 copy_bb (basic_block old_bb, edge e, basic_block bb, int trace)
834 {
835   basic_block new_bb;
836 
837   new_bb = duplicate_block (old_bb, e, bb);
838   BB_COPY_PARTITION (new_bb, old_bb);
839 
840   gcc_assert (e->dest == new_bb);
841 
842   if (dump_file)
843     fprintf (dump_file,
844 	     "Duplicated bb %d (created bb %d)\n",
845 	     old_bb->index, new_bb->index);
846 
847   if (new_bb->index >= array_size
848       || last_basic_block_for_fn (cfun) > array_size)
849     {
850       int i;
851       int new_size;
852 
853       new_size = MAX (last_basic_block_for_fn (cfun), new_bb->index + 1);
854       new_size = GET_ARRAY_SIZE (new_size);
855       bbd = XRESIZEVEC (bbro_basic_block_data, bbd, new_size);
856       for (i = array_size; i < new_size; i++)
857 	{
858 	  bbd[i].start_of_trace = -1;
859 	  bbd[i].end_of_trace = -1;
860 	  bbd[i].in_trace = -1;
861 	  bbd[i].visited = 0;
862 	  bbd[i].priority = -1;
863 	  bbd[i].heap = NULL;
864 	  bbd[i].node = NULL;
865 	}
866       array_size = new_size;
867 
868       if (dump_file)
869 	{
870 	  fprintf (dump_file,
871 		   "Growing the dynamic array to %d elements.\n",
872 		   array_size);
873 	}
874     }
875 
876   gcc_assert (!bb_visited_trace (e->dest));
877   mark_bb_visited (new_bb, trace);
878   new_bb->aux = bb->aux;
879   bb->aux = new_bb;
880 
881   bbd[new_bb->index].in_trace = trace;
882 
883   return new_bb;
884 }
885 
886 /* Compute and return the key (for the heap) of the basic block BB.  */
887 
888 static long
bb_to_key(basic_block bb)889 bb_to_key (basic_block bb)
890 {
891   edge e;
892   edge_iterator ei;
893 
894   /* Use index as key to align with its original order.  */
895   if (optimize_function_for_size_p (cfun))
896     return bb->index;
897 
898   /* Do not start in probably never executed blocks.  */
899 
900   if (BB_PARTITION (bb) == BB_COLD_PARTITION
901       || probably_never_executed_bb_p (cfun, bb))
902     return BB_FREQ_MAX;
903 
904   /* Prefer blocks whose predecessor is an end of some trace
905      or whose predecessor edge is EDGE_DFS_BACK.  */
906   int priority = bbd[bb->index].priority;
907   if (priority == -1)
908     {
909       priority = 0;
910       FOR_EACH_EDGE (e, ei, bb->preds)
911 	{
912 	  if ((e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun)
913 	       && bbd[e->src->index].end_of_trace >= 0)
914 	      || (e->flags & EDGE_DFS_BACK))
915 	    {
916 	      int edge_freq = EDGE_FREQUENCY (e);
917 
918 	      if (edge_freq > priority)
919 		priority = edge_freq;
920 	    }
921 	}
922       bbd[bb->index].priority = priority;
923     }
924 
925   if (priority)
926     /* The block with priority should have significantly lower key.  */
927     return -(100 * BB_FREQ_MAX + 100 * priority + bb->count.to_frequency (cfun));
928 
929   return -bb->count.to_frequency (cfun);
930 }
931 
932 /* Return true when the edge E from basic block BB is better than the temporary
933    best edge (details are in function).  The probability of edge E is PROB. The
934    count of the successor is COUNT.  The current best probability is
935    BEST_PROB, the best count is BEST_COUNT.
936    The edge is considered to be equivalent when PROB does not differ much from
937    BEST_PROB; similarly for count.  */
938 
939 static bool
better_edge_p(const_basic_block bb,const_edge e,profile_probability prob,profile_count count,profile_probability best_prob,profile_count best_count,const_edge cur_best_edge)940 better_edge_p (const_basic_block bb, const_edge e, profile_probability prob,
941 	       profile_count count, profile_probability best_prob,
942 	       profile_count best_count, const_edge cur_best_edge)
943 {
944   bool is_better_edge;
945 
946   /* The BEST_* values do not have to be best, but can be a bit smaller than
947      maximum values.  */
948   profile_probability diff_prob = best_prob.apply_scale (1, 10);
949 
950   /* The smaller one is better to keep the original order.  */
951   if (optimize_function_for_size_p (cfun))
952     return !cur_best_edge
953 	   || cur_best_edge->dest->index > e->dest->index;
954 
955   /* Those edges are so expensive that continuing a trace is not useful
956      performance wise.  */
957   if (e->flags & (EDGE_ABNORMAL | EDGE_EH))
958     return false;
959 
960   if (prob > best_prob + diff_prob
961       || (!best_prob.initialized_p ()
962 	  && prob > profile_probability::guessed_never ()))
963     /* The edge has higher probability than the temporary best edge.  */
964     is_better_edge = true;
965   else if (prob < best_prob - diff_prob)
966     /* The edge has lower probability than the temporary best edge.  */
967     is_better_edge = false;
968   else
969     {
970       profile_count diff_count = best_count.apply_scale (1, 10);
971       if (count < best_count - diff_count
972 	  || (!best_count.initialized_p ()
973 	      && count.nonzero_p ()))
974 	/* The edge and the temporary best edge  have almost equivalent
975 	   probabilities.  The higher countuency of a successor now means
976 	   that there is another edge going into that successor.
977 	   This successor has lower countuency so it is better.  */
978 	is_better_edge = true;
979       else if (count > best_count + diff_count)
980 	/* This successor has higher countuency so it is worse.  */
981 	is_better_edge = false;
982       else if (e->dest->prev_bb == bb)
983 	/* The edges have equivalent probabilities and the successors
984 	   have equivalent frequencies.  Select the previous successor.  */
985 	is_better_edge = true;
986       else
987 	is_better_edge = false;
988     }
989 
990   return is_better_edge;
991 }
992 
993 /* Return true when the edge E is better than the temporary best edge
994    CUR_BEST_EDGE.  If SRC_INDEX_P is true, the function compares the src bb of
995    E and CUR_BEST_EDGE; otherwise it will compare the dest bb.
996    BEST_LEN is the trace length of src (or dest) bb in CUR_BEST_EDGE.
997    TRACES record the information about traces.
998    When optimizing for size, the edge with smaller index is better.
999    When optimizing for speed, the edge with bigger probability or longer trace
1000    is better.  */
1001 
1002 static bool
connect_better_edge_p(const_edge e,bool src_index_p,int best_len,const_edge cur_best_edge,struct trace * traces)1003 connect_better_edge_p (const_edge e, bool src_index_p, int best_len,
1004 		       const_edge cur_best_edge, struct trace *traces)
1005 {
1006   int e_index;
1007   int b_index;
1008   bool is_better_edge;
1009 
1010   if (!cur_best_edge)
1011     return true;
1012 
1013   if (optimize_function_for_size_p (cfun))
1014     {
1015       e_index = src_index_p ? e->src->index : e->dest->index;
1016       b_index = src_index_p ? cur_best_edge->src->index
1017 			      : cur_best_edge->dest->index;
1018       /* The smaller one is better to keep the original order.  */
1019       return b_index > e_index;
1020     }
1021 
1022   if (src_index_p)
1023     {
1024       e_index = e->src->index;
1025 
1026       /* We are looking for predecessor, so probabilities are not that
1027 	 informative.  We do not want to connect A to B becuse A has
1028 	 only one sucessor (probablity is 100%) while there is edge
1029 	 A' to B where probability is 90% but which is much more frequent.  */
1030       if (e->count () > cur_best_edge->count ())
1031 	/* The edge has higher probability than the temporary best edge.  */
1032 	is_better_edge = true;
1033       else if (e->count () < cur_best_edge->count ())
1034 	/* The edge has lower probability than the temporary best edge.  */
1035 	is_better_edge = false;
1036       if (e->probability > cur_best_edge->probability)
1037 	/* The edge has higher probability than the temporary best edge.  */
1038 	is_better_edge = true;
1039       else if (e->probability < cur_best_edge->probability)
1040 	/* The edge has lower probability than the temporary best edge.  */
1041 	is_better_edge = false;
1042       else if (traces[bbd[e_index].end_of_trace].length > best_len)
1043 	/* The edge and the temporary best edge have equivalent probabilities.
1044 	   The edge with longer trace is better.  */
1045 	is_better_edge = true;
1046       else
1047 	is_better_edge = false;
1048     }
1049   else
1050     {
1051       e_index = e->dest->index;
1052 
1053       if (e->probability > cur_best_edge->probability)
1054 	/* The edge has higher probability than the temporary best edge.  */
1055 	is_better_edge = true;
1056       else if (e->probability < cur_best_edge->probability)
1057 	/* The edge has lower probability than the temporary best edge.  */
1058 	is_better_edge = false;
1059       else if (traces[bbd[e_index].start_of_trace].length > best_len)
1060 	/* The edge and the temporary best edge have equivalent probabilities.
1061 	   The edge with longer trace is better.  */
1062 	is_better_edge = true;
1063       else
1064 	is_better_edge = false;
1065     }
1066 
1067   return is_better_edge;
1068 }
1069 
1070 /* Connect traces in array TRACES, N_TRACES is the count of traces.  */
1071 
1072 static void
connect_traces(int n_traces,struct trace * traces)1073 connect_traces (int n_traces, struct trace *traces)
1074 {
1075   int i;
1076   bool *connected;
1077   bool two_passes;
1078   int last_trace;
1079   int current_pass;
1080   int current_partition;
1081   profile_count count_threshold;
1082   bool for_size = optimize_function_for_size_p (cfun);
1083 
1084   count_threshold = max_entry_count.apply_scale (DUPLICATION_THRESHOLD, 1000);
1085 
1086   connected = XCNEWVEC (bool, n_traces);
1087   last_trace = -1;
1088   current_pass = 1;
1089   current_partition = BB_PARTITION (traces[0].first);
1090   two_passes = false;
1091 
1092   if (crtl->has_bb_partition)
1093     for (i = 0; i < n_traces && !two_passes; i++)
1094       if (BB_PARTITION (traces[0].first)
1095 	  != BB_PARTITION (traces[i].first))
1096 	two_passes = true;
1097 
1098   for (i = 0; i < n_traces || (two_passes && current_pass == 1) ; i++)
1099     {
1100       int t = i;
1101       int t2;
1102       edge e, best;
1103       int best_len;
1104 
1105       if (i >= n_traces)
1106 	{
1107 	  gcc_assert (two_passes && current_pass == 1);
1108 	  i = 0;
1109 	  t = i;
1110 	  current_pass = 2;
1111 	  if (current_partition == BB_HOT_PARTITION)
1112 	    current_partition = BB_COLD_PARTITION;
1113 	  else
1114 	    current_partition = BB_HOT_PARTITION;
1115 	}
1116 
1117       if (connected[t])
1118 	continue;
1119 
1120       if (two_passes
1121 	  && BB_PARTITION (traces[t].first) != current_partition)
1122 	continue;
1123 
1124       connected[t] = true;
1125 
1126       /* Find the predecessor traces.  */
1127       for (t2 = t; t2 > 0;)
1128 	{
1129 	  edge_iterator ei;
1130 	  best = NULL;
1131 	  best_len = 0;
1132 	  FOR_EACH_EDGE (e, ei, traces[t2].first->preds)
1133 	    {
1134 	      int si = e->src->index;
1135 
1136 	      if (e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun)
1137 		  && (e->flags & EDGE_CAN_FALLTHRU)
1138 		  && !(e->flags & EDGE_COMPLEX)
1139 		  && bbd[si].end_of_trace >= 0
1140 		  && !connected[bbd[si].end_of_trace]
1141 		  && (BB_PARTITION (e->src) == current_partition)
1142 		  && connect_better_edge_p (e, true, best_len, best, traces))
1143 		{
1144 		  best = e;
1145 		  best_len = traces[bbd[si].end_of_trace].length;
1146 		}
1147 	    }
1148 	  if (best)
1149 	    {
1150 	      best->src->aux = best->dest;
1151 	      t2 = bbd[best->src->index].end_of_trace;
1152 	      connected[t2] = true;
1153 
1154 	      if (dump_file)
1155 		{
1156 		  fprintf (dump_file, "Connection: %d %d\n",
1157 			   best->src->index, best->dest->index);
1158 		}
1159 	    }
1160 	  else
1161 	    break;
1162 	}
1163 
1164       if (last_trace >= 0)
1165 	traces[last_trace].last->aux = traces[t2].first;
1166       last_trace = t;
1167 
1168       /* Find the successor traces.  */
1169       while (1)
1170 	{
1171 	  /* Find the continuation of the chain.  */
1172 	  edge_iterator ei;
1173 	  best = NULL;
1174 	  best_len = 0;
1175 	  FOR_EACH_EDGE (e, ei, traces[t].last->succs)
1176 	    {
1177 	      int di = e->dest->index;
1178 
1179 	      if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
1180 		  && (e->flags & EDGE_CAN_FALLTHRU)
1181 		  && !(e->flags & EDGE_COMPLEX)
1182 		  && bbd[di].start_of_trace >= 0
1183 		  && !connected[bbd[di].start_of_trace]
1184 		  && (BB_PARTITION (e->dest) == current_partition)
1185 		  && connect_better_edge_p (e, false, best_len, best, traces))
1186 		{
1187 		  best = e;
1188 		  best_len = traces[bbd[di].start_of_trace].length;
1189 		}
1190 	    }
1191 
1192 	  if (for_size)
1193 	    {
1194 	      if (!best)
1195 		/* Stop finding the successor traces.  */
1196 		break;
1197 
1198 	      /* It is OK to connect block n with block n + 1 or a block
1199 		 before n.  For others, only connect to the loop header.  */
1200 	      if (best->dest->index > (traces[t].last->index + 1))
1201 		{
1202 		  int count = EDGE_COUNT (best->dest->preds);
1203 
1204 		  FOR_EACH_EDGE (e, ei, best->dest->preds)
1205 		    if (e->flags & EDGE_DFS_BACK)
1206 		      count--;
1207 
1208 		  /* If dest has multiple predecessors, skip it.  We expect
1209 		     that one predecessor with smaller index connects with it
1210 		     later.  */
1211 		  if (count != 1)
1212 		    break;
1213 		}
1214 
1215 	      /* Only connect Trace n with Trace n + 1.  It is conservative
1216 		 to keep the order as close as possible to the original order.
1217 		 It also helps to reduce long jumps.  */
1218 	      if (last_trace != bbd[best->dest->index].start_of_trace - 1)
1219 		break;
1220 
1221 	      if (dump_file)
1222 		fprintf (dump_file, "Connection: %d %d\n",
1223 			 best->src->index, best->dest->index);
1224 
1225 	      t = bbd[best->dest->index].start_of_trace;
1226 	      traces[last_trace].last->aux = traces[t].first;
1227 	      connected[t] = true;
1228 	      last_trace = t;
1229 	    }
1230 	  else if (best)
1231 	    {
1232 	      if (dump_file)
1233 		{
1234 		  fprintf (dump_file, "Connection: %d %d\n",
1235 			   best->src->index, best->dest->index);
1236 		}
1237 	      t = bbd[best->dest->index].start_of_trace;
1238 	      traces[last_trace].last->aux = traces[t].first;
1239 	      connected[t] = true;
1240 	      last_trace = t;
1241 	    }
1242 	  else
1243 	    {
1244 	      /* Try to connect the traces by duplication of 1 block.  */
1245 	      edge e2;
1246 	      basic_block next_bb = NULL;
1247 	      bool try_copy = false;
1248 
1249 	      FOR_EACH_EDGE (e, ei, traces[t].last->succs)
1250 		if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
1251 		    && (e->flags & EDGE_CAN_FALLTHRU)
1252 		    && !(e->flags & EDGE_COMPLEX)
1253 		    && (!best || e->probability > best->probability))
1254 		  {
1255 		    edge_iterator ei;
1256 		    edge best2 = NULL;
1257 		    int best2_len = 0;
1258 
1259 		    /* If the destination is a start of a trace which is only
1260 		       one block long, then no need to search the successor
1261 		       blocks of the trace.  Accept it.  */
1262 		    if (bbd[e->dest->index].start_of_trace >= 0
1263 			&& traces[bbd[e->dest->index].start_of_trace].length
1264 			   == 1)
1265 		      {
1266 			best = e;
1267 			try_copy = true;
1268 			continue;
1269 		      }
1270 
1271 		    FOR_EACH_EDGE (e2, ei, e->dest->succs)
1272 		      {
1273 			int di = e2->dest->index;
1274 
1275 			if (e2->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)
1276 			    || ((e2->flags & EDGE_CAN_FALLTHRU)
1277 				&& !(e2->flags & EDGE_COMPLEX)
1278 				&& bbd[di].start_of_trace >= 0
1279 				&& !connected[bbd[di].start_of_trace]
1280 				&& BB_PARTITION (e2->dest) == current_partition
1281 				&& e2->count () >= count_threshold
1282 				&& (!best2
1283 				    || e2->probability > best2->probability
1284 				    || (e2->probability == best2->probability
1285 					&& traces[bbd[di].start_of_trace].length
1286 					   > best2_len))))
1287 			  {
1288 			    best = e;
1289 			    best2 = e2;
1290 			    if (e2->dest != EXIT_BLOCK_PTR_FOR_FN (cfun))
1291 			      best2_len = traces[bbd[di].start_of_trace].length;
1292 			    else
1293 			      best2_len = INT_MAX;
1294 			    next_bb = e2->dest;
1295 			    try_copy = true;
1296 			  }
1297 		      }
1298 		  }
1299 
1300 	      /* Copy tiny blocks always; copy larger blocks only when the
1301 		 edge is traversed frequently enough.  */
1302 	      if (try_copy
1303 		  && BB_PARTITION (best->src) == BB_PARTITION (best->dest)
1304 		  && copy_bb_p (best->dest,
1305 				optimize_edge_for_speed_p (best)
1306 				&& (!best->count ().initialized_p ()
1307 				    || best->count () >= count_threshold)))
1308 		{
1309 		  basic_block new_bb;
1310 
1311 		  if (dump_file)
1312 		    {
1313 		      fprintf (dump_file, "Connection: %d %d ",
1314 			       traces[t].last->index, best->dest->index);
1315 		      if (!next_bb)
1316 			fputc ('\n', dump_file);
1317 		      else if (next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
1318 			fprintf (dump_file, "exit\n");
1319 		      else
1320 			fprintf (dump_file, "%d\n", next_bb->index);
1321 		    }
1322 
1323 		  new_bb = copy_bb (best->dest, best, traces[t].last, t);
1324 		  traces[t].last = new_bb;
1325 		  if (next_bb && next_bb != EXIT_BLOCK_PTR_FOR_FN (cfun))
1326 		    {
1327 		      t = bbd[next_bb->index].start_of_trace;
1328 		      traces[last_trace].last->aux = traces[t].first;
1329 		      connected[t] = true;
1330 		      last_trace = t;
1331 		    }
1332 		  else
1333 		    break;	/* Stop finding the successor traces.  */
1334 		}
1335 	      else
1336 		break;	/* Stop finding the successor traces.  */
1337 	    }
1338 	}
1339     }
1340 
1341   if (dump_file)
1342     {
1343       basic_block bb;
1344 
1345       fprintf (dump_file, "Final order:\n");
1346       for (bb = traces[0].first; bb; bb = (basic_block) bb->aux)
1347 	fprintf (dump_file, "%d ", bb->index);
1348       fprintf (dump_file, "\n");
1349       fflush (dump_file);
1350     }
1351 
1352   FREE (connected);
1353 }
1354 
1355 /* Return true when BB can and should be copied. CODE_MAY_GROW is true
1356    when code size is allowed to grow by duplication.  */
1357 
1358 static bool
copy_bb_p(const_basic_block bb,int code_may_grow)1359 copy_bb_p (const_basic_block bb, int code_may_grow)
1360 {
1361   int size = 0;
1362   int max_size = uncond_jump_length;
1363   rtx_insn *insn;
1364 
1365   if (EDGE_COUNT (bb->preds) < 2)
1366     return false;
1367   if (!can_duplicate_block_p (bb))
1368     return false;
1369 
1370   /* Avoid duplicating blocks which have many successors (PR/13430).  */
1371   if (EDGE_COUNT (bb->succs) > 8)
1372     return false;
1373 
1374   if (code_may_grow && optimize_bb_for_speed_p (bb))
1375     max_size *= PARAM_VALUE (PARAM_MAX_GROW_COPY_BB_INSNS);
1376 
1377   FOR_BB_INSNS (bb, insn)
1378     {
1379       if (INSN_P (insn))
1380 	size += get_attr_min_length (insn);
1381     }
1382 
1383   if (size <= max_size)
1384     return true;
1385 
1386   if (dump_file)
1387     {
1388       fprintf (dump_file,
1389 	       "Block %d can't be copied because its size = %d.\n",
1390 	       bb->index, size);
1391     }
1392 
1393   return false;
1394 }
1395 
1396 /* Return the length of unconditional jump instruction.  */
1397 
1398 int
get_uncond_jump_length(void)1399 get_uncond_jump_length (void)
1400 {
1401   int length;
1402 
1403   start_sequence ();
1404   rtx_code_label *label = emit_label (gen_label_rtx ());
1405   rtx_insn *jump = emit_jump_insn (targetm.gen_jump (label));
1406   length = get_attr_min_length (jump);
1407   end_sequence ();
1408 
1409   return length;
1410 }
1411 
1412 /* Create a forwarder block to OLD_BB starting with NEW_LABEL and in the
1413    other partition wrt OLD_BB.  */
1414 
1415 static basic_block
create_eh_forwarder_block(rtx_code_label * new_label,basic_block old_bb)1416 create_eh_forwarder_block (rtx_code_label *new_label, basic_block old_bb)
1417 {
1418   /* Split OLD_BB, so that EH pads have always only incoming EH edges,
1419      bb_has_eh_pred bbs are treated specially by DF infrastructure.  */
1420   old_bb = split_block_after_labels (old_bb)->dest;
1421 
1422   /* Put the new label and a jump in the new basic block.  */
1423   rtx_insn *label = emit_label (new_label);
1424   rtx_code_label *old_label = block_label (old_bb);
1425   rtx_insn *jump = emit_jump_insn (targetm.gen_jump (old_label));
1426   JUMP_LABEL (jump) = old_label;
1427 
1428   /* Create the new basic block and put it in last position.  */
1429   basic_block last_bb = EXIT_BLOCK_PTR_FOR_FN (cfun)->prev_bb;
1430   basic_block new_bb = create_basic_block (label, jump, last_bb);
1431   new_bb->aux = last_bb->aux;
1432   new_bb->count = old_bb->count;
1433   last_bb->aux = new_bb;
1434 
1435   emit_barrier_after_bb (new_bb);
1436 
1437   make_single_succ_edge (new_bb, old_bb, 0);
1438 
1439   /* Make sure the new basic block is in the other partition.  */
1440   unsigned new_partition = BB_PARTITION (old_bb);
1441   new_partition ^= BB_HOT_PARTITION | BB_COLD_PARTITION;
1442   BB_SET_PARTITION (new_bb, new_partition);
1443 
1444   return new_bb;
1445 }
1446 
1447 /* The common landing pad in block OLD_BB has edges from both partitions.
1448    Add a new landing pad that will just jump to the old one and split the
1449    edges so that no EH edge crosses partitions.  */
1450 
1451 static void
sjlj_fix_up_crossing_landing_pad(basic_block old_bb)1452 sjlj_fix_up_crossing_landing_pad (basic_block old_bb)
1453 {
1454   const unsigned lp_len = cfun->eh->lp_array->length ();
1455   edge_iterator ei;
1456   edge e;
1457 
1458   /* Generate the new common landing-pad label.  */
1459   rtx_code_label *new_label = gen_label_rtx ();
1460   LABEL_PRESERVE_P (new_label) = 1;
1461 
1462   /* Create the forwarder block.  */
1463   basic_block new_bb = create_eh_forwarder_block (new_label, old_bb);
1464 
1465   /* Create the map from old to new lp index and initialize it.  */
1466   unsigned *index_map = (unsigned *) alloca (lp_len * sizeof (unsigned));
1467   memset (index_map, 0, lp_len * sizeof (unsigned));
1468 
1469   /* Fix up the edges.  */
1470   for (ei = ei_start (old_bb->preds); (e = ei_safe_edge (ei)) != NULL; )
1471     if (e->src != new_bb && BB_PARTITION (e->src) == BB_PARTITION (new_bb))
1472       {
1473 	rtx_insn *insn = BB_END (e->src);
1474 	rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1475 
1476 	gcc_assert (note != NULL);
1477 	const unsigned old_index = INTVAL (XEXP (note, 0));
1478 
1479 	/* Generate the new landing-pad structure.  */
1480 	if (index_map[old_index] == 0)
1481 	  {
1482 	    eh_landing_pad old_lp = (*cfun->eh->lp_array)[old_index];
1483 	    eh_landing_pad new_lp = gen_eh_landing_pad (old_lp->region);
1484 	    new_lp->post_landing_pad = old_lp->post_landing_pad;
1485 	    new_lp->landing_pad = new_label;
1486 	    index_map[old_index] = new_lp->index;
1487 	  }
1488 	XEXP (note, 0) = GEN_INT (index_map[old_index]);
1489 
1490 	/* Adjust the edge to the new destination.  */
1491 	redirect_edge_succ (e, new_bb);
1492       }
1493     else
1494       ei_next (&ei);
1495 }
1496 
1497 /* The landing pad OLD_LP, in block OLD_BB, has edges from both partitions.
1498    Add a new landing pad that will just jump to the old one and split the
1499    edges so that no EH edge crosses partitions.  */
1500 
1501 static void
dw2_fix_up_crossing_landing_pad(eh_landing_pad old_lp,basic_block old_bb)1502 dw2_fix_up_crossing_landing_pad (eh_landing_pad old_lp, basic_block old_bb)
1503 {
1504   eh_landing_pad new_lp;
1505   edge_iterator ei;
1506   edge e;
1507 
1508   /* Generate the new landing-pad structure.  */
1509   new_lp = gen_eh_landing_pad (old_lp->region);
1510   new_lp->post_landing_pad = old_lp->post_landing_pad;
1511   new_lp->landing_pad = gen_label_rtx ();
1512   LABEL_PRESERVE_P (new_lp->landing_pad) = 1;
1513 
1514   /* Create the forwarder block.  */
1515   basic_block new_bb = create_eh_forwarder_block (new_lp->landing_pad, old_bb);
1516 
1517   /* Fix up the edges.  */
1518   for (ei = ei_start (old_bb->preds); (e = ei_safe_edge (ei)) != NULL; )
1519     if (e->src != new_bb && BB_PARTITION (e->src) == BB_PARTITION (new_bb))
1520       {
1521 	rtx_insn *insn = BB_END (e->src);
1522 	rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1523 
1524 	gcc_assert (note != NULL);
1525 	gcc_checking_assert (INTVAL (XEXP (note, 0)) == old_lp->index);
1526 	XEXP (note, 0) = GEN_INT (new_lp->index);
1527 
1528 	/* Adjust the edge to the new destination.  */
1529 	redirect_edge_succ (e, new_bb);
1530       }
1531     else
1532       ei_next (&ei);
1533 }
1534 
1535 
1536 /* Ensure that all hot bbs are included in a hot path through the
1537    procedure. This is done by calling this function twice, once
1538    with WALK_UP true (to look for paths from the entry to hot bbs) and
1539    once with WALK_UP false (to look for paths from hot bbs to the exit).
1540    Returns the updated value of COLD_BB_COUNT and adds newly-hot bbs
1541    to BBS_IN_HOT_PARTITION.  */
1542 
1543 static unsigned int
sanitize_hot_paths(bool walk_up,unsigned int cold_bb_count,vec<basic_block> * bbs_in_hot_partition)1544 sanitize_hot_paths (bool walk_up, unsigned int cold_bb_count,
1545                     vec<basic_block> *bbs_in_hot_partition)
1546 {
1547   /* Callers check this.  */
1548   gcc_checking_assert (cold_bb_count);
1549 
1550   /* Keep examining hot bbs while we still have some left to check
1551      and there are remaining cold bbs.  */
1552   vec<basic_block> hot_bbs_to_check = bbs_in_hot_partition->copy ();
1553   while (! hot_bbs_to_check.is_empty ()
1554          && cold_bb_count)
1555     {
1556       basic_block bb = hot_bbs_to_check.pop ();
1557       vec<edge, va_gc> *edges = walk_up ? bb->preds : bb->succs;
1558       edge e;
1559       edge_iterator ei;
1560       profile_probability highest_probability
1561 				 = profile_probability::uninitialized ();
1562       profile_count highest_count = profile_count::uninitialized ();
1563       bool found = false;
1564 
1565       /* Walk the preds/succs and check if there is at least one already
1566          marked hot. Keep track of the most frequent pred/succ so that we
1567          can mark it hot if we don't find one.  */
1568       FOR_EACH_EDGE (e, ei, edges)
1569         {
1570           basic_block reach_bb = walk_up ? e->src : e->dest;
1571 
1572           if (e->flags & EDGE_DFS_BACK)
1573             continue;
1574 
1575 	  /* Do not expect profile insanities when profile was not adjusted.  */
1576 	  if (e->probability == profile_probability::never ()
1577 	      || e->count () == profile_count::zero ())
1578 	    continue;
1579 
1580           if (BB_PARTITION (reach_bb) != BB_COLD_PARTITION)
1581           {
1582             found = true;
1583             break;
1584           }
1585           /* The following loop will look for the hottest edge via
1586              the edge count, if it is non-zero, then fallback to
1587              the edge probability.  */
1588           if (!(e->count () > highest_count))
1589             highest_count = e->count ();
1590           if (!highest_probability.initialized_p ()
1591 	      || e->probability > highest_probability)
1592             highest_probability = e->probability;
1593         }
1594 
1595       /* If bb is reached by (or reaches, in the case of !WALK_UP) another hot
1596          block (or unpartitioned, e.g. the entry block) then it is ok. If not,
1597          then the most frequent pred (or succ) needs to be adjusted.  In the
1598          case where multiple preds/succs have the same frequency (e.g. a
1599          50-50 branch), then both will be adjusted.  */
1600       if (found)
1601         continue;
1602 
1603       FOR_EACH_EDGE (e, ei, edges)
1604         {
1605           if (e->flags & EDGE_DFS_BACK)
1606             continue;
1607 	  /* Do not expect profile insanities when profile was not adjusted.  */
1608 	  if (e->probability == profile_probability::never ()
1609 	      || e->count () == profile_count::zero ())
1610 	    continue;
1611           /* Select the hottest edge using the edge count, if it is non-zero,
1612              then fallback to the edge probability.  */
1613           if (highest_count.initialized_p ())
1614             {
1615               if (!(e->count () >= highest_count))
1616                 continue;
1617             }
1618           else if (!(e->probability >= highest_probability))
1619             continue;
1620 
1621           basic_block reach_bb = walk_up ? e->src : e->dest;
1622 
1623           /* We have a hot bb with an immediate dominator that is cold.
1624              The dominator needs to be re-marked hot.  */
1625           BB_SET_PARTITION (reach_bb, BB_HOT_PARTITION);
1626 	  if (dump_file)
1627 	    fprintf (dump_file, "Promoting bb %i to hot partition to sanitize "
1628 		     "profile of bb %i in %s walk\n", reach_bb->index,
1629 		     bb->index, walk_up ? "backward" : "forward");
1630           cold_bb_count--;
1631 
1632           /* Now we need to examine newly-hot reach_bb to see if it is also
1633              dominated by a cold bb.  */
1634           bbs_in_hot_partition->safe_push (reach_bb);
1635           hot_bbs_to_check.safe_push (reach_bb);
1636         }
1637     }
1638   hot_bbs_to_check.release ();
1639 
1640   return cold_bb_count;
1641 }
1642 
1643 
1644 /* Find the basic blocks that are rarely executed and need to be moved to
1645    a separate section of the .o file (to cut down on paging and improve
1646    cache locality).  Return a vector of all edges that cross.  */
1647 
1648 static vec<edge>
find_rarely_executed_basic_blocks_and_crossing_edges(void)1649 find_rarely_executed_basic_blocks_and_crossing_edges (void)
1650 {
1651   vec<edge> crossing_edges = vNULL;
1652   basic_block bb;
1653   edge e;
1654   edge_iterator ei;
1655   unsigned int cold_bb_count = 0;
1656   auto_vec<basic_block> bbs_in_hot_partition;
1657 
1658   propagate_unlikely_bbs_forward ();
1659 
1660   /* Mark which partition (hot/cold) each basic block belongs in.  */
1661   FOR_EACH_BB_FN (bb, cfun)
1662     {
1663       bool cold_bb = false;
1664 
1665       if (probably_never_executed_bb_p (cfun, bb))
1666         {
1667           /* Handle profile insanities created by upstream optimizations
1668              by also checking the incoming edge weights. If there is a non-cold
1669              incoming edge, conservatively prevent this block from being split
1670              into the cold section.  */
1671           cold_bb = true;
1672           FOR_EACH_EDGE (e, ei, bb->preds)
1673             if (!probably_never_executed_edge_p (cfun, e))
1674               {
1675                 cold_bb = false;
1676                 break;
1677               }
1678         }
1679       if (cold_bb)
1680         {
1681           BB_SET_PARTITION (bb, BB_COLD_PARTITION);
1682           cold_bb_count++;
1683         }
1684       else
1685         {
1686           BB_SET_PARTITION (bb, BB_HOT_PARTITION);
1687           bbs_in_hot_partition.safe_push (bb);
1688         }
1689     }
1690 
1691   /* Ensure that hot bbs are included along a hot path from the entry to exit.
1692      Several different possibilities may include cold bbs along all paths
1693      to/from a hot bb. One is that there are edge weight insanities
1694      due to optimization phases that do not properly update basic block profile
1695      counts. The second is that the entry of the function may not be hot, because
1696      it is entered fewer times than the number of profile training runs, but there
1697      is a loop inside the function that causes blocks within the function to be
1698      above the threshold for hotness. This is fixed by walking up from hot bbs
1699      to the entry block, and then down from hot bbs to the exit, performing
1700      partitioning fixups as necessary.  */
1701   if (cold_bb_count)
1702     {
1703       mark_dfs_back_edges ();
1704       cold_bb_count = sanitize_hot_paths (true, cold_bb_count,
1705                                           &bbs_in_hot_partition);
1706       if (cold_bb_count)
1707         sanitize_hot_paths (false, cold_bb_count, &bbs_in_hot_partition);
1708 
1709       hash_set <basic_block> set;
1710       find_bbs_reachable_by_hot_paths (&set);
1711       FOR_EACH_BB_FN (bb, cfun)
1712 	if (!set.contains (bb))
1713 	  BB_SET_PARTITION (bb, BB_COLD_PARTITION);
1714     }
1715 
1716   /* The format of .gcc_except_table does not allow landing pads to
1717      be in a different partition as the throw.  Fix this by either
1718      moving the landing pads or inserting forwarder landing pads.  */
1719   if (cfun->eh->lp_array)
1720     {
1721       const bool sjlj
1722 	= (targetm_common.except_unwind_info (&global_options) == UI_SJLJ);
1723       unsigned i;
1724       eh_landing_pad lp;
1725 
1726       FOR_EACH_VEC_ELT (*cfun->eh->lp_array, i, lp)
1727 	{
1728 	  bool all_same, all_diff;
1729 
1730 	  if (lp == NULL
1731 	      || lp->landing_pad == NULL_RTX
1732 	      || !LABEL_P (lp->landing_pad))
1733 	    continue;
1734 
1735 	  all_same = all_diff = true;
1736 	  bb = BLOCK_FOR_INSN (lp->landing_pad);
1737 	  FOR_EACH_EDGE (e, ei, bb->preds)
1738 	    {
1739 	      gcc_assert (e->flags & EDGE_EH);
1740 	      if (BB_PARTITION (bb) == BB_PARTITION (e->src))
1741 		all_diff = false;
1742 	      else
1743 		all_same = false;
1744 	    }
1745 
1746 	  if (all_same)
1747 	    ;
1748 	  else if (all_diff)
1749 	    {
1750 	      int which = BB_PARTITION (bb);
1751 	      which ^= BB_HOT_PARTITION | BB_COLD_PARTITION;
1752 	      BB_SET_PARTITION (bb, which);
1753 	    }
1754 	  else if (sjlj)
1755 	    sjlj_fix_up_crossing_landing_pad (bb);
1756 	  else
1757 	    dw2_fix_up_crossing_landing_pad (lp, bb);
1758 
1759 	  /* There is a single, common landing pad in SJLJ mode.  */
1760 	  if (sjlj)
1761 	    break;
1762 	}
1763     }
1764 
1765   /* Mark every edge that crosses between sections.  */
1766   FOR_EACH_BB_FN (bb, cfun)
1767     FOR_EACH_EDGE (e, ei, bb->succs)
1768       {
1769 	unsigned int flags = e->flags;
1770 
1771         /* We should never have EDGE_CROSSING set yet.  */
1772 	gcc_checking_assert ((flags & EDGE_CROSSING) == 0);
1773 
1774 	if (e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun)
1775 	    && e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
1776 	    && BB_PARTITION (e->src) != BB_PARTITION (e->dest))
1777 	  {
1778 	    crossing_edges.safe_push (e);
1779 	    flags |= EDGE_CROSSING;
1780 	  }
1781 
1782 	/* Now that we've split eh edges as appropriate, allow landing pads
1783 	   to be merged with the post-landing pads.  */
1784 	flags &= ~EDGE_PRESERVE;
1785 
1786 	e->flags = flags;
1787       }
1788 
1789   return crossing_edges;
1790 }
1791 
1792 /* Set the flag EDGE_CAN_FALLTHRU for edges that can be fallthru.  */
1793 
1794 static void
set_edge_can_fallthru_flag(void)1795 set_edge_can_fallthru_flag (void)
1796 {
1797   basic_block bb;
1798 
1799   FOR_EACH_BB_FN (bb, cfun)
1800     {
1801       edge e;
1802       edge_iterator ei;
1803 
1804       FOR_EACH_EDGE (e, ei, bb->succs)
1805 	{
1806 	  e->flags &= ~EDGE_CAN_FALLTHRU;
1807 
1808 	  /* The FALLTHRU edge is also CAN_FALLTHRU edge.  */
1809 	  if (e->flags & EDGE_FALLTHRU)
1810 	    e->flags |= EDGE_CAN_FALLTHRU;
1811 	}
1812 
1813       /* If the BB ends with an invertible condjump all (2) edges are
1814 	 CAN_FALLTHRU edges.  */
1815       if (EDGE_COUNT (bb->succs) != 2)
1816 	continue;
1817       if (!any_condjump_p (BB_END (bb)))
1818 	continue;
1819 
1820       rtx_jump_insn *bb_end_jump = as_a <rtx_jump_insn *> (BB_END (bb));
1821       if (!invert_jump (bb_end_jump, JUMP_LABEL (bb_end_jump), 0))
1822 	continue;
1823       invert_jump (bb_end_jump, JUMP_LABEL (bb_end_jump), 0);
1824       EDGE_SUCC (bb, 0)->flags |= EDGE_CAN_FALLTHRU;
1825       EDGE_SUCC (bb, 1)->flags |= EDGE_CAN_FALLTHRU;
1826     }
1827 }
1828 
1829 /* If any destination of a crossing edge does not have a label, add label;
1830    Convert any easy fall-through crossing edges to unconditional jumps.  */
1831 
1832 static void
add_labels_and_missing_jumps(vec<edge> crossing_edges)1833 add_labels_and_missing_jumps (vec<edge> crossing_edges)
1834 {
1835   size_t i;
1836   edge e;
1837 
1838   FOR_EACH_VEC_ELT (crossing_edges, i, e)
1839     {
1840       basic_block src = e->src;
1841       basic_block dest = e->dest;
1842       rtx_jump_insn *new_jump;
1843 
1844       if (dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1845 	continue;
1846 
1847       /* Make sure dest has a label.  */
1848       rtx_code_label *label = block_label (dest);
1849 
1850       /* Nothing to do for non-fallthru edges.  */
1851       if (src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
1852 	continue;
1853       if ((e->flags & EDGE_FALLTHRU) == 0)
1854 	continue;
1855 
1856       /* If the block does not end with a control flow insn, then we
1857 	 can trivially add a jump to the end to fixup the crossing.
1858 	 Otherwise the jump will have to go in a new bb, which will
1859 	 be handled by fix_up_fall_thru_edges function.  */
1860       if (control_flow_insn_p (BB_END (src)))
1861 	continue;
1862 
1863       /* Make sure there's only one successor.  */
1864       gcc_assert (single_succ_p (src));
1865 
1866       new_jump = emit_jump_insn_after (targetm.gen_jump (label), BB_END (src));
1867       BB_END (src) = new_jump;
1868       JUMP_LABEL (new_jump) = label;
1869       LABEL_NUSES (label) += 1;
1870 
1871       emit_barrier_after_bb (src);
1872 
1873       /* Mark edge as non-fallthru.  */
1874       e->flags &= ~EDGE_FALLTHRU;
1875     }
1876 }
1877 
1878 /* Find any bb's where the fall-through edge is a crossing edge (note that
1879    these bb's must also contain a conditional jump or end with a call
1880    instruction; we've already dealt with fall-through edges for blocks
1881    that didn't have a conditional jump or didn't end with call instruction
1882    in the call to add_labels_and_missing_jumps).  Convert the fall-through
1883    edge to non-crossing edge by inserting a new bb to fall-through into.
1884    The new bb will contain an unconditional jump (crossing edge) to the
1885    original fall through destination.  */
1886 
1887 static void
fix_up_fall_thru_edges(void)1888 fix_up_fall_thru_edges (void)
1889 {
1890   basic_block cur_bb;
1891 
1892   FOR_EACH_BB_FN (cur_bb, cfun)
1893     {
1894       edge succ1;
1895       edge succ2;
1896       edge fall_thru = NULL;
1897       edge cond_jump = NULL;
1898 
1899       fall_thru = NULL;
1900       if (EDGE_COUNT (cur_bb->succs) > 0)
1901 	succ1 = EDGE_SUCC (cur_bb, 0);
1902       else
1903 	succ1 = NULL;
1904 
1905       if (EDGE_COUNT (cur_bb->succs) > 1)
1906 	succ2 = EDGE_SUCC (cur_bb, 1);
1907       else
1908 	succ2 = NULL;
1909 
1910       /* Find the fall-through edge.  */
1911 
1912       if (succ1
1913 	  && (succ1->flags & EDGE_FALLTHRU))
1914 	{
1915 	  fall_thru = succ1;
1916 	  cond_jump = succ2;
1917 	}
1918       else if (succ2
1919 	       && (succ2->flags & EDGE_FALLTHRU))
1920 	{
1921 	  fall_thru = succ2;
1922 	  cond_jump = succ1;
1923 	}
1924       else if (succ2 && EDGE_COUNT (cur_bb->succs) > 2)
1925 	fall_thru = find_fallthru_edge (cur_bb->succs);
1926 
1927       if (fall_thru && (fall_thru->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)))
1928 	{
1929 	  /* Check to see if the fall-thru edge is a crossing edge.  */
1930 
1931 	  if (fall_thru->flags & EDGE_CROSSING)
1932 	    {
1933 	      /* The fall_thru edge crosses; now check the cond jump edge, if
1934 		 it exists.  */
1935 
1936 	      bool cond_jump_crosses = true;
1937 	      int invert_worked = 0;
1938 	      rtx_insn *old_jump = BB_END (cur_bb);
1939 
1940 	      /* Find the jump instruction, if there is one.  */
1941 
1942 	      if (cond_jump)
1943 		{
1944 		  if (!(cond_jump->flags & EDGE_CROSSING))
1945 		    cond_jump_crosses = false;
1946 
1947 		  /* We know the fall-thru edge crosses; if the cond
1948 		     jump edge does NOT cross, and its destination is the
1949 		     next block in the bb order, invert the jump
1950 		     (i.e. fix it so the fall through does not cross and
1951 		     the cond jump does).  */
1952 
1953 		  if (!cond_jump_crosses)
1954 		    {
1955 		      /* Find label in fall_thru block. We've already added
1956 			 any missing labels, so there must be one.  */
1957 
1958 		      rtx_code_label *fall_thru_label
1959 			= block_label (fall_thru->dest);
1960 
1961 		      if (old_jump && fall_thru_label)
1962 			{
1963 			  rtx_jump_insn *old_jump_insn
1964 			    = dyn_cast <rtx_jump_insn *> (old_jump);
1965 			  if (old_jump_insn)
1966 			    invert_worked = invert_jump (old_jump_insn,
1967 							 fall_thru_label, 0);
1968 			}
1969 
1970 		      if (invert_worked)
1971 			{
1972 			  fall_thru->flags &= ~EDGE_FALLTHRU;
1973 			  cond_jump->flags |= EDGE_FALLTHRU;
1974 			  update_br_prob_note (cur_bb);
1975 			  std::swap (fall_thru, cond_jump);
1976 			  cond_jump->flags |= EDGE_CROSSING;
1977 			  fall_thru->flags &= ~EDGE_CROSSING;
1978 			}
1979 		    }
1980 		}
1981 
1982 	      if (cond_jump_crosses || !invert_worked)
1983 		{
1984 		  /* This is the case where both edges out of the basic
1985 		     block are crossing edges. Here we will fix up the
1986 		     fall through edge. The jump edge will be taken care
1987 		     of later.  The EDGE_CROSSING flag of fall_thru edge
1988 		     is unset before the call to force_nonfallthru
1989 		     function because if a new basic-block is created
1990 		     this edge remains in the current section boundary
1991 		     while the edge between new_bb and the fall_thru->dest
1992 		     becomes EDGE_CROSSING.  */
1993 
1994 		  fall_thru->flags &= ~EDGE_CROSSING;
1995 		  basic_block new_bb = force_nonfallthru (fall_thru);
1996 
1997 		  if (new_bb)
1998 		    {
1999 		      new_bb->aux = cur_bb->aux;
2000 		      cur_bb->aux = new_bb;
2001 
2002                       /* This is done by force_nonfallthru_and_redirect.  */
2003 		      gcc_assert (BB_PARTITION (new_bb)
2004                                   == BB_PARTITION (cur_bb));
2005 
2006 		      single_succ_edge (new_bb)->flags |= EDGE_CROSSING;
2007 		    }
2008 		  else
2009 		    {
2010 		      /* If a new basic-block was not created; restore
2011 			 the EDGE_CROSSING flag.  */
2012 		      fall_thru->flags |= EDGE_CROSSING;
2013 		    }
2014 
2015 		  /* Add barrier after new jump */
2016 		  emit_barrier_after_bb (new_bb ? new_bb : cur_bb);
2017 		}
2018 	    }
2019 	}
2020     }
2021 }
2022 
2023 /* This function checks the destination block of a "crossing jump" to
2024    see if it has any crossing predecessors that begin with a code label
2025    and end with an unconditional jump.  If so, it returns that predecessor
2026    block.  (This is to avoid creating lots of new basic blocks that all
2027    contain unconditional jumps to the same destination).  */
2028 
2029 static basic_block
find_jump_block(basic_block jump_dest)2030 find_jump_block (basic_block jump_dest)
2031 {
2032   basic_block source_bb = NULL;
2033   edge e;
2034   rtx_insn *insn;
2035   edge_iterator ei;
2036 
2037   FOR_EACH_EDGE (e, ei, jump_dest->preds)
2038     if (e->flags & EDGE_CROSSING)
2039       {
2040 	basic_block src = e->src;
2041 
2042 	/* Check each predecessor to see if it has a label, and contains
2043 	   only one executable instruction, which is an unconditional jump.
2044 	   If so, we can use it.  */
2045 
2046 	if (LABEL_P (BB_HEAD (src)))
2047 	  for (insn = BB_HEAD (src);
2048 	       !INSN_P (insn) && insn != NEXT_INSN (BB_END (src));
2049 	       insn = NEXT_INSN (insn))
2050 	    {
2051 	      if (INSN_P (insn)
2052 		  && insn == BB_END (src)
2053 		  && JUMP_P (insn)
2054 		  && !any_condjump_p (insn))
2055 		{
2056 		  source_bb = src;
2057 		  break;
2058 		}
2059 	    }
2060 
2061 	if (source_bb)
2062 	  break;
2063       }
2064 
2065   return source_bb;
2066 }
2067 
2068 /* Find all BB's with conditional jumps that are crossing edges;
2069    insert a new bb and make the conditional jump branch to the new
2070    bb instead (make the new bb same color so conditional branch won't
2071    be a 'crossing' edge).  Insert an unconditional jump from the
2072    new bb to the original destination of the conditional jump.  */
2073 
2074 static void
fix_crossing_conditional_branches(void)2075 fix_crossing_conditional_branches (void)
2076 {
2077   basic_block cur_bb;
2078   basic_block new_bb;
2079   basic_block dest;
2080   edge succ1;
2081   edge succ2;
2082   edge crossing_edge;
2083   edge new_edge;
2084   rtx set_src;
2085   rtx old_label = NULL_RTX;
2086   rtx_code_label *new_label;
2087 
2088   FOR_EACH_BB_FN (cur_bb, cfun)
2089     {
2090       crossing_edge = NULL;
2091       if (EDGE_COUNT (cur_bb->succs) > 0)
2092 	succ1 = EDGE_SUCC (cur_bb, 0);
2093       else
2094 	succ1 = NULL;
2095 
2096       if (EDGE_COUNT (cur_bb->succs) > 1)
2097 	succ2 = EDGE_SUCC (cur_bb, 1);
2098       else
2099 	succ2 = NULL;
2100 
2101       /* We already took care of fall-through edges, so only one successor
2102 	 can be a crossing edge.  */
2103 
2104       if (succ1 && (succ1->flags & EDGE_CROSSING))
2105 	crossing_edge = succ1;
2106       else if (succ2 && (succ2->flags & EDGE_CROSSING))
2107 	crossing_edge = succ2;
2108 
2109       if (crossing_edge)
2110 	{
2111 	  rtx_insn *old_jump = BB_END (cur_bb);
2112 
2113 	  /* Check to make sure the jump instruction is a
2114 	     conditional jump.  */
2115 
2116 	  set_src = NULL_RTX;
2117 
2118 	  if (any_condjump_p (old_jump))
2119 	    {
2120 	      if (GET_CODE (PATTERN (old_jump)) == SET)
2121 		set_src = SET_SRC (PATTERN (old_jump));
2122 	      else if (GET_CODE (PATTERN (old_jump)) == PARALLEL)
2123 		{
2124 		  set_src = XVECEXP (PATTERN (old_jump), 0,0);
2125 		  if (GET_CODE (set_src) == SET)
2126 		    set_src = SET_SRC (set_src);
2127 		  else
2128 		    set_src = NULL_RTX;
2129 		}
2130 	    }
2131 
2132 	  if (set_src && (GET_CODE (set_src) == IF_THEN_ELSE))
2133 	    {
2134 	      rtx_jump_insn *old_jump_insn =
2135 			as_a <rtx_jump_insn *> (old_jump);
2136 
2137 	      if (GET_CODE (XEXP (set_src, 1)) == PC)
2138 		old_label = XEXP (set_src, 2);
2139 	      else if (GET_CODE (XEXP (set_src, 2)) == PC)
2140 		old_label = XEXP (set_src, 1);
2141 
2142 	      /* Check to see if new bb for jumping to that dest has
2143 		 already been created; if so, use it; if not, create
2144 		 a new one.  */
2145 
2146 	      new_bb = find_jump_block (crossing_edge->dest);
2147 
2148 	      if (new_bb)
2149 		new_label = block_label (new_bb);
2150 	      else
2151 		{
2152 		  basic_block last_bb;
2153 		  rtx_code_label *old_jump_target;
2154 		  rtx_jump_insn *new_jump;
2155 
2156 		  /* Create new basic block to be dest for
2157 		     conditional jump.  */
2158 
2159 		  /* Put appropriate instructions in new bb.  */
2160 
2161 		  new_label = gen_label_rtx ();
2162 		  emit_label (new_label);
2163 
2164 		  gcc_assert (GET_CODE (old_label) == LABEL_REF);
2165 		  old_jump_target = old_jump_insn->jump_target ();
2166 		  new_jump = as_a <rtx_jump_insn *>
2167 		    (emit_jump_insn (targetm.gen_jump (old_jump_target)));
2168 		  new_jump->set_jump_target (old_jump_target);
2169 
2170 		  last_bb = EXIT_BLOCK_PTR_FOR_FN (cfun)->prev_bb;
2171 		  new_bb = create_basic_block (new_label, new_jump, last_bb);
2172 		  new_bb->aux = last_bb->aux;
2173 		  last_bb->aux = new_bb;
2174 
2175 		  emit_barrier_after_bb (new_bb);
2176 
2177 		  /* Make sure new bb is in same partition as source
2178 		     of conditional branch.  */
2179 		  BB_COPY_PARTITION (new_bb, cur_bb);
2180 		}
2181 
2182 	      /* Make old jump branch to new bb.  */
2183 
2184 	      redirect_jump (old_jump_insn, new_label, 0);
2185 
2186 	      /* Remove crossing_edge as predecessor of 'dest'.  */
2187 
2188 	      dest = crossing_edge->dest;
2189 
2190 	      redirect_edge_succ (crossing_edge, new_bb);
2191 
2192 	      /* Make a new edge from new_bb to old dest; new edge
2193 		 will be a successor for new_bb and a predecessor
2194 		 for 'dest'.  */
2195 
2196 	      if (EDGE_COUNT (new_bb->succs) == 0)
2197 		new_edge = make_single_succ_edge (new_bb, dest, 0);
2198 	      else
2199 		new_edge = EDGE_SUCC (new_bb, 0);
2200 
2201 	      crossing_edge->flags &= ~EDGE_CROSSING;
2202 	      new_edge->flags |= EDGE_CROSSING;
2203 	    }
2204 	}
2205     }
2206 }
2207 
2208 /* Find any unconditional branches that cross between hot and cold
2209    sections.  Convert them into indirect jumps instead.  */
2210 
2211 static void
fix_crossing_unconditional_branches(void)2212 fix_crossing_unconditional_branches (void)
2213 {
2214   basic_block cur_bb;
2215   rtx_insn *last_insn;
2216   rtx label;
2217   rtx label_addr;
2218   rtx_insn *indirect_jump_sequence;
2219   rtx_insn *jump_insn = NULL;
2220   rtx new_reg;
2221   rtx_insn *cur_insn;
2222   edge succ;
2223 
2224   FOR_EACH_BB_FN (cur_bb, cfun)
2225     {
2226       last_insn = BB_END (cur_bb);
2227 
2228       if (EDGE_COUNT (cur_bb->succs) < 1)
2229 	continue;
2230 
2231       succ = EDGE_SUCC (cur_bb, 0);
2232 
2233       /* Check to see if bb ends in a crossing (unconditional) jump.  At
2234 	 this point, no crossing jumps should be conditional.  */
2235 
2236       if (JUMP_P (last_insn)
2237 	  && (succ->flags & EDGE_CROSSING))
2238 	{
2239 	  gcc_assert (!any_condjump_p (last_insn));
2240 
2241 	  /* Make sure the jump is not already an indirect or table jump.  */
2242 
2243 	  if (!computed_jump_p (last_insn)
2244 	      && !tablejump_p (last_insn, NULL, NULL))
2245 	    {
2246 	      /* We have found a "crossing" unconditional branch.  Now
2247 		 we must convert it to an indirect jump.  First create
2248 		 reference of label, as target for jump.  */
2249 
2250 	      label = JUMP_LABEL (last_insn);
2251 	      label_addr = gen_rtx_LABEL_REF (Pmode, label);
2252 	      LABEL_NUSES (label) += 1;
2253 
2254 	      /* Get a register to use for the indirect jump.  */
2255 
2256 	      new_reg = gen_reg_rtx (Pmode);
2257 
2258 	      /* Generate indirect the jump sequence.  */
2259 
2260 	      start_sequence ();
2261 	      emit_move_insn (new_reg, label_addr);
2262 	      emit_indirect_jump (new_reg);
2263 	      indirect_jump_sequence = get_insns ();
2264 	      end_sequence ();
2265 
2266 	      /* Make sure every instruction in the new jump sequence has
2267 		 its basic block set to be cur_bb.  */
2268 
2269 	      for (cur_insn = indirect_jump_sequence; cur_insn;
2270 		   cur_insn = NEXT_INSN (cur_insn))
2271 		{
2272 		  if (!BARRIER_P (cur_insn))
2273 		    BLOCK_FOR_INSN (cur_insn) = cur_bb;
2274 		  if (JUMP_P (cur_insn))
2275 		    jump_insn = cur_insn;
2276 		}
2277 
2278 	      /* Insert the new (indirect) jump sequence immediately before
2279 		 the unconditional jump, then delete the unconditional jump.  */
2280 
2281 	      emit_insn_before (indirect_jump_sequence, last_insn);
2282 	      delete_insn (last_insn);
2283 
2284 	      JUMP_LABEL (jump_insn) = label;
2285 	      LABEL_NUSES (label)++;
2286 
2287 	      /* Make BB_END for cur_bb be the jump instruction (NOT the
2288 		 barrier instruction at the end of the sequence...).  */
2289 
2290 	      BB_END (cur_bb) = jump_insn;
2291 	    }
2292 	}
2293     }
2294 }
2295 
2296 /* Update CROSSING_JUMP_P flags on all jump insns.  */
2297 
2298 static void
update_crossing_jump_flags(void)2299 update_crossing_jump_flags (void)
2300 {
2301   basic_block bb;
2302   edge e;
2303   edge_iterator ei;
2304 
2305   FOR_EACH_BB_FN (bb, cfun)
2306     FOR_EACH_EDGE (e, ei, bb->succs)
2307       if (e->flags & EDGE_CROSSING)
2308 	{
2309 	  if (JUMP_P (BB_END (bb)))
2310 	    CROSSING_JUMP_P (BB_END (bb)) = 1;
2311 	  break;
2312 	}
2313 }
2314 
2315 /* Reorder basic blocks using the software trace cache (STC) algorithm.  */
2316 
2317 static void
reorder_basic_blocks_software_trace_cache(void)2318 reorder_basic_blocks_software_trace_cache (void)
2319 {
2320   if (dump_file)
2321     fprintf (dump_file, "\nReordering with the STC algorithm.\n\n");
2322 
2323   int n_traces;
2324   int i;
2325   struct trace *traces;
2326 
2327   /* We are estimating the length of uncond jump insn only once since the code
2328      for getting the insn length always returns the minimal length now.  */
2329   if (uncond_jump_length == 0)
2330     uncond_jump_length = get_uncond_jump_length ();
2331 
2332   /* We need to know some information for each basic block.  */
2333   array_size = GET_ARRAY_SIZE (last_basic_block_for_fn (cfun));
2334   bbd = XNEWVEC (bbro_basic_block_data, array_size);
2335   for (i = 0; i < array_size; i++)
2336     {
2337       bbd[i].start_of_trace = -1;
2338       bbd[i].end_of_trace = -1;
2339       bbd[i].in_trace = -1;
2340       bbd[i].visited = 0;
2341       bbd[i].priority = -1;
2342       bbd[i].heap = NULL;
2343       bbd[i].node = NULL;
2344     }
2345 
2346   traces = XNEWVEC (struct trace, n_basic_blocks_for_fn (cfun));
2347   n_traces = 0;
2348   find_traces (&n_traces, traces);
2349   connect_traces (n_traces, traces);
2350   FREE (traces);
2351   FREE (bbd);
2352 }
2353 
2354 /* Return true if edge E1 is more desirable as a fallthrough edge than
2355    edge E2 is.  */
2356 
2357 static bool
edge_order(edge e1,edge e2)2358 edge_order (edge e1, edge e2)
2359 {
2360   return e1->count () > e2->count ();
2361 }
2362 
2363 /* Reorder basic blocks using the "simple" algorithm.  This tries to
2364    maximize the dynamic number of branches that are fallthrough, without
2365    copying instructions.  The algorithm is greedy, looking at the most
2366    frequently executed branch first.  */
2367 
2368 static void
reorder_basic_blocks_simple(void)2369 reorder_basic_blocks_simple (void)
2370 {
2371   if (dump_file)
2372     fprintf (dump_file, "\nReordering with the \"simple\" algorithm.\n\n");
2373 
2374   edge *edges = new edge[2 * n_basic_blocks_for_fn (cfun)];
2375 
2376   /* First, collect all edges that can be optimized by reordering blocks:
2377      simple jumps and conditional jumps, as well as the function entry edge.  */
2378 
2379   int n = 0;
2380   edges[n++] = EDGE_SUCC (ENTRY_BLOCK_PTR_FOR_FN (cfun), 0);
2381 
2382   basic_block bb;
2383   FOR_EACH_BB_FN (bb, cfun)
2384     {
2385       rtx_insn *end = BB_END (bb);
2386 
2387       if (computed_jump_p (end) || tablejump_p (end, NULL, NULL))
2388 	continue;
2389 
2390       /* We cannot optimize asm goto.  */
2391       if (JUMP_P (end) && extract_asm_operands (end))
2392 	continue;
2393 
2394       if (single_succ_p (bb))
2395 	edges[n++] = EDGE_SUCC (bb, 0);
2396       else if (any_condjump_p (end))
2397 	{
2398 	  edge e0 = EDGE_SUCC (bb, 0);
2399 	  edge e1 = EDGE_SUCC (bb, 1);
2400 	  /* When optimizing for size it is best to keep the original
2401 	     fallthrough edges.  */
2402 	  if (e1->flags & EDGE_FALLTHRU)
2403 	    std::swap (e0, e1);
2404 	  edges[n++] = e0;
2405 	  edges[n++] = e1;
2406 	}
2407     }
2408 
2409   /* Sort the edges, the most desirable first.  When optimizing for size
2410      all edges are equally desirable.  */
2411 
2412   if (optimize_function_for_speed_p (cfun))
2413     std::stable_sort (edges, edges + n, edge_order);
2414 
2415   /* Now decide which of those edges to make fallthrough edges.  We set
2416      BB_VISITED if a block already has a fallthrough successor assigned
2417      to it.  We make ->AUX of an endpoint point to the opposite endpoint
2418      of a sequence of blocks that fall through, and ->AUX will be NULL
2419      for a block that is in such a sequence but not an endpoint anymore.
2420 
2421      To start with, everything points to itself, nothing is assigned yet.  */
2422 
2423   FOR_ALL_BB_FN (bb, cfun)
2424     {
2425       bb->aux = bb;
2426       bb->flags &= ~BB_VISITED;
2427     }
2428 
2429   EXIT_BLOCK_PTR_FOR_FN (cfun)->aux = 0;
2430 
2431   /* Now for all edges, the most desirable first, see if that edge can
2432      connect two sequences.  If it can, update AUX and BB_VISITED; if it
2433      cannot, zero out the edge in the table.  */
2434 
2435   for (int j = 0; j < n; j++)
2436     {
2437       edge e = edges[j];
2438 
2439       basic_block tail_a = e->src;
2440       basic_block head_b = e->dest;
2441       basic_block head_a = (basic_block) tail_a->aux;
2442       basic_block tail_b = (basic_block) head_b->aux;
2443 
2444       /* An edge cannot connect two sequences if:
2445 	 - it crosses partitions;
2446 	 - its src is not a current endpoint;
2447 	 - its dest is not a current endpoint;
2448 	 - or, it would create a loop.  */
2449 
2450       if (e->flags & EDGE_CROSSING
2451 	  || tail_a->flags & BB_VISITED
2452 	  || !tail_b
2453 	  || (!(head_b->flags & BB_VISITED) && head_b != tail_b)
2454 	  || tail_a == tail_b)
2455 	{
2456 	  edges[j] = 0;
2457 	  continue;
2458 	}
2459 
2460       tail_a->aux = 0;
2461       head_b->aux = 0;
2462       head_a->aux = tail_b;
2463       tail_b->aux = head_a;
2464       tail_a->flags |= BB_VISITED;
2465     }
2466 
2467   /* Put the pieces together, in the same order that the start blocks of
2468      the sequences already had.  The hot/cold partitioning gives a little
2469      complication: as a first pass only do this for blocks in the same
2470      partition as the start block, and (if there is anything left to do)
2471      in a second pass handle the other partition.  */
2472 
2473   basic_block last_tail = (basic_block) ENTRY_BLOCK_PTR_FOR_FN (cfun)->aux;
2474 
2475   int current_partition
2476     = BB_PARTITION (last_tail == ENTRY_BLOCK_PTR_FOR_FN (cfun)
2477 		    ? EDGE_SUCC (ENTRY_BLOCK_PTR_FOR_FN (cfun), 0)->dest
2478 		    : last_tail);
2479   bool need_another_pass = true;
2480 
2481   for (int pass = 0; pass < 2 && need_another_pass; pass++)
2482     {
2483       need_another_pass = false;
2484 
2485       FOR_EACH_BB_FN (bb, cfun)
2486 	if ((bb->flags & BB_VISITED && bb->aux) || bb->aux == bb)
2487 	  {
2488 	    if (BB_PARTITION (bb) != current_partition)
2489 	      {
2490 		need_another_pass = true;
2491 		continue;
2492 	      }
2493 
2494 	    last_tail->aux = bb;
2495 	    last_tail = (basic_block) bb->aux;
2496 	  }
2497 
2498       current_partition ^= BB_HOT_PARTITION | BB_COLD_PARTITION;
2499     }
2500 
2501   last_tail->aux = 0;
2502 
2503   /* Finally, link all the chosen fallthrough edges.  */
2504 
2505   for (int j = 0; j < n; j++)
2506     if (edges[j])
2507       edges[j]->src->aux = edges[j]->dest;
2508 
2509   delete[] edges;
2510 
2511   /* If the entry edge no longer falls through we have to make a new
2512      block so it can do so again.  */
2513 
2514   edge e = EDGE_SUCC (ENTRY_BLOCK_PTR_FOR_FN (cfun), 0);
2515   if (e->dest != ENTRY_BLOCK_PTR_FOR_FN (cfun)->aux)
2516     {
2517       force_nonfallthru (e);
2518       e->src->aux = ENTRY_BLOCK_PTR_FOR_FN (cfun)->aux;
2519     }
2520 }
2521 
2522 /* Reorder basic blocks.  The main entry point to this file.  */
2523 
2524 static void
reorder_basic_blocks(void)2525 reorder_basic_blocks (void)
2526 {
2527   gcc_assert (current_ir_type () == IR_RTL_CFGLAYOUT);
2528 
2529   if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
2530     return;
2531 
2532   set_edge_can_fallthru_flag ();
2533   mark_dfs_back_edges ();
2534 
2535   switch (flag_reorder_blocks_algorithm)
2536     {
2537     case REORDER_BLOCKS_ALGORITHM_SIMPLE:
2538       reorder_basic_blocks_simple ();
2539       break;
2540 
2541     case REORDER_BLOCKS_ALGORITHM_STC:
2542       reorder_basic_blocks_software_trace_cache ();
2543       break;
2544 
2545     default:
2546       gcc_unreachable ();
2547     }
2548 
2549   relink_block_chain (/*stay_in_cfglayout_mode=*/true);
2550 
2551   if (dump_file)
2552     {
2553       if (dump_flags & TDF_DETAILS)
2554 	dump_reg_info (dump_file);
2555       dump_flow_info (dump_file, dump_flags);
2556     }
2557 
2558   /* Signal that rtl_verify_flow_info_1 can now verify that there
2559      is at most one switch between hot/cold sections.  */
2560   crtl->bb_reorder_complete = true;
2561 }
2562 
2563 /* Determine which partition the first basic block in the function
2564    belongs to, then find the first basic block in the current function
2565    that belongs to a different section, and insert a
2566    NOTE_INSN_SWITCH_TEXT_SECTIONS note immediately before it in the
2567    instruction stream.  When writing out the assembly code,
2568    encountering this note will make the compiler switch between the
2569    hot and cold text sections.  */
2570 
2571 void
insert_section_boundary_note(void)2572 insert_section_boundary_note (void)
2573 {
2574   basic_block bb;
2575   bool switched_sections = false;
2576   int current_partition = 0;
2577 
2578   if (!crtl->has_bb_partition)
2579     return;
2580 
2581   FOR_EACH_BB_FN (bb, cfun)
2582     {
2583       if (!current_partition)
2584 	current_partition = BB_PARTITION (bb);
2585       if (BB_PARTITION (bb) != current_partition)
2586 	{
2587 	  gcc_assert (!switched_sections);
2588           switched_sections = true;
2589           emit_note_before (NOTE_INSN_SWITCH_TEXT_SECTIONS, BB_HEAD (bb));
2590           current_partition = BB_PARTITION (bb);
2591 	}
2592     }
2593 
2594   /* Make sure crtl->has_bb_partition matches reality even if bbpart finds
2595      some hot and some cold basic blocks, but later one of those kinds is
2596      optimized away.  */
2597   crtl->has_bb_partition = switched_sections;
2598 }
2599 
2600 namespace {
2601 
2602 const pass_data pass_data_reorder_blocks =
2603 {
2604   RTL_PASS, /* type */
2605   "bbro", /* name */
2606   OPTGROUP_NONE, /* optinfo_flags */
2607   TV_REORDER_BLOCKS, /* tv_id */
2608   0, /* properties_required */
2609   0, /* properties_provided */
2610   0, /* properties_destroyed */
2611   0, /* todo_flags_start */
2612   0, /* todo_flags_finish */
2613 };
2614 
2615 class pass_reorder_blocks : public rtl_opt_pass
2616 {
2617 public:
pass_reorder_blocks(gcc::context * ctxt)2618   pass_reorder_blocks (gcc::context *ctxt)
2619     : rtl_opt_pass (pass_data_reorder_blocks, ctxt)
2620   {}
2621 
2622   /* opt_pass methods: */
gate(function *)2623   virtual bool gate (function *)
2624     {
2625       if (targetm.cannot_modify_jumps_p ())
2626 	return false;
2627       return (optimize > 0
2628 	      && (flag_reorder_blocks || flag_reorder_blocks_and_partition));
2629     }
2630 
2631   virtual unsigned int execute (function *);
2632 
2633 }; // class pass_reorder_blocks
2634 
2635 unsigned int
execute(function * fun)2636 pass_reorder_blocks::execute (function *fun)
2637 {
2638   basic_block bb;
2639 
2640   /* Last attempt to optimize CFG, as scheduling, peepholing and insn
2641      splitting possibly introduced more crossjumping opportunities.  */
2642   cfg_layout_initialize (CLEANUP_EXPENSIVE);
2643 
2644   reorder_basic_blocks ();
2645   cleanup_cfg (CLEANUP_EXPENSIVE | CLEANUP_NO_PARTITIONING);
2646 
2647   FOR_EACH_BB_FN (bb, fun)
2648     if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
2649       bb->aux = bb->next_bb;
2650   cfg_layout_finalize ();
2651 
2652   return 0;
2653 }
2654 
2655 } // anon namespace
2656 
2657 rtl_opt_pass *
make_pass_reorder_blocks(gcc::context * ctxt)2658 make_pass_reorder_blocks (gcc::context *ctxt)
2659 {
2660   return new pass_reorder_blocks (ctxt);
2661 }
2662 
2663 /* Duplicate a block (that we already know ends in a computed jump) into its
2664    predecessors, where possible.  Return whether anything is changed.  */
2665 static bool
maybe_duplicate_computed_goto(basic_block bb,int max_size)2666 maybe_duplicate_computed_goto (basic_block bb, int max_size)
2667 {
2668   if (single_pred_p (bb))
2669     return false;
2670 
2671   /* Make sure that the block is small enough.  */
2672   rtx_insn *insn;
2673   FOR_BB_INSNS (bb, insn)
2674     if (INSN_P (insn))
2675       {
2676 	max_size -= get_attr_min_length (insn);
2677 	if (max_size < 0)
2678 	   return false;
2679       }
2680 
2681   bool changed = false;
2682   edge e;
2683   edge_iterator ei;
2684   for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); )
2685     {
2686       basic_block pred = e->src;
2687 
2688       /* Do not duplicate BB into PRED if that is the last predecessor, or if
2689 	 we cannot merge a copy of BB with PRED.  */
2690       if (single_pred_p (bb)
2691 	  || !single_succ_p (pred)
2692 	  || e->flags & EDGE_COMPLEX
2693 	  || pred->index < NUM_FIXED_BLOCKS
2694 	  || (JUMP_P (BB_END (pred)) && !simplejump_p (BB_END (pred)))
2695 	  || (JUMP_P (BB_END (pred)) && CROSSING_JUMP_P (BB_END (pred))))
2696 	{
2697 	  ei_next (&ei);
2698 	  continue;
2699 	}
2700 
2701       if (dump_file)
2702 	fprintf (dump_file, "Duplicating computed goto bb %d into bb %d\n",
2703 		 bb->index, e->src->index);
2704 
2705       /* Remember if PRED can be duplicated; if so, the copy of BB merged
2706 	 with PRED can be duplicated as well.  */
2707       bool can_dup_more = can_duplicate_block_p (pred);
2708 
2709       /* Make a copy of BB, merge it into PRED.  */
2710       basic_block copy = duplicate_block (bb, e, NULL);
2711       emit_barrier_after_bb (copy);
2712       reorder_insns_nobb (BB_HEAD (copy), BB_END (copy), BB_END (pred));
2713       merge_blocks (pred, copy);
2714 
2715       changed = true;
2716 
2717       /* Try to merge the resulting merged PRED into further predecessors.  */
2718       if (can_dup_more)
2719 	maybe_duplicate_computed_goto (pred, max_size);
2720     }
2721 
2722   return changed;
2723 }
2724 
2725 /* Duplicate the blocks containing computed gotos.  This basically unfactors
2726    computed gotos that were factored early on in the compilation process to
2727    speed up edge based data flow.  We used to not unfactor them again, which
2728    can seriously pessimize code with many computed jumps in the source code,
2729    such as interpreters.  See e.g. PR15242.  */
2730 static void
duplicate_computed_gotos(function * fun)2731 duplicate_computed_gotos (function *fun)
2732 {
2733   /* We are estimating the length of uncond jump insn only once
2734      since the code for getting the insn length always returns
2735      the minimal length now.  */
2736   if (uncond_jump_length == 0)
2737     uncond_jump_length = get_uncond_jump_length ();
2738 
2739   /* Never copy a block larger than this.  */
2740   int max_size
2741     = uncond_jump_length * PARAM_VALUE (PARAM_MAX_GOTO_DUPLICATION_INSNS);
2742 
2743   bool changed = false;
2744 
2745   /* Try to duplicate all blocks that end in a computed jump and that
2746      can be duplicated at all.  */
2747   basic_block bb;
2748   FOR_EACH_BB_FN (bb, fun)
2749     if (computed_jump_p (BB_END (bb)) && can_duplicate_block_p (bb))
2750       changed |= maybe_duplicate_computed_goto (bb, max_size);
2751 
2752   /* Duplicating blocks will redirect edges and may cause hot blocks
2753     previously reached by both hot and cold blocks to become dominated
2754     only by cold blocks.  */
2755   if (changed)
2756     fixup_partitions ();
2757 }
2758 
2759 namespace {
2760 
2761 const pass_data pass_data_duplicate_computed_gotos =
2762 {
2763   RTL_PASS, /* type */
2764   "compgotos", /* name */
2765   OPTGROUP_NONE, /* optinfo_flags */
2766   TV_REORDER_BLOCKS, /* tv_id */
2767   0, /* properties_required */
2768   0, /* properties_provided */
2769   0, /* properties_destroyed */
2770   0, /* todo_flags_start */
2771   0, /* todo_flags_finish */
2772 };
2773 
2774 class pass_duplicate_computed_gotos : public rtl_opt_pass
2775 {
2776 public:
pass_duplicate_computed_gotos(gcc::context * ctxt)2777   pass_duplicate_computed_gotos (gcc::context *ctxt)
2778     : rtl_opt_pass (pass_data_duplicate_computed_gotos, ctxt)
2779   {}
2780 
2781   /* opt_pass methods: */
2782   virtual bool gate (function *);
2783   virtual unsigned int execute (function *);
2784 
2785 }; // class pass_duplicate_computed_gotos
2786 
2787 bool
gate(function * fun)2788 pass_duplicate_computed_gotos::gate (function *fun)
2789 {
2790   if (targetm.cannot_modify_jumps_p ())
2791     return false;
2792   return (optimize > 0
2793 	  && flag_expensive_optimizations
2794 	  && ! optimize_function_for_size_p (fun));
2795 }
2796 
2797 unsigned int
execute(function * fun)2798 pass_duplicate_computed_gotos::execute (function *fun)
2799 {
2800   duplicate_computed_gotos (fun);
2801 
2802   return 0;
2803 }
2804 
2805 } // anon namespace
2806 
2807 rtl_opt_pass *
make_pass_duplicate_computed_gotos(gcc::context * ctxt)2808 make_pass_duplicate_computed_gotos (gcc::context *ctxt)
2809 {
2810   return new pass_duplicate_computed_gotos (ctxt);
2811 }
2812 
2813 /* This function is the main 'entrance' for the optimization that
2814    partitions hot and cold basic blocks into separate sections of the
2815    .o file (to improve performance and cache locality).  Ideally it
2816    would be called after all optimizations that rearrange the CFG have
2817    been called.  However part of this optimization may introduce new
2818    register usage, so it must be called before register allocation has
2819    occurred.  This means that this optimization is actually called
2820    well before the optimization that reorders basic blocks (see
2821    function above).
2822 
2823    This optimization checks the feedback information to determine
2824    which basic blocks are hot/cold, updates flags on the basic blocks
2825    to indicate which section they belong in.  This information is
2826    later used for writing out sections in the .o file.  Because hot
2827    and cold sections can be arbitrarily large (within the bounds of
2828    memory), far beyond the size of a single function, it is necessary
2829    to fix up all edges that cross section boundaries, to make sure the
2830    instructions used can actually span the required distance.  The
2831    fixes are described below.
2832 
2833    Fall-through edges must be changed into jumps; it is not safe or
2834    legal to fall through across a section boundary.  Whenever a
2835    fall-through edge crossing a section boundary is encountered, a new
2836    basic block is inserted (in the same section as the fall-through
2837    source), and the fall through edge is redirected to the new basic
2838    block.  The new basic block contains an unconditional jump to the
2839    original fall-through target.  (If the unconditional jump is
2840    insufficient to cross section boundaries, that is dealt with a
2841    little later, see below).
2842 
2843    In order to deal with architectures that have short conditional
2844    branches (which cannot span all of memory) we take any conditional
2845    jump that attempts to cross a section boundary and add a level of
2846    indirection: it becomes a conditional jump to a new basic block, in
2847    the same section.  The new basic block contains an unconditional
2848    jump to the original target, in the other section.
2849 
2850    For those architectures whose unconditional branch is also
2851    incapable of reaching all of memory, those unconditional jumps are
2852    converted into indirect jumps, through a register.
2853 
2854    IMPORTANT NOTE: This optimization causes some messy interactions
2855    with the cfg cleanup optimizations; those optimizations want to
2856    merge blocks wherever possible, and to collapse indirect jump
2857    sequences (change "A jumps to B jumps to C" directly into "A jumps
2858    to C").  Those optimizations can undo the jump fixes that
2859    partitioning is required to make (see above), in order to ensure
2860    that jumps attempting to cross section boundaries are really able
2861    to cover whatever distance the jump requires (on many architectures
2862    conditional or unconditional jumps are not able to reach all of
2863    memory).  Therefore tests have to be inserted into each such
2864    optimization to make sure that it does not undo stuff necessary to
2865    cross partition boundaries.  This would be much less of a problem
2866    if we could perform this optimization later in the compilation, but
2867    unfortunately the fact that we may need to create indirect jumps
2868    (through registers) requires that this optimization be performed
2869    before register allocation.
2870 
2871    Hot and cold basic blocks are partitioned and put in separate
2872    sections of the .o file, to reduce paging and improve cache
2873    performance (hopefully).  This can result in bits of code from the
2874    same function being widely separated in the .o file.  However this
2875    is not obvious to the current bb structure.  Therefore we must take
2876    care to ensure that: 1). There are no fall_thru edges that cross
2877    between sections; 2). For those architectures which have "short"
2878    conditional branches, all conditional branches that attempt to
2879    cross between sections are converted to unconditional branches;
2880    and, 3). For those architectures which have "short" unconditional
2881    branches, all unconditional branches that attempt to cross between
2882    sections are converted to indirect jumps.
2883 
2884    The code for fixing up fall_thru edges that cross between hot and
2885    cold basic blocks does so by creating new basic blocks containing
2886    unconditional branches to the appropriate label in the "other"
2887    section.  The new basic block is then put in the same (hot or cold)
2888    section as the original conditional branch, and the fall_thru edge
2889    is modified to fall into the new basic block instead.  By adding
2890    this level of indirection we end up with only unconditional branches
2891    crossing between hot and cold sections.
2892 
2893    Conditional branches are dealt with by adding a level of indirection.
2894    A new basic block is added in the same (hot/cold) section as the
2895    conditional branch, and the conditional branch is retargeted to the
2896    new basic block.  The new basic block contains an unconditional branch
2897    to the original target of the conditional branch (in the other section).
2898 
2899    Unconditional branches are dealt with by converting them into
2900    indirect jumps.  */
2901 
2902 namespace {
2903 
2904 const pass_data pass_data_partition_blocks =
2905 {
2906   RTL_PASS, /* type */
2907   "bbpart", /* name */
2908   OPTGROUP_NONE, /* optinfo_flags */
2909   TV_REORDER_BLOCKS, /* tv_id */
2910   PROP_cfglayout, /* properties_required */
2911   0, /* properties_provided */
2912   0, /* properties_destroyed */
2913   0, /* todo_flags_start */
2914   0, /* todo_flags_finish */
2915 };
2916 
2917 class pass_partition_blocks : public rtl_opt_pass
2918 {
2919 public:
pass_partition_blocks(gcc::context * ctxt)2920   pass_partition_blocks (gcc::context *ctxt)
2921     : rtl_opt_pass (pass_data_partition_blocks, ctxt)
2922   {}
2923 
2924   /* opt_pass methods: */
2925   virtual bool gate (function *);
2926   virtual unsigned int execute (function *);
2927 
2928 }; // class pass_partition_blocks
2929 
2930 bool
gate(function * fun)2931 pass_partition_blocks::gate (function *fun)
2932 {
2933   /* The optimization to partition hot/cold basic blocks into separate
2934      sections of the .o file does not work well with linkonce or with
2935      user defined section attributes or with naked attribute.  Don't call
2936      it if either case arises.  */
2937   return (flag_reorder_blocks_and_partition
2938 	  && optimize
2939 	  /* See pass_reorder_blocks::gate.  We should not partition if
2940 	     we are going to omit the reordering.  */
2941 	  && optimize_function_for_speed_p (fun)
2942 	  && !DECL_COMDAT_GROUP (current_function_decl)
2943 	  && !lookup_attribute ("section", DECL_ATTRIBUTES (fun->decl))
2944 	  && !lookup_attribute ("naked", DECL_ATTRIBUTES (fun->decl))
2945 	  /* Workaround a bug in GDB where read_partial_die doesn't cope
2946 	     with DIEs with DW_AT_ranges, see PR81115.  */
2947 	  && !(in_lto_p && MAIN_NAME_P (DECL_NAME (fun->decl))));
2948 }
2949 
2950 unsigned
execute(function * fun)2951 pass_partition_blocks::execute (function *fun)
2952 {
2953   vec<edge> crossing_edges;
2954 
2955   if (n_basic_blocks_for_fn (fun) <= NUM_FIXED_BLOCKS + 1)
2956     return 0;
2957 
2958   df_set_flags (DF_DEFER_INSN_RESCAN);
2959 
2960   crossing_edges = find_rarely_executed_basic_blocks_and_crossing_edges ();
2961   if (!crossing_edges.exists ())
2962     /* Make sure to process deferred rescans and clear changeable df flags.  */
2963     return TODO_df_finish;
2964 
2965   crtl->has_bb_partition = true;
2966 
2967   /* Make sure the source of any crossing edge ends in a jump and the
2968      destination of any crossing edge has a label.  */
2969   add_labels_and_missing_jumps (crossing_edges);
2970 
2971   /* Convert all crossing fall_thru edges to non-crossing fall
2972      thrus to unconditional jumps (that jump to the original fall
2973      through dest).  */
2974   fix_up_fall_thru_edges ();
2975 
2976   /* If the architecture does not have conditional branches that can
2977      span all of memory, convert crossing conditional branches into
2978      crossing unconditional branches.  */
2979   if (!HAS_LONG_COND_BRANCH)
2980     fix_crossing_conditional_branches ();
2981 
2982   /* If the architecture does not have unconditional branches that
2983      can span all of memory, convert crossing unconditional branches
2984      into indirect jumps.  Since adding an indirect jump also adds
2985      a new register usage, update the register usage information as
2986      well.  */
2987   if (!HAS_LONG_UNCOND_BRANCH)
2988     fix_crossing_unconditional_branches ();
2989 
2990   update_crossing_jump_flags ();
2991 
2992   /* Clear bb->aux fields that the above routines were using.  */
2993   clear_aux_for_blocks ();
2994 
2995   crossing_edges.release ();
2996 
2997   /* ??? FIXME: DF generates the bb info for a block immediately.
2998      And by immediately, I mean *during* creation of the block.
2999 
3000 	#0  df_bb_refs_collect
3001 	#1  in df_bb_refs_record
3002 	#2  in create_basic_block_structure
3003 
3004      Which means that the bb_has_eh_pred test in df_bb_refs_collect
3005      will *always* fail, because no edges can have been added to the
3006      block yet.  Which of course means we don't add the right
3007      artificial refs, which means we fail df_verify (much) later.
3008 
3009      Cleanest solution would seem to make DF_DEFER_INSN_RESCAN imply
3010      that we also shouldn't grab data from the new blocks those new
3011      insns are in either.  In this way one can create the block, link
3012      it up properly, and have everything Just Work later, when deferred
3013      insns are processed.
3014 
3015      In the meantime, we have no other option but to throw away all
3016      of the DF data and recompute it all.  */
3017   if (fun->eh->lp_array)
3018     {
3019       df_finish_pass (true);
3020       df_scan_alloc (NULL);
3021       df_scan_blocks ();
3022       /* Not all post-landing pads use all of the EH_RETURN_DATA_REGNO
3023 	 data.  We blindly generated all of them when creating the new
3024 	 landing pad.  Delete those assignments we don't use.  */
3025       df_set_flags (DF_LR_RUN_DCE);
3026       df_analyze ();
3027     }
3028 
3029   /* Make sure to process deferred rescans and clear changeable df flags.  */
3030   return TODO_df_finish;
3031 }
3032 
3033 } // anon namespace
3034 
3035 rtl_opt_pass *
make_pass_partition_blocks(gcc::context * ctxt)3036 make_pass_partition_blocks (gcc::context *ctxt)
3037 {
3038   return new pass_partition_blocks (ctxt);
3039 }
3040