1 /* Basic block reordering routines for the GNU compiler.
2    Copyright (C) 2000-2016 Free Software Foundation, Inc.
3 
4    This file is part of GCC.
5 
6    GCC is free software; you can redistribute it and/or modify it
7    under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    GCC is distributed in the hope that it will be useful, but WITHOUT
12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
14    License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with GCC; see the file COPYING3.  If not see
18    <http://www.gnu.org/licenses/>.  */
19 
20 /* This file contains the "reorder blocks" pass, which changes the control
21    flow of a function to encounter fewer branches; the "partition blocks"
22    pass, which divides the basic blocks into "hot" and "cold" partitions,
23    which are kept separate; and the "duplicate computed gotos" pass, which
24    duplicates blocks ending in an indirect jump.
25 
26    There are two algorithms for "reorder blocks": the "simple" algorithm,
27    which just rearranges blocks, trying to minimize the number of executed
28    unconditional branches; and the "software trace cache" algorithm, which
29    also copies code, and in general tries a lot harder to have long linear
30    pieces of machine code executed.  This algorithm is described next.  */
31 
32 /* This (greedy) algorithm constructs traces in several rounds.
33    The construction starts from "seeds".  The seed for the first round
34    is the entry point of the function.  When there are more than one seed,
35    the one with the lowest key in the heap is selected first (see bb_to_key).
36    Then the algorithm repeatedly adds the most probable successor to the end
37    of a trace.  Finally it connects the traces.
38 
39    There are two parameters: Branch Threshold and Exec Threshold.
40    If the probability of an edge to a successor of the current basic block is
41    lower than Branch Threshold or its frequency is lower than Exec Threshold,
42    then the successor will be the seed in one of the next rounds.
43    Each round has these parameters lower than the previous one.
44    The last round has to have these parameters set to zero so that the
45    remaining blocks are picked up.
46 
47    The algorithm selects the most probable successor from all unvisited
48    successors and successors that have been added to this trace.
49    The other successors (that has not been "sent" to the next round) will be
50    other seeds for this round and the secondary traces will start from them.
51    If the successor has not been visited in this trace, it is added to the
52    trace (however, there is some heuristic for simple branches).
53    If the successor has been visited in this trace, a loop has been found.
54    If the loop has many iterations, the loop is rotated so that the source
55    block of the most probable edge going out of the loop is the last block
56    of the trace.
57    If the loop has few iterations and there is no edge from the last block of
58    the loop going out of the loop, the loop header is duplicated.
59 
60    When connecting traces, the algorithm first checks whether there is an edge
61    from the last block of a trace to the first block of another trace.
62    When there are still some unconnected traces it checks whether there exists
63    a basic block BB such that BB is a successor of the last block of a trace
64    and BB is a predecessor of the first block of another trace.  In this case,
65    BB is duplicated, added at the end of the first trace and the traces are
66    connected through it.
67    The rest of traces are simply connected so there will be a jump to the
68    beginning of the rest of traces.
69 
70    The above description is for the full algorithm, which is used when the
71    function is optimized for speed.  When the function is optimized for size,
72    in order to reduce long jumps and connect more fallthru edges, the
73    algorithm is modified as follows:
74    (1) Break long traces to short ones.  A trace is broken at a block that has
75    multiple predecessors/ successors during trace discovery.  When connecting
76    traces, only connect Trace n with Trace n + 1.  This change reduces most
77    long jumps compared with the above algorithm.
78    (2) Ignore the edge probability and frequency for fallthru edges.
79    (3) Keep the original order of blocks when there is no chance to fall
80    through.  We rely on the results of cfg_cleanup.
81 
82    To implement the change for code size optimization, block's index is
83    selected as the key and all traces are found in one round.
84 
85    References:
86 
87    "Software Trace Cache"
88    A. Ramirez, J. Larriba-Pey, C. Navarro, J. Torrellas and M. Valero; 1999
89    http://citeseer.nj.nec.com/15361.html
90 
91 */
92 
93 #include "config.h"
94 #define INCLUDE_ALGORITHM /* stable_sort */
95 #include "system.h"
96 #include "coretypes.h"
97 #include "backend.h"
98 #include "target.h"
99 #include "rtl.h"
100 #include "tree.h"
101 #include "cfghooks.h"
102 #include "df.h"
103 #include "optabs.h"
104 #include "regs.h"
105 #include "emit-rtl.h"
106 #include "output.h"
107 #include "expr.h"
108 #include "params.h"
109 #include "toplev.h" /* user_defined_section_attribute */
110 #include "tree-pass.h"
111 #include "cfgrtl.h"
112 #include "cfganal.h"
113 #include "cfgbuild.h"
114 #include "cfgcleanup.h"
115 #include "bb-reorder.h"
116 #include "except.h"
117 #include "fibonacci_heap.h"
118 
119 /* The number of rounds.  In most cases there will only be 4 rounds, but
120    when partitioning hot and cold basic blocks into separate sections of
121    the object file there will be an extra round.  */
122 #define N_ROUNDS 5
123 
124 struct target_bb_reorder default_target_bb_reorder;
125 #if SWITCHABLE_TARGET
126 struct target_bb_reorder *this_target_bb_reorder = &default_target_bb_reorder;
127 #endif
128 
129 #define uncond_jump_length \
130   (this_target_bb_reorder->x_uncond_jump_length)
131 
132 /* Branch thresholds in thousandths (per mille) of the REG_BR_PROB_BASE.  */
133 static const int branch_threshold[N_ROUNDS] = {400, 200, 100, 0, 0};
134 
135 /* Exec thresholds in thousandths (per mille) of the frequency of bb 0.  */
136 static const int exec_threshold[N_ROUNDS] = {500, 200, 50, 0, 0};
137 
138 /* If edge frequency is lower than DUPLICATION_THRESHOLD per mille of entry
139    block the edge destination is not duplicated while connecting traces.  */
140 #define DUPLICATION_THRESHOLD 100
141 
142 typedef fibonacci_heap <long, basic_block_def> bb_heap_t;
143 typedef fibonacci_node <long, basic_block_def> bb_heap_node_t;
144 
145 /* Structure to hold needed information for each basic block.  */
146 struct bbro_basic_block_data
147 {
148   /* Which trace is the bb start of (-1 means it is not a start of any).  */
149   int start_of_trace;
150 
151   /* Which trace is the bb end of (-1 means it is not an end of any).  */
152   int end_of_trace;
153 
154   /* Which trace is the bb in?  */
155   int in_trace;
156 
157   /* Which trace was this bb visited in?  */
158   int visited;
159 
160   /* Cached maximum frequency of interesting incoming edges.
161      Minus one means not yet computed.  */
162   int priority;
163 
164   /* Which heap is BB in (if any)?  */
165   bb_heap_t *heap;
166 
167   /* Which heap node is BB in (if any)?  */
168   bb_heap_node_t *node;
169 };
170 
171 /* The current size of the following dynamic array.  */
172 static int array_size;
173 
174 /* The array which holds needed information for basic blocks.  */
175 static bbro_basic_block_data *bbd;
176 
177 /* To avoid frequent reallocation the size of arrays is greater than needed,
178    the number of elements is (not less than) 1.25 * size_wanted.  */
179 #define GET_ARRAY_SIZE(X) ((((X) / 4) + 1) * 5)
180 
181 /* Free the memory and set the pointer to NULL.  */
182 #define FREE(P) (gcc_assert (P), free (P), P = 0)
183 
184 /* Structure for holding information about a trace.  */
185 struct trace
186 {
187   /* First and last basic block of the trace.  */
188   basic_block first, last;
189 
190   /* The round of the STC creation which this trace was found in.  */
191   int round;
192 
193   /* The length (i.e. the number of basic blocks) of the trace.  */
194   int length;
195 };
196 
197 /* Maximum frequency and count of one of the entry blocks.  */
198 static int max_entry_frequency;
199 static gcov_type max_entry_count;
200 
201 /* Local function prototypes.  */
202 static void find_traces (int *, struct trace *);
203 static basic_block rotate_loop (edge, struct trace *, int);
204 static void mark_bb_visited (basic_block, int);
205 static void find_traces_1_round (int, int, gcov_type, struct trace *, int *,
206 				 int, bb_heap_t **, int);
207 static basic_block copy_bb (basic_block, edge, basic_block, int);
208 static long bb_to_key (basic_block);
209 static bool better_edge_p (const_basic_block, const_edge, int, int, int, int,
210 			   const_edge);
211 static bool connect_better_edge_p (const_edge, bool, int, const_edge,
212 				   struct trace *);
213 static void connect_traces (int, struct trace *);
214 static bool copy_bb_p (const_basic_block, int);
215 static bool push_to_next_round_p (const_basic_block, int, int, int, gcov_type);
216 
217 /* Return the trace number in which BB was visited.  */
218 
219 static int
bb_visited_trace(const_basic_block bb)220 bb_visited_trace (const_basic_block bb)
221 {
222   gcc_assert (bb->index < array_size);
223   return bbd[bb->index].visited;
224 }
225 
226 /* This function marks BB that it was visited in trace number TRACE.  */
227 
228 static void
mark_bb_visited(basic_block bb,int trace)229 mark_bb_visited (basic_block bb, int trace)
230 {
231   bbd[bb->index].visited = trace;
232   if (bbd[bb->index].heap)
233     {
234       bbd[bb->index].heap->delete_node (bbd[bb->index].node);
235       bbd[bb->index].heap = NULL;
236       bbd[bb->index].node = NULL;
237     }
238 }
239 
240 /* Check to see if bb should be pushed into the next round of trace
241    collections or not.  Reasons for pushing the block forward are 1).
242    If the block is cold, we are doing partitioning, and there will be
243    another round (cold partition blocks are not supposed to be
244    collected into traces until the very last round); or 2). There will
245    be another round, and the basic block is not "hot enough" for the
246    current round of trace collection.  */
247 
248 static bool
push_to_next_round_p(const_basic_block bb,int round,int number_of_rounds,int exec_th,gcov_type count_th)249 push_to_next_round_p (const_basic_block bb, int round, int number_of_rounds,
250 		      int exec_th, gcov_type count_th)
251 {
252   bool there_exists_another_round;
253   bool block_not_hot_enough;
254 
255   there_exists_another_round = round < number_of_rounds - 1;
256 
257   block_not_hot_enough = (bb->frequency < exec_th
258 			  || bb->count < count_th
259 			  || probably_never_executed_bb_p (cfun, bb));
260 
261   if (there_exists_another_round
262       && block_not_hot_enough)
263     return true;
264   else
265     return false;
266 }
267 
268 /* Find the traces for Software Trace Cache.  Chain each trace through
269    RBI()->next.  Store the number of traces to N_TRACES and description of
270    traces to TRACES.  */
271 
272 static void
find_traces(int * n_traces,struct trace * traces)273 find_traces (int *n_traces, struct trace *traces)
274 {
275   int i;
276   int number_of_rounds;
277   edge e;
278   edge_iterator ei;
279   bb_heap_t *heap = new bb_heap_t (LONG_MIN);
280 
281   /* Add one extra round of trace collection when partitioning hot/cold
282      basic blocks into separate sections.  The last round is for all the
283      cold blocks (and ONLY the cold blocks).  */
284 
285   number_of_rounds = N_ROUNDS - 1;
286 
287   /* Insert entry points of function into heap.  */
288   max_entry_frequency = 0;
289   max_entry_count = 0;
290   FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR_FOR_FN (cfun)->succs)
291     {
292       bbd[e->dest->index].heap = heap;
293       bbd[e->dest->index].node = heap->insert (bb_to_key (e->dest), e->dest);
294       if (e->dest->frequency > max_entry_frequency)
295 	max_entry_frequency = e->dest->frequency;
296       if (e->dest->count > max_entry_count)
297 	max_entry_count = e->dest->count;
298     }
299 
300   /* Find the traces.  */
301   for (i = 0; i < number_of_rounds; i++)
302     {
303       gcov_type count_threshold;
304 
305       if (dump_file)
306 	fprintf (dump_file, "STC - round %d\n", i + 1);
307 
308       if (max_entry_count < INT_MAX / 1000)
309 	count_threshold = max_entry_count * exec_threshold[i] / 1000;
310       else
311 	count_threshold = max_entry_count / 1000 * exec_threshold[i];
312 
313       find_traces_1_round (REG_BR_PROB_BASE * branch_threshold[i] / 1000,
314 			   max_entry_frequency * exec_threshold[i] / 1000,
315 			   count_threshold, traces, n_traces, i, &heap,
316 			   number_of_rounds);
317     }
318   delete heap;
319 
320   if (dump_file)
321     {
322       for (i = 0; i < *n_traces; i++)
323 	{
324 	  basic_block bb;
325 	  fprintf (dump_file, "Trace %d (round %d):  ", i + 1,
326 		   traces[i].round + 1);
327 	  for (bb = traces[i].first;
328 	       bb != traces[i].last;
329 	       bb = (basic_block) bb->aux)
330 	    fprintf (dump_file, "%d [%d] ", bb->index, bb->frequency);
331 	  fprintf (dump_file, "%d [%d]\n", bb->index, bb->frequency);
332 	}
333       fflush (dump_file);
334     }
335 }
336 
337 /* Rotate loop whose back edge is BACK_EDGE in the tail of trace TRACE
338    (with sequential number TRACE_N).  */
339 
340 static basic_block
rotate_loop(edge back_edge,struct trace * trace,int trace_n)341 rotate_loop (edge back_edge, struct trace *trace, int trace_n)
342 {
343   basic_block bb;
344 
345   /* Information about the best end (end after rotation) of the loop.  */
346   basic_block best_bb = NULL;
347   edge best_edge = NULL;
348   int best_freq = -1;
349   gcov_type best_count = -1;
350   /* The best edge is preferred when its destination is not visited yet
351      or is a start block of some trace.  */
352   bool is_preferred = false;
353 
354   /* Find the most frequent edge that goes out from current trace.  */
355   bb = back_edge->dest;
356   do
357     {
358       edge e;
359       edge_iterator ei;
360 
361       FOR_EACH_EDGE (e, ei, bb->succs)
362 	if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
363 	    && bb_visited_trace (e->dest) != trace_n
364 	    && (e->flags & EDGE_CAN_FALLTHRU)
365 	    && !(e->flags & EDGE_COMPLEX))
366 	{
367 	  if (is_preferred)
368 	    {
369 	      /* The best edge is preferred.  */
370 	      if (!bb_visited_trace (e->dest)
371 		  || bbd[e->dest->index].start_of_trace >= 0)
372 		{
373 		  /* The current edge E is also preferred.  */
374 		  int freq = EDGE_FREQUENCY (e);
375 		  if (freq > best_freq || e->count > best_count)
376 		    {
377 		      best_freq = freq;
378 		      best_count = e->count;
379 		      best_edge = e;
380 		      best_bb = bb;
381 		    }
382 		}
383 	    }
384 	  else
385 	    {
386 	      if (!bb_visited_trace (e->dest)
387 		  || bbd[e->dest->index].start_of_trace >= 0)
388 		{
389 		  /* The current edge E is preferred.  */
390 		  is_preferred = true;
391 		  best_freq = EDGE_FREQUENCY (e);
392 		  best_count = e->count;
393 		  best_edge = e;
394 		  best_bb = bb;
395 		}
396 	      else
397 		{
398 		  int freq = EDGE_FREQUENCY (e);
399 		  if (!best_edge || freq > best_freq || e->count > best_count)
400 		    {
401 		      best_freq = freq;
402 		      best_count = e->count;
403 		      best_edge = e;
404 		      best_bb = bb;
405 		    }
406 		}
407 	    }
408 	}
409       bb = (basic_block) bb->aux;
410     }
411   while (bb != back_edge->dest);
412 
413   if (best_bb)
414     {
415       /* Rotate the loop so that the BEST_EDGE goes out from the last block of
416 	 the trace.  */
417       if (back_edge->dest == trace->first)
418 	{
419 	  trace->first = (basic_block) best_bb->aux;
420 	}
421       else
422 	{
423 	  basic_block prev_bb;
424 
425 	  for (prev_bb = trace->first;
426 	       prev_bb->aux != back_edge->dest;
427 	       prev_bb = (basic_block) prev_bb->aux)
428 	    ;
429 	  prev_bb->aux = best_bb->aux;
430 
431 	  /* Try to get rid of uncond jump to cond jump.  */
432 	  if (single_succ_p (prev_bb))
433 	    {
434 	      basic_block header = single_succ (prev_bb);
435 
436 	      /* Duplicate HEADER if it is a small block containing cond jump
437 		 in the end.  */
438 	      if (any_condjump_p (BB_END (header)) && copy_bb_p (header, 0)
439 		  && !CROSSING_JUMP_P (BB_END (header)))
440 		copy_bb (header, single_succ_edge (prev_bb), prev_bb, trace_n);
441 	    }
442 	}
443     }
444   else
445     {
446       /* We have not found suitable loop tail so do no rotation.  */
447       best_bb = back_edge->src;
448     }
449   best_bb->aux = NULL;
450   return best_bb;
451 }
452 
453 /* One round of finding traces.  Find traces for BRANCH_TH and EXEC_TH i.e. do
454    not include basic blocks whose probability is lower than BRANCH_TH or whose
455    frequency is lower than EXEC_TH into traces (or whose count is lower than
456    COUNT_TH).  Store the new traces into TRACES and modify the number of
457    traces *N_TRACES.  Set the round (which the trace belongs to) to ROUND.
458    The function expects starting basic blocks to be in *HEAP and will delete
459    *HEAP and store starting points for the next round into new *HEAP.  */
460 
461 static void
find_traces_1_round(int branch_th,int exec_th,gcov_type count_th,struct trace * traces,int * n_traces,int round,bb_heap_t ** heap,int number_of_rounds)462 find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
463 		     struct trace *traces, int *n_traces, int round,
464 		     bb_heap_t **heap, int number_of_rounds)
465 {
466   /* Heap for discarded basic blocks which are possible starting points for
467      the next round.  */
468   bb_heap_t *new_heap = new bb_heap_t (LONG_MIN);
469   bool for_size = optimize_function_for_size_p (cfun);
470 
471   while (!(*heap)->empty ())
472     {
473       basic_block bb;
474       struct trace *trace;
475       edge best_edge, e;
476       long key;
477       edge_iterator ei;
478 
479       bb = (*heap)->extract_min ();
480       bbd[bb->index].heap = NULL;
481       bbd[bb->index].node = NULL;
482 
483       if (dump_file)
484 	fprintf (dump_file, "Getting bb %d\n", bb->index);
485 
486       /* If the BB's frequency is too low, send BB to the next round.  When
487 	 partitioning hot/cold blocks into separate sections, make sure all
488 	 the cold blocks (and ONLY the cold blocks) go into the (extra) final
489 	 round.  When optimizing for size, do not push to next round.  */
490 
491       if (!for_size
492 	  && push_to_next_round_p (bb, round, number_of_rounds, exec_th,
493 				   count_th))
494 	{
495 	  int key = bb_to_key (bb);
496 	  bbd[bb->index].heap = new_heap;
497 	  bbd[bb->index].node = new_heap->insert (key, bb);
498 
499 	  if (dump_file)
500 	    fprintf (dump_file,
501 		     "  Possible start point of next round: %d (key: %d)\n",
502 		     bb->index, key);
503 	  continue;
504 	}
505 
506       trace = traces + *n_traces;
507       trace->first = bb;
508       trace->round = round;
509       trace->length = 0;
510       bbd[bb->index].in_trace = *n_traces;
511       (*n_traces)++;
512 
513       do
514 	{
515 	  int prob, freq;
516 	  bool ends_in_call;
517 
518 	  /* The probability and frequency of the best edge.  */
519 	  int best_prob = INT_MIN / 2;
520 	  int best_freq = INT_MIN / 2;
521 
522 	  best_edge = NULL;
523 	  mark_bb_visited (bb, *n_traces);
524 	  trace->length++;
525 
526 	  if (dump_file)
527 	    fprintf (dump_file, "Basic block %d was visited in trace %d\n",
528 		     bb->index, *n_traces - 1);
529 
530 	  ends_in_call = block_ends_with_call_p (bb);
531 
532 	  /* Select the successor that will be placed after BB.  */
533 	  FOR_EACH_EDGE (e, ei, bb->succs)
534 	    {
535 	      gcc_assert (!(e->flags & EDGE_FAKE));
536 
537 	      if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
538 		continue;
539 
540 	      if (bb_visited_trace (e->dest)
541 		  && bb_visited_trace (e->dest) != *n_traces)
542 		continue;
543 
544 	      if (BB_PARTITION (e->dest) != BB_PARTITION (bb))
545 		continue;
546 
547 	      prob = e->probability;
548 	      freq = e->dest->frequency;
549 
550 	      /* The only sensible preference for a call instruction is the
551 		 fallthru edge.  Don't bother selecting anything else.  */
552 	      if (ends_in_call)
553 		{
554 		  if (e->flags & EDGE_CAN_FALLTHRU)
555 		    {
556 		      best_edge = e;
557 		      best_prob = prob;
558 		      best_freq = freq;
559 		    }
560 		  continue;
561 		}
562 
563 	      /* Edge that cannot be fallthru or improbable or infrequent
564 		 successor (i.e. it is unsuitable successor).  When optimizing
565 		 for size, ignore the probability and frequency.  */
566 	      if (!(e->flags & EDGE_CAN_FALLTHRU) || (e->flags & EDGE_COMPLEX)
567 		  || ((prob < branch_th || EDGE_FREQUENCY (e) < exec_th
568 		      || e->count < count_th) && (!for_size)))
569 		continue;
570 
571 	      /* If partitioning hot/cold basic blocks, don't consider edges
572 		 that cross section boundaries.  */
573 
574 	      if (better_edge_p (bb, e, prob, freq, best_prob, best_freq,
575 				 best_edge))
576 		{
577 		  best_edge = e;
578 		  best_prob = prob;
579 		  best_freq = freq;
580 		}
581 	    }
582 
583 	  /* If the best destination has multiple predecessors, and can be
584 	     duplicated cheaper than a jump, don't allow it to be added
585 	     to a trace.  We'll duplicate it when connecting traces.  */
586 	  if (best_edge && EDGE_COUNT (best_edge->dest->preds) >= 2
587 	      && copy_bb_p (best_edge->dest, 0))
588 	    best_edge = NULL;
589 
590 	  /* If the best destination has multiple successors or predecessors,
591 	     don't allow it to be added when optimizing for size.  This makes
592 	     sure predecessors with smaller index are handled before the best
593 	     destinarion.  It breaks long trace and reduces long jumps.
594 
595 	     Take if-then-else as an example.
596 		A
597 	       / \
598 	      B   C
599 	       \ /
600 		D
601 	     If we do not remove the best edge B->D/C->D, the final order might
602 	     be A B D ... C.  C is at the end of the program.  If D's successors
603 	     and D are complicated, might need long jumps for A->C and C->D.
604 	     Similar issue for order: A C D ... B.
605 
606 	     After removing the best edge, the final result will be ABCD/ ACBD.
607 	     It does not add jump compared with the previous order.  But it
608 	     reduces the possibility of long jumps.  */
609 	  if (best_edge && for_size
610 	      && (EDGE_COUNT (best_edge->dest->succs) > 1
611 		 || EDGE_COUNT (best_edge->dest->preds) > 1))
612 	    best_edge = NULL;
613 
614 	  /* Add all non-selected successors to the heaps.  */
615 	  FOR_EACH_EDGE (e, ei, bb->succs)
616 	    {
617 	      if (e == best_edge
618 		  || e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)
619 		  || bb_visited_trace (e->dest))
620 		continue;
621 
622 	      key = bb_to_key (e->dest);
623 
624 	      if (bbd[e->dest->index].heap)
625 		{
626 		  /* E->DEST is already in some heap.  */
627 		  if (key != bbd[e->dest->index].node->get_key ())
628 		    {
629 		      if (dump_file)
630 			{
631 			  fprintf (dump_file,
632 				   "Changing key for bb %d from %ld to %ld.\n",
633 				   e->dest->index,
634 				   (long) bbd[e->dest->index].node->get_key (),
635 				   key);
636 			}
637 		      bbd[e->dest->index].heap->replace_key
638 		        (bbd[e->dest->index].node, key);
639 		    }
640 		}
641 	      else
642 		{
643 		  bb_heap_t *which_heap = *heap;
644 
645 		  prob = e->probability;
646 		  freq = EDGE_FREQUENCY (e);
647 
648 		  if (!(e->flags & EDGE_CAN_FALLTHRU)
649 		      || (e->flags & EDGE_COMPLEX)
650 		      || prob < branch_th || freq < exec_th
651 		      || e->count < count_th)
652 		    {
653 		      /* When partitioning hot/cold basic blocks, make sure
654 			 the cold blocks (and only the cold blocks) all get
655 			 pushed to the last round of trace collection.  When
656 			 optimizing for size, do not push to next round.  */
657 
658 		      if (!for_size && push_to_next_round_p (e->dest, round,
659 							     number_of_rounds,
660 							     exec_th, count_th))
661 			which_heap = new_heap;
662 		    }
663 
664 		  bbd[e->dest->index].heap = which_heap;
665 		  bbd[e->dest->index].node = which_heap->insert (key, e->dest);
666 
667 		  if (dump_file)
668 		    {
669 		      fprintf (dump_file,
670 			       "  Possible start of %s round: %d (key: %ld)\n",
671 			       (which_heap == new_heap) ? "next" : "this",
672 			       e->dest->index, (long) key);
673 		    }
674 
675 		}
676 	    }
677 
678 	  if (best_edge) /* Suitable successor was found.  */
679 	    {
680 	      if (bb_visited_trace (best_edge->dest) == *n_traces)
681 		{
682 		  /* We do nothing with one basic block loops.  */
683 		  if (best_edge->dest != bb)
684 		    {
685 		      if (EDGE_FREQUENCY (best_edge)
686 			  > 4 * best_edge->dest->frequency / 5)
687 			{
688 			  /* The loop has at least 4 iterations.  If the loop
689 			     header is not the first block of the function
690 			     we can rotate the loop.  */
691 
692 			  if (best_edge->dest
693 			      != ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb)
694 			    {
695 			      if (dump_file)
696 				{
697 				  fprintf (dump_file,
698 					   "Rotating loop %d - %d\n",
699 					   best_edge->dest->index, bb->index);
700 				}
701 			      bb->aux = best_edge->dest;
702 			      bbd[best_edge->dest->index].in_trace =
703 							     (*n_traces) - 1;
704 			      bb = rotate_loop (best_edge, trace, *n_traces);
705 			    }
706 			}
707 		      else
708 			{
709 			  /* The loop has less than 4 iterations.  */
710 
711 			  if (single_succ_p (bb)
712 			      && copy_bb_p (best_edge->dest,
713 			      		    optimize_edge_for_speed_p
714 			      		    (best_edge)))
715 			    {
716 			      bb = copy_bb (best_edge->dest, best_edge, bb,
717 					    *n_traces);
718 			      trace->length++;
719 			    }
720 			}
721 		    }
722 
723 		  /* Terminate the trace.  */
724 		  break;
725 		}
726 	      else
727 		{
728 		  /* Check for a situation
729 
730 		    A
731 		   /|
732 		  B |
733 		   \|
734 		    C
735 
736 		  where
737 		  EDGE_FREQUENCY (AB) + EDGE_FREQUENCY (BC)
738 		    >= EDGE_FREQUENCY (AC).
739 		  (i.e. 2 * B->frequency >= EDGE_FREQUENCY (AC) )
740 		  Best ordering is then A B C.
741 
742 		  When optimizing for size, A B C is always the best order.
743 
744 		  This situation is created for example by:
745 
746 		  if (A) B;
747 		  C;
748 
749 		  */
750 
751 		  FOR_EACH_EDGE (e, ei, bb->succs)
752 		    if (e != best_edge
753 			&& (e->flags & EDGE_CAN_FALLTHRU)
754 			&& !(e->flags & EDGE_COMPLEX)
755 			&& !bb_visited_trace (e->dest)
756 			&& single_pred_p (e->dest)
757 			&& !(e->flags & EDGE_CROSSING)
758 			&& single_succ_p (e->dest)
759 			&& (single_succ_edge (e->dest)->flags
760 			    & EDGE_CAN_FALLTHRU)
761 			&& !(single_succ_edge (e->dest)->flags & EDGE_COMPLEX)
762 			&& single_succ (e->dest) == best_edge->dest
763 			&& (2 * e->dest->frequency >= EDGE_FREQUENCY (best_edge)
764 			    || for_size))
765 		      {
766 			best_edge = e;
767 			if (dump_file)
768 			  fprintf (dump_file, "Selecting BB %d\n",
769 				   best_edge->dest->index);
770 			break;
771 		      }
772 
773 		  bb->aux = best_edge->dest;
774 		  bbd[best_edge->dest->index].in_trace = (*n_traces) - 1;
775 		  bb = best_edge->dest;
776 		}
777 	    }
778 	}
779       while (best_edge);
780       trace->last = bb;
781       bbd[trace->first->index].start_of_trace = *n_traces - 1;
782       if (bbd[trace->last->index].end_of_trace != *n_traces - 1)
783 	{
784 	  bbd[trace->last->index].end_of_trace = *n_traces - 1;
785 	  /* Update the cached maximum frequency for interesting predecessor
786 	     edges for successors of the new trace end.  */
787 	  FOR_EACH_EDGE (e, ei, trace->last->succs)
788 	    if (EDGE_FREQUENCY (e) > bbd[e->dest->index].priority)
789 	      bbd[e->dest->index].priority = EDGE_FREQUENCY (e);
790 	}
791 
792       /* The trace is terminated so we have to recount the keys in heap
793 	 (some block can have a lower key because now one of its predecessors
794 	 is an end of the trace).  */
795       FOR_EACH_EDGE (e, ei, bb->succs)
796 	{
797 	  if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)
798 	      || bb_visited_trace (e->dest))
799 	    continue;
800 
801 	  if (bbd[e->dest->index].heap)
802 	    {
803 	      key = bb_to_key (e->dest);
804 	      if (key != bbd[e->dest->index].node->get_key ())
805 		{
806 		  if (dump_file)
807 		    {
808 		      fprintf (dump_file,
809 			       "Changing key for bb %d from %ld to %ld.\n",
810 			       e->dest->index,
811 			       (long) bbd[e->dest->index].node->get_key (), key);
812 		    }
813 		  bbd[e->dest->index].heap->replace_key
814 		    (bbd[e->dest->index].node, key);
815 		}
816 	    }
817 	}
818     }
819 
820   delete (*heap);
821 
822   /* "Return" the new heap.  */
823   *heap = new_heap;
824 }
825 
826 /* Create a duplicate of the basic block OLD_BB and redirect edge E to it, add
827    it to trace after BB, mark OLD_BB visited and update pass' data structures
828    (TRACE is a number of trace which OLD_BB is duplicated to).  */
829 
830 static basic_block
copy_bb(basic_block old_bb,edge e,basic_block bb,int trace)831 copy_bb (basic_block old_bb, edge e, basic_block bb, int trace)
832 {
833   basic_block new_bb;
834 
835   new_bb = duplicate_block (old_bb, e, bb);
836   BB_COPY_PARTITION (new_bb, old_bb);
837 
838   gcc_assert (e->dest == new_bb);
839 
840   if (dump_file)
841     fprintf (dump_file,
842 	     "Duplicated bb %d (created bb %d)\n",
843 	     old_bb->index, new_bb->index);
844 
845   if (new_bb->index >= array_size
846       || last_basic_block_for_fn (cfun) > array_size)
847     {
848       int i;
849       int new_size;
850 
851       new_size = MAX (last_basic_block_for_fn (cfun), new_bb->index + 1);
852       new_size = GET_ARRAY_SIZE (new_size);
853       bbd = XRESIZEVEC (bbro_basic_block_data, bbd, new_size);
854       for (i = array_size; i < new_size; i++)
855 	{
856 	  bbd[i].start_of_trace = -1;
857 	  bbd[i].end_of_trace = -1;
858 	  bbd[i].in_trace = -1;
859 	  bbd[i].visited = 0;
860 	  bbd[i].priority = -1;
861 	  bbd[i].heap = NULL;
862 	  bbd[i].node = NULL;
863 	}
864       array_size = new_size;
865 
866       if (dump_file)
867 	{
868 	  fprintf (dump_file,
869 		   "Growing the dynamic array to %d elements.\n",
870 		   array_size);
871 	}
872     }
873 
874   gcc_assert (!bb_visited_trace (e->dest));
875   mark_bb_visited (new_bb, trace);
876   new_bb->aux = bb->aux;
877   bb->aux = new_bb;
878 
879   bbd[new_bb->index].in_trace = trace;
880 
881   return new_bb;
882 }
883 
884 /* Compute and return the key (for the heap) of the basic block BB.  */
885 
886 static long
bb_to_key(basic_block bb)887 bb_to_key (basic_block bb)
888 {
889   edge e;
890   edge_iterator ei;
891 
892   /* Use index as key to align with its original order.  */
893   if (optimize_function_for_size_p (cfun))
894     return bb->index;
895 
896   /* Do not start in probably never executed blocks.  */
897 
898   if (BB_PARTITION (bb) == BB_COLD_PARTITION
899       || probably_never_executed_bb_p (cfun, bb))
900     return BB_FREQ_MAX;
901 
902   /* Prefer blocks whose predecessor is an end of some trace
903      or whose predecessor edge is EDGE_DFS_BACK.  */
904   int priority = bbd[bb->index].priority;
905   if (priority == -1)
906     {
907       priority = 0;
908       FOR_EACH_EDGE (e, ei, bb->preds)
909 	{
910 	  if ((e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun)
911 	       && bbd[e->src->index].end_of_trace >= 0)
912 	      || (e->flags & EDGE_DFS_BACK))
913 	    {
914 	      int edge_freq = EDGE_FREQUENCY (e);
915 
916 	      if (edge_freq > priority)
917 		priority = edge_freq;
918 	    }
919 	}
920       bbd[bb->index].priority = priority;
921     }
922 
923   if (priority)
924     /* The block with priority should have significantly lower key.  */
925     return -(100 * BB_FREQ_MAX + 100 * priority + bb->frequency);
926 
927   return -bb->frequency;
928 }
929 
930 /* Return true when the edge E from basic block BB is better than the temporary
931    best edge (details are in function).  The probability of edge E is PROB. The
932    frequency of the successor is FREQ.  The current best probability is
933    BEST_PROB, the best frequency is BEST_FREQ.
934    The edge is considered to be equivalent when PROB does not differ much from
935    BEST_PROB; similarly for frequency.  */
936 
937 static bool
better_edge_p(const_basic_block bb,const_edge e,int prob,int freq,int best_prob,int best_freq,const_edge cur_best_edge)938 better_edge_p (const_basic_block bb, const_edge e, int prob, int freq,
939 	       int best_prob, int best_freq, const_edge cur_best_edge)
940 {
941   bool is_better_edge;
942 
943   /* The BEST_* values do not have to be best, but can be a bit smaller than
944      maximum values.  */
945   int diff_prob = best_prob / 10;
946   int diff_freq = best_freq / 10;
947 
948   /* The smaller one is better to keep the original order.  */
949   if (optimize_function_for_size_p (cfun))
950     return !cur_best_edge
951 	   || cur_best_edge->dest->index > e->dest->index;
952 
953   if (prob > best_prob + diff_prob)
954     /* The edge has higher probability than the temporary best edge.  */
955     is_better_edge = true;
956   else if (prob < best_prob - diff_prob)
957     /* The edge has lower probability than the temporary best edge.  */
958     is_better_edge = false;
959   else if (freq < best_freq - diff_freq)
960     /* The edge and the temporary best edge  have almost equivalent
961        probabilities.  The higher frequency of a successor now means
962        that there is another edge going into that successor.
963        This successor has lower frequency so it is better.  */
964     is_better_edge = true;
965   else if (freq > best_freq + diff_freq)
966     /* This successor has higher frequency so it is worse.  */
967     is_better_edge = false;
968   else if (e->dest->prev_bb == bb)
969     /* The edges have equivalent probabilities and the successors
970        have equivalent frequencies.  Select the previous successor.  */
971     is_better_edge = true;
972   else
973     is_better_edge = false;
974 
975   /* If we are doing hot/cold partitioning, make sure that we always favor
976      non-crossing edges over crossing edges.  */
977 
978   if (!is_better_edge
979       && flag_reorder_blocks_and_partition
980       && cur_best_edge
981       && (cur_best_edge->flags & EDGE_CROSSING)
982       && !(e->flags & EDGE_CROSSING))
983     is_better_edge = true;
984 
985   return is_better_edge;
986 }
987 
988 /* Return true when the edge E is better than the temporary best edge
989    CUR_BEST_EDGE.  If SRC_INDEX_P is true, the function compares the src bb of
990    E and CUR_BEST_EDGE; otherwise it will compare the dest bb.
991    BEST_LEN is the trace length of src (or dest) bb in CUR_BEST_EDGE.
992    TRACES record the information about traces.
993    When optimizing for size, the edge with smaller index is better.
994    When optimizing for speed, the edge with bigger probability or longer trace
995    is better.  */
996 
997 static bool
connect_better_edge_p(const_edge e,bool src_index_p,int best_len,const_edge cur_best_edge,struct trace * traces)998 connect_better_edge_p (const_edge e, bool src_index_p, int best_len,
999 		       const_edge cur_best_edge, struct trace *traces)
1000 {
1001   int e_index;
1002   int b_index;
1003   bool is_better_edge;
1004 
1005   if (!cur_best_edge)
1006     return true;
1007 
1008   if (optimize_function_for_size_p (cfun))
1009     {
1010       e_index = src_index_p ? e->src->index : e->dest->index;
1011       b_index = src_index_p ? cur_best_edge->src->index
1012 			      : cur_best_edge->dest->index;
1013       /* The smaller one is better to keep the original order.  */
1014       return b_index > e_index;
1015     }
1016 
1017   if (src_index_p)
1018     {
1019       e_index = e->src->index;
1020 
1021       if (e->probability > cur_best_edge->probability)
1022 	/* The edge has higher probability than the temporary best edge.  */
1023 	is_better_edge = true;
1024       else if (e->probability < cur_best_edge->probability)
1025 	/* The edge has lower probability than the temporary best edge.  */
1026 	is_better_edge = false;
1027       else if (traces[bbd[e_index].end_of_trace].length > best_len)
1028 	/* The edge and the temporary best edge have equivalent probabilities.
1029 	   The edge with longer trace is better.  */
1030 	is_better_edge = true;
1031       else
1032 	is_better_edge = false;
1033     }
1034   else
1035     {
1036       e_index = e->dest->index;
1037 
1038       if (e->probability > cur_best_edge->probability)
1039 	/* The edge has higher probability than the temporary best edge.  */
1040 	is_better_edge = true;
1041       else if (e->probability < cur_best_edge->probability)
1042 	/* The edge has lower probability than the temporary best edge.  */
1043 	is_better_edge = false;
1044       else if (traces[bbd[e_index].start_of_trace].length > best_len)
1045 	/* The edge and the temporary best edge have equivalent probabilities.
1046 	   The edge with longer trace is better.  */
1047 	is_better_edge = true;
1048       else
1049 	is_better_edge = false;
1050     }
1051 
1052   return is_better_edge;
1053 }
1054 
1055 /* Connect traces in array TRACES, N_TRACES is the count of traces.  */
1056 
1057 static void
connect_traces(int n_traces,struct trace * traces)1058 connect_traces (int n_traces, struct trace *traces)
1059 {
1060   int i;
1061   bool *connected;
1062   bool two_passes;
1063   int last_trace;
1064   int current_pass;
1065   int current_partition;
1066   int freq_threshold;
1067   gcov_type count_threshold;
1068   bool for_size = optimize_function_for_size_p (cfun);
1069 
1070   freq_threshold = max_entry_frequency * DUPLICATION_THRESHOLD / 1000;
1071   if (max_entry_count < INT_MAX / 1000)
1072     count_threshold = max_entry_count * DUPLICATION_THRESHOLD / 1000;
1073   else
1074     count_threshold = max_entry_count / 1000 * DUPLICATION_THRESHOLD;
1075 
1076   connected = XCNEWVEC (bool, n_traces);
1077   last_trace = -1;
1078   current_pass = 1;
1079   current_partition = BB_PARTITION (traces[0].first);
1080   two_passes = false;
1081 
1082   if (crtl->has_bb_partition)
1083     for (i = 0; i < n_traces && !two_passes; i++)
1084       if (BB_PARTITION (traces[0].first)
1085 	  != BB_PARTITION (traces[i].first))
1086 	two_passes = true;
1087 
1088   for (i = 0; i < n_traces || (two_passes && current_pass == 1) ; i++)
1089     {
1090       int t = i;
1091       int t2;
1092       edge e, best;
1093       int best_len;
1094 
1095       if (i >= n_traces)
1096 	{
1097 	  gcc_assert (two_passes && current_pass == 1);
1098 	  i = 0;
1099 	  t = i;
1100 	  current_pass = 2;
1101 	  if (current_partition == BB_HOT_PARTITION)
1102 	    current_partition = BB_COLD_PARTITION;
1103 	  else
1104 	    current_partition = BB_HOT_PARTITION;
1105 	}
1106 
1107       if (connected[t])
1108 	continue;
1109 
1110       if (two_passes
1111 	  && BB_PARTITION (traces[t].first) != current_partition)
1112 	continue;
1113 
1114       connected[t] = true;
1115 
1116       /* Find the predecessor traces.  */
1117       for (t2 = t; t2 > 0;)
1118 	{
1119 	  edge_iterator ei;
1120 	  best = NULL;
1121 	  best_len = 0;
1122 	  FOR_EACH_EDGE (e, ei, traces[t2].first->preds)
1123 	    {
1124 	      int si = e->src->index;
1125 
1126 	      if (e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun)
1127 		  && (e->flags & EDGE_CAN_FALLTHRU)
1128 		  && !(e->flags & EDGE_COMPLEX)
1129 		  && bbd[si].end_of_trace >= 0
1130 		  && !connected[bbd[si].end_of_trace]
1131 		  && (BB_PARTITION (e->src) == current_partition)
1132 		  && connect_better_edge_p (e, true, best_len, best, traces))
1133 		{
1134 		  best = e;
1135 		  best_len = traces[bbd[si].end_of_trace].length;
1136 		}
1137 	    }
1138 	  if (best)
1139 	    {
1140 	      best->src->aux = best->dest;
1141 	      t2 = bbd[best->src->index].end_of_trace;
1142 	      connected[t2] = true;
1143 
1144 	      if (dump_file)
1145 		{
1146 		  fprintf (dump_file, "Connection: %d %d\n",
1147 			   best->src->index, best->dest->index);
1148 		}
1149 	    }
1150 	  else
1151 	    break;
1152 	}
1153 
1154       if (last_trace >= 0)
1155 	traces[last_trace].last->aux = traces[t2].first;
1156       last_trace = t;
1157 
1158       /* Find the successor traces.  */
1159       while (1)
1160 	{
1161 	  /* Find the continuation of the chain.  */
1162 	  edge_iterator ei;
1163 	  best = NULL;
1164 	  best_len = 0;
1165 	  FOR_EACH_EDGE (e, ei, traces[t].last->succs)
1166 	    {
1167 	      int di = e->dest->index;
1168 
1169 	      if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
1170 		  && (e->flags & EDGE_CAN_FALLTHRU)
1171 		  && !(e->flags & EDGE_COMPLEX)
1172 		  && bbd[di].start_of_trace >= 0
1173 		  && !connected[bbd[di].start_of_trace]
1174 		  && (BB_PARTITION (e->dest) == current_partition)
1175 		  && connect_better_edge_p (e, false, best_len, best, traces))
1176 		{
1177 		  best = e;
1178 		  best_len = traces[bbd[di].start_of_trace].length;
1179 		}
1180 	    }
1181 
1182 	  if (for_size)
1183 	    {
1184 	      if (!best)
1185 		/* Stop finding the successor traces.  */
1186 		break;
1187 
1188 	      /* It is OK to connect block n with block n + 1 or a block
1189 		 before n.  For others, only connect to the loop header.  */
1190 	      if (best->dest->index > (traces[t].last->index + 1))
1191 		{
1192 		  int count = EDGE_COUNT (best->dest->preds);
1193 
1194 		  FOR_EACH_EDGE (e, ei, best->dest->preds)
1195 		    if (e->flags & EDGE_DFS_BACK)
1196 		      count--;
1197 
1198 		  /* If dest has multiple predecessors, skip it.  We expect
1199 		     that one predecessor with smaller index connects with it
1200 		     later.  */
1201 		  if (count != 1)
1202 		    break;
1203 		}
1204 
1205 	      /* Only connect Trace n with Trace n + 1.  It is conservative
1206 		 to keep the order as close as possible to the original order.
1207 		 It also helps to reduce long jumps.  */
1208 	      if (last_trace != bbd[best->dest->index].start_of_trace - 1)
1209 		break;
1210 
1211 	      if (dump_file)
1212 		fprintf (dump_file, "Connection: %d %d\n",
1213 			 best->src->index, best->dest->index);
1214 
1215 	      t = bbd[best->dest->index].start_of_trace;
1216 	      traces[last_trace].last->aux = traces[t].first;
1217 	      connected[t] = true;
1218 	      last_trace = t;
1219 	    }
1220 	  else if (best)
1221 	    {
1222 	      if (dump_file)
1223 		{
1224 		  fprintf (dump_file, "Connection: %d %d\n",
1225 			   best->src->index, best->dest->index);
1226 		}
1227 	      t = bbd[best->dest->index].start_of_trace;
1228 	      traces[last_trace].last->aux = traces[t].first;
1229 	      connected[t] = true;
1230 	      last_trace = t;
1231 	    }
1232 	  else
1233 	    {
1234 	      /* Try to connect the traces by duplication of 1 block.  */
1235 	      edge e2;
1236 	      basic_block next_bb = NULL;
1237 	      bool try_copy = false;
1238 
1239 	      FOR_EACH_EDGE (e, ei, traces[t].last->succs)
1240 		if (e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
1241 		    && (e->flags & EDGE_CAN_FALLTHRU)
1242 		    && !(e->flags & EDGE_COMPLEX)
1243 		    && (!best || e->probability > best->probability))
1244 		  {
1245 		    edge_iterator ei;
1246 		    edge best2 = NULL;
1247 		    int best2_len = 0;
1248 
1249 		    /* If the destination is a start of a trace which is only
1250 		       one block long, then no need to search the successor
1251 		       blocks of the trace.  Accept it.  */
1252 		    if (bbd[e->dest->index].start_of_trace >= 0
1253 			&& traces[bbd[e->dest->index].start_of_trace].length
1254 			   == 1)
1255 		      {
1256 			best = e;
1257 			try_copy = true;
1258 			continue;
1259 		      }
1260 
1261 		    FOR_EACH_EDGE (e2, ei, e->dest->succs)
1262 		      {
1263 			int di = e2->dest->index;
1264 
1265 			if (e2->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)
1266 			    || ((e2->flags & EDGE_CAN_FALLTHRU)
1267 				&& !(e2->flags & EDGE_COMPLEX)
1268 				&& bbd[di].start_of_trace >= 0
1269 				&& !connected[bbd[di].start_of_trace]
1270 				&& BB_PARTITION (e2->dest) == current_partition
1271 				&& EDGE_FREQUENCY (e2) >= freq_threshold
1272 				&& e2->count >= count_threshold
1273 				&& (!best2
1274 				    || e2->probability > best2->probability
1275 				    || (e2->probability == best2->probability
1276 					&& traces[bbd[di].start_of_trace].length
1277 					   > best2_len))))
1278 			  {
1279 			    best = e;
1280 			    best2 = e2;
1281 			    if (e2->dest != EXIT_BLOCK_PTR_FOR_FN (cfun))
1282 			      best2_len = traces[bbd[di].start_of_trace].length;
1283 			    else
1284 			      best2_len = INT_MAX;
1285 			    next_bb = e2->dest;
1286 			    try_copy = true;
1287 			  }
1288 		      }
1289 		  }
1290 
1291 	      if (crtl->has_bb_partition)
1292 		try_copy = false;
1293 
1294 	      /* Copy tiny blocks always; copy larger blocks only when the
1295 		 edge is traversed frequently enough.  */
1296 	      if (try_copy
1297 		  && copy_bb_p (best->dest,
1298 				optimize_edge_for_speed_p (best)
1299 				&& EDGE_FREQUENCY (best) >= freq_threshold
1300 				&& best->count >= count_threshold))
1301 		{
1302 		  basic_block new_bb;
1303 
1304 		  if (dump_file)
1305 		    {
1306 		      fprintf (dump_file, "Connection: %d %d ",
1307 			       traces[t].last->index, best->dest->index);
1308 		      if (!next_bb)
1309 			fputc ('\n', dump_file);
1310 		      else if (next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
1311 			fprintf (dump_file, "exit\n");
1312 		      else
1313 			fprintf (dump_file, "%d\n", next_bb->index);
1314 		    }
1315 
1316 		  new_bb = copy_bb (best->dest, best, traces[t].last, t);
1317 		  traces[t].last = new_bb;
1318 		  if (next_bb && next_bb != EXIT_BLOCK_PTR_FOR_FN (cfun))
1319 		    {
1320 		      t = bbd[next_bb->index].start_of_trace;
1321 		      traces[last_trace].last->aux = traces[t].first;
1322 		      connected[t] = true;
1323 		      last_trace = t;
1324 		    }
1325 		  else
1326 		    break;	/* Stop finding the successor traces.  */
1327 		}
1328 	      else
1329 		break;	/* Stop finding the successor traces.  */
1330 	    }
1331 	}
1332     }
1333 
1334   if (dump_file)
1335     {
1336       basic_block bb;
1337 
1338       fprintf (dump_file, "Final order:\n");
1339       for (bb = traces[0].first; bb; bb = (basic_block) bb->aux)
1340 	fprintf (dump_file, "%d ", bb->index);
1341       fprintf (dump_file, "\n");
1342       fflush (dump_file);
1343     }
1344 
1345   FREE (connected);
1346 }
1347 
1348 /* Return true when BB can and should be copied. CODE_MAY_GROW is true
1349    when code size is allowed to grow by duplication.  */
1350 
1351 static bool
copy_bb_p(const_basic_block bb,int code_may_grow)1352 copy_bb_p (const_basic_block bb, int code_may_grow)
1353 {
1354   int size = 0;
1355   int max_size = uncond_jump_length;
1356   rtx_insn *insn;
1357 
1358   if (!bb->frequency)
1359     return false;
1360   if (EDGE_COUNT (bb->preds) < 2)
1361     return false;
1362   if (!can_duplicate_block_p (bb))
1363     return false;
1364 
1365   /* Avoid duplicating blocks which have many successors (PR/13430).  */
1366   if (EDGE_COUNT (bb->succs) > 8)
1367     return false;
1368 
1369   if (code_may_grow && optimize_bb_for_speed_p (bb))
1370     max_size *= PARAM_VALUE (PARAM_MAX_GROW_COPY_BB_INSNS);
1371 
1372   FOR_BB_INSNS (bb, insn)
1373     {
1374       if (INSN_P (insn))
1375 	size += get_attr_min_length (insn);
1376     }
1377 
1378   if (size <= max_size)
1379     return true;
1380 
1381   if (dump_file)
1382     {
1383       fprintf (dump_file,
1384 	       "Block %d can't be copied because its size = %d.\n",
1385 	       bb->index, size);
1386     }
1387 
1388   return false;
1389 }
1390 
1391 /* Return the length of unconditional jump instruction.  */
1392 
1393 int
get_uncond_jump_length(void)1394 get_uncond_jump_length (void)
1395 {
1396   int length;
1397 
1398   start_sequence ();
1399   rtx_code_label *label = emit_label (gen_label_rtx ());
1400   rtx_insn *jump = emit_jump_insn (targetm.gen_jump (label));
1401   length = get_attr_min_length (jump);
1402   end_sequence ();
1403 
1404   return length;
1405 }
1406 
1407 /* The landing pad OLD_LP, in block OLD_BB, has edges from both partitions.
1408    Duplicate the landing pad and split the edges so that no EH edge
1409    crosses partitions.  */
1410 
1411 static void
fix_up_crossing_landing_pad(eh_landing_pad old_lp,basic_block old_bb)1412 fix_up_crossing_landing_pad (eh_landing_pad old_lp, basic_block old_bb)
1413 {
1414   eh_landing_pad new_lp;
1415   basic_block new_bb, last_bb, post_bb;
1416   rtx_insn *jump;
1417   unsigned new_partition;
1418   edge_iterator ei;
1419   edge e;
1420 
1421   /* Generate the new landing-pad structure.  */
1422   new_lp = gen_eh_landing_pad (old_lp->region);
1423   new_lp->post_landing_pad = old_lp->post_landing_pad;
1424   new_lp->landing_pad = gen_label_rtx ();
1425   LABEL_PRESERVE_P (new_lp->landing_pad) = 1;
1426 
1427   /* Put appropriate instructions in new bb.  */
1428   rtx_code_label *new_label = emit_label (new_lp->landing_pad);
1429 
1430   expand_dw2_landing_pad_for_region (old_lp->region);
1431 
1432   post_bb = BLOCK_FOR_INSN (old_lp->landing_pad);
1433   post_bb = single_succ (post_bb);
1434   rtx_code_label *post_label = block_label (post_bb);
1435   jump = emit_jump_insn (targetm.gen_jump (post_label));
1436   JUMP_LABEL (jump) = post_label;
1437 
1438   /* Create new basic block to be dest for lp.  */
1439   last_bb = EXIT_BLOCK_PTR_FOR_FN (cfun)->prev_bb;
1440   new_bb = create_basic_block (new_label, jump, last_bb);
1441   new_bb->aux = last_bb->aux;
1442   last_bb->aux = new_bb;
1443 
1444   emit_barrier_after_bb (new_bb);
1445 
1446   make_edge (new_bb, post_bb, 0);
1447 
1448   /* Make sure new bb is in the other partition.  */
1449   new_partition = BB_PARTITION (old_bb);
1450   new_partition ^= BB_HOT_PARTITION | BB_COLD_PARTITION;
1451   BB_SET_PARTITION (new_bb, new_partition);
1452 
1453   /* Fix up the edges.  */
1454   for (ei = ei_start (old_bb->preds); (e = ei_safe_edge (ei)) != NULL; )
1455     if (BB_PARTITION (e->src) == new_partition)
1456       {
1457 	rtx_insn *insn = BB_END (e->src);
1458 	rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1459 
1460 	gcc_assert (note != NULL);
1461 	gcc_checking_assert (INTVAL (XEXP (note, 0)) == old_lp->index);
1462 	XEXP (note, 0) = GEN_INT (new_lp->index);
1463 
1464 	/* Adjust the edge to the new destination.  */
1465 	redirect_edge_succ (e, new_bb);
1466       }
1467     else
1468       ei_next (&ei);
1469 }
1470 
1471 
1472 /* Ensure that all hot bbs are included in a hot path through the
1473    procedure. This is done by calling this function twice, once
1474    with WALK_UP true (to look for paths from the entry to hot bbs) and
1475    once with WALK_UP false (to look for paths from hot bbs to the exit).
1476    Returns the updated value of COLD_BB_COUNT and adds newly-hot bbs
1477    to BBS_IN_HOT_PARTITION.  */
1478 
1479 static unsigned int
sanitize_hot_paths(bool walk_up,unsigned int cold_bb_count,vec<basic_block> * bbs_in_hot_partition)1480 sanitize_hot_paths (bool walk_up, unsigned int cold_bb_count,
1481                     vec<basic_block> *bbs_in_hot_partition)
1482 {
1483   /* Callers check this.  */
1484   gcc_checking_assert (cold_bb_count);
1485 
1486   /* Keep examining hot bbs while we still have some left to check
1487      and there are remaining cold bbs.  */
1488   vec<basic_block> hot_bbs_to_check = bbs_in_hot_partition->copy ();
1489   while (! hot_bbs_to_check.is_empty ()
1490          && cold_bb_count)
1491     {
1492       basic_block bb = hot_bbs_to_check.pop ();
1493       vec<edge, va_gc> *edges = walk_up ? bb->preds : bb->succs;
1494       edge e;
1495       edge_iterator ei;
1496       int highest_probability = 0;
1497       int highest_freq = 0;
1498       gcov_type highest_count = 0;
1499       bool found = false;
1500 
1501       /* Walk the preds/succs and check if there is at least one already
1502          marked hot. Keep track of the most frequent pred/succ so that we
1503          can mark it hot if we don't find one.  */
1504       FOR_EACH_EDGE (e, ei, edges)
1505         {
1506           basic_block reach_bb = walk_up ? e->src : e->dest;
1507 
1508           if (e->flags & EDGE_DFS_BACK)
1509             continue;
1510 
1511           if (BB_PARTITION (reach_bb) != BB_COLD_PARTITION)
1512           {
1513             found = true;
1514             break;
1515           }
1516           /* The following loop will look for the hottest edge via
1517              the edge count, if it is non-zero, then fallback to the edge
1518              frequency and finally the edge probability.  */
1519           if (e->count > highest_count)
1520             highest_count = e->count;
1521           int edge_freq = EDGE_FREQUENCY (e);
1522           if (edge_freq > highest_freq)
1523             highest_freq = edge_freq;
1524           if (e->probability > highest_probability)
1525             highest_probability = e->probability;
1526         }
1527 
1528       /* If bb is reached by (or reaches, in the case of !WALK_UP) another hot
1529          block (or unpartitioned, e.g. the entry block) then it is ok. If not,
1530          then the most frequent pred (or succ) needs to be adjusted.  In the
1531          case where multiple preds/succs have the same frequency (e.g. a
1532          50-50 branch), then both will be adjusted.  */
1533       if (found)
1534         continue;
1535 
1536       FOR_EACH_EDGE (e, ei, edges)
1537         {
1538           if (e->flags & EDGE_DFS_BACK)
1539             continue;
1540           /* Select the hottest edge using the edge count, if it is non-zero,
1541              then fallback to the edge frequency and finally the edge
1542              probability.  */
1543           if (highest_count)
1544             {
1545               if (e->count < highest_count)
1546                 continue;
1547             }
1548           else if (highest_freq)
1549             {
1550               if (EDGE_FREQUENCY (e) < highest_freq)
1551                 continue;
1552             }
1553           else if (e->probability < highest_probability)
1554             continue;
1555 
1556           basic_block reach_bb = walk_up ? e->src : e->dest;
1557 
1558           /* We have a hot bb with an immediate dominator that is cold.
1559              The dominator needs to be re-marked hot.  */
1560           BB_SET_PARTITION (reach_bb, BB_HOT_PARTITION);
1561           cold_bb_count--;
1562 
1563           /* Now we need to examine newly-hot reach_bb to see if it is also
1564              dominated by a cold bb.  */
1565           bbs_in_hot_partition->safe_push (reach_bb);
1566           hot_bbs_to_check.safe_push (reach_bb);
1567         }
1568     }
1569 
1570   return cold_bb_count;
1571 }
1572 
1573 
1574 /* Find the basic blocks that are rarely executed and need to be moved to
1575    a separate section of the .o file (to cut down on paging and improve
1576    cache locality).  Return a vector of all edges that cross.  */
1577 
1578 static vec<edge>
find_rarely_executed_basic_blocks_and_crossing_edges(void)1579 find_rarely_executed_basic_blocks_and_crossing_edges (void)
1580 {
1581   vec<edge> crossing_edges = vNULL;
1582   basic_block bb;
1583   edge e;
1584   edge_iterator ei;
1585   unsigned int cold_bb_count = 0;
1586   auto_vec<basic_block> bbs_in_hot_partition;
1587 
1588   /* Mark which partition (hot/cold) each basic block belongs in.  */
1589   FOR_EACH_BB_FN (bb, cfun)
1590     {
1591       bool cold_bb = false;
1592 
1593       if (probably_never_executed_bb_p (cfun, bb))
1594         {
1595           /* Handle profile insanities created by upstream optimizations
1596              by also checking the incoming edge weights. If there is a non-cold
1597              incoming edge, conservatively prevent this block from being split
1598              into the cold section.  */
1599           cold_bb = true;
1600           FOR_EACH_EDGE (e, ei, bb->preds)
1601             if (!probably_never_executed_edge_p (cfun, e))
1602               {
1603                 cold_bb = false;
1604                 break;
1605               }
1606         }
1607       if (cold_bb)
1608         {
1609           BB_SET_PARTITION (bb, BB_COLD_PARTITION);
1610           cold_bb_count++;
1611         }
1612       else
1613         {
1614           BB_SET_PARTITION (bb, BB_HOT_PARTITION);
1615           bbs_in_hot_partition.safe_push (bb);
1616         }
1617     }
1618 
1619   /* Ensure that hot bbs are included along a hot path from the entry to exit.
1620      Several different possibilities may include cold bbs along all paths
1621      to/from a hot bb. One is that there are edge weight insanities
1622      due to optimization phases that do not properly update basic block profile
1623      counts. The second is that the entry of the function may not be hot, because
1624      it is entered fewer times than the number of profile training runs, but there
1625      is a loop inside the function that causes blocks within the function to be
1626      above the threshold for hotness. This is fixed by walking up from hot bbs
1627      to the entry block, and then down from hot bbs to the exit, performing
1628      partitioning fixups as necessary.  */
1629   if (cold_bb_count)
1630     {
1631       mark_dfs_back_edges ();
1632       cold_bb_count = sanitize_hot_paths (true, cold_bb_count,
1633                                           &bbs_in_hot_partition);
1634       if (cold_bb_count)
1635         sanitize_hot_paths (false, cold_bb_count, &bbs_in_hot_partition);
1636     }
1637 
1638   /* The format of .gcc_except_table does not allow landing pads to
1639      be in a different partition as the throw.  Fix this by either
1640      moving or duplicating the landing pads.  */
1641   if (cfun->eh->lp_array)
1642     {
1643       unsigned i;
1644       eh_landing_pad lp;
1645 
1646       FOR_EACH_VEC_ELT (*cfun->eh->lp_array, i, lp)
1647 	{
1648 	  bool all_same, all_diff;
1649 
1650 	  if (lp == NULL
1651 	      || lp->landing_pad == NULL_RTX
1652 	      || !LABEL_P (lp->landing_pad))
1653 	    continue;
1654 
1655 	  all_same = all_diff = true;
1656 	  bb = BLOCK_FOR_INSN (lp->landing_pad);
1657 	  FOR_EACH_EDGE (e, ei, bb->preds)
1658 	    {
1659 	      gcc_assert (e->flags & EDGE_EH);
1660 	      if (BB_PARTITION (bb) == BB_PARTITION (e->src))
1661 		all_diff = false;
1662 	      else
1663 		all_same = false;
1664 	    }
1665 
1666 	  if (all_same)
1667 	    ;
1668 	  else if (all_diff)
1669 	    {
1670 	      int which = BB_PARTITION (bb);
1671 	      which ^= BB_HOT_PARTITION | BB_COLD_PARTITION;
1672 	      BB_SET_PARTITION (bb, which);
1673 	    }
1674 	  else
1675 	    fix_up_crossing_landing_pad (lp, bb);
1676 	}
1677     }
1678 
1679   /* Mark every edge that crosses between sections.  */
1680 
1681   FOR_EACH_BB_FN (bb, cfun)
1682     FOR_EACH_EDGE (e, ei, bb->succs)
1683       {
1684 	unsigned int flags = e->flags;
1685 
1686         /* We should never have EDGE_CROSSING set yet.  */
1687 	gcc_checking_assert ((flags & EDGE_CROSSING) == 0);
1688 
1689 	if (e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun)
1690 	    && e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
1691 	    && BB_PARTITION (e->src) != BB_PARTITION (e->dest))
1692 	  {
1693 	    crossing_edges.safe_push (e);
1694 	    flags |= EDGE_CROSSING;
1695 	  }
1696 
1697 	/* Now that we've split eh edges as appropriate, allow landing pads
1698 	   to be merged with the post-landing pads.  */
1699 	flags &= ~EDGE_PRESERVE;
1700 
1701 	e->flags = flags;
1702       }
1703 
1704   return crossing_edges;
1705 }
1706 
1707 /* Set the flag EDGE_CAN_FALLTHRU for edges that can be fallthru.  */
1708 
1709 static void
set_edge_can_fallthru_flag(void)1710 set_edge_can_fallthru_flag (void)
1711 {
1712   basic_block bb;
1713 
1714   FOR_EACH_BB_FN (bb, cfun)
1715     {
1716       edge e;
1717       edge_iterator ei;
1718 
1719       FOR_EACH_EDGE (e, ei, bb->succs)
1720 	{
1721 	  e->flags &= ~EDGE_CAN_FALLTHRU;
1722 
1723 	  /* The FALLTHRU edge is also CAN_FALLTHRU edge.  */
1724 	  if (e->flags & EDGE_FALLTHRU)
1725 	    e->flags |= EDGE_CAN_FALLTHRU;
1726 	}
1727 
1728       /* If the BB ends with an invertible condjump all (2) edges are
1729 	 CAN_FALLTHRU edges.  */
1730       if (EDGE_COUNT (bb->succs) != 2)
1731 	continue;
1732       if (!any_condjump_p (BB_END (bb)))
1733 	continue;
1734 
1735       rtx_jump_insn *bb_end_jump = as_a <rtx_jump_insn *> (BB_END (bb));
1736       if (!invert_jump (bb_end_jump, JUMP_LABEL (bb_end_jump), 0))
1737 	continue;
1738       invert_jump (bb_end_jump, JUMP_LABEL (bb_end_jump), 0);
1739       EDGE_SUCC (bb, 0)->flags |= EDGE_CAN_FALLTHRU;
1740       EDGE_SUCC (bb, 1)->flags |= EDGE_CAN_FALLTHRU;
1741     }
1742 }
1743 
1744 /* If any destination of a crossing edge does not have a label, add label;
1745    Convert any easy fall-through crossing edges to unconditional jumps.  */
1746 
1747 static void
add_labels_and_missing_jumps(vec<edge> crossing_edges)1748 add_labels_and_missing_jumps (vec<edge> crossing_edges)
1749 {
1750   size_t i;
1751   edge e;
1752 
1753   FOR_EACH_VEC_ELT (crossing_edges, i, e)
1754     {
1755       basic_block src = e->src;
1756       basic_block dest = e->dest;
1757       rtx_jump_insn *new_jump;
1758 
1759       if (dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1760 	continue;
1761 
1762       /* Make sure dest has a label.  */
1763       rtx_code_label *label = block_label (dest);
1764 
1765       /* Nothing to do for non-fallthru edges.  */
1766       if (src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
1767 	continue;
1768       if ((e->flags & EDGE_FALLTHRU) == 0)
1769 	continue;
1770 
1771       /* If the block does not end with a control flow insn, then we
1772 	 can trivially add a jump to the end to fixup the crossing.
1773 	 Otherwise the jump will have to go in a new bb, which will
1774 	 be handled by fix_up_fall_thru_edges function.  */
1775       if (control_flow_insn_p (BB_END (src)))
1776 	continue;
1777 
1778       /* Make sure there's only one successor.  */
1779       gcc_assert (single_succ_p (src));
1780 
1781       new_jump = emit_jump_insn_after (targetm.gen_jump (label), BB_END (src));
1782       BB_END (src) = new_jump;
1783       JUMP_LABEL (new_jump) = label;
1784       LABEL_NUSES (label) += 1;
1785 
1786       emit_barrier_after_bb (src);
1787 
1788       /* Mark edge as non-fallthru.  */
1789       e->flags &= ~EDGE_FALLTHRU;
1790     }
1791 }
1792 
1793 /* Find any bb's where the fall-through edge is a crossing edge (note that
1794    these bb's must also contain a conditional jump or end with a call
1795    instruction; we've already dealt with fall-through edges for blocks
1796    that didn't have a conditional jump or didn't end with call instruction
1797    in the call to add_labels_and_missing_jumps).  Convert the fall-through
1798    edge to non-crossing edge by inserting a new bb to fall-through into.
1799    The new bb will contain an unconditional jump (crossing edge) to the
1800    original fall through destination.  */
1801 
1802 static void
fix_up_fall_thru_edges(void)1803 fix_up_fall_thru_edges (void)
1804 {
1805   basic_block cur_bb;
1806   basic_block new_bb;
1807   edge succ1;
1808   edge succ2;
1809   edge fall_thru;
1810   edge cond_jump = NULL;
1811   bool cond_jump_crosses;
1812   int invert_worked;
1813   rtx_insn *old_jump;
1814   rtx_code_label *fall_thru_label;
1815 
1816   FOR_EACH_BB_FN (cur_bb, cfun)
1817     {
1818       fall_thru = NULL;
1819       if (EDGE_COUNT (cur_bb->succs) > 0)
1820 	succ1 = EDGE_SUCC (cur_bb, 0);
1821       else
1822 	succ1 = NULL;
1823 
1824       if (EDGE_COUNT (cur_bb->succs) > 1)
1825 	succ2 = EDGE_SUCC (cur_bb, 1);
1826       else
1827 	succ2 = NULL;
1828 
1829       /* Find the fall-through edge.  */
1830 
1831       if (succ1
1832 	  && (succ1->flags & EDGE_FALLTHRU))
1833 	{
1834 	  fall_thru = succ1;
1835 	  cond_jump = succ2;
1836 	}
1837       else if (succ2
1838 	       && (succ2->flags & EDGE_FALLTHRU))
1839 	{
1840 	  fall_thru = succ2;
1841 	  cond_jump = succ1;
1842 	}
1843       else if (succ1
1844 	       && (block_ends_with_call_p (cur_bb)
1845 		   || can_throw_internal (BB_END (cur_bb))))
1846 	{
1847 	  edge e;
1848 	  edge_iterator ei;
1849 
1850 	  FOR_EACH_EDGE (e, ei, cur_bb->succs)
1851 	    if (e->flags & EDGE_FALLTHRU)
1852 	      {
1853 		fall_thru = e;
1854 		break;
1855 	      }
1856 	}
1857 
1858       if (fall_thru && (fall_thru->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)))
1859 	{
1860 	  /* Check to see if the fall-thru edge is a crossing edge.  */
1861 
1862 	  if (fall_thru->flags & EDGE_CROSSING)
1863 	    {
1864 	      /* The fall_thru edge crosses; now check the cond jump edge, if
1865 		 it exists.  */
1866 
1867 	      cond_jump_crosses = true;
1868 	      invert_worked  = 0;
1869 	      old_jump = BB_END (cur_bb);
1870 
1871 	      /* Find the jump instruction, if there is one.  */
1872 
1873 	      if (cond_jump)
1874 		{
1875 		  if (!(cond_jump->flags & EDGE_CROSSING))
1876 		    cond_jump_crosses = false;
1877 
1878 		  /* We know the fall-thru edge crosses; if the cond
1879 		     jump edge does NOT cross, and its destination is the
1880 		     next block in the bb order, invert the jump
1881 		     (i.e. fix it so the fall through does not cross and
1882 		     the cond jump does).  */
1883 
1884 		  if (!cond_jump_crosses)
1885 		    {
1886 		      /* Find label in fall_thru block. We've already added
1887 			 any missing labels, so there must be one.  */
1888 
1889 		      fall_thru_label = block_label (fall_thru->dest);
1890 
1891 		      if (old_jump && fall_thru_label)
1892 			{
1893 			  rtx_jump_insn *old_jump_insn =
1894 				dyn_cast <rtx_jump_insn *> (old_jump);
1895 			  if (old_jump_insn)
1896 			    invert_worked = invert_jump (old_jump_insn,
1897 							 fall_thru_label, 0);
1898 			}
1899 
1900 		      if (invert_worked)
1901 			{
1902 			  fall_thru->flags &= ~EDGE_FALLTHRU;
1903 			  cond_jump->flags |= EDGE_FALLTHRU;
1904 			  update_br_prob_note (cur_bb);
1905 			  std::swap (fall_thru, cond_jump);
1906 			  cond_jump->flags |= EDGE_CROSSING;
1907 			  fall_thru->flags &= ~EDGE_CROSSING;
1908 			}
1909 		    }
1910 		}
1911 
1912 	      if (cond_jump_crosses || !invert_worked)
1913 		{
1914 		  /* This is the case where both edges out of the basic
1915 		     block are crossing edges. Here we will fix up the
1916 		     fall through edge. The jump edge will be taken care
1917 		     of later.  The EDGE_CROSSING flag of fall_thru edge
1918 		     is unset before the call to force_nonfallthru
1919 		     function because if a new basic-block is created
1920 		     this edge remains in the current section boundary
1921 		     while the edge between new_bb and the fall_thru->dest
1922 		     becomes EDGE_CROSSING.  */
1923 
1924 		  fall_thru->flags &= ~EDGE_CROSSING;
1925 		  new_bb = force_nonfallthru (fall_thru);
1926 
1927 		  if (new_bb)
1928 		    {
1929 		      new_bb->aux = cur_bb->aux;
1930 		      cur_bb->aux = new_bb;
1931 
1932                       /* This is done by force_nonfallthru_and_redirect.  */
1933 		      gcc_assert (BB_PARTITION (new_bb)
1934                                   == BB_PARTITION (cur_bb));
1935 
1936 		      single_succ_edge (new_bb)->flags |= EDGE_CROSSING;
1937 		    }
1938 		  else
1939 		    {
1940 		      /* If a new basic-block was not created; restore
1941 			 the EDGE_CROSSING flag.  */
1942 		      fall_thru->flags |= EDGE_CROSSING;
1943 		    }
1944 
1945 		  /* Add barrier after new jump */
1946 		  emit_barrier_after_bb (new_bb ? new_bb : cur_bb);
1947 		}
1948 	    }
1949 	}
1950     }
1951 }
1952 
1953 /* This function checks the destination block of a "crossing jump" to
1954    see if it has any crossing predecessors that begin with a code label
1955    and end with an unconditional jump.  If so, it returns that predecessor
1956    block.  (This is to avoid creating lots of new basic blocks that all
1957    contain unconditional jumps to the same destination).  */
1958 
1959 static basic_block
find_jump_block(basic_block jump_dest)1960 find_jump_block (basic_block jump_dest)
1961 {
1962   basic_block source_bb = NULL;
1963   edge e;
1964   rtx_insn *insn;
1965   edge_iterator ei;
1966 
1967   FOR_EACH_EDGE (e, ei, jump_dest->preds)
1968     if (e->flags & EDGE_CROSSING)
1969       {
1970 	basic_block src = e->src;
1971 
1972 	/* Check each predecessor to see if it has a label, and contains
1973 	   only one executable instruction, which is an unconditional jump.
1974 	   If so, we can use it.  */
1975 
1976 	if (LABEL_P (BB_HEAD (src)))
1977 	  for (insn = BB_HEAD (src);
1978 	       !INSN_P (insn) && insn != NEXT_INSN (BB_END (src));
1979 	       insn = NEXT_INSN (insn))
1980 	    {
1981 	      if (INSN_P (insn)
1982 		  && insn == BB_END (src)
1983 		  && JUMP_P (insn)
1984 		  && !any_condjump_p (insn))
1985 		{
1986 		  source_bb = src;
1987 		  break;
1988 		}
1989 	    }
1990 
1991 	if (source_bb)
1992 	  break;
1993       }
1994 
1995   return source_bb;
1996 }
1997 
1998 /* Find all BB's with conditional jumps that are crossing edges;
1999    insert a new bb and make the conditional jump branch to the new
2000    bb instead (make the new bb same color so conditional branch won't
2001    be a 'crossing' edge).  Insert an unconditional jump from the
2002    new bb to the original destination of the conditional jump.  */
2003 
2004 static void
fix_crossing_conditional_branches(void)2005 fix_crossing_conditional_branches (void)
2006 {
2007   basic_block cur_bb;
2008   basic_block new_bb;
2009   basic_block dest;
2010   edge succ1;
2011   edge succ2;
2012   edge crossing_edge;
2013   edge new_edge;
2014   rtx set_src;
2015   rtx old_label = NULL_RTX;
2016   rtx_code_label *new_label;
2017 
2018   FOR_EACH_BB_FN (cur_bb, cfun)
2019     {
2020       crossing_edge = NULL;
2021       if (EDGE_COUNT (cur_bb->succs) > 0)
2022 	succ1 = EDGE_SUCC (cur_bb, 0);
2023       else
2024 	succ1 = NULL;
2025 
2026       if (EDGE_COUNT (cur_bb->succs) > 1)
2027 	succ2 = EDGE_SUCC (cur_bb, 1);
2028       else
2029 	succ2 = NULL;
2030 
2031       /* We already took care of fall-through edges, so only one successor
2032 	 can be a crossing edge.  */
2033 
2034       if (succ1 && (succ1->flags & EDGE_CROSSING))
2035 	crossing_edge = succ1;
2036       else if (succ2 && (succ2->flags & EDGE_CROSSING))
2037 	crossing_edge = succ2;
2038 
2039       if (crossing_edge)
2040 	{
2041 	  rtx_insn *old_jump = BB_END (cur_bb);
2042 
2043 	  /* Check to make sure the jump instruction is a
2044 	     conditional jump.  */
2045 
2046 	  set_src = NULL_RTX;
2047 
2048 	  if (any_condjump_p (old_jump))
2049 	    {
2050 	      if (GET_CODE (PATTERN (old_jump)) == SET)
2051 		set_src = SET_SRC (PATTERN (old_jump));
2052 	      else if (GET_CODE (PATTERN (old_jump)) == PARALLEL)
2053 		{
2054 		  set_src = XVECEXP (PATTERN (old_jump), 0,0);
2055 		  if (GET_CODE (set_src) == SET)
2056 		    set_src = SET_SRC (set_src);
2057 		  else
2058 		    set_src = NULL_RTX;
2059 		}
2060 	    }
2061 
2062 	  if (set_src && (GET_CODE (set_src) == IF_THEN_ELSE))
2063 	    {
2064 	      rtx_jump_insn *old_jump_insn =
2065 			as_a <rtx_jump_insn *> (old_jump);
2066 
2067 	      if (GET_CODE (XEXP (set_src, 1)) == PC)
2068 		old_label = XEXP (set_src, 2);
2069 	      else if (GET_CODE (XEXP (set_src, 2)) == PC)
2070 		old_label = XEXP (set_src, 1);
2071 
2072 	      /* Check to see if new bb for jumping to that dest has
2073 		 already been created; if so, use it; if not, create
2074 		 a new one.  */
2075 
2076 	      new_bb = find_jump_block (crossing_edge->dest);
2077 
2078 	      if (new_bb)
2079 		new_label = block_label (new_bb);
2080 	      else
2081 		{
2082 		  basic_block last_bb;
2083 		  rtx_code_label *old_jump_target;
2084 		  rtx_jump_insn *new_jump;
2085 
2086 		  /* Create new basic block to be dest for
2087 		     conditional jump.  */
2088 
2089 		  /* Put appropriate instructions in new bb.  */
2090 
2091 		  new_label = gen_label_rtx ();
2092 		  emit_label (new_label);
2093 
2094 		  gcc_assert (GET_CODE (old_label) == LABEL_REF);
2095 		  old_jump_target = old_jump_insn->jump_target ();
2096 		  new_jump = as_a <rtx_jump_insn *>
2097 		    (emit_jump_insn (targetm.gen_jump (old_jump_target)));
2098 		  new_jump->set_jump_target (old_jump_target);
2099 
2100 		  last_bb = EXIT_BLOCK_PTR_FOR_FN (cfun)->prev_bb;
2101 		  new_bb = create_basic_block (new_label, new_jump, last_bb);
2102 		  new_bb->aux = last_bb->aux;
2103 		  last_bb->aux = new_bb;
2104 
2105 		  emit_barrier_after_bb (new_bb);
2106 
2107 		  /* Make sure new bb is in same partition as source
2108 		     of conditional branch.  */
2109 		  BB_COPY_PARTITION (new_bb, cur_bb);
2110 		}
2111 
2112 	      /* Make old jump branch to new bb.  */
2113 
2114 	      redirect_jump (old_jump_insn, new_label, 0);
2115 
2116 	      /* Remove crossing_edge as predecessor of 'dest'.  */
2117 
2118 	      dest = crossing_edge->dest;
2119 
2120 	      redirect_edge_succ (crossing_edge, new_bb);
2121 
2122 	      /* Make a new edge from new_bb to old dest; new edge
2123 		 will be a successor for new_bb and a predecessor
2124 		 for 'dest'.  */
2125 
2126 	      if (EDGE_COUNT (new_bb->succs) == 0)
2127 		new_edge = make_edge (new_bb, dest, 0);
2128 	      else
2129 		new_edge = EDGE_SUCC (new_bb, 0);
2130 
2131 	      crossing_edge->flags &= ~EDGE_CROSSING;
2132 	      new_edge->flags |= EDGE_CROSSING;
2133 	    }
2134 	}
2135     }
2136 }
2137 
2138 /* Find any unconditional branches that cross between hot and cold
2139    sections.  Convert them into indirect jumps instead.  */
2140 
2141 static void
fix_crossing_unconditional_branches(void)2142 fix_crossing_unconditional_branches (void)
2143 {
2144   basic_block cur_bb;
2145   rtx_insn *last_insn;
2146   rtx label;
2147   rtx label_addr;
2148   rtx_insn *indirect_jump_sequence;
2149   rtx_insn *jump_insn = NULL;
2150   rtx new_reg;
2151   rtx_insn *cur_insn;
2152   edge succ;
2153 
2154   FOR_EACH_BB_FN (cur_bb, cfun)
2155     {
2156       last_insn = BB_END (cur_bb);
2157 
2158       if (EDGE_COUNT (cur_bb->succs) < 1)
2159 	continue;
2160 
2161       succ = EDGE_SUCC (cur_bb, 0);
2162 
2163       /* Check to see if bb ends in a crossing (unconditional) jump.  At
2164 	 this point, no crossing jumps should be conditional.  */
2165 
2166       if (JUMP_P (last_insn)
2167 	  && (succ->flags & EDGE_CROSSING))
2168 	{
2169 	  gcc_assert (!any_condjump_p (last_insn));
2170 
2171 	  /* Make sure the jump is not already an indirect or table jump.  */
2172 
2173 	  if (!computed_jump_p (last_insn)
2174 	      && !tablejump_p (last_insn, NULL, NULL))
2175 	    {
2176 	      /* We have found a "crossing" unconditional branch.  Now
2177 		 we must convert it to an indirect jump.  First create
2178 		 reference of label, as target for jump.  */
2179 
2180 	      label = JUMP_LABEL (last_insn);
2181 	      label_addr = gen_rtx_LABEL_REF (Pmode, label);
2182 	      LABEL_NUSES (label) += 1;
2183 
2184 	      /* Get a register to use for the indirect jump.  */
2185 
2186 	      new_reg = gen_reg_rtx (Pmode);
2187 
2188 	      /* Generate indirect the jump sequence.  */
2189 
2190 	      start_sequence ();
2191 	      emit_move_insn (new_reg, label_addr);
2192 	      emit_indirect_jump (new_reg);
2193 	      indirect_jump_sequence = get_insns ();
2194 	      end_sequence ();
2195 
2196 	      /* Make sure every instruction in the new jump sequence has
2197 		 its basic block set to be cur_bb.  */
2198 
2199 	      for (cur_insn = indirect_jump_sequence; cur_insn;
2200 		   cur_insn = NEXT_INSN (cur_insn))
2201 		{
2202 		  if (!BARRIER_P (cur_insn))
2203 		    BLOCK_FOR_INSN (cur_insn) = cur_bb;
2204 		  if (JUMP_P (cur_insn))
2205 		    jump_insn = cur_insn;
2206 		}
2207 
2208 	      /* Insert the new (indirect) jump sequence immediately before
2209 		 the unconditional jump, then delete the unconditional jump.  */
2210 
2211 	      emit_insn_before (indirect_jump_sequence, last_insn);
2212 	      delete_insn (last_insn);
2213 
2214 	      JUMP_LABEL (jump_insn) = label;
2215 	      LABEL_NUSES (label)++;
2216 
2217 	      /* Make BB_END for cur_bb be the jump instruction (NOT the
2218 		 barrier instruction at the end of the sequence...).  */
2219 
2220 	      BB_END (cur_bb) = jump_insn;
2221 	    }
2222 	}
2223     }
2224 }
2225 
2226 /* Update CROSSING_JUMP_P flags on all jump insns.  */
2227 
2228 static void
update_crossing_jump_flags(void)2229 update_crossing_jump_flags (void)
2230 {
2231   basic_block bb;
2232   edge e;
2233   edge_iterator ei;
2234 
2235   FOR_EACH_BB_FN (bb, cfun)
2236     FOR_EACH_EDGE (e, ei, bb->succs)
2237       if (e->flags & EDGE_CROSSING)
2238 	{
2239 	  if (JUMP_P (BB_END (bb))
2240 	      /* Some flags were added during fix_up_fall_thru_edges, via
2241 		 force_nonfallthru_and_redirect.  */
2242 	      && !CROSSING_JUMP_P (BB_END (bb)))
2243 	    CROSSING_JUMP_P (BB_END (bb)) = 1;
2244 	  break;
2245 	}
2246 }
2247 
2248 /* Reorder basic blocks using the software trace cache (STC) algorithm.  */
2249 
2250 static void
reorder_basic_blocks_software_trace_cache(void)2251 reorder_basic_blocks_software_trace_cache (void)
2252 {
2253   if (dump_file)
2254     fprintf (dump_file, "\nReordering with the STC algorithm.\n\n");
2255 
2256   int n_traces;
2257   int i;
2258   struct trace *traces;
2259 
2260   /* We are estimating the length of uncond jump insn only once since the code
2261      for getting the insn length always returns the minimal length now.  */
2262   if (uncond_jump_length == 0)
2263     uncond_jump_length = get_uncond_jump_length ();
2264 
2265   /* We need to know some information for each basic block.  */
2266   array_size = GET_ARRAY_SIZE (last_basic_block_for_fn (cfun));
2267   bbd = XNEWVEC (bbro_basic_block_data, array_size);
2268   for (i = 0; i < array_size; i++)
2269     {
2270       bbd[i].start_of_trace = -1;
2271       bbd[i].end_of_trace = -1;
2272       bbd[i].in_trace = -1;
2273       bbd[i].visited = 0;
2274       bbd[i].priority = -1;
2275       bbd[i].heap = NULL;
2276       bbd[i].node = NULL;
2277     }
2278 
2279   traces = XNEWVEC (struct trace, n_basic_blocks_for_fn (cfun));
2280   n_traces = 0;
2281   find_traces (&n_traces, traces);
2282   connect_traces (n_traces, traces);
2283   FREE (traces);
2284   FREE (bbd);
2285 }
2286 
2287 /* Return true if edge E1 is more desirable as a fallthrough edge than
2288    edge E2 is.  */
2289 
2290 static bool
edge_order(edge e1,edge e2)2291 edge_order (edge e1, edge e2)
2292 {
2293   return EDGE_FREQUENCY (e1) > EDGE_FREQUENCY (e2);
2294 }
2295 
2296 /* Reorder basic blocks using the "simple" algorithm.  This tries to
2297    maximize the dynamic number of branches that are fallthrough, without
2298    copying instructions.  The algorithm is greedy, looking at the most
2299    frequently executed branch first.  */
2300 
2301 static void
reorder_basic_blocks_simple(void)2302 reorder_basic_blocks_simple (void)
2303 {
2304   if (dump_file)
2305     fprintf (dump_file, "\nReordering with the \"simple\" algorithm.\n\n");
2306 
2307   edge *edges = new edge[2 * n_basic_blocks_for_fn (cfun)];
2308 
2309   /* First, collect all edges that can be optimized by reordering blocks:
2310      simple jumps and conditional jumps, as well as the function entry edge.  */
2311 
2312   int n = 0;
2313   edges[n++] = EDGE_SUCC (ENTRY_BLOCK_PTR_FOR_FN (cfun), 0);
2314 
2315   basic_block bb;
2316   FOR_EACH_BB_FN (bb, cfun)
2317     {
2318       rtx_insn *end = BB_END (bb);
2319 
2320       if (computed_jump_p (end) || tablejump_p (end, NULL, NULL))
2321 	continue;
2322 
2323       /* We cannot optimize asm goto.  */
2324       if (JUMP_P (end) && extract_asm_operands (end))
2325 	continue;
2326 
2327       if (single_succ_p (bb))
2328 	edges[n++] = EDGE_SUCC (bb, 0);
2329       else if (any_condjump_p (end))
2330 	{
2331 	  edge e0 = EDGE_SUCC (bb, 0);
2332 	  edge e1 = EDGE_SUCC (bb, 1);
2333 	  /* When optimizing for size it is best to keep the original
2334 	     fallthrough edges.  */
2335 	  if (e1->flags & EDGE_FALLTHRU)
2336 	    std::swap (e0, e1);
2337 	  edges[n++] = e0;
2338 	  edges[n++] = e1;
2339 	}
2340     }
2341 
2342   /* Sort the edges, the most desirable first.  When optimizing for size
2343      all edges are equally desirable.  */
2344 
2345   if (optimize_function_for_speed_p (cfun))
2346     std::stable_sort (edges, edges + n, edge_order);
2347 
2348   /* Now decide which of those edges to make fallthrough edges.  We set
2349      BB_VISITED if a block already has a fallthrough successor assigned
2350      to it.  We make ->AUX of an endpoint point to the opposite endpoint
2351      of a sequence of blocks that fall through, and ->AUX will be NULL
2352      for a block that is in such a sequence but not an endpoint anymore.
2353 
2354      To start with, everything points to itself, nothing is assigned yet.  */
2355 
2356   FOR_ALL_BB_FN (bb, cfun)
2357     bb->aux = bb;
2358 
2359   EXIT_BLOCK_PTR_FOR_FN (cfun)->aux = 0;
2360 
2361   /* Now for all edges, the most desirable first, see if that edge can
2362      connect two sequences.  If it can, update AUX and BB_VISITED; if it
2363      cannot, zero out the edge in the table.  */
2364 
2365   for (int j = 0; j < n; j++)
2366     {
2367       edge e = edges[j];
2368 
2369       basic_block tail_a = e->src;
2370       basic_block head_b = e->dest;
2371       basic_block head_a = (basic_block) tail_a->aux;
2372       basic_block tail_b = (basic_block) head_b->aux;
2373 
2374       /* An edge cannot connect two sequences if:
2375 	 - it crosses partitions;
2376 	 - its src is not a current endpoint;
2377 	 - its dest is not a current endpoint;
2378 	 - or, it would create a loop.  */
2379 
2380       if (e->flags & EDGE_CROSSING
2381 	  || tail_a->flags & BB_VISITED
2382 	  || !tail_b
2383 	  || (!(head_b->flags & BB_VISITED) && head_b != tail_b)
2384 	  || tail_a == tail_b)
2385 	{
2386 	  edges[j] = 0;
2387 	  continue;
2388 	}
2389 
2390       tail_a->aux = 0;
2391       head_b->aux = 0;
2392       head_a->aux = tail_b;
2393       tail_b->aux = head_a;
2394       tail_a->flags |= BB_VISITED;
2395     }
2396 
2397   /* Put the pieces together, in the same order that the start blocks of
2398      the sequences already had.  The hot/cold partitioning gives a little
2399      complication: as a first pass only do this for blocks in the same
2400      partition as the start block, and (if there is anything left to do)
2401      in a second pass handle the other partition.  */
2402 
2403   basic_block last_tail = (basic_block) ENTRY_BLOCK_PTR_FOR_FN (cfun)->aux;
2404 
2405   int current_partition = BB_PARTITION (last_tail);
2406   bool need_another_pass = true;
2407 
2408   for (int pass = 0; pass < 2 && need_another_pass; pass++)
2409     {
2410       need_another_pass = false;
2411 
2412       FOR_EACH_BB_FN (bb, cfun)
2413 	if ((bb->flags & BB_VISITED && bb->aux) || bb->aux == bb)
2414 	  {
2415 	    if (BB_PARTITION (bb) != current_partition)
2416 	      {
2417 		need_another_pass = true;
2418 		continue;
2419 	      }
2420 
2421 	    last_tail->aux = bb;
2422 	    last_tail = (basic_block) bb->aux;
2423 	  }
2424 
2425       current_partition ^= BB_HOT_PARTITION | BB_COLD_PARTITION;
2426     }
2427 
2428   last_tail->aux = 0;
2429 
2430   /* Finally, link all the chosen fallthrough edges.  */
2431 
2432   for (int j = 0; j < n; j++)
2433     if (edges[j])
2434       edges[j]->src->aux = edges[j]->dest;
2435 
2436   delete[] edges;
2437 
2438   /* If the entry edge no longer falls through we have to make a new
2439      block so it can do so again.  */
2440 
2441   edge e = EDGE_SUCC (ENTRY_BLOCK_PTR_FOR_FN (cfun), 0);
2442   if (e->dest != ENTRY_BLOCK_PTR_FOR_FN (cfun)->aux)
2443     {
2444       force_nonfallthru (e);
2445       e->src->aux = ENTRY_BLOCK_PTR_FOR_FN (cfun)->aux;
2446       BB_COPY_PARTITION (e->src, e->dest);
2447     }
2448 }
2449 
2450 /* Reorder basic blocks.  The main entry point to this file.  */
2451 
2452 static void
reorder_basic_blocks(void)2453 reorder_basic_blocks (void)
2454 {
2455   gcc_assert (current_ir_type () == IR_RTL_CFGLAYOUT);
2456 
2457   if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
2458     return;
2459 
2460   set_edge_can_fallthru_flag ();
2461   mark_dfs_back_edges ();
2462 
2463   switch (flag_reorder_blocks_algorithm)
2464     {
2465     case REORDER_BLOCKS_ALGORITHM_SIMPLE:
2466       reorder_basic_blocks_simple ();
2467       break;
2468 
2469     case REORDER_BLOCKS_ALGORITHM_STC:
2470       reorder_basic_blocks_software_trace_cache ();
2471       break;
2472 
2473     default:
2474       gcc_unreachable ();
2475     }
2476 
2477   relink_block_chain (/*stay_in_cfglayout_mode=*/true);
2478 
2479   if (dump_file)
2480     {
2481       if (dump_flags & TDF_DETAILS)
2482 	dump_reg_info (dump_file);
2483       dump_flow_info (dump_file, dump_flags);
2484     }
2485 
2486   /* Signal that rtl_verify_flow_info_1 can now verify that there
2487      is at most one switch between hot/cold sections.  */
2488   crtl->bb_reorder_complete = true;
2489 }
2490 
2491 /* Determine which partition the first basic block in the function
2492    belongs to, then find the first basic block in the current function
2493    that belongs to a different section, and insert a
2494    NOTE_INSN_SWITCH_TEXT_SECTIONS note immediately before it in the
2495    instruction stream.  When writing out the assembly code,
2496    encountering this note will make the compiler switch between the
2497    hot and cold text sections.  */
2498 
2499 void
insert_section_boundary_note(void)2500 insert_section_boundary_note (void)
2501 {
2502   basic_block bb;
2503   bool switched_sections = false;
2504   int current_partition = 0;
2505 
2506   if (!crtl->has_bb_partition)
2507     return;
2508 
2509   FOR_EACH_BB_FN (bb, cfun)
2510     {
2511       if (!current_partition)
2512 	current_partition = BB_PARTITION (bb);
2513       if (BB_PARTITION (bb) != current_partition)
2514 	{
2515 	  gcc_assert (!switched_sections);
2516           switched_sections = true;
2517           emit_note_before (NOTE_INSN_SWITCH_TEXT_SECTIONS, BB_HEAD (bb));
2518           current_partition = BB_PARTITION (bb);
2519 	}
2520     }
2521 }
2522 
2523 namespace {
2524 
2525 const pass_data pass_data_reorder_blocks =
2526 {
2527   RTL_PASS, /* type */
2528   "bbro", /* name */
2529   OPTGROUP_NONE, /* optinfo_flags */
2530   TV_REORDER_BLOCKS, /* tv_id */
2531   0, /* properties_required */
2532   0, /* properties_provided */
2533   0, /* properties_destroyed */
2534   0, /* todo_flags_start */
2535   0, /* todo_flags_finish */
2536 };
2537 
2538 class pass_reorder_blocks : public rtl_opt_pass
2539 {
2540 public:
pass_reorder_blocks(gcc::context * ctxt)2541   pass_reorder_blocks (gcc::context *ctxt)
2542     : rtl_opt_pass (pass_data_reorder_blocks, ctxt)
2543   {}
2544 
2545   /* opt_pass methods: */
gate(function *)2546   virtual bool gate (function *)
2547     {
2548       if (targetm.cannot_modify_jumps_p ())
2549 	return false;
2550       return (optimize > 0
2551 	      && (flag_reorder_blocks || flag_reorder_blocks_and_partition));
2552     }
2553 
2554   virtual unsigned int execute (function *);
2555 
2556 }; // class pass_reorder_blocks
2557 
2558 unsigned int
execute(function * fun)2559 pass_reorder_blocks::execute (function *fun)
2560 {
2561   basic_block bb;
2562 
2563   /* Last attempt to optimize CFG, as scheduling, peepholing and insn
2564      splitting possibly introduced more crossjumping opportunities.  */
2565   cfg_layout_initialize (CLEANUP_EXPENSIVE);
2566 
2567   reorder_basic_blocks ();
2568   cleanup_cfg (CLEANUP_EXPENSIVE);
2569 
2570   FOR_EACH_BB_FN (bb, fun)
2571     if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
2572       bb->aux = bb->next_bb;
2573   cfg_layout_finalize ();
2574 
2575   return 0;
2576 }
2577 
2578 } // anon namespace
2579 
2580 rtl_opt_pass *
make_pass_reorder_blocks(gcc::context * ctxt)2581 make_pass_reorder_blocks (gcc::context *ctxt)
2582 {
2583   return new pass_reorder_blocks (ctxt);
2584 }
2585 
2586 /* Duplicate the blocks containing computed gotos.  This basically unfactors
2587    computed gotos that were factored early on in the compilation process to
2588    speed up edge based data flow.  We used to not unfactoring them again,
2589    which can seriously pessimize code with many computed jumps in the source
2590    code, such as interpreters.  See e.g. PR15242.  */
2591 
2592 namespace {
2593 
2594 const pass_data pass_data_duplicate_computed_gotos =
2595 {
2596   RTL_PASS, /* type */
2597   "compgotos", /* name */
2598   OPTGROUP_NONE, /* optinfo_flags */
2599   TV_REORDER_BLOCKS, /* tv_id */
2600   0, /* properties_required */
2601   0, /* properties_provided */
2602   0, /* properties_destroyed */
2603   0, /* todo_flags_start */
2604   0, /* todo_flags_finish */
2605 };
2606 
2607 class pass_duplicate_computed_gotos : public rtl_opt_pass
2608 {
2609 public:
pass_duplicate_computed_gotos(gcc::context * ctxt)2610   pass_duplicate_computed_gotos (gcc::context *ctxt)
2611     : rtl_opt_pass (pass_data_duplicate_computed_gotos, ctxt)
2612   {}
2613 
2614   /* opt_pass methods: */
2615   virtual bool gate (function *);
2616   virtual unsigned int execute (function *);
2617 
2618 }; // class pass_duplicate_computed_gotos
2619 
2620 bool
gate(function * fun)2621 pass_duplicate_computed_gotos::gate (function *fun)
2622 {
2623   if (targetm.cannot_modify_jumps_p ())
2624     return false;
2625   return (optimize > 0
2626 	  && flag_expensive_optimizations
2627 	  && ! optimize_function_for_size_p (fun));
2628 }
2629 
2630 unsigned int
execute(function * fun)2631 pass_duplicate_computed_gotos::execute (function *fun)
2632 {
2633   basic_block bb, new_bb;
2634   bitmap candidates;
2635   int max_size;
2636   bool changed = false;
2637 
2638   if (n_basic_blocks_for_fn (fun) <= NUM_FIXED_BLOCKS + 1)
2639     return 0;
2640 
2641   clear_bb_flags ();
2642   cfg_layout_initialize (0);
2643 
2644   /* We are estimating the length of uncond jump insn only once
2645      since the code for getting the insn length always returns
2646      the minimal length now.  */
2647   if (uncond_jump_length == 0)
2648     uncond_jump_length = get_uncond_jump_length ();
2649 
2650   max_size
2651     = uncond_jump_length * PARAM_VALUE (PARAM_MAX_GOTO_DUPLICATION_INSNS);
2652   candidates = BITMAP_ALLOC (NULL);
2653 
2654   /* Look for blocks that end in a computed jump, and see if such blocks
2655      are suitable for unfactoring.  If a block is a candidate for unfactoring,
2656      mark it in the candidates.  */
2657   FOR_EACH_BB_FN (bb, fun)
2658     {
2659       rtx_insn *insn;
2660       edge e;
2661       edge_iterator ei;
2662       int size, all_flags;
2663 
2664       /* Build the reorder chain for the original order of blocks.  */
2665       if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
2666 	bb->aux = bb->next_bb;
2667 
2668       /* Obviously the block has to end in a computed jump.  */
2669       if (!computed_jump_p (BB_END (bb)))
2670 	continue;
2671 
2672       /* Only consider blocks that can be duplicated.  */
2673       if (CROSSING_JUMP_P (BB_END (bb))
2674 	  || !can_duplicate_block_p (bb))
2675 	continue;
2676 
2677       /* Make sure that the block is small enough.  */
2678       size = 0;
2679       FOR_BB_INSNS (bb, insn)
2680 	if (INSN_P (insn))
2681 	  {
2682 	    size += get_attr_min_length (insn);
2683 	    if (size > max_size)
2684 	       break;
2685 	  }
2686       if (size > max_size)
2687 	continue;
2688 
2689       /* Final check: there must not be any incoming abnormal edges.  */
2690       all_flags = 0;
2691       FOR_EACH_EDGE (e, ei, bb->preds)
2692 	all_flags |= e->flags;
2693       if (all_flags & EDGE_COMPLEX)
2694 	continue;
2695 
2696       bitmap_set_bit (candidates, bb->index);
2697     }
2698 
2699   /* Nothing to do if there is no computed jump here.  */
2700   if (bitmap_empty_p (candidates))
2701     goto done;
2702 
2703   /* Duplicate computed gotos.  */
2704   FOR_EACH_BB_FN (bb, fun)
2705     {
2706       if (bb->flags & BB_VISITED)
2707 	continue;
2708 
2709       bb->flags |= BB_VISITED;
2710 
2711       /* BB must have one outgoing edge.  That edge must not lead to
2712 	 the exit block or the next block.
2713 	 The destination must have more than one predecessor.  */
2714       if (!single_succ_p (bb)
2715 	  || single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (fun)
2716 	  || single_succ (bb) == bb->next_bb
2717 	  || single_pred_p (single_succ (bb)))
2718 	continue;
2719 
2720       /* The successor block has to be a duplication candidate.  */
2721       if (!bitmap_bit_p (candidates, single_succ (bb)->index))
2722 	continue;
2723 
2724       /* Don't duplicate a partition crossing edge, which requires difficult
2725          fixup.  */
2726       if (JUMP_P (BB_END (bb)) && CROSSING_JUMP_P (BB_END (bb)))
2727 	continue;
2728 
2729       new_bb = duplicate_block (single_succ (bb), single_succ_edge (bb), bb);
2730       new_bb->aux = bb->aux;
2731       bb->aux = new_bb;
2732       new_bb->flags |= BB_VISITED;
2733       changed = true;
2734     }
2735 
2736  done:
2737   if (changed)
2738     {
2739       /* Duplicating blocks above will redirect edges and may cause hot
2740 	 blocks previously reached by both hot and cold blocks to become
2741 	 dominated only by cold blocks.  */
2742       fixup_partitions ();
2743 
2744       /* Merge the duplicated blocks into predecessors, when possible.  */
2745       cfg_layout_finalize ();
2746       cleanup_cfg (0);
2747     }
2748   else
2749     cfg_layout_finalize ();
2750 
2751   BITMAP_FREE (candidates);
2752   return 0;
2753 }
2754 
2755 } // anon namespace
2756 
2757 rtl_opt_pass *
make_pass_duplicate_computed_gotos(gcc::context * ctxt)2758 make_pass_duplicate_computed_gotos (gcc::context *ctxt)
2759 {
2760   return new pass_duplicate_computed_gotos (ctxt);
2761 }
2762 
2763 /* This function is the main 'entrance' for the optimization that
2764    partitions hot and cold basic blocks into separate sections of the
2765    .o file (to improve performance and cache locality).  Ideally it
2766    would be called after all optimizations that rearrange the CFG have
2767    been called.  However part of this optimization may introduce new
2768    register usage, so it must be called before register allocation has
2769    occurred.  This means that this optimization is actually called
2770    well before the optimization that reorders basic blocks (see
2771    function above).
2772 
2773    This optimization checks the feedback information to determine
2774    which basic blocks are hot/cold, updates flags on the basic blocks
2775    to indicate which section they belong in.  This information is
2776    later used for writing out sections in the .o file.  Because hot
2777    and cold sections can be arbitrarily large (within the bounds of
2778    memory), far beyond the size of a single function, it is necessary
2779    to fix up all edges that cross section boundaries, to make sure the
2780    instructions used can actually span the required distance.  The
2781    fixes are described below.
2782 
2783    Fall-through edges must be changed into jumps; it is not safe or
2784    legal to fall through across a section boundary.  Whenever a
2785    fall-through edge crossing a section boundary is encountered, a new
2786    basic block is inserted (in the same section as the fall-through
2787    source), and the fall through edge is redirected to the new basic
2788    block.  The new basic block contains an unconditional jump to the
2789    original fall-through target.  (If the unconditional jump is
2790    insufficient to cross section boundaries, that is dealt with a
2791    little later, see below).
2792 
2793    In order to deal with architectures that have short conditional
2794    branches (which cannot span all of memory) we take any conditional
2795    jump that attempts to cross a section boundary and add a level of
2796    indirection: it becomes a conditional jump to a new basic block, in
2797    the same section.  The new basic block contains an unconditional
2798    jump to the original target, in the other section.
2799 
2800    For those architectures whose unconditional branch is also
2801    incapable of reaching all of memory, those unconditional jumps are
2802    converted into indirect jumps, through a register.
2803 
2804    IMPORTANT NOTE: This optimization causes some messy interactions
2805    with the cfg cleanup optimizations; those optimizations want to
2806    merge blocks wherever possible, and to collapse indirect jump
2807    sequences (change "A jumps to B jumps to C" directly into "A jumps
2808    to C").  Those optimizations can undo the jump fixes that
2809    partitioning is required to make (see above), in order to ensure
2810    that jumps attempting to cross section boundaries are really able
2811    to cover whatever distance the jump requires (on many architectures
2812    conditional or unconditional jumps are not able to reach all of
2813    memory).  Therefore tests have to be inserted into each such
2814    optimization to make sure that it does not undo stuff necessary to
2815    cross partition boundaries.  This would be much less of a problem
2816    if we could perform this optimization later in the compilation, but
2817    unfortunately the fact that we may need to create indirect jumps
2818    (through registers) requires that this optimization be performed
2819    before register allocation.
2820 
2821    Hot and cold basic blocks are partitioned and put in separate
2822    sections of the .o file, to reduce paging and improve cache
2823    performance (hopefully).  This can result in bits of code from the
2824    same function being widely separated in the .o file.  However this
2825    is not obvious to the current bb structure.  Therefore we must take
2826    care to ensure that: 1). There are no fall_thru edges that cross
2827    between sections; 2). For those architectures which have "short"
2828    conditional branches, all conditional branches that attempt to
2829    cross between sections are converted to unconditional branches;
2830    and, 3). For those architectures which have "short" unconditional
2831    branches, all unconditional branches that attempt to cross between
2832    sections are converted to indirect jumps.
2833 
2834    The code for fixing up fall_thru edges that cross between hot and
2835    cold basic blocks does so by creating new basic blocks containing
2836    unconditional branches to the appropriate label in the "other"
2837    section.  The new basic block is then put in the same (hot or cold)
2838    section as the original conditional branch, and the fall_thru edge
2839    is modified to fall into the new basic block instead.  By adding
2840    this level of indirection we end up with only unconditional branches
2841    crossing between hot and cold sections.
2842 
2843    Conditional branches are dealt with by adding a level of indirection.
2844    A new basic block is added in the same (hot/cold) section as the
2845    conditional branch, and the conditional branch is retargeted to the
2846    new basic block.  The new basic block contains an unconditional branch
2847    to the original target of the conditional branch (in the other section).
2848 
2849    Unconditional branches are dealt with by converting them into
2850    indirect jumps.  */
2851 
2852 namespace {
2853 
2854 const pass_data pass_data_partition_blocks =
2855 {
2856   RTL_PASS, /* type */
2857   "bbpart", /* name */
2858   OPTGROUP_NONE, /* optinfo_flags */
2859   TV_REORDER_BLOCKS, /* tv_id */
2860   PROP_cfglayout, /* properties_required */
2861   0, /* properties_provided */
2862   0, /* properties_destroyed */
2863   0, /* todo_flags_start */
2864   0, /* todo_flags_finish */
2865 };
2866 
2867 class pass_partition_blocks : public rtl_opt_pass
2868 {
2869 public:
pass_partition_blocks(gcc::context * ctxt)2870   pass_partition_blocks (gcc::context *ctxt)
2871     : rtl_opt_pass (pass_data_partition_blocks, ctxt)
2872   {}
2873 
2874   /* opt_pass methods: */
2875   virtual bool gate (function *);
2876   virtual unsigned int execute (function *);
2877 
2878 }; // class pass_partition_blocks
2879 
2880 bool
gate(function * fun)2881 pass_partition_blocks::gate (function *fun)
2882 {
2883   /* The optimization to partition hot/cold basic blocks into separate
2884      sections of the .o file does not work well with linkonce or with
2885      user defined section attributes.  Don't call it if either case
2886      arises.  */
2887   return (flag_reorder_blocks_and_partition
2888 	  && optimize
2889 	  /* See gate_handle_reorder_blocks.  We should not partition if
2890 	     we are going to omit the reordering.  */
2891 	  && optimize_function_for_speed_p (fun)
2892 	  && !DECL_COMDAT_GROUP (current_function_decl)
2893 	  && !user_defined_section_attribute);
2894 }
2895 
2896 unsigned
execute(function * fun)2897 pass_partition_blocks::execute (function *fun)
2898 {
2899   vec<edge> crossing_edges;
2900 
2901   if (n_basic_blocks_for_fn (fun) <= NUM_FIXED_BLOCKS + 1)
2902     return 0;
2903 
2904   df_set_flags (DF_DEFER_INSN_RESCAN);
2905 
2906   crossing_edges = find_rarely_executed_basic_blocks_and_crossing_edges ();
2907   if (!crossing_edges.exists ())
2908     /* Make sure to process deferred rescans and clear changeable df flags.  */
2909     return TODO_df_finish;
2910 
2911   crtl->has_bb_partition = true;
2912 
2913   /* Make sure the source of any crossing edge ends in a jump and the
2914      destination of any crossing edge has a label.  */
2915   add_labels_and_missing_jumps (crossing_edges);
2916 
2917   /* Convert all crossing fall_thru edges to non-crossing fall
2918      thrus to unconditional jumps (that jump to the original fall
2919      through dest).  */
2920   fix_up_fall_thru_edges ();
2921 
2922   /* If the architecture does not have conditional branches that can
2923      span all of memory, convert crossing conditional branches into
2924      crossing unconditional branches.  */
2925   if (!HAS_LONG_COND_BRANCH)
2926     fix_crossing_conditional_branches ();
2927 
2928   /* If the architecture does not have unconditional branches that
2929      can span all of memory, convert crossing unconditional branches
2930      into indirect jumps.  Since adding an indirect jump also adds
2931      a new register usage, update the register usage information as
2932      well.  */
2933   if (!HAS_LONG_UNCOND_BRANCH)
2934     fix_crossing_unconditional_branches ();
2935 
2936   update_crossing_jump_flags ();
2937 
2938   /* Clear bb->aux fields that the above routines were using.  */
2939   clear_aux_for_blocks ();
2940 
2941   crossing_edges.release ();
2942 
2943   /* ??? FIXME: DF generates the bb info for a block immediately.
2944      And by immediately, I mean *during* creation of the block.
2945 
2946 	#0  df_bb_refs_collect
2947 	#1  in df_bb_refs_record
2948 	#2  in create_basic_block_structure
2949 
2950      Which means that the bb_has_eh_pred test in df_bb_refs_collect
2951      will *always* fail, because no edges can have been added to the
2952      block yet.  Which of course means we don't add the right
2953      artificial refs, which means we fail df_verify (much) later.
2954 
2955      Cleanest solution would seem to make DF_DEFER_INSN_RESCAN imply
2956      that we also shouldn't grab data from the new blocks those new
2957      insns are in either.  In this way one can create the block, link
2958      it up properly, and have everything Just Work later, when deferred
2959      insns are processed.
2960 
2961      In the meantime, we have no other option but to throw away all
2962      of the DF data and recompute it all.  */
2963   if (fun->eh->lp_array)
2964     {
2965       df_finish_pass (true);
2966       df_scan_alloc (NULL);
2967       df_scan_blocks ();
2968       /* Not all post-landing pads use all of the EH_RETURN_DATA_REGNO
2969 	 data.  We blindly generated all of them when creating the new
2970 	 landing pad.  Delete those assignments we don't use.  */
2971       df_set_flags (DF_LR_RUN_DCE);
2972       df_analyze ();
2973     }
2974 
2975   /* Make sure to process deferred rescans and clear changeable df flags.  */
2976   return TODO_df_finish;
2977 }
2978 
2979 } // anon namespace
2980 
2981 rtl_opt_pass *
make_pass_partition_blocks(gcc::context * ctxt)2982 make_pass_partition_blocks (gcc::context *ctxt)
2983 {
2984   return new pass_partition_blocks (ctxt);
2985 }
2986