1 /* Loop unrolling.
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "cfghooks.h"
28 #include "memmodel.h"
29 #include "optabs.h"
30 #include "emit-rtl.h"
31 #include "recog.h"
32 #include "profile.h"
33 #include "cfgrtl.h"
34 #include "cfgloop.h"
35 #include "params.h"
36 #include "dojump.h"
37 #include "expr.h"
38 #include "dumpfile.h"
39 
40 /* This pass performs loop unrolling.  We only perform this
41    optimization on innermost loops (with single exception) because
42    the impact on performance is greatest here, and we want to avoid
43    unnecessary code size growth.  The gain is caused by greater sequentiality
44    of code, better code to optimize for further passes and in some cases
45    by fewer testings of exit conditions.  The main problem is code growth,
46    that impacts performance negatively due to effect of caches.
47 
48    What we do:
49 
50    -- unrolling of loops that roll constant times; this is almost always
51       win, as we get rid of exit condition tests.
52    -- unrolling of loops that roll number of times that we can compute
53       in runtime; we also get rid of exit condition tests here, but there
54       is the extra expense for calculating the number of iterations
55    -- simple unrolling of remaining loops; this is performed only if we
56       are asked to, as the gain is questionable in this case and often
57       it may even slow down the code
58    For more detailed descriptions of each of those, see comments at
59    appropriate function below.
60 
61    There is a lot of parameters (defined and described in params.def) that
62    control how much we unroll.
63 
64    ??? A great problem is that we don't have a good way how to determine
65    how many times we should unroll the loop; the experiments I have made
66    showed that this choice may affect performance in order of several %.
67    */
68 
69 /* Information about induction variables to split.  */
70 
71 struct iv_to_split
72 {
73   rtx_insn *insn;	/* The insn in that the induction variable occurs.  */
74   rtx orig_var;		/* The variable (register) for the IV before split.  */
75   rtx base_var;		/* The variable on that the values in the further
76 			   iterations are based.  */
77   rtx step;		/* Step of the induction variable.  */
78   struct iv_to_split *next; /* Next entry in walking order.  */
79 };
80 
81 /* Information about accumulators to expand.  */
82 
83 struct var_to_expand
84 {
85   rtx_insn *insn;	           /* The insn in that the variable expansion occurs.  */
86   rtx reg;                         /* The accumulator which is expanded.  */
87   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
88   struct var_to_expand *next;	   /* Next entry in walking order.  */
89   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
90                                       or multiplication.  */
91   int expansion_count;             /* Count the number of expansions generated so far.  */
92   int reuse_expansion;             /* The expansion we intend to reuse to expand
93                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
94                                       the original accumulator.  Else use
95                                       var_expansions[REUSE_EXPANSION - 1].  */
96 };
97 
98 /* Hashtable helper for iv_to_split.  */
99 
100 struct iv_split_hasher : free_ptr_hash <iv_to_split>
101 {
102   static inline hashval_t hash (const iv_to_split *);
103   static inline bool equal (const iv_to_split *, const iv_to_split *);
104 };
105 
106 
107 /* A hash function for information about insns to split.  */
108 
109 inline hashval_t
hash(const iv_to_split * ivts)110 iv_split_hasher::hash (const iv_to_split *ivts)
111 {
112   return (hashval_t) INSN_UID (ivts->insn);
113 }
114 
115 /* An equality functions for information about insns to split.  */
116 
117 inline bool
equal(const iv_to_split * i1,const iv_to_split * i2)118 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
119 {
120   return i1->insn == i2->insn;
121 }
122 
123 /* Hashtable helper for iv_to_split.  */
124 
125 struct var_expand_hasher : free_ptr_hash <var_to_expand>
126 {
127   static inline hashval_t hash (const var_to_expand *);
128   static inline bool equal (const var_to_expand *, const var_to_expand *);
129 };
130 
131 /* Return a hash for VES.  */
132 
133 inline hashval_t
hash(const var_to_expand * ves)134 var_expand_hasher::hash (const var_to_expand *ves)
135 {
136   return (hashval_t) INSN_UID (ves->insn);
137 }
138 
139 /* Return true if I1 and I2 refer to the same instruction.  */
140 
141 inline bool
equal(const var_to_expand * i1,const var_to_expand * i2)142 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
143 {
144   return i1->insn == i2->insn;
145 }
146 
147 /* Information about optimization applied in
148    the unrolled loop.  */
149 
150 struct opt_info
151 {
152   hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
153 						  split.  */
154   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
155   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
156   hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
157 					insns with accumulators to expand.  */
158   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
159   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
160   unsigned first_new_block;        /* The first basic block that was
161                                       duplicated.  */
162   basic_block loop_exit;           /* The loop exit basic block.  */
163   basic_block loop_preheader;      /* The loop preheader basic block.  */
164 };
165 
166 static void decide_unroll_stupid (struct loop *, int);
167 static void decide_unroll_constant_iterations (struct loop *, int);
168 static void decide_unroll_runtime_iterations (struct loop *, int);
169 static void unroll_loop_stupid (struct loop *);
170 static void decide_unrolling (int);
171 static void unroll_loop_constant_iterations (struct loop *);
172 static void unroll_loop_runtime_iterations (struct loop *);
173 static struct opt_info *analyze_insns_in_loop (struct loop *);
174 static void opt_info_start_duplication (struct opt_info *);
175 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
176 static void free_opt_info (struct opt_info *);
177 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
178 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
179 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
180 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
181 static void insert_var_expansion_initialization (struct var_to_expand *,
182 						 basic_block);
183 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
184 					     basic_block);
185 static rtx get_expansion (struct var_to_expand *);
186 
187 /* Emit a message summarizing the unroll that will be
188    performed for LOOP, along with the loop's location LOCUS, if
189    appropriate given the dump or -fopt-info settings.  */
190 
191 static void
report_unroll(struct loop * loop,location_t locus)192 report_unroll (struct loop *loop, location_t locus)
193 {
194   dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
195 
196   if (loop->lpt_decision.decision == LPT_NONE)
197     return;
198 
199   if (!dump_enabled_p ())
200     return;
201 
202   dump_printf_loc (report_flags, locus,
203                    "loop unrolled %d times",
204                    loop->lpt_decision.times);
205   if (profile_info && loop->header->count.initialized_p ())
206     dump_printf (report_flags,
207                  " (header execution count %d)",
208                  (int)loop->header->count.to_gcov_type ());
209 
210   dump_printf (report_flags, "\n");
211 }
212 
213 /* Decide whether unroll loops and how much.  */
214 static void
decide_unrolling(int flags)215 decide_unrolling (int flags)
216 {
217   struct loop *loop;
218 
219   /* Scan the loops, inner ones first.  */
220   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
221     {
222       loop->lpt_decision.decision = LPT_NONE;
223       location_t locus = get_loop_location (loop);
224 
225       if (dump_enabled_p ())
226 	dump_printf_loc (MSG_NOTE, locus,
227 			 "considering unrolling loop %d at BB %d\n",
228 			 loop->num, loop->header->index);
229 
230       if (loop->unroll == 1)
231 	{
232 	  if (dump_file)
233 	    fprintf (dump_file,
234 		     ";; Not unrolling loop, user didn't want it unrolled\n");
235 	  continue;
236 	}
237 
238       /* Do not peel cold areas.  */
239       if (optimize_loop_for_size_p (loop))
240 	{
241 	  if (dump_file)
242 	    fprintf (dump_file, ";; Not considering loop, cold area\n");
243 	  continue;
244 	}
245 
246       /* Can the loop be manipulated?  */
247       if (!can_duplicate_loop_p (loop))
248 	{
249 	  if (dump_file)
250 	    fprintf (dump_file,
251 		     ";; Not considering loop, cannot duplicate\n");
252 	  continue;
253 	}
254 
255       /* Skip non-innermost loops.  */
256       if (loop->inner)
257 	{
258 	  if (dump_file)
259 	    fprintf (dump_file, ";; Not considering loop, is not innermost\n");
260 	  continue;
261 	}
262 
263       loop->ninsns = num_loop_insns (loop);
264       loop->av_ninsns = average_num_loop_insns (loop);
265 
266       /* Try transformations one by one in decreasing order of priority.  */
267       decide_unroll_constant_iterations (loop, flags);
268       if (loop->lpt_decision.decision == LPT_NONE)
269 	decide_unroll_runtime_iterations (loop, flags);
270       if (loop->lpt_decision.decision == LPT_NONE)
271 	decide_unroll_stupid (loop, flags);
272 
273       report_unroll (loop, locus);
274     }
275 }
276 
277 /* Unroll LOOPS.  */
278 void
unroll_loops(int flags)279 unroll_loops (int flags)
280 {
281   struct loop *loop;
282   bool changed = false;
283 
284   /* Now decide rest of unrolling.  */
285   decide_unrolling (flags);
286 
287   /* Scan the loops, inner ones first.  */
288   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
289     {
290       /* And perform the appropriate transformations.  */
291       switch (loop->lpt_decision.decision)
292 	{
293 	case LPT_UNROLL_CONSTANT:
294 	  unroll_loop_constant_iterations (loop);
295 	  changed = true;
296 	  break;
297 	case LPT_UNROLL_RUNTIME:
298 	  unroll_loop_runtime_iterations (loop);
299 	  changed = true;
300 	  break;
301 	case LPT_UNROLL_STUPID:
302 	  unroll_loop_stupid (loop);
303 	  changed = true;
304 	  break;
305 	case LPT_NONE:
306 	  break;
307 	default:
308 	  gcc_unreachable ();
309 	}
310     }
311 
312     if (changed)
313       {
314 	calculate_dominance_info (CDI_DOMINATORS);
315 	fix_loop_structure (NULL);
316       }
317 
318   iv_analysis_done ();
319 }
320 
321 /* Check whether exit of the LOOP is at the end of loop body.  */
322 
323 static bool
loop_exit_at_end_p(struct loop * loop)324 loop_exit_at_end_p (struct loop *loop)
325 {
326   struct niter_desc *desc = get_simple_loop_desc (loop);
327   rtx_insn *insn;
328 
329   /* We should never have conditional in latch block.  */
330   gcc_assert (desc->in_edge->dest != loop->header);
331 
332   if (desc->in_edge->dest != loop->latch)
333     return false;
334 
335   /* Check that the latch is empty.  */
336   FOR_BB_INSNS (loop->latch, insn)
337     {
338       if (INSN_P (insn) && active_insn_p (insn))
339 	return false;
340     }
341 
342   return true;
343 }
344 
345 /* Decide whether to unroll LOOP iterating constant number of times
346    and how much.  */
347 
348 static void
decide_unroll_constant_iterations(struct loop * loop,int flags)349 decide_unroll_constant_iterations (struct loop *loop, int flags)
350 {
351   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
352   struct niter_desc *desc;
353   widest_int iterations;
354 
355   /* If we were not asked to unroll this loop, just return back silently.  */
356   if (!(flags & UAP_UNROLL) && !loop->unroll)
357     return;
358 
359   if (dump_enabled_p ())
360     dump_printf (MSG_NOTE,
361 		 "considering unrolling loop with constant "
362 		 "number of iterations\n");
363 
364   /* nunroll = total number of copies of the original loop body in
365      unrolled loop (i.e. if it is 2, we have to duplicate loop body once).  */
366   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
367   nunroll_by_av
368     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
369   if (nunroll > nunroll_by_av)
370     nunroll = nunroll_by_av;
371   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
372     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
373 
374   if (targetm.loop_unroll_adjust)
375     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
376 
377   /* Skip big loops.  */
378   if (nunroll <= 1)
379     {
380       if (dump_file)
381 	fprintf (dump_file, ";; Not considering loop, is too big\n");
382       return;
383     }
384 
385   /* Check for simple loops.  */
386   desc = get_simple_loop_desc (loop);
387 
388   /* Check number of iterations.  */
389   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
390     {
391       if (dump_file)
392 	fprintf (dump_file,
393 		 ";; Unable to prove that the loop iterates constant times\n");
394       return;
395     }
396 
397   /* Check for an explicit unrolling factor.  */
398   if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
399     {
400       /* However we cannot unroll completely at the RTL level a loop with
401 	 constant number of iterations; it should have been peeled instead.  */
402       if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
403 	{
404 	  if (dump_file)
405 	    fprintf (dump_file, ";; Loop should have been peeled\n");
406 	}
407       else
408 	{
409 	  loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
410 	  loop->lpt_decision.times = loop->unroll - 1;
411 	}
412       return;
413     }
414 
415   /* Check whether the loop rolls enough to consider.
416      Consult also loop bounds and profile; in the case the loop has more
417      than one exit it may well loop less than determined maximal number
418      of iterations.  */
419   if (desc->niter < 2 * nunroll
420       || ((get_estimated_loop_iterations (loop, &iterations)
421 	   || get_likely_max_loop_iterations (loop, &iterations))
422 	  && wi::ltu_p (iterations, 2 * nunroll)))
423     {
424       if (dump_file)
425 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
426       return;
427     }
428 
429   /* Success; now compute number of iterations to unroll.  We alter
430      nunroll so that as few as possible copies of loop body are
431      necessary, while still not decreasing the number of unrollings
432      too much (at most by 1).  */
433   best_copies = 2 * nunroll + 10;
434 
435   i = 2 * nunroll + 2;
436   if (i > desc->niter - 2)
437     i = desc->niter - 2;
438 
439   for (; i >= nunroll - 1; i--)
440     {
441       unsigned exit_mod = desc->niter % (i + 1);
442 
443       if (!loop_exit_at_end_p (loop))
444 	n_copies = exit_mod + i + 1;
445       else if (exit_mod != (unsigned) i
446 	       || desc->noloop_assumptions != NULL_RTX)
447 	n_copies = exit_mod + i + 2;
448       else
449 	n_copies = i + 1;
450 
451       if (n_copies < best_copies)
452 	{
453 	  best_copies = n_copies;
454 	  best_unroll = i;
455 	}
456     }
457 
458   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
459   loop->lpt_decision.times = best_unroll;
460 }
461 
462 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
463    The transformation does this:
464 
465    for (i = 0; i < 102; i++)
466      body;
467 
468    ==>  (LOOP->LPT_DECISION.TIMES == 3)
469 
470    i = 0;
471    body; i++;
472    body; i++;
473    while (i < 102)
474      {
475        body; i++;
476        body; i++;
477        body; i++;
478        body; i++;
479      }
480   */
481 static void
unroll_loop_constant_iterations(struct loop * loop)482 unroll_loop_constant_iterations (struct loop *loop)
483 {
484   unsigned HOST_WIDE_INT niter;
485   unsigned exit_mod;
486   unsigned i;
487   edge e;
488   unsigned max_unroll = loop->lpt_decision.times;
489   struct niter_desc *desc = get_simple_loop_desc (loop);
490   bool exit_at_end = loop_exit_at_end_p (loop);
491   struct opt_info *opt_info = NULL;
492   bool ok;
493 
494   niter = desc->niter;
495 
496   /* Should not get here (such loop should be peeled instead).  */
497   gcc_assert (niter > max_unroll + 1);
498 
499   exit_mod = niter % (max_unroll + 1);
500 
501   auto_sbitmap wont_exit (max_unroll + 2);
502   bitmap_ones (wont_exit);
503 
504   auto_vec<edge> remove_edges;
505   if (flag_split_ivs_in_unroller
506       || flag_variable_expansion_in_unroller)
507     opt_info = analyze_insns_in_loop (loop);
508 
509   if (!exit_at_end)
510     {
511       /* The exit is not at the end of the loop; leave exit test
512 	 in the first copy, so that the loops that start with test
513 	 of exit condition have continuous body after unrolling.  */
514 
515       if (dump_file)
516 	fprintf (dump_file, ";; Condition at beginning of loop.\n");
517 
518       /* Peel exit_mod iterations.  */
519       bitmap_clear_bit (wont_exit, 0);
520       if (desc->noloop_assumptions)
521 	bitmap_clear_bit (wont_exit, 1);
522 
523       if (exit_mod)
524 	{
525 	  opt_info_start_duplication (opt_info);
526           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
527 					      exit_mod,
528 					      wont_exit, desc->out_edge,
529 					      &remove_edges,
530 					      DLTHE_FLAG_UPDATE_FREQ
531 					      | (opt_info && exit_mod > 1
532 						 ? DLTHE_RECORD_COPY_NUMBER
533 						   : 0));
534 	  gcc_assert (ok);
535 
536           if (opt_info && exit_mod > 1)
537  	    apply_opt_in_copies (opt_info, exit_mod, false, false);
538 
539 	  desc->noloop_assumptions = NULL_RTX;
540 	  desc->niter -= exit_mod;
541 	  loop->nb_iterations_upper_bound -= exit_mod;
542 	  if (loop->any_estimate
543 	      && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
544 	    loop->nb_iterations_estimate -= exit_mod;
545 	  else
546 	    loop->any_estimate = false;
547 	  if (loop->any_likely_upper_bound
548 	      && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
549 	    loop->nb_iterations_likely_upper_bound -= exit_mod;
550 	  else
551 	    loop->any_likely_upper_bound = false;
552 	}
553 
554       bitmap_set_bit (wont_exit, 1);
555     }
556   else
557     {
558       /* Leave exit test in last copy, for the same reason as above if
559 	 the loop tests the condition at the end of loop body.  */
560 
561       if (dump_file)
562 	fprintf (dump_file, ";; Condition at end of loop.\n");
563 
564       /* We know that niter >= max_unroll + 2; so we do not need to care of
565 	 case when we would exit before reaching the loop.  So just peel
566 	 exit_mod + 1 iterations.  */
567       if (exit_mod != max_unroll
568 	  || desc->noloop_assumptions)
569 	{
570 	  bitmap_clear_bit (wont_exit, 0);
571 	  if (desc->noloop_assumptions)
572 	    bitmap_clear_bit (wont_exit, 1);
573 
574           opt_info_start_duplication (opt_info);
575 	  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
576 					      exit_mod + 1,
577 					      wont_exit, desc->out_edge,
578 					      &remove_edges,
579 					      DLTHE_FLAG_UPDATE_FREQ
580 					      | (opt_info && exit_mod > 0
581 						 ? DLTHE_RECORD_COPY_NUMBER
582 						   : 0));
583 	  gcc_assert (ok);
584 
585           if (opt_info && exit_mod > 0)
586   	    apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
587 
588 	  desc->niter -= exit_mod + 1;
589 	  loop->nb_iterations_upper_bound -= exit_mod + 1;
590 	  if (loop->any_estimate
591 	      && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
592 	    loop->nb_iterations_estimate -= exit_mod + 1;
593 	  else
594 	    loop->any_estimate = false;
595 	  if (loop->any_likely_upper_bound
596 	      && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
597 	    loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
598 	  else
599 	    loop->any_likely_upper_bound = false;
600 	  desc->noloop_assumptions = NULL_RTX;
601 
602 	  bitmap_set_bit (wont_exit, 0);
603 	  bitmap_set_bit (wont_exit, 1);
604 	}
605 
606       bitmap_clear_bit (wont_exit, max_unroll);
607     }
608 
609   /* Now unroll the loop.  */
610 
611   opt_info_start_duplication (opt_info);
612   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
613 				      max_unroll,
614 				      wont_exit, desc->out_edge,
615 				      &remove_edges,
616 				      DLTHE_FLAG_UPDATE_FREQ
617 				      | (opt_info
618 					 ? DLTHE_RECORD_COPY_NUMBER
619 					   : 0));
620   gcc_assert (ok);
621 
622   if (opt_info)
623     {
624       apply_opt_in_copies (opt_info, max_unroll, true, true);
625       free_opt_info (opt_info);
626     }
627 
628   if (exit_at_end)
629     {
630       basic_block exit_block = get_bb_copy (desc->in_edge->src);
631       /* Find a new in and out edge; they are in the last copy we have made.  */
632 
633       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
634 	{
635 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
636 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
637 	}
638       else
639 	{
640 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
641 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
642 	}
643     }
644 
645   desc->niter /= max_unroll + 1;
646   loop->nb_iterations_upper_bound
647     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
648   if (loop->any_estimate)
649     loop->nb_iterations_estimate
650       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
651   if (loop->any_likely_upper_bound)
652     loop->nb_iterations_likely_upper_bound
653       = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
654   desc->niter_expr = gen_int_mode (desc->niter, desc->mode);
655 
656   /* Remove the edges.  */
657   FOR_EACH_VEC_ELT (remove_edges, i, e)
658     remove_path (e);
659 
660   if (dump_file)
661     fprintf (dump_file,
662 	     ";; Unrolled loop %d times, constant # of iterations %i insns\n",
663 	     max_unroll, num_loop_insns (loop));
664 }
665 
666 /* Decide whether to unroll LOOP iterating runtime computable number of times
667    and how much.  */
668 static void
decide_unroll_runtime_iterations(struct loop * loop,int flags)669 decide_unroll_runtime_iterations (struct loop *loop, int flags)
670 {
671   unsigned nunroll, nunroll_by_av, i;
672   struct niter_desc *desc;
673   widest_int iterations;
674 
675   /* If we were not asked to unroll this loop, just return back silently.  */
676   if (!(flags & UAP_UNROLL) && !loop->unroll)
677     return;
678 
679   if (dump_enabled_p ())
680     dump_printf (MSG_NOTE,
681 		 "considering unrolling loop with runtime-"
682 		 "computable number of iterations\n");
683 
684   /* nunroll = total number of copies of the original loop body in
685      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
686   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
687   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
688   if (nunroll > nunroll_by_av)
689     nunroll = nunroll_by_av;
690   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
691     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
692 
693   if (targetm.loop_unroll_adjust)
694     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
695 
696   if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
697     nunroll = loop->unroll;
698 
699   /* Skip big loops.  */
700   if (nunroll <= 1)
701     {
702       if (dump_file)
703 	fprintf (dump_file, ";; Not considering loop, is too big\n");
704       return;
705     }
706 
707   /* Check for simple loops.  */
708   desc = get_simple_loop_desc (loop);
709 
710   /* Check simpleness.  */
711   if (!desc->simple_p || desc->assumptions)
712     {
713       if (dump_file)
714 	fprintf (dump_file,
715 		 ";; Unable to prove that the number of iterations "
716 		 "can be counted in runtime\n");
717       return;
718     }
719 
720   if (desc->const_iter)
721     {
722       if (dump_file)
723 	fprintf (dump_file, ";; Loop iterates constant times\n");
724       return;
725     }
726 
727   /* Check whether the loop rolls.  */
728   if ((get_estimated_loop_iterations (loop, &iterations)
729        || get_likely_max_loop_iterations (loop, &iterations))
730       && wi::ltu_p (iterations, 2 * nunroll))
731     {
732       if (dump_file)
733 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
734       return;
735     }
736 
737   /* Success; now force nunroll to be power of 2, as code-gen
738      requires it, we are unable to cope with overflows in
739      computation of number of iterations.  */
740   for (i = 1; 2 * i <= nunroll; i *= 2)
741     continue;
742 
743   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
744   loop->lpt_decision.times = i - 1;
745 }
746 
747 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
748    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
749    and NULL is returned instead.  */
750 
751 basic_block
split_edge_and_insert(edge e,rtx_insn * insns)752 split_edge_and_insert (edge e, rtx_insn *insns)
753 {
754   basic_block bb;
755 
756   if (!insns)
757     return NULL;
758   bb = split_edge (e);
759   emit_insn_after (insns, BB_END (bb));
760 
761   /* ??? We used to assume that INSNS can contain control flow insns, and
762      that we had to try to find sub basic blocks in BB to maintain a valid
763      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
764      and call break_superblocks when going out of cfglayout mode.  But it
765      turns out that this never happens; and that if it does ever happen,
766      the verify_flow_info at the end of the RTL loop passes would fail.
767 
768      There are two reasons why we expected we could have control flow insns
769      in INSNS.  The first is when a comparison has to be done in parts, and
770      the second is when the number of iterations is computed for loops with
771      the number of iterations known at runtime.  In both cases, test cases
772      to get control flow in INSNS appear to be impossible to construct:
773 
774       * If do_compare_rtx_and_jump needs several branches to do comparison
775 	in a mode that needs comparison by parts, we cannot analyze the
776 	number of iterations of the loop, and we never get to unrolling it.
777 
778       * The code in expand_divmod that was suspected to cause creation of
779 	branching code seems to be only accessed for signed division.  The
780 	divisions used by # of iterations analysis are always unsigned.
781 	Problems might arise on architectures that emits branching code
782 	for some operations that may appear in the unroller (especially
783 	for division), but we have no such architectures.
784 
785      Considering all this, it was decided that we should for now assume
786      that INSNS can in theory contain control flow insns, but in practice
787      it never does.  So we don't handle the theoretical case, and should
788      a real failure ever show up, we have a pretty good clue for how to
789      fix it.  */
790 
791   return bb;
792 }
793 
794 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
795    true, with probability PROB.  If CINSN is not NULL, it is the insn to copy
796    in order to create a jump.  */
797 
798 static rtx_insn *
compare_and_jump_seq(rtx op0,rtx op1,enum rtx_code comp,rtx_code_label * label,profile_probability prob,rtx_insn * cinsn)799 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
800 		      rtx_code_label *label, profile_probability prob,
801 		      rtx_insn *cinsn)
802 {
803   rtx_insn *seq;
804   rtx_jump_insn *jump;
805   rtx cond;
806   machine_mode mode;
807 
808   mode = GET_MODE (op0);
809   if (mode == VOIDmode)
810     mode = GET_MODE (op1);
811 
812   start_sequence ();
813   if (GET_MODE_CLASS (mode) == MODE_CC)
814     {
815       /* A hack -- there seems to be no easy generic way how to make a
816 	 conditional jump from a ccmode comparison.  */
817       gcc_assert (cinsn);
818       cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
819       gcc_assert (GET_CODE (cond) == comp);
820       gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
821       gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
822       emit_jump_insn (copy_insn (PATTERN (cinsn)));
823       jump = as_a <rtx_jump_insn *> (get_last_insn ());
824       JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
825       LABEL_NUSES (JUMP_LABEL (jump))++;
826       redirect_jump (jump, label, 0);
827     }
828   else
829     {
830       gcc_assert (!cinsn);
831 
832       op0 = force_operand (op0, NULL_RTX);
833       op1 = force_operand (op1, NULL_RTX);
834       do_compare_rtx_and_jump (op0, op1, comp, 0,
835 			       mode, NULL_RTX, NULL, label,
836 			       profile_probability::uninitialized ());
837       jump = as_a <rtx_jump_insn *> (get_last_insn ());
838       jump->set_jump_target (label);
839       LABEL_NUSES (label)++;
840     }
841   if (prob.initialized_p ())
842     add_reg_br_prob_note (jump, prob);
843 
844   seq = get_insns ();
845   end_sequence ();
846 
847   return seq;
848 }
849 
850 /* Unroll LOOP for which we are able to count number of iterations in
851    runtime LOOP->LPT_DECISION.TIMES times.  The times value must be a
852    power of two.  The transformation does this (with some extra care
853    for case n < 0):
854 
855    for (i = 0; i < n; i++)
856      body;
857 
858    ==>  (LOOP->LPT_DECISION.TIMES == 3)
859 
860    i = 0;
861    mod = n % 4;
862 
863    switch (mod)
864      {
865        case 3:
866          body; i++;
867        case 2:
868          body; i++;
869        case 1:
870          body; i++;
871        case 0: ;
872      }
873 
874    while (i < n)
875      {
876        body; i++;
877        body; i++;
878        body; i++;
879        body; i++;
880      }
881    */
882 static void
unroll_loop_runtime_iterations(struct loop * loop)883 unroll_loop_runtime_iterations (struct loop *loop)
884 {
885   rtx old_niter, niter, tmp;
886   rtx_insn *init_code, *branch_code;
887   unsigned i, j;
888   profile_probability p;
889   basic_block preheader, *body, swtch, ezc_swtch = NULL;
890   int may_exit_copy;
891   profile_count iter_count, new_count;
892   unsigned n_peel;
893   edge e;
894   bool extra_zero_check, last_may_exit;
895   unsigned max_unroll = loop->lpt_decision.times;
896   struct niter_desc *desc = get_simple_loop_desc (loop);
897   bool exit_at_end = loop_exit_at_end_p (loop);
898   struct opt_info *opt_info = NULL;
899   bool ok;
900 
901   if (flag_split_ivs_in_unroller
902       || flag_variable_expansion_in_unroller)
903     opt_info = analyze_insns_in_loop (loop);
904 
905   /* Remember blocks whose dominators will have to be updated.  */
906   auto_vec<basic_block> dom_bbs;
907 
908   body = get_loop_body (loop);
909   for (i = 0; i < loop->num_nodes; i++)
910     {
911       vec<basic_block> ldom;
912       basic_block bb;
913 
914       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
915       FOR_EACH_VEC_ELT (ldom, j, bb)
916 	if (!flow_bb_inside_loop_p (loop, bb))
917 	  dom_bbs.safe_push (bb);
918 
919       ldom.release ();
920     }
921   free (body);
922 
923   if (!exit_at_end)
924     {
925       /* Leave exit in first copy (for explanation why see comment in
926 	 unroll_loop_constant_iterations).  */
927       may_exit_copy = 0;
928       n_peel = max_unroll - 1;
929       extra_zero_check = true;
930       last_may_exit = false;
931     }
932   else
933     {
934       /* Leave exit in last copy (for explanation why see comment in
935 	 unroll_loop_constant_iterations).  */
936       may_exit_copy = max_unroll;
937       n_peel = max_unroll;
938       extra_zero_check = false;
939       last_may_exit = true;
940     }
941 
942   /* Get expression for number of iterations.  */
943   start_sequence ();
944   old_niter = niter = gen_reg_rtx (desc->mode);
945   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
946   if (tmp != niter)
947     emit_move_insn (niter, tmp);
948 
949   /* For loops that exit at end and whose number of iterations is reliable,
950      add one to niter to account for first pass through loop body before
951      reaching exit test. */
952   if (exit_at_end && !desc->noloop_assumptions)
953     {
954       niter = expand_simple_binop (desc->mode, PLUS,
955 				   niter, const1_rtx,
956 				   NULL_RTX, 0, OPTAB_LIB_WIDEN);
957       old_niter = niter;
958     }
959 
960   /* Count modulo by ANDing it with max_unroll; we use the fact that
961      the number of unrollings is a power of two, and thus this is correct
962      even if there is overflow in the computation.  */
963   niter = expand_simple_binop (desc->mode, AND,
964 			       niter, gen_int_mode (max_unroll, desc->mode),
965 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
966 
967   init_code = get_insns ();
968   end_sequence ();
969   unshare_all_rtl_in_chain (init_code);
970 
971   /* Precondition the loop.  */
972   split_edge_and_insert (loop_preheader_edge (loop), init_code);
973 
974   auto_vec<edge> remove_edges;
975 
976   auto_sbitmap wont_exit (max_unroll + 2);
977 
978   if (extra_zero_check || desc->noloop_assumptions)
979     {
980       /* Peel the first copy of loop body.  Leave the exit test if the number
981 	 of iterations is not reliable.  Also record the place of the extra zero
982 	 check.  */
983       bitmap_clear (wont_exit);
984       if (!desc->noloop_assumptions)
985 	bitmap_set_bit (wont_exit, 1);
986       ezc_swtch = loop_preheader_edge (loop)->src;
987       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
988 					  1, wont_exit, desc->out_edge,
989 					  &remove_edges,
990 					  DLTHE_FLAG_UPDATE_FREQ);
991       gcc_assert (ok);
992     }
993 
994   /* Record the place where switch will be built for preconditioning.  */
995   swtch = split_edge (loop_preheader_edge (loop));
996 
997   /* Compute count increments for each switch block and initialize
998      innermost switch block.  Switch blocks and peeled loop copies are built
999      from innermost outward.  */
1000   iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
1001   swtch->count = new_count;
1002 
1003   for (i = 0; i < n_peel; i++)
1004     {
1005       /* Peel the copy.  */
1006       bitmap_clear (wont_exit);
1007       if (i != n_peel - 1 || !last_may_exit)
1008 	bitmap_set_bit (wont_exit, 1);
1009       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1010 					  1, wont_exit, desc->out_edge,
1011 					  &remove_edges,
1012 					  DLTHE_FLAG_UPDATE_FREQ);
1013       gcc_assert (ok);
1014 
1015       /* Create item for switch.  */
1016       j = n_peel - i - (extra_zero_check ? 0 : 1);
1017       p = profile_probability::always ().apply_scale (1, i + 2);
1018 
1019       preheader = split_edge (loop_preheader_edge (loop));
1020       /* Add in count of edge from switch block.  */
1021       preheader->count += iter_count;
1022       branch_code = compare_and_jump_seq (copy_rtx (niter),
1023 					  gen_int_mode (j, desc->mode), EQ,
1024 					  block_label (preheader), p, NULL);
1025 
1026       /* We rely on the fact that the compare and jump cannot be optimized out,
1027 	 and hence the cfg we create is correct.  */
1028       gcc_assert (branch_code != NULL_RTX);
1029 
1030       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1031       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1032       single_succ_edge (swtch)->probability = p.invert ();
1033       new_count += iter_count;
1034       swtch->count = new_count;
1035       e = make_edge (swtch, preheader,
1036 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1037       e->probability = p;
1038     }
1039 
1040   if (extra_zero_check)
1041     {
1042       /* Add branch for zero iterations.  */
1043       p = profile_probability::always ().apply_scale (1, max_unroll + 1);
1044       swtch = ezc_swtch;
1045       preheader = split_edge (loop_preheader_edge (loop));
1046       /* Recompute count adjustments since initial peel copy may
1047 	 have exited and reduced those values that were computed above.  */
1048       iter_count = swtch->count.apply_scale (1, max_unroll + 1);
1049       /* Add in count of edge from switch block.  */
1050       preheader->count += iter_count;
1051       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1052 					  block_label (preheader), p,
1053 					  NULL);
1054       gcc_assert (branch_code != NULL_RTX);
1055 
1056       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1057       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1058       single_succ_edge (swtch)->probability = p.invert ();
1059       e = make_edge (swtch, preheader,
1060 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1061       e->probability = p;
1062     }
1063 
1064   /* Recount dominators for outer blocks.  */
1065   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1066 
1067   /* And unroll loop.  */
1068 
1069   bitmap_ones (wont_exit);
1070   bitmap_clear_bit (wont_exit, may_exit_copy);
1071   opt_info_start_duplication (opt_info);
1072 
1073   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1074 				      max_unroll,
1075 				      wont_exit, desc->out_edge,
1076 				      &remove_edges,
1077 				      DLTHE_FLAG_UPDATE_FREQ
1078 				      | (opt_info
1079 					 ? DLTHE_RECORD_COPY_NUMBER
1080 					   : 0));
1081   gcc_assert (ok);
1082 
1083   if (opt_info)
1084     {
1085       apply_opt_in_copies (opt_info, max_unroll, true, true);
1086       free_opt_info (opt_info);
1087     }
1088 
1089   if (exit_at_end)
1090     {
1091       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1092       /* Find a new in and out edge; they are in the last copy we have
1093 	 made.  */
1094 
1095       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1096 	{
1097 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
1098 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
1099 	}
1100       else
1101 	{
1102 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
1103 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
1104 	}
1105     }
1106 
1107   /* Remove the edges.  */
1108   FOR_EACH_VEC_ELT (remove_edges, i, e)
1109     remove_path (e);
1110 
1111   /* We must be careful when updating the number of iterations due to
1112      preconditioning and the fact that the value must be valid at entry
1113      of the loop.  After passing through the above code, we see that
1114      the correct new number of iterations is this:  */
1115   gcc_assert (!desc->const_iter);
1116   desc->niter_expr =
1117     simplify_gen_binary (UDIV, desc->mode, old_niter,
1118 			 gen_int_mode (max_unroll + 1, desc->mode));
1119   loop->nb_iterations_upper_bound
1120     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1121   if (loop->any_estimate)
1122     loop->nb_iterations_estimate
1123       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1124   if (loop->any_likely_upper_bound)
1125     loop->nb_iterations_likely_upper_bound
1126       = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
1127   if (exit_at_end)
1128     {
1129       desc->niter_expr =
1130 	simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1131       desc->noloop_assumptions = NULL_RTX;
1132       --loop->nb_iterations_upper_bound;
1133       if (loop->any_estimate
1134 	  && loop->nb_iterations_estimate != 0)
1135 	--loop->nb_iterations_estimate;
1136       else
1137 	loop->any_estimate = false;
1138       if (loop->any_likely_upper_bound
1139 	  && loop->nb_iterations_likely_upper_bound != 0)
1140 	--loop->nb_iterations_likely_upper_bound;
1141       else
1142 	loop->any_likely_upper_bound = false;
1143     }
1144 
1145   if (dump_file)
1146     fprintf (dump_file,
1147 	     ";; Unrolled loop %d times, counting # of iterations "
1148 	     "in runtime, %i insns\n",
1149 	     max_unroll, num_loop_insns (loop));
1150 }
1151 
1152 /* Decide whether to unroll LOOP stupidly and how much.  */
1153 static void
decide_unroll_stupid(struct loop * loop,int flags)1154 decide_unroll_stupid (struct loop *loop, int flags)
1155 {
1156   unsigned nunroll, nunroll_by_av, i;
1157   struct niter_desc *desc;
1158   widest_int iterations;
1159 
1160   /* If we were not asked to unroll this loop, just return back silently.  */
1161   if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
1162     return;
1163 
1164   if (dump_enabled_p ())
1165     dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
1166 
1167   /* nunroll = total number of copies of the original loop body in
1168      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1169   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1170   nunroll_by_av
1171     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1172   if (nunroll > nunroll_by_av)
1173     nunroll = nunroll_by_av;
1174   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1175     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1176 
1177   if (targetm.loop_unroll_adjust)
1178     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1179 
1180   if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
1181     nunroll = loop->unroll;
1182 
1183   /* Skip big loops.  */
1184   if (nunroll <= 1)
1185     {
1186       if (dump_file)
1187 	fprintf (dump_file, ";; Not considering loop, is too big\n");
1188       return;
1189     }
1190 
1191   /* Check for simple loops.  */
1192   desc = get_simple_loop_desc (loop);
1193 
1194   /* Check simpleness.  */
1195   if (desc->simple_p && !desc->assumptions)
1196     {
1197       if (dump_file)
1198 	fprintf (dump_file, ";; Loop is simple\n");
1199       return;
1200     }
1201 
1202   /* Do not unroll loops with branches inside -- it increases number
1203      of mispredicts.
1204      TODO: this heuristic needs tunning; call inside the loop body
1205      is also relatively good reason to not unroll.  */
1206   if (num_loop_branches (loop) > 1)
1207     {
1208       if (dump_file)
1209 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
1210       return;
1211     }
1212 
1213   /* Check whether the loop rolls.  */
1214   if ((get_estimated_loop_iterations (loop, &iterations)
1215        || get_likely_max_loop_iterations (loop, &iterations))
1216       && wi::ltu_p (iterations, 2 * nunroll))
1217     {
1218       if (dump_file)
1219 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1220       return;
1221     }
1222 
1223   /* Success.  Now force nunroll to be power of 2, as it seems that this
1224      improves results (partially because of better alignments, partially
1225      because of some dark magic).  */
1226   for (i = 1; 2 * i <= nunroll; i *= 2)
1227     continue;
1228 
1229   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1230   loop->lpt_decision.times = i - 1;
1231 }
1232 
1233 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1234 
1235    while (cond)
1236      body;
1237 
1238    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1239 
1240    while (cond)
1241      {
1242        body;
1243        if (!cond) break;
1244        body;
1245        if (!cond) break;
1246        body;
1247        if (!cond) break;
1248        body;
1249      }
1250    */
1251 static void
unroll_loop_stupid(struct loop * loop)1252 unroll_loop_stupid (struct loop *loop)
1253 {
1254   unsigned nunroll = loop->lpt_decision.times;
1255   struct niter_desc *desc = get_simple_loop_desc (loop);
1256   struct opt_info *opt_info = NULL;
1257   bool ok;
1258 
1259   if (flag_split_ivs_in_unroller
1260       || flag_variable_expansion_in_unroller)
1261     opt_info = analyze_insns_in_loop (loop);
1262 
1263   auto_sbitmap wont_exit (nunroll + 1);
1264   bitmap_clear (wont_exit);
1265   opt_info_start_duplication (opt_info);
1266 
1267   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1268 				      nunroll, wont_exit,
1269 				      NULL, NULL,
1270 				      DLTHE_FLAG_UPDATE_FREQ
1271 				      | (opt_info
1272 					 ? DLTHE_RECORD_COPY_NUMBER
1273 					   : 0));
1274   gcc_assert (ok);
1275 
1276   if (opt_info)
1277     {
1278       apply_opt_in_copies (opt_info, nunroll, true, true);
1279       free_opt_info (opt_info);
1280     }
1281 
1282   if (desc->simple_p)
1283     {
1284       /* We indeed may get here provided that there are nontrivial assumptions
1285 	 for a loop to be really simple.  We could update the counts, but the
1286 	 problem is that we are unable to decide which exit will be taken
1287 	 (not really true in case the number of iterations is constant,
1288 	 but no one will do anything with this information, so we do not
1289 	 worry about it).  */
1290       desc->simple_p = false;
1291     }
1292 
1293   if (dump_file)
1294     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1295 	     nunroll, num_loop_insns (loop));
1296 }
1297 
1298 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1299    Set *DEBUG_USES to the number of debug insns that reference the
1300    variable.  */
1301 
1302 static bool
referenced_in_one_insn_in_loop_p(struct loop * loop,rtx reg,int * debug_uses)1303 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1304 				  int *debug_uses)
1305 {
1306   basic_block *body, bb;
1307   unsigned i;
1308   int count_ref = 0;
1309   rtx_insn *insn;
1310 
1311   body = get_loop_body (loop);
1312   for (i = 0; i < loop->num_nodes; i++)
1313     {
1314       bb = body[i];
1315 
1316       FOR_BB_INSNS (bb, insn)
1317 	if (!rtx_referenced_p (reg, insn))
1318 	  continue;
1319 	else if (DEBUG_INSN_P (insn))
1320 	  ++*debug_uses;
1321 	else if (++count_ref > 1)
1322 	  break;
1323     }
1324   free (body);
1325   return (count_ref  == 1);
1326 }
1327 
1328 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1329 
1330 static void
reset_debug_uses_in_loop(struct loop * loop,rtx reg,int debug_uses)1331 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1332 {
1333   basic_block *body, bb;
1334   unsigned i;
1335   rtx_insn *insn;
1336 
1337   body = get_loop_body (loop);
1338   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1339     {
1340       bb = body[i];
1341 
1342       FOR_BB_INSNS (bb, insn)
1343 	if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1344 	  continue;
1345 	else
1346 	  {
1347 	    validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1348 			     gen_rtx_UNKNOWN_VAR_LOC (), 0);
1349 	    if (!--debug_uses)
1350 	      break;
1351 	  }
1352     }
1353   free (body);
1354 }
1355 
1356 /* Determine whether INSN contains an accumulator
1357    which can be expanded into separate copies,
1358    one for each copy of the LOOP body.
1359 
1360    for (i = 0 ; i < n; i++)
1361      sum += a[i];
1362 
1363    ==>
1364 
1365    sum += a[i]
1366    ....
1367    i = i+1;
1368    sum1 += a[i]
1369    ....
1370    i = i+1
1371    sum2 += a[i];
1372    ....
1373 
1374    Return NULL if INSN contains no opportunity for expansion of accumulator.
1375    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1376    information and return a pointer to it.
1377 */
1378 
1379 static struct var_to_expand *
analyze_insn_to_expand_var(struct loop * loop,rtx_insn * insn)1380 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1381 {
1382   rtx set, dest, src;
1383   struct var_to_expand *ves;
1384   unsigned accum_pos;
1385   enum rtx_code code;
1386   int debug_uses = 0;
1387 
1388   set = single_set (insn);
1389   if (!set)
1390     return NULL;
1391 
1392   dest = SET_DEST (set);
1393   src = SET_SRC (set);
1394   code = GET_CODE (src);
1395 
1396   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1397     return NULL;
1398 
1399   if (FLOAT_MODE_P (GET_MODE (dest)))
1400     {
1401       if (!flag_associative_math)
1402         return NULL;
1403       /* In the case of FMA, we're also changing the rounding.  */
1404       if (code == FMA && !flag_unsafe_math_optimizations)
1405 	return NULL;
1406     }
1407 
1408   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1409      in MD.  But if there is no optab to generate the insn, we can not
1410      perform the variable expansion.  This can happen if an MD provides
1411      an insn but not a named pattern to generate it, for example to avoid
1412      producing code that needs additional mode switches like for x87/mmx.
1413 
1414      So we check have_insn_for which looks for an optab for the operation
1415      in SRC.  If it doesn't exist, we can't perform the expansion even
1416      though INSN is valid.  */
1417   if (!have_insn_for (code, GET_MODE (src)))
1418     return NULL;
1419 
1420   if (!REG_P (dest)
1421       && !(GET_CODE (dest) == SUBREG
1422            && REG_P (SUBREG_REG (dest))))
1423     return NULL;
1424 
1425   /* Find the accumulator use within the operation.  */
1426   if (code == FMA)
1427     {
1428       /* We only support accumulation via FMA in the ADD position.  */
1429       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1430 	return NULL;
1431       accum_pos = 2;
1432     }
1433   else if (rtx_equal_p (dest, XEXP (src, 0)))
1434     accum_pos = 0;
1435   else if (rtx_equal_p (dest, XEXP (src, 1)))
1436     {
1437       /* The method of expansion that we are using; which includes the
1438 	 initialization of the expansions with zero and the summation of
1439          the expansions at the end of the computation will yield wrong
1440 	 results for (x = something - x) thus avoid using it in that case.  */
1441       if (code == MINUS)
1442 	return NULL;
1443       accum_pos = 1;
1444     }
1445   else
1446     return NULL;
1447 
1448   /* It must not otherwise be used.  */
1449   if (code == FMA)
1450     {
1451       if (rtx_referenced_p (dest, XEXP (src, 0))
1452 	  || rtx_referenced_p (dest, XEXP (src, 1)))
1453 	return NULL;
1454     }
1455   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1456     return NULL;
1457 
1458   /* It must be used in exactly one insn.  */
1459   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1460     return NULL;
1461 
1462   if (dump_file)
1463     {
1464       fprintf (dump_file, "\n;; Expanding Accumulator ");
1465       print_rtl (dump_file, dest);
1466       fprintf (dump_file, "\n");
1467     }
1468 
1469   if (debug_uses)
1470     /* Instead of resetting the debug insns, we could replace each
1471        debug use in the loop with the sum or product of all expanded
1472        accumulators.  Since we'll only know of all expansions at the
1473        end, we'd have to keep track of which vars_to_expand a debug
1474        insn in the loop references, take note of each copy of the
1475        debug insn during unrolling, and when it's all done, compute
1476        the sum or product of each variable and adjust the original
1477        debug insn and each copy thereof.  What a pain!  */
1478     reset_debug_uses_in_loop (loop, dest, debug_uses);
1479 
1480   /* Record the accumulator to expand.  */
1481   ves = XNEW (struct var_to_expand);
1482   ves->insn = insn;
1483   ves->reg = copy_rtx (dest);
1484   ves->var_expansions.create (1);
1485   ves->next = NULL;
1486   ves->op = GET_CODE (src);
1487   ves->expansion_count = 0;
1488   ves->reuse_expansion = 0;
1489   return ves;
1490 }
1491 
1492 /* Determine whether there is an induction variable in INSN that
1493    we would like to split during unrolling.
1494 
1495    I.e. replace
1496 
1497    i = i + 1;
1498    ...
1499    i = i + 1;
1500    ...
1501    i = i + 1;
1502    ...
1503 
1504    type chains by
1505 
1506    i0 = i + 1
1507    ...
1508    i = i0 + 1
1509    ...
1510    i = i0 + 2
1511    ...
1512 
1513    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1514    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1515    pointer to it.  */
1516 
1517 static struct iv_to_split *
analyze_iv_to_split_insn(rtx_insn * insn)1518 analyze_iv_to_split_insn (rtx_insn *insn)
1519 {
1520   rtx set, dest;
1521   struct rtx_iv iv;
1522   struct iv_to_split *ivts;
1523   scalar_int_mode mode;
1524   bool ok;
1525 
1526   /* For now we just split the basic induction variables.  Later this may be
1527      extended for example by selecting also addresses of memory references.  */
1528   set = single_set (insn);
1529   if (!set)
1530     return NULL;
1531 
1532   dest = SET_DEST (set);
1533   if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
1534     return NULL;
1535 
1536   if (!biv_p (insn, mode, dest))
1537     return NULL;
1538 
1539   ok = iv_analyze_result (insn, dest, &iv);
1540 
1541   /* This used to be an assert under the assumption that if biv_p returns
1542      true that iv_analyze_result must also return true.  However, that
1543      assumption is not strictly correct as evidenced by pr25569.
1544 
1545      Returning NULL when iv_analyze_result returns false is safe and
1546      avoids the problems in pr25569 until the iv_analyze_* routines
1547      can be fixed, which is apparently hard and time consuming
1548      according to their author.  */
1549   if (! ok)
1550     return NULL;
1551 
1552   if (iv.step == const0_rtx
1553       || iv.mode != iv.extend_mode)
1554     return NULL;
1555 
1556   /* Record the insn to split.  */
1557   ivts = XNEW (struct iv_to_split);
1558   ivts->insn = insn;
1559   ivts->orig_var = dest;
1560   ivts->base_var = NULL_RTX;
1561   ivts->step = iv.step;
1562   ivts->next = NULL;
1563 
1564   return ivts;
1565 }
1566 
1567 /* Determines which of insns in LOOP can be optimized.
1568    Return a OPT_INFO struct with the relevant hash tables filled
1569    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1570    is undefined for the return value.  */
1571 
1572 static struct opt_info *
analyze_insns_in_loop(struct loop * loop)1573 analyze_insns_in_loop (struct loop *loop)
1574 {
1575   basic_block *body, bb;
1576   unsigned i;
1577   struct opt_info *opt_info = XCNEW (struct opt_info);
1578   rtx_insn *insn;
1579   struct iv_to_split *ivts = NULL;
1580   struct var_to_expand *ves = NULL;
1581   iv_to_split **slot1;
1582   var_to_expand **slot2;
1583   vec<edge> edges = get_loop_exit_edges (loop);
1584   edge exit;
1585   bool can_apply = false;
1586 
1587   iv_analysis_loop_init (loop);
1588 
1589   body = get_loop_body (loop);
1590 
1591   if (flag_split_ivs_in_unroller)
1592     {
1593       opt_info->insns_to_split
1594        	= new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1595       opt_info->iv_to_split_head = NULL;
1596       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1597     }
1598 
1599   /* Record the loop exit bb and loop preheader before the unrolling.  */
1600   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1601 
1602   if (edges.length () == 1)
1603     {
1604       exit = edges[0];
1605       if (!(exit->flags & EDGE_COMPLEX))
1606 	{
1607 	  opt_info->loop_exit = split_edge (exit);
1608 	  can_apply = true;
1609 	}
1610     }
1611 
1612   if (flag_variable_expansion_in_unroller
1613       && can_apply)
1614     {
1615       opt_info->insns_with_var_to_expand
1616        	= new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1617       opt_info->var_to_expand_head = NULL;
1618       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1619     }
1620 
1621   for (i = 0; i < loop->num_nodes; i++)
1622     {
1623       bb = body[i];
1624       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1625 	continue;
1626 
1627       FOR_BB_INSNS (bb, insn)
1628       {
1629         if (!INSN_P (insn))
1630           continue;
1631 
1632         if (opt_info->insns_to_split)
1633           ivts = analyze_iv_to_split_insn (insn);
1634 
1635         if (ivts)
1636           {
1637             slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1638 	    gcc_assert (*slot1 == NULL);
1639             *slot1 = ivts;
1640 	    *opt_info->iv_to_split_tail = ivts;
1641 	    opt_info->iv_to_split_tail = &ivts->next;
1642             continue;
1643           }
1644 
1645         if (opt_info->insns_with_var_to_expand)
1646           ves = analyze_insn_to_expand_var (loop, insn);
1647 
1648         if (ves)
1649           {
1650             slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1651 	    gcc_assert (*slot2 == NULL);
1652             *slot2 = ves;
1653 	    *opt_info->var_to_expand_tail = ves;
1654 	    opt_info->var_to_expand_tail = &ves->next;
1655           }
1656       }
1657     }
1658 
1659   edges.release ();
1660   free (body);
1661   return opt_info;
1662 }
1663 
1664 /* Called just before loop duplication.  Records start of duplicated area
1665    to OPT_INFO.  */
1666 
1667 static void
opt_info_start_duplication(struct opt_info * opt_info)1668 opt_info_start_duplication (struct opt_info *opt_info)
1669 {
1670   if (opt_info)
1671     opt_info->first_new_block = last_basic_block_for_fn (cfun);
1672 }
1673 
1674 /* Determine the number of iterations between initialization of the base
1675    variable and the current copy (N_COPY).  N_COPIES is the total number
1676    of newly created copies.  UNROLLING is true if we are unrolling
1677    (not peeling) the loop.  */
1678 
1679 static unsigned
determine_split_iv_delta(unsigned n_copy,unsigned n_copies,bool unrolling)1680 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1681 {
1682   if (unrolling)
1683     {
1684       /* If we are unrolling, initialization is done in the original loop
1685 	 body (number 0).  */
1686       return n_copy;
1687     }
1688   else
1689     {
1690       /* If we are peeling, the copy in that the initialization occurs has
1691 	 number 1.  The original loop (number 0) is the last.  */
1692       if (n_copy)
1693 	return n_copy - 1;
1694       else
1695 	return n_copies;
1696     }
1697 }
1698 
1699 /* Allocate basic variable for the induction variable chain.  */
1700 
1701 static void
allocate_basic_variable(struct iv_to_split * ivts)1702 allocate_basic_variable (struct iv_to_split *ivts)
1703 {
1704   rtx expr = SET_SRC (single_set (ivts->insn));
1705 
1706   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1707 }
1708 
1709 /* Insert initialization of basic variable of IVTS before INSN, taking
1710    the initial value from INSN.  */
1711 
1712 static void
insert_base_initialization(struct iv_to_split * ivts,rtx_insn * insn)1713 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1714 {
1715   rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1716   rtx_insn *seq;
1717 
1718   start_sequence ();
1719   expr = force_operand (expr, ivts->base_var);
1720   if (expr != ivts->base_var)
1721     emit_move_insn (ivts->base_var, expr);
1722   seq = get_insns ();
1723   end_sequence ();
1724 
1725   emit_insn_before (seq, insn);
1726 }
1727 
1728 /* Replace the use of induction variable described in IVTS in INSN
1729    by base variable + DELTA * step.  */
1730 
1731 static void
split_iv(struct iv_to_split * ivts,rtx_insn * insn,unsigned delta)1732 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1733 {
1734   rtx expr, *loc, incr, var;
1735   rtx_insn *seq;
1736   machine_mode mode = GET_MODE (ivts->base_var);
1737   rtx src, dest, set;
1738 
1739   /* Construct base + DELTA * step.  */
1740   if (!delta)
1741     expr = ivts->base_var;
1742   else
1743     {
1744       incr = simplify_gen_binary (MULT, mode,
1745 				  copy_rtx (ivts->step),
1746 				  gen_int_mode (delta, mode));
1747       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1748 				  ivts->base_var, incr);
1749     }
1750 
1751   /* Figure out where to do the replacement.  */
1752   loc = &SET_SRC (single_set (insn));
1753 
1754   /* If we can make the replacement right away, we're done.  */
1755   if (validate_change (insn, loc, expr, 0))
1756     return;
1757 
1758   /* Otherwise, force EXPR into a register and try again.  */
1759   start_sequence ();
1760   var = gen_reg_rtx (mode);
1761   expr = force_operand (expr, var);
1762   if (expr != var)
1763     emit_move_insn (var, expr);
1764   seq = get_insns ();
1765   end_sequence ();
1766   emit_insn_before (seq, insn);
1767 
1768   if (validate_change (insn, loc, var, 0))
1769     return;
1770 
1771   /* The last chance.  Try recreating the assignment in insn
1772      completely from scratch.  */
1773   set = single_set (insn);
1774   gcc_assert (set);
1775 
1776   start_sequence ();
1777   *loc = var;
1778   src = copy_rtx (SET_SRC (set));
1779   dest = copy_rtx (SET_DEST (set));
1780   src = force_operand (src, dest);
1781   if (src != dest)
1782     emit_move_insn (dest, src);
1783   seq = get_insns ();
1784   end_sequence ();
1785 
1786   emit_insn_before (seq, insn);
1787   delete_insn (insn);
1788 }
1789 
1790 
1791 /* Return one expansion of the accumulator recorded in struct VE.  */
1792 
1793 static rtx
get_expansion(struct var_to_expand * ve)1794 get_expansion (struct var_to_expand *ve)
1795 {
1796   rtx reg;
1797 
1798   if (ve->reuse_expansion == 0)
1799     reg = ve->reg;
1800   else
1801     reg = ve->var_expansions[ve->reuse_expansion - 1];
1802 
1803   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1804     ve->reuse_expansion = 0;
1805   else
1806     ve->reuse_expansion++;
1807 
1808   return reg;
1809 }
1810 
1811 
1812 /* Given INSN replace the uses of the accumulator recorded in VE
1813    with a new register.  */
1814 
1815 static void
expand_var_during_unrolling(struct var_to_expand * ve,rtx_insn * insn)1816 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1817 {
1818   rtx new_reg, set;
1819   bool really_new_expansion = false;
1820 
1821   set = single_set (insn);
1822   gcc_assert (set);
1823 
1824   /* Generate a new register only if the expansion limit has not been
1825      reached.  Else reuse an already existing expansion.  */
1826   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1827     {
1828       really_new_expansion = true;
1829       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1830     }
1831   else
1832     new_reg = get_expansion (ve);
1833 
1834   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1835   if (apply_change_group ())
1836     if (really_new_expansion)
1837       {
1838         ve->var_expansions.safe_push (new_reg);
1839         ve->expansion_count++;
1840       }
1841 }
1842 
1843 /* Initialize the variable expansions in loop preheader.  PLACE is the
1844    loop-preheader basic block where the initialization of the
1845    expansions should take place.  The expansions are initialized with
1846    (-0) when the operation is plus or minus to honor sign zero.  This
1847    way we can prevent cases where the sign of the final result is
1848    effected by the sign of the expansion.  Here is an example to
1849    demonstrate this:
1850 
1851    for (i = 0 ; i < n; i++)
1852      sum += something;
1853 
1854    ==>
1855 
1856    sum += something
1857    ....
1858    i = i+1;
1859    sum1 += something
1860    ....
1861    i = i+1
1862    sum2 += something;
1863    ....
1864 
1865    When SUM is initialized with -zero and SOMETHING is also -zero; the
1866    final result of sum should be -zero thus the expansions sum1 and sum2
1867    should be initialized with -zero as well (otherwise we will get +zero
1868    as the final result).  */
1869 
1870 static void
insert_var_expansion_initialization(struct var_to_expand * ve,basic_block place)1871 insert_var_expansion_initialization (struct var_to_expand *ve,
1872 				     basic_block place)
1873 {
1874   rtx_insn *seq;
1875   rtx var, zero_init;
1876   unsigned i;
1877   machine_mode mode = GET_MODE (ve->reg);
1878   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1879 
1880   if (ve->var_expansions.length () == 0)
1881     return;
1882 
1883   start_sequence ();
1884   switch (ve->op)
1885     {
1886     case FMA:
1887       /* Note that we only accumulate FMA via the ADD operand.  */
1888     case PLUS:
1889     case MINUS:
1890       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1891         {
1892 	  if (honor_signed_zero_p)
1893 	    zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1894 	  else
1895 	    zero_init = CONST0_RTX (mode);
1896           emit_move_insn (var, zero_init);
1897         }
1898       break;
1899 
1900     case MULT:
1901       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1902         {
1903           zero_init = CONST1_RTX (GET_MODE (var));
1904           emit_move_insn (var, zero_init);
1905         }
1906       break;
1907 
1908     default:
1909       gcc_unreachable ();
1910     }
1911 
1912   seq = get_insns ();
1913   end_sequence ();
1914 
1915   emit_insn_after (seq, BB_END (place));
1916 }
1917 
1918 /* Combine the variable expansions at the loop exit.  PLACE is the
1919    loop exit basic block where the summation of the expansions should
1920    take place.  */
1921 
1922 static void
combine_var_copies_in_loop_exit(struct var_to_expand * ve,basic_block place)1923 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1924 {
1925   rtx sum = ve->reg;
1926   rtx expr, var;
1927   rtx_insn *seq, *insn;
1928   unsigned i;
1929 
1930   if (ve->var_expansions.length () == 0)
1931     return;
1932 
1933   /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1934      it both here and as the destination of the assignment.  */
1935   sum = copy_rtx (sum);
1936   start_sequence ();
1937   switch (ve->op)
1938     {
1939     case FMA:
1940       /* Note that we only accumulate FMA via the ADD operand.  */
1941     case PLUS:
1942     case MINUS:
1943       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1944 	sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1945       break;
1946 
1947     case MULT:
1948       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1949 	sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1950       break;
1951 
1952     default:
1953       gcc_unreachable ();
1954     }
1955 
1956   expr = force_operand (sum, ve->reg);
1957   if (expr != ve->reg)
1958     emit_move_insn (ve->reg, expr);
1959   seq = get_insns ();
1960   end_sequence ();
1961 
1962   insn = BB_HEAD (place);
1963   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1964     insn = NEXT_INSN (insn);
1965 
1966   emit_insn_after (seq, insn);
1967 }
1968 
1969 /* Strip away REG_EQUAL notes for IVs we're splitting.
1970 
1971    Updating REG_EQUAL notes for IVs we split is tricky: We
1972    cannot tell until after unrolling, DF-rescanning, and liveness
1973    updating, whether an EQ_USE is reached by the split IV while
1974    the IV reg is still live.  See PR55006.
1975 
1976    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1977    because RTL loop-iv requires us to defer rescanning insns and
1978    any notes attached to them.  So resort to old techniques...  */
1979 
1980 static void
maybe_strip_eq_note_for_split_iv(struct opt_info * opt_info,rtx_insn * insn)1981 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1982 {
1983   struct iv_to_split *ivts;
1984   rtx note = find_reg_equal_equiv_note (insn);
1985   if (! note)
1986     return;
1987   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1988     if (reg_mentioned_p (ivts->orig_var, note))
1989       {
1990 	remove_note (insn, note);
1991 	return;
1992       }
1993 }
1994 
1995 /* Apply loop optimizations in loop copies using the
1996    data which gathered during the unrolling.  Structure
1997    OPT_INFO record that data.
1998 
1999    UNROLLING is true if we unrolled (not peeled) the loop.
2000    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2001    the loop (as it should happen in complete unrolling, but not in ordinary
2002    peeling of the loop).  */
2003 
2004 static void
apply_opt_in_copies(struct opt_info * opt_info,unsigned n_copies,bool unrolling,bool rewrite_original_loop)2005 apply_opt_in_copies (struct opt_info *opt_info,
2006                      unsigned n_copies, bool unrolling,
2007                      bool rewrite_original_loop)
2008 {
2009   unsigned i, delta;
2010   basic_block bb, orig_bb;
2011   rtx_insn *insn, *orig_insn, *next;
2012   struct iv_to_split ivts_templ, *ivts;
2013   struct var_to_expand ve_templ, *ves;
2014 
2015   /* Sanity check -- we need to put initialization in the original loop
2016      body.  */
2017   gcc_assert (!unrolling || rewrite_original_loop);
2018 
2019   /* Allocate the basic variables (i0).  */
2020   if (opt_info->insns_to_split)
2021     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2022       allocate_basic_variable (ivts);
2023 
2024   for (i = opt_info->first_new_block;
2025        i < (unsigned) last_basic_block_for_fn (cfun);
2026        i++)
2027     {
2028       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2029       orig_bb = get_bb_original (bb);
2030 
2031       /* bb->aux holds position in copy sequence initialized by
2032 	 duplicate_loop_to_header_edge.  */
2033       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2034 					unrolling);
2035       bb->aux = 0;
2036       orig_insn = BB_HEAD (orig_bb);
2037       FOR_BB_INSNS_SAFE (bb, insn, next)
2038         {
2039 	  if (!INSN_P (insn)
2040 	      || (DEBUG_BIND_INSN_P (insn)
2041 		  && INSN_VAR_LOCATION_DECL (insn)
2042 		  && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2043             continue;
2044 
2045 	  while (!INSN_P (orig_insn)
2046 		 || (DEBUG_BIND_INSN_P (orig_insn)
2047 		     && INSN_VAR_LOCATION_DECL (orig_insn)
2048 		     && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2049 			 == LABEL_DECL)))
2050             orig_insn = NEXT_INSN (orig_insn);
2051 
2052           ivts_templ.insn = orig_insn;
2053           ve_templ.insn = orig_insn;
2054 
2055           /* Apply splitting iv optimization.  */
2056           if (opt_info->insns_to_split)
2057             {
2058 	      maybe_strip_eq_note_for_split_iv (opt_info, insn);
2059 
2060               ivts = opt_info->insns_to_split->find (&ivts_templ);
2061 
2062               if (ivts)
2063                 {
2064 		  gcc_assert (GET_CODE (PATTERN (insn))
2065 			      == GET_CODE (PATTERN (orig_insn)));
2066 
2067                   if (!delta)
2068                     insert_base_initialization (ivts, insn);
2069                   split_iv (ivts, insn, delta);
2070                 }
2071             }
2072           /* Apply variable expansion optimization.  */
2073           if (unrolling && opt_info->insns_with_var_to_expand)
2074             {
2075               ves = (struct var_to_expand *)
2076 		opt_info->insns_with_var_to_expand->find (&ve_templ);
2077               if (ves)
2078                 {
2079 		  gcc_assert (GET_CODE (PATTERN (insn))
2080 			      == GET_CODE (PATTERN (orig_insn)));
2081                   expand_var_during_unrolling (ves, insn);
2082                 }
2083             }
2084           orig_insn = NEXT_INSN (orig_insn);
2085         }
2086     }
2087 
2088   if (!rewrite_original_loop)
2089     return;
2090 
2091   /* Initialize the variable expansions in the loop preheader
2092      and take care of combining them at the loop exit.  */
2093   if (opt_info->insns_with_var_to_expand)
2094     {
2095       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2096 	insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2097       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2098 	combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2099     }
2100 
2101   /* Rewrite also the original loop body.  Find them as originals of the blocks
2102      in the last copied iteration, i.e. those that have
2103      get_bb_copy (get_bb_original (bb)) == bb.  */
2104   for (i = opt_info->first_new_block;
2105        i < (unsigned) last_basic_block_for_fn (cfun);
2106        i++)
2107     {
2108       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2109       orig_bb = get_bb_original (bb);
2110       if (get_bb_copy (orig_bb) != bb)
2111 	continue;
2112 
2113       delta = determine_split_iv_delta (0, n_copies, unrolling);
2114       for (orig_insn = BB_HEAD (orig_bb);
2115            orig_insn != NEXT_INSN (BB_END (bb));
2116            orig_insn = next)
2117         {
2118           next = NEXT_INSN (orig_insn);
2119 
2120           if (!INSN_P (orig_insn))
2121  	    continue;
2122 
2123           ivts_templ.insn = orig_insn;
2124           if (opt_info->insns_to_split)
2125             {
2126 	      maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2127 
2128               ivts = (struct iv_to_split *)
2129 		opt_info->insns_to_split->find (&ivts_templ);
2130               if (ivts)
2131                 {
2132                   if (!delta)
2133                     insert_base_initialization (ivts, orig_insn);
2134                   split_iv (ivts, orig_insn, delta);
2135                   continue;
2136                 }
2137             }
2138 
2139         }
2140     }
2141 }
2142 
2143 /* Release OPT_INFO.  */
2144 
2145 static void
free_opt_info(struct opt_info * opt_info)2146 free_opt_info (struct opt_info *opt_info)
2147 {
2148   delete opt_info->insns_to_split;
2149   opt_info->insns_to_split = NULL;
2150   if (opt_info->insns_with_var_to_expand)
2151     {
2152       struct var_to_expand *ves;
2153 
2154       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2155 	ves->var_expansions.release ();
2156       delete opt_info->insns_with_var_to_expand;
2157       opt_info->insns_with_var_to_expand = NULL;
2158     }
2159   free (opt_info);
2160 }
2161