1 /* Loop unrolling.
2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "cfghooks.h"
28 #include "memmodel.h"
29 #include "optabs.h"
30 #include "emit-rtl.h"
31 #include "recog.h"
32 #include "profile.h"
33 #include "cfgrtl.h"
34 #include "cfgloop.h"
35 #include "dojump.h"
36 #include "expr.h"
37 #include "dumpfile.h"
38 
39 /* This pass performs loop unrolling.  We only perform this
40    optimization on innermost loops (with single exception) because
41    the impact on performance is greatest here, and we want to avoid
42    unnecessary code size growth.  The gain is caused by greater sequentiality
43    of code, better code to optimize for further passes and in some cases
44    by fewer testings of exit conditions.  The main problem is code growth,
45    that impacts performance negatively due to effect of caches.
46 
47    What we do:
48 
49    -- unrolling of loops that roll constant times; this is almost always
50       win, as we get rid of exit condition tests.
51    -- unrolling of loops that roll number of times that we can compute
52       in runtime; we also get rid of exit condition tests here, but there
53       is the extra expense for calculating the number of iterations
54    -- simple unrolling of remaining loops; this is performed only if we
55       are asked to, as the gain is questionable in this case and often
56       it may even slow down the code
57    For more detailed descriptions of each of those, see comments at
58    appropriate function below.
59 
60    There is a lot of parameters (defined and described in params.def) that
61    control how much we unroll.
62 
63    ??? A great problem is that we don't have a good way how to determine
64    how many times we should unroll the loop; the experiments I have made
65    showed that this choice may affect performance in order of several %.
66    */
67 
68 /* Information about induction variables to split.  */
69 
70 struct iv_to_split
71 {
72   rtx_insn *insn;	/* The insn in that the induction variable occurs.  */
73   rtx orig_var;		/* The variable (register) for the IV before split.  */
74   rtx base_var;		/* The variable on that the values in the further
75 			   iterations are based.  */
76   rtx step;		/* Step of the induction variable.  */
77   struct iv_to_split *next; /* Next entry in walking order.  */
78 };
79 
80 /* Information about accumulators to expand.  */
81 
82 struct var_to_expand
83 {
84   rtx_insn *insn;	           /* The insn in that the variable expansion occurs.  */
85   rtx reg;                         /* The accumulator which is expanded.  */
86   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
87   struct var_to_expand *next;	   /* Next entry in walking order.  */
88   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
89                                       or multiplication.  */
90   int expansion_count;             /* Count the number of expansions generated so far.  */
91   int reuse_expansion;             /* The expansion we intend to reuse to expand
92                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
93                                       the original accumulator.  Else use
94                                       var_expansions[REUSE_EXPANSION - 1].  */
95 };
96 
97 /* Hashtable helper for iv_to_split.  */
98 
99 struct iv_split_hasher : free_ptr_hash <iv_to_split>
100 {
101   static inline hashval_t hash (const iv_to_split *);
102   static inline bool equal (const iv_to_split *, const iv_to_split *);
103 };
104 
105 
106 /* A hash function for information about insns to split.  */
107 
108 inline hashval_t
hash(const iv_to_split * ivts)109 iv_split_hasher::hash (const iv_to_split *ivts)
110 {
111   return (hashval_t) INSN_UID (ivts->insn);
112 }
113 
114 /* An equality functions for information about insns to split.  */
115 
116 inline bool
equal(const iv_to_split * i1,const iv_to_split * i2)117 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
118 {
119   return i1->insn == i2->insn;
120 }
121 
122 /* Hashtable helper for iv_to_split.  */
123 
124 struct var_expand_hasher : free_ptr_hash <var_to_expand>
125 {
126   static inline hashval_t hash (const var_to_expand *);
127   static inline bool equal (const var_to_expand *, const var_to_expand *);
128 };
129 
130 /* Return a hash for VES.  */
131 
132 inline hashval_t
hash(const var_to_expand * ves)133 var_expand_hasher::hash (const var_to_expand *ves)
134 {
135   return (hashval_t) INSN_UID (ves->insn);
136 }
137 
138 /* Return true if I1 and I2 refer to the same instruction.  */
139 
140 inline bool
equal(const var_to_expand * i1,const var_to_expand * i2)141 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
142 {
143   return i1->insn == i2->insn;
144 }
145 
146 /* Information about optimization applied in
147    the unrolled loop.  */
148 
149 struct opt_info
150 {
151   hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
152 						  split.  */
153   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
154   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
155   hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
156 					insns with accumulators to expand.  */
157   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
158   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
159   unsigned first_new_block;        /* The first basic block that was
160                                       duplicated.  */
161   basic_block loop_exit;           /* The loop exit basic block.  */
162   basic_block loop_preheader;      /* The loop preheader basic block.  */
163 };
164 
165 static void decide_unroll_stupid (class loop *, int);
166 static void decide_unroll_constant_iterations (class loop *, int);
167 static void decide_unroll_runtime_iterations (class loop *, int);
168 static void unroll_loop_stupid (class loop *);
169 static void decide_unrolling (int);
170 static void unroll_loop_constant_iterations (class loop *);
171 static void unroll_loop_runtime_iterations (class loop *);
172 static struct opt_info *analyze_insns_in_loop (class loop *);
173 static void opt_info_start_duplication (struct opt_info *);
174 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
175 static void free_opt_info (struct opt_info *);
176 static struct var_to_expand *analyze_insn_to_expand_var (class loop*, rtx_insn *);
177 static bool referenced_in_one_insn_in_loop_p (class loop *, rtx, int *);
178 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
179 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
180 static void insert_var_expansion_initialization (struct var_to_expand *,
181 						 basic_block);
182 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
183 					     basic_block);
184 static rtx get_expansion (struct var_to_expand *);
185 
186 /* Emit a message summarizing the unroll that will be
187    performed for LOOP, along with the loop's location LOCUS, if
188    appropriate given the dump or -fopt-info settings.  */
189 
190 static void
report_unroll(class loop * loop,dump_location_t locus)191 report_unroll (class loop *loop, dump_location_t locus)
192 {
193   dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
194 
195   if (loop->lpt_decision.decision == LPT_NONE)
196     return;
197 
198   if (!dump_enabled_p ())
199     return;
200 
201   dump_metadata_t metadata (report_flags, locus.get_impl_location ());
202   dump_printf_loc (metadata, locus.get_user_location (),
203                    "loop unrolled %d times",
204                    loop->lpt_decision.times);
205   if (profile_info && loop->header->count.initialized_p ())
206     dump_printf (metadata,
207                  " (header execution count %d)",
208                  (int)loop->header->count.to_gcov_type ());
209 
210   dump_printf (metadata, "\n");
211 }
212 
213 /* Decide whether unroll loops and how much.  */
214 static void
decide_unrolling(int flags)215 decide_unrolling (int flags)
216 {
217   class loop *loop;
218 
219   /* Scan the loops, inner ones first.  */
220   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
221     {
222       loop->lpt_decision.decision = LPT_NONE;
223       dump_user_location_t locus = get_loop_location (loop);
224 
225       if (dump_enabled_p ())
226 	dump_printf_loc (MSG_NOTE, locus,
227 			 "considering unrolling loop %d at BB %d\n",
228 			 loop->num, loop->header->index);
229 
230       if (loop->unroll == 1)
231 	{
232 	  if (dump_file)
233 	    fprintf (dump_file,
234 		     ";; Not unrolling loop, user didn't want it unrolled\n");
235 	  continue;
236 	}
237 
238       /* Do not peel cold areas.  */
239       if (optimize_loop_for_size_p (loop))
240 	{
241 	  if (dump_file)
242 	    fprintf (dump_file, ";; Not considering loop, cold area\n");
243 	  continue;
244 	}
245 
246       /* Can the loop be manipulated?  */
247       if (!can_duplicate_loop_p (loop))
248 	{
249 	  if (dump_file)
250 	    fprintf (dump_file,
251 		     ";; Not considering loop, cannot duplicate\n");
252 	  continue;
253 	}
254 
255       /* Skip non-innermost loops.  */
256       if (loop->inner)
257 	{
258 	  if (dump_file)
259 	    fprintf (dump_file, ";; Not considering loop, is not innermost\n");
260 	  continue;
261 	}
262 
263       loop->ninsns = num_loop_insns (loop);
264       loop->av_ninsns = average_num_loop_insns (loop);
265 
266       /* Try transformations one by one in decreasing order of priority.  */
267       decide_unroll_constant_iterations (loop, flags);
268       if (loop->lpt_decision.decision == LPT_NONE)
269 	decide_unroll_runtime_iterations (loop, flags);
270       if (loop->lpt_decision.decision == LPT_NONE)
271 	decide_unroll_stupid (loop, flags);
272 
273       report_unroll (loop, locus);
274     }
275 }
276 
277 /* Unroll LOOPS.  */
278 void
unroll_loops(int flags)279 unroll_loops (int flags)
280 {
281   class loop *loop;
282   bool changed = false;
283 
284   /* Now decide rest of unrolling.  */
285   decide_unrolling (flags);
286 
287   /* Scan the loops, inner ones first.  */
288   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
289     {
290       /* And perform the appropriate transformations.  */
291       switch (loop->lpt_decision.decision)
292 	{
293 	case LPT_UNROLL_CONSTANT:
294 	  unroll_loop_constant_iterations (loop);
295 	  changed = true;
296 	  break;
297 	case LPT_UNROLL_RUNTIME:
298 	  unroll_loop_runtime_iterations (loop);
299 	  changed = true;
300 	  break;
301 	case LPT_UNROLL_STUPID:
302 	  unroll_loop_stupid (loop);
303 	  changed = true;
304 	  break;
305 	case LPT_NONE:
306 	  break;
307 	default:
308 	  gcc_unreachable ();
309 	}
310     }
311 
312     if (changed)
313       {
314 	calculate_dominance_info (CDI_DOMINATORS);
315 	fix_loop_structure (NULL);
316       }
317 
318   iv_analysis_done ();
319 }
320 
321 /* Check whether exit of the LOOP is at the end of loop body.  */
322 
323 static bool
loop_exit_at_end_p(class loop * loop)324 loop_exit_at_end_p (class loop *loop)
325 {
326   class niter_desc *desc = get_simple_loop_desc (loop);
327   rtx_insn *insn;
328 
329   /* We should never have conditional in latch block.  */
330   gcc_assert (desc->in_edge->dest != loop->header);
331 
332   if (desc->in_edge->dest != loop->latch)
333     return false;
334 
335   /* Check that the latch is empty.  */
336   FOR_BB_INSNS (loop->latch, insn)
337     {
338       if (INSN_P (insn) && active_insn_p (insn))
339 	return false;
340     }
341 
342   return true;
343 }
344 
345 /* Decide whether to unroll LOOP iterating constant number of times
346    and how much.  */
347 
348 static void
decide_unroll_constant_iterations(class loop * loop,int flags)349 decide_unroll_constant_iterations (class loop *loop, int flags)
350 {
351   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
352   class niter_desc *desc;
353   widest_int iterations;
354 
355   /* If we were not asked to unroll this loop, just return back silently.  */
356   if (!(flags & UAP_UNROLL) && !loop->unroll)
357     return;
358 
359   if (dump_enabled_p ())
360     dump_printf (MSG_NOTE,
361 		 "considering unrolling loop with constant "
362 		 "number of iterations\n");
363 
364   /* nunroll = total number of copies of the original loop body in
365      unrolled loop (i.e. if it is 2, we have to duplicate loop body once).  */
366   nunroll = param_max_unrolled_insns / loop->ninsns;
367   nunroll_by_av
368     = param_max_average_unrolled_insns / loop->av_ninsns;
369   if (nunroll > nunroll_by_av)
370     nunroll = nunroll_by_av;
371   if (nunroll > (unsigned) param_max_unroll_times)
372     nunroll = param_max_unroll_times;
373 
374   if (targetm.loop_unroll_adjust)
375     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
376 
377   /* Skip big loops.  */
378   if (nunroll <= 1)
379     {
380       if (dump_file)
381 	fprintf (dump_file, ";; Not considering loop, is too big\n");
382       return;
383     }
384 
385   /* Check for simple loops.  */
386   desc = get_simple_loop_desc (loop);
387 
388   /* Check number of iterations.  */
389   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
390     {
391       if (dump_file)
392 	fprintf (dump_file,
393 		 ";; Unable to prove that the loop iterates constant times\n");
394       return;
395     }
396 
397   /* Check for an explicit unrolling factor.  */
398   if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
399     {
400       /* However we cannot unroll completely at the RTL level a loop with
401 	 constant number of iterations; it should have been peeled instead.  */
402       if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
403 	{
404 	  if (dump_file)
405 	    fprintf (dump_file, ";; Loop should have been peeled\n");
406 	}
407       else
408 	{
409 	  loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
410 	  loop->lpt_decision.times = loop->unroll - 1;
411 	}
412       return;
413     }
414 
415   /* Check whether the loop rolls enough to consider.
416      Consult also loop bounds and profile; in the case the loop has more
417      than one exit it may well loop less than determined maximal number
418      of iterations.  */
419   if (desc->niter < 2 * nunroll
420       || ((get_estimated_loop_iterations (loop, &iterations)
421 	   || get_likely_max_loop_iterations (loop, &iterations))
422 	  && wi::ltu_p (iterations, 2 * nunroll)))
423     {
424       if (dump_file)
425 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
426       return;
427     }
428 
429   /* Success; now compute number of iterations to unroll.  We alter
430      nunroll so that as few as possible copies of loop body are
431      necessary, while still not decreasing the number of unrollings
432      too much (at most by 1).  */
433   best_copies = 2 * nunroll + 10;
434 
435   i = 2 * nunroll + 2;
436   if (i > desc->niter - 2)
437     i = desc->niter - 2;
438 
439   for (; i >= nunroll - 1; i--)
440     {
441       unsigned exit_mod = desc->niter % (i + 1);
442 
443       if (!loop_exit_at_end_p (loop))
444 	n_copies = exit_mod + i + 1;
445       else if (exit_mod != (unsigned) i
446 	       || desc->noloop_assumptions != NULL_RTX)
447 	n_copies = exit_mod + i + 2;
448       else
449 	n_copies = i + 1;
450 
451       if (n_copies < best_copies)
452 	{
453 	  best_copies = n_copies;
454 	  best_unroll = i;
455 	}
456     }
457 
458   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
459   loop->lpt_decision.times = best_unroll;
460 }
461 
462 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
463    The transformation does this:
464 
465    for (i = 0; i < 102; i++)
466      body;
467 
468    ==>  (LOOP->LPT_DECISION.TIMES == 3)
469 
470    i = 0;
471    body; i++;
472    body; i++;
473    while (i < 102)
474      {
475        body; i++;
476        body; i++;
477        body; i++;
478        body; i++;
479      }
480   */
481 static void
unroll_loop_constant_iterations(class loop * loop)482 unroll_loop_constant_iterations (class loop *loop)
483 {
484   unsigned HOST_WIDE_INT niter;
485   unsigned exit_mod;
486   unsigned i;
487   edge e;
488   unsigned max_unroll = loop->lpt_decision.times;
489   class niter_desc *desc = get_simple_loop_desc (loop);
490   bool exit_at_end = loop_exit_at_end_p (loop);
491   struct opt_info *opt_info = NULL;
492   bool ok;
493 
494   niter = desc->niter;
495 
496   /* Should not get here (such loop should be peeled instead).  */
497   gcc_assert (niter > max_unroll + 1);
498 
499   exit_mod = niter % (max_unroll + 1);
500 
501   auto_sbitmap wont_exit (max_unroll + 2);
502   bitmap_ones (wont_exit);
503 
504   auto_vec<edge> remove_edges;
505   if (flag_split_ivs_in_unroller
506       || flag_variable_expansion_in_unroller)
507     opt_info = analyze_insns_in_loop (loop);
508 
509   if (!exit_at_end)
510     {
511       /* The exit is not at the end of the loop; leave exit test
512 	 in the first copy, so that the loops that start with test
513 	 of exit condition have continuous body after unrolling.  */
514 
515       if (dump_file)
516 	fprintf (dump_file, ";; Condition at beginning of loop.\n");
517 
518       /* Peel exit_mod iterations.  */
519       bitmap_clear_bit (wont_exit, 0);
520       if (desc->noloop_assumptions)
521 	bitmap_clear_bit (wont_exit, 1);
522 
523       if (exit_mod)
524 	{
525 	  opt_info_start_duplication (opt_info);
526           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
527 					      exit_mod,
528 					      wont_exit, desc->out_edge,
529 					      &remove_edges,
530 					      DLTHE_FLAG_UPDATE_FREQ
531 					      | (opt_info && exit_mod > 1
532 						 ? DLTHE_RECORD_COPY_NUMBER
533 						   : 0));
534 	  gcc_assert (ok);
535 
536           if (opt_info && exit_mod > 1)
537  	    apply_opt_in_copies (opt_info, exit_mod, false, false);
538 
539 	  desc->noloop_assumptions = NULL_RTX;
540 	  desc->niter -= exit_mod;
541 	  loop->nb_iterations_upper_bound -= exit_mod;
542 	  if (loop->any_estimate
543 	      && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
544 	    loop->nb_iterations_estimate -= exit_mod;
545 	  else
546 	    loop->any_estimate = false;
547 	  if (loop->any_likely_upper_bound
548 	      && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
549 	    loop->nb_iterations_likely_upper_bound -= exit_mod;
550 	  else
551 	    loop->any_likely_upper_bound = false;
552 	}
553 
554       bitmap_set_bit (wont_exit, 1);
555     }
556   else
557     {
558       /* Leave exit test in last copy, for the same reason as above if
559 	 the loop tests the condition at the end of loop body.  */
560 
561       if (dump_file)
562 	fprintf (dump_file, ";; Condition at end of loop.\n");
563 
564       /* We know that niter >= max_unroll + 2; so we do not need to care of
565 	 case when we would exit before reaching the loop.  So just peel
566 	 exit_mod + 1 iterations.  */
567       if (exit_mod != max_unroll
568 	  || desc->noloop_assumptions)
569 	{
570 	  bitmap_clear_bit (wont_exit, 0);
571 	  if (desc->noloop_assumptions)
572 	    bitmap_clear_bit (wont_exit, 1);
573 
574           opt_info_start_duplication (opt_info);
575 	  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
576 					      exit_mod + 1,
577 					      wont_exit, desc->out_edge,
578 					      &remove_edges,
579 					      DLTHE_FLAG_UPDATE_FREQ
580 					      | (opt_info && exit_mod > 0
581 						 ? DLTHE_RECORD_COPY_NUMBER
582 						   : 0));
583 	  gcc_assert (ok);
584 
585           if (opt_info && exit_mod > 0)
586   	    apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
587 
588 	  desc->niter -= exit_mod + 1;
589 	  loop->nb_iterations_upper_bound -= exit_mod + 1;
590 	  if (loop->any_estimate
591 	      && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
592 	    loop->nb_iterations_estimate -= exit_mod + 1;
593 	  else
594 	    loop->any_estimate = false;
595 	  if (loop->any_likely_upper_bound
596 	      && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
597 	    loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
598 	  else
599 	    loop->any_likely_upper_bound = false;
600 	  desc->noloop_assumptions = NULL_RTX;
601 
602 	  bitmap_set_bit (wont_exit, 0);
603 	  bitmap_set_bit (wont_exit, 1);
604 	}
605 
606       bitmap_clear_bit (wont_exit, max_unroll);
607     }
608 
609   /* Now unroll the loop.  */
610 
611   opt_info_start_duplication (opt_info);
612   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
613 				      max_unroll,
614 				      wont_exit, desc->out_edge,
615 				      &remove_edges,
616 				      DLTHE_FLAG_UPDATE_FREQ
617 				      | (opt_info
618 					 ? DLTHE_RECORD_COPY_NUMBER
619 					   : 0));
620   gcc_assert (ok);
621 
622   if (opt_info)
623     {
624       apply_opt_in_copies (opt_info, max_unroll, true, true);
625       free_opt_info (opt_info);
626     }
627 
628   if (exit_at_end)
629     {
630       basic_block exit_block = get_bb_copy (desc->in_edge->src);
631       /* Find a new in and out edge; they are in the last copy we have made.  */
632 
633       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
634 	{
635 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
636 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
637 	}
638       else
639 	{
640 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
641 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
642 	}
643     }
644 
645   desc->niter /= max_unroll + 1;
646   loop->nb_iterations_upper_bound
647     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
648   if (loop->any_estimate)
649     loop->nb_iterations_estimate
650       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
651   if (loop->any_likely_upper_bound)
652     loop->nb_iterations_likely_upper_bound
653       = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
654   desc->niter_expr = gen_int_mode (desc->niter, desc->mode);
655 
656   /* Remove the edges.  */
657   FOR_EACH_VEC_ELT (remove_edges, i, e)
658     remove_path (e);
659 
660   if (dump_file)
661     fprintf (dump_file,
662 	     ";; Unrolled loop %d times, constant # of iterations %i insns\n",
663 	     max_unroll, num_loop_insns (loop));
664 }
665 
666 /* Decide whether to unroll LOOP iterating runtime computable number of times
667    and how much.  */
668 static void
decide_unroll_runtime_iterations(class loop * loop,int flags)669 decide_unroll_runtime_iterations (class loop *loop, int flags)
670 {
671   unsigned nunroll, nunroll_by_av, i;
672   class niter_desc *desc;
673   widest_int iterations;
674 
675   /* If we were not asked to unroll this loop, just return back silently.  */
676   if (!(flags & UAP_UNROLL) && !loop->unroll)
677     return;
678 
679   if (dump_enabled_p ())
680     dump_printf (MSG_NOTE,
681 		 "considering unrolling loop with runtime-"
682 		 "computable number of iterations\n");
683 
684   /* nunroll = total number of copies of the original loop body in
685      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
686   nunroll = param_max_unrolled_insns / loop->ninsns;
687   nunroll_by_av = param_max_average_unrolled_insns / loop->av_ninsns;
688   if (nunroll > nunroll_by_av)
689     nunroll = nunroll_by_av;
690   if (nunroll > (unsigned) param_max_unroll_times)
691     nunroll = param_max_unroll_times;
692 
693   if (targetm.loop_unroll_adjust)
694     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
695 
696   if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
697     nunroll = loop->unroll;
698 
699   /* Skip big loops.  */
700   if (nunroll <= 1)
701     {
702       if (dump_file)
703 	fprintf (dump_file, ";; Not considering loop, is too big\n");
704       return;
705     }
706 
707   /* Check for simple loops.  */
708   desc = get_simple_loop_desc (loop);
709 
710   /* Check simpleness.  */
711   if (!desc->simple_p || desc->assumptions)
712     {
713       if (dump_file)
714 	fprintf (dump_file,
715 		 ";; Unable to prove that the number of iterations "
716 		 "can be counted in runtime\n");
717       return;
718     }
719 
720   if (desc->const_iter)
721     {
722       if (dump_file)
723 	fprintf (dump_file, ";; Loop iterates constant times\n");
724       return;
725     }
726 
727   /* Check whether the loop rolls.  */
728   if ((get_estimated_loop_iterations (loop, &iterations)
729        || get_likely_max_loop_iterations (loop, &iterations))
730       && wi::ltu_p (iterations, 2 * nunroll))
731     {
732       if (dump_file)
733 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
734       return;
735     }
736 
737   /* Success; now force nunroll to be power of 2, as code-gen
738      requires it, we are unable to cope with overflows in
739      computation of number of iterations.  */
740   for (i = 1; 2 * i <= nunroll; i *= 2)
741     continue;
742 
743   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
744   loop->lpt_decision.times = i - 1;
745 }
746 
747 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
748    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
749    and NULL is returned instead.  */
750 
751 basic_block
split_edge_and_insert(edge e,rtx_insn * insns)752 split_edge_and_insert (edge e, rtx_insn *insns)
753 {
754   basic_block bb;
755 
756   if (!insns)
757     return NULL;
758   bb = split_edge (e);
759   emit_insn_after (insns, BB_END (bb));
760 
761   /* ??? We used to assume that INSNS can contain control flow insns, and
762      that we had to try to find sub basic blocks in BB to maintain a valid
763      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
764      and call break_superblocks when going out of cfglayout mode.  But it
765      turns out that this never happens; and that if it does ever happen,
766      the verify_flow_info at the end of the RTL loop passes would fail.
767 
768      There are two reasons why we expected we could have control flow insns
769      in INSNS.  The first is when a comparison has to be done in parts, and
770      the second is when the number of iterations is computed for loops with
771      the number of iterations known at runtime.  In both cases, test cases
772      to get control flow in INSNS appear to be impossible to construct:
773 
774       * If do_compare_rtx_and_jump needs several branches to do comparison
775 	in a mode that needs comparison by parts, we cannot analyze the
776 	number of iterations of the loop, and we never get to unrolling it.
777 
778       * The code in expand_divmod that was suspected to cause creation of
779 	branching code seems to be only accessed for signed division.  The
780 	divisions used by # of iterations analysis are always unsigned.
781 	Problems might arise on architectures that emits branching code
782 	for some operations that may appear in the unroller (especially
783 	for division), but we have no such architectures.
784 
785      Considering all this, it was decided that we should for now assume
786      that INSNS can in theory contain control flow insns, but in practice
787      it never does.  So we don't handle the theoretical case, and should
788      a real failure ever show up, we have a pretty good clue for how to
789      fix it.  */
790 
791   return bb;
792 }
793 
794 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
795    true, with probability PROB.  If CINSN is not NULL, it is the insn to copy
796    in order to create a jump.  */
797 
798 static rtx_insn *
compare_and_jump_seq(rtx op0,rtx op1,enum rtx_code comp,rtx_code_label * label,profile_probability prob,rtx_insn * cinsn)799 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
800 		      rtx_code_label *label, profile_probability prob,
801 		      rtx_insn *cinsn)
802 {
803   rtx_insn *seq;
804   rtx_jump_insn *jump;
805   rtx cond;
806   machine_mode mode;
807 
808   mode = GET_MODE (op0);
809   if (mode == VOIDmode)
810     mode = GET_MODE (op1);
811 
812   start_sequence ();
813   if (GET_MODE_CLASS (mode) == MODE_CC)
814     {
815       /* A hack -- there seems to be no easy generic way how to make a
816 	 conditional jump from a ccmode comparison.  */
817       gcc_assert (cinsn);
818       cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
819       gcc_assert (GET_CODE (cond) == comp);
820       gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
821       gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
822       emit_jump_insn (copy_insn (PATTERN (cinsn)));
823       jump = as_a <rtx_jump_insn *> (get_last_insn ());
824       JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
825       LABEL_NUSES (JUMP_LABEL (jump))++;
826       redirect_jump (jump, label, 0);
827     }
828   else
829     {
830       gcc_assert (!cinsn);
831 
832       op0 = force_operand (op0, NULL_RTX);
833       op1 = force_operand (op1, NULL_RTX);
834       do_compare_rtx_and_jump (op0, op1, comp, 0,
835 			       mode, NULL_RTX, NULL, label,
836 			       profile_probability::uninitialized ());
837       jump = as_a <rtx_jump_insn *> (get_last_insn ());
838       jump->set_jump_target (label);
839       LABEL_NUSES (label)++;
840     }
841   if (prob.initialized_p ())
842     add_reg_br_prob_note (jump, prob);
843 
844   seq = get_insns ();
845   end_sequence ();
846 
847   return seq;
848 }
849 
850 /* Unroll LOOP for which we are able to count number of iterations in
851    runtime LOOP->LPT_DECISION.TIMES times.  The times value must be a
852    power of two.  The transformation does this (with some extra care
853    for case n < 0):
854 
855    for (i = 0; i < n; i++)
856      body;
857 
858    ==>  (LOOP->LPT_DECISION.TIMES == 3)
859 
860    i = 0;
861    mod = n % 4;
862 
863    switch (mod)
864      {
865        case 3:
866          body; i++;
867        case 2:
868          body; i++;
869        case 1:
870          body; i++;
871        case 0: ;
872      }
873 
874    while (i < n)
875      {
876        body; i++;
877        body; i++;
878        body; i++;
879        body; i++;
880      }
881    */
882 static void
unroll_loop_runtime_iterations(class loop * loop)883 unroll_loop_runtime_iterations (class loop *loop)
884 {
885   rtx old_niter, niter, tmp;
886   rtx_insn *init_code, *branch_code;
887   unsigned i, j;
888   profile_probability p;
889   basic_block preheader, *body, swtch, ezc_swtch = NULL;
890   int may_exit_copy;
891   profile_count iter_count, new_count;
892   unsigned n_peel;
893   edge e;
894   bool extra_zero_check, last_may_exit;
895   unsigned max_unroll = loop->lpt_decision.times;
896   class niter_desc *desc = get_simple_loop_desc (loop);
897   bool exit_at_end = loop_exit_at_end_p (loop);
898   struct opt_info *opt_info = NULL;
899   bool ok;
900 
901   if (flag_split_ivs_in_unroller
902       || flag_variable_expansion_in_unroller)
903     opt_info = analyze_insns_in_loop (loop);
904 
905   /* Remember blocks whose dominators will have to be updated.  */
906   auto_vec<basic_block> dom_bbs;
907 
908   body = get_loop_body (loop);
909   for (i = 0; i < loop->num_nodes; i++)
910     {
911       vec<basic_block> ldom;
912       basic_block bb;
913 
914       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
915       FOR_EACH_VEC_ELT (ldom, j, bb)
916 	if (!flow_bb_inside_loop_p (loop, bb))
917 	  dom_bbs.safe_push (bb);
918 
919       ldom.release ();
920     }
921   free (body);
922 
923   if (!exit_at_end)
924     {
925       /* Leave exit in first copy (for explanation why see comment in
926 	 unroll_loop_constant_iterations).  */
927       may_exit_copy = 0;
928       n_peel = max_unroll - 1;
929       extra_zero_check = true;
930       last_may_exit = false;
931     }
932   else
933     {
934       /* Leave exit in last copy (for explanation why see comment in
935 	 unroll_loop_constant_iterations).  */
936       may_exit_copy = max_unroll;
937       n_peel = max_unroll;
938       extra_zero_check = false;
939       last_may_exit = true;
940     }
941 
942   /* Get expression for number of iterations.  */
943   start_sequence ();
944   old_niter = niter = gen_reg_rtx (desc->mode);
945   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
946   if (tmp != niter)
947     emit_move_insn (niter, tmp);
948 
949   /* For loops that exit at end and whose number of iterations is reliable,
950      add one to niter to account for first pass through loop body before
951      reaching exit test. */
952   if (exit_at_end && !desc->noloop_assumptions)
953     {
954       niter = expand_simple_binop (desc->mode, PLUS,
955 				   niter, const1_rtx,
956 				   NULL_RTX, 0, OPTAB_LIB_WIDEN);
957       old_niter = niter;
958     }
959 
960   /* Count modulo by ANDing it with max_unroll; we use the fact that
961      the number of unrollings is a power of two, and thus this is correct
962      even if there is overflow in the computation.  */
963   niter = expand_simple_binop (desc->mode, AND,
964 			       niter, gen_int_mode (max_unroll, desc->mode),
965 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
966 
967   init_code = get_insns ();
968   end_sequence ();
969   unshare_all_rtl_in_chain (init_code);
970 
971   /* Precondition the loop.  */
972   split_edge_and_insert (loop_preheader_edge (loop), init_code);
973 
974   auto_vec<edge> remove_edges;
975 
976   auto_sbitmap wont_exit (max_unroll + 2);
977 
978   if (extra_zero_check || desc->noloop_assumptions)
979     {
980       /* Peel the first copy of loop body.  Leave the exit test if the number
981 	 of iterations is not reliable.  Also record the place of the extra zero
982 	 check.  */
983       bitmap_clear (wont_exit);
984       if (!desc->noloop_assumptions)
985 	bitmap_set_bit (wont_exit, 1);
986       ezc_swtch = loop_preheader_edge (loop)->src;
987       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
988 					  1, wont_exit, desc->out_edge,
989 					  &remove_edges,
990 					  DLTHE_FLAG_UPDATE_FREQ);
991       gcc_assert (ok);
992     }
993 
994   /* Record the place where switch will be built for preconditioning.  */
995   swtch = split_edge (loop_preheader_edge (loop));
996 
997   /* Compute count increments for each switch block and initialize
998      innermost switch block.  Switch blocks and peeled loop copies are built
999      from innermost outward.  */
1000   iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
1001   swtch->count = new_count;
1002 
1003   for (i = 0; i < n_peel; i++)
1004     {
1005       /* Peel the copy.  */
1006       bitmap_clear (wont_exit);
1007       if (i != n_peel - 1 || !last_may_exit)
1008 	bitmap_set_bit (wont_exit, 1);
1009       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1010 					  1, wont_exit, desc->out_edge,
1011 					  &remove_edges,
1012 					  DLTHE_FLAG_UPDATE_FREQ);
1013       gcc_assert (ok);
1014 
1015       /* Create item for switch.  */
1016       j = n_peel - i - (extra_zero_check ? 0 : 1);
1017       p = profile_probability::always ().apply_scale (1, i + 2);
1018 
1019       preheader = split_edge (loop_preheader_edge (loop));
1020       /* Add in count of edge from switch block.  */
1021       preheader->count += iter_count;
1022       branch_code = compare_and_jump_seq (copy_rtx (niter),
1023 					  gen_int_mode (j, desc->mode), EQ,
1024 					  block_label (preheader), p, NULL);
1025 
1026       /* We rely on the fact that the compare and jump cannot be optimized out,
1027 	 and hence the cfg we create is correct.  */
1028       gcc_assert (branch_code != NULL_RTX);
1029 
1030       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1031       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1032       single_succ_edge (swtch)->probability = p.invert ();
1033       new_count += iter_count;
1034       swtch->count = new_count;
1035       e = make_edge (swtch, preheader,
1036 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1037       e->probability = p;
1038     }
1039 
1040   if (extra_zero_check)
1041     {
1042       /* Add branch for zero iterations.  */
1043       p = profile_probability::always ().apply_scale (1, max_unroll + 1);
1044       swtch = ezc_swtch;
1045       preheader = split_edge (loop_preheader_edge (loop));
1046       /* Recompute count adjustments since initial peel copy may
1047 	 have exited and reduced those values that were computed above.  */
1048       iter_count = swtch->count.apply_scale (1, max_unroll + 1);
1049       /* Add in count of edge from switch block.  */
1050       preheader->count += iter_count;
1051       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1052 					  block_label (preheader), p,
1053 					  NULL);
1054       gcc_assert (branch_code != NULL_RTX);
1055 
1056       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1057       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1058       single_succ_edge (swtch)->probability = p.invert ();
1059       e = make_edge (swtch, preheader,
1060 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1061       e->probability = p;
1062     }
1063 
1064   /* Recount dominators for outer blocks.  */
1065   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1066 
1067   /* And unroll loop.  */
1068 
1069   bitmap_ones (wont_exit);
1070   bitmap_clear_bit (wont_exit, may_exit_copy);
1071   opt_info_start_duplication (opt_info);
1072 
1073   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1074 				      max_unroll,
1075 				      wont_exit, desc->out_edge,
1076 				      &remove_edges,
1077 				      DLTHE_FLAG_UPDATE_FREQ
1078 				      | (opt_info
1079 					 ? DLTHE_RECORD_COPY_NUMBER
1080 					   : 0));
1081   gcc_assert (ok);
1082 
1083   if (opt_info)
1084     {
1085       apply_opt_in_copies (opt_info, max_unroll, true, true);
1086       free_opt_info (opt_info);
1087     }
1088 
1089   if (exit_at_end)
1090     {
1091       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1092       /* Find a new in and out edge; they are in the last copy we have
1093 	 made.  */
1094 
1095       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1096 	{
1097 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
1098 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
1099 	}
1100       else
1101 	{
1102 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
1103 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
1104 	}
1105     }
1106 
1107   /* Remove the edges.  */
1108   FOR_EACH_VEC_ELT (remove_edges, i, e)
1109     remove_path (e);
1110 
1111   /* We must be careful when updating the number of iterations due to
1112      preconditioning and the fact that the value must be valid at entry
1113      of the loop.  After passing through the above code, we see that
1114      the correct new number of iterations is this:  */
1115   gcc_assert (!desc->const_iter);
1116   desc->niter_expr =
1117     simplify_gen_binary (UDIV, desc->mode, old_niter,
1118 			 gen_int_mode (max_unroll + 1, desc->mode));
1119   loop->nb_iterations_upper_bound
1120     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1121   if (loop->any_estimate)
1122     loop->nb_iterations_estimate
1123       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1124   if (loop->any_likely_upper_bound)
1125     loop->nb_iterations_likely_upper_bound
1126       = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
1127   if (exit_at_end)
1128     {
1129       desc->niter_expr =
1130 	simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1131       desc->noloop_assumptions = NULL_RTX;
1132       --loop->nb_iterations_upper_bound;
1133       if (loop->any_estimate
1134 	  && loop->nb_iterations_estimate != 0)
1135 	--loop->nb_iterations_estimate;
1136       else
1137 	loop->any_estimate = false;
1138       if (loop->any_likely_upper_bound
1139 	  && loop->nb_iterations_likely_upper_bound != 0)
1140 	--loop->nb_iterations_likely_upper_bound;
1141       else
1142 	loop->any_likely_upper_bound = false;
1143     }
1144 
1145   if (dump_file)
1146     fprintf (dump_file,
1147 	     ";; Unrolled loop %d times, counting # of iterations "
1148 	     "in runtime, %i insns\n",
1149 	     max_unroll, num_loop_insns (loop));
1150 }
1151 
1152 /* Decide whether to unroll LOOP stupidly and how much.  */
1153 static void
decide_unroll_stupid(class loop * loop,int flags)1154 decide_unroll_stupid (class loop *loop, int flags)
1155 {
1156   unsigned nunroll, nunroll_by_av, i;
1157   class niter_desc *desc;
1158   widest_int iterations;
1159 
1160   /* If we were not asked to unroll this loop, just return back silently.  */
1161   if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
1162     return;
1163 
1164   if (dump_enabled_p ())
1165     dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
1166 
1167   /* nunroll = total number of copies of the original loop body in
1168      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1169   nunroll = param_max_unrolled_insns / loop->ninsns;
1170   nunroll_by_av
1171     = param_max_average_unrolled_insns / loop->av_ninsns;
1172   if (nunroll > nunroll_by_av)
1173     nunroll = nunroll_by_av;
1174   if (nunroll > (unsigned) param_max_unroll_times)
1175     nunroll = param_max_unroll_times;
1176 
1177   if (targetm.loop_unroll_adjust)
1178     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1179 
1180   if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
1181     nunroll = loop->unroll;
1182 
1183   /* Skip big loops.  */
1184   if (nunroll <= 1)
1185     {
1186       if (dump_file)
1187 	fprintf (dump_file, ";; Not considering loop, is too big\n");
1188       return;
1189     }
1190 
1191   /* Check for simple loops.  */
1192   desc = get_simple_loop_desc (loop);
1193 
1194   /* Check simpleness.  */
1195   if (desc->simple_p && !desc->assumptions)
1196     {
1197       if (dump_file)
1198 	fprintf (dump_file, ";; Loop is simple\n");
1199       return;
1200     }
1201 
1202   /* Do not unroll loops with branches inside -- it increases number
1203      of mispredicts.
1204      TODO: this heuristic needs tunning; call inside the loop body
1205      is also relatively good reason to not unroll.  */
1206   if (num_loop_branches (loop) > 1)
1207     {
1208       if (dump_file)
1209 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
1210       return;
1211     }
1212 
1213   /* Check whether the loop rolls.  */
1214   if ((get_estimated_loop_iterations (loop, &iterations)
1215        || get_likely_max_loop_iterations (loop, &iterations))
1216       && wi::ltu_p (iterations, 2 * nunroll))
1217     {
1218       if (dump_file)
1219 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1220       return;
1221     }
1222 
1223   /* Success.  Now force nunroll to be power of 2, as it seems that this
1224      improves results (partially because of better alignments, partially
1225      because of some dark magic).  */
1226   for (i = 1; 2 * i <= nunroll; i *= 2)
1227     continue;
1228 
1229   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1230   loop->lpt_decision.times = i - 1;
1231 }
1232 
1233 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1234 
1235    while (cond)
1236      body;
1237 
1238    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1239 
1240    while (cond)
1241      {
1242        body;
1243        if (!cond) break;
1244        body;
1245        if (!cond) break;
1246        body;
1247        if (!cond) break;
1248        body;
1249      }
1250    */
1251 static void
unroll_loop_stupid(class loop * loop)1252 unroll_loop_stupid (class loop *loop)
1253 {
1254   unsigned nunroll = loop->lpt_decision.times;
1255   class niter_desc *desc = get_simple_loop_desc (loop);
1256   struct opt_info *opt_info = NULL;
1257   bool ok;
1258 
1259   if (flag_split_ivs_in_unroller
1260       || flag_variable_expansion_in_unroller)
1261     opt_info = analyze_insns_in_loop (loop);
1262 
1263   auto_sbitmap wont_exit (nunroll + 1);
1264   bitmap_clear (wont_exit);
1265   opt_info_start_duplication (opt_info);
1266 
1267   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1268 				      nunroll, wont_exit,
1269 				      NULL, NULL,
1270 				      DLTHE_FLAG_UPDATE_FREQ
1271 				      | (opt_info
1272 					 ? DLTHE_RECORD_COPY_NUMBER
1273 					   : 0));
1274   gcc_assert (ok);
1275 
1276   if (opt_info)
1277     {
1278       apply_opt_in_copies (opt_info, nunroll, true, true);
1279       free_opt_info (opt_info);
1280     }
1281 
1282   if (desc->simple_p)
1283     {
1284       /* We indeed may get here provided that there are nontrivial assumptions
1285 	 for a loop to be really simple.  We could update the counts, but the
1286 	 problem is that we are unable to decide which exit will be taken
1287 	 (not really true in case the number of iterations is constant,
1288 	 but no one will do anything with this information, so we do not
1289 	 worry about it).  */
1290       desc->simple_p = false;
1291     }
1292 
1293   if (dump_file)
1294     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1295 	     nunroll, num_loop_insns (loop));
1296 }
1297 
1298 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1299    Set *DEBUG_USES to the number of debug insns that reference the
1300    variable.  */
1301 
1302 static bool
referenced_in_one_insn_in_loop_p(class loop * loop,rtx reg,int * debug_uses)1303 referenced_in_one_insn_in_loop_p (class loop *loop, rtx reg,
1304 				  int *debug_uses)
1305 {
1306   basic_block *body, bb;
1307   unsigned i;
1308   int count_ref = 0;
1309   rtx_insn *insn;
1310 
1311   body = get_loop_body (loop);
1312   for (i = 0; i < loop->num_nodes; i++)
1313     {
1314       bb = body[i];
1315 
1316       FOR_BB_INSNS (bb, insn)
1317 	if (!rtx_referenced_p (reg, insn))
1318 	  continue;
1319 	else if (DEBUG_INSN_P (insn))
1320 	  ++*debug_uses;
1321 	else if (++count_ref > 1)
1322 	  break;
1323     }
1324   free (body);
1325   return (count_ref  == 1);
1326 }
1327 
1328 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1329 
1330 static void
reset_debug_uses_in_loop(class loop * loop,rtx reg,int debug_uses)1331 reset_debug_uses_in_loop (class loop *loop, rtx reg, int debug_uses)
1332 {
1333   basic_block *body, bb;
1334   unsigned i;
1335   rtx_insn *insn;
1336 
1337   body = get_loop_body (loop);
1338   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1339     {
1340       bb = body[i];
1341 
1342       FOR_BB_INSNS (bb, insn)
1343 	if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1344 	  continue;
1345 	else
1346 	  {
1347 	    validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1348 			     gen_rtx_UNKNOWN_VAR_LOC (), 0);
1349 	    if (!--debug_uses)
1350 	      break;
1351 	  }
1352     }
1353   free (body);
1354 }
1355 
1356 /* Determine whether INSN contains an accumulator
1357    which can be expanded into separate copies,
1358    one for each copy of the LOOP body.
1359 
1360    for (i = 0 ; i < n; i++)
1361      sum += a[i];
1362 
1363    ==>
1364 
1365    sum += a[i]
1366    ....
1367    i = i+1;
1368    sum1 += a[i]
1369    ....
1370    i = i+1
1371    sum2 += a[i];
1372    ....
1373 
1374    Return NULL if INSN contains no opportunity for expansion of accumulator.
1375    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1376    information and return a pointer to it.
1377 */
1378 
1379 static struct var_to_expand *
analyze_insn_to_expand_var(class loop * loop,rtx_insn * insn)1380 analyze_insn_to_expand_var (class loop *loop, rtx_insn *insn)
1381 {
1382   rtx set, dest, src;
1383   struct var_to_expand *ves;
1384   unsigned accum_pos;
1385   enum rtx_code code;
1386   int debug_uses = 0;
1387 
1388   set = single_set (insn);
1389   if (!set)
1390     return NULL;
1391 
1392   dest = SET_DEST (set);
1393   src = SET_SRC (set);
1394   code = GET_CODE (src);
1395 
1396   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1397     return NULL;
1398 
1399   if (FLOAT_MODE_P (GET_MODE (dest)))
1400     {
1401       if (!flag_associative_math)
1402         return NULL;
1403       /* In the case of FMA, we're also changing the rounding.  */
1404       if (code == FMA && !flag_unsafe_math_optimizations)
1405 	return NULL;
1406     }
1407 
1408   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1409      in MD.  But if there is no optab to generate the insn, we cannot
1410      perform the variable expansion.  This can happen if an MD provides
1411      an insn but not a named pattern to generate it, for example to avoid
1412      producing code that needs additional mode switches like for x87/mmx.
1413 
1414      So we check have_insn_for which looks for an optab for the operation
1415      in SRC.  If it doesn't exist, we can't perform the expansion even
1416      though INSN is valid.  */
1417   if (!have_insn_for (code, GET_MODE (src)))
1418     return NULL;
1419 
1420   if (!REG_P (dest)
1421       && !(GET_CODE (dest) == SUBREG
1422            && REG_P (SUBREG_REG (dest))))
1423     return NULL;
1424 
1425   /* Find the accumulator use within the operation.  */
1426   if (code == FMA)
1427     {
1428       /* We only support accumulation via FMA in the ADD position.  */
1429       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1430 	return NULL;
1431       accum_pos = 2;
1432     }
1433   else if (rtx_equal_p (dest, XEXP (src, 0)))
1434     accum_pos = 0;
1435   else if (rtx_equal_p (dest, XEXP (src, 1)))
1436     {
1437       /* The method of expansion that we are using; which includes the
1438 	 initialization of the expansions with zero and the summation of
1439          the expansions at the end of the computation will yield wrong
1440 	 results for (x = something - x) thus avoid using it in that case.  */
1441       if (code == MINUS)
1442 	return NULL;
1443       accum_pos = 1;
1444     }
1445   else
1446     return NULL;
1447 
1448   /* It must not otherwise be used.  */
1449   if (code == FMA)
1450     {
1451       if (rtx_referenced_p (dest, XEXP (src, 0))
1452 	  || rtx_referenced_p (dest, XEXP (src, 1)))
1453 	return NULL;
1454     }
1455   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1456     return NULL;
1457 
1458   /* It must be used in exactly one insn.  */
1459   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1460     return NULL;
1461 
1462   if (dump_file)
1463     {
1464       fprintf (dump_file, "\n;; Expanding Accumulator ");
1465       print_rtl (dump_file, dest);
1466       fprintf (dump_file, "\n");
1467     }
1468 
1469   if (debug_uses)
1470     /* Instead of resetting the debug insns, we could replace each
1471        debug use in the loop with the sum or product of all expanded
1472        accumulators.  Since we'll only know of all expansions at the
1473        end, we'd have to keep track of which vars_to_expand a debug
1474        insn in the loop references, take note of each copy of the
1475        debug insn during unrolling, and when it's all done, compute
1476        the sum or product of each variable and adjust the original
1477        debug insn and each copy thereof.  What a pain!  */
1478     reset_debug_uses_in_loop (loop, dest, debug_uses);
1479 
1480   /* Record the accumulator to expand.  */
1481   ves = XNEW (struct var_to_expand);
1482   ves->insn = insn;
1483   ves->reg = copy_rtx (dest);
1484   ves->var_expansions.create (1);
1485   ves->next = NULL;
1486   ves->op = GET_CODE (src);
1487   ves->expansion_count = 0;
1488   ves->reuse_expansion = 0;
1489   return ves;
1490 }
1491 
1492 /* Determine whether there is an induction variable in INSN that
1493    we would like to split during unrolling.
1494 
1495    I.e. replace
1496 
1497    i = i + 1;
1498    ...
1499    i = i + 1;
1500    ...
1501    i = i + 1;
1502    ...
1503 
1504    type chains by
1505 
1506    i0 = i + 1
1507    ...
1508    i = i0 + 1
1509    ...
1510    i = i0 + 2
1511    ...
1512 
1513    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1514    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1515    pointer to it.  */
1516 
1517 static struct iv_to_split *
analyze_iv_to_split_insn(rtx_insn * insn)1518 analyze_iv_to_split_insn (rtx_insn *insn)
1519 {
1520   rtx set, dest;
1521   class rtx_iv iv;
1522   struct iv_to_split *ivts;
1523   scalar_int_mode mode;
1524   bool ok;
1525 
1526   /* For now we just split the basic induction variables.  Later this may be
1527      extended for example by selecting also addresses of memory references.  */
1528   set = single_set (insn);
1529   if (!set)
1530     return NULL;
1531 
1532   dest = SET_DEST (set);
1533   if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
1534     return NULL;
1535 
1536   if (!biv_p (insn, mode, dest))
1537     return NULL;
1538 
1539   ok = iv_analyze_result (insn, dest, &iv);
1540 
1541   /* This used to be an assert under the assumption that if biv_p returns
1542      true that iv_analyze_result must also return true.  However, that
1543      assumption is not strictly correct as evidenced by pr25569.
1544 
1545      Returning NULL when iv_analyze_result returns false is safe and
1546      avoids the problems in pr25569 until the iv_analyze_* routines
1547      can be fixed, which is apparently hard and time consuming
1548      according to their author.  */
1549   if (! ok)
1550     return NULL;
1551 
1552   if (iv.step == const0_rtx
1553       || iv.mode != iv.extend_mode)
1554     return NULL;
1555 
1556   /* Record the insn to split.  */
1557   ivts = XNEW (struct iv_to_split);
1558   ivts->insn = insn;
1559   ivts->orig_var = dest;
1560   ivts->base_var = NULL_RTX;
1561   ivts->step = iv.step;
1562   ivts->next = NULL;
1563 
1564   return ivts;
1565 }
1566 
1567 /* Determines which of insns in LOOP can be optimized.
1568    Return a OPT_INFO struct with the relevant hash tables filled
1569    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1570    is undefined for the return value.  */
1571 
1572 static struct opt_info *
analyze_insns_in_loop(class loop * loop)1573 analyze_insns_in_loop (class loop *loop)
1574 {
1575   basic_block *body, bb;
1576   unsigned i;
1577   struct opt_info *opt_info = XCNEW (struct opt_info);
1578   rtx_insn *insn;
1579   struct iv_to_split *ivts = NULL;
1580   struct var_to_expand *ves = NULL;
1581   iv_to_split **slot1;
1582   var_to_expand **slot2;
1583   auto_vec<edge> edges = get_loop_exit_edges (loop);
1584   edge exit;
1585   bool can_apply = false;
1586 
1587   iv_analysis_loop_init (loop);
1588 
1589   body = get_loop_body (loop);
1590 
1591   if (flag_split_ivs_in_unroller)
1592     {
1593       opt_info->insns_to_split
1594        	= new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1595       opt_info->iv_to_split_head = NULL;
1596       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1597     }
1598 
1599   /* Record the loop exit bb and loop preheader before the unrolling.  */
1600   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1601 
1602   if (edges.length () == 1)
1603     {
1604       exit = edges[0];
1605       if (!(exit->flags & EDGE_COMPLEX))
1606 	{
1607 	  opt_info->loop_exit = split_edge (exit);
1608 	  can_apply = true;
1609 	}
1610     }
1611 
1612   if (flag_variable_expansion_in_unroller
1613       && can_apply)
1614     {
1615       opt_info->insns_with_var_to_expand
1616        	= new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1617       opt_info->var_to_expand_head = NULL;
1618       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1619     }
1620 
1621   for (i = 0; i < loop->num_nodes; i++)
1622     {
1623       bb = body[i];
1624       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1625 	continue;
1626 
1627       FOR_BB_INSNS (bb, insn)
1628       {
1629         if (!INSN_P (insn))
1630           continue;
1631 
1632         if (opt_info->insns_to_split)
1633           ivts = analyze_iv_to_split_insn (insn);
1634 
1635         if (ivts)
1636           {
1637             slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1638 	    gcc_assert (*slot1 == NULL);
1639             *slot1 = ivts;
1640 	    *opt_info->iv_to_split_tail = ivts;
1641 	    opt_info->iv_to_split_tail = &ivts->next;
1642             continue;
1643           }
1644 
1645         if (opt_info->insns_with_var_to_expand)
1646           ves = analyze_insn_to_expand_var (loop, insn);
1647 
1648         if (ves)
1649           {
1650             slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1651 	    gcc_assert (*slot2 == NULL);
1652             *slot2 = ves;
1653 	    *opt_info->var_to_expand_tail = ves;
1654 	    opt_info->var_to_expand_tail = &ves->next;
1655           }
1656       }
1657     }
1658 
1659   free (body);
1660   return opt_info;
1661 }
1662 
1663 /* Called just before loop duplication.  Records start of duplicated area
1664    to OPT_INFO.  */
1665 
1666 static void
opt_info_start_duplication(struct opt_info * opt_info)1667 opt_info_start_duplication (struct opt_info *opt_info)
1668 {
1669   if (opt_info)
1670     opt_info->first_new_block = last_basic_block_for_fn (cfun);
1671 }
1672 
1673 /* Determine the number of iterations between initialization of the base
1674    variable and the current copy (N_COPY).  N_COPIES is the total number
1675    of newly created copies.  UNROLLING is true if we are unrolling
1676    (not peeling) the loop.  */
1677 
1678 static unsigned
determine_split_iv_delta(unsigned n_copy,unsigned n_copies,bool unrolling)1679 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1680 {
1681   if (unrolling)
1682     {
1683       /* If we are unrolling, initialization is done in the original loop
1684 	 body (number 0).  */
1685       return n_copy;
1686     }
1687   else
1688     {
1689       /* If we are peeling, the copy in that the initialization occurs has
1690 	 number 1.  The original loop (number 0) is the last.  */
1691       if (n_copy)
1692 	return n_copy - 1;
1693       else
1694 	return n_copies;
1695     }
1696 }
1697 
1698 /* Allocate basic variable for the induction variable chain.  */
1699 
1700 static void
allocate_basic_variable(struct iv_to_split * ivts)1701 allocate_basic_variable (struct iv_to_split *ivts)
1702 {
1703   rtx expr = SET_SRC (single_set (ivts->insn));
1704 
1705   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1706 }
1707 
1708 /* Insert initialization of basic variable of IVTS before INSN, taking
1709    the initial value from INSN.  */
1710 
1711 static void
insert_base_initialization(struct iv_to_split * ivts,rtx_insn * insn)1712 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1713 {
1714   rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1715   rtx_insn *seq;
1716 
1717   start_sequence ();
1718   expr = force_operand (expr, ivts->base_var);
1719   if (expr != ivts->base_var)
1720     emit_move_insn (ivts->base_var, expr);
1721   seq = get_insns ();
1722   end_sequence ();
1723 
1724   emit_insn_before (seq, insn);
1725 }
1726 
1727 /* Replace the use of induction variable described in IVTS in INSN
1728    by base variable + DELTA * step.  */
1729 
1730 static void
split_iv(struct iv_to_split * ivts,rtx_insn * insn,unsigned delta)1731 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1732 {
1733   rtx expr, *loc, incr, var;
1734   rtx_insn *seq;
1735   machine_mode mode = GET_MODE (ivts->base_var);
1736   rtx src, dest, set;
1737 
1738   /* Construct base + DELTA * step.  */
1739   if (!delta)
1740     expr = ivts->base_var;
1741   else
1742     {
1743       incr = simplify_gen_binary (MULT, mode,
1744 				  copy_rtx (ivts->step),
1745 				  gen_int_mode (delta, mode));
1746       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1747 				  ivts->base_var, incr);
1748     }
1749 
1750   /* Figure out where to do the replacement.  */
1751   loc = &SET_SRC (single_set (insn));
1752 
1753   /* If we can make the replacement right away, we're done.  */
1754   if (validate_change (insn, loc, expr, 0))
1755     return;
1756 
1757   /* Otherwise, force EXPR into a register and try again.  */
1758   start_sequence ();
1759   var = gen_reg_rtx (mode);
1760   expr = force_operand (expr, var);
1761   if (expr != var)
1762     emit_move_insn (var, expr);
1763   seq = get_insns ();
1764   end_sequence ();
1765   emit_insn_before (seq, insn);
1766 
1767   if (validate_change (insn, loc, var, 0))
1768     return;
1769 
1770   /* The last chance.  Try recreating the assignment in insn
1771      completely from scratch.  */
1772   set = single_set (insn);
1773   gcc_assert (set);
1774 
1775   start_sequence ();
1776   *loc = var;
1777   src = copy_rtx (SET_SRC (set));
1778   dest = copy_rtx (SET_DEST (set));
1779   src = force_operand (src, dest);
1780   if (src != dest)
1781     emit_move_insn (dest, src);
1782   seq = get_insns ();
1783   end_sequence ();
1784 
1785   emit_insn_before (seq, insn);
1786   delete_insn (insn);
1787 }
1788 
1789 
1790 /* Return one expansion of the accumulator recorded in struct VE.  */
1791 
1792 static rtx
get_expansion(struct var_to_expand * ve)1793 get_expansion (struct var_to_expand *ve)
1794 {
1795   rtx reg;
1796 
1797   if (ve->reuse_expansion == 0)
1798     reg = ve->reg;
1799   else
1800     reg = ve->var_expansions[ve->reuse_expansion - 1];
1801 
1802   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1803     ve->reuse_expansion = 0;
1804   else
1805     ve->reuse_expansion++;
1806 
1807   return reg;
1808 }
1809 
1810 
1811 /* Given INSN replace the uses of the accumulator recorded in VE
1812    with a new register.  */
1813 
1814 static void
expand_var_during_unrolling(struct var_to_expand * ve,rtx_insn * insn)1815 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1816 {
1817   rtx new_reg, set;
1818   bool really_new_expansion = false;
1819 
1820   set = single_set (insn);
1821   gcc_assert (set);
1822 
1823   /* Generate a new register only if the expansion limit has not been
1824      reached.  Else reuse an already existing expansion.  */
1825   if (param_max_variable_expansions > ve->expansion_count)
1826     {
1827       really_new_expansion = true;
1828       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1829     }
1830   else
1831     new_reg = get_expansion (ve);
1832 
1833   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1834   if (apply_change_group ())
1835     if (really_new_expansion)
1836       {
1837         ve->var_expansions.safe_push (new_reg);
1838         ve->expansion_count++;
1839       }
1840 }
1841 
1842 /* Initialize the variable expansions in loop preheader.  PLACE is the
1843    loop-preheader basic block where the initialization of the
1844    expansions should take place.  The expansions are initialized with
1845    (-0) when the operation is plus or minus to honor sign zero.  This
1846    way we can prevent cases where the sign of the final result is
1847    effected by the sign of the expansion.  Here is an example to
1848    demonstrate this:
1849 
1850    for (i = 0 ; i < n; i++)
1851      sum += something;
1852 
1853    ==>
1854 
1855    sum += something
1856    ....
1857    i = i+1;
1858    sum1 += something
1859    ....
1860    i = i+1
1861    sum2 += something;
1862    ....
1863 
1864    When SUM is initialized with -zero and SOMETHING is also -zero; the
1865    final result of sum should be -zero thus the expansions sum1 and sum2
1866    should be initialized with -zero as well (otherwise we will get +zero
1867    as the final result).  */
1868 
1869 static void
insert_var_expansion_initialization(struct var_to_expand * ve,basic_block place)1870 insert_var_expansion_initialization (struct var_to_expand *ve,
1871 				     basic_block place)
1872 {
1873   rtx_insn *seq;
1874   rtx var, zero_init;
1875   unsigned i;
1876   machine_mode mode = GET_MODE (ve->reg);
1877   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1878 
1879   if (ve->var_expansions.length () == 0)
1880     return;
1881 
1882   start_sequence ();
1883   switch (ve->op)
1884     {
1885     case FMA:
1886       /* Note that we only accumulate FMA via the ADD operand.  */
1887     case PLUS:
1888     case MINUS:
1889       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1890         {
1891 	  if (honor_signed_zero_p)
1892 	    zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1893 	  else
1894 	    zero_init = CONST0_RTX (mode);
1895           emit_move_insn (var, zero_init);
1896         }
1897       break;
1898 
1899     case MULT:
1900       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1901         {
1902           zero_init = CONST1_RTX (GET_MODE (var));
1903           emit_move_insn (var, zero_init);
1904         }
1905       break;
1906 
1907     default:
1908       gcc_unreachable ();
1909     }
1910 
1911   seq = get_insns ();
1912   end_sequence ();
1913 
1914   emit_insn_after (seq, BB_END (place));
1915 }
1916 
1917 /* Combine the variable expansions at the loop exit.  PLACE is the
1918    loop exit basic block where the summation of the expansions should
1919    take place.  */
1920 
1921 static void
combine_var_copies_in_loop_exit(struct var_to_expand * ve,basic_block place)1922 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1923 {
1924   rtx sum = ve->reg;
1925   rtx expr, var;
1926   rtx_insn *seq, *insn;
1927   unsigned i;
1928 
1929   if (ve->var_expansions.length () == 0)
1930     return;
1931 
1932   /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1933      it both here and as the destination of the assignment.  */
1934   sum = copy_rtx (sum);
1935   start_sequence ();
1936   switch (ve->op)
1937     {
1938     case FMA:
1939       /* Note that we only accumulate FMA via the ADD operand.  */
1940     case PLUS:
1941     case MINUS:
1942       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1943 	sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1944       break;
1945 
1946     case MULT:
1947       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1948 	sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1949       break;
1950 
1951     default:
1952       gcc_unreachable ();
1953     }
1954 
1955   expr = force_operand (sum, ve->reg);
1956   if (expr != ve->reg)
1957     emit_move_insn (ve->reg, expr);
1958   seq = get_insns ();
1959   end_sequence ();
1960 
1961   insn = BB_HEAD (place);
1962   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1963     insn = NEXT_INSN (insn);
1964 
1965   emit_insn_after (seq, insn);
1966 }
1967 
1968 /* Strip away REG_EQUAL notes for IVs we're splitting.
1969 
1970    Updating REG_EQUAL notes for IVs we split is tricky: We
1971    cannot tell until after unrolling, DF-rescanning, and liveness
1972    updating, whether an EQ_USE is reached by the split IV while
1973    the IV reg is still live.  See PR55006.
1974 
1975    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1976    because RTL loop-iv requires us to defer rescanning insns and
1977    any notes attached to them.  So resort to old techniques...  */
1978 
1979 static void
maybe_strip_eq_note_for_split_iv(struct opt_info * opt_info,rtx_insn * insn)1980 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1981 {
1982   struct iv_to_split *ivts;
1983   rtx note = find_reg_equal_equiv_note (insn);
1984   if (! note)
1985     return;
1986   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1987     if (reg_mentioned_p (ivts->orig_var, note))
1988       {
1989 	remove_note (insn, note);
1990 	return;
1991       }
1992 }
1993 
1994 /* Apply loop optimizations in loop copies using the
1995    data which gathered during the unrolling.  Structure
1996    OPT_INFO record that data.
1997 
1998    UNROLLING is true if we unrolled (not peeled) the loop.
1999    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2000    the loop (as it should happen in complete unrolling, but not in ordinary
2001    peeling of the loop).  */
2002 
2003 static void
apply_opt_in_copies(struct opt_info * opt_info,unsigned n_copies,bool unrolling,bool rewrite_original_loop)2004 apply_opt_in_copies (struct opt_info *opt_info,
2005                      unsigned n_copies, bool unrolling,
2006                      bool rewrite_original_loop)
2007 {
2008   unsigned i, delta;
2009   basic_block bb, orig_bb;
2010   rtx_insn *insn, *orig_insn, *next;
2011   struct iv_to_split ivts_templ, *ivts;
2012   struct var_to_expand ve_templ, *ves;
2013 
2014   /* Sanity check -- we need to put initialization in the original loop
2015      body.  */
2016   gcc_assert (!unrolling || rewrite_original_loop);
2017 
2018   /* Allocate the basic variables (i0).  */
2019   if (opt_info->insns_to_split)
2020     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2021       allocate_basic_variable (ivts);
2022 
2023   for (i = opt_info->first_new_block;
2024        i < (unsigned) last_basic_block_for_fn (cfun);
2025        i++)
2026     {
2027       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2028       orig_bb = get_bb_original (bb);
2029 
2030       /* bb->aux holds position in copy sequence initialized by
2031 	 duplicate_loop_to_header_edge.  */
2032       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2033 					unrolling);
2034       bb->aux = 0;
2035       orig_insn = BB_HEAD (orig_bb);
2036       FOR_BB_INSNS_SAFE (bb, insn, next)
2037         {
2038 	  if (!INSN_P (insn)
2039 	      || (DEBUG_BIND_INSN_P (insn)
2040 		  && INSN_VAR_LOCATION_DECL (insn)
2041 		  && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2042             continue;
2043 
2044 	  while (!INSN_P (orig_insn)
2045 		 || (DEBUG_BIND_INSN_P (orig_insn)
2046 		     && INSN_VAR_LOCATION_DECL (orig_insn)
2047 		     && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2048 			 == LABEL_DECL)))
2049             orig_insn = NEXT_INSN (orig_insn);
2050 
2051           ivts_templ.insn = orig_insn;
2052           ve_templ.insn = orig_insn;
2053 
2054           /* Apply splitting iv optimization.  */
2055           if (opt_info->insns_to_split)
2056             {
2057 	      maybe_strip_eq_note_for_split_iv (opt_info, insn);
2058 
2059               ivts = opt_info->insns_to_split->find (&ivts_templ);
2060 
2061               if (ivts)
2062                 {
2063 		  gcc_assert (GET_CODE (PATTERN (insn))
2064 			      == GET_CODE (PATTERN (orig_insn)));
2065 
2066                   if (!delta)
2067                     insert_base_initialization (ivts, insn);
2068                   split_iv (ivts, insn, delta);
2069                 }
2070             }
2071           /* Apply variable expansion optimization.  */
2072           if (unrolling && opt_info->insns_with_var_to_expand)
2073             {
2074               ves = (struct var_to_expand *)
2075 		opt_info->insns_with_var_to_expand->find (&ve_templ);
2076               if (ves)
2077                 {
2078 		  gcc_assert (GET_CODE (PATTERN (insn))
2079 			      == GET_CODE (PATTERN (orig_insn)));
2080                   expand_var_during_unrolling (ves, insn);
2081                 }
2082             }
2083           orig_insn = NEXT_INSN (orig_insn);
2084         }
2085     }
2086 
2087   if (!rewrite_original_loop)
2088     return;
2089 
2090   /* Initialize the variable expansions in the loop preheader
2091      and take care of combining them at the loop exit.  */
2092   if (opt_info->insns_with_var_to_expand)
2093     {
2094       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2095 	insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2096       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2097 	combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2098     }
2099 
2100   /* Rewrite also the original loop body.  Find them as originals of the blocks
2101      in the last copied iteration, i.e. those that have
2102      get_bb_copy (get_bb_original (bb)) == bb.  */
2103   for (i = opt_info->first_new_block;
2104        i < (unsigned) last_basic_block_for_fn (cfun);
2105        i++)
2106     {
2107       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2108       orig_bb = get_bb_original (bb);
2109       if (get_bb_copy (orig_bb) != bb)
2110 	continue;
2111 
2112       delta = determine_split_iv_delta (0, n_copies, unrolling);
2113       for (orig_insn = BB_HEAD (orig_bb);
2114            orig_insn != NEXT_INSN (BB_END (bb));
2115            orig_insn = next)
2116         {
2117           next = NEXT_INSN (orig_insn);
2118 
2119           if (!INSN_P (orig_insn))
2120  	    continue;
2121 
2122           ivts_templ.insn = orig_insn;
2123           if (opt_info->insns_to_split)
2124             {
2125 	      maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2126 
2127               ivts = (struct iv_to_split *)
2128 		opt_info->insns_to_split->find (&ivts_templ);
2129               if (ivts)
2130                 {
2131                   if (!delta)
2132                     insert_base_initialization (ivts, orig_insn);
2133                   split_iv (ivts, orig_insn, delta);
2134                   continue;
2135                 }
2136             }
2137 
2138         }
2139     }
2140 }
2141 
2142 /* Release OPT_INFO.  */
2143 
2144 static void
free_opt_info(struct opt_info * opt_info)2145 free_opt_info (struct opt_info *opt_info)
2146 {
2147   delete opt_info->insns_to_split;
2148   opt_info->insns_to_split = NULL;
2149   if (opt_info->insns_with_var_to_expand)
2150     {
2151       struct var_to_expand *ves;
2152 
2153       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2154 	ves->var_expansions.release ();
2155       delete opt_info->insns_with_var_to_expand;
2156       opt_info->insns_with_var_to_expand = NULL;
2157     }
2158   free (opt_info);
2159 }
2160