1 /* Loop unrolling and peeling.
2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "basic-block.h"
29 #include "cfgloop.h"
30 #include "params.h"
31 #include "expr.h"
32 #include "hash-table.h"
33 #include "recog.h"
34 #include "target.h"
35 #include "dumpfile.h"
36 
37 /* This pass performs loop unrolling and peeling.  We only perform these
38    optimizations on innermost loops (with single exception) because
39    the impact on performance is greatest here, and we want to avoid
40    unnecessary code size growth.  The gain is caused by greater sequentiality
41    of code, better code to optimize for further passes and in some cases
42    by fewer testings of exit conditions.  The main problem is code growth,
43    that impacts performance negatively due to effect of caches.
44 
45    What we do:
46 
47    -- complete peeling of once-rolling loops; this is the above mentioned
48       exception, as this causes loop to be cancelled completely and
49       does not cause code growth
50    -- complete peeling of loops that roll (small) constant times.
51    -- simple peeling of first iterations of loops that do not roll much
52       (according to profile feedback)
53    -- unrolling of loops that roll constant times; this is almost always
54       win, as we get rid of exit condition tests.
55    -- unrolling of loops that roll number of times that we can compute
56       in runtime; we also get rid of exit condition tests here, but there
57       is the extra expense for calculating the number of iterations
58    -- simple unrolling of remaining loops; this is performed only if we
59       are asked to, as the gain is questionable in this case and often
60       it may even slow down the code
61    For more detailed descriptions of each of those, see comments at
62    appropriate function below.
63 
64    There is a lot of parameters (defined and described in params.def) that
65    control how much we unroll/peel.
66 
67    ??? A great problem is that we don't have a good way how to determine
68    how many times we should unroll the loop; the experiments I have made
69    showed that this choice may affect performance in order of several %.
70    */
71 
72 /* Information about induction variables to split.  */
73 
74 struct iv_to_split
75 {
76   rtx insn;		/* The insn in that the induction variable occurs.  */
77   rtx orig_var;		/* The variable (register) for the IV before split.  */
78   rtx base_var;		/* The variable on that the values in the further
79 			   iterations are based.  */
80   rtx step;		/* Step of the induction variable.  */
81   struct iv_to_split *next; /* Next entry in walking order.  */
82   unsigned n_loc;
83   unsigned loc[3];	/* Location where the definition of the induction
84 			   variable occurs in the insn.  For example if
85 			   N_LOC is 2, the expression is located at
86 			   XEXP (XEXP (single_set, loc[0]), loc[1]).  */
87 };
88 
89 /* Information about accumulators to expand.  */
90 
91 struct var_to_expand
92 {
93   rtx insn;		           /* The insn in that the variable expansion occurs.  */
94   rtx reg;                         /* The accumulator which is expanded.  */
95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
96   struct var_to_expand *next;	   /* Next entry in walking order.  */
97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
98                                       or multiplication.  */
99   int expansion_count;             /* Count the number of expansions generated so far.  */
100   int reuse_expansion;             /* The expansion we intend to reuse to expand
101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
102                                       the original accumulator.  Else use
103                                       var_expansions[REUSE_EXPANSION - 1].  */
104 };
105 
106 /* Hashtable helper for iv_to_split.  */
107 
108 struct iv_split_hasher : typed_free_remove <iv_to_split>
109 {
110   typedef iv_to_split value_type;
111   typedef iv_to_split compare_type;
112   static inline hashval_t hash (const value_type *);
113   static inline bool equal (const value_type *, const compare_type *);
114 };
115 
116 
117 /* A hash function for information about insns to split.  */
118 
119 inline hashval_t
hash(const value_type * ivts)120 iv_split_hasher::hash (const value_type *ivts)
121 {
122   return (hashval_t) INSN_UID (ivts->insn);
123 }
124 
125 /* An equality functions for information about insns to split.  */
126 
127 inline bool
equal(const value_type * i1,const compare_type * i2)128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
129 {
130   return i1->insn == i2->insn;
131 }
132 
133 /* Hashtable helper for iv_to_split.  */
134 
135 struct var_expand_hasher : typed_free_remove <var_to_expand>
136 {
137   typedef var_to_expand value_type;
138   typedef var_to_expand compare_type;
139   static inline hashval_t hash (const value_type *);
140   static inline bool equal (const value_type *, const compare_type *);
141 };
142 
143 /* Return a hash for VES.  */
144 
145 inline hashval_t
hash(const value_type * ves)146 var_expand_hasher::hash (const value_type *ves)
147 {
148   return (hashval_t) INSN_UID (ves->insn);
149 }
150 
151 /* Return true if I1 and I2 refer to the same instruction.  */
152 
153 inline bool
equal(const value_type * i1,const compare_type * i2)154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
155 {
156   return i1->insn == i2->insn;
157 }
158 
159 /* Information about optimization applied in
160    the unrolled loop.  */
161 
162 struct opt_info
163 {
164   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
165 						  split.  */
166   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
167   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
168   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
169 					insns with accumulators to expand.  */
170   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
171   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
172   unsigned first_new_block;        /* The first basic block that was
173                                       duplicated.  */
174   basic_block loop_exit;           /* The loop exit basic block.  */
175   basic_block loop_preheader;      /* The loop preheader basic block.  */
176 };
177 
178 static void decide_unrolling_and_peeling (int);
179 static void peel_loops_completely (int);
180 static void decide_peel_simple (struct loop *, int);
181 static void decide_peel_once_rolling (struct loop *, int);
182 static void decide_peel_completely (struct loop *, int);
183 static void decide_unroll_stupid (struct loop *, int);
184 static void decide_unroll_constant_iterations (struct loop *, int);
185 static void decide_unroll_runtime_iterations (struct loop *, int);
186 static void peel_loop_simple (struct loop *);
187 static void peel_loop_completely (struct loop *);
188 static void unroll_loop_stupid (struct loop *);
189 static void unroll_loop_constant_iterations (struct loop *);
190 static void unroll_loop_runtime_iterations (struct loop *);
191 static struct opt_info *analyze_insns_in_loop (struct loop *);
192 static void opt_info_start_duplication (struct opt_info *);
193 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
194 static void free_opt_info (struct opt_info *);
195 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
196 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
197 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
198 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
199 static void insert_var_expansion_initialization (struct var_to_expand *,
200 						 basic_block);
201 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
202 					     basic_block);
203 static rtx get_expansion (struct var_to_expand *);
204 
205 /* Emit a message summarizing the unroll or peel that will be
206    performed for LOOP, along with the loop's location LOCUS, if
207    appropriate given the dump or -fopt-info settings.  */
208 
209 static void
report_unroll_peel(struct loop * loop,location_t locus)210 report_unroll_peel (struct loop *loop, location_t locus)
211 {
212   struct niter_desc *desc;
213   int niters = 0;
214   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
215 
216   if (loop->lpt_decision.decision == LPT_NONE)
217     return;
218 
219   if (!dump_enabled_p ())
220     return;
221 
222   /* In the special case where the loop never iterated, emit
223      a different message so that we don't report an unroll by 0.
224      This matches the equivalent message emitted during tree unrolling.  */
225   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
226       && !loop->lpt_decision.times)
227     {
228       dump_printf_loc (report_flags, locus,
229                        "loop turned into non-loop; it never loops.\n");
230       return;
231     }
232 
233   desc = get_simple_loop_desc (loop);
234 
235   if (desc->const_iter)
236     niters = desc->niter;
237   else if (loop->header->count)
238     niters = expected_loop_iterations (loop);
239 
240   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
241     dump_printf_loc (report_flags, locus,
242                      "loop with %d iterations completely unrolled",
243 		     loop->lpt_decision.times + 1);
244   else
245     dump_printf_loc (report_flags, locus,
246                      "loop %s %d times",
247                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
248                        ? "peeled" : "unrolled"),
249                      loop->lpt_decision.times);
250   if (profile_info)
251     dump_printf (report_flags,
252                  " (header execution count %d",
253                  (int)loop->header->count);
254   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
255     dump_printf (report_flags,
256                  "%s%s iterations %d)",
257                  profile_info ? ", " : " (",
258                  desc->const_iter ? "const" : "average",
259                  niters);
260   else if (profile_info)
261     dump_printf (report_flags, ")");
262 
263   dump_printf (report_flags, "\n");
264 }
265 
266 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
267 void
unroll_and_peel_loops(int flags)268 unroll_and_peel_loops (int flags)
269 {
270   struct loop *loop;
271   bool changed = false;
272 
273   /* First perform complete loop peeling (it is almost surely a win,
274      and affects parameters for further decision a lot).  */
275   peel_loops_completely (flags);
276 
277   /* Now decide rest of unrolling and peeling.  */
278   decide_unrolling_and_peeling (flags);
279 
280   /* Scan the loops, inner ones first.  */
281   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
282     {
283       /* And perform the appropriate transformations.  */
284       switch (loop->lpt_decision.decision)
285 	{
286 	case LPT_PEEL_COMPLETELY:
287 	  /* Already done.  */
288 	  gcc_unreachable ();
289 	case LPT_PEEL_SIMPLE:
290 	  peel_loop_simple (loop);
291 	  changed = true;
292 	  break;
293 	case LPT_UNROLL_CONSTANT:
294 	  unroll_loop_constant_iterations (loop);
295 	  changed = true;
296 	  break;
297 	case LPT_UNROLL_RUNTIME:
298 	  unroll_loop_runtime_iterations (loop);
299 	  changed = true;
300 	  break;
301 	case LPT_UNROLL_STUPID:
302 	  unroll_loop_stupid (loop);
303 	  changed = true;
304 	  break;
305 	case LPT_NONE:
306 	  break;
307 	default:
308 	  gcc_unreachable ();
309 	}
310     }
311 
312     if (changed)
313       {
314 	calculate_dominance_info (CDI_DOMINATORS);
315 	fix_loop_structure (NULL);
316       }
317 
318   iv_analysis_done ();
319 }
320 
321 /* Check whether exit of the LOOP is at the end of loop body.  */
322 
323 static bool
loop_exit_at_end_p(struct loop * loop)324 loop_exit_at_end_p (struct loop *loop)
325 {
326   struct niter_desc *desc = get_simple_loop_desc (loop);
327   rtx insn;
328 
329   if (desc->in_edge->dest != loop->latch)
330     return false;
331 
332   /* Check that the latch is empty.  */
333   FOR_BB_INSNS (loop->latch, insn)
334     {
335       if (NONDEBUG_INSN_P (insn))
336 	return false;
337     }
338 
339   return true;
340 }
341 
342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
343 static void
peel_loops_completely(int flags)344 peel_loops_completely (int flags)
345 {
346   struct loop *loop;
347   bool changed = false;
348 
349   /* Scan the loops, the inner ones first.  */
350   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
351     {
352       loop->lpt_decision.decision = LPT_NONE;
353       location_t locus = get_loop_location (loop);
354 
355       if (dump_enabled_p ())
356 	dump_printf_loc (TDF_RTL, locus,
357                          ";; *** Considering loop %d at BB %d for "
358                          "complete peeling ***\n",
359                          loop->num, loop->header->index);
360 
361       loop->ninsns = num_loop_insns (loop);
362 
363       decide_peel_once_rolling (loop, flags);
364       if (loop->lpt_decision.decision == LPT_NONE)
365 	decide_peel_completely (loop, flags);
366 
367       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
368 	{
369 	  report_unroll_peel (loop, locus);
370 	  peel_loop_completely (loop);
371 	  changed = true;
372 	}
373     }
374 
375     if (changed)
376       {
377 	calculate_dominance_info (CDI_DOMINATORS);
378 	fix_loop_structure (NULL);
379       }
380 }
381 
382 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
383 static void
decide_unrolling_and_peeling(int flags)384 decide_unrolling_and_peeling (int flags)
385 {
386   struct loop *loop;
387 
388   /* Scan the loops, inner ones first.  */
389   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
390     {
391       loop->lpt_decision.decision = LPT_NONE;
392       location_t locus = get_loop_location (loop);
393 
394       if (dump_enabled_p ())
395 	dump_printf_loc (TDF_RTL, locus,
396                          ";; *** Considering loop %d at BB %d for "
397                          "unrolling and peeling ***\n",
398                          loop->num, loop->header->index);
399 
400       /* Do not peel cold areas.  */
401       if (optimize_loop_for_size_p (loop))
402 	{
403 	  if (dump_file)
404 	    fprintf (dump_file, ";; Not considering loop, cold area\n");
405 	  continue;
406 	}
407 
408       /* Can the loop be manipulated?  */
409       if (!can_duplicate_loop_p (loop))
410 	{
411 	  if (dump_file)
412 	    fprintf (dump_file,
413 		     ";; Not considering loop, cannot duplicate\n");
414 	  continue;
415 	}
416 
417       /* Skip non-innermost loops.  */
418       if (loop->inner)
419 	{
420 	  if (dump_file)
421 	    fprintf (dump_file, ";; Not considering loop, is not innermost\n");
422 	  continue;
423 	}
424 
425       loop->ninsns = num_loop_insns (loop);
426       loop->av_ninsns = average_num_loop_insns (loop);
427 
428       /* Try transformations one by one in decreasing order of
429 	 priority.  */
430 
431       decide_unroll_constant_iterations (loop, flags);
432       if (loop->lpt_decision.decision == LPT_NONE)
433 	decide_unroll_runtime_iterations (loop, flags);
434       if (loop->lpt_decision.decision == LPT_NONE)
435 	decide_unroll_stupid (loop, flags);
436       if (loop->lpt_decision.decision == LPT_NONE)
437 	decide_peel_simple (loop, flags);
438 
439       report_unroll_peel (loop, locus);
440     }
441 }
442 
443 /* Decide whether the LOOP is once rolling and suitable for complete
444    peeling.  */
445 static void
decide_peel_once_rolling(struct loop * loop,int flags ATTRIBUTE_UNUSED)446 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
447 {
448   struct niter_desc *desc;
449 
450   if (dump_file)
451     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
452 
453   /* Is the loop small enough?  */
454   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
455     {
456       if (dump_file)
457 	fprintf (dump_file, ";; Not considering loop, is too big\n");
458       return;
459     }
460 
461   /* Check for simple loops.  */
462   desc = get_simple_loop_desc (loop);
463 
464   /* Check number of iterations.  */
465   if (!desc->simple_p
466       || desc->assumptions
467       || desc->infinite
468       || !desc->const_iter
469       || (desc->niter != 0
470 	  && get_max_loop_iterations_int (loop) != 0))
471     {
472       if (dump_file)
473 	fprintf (dump_file,
474 		 ";; Unable to prove that the loop rolls exactly once\n");
475       return;
476     }
477 
478   /* Success.  */
479   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
480 }
481 
482 /* Decide whether the LOOP is suitable for complete peeling.  */
483 static void
decide_peel_completely(struct loop * loop,int flags ATTRIBUTE_UNUSED)484 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
485 {
486   unsigned npeel;
487   struct niter_desc *desc;
488 
489   if (dump_file)
490     fprintf (dump_file, "\n;; Considering peeling completely\n");
491 
492   /* Skip non-innermost loops.  */
493   if (loop->inner)
494     {
495       if (dump_file)
496 	fprintf (dump_file, ";; Not considering loop, is not innermost\n");
497       return;
498     }
499 
500   /* Do not peel cold areas.  */
501   if (optimize_loop_for_size_p (loop))
502     {
503       if (dump_file)
504 	fprintf (dump_file, ";; Not considering loop, cold area\n");
505       return;
506     }
507 
508   /* Can the loop be manipulated?  */
509   if (!can_duplicate_loop_p (loop))
510     {
511       if (dump_file)
512 	fprintf (dump_file,
513 		 ";; Not considering loop, cannot duplicate\n");
514       return;
515     }
516 
517   /* npeel = number of iterations to peel.  */
518   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
519   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
520     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
521 
522   /* Is the loop small enough?  */
523   if (!npeel)
524     {
525       if (dump_file)
526 	fprintf (dump_file, ";; Not considering loop, is too big\n");
527       return;
528     }
529 
530   /* Check for simple loops.  */
531   desc = get_simple_loop_desc (loop);
532 
533   /* Check number of iterations.  */
534   if (!desc->simple_p
535       || desc->assumptions
536       || !desc->const_iter
537       || desc->infinite)
538     {
539       if (dump_file)
540 	fprintf (dump_file,
541 		 ";; Unable to prove that the loop iterates constant times\n");
542       return;
543     }
544 
545   if (desc->niter > npeel - 1)
546     {
547       if (dump_file)
548 	{
549 	  fprintf (dump_file,
550 		   ";; Not peeling loop completely, rolls too much (");
551 	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
552 	  fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
553 	}
554       return;
555     }
556 
557   /* Success.  */
558   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
559 }
560 
561 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
562    completely.  The transformation done:
563 
564    for (i = 0; i < 4; i++)
565      body;
566 
567    ==>
568 
569    i = 0;
570    body; i++;
571    body; i++;
572    body; i++;
573    body; i++;
574    */
575 static void
peel_loop_completely(struct loop * loop)576 peel_loop_completely (struct loop *loop)
577 {
578   sbitmap wont_exit;
579   unsigned HOST_WIDE_INT npeel;
580   unsigned i;
581   edge ein;
582   struct niter_desc *desc = get_simple_loop_desc (loop);
583   struct opt_info *opt_info = NULL;
584 
585   npeel = desc->niter;
586 
587   if (npeel)
588     {
589       bool ok;
590 
591       wont_exit = sbitmap_alloc (npeel + 1);
592       bitmap_ones (wont_exit);
593       bitmap_clear_bit (wont_exit, 0);
594       if (desc->noloop_assumptions)
595 	bitmap_clear_bit (wont_exit, 1);
596 
597       auto_vec<edge> remove_edges;
598       if (flag_split_ivs_in_unroller)
599         opt_info = analyze_insns_in_loop (loop);
600 
601       opt_info_start_duplication (opt_info);
602       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
603 					  npeel,
604 					  wont_exit, desc->out_edge,
605 					  &remove_edges,
606 					  DLTHE_FLAG_UPDATE_FREQ
607 					  | DLTHE_FLAG_COMPLETTE_PEEL
608 					  | (opt_info
609 					     ? DLTHE_RECORD_COPY_NUMBER : 0));
610       gcc_assert (ok);
611 
612       free (wont_exit);
613 
614       if (opt_info)
615  	{
616  	  apply_opt_in_copies (opt_info, npeel, false, true);
617  	  free_opt_info (opt_info);
618  	}
619 
620       /* Remove the exit edges.  */
621       FOR_EACH_VEC_ELT (remove_edges, i, ein)
622 	remove_path (ein);
623     }
624 
625   ein = desc->in_edge;
626   free_simple_loop_desc (loop);
627 
628   /* Now remove the unreachable part of the last iteration and cancel
629      the loop.  */
630   remove_path (ein);
631 
632   if (dump_file)
633     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
634 }
635 
636 /* Decide whether to unroll LOOP iterating constant number of times
637    and how much.  */
638 
639 static void
decide_unroll_constant_iterations(struct loop * loop,int flags)640 decide_unroll_constant_iterations (struct loop *loop, int flags)
641 {
642   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
643   struct niter_desc *desc;
644   double_int iterations;
645 
646   if (!(flags & UAP_UNROLL))
647     {
648       /* We were not asked to, just return back silently.  */
649       return;
650     }
651 
652   if (dump_file)
653     fprintf (dump_file,
654 	     "\n;; Considering unrolling loop with constant "
655 	     "number of iterations\n");
656 
657   /* nunroll = total number of copies of the original loop body in
658      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
659   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
660   nunroll_by_av
661     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
662   if (nunroll > nunroll_by_av)
663     nunroll = nunroll_by_av;
664   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
665     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
666 
667   if (targetm.loop_unroll_adjust)
668     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
669 
670   /* Skip big loops.  */
671   if (nunroll <= 1)
672     {
673       if (dump_file)
674 	fprintf (dump_file, ";; Not considering loop, is too big\n");
675       return;
676     }
677 
678   /* Check for simple loops.  */
679   desc = get_simple_loop_desc (loop);
680 
681   /* Check number of iterations.  */
682   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
683     {
684       if (dump_file)
685 	fprintf (dump_file,
686 		 ";; Unable to prove that the loop iterates constant times\n");
687       return;
688     }
689 
690   /* Check whether the loop rolls enough to consider.
691      Consult also loop bounds and profile; in the case the loop has more
692      than one exit it may well loop less than determined maximal number
693      of iterations.  */
694   if (desc->niter < 2 * nunroll
695       || ((get_estimated_loop_iterations (loop, &iterations)
696 	   || get_max_loop_iterations (loop, &iterations))
697 	  && iterations.ult (double_int::from_shwi (2 * nunroll))))
698     {
699       if (dump_file)
700 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
701       return;
702     }
703 
704   /* Success; now compute number of iterations to unroll.  We alter
705      nunroll so that as few as possible copies of loop body are
706      necessary, while still not decreasing the number of unrollings
707      too much (at most by 1).  */
708   best_copies = 2 * nunroll + 10;
709 
710   i = 2 * nunroll + 2;
711   if (i - 1 >= desc->niter)
712     i = desc->niter - 2;
713 
714   for (; i >= nunroll - 1; i--)
715     {
716       unsigned exit_mod = desc->niter % (i + 1);
717 
718       if (!loop_exit_at_end_p (loop))
719 	n_copies = exit_mod + i + 1;
720       else if (exit_mod != (unsigned) i
721 	       || desc->noloop_assumptions != NULL_RTX)
722 	n_copies = exit_mod + i + 2;
723       else
724 	n_copies = i + 1;
725 
726       if (n_copies < best_copies)
727 	{
728 	  best_copies = n_copies;
729 	  best_unroll = i;
730 	}
731     }
732 
733   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
734   loop->lpt_decision.times = best_unroll;
735 }
736 
737 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
738    The transformation does this:
739 
740    for (i = 0; i < 102; i++)
741      body;
742 
743    ==>  (LOOP->LPT_DECISION.TIMES == 3)
744 
745    i = 0;
746    body; i++;
747    body; i++;
748    while (i < 102)
749      {
750        body; i++;
751        body; i++;
752        body; i++;
753        body; i++;
754      }
755   */
756 static void
unroll_loop_constant_iterations(struct loop * loop)757 unroll_loop_constant_iterations (struct loop *loop)
758 {
759   unsigned HOST_WIDE_INT niter;
760   unsigned exit_mod;
761   sbitmap wont_exit;
762   unsigned i;
763   edge e;
764   unsigned max_unroll = loop->lpt_decision.times;
765   struct niter_desc *desc = get_simple_loop_desc (loop);
766   bool exit_at_end = loop_exit_at_end_p (loop);
767   struct opt_info *opt_info = NULL;
768   bool ok;
769 
770   niter = desc->niter;
771 
772   /* Should not get here (such loop should be peeled instead).  */
773   gcc_assert (niter > max_unroll + 1);
774 
775   exit_mod = niter % (max_unroll + 1);
776 
777   wont_exit = sbitmap_alloc (max_unroll + 1);
778   bitmap_ones (wont_exit);
779 
780   auto_vec<edge> remove_edges;
781   if (flag_split_ivs_in_unroller
782       || flag_variable_expansion_in_unroller)
783     opt_info = analyze_insns_in_loop (loop);
784 
785   if (!exit_at_end)
786     {
787       /* The exit is not at the end of the loop; leave exit test
788 	 in the first copy, so that the loops that start with test
789 	 of exit condition have continuous body after unrolling.  */
790 
791       if (dump_file)
792 	fprintf (dump_file, ";; Condition at beginning of loop.\n");
793 
794       /* Peel exit_mod iterations.  */
795       bitmap_clear_bit (wont_exit, 0);
796       if (desc->noloop_assumptions)
797 	bitmap_clear_bit (wont_exit, 1);
798 
799       if (exit_mod)
800 	{
801 	  opt_info_start_duplication (opt_info);
802           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
803 					      exit_mod,
804 					      wont_exit, desc->out_edge,
805 					      &remove_edges,
806 					      DLTHE_FLAG_UPDATE_FREQ
807 					      | (opt_info && exit_mod > 1
808 						 ? DLTHE_RECORD_COPY_NUMBER
809 						   : 0));
810 	  gcc_assert (ok);
811 
812           if (opt_info && exit_mod > 1)
813  	    apply_opt_in_copies (opt_info, exit_mod, false, false);
814 
815 	  desc->noloop_assumptions = NULL_RTX;
816 	  desc->niter -= exit_mod;
817 	  loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
818 	  if (loop->any_estimate
819 	      && double_int::from_uhwi (exit_mod).ule
820 	           (loop->nb_iterations_estimate))
821 	    loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
822 	  else
823 	    loop->any_estimate = false;
824 	}
825 
826       bitmap_set_bit (wont_exit, 1);
827     }
828   else
829     {
830       /* Leave exit test in last copy, for the same reason as above if
831 	 the loop tests the condition at the end of loop body.  */
832 
833       if (dump_file)
834 	fprintf (dump_file, ";; Condition at end of loop.\n");
835 
836       /* We know that niter >= max_unroll + 2; so we do not need to care of
837 	 case when we would exit before reaching the loop.  So just peel
838 	 exit_mod + 1 iterations.  */
839       if (exit_mod != max_unroll
840 	  || desc->noloop_assumptions)
841 	{
842 	  bitmap_clear_bit (wont_exit, 0);
843 	  if (desc->noloop_assumptions)
844 	    bitmap_clear_bit (wont_exit, 1);
845 
846           opt_info_start_duplication (opt_info);
847 	  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
848 					      exit_mod + 1,
849 					      wont_exit, desc->out_edge,
850 					      &remove_edges,
851 					      DLTHE_FLAG_UPDATE_FREQ
852 					      | (opt_info && exit_mod > 0
853 						 ? DLTHE_RECORD_COPY_NUMBER
854 						   : 0));
855 	  gcc_assert (ok);
856 
857           if (opt_info && exit_mod > 0)
858   	    apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
859 
860 	  desc->niter -= exit_mod + 1;
861 	  loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
862 	  if (loop->any_estimate
863 	      && double_int::from_uhwi (exit_mod + 1).ule
864 	           (loop->nb_iterations_estimate))
865 	    loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
866 	  else
867 	    loop->any_estimate = false;
868 	  desc->noloop_assumptions = NULL_RTX;
869 
870 	  bitmap_set_bit (wont_exit, 0);
871 	  bitmap_set_bit (wont_exit, 1);
872 	}
873 
874       bitmap_clear_bit (wont_exit, max_unroll);
875     }
876 
877   /* Now unroll the loop.  */
878 
879   opt_info_start_duplication (opt_info);
880   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
881 				      max_unroll,
882 				      wont_exit, desc->out_edge,
883 				      &remove_edges,
884 				      DLTHE_FLAG_UPDATE_FREQ
885 				      | (opt_info
886 					 ? DLTHE_RECORD_COPY_NUMBER
887 					   : 0));
888   gcc_assert (ok);
889 
890   if (opt_info)
891     {
892       apply_opt_in_copies (opt_info, max_unroll, true, true);
893       free_opt_info (opt_info);
894     }
895 
896   free (wont_exit);
897 
898   if (exit_at_end)
899     {
900       basic_block exit_block = get_bb_copy (desc->in_edge->src);
901       /* Find a new in and out edge; they are in the last copy we have made.  */
902 
903       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
904 	{
905 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
906 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
907 	}
908       else
909 	{
910 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
911 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
912 	}
913     }
914 
915   desc->niter /= max_unroll + 1;
916   loop->nb_iterations_upper_bound
917     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
918 								   + 1),
919 					    TRUNC_DIV_EXPR);
920   if (loop->any_estimate)
921     loop->nb_iterations_estimate
922       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
923 							          + 1),
924 				           TRUNC_DIV_EXPR);
925   desc->niter_expr = GEN_INT (desc->niter);
926 
927   /* Remove the edges.  */
928   FOR_EACH_VEC_ELT (remove_edges, i, e)
929     remove_path (e);
930 
931   if (dump_file)
932     fprintf (dump_file,
933 	     ";; Unrolled loop %d times, constant # of iterations %i insns\n",
934 	     max_unroll, num_loop_insns (loop));
935 }
936 
937 /* Decide whether to unroll LOOP iterating runtime computable number of times
938    and how much.  */
939 static void
decide_unroll_runtime_iterations(struct loop * loop,int flags)940 decide_unroll_runtime_iterations (struct loop *loop, int flags)
941 {
942   unsigned nunroll, nunroll_by_av, i;
943   struct niter_desc *desc;
944   double_int iterations;
945 
946   if (!(flags & UAP_UNROLL))
947     {
948       /* We were not asked to, just return back silently.  */
949       return;
950     }
951 
952   if (dump_file)
953     fprintf (dump_file,
954 	     "\n;; Considering unrolling loop with runtime "
955 	     "computable number of iterations\n");
956 
957   /* nunroll = total number of copies of the original loop body in
958      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
959   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
960   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
961   if (nunroll > nunroll_by_av)
962     nunroll = nunroll_by_av;
963   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
964     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
965 
966   if (targetm.loop_unroll_adjust)
967     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
968 
969   /* Skip big loops.  */
970   if (nunroll <= 1)
971     {
972       if (dump_file)
973 	fprintf (dump_file, ";; Not considering loop, is too big\n");
974       return;
975     }
976 
977   /* Check for simple loops.  */
978   desc = get_simple_loop_desc (loop);
979 
980   /* Check simpleness.  */
981   if (!desc->simple_p || desc->assumptions)
982     {
983       if (dump_file)
984 	fprintf (dump_file,
985 		 ";; Unable to prove that the number of iterations "
986 		 "can be counted in runtime\n");
987       return;
988     }
989 
990   if (desc->const_iter)
991     {
992       if (dump_file)
993 	fprintf (dump_file, ";; Loop iterates constant times\n");
994       return;
995     }
996 
997   /* Check whether the loop rolls.  */
998   if ((get_estimated_loop_iterations (loop, &iterations)
999        || get_max_loop_iterations (loop, &iterations))
1000       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1001     {
1002       if (dump_file)
1003 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1004       return;
1005     }
1006 
1007   /* Success; now force nunroll to be power of 2, as we are unable to
1008      cope with overflows in computation of number of iterations.  */
1009   for (i = 1; 2 * i <= nunroll; i *= 2)
1010     continue;
1011 
1012   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1013   loop->lpt_decision.times = i - 1;
1014 }
1015 
1016 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1017    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1018    and NULL is returned instead.  */
1019 
1020 basic_block
split_edge_and_insert(edge e,rtx insns)1021 split_edge_and_insert (edge e, rtx insns)
1022 {
1023   basic_block bb;
1024 
1025   if (!insns)
1026     return NULL;
1027   bb = split_edge (e);
1028   emit_insn_after (insns, BB_END (bb));
1029 
1030   /* ??? We used to assume that INSNS can contain control flow insns, and
1031      that we had to try to find sub basic blocks in BB to maintain a valid
1032      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1033      and call break_superblocks when going out of cfglayout mode.  But it
1034      turns out that this never happens; and that if it does ever happen,
1035      the TODO_verify_flow at the end of the RTL loop passes would fail.
1036 
1037      There are two reasons why we expected we could have control flow insns
1038      in INSNS.  The first is when a comparison has to be done in parts, and
1039      the second is when the number of iterations is computed for loops with
1040      the number of iterations known at runtime.  In both cases, test cases
1041      to get control flow in INSNS appear to be impossible to construct:
1042 
1043       * If do_compare_rtx_and_jump needs several branches to do comparison
1044 	in a mode that needs comparison by parts, we cannot analyze the
1045 	number of iterations of the loop, and we never get to unrolling it.
1046 
1047       * The code in expand_divmod that was suspected to cause creation of
1048 	branching code seems to be only accessed for signed division.  The
1049 	divisions used by # of iterations analysis are always unsigned.
1050 	Problems might arise on architectures that emits branching code
1051 	for some operations that may appear in the unroller (especially
1052 	for division), but we have no such architectures.
1053 
1054      Considering all this, it was decided that we should for now assume
1055      that INSNS can in theory contain control flow insns, but in practice
1056      it never does.  So we don't handle the theoretical case, and should
1057      a real failure ever show up, we have a pretty good clue for how to
1058      fix it.  */
1059 
1060   return bb;
1061 }
1062 
1063 /* Unroll LOOP for which we are able to count number of iterations in runtime
1064    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1065    extra care for case n < 0):
1066 
1067    for (i = 0; i < n; i++)
1068      body;
1069 
1070    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1071 
1072    i = 0;
1073    mod = n % 4;
1074 
1075    switch (mod)
1076      {
1077        case 3:
1078          body; i++;
1079        case 2:
1080          body; i++;
1081        case 1:
1082          body; i++;
1083        case 0: ;
1084      }
1085 
1086    while (i < n)
1087      {
1088        body; i++;
1089        body; i++;
1090        body; i++;
1091        body; i++;
1092      }
1093    */
1094 static void
unroll_loop_runtime_iterations(struct loop * loop)1095 unroll_loop_runtime_iterations (struct loop *loop)
1096 {
1097   rtx old_niter, niter, init_code, branch_code, tmp;
1098   unsigned i, j, p;
1099   basic_block preheader, *body, swtch, ezc_swtch;
1100   sbitmap wont_exit;
1101   int may_exit_copy;
1102   unsigned n_peel;
1103   edge e;
1104   bool extra_zero_check, last_may_exit;
1105   unsigned max_unroll = loop->lpt_decision.times;
1106   struct niter_desc *desc = get_simple_loop_desc (loop);
1107   bool exit_at_end = loop_exit_at_end_p (loop);
1108   struct opt_info *opt_info = NULL;
1109   bool ok;
1110 
1111   if (flag_split_ivs_in_unroller
1112       || flag_variable_expansion_in_unroller)
1113     opt_info = analyze_insns_in_loop (loop);
1114 
1115   /* Remember blocks whose dominators will have to be updated.  */
1116   auto_vec<basic_block> dom_bbs;
1117 
1118   body = get_loop_body (loop);
1119   for (i = 0; i < loop->num_nodes; i++)
1120     {
1121       vec<basic_block> ldom;
1122       basic_block bb;
1123 
1124       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1125       FOR_EACH_VEC_ELT (ldom, j, bb)
1126 	if (!flow_bb_inside_loop_p (loop, bb))
1127 	  dom_bbs.safe_push (bb);
1128 
1129       ldom.release ();
1130     }
1131   free (body);
1132 
1133   if (!exit_at_end)
1134     {
1135       /* Leave exit in first copy (for explanation why see comment in
1136 	 unroll_loop_constant_iterations).  */
1137       may_exit_copy = 0;
1138       n_peel = max_unroll - 1;
1139       extra_zero_check = true;
1140       last_may_exit = false;
1141     }
1142   else
1143     {
1144       /* Leave exit in last copy (for explanation why see comment in
1145 	 unroll_loop_constant_iterations).  */
1146       may_exit_copy = max_unroll;
1147       n_peel = max_unroll;
1148       extra_zero_check = false;
1149       last_may_exit = true;
1150     }
1151 
1152   /* Get expression for number of iterations.  */
1153   start_sequence ();
1154   old_niter = niter = gen_reg_rtx (desc->mode);
1155   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1156   if (tmp != niter)
1157     emit_move_insn (niter, tmp);
1158 
1159   /* Count modulo by ANDing it with max_unroll; we use the fact that
1160      the number of unrollings is a power of two, and thus this is correct
1161      even if there is overflow in the computation.  */
1162   niter = expand_simple_binop (desc->mode, AND,
1163 			       niter, gen_int_mode (max_unroll, desc->mode),
1164 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
1165 
1166   init_code = get_insns ();
1167   end_sequence ();
1168   unshare_all_rtl_in_chain (init_code);
1169 
1170   /* Precondition the loop.  */
1171   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1172 
1173   auto_vec<edge> remove_edges;
1174 
1175   wont_exit = sbitmap_alloc (max_unroll + 2);
1176 
1177   /* Peel the first copy of loop body (almost always we must leave exit test
1178      here; the only exception is when we have extra zero check and the number
1179      of iterations is reliable.  Also record the place of (possible) extra
1180      zero check.  */
1181   bitmap_clear (wont_exit);
1182   if (extra_zero_check
1183       && !desc->noloop_assumptions)
1184     bitmap_set_bit (wont_exit, 1);
1185   ezc_swtch = loop_preheader_edge (loop)->src;
1186   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1187 				      1, wont_exit, desc->out_edge,
1188 				      &remove_edges,
1189 				      DLTHE_FLAG_UPDATE_FREQ);
1190   gcc_assert (ok);
1191 
1192   /* Record the place where switch will be built for preconditioning.  */
1193   swtch = split_edge (loop_preheader_edge (loop));
1194 
1195   for (i = 0; i < n_peel; i++)
1196     {
1197       /* Peel the copy.  */
1198       bitmap_clear (wont_exit);
1199       if (i != n_peel - 1 || !last_may_exit)
1200 	bitmap_set_bit (wont_exit, 1);
1201       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1202 					  1, wont_exit, desc->out_edge,
1203 					  &remove_edges,
1204 					  DLTHE_FLAG_UPDATE_FREQ);
1205       gcc_assert (ok);
1206 
1207       /* Create item for switch.  */
1208       j = n_peel - i - (extra_zero_check ? 0 : 1);
1209       p = REG_BR_PROB_BASE / (i + 2);
1210 
1211       preheader = split_edge (loop_preheader_edge (loop));
1212       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1213 					  block_label (preheader), p,
1214 					  NULL_RTX);
1215 
1216       /* We rely on the fact that the compare and jump cannot be optimized out,
1217 	 and hence the cfg we create is correct.  */
1218       gcc_assert (branch_code != NULL_RTX);
1219 
1220       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1221       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1222       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1223       e = make_edge (swtch, preheader,
1224 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1225       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1226       e->probability = p;
1227     }
1228 
1229   if (extra_zero_check)
1230     {
1231       /* Add branch for zero iterations.  */
1232       p = REG_BR_PROB_BASE / (max_unroll + 1);
1233       swtch = ezc_swtch;
1234       preheader = split_edge (loop_preheader_edge (loop));
1235       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1236 					  block_label (preheader), p,
1237 					  NULL_RTX);
1238       gcc_assert (branch_code != NULL_RTX);
1239 
1240       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1241       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1242       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1243       e = make_edge (swtch, preheader,
1244 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1245       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1246       e->probability = p;
1247     }
1248 
1249   /* Recount dominators for outer blocks.  */
1250   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1251 
1252   /* And unroll loop.  */
1253 
1254   bitmap_ones (wont_exit);
1255   bitmap_clear_bit (wont_exit, may_exit_copy);
1256   opt_info_start_duplication (opt_info);
1257 
1258   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1259 				      max_unroll,
1260 				      wont_exit, desc->out_edge,
1261 				      &remove_edges,
1262 				      DLTHE_FLAG_UPDATE_FREQ
1263 				      | (opt_info
1264 					 ? DLTHE_RECORD_COPY_NUMBER
1265 					   : 0));
1266   gcc_assert (ok);
1267 
1268   if (opt_info)
1269     {
1270       apply_opt_in_copies (opt_info, max_unroll, true, true);
1271       free_opt_info (opt_info);
1272     }
1273 
1274   free (wont_exit);
1275 
1276   if (exit_at_end)
1277     {
1278       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1279       /* Find a new in and out edge; they are in the last copy we have
1280 	 made.  */
1281 
1282       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1283 	{
1284 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
1285 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
1286 	}
1287       else
1288 	{
1289 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
1290 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
1291 	}
1292     }
1293 
1294   /* Remove the edges.  */
1295   FOR_EACH_VEC_ELT (remove_edges, i, e)
1296     remove_path (e);
1297 
1298   /* We must be careful when updating the number of iterations due to
1299      preconditioning and the fact that the value must be valid at entry
1300      of the loop.  After passing through the above code, we see that
1301      the correct new number of iterations is this:  */
1302   gcc_assert (!desc->const_iter);
1303   desc->niter_expr =
1304     simplify_gen_binary (UDIV, desc->mode, old_niter,
1305 			 gen_int_mode (max_unroll + 1, desc->mode));
1306   loop->nb_iterations_upper_bound
1307     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1308 								   + 1),
1309 					    TRUNC_DIV_EXPR);
1310   if (loop->any_estimate)
1311     loop->nb_iterations_estimate
1312       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1313 							          + 1),
1314 				           TRUNC_DIV_EXPR);
1315   if (exit_at_end)
1316     {
1317       desc->niter_expr =
1318 	simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1319       desc->noloop_assumptions = NULL_RTX;
1320       --loop->nb_iterations_upper_bound;
1321       if (loop->any_estimate
1322 	  && loop->nb_iterations_estimate != double_int_zero)
1323 	--loop->nb_iterations_estimate;
1324       else
1325 	loop->any_estimate = false;
1326     }
1327 
1328   if (dump_file)
1329     fprintf (dump_file,
1330 	     ";; Unrolled loop %d times, counting # of iterations "
1331 	     "in runtime, %i insns\n",
1332 	     max_unroll, num_loop_insns (loop));
1333 }
1334 
1335 /* Decide whether to simply peel LOOP and how much.  */
1336 static void
decide_peel_simple(struct loop * loop,int flags)1337 decide_peel_simple (struct loop *loop, int flags)
1338 {
1339   unsigned npeel;
1340   double_int iterations;
1341 
1342   if (!(flags & UAP_PEEL))
1343     {
1344       /* We were not asked to, just return back silently.  */
1345       return;
1346     }
1347 
1348   if (dump_file)
1349     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1350 
1351   /* npeel = number of iterations to peel.  */
1352   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1353   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1354     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1355 
1356   /* Skip big loops.  */
1357   if (!npeel)
1358     {
1359       if (dump_file)
1360 	fprintf (dump_file, ";; Not considering loop, is too big\n");
1361       return;
1362     }
1363 
1364   /* Do not simply peel loops with branches inside -- it increases number
1365      of mispredicts.
1366      Exception is when we do have profile and we however have good chance
1367      to peel proper number of iterations loop will iterate in practice.
1368      TODO: this heuristic needs tunning; while for complette unrolling
1369      the branch inside loop mostly eliminates any improvements, for
1370      peeling it is not the case.  Also a function call inside loop is
1371      also branch from branch prediction POV (and probably better reason
1372      to not unroll/peel).  */
1373   if (num_loop_branches (loop) > 1
1374       && profile_status_for_fn (cfun) != PROFILE_READ)
1375     {
1376       if (dump_file)
1377 	fprintf (dump_file, ";; Not peeling, contains branches\n");
1378       return;
1379     }
1380 
1381   /* If we have realistic estimate on number of iterations, use it.  */
1382   if (get_estimated_loop_iterations (loop, &iterations))
1383     {
1384       if (double_int::from_shwi (npeel).ule (iterations))
1385 	{
1386 	  if (dump_file)
1387 	    {
1388 	      fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1389 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1390 		       (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1391 	      fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1392 		       npeel);
1393 	    }
1394 	  return;
1395 	}
1396       npeel = iterations.to_shwi () + 1;
1397     }
1398   /* If we have small enough bound on iterations, we can still peel (completely
1399      unroll).  */
1400   else if (get_max_loop_iterations (loop, &iterations)
1401            && iterations.ult (double_int::from_shwi (npeel)))
1402     npeel = iterations.to_shwi () + 1;
1403   else
1404     {
1405       /* For now we have no good heuristics to decide whether loop peeling
1406          will be effective, so disable it.  */
1407       if (dump_file)
1408 	fprintf (dump_file,
1409 		 ";; Not peeling loop, no evidence it will be profitable\n");
1410       return;
1411     }
1412 
1413   /* Success.  */
1414   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1415   loop->lpt_decision.times = npeel;
1416 }
1417 
1418 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1419 
1420    while (cond)
1421      body;
1422 
1423    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1424 
1425    if (!cond) goto end;
1426    body;
1427    if (!cond) goto end;
1428    body;
1429    if (!cond) goto end;
1430    body;
1431    while (cond)
1432      body;
1433    end: ;
1434    */
1435 static void
peel_loop_simple(struct loop * loop)1436 peel_loop_simple (struct loop *loop)
1437 {
1438   sbitmap wont_exit;
1439   unsigned npeel = loop->lpt_decision.times;
1440   struct niter_desc *desc = get_simple_loop_desc (loop);
1441   struct opt_info *opt_info = NULL;
1442   bool ok;
1443 
1444   if (flag_split_ivs_in_unroller && npeel > 1)
1445     opt_info = analyze_insns_in_loop (loop);
1446 
1447   wont_exit = sbitmap_alloc (npeel + 1);
1448   bitmap_clear (wont_exit);
1449 
1450   opt_info_start_duplication (opt_info);
1451 
1452   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1453 				      npeel, wont_exit, NULL,
1454 				      NULL, DLTHE_FLAG_UPDATE_FREQ
1455 				      | (opt_info
1456 					 ? DLTHE_RECORD_COPY_NUMBER
1457 					   : 0));
1458   gcc_assert (ok);
1459 
1460   free (wont_exit);
1461 
1462   if (opt_info)
1463     {
1464       apply_opt_in_copies (opt_info, npeel, false, false);
1465       free_opt_info (opt_info);
1466     }
1467 
1468   if (desc->simple_p)
1469     {
1470       if (desc->const_iter)
1471 	{
1472 	  desc->niter -= npeel;
1473 	  desc->niter_expr = GEN_INT (desc->niter);
1474 	  desc->noloop_assumptions = NULL_RTX;
1475 	}
1476       else
1477 	{
1478 	  /* We cannot just update niter_expr, as its value might be clobbered
1479 	     inside loop.  We could handle this by counting the number into
1480 	     temporary just like we do in runtime unrolling, but it does not
1481 	     seem worthwhile.  */
1482 	  free_simple_loop_desc (loop);
1483 	}
1484     }
1485   if (dump_file)
1486     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1487 }
1488 
1489 /* Decide whether to unroll LOOP stupidly and how much.  */
1490 static void
decide_unroll_stupid(struct loop * loop,int flags)1491 decide_unroll_stupid (struct loop *loop, int flags)
1492 {
1493   unsigned nunroll, nunroll_by_av, i;
1494   struct niter_desc *desc;
1495   double_int iterations;
1496 
1497   if (!(flags & UAP_UNROLL_ALL))
1498     {
1499       /* We were not asked to, just return back silently.  */
1500       return;
1501     }
1502 
1503   if (dump_file)
1504     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1505 
1506   /* nunroll = total number of copies of the original loop body in
1507      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1508   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1509   nunroll_by_av
1510     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1511   if (nunroll > nunroll_by_av)
1512     nunroll = nunroll_by_av;
1513   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1514     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1515 
1516   if (targetm.loop_unroll_adjust)
1517     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1518 
1519   /* Skip big loops.  */
1520   if (nunroll <= 1)
1521     {
1522       if (dump_file)
1523 	fprintf (dump_file, ";; Not considering loop, is too big\n");
1524       return;
1525     }
1526 
1527   /* Check for simple loops.  */
1528   desc = get_simple_loop_desc (loop);
1529 
1530   /* Check simpleness.  */
1531   if (desc->simple_p && !desc->assumptions)
1532     {
1533       if (dump_file)
1534 	fprintf (dump_file, ";; The loop is simple\n");
1535       return;
1536     }
1537 
1538   /* Do not unroll loops with branches inside -- it increases number
1539      of mispredicts.
1540      TODO: this heuristic needs tunning; call inside the loop body
1541      is also relatively good reason to not unroll.  */
1542   if (num_loop_branches (loop) > 1)
1543     {
1544       if (dump_file)
1545 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
1546       return;
1547     }
1548 
1549   /* Check whether the loop rolls.  */
1550   if ((get_estimated_loop_iterations (loop, &iterations)
1551        || get_max_loop_iterations (loop, &iterations))
1552       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1553     {
1554       if (dump_file)
1555 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1556       return;
1557     }
1558 
1559   /* Success.  Now force nunroll to be power of 2, as it seems that this
1560      improves results (partially because of better alignments, partially
1561      because of some dark magic).  */
1562   for (i = 1; 2 * i <= nunroll; i *= 2)
1563     continue;
1564 
1565   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1566   loop->lpt_decision.times = i - 1;
1567 }
1568 
1569 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1570 
1571    while (cond)
1572      body;
1573 
1574    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1575 
1576    while (cond)
1577      {
1578        body;
1579        if (!cond) break;
1580        body;
1581        if (!cond) break;
1582        body;
1583        if (!cond) break;
1584        body;
1585      }
1586    */
1587 static void
unroll_loop_stupid(struct loop * loop)1588 unroll_loop_stupid (struct loop *loop)
1589 {
1590   sbitmap wont_exit;
1591   unsigned nunroll = loop->lpt_decision.times;
1592   struct niter_desc *desc = get_simple_loop_desc (loop);
1593   struct opt_info *opt_info = NULL;
1594   bool ok;
1595 
1596   if (flag_split_ivs_in_unroller
1597       || flag_variable_expansion_in_unroller)
1598     opt_info = analyze_insns_in_loop (loop);
1599 
1600 
1601   wont_exit = sbitmap_alloc (nunroll + 1);
1602   bitmap_clear (wont_exit);
1603   opt_info_start_duplication (opt_info);
1604 
1605   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1606 				      nunroll, wont_exit,
1607 				      NULL, NULL,
1608 				      DLTHE_FLAG_UPDATE_FREQ
1609 				      | (opt_info
1610 					 ? DLTHE_RECORD_COPY_NUMBER
1611 					   : 0));
1612   gcc_assert (ok);
1613 
1614   if (opt_info)
1615     {
1616       apply_opt_in_copies (opt_info, nunroll, true, true);
1617       free_opt_info (opt_info);
1618     }
1619 
1620   free (wont_exit);
1621 
1622   if (desc->simple_p)
1623     {
1624       /* We indeed may get here provided that there are nontrivial assumptions
1625 	 for a loop to be really simple.  We could update the counts, but the
1626 	 problem is that we are unable to decide which exit will be taken
1627 	 (not really true in case the number of iterations is constant,
1628 	 but no one will do anything with this information, so we do not
1629 	 worry about it).  */
1630       desc->simple_p = false;
1631     }
1632 
1633   if (dump_file)
1634     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1635 	     nunroll, num_loop_insns (loop));
1636 }
1637 
1638 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1639    Set *DEBUG_USES to the number of debug insns that reference the
1640    variable.  */
1641 
1642 bool
referenced_in_one_insn_in_loop_p(struct loop * loop,rtx reg,int * debug_uses)1643 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1644 				  int *debug_uses)
1645 {
1646   basic_block *body, bb;
1647   unsigned i;
1648   int count_ref = 0;
1649   rtx insn;
1650 
1651   body = get_loop_body (loop);
1652   for (i = 0; i < loop->num_nodes; i++)
1653     {
1654       bb = body[i];
1655 
1656       FOR_BB_INSNS (bb, insn)
1657 	if (!rtx_referenced_p (reg, insn))
1658 	  continue;
1659 	else if (DEBUG_INSN_P (insn))
1660 	  ++*debug_uses;
1661 	else if (++count_ref > 1)
1662 	  break;
1663     }
1664   free (body);
1665   return (count_ref  == 1);
1666 }
1667 
1668 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1669 
1670 static void
reset_debug_uses_in_loop(struct loop * loop,rtx reg,int debug_uses)1671 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1672 {
1673   basic_block *body, bb;
1674   unsigned i;
1675   rtx insn;
1676 
1677   body = get_loop_body (loop);
1678   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1679     {
1680       bb = body[i];
1681 
1682       FOR_BB_INSNS (bb, insn)
1683 	if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1684 	  continue;
1685 	else
1686 	  {
1687 	    validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1688 			     gen_rtx_UNKNOWN_VAR_LOC (), 0);
1689 	    if (!--debug_uses)
1690 	      break;
1691 	  }
1692     }
1693   free (body);
1694 }
1695 
1696 /* Determine whether INSN contains an accumulator
1697    which can be expanded into separate copies,
1698    one for each copy of the LOOP body.
1699 
1700    for (i = 0 ; i < n; i++)
1701      sum += a[i];
1702 
1703    ==>
1704 
1705    sum += a[i]
1706    ....
1707    i = i+1;
1708    sum1 += a[i]
1709    ....
1710    i = i+1
1711    sum2 += a[i];
1712    ....
1713 
1714    Return NULL if INSN contains no opportunity for expansion of accumulator.
1715    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1716    information and return a pointer to it.
1717 */
1718 
1719 static struct var_to_expand *
analyze_insn_to_expand_var(struct loop * loop,rtx insn)1720 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1721 {
1722   rtx set, dest, src;
1723   struct var_to_expand *ves;
1724   unsigned accum_pos;
1725   enum rtx_code code;
1726   int debug_uses = 0;
1727 
1728   set = single_set (insn);
1729   if (!set)
1730     return NULL;
1731 
1732   dest = SET_DEST (set);
1733   src = SET_SRC (set);
1734   code = GET_CODE (src);
1735 
1736   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1737     return NULL;
1738 
1739   if (FLOAT_MODE_P (GET_MODE (dest)))
1740     {
1741       if (!flag_associative_math)
1742         return NULL;
1743       /* In the case of FMA, we're also changing the rounding.  */
1744       if (code == FMA && !flag_unsafe_math_optimizations)
1745 	return NULL;
1746     }
1747 
1748   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1749      in MD.  But if there is no optab to generate the insn, we can not
1750      perform the variable expansion.  This can happen if an MD provides
1751      an insn but not a named pattern to generate it, for example to avoid
1752      producing code that needs additional mode switches like for x87/mmx.
1753 
1754      So we check have_insn_for which looks for an optab for the operation
1755      in SRC.  If it doesn't exist, we can't perform the expansion even
1756      though INSN is valid.  */
1757   if (!have_insn_for (code, GET_MODE (src)))
1758     return NULL;
1759 
1760   if (!REG_P (dest)
1761       && !(GET_CODE (dest) == SUBREG
1762            && REG_P (SUBREG_REG (dest))))
1763     return NULL;
1764 
1765   /* Find the accumulator use within the operation.  */
1766   if (code == FMA)
1767     {
1768       /* We only support accumulation via FMA in the ADD position.  */
1769       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1770 	return NULL;
1771       accum_pos = 2;
1772     }
1773   else if (rtx_equal_p (dest, XEXP (src, 0)))
1774     accum_pos = 0;
1775   else if (rtx_equal_p (dest, XEXP (src, 1)))
1776     {
1777       /* The method of expansion that we are using; which includes the
1778 	 initialization of the expansions with zero and the summation of
1779          the expansions at the end of the computation will yield wrong
1780 	 results for (x = something - x) thus avoid using it in that case.  */
1781       if (code == MINUS)
1782 	return NULL;
1783       accum_pos = 1;
1784     }
1785   else
1786     return NULL;
1787 
1788   /* It must not otherwise be used.  */
1789   if (code == FMA)
1790     {
1791       if (rtx_referenced_p (dest, XEXP (src, 0))
1792 	  || rtx_referenced_p (dest, XEXP (src, 1)))
1793 	return NULL;
1794     }
1795   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1796     return NULL;
1797 
1798   /* It must be used in exactly one insn.  */
1799   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1800     return NULL;
1801 
1802   if (dump_file)
1803     {
1804       fprintf (dump_file, "\n;; Expanding Accumulator ");
1805       print_rtl (dump_file, dest);
1806       fprintf (dump_file, "\n");
1807     }
1808 
1809   if (debug_uses)
1810     /* Instead of resetting the debug insns, we could replace each
1811        debug use in the loop with the sum or product of all expanded
1812        accummulators.  Since we'll only know of all expansions at the
1813        end, we'd have to keep track of which vars_to_expand a debug
1814        insn in the loop references, take note of each copy of the
1815        debug insn during unrolling, and when it's all done, compute
1816        the sum or product of each variable and adjust the original
1817        debug insn and each copy thereof.  What a pain!  */
1818     reset_debug_uses_in_loop (loop, dest, debug_uses);
1819 
1820   /* Record the accumulator to expand.  */
1821   ves = XNEW (struct var_to_expand);
1822   ves->insn = insn;
1823   ves->reg = copy_rtx (dest);
1824   ves->var_expansions.create (1);
1825   ves->next = NULL;
1826   ves->op = GET_CODE (src);
1827   ves->expansion_count = 0;
1828   ves->reuse_expansion = 0;
1829   return ves;
1830 }
1831 
1832 /* Determine whether there is an induction variable in INSN that
1833    we would like to split during unrolling.
1834 
1835    I.e. replace
1836 
1837    i = i + 1;
1838    ...
1839    i = i + 1;
1840    ...
1841    i = i + 1;
1842    ...
1843 
1844    type chains by
1845 
1846    i0 = i + 1
1847    ...
1848    i = i0 + 1
1849    ...
1850    i = i0 + 2
1851    ...
1852 
1853    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1854    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1855    pointer to it.  */
1856 
1857 static struct iv_to_split *
analyze_iv_to_split_insn(rtx insn)1858 analyze_iv_to_split_insn (rtx insn)
1859 {
1860   rtx set, dest;
1861   struct rtx_iv iv;
1862   struct iv_to_split *ivts;
1863   bool ok;
1864 
1865   /* For now we just split the basic induction variables.  Later this may be
1866      extended for example by selecting also addresses of memory references.  */
1867   set = single_set (insn);
1868   if (!set)
1869     return NULL;
1870 
1871   dest = SET_DEST (set);
1872   if (!REG_P (dest))
1873     return NULL;
1874 
1875   if (!biv_p (insn, dest))
1876     return NULL;
1877 
1878   ok = iv_analyze_result (insn, dest, &iv);
1879 
1880   /* This used to be an assert under the assumption that if biv_p returns
1881      true that iv_analyze_result must also return true.  However, that
1882      assumption is not strictly correct as evidenced by pr25569.
1883 
1884      Returning NULL when iv_analyze_result returns false is safe and
1885      avoids the problems in pr25569 until the iv_analyze_* routines
1886      can be fixed, which is apparently hard and time consuming
1887      according to their author.  */
1888   if (! ok)
1889     return NULL;
1890 
1891   if (iv.step == const0_rtx
1892       || iv.mode != iv.extend_mode)
1893     return NULL;
1894 
1895   /* Record the insn to split.  */
1896   ivts = XNEW (struct iv_to_split);
1897   ivts->insn = insn;
1898   ivts->orig_var = dest;
1899   ivts->base_var = NULL_RTX;
1900   ivts->step = iv.step;
1901   ivts->next = NULL;
1902   ivts->n_loc = 1;
1903   ivts->loc[0] = 1;
1904 
1905   return ivts;
1906 }
1907 
1908 /* Determines which of insns in LOOP can be optimized.
1909    Return a OPT_INFO struct with the relevant hash tables filled
1910    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1911    is undefined for the return value.  */
1912 
1913 static struct opt_info *
analyze_insns_in_loop(struct loop * loop)1914 analyze_insns_in_loop (struct loop *loop)
1915 {
1916   basic_block *body, bb;
1917   unsigned i;
1918   struct opt_info *opt_info = XCNEW (struct opt_info);
1919   rtx insn;
1920   struct iv_to_split *ivts = NULL;
1921   struct var_to_expand *ves = NULL;
1922   iv_to_split **slot1;
1923   var_to_expand **slot2;
1924   vec<edge> edges = get_loop_exit_edges (loop);
1925   edge exit;
1926   bool can_apply = false;
1927 
1928   iv_analysis_loop_init (loop);
1929 
1930   body = get_loop_body (loop);
1931 
1932   if (flag_split_ivs_in_unroller)
1933     {
1934       opt_info->insns_to_split.create (5 * loop->num_nodes);
1935       opt_info->iv_to_split_head = NULL;
1936       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1937     }
1938 
1939   /* Record the loop exit bb and loop preheader before the unrolling.  */
1940   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1941 
1942   if (edges.length () == 1)
1943     {
1944       exit = edges[0];
1945       if (!(exit->flags & EDGE_COMPLEX))
1946 	{
1947 	  opt_info->loop_exit = split_edge (exit);
1948 	  can_apply = true;
1949 	}
1950     }
1951 
1952   if (flag_variable_expansion_in_unroller
1953       && can_apply)
1954     {
1955       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1956       opt_info->var_to_expand_head = NULL;
1957       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1958     }
1959 
1960   for (i = 0; i < loop->num_nodes; i++)
1961     {
1962       bb = body[i];
1963       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1964 	continue;
1965 
1966       FOR_BB_INSNS (bb, insn)
1967       {
1968         if (!INSN_P (insn))
1969           continue;
1970 
1971         if (opt_info->insns_to_split.is_created ())
1972           ivts = analyze_iv_to_split_insn (insn);
1973 
1974         if (ivts)
1975           {
1976             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1977 	    gcc_assert (*slot1 == NULL);
1978             *slot1 = ivts;
1979 	    *opt_info->iv_to_split_tail = ivts;
1980 	    opt_info->iv_to_split_tail = &ivts->next;
1981             continue;
1982           }
1983 
1984         if (opt_info->insns_with_var_to_expand.is_created ())
1985           ves = analyze_insn_to_expand_var (loop, insn);
1986 
1987         if (ves)
1988           {
1989             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1990 	    gcc_assert (*slot2 == NULL);
1991             *slot2 = ves;
1992 	    *opt_info->var_to_expand_tail = ves;
1993 	    opt_info->var_to_expand_tail = &ves->next;
1994           }
1995       }
1996     }
1997 
1998   edges.release ();
1999   free (body);
2000   return opt_info;
2001 }
2002 
2003 /* Called just before loop duplication.  Records start of duplicated area
2004    to OPT_INFO.  */
2005 
2006 static void
opt_info_start_duplication(struct opt_info * opt_info)2007 opt_info_start_duplication (struct opt_info *opt_info)
2008 {
2009   if (opt_info)
2010     opt_info->first_new_block = last_basic_block_for_fn (cfun);
2011 }
2012 
2013 /* Determine the number of iterations between initialization of the base
2014    variable and the current copy (N_COPY).  N_COPIES is the total number
2015    of newly created copies.  UNROLLING is true if we are unrolling
2016    (not peeling) the loop.  */
2017 
2018 static unsigned
determine_split_iv_delta(unsigned n_copy,unsigned n_copies,bool unrolling)2019 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2020 {
2021   if (unrolling)
2022     {
2023       /* If we are unrolling, initialization is done in the original loop
2024 	 body (number 0).  */
2025       return n_copy;
2026     }
2027   else
2028     {
2029       /* If we are peeling, the copy in that the initialization occurs has
2030 	 number 1.  The original loop (number 0) is the last.  */
2031       if (n_copy)
2032 	return n_copy - 1;
2033       else
2034 	return n_copies;
2035     }
2036 }
2037 
2038 /* Locate in EXPR the expression corresponding to the location recorded
2039    in IVTS, and return a pointer to the RTX for this location.  */
2040 
2041 static rtx *
get_ivts_expr(rtx expr,struct iv_to_split * ivts)2042 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2043 {
2044   unsigned i;
2045   rtx *ret = &expr;
2046 
2047   for (i = 0; i < ivts->n_loc; i++)
2048     ret = &XEXP (*ret, ivts->loc[i]);
2049 
2050   return ret;
2051 }
2052 
2053 /* Allocate basic variable for the induction variable chain.  */
2054 
2055 static void
allocate_basic_variable(struct iv_to_split * ivts)2056 allocate_basic_variable (struct iv_to_split *ivts)
2057 {
2058   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2059 
2060   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2061 }
2062 
2063 /* Insert initialization of basic variable of IVTS before INSN, taking
2064    the initial value from INSN.  */
2065 
2066 static void
insert_base_initialization(struct iv_to_split * ivts,rtx insn)2067 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2068 {
2069   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2070   rtx seq;
2071 
2072   start_sequence ();
2073   expr = force_operand (expr, ivts->base_var);
2074   if (expr != ivts->base_var)
2075     emit_move_insn (ivts->base_var, expr);
2076   seq = get_insns ();
2077   end_sequence ();
2078 
2079   emit_insn_before (seq, insn);
2080 }
2081 
2082 /* Replace the use of induction variable described in IVTS in INSN
2083    by base variable + DELTA * step.  */
2084 
2085 static void
split_iv(struct iv_to_split * ivts,rtx insn,unsigned delta)2086 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2087 {
2088   rtx expr, *loc, seq, incr, var;
2089   enum machine_mode mode = GET_MODE (ivts->base_var);
2090   rtx src, dest, set;
2091 
2092   /* Construct base + DELTA * step.  */
2093   if (!delta)
2094     expr = ivts->base_var;
2095   else
2096     {
2097       incr = simplify_gen_binary (MULT, mode,
2098 				  ivts->step, gen_int_mode (delta, mode));
2099       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2100 				  ivts->base_var, incr);
2101     }
2102 
2103   /* Figure out where to do the replacement.  */
2104   loc = get_ivts_expr (single_set (insn), ivts);
2105 
2106   /* If we can make the replacement right away, we're done.  */
2107   if (validate_change (insn, loc, expr, 0))
2108     return;
2109 
2110   /* Otherwise, force EXPR into a register and try again.  */
2111   start_sequence ();
2112   var = gen_reg_rtx (mode);
2113   expr = force_operand (expr, var);
2114   if (expr != var)
2115     emit_move_insn (var, expr);
2116   seq = get_insns ();
2117   end_sequence ();
2118   emit_insn_before (seq, insn);
2119 
2120   if (validate_change (insn, loc, var, 0))
2121     return;
2122 
2123   /* The last chance.  Try recreating the assignment in insn
2124      completely from scratch.  */
2125   set = single_set (insn);
2126   gcc_assert (set);
2127 
2128   start_sequence ();
2129   *loc = var;
2130   src = copy_rtx (SET_SRC (set));
2131   dest = copy_rtx (SET_DEST (set));
2132   src = force_operand (src, dest);
2133   if (src != dest)
2134     emit_move_insn (dest, src);
2135   seq = get_insns ();
2136   end_sequence ();
2137 
2138   emit_insn_before (seq, insn);
2139   delete_insn (insn);
2140 }
2141 
2142 
2143 /* Return one expansion of the accumulator recorded in struct VE.  */
2144 
2145 static rtx
get_expansion(struct var_to_expand * ve)2146 get_expansion (struct var_to_expand *ve)
2147 {
2148   rtx reg;
2149 
2150   if (ve->reuse_expansion == 0)
2151     reg = ve->reg;
2152   else
2153     reg = ve->var_expansions[ve->reuse_expansion - 1];
2154 
2155   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2156     ve->reuse_expansion = 0;
2157   else
2158     ve->reuse_expansion++;
2159 
2160   return reg;
2161 }
2162 
2163 
2164 /* Given INSN replace the uses of the accumulator recorded in VE
2165    with a new register.  */
2166 
2167 static void
expand_var_during_unrolling(struct var_to_expand * ve,rtx insn)2168 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2169 {
2170   rtx new_reg, set;
2171   bool really_new_expansion = false;
2172 
2173   set = single_set (insn);
2174   gcc_assert (set);
2175 
2176   /* Generate a new register only if the expansion limit has not been
2177      reached.  Else reuse an already existing expansion.  */
2178   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2179     {
2180       really_new_expansion = true;
2181       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2182     }
2183   else
2184     new_reg = get_expansion (ve);
2185 
2186   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2187   if (apply_change_group ())
2188     if (really_new_expansion)
2189       {
2190         ve->var_expansions.safe_push (new_reg);
2191         ve->expansion_count++;
2192       }
2193 }
2194 
2195 /* Initialize the variable expansions in loop preheader.  PLACE is the
2196    loop-preheader basic block where the initialization of the
2197    expansions should take place.  The expansions are initialized with
2198    (-0) when the operation is plus or minus to honor sign zero.  This
2199    way we can prevent cases where the sign of the final result is
2200    effected by the sign of the expansion.  Here is an example to
2201    demonstrate this:
2202 
2203    for (i = 0 ; i < n; i++)
2204      sum += something;
2205 
2206    ==>
2207 
2208    sum += something
2209    ....
2210    i = i+1;
2211    sum1 += something
2212    ....
2213    i = i+1
2214    sum2 += something;
2215    ....
2216 
2217    When SUM is initialized with -zero and SOMETHING is also -zero; the
2218    final result of sum should be -zero thus the expansions sum1 and sum2
2219    should be initialized with -zero as well (otherwise we will get +zero
2220    as the final result).  */
2221 
2222 static void
insert_var_expansion_initialization(struct var_to_expand * ve,basic_block place)2223 insert_var_expansion_initialization (struct var_to_expand *ve,
2224 				     basic_block place)
2225 {
2226   rtx seq, var, zero_init;
2227   unsigned i;
2228   enum machine_mode mode = GET_MODE (ve->reg);
2229   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2230 
2231   if (ve->var_expansions.length () == 0)
2232     return;
2233 
2234   start_sequence ();
2235   switch (ve->op)
2236     {
2237     case FMA:
2238       /* Note that we only accumulate FMA via the ADD operand.  */
2239     case PLUS:
2240     case MINUS:
2241       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2242         {
2243 	  if (honor_signed_zero_p)
2244 	    zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2245 	  else
2246 	    zero_init = CONST0_RTX (mode);
2247           emit_move_insn (var, zero_init);
2248         }
2249       break;
2250 
2251     case MULT:
2252       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2253         {
2254           zero_init = CONST1_RTX (GET_MODE (var));
2255           emit_move_insn (var, zero_init);
2256         }
2257       break;
2258 
2259     default:
2260       gcc_unreachable ();
2261     }
2262 
2263   seq = get_insns ();
2264   end_sequence ();
2265 
2266   emit_insn_after (seq, BB_END (place));
2267 }
2268 
2269 /* Combine the variable expansions at the loop exit.  PLACE is the
2270    loop exit basic block where the summation of the expansions should
2271    take place.  */
2272 
2273 static void
combine_var_copies_in_loop_exit(struct var_to_expand * ve,basic_block place)2274 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2275 {
2276   rtx sum = ve->reg;
2277   rtx expr, seq, var, insn;
2278   unsigned i;
2279 
2280   if (ve->var_expansions.length () == 0)
2281     return;
2282 
2283   start_sequence ();
2284   switch (ve->op)
2285     {
2286     case FMA:
2287       /* Note that we only accumulate FMA via the ADD operand.  */
2288     case PLUS:
2289     case MINUS:
2290       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2291 	sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2292       break;
2293 
2294     case MULT:
2295       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2296 	sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2297       break;
2298 
2299     default:
2300       gcc_unreachable ();
2301     }
2302 
2303   expr = force_operand (sum, ve->reg);
2304   if (expr != ve->reg)
2305     emit_move_insn (ve->reg, expr);
2306   seq = get_insns ();
2307   end_sequence ();
2308 
2309   insn = BB_HEAD (place);
2310   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2311     insn = NEXT_INSN (insn);
2312 
2313   emit_insn_after (seq, insn);
2314 }
2315 
2316 /* Strip away REG_EQUAL notes for IVs we're splitting.
2317 
2318    Updating REG_EQUAL notes for IVs we split is tricky: We
2319    cannot tell until after unrolling, DF-rescanning, and liveness
2320    updating, whether an EQ_USE is reached by the split IV while
2321    the IV reg is still live.  See PR55006.
2322 
2323    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2324    because RTL loop-iv requires us to defer rescanning insns and
2325    any notes attached to them.  So resort to old techniques...  */
2326 
2327 static void
maybe_strip_eq_note_for_split_iv(struct opt_info * opt_info,rtx insn)2328 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2329 {
2330   struct iv_to_split *ivts;
2331   rtx note = find_reg_equal_equiv_note (insn);
2332   if (! note)
2333     return;
2334   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2335     if (reg_mentioned_p (ivts->orig_var, note))
2336       {
2337 	remove_note (insn, note);
2338 	return;
2339       }
2340 }
2341 
2342 /* Apply loop optimizations in loop copies using the
2343    data which gathered during the unrolling.  Structure
2344    OPT_INFO record that data.
2345 
2346    UNROLLING is true if we unrolled (not peeled) the loop.
2347    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2348    the loop (as it should happen in complete unrolling, but not in ordinary
2349    peeling of the loop).  */
2350 
2351 static void
apply_opt_in_copies(struct opt_info * opt_info,unsigned n_copies,bool unrolling,bool rewrite_original_loop)2352 apply_opt_in_copies (struct opt_info *opt_info,
2353                      unsigned n_copies, bool unrolling,
2354                      bool rewrite_original_loop)
2355 {
2356   unsigned i, delta;
2357   basic_block bb, orig_bb;
2358   rtx insn, orig_insn, next;
2359   struct iv_to_split ivts_templ, *ivts;
2360   struct var_to_expand ve_templ, *ves;
2361 
2362   /* Sanity check -- we need to put initialization in the original loop
2363      body.  */
2364   gcc_assert (!unrolling || rewrite_original_loop);
2365 
2366   /* Allocate the basic variables (i0).  */
2367   if (opt_info->insns_to_split.is_created ())
2368     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2369       allocate_basic_variable (ivts);
2370 
2371   for (i = opt_info->first_new_block;
2372        i < (unsigned) last_basic_block_for_fn (cfun);
2373        i++)
2374     {
2375       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2376       orig_bb = get_bb_original (bb);
2377 
2378       /* bb->aux holds position in copy sequence initialized by
2379 	 duplicate_loop_to_header_edge.  */
2380       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2381 					unrolling);
2382       bb->aux = 0;
2383       orig_insn = BB_HEAD (orig_bb);
2384       FOR_BB_INSNS_SAFE (bb, insn, next)
2385         {
2386 	  if (!INSN_P (insn)
2387 	      || (DEBUG_INSN_P (insn)
2388 		  && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2389             continue;
2390 
2391 	  while (!INSN_P (orig_insn)
2392 		 || (DEBUG_INSN_P (orig_insn)
2393 		     && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2394 			 == LABEL_DECL)))
2395             orig_insn = NEXT_INSN (orig_insn);
2396 
2397           ivts_templ.insn = orig_insn;
2398           ve_templ.insn = orig_insn;
2399 
2400           /* Apply splitting iv optimization.  */
2401           if (opt_info->insns_to_split.is_created ())
2402             {
2403 	      maybe_strip_eq_note_for_split_iv (opt_info, insn);
2404 
2405               ivts = opt_info->insns_to_split.find (&ivts_templ);
2406 
2407               if (ivts)
2408                 {
2409 		  gcc_assert (GET_CODE (PATTERN (insn))
2410 			      == GET_CODE (PATTERN (orig_insn)));
2411 
2412                   if (!delta)
2413                     insert_base_initialization (ivts, insn);
2414                   split_iv (ivts, insn, delta);
2415                 }
2416             }
2417           /* Apply variable expansion optimization.  */
2418           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2419             {
2420               ves = (struct var_to_expand *)
2421 		opt_info->insns_with_var_to_expand.find (&ve_templ);
2422               if (ves)
2423                 {
2424 		  gcc_assert (GET_CODE (PATTERN (insn))
2425 			      == GET_CODE (PATTERN (orig_insn)));
2426                   expand_var_during_unrolling (ves, insn);
2427                 }
2428             }
2429           orig_insn = NEXT_INSN (orig_insn);
2430         }
2431     }
2432 
2433   if (!rewrite_original_loop)
2434     return;
2435 
2436   /* Initialize the variable expansions in the loop preheader
2437      and take care of combining them at the loop exit.  */
2438   if (opt_info->insns_with_var_to_expand.is_created ())
2439     {
2440       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2441 	insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2442       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2443 	combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2444     }
2445 
2446   /* Rewrite also the original loop body.  Find them as originals of the blocks
2447      in the last copied iteration, i.e. those that have
2448      get_bb_copy (get_bb_original (bb)) == bb.  */
2449   for (i = opt_info->first_new_block;
2450        i < (unsigned) last_basic_block_for_fn (cfun);
2451        i++)
2452     {
2453       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2454       orig_bb = get_bb_original (bb);
2455       if (get_bb_copy (orig_bb) != bb)
2456 	continue;
2457 
2458       delta = determine_split_iv_delta (0, n_copies, unrolling);
2459       for (orig_insn = BB_HEAD (orig_bb);
2460            orig_insn != NEXT_INSN (BB_END (bb));
2461            orig_insn = next)
2462         {
2463           next = NEXT_INSN (orig_insn);
2464 
2465           if (!INSN_P (orig_insn))
2466  	    continue;
2467 
2468           ivts_templ.insn = orig_insn;
2469           if (opt_info->insns_to_split.is_created ())
2470             {
2471 	      maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2472 
2473               ivts = (struct iv_to_split *)
2474 		opt_info->insns_to_split.find (&ivts_templ);
2475               if (ivts)
2476                 {
2477                   if (!delta)
2478                     insert_base_initialization (ivts, orig_insn);
2479                   split_iv (ivts, orig_insn, delta);
2480                   continue;
2481                 }
2482             }
2483 
2484         }
2485     }
2486 }
2487 
2488 /* Release OPT_INFO.  */
2489 
2490 static void
free_opt_info(struct opt_info * opt_info)2491 free_opt_info (struct opt_info *opt_info)
2492 {
2493   if (opt_info->insns_to_split.is_created ())
2494     opt_info->insns_to_split.dispose ();
2495   if (opt_info->insns_with_var_to_expand.is_created ())
2496     {
2497       struct var_to_expand *ves;
2498 
2499       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2500 	ves->var_expansions.release ();
2501       opt_info->insns_with_var_to_expand.dispose ();
2502     }
2503   free (opt_info);
2504 }
2505