1 /* Loop versioning pass.
2    Copyright (C) 2018-2021 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "tree.h"
25 #include "gimple.h"
26 #include "gimple-iterator.h"
27 #include "tree-pass.h"
28 #include "gimplify-me.h"
29 #include "cfgloop.h"
30 #include "tree-ssa-loop.h"
31 #include "ssa.h"
32 #include "tree-scalar-evolution.h"
33 #include "tree-chrec.h"
34 #include "tree-ssa-loop-ivopts.h"
35 #include "fold-const.h"
36 #include "tree-ssa-propagate.h"
37 #include "tree-inline.h"
38 #include "domwalk.h"
39 #include "alloc-pool.h"
40 #include "vr-values.h"
41 #include "gimple-ssa-evrp-analyze.h"
42 #include "tree-vectorizer.h"
43 #include "omp-general.h"
44 #include "predict.h"
45 #include "tree-into-ssa.h"
46 
47 namespace {
48 
49 /* This pass looks for loops that could be simplified if certain loop
50    invariant conditions were true.  It is effectively a form of loop
51    splitting in which the pass produces the split conditions itself,
52    instead of using ones that are already present in the IL.
53 
54    Versioning for when strides are 1
55    ---------------------------------
56 
57    At the moment the only thing the pass looks for are memory references
58    like:
59 
60      for (auto i : ...)
61        ...x[i * stride]...
62 
63    It considers changing such loops to:
64 
65      if (stride == 1)
66        for (auto i : ...)    [A]
67 	 ...x[i]...
68      else
69        for (auto i : ...)    [B]
70 	 ...x[i * stride]...
71 
72    This can have several benefits:
73 
74    (1) [A] is often easier or cheaper to vectorize than [B].
75 
76    (2) The scalar code in [A] is simpler than the scalar code in [B]
77        (if the loops cannot be vectorized or need an epilogue loop).
78 
79    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
80 
81    (4) [A] has simpler address evolutions, which can help other passes
82        like loop interchange.
83 
84    The optimization is particularly useful for assumed-shape arrays in
85    Fortran, where the stride of the innermost dimension depends on the
86    array descriptor but is often equal to 1 in practice.  For example:
87 
88      subroutine f1(x)
89        real :: x(:)
90        x(:) = 100
91      end subroutine f1
92 
93    generates the equivalent of:
94 
95      raw_stride = *x.dim[0].stride;
96      stride = raw_stride != 0 ? raw_stride : 1;
97      x_base = *x.data;
98      ...
99      tmp1 = stride * S;
100      tmp2 = tmp1 - stride;
101      *x_base[tmp2] = 1.0e+2;
102 
103    but in the common case that stride == 1, the last three statements
104    simplify to:
105 
106      tmp3 = S + -1;
107      *x_base[tmp3] = 1.0e+2;
108 
109    The optimization is in principle very simple.  The difficult parts are:
110 
111    (a) deciding which parts of a general address calculation correspond
112        to the inner dimension of an array, since this usually isn't explicit
113        in the IL, and for C often isn't even explicit in the source code
114 
115    (b) estimating when the transformation is worthwhile
116 
117    Structure
118    ---------
119 
120    The pass has four phases:
121 
122    (1) Walk through the statements looking for and recording potential
123        versioning opportunities.  Stop if there are none.
124 
125    (2) Use context-sensitive range information to see whether any versioning
126        conditions are impossible in practice.  Remove them if so, and stop
127        if no opportunities remain.
128 
129        (We do this only after (1) to keep compile time down when no
130        versioning opportunities exist.)
131 
132    (3) Apply the cost model.  Decide which versioning opportunities are
133        worthwhile and at which nesting level they should be applied.
134 
135    (4) Attempt to version all the loops selected by (3), so that:
136 
137 	 for (...)
138 	   ...
139 
140        becomes:
141 
142 	 if (!cond)
143 	   for (...) // Original loop
144 	     ...
145 	 else
146 	   for (...) // New loop
147 	     ...
148 
149        Use the version condition COND to simplify the new loop.  */
150 
151 /* Enumerates the likelihood that a particular value indexes the inner
152    dimension of an array.  */
153 enum inner_likelihood {
154   INNER_UNLIKELY,
155   INNER_DONT_KNOW,
156   INNER_LIKELY
157 };
158 
159 /* Information about one term of an address_info.  */
160 struct address_term_info
161 {
162   /* The value of the term is EXPR * MULTIPLIER.  */
163   tree expr;
164   unsigned HOST_WIDE_INT multiplier;
165 
166   /* The stride applied by EXPR in each iteration of some unrecorded loop,
167      or null if no stride has been identified.  */
168   tree stride;
169 
170   /* Enumerates the likelihood that EXPR indexes the inner dimension
171      of an array.  */
172   enum inner_likelihood inner_likelihood;
173 
174   /* True if STRIDE == 1 is a versioning opportunity when considered
175      in isolation.  */
176   bool versioning_opportunity_p;
177 };
178 
179 /* Information about an address calculation, and the range of constant
180    offsets applied to it.  */
181 class address_info
182 {
183 public:
184   static const unsigned int MAX_TERMS = 8;
185 
186   /* One statement that calculates the address.  If multiple statements
187      share the same address, we only record the first.  */
188   gimple *stmt;
189 
190   /* The loop containing STMT (cached for convenience).  If multiple
191      statements share the same address, they all belong to this loop.  */
192   class loop *loop;
193 
194   /* A decomposition of the calculation into a sum of terms plus an
195      optional base.  When BASE is provided, it is never an SSA name.
196      Once initialization is complete, all members of TERMs are SSA names.  */
197   tree base;
198   auto_vec<address_term_info, MAX_TERMS> terms;
199 
200   /* All bytes accessed from the address fall in the offset range
201      [MIN_OFFSET, MAX_OFFSET).  */
202   HOST_WIDE_INT min_offset, max_offset;
203 };
204 
205 /* Stores addresses based on their base and terms (ignoring the offsets).  */
206 struct address_info_hasher : nofree_ptr_hash <address_info>
207 {
208   static hashval_t hash (const address_info *);
209   static bool equal (const address_info *, const address_info *);
210 };
211 
212 /* Information about the versioning we'd like to apply to a loop.  */
213 class loop_info
214 {
215 public:
216   bool worth_versioning_p () const;
217 
218   /* True if we've decided not to version this loop.  The remaining
219      fields are meaningless if so.  */
220   bool rejected_p;
221 
222   /* True if at least one subloop of this loop benefits from versioning.  */
223   bool subloops_benefit_p;
224 
225   /* An estimate of the total number of instructions in the loop,
226      excluding those in subloops that benefit from versioning.  */
227   unsigned int num_insns;
228 
229   /* The outermost loop that can handle all the version checks
230      described below.  */
231   class loop *outermost;
232 
233   /* The first entry in the list of blocks that belong to this loop
234      (and not to subloops).  m_next_block_in_loop provides the chain
235      pointers for the list.  */
236   basic_block block_list;
237 
238   /* We'd like to version the loop for the case in which these SSA names
239      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
240   bitmap_head unity_names;
241 
242   /* If versioning succeeds, this points the version of the loop that
243      assumes the version conditions holds.  */
244   class loop *optimized_loop;
245 };
246 
247 /* The main pass structure.  */
248 class loop_versioning
249 {
250 public:
251   loop_versioning (function *);
252   ~loop_versioning ();
253   unsigned int run ();
254 
255 private:
256   /* Used to walk the dominator tree to find loop versioning conditions
257      that are always false.  */
258   class lv_dom_walker : public dom_walker
259   {
260   public:
261     lv_dom_walker (loop_versioning &);
262 
263     edge before_dom_children (basic_block) FINAL OVERRIDE;
264     void after_dom_children (basic_block) FINAL OVERRIDE;
265 
266   private:
267     /* The parent pass.  */
268     loop_versioning &m_lv;
269 
270     /* Used to build context-dependent range information.  */
271     evrp_range_analyzer m_range_analyzer;
272   };
273 
274   /* Used to simplify statements based on conditions that are established
275      by the version checks.  */
276   class name_prop : public substitute_and_fold_engine
277   {
278   public:
name_prop(loop_info & li)279     name_prop (loop_info &li) : m_li (li) {}
280     tree value_of_expr (tree name, gimple *) FINAL OVERRIDE;
281 
282   private:
283     /* Information about the versioning we've performed on the loop.  */
284     loop_info &m_li;
285   };
286 
get_loop_info(class loop * loop)287   loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
288 
289   unsigned int max_insns_for_loop (class loop *);
290   bool expensive_stmt_p (gimple *);
291 
292   void version_for_unity (gimple *, tree);
293   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
294 				unsigned HOST_WIDE_INT * = 0);
295   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
296   bool multiply_term_by (address_term_info &, tree);
297   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
298   void dump_inner_likelihood (address_info &, address_term_info &);
299   void analyze_stride (address_info &, address_term_info &,
300 		       tree, class loop *);
301   bool find_per_loop_multiplication (address_info &, address_term_info &);
302   bool analyze_term_using_scevs (address_info &, address_term_info &);
303   void analyze_arbitrary_term (address_info &, address_term_info &);
304   void analyze_address_fragment (address_info &);
305   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
306 				tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
307   void analyze_expr (gimple *, tree);
308   bool analyze_block (basic_block);
309   bool analyze_blocks ();
310 
311   void prune_loop_conditions (class loop *, vr_values *);
312   bool prune_conditions ();
313 
314   void merge_loop_info (class loop *, class loop *);
315   void add_loop_to_queue (class loop *);
316   bool decide_whether_loop_is_versionable (class loop *);
317   bool make_versioning_decisions ();
318 
319   bool version_loop (class loop *);
320   void implement_versioning_decisions ();
321 
322   /* The function we're optimizing.  */
323   function *m_fn;
324 
325   /* The obstack to use for all pass-specific bitmaps.  */
326   bitmap_obstack m_bitmap_obstack;
327 
328   /* An obstack to use for general allocation.  */
329   obstack m_obstack;
330 
331   /* The number of loops in the function.  */
332   unsigned int m_nloops;
333 
334   /* The total number of loop version conditions we've found.  */
335   unsigned int m_num_conditions;
336 
337   /* Assume that an address fragment of the form i * stride * scale
338      (for variable stride and constant scale) will not benefit from
339      versioning for stride == 1 when scale is greater than this value.  */
340   unsigned HOST_WIDE_INT m_maximum_scale;
341 
342   /* Information about each loop.  */
343   auto_vec<loop_info> m_loops;
344 
345   /* Used to form a linked list of blocks that belong to a loop,
346      started by loop_info::block_list.  */
347   auto_vec<basic_block> m_next_block_in_loop;
348 
349   /* The list of loops that we've decided to version.  */
350   auto_vec<class loop *> m_loops_to_version;
351 
352   /* A table of addresses in the current loop, keyed off their values
353      but not their offsets.  */
354   hash_table <address_info_hasher> m_address_table;
355 
356   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
357   auto_vec <address_info *, 32> m_address_list;
358 };
359 
360 /* If EXPR is an SSA name and not a default definition, return the
361    defining statement, otherwise return null.  */
362 
363 static gimple *
maybe_get_stmt(tree expr)364 maybe_get_stmt (tree expr)
365 {
366   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
367     return SSA_NAME_DEF_STMT (expr);
368   return NULL;
369 }
370 
371 /* Like maybe_get_stmt, but also return null if the defining
372    statement isn't an assignment.  */
373 
374 static gassign *
maybe_get_assign(tree expr)375 maybe_get_assign (tree expr)
376 {
377   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
378 }
379 
380 /* Return true if this pass should look through a cast of expression FROM
381    to type TYPE when analyzing pieces of an address.  */
382 
383 static bool
look_through_cast_p(tree type,tree from)384 look_through_cast_p (tree type, tree from)
385 {
386   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
387 	  && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
388 }
389 
390 /* Strip all conversions of integers or pointers from EXPR, regardless
391    of whether the conversions are nops.  This is useful in the context
392    of this pass because we're not trying to fold or simulate the
393    expression; we just want to see how it's structured.  */
394 
395 static tree
strip_casts(tree expr)396 strip_casts (tree expr)
397 {
398   const unsigned int MAX_NITERS = 4;
399 
400   tree type = TREE_TYPE (expr);
401   while (CONVERT_EXPR_P (expr)
402 	 && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
403     expr = TREE_OPERAND (expr, 0);
404 
405   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
406     {
407       gassign *assign = maybe_get_assign (expr);
408       if (assign
409 	  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
410 	  && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
411 	expr = gimple_assign_rhs1 (assign);
412       else
413 	break;
414     }
415   return expr;
416 }
417 
418 /* Compare two address_term_infos in the same address_info.  */
419 
420 static int
compare_address_terms(const void * a_uncast,const void * b_uncast)421 compare_address_terms (const void *a_uncast, const void *b_uncast)
422 {
423   const address_term_info *a = (const address_term_info *) a_uncast;
424   const address_term_info *b = (const address_term_info *) b_uncast;
425 
426   if (a->expr != b->expr)
427     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
428 
429   if (a->multiplier != b->multiplier)
430     return a->multiplier < b->multiplier ? -1 : 1;
431 
432   return 0;
433 }
434 
435 /* Dump ADDRESS using flags FLAGS.  */
436 
437 static void
dump_address_info(dump_flags_t flags,address_info & address)438 dump_address_info (dump_flags_t flags, address_info &address)
439 {
440   if (address.base)
441     dump_printf (flags, "%T + ", address.base);
442   for (unsigned int i = 0; i < address.terms.length (); ++i)
443     {
444       if (i != 0)
445 	dump_printf (flags, " + ");
446       dump_printf (flags, "%T", address.terms[i].expr);
447       if (address.terms[i].multiplier != 1)
448 	dump_printf (flags, " * %wd", address.terms[i].multiplier);
449     }
450   dump_printf (flags, " + [%wd, %wd]",
451 	       address.min_offset, address.max_offset - 1);
452 }
453 
454 /* Hash an address_info based on its base and terms.  */
455 
456 hashval_t
hash(const address_info * info)457 address_info_hasher::hash (const address_info *info)
458 {
459   inchash::hash hash;
460   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
461   hash.add_int (info->terms.length ());
462   for (unsigned int i = 0; i < info->terms.length (); ++i)
463     {
464       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
465       hash.add_hwi (info->terms[i].multiplier);
466     }
467   return hash.end ();
468 }
469 
470 /* Return true if two address_infos have equal bases and terms.  Other
471    properties might be different (such as the statement or constant
472    offset range).  */
473 
474 bool
equal(const address_info * a,const address_info * b)475 address_info_hasher::equal (const address_info *a, const address_info *b)
476 {
477   if (a->base != b->base
478       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
479     return false;
480 
481   if (a->terms.length () != b->terms.length ())
482     return false;
483 
484   for (unsigned int i = 0; i < a->terms.length (); ++i)
485     if (a->terms[i].expr != b->terms[i].expr
486 	|| a->terms[i].multiplier != b->terms[i].multiplier)
487       return false;
488 
489   return true;
490 }
491 
492 /* Return true if we want to version the loop, i.e. if we have a
493    specific reason for doing so and no specific reason not to.  */
494 
495 bool
worth_versioning_p() const496 loop_info::worth_versioning_p () const
497 {
498   return (!rejected_p
499 	  && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
500 }
501 
lv_dom_walker(loop_versioning & lv)502 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
503   : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
504 {
505 }
506 
507 /* Process BB before processing the blocks it dominates.  */
508 
509 edge
before_dom_children(basic_block bb)510 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
511 {
512   m_range_analyzer.enter (bb);
513 
514   if (bb == bb->loop_father->header)
515     m_lv.prune_loop_conditions (bb->loop_father, &m_range_analyzer);
516 
517   for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
518        gsi_next (&si))
519     m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
520 
521   return NULL;
522 }
523 
524 /* Process BB after processing the blocks it dominates.  */
525 
526 void
after_dom_children(basic_block bb)527 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
528 {
529   m_range_analyzer.leave (bb);
530 }
531 
532 /* Decide whether to replace VAL with a new value in a versioned loop.
533    Return the new value if so, otherwise return null.  */
534 
535 tree
value_of_expr(tree val,gimple *)536 loop_versioning::name_prop::value_of_expr (tree val, gimple *)
537 {
538   if (TREE_CODE (val) == SSA_NAME
539       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
540     return build_one_cst (TREE_TYPE (val));
541   return NULL_TREE;
542 }
543 
544 /* Initialize the structure to optimize FN.  */
545 
loop_versioning(function * fn)546 loop_versioning::loop_versioning (function *fn)
547   : m_fn (fn),
548     m_nloops (number_of_loops (fn)),
549     m_num_conditions (0),
550     m_address_table (31)
551 {
552   bitmap_obstack_initialize (&m_bitmap_obstack);
553   gcc_obstack_init (&m_obstack);
554 
555   /* Initialize the loop information.  */
556   m_loops.safe_grow_cleared (m_nloops, true);
557   for (unsigned int i = 0; i < m_nloops; ++i)
558     {
559       m_loops[i].outermost = get_loop (m_fn, 0);
560       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
561     }
562 
563   /* Initialize the list of blocks that belong to each loop.  */
564   unsigned int nbbs = last_basic_block_for_fn (fn);
565   m_next_block_in_loop.safe_grow (nbbs, true);
566   basic_block bb;
567   FOR_EACH_BB_FN (bb, fn)
568     {
569       loop_info &li = get_loop_info (bb->loop_father);
570       m_next_block_in_loop[bb->index] = li.block_list;
571       li.block_list = bb;
572     }
573 
574   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
575      unvectorizable code, since it is the largest size that can be
576      handled efficiently by scalar code.  omp_max_vf calculates the
577      maximum number of bytes in a vector, when such a value is relevant
578      to loop optimization.  */
579   m_maximum_scale = estimated_poly_value (omp_max_vf ());
580   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
581 }
582 
~loop_versioning()583 loop_versioning::~loop_versioning ()
584 {
585   bitmap_obstack_release (&m_bitmap_obstack);
586   obstack_free (&m_obstack, NULL);
587 }
588 
589 /* Return the maximum number of instructions allowed in LOOP before
590    it becomes too big for versioning.
591 
592    There are separate limits for inner and outer loops.  The limit for
593    inner loops applies only to loops that benefit directly from versioning.
594    The limit for outer loops applies to all code in the outer loop and
595    its subloops that *doesn't* benefit directly from versioning; such code
596    would be "taken along for the ride".  The idea is that if the cost of
597    the latter is small, it is better to version outer loops rather than
598    inner loops, both to reduce the number of repeated checks and to enable
599    more of the loop nest to be optimized as a natural nest (e.g. by loop
600    interchange or outer-loop vectorization).  */
601 
602 unsigned int
max_insns_for_loop(class loop * loop)603 loop_versioning::max_insns_for_loop (class loop *loop)
604 {
605   return (loop->inner
606 	  ? param_loop_versioning_max_outer_insns
607 	  : param_loop_versioning_max_inner_insns);
608 }
609 
610 /* Return true if for cost reasons we should avoid versioning any loop
611    that contains STMT.
612 
613    Note that we don't need to check whether versioning is invalid for
614    correctness reasons, since the versioning process does that for us.
615    The conditions involved are too rare to be worth duplicating here.  */
616 
617 bool
expensive_stmt_p(gimple * stmt)618 loop_versioning::expensive_stmt_p (gimple *stmt)
619 {
620   if (gcall *call = dyn_cast <gcall *> (stmt))
621     /* Assume for now that the time spent in an "expensive" call would
622        overwhelm any saving from versioning.  */
623     return !gimple_inexpensive_call_p (call);
624   return false;
625 }
626 
627 /* Record that we want to version the loop that contains STMT for the
628    case in which SSA name NAME is equal to 1.  We already know that NAME
629    is invariant in the loop.  */
630 
631 void
version_for_unity(gimple * stmt,tree name)632 loop_versioning::version_for_unity (gimple *stmt, tree name)
633 {
634   class loop *loop = loop_containing_stmt (stmt);
635   loop_info &li = get_loop_info (loop);
636 
637   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
638     {
639       /* This is the first time we've wanted to version LOOP for NAME.
640 	 Keep track of the outermost loop that can handle all versioning
641 	 checks in LI.  */
642       class loop *outermost
643 	= outermost_invariant_loop_for_expr (loop, name);
644       if (loop_depth (li.outermost) < loop_depth (outermost))
645 	li.outermost = outermost;
646 
647       if (dump_enabled_p ())
648 	{
649 	  dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
650 			   " for when %T == 1", name);
651 	  if (outermost == loop)
652 	    dump_printf (MSG_NOTE, "; cannot hoist check further");
653 	  else
654 	    {
655 	      dump_printf (MSG_NOTE, "; could implement the check at loop"
656 			   " depth %d", loop_depth (outermost));
657 	      if (loop_depth (li.outermost) > loop_depth (outermost))
658 		dump_printf (MSG_NOTE, ", but other checks only allow"
659 			     " a depth of %d", loop_depth (li.outermost));
660 	    }
661 	  dump_printf (MSG_NOTE, "\n");
662 	}
663 
664       m_num_conditions += 1;
665     }
666   else
667     {
668       /* This is a duplicate request.  */
669       if (dump_enabled_p ())
670 	dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
671 			 " loop for when %T == 1\n", name);
672     }
673 }
674 
675 /* Return true if OP1_TREE is constant and if in principle it is worth
676    versioning an address fragment of the form:
677 
678      i * OP1_TREE * OP2 * stride
679 
680    for the case in which stride == 1.  This in practice means testing
681    whether:
682 
683      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
684 
685    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
686 
687 bool
acceptable_multiplier_p(tree op1_tree,unsigned HOST_WIDE_INT op2,unsigned HOST_WIDE_INT * result)688 loop_versioning::acceptable_multiplier_p (tree op1_tree,
689 					  unsigned HOST_WIDE_INT op2,
690 					  unsigned HOST_WIDE_INT *result)
691 {
692   if (tree_fits_uhwi_p (op1_tree))
693     {
694       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
695       /* The first part checks for overflow.  */
696       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
697 	{
698 	  if (result)
699 	    *result = op1 * op2;
700 	  return true;
701 	}
702     }
703   return false;
704 }
705 
706 /* Return true if it is worth using loop versioning on a memory access
707    of type TYPE.  Store the size of the access in *SIZE if so.  */
708 
709 bool
acceptable_type_p(tree type,unsigned HOST_WIDE_INT * size)710 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
711 {
712   return (TYPE_SIZE_UNIT (type)
713 	  && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
714 }
715 
716 /* See whether OP is constant and whether we can multiply TERM by that
717    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
718    TERM if so.  */
719 
720 bool
multiply_term_by(address_term_info & term,tree op)721 loop_versioning::multiply_term_by (address_term_info &term, tree op)
722 {
723   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
724 }
725 
726 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
727    is likely to be indexing an innermost dimension, returning the result
728    as an INNER_* probability.  */
729 
730 inner_likelihood
get_inner_likelihood(tree stride,unsigned HOST_WIDE_INT multiplier)731 loop_versioning::get_inner_likelihood (tree stride,
732 				       unsigned HOST_WIDE_INT multiplier)
733 {
734   const unsigned int MAX_NITERS = 8;
735 
736   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
737      least one of those values is likely to be for the innermost dimension.
738      Record in UNLIKELY_P if at least one of those values is unlikely to be
739      for the innermost dimension.
740 
741      E.g. for:
742 
743        stride = cond ? a * b : 1
744 
745      we should treat STRIDE as being a likely inner dimension, since
746      we know that it is 1 under at least some circumstances.  (See the
747      Fortran example below.)  However:
748 
749        stride = a * b
750 
751      on its own is unlikely to be for the innermost dimension, since
752      that would require both a and b to be 1 at runtime.  */
753   bool unlikely_p = false;
754   tree worklist[MAX_NITERS];
755   unsigned int length = 0;
756   worklist[length++] = stride;
757   for (unsigned int i = 0; i < length; ++i)
758     {
759       tree expr = worklist[i];
760 
761       if (CONSTANT_CLASS_P (expr))
762 	{
763 	  /* See if EXPR * MULTIPLIER would be consistent with an individual
764 	     access or a small grouped access.  */
765 	  if (acceptable_multiplier_p (expr, multiplier))
766 	    return INNER_LIKELY;
767 	  else
768 	    unlikely_p = true;
769 	}
770       else if (gimple *stmt = maybe_get_stmt (expr))
771 	{
772 	  /* If EXPR is set by a PHI node, queue its arguments in case
773 	     we find one that is consistent with an inner dimension.
774 
775 	     An important instance of this is the Fortran handling of array
776 	     descriptors, which calculates the stride of the inner dimension
777 	     using a PHI equivalent of:
778 
779 		raw_stride = a.dim[0].stride;
780 		stride = raw_stride != 0 ? raw_stride : 1;
781 
782 	     (Strides for outer dimensions do not treat 0 specially.)  */
783 	  if (gphi *phi = dyn_cast <gphi *> (stmt))
784 	    {
785 	      unsigned int nargs = gimple_phi_num_args (phi);
786 	      for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
787 		worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
788 	    }
789 	  /* If the value is set by an assignment, expect it to be read
790 	     from memory (such as an array descriptor) rather than be
791 	     calculated.  */
792 	  else if (gassign *assign = dyn_cast <gassign *> (stmt))
793 	    {
794 	      if (!gimple_assign_load_p (assign))
795 		unlikely_p = true;
796 	    }
797 	  /* Things like calls don't really tell us anything.  */
798 	}
799     }
800 
801   /* We didn't find any possible values of STRIDE that were likely to be
802      for the innermost dimension.  If we found one that was actively
803      unlikely to be for the innermost dimension, assume that that applies
804      to STRIDE too.  */
805   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
806 }
807 
808 /* Dump the likelihood that TERM's stride is for the innermost dimension.
809    ADDRESS is the address that contains TERM.  */
810 
811 void
dump_inner_likelihood(address_info & address,address_term_info & term)812 loop_versioning::dump_inner_likelihood (address_info &address,
813 					address_term_info &term)
814 {
815   if (term.inner_likelihood == INNER_LIKELY)
816     dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
817 		     " innermost dimension\n", term.stride);
818   else if (term.inner_likelihood == INNER_UNLIKELY)
819     dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
820 		     " innermost dimension\n", term.stride);
821   else
822     dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
823 		     " is the innermost dimension\n", term.stride);
824 }
825 
826 /* The caller has identified that STRIDE is the stride of interest
827    in TERM, and that the stride is applied in OP_LOOP.  Record this
828    information in TERM, deciding whether STRIDE is likely to be for
829    the innermost dimension of an array and whether it represents a
830    versioning opportunity.  ADDRESS is the address that contains TERM.  */
831 
832 void
analyze_stride(address_info & address,address_term_info & term,tree stride,class loop * op_loop)833 loop_versioning::analyze_stride (address_info &address,
834 				 address_term_info &term,
835 				 tree stride, class loop *op_loop)
836 {
837   term.stride = stride;
838 
839   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
840   if (dump_enabled_p ())
841     dump_inner_likelihood (address, term);
842 
843   /* To be a versioning opportunity we require:
844 
845      - The multiplier applied by TERM is equal to the access size,
846        so that when STRIDE is 1, the accesses in successive loop
847        iterations are consecutive.
848 
849        This is deliberately conservative.  We could relax it to handle
850        other cases (such as those with gaps between iterations) if we
851        find any real testcases for which it's useful.
852 
853      - the stride is applied in the same loop as STMT rather than
854        in an outer loop.  Although versioning for strides applied in
855        outer loops could help in some cases -- such as enabling
856        more loop interchange -- the savings are much lower than for
857        inner loops.
858 
859      - the stride is an SSA name that is invariant in STMT's loop,
860        since otherwise versioning isn't possible.  */
861   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
862   if (term.multiplier == access_size
863       && address.loop == op_loop
864       && TREE_CODE (stride) == SSA_NAME
865       && expr_invariant_in_loop_p (address.loop, stride))
866     {
867       term.versioning_opportunity_p = true;
868       if (dump_enabled_p ())
869 	dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
870 			 " opportunity\n", stride);
871     }
872 }
873 
874 /* See whether address term TERM (which belongs to ADDRESS) is the result
875    of multiplying a varying SSA name by a loop-invariant SSA name.
876    Return true and update TERM if so.
877 
878    This handles both cases that SCEV might handle, such as:
879 
880      for (int i = 0; i < n; ++i)
881        res += a[i * stride];
882 
883    and ones in which the term varies arbitrarily between iterations, such as:
884 
885      for (int i = 0; i < n; ++i)
886        res += a[index[i] * stride];  */
887 
888 bool
find_per_loop_multiplication(address_info & address,address_term_info & term)889 loop_versioning::find_per_loop_multiplication (address_info &address,
890 					       address_term_info &term)
891 {
892   gassign *mult = maybe_get_assign (term.expr);
893   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
894     return false;
895 
896   class loop *mult_loop = loop_containing_stmt (mult);
897   if (!loop_outer (mult_loop))
898     return false;
899 
900   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
901   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
902   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
903     return false;
904 
905   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
906   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
907   if (invariant1_p == invariant2_p)
908     return false;
909 
910   /* Make sure that the loop invariant is OP2 rather than OP1.  */
911   if (invariant1_p)
912     std::swap (op1, op2);
913 
914   if (dump_enabled_p ())
915     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
916 		     " * loop-invariant %T\n", term.expr, op1, op2);
917   analyze_stride (address, term, op2, mult_loop);
918   return true;
919 }
920 
921 /* Try to use scalar evolutions to find an address stride for TERM,
922    which belongs to ADDRESS.  Return true and update TERM if so.
923 
924    Here we are interested in any evolution information we can find,
925    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
926    an outer loop obviously iterates over the inner dimension of an array,
927    that information can help us eliminate worthless versioning opportunities
928    in inner loops.  */
929 
930 bool
analyze_term_using_scevs(address_info & address,address_term_info & term)931 loop_versioning::analyze_term_using_scevs (address_info &address,
932 					   address_term_info &term)
933 {
934   gimple *setter = maybe_get_stmt (term.expr);
935   if (!setter)
936     return false;
937 
938   class loop *wrt_loop = loop_containing_stmt (setter);
939   if (!loop_outer (wrt_loop))
940     return false;
941 
942   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
943   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
944     {
945       if (dump_enabled_p ())
946 	dump_printf_loc (MSG_NOTE, address.stmt,
947 			 "address term %T = %T\n", term.expr, chrec);
948 
949       /* Peel casts and accumulate constant multiplications, up to the
950 	 limit allowed by M_MAXIMUM_SCALE.  */
951       tree stride = strip_casts (CHREC_RIGHT (chrec));
952       while (TREE_CODE (stride) == MULT_EXPR
953 	     && multiply_term_by (term, TREE_OPERAND (stride, 1)))
954 	stride = strip_casts (TREE_OPERAND (stride, 0));
955 
956       gassign *assign;
957       while ((assign = maybe_get_assign (stride))
958 	     && gimple_assign_rhs_code (assign) == MULT_EXPR
959 	     && multiply_term_by (term, gimple_assign_rhs2 (assign)))
960 	{
961 	  if (dump_enabled_p ())
962 	    dump_printf_loc (MSG_NOTE, address.stmt,
963 			     "looking through %G", assign);
964 	  stride = strip_casts (gimple_assign_rhs1 (assign));
965 	}
966 
967       analyze_stride (address, term, stride, get_chrec_loop (chrec));
968       return true;
969     }
970 
971   return false;
972 }
973 
974 /* Address term TERM is an arbitrary term that provides no versioning
975    opportunities.  Analyze it to see whether it contains any likely
976    inner strides, so that we don't mistakenly version for other
977    (less likely) candidates.
978 
979    This copes with invariant innermost indices such as:
980 
981      x(i, :) = 100
982 
983    where the "i" component of the address is invariant in the loop
984    but provides the real inner stride.
985 
986    ADDRESS is the address that contains TERM.  */
987 
988 void
analyze_arbitrary_term(address_info & address,address_term_info & term)989 loop_versioning::analyze_arbitrary_term (address_info &address,
990 					 address_term_info &term)
991 
992 {
993   /* A multiplication offers two potential strides.  Pick the one that
994      is most likely to be an innermost stride.  */
995   tree expr = term.expr, alt = NULL_TREE;
996   gassign *mult = maybe_get_assign (expr);
997   if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
998     {
999       expr = strip_casts (gimple_assign_rhs1 (mult));
1000       alt = strip_casts (gimple_assign_rhs2 (mult));
1001     }
1002   term.stride = expr;
1003   term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
1004   if (alt)
1005     {
1006       inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
1007       if (alt_l > term.inner_likelihood)
1008 	{
1009 	  term.stride = alt;
1010 	  term.inner_likelihood = alt_l;
1011 	}
1012     }
1013   if (dump_enabled_p ())
1014     dump_inner_likelihood (address, term);
1015 }
1016 
1017 /* Try to identify loop strides in ADDRESS and try to choose realistic
1018    versioning opportunities based on these strides.
1019 
1020    The main difficulty here isn't finding strides that could be used
1021    in a version check (that's pretty easy).  The problem instead is to
1022    avoid versioning for some stride S that is unlikely ever to be 1 at
1023    runtime.  Versioning for S == 1 on its own would lead to unnecessary
1024    code bloat, while adding S == 1 to more realistic version conditions
1025    would lose the optimisation opportunity offered by those other conditions.
1026 
1027    For example, versioning for a stride of 1 in the Fortran code:
1028 
1029      integer :: a(:,:)
1030      a(1,:) = 1
1031 
1032    is not usually a good idea, since the assignment is iterating over
1033    an outer dimension and is relatively unlikely to have a stride of 1.
1034    (It isn't impossible, since the inner dimension might be 1, or the
1035    array might be transposed.)  Similarly, in:
1036 
1037      integer :: a(:,:), b(:,:)
1038      b(:,1) = a(1,:)
1039 
1040    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1041    Versioning for when both strides are 1 would lose most of the benefit
1042    of versioning for b's access.
1043 
1044    The approach we take is as follows:
1045 
1046    - Analyze each term to see whether it has an identifiable stride,
1047      regardless of which loop applies the stride.
1048 
1049    - Evaluate the likelihood that each such stride is for the innermost
1050      dimension of an array, on the scale "likely", "don't know" or "unlikely".
1051 
1052    - If there is a single "likely" innermost stride, and that stride is
1053      applied in the loop that contains STMT, version the loop for when the
1054      stride is 1.  This deals with the cases in which we're fairly
1055      confident of doing the right thing, such as the b(:,1) reference above.
1056 
1057    - If there are no "likely" innermost strides, and the loop that contains
1058      STMT uses a stride that we rated as "don't know", version for when
1059      that stride is 1.  This is principally used for C code such as:
1060 
1061        for (int i = 0; i < n; ++i)
1062 	 a[i * x] = ...;
1063 
1064      and:
1065 
1066        for (int j = 0; j < n; ++j)
1067 	 for (int i = 0; i < n; ++i)
1068 	   a[i * x + j * y] = ...;
1069 
1070      where nothing in the way "x" and "y" are set gives a hint as to
1071      whether "i" iterates over the innermost dimension of the array.
1072      In these situations it seems reasonable to assume the
1073      programmer has nested the loops appropriately (although of course
1074      there are examples like GEMM in which this assumption doesn't hold
1075      for all accesses in the loop).
1076 
1077      This case is also useful for the Fortran equivalent of the
1078      above C code.  */
1079 
1080 void
analyze_address_fragment(address_info & address)1081 loop_versioning::analyze_address_fragment (address_info &address)
1082 {
1083   if (dump_enabled_p ())
1084     {
1085       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1086       dump_address_info (MSG_NOTE, address);
1087       dump_printf (MSG_NOTE, "\n");
1088     }
1089 
1090   /* Analyze each component of the sum to see whether it involves an
1091      apparent stride.
1092 
1093      There is an overlap between the addresses that
1094      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1095      but the former is much cheaper than SCEV analysis, so try it first.  */
1096   for (unsigned int i = 0; i < address.terms.length (); ++i)
1097     if (!find_per_loop_multiplication (address, address.terms[i])
1098 	&& !analyze_term_using_scevs (address, address.terms[i])
1099 	&& !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1100       analyze_arbitrary_term (address, address.terms[i]);
1101 
1102   /* Check for strides that are likely to be for the innermost dimension.
1103 
1104      1. If there is a single likely inner stride, if it is an SSA name,
1105 	and if it is worth versioning the loop for when the SSA name
1106 	equals 1, record that we want to do so.
1107 
1108      2. Otherwise, if there any likely inner strides, bail out.  This means
1109 	one of:
1110 
1111 	(a) There are multiple likely inner strides.  This suggests we're
1112 	    confused and be can't be confident of doing the right thing.
1113 
1114 	(b) There is a single likely inner stride and it is a constant
1115 	    rather than an SSA name.  This can mean either that the access
1116 	    is a natural one without any variable strides, such as:
1117 
1118 	      for (int i = 0; i < n; ++i)
1119 		a[i] += 1;
1120 
1121 	    or that a variable stride is applied to an outer dimension,
1122 	    such as:
1123 
1124 	      for (int i = 0; i < n; ++i)
1125 		for (int j = 0; j < n; ++j)
1126 		  a[j * stride][i] += 1;
1127 
1128 	(c) There is a single likely inner stride, and it is an SSA name,
1129 	    but it isn't a worthwhile versioning opportunity.  This usually
1130 	    means that the variable stride is applied by an outer loop,
1131 	    such as:
1132 
1133 	      for (int i = 0; i < n; ++i)
1134 		for (int j = 0; j < n; ++j)
1135 		  a[j][i * stride] += 1;
1136 
1137 	    or (using an example with a more natural loop nesting):
1138 
1139 	      for (int i = 0; i < n; ++i)
1140 		for (int j = 0; j < n; ++j)
1141 		  a[i][j] += b[i * stride];
1142 
1143 	    in cases where b[i * stride] cannot (yet) be hoisted for
1144 	    aliasing reasons.
1145 
1146      3. If there are no likely inner strides, fall through to the next
1147 	set of checks.
1148 
1149      Pointer equality is enough to check for uniqueness in (1), since we
1150      only care about SSA names.  */
1151   tree chosen_stride = NULL_TREE;
1152   tree version_stride = NULL_TREE;
1153   for (unsigned int i = 0; i < address.terms.length (); ++i)
1154     if (chosen_stride != address.terms[i].stride
1155 	&& address.terms[i].inner_likelihood == INNER_LIKELY)
1156       {
1157 	if (chosen_stride)
1158 	  return;
1159 	chosen_stride = address.terms[i].stride;
1160 	if (address.terms[i].versioning_opportunity_p)
1161 	  version_stride = chosen_stride;
1162       }
1163 
1164   /* If there are no likely inner strides, see if there is a single
1165      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1166      See the comment above the function for the cases that this code
1167      handles.  */
1168   if (!chosen_stride)
1169     for (unsigned int i = 0; i < address.terms.length (); ++i)
1170       if (version_stride != address.terms[i].stride
1171 	  && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1172 	  && address.terms[i].versioning_opportunity_p)
1173 	{
1174 	  if (version_stride)
1175 	    return;
1176 	  version_stride = address.terms[i].stride;
1177 	}
1178 
1179   if (version_stride)
1180     version_for_unity (address.stmt, version_stride);
1181 }
1182 
1183 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1184    TYPE_SIZE bytes and record this address fragment for later processing.
1185    STMT is the statement that contains the address.  */
1186 
1187 void
record_address_fragment(gimple * stmt,unsigned HOST_WIDE_INT type_size,tree expr,unsigned HOST_WIDE_INT multiplier,HOST_WIDE_INT offset)1188 loop_versioning::record_address_fragment (gimple *stmt,
1189 					  unsigned HOST_WIDE_INT type_size,
1190 					  tree expr,
1191 					  unsigned HOST_WIDE_INT multiplier,
1192 					  HOST_WIDE_INT offset)
1193 {
1194   /* We're only interested in computed values.  */
1195   if (TREE_CODE (expr) != SSA_NAME)
1196     return;
1197 
1198   /* Quick exit if no part of the address is calculated in STMT's loop,
1199      since such addresses have no versioning opportunities.  */
1200   class loop *loop = loop_containing_stmt (stmt);
1201   if (expr_invariant_in_loop_p (loop, expr))
1202     return;
1203 
1204   /* Set up an address_info for EXPR * MULTIPLIER.  */
1205   address_info *address = XOBNEW (&m_obstack, address_info);
1206   new (address) address_info;
1207   address->stmt = stmt;
1208   address->loop = loop;
1209   address->base = NULL_TREE;
1210   address->terms.quick_grow (1);
1211   address->terms[0].expr = expr;
1212   address->terms[0].multiplier = multiplier;
1213   address->terms[0].stride = NULL_TREE;
1214   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1215   address->terms[0].versioning_opportunity_p = false;
1216   address->min_offset = offset;
1217 
1218   /* Peel apart the expression into a sum of address_terms, where each
1219      term is multiplied by a constant.  Treat a + b and a - b the same,
1220      since it doesn't matter for our purposes whether an address is
1221      increasing or decreasing.  Distribute (a + b) * constant into
1222      a * constant + b * constant.
1223 
1224      We don't care which loop each term belongs to, since we want to
1225      examine as many candidate strides as possible when determining
1226      which is likely to be for the innermost dimension.  We therefore
1227      don't limit the search to statements in STMT's loop.  */
1228   for (unsigned int i = 0; i < address->terms.length (); )
1229     {
1230       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1231 	{
1232 	  tree_code code = gimple_assign_rhs_code (assign);
1233 	  if (code == PLUS_EXPR
1234 	      || code == POINTER_PLUS_EXPR
1235 	      || code == MINUS_EXPR)
1236 	    {
1237 	      tree op1 = gimple_assign_rhs1 (assign);
1238 	      tree op2 = gimple_assign_rhs2 (assign);
1239 	      if (TREE_CODE (op2) == INTEGER_CST)
1240 		{
1241 		  address->terms[i].expr = strip_casts (op1);
1242 		  /* This is heuristic only, so don't worry about truncation
1243 		     or overflow.  */
1244 		  address->min_offset += (TREE_INT_CST_LOW (op2)
1245 					  * address->terms[i].multiplier);
1246 		  continue;
1247 		}
1248 	      else if (address->terms.length () < address_info::MAX_TERMS)
1249 		{
1250 		  unsigned int j = address->terms.length ();
1251 		  address->terms.quick_push (address->terms[i]);
1252 		  address->terms[i].expr = strip_casts (op1);
1253 		  address->terms[j].expr = strip_casts (op2);
1254 		  continue;
1255 		}
1256 	    }
1257 	  if (code == MULT_EXPR)
1258 	    {
1259 	      tree op1 = gimple_assign_rhs1 (assign);
1260 	      tree op2 = gimple_assign_rhs2 (assign);
1261 	      if (multiply_term_by (address->terms[i], op2))
1262 		{
1263 		  address->terms[i].expr = strip_casts (op1);
1264 		  continue;
1265 		}
1266 	    }
1267 	  if (CONVERT_EXPR_CODE_P (code))
1268 	    {
1269 	      tree op1 = gimple_assign_rhs1 (assign);
1270 	      address->terms[i].expr = strip_casts (op1);
1271 	      continue;
1272 	    }
1273 	}
1274       i += 1;
1275     }
1276 
1277   /* Peel off any symbolic pointer.  */
1278   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1279       && address->terms[0].multiplier == 1)
1280     {
1281       if (address->terms.length () == 1)
1282 	{
1283 	  obstack_free (&m_obstack, address);
1284 	  return;
1285 	}
1286       address->base = address->terms[0].expr;
1287       address->terms.ordered_remove (0);
1288     }
1289 
1290   /* Require all remaining terms to be SSA names.  (This could be false
1291      for unfolded statements, but they aren't worth dealing with.)  */
1292   for (unsigned int i = 0; i < address->terms.length (); ++i)
1293     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1294       {
1295 	obstack_free (&m_obstack, address);
1296 	return;
1297       }
1298 
1299   /* The loop above set MIN_OFFSET based on the first byte of the
1300      referenced data.  Calculate the end + 1.  */
1301   address->max_offset = address->min_offset + type_size;
1302 
1303   /* Put the terms into a canonical order for the hash table lookup below.  */
1304   address->terms.qsort (compare_address_terms);
1305 
1306   if (dump_enabled_p ())
1307     {
1308       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1309       if (multiplier != 1)
1310 	dump_printf (MSG_NOTE, " * %wd", multiplier);
1311       dump_printf (MSG_NOTE, " = ");
1312       dump_address_info (MSG_NOTE, *address);
1313       dump_printf (MSG_NOTE, "\n");
1314     }
1315 
1316   /* Pool address information with the same terms (but potentially
1317      different offsets).  */
1318   address_info **slot = m_address_table.find_slot (address, INSERT);
1319   if (address_info *old_address = *slot)
1320     {
1321       /* We've already seen an address with the same terms.  Extend the
1322 	 offset range to account for this access.  Doing this can paper
1323 	 over gaps, such as in:
1324 
1325 	   a[i * stride * 4] + a[i * stride * 4 + 3];
1326 
1327 	 where nothing references "+ 1" or "+ 2".  However, the vectorizer
1328 	 handles such gapped accesses without problems, so it's not worth
1329 	 trying to exclude them.  */
1330       if (old_address->min_offset > address->min_offset)
1331 	old_address->min_offset = address->min_offset;
1332       if (old_address->max_offset < address->max_offset)
1333 	old_address->max_offset = address->max_offset;
1334       obstack_free (&m_obstack, address);
1335     }
1336   else
1337     {
1338       /* This is the first time we've seen an address with these terms.  */
1339       *slot = address;
1340       m_address_list.safe_push (address);
1341     }
1342 }
1343 
1344 /* Analyze expression EXPR, which occurs in STMT.  */
1345 
1346 void
analyze_expr(gimple * stmt,tree expr)1347 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1348 {
1349   unsigned HOST_WIDE_INT type_size;
1350 
1351   while (handled_component_p (expr))
1352     {
1353       /* See whether we can use versioning to avoid a multiplication
1354 	 in an array index.  */
1355       if (TREE_CODE (expr) == ARRAY_REF
1356 	  && acceptable_type_p (TREE_TYPE (expr), &type_size))
1357 	record_address_fragment (stmt, type_size,
1358 				 TREE_OPERAND (expr, 1), type_size, 0);
1359       expr = TREE_OPERAND (expr, 0);
1360     }
1361 
1362   /* See whether we can use versioning to avoid a multiplication
1363      in the pointer calculation of a MEM_REF.  */
1364   if (TREE_CODE (expr) == MEM_REF
1365       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1366     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1367 			     /* This is heuristic only, so don't worry
1368 				about truncation or overflow.  */
1369 			     TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1370 
1371   /* These would be easy to handle if they existed at this stage.  */
1372   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1373 }
1374 
1375 /* Analyze all the statements in BB looking for useful version checks.
1376    Return true on success, false if something prevents the block from
1377    being versioned.  */
1378 
1379 bool
analyze_block(basic_block bb)1380 loop_versioning::analyze_block (basic_block bb)
1381 {
1382   class loop *loop = bb->loop_father;
1383   loop_info &li = get_loop_info (loop);
1384   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1385        gsi_next (&gsi))
1386     {
1387       gimple *stmt = gsi_stmt (gsi);
1388       if (is_gimple_debug (stmt))
1389 	continue;
1390 
1391       if (expensive_stmt_p (stmt))
1392 	{
1393 	  if (dump_enabled_p ())
1394 	    dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1395 			     " prevents versioning: %G", stmt);
1396 	  return false;
1397 	}
1398 
1399       /* Only look for direct versioning opportunities in inner loops
1400 	 since the benefit tends to be much smaller for outer loops.  */
1401       if (!loop->inner)
1402 	{
1403 	  unsigned int nops = gimple_num_ops (stmt);
1404 	  for (unsigned int i = 0; i < nops; ++i)
1405 	    if (tree op = gimple_op (stmt, i))
1406 	      analyze_expr (stmt, op);
1407 	}
1408 
1409       /* The point of the instruction limit is to prevent excessive
1410 	 code growth, so this is a size-based estimate even though
1411 	 the optimization is aimed at speed.  */
1412       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1413     }
1414 
1415   return true;
1416 }
1417 
1418 /* Analyze all the blocks in the function, looking for useful version checks.
1419    Return true if we found one.  */
1420 
1421 bool
analyze_blocks()1422 loop_versioning::analyze_blocks ()
1423 {
1424   AUTO_DUMP_SCOPE ("analyze_blocks",
1425 		   dump_user_location_t::from_function_decl (m_fn->decl));
1426 
1427   /* For now we don't try to version the whole function, although
1428      versioning at that level could be useful in some cases.  */
1429   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1430 
1431   class loop *loop;
1432   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1433     {
1434       loop_info &linfo = get_loop_info (loop);
1435 
1436       /* Ignore cold loops.  */
1437       if (!optimize_loop_for_speed_p (loop))
1438 	linfo.rejected_p = true;
1439 
1440       /* See whether an inner loop prevents versioning of this loop.  */
1441       if (!linfo.rejected_p)
1442 	for (class loop *inner = loop->inner; inner; inner = inner->next)
1443 	  if (get_loop_info (inner).rejected_p)
1444 	    {
1445 	      linfo.rejected_p = true;
1446 	      break;
1447 	    }
1448 
1449       /* If versioning the loop is still a possibility, examine the
1450 	 statements in the loop to look for versioning opportunities.  */
1451       if (!linfo.rejected_p)
1452 	{
1453 	  void *start_point = obstack_alloc (&m_obstack, 0);
1454 
1455 	  for (basic_block bb = linfo.block_list; bb;
1456 	       bb = m_next_block_in_loop[bb->index])
1457 	    if (!analyze_block (bb))
1458 	      {
1459 		linfo.rejected_p = true;
1460 		break;
1461 	    }
1462 
1463 	  if (!linfo.rejected_p)
1464 	    {
1465 	      /* Process any queued address fragments, now that we have
1466 		 complete grouping information.  */
1467 	      address_info *address;
1468 	      unsigned int i;
1469 	      FOR_EACH_VEC_ELT (m_address_list, i, address)
1470 		analyze_address_fragment (*address);
1471 	    }
1472 
1473 	  m_address_table.empty ();
1474 	  m_address_list.truncate (0);
1475 	  obstack_free (&m_obstack, start_point);
1476 	}
1477     }
1478 
1479   return m_num_conditions != 0;
1480 }
1481 
1482 /* Use the ranges in VRS to remove impossible versioning conditions from
1483    LOOP.  */
1484 
1485 void
prune_loop_conditions(class loop * loop,vr_values * vrs)1486 loop_versioning::prune_loop_conditions (class loop *loop, vr_values *vrs)
1487 {
1488   loop_info &li = get_loop_info (loop);
1489 
1490   int to_remove = -1;
1491   bitmap_iterator bi;
1492   unsigned int i;
1493   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1494     {
1495       tree name = ssa_name (i);
1496       const value_range_equiv *vr = vrs->get_value_range (name);
1497       if (vr && !vr->may_contain_p (build_one_cst (TREE_TYPE (name))))
1498 	{
1499 	  if (dump_enabled_p ())
1500 	    dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1501 			     "%T can never be 1 in this loop\n", name);
1502 
1503 	  if (to_remove >= 0)
1504 	    bitmap_clear_bit (&li.unity_names, to_remove);
1505 	  to_remove = i;
1506 	  m_num_conditions -= 1;
1507 	}
1508     }
1509   if (to_remove >= 0)
1510     bitmap_clear_bit (&li.unity_names, to_remove);
1511 }
1512 
1513 /* Remove any scheduled loop version conditions that will never be true.
1514    Return true if any remain.  */
1515 
1516 bool
prune_conditions()1517 loop_versioning::prune_conditions ()
1518 {
1519   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1520 		   dump_user_location_t::from_function_decl (m_fn->decl));
1521 
1522   calculate_dominance_info (CDI_DOMINATORS);
1523   lv_dom_walker dom_walker (*this);
1524   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1525   return m_num_conditions != 0;
1526 }
1527 
1528 /* Merge the version checks for INNER into immediately-enclosing loop
1529    OUTER.  */
1530 
1531 void
merge_loop_info(class loop * outer,class loop * inner)1532 loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
1533 {
1534   loop_info &inner_li = get_loop_info (inner);
1535   loop_info &outer_li = get_loop_info (outer);
1536 
1537   if (dump_enabled_p ())
1538     {
1539       bitmap_iterator bi;
1540       unsigned int i;
1541       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1542 	if (!bitmap_bit_p (&outer_li.unity_names, i))
1543 	  dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1544 			   "hoisting check that %T == 1 to outer loop\n",
1545 			   ssa_name (i));
1546     }
1547 
1548   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1549   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1550     outer_li.outermost = inner_li.outermost;
1551 }
1552 
1553 /* Add LOOP to the queue of loops to version.  */
1554 
1555 void
add_loop_to_queue(class loop * loop)1556 loop_versioning::add_loop_to_queue (class loop *loop)
1557 {
1558   loop_info &li = get_loop_info (loop);
1559 
1560   if (dump_enabled_p ())
1561     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1562 		     "queuing this loop for versioning\n");
1563   m_loops_to_version.safe_push (loop);
1564 
1565   /* Don't try to version superloops.  */
1566   li.rejected_p = true;
1567 }
1568 
1569 /* Decide whether the cost model would allow us to version LOOP,
1570    either directly or as part of a parent loop, and return true if so.
1571    This does not imply that the loop is actually worth versioning in its
1572    own right, just that it would be valid to version it if something
1573    benefited.
1574 
1575    We have already made this decision for all inner loops of LOOP.  */
1576 
1577 bool
decide_whether_loop_is_versionable(class loop * loop)1578 loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
1579 {
1580   loop_info &li = get_loop_info (loop);
1581 
1582   if (li.rejected_p)
1583     return false;
1584 
1585   /* Examine the decisions made for inner loops.  */
1586   for (class loop *inner = loop->inner; inner; inner = inner->next)
1587     {
1588       loop_info &inner_li = get_loop_info (inner);
1589       if (inner_li.rejected_p)
1590 	{
1591 	  if (dump_enabled_p ())
1592 	    dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1593 			     "not versioning this loop because one of its"
1594 			     " inner loops should not be versioned\n");
1595 	  return false;
1596 	}
1597 
1598       if (inner_li.worth_versioning_p ())
1599 	li.subloops_benefit_p = true;
1600 
1601       /* Accumulate the number of instructions from subloops that are not
1602 	 the innermost, or that don't benefit from versioning.  Only the
1603 	 instructions from innermost loops that benefit from versioning
1604 	 should be weighed against loop-versioning-max-inner-insns;
1605 	 everything else should be weighed against
1606 	 loop-versioning-max-outer-insns.  */
1607       if (!inner_li.worth_versioning_p () || inner->inner)
1608 	{
1609 	  if (dump_enabled_p ())
1610 	    dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1611 			     "counting %d instructions from this loop"
1612 			     " against its parent loop\n", inner_li.num_insns);
1613 	  li.num_insns += inner_li.num_insns;
1614 	}
1615     }
1616 
1617   /* Enforce the size limits.  */
1618   if (li.worth_versioning_p ())
1619     {
1620       unsigned int max_num_insns = max_insns_for_loop (loop);
1621       if (dump_enabled_p ())
1622 	dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1623 			 "this loop has %d instructions, against"
1624 			 " a versioning limit of %d\n",
1625 			 li.num_insns, max_num_insns);
1626       if (li.num_insns > max_num_insns)
1627 	{
1628 	  if (dump_enabled_p ())
1629 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION
1630 			     | MSG_PRIORITY_USER_FACING,
1631 			     find_loop_location (loop),
1632 			     "this loop is too big to version");
1633 	  return false;
1634 	}
1635     }
1636 
1637   /* Hoist all version checks from subloops to this loop.  */
1638   for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
1639     merge_loop_info (loop, subloop);
1640 
1641   return true;
1642 }
1643 
1644 /* Decide which loops to version and add them to the versioning queue.
1645    Return true if there are any loops to version.  */
1646 
1647 bool
make_versioning_decisions()1648 loop_versioning::make_versioning_decisions ()
1649 {
1650   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1651 		   dump_user_location_t::from_function_decl (m_fn->decl));
1652 
1653   class loop *loop;
1654   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1655     {
1656       loop_info &linfo = get_loop_info (loop);
1657       if (decide_whether_loop_is_versionable (loop))
1658 	{
1659 	  /* Commit to versioning LOOP directly if we can't hoist the
1660 	     version checks any further.  */
1661 	  if (linfo.worth_versioning_p ()
1662 	      && (loop_depth (loop) == 1 || linfo.outermost == loop))
1663 	    add_loop_to_queue (loop);
1664 	}
1665       else
1666 	{
1667 	  /* We can't version this loop, so individually version any
1668 	     subloops that would benefit and haven't been versioned yet.  */
1669 	  linfo.rejected_p = true;
1670 	  for (class loop *subloop = loop->inner; subloop;
1671 	       subloop = subloop->next)
1672 	    if (get_loop_info (subloop).worth_versioning_p ())
1673 	      add_loop_to_queue (subloop);
1674 	}
1675     }
1676 
1677   return !m_loops_to_version.is_empty ();
1678 }
1679 
1680 /* Attempt to implement loop versioning for LOOP, using the information
1681    cached in the associated loop_info.  Return true on success.  */
1682 
1683 bool
version_loop(class loop * loop)1684 loop_versioning::version_loop (class loop *loop)
1685 {
1686   loop_info &li = get_loop_info (loop);
1687 
1688   /* Build up a condition that selects the original loop instead of
1689      the simplified loop.  */
1690   tree cond = boolean_false_node;
1691   bitmap_iterator bi;
1692   unsigned int i;
1693   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1694     {
1695       tree name = ssa_name (i);
1696       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1697 				 build_one_cst (TREE_TYPE (name)));
1698       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1699     }
1700 
1701   /* Convert the condition into a suitable gcond.  */
1702   gimple_seq stmts = NULL;
1703   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1704 
1705   /* Version the loop.  */
1706   initialize_original_copy_tables ();
1707   basic_block cond_bb;
1708   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1709 				    profile_probability::unlikely (),
1710 				    profile_probability::likely (),
1711 				    profile_probability::unlikely (),
1712 				    profile_probability::likely (), true);
1713   free_original_copy_tables ();
1714   if (!li.optimized_loop)
1715     {
1716       if (dump_enabled_p ())
1717 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1718 			 "tried but failed to version this loop for when"
1719 			 " certain strides are 1\n");
1720       return false;
1721     }
1722 
1723   if (dump_enabled_p ())
1724     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1725 		     "versioned this loop for when certain strides are 1\n");
1726 
1727   /* Insert the statements that feed COND.  */
1728   if (stmts)
1729     {
1730       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1731       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1732     }
1733 
1734   return true;
1735 }
1736 
1737 /* Attempt to version all loops in the versioning queue.  */
1738 
1739 void
implement_versioning_decisions()1740 loop_versioning::implement_versioning_decisions ()
1741 {
1742   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1743      user-facing at this point.  */
1744 
1745   bool any_succeeded_p = false;
1746   class loop *loop;
1747   unsigned int i;
1748   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1749     if (version_loop (loop))
1750       any_succeeded_p = true;
1751   if (!any_succeeded_p)
1752     return;
1753 
1754   update_ssa (TODO_update_ssa);
1755 
1756   /* Simplify the new loop, which is used when COND is false.  */
1757   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1758     {
1759       loop_info &linfo = get_loop_info (loop);
1760       if (linfo.optimized_loop)
1761 	name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1762     }
1763 }
1764 
1765 /* Run the pass and return a set of TODO_* flags.  */
1766 
1767 unsigned int
run()1768 loop_versioning::run ()
1769 {
1770   gcc_assert (scev_initialized_p ());
1771 
1772   if (analyze_blocks ()
1773       && prune_conditions ()
1774       && make_versioning_decisions ())
1775     implement_versioning_decisions ();
1776 
1777   return 0;
1778 }
1779 
1780 /* Loop versioning pass.  */
1781 
1782 const pass_data pass_data_loop_versioning =
1783 {
1784   GIMPLE_PASS, /* type */
1785   "lversion", /* name */
1786   OPTGROUP_LOOP, /* optinfo_flags */
1787   TV_LOOP_VERSIONING, /* tv_id */
1788   PROP_cfg, /* properties_required */
1789   0, /* properties_provided */
1790   0, /* properties_destroyed */
1791   0, /* todo_flags_start */
1792   0, /* todo_flags_finish */
1793 };
1794 
1795 class pass_loop_versioning : public gimple_opt_pass
1796 {
1797 public:
pass_loop_versioning(gcc::context * ctxt)1798   pass_loop_versioning (gcc::context *ctxt)
1799     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1800   {}
1801 
1802   /* opt_pass methods: */
gate(function *)1803   virtual bool gate (function *) { return flag_version_loops_for_strides; }
1804   virtual unsigned int execute (function *);
1805 };
1806 
1807 unsigned int
execute(function * fn)1808 pass_loop_versioning::execute (function *fn)
1809 {
1810   if (number_of_loops (fn) <= 1)
1811     return 0;
1812 
1813   return loop_versioning (fn).run ();
1814 }
1815 
1816 } // anon namespace
1817 
1818 gimple_opt_pass *
make_pass_loop_versioning(gcc::context * ctxt)1819 make_pass_loop_versioning (gcc::context *ctxt)
1820 {
1821   return new pass_loop_versioning (ctxt);
1822 }
1823