1*e4b17023SJohn Marino /* Array prefetching.
2*e4b17023SJohn Marino    Copyright (C) 2005, 2007, 2008, 2009, 2010, 2011
3*e4b17023SJohn Marino    Free Software Foundation, Inc.
4*e4b17023SJohn Marino 
5*e4b17023SJohn Marino This file is part of GCC.
6*e4b17023SJohn Marino 
7*e4b17023SJohn Marino GCC is free software; you can redistribute it and/or modify it
8*e4b17023SJohn Marino under the terms of the GNU General Public License as published by the
9*e4b17023SJohn Marino Free Software Foundation; either version 3, or (at your option) any
10*e4b17023SJohn Marino later version.
11*e4b17023SJohn Marino 
12*e4b17023SJohn Marino GCC is distributed in the hope that it will be useful, but WITHOUT
13*e4b17023SJohn Marino ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14*e4b17023SJohn Marino FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15*e4b17023SJohn Marino for more details.
16*e4b17023SJohn Marino 
17*e4b17023SJohn Marino You should have received a copy of the GNU General Public License
18*e4b17023SJohn Marino along with GCC; see the file COPYING3.  If not see
19*e4b17023SJohn Marino <http://www.gnu.org/licenses/>.  */
20*e4b17023SJohn Marino 
21*e4b17023SJohn Marino #include "config.h"
22*e4b17023SJohn Marino #include "system.h"
23*e4b17023SJohn Marino #include "coretypes.h"
24*e4b17023SJohn Marino #include "tm.h"
25*e4b17023SJohn Marino #include "tree.h"
26*e4b17023SJohn Marino #include "tm_p.h"
27*e4b17023SJohn Marino #include "basic-block.h"
28*e4b17023SJohn Marino #include "output.h"
29*e4b17023SJohn Marino #include "tree-pretty-print.h"
30*e4b17023SJohn Marino #include "tree-flow.h"
31*e4b17023SJohn Marino #include "tree-dump.h"
32*e4b17023SJohn Marino #include "timevar.h"
33*e4b17023SJohn Marino #include "cfgloop.h"
34*e4b17023SJohn Marino #include "tree-pass.h"
35*e4b17023SJohn Marino #include "insn-config.h"
36*e4b17023SJohn Marino #include "recog.h"
37*e4b17023SJohn Marino #include "hashtab.h"
38*e4b17023SJohn Marino #include "tree-chrec.h"
39*e4b17023SJohn Marino #include "tree-scalar-evolution.h"
40*e4b17023SJohn Marino #include "diagnostic-core.h"
41*e4b17023SJohn Marino #include "params.h"
42*e4b17023SJohn Marino #include "langhooks.h"
43*e4b17023SJohn Marino #include "tree-inline.h"
44*e4b17023SJohn Marino #include "tree-data-ref.h"
45*e4b17023SJohn Marino 
46*e4b17023SJohn Marino 
47*e4b17023SJohn Marino /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
48*e4b17023SJohn Marino    between the GIMPLE and RTL worlds.  */
49*e4b17023SJohn Marino #include "expr.h"
50*e4b17023SJohn Marino #include "optabs.h"
51*e4b17023SJohn Marino 
52*e4b17023SJohn Marino /* This pass inserts prefetch instructions to optimize cache usage during
53*e4b17023SJohn Marino    accesses to arrays in loops.  It processes loops sequentially and:
54*e4b17023SJohn Marino 
55*e4b17023SJohn Marino    1) Gathers all memory references in the single loop.
56*e4b17023SJohn Marino    2) For each of the references it decides when it is profitable to prefetch
57*e4b17023SJohn Marino       it.  To do it, we evaluate the reuse among the accesses, and determines
58*e4b17023SJohn Marino       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
59*e4b17023SJohn Marino       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
60*e4b17023SJohn Marino       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
61*e4b17023SJohn Marino       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
62*e4b17023SJohn Marino       (assuming cache line size is 64 bytes, char has size 1 byte and there
63*e4b17023SJohn Marino       is no hardware sequential prefetch):
64*e4b17023SJohn Marino 
65*e4b17023SJohn Marino       char *a;
66*e4b17023SJohn Marino       for (i = 0; i < max; i++)
67*e4b17023SJohn Marino 	{
68*e4b17023SJohn Marino 	  a[255] = ...;		(0)
69*e4b17023SJohn Marino 	  a[i] = ...;		(1)
70*e4b17023SJohn Marino 	  a[i + 64] = ...;	(2)
71*e4b17023SJohn Marino 	  a[16*i] = ...;	(3)
72*e4b17023SJohn Marino 	  a[187*i] = ...;	(4)
73*e4b17023SJohn Marino 	  a[187*i + 50] = ...;	(5)
74*e4b17023SJohn Marino 	}
75*e4b17023SJohn Marino 
76*e4b17023SJohn Marino        (0) obviously has PREFETCH_BEFORE 1
77*e4b17023SJohn Marino        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
78*e4b17023SJohn Marino            location 64 iterations before it, and PREFETCH_MOD 64 (since
79*e4b17023SJohn Marino 	   it hits the same cache line otherwise).
80*e4b17023SJohn Marino        (2) has PREFETCH_MOD 64
81*e4b17023SJohn Marino        (3) has PREFETCH_MOD 4
82*e4b17023SJohn Marino        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
83*e4b17023SJohn Marino            the cache line accessed by (5) is the same with probability only
84*e4b17023SJohn Marino 	   7/32.
85*e4b17023SJohn Marino        (5) has PREFETCH_MOD 1 as well.
86*e4b17023SJohn Marino 
87*e4b17023SJohn Marino       Additionally, we use data dependence analysis to determine for each
88*e4b17023SJohn Marino       reference the distance till the first reuse; this information is used
89*e4b17023SJohn Marino       to determine the temporality of the issued prefetch instruction.
90*e4b17023SJohn Marino 
91*e4b17023SJohn Marino    3) We determine how much ahead we need to prefetch.  The number of
92*e4b17023SJohn Marino       iterations needed is time to fetch / time spent in one iteration of
93*e4b17023SJohn Marino       the loop.  The problem is that we do not know either of these values,
94*e4b17023SJohn Marino       so we just make a heuristic guess based on a magic (possibly)
95*e4b17023SJohn Marino       target-specific constant and size of the loop.
96*e4b17023SJohn Marino 
97*e4b17023SJohn Marino    4) Determine which of the references we prefetch.  We take into account
98*e4b17023SJohn Marino       that there is a maximum number of simultaneous prefetches (provided
99*e4b17023SJohn Marino       by machine description).  We prefetch as many prefetches as possible
100*e4b17023SJohn Marino       while still within this bound (starting with those with lowest
101*e4b17023SJohn Marino       prefetch_mod, since they are responsible for most of the cache
102*e4b17023SJohn Marino       misses).
103*e4b17023SJohn Marino 
104*e4b17023SJohn Marino    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
105*e4b17023SJohn Marino       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
106*e4b17023SJohn Marino       prefetching nonaccessed memory.
107*e4b17023SJohn Marino       TODO -- actually implement peeling.
108*e4b17023SJohn Marino 
109*e4b17023SJohn Marino    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
110*e4b17023SJohn Marino       prefetch instructions with guards in cases where 5) was not sufficient
111*e4b17023SJohn Marino       to satisfy the constraints?
112*e4b17023SJohn Marino 
113*e4b17023SJohn Marino    A cost model is implemented to determine whether or not prefetching is
114*e4b17023SJohn Marino    profitable for a given loop.  The cost model has three heuristics:
115*e4b17023SJohn Marino 
116*e4b17023SJohn Marino    1. Function trip_count_to_ahead_ratio_too_small_p implements a
117*e4b17023SJohn Marino       heuristic that determines whether or not the loop has too few
118*e4b17023SJohn Marino       iterations (compared to ahead).  Prefetching is not likely to be
119*e4b17023SJohn Marino       beneficial if the trip count to ahead ratio is below a certain
120*e4b17023SJohn Marino       minimum.
121*e4b17023SJohn Marino 
122*e4b17023SJohn Marino    2. Function mem_ref_count_reasonable_p implements a heuristic that
123*e4b17023SJohn Marino       determines whether the given loop has enough CPU ops that can be
124*e4b17023SJohn Marino       overlapped with cache missing memory ops.  If not, the loop
125*e4b17023SJohn Marino       won't benefit from prefetching.  In the implementation,
126*e4b17023SJohn Marino       prefetching is not considered beneficial if the ratio between
127*e4b17023SJohn Marino       the instruction count and the mem ref count is below a certain
128*e4b17023SJohn Marino       minimum.
129*e4b17023SJohn Marino 
130*e4b17023SJohn Marino    3. Function insn_to_prefetch_ratio_too_small_p implements a
131*e4b17023SJohn Marino       heuristic that disables prefetching in a loop if the prefetching
132*e4b17023SJohn Marino       cost is above a certain limit.  The relative prefetching cost is
133*e4b17023SJohn Marino       estimated by taking the ratio between the prefetch count and the
134*e4b17023SJohn Marino       total intruction count (this models the I-cache cost).
135*e4b17023SJohn Marino 
136*e4b17023SJohn Marino    The limits used in these heuristics are defined as parameters with
137*e4b17023SJohn Marino    reasonable default values. Machine-specific default values will be
138*e4b17023SJohn Marino    added later.
139*e4b17023SJohn Marino 
140*e4b17023SJohn Marino    Some other TODO:
141*e4b17023SJohn Marino       -- write and use more general reuse analysis (that could be also used
142*e4b17023SJohn Marino 	 in other cache aimed loop optimizations)
143*e4b17023SJohn Marino       -- make it behave sanely together with the prefetches given by user
144*e4b17023SJohn Marino 	 (now we just ignore them; at the very least we should avoid
145*e4b17023SJohn Marino 	 optimizing loops in that user put his own prefetches)
146*e4b17023SJohn Marino       -- we assume cache line size alignment of arrays; this could be
147*e4b17023SJohn Marino 	 improved.  */
148*e4b17023SJohn Marino 
149*e4b17023SJohn Marino /* Magic constants follow.  These should be replaced by machine specific
150*e4b17023SJohn Marino    numbers.  */
151*e4b17023SJohn Marino 
152*e4b17023SJohn Marino /* True if write can be prefetched by a read prefetch.  */
153*e4b17023SJohn Marino 
154*e4b17023SJohn Marino #ifndef WRITE_CAN_USE_READ_PREFETCH
155*e4b17023SJohn Marino #define WRITE_CAN_USE_READ_PREFETCH 1
156*e4b17023SJohn Marino #endif
157*e4b17023SJohn Marino 
158*e4b17023SJohn Marino /* True if read can be prefetched by a write prefetch. */
159*e4b17023SJohn Marino 
160*e4b17023SJohn Marino #ifndef READ_CAN_USE_WRITE_PREFETCH
161*e4b17023SJohn Marino #define READ_CAN_USE_WRITE_PREFETCH 0
162*e4b17023SJohn Marino #endif
163*e4b17023SJohn Marino 
164*e4b17023SJohn Marino /* The size of the block loaded by a single prefetch.  Usually, this is
165*e4b17023SJohn Marino    the same as cache line size (at the moment, we only consider one level
166*e4b17023SJohn Marino    of cache hierarchy).  */
167*e4b17023SJohn Marino 
168*e4b17023SJohn Marino #ifndef PREFETCH_BLOCK
169*e4b17023SJohn Marino #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
170*e4b17023SJohn Marino #endif
171*e4b17023SJohn Marino 
172*e4b17023SJohn Marino /* Do we have a forward hardware sequential prefetching?  */
173*e4b17023SJohn Marino 
174*e4b17023SJohn Marino #ifndef HAVE_FORWARD_PREFETCH
175*e4b17023SJohn Marino #define HAVE_FORWARD_PREFETCH 0
176*e4b17023SJohn Marino #endif
177*e4b17023SJohn Marino 
178*e4b17023SJohn Marino /* Do we have a backward hardware sequential prefetching?  */
179*e4b17023SJohn Marino 
180*e4b17023SJohn Marino #ifndef HAVE_BACKWARD_PREFETCH
181*e4b17023SJohn Marino #define HAVE_BACKWARD_PREFETCH 0
182*e4b17023SJohn Marino #endif
183*e4b17023SJohn Marino 
184*e4b17023SJohn Marino /* In some cases we are only able to determine that there is a certain
185*e4b17023SJohn Marino    probability that the two accesses hit the same cache line.  In this
186*e4b17023SJohn Marino    case, we issue the prefetches for both of them if this probability
187*e4b17023SJohn Marino    is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
188*e4b17023SJohn Marino 
189*e4b17023SJohn Marino #ifndef ACCEPTABLE_MISS_RATE
190*e4b17023SJohn Marino #define ACCEPTABLE_MISS_RATE 50
191*e4b17023SJohn Marino #endif
192*e4b17023SJohn Marino 
193*e4b17023SJohn Marino #ifndef HAVE_prefetch
194*e4b17023SJohn Marino #define HAVE_prefetch 0
195*e4b17023SJohn Marino #endif
196*e4b17023SJohn Marino 
197*e4b17023SJohn Marino #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
198*e4b17023SJohn Marino #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
199*e4b17023SJohn Marino 
200*e4b17023SJohn Marino /* We consider a memory access nontemporal if it is not reused sooner than
201*e4b17023SJohn Marino    after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
202*e4b17023SJohn Marino    accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
203*e4b17023SJohn Marino    so that we use nontemporal prefetches e.g. if single memory location
204*e4b17023SJohn Marino    is accessed several times in a single iteration of the loop.  */
205*e4b17023SJohn Marino #define NONTEMPORAL_FRACTION 16
206*e4b17023SJohn Marino 
207*e4b17023SJohn Marino /* In case we have to emit a memory fence instruction after the loop that
208*e4b17023SJohn Marino    uses nontemporal stores, this defines the builtin to use.  */
209*e4b17023SJohn Marino 
210*e4b17023SJohn Marino #ifndef FENCE_FOLLOWING_MOVNT
211*e4b17023SJohn Marino #define FENCE_FOLLOWING_MOVNT NULL_TREE
212*e4b17023SJohn Marino #endif
213*e4b17023SJohn Marino 
214*e4b17023SJohn Marino /* It is not profitable to prefetch when the trip count is not at
215*e4b17023SJohn Marino    least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
216*e4b17023SJohn Marino    For example, in a loop with a prefetch ahead distance of 10,
217*e4b17023SJohn Marino    supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
218*e4b17023SJohn Marino    profitable to prefetch when the trip count is greater or equal to
219*e4b17023SJohn Marino    40.  In that case, 30 out of the 40 iterations will benefit from
220*e4b17023SJohn Marino    prefetching.  */
221*e4b17023SJohn Marino 
222*e4b17023SJohn Marino #ifndef TRIP_COUNT_TO_AHEAD_RATIO
223*e4b17023SJohn Marino #define TRIP_COUNT_TO_AHEAD_RATIO 4
224*e4b17023SJohn Marino #endif
225*e4b17023SJohn Marino 
226*e4b17023SJohn Marino /* The group of references between that reuse may occur.  */
227*e4b17023SJohn Marino 
228*e4b17023SJohn Marino struct mem_ref_group
229*e4b17023SJohn Marino {
230*e4b17023SJohn Marino   tree base;			/* Base of the reference.  */
231*e4b17023SJohn Marino   tree step;			/* Step of the reference.  */
232*e4b17023SJohn Marino   struct mem_ref *refs;		/* References in the group.  */
233*e4b17023SJohn Marino   struct mem_ref_group *next;	/* Next group of references.  */
234*e4b17023SJohn Marino };
235*e4b17023SJohn Marino 
236*e4b17023SJohn Marino /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
237*e4b17023SJohn Marino 
238*e4b17023SJohn Marino #define PREFETCH_ALL		(~(unsigned HOST_WIDE_INT) 0)
239*e4b17023SJohn Marino 
240*e4b17023SJohn Marino /* Do not generate a prefetch if the unroll factor is significantly less
241*e4b17023SJohn Marino    than what is required by the prefetch.  This is to avoid redundant
242*e4b17023SJohn Marino    prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
243*e4b17023SJohn Marino    2, prefetching requires unrolling the loop 16 times, but
244*e4b17023SJohn Marino    the loop is actually unrolled twice.  In this case (ratio = 8),
245*e4b17023SJohn Marino    prefetching is not likely to be beneficial.  */
246*e4b17023SJohn Marino 
247*e4b17023SJohn Marino #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
248*e4b17023SJohn Marino #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
249*e4b17023SJohn Marino #endif
250*e4b17023SJohn Marino 
251*e4b17023SJohn Marino /* Some of the prefetch computations have quadratic complexity.  We want to
252*e4b17023SJohn Marino    avoid huge compile times and, therefore, want to limit the amount of
253*e4b17023SJohn Marino    memory references per loop where we consider prefetching.  */
254*e4b17023SJohn Marino 
255*e4b17023SJohn Marino #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
256*e4b17023SJohn Marino #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
257*e4b17023SJohn Marino #endif
258*e4b17023SJohn Marino 
259*e4b17023SJohn Marino /* The memory reference.  */
260*e4b17023SJohn Marino 
261*e4b17023SJohn Marino struct mem_ref
262*e4b17023SJohn Marino {
263*e4b17023SJohn Marino   gimple stmt;			/* Statement in that the reference appears.  */
264*e4b17023SJohn Marino   tree mem;			/* The reference.  */
265*e4b17023SJohn Marino   HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
266*e4b17023SJohn Marino   struct mem_ref_group *group;	/* The group of references it belongs to.  */
267*e4b17023SJohn Marino   unsigned HOST_WIDE_INT prefetch_mod;
268*e4b17023SJohn Marino 				/* Prefetch only each PREFETCH_MOD-th
269*e4b17023SJohn Marino 				   iteration.  */
270*e4b17023SJohn Marino   unsigned HOST_WIDE_INT prefetch_before;
271*e4b17023SJohn Marino 				/* Prefetch only first PREFETCH_BEFORE
272*e4b17023SJohn Marino 				   iterations.  */
273*e4b17023SJohn Marino   unsigned reuse_distance;	/* The amount of data accessed before the first
274*e4b17023SJohn Marino 				   reuse of this value.  */
275*e4b17023SJohn Marino   struct mem_ref *next;		/* The next reference in the group.  */
276*e4b17023SJohn Marino   unsigned write_p : 1;		/* Is it a write?  */
277*e4b17023SJohn Marino   unsigned independent_p : 1;	/* True if the reference is independent on
278*e4b17023SJohn Marino 				   all other references inside the loop.  */
279*e4b17023SJohn Marino   unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
280*e4b17023SJohn Marino   unsigned storent_p : 1;	/* True if we changed the store to a
281*e4b17023SJohn Marino 				   nontemporal one.  */
282*e4b17023SJohn Marino };
283*e4b17023SJohn Marino 
284*e4b17023SJohn Marino /* Dumps information about reference REF to FILE.  */
285*e4b17023SJohn Marino 
286*e4b17023SJohn Marino static void
dump_mem_ref(FILE * file,struct mem_ref * ref)287*e4b17023SJohn Marino dump_mem_ref (FILE *file, struct mem_ref *ref)
288*e4b17023SJohn Marino {
289*e4b17023SJohn Marino   fprintf (file, "Reference %p:\n", (void *) ref);
290*e4b17023SJohn Marino 
291*e4b17023SJohn Marino   fprintf (file, "  group %p (base ", (void *) ref->group);
292*e4b17023SJohn Marino   print_generic_expr (file, ref->group->base, TDF_SLIM);
293*e4b17023SJohn Marino   fprintf (file, ", step ");
294*e4b17023SJohn Marino   if (cst_and_fits_in_hwi (ref->group->step))
295*e4b17023SJohn Marino     fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step));
296*e4b17023SJohn Marino   else
297*e4b17023SJohn Marino     print_generic_expr (file, ref->group->step, TDF_TREE);
298*e4b17023SJohn Marino   fprintf (file, ")\n");
299*e4b17023SJohn Marino 
300*e4b17023SJohn Marino   fprintf (file, "  delta ");
301*e4b17023SJohn Marino   fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
302*e4b17023SJohn Marino   fprintf (file, "\n");
303*e4b17023SJohn Marino 
304*e4b17023SJohn Marino   fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
305*e4b17023SJohn Marino 
306*e4b17023SJohn Marino   fprintf (file, "\n");
307*e4b17023SJohn Marino }
308*e4b17023SJohn Marino 
309*e4b17023SJohn Marino /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
310*e4b17023SJohn Marino    exist.  */
311*e4b17023SJohn Marino 
312*e4b17023SJohn Marino static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,tree step)313*e4b17023SJohn Marino find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
314*e4b17023SJohn Marino {
315*e4b17023SJohn Marino   struct mem_ref_group *group;
316*e4b17023SJohn Marino 
317*e4b17023SJohn Marino   for (; *groups; groups = &(*groups)->next)
318*e4b17023SJohn Marino     {
319*e4b17023SJohn Marino       if (operand_equal_p ((*groups)->step, step, 0)
320*e4b17023SJohn Marino 	  && operand_equal_p ((*groups)->base, base, 0))
321*e4b17023SJohn Marino 	return *groups;
322*e4b17023SJohn Marino 
323*e4b17023SJohn Marino       /* If step is an integer constant, keep the list of groups sorted
324*e4b17023SJohn Marino          by decreasing step.  */
325*e4b17023SJohn Marino         if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
326*e4b17023SJohn Marino             && int_cst_value ((*groups)->step) < int_cst_value (step))
327*e4b17023SJohn Marino 	break;
328*e4b17023SJohn Marino     }
329*e4b17023SJohn Marino 
330*e4b17023SJohn Marino   group = XNEW (struct mem_ref_group);
331*e4b17023SJohn Marino   group->base = base;
332*e4b17023SJohn Marino   group->step = step;
333*e4b17023SJohn Marino   group->refs = NULL;
334*e4b17023SJohn Marino   group->next = *groups;
335*e4b17023SJohn Marino   *groups = group;
336*e4b17023SJohn Marino 
337*e4b17023SJohn Marino   return group;
338*e4b17023SJohn Marino }
339*e4b17023SJohn Marino 
340*e4b17023SJohn Marino /* Records a memory reference MEM in GROUP with offset DELTA and write status
341*e4b17023SJohn Marino    WRITE_P.  The reference occurs in statement STMT.  */
342*e4b17023SJohn Marino 
343*e4b17023SJohn Marino static void
record_ref(struct mem_ref_group * group,gimple stmt,tree mem,HOST_WIDE_INT delta,bool write_p)344*e4b17023SJohn Marino record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
345*e4b17023SJohn Marino 	    HOST_WIDE_INT delta, bool write_p)
346*e4b17023SJohn Marino {
347*e4b17023SJohn Marino   struct mem_ref **aref;
348*e4b17023SJohn Marino 
349*e4b17023SJohn Marino   /* Do not record the same address twice.  */
350*e4b17023SJohn Marino   for (aref = &group->refs; *aref; aref = &(*aref)->next)
351*e4b17023SJohn Marino     {
352*e4b17023SJohn Marino       /* It does not have to be possible for write reference to reuse the read
353*e4b17023SJohn Marino 	 prefetch, or vice versa.  */
354*e4b17023SJohn Marino       if (!WRITE_CAN_USE_READ_PREFETCH
355*e4b17023SJohn Marino 	  && write_p
356*e4b17023SJohn Marino 	  && !(*aref)->write_p)
357*e4b17023SJohn Marino 	continue;
358*e4b17023SJohn Marino       if (!READ_CAN_USE_WRITE_PREFETCH
359*e4b17023SJohn Marino 	  && !write_p
360*e4b17023SJohn Marino 	  && (*aref)->write_p)
361*e4b17023SJohn Marino 	continue;
362*e4b17023SJohn Marino 
363*e4b17023SJohn Marino       if ((*aref)->delta == delta)
364*e4b17023SJohn Marino 	return;
365*e4b17023SJohn Marino     }
366*e4b17023SJohn Marino 
367*e4b17023SJohn Marino   (*aref) = XNEW (struct mem_ref);
368*e4b17023SJohn Marino   (*aref)->stmt = stmt;
369*e4b17023SJohn Marino   (*aref)->mem = mem;
370*e4b17023SJohn Marino   (*aref)->delta = delta;
371*e4b17023SJohn Marino   (*aref)->write_p = write_p;
372*e4b17023SJohn Marino   (*aref)->prefetch_before = PREFETCH_ALL;
373*e4b17023SJohn Marino   (*aref)->prefetch_mod = 1;
374*e4b17023SJohn Marino   (*aref)->reuse_distance = 0;
375*e4b17023SJohn Marino   (*aref)->issue_prefetch_p = false;
376*e4b17023SJohn Marino   (*aref)->group = group;
377*e4b17023SJohn Marino   (*aref)->next = NULL;
378*e4b17023SJohn Marino   (*aref)->independent_p = false;
379*e4b17023SJohn Marino   (*aref)->storent_p = false;
380*e4b17023SJohn Marino 
381*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
382*e4b17023SJohn Marino     dump_mem_ref (dump_file, *aref);
383*e4b17023SJohn Marino }
384*e4b17023SJohn Marino 
385*e4b17023SJohn Marino /* Release memory references in GROUPS.  */
386*e4b17023SJohn Marino 
387*e4b17023SJohn Marino static void
release_mem_refs(struct mem_ref_group * groups)388*e4b17023SJohn Marino release_mem_refs (struct mem_ref_group *groups)
389*e4b17023SJohn Marino {
390*e4b17023SJohn Marino   struct mem_ref_group *next_g;
391*e4b17023SJohn Marino   struct mem_ref *ref, *next_r;
392*e4b17023SJohn Marino 
393*e4b17023SJohn Marino   for (; groups; groups = next_g)
394*e4b17023SJohn Marino     {
395*e4b17023SJohn Marino       next_g = groups->next;
396*e4b17023SJohn Marino       for (ref = groups->refs; ref; ref = next_r)
397*e4b17023SJohn Marino 	{
398*e4b17023SJohn Marino 	  next_r = ref->next;
399*e4b17023SJohn Marino 	  free (ref);
400*e4b17023SJohn Marino 	}
401*e4b17023SJohn Marino       free (groups);
402*e4b17023SJohn Marino     }
403*e4b17023SJohn Marino }
404*e4b17023SJohn Marino 
405*e4b17023SJohn Marino /* A structure used to pass arguments to idx_analyze_ref.  */
406*e4b17023SJohn Marino 
407*e4b17023SJohn Marino struct ar_data
408*e4b17023SJohn Marino {
409*e4b17023SJohn Marino   struct loop *loop;			/* Loop of the reference.  */
410*e4b17023SJohn Marino   gimple stmt;				/* Statement of the reference.  */
411*e4b17023SJohn Marino   tree *step;				/* Step of the memory reference.  */
412*e4b17023SJohn Marino   HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
413*e4b17023SJohn Marino };
414*e4b17023SJohn Marino 
415*e4b17023SJohn Marino /* Analyzes a single INDEX of a memory reference to obtain information
416*e4b17023SJohn Marino    described at analyze_ref.  Callback for for_each_index.  */
417*e4b17023SJohn Marino 
418*e4b17023SJohn Marino static bool
idx_analyze_ref(tree base,tree * index,void * data)419*e4b17023SJohn Marino idx_analyze_ref (tree base, tree *index, void *data)
420*e4b17023SJohn Marino {
421*e4b17023SJohn Marino   struct ar_data *ar_data = (struct ar_data *) data;
422*e4b17023SJohn Marino   tree ibase, step, stepsize;
423*e4b17023SJohn Marino   HOST_WIDE_INT idelta = 0, imult = 1;
424*e4b17023SJohn Marino   affine_iv iv;
425*e4b17023SJohn Marino 
426*e4b17023SJohn Marino   if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
427*e4b17023SJohn Marino 		  *index, &iv, true))
428*e4b17023SJohn Marino     return false;
429*e4b17023SJohn Marino   ibase = iv.base;
430*e4b17023SJohn Marino   step = iv.step;
431*e4b17023SJohn Marino 
432*e4b17023SJohn Marino   if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
433*e4b17023SJohn Marino       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
434*e4b17023SJohn Marino     {
435*e4b17023SJohn Marino       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
436*e4b17023SJohn Marino       ibase = TREE_OPERAND (ibase, 0);
437*e4b17023SJohn Marino     }
438*e4b17023SJohn Marino   if (cst_and_fits_in_hwi (ibase))
439*e4b17023SJohn Marino     {
440*e4b17023SJohn Marino       idelta += int_cst_value (ibase);
441*e4b17023SJohn Marino       ibase = build_int_cst (TREE_TYPE (ibase), 0);
442*e4b17023SJohn Marino     }
443*e4b17023SJohn Marino 
444*e4b17023SJohn Marino   if (TREE_CODE (base) == ARRAY_REF)
445*e4b17023SJohn Marino     {
446*e4b17023SJohn Marino       stepsize = array_ref_element_size (base);
447*e4b17023SJohn Marino       if (!cst_and_fits_in_hwi (stepsize))
448*e4b17023SJohn Marino 	return false;
449*e4b17023SJohn Marino       imult = int_cst_value (stepsize);
450*e4b17023SJohn Marino       step = fold_build2 (MULT_EXPR, sizetype,
451*e4b17023SJohn Marino 			  fold_convert (sizetype, step),
452*e4b17023SJohn Marino 			  fold_convert (sizetype, stepsize));
453*e4b17023SJohn Marino       idelta *= imult;
454*e4b17023SJohn Marino     }
455*e4b17023SJohn Marino 
456*e4b17023SJohn Marino   if (*ar_data->step == NULL_TREE)
457*e4b17023SJohn Marino     *ar_data->step = step;
458*e4b17023SJohn Marino   else
459*e4b17023SJohn Marino     *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
460*e4b17023SJohn Marino 				  fold_convert (sizetype, *ar_data->step),
461*e4b17023SJohn Marino 				  fold_convert (sizetype, step));
462*e4b17023SJohn Marino   *ar_data->delta += idelta;
463*e4b17023SJohn Marino   *index = ibase;
464*e4b17023SJohn Marino 
465*e4b17023SJohn Marino   return true;
466*e4b17023SJohn Marino }
467*e4b17023SJohn Marino 
468*e4b17023SJohn Marino /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
469*e4b17023SJohn Marino    STEP are integer constants and iter is number of iterations of LOOP.  The
470*e4b17023SJohn Marino    reference occurs in statement STMT.  Strips nonaddressable component
471*e4b17023SJohn Marino    references from REF_P.  */
472*e4b17023SJohn Marino 
473*e4b17023SJohn Marino static bool
analyze_ref(struct loop * loop,tree * ref_p,tree * base,tree * step,HOST_WIDE_INT * delta,gimple stmt)474*e4b17023SJohn Marino analyze_ref (struct loop *loop, tree *ref_p, tree *base,
475*e4b17023SJohn Marino 	     tree *step, HOST_WIDE_INT *delta,
476*e4b17023SJohn Marino 	     gimple stmt)
477*e4b17023SJohn Marino {
478*e4b17023SJohn Marino   struct ar_data ar_data;
479*e4b17023SJohn Marino   tree off;
480*e4b17023SJohn Marino   HOST_WIDE_INT bit_offset;
481*e4b17023SJohn Marino   tree ref = *ref_p;
482*e4b17023SJohn Marino 
483*e4b17023SJohn Marino   *step = NULL_TREE;
484*e4b17023SJohn Marino   *delta = 0;
485*e4b17023SJohn Marino 
486*e4b17023SJohn Marino   /* First strip off the component references.  Ignore bitfields.
487*e4b17023SJohn Marino      Also strip off the real and imagine parts of a complex, so that
488*e4b17023SJohn Marino      they can have the same base.  */
489*e4b17023SJohn Marino   if (TREE_CODE (ref) == REALPART_EXPR
490*e4b17023SJohn Marino       || TREE_CODE (ref) == IMAGPART_EXPR
491*e4b17023SJohn Marino       || (TREE_CODE (ref) == COMPONENT_REF
492*e4b17023SJohn Marino           && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
493*e4b17023SJohn Marino     {
494*e4b17023SJohn Marino       if (TREE_CODE (ref) == IMAGPART_EXPR)
495*e4b17023SJohn Marino         *delta += int_size_in_bytes (TREE_TYPE (ref));
496*e4b17023SJohn Marino       ref = TREE_OPERAND (ref, 0);
497*e4b17023SJohn Marino     }
498*e4b17023SJohn Marino 
499*e4b17023SJohn Marino   *ref_p = ref;
500*e4b17023SJohn Marino 
501*e4b17023SJohn Marino   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
502*e4b17023SJohn Marino     {
503*e4b17023SJohn Marino       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
504*e4b17023SJohn Marino       bit_offset = TREE_INT_CST_LOW (off);
505*e4b17023SJohn Marino       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
506*e4b17023SJohn Marino 
507*e4b17023SJohn Marino       *delta += bit_offset / BITS_PER_UNIT;
508*e4b17023SJohn Marino     }
509*e4b17023SJohn Marino 
510*e4b17023SJohn Marino   *base = unshare_expr (ref);
511*e4b17023SJohn Marino   ar_data.loop = loop;
512*e4b17023SJohn Marino   ar_data.stmt = stmt;
513*e4b17023SJohn Marino   ar_data.step = step;
514*e4b17023SJohn Marino   ar_data.delta = delta;
515*e4b17023SJohn Marino   return for_each_index (base, idx_analyze_ref, &ar_data);
516*e4b17023SJohn Marino }
517*e4b17023SJohn Marino 
518*e4b17023SJohn Marino /* Record a memory reference REF to the list REFS.  The reference occurs in
519*e4b17023SJohn Marino    LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
520*e4b17023SJohn Marino    reference was recorded, false otherwise.  */
521*e4b17023SJohn Marino 
522*e4b17023SJohn Marino static bool
gather_memory_references_ref(struct loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,gimple stmt)523*e4b17023SJohn Marino gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
524*e4b17023SJohn Marino 			      tree ref, bool write_p, gimple stmt)
525*e4b17023SJohn Marino {
526*e4b17023SJohn Marino   tree base, step;
527*e4b17023SJohn Marino   HOST_WIDE_INT delta;
528*e4b17023SJohn Marino   struct mem_ref_group *agrp;
529*e4b17023SJohn Marino 
530*e4b17023SJohn Marino   if (get_base_address (ref) == NULL)
531*e4b17023SJohn Marino     return false;
532*e4b17023SJohn Marino 
533*e4b17023SJohn Marino   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
534*e4b17023SJohn Marino     return false;
535*e4b17023SJohn Marino   /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
536*e4b17023SJohn Marino   if (step == NULL_TREE)
537*e4b17023SJohn Marino     return false;
538*e4b17023SJohn Marino 
539*e4b17023SJohn Marino   /* Stop if the address of BASE could not be taken.  */
540*e4b17023SJohn Marino   if (may_be_nonaddressable_p (base))
541*e4b17023SJohn Marino     return false;
542*e4b17023SJohn Marino 
543*e4b17023SJohn Marino   /* Limit non-constant step prefetching only to the innermost loops.  */
544*e4b17023SJohn Marino   if (!cst_and_fits_in_hwi (step) && loop->inner != NULL)
545*e4b17023SJohn Marino     return false;
546*e4b17023SJohn Marino 
547*e4b17023SJohn Marino   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
548*e4b17023SJohn Marino      are integer constants.  */
549*e4b17023SJohn Marino   agrp = find_or_create_group (refs, base, step);
550*e4b17023SJohn Marino   record_ref (agrp, stmt, ref, delta, write_p);
551*e4b17023SJohn Marino 
552*e4b17023SJohn Marino   return true;
553*e4b17023SJohn Marino }
554*e4b17023SJohn Marino 
555*e4b17023SJohn Marino /* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
556*e4b17023SJohn Marino    true if there are no other memory references inside the loop.  */
557*e4b17023SJohn Marino 
558*e4b17023SJohn Marino static struct mem_ref_group *
gather_memory_references(struct loop * loop,bool * no_other_refs,unsigned * ref_count)559*e4b17023SJohn Marino gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
560*e4b17023SJohn Marino {
561*e4b17023SJohn Marino   basic_block *body = get_loop_body_in_dom_order (loop);
562*e4b17023SJohn Marino   basic_block bb;
563*e4b17023SJohn Marino   unsigned i;
564*e4b17023SJohn Marino   gimple_stmt_iterator bsi;
565*e4b17023SJohn Marino   gimple stmt;
566*e4b17023SJohn Marino   tree lhs, rhs;
567*e4b17023SJohn Marino   struct mem_ref_group *refs = NULL;
568*e4b17023SJohn Marino 
569*e4b17023SJohn Marino   *no_other_refs = true;
570*e4b17023SJohn Marino   *ref_count = 0;
571*e4b17023SJohn Marino 
572*e4b17023SJohn Marino   /* Scan the loop body in order, so that the former references precede the
573*e4b17023SJohn Marino      later ones.  */
574*e4b17023SJohn Marino   for (i = 0; i < loop->num_nodes; i++)
575*e4b17023SJohn Marino     {
576*e4b17023SJohn Marino       bb = body[i];
577*e4b17023SJohn Marino       if (bb->loop_father != loop)
578*e4b17023SJohn Marino 	continue;
579*e4b17023SJohn Marino 
580*e4b17023SJohn Marino       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
581*e4b17023SJohn Marino 	{
582*e4b17023SJohn Marino 	  stmt = gsi_stmt (bsi);
583*e4b17023SJohn Marino 
584*e4b17023SJohn Marino 	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
585*e4b17023SJohn Marino 	    {
586*e4b17023SJohn Marino 	      if (gimple_vuse (stmt)
587*e4b17023SJohn Marino 		  || (is_gimple_call (stmt)
588*e4b17023SJohn Marino 		      && !(gimple_call_flags (stmt) & ECF_CONST)))
589*e4b17023SJohn Marino 		*no_other_refs = false;
590*e4b17023SJohn Marino 	      continue;
591*e4b17023SJohn Marino 	    }
592*e4b17023SJohn Marino 
593*e4b17023SJohn Marino 	  lhs = gimple_assign_lhs (stmt);
594*e4b17023SJohn Marino 	  rhs = gimple_assign_rhs1 (stmt);
595*e4b17023SJohn Marino 
596*e4b17023SJohn Marino 	  if (REFERENCE_CLASS_P (rhs))
597*e4b17023SJohn Marino 	    {
598*e4b17023SJohn Marino 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
599*e4b17023SJohn Marino 							    rhs, false, stmt);
600*e4b17023SJohn Marino 	    *ref_count += 1;
601*e4b17023SJohn Marino 	    }
602*e4b17023SJohn Marino 	  if (REFERENCE_CLASS_P (lhs))
603*e4b17023SJohn Marino 	    {
604*e4b17023SJohn Marino 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
605*e4b17023SJohn Marino 							    lhs, true, stmt);
606*e4b17023SJohn Marino 	    *ref_count += 1;
607*e4b17023SJohn Marino 	    }
608*e4b17023SJohn Marino 	}
609*e4b17023SJohn Marino     }
610*e4b17023SJohn Marino   free (body);
611*e4b17023SJohn Marino 
612*e4b17023SJohn Marino   return refs;
613*e4b17023SJohn Marino }
614*e4b17023SJohn Marino 
615*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the self-reuse.  */
616*e4b17023SJohn Marino 
617*e4b17023SJohn Marino static void
prune_ref_by_self_reuse(struct mem_ref * ref)618*e4b17023SJohn Marino prune_ref_by_self_reuse (struct mem_ref *ref)
619*e4b17023SJohn Marino {
620*e4b17023SJohn Marino   HOST_WIDE_INT step;
621*e4b17023SJohn Marino   bool backward;
622*e4b17023SJohn Marino 
623*e4b17023SJohn Marino   /* If the step size is non constant, we cannot calculate prefetch_mod.  */
624*e4b17023SJohn Marino   if (!cst_and_fits_in_hwi (ref->group->step))
625*e4b17023SJohn Marino     return;
626*e4b17023SJohn Marino 
627*e4b17023SJohn Marino   step = int_cst_value (ref->group->step);
628*e4b17023SJohn Marino 
629*e4b17023SJohn Marino   backward = step < 0;
630*e4b17023SJohn Marino 
631*e4b17023SJohn Marino   if (step == 0)
632*e4b17023SJohn Marino     {
633*e4b17023SJohn Marino       /* Prefetch references to invariant address just once.  */
634*e4b17023SJohn Marino       ref->prefetch_before = 1;
635*e4b17023SJohn Marino       return;
636*e4b17023SJohn Marino     }
637*e4b17023SJohn Marino 
638*e4b17023SJohn Marino   if (backward)
639*e4b17023SJohn Marino     step = -step;
640*e4b17023SJohn Marino 
641*e4b17023SJohn Marino   if (step > PREFETCH_BLOCK)
642*e4b17023SJohn Marino     return;
643*e4b17023SJohn Marino 
644*e4b17023SJohn Marino   if ((backward && HAVE_BACKWARD_PREFETCH)
645*e4b17023SJohn Marino       || (!backward && HAVE_FORWARD_PREFETCH))
646*e4b17023SJohn Marino     {
647*e4b17023SJohn Marino       ref->prefetch_before = 1;
648*e4b17023SJohn Marino       return;
649*e4b17023SJohn Marino     }
650*e4b17023SJohn Marino 
651*e4b17023SJohn Marino   ref->prefetch_mod = PREFETCH_BLOCK / step;
652*e4b17023SJohn Marino }
653*e4b17023SJohn Marino 
654*e4b17023SJohn Marino /* Divides X by BY, rounding down.  */
655*e4b17023SJohn Marino 
656*e4b17023SJohn Marino static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)657*e4b17023SJohn Marino ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
658*e4b17023SJohn Marino {
659*e4b17023SJohn Marino   gcc_assert (by > 0);
660*e4b17023SJohn Marino 
661*e4b17023SJohn Marino   if (x >= 0)
662*e4b17023SJohn Marino     return x / by;
663*e4b17023SJohn Marino   else
664*e4b17023SJohn Marino     return (x + by - 1) / by;
665*e4b17023SJohn Marino }
666*e4b17023SJohn Marino 
667*e4b17023SJohn Marino /* Given a CACHE_LINE_SIZE and two inductive memory references
668*e4b17023SJohn Marino    with a common STEP greater than CACHE_LINE_SIZE and an address
669*e4b17023SJohn Marino    difference DELTA, compute the probability that they will fall
670*e4b17023SJohn Marino    in different cache lines.  Return true if the computed miss rate
671*e4b17023SJohn Marino    is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
672*e4b17023SJohn Marino    number of distinct iterations after which the pattern repeats itself.
673*e4b17023SJohn Marino    ALIGN_UNIT is the unit of alignment in bytes.  */
674*e4b17023SJohn Marino 
675*e4b17023SJohn Marino static bool
is_miss_rate_acceptable(unsigned HOST_WIDE_INT cache_line_size,HOST_WIDE_INT step,HOST_WIDE_INT delta,unsigned HOST_WIDE_INT distinct_iters,int align_unit)676*e4b17023SJohn Marino is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
677*e4b17023SJohn Marino 		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
678*e4b17023SJohn Marino 		   unsigned HOST_WIDE_INT distinct_iters,
679*e4b17023SJohn Marino 		   int align_unit)
680*e4b17023SJohn Marino {
681*e4b17023SJohn Marino   unsigned align, iter;
682*e4b17023SJohn Marino   int total_positions, miss_positions, max_allowed_miss_positions;
683*e4b17023SJohn Marino   int address1, address2, cache_line1, cache_line2;
684*e4b17023SJohn Marino 
685*e4b17023SJohn Marino   /* It always misses if delta is greater than or equal to the cache
686*e4b17023SJohn Marino      line size.  */
687*e4b17023SJohn Marino   if (delta >= (HOST_WIDE_INT) cache_line_size)
688*e4b17023SJohn Marino     return false;
689*e4b17023SJohn Marino 
690*e4b17023SJohn Marino   miss_positions = 0;
691*e4b17023SJohn Marino   total_positions = (cache_line_size / align_unit) * distinct_iters;
692*e4b17023SJohn Marino   max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
693*e4b17023SJohn Marino 
694*e4b17023SJohn Marino   /* Iterate through all possible alignments of the first
695*e4b17023SJohn Marino      memory reference within its cache line.  */
696*e4b17023SJohn Marino   for (align = 0; align < cache_line_size; align += align_unit)
697*e4b17023SJohn Marino 
698*e4b17023SJohn Marino     /* Iterate through all distinct iterations.  */
699*e4b17023SJohn Marino     for (iter = 0; iter < distinct_iters; iter++)
700*e4b17023SJohn Marino       {
701*e4b17023SJohn Marino 	address1 = align + step * iter;
702*e4b17023SJohn Marino 	address2 = address1 + delta;
703*e4b17023SJohn Marino 	cache_line1 = address1 / cache_line_size;
704*e4b17023SJohn Marino 	cache_line2 = address2 / cache_line_size;
705*e4b17023SJohn Marino 	if (cache_line1 != cache_line2)
706*e4b17023SJohn Marino 	  {
707*e4b17023SJohn Marino 	    miss_positions += 1;
708*e4b17023SJohn Marino             if (miss_positions > max_allowed_miss_positions)
709*e4b17023SJohn Marino 	      return false;
710*e4b17023SJohn Marino           }
711*e4b17023SJohn Marino       }
712*e4b17023SJohn Marino   return true;
713*e4b17023SJohn Marino }
714*e4b17023SJohn Marino 
715*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the reuse with BY.
716*e4b17023SJohn Marino    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
717*e4b17023SJohn Marino 
718*e4b17023SJohn Marino static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)719*e4b17023SJohn Marino prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
720*e4b17023SJohn Marino 			  bool by_is_before)
721*e4b17023SJohn Marino {
722*e4b17023SJohn Marino   HOST_WIDE_INT step;
723*e4b17023SJohn Marino   bool backward;
724*e4b17023SJohn Marino   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
725*e4b17023SJohn Marino   HOST_WIDE_INT delta = delta_b - delta_r;
726*e4b17023SJohn Marino   HOST_WIDE_INT hit_from;
727*e4b17023SJohn Marino   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
728*e4b17023SJohn Marino   HOST_WIDE_INT reduced_step;
729*e4b17023SJohn Marino   unsigned HOST_WIDE_INT reduced_prefetch_block;
730*e4b17023SJohn Marino   tree ref_type;
731*e4b17023SJohn Marino   int align_unit;
732*e4b17023SJohn Marino 
733*e4b17023SJohn Marino   /* If the step is non constant we cannot calculate prefetch_before.  */
734*e4b17023SJohn Marino   if (!cst_and_fits_in_hwi (ref->group->step)) {
735*e4b17023SJohn Marino     return;
736*e4b17023SJohn Marino   }
737*e4b17023SJohn Marino 
738*e4b17023SJohn Marino   step = int_cst_value (ref->group->step);
739*e4b17023SJohn Marino 
740*e4b17023SJohn Marino   backward = step < 0;
741*e4b17023SJohn Marino 
742*e4b17023SJohn Marino 
743*e4b17023SJohn Marino   if (delta == 0)
744*e4b17023SJohn Marino     {
745*e4b17023SJohn Marino       /* If the references has the same address, only prefetch the
746*e4b17023SJohn Marino 	 former.  */
747*e4b17023SJohn Marino       if (by_is_before)
748*e4b17023SJohn Marino 	ref->prefetch_before = 0;
749*e4b17023SJohn Marino 
750*e4b17023SJohn Marino       return;
751*e4b17023SJohn Marino     }
752*e4b17023SJohn Marino 
753*e4b17023SJohn Marino   if (!step)
754*e4b17023SJohn Marino     {
755*e4b17023SJohn Marino       /* If the reference addresses are invariant and fall into the
756*e4b17023SJohn Marino 	 same cache line, prefetch just the first one.  */
757*e4b17023SJohn Marino       if (!by_is_before)
758*e4b17023SJohn Marino 	return;
759*e4b17023SJohn Marino 
760*e4b17023SJohn Marino       if (ddown (ref->delta, PREFETCH_BLOCK)
761*e4b17023SJohn Marino 	  != ddown (by->delta, PREFETCH_BLOCK))
762*e4b17023SJohn Marino 	return;
763*e4b17023SJohn Marino 
764*e4b17023SJohn Marino       ref->prefetch_before = 0;
765*e4b17023SJohn Marino       return;
766*e4b17023SJohn Marino     }
767*e4b17023SJohn Marino 
768*e4b17023SJohn Marino   /* Only prune the reference that is behind in the array.  */
769*e4b17023SJohn Marino   if (backward)
770*e4b17023SJohn Marino     {
771*e4b17023SJohn Marino       if (delta > 0)
772*e4b17023SJohn Marino 	return;
773*e4b17023SJohn Marino 
774*e4b17023SJohn Marino       /* Transform the data so that we may assume that the accesses
775*e4b17023SJohn Marino 	 are forward.  */
776*e4b17023SJohn Marino       delta = - delta;
777*e4b17023SJohn Marino       step = -step;
778*e4b17023SJohn Marino       delta_r = PREFETCH_BLOCK - 1 - delta_r;
779*e4b17023SJohn Marino       delta_b = PREFETCH_BLOCK - 1 - delta_b;
780*e4b17023SJohn Marino     }
781*e4b17023SJohn Marino   else
782*e4b17023SJohn Marino     {
783*e4b17023SJohn Marino       if (delta < 0)
784*e4b17023SJohn Marino 	return;
785*e4b17023SJohn Marino     }
786*e4b17023SJohn Marino 
787*e4b17023SJohn Marino   /* Check whether the two references are likely to hit the same cache
788*e4b17023SJohn Marino      line, and how distant the iterations in that it occurs are from
789*e4b17023SJohn Marino      each other.  */
790*e4b17023SJohn Marino 
791*e4b17023SJohn Marino   if (step <= PREFETCH_BLOCK)
792*e4b17023SJohn Marino     {
793*e4b17023SJohn Marino       /* The accesses are sure to meet.  Let us check when.  */
794*e4b17023SJohn Marino       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
795*e4b17023SJohn Marino       prefetch_before = (hit_from - delta_r + step - 1) / step;
796*e4b17023SJohn Marino 
797*e4b17023SJohn Marino       /* Do not reduce prefetch_before if we meet beyond cache size.  */
798*e4b17023SJohn Marino       if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
799*e4b17023SJohn Marino         prefetch_before = PREFETCH_ALL;
800*e4b17023SJohn Marino       if (prefetch_before < ref->prefetch_before)
801*e4b17023SJohn Marino 	ref->prefetch_before = prefetch_before;
802*e4b17023SJohn Marino 
803*e4b17023SJohn Marino       return;
804*e4b17023SJohn Marino     }
805*e4b17023SJohn Marino 
806*e4b17023SJohn Marino   /* A more complicated case with step > prefetch_block.  First reduce
807*e4b17023SJohn Marino      the ratio between the step and the cache line size to its simplest
808*e4b17023SJohn Marino      terms.  The resulting denominator will then represent the number of
809*e4b17023SJohn Marino      distinct iterations after which each address will go back to its
810*e4b17023SJohn Marino      initial location within the cache line.  This computation assumes
811*e4b17023SJohn Marino      that PREFETCH_BLOCK is a power of two.  */
812*e4b17023SJohn Marino   prefetch_block = PREFETCH_BLOCK;
813*e4b17023SJohn Marino   reduced_prefetch_block = prefetch_block;
814*e4b17023SJohn Marino   reduced_step = step;
815*e4b17023SJohn Marino   while ((reduced_step & 1) == 0
816*e4b17023SJohn Marino 	 && reduced_prefetch_block > 1)
817*e4b17023SJohn Marino     {
818*e4b17023SJohn Marino       reduced_step >>= 1;
819*e4b17023SJohn Marino       reduced_prefetch_block >>= 1;
820*e4b17023SJohn Marino     }
821*e4b17023SJohn Marino 
822*e4b17023SJohn Marino   prefetch_before = delta / step;
823*e4b17023SJohn Marino   delta %= step;
824*e4b17023SJohn Marino   ref_type = TREE_TYPE (ref->mem);
825*e4b17023SJohn Marino   align_unit = TYPE_ALIGN (ref_type) / 8;
826*e4b17023SJohn Marino   if (is_miss_rate_acceptable (prefetch_block, step, delta,
827*e4b17023SJohn Marino 			       reduced_prefetch_block, align_unit))
828*e4b17023SJohn Marino     {
829*e4b17023SJohn Marino       /* Do not reduce prefetch_before if we meet beyond cache size.  */
830*e4b17023SJohn Marino       if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
831*e4b17023SJohn Marino         prefetch_before = PREFETCH_ALL;
832*e4b17023SJohn Marino       if (prefetch_before < ref->prefetch_before)
833*e4b17023SJohn Marino 	ref->prefetch_before = prefetch_before;
834*e4b17023SJohn Marino 
835*e4b17023SJohn Marino       return;
836*e4b17023SJohn Marino     }
837*e4b17023SJohn Marino 
838*e4b17023SJohn Marino   /* Try also the following iteration.  */
839*e4b17023SJohn Marino   prefetch_before++;
840*e4b17023SJohn Marino   delta = step - delta;
841*e4b17023SJohn Marino   if (is_miss_rate_acceptable (prefetch_block, step, delta,
842*e4b17023SJohn Marino 			       reduced_prefetch_block, align_unit))
843*e4b17023SJohn Marino     {
844*e4b17023SJohn Marino       if (prefetch_before < ref->prefetch_before)
845*e4b17023SJohn Marino 	ref->prefetch_before = prefetch_before;
846*e4b17023SJohn Marino 
847*e4b17023SJohn Marino       return;
848*e4b17023SJohn Marino     }
849*e4b17023SJohn Marino 
850*e4b17023SJohn Marino   /* The ref probably does not reuse by.  */
851*e4b17023SJohn Marino   return;
852*e4b17023SJohn Marino }
853*e4b17023SJohn Marino 
854*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the reuses with other references
855*e4b17023SJohn Marino    in REFS.  */
856*e4b17023SJohn Marino 
857*e4b17023SJohn Marino static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)858*e4b17023SJohn Marino prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
859*e4b17023SJohn Marino {
860*e4b17023SJohn Marino   struct mem_ref *prune_by;
861*e4b17023SJohn Marino   bool before = true;
862*e4b17023SJohn Marino 
863*e4b17023SJohn Marino   prune_ref_by_self_reuse (ref);
864*e4b17023SJohn Marino 
865*e4b17023SJohn Marino   for (prune_by = refs; prune_by; prune_by = prune_by->next)
866*e4b17023SJohn Marino     {
867*e4b17023SJohn Marino       if (prune_by == ref)
868*e4b17023SJohn Marino 	{
869*e4b17023SJohn Marino 	  before = false;
870*e4b17023SJohn Marino 	  continue;
871*e4b17023SJohn Marino 	}
872*e4b17023SJohn Marino 
873*e4b17023SJohn Marino       if (!WRITE_CAN_USE_READ_PREFETCH
874*e4b17023SJohn Marino 	  && ref->write_p
875*e4b17023SJohn Marino 	  && !prune_by->write_p)
876*e4b17023SJohn Marino 	continue;
877*e4b17023SJohn Marino       if (!READ_CAN_USE_WRITE_PREFETCH
878*e4b17023SJohn Marino 	  && !ref->write_p
879*e4b17023SJohn Marino 	  && prune_by->write_p)
880*e4b17023SJohn Marino 	continue;
881*e4b17023SJohn Marino 
882*e4b17023SJohn Marino       prune_ref_by_group_reuse (ref, prune_by, before);
883*e4b17023SJohn Marino     }
884*e4b17023SJohn Marino }
885*e4b17023SJohn Marino 
886*e4b17023SJohn Marino /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
887*e4b17023SJohn Marino 
888*e4b17023SJohn Marino static void
prune_group_by_reuse(struct mem_ref_group * group)889*e4b17023SJohn Marino prune_group_by_reuse (struct mem_ref_group *group)
890*e4b17023SJohn Marino {
891*e4b17023SJohn Marino   struct mem_ref *ref_pruned;
892*e4b17023SJohn Marino 
893*e4b17023SJohn Marino   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
894*e4b17023SJohn Marino     {
895*e4b17023SJohn Marino       prune_ref_by_reuse (ref_pruned, group->refs);
896*e4b17023SJohn Marino 
897*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
898*e4b17023SJohn Marino 	{
899*e4b17023SJohn Marino 	  fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
900*e4b17023SJohn Marino 
901*e4b17023SJohn Marino 	  if (ref_pruned->prefetch_before == PREFETCH_ALL
902*e4b17023SJohn Marino 	      && ref_pruned->prefetch_mod == 1)
903*e4b17023SJohn Marino 	    fprintf (dump_file, " no restrictions");
904*e4b17023SJohn Marino 	  else if (ref_pruned->prefetch_before == 0)
905*e4b17023SJohn Marino 	    fprintf (dump_file, " do not prefetch");
906*e4b17023SJohn Marino 	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
907*e4b17023SJohn Marino 	    fprintf (dump_file, " prefetch once");
908*e4b17023SJohn Marino 	  else
909*e4b17023SJohn Marino 	    {
910*e4b17023SJohn Marino 	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
911*e4b17023SJohn Marino 		{
912*e4b17023SJohn Marino 		  fprintf (dump_file, " prefetch before ");
913*e4b17023SJohn Marino 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
914*e4b17023SJohn Marino 			   ref_pruned->prefetch_before);
915*e4b17023SJohn Marino 		}
916*e4b17023SJohn Marino 	      if (ref_pruned->prefetch_mod != 1)
917*e4b17023SJohn Marino 		{
918*e4b17023SJohn Marino 		  fprintf (dump_file, " prefetch mod ");
919*e4b17023SJohn Marino 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
920*e4b17023SJohn Marino 			   ref_pruned->prefetch_mod);
921*e4b17023SJohn Marino 		}
922*e4b17023SJohn Marino 	    }
923*e4b17023SJohn Marino 	  fprintf (dump_file, "\n");
924*e4b17023SJohn Marino 	}
925*e4b17023SJohn Marino     }
926*e4b17023SJohn Marino }
927*e4b17023SJohn Marino 
928*e4b17023SJohn Marino /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
929*e4b17023SJohn Marino 
930*e4b17023SJohn Marino static void
prune_by_reuse(struct mem_ref_group * groups)931*e4b17023SJohn Marino prune_by_reuse (struct mem_ref_group *groups)
932*e4b17023SJohn Marino {
933*e4b17023SJohn Marino   for (; groups; groups = groups->next)
934*e4b17023SJohn Marino     prune_group_by_reuse (groups);
935*e4b17023SJohn Marino }
936*e4b17023SJohn Marino 
937*e4b17023SJohn Marino /* Returns true if we should issue prefetch for REF.  */
938*e4b17023SJohn Marino 
939*e4b17023SJohn Marino static bool
should_issue_prefetch_p(struct mem_ref * ref)940*e4b17023SJohn Marino should_issue_prefetch_p (struct mem_ref *ref)
941*e4b17023SJohn Marino {
942*e4b17023SJohn Marino   /* For now do not issue prefetches for only first few of the
943*e4b17023SJohn Marino      iterations.  */
944*e4b17023SJohn Marino   if (ref->prefetch_before != PREFETCH_ALL)
945*e4b17023SJohn Marino     {
946*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
947*e4b17023SJohn Marino         fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
948*e4b17023SJohn Marino 		 (void *) ref);
949*e4b17023SJohn Marino       return false;
950*e4b17023SJohn Marino     }
951*e4b17023SJohn Marino 
952*e4b17023SJohn Marino   /* Do not prefetch nontemporal stores.  */
953*e4b17023SJohn Marino   if (ref->storent_p)
954*e4b17023SJohn Marino     {
955*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
956*e4b17023SJohn Marino         fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
957*e4b17023SJohn Marino       return false;
958*e4b17023SJohn Marino     }
959*e4b17023SJohn Marino 
960*e4b17023SJohn Marino   return true;
961*e4b17023SJohn Marino }
962*e4b17023SJohn Marino 
963*e4b17023SJohn Marino /* Decide which of the prefetch candidates in GROUPS to prefetch.
964*e4b17023SJohn Marino    AHEAD is the number of iterations to prefetch ahead (which corresponds
965*e4b17023SJohn Marino    to the number of simultaneous instances of one prefetch running at a
966*e4b17023SJohn Marino    time).  UNROLL_FACTOR is the factor by that the loop is going to be
967*e4b17023SJohn Marino    unrolled.  Returns true if there is anything to prefetch.  */
968*e4b17023SJohn Marino 
969*e4b17023SJohn Marino static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)970*e4b17023SJohn Marino schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
971*e4b17023SJohn Marino 		     unsigned ahead)
972*e4b17023SJohn Marino {
973*e4b17023SJohn Marino   unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
974*e4b17023SJohn Marino   unsigned slots_per_prefetch;
975*e4b17023SJohn Marino   struct mem_ref *ref;
976*e4b17023SJohn Marino   bool any = false;
977*e4b17023SJohn Marino 
978*e4b17023SJohn Marino   /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
979*e4b17023SJohn Marino   remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
980*e4b17023SJohn Marino 
981*e4b17023SJohn Marino   /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
982*e4b17023SJohn Marino      AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
983*e4b17023SJohn Marino      it will need a prefetch slot.  */
984*e4b17023SJohn Marino   slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
985*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
986*e4b17023SJohn Marino     fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
987*e4b17023SJohn Marino 	     slots_per_prefetch);
988*e4b17023SJohn Marino 
989*e4b17023SJohn Marino   /* For now we just take memory references one by one and issue
990*e4b17023SJohn Marino      prefetches for as many as possible.  The groups are sorted
991*e4b17023SJohn Marino      starting with the largest step, since the references with
992*e4b17023SJohn Marino      large step are more likely to cause many cache misses.  */
993*e4b17023SJohn Marino 
994*e4b17023SJohn Marino   for (; groups; groups = groups->next)
995*e4b17023SJohn Marino     for (ref = groups->refs; ref; ref = ref->next)
996*e4b17023SJohn Marino       {
997*e4b17023SJohn Marino 	if (!should_issue_prefetch_p (ref))
998*e4b17023SJohn Marino 	  continue;
999*e4b17023SJohn Marino 
1000*e4b17023SJohn Marino         /* The loop is far from being sufficiently unrolled for this
1001*e4b17023SJohn Marino            prefetch.  Do not generate prefetch to avoid many redudant
1002*e4b17023SJohn Marino            prefetches.  */
1003*e4b17023SJohn Marino         if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1004*e4b17023SJohn Marino           continue;
1005*e4b17023SJohn Marino 
1006*e4b17023SJohn Marino 	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
1007*e4b17023SJohn Marino 	   and we unroll the loop UNROLL_FACTOR times, we need to insert
1008*e4b17023SJohn Marino 	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1009*e4b17023SJohn Marino 	   iteration.  */
1010*e4b17023SJohn Marino 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1011*e4b17023SJohn Marino 			/ ref->prefetch_mod);
1012*e4b17023SJohn Marino 	prefetch_slots = n_prefetches * slots_per_prefetch;
1013*e4b17023SJohn Marino 
1014*e4b17023SJohn Marino 	/* If more than half of the prefetches would be lost anyway, do not
1015*e4b17023SJohn Marino 	   issue the prefetch.  */
1016*e4b17023SJohn Marino 	if (2 * remaining_prefetch_slots < prefetch_slots)
1017*e4b17023SJohn Marino 	  continue;
1018*e4b17023SJohn Marino 
1019*e4b17023SJohn Marino 	ref->issue_prefetch_p = true;
1020*e4b17023SJohn Marino 
1021*e4b17023SJohn Marino 	if (remaining_prefetch_slots <= prefetch_slots)
1022*e4b17023SJohn Marino 	  return true;
1023*e4b17023SJohn Marino 	remaining_prefetch_slots -= prefetch_slots;
1024*e4b17023SJohn Marino 	any = true;
1025*e4b17023SJohn Marino       }
1026*e4b17023SJohn Marino 
1027*e4b17023SJohn Marino   return any;
1028*e4b17023SJohn Marino }
1029*e4b17023SJohn Marino 
1030*e4b17023SJohn Marino /* Return TRUE if no prefetch is going to be generated in the given
1031*e4b17023SJohn Marino    GROUPS.  */
1032*e4b17023SJohn Marino 
1033*e4b17023SJohn Marino static bool
nothing_to_prefetch_p(struct mem_ref_group * groups)1034*e4b17023SJohn Marino nothing_to_prefetch_p (struct mem_ref_group *groups)
1035*e4b17023SJohn Marino {
1036*e4b17023SJohn Marino   struct mem_ref *ref;
1037*e4b17023SJohn Marino 
1038*e4b17023SJohn Marino   for (; groups; groups = groups->next)
1039*e4b17023SJohn Marino     for (ref = groups->refs; ref; ref = ref->next)
1040*e4b17023SJohn Marino       if (should_issue_prefetch_p (ref))
1041*e4b17023SJohn Marino 	return false;
1042*e4b17023SJohn Marino 
1043*e4b17023SJohn Marino   return true;
1044*e4b17023SJohn Marino }
1045*e4b17023SJohn Marino 
1046*e4b17023SJohn Marino /* Estimate the number of prefetches in the given GROUPS.
1047*e4b17023SJohn Marino    UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1048*e4b17023SJohn Marino 
1049*e4b17023SJohn Marino static int
estimate_prefetch_count(struct mem_ref_group * groups,unsigned unroll_factor)1050*e4b17023SJohn Marino estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1051*e4b17023SJohn Marino {
1052*e4b17023SJohn Marino   struct mem_ref *ref;
1053*e4b17023SJohn Marino   unsigned n_prefetches;
1054*e4b17023SJohn Marino   int prefetch_count = 0;
1055*e4b17023SJohn Marino 
1056*e4b17023SJohn Marino   for (; groups; groups = groups->next)
1057*e4b17023SJohn Marino     for (ref = groups->refs; ref; ref = ref->next)
1058*e4b17023SJohn Marino       if (should_issue_prefetch_p (ref))
1059*e4b17023SJohn Marino 	{
1060*e4b17023SJohn Marino 	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1061*e4b17023SJohn Marino 			  / ref->prefetch_mod);
1062*e4b17023SJohn Marino 	  prefetch_count += n_prefetches;
1063*e4b17023SJohn Marino 	}
1064*e4b17023SJohn Marino 
1065*e4b17023SJohn Marino   return prefetch_count;
1066*e4b17023SJohn Marino }
1067*e4b17023SJohn Marino 
1068*e4b17023SJohn Marino /* Issue prefetches for the reference REF into loop as decided before.
1069*e4b17023SJohn Marino    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1070*e4b17023SJohn Marino    is the factor by which LOOP was unrolled.  */
1071*e4b17023SJohn Marino 
1072*e4b17023SJohn Marino static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)1073*e4b17023SJohn Marino issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1074*e4b17023SJohn Marino {
1075*e4b17023SJohn Marino   HOST_WIDE_INT delta;
1076*e4b17023SJohn Marino   tree addr, addr_base, write_p, local, forward;
1077*e4b17023SJohn Marino   gimple prefetch;
1078*e4b17023SJohn Marino   gimple_stmt_iterator bsi;
1079*e4b17023SJohn Marino   unsigned n_prefetches, ap;
1080*e4b17023SJohn Marino   bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1081*e4b17023SJohn Marino 
1082*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
1083*e4b17023SJohn Marino     fprintf (dump_file, "Issued%s prefetch for %p.\n",
1084*e4b17023SJohn Marino 	     nontemporal ? " nontemporal" : "",
1085*e4b17023SJohn Marino 	     (void *) ref);
1086*e4b17023SJohn Marino 
1087*e4b17023SJohn Marino   bsi = gsi_for_stmt (ref->stmt);
1088*e4b17023SJohn Marino 
1089*e4b17023SJohn Marino   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1090*e4b17023SJohn Marino 		  / ref->prefetch_mod);
1091*e4b17023SJohn Marino   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1092*e4b17023SJohn Marino   addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1093*e4b17023SJohn Marino 					true, NULL, true, GSI_SAME_STMT);
1094*e4b17023SJohn Marino   write_p = ref->write_p ? integer_one_node : integer_zero_node;
1095*e4b17023SJohn Marino   local = nontemporal ? integer_zero_node : integer_three_node;
1096*e4b17023SJohn Marino 
1097*e4b17023SJohn Marino   for (ap = 0; ap < n_prefetches; ap++)
1098*e4b17023SJohn Marino     {
1099*e4b17023SJohn Marino       if (cst_and_fits_in_hwi (ref->group->step))
1100*e4b17023SJohn Marino         {
1101*e4b17023SJohn Marino           /* Determine the address to prefetch.  */
1102*e4b17023SJohn Marino           delta = (ahead + ap * ref->prefetch_mod) *
1103*e4b17023SJohn Marino 		   int_cst_value (ref->group->step);
1104*e4b17023SJohn Marino           addr = fold_build_pointer_plus_hwi (addr_base, delta);
1105*e4b17023SJohn Marino           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1106*e4b17023SJohn Marino                                            true, GSI_SAME_STMT);
1107*e4b17023SJohn Marino         }
1108*e4b17023SJohn Marino       else
1109*e4b17023SJohn Marino         {
1110*e4b17023SJohn Marino           /* The step size is non-constant but loop-invariant.  We use the
1111*e4b17023SJohn Marino              heuristic to simply prefetch ahead iterations ahead.  */
1112*e4b17023SJohn Marino           forward = fold_build2 (MULT_EXPR, sizetype,
1113*e4b17023SJohn Marino                                  fold_convert (sizetype, ref->group->step),
1114*e4b17023SJohn Marino                                  fold_convert (sizetype, size_int (ahead)));
1115*e4b17023SJohn Marino           addr = fold_build_pointer_plus (addr_base, forward);
1116*e4b17023SJohn Marino           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1117*e4b17023SJohn Marino 					   NULL, true, GSI_SAME_STMT);
1118*e4b17023SJohn Marino       }
1119*e4b17023SJohn Marino       /* Create the prefetch instruction.  */
1120*e4b17023SJohn Marino       prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1121*e4b17023SJohn Marino 				    3, addr, write_p, local);
1122*e4b17023SJohn Marino       gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1123*e4b17023SJohn Marino     }
1124*e4b17023SJohn Marino }
1125*e4b17023SJohn Marino 
1126*e4b17023SJohn Marino /* Issue prefetches for the references in GROUPS into loop as decided before.
1127*e4b17023SJohn Marino    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1128*e4b17023SJohn Marino    factor by that LOOP was unrolled.  */
1129*e4b17023SJohn Marino 
1130*e4b17023SJohn Marino static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)1131*e4b17023SJohn Marino issue_prefetches (struct mem_ref_group *groups,
1132*e4b17023SJohn Marino 		  unsigned unroll_factor, unsigned ahead)
1133*e4b17023SJohn Marino {
1134*e4b17023SJohn Marino   struct mem_ref *ref;
1135*e4b17023SJohn Marino 
1136*e4b17023SJohn Marino   for (; groups; groups = groups->next)
1137*e4b17023SJohn Marino     for (ref = groups->refs; ref; ref = ref->next)
1138*e4b17023SJohn Marino       if (ref->issue_prefetch_p)
1139*e4b17023SJohn Marino 	issue_prefetch_ref (ref, unroll_factor, ahead);
1140*e4b17023SJohn Marino }
1141*e4b17023SJohn Marino 
1142*e4b17023SJohn Marino /* Returns true if REF is a memory write for that a nontemporal store insn
1143*e4b17023SJohn Marino    can be used.  */
1144*e4b17023SJohn Marino 
1145*e4b17023SJohn Marino static bool
nontemporal_store_p(struct mem_ref * ref)1146*e4b17023SJohn Marino nontemporal_store_p (struct mem_ref *ref)
1147*e4b17023SJohn Marino {
1148*e4b17023SJohn Marino   enum machine_mode mode;
1149*e4b17023SJohn Marino   enum insn_code code;
1150*e4b17023SJohn Marino 
1151*e4b17023SJohn Marino   /* REF must be a write that is not reused.  We require it to be independent
1152*e4b17023SJohn Marino      on all other memory references in the loop, as the nontemporal stores may
1153*e4b17023SJohn Marino      be reordered with respect to other memory references.  */
1154*e4b17023SJohn Marino   if (!ref->write_p
1155*e4b17023SJohn Marino       || !ref->independent_p
1156*e4b17023SJohn Marino       || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1157*e4b17023SJohn Marino     return false;
1158*e4b17023SJohn Marino 
1159*e4b17023SJohn Marino   /* Check that we have the storent instruction for the mode.  */
1160*e4b17023SJohn Marino   mode = TYPE_MODE (TREE_TYPE (ref->mem));
1161*e4b17023SJohn Marino   if (mode == BLKmode)
1162*e4b17023SJohn Marino     return false;
1163*e4b17023SJohn Marino 
1164*e4b17023SJohn Marino   code = optab_handler (storent_optab, mode);
1165*e4b17023SJohn Marino   return code != CODE_FOR_nothing;
1166*e4b17023SJohn Marino }
1167*e4b17023SJohn Marino 
1168*e4b17023SJohn Marino /* If REF is a nontemporal store, we mark the corresponding modify statement
1169*e4b17023SJohn Marino    and return true.  Otherwise, we return false.  */
1170*e4b17023SJohn Marino 
1171*e4b17023SJohn Marino static bool
mark_nontemporal_store(struct mem_ref * ref)1172*e4b17023SJohn Marino mark_nontemporal_store (struct mem_ref *ref)
1173*e4b17023SJohn Marino {
1174*e4b17023SJohn Marino   if (!nontemporal_store_p (ref))
1175*e4b17023SJohn Marino     return false;
1176*e4b17023SJohn Marino 
1177*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
1178*e4b17023SJohn Marino     fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1179*e4b17023SJohn Marino 	     (void *) ref);
1180*e4b17023SJohn Marino 
1181*e4b17023SJohn Marino   gimple_assign_set_nontemporal_move (ref->stmt, true);
1182*e4b17023SJohn Marino   ref->storent_p = true;
1183*e4b17023SJohn Marino 
1184*e4b17023SJohn Marino   return true;
1185*e4b17023SJohn Marino }
1186*e4b17023SJohn Marino 
1187*e4b17023SJohn Marino /* Issue a memory fence instruction after LOOP.  */
1188*e4b17023SJohn Marino 
1189*e4b17023SJohn Marino static void
emit_mfence_after_loop(struct loop * loop)1190*e4b17023SJohn Marino emit_mfence_after_loop (struct loop *loop)
1191*e4b17023SJohn Marino {
1192*e4b17023SJohn Marino   VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1193*e4b17023SJohn Marino   edge exit;
1194*e4b17023SJohn Marino   gimple call;
1195*e4b17023SJohn Marino   gimple_stmt_iterator bsi;
1196*e4b17023SJohn Marino   unsigned i;
1197*e4b17023SJohn Marino 
1198*e4b17023SJohn Marino   FOR_EACH_VEC_ELT (edge, exits, i, exit)
1199*e4b17023SJohn Marino     {
1200*e4b17023SJohn Marino       call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1201*e4b17023SJohn Marino 
1202*e4b17023SJohn Marino       if (!single_pred_p (exit->dest)
1203*e4b17023SJohn Marino 	  /* If possible, we prefer not to insert the fence on other paths
1204*e4b17023SJohn Marino 	     in cfg.  */
1205*e4b17023SJohn Marino 	  && !(exit->flags & EDGE_ABNORMAL))
1206*e4b17023SJohn Marino 	split_loop_exit_edge (exit);
1207*e4b17023SJohn Marino       bsi = gsi_after_labels (exit->dest);
1208*e4b17023SJohn Marino 
1209*e4b17023SJohn Marino       gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1210*e4b17023SJohn Marino       mark_virtual_ops_for_renaming (call);
1211*e4b17023SJohn Marino     }
1212*e4b17023SJohn Marino 
1213*e4b17023SJohn Marino   VEC_free (edge, heap, exits);
1214*e4b17023SJohn Marino   update_ssa (TODO_update_ssa_only_virtuals);
1215*e4b17023SJohn Marino }
1216*e4b17023SJohn Marino 
1217*e4b17023SJohn Marino /* Returns true if we can use storent in loop, false otherwise.  */
1218*e4b17023SJohn Marino 
1219*e4b17023SJohn Marino static bool
may_use_storent_in_loop_p(struct loop * loop)1220*e4b17023SJohn Marino may_use_storent_in_loop_p (struct loop *loop)
1221*e4b17023SJohn Marino {
1222*e4b17023SJohn Marino   bool ret = true;
1223*e4b17023SJohn Marino 
1224*e4b17023SJohn Marino   if (loop->inner != NULL)
1225*e4b17023SJohn Marino     return false;
1226*e4b17023SJohn Marino 
1227*e4b17023SJohn Marino   /* If we must issue a mfence insn after using storent, check that there
1228*e4b17023SJohn Marino      is a suitable place for it at each of the loop exits.  */
1229*e4b17023SJohn Marino   if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1230*e4b17023SJohn Marino     {
1231*e4b17023SJohn Marino       VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1232*e4b17023SJohn Marino       unsigned i;
1233*e4b17023SJohn Marino       edge exit;
1234*e4b17023SJohn Marino 
1235*e4b17023SJohn Marino       FOR_EACH_VEC_ELT (edge, exits, i, exit)
1236*e4b17023SJohn Marino 	if ((exit->flags & EDGE_ABNORMAL)
1237*e4b17023SJohn Marino 	    && exit->dest == EXIT_BLOCK_PTR)
1238*e4b17023SJohn Marino 	  ret = false;
1239*e4b17023SJohn Marino 
1240*e4b17023SJohn Marino       VEC_free (edge, heap, exits);
1241*e4b17023SJohn Marino     }
1242*e4b17023SJohn Marino 
1243*e4b17023SJohn Marino   return ret;
1244*e4b17023SJohn Marino }
1245*e4b17023SJohn Marino 
1246*e4b17023SJohn Marino /* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1247*e4b17023SJohn Marino    references in the loop.  */
1248*e4b17023SJohn Marino 
1249*e4b17023SJohn Marino static void
mark_nontemporal_stores(struct loop * loop,struct mem_ref_group * groups)1250*e4b17023SJohn Marino mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1251*e4b17023SJohn Marino {
1252*e4b17023SJohn Marino   struct mem_ref *ref;
1253*e4b17023SJohn Marino   bool any = false;
1254*e4b17023SJohn Marino 
1255*e4b17023SJohn Marino   if (!may_use_storent_in_loop_p (loop))
1256*e4b17023SJohn Marino     return;
1257*e4b17023SJohn Marino 
1258*e4b17023SJohn Marino   for (; groups; groups = groups->next)
1259*e4b17023SJohn Marino     for (ref = groups->refs; ref; ref = ref->next)
1260*e4b17023SJohn Marino       any |= mark_nontemporal_store (ref);
1261*e4b17023SJohn Marino 
1262*e4b17023SJohn Marino   if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1263*e4b17023SJohn Marino     emit_mfence_after_loop (loop);
1264*e4b17023SJohn Marino }
1265*e4b17023SJohn Marino 
1266*e4b17023SJohn Marino /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1267*e4b17023SJohn Marino    this is the case, fill in DESC by the description of number of
1268*e4b17023SJohn Marino    iterations.  */
1269*e4b17023SJohn Marino 
1270*e4b17023SJohn Marino static bool
should_unroll_loop_p(struct loop * loop,struct tree_niter_desc * desc,unsigned factor)1271*e4b17023SJohn Marino should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1272*e4b17023SJohn Marino 		      unsigned factor)
1273*e4b17023SJohn Marino {
1274*e4b17023SJohn Marino   if (!can_unroll_loop_p (loop, factor, desc))
1275*e4b17023SJohn Marino     return false;
1276*e4b17023SJohn Marino 
1277*e4b17023SJohn Marino   /* We only consider loops without control flow for unrolling.  This is not
1278*e4b17023SJohn Marino      a hard restriction -- tree_unroll_loop works with arbitrary loops
1279*e4b17023SJohn Marino      as well; but the unrolling/prefetching is usually more profitable for
1280*e4b17023SJohn Marino      loops consisting of a single basic block, and we want to limit the
1281*e4b17023SJohn Marino      code growth.  */
1282*e4b17023SJohn Marino   if (loop->num_nodes > 2)
1283*e4b17023SJohn Marino     return false;
1284*e4b17023SJohn Marino 
1285*e4b17023SJohn Marino   return true;
1286*e4b17023SJohn Marino }
1287*e4b17023SJohn Marino 
1288*e4b17023SJohn Marino /* Determine the coefficient by that unroll LOOP, from the information
1289*e4b17023SJohn Marino    contained in the list of memory references REFS.  Description of
1290*e4b17023SJohn Marino    umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1291*e4b17023SJohn Marino    insns of the LOOP.  EST_NITER is the estimated number of iterations of
1292*e4b17023SJohn Marino    the loop, or -1 if no estimate is available.  */
1293*e4b17023SJohn Marino 
1294*e4b17023SJohn Marino static unsigned
determine_unroll_factor(struct loop * loop,struct mem_ref_group * refs,unsigned ninsns,struct tree_niter_desc * desc,HOST_WIDE_INT est_niter)1295*e4b17023SJohn Marino determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1296*e4b17023SJohn Marino 			 unsigned ninsns, struct tree_niter_desc *desc,
1297*e4b17023SJohn Marino 			 HOST_WIDE_INT est_niter)
1298*e4b17023SJohn Marino {
1299*e4b17023SJohn Marino   unsigned upper_bound;
1300*e4b17023SJohn Marino   unsigned nfactor, factor, mod_constraint;
1301*e4b17023SJohn Marino   struct mem_ref_group *agp;
1302*e4b17023SJohn Marino   struct mem_ref *ref;
1303*e4b17023SJohn Marino 
1304*e4b17023SJohn Marino   /* First check whether the loop is not too large to unroll.  We ignore
1305*e4b17023SJohn Marino      PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1306*e4b17023SJohn Marino      from unrolling them enough to make exactly one cache line covered by each
1307*e4b17023SJohn Marino      iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1308*e4b17023SJohn Marino      us from unrolling the loops too many times in cases where we only expect
1309*e4b17023SJohn Marino      gains from better scheduling and decreasing loop overhead, which is not
1310*e4b17023SJohn Marino      the case here.  */
1311*e4b17023SJohn Marino   upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1312*e4b17023SJohn Marino 
1313*e4b17023SJohn Marino   /* If we unrolled the loop more times than it iterates, the unrolled version
1314*e4b17023SJohn Marino      of the loop would be never entered.  */
1315*e4b17023SJohn Marino   if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1316*e4b17023SJohn Marino     upper_bound = est_niter;
1317*e4b17023SJohn Marino 
1318*e4b17023SJohn Marino   if (upper_bound <= 1)
1319*e4b17023SJohn Marino     return 1;
1320*e4b17023SJohn Marino 
1321*e4b17023SJohn Marino   /* Choose the factor so that we may prefetch each cache just once,
1322*e4b17023SJohn Marino      but bound the unrolling by UPPER_BOUND.  */
1323*e4b17023SJohn Marino   factor = 1;
1324*e4b17023SJohn Marino   for (agp = refs; agp; agp = agp->next)
1325*e4b17023SJohn Marino     for (ref = agp->refs; ref; ref = ref->next)
1326*e4b17023SJohn Marino       if (should_issue_prefetch_p (ref))
1327*e4b17023SJohn Marino 	{
1328*e4b17023SJohn Marino 	  mod_constraint = ref->prefetch_mod;
1329*e4b17023SJohn Marino 	  nfactor = least_common_multiple (mod_constraint, factor);
1330*e4b17023SJohn Marino 	  if (nfactor <= upper_bound)
1331*e4b17023SJohn Marino 	    factor = nfactor;
1332*e4b17023SJohn Marino 	}
1333*e4b17023SJohn Marino 
1334*e4b17023SJohn Marino   if (!should_unroll_loop_p (loop, desc, factor))
1335*e4b17023SJohn Marino     return 1;
1336*e4b17023SJohn Marino 
1337*e4b17023SJohn Marino   return factor;
1338*e4b17023SJohn Marino }
1339*e4b17023SJohn Marino 
1340*e4b17023SJohn Marino /* Returns the total volume of the memory references REFS, taking into account
1341*e4b17023SJohn Marino    reuses in the innermost loop and cache line size.  TODO -- we should also
1342*e4b17023SJohn Marino    take into account reuses across the iterations of the loops in the loop
1343*e4b17023SJohn Marino    nest.  */
1344*e4b17023SJohn Marino 
1345*e4b17023SJohn Marino static unsigned
volume_of_references(struct mem_ref_group * refs)1346*e4b17023SJohn Marino volume_of_references (struct mem_ref_group *refs)
1347*e4b17023SJohn Marino {
1348*e4b17023SJohn Marino   unsigned volume = 0;
1349*e4b17023SJohn Marino   struct mem_ref_group *gr;
1350*e4b17023SJohn Marino   struct mem_ref *ref;
1351*e4b17023SJohn Marino 
1352*e4b17023SJohn Marino   for (gr = refs; gr; gr = gr->next)
1353*e4b17023SJohn Marino     for (ref = gr->refs; ref; ref = ref->next)
1354*e4b17023SJohn Marino       {
1355*e4b17023SJohn Marino 	/* Almost always reuses another value?  */
1356*e4b17023SJohn Marino 	if (ref->prefetch_before != PREFETCH_ALL)
1357*e4b17023SJohn Marino 	  continue;
1358*e4b17023SJohn Marino 
1359*e4b17023SJohn Marino 	/* If several iterations access the same cache line, use the size of
1360*e4b17023SJohn Marino 	   the line divided by this number.  Otherwise, a cache line is
1361*e4b17023SJohn Marino 	   accessed in each iteration.  TODO -- in the latter case, we should
1362*e4b17023SJohn Marino 	   take the size of the reference into account, rounding it up on cache
1363*e4b17023SJohn Marino 	   line size multiple.  */
1364*e4b17023SJohn Marino 	volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1365*e4b17023SJohn Marino       }
1366*e4b17023SJohn Marino   return volume;
1367*e4b17023SJohn Marino }
1368*e4b17023SJohn Marino 
1369*e4b17023SJohn Marino /* Returns the volume of memory references accessed across VEC iterations of
1370*e4b17023SJohn Marino    loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1371*e4b17023SJohn Marino    of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1372*e4b17023SJohn Marino 
1373*e4b17023SJohn Marino static unsigned
volume_of_dist_vector(lambda_vector vec,unsigned * loop_sizes,unsigned n)1374*e4b17023SJohn Marino volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1375*e4b17023SJohn Marino {
1376*e4b17023SJohn Marino   unsigned i;
1377*e4b17023SJohn Marino 
1378*e4b17023SJohn Marino   for (i = 0; i < n; i++)
1379*e4b17023SJohn Marino     if (vec[i] != 0)
1380*e4b17023SJohn Marino       break;
1381*e4b17023SJohn Marino 
1382*e4b17023SJohn Marino   if (i == n)
1383*e4b17023SJohn Marino     return 0;
1384*e4b17023SJohn Marino 
1385*e4b17023SJohn Marino   gcc_assert (vec[i] > 0);
1386*e4b17023SJohn Marino 
1387*e4b17023SJohn Marino   /* We ignore the parts of the distance vector in subloops, since usually
1388*e4b17023SJohn Marino      the numbers of iterations are much smaller.  */
1389*e4b17023SJohn Marino   return loop_sizes[i] * vec[i];
1390*e4b17023SJohn Marino }
1391*e4b17023SJohn Marino 
1392*e4b17023SJohn Marino /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1393*e4b17023SJohn Marino    at the position corresponding to the loop of the step.  N is the depth
1394*e4b17023SJohn Marino    of the considered loop nest, and, LOOP is its innermost loop.  */
1395*e4b17023SJohn Marino 
1396*e4b17023SJohn Marino static void
add_subscript_strides(tree access_fn,unsigned stride,HOST_WIDE_INT * strides,unsigned n,struct loop * loop)1397*e4b17023SJohn Marino add_subscript_strides (tree access_fn, unsigned stride,
1398*e4b17023SJohn Marino 		       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1399*e4b17023SJohn Marino {
1400*e4b17023SJohn Marino   struct loop *aloop;
1401*e4b17023SJohn Marino   tree step;
1402*e4b17023SJohn Marino   HOST_WIDE_INT astep;
1403*e4b17023SJohn Marino   unsigned min_depth = loop_depth (loop) - n;
1404*e4b17023SJohn Marino 
1405*e4b17023SJohn Marino   while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1406*e4b17023SJohn Marino     {
1407*e4b17023SJohn Marino       aloop = get_chrec_loop (access_fn);
1408*e4b17023SJohn Marino       step = CHREC_RIGHT (access_fn);
1409*e4b17023SJohn Marino       access_fn = CHREC_LEFT (access_fn);
1410*e4b17023SJohn Marino 
1411*e4b17023SJohn Marino       if ((unsigned) loop_depth (aloop) <= min_depth)
1412*e4b17023SJohn Marino 	continue;
1413*e4b17023SJohn Marino 
1414*e4b17023SJohn Marino       if (host_integerp (step, 0))
1415*e4b17023SJohn Marino 	astep = tree_low_cst (step, 0);
1416*e4b17023SJohn Marino       else
1417*e4b17023SJohn Marino 	astep = L1_CACHE_LINE_SIZE;
1418*e4b17023SJohn Marino 
1419*e4b17023SJohn Marino       strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1420*e4b17023SJohn Marino 
1421*e4b17023SJohn Marino     }
1422*e4b17023SJohn Marino }
1423*e4b17023SJohn Marino 
1424*e4b17023SJohn Marino /* Returns the volume of memory references accessed between two consecutive
1425*e4b17023SJohn Marino    self-reuses of the reference DR.  We consider the subscripts of DR in N
1426*e4b17023SJohn Marino    loops, and LOOP_SIZES contains the volumes of accesses in each of the
1427*e4b17023SJohn Marino    loops.  LOOP is the innermost loop of the current loop nest.  */
1428*e4b17023SJohn Marino 
1429*e4b17023SJohn Marino static unsigned
self_reuse_distance(data_reference_p dr,unsigned * loop_sizes,unsigned n,struct loop * loop)1430*e4b17023SJohn Marino self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1431*e4b17023SJohn Marino 		     struct loop *loop)
1432*e4b17023SJohn Marino {
1433*e4b17023SJohn Marino   tree stride, access_fn;
1434*e4b17023SJohn Marino   HOST_WIDE_INT *strides, astride;
1435*e4b17023SJohn Marino   VEC (tree, heap) *access_fns;
1436*e4b17023SJohn Marino   tree ref = DR_REF (dr);
1437*e4b17023SJohn Marino   unsigned i, ret = ~0u;
1438*e4b17023SJohn Marino 
1439*e4b17023SJohn Marino   /* In the following example:
1440*e4b17023SJohn Marino 
1441*e4b17023SJohn Marino      for (i = 0; i < N; i++)
1442*e4b17023SJohn Marino        for (j = 0; j < N; j++)
1443*e4b17023SJohn Marino          use (a[j][i]);
1444*e4b17023SJohn Marino      the same cache line is accessed each N steps (except if the change from
1445*e4b17023SJohn Marino      i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1446*e4b17023SJohn Marino      we cannot rely purely on the results of the data dependence analysis.
1447*e4b17023SJohn Marino 
1448*e4b17023SJohn Marino      Instead, we compute the stride of the reference in each loop, and consider
1449*e4b17023SJohn Marino      the innermost loop in that the stride is less than cache size.  */
1450*e4b17023SJohn Marino 
1451*e4b17023SJohn Marino   strides = XCNEWVEC (HOST_WIDE_INT, n);
1452*e4b17023SJohn Marino   access_fns = DR_ACCESS_FNS (dr);
1453*e4b17023SJohn Marino 
1454*e4b17023SJohn Marino   FOR_EACH_VEC_ELT (tree, access_fns, i, access_fn)
1455*e4b17023SJohn Marino     {
1456*e4b17023SJohn Marino       /* Keep track of the reference corresponding to the subscript, so that we
1457*e4b17023SJohn Marino 	 know its stride.  */
1458*e4b17023SJohn Marino       while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1459*e4b17023SJohn Marino 	ref = TREE_OPERAND (ref, 0);
1460*e4b17023SJohn Marino 
1461*e4b17023SJohn Marino       if (TREE_CODE (ref) == ARRAY_REF)
1462*e4b17023SJohn Marino 	{
1463*e4b17023SJohn Marino 	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1464*e4b17023SJohn Marino 	  if (host_integerp (stride, 1))
1465*e4b17023SJohn Marino 	    astride = tree_low_cst (stride, 1);
1466*e4b17023SJohn Marino 	  else
1467*e4b17023SJohn Marino 	    astride = L1_CACHE_LINE_SIZE;
1468*e4b17023SJohn Marino 
1469*e4b17023SJohn Marino 	  ref = TREE_OPERAND (ref, 0);
1470*e4b17023SJohn Marino 	}
1471*e4b17023SJohn Marino       else
1472*e4b17023SJohn Marino 	astride = 1;
1473*e4b17023SJohn Marino 
1474*e4b17023SJohn Marino       add_subscript_strides (access_fn, astride, strides, n, loop);
1475*e4b17023SJohn Marino     }
1476*e4b17023SJohn Marino 
1477*e4b17023SJohn Marino   for (i = n; i-- > 0; )
1478*e4b17023SJohn Marino     {
1479*e4b17023SJohn Marino       unsigned HOST_WIDE_INT s;
1480*e4b17023SJohn Marino 
1481*e4b17023SJohn Marino       s = strides[i] < 0 ?  -strides[i] : strides[i];
1482*e4b17023SJohn Marino 
1483*e4b17023SJohn Marino       if (s < (unsigned) L1_CACHE_LINE_SIZE
1484*e4b17023SJohn Marino 	  && (loop_sizes[i]
1485*e4b17023SJohn Marino 	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1486*e4b17023SJohn Marino 	{
1487*e4b17023SJohn Marino 	  ret = loop_sizes[i];
1488*e4b17023SJohn Marino 	  break;
1489*e4b17023SJohn Marino 	}
1490*e4b17023SJohn Marino     }
1491*e4b17023SJohn Marino 
1492*e4b17023SJohn Marino   free (strides);
1493*e4b17023SJohn Marino   return ret;
1494*e4b17023SJohn Marino }
1495*e4b17023SJohn Marino 
1496*e4b17023SJohn Marino /* Determines the distance till the first reuse of each reference in REFS
1497*e4b17023SJohn Marino    in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1498*e4b17023SJohn Marino    memory references in the loop.  */
1499*e4b17023SJohn Marino 
1500*e4b17023SJohn Marino static void
determine_loop_nest_reuse(struct loop * loop,struct mem_ref_group * refs,bool no_other_refs)1501*e4b17023SJohn Marino determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1502*e4b17023SJohn Marino 			   bool no_other_refs)
1503*e4b17023SJohn Marino {
1504*e4b17023SJohn Marino   struct loop *nest, *aloop;
1505*e4b17023SJohn Marino   VEC (data_reference_p, heap) *datarefs = NULL;
1506*e4b17023SJohn Marino   VEC (ddr_p, heap) *dependences = NULL;
1507*e4b17023SJohn Marino   struct mem_ref_group *gr;
1508*e4b17023SJohn Marino   struct mem_ref *ref, *refb;
1509*e4b17023SJohn Marino   VEC (loop_p, heap) *vloops = NULL;
1510*e4b17023SJohn Marino   unsigned *loop_data_size;
1511*e4b17023SJohn Marino   unsigned i, j, n;
1512*e4b17023SJohn Marino   unsigned volume, dist, adist;
1513*e4b17023SJohn Marino   HOST_WIDE_INT vol;
1514*e4b17023SJohn Marino   data_reference_p dr;
1515*e4b17023SJohn Marino   ddr_p dep;
1516*e4b17023SJohn Marino 
1517*e4b17023SJohn Marino   if (loop->inner)
1518*e4b17023SJohn Marino     return;
1519*e4b17023SJohn Marino 
1520*e4b17023SJohn Marino   /* Find the outermost loop of the loop nest of loop (we require that
1521*e4b17023SJohn Marino      there are no sibling loops inside the nest).  */
1522*e4b17023SJohn Marino   nest = loop;
1523*e4b17023SJohn Marino   while (1)
1524*e4b17023SJohn Marino     {
1525*e4b17023SJohn Marino       aloop = loop_outer (nest);
1526*e4b17023SJohn Marino 
1527*e4b17023SJohn Marino       if (aloop == current_loops->tree_root
1528*e4b17023SJohn Marino 	  || aloop->inner->next)
1529*e4b17023SJohn Marino 	break;
1530*e4b17023SJohn Marino 
1531*e4b17023SJohn Marino       nest = aloop;
1532*e4b17023SJohn Marino     }
1533*e4b17023SJohn Marino 
1534*e4b17023SJohn Marino   /* For each loop, determine the amount of data accessed in each iteration.
1535*e4b17023SJohn Marino      We use this to estimate whether the reference is evicted from the
1536*e4b17023SJohn Marino      cache before its reuse.  */
1537*e4b17023SJohn Marino   find_loop_nest (nest, &vloops);
1538*e4b17023SJohn Marino   n = VEC_length (loop_p, vloops);
1539*e4b17023SJohn Marino   loop_data_size = XNEWVEC (unsigned, n);
1540*e4b17023SJohn Marino   volume = volume_of_references (refs);
1541*e4b17023SJohn Marino   i = n;
1542*e4b17023SJohn Marino   while (i-- != 0)
1543*e4b17023SJohn Marino     {
1544*e4b17023SJohn Marino       loop_data_size[i] = volume;
1545*e4b17023SJohn Marino       /* Bound the volume by the L2 cache size, since above this bound,
1546*e4b17023SJohn Marino 	 all dependence distances are equivalent.  */
1547*e4b17023SJohn Marino       if (volume > L2_CACHE_SIZE_BYTES)
1548*e4b17023SJohn Marino 	continue;
1549*e4b17023SJohn Marino 
1550*e4b17023SJohn Marino       aloop = VEC_index (loop_p, vloops, i);
1551*e4b17023SJohn Marino       vol = max_stmt_executions_int (aloop, false);
1552*e4b17023SJohn Marino       if (vol < 0)
1553*e4b17023SJohn Marino 	vol = expected_loop_iterations (aloop);
1554*e4b17023SJohn Marino       volume *= vol;
1555*e4b17023SJohn Marino     }
1556*e4b17023SJohn Marino 
1557*e4b17023SJohn Marino   /* Prepare the references in the form suitable for data dependence
1558*e4b17023SJohn Marino      analysis.  We ignore unanalyzable data references (the results
1559*e4b17023SJohn Marino      are used just as a heuristics to estimate temporality of the
1560*e4b17023SJohn Marino      references, hence we do not need to worry about correctness).  */
1561*e4b17023SJohn Marino   for (gr = refs; gr; gr = gr->next)
1562*e4b17023SJohn Marino     for (ref = gr->refs; ref; ref = ref->next)
1563*e4b17023SJohn Marino       {
1564*e4b17023SJohn Marino 	dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1565*e4b17023SJohn Marino 			      ref->mem, ref->stmt, !ref->write_p);
1566*e4b17023SJohn Marino 
1567*e4b17023SJohn Marino 	if (dr)
1568*e4b17023SJohn Marino 	  {
1569*e4b17023SJohn Marino 	    ref->reuse_distance = volume;
1570*e4b17023SJohn Marino 	    dr->aux = ref;
1571*e4b17023SJohn Marino 	    VEC_safe_push (data_reference_p, heap, datarefs, dr);
1572*e4b17023SJohn Marino 	  }
1573*e4b17023SJohn Marino 	else
1574*e4b17023SJohn Marino 	  no_other_refs = false;
1575*e4b17023SJohn Marino       }
1576*e4b17023SJohn Marino 
1577*e4b17023SJohn Marino   FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1578*e4b17023SJohn Marino     {
1579*e4b17023SJohn Marino       dist = self_reuse_distance (dr, loop_data_size, n, loop);
1580*e4b17023SJohn Marino       ref = (struct mem_ref *) dr->aux;
1581*e4b17023SJohn Marino       if (ref->reuse_distance > dist)
1582*e4b17023SJohn Marino 	ref->reuse_distance = dist;
1583*e4b17023SJohn Marino 
1584*e4b17023SJohn Marino       if (no_other_refs)
1585*e4b17023SJohn Marino 	ref->independent_p = true;
1586*e4b17023SJohn Marino     }
1587*e4b17023SJohn Marino 
1588*e4b17023SJohn Marino   compute_all_dependences (datarefs, &dependences, vloops, true);
1589*e4b17023SJohn Marino 
1590*e4b17023SJohn Marino   FOR_EACH_VEC_ELT (ddr_p, dependences, i, dep)
1591*e4b17023SJohn Marino     {
1592*e4b17023SJohn Marino       if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1593*e4b17023SJohn Marino 	continue;
1594*e4b17023SJohn Marino 
1595*e4b17023SJohn Marino       ref = (struct mem_ref *) DDR_A (dep)->aux;
1596*e4b17023SJohn Marino       refb = (struct mem_ref *) DDR_B (dep)->aux;
1597*e4b17023SJohn Marino 
1598*e4b17023SJohn Marino       if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1599*e4b17023SJohn Marino 	  || DDR_NUM_DIST_VECTS (dep) == 0)
1600*e4b17023SJohn Marino 	{
1601*e4b17023SJohn Marino 	  /* If the dependence cannot be analyzed, assume that there might be
1602*e4b17023SJohn Marino 	     a reuse.  */
1603*e4b17023SJohn Marino 	  dist = 0;
1604*e4b17023SJohn Marino 
1605*e4b17023SJohn Marino 	  ref->independent_p = false;
1606*e4b17023SJohn Marino 	  refb->independent_p = false;
1607*e4b17023SJohn Marino 	}
1608*e4b17023SJohn Marino       else
1609*e4b17023SJohn Marino 	{
1610*e4b17023SJohn Marino 	  /* The distance vectors are normalized to be always lexicographically
1611*e4b17023SJohn Marino 	     positive, hence we cannot tell just from them whether DDR_A comes
1612*e4b17023SJohn Marino 	     before DDR_B or vice versa.  However, it is not important,
1613*e4b17023SJohn Marino 	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
1614*e4b17023SJohn Marino 	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1615*e4b17023SJohn Marino 	     in cache (and marking it as nontemporal would not affect
1616*e4b17023SJohn Marino 	     anything).  */
1617*e4b17023SJohn Marino 
1618*e4b17023SJohn Marino 	  dist = volume;
1619*e4b17023SJohn Marino 	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1620*e4b17023SJohn Marino 	    {
1621*e4b17023SJohn Marino 	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1622*e4b17023SJohn Marino 					     loop_data_size, n);
1623*e4b17023SJohn Marino 
1624*e4b17023SJohn Marino 	      /* If this is a dependence in the innermost loop (i.e., the
1625*e4b17023SJohn Marino 		 distances in all superloops are zero) and it is not
1626*e4b17023SJohn Marino 		 the trivial self-dependence with distance zero, record that
1627*e4b17023SJohn Marino 		 the references are not completely independent.  */
1628*e4b17023SJohn Marino 	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1629*e4b17023SJohn Marino 		  && (ref != refb
1630*e4b17023SJohn Marino 		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1631*e4b17023SJohn Marino 		{
1632*e4b17023SJohn Marino 		  ref->independent_p = false;
1633*e4b17023SJohn Marino 		  refb->independent_p = false;
1634*e4b17023SJohn Marino 		}
1635*e4b17023SJohn Marino 
1636*e4b17023SJohn Marino 	      /* Ignore accesses closer than
1637*e4b17023SJohn Marino 		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1638*e4b17023SJohn Marino 	      	 so that we use nontemporal prefetches e.g. if single memory
1639*e4b17023SJohn Marino 		 location is accessed several times in a single iteration of
1640*e4b17023SJohn Marino 		 the loop.  */
1641*e4b17023SJohn Marino 	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1642*e4b17023SJohn Marino 		continue;
1643*e4b17023SJohn Marino 
1644*e4b17023SJohn Marino 	      if (adist < dist)
1645*e4b17023SJohn Marino 		dist = adist;
1646*e4b17023SJohn Marino 	    }
1647*e4b17023SJohn Marino 	}
1648*e4b17023SJohn Marino 
1649*e4b17023SJohn Marino       if (ref->reuse_distance > dist)
1650*e4b17023SJohn Marino 	ref->reuse_distance = dist;
1651*e4b17023SJohn Marino       if (refb->reuse_distance > dist)
1652*e4b17023SJohn Marino 	refb->reuse_distance = dist;
1653*e4b17023SJohn Marino     }
1654*e4b17023SJohn Marino 
1655*e4b17023SJohn Marino   free_dependence_relations (dependences);
1656*e4b17023SJohn Marino   free_data_refs (datarefs);
1657*e4b17023SJohn Marino   free (loop_data_size);
1658*e4b17023SJohn Marino 
1659*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
1660*e4b17023SJohn Marino     {
1661*e4b17023SJohn Marino       fprintf (dump_file, "Reuse distances:\n");
1662*e4b17023SJohn Marino       for (gr = refs; gr; gr = gr->next)
1663*e4b17023SJohn Marino 	for (ref = gr->refs; ref; ref = ref->next)
1664*e4b17023SJohn Marino 	  fprintf (dump_file, " ref %p distance %u\n",
1665*e4b17023SJohn Marino 		   (void *) ref, ref->reuse_distance);
1666*e4b17023SJohn Marino     }
1667*e4b17023SJohn Marino }
1668*e4b17023SJohn Marino 
1669*e4b17023SJohn Marino /* Determine whether or not the trip count to ahead ratio is too small based
1670*e4b17023SJohn Marino    on prefitablility consideration.
1671*e4b17023SJohn Marino    AHEAD: the iteration ahead distance,
1672*e4b17023SJohn Marino    EST_NITER: the estimated trip count.  */
1673*e4b17023SJohn Marino 
1674*e4b17023SJohn Marino static bool
trip_count_to_ahead_ratio_too_small_p(unsigned ahead,HOST_WIDE_INT est_niter)1675*e4b17023SJohn Marino trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1676*e4b17023SJohn Marino {
1677*e4b17023SJohn Marino   /* Assume trip count to ahead ratio is big enough if the trip count could not
1678*e4b17023SJohn Marino      be estimated at compile time.  */
1679*e4b17023SJohn Marino   if (est_niter < 0)
1680*e4b17023SJohn Marino     return false;
1681*e4b17023SJohn Marino 
1682*e4b17023SJohn Marino   if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1683*e4b17023SJohn Marino     {
1684*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
1685*e4b17023SJohn Marino 	fprintf (dump_file,
1686*e4b17023SJohn Marino 		 "Not prefetching -- loop estimated to roll only %d times\n",
1687*e4b17023SJohn Marino 		 (int) est_niter);
1688*e4b17023SJohn Marino       return true;
1689*e4b17023SJohn Marino     }
1690*e4b17023SJohn Marino 
1691*e4b17023SJohn Marino   return false;
1692*e4b17023SJohn Marino }
1693*e4b17023SJohn Marino 
1694*e4b17023SJohn Marino /* Determine whether or not the number of memory references in the loop is
1695*e4b17023SJohn Marino    reasonable based on the profitablity and compilation time considerations.
1696*e4b17023SJohn Marino    NINSNS: estimated number of instructions in the loop,
1697*e4b17023SJohn Marino    MEM_REF_COUNT: total number of memory references in the loop.  */
1698*e4b17023SJohn Marino 
1699*e4b17023SJohn Marino static bool
mem_ref_count_reasonable_p(unsigned ninsns,unsigned mem_ref_count)1700*e4b17023SJohn Marino mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1701*e4b17023SJohn Marino {
1702*e4b17023SJohn Marino   int insn_to_mem_ratio;
1703*e4b17023SJohn Marino 
1704*e4b17023SJohn Marino   if (mem_ref_count == 0)
1705*e4b17023SJohn Marino     return false;
1706*e4b17023SJohn Marino 
1707*e4b17023SJohn Marino   /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1708*e4b17023SJohn Marino      (compute_all_dependences) have high costs based on quadratic complexity.
1709*e4b17023SJohn Marino      To avoid huge compilation time, we give up prefetching if mem_ref_count
1710*e4b17023SJohn Marino      is too large.  */
1711*e4b17023SJohn Marino   if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1712*e4b17023SJohn Marino     return false;
1713*e4b17023SJohn Marino 
1714*e4b17023SJohn Marino   /* Prefetching improves performance by overlapping cache missing
1715*e4b17023SJohn Marino      memory accesses with CPU operations.  If the loop does not have
1716*e4b17023SJohn Marino      enough CPU operations to overlap with memory operations, prefetching
1717*e4b17023SJohn Marino      won't give a significant benefit.  One approximate way of checking
1718*e4b17023SJohn Marino      this is to require the ratio of instructions to memory references to
1719*e4b17023SJohn Marino      be above a certain limit.  This approximation works well in practice.
1720*e4b17023SJohn Marino      TODO: Implement a more precise computation by estimating the time
1721*e4b17023SJohn Marino      for each CPU or memory op in the loop. Time estimates for memory ops
1722*e4b17023SJohn Marino      should account for cache misses.  */
1723*e4b17023SJohn Marino   insn_to_mem_ratio = ninsns / mem_ref_count;
1724*e4b17023SJohn Marino 
1725*e4b17023SJohn Marino   if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1726*e4b17023SJohn Marino     {
1727*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
1728*e4b17023SJohn Marino         fprintf (dump_file,
1729*e4b17023SJohn Marino 		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1730*e4b17023SJohn Marino 		 insn_to_mem_ratio);
1731*e4b17023SJohn Marino       return false;
1732*e4b17023SJohn Marino     }
1733*e4b17023SJohn Marino 
1734*e4b17023SJohn Marino   return true;
1735*e4b17023SJohn Marino }
1736*e4b17023SJohn Marino 
1737*e4b17023SJohn Marino /* Determine whether or not the instruction to prefetch ratio in the loop is
1738*e4b17023SJohn Marino    too small based on the profitablity consideration.
1739*e4b17023SJohn Marino    NINSNS: estimated number of instructions in the loop,
1740*e4b17023SJohn Marino    PREFETCH_COUNT: an estimate of the number of prefetches,
1741*e4b17023SJohn Marino    UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1742*e4b17023SJohn Marino 
1743*e4b17023SJohn Marino static bool
insn_to_prefetch_ratio_too_small_p(unsigned ninsns,unsigned prefetch_count,unsigned unroll_factor)1744*e4b17023SJohn Marino insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1745*e4b17023SJohn Marino                                      unsigned unroll_factor)
1746*e4b17023SJohn Marino {
1747*e4b17023SJohn Marino   int insn_to_prefetch_ratio;
1748*e4b17023SJohn Marino 
1749*e4b17023SJohn Marino   /* Prefetching most likely causes performance degradation when the instruction
1750*e4b17023SJohn Marino      to prefetch ratio is too small.  Too many prefetch instructions in a loop
1751*e4b17023SJohn Marino      may reduce the I-cache performance.
1752*e4b17023SJohn Marino      (unroll_factor * ninsns) is used to estimate the number of instructions in
1753*e4b17023SJohn Marino      the unrolled loop.  This implementation is a bit simplistic -- the number
1754*e4b17023SJohn Marino      of issued prefetch instructions is also affected by unrolling.  So,
1755*e4b17023SJohn Marino      prefetch_mod and the unroll factor should be taken into account when
1756*e4b17023SJohn Marino      determining prefetch_count.  Also, the number of insns of the unrolled
1757*e4b17023SJohn Marino      loop will usually be significantly smaller than the number of insns of the
1758*e4b17023SJohn Marino      original loop * unroll_factor (at least the induction variable increases
1759*e4b17023SJohn Marino      and the exit branches will get eliminated), so it might be better to use
1760*e4b17023SJohn Marino      tree_estimate_loop_size + estimated_unrolled_size.  */
1761*e4b17023SJohn Marino   insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1762*e4b17023SJohn Marino   if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1763*e4b17023SJohn Marino     {
1764*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
1765*e4b17023SJohn Marino         fprintf (dump_file,
1766*e4b17023SJohn Marino 		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1767*e4b17023SJohn Marino 		 insn_to_prefetch_ratio);
1768*e4b17023SJohn Marino       return true;
1769*e4b17023SJohn Marino     }
1770*e4b17023SJohn Marino 
1771*e4b17023SJohn Marino   return false;
1772*e4b17023SJohn Marino }
1773*e4b17023SJohn Marino 
1774*e4b17023SJohn Marino 
1775*e4b17023SJohn Marino /* Issue prefetch instructions for array references in LOOP.  Returns
1776*e4b17023SJohn Marino    true if the LOOP was unrolled.  */
1777*e4b17023SJohn Marino 
1778*e4b17023SJohn Marino static bool
loop_prefetch_arrays(struct loop * loop)1779*e4b17023SJohn Marino loop_prefetch_arrays (struct loop *loop)
1780*e4b17023SJohn Marino {
1781*e4b17023SJohn Marino   struct mem_ref_group *refs;
1782*e4b17023SJohn Marino   unsigned ahead, ninsns, time, unroll_factor;
1783*e4b17023SJohn Marino   HOST_WIDE_INT est_niter;
1784*e4b17023SJohn Marino   struct tree_niter_desc desc;
1785*e4b17023SJohn Marino   bool unrolled = false, no_other_refs;
1786*e4b17023SJohn Marino   unsigned prefetch_count;
1787*e4b17023SJohn Marino   unsigned mem_ref_count;
1788*e4b17023SJohn Marino 
1789*e4b17023SJohn Marino   if (optimize_loop_nest_for_size_p (loop))
1790*e4b17023SJohn Marino     {
1791*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
1792*e4b17023SJohn Marino 	fprintf (dump_file, "  ignored (cold area)\n");
1793*e4b17023SJohn Marino       return false;
1794*e4b17023SJohn Marino     }
1795*e4b17023SJohn Marino 
1796*e4b17023SJohn Marino   /* FIXME: the time should be weighted by the probabilities of the blocks in
1797*e4b17023SJohn Marino      the loop body.  */
1798*e4b17023SJohn Marino   time = tree_num_loop_insns (loop, &eni_time_weights);
1799*e4b17023SJohn Marino   if (time == 0)
1800*e4b17023SJohn Marino     return false;
1801*e4b17023SJohn Marino 
1802*e4b17023SJohn Marino   ahead = (PREFETCH_LATENCY + time - 1) / time;
1803*e4b17023SJohn Marino   est_niter = max_stmt_executions_int (loop, false);
1804*e4b17023SJohn Marino 
1805*e4b17023SJohn Marino   /* Prefetching is not likely to be profitable if the trip count to ahead
1806*e4b17023SJohn Marino      ratio is too small.  */
1807*e4b17023SJohn Marino   if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1808*e4b17023SJohn Marino     return false;
1809*e4b17023SJohn Marino 
1810*e4b17023SJohn Marino   ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1811*e4b17023SJohn Marino 
1812*e4b17023SJohn Marino   /* Step 1: gather the memory references.  */
1813*e4b17023SJohn Marino   refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1814*e4b17023SJohn Marino 
1815*e4b17023SJohn Marino   /* Give up prefetching if the number of memory references in the
1816*e4b17023SJohn Marino      loop is not reasonable based on profitablity and compilation time
1817*e4b17023SJohn Marino      considerations.  */
1818*e4b17023SJohn Marino   if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1819*e4b17023SJohn Marino     goto fail;
1820*e4b17023SJohn Marino 
1821*e4b17023SJohn Marino   /* Step 2: estimate the reuse effects.  */
1822*e4b17023SJohn Marino   prune_by_reuse (refs);
1823*e4b17023SJohn Marino 
1824*e4b17023SJohn Marino   if (nothing_to_prefetch_p (refs))
1825*e4b17023SJohn Marino     goto fail;
1826*e4b17023SJohn Marino 
1827*e4b17023SJohn Marino   determine_loop_nest_reuse (loop, refs, no_other_refs);
1828*e4b17023SJohn Marino 
1829*e4b17023SJohn Marino   /* Step 3: determine unroll factor.  */
1830*e4b17023SJohn Marino   unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1831*e4b17023SJohn Marino 					   est_niter);
1832*e4b17023SJohn Marino 
1833*e4b17023SJohn Marino   /* Estimate prefetch count for the unrolled loop.  */
1834*e4b17023SJohn Marino   prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1835*e4b17023SJohn Marino   if (prefetch_count == 0)
1836*e4b17023SJohn Marino     goto fail;
1837*e4b17023SJohn Marino 
1838*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
1839*e4b17023SJohn Marino     fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1840*e4b17023SJohn Marino 	     HOST_WIDE_INT_PRINT_DEC "\n"
1841*e4b17023SJohn Marino 	     "insn count %d, mem ref count %d, prefetch count %d\n",
1842*e4b17023SJohn Marino 	     ahead, unroll_factor, est_niter,
1843*e4b17023SJohn Marino 	     ninsns, mem_ref_count, prefetch_count);
1844*e4b17023SJohn Marino 
1845*e4b17023SJohn Marino   /* Prefetching is not likely to be profitable if the instruction to prefetch
1846*e4b17023SJohn Marino      ratio is too small.  */
1847*e4b17023SJohn Marino   if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1848*e4b17023SJohn Marino 					  unroll_factor))
1849*e4b17023SJohn Marino     goto fail;
1850*e4b17023SJohn Marino 
1851*e4b17023SJohn Marino   mark_nontemporal_stores (loop, refs);
1852*e4b17023SJohn Marino 
1853*e4b17023SJohn Marino   /* Step 4: what to prefetch?  */
1854*e4b17023SJohn Marino   if (!schedule_prefetches (refs, unroll_factor, ahead))
1855*e4b17023SJohn Marino     goto fail;
1856*e4b17023SJohn Marino 
1857*e4b17023SJohn Marino   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1858*e4b17023SJohn Marino      iterations so that we do not issue superfluous prefetches.  */
1859*e4b17023SJohn Marino   if (unroll_factor != 1)
1860*e4b17023SJohn Marino     {
1861*e4b17023SJohn Marino       tree_unroll_loop (loop, unroll_factor,
1862*e4b17023SJohn Marino 			single_dom_exit (loop), &desc);
1863*e4b17023SJohn Marino       unrolled = true;
1864*e4b17023SJohn Marino     }
1865*e4b17023SJohn Marino 
1866*e4b17023SJohn Marino   /* Step 6: issue the prefetches.  */
1867*e4b17023SJohn Marino   issue_prefetches (refs, unroll_factor, ahead);
1868*e4b17023SJohn Marino 
1869*e4b17023SJohn Marino fail:
1870*e4b17023SJohn Marino   release_mem_refs (refs);
1871*e4b17023SJohn Marino   return unrolled;
1872*e4b17023SJohn Marino }
1873*e4b17023SJohn Marino 
1874*e4b17023SJohn Marino /* Issue prefetch instructions for array references in loops.  */
1875*e4b17023SJohn Marino 
1876*e4b17023SJohn Marino unsigned int
tree_ssa_prefetch_arrays(void)1877*e4b17023SJohn Marino tree_ssa_prefetch_arrays (void)
1878*e4b17023SJohn Marino {
1879*e4b17023SJohn Marino   loop_iterator li;
1880*e4b17023SJohn Marino   struct loop *loop;
1881*e4b17023SJohn Marino   bool unrolled = false;
1882*e4b17023SJohn Marino   int todo_flags = 0;
1883*e4b17023SJohn Marino 
1884*e4b17023SJohn Marino   if (!HAVE_prefetch
1885*e4b17023SJohn Marino       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1886*e4b17023SJohn Marino 	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1887*e4b17023SJohn Marino 	 of processor costs and i486 does not have prefetch, but
1888*e4b17023SJohn Marino 	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1889*e4b17023SJohn Marino       || PREFETCH_BLOCK == 0)
1890*e4b17023SJohn Marino     return 0;
1891*e4b17023SJohn Marino 
1892*e4b17023SJohn Marino   if (dump_file && (dump_flags & TDF_DETAILS))
1893*e4b17023SJohn Marino     {
1894*e4b17023SJohn Marino       fprintf (dump_file, "Prefetching parameters:\n");
1895*e4b17023SJohn Marino       fprintf (dump_file, "    simultaneous prefetches: %d\n",
1896*e4b17023SJohn Marino 	       SIMULTANEOUS_PREFETCHES);
1897*e4b17023SJohn Marino       fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1898*e4b17023SJohn Marino       fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1899*e4b17023SJohn Marino       fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1900*e4b17023SJohn Marino 	       L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1901*e4b17023SJohn Marino       fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1902*e4b17023SJohn Marino       fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1903*e4b17023SJohn Marino       fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1904*e4b17023SJohn Marino 	       MIN_INSN_TO_PREFETCH_RATIO);
1905*e4b17023SJohn Marino       fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1906*e4b17023SJohn Marino 	       PREFETCH_MIN_INSN_TO_MEM_RATIO);
1907*e4b17023SJohn Marino       fprintf (dump_file, "\n");
1908*e4b17023SJohn Marino     }
1909*e4b17023SJohn Marino 
1910*e4b17023SJohn Marino   initialize_original_copy_tables ();
1911*e4b17023SJohn Marino 
1912*e4b17023SJohn Marino   if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1913*e4b17023SJohn Marino     {
1914*e4b17023SJohn Marino       tree type = build_function_type_list (void_type_node,
1915*e4b17023SJohn Marino 					    const_ptr_type_node, NULL_TREE);
1916*e4b17023SJohn Marino       tree decl = add_builtin_function ("__builtin_prefetch", type,
1917*e4b17023SJohn Marino 					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1918*e4b17023SJohn Marino 					NULL, NULL_TREE);
1919*e4b17023SJohn Marino       DECL_IS_NOVOPS (decl) = true;
1920*e4b17023SJohn Marino       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1921*e4b17023SJohn Marino     }
1922*e4b17023SJohn Marino 
1923*e4b17023SJohn Marino   /* We assume that size of cache line is a power of two, so verify this
1924*e4b17023SJohn Marino      here.  */
1925*e4b17023SJohn Marino   gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1926*e4b17023SJohn Marino 
1927*e4b17023SJohn Marino   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1928*e4b17023SJohn Marino     {
1929*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
1930*e4b17023SJohn Marino 	fprintf (dump_file, "Processing loop %d:\n", loop->num);
1931*e4b17023SJohn Marino 
1932*e4b17023SJohn Marino       unrolled |= loop_prefetch_arrays (loop);
1933*e4b17023SJohn Marino 
1934*e4b17023SJohn Marino       if (dump_file && (dump_flags & TDF_DETAILS))
1935*e4b17023SJohn Marino 	fprintf (dump_file, "\n\n");
1936*e4b17023SJohn Marino     }
1937*e4b17023SJohn Marino 
1938*e4b17023SJohn Marino   if (unrolled)
1939*e4b17023SJohn Marino     {
1940*e4b17023SJohn Marino       scev_reset ();
1941*e4b17023SJohn Marino       todo_flags |= TODO_cleanup_cfg;
1942*e4b17023SJohn Marino     }
1943*e4b17023SJohn Marino 
1944*e4b17023SJohn Marino   free_original_copy_tables ();
1945*e4b17023SJohn Marino   return todo_flags;
1946*e4b17023SJohn Marino }
1947