1*e4b17023SJohn Marino /* Array prefetching.
2*e4b17023SJohn Marino Copyright (C) 2005, 2007, 2008, 2009, 2010, 2011
3*e4b17023SJohn Marino Free Software Foundation, Inc.
4*e4b17023SJohn Marino
5*e4b17023SJohn Marino This file is part of GCC.
6*e4b17023SJohn Marino
7*e4b17023SJohn Marino GCC is free software; you can redistribute it and/or modify it
8*e4b17023SJohn Marino under the terms of the GNU General Public License as published by the
9*e4b17023SJohn Marino Free Software Foundation; either version 3, or (at your option) any
10*e4b17023SJohn Marino later version.
11*e4b17023SJohn Marino
12*e4b17023SJohn Marino GCC is distributed in the hope that it will be useful, but WITHOUT
13*e4b17023SJohn Marino ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14*e4b17023SJohn Marino FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15*e4b17023SJohn Marino for more details.
16*e4b17023SJohn Marino
17*e4b17023SJohn Marino You should have received a copy of the GNU General Public License
18*e4b17023SJohn Marino along with GCC; see the file COPYING3. If not see
19*e4b17023SJohn Marino <http://www.gnu.org/licenses/>. */
20*e4b17023SJohn Marino
21*e4b17023SJohn Marino #include "config.h"
22*e4b17023SJohn Marino #include "system.h"
23*e4b17023SJohn Marino #include "coretypes.h"
24*e4b17023SJohn Marino #include "tm.h"
25*e4b17023SJohn Marino #include "tree.h"
26*e4b17023SJohn Marino #include "tm_p.h"
27*e4b17023SJohn Marino #include "basic-block.h"
28*e4b17023SJohn Marino #include "output.h"
29*e4b17023SJohn Marino #include "tree-pretty-print.h"
30*e4b17023SJohn Marino #include "tree-flow.h"
31*e4b17023SJohn Marino #include "tree-dump.h"
32*e4b17023SJohn Marino #include "timevar.h"
33*e4b17023SJohn Marino #include "cfgloop.h"
34*e4b17023SJohn Marino #include "tree-pass.h"
35*e4b17023SJohn Marino #include "insn-config.h"
36*e4b17023SJohn Marino #include "recog.h"
37*e4b17023SJohn Marino #include "hashtab.h"
38*e4b17023SJohn Marino #include "tree-chrec.h"
39*e4b17023SJohn Marino #include "tree-scalar-evolution.h"
40*e4b17023SJohn Marino #include "diagnostic-core.h"
41*e4b17023SJohn Marino #include "params.h"
42*e4b17023SJohn Marino #include "langhooks.h"
43*e4b17023SJohn Marino #include "tree-inline.h"
44*e4b17023SJohn Marino #include "tree-data-ref.h"
45*e4b17023SJohn Marino
46*e4b17023SJohn Marino
47*e4b17023SJohn Marino /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
48*e4b17023SJohn Marino between the GIMPLE and RTL worlds. */
49*e4b17023SJohn Marino #include "expr.h"
50*e4b17023SJohn Marino #include "optabs.h"
51*e4b17023SJohn Marino
52*e4b17023SJohn Marino /* This pass inserts prefetch instructions to optimize cache usage during
53*e4b17023SJohn Marino accesses to arrays in loops. It processes loops sequentially and:
54*e4b17023SJohn Marino
55*e4b17023SJohn Marino 1) Gathers all memory references in the single loop.
56*e4b17023SJohn Marino 2) For each of the references it decides when it is profitable to prefetch
57*e4b17023SJohn Marino it. To do it, we evaluate the reuse among the accesses, and determines
58*e4b17023SJohn Marino two values: PREFETCH_BEFORE (meaning that it only makes sense to do
59*e4b17023SJohn Marino prefetching in the first PREFETCH_BEFORE iterations of the loop) and
60*e4b17023SJohn Marino PREFETCH_MOD (meaning that it only makes sense to prefetch in the
61*e4b17023SJohn Marino iterations of the loop that are zero modulo PREFETCH_MOD). For example
62*e4b17023SJohn Marino (assuming cache line size is 64 bytes, char has size 1 byte and there
63*e4b17023SJohn Marino is no hardware sequential prefetch):
64*e4b17023SJohn Marino
65*e4b17023SJohn Marino char *a;
66*e4b17023SJohn Marino for (i = 0; i < max; i++)
67*e4b17023SJohn Marino {
68*e4b17023SJohn Marino a[255] = ...; (0)
69*e4b17023SJohn Marino a[i] = ...; (1)
70*e4b17023SJohn Marino a[i + 64] = ...; (2)
71*e4b17023SJohn Marino a[16*i] = ...; (3)
72*e4b17023SJohn Marino a[187*i] = ...; (4)
73*e4b17023SJohn Marino a[187*i + 50] = ...; (5)
74*e4b17023SJohn Marino }
75*e4b17023SJohn Marino
76*e4b17023SJohn Marino (0) obviously has PREFETCH_BEFORE 1
77*e4b17023SJohn Marino (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
78*e4b17023SJohn Marino location 64 iterations before it, and PREFETCH_MOD 64 (since
79*e4b17023SJohn Marino it hits the same cache line otherwise).
80*e4b17023SJohn Marino (2) has PREFETCH_MOD 64
81*e4b17023SJohn Marino (3) has PREFETCH_MOD 4
82*e4b17023SJohn Marino (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
83*e4b17023SJohn Marino the cache line accessed by (5) is the same with probability only
84*e4b17023SJohn Marino 7/32.
85*e4b17023SJohn Marino (5) has PREFETCH_MOD 1 as well.
86*e4b17023SJohn Marino
87*e4b17023SJohn Marino Additionally, we use data dependence analysis to determine for each
88*e4b17023SJohn Marino reference the distance till the first reuse; this information is used
89*e4b17023SJohn Marino to determine the temporality of the issued prefetch instruction.
90*e4b17023SJohn Marino
91*e4b17023SJohn Marino 3) We determine how much ahead we need to prefetch. The number of
92*e4b17023SJohn Marino iterations needed is time to fetch / time spent in one iteration of
93*e4b17023SJohn Marino the loop. The problem is that we do not know either of these values,
94*e4b17023SJohn Marino so we just make a heuristic guess based on a magic (possibly)
95*e4b17023SJohn Marino target-specific constant and size of the loop.
96*e4b17023SJohn Marino
97*e4b17023SJohn Marino 4) Determine which of the references we prefetch. We take into account
98*e4b17023SJohn Marino that there is a maximum number of simultaneous prefetches (provided
99*e4b17023SJohn Marino by machine description). We prefetch as many prefetches as possible
100*e4b17023SJohn Marino while still within this bound (starting with those with lowest
101*e4b17023SJohn Marino prefetch_mod, since they are responsible for most of the cache
102*e4b17023SJohn Marino misses).
103*e4b17023SJohn Marino
104*e4b17023SJohn Marino 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
105*e4b17023SJohn Marino and PREFETCH_BEFORE requirements (within some bounds), and to avoid
106*e4b17023SJohn Marino prefetching nonaccessed memory.
107*e4b17023SJohn Marino TODO -- actually implement peeling.
108*e4b17023SJohn Marino
109*e4b17023SJohn Marino 6) We actually emit the prefetch instructions. ??? Perhaps emit the
110*e4b17023SJohn Marino prefetch instructions with guards in cases where 5) was not sufficient
111*e4b17023SJohn Marino to satisfy the constraints?
112*e4b17023SJohn Marino
113*e4b17023SJohn Marino A cost model is implemented to determine whether or not prefetching is
114*e4b17023SJohn Marino profitable for a given loop. The cost model has three heuristics:
115*e4b17023SJohn Marino
116*e4b17023SJohn Marino 1. Function trip_count_to_ahead_ratio_too_small_p implements a
117*e4b17023SJohn Marino heuristic that determines whether or not the loop has too few
118*e4b17023SJohn Marino iterations (compared to ahead). Prefetching is not likely to be
119*e4b17023SJohn Marino beneficial if the trip count to ahead ratio is below a certain
120*e4b17023SJohn Marino minimum.
121*e4b17023SJohn Marino
122*e4b17023SJohn Marino 2. Function mem_ref_count_reasonable_p implements a heuristic that
123*e4b17023SJohn Marino determines whether the given loop has enough CPU ops that can be
124*e4b17023SJohn Marino overlapped with cache missing memory ops. If not, the loop
125*e4b17023SJohn Marino won't benefit from prefetching. In the implementation,
126*e4b17023SJohn Marino prefetching is not considered beneficial if the ratio between
127*e4b17023SJohn Marino the instruction count and the mem ref count is below a certain
128*e4b17023SJohn Marino minimum.
129*e4b17023SJohn Marino
130*e4b17023SJohn Marino 3. Function insn_to_prefetch_ratio_too_small_p implements a
131*e4b17023SJohn Marino heuristic that disables prefetching in a loop if the prefetching
132*e4b17023SJohn Marino cost is above a certain limit. The relative prefetching cost is
133*e4b17023SJohn Marino estimated by taking the ratio between the prefetch count and the
134*e4b17023SJohn Marino total intruction count (this models the I-cache cost).
135*e4b17023SJohn Marino
136*e4b17023SJohn Marino The limits used in these heuristics are defined as parameters with
137*e4b17023SJohn Marino reasonable default values. Machine-specific default values will be
138*e4b17023SJohn Marino added later.
139*e4b17023SJohn Marino
140*e4b17023SJohn Marino Some other TODO:
141*e4b17023SJohn Marino -- write and use more general reuse analysis (that could be also used
142*e4b17023SJohn Marino in other cache aimed loop optimizations)
143*e4b17023SJohn Marino -- make it behave sanely together with the prefetches given by user
144*e4b17023SJohn Marino (now we just ignore them; at the very least we should avoid
145*e4b17023SJohn Marino optimizing loops in that user put his own prefetches)
146*e4b17023SJohn Marino -- we assume cache line size alignment of arrays; this could be
147*e4b17023SJohn Marino improved. */
148*e4b17023SJohn Marino
149*e4b17023SJohn Marino /* Magic constants follow. These should be replaced by machine specific
150*e4b17023SJohn Marino numbers. */
151*e4b17023SJohn Marino
152*e4b17023SJohn Marino /* True if write can be prefetched by a read prefetch. */
153*e4b17023SJohn Marino
154*e4b17023SJohn Marino #ifndef WRITE_CAN_USE_READ_PREFETCH
155*e4b17023SJohn Marino #define WRITE_CAN_USE_READ_PREFETCH 1
156*e4b17023SJohn Marino #endif
157*e4b17023SJohn Marino
158*e4b17023SJohn Marino /* True if read can be prefetched by a write prefetch. */
159*e4b17023SJohn Marino
160*e4b17023SJohn Marino #ifndef READ_CAN_USE_WRITE_PREFETCH
161*e4b17023SJohn Marino #define READ_CAN_USE_WRITE_PREFETCH 0
162*e4b17023SJohn Marino #endif
163*e4b17023SJohn Marino
164*e4b17023SJohn Marino /* The size of the block loaded by a single prefetch. Usually, this is
165*e4b17023SJohn Marino the same as cache line size (at the moment, we only consider one level
166*e4b17023SJohn Marino of cache hierarchy). */
167*e4b17023SJohn Marino
168*e4b17023SJohn Marino #ifndef PREFETCH_BLOCK
169*e4b17023SJohn Marino #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
170*e4b17023SJohn Marino #endif
171*e4b17023SJohn Marino
172*e4b17023SJohn Marino /* Do we have a forward hardware sequential prefetching? */
173*e4b17023SJohn Marino
174*e4b17023SJohn Marino #ifndef HAVE_FORWARD_PREFETCH
175*e4b17023SJohn Marino #define HAVE_FORWARD_PREFETCH 0
176*e4b17023SJohn Marino #endif
177*e4b17023SJohn Marino
178*e4b17023SJohn Marino /* Do we have a backward hardware sequential prefetching? */
179*e4b17023SJohn Marino
180*e4b17023SJohn Marino #ifndef HAVE_BACKWARD_PREFETCH
181*e4b17023SJohn Marino #define HAVE_BACKWARD_PREFETCH 0
182*e4b17023SJohn Marino #endif
183*e4b17023SJohn Marino
184*e4b17023SJohn Marino /* In some cases we are only able to determine that there is a certain
185*e4b17023SJohn Marino probability that the two accesses hit the same cache line. In this
186*e4b17023SJohn Marino case, we issue the prefetches for both of them if this probability
187*e4b17023SJohn Marino is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
188*e4b17023SJohn Marino
189*e4b17023SJohn Marino #ifndef ACCEPTABLE_MISS_RATE
190*e4b17023SJohn Marino #define ACCEPTABLE_MISS_RATE 50
191*e4b17023SJohn Marino #endif
192*e4b17023SJohn Marino
193*e4b17023SJohn Marino #ifndef HAVE_prefetch
194*e4b17023SJohn Marino #define HAVE_prefetch 0
195*e4b17023SJohn Marino #endif
196*e4b17023SJohn Marino
197*e4b17023SJohn Marino #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
198*e4b17023SJohn Marino #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
199*e4b17023SJohn Marino
200*e4b17023SJohn Marino /* We consider a memory access nontemporal if it is not reused sooner than
201*e4b17023SJohn Marino after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
202*e4b17023SJohn Marino accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
203*e4b17023SJohn Marino so that we use nontemporal prefetches e.g. if single memory location
204*e4b17023SJohn Marino is accessed several times in a single iteration of the loop. */
205*e4b17023SJohn Marino #define NONTEMPORAL_FRACTION 16
206*e4b17023SJohn Marino
207*e4b17023SJohn Marino /* In case we have to emit a memory fence instruction after the loop that
208*e4b17023SJohn Marino uses nontemporal stores, this defines the builtin to use. */
209*e4b17023SJohn Marino
210*e4b17023SJohn Marino #ifndef FENCE_FOLLOWING_MOVNT
211*e4b17023SJohn Marino #define FENCE_FOLLOWING_MOVNT NULL_TREE
212*e4b17023SJohn Marino #endif
213*e4b17023SJohn Marino
214*e4b17023SJohn Marino /* It is not profitable to prefetch when the trip count is not at
215*e4b17023SJohn Marino least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
216*e4b17023SJohn Marino For example, in a loop with a prefetch ahead distance of 10,
217*e4b17023SJohn Marino supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
218*e4b17023SJohn Marino profitable to prefetch when the trip count is greater or equal to
219*e4b17023SJohn Marino 40. In that case, 30 out of the 40 iterations will benefit from
220*e4b17023SJohn Marino prefetching. */
221*e4b17023SJohn Marino
222*e4b17023SJohn Marino #ifndef TRIP_COUNT_TO_AHEAD_RATIO
223*e4b17023SJohn Marino #define TRIP_COUNT_TO_AHEAD_RATIO 4
224*e4b17023SJohn Marino #endif
225*e4b17023SJohn Marino
226*e4b17023SJohn Marino /* The group of references between that reuse may occur. */
227*e4b17023SJohn Marino
228*e4b17023SJohn Marino struct mem_ref_group
229*e4b17023SJohn Marino {
230*e4b17023SJohn Marino tree base; /* Base of the reference. */
231*e4b17023SJohn Marino tree step; /* Step of the reference. */
232*e4b17023SJohn Marino struct mem_ref *refs; /* References in the group. */
233*e4b17023SJohn Marino struct mem_ref_group *next; /* Next group of references. */
234*e4b17023SJohn Marino };
235*e4b17023SJohn Marino
236*e4b17023SJohn Marino /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
237*e4b17023SJohn Marino
238*e4b17023SJohn Marino #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
239*e4b17023SJohn Marino
240*e4b17023SJohn Marino /* Do not generate a prefetch if the unroll factor is significantly less
241*e4b17023SJohn Marino than what is required by the prefetch. This is to avoid redundant
242*e4b17023SJohn Marino prefetches. For example, when prefetch_mod is 16 and unroll_factor is
243*e4b17023SJohn Marino 2, prefetching requires unrolling the loop 16 times, but
244*e4b17023SJohn Marino the loop is actually unrolled twice. In this case (ratio = 8),
245*e4b17023SJohn Marino prefetching is not likely to be beneficial. */
246*e4b17023SJohn Marino
247*e4b17023SJohn Marino #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
248*e4b17023SJohn Marino #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
249*e4b17023SJohn Marino #endif
250*e4b17023SJohn Marino
251*e4b17023SJohn Marino /* Some of the prefetch computations have quadratic complexity. We want to
252*e4b17023SJohn Marino avoid huge compile times and, therefore, want to limit the amount of
253*e4b17023SJohn Marino memory references per loop where we consider prefetching. */
254*e4b17023SJohn Marino
255*e4b17023SJohn Marino #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
256*e4b17023SJohn Marino #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
257*e4b17023SJohn Marino #endif
258*e4b17023SJohn Marino
259*e4b17023SJohn Marino /* The memory reference. */
260*e4b17023SJohn Marino
261*e4b17023SJohn Marino struct mem_ref
262*e4b17023SJohn Marino {
263*e4b17023SJohn Marino gimple stmt; /* Statement in that the reference appears. */
264*e4b17023SJohn Marino tree mem; /* The reference. */
265*e4b17023SJohn Marino HOST_WIDE_INT delta; /* Constant offset of the reference. */
266*e4b17023SJohn Marino struct mem_ref_group *group; /* The group of references it belongs to. */
267*e4b17023SJohn Marino unsigned HOST_WIDE_INT prefetch_mod;
268*e4b17023SJohn Marino /* Prefetch only each PREFETCH_MOD-th
269*e4b17023SJohn Marino iteration. */
270*e4b17023SJohn Marino unsigned HOST_WIDE_INT prefetch_before;
271*e4b17023SJohn Marino /* Prefetch only first PREFETCH_BEFORE
272*e4b17023SJohn Marino iterations. */
273*e4b17023SJohn Marino unsigned reuse_distance; /* The amount of data accessed before the first
274*e4b17023SJohn Marino reuse of this value. */
275*e4b17023SJohn Marino struct mem_ref *next; /* The next reference in the group. */
276*e4b17023SJohn Marino unsigned write_p : 1; /* Is it a write? */
277*e4b17023SJohn Marino unsigned independent_p : 1; /* True if the reference is independent on
278*e4b17023SJohn Marino all other references inside the loop. */
279*e4b17023SJohn Marino unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
280*e4b17023SJohn Marino unsigned storent_p : 1; /* True if we changed the store to a
281*e4b17023SJohn Marino nontemporal one. */
282*e4b17023SJohn Marino };
283*e4b17023SJohn Marino
284*e4b17023SJohn Marino /* Dumps information about reference REF to FILE. */
285*e4b17023SJohn Marino
286*e4b17023SJohn Marino static void
dump_mem_ref(FILE * file,struct mem_ref * ref)287*e4b17023SJohn Marino dump_mem_ref (FILE *file, struct mem_ref *ref)
288*e4b17023SJohn Marino {
289*e4b17023SJohn Marino fprintf (file, "Reference %p:\n", (void *) ref);
290*e4b17023SJohn Marino
291*e4b17023SJohn Marino fprintf (file, " group %p (base ", (void *) ref->group);
292*e4b17023SJohn Marino print_generic_expr (file, ref->group->base, TDF_SLIM);
293*e4b17023SJohn Marino fprintf (file, ", step ");
294*e4b17023SJohn Marino if (cst_and_fits_in_hwi (ref->group->step))
295*e4b17023SJohn Marino fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step));
296*e4b17023SJohn Marino else
297*e4b17023SJohn Marino print_generic_expr (file, ref->group->step, TDF_TREE);
298*e4b17023SJohn Marino fprintf (file, ")\n");
299*e4b17023SJohn Marino
300*e4b17023SJohn Marino fprintf (file, " delta ");
301*e4b17023SJohn Marino fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
302*e4b17023SJohn Marino fprintf (file, "\n");
303*e4b17023SJohn Marino
304*e4b17023SJohn Marino fprintf (file, " %s\n", ref->write_p ? "write" : "read");
305*e4b17023SJohn Marino
306*e4b17023SJohn Marino fprintf (file, "\n");
307*e4b17023SJohn Marino }
308*e4b17023SJohn Marino
309*e4b17023SJohn Marino /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
310*e4b17023SJohn Marino exist. */
311*e4b17023SJohn Marino
312*e4b17023SJohn Marino static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,tree step)313*e4b17023SJohn Marino find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
314*e4b17023SJohn Marino {
315*e4b17023SJohn Marino struct mem_ref_group *group;
316*e4b17023SJohn Marino
317*e4b17023SJohn Marino for (; *groups; groups = &(*groups)->next)
318*e4b17023SJohn Marino {
319*e4b17023SJohn Marino if (operand_equal_p ((*groups)->step, step, 0)
320*e4b17023SJohn Marino && operand_equal_p ((*groups)->base, base, 0))
321*e4b17023SJohn Marino return *groups;
322*e4b17023SJohn Marino
323*e4b17023SJohn Marino /* If step is an integer constant, keep the list of groups sorted
324*e4b17023SJohn Marino by decreasing step. */
325*e4b17023SJohn Marino if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
326*e4b17023SJohn Marino && int_cst_value ((*groups)->step) < int_cst_value (step))
327*e4b17023SJohn Marino break;
328*e4b17023SJohn Marino }
329*e4b17023SJohn Marino
330*e4b17023SJohn Marino group = XNEW (struct mem_ref_group);
331*e4b17023SJohn Marino group->base = base;
332*e4b17023SJohn Marino group->step = step;
333*e4b17023SJohn Marino group->refs = NULL;
334*e4b17023SJohn Marino group->next = *groups;
335*e4b17023SJohn Marino *groups = group;
336*e4b17023SJohn Marino
337*e4b17023SJohn Marino return group;
338*e4b17023SJohn Marino }
339*e4b17023SJohn Marino
340*e4b17023SJohn Marino /* Records a memory reference MEM in GROUP with offset DELTA and write status
341*e4b17023SJohn Marino WRITE_P. The reference occurs in statement STMT. */
342*e4b17023SJohn Marino
343*e4b17023SJohn Marino static void
record_ref(struct mem_ref_group * group,gimple stmt,tree mem,HOST_WIDE_INT delta,bool write_p)344*e4b17023SJohn Marino record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
345*e4b17023SJohn Marino HOST_WIDE_INT delta, bool write_p)
346*e4b17023SJohn Marino {
347*e4b17023SJohn Marino struct mem_ref **aref;
348*e4b17023SJohn Marino
349*e4b17023SJohn Marino /* Do not record the same address twice. */
350*e4b17023SJohn Marino for (aref = &group->refs; *aref; aref = &(*aref)->next)
351*e4b17023SJohn Marino {
352*e4b17023SJohn Marino /* It does not have to be possible for write reference to reuse the read
353*e4b17023SJohn Marino prefetch, or vice versa. */
354*e4b17023SJohn Marino if (!WRITE_CAN_USE_READ_PREFETCH
355*e4b17023SJohn Marino && write_p
356*e4b17023SJohn Marino && !(*aref)->write_p)
357*e4b17023SJohn Marino continue;
358*e4b17023SJohn Marino if (!READ_CAN_USE_WRITE_PREFETCH
359*e4b17023SJohn Marino && !write_p
360*e4b17023SJohn Marino && (*aref)->write_p)
361*e4b17023SJohn Marino continue;
362*e4b17023SJohn Marino
363*e4b17023SJohn Marino if ((*aref)->delta == delta)
364*e4b17023SJohn Marino return;
365*e4b17023SJohn Marino }
366*e4b17023SJohn Marino
367*e4b17023SJohn Marino (*aref) = XNEW (struct mem_ref);
368*e4b17023SJohn Marino (*aref)->stmt = stmt;
369*e4b17023SJohn Marino (*aref)->mem = mem;
370*e4b17023SJohn Marino (*aref)->delta = delta;
371*e4b17023SJohn Marino (*aref)->write_p = write_p;
372*e4b17023SJohn Marino (*aref)->prefetch_before = PREFETCH_ALL;
373*e4b17023SJohn Marino (*aref)->prefetch_mod = 1;
374*e4b17023SJohn Marino (*aref)->reuse_distance = 0;
375*e4b17023SJohn Marino (*aref)->issue_prefetch_p = false;
376*e4b17023SJohn Marino (*aref)->group = group;
377*e4b17023SJohn Marino (*aref)->next = NULL;
378*e4b17023SJohn Marino (*aref)->independent_p = false;
379*e4b17023SJohn Marino (*aref)->storent_p = false;
380*e4b17023SJohn Marino
381*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
382*e4b17023SJohn Marino dump_mem_ref (dump_file, *aref);
383*e4b17023SJohn Marino }
384*e4b17023SJohn Marino
385*e4b17023SJohn Marino /* Release memory references in GROUPS. */
386*e4b17023SJohn Marino
387*e4b17023SJohn Marino static void
release_mem_refs(struct mem_ref_group * groups)388*e4b17023SJohn Marino release_mem_refs (struct mem_ref_group *groups)
389*e4b17023SJohn Marino {
390*e4b17023SJohn Marino struct mem_ref_group *next_g;
391*e4b17023SJohn Marino struct mem_ref *ref, *next_r;
392*e4b17023SJohn Marino
393*e4b17023SJohn Marino for (; groups; groups = next_g)
394*e4b17023SJohn Marino {
395*e4b17023SJohn Marino next_g = groups->next;
396*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = next_r)
397*e4b17023SJohn Marino {
398*e4b17023SJohn Marino next_r = ref->next;
399*e4b17023SJohn Marino free (ref);
400*e4b17023SJohn Marino }
401*e4b17023SJohn Marino free (groups);
402*e4b17023SJohn Marino }
403*e4b17023SJohn Marino }
404*e4b17023SJohn Marino
405*e4b17023SJohn Marino /* A structure used to pass arguments to idx_analyze_ref. */
406*e4b17023SJohn Marino
407*e4b17023SJohn Marino struct ar_data
408*e4b17023SJohn Marino {
409*e4b17023SJohn Marino struct loop *loop; /* Loop of the reference. */
410*e4b17023SJohn Marino gimple stmt; /* Statement of the reference. */
411*e4b17023SJohn Marino tree *step; /* Step of the memory reference. */
412*e4b17023SJohn Marino HOST_WIDE_INT *delta; /* Offset of the memory reference. */
413*e4b17023SJohn Marino };
414*e4b17023SJohn Marino
415*e4b17023SJohn Marino /* Analyzes a single INDEX of a memory reference to obtain information
416*e4b17023SJohn Marino described at analyze_ref. Callback for for_each_index. */
417*e4b17023SJohn Marino
418*e4b17023SJohn Marino static bool
idx_analyze_ref(tree base,tree * index,void * data)419*e4b17023SJohn Marino idx_analyze_ref (tree base, tree *index, void *data)
420*e4b17023SJohn Marino {
421*e4b17023SJohn Marino struct ar_data *ar_data = (struct ar_data *) data;
422*e4b17023SJohn Marino tree ibase, step, stepsize;
423*e4b17023SJohn Marino HOST_WIDE_INT idelta = 0, imult = 1;
424*e4b17023SJohn Marino affine_iv iv;
425*e4b17023SJohn Marino
426*e4b17023SJohn Marino if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
427*e4b17023SJohn Marino *index, &iv, true))
428*e4b17023SJohn Marino return false;
429*e4b17023SJohn Marino ibase = iv.base;
430*e4b17023SJohn Marino step = iv.step;
431*e4b17023SJohn Marino
432*e4b17023SJohn Marino if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
433*e4b17023SJohn Marino && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
434*e4b17023SJohn Marino {
435*e4b17023SJohn Marino idelta = int_cst_value (TREE_OPERAND (ibase, 1));
436*e4b17023SJohn Marino ibase = TREE_OPERAND (ibase, 0);
437*e4b17023SJohn Marino }
438*e4b17023SJohn Marino if (cst_and_fits_in_hwi (ibase))
439*e4b17023SJohn Marino {
440*e4b17023SJohn Marino idelta += int_cst_value (ibase);
441*e4b17023SJohn Marino ibase = build_int_cst (TREE_TYPE (ibase), 0);
442*e4b17023SJohn Marino }
443*e4b17023SJohn Marino
444*e4b17023SJohn Marino if (TREE_CODE (base) == ARRAY_REF)
445*e4b17023SJohn Marino {
446*e4b17023SJohn Marino stepsize = array_ref_element_size (base);
447*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (stepsize))
448*e4b17023SJohn Marino return false;
449*e4b17023SJohn Marino imult = int_cst_value (stepsize);
450*e4b17023SJohn Marino step = fold_build2 (MULT_EXPR, sizetype,
451*e4b17023SJohn Marino fold_convert (sizetype, step),
452*e4b17023SJohn Marino fold_convert (sizetype, stepsize));
453*e4b17023SJohn Marino idelta *= imult;
454*e4b17023SJohn Marino }
455*e4b17023SJohn Marino
456*e4b17023SJohn Marino if (*ar_data->step == NULL_TREE)
457*e4b17023SJohn Marino *ar_data->step = step;
458*e4b17023SJohn Marino else
459*e4b17023SJohn Marino *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
460*e4b17023SJohn Marino fold_convert (sizetype, *ar_data->step),
461*e4b17023SJohn Marino fold_convert (sizetype, step));
462*e4b17023SJohn Marino *ar_data->delta += idelta;
463*e4b17023SJohn Marino *index = ibase;
464*e4b17023SJohn Marino
465*e4b17023SJohn Marino return true;
466*e4b17023SJohn Marino }
467*e4b17023SJohn Marino
468*e4b17023SJohn Marino /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
469*e4b17023SJohn Marino STEP are integer constants and iter is number of iterations of LOOP. The
470*e4b17023SJohn Marino reference occurs in statement STMT. Strips nonaddressable component
471*e4b17023SJohn Marino references from REF_P. */
472*e4b17023SJohn Marino
473*e4b17023SJohn Marino static bool
analyze_ref(struct loop * loop,tree * ref_p,tree * base,tree * step,HOST_WIDE_INT * delta,gimple stmt)474*e4b17023SJohn Marino analyze_ref (struct loop *loop, tree *ref_p, tree *base,
475*e4b17023SJohn Marino tree *step, HOST_WIDE_INT *delta,
476*e4b17023SJohn Marino gimple stmt)
477*e4b17023SJohn Marino {
478*e4b17023SJohn Marino struct ar_data ar_data;
479*e4b17023SJohn Marino tree off;
480*e4b17023SJohn Marino HOST_WIDE_INT bit_offset;
481*e4b17023SJohn Marino tree ref = *ref_p;
482*e4b17023SJohn Marino
483*e4b17023SJohn Marino *step = NULL_TREE;
484*e4b17023SJohn Marino *delta = 0;
485*e4b17023SJohn Marino
486*e4b17023SJohn Marino /* First strip off the component references. Ignore bitfields.
487*e4b17023SJohn Marino Also strip off the real and imagine parts of a complex, so that
488*e4b17023SJohn Marino they can have the same base. */
489*e4b17023SJohn Marino if (TREE_CODE (ref) == REALPART_EXPR
490*e4b17023SJohn Marino || TREE_CODE (ref) == IMAGPART_EXPR
491*e4b17023SJohn Marino || (TREE_CODE (ref) == COMPONENT_REF
492*e4b17023SJohn Marino && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
493*e4b17023SJohn Marino {
494*e4b17023SJohn Marino if (TREE_CODE (ref) == IMAGPART_EXPR)
495*e4b17023SJohn Marino *delta += int_size_in_bytes (TREE_TYPE (ref));
496*e4b17023SJohn Marino ref = TREE_OPERAND (ref, 0);
497*e4b17023SJohn Marino }
498*e4b17023SJohn Marino
499*e4b17023SJohn Marino *ref_p = ref;
500*e4b17023SJohn Marino
501*e4b17023SJohn Marino for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
502*e4b17023SJohn Marino {
503*e4b17023SJohn Marino off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
504*e4b17023SJohn Marino bit_offset = TREE_INT_CST_LOW (off);
505*e4b17023SJohn Marino gcc_assert (bit_offset % BITS_PER_UNIT == 0);
506*e4b17023SJohn Marino
507*e4b17023SJohn Marino *delta += bit_offset / BITS_PER_UNIT;
508*e4b17023SJohn Marino }
509*e4b17023SJohn Marino
510*e4b17023SJohn Marino *base = unshare_expr (ref);
511*e4b17023SJohn Marino ar_data.loop = loop;
512*e4b17023SJohn Marino ar_data.stmt = stmt;
513*e4b17023SJohn Marino ar_data.step = step;
514*e4b17023SJohn Marino ar_data.delta = delta;
515*e4b17023SJohn Marino return for_each_index (base, idx_analyze_ref, &ar_data);
516*e4b17023SJohn Marino }
517*e4b17023SJohn Marino
518*e4b17023SJohn Marino /* Record a memory reference REF to the list REFS. The reference occurs in
519*e4b17023SJohn Marino LOOP in statement STMT and it is write if WRITE_P. Returns true if the
520*e4b17023SJohn Marino reference was recorded, false otherwise. */
521*e4b17023SJohn Marino
522*e4b17023SJohn Marino static bool
gather_memory_references_ref(struct loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,gimple stmt)523*e4b17023SJohn Marino gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
524*e4b17023SJohn Marino tree ref, bool write_p, gimple stmt)
525*e4b17023SJohn Marino {
526*e4b17023SJohn Marino tree base, step;
527*e4b17023SJohn Marino HOST_WIDE_INT delta;
528*e4b17023SJohn Marino struct mem_ref_group *agrp;
529*e4b17023SJohn Marino
530*e4b17023SJohn Marino if (get_base_address (ref) == NULL)
531*e4b17023SJohn Marino return false;
532*e4b17023SJohn Marino
533*e4b17023SJohn Marino if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
534*e4b17023SJohn Marino return false;
535*e4b17023SJohn Marino /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
536*e4b17023SJohn Marino if (step == NULL_TREE)
537*e4b17023SJohn Marino return false;
538*e4b17023SJohn Marino
539*e4b17023SJohn Marino /* Stop if the address of BASE could not be taken. */
540*e4b17023SJohn Marino if (may_be_nonaddressable_p (base))
541*e4b17023SJohn Marino return false;
542*e4b17023SJohn Marino
543*e4b17023SJohn Marino /* Limit non-constant step prefetching only to the innermost loops. */
544*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (step) && loop->inner != NULL)
545*e4b17023SJohn Marino return false;
546*e4b17023SJohn Marino
547*e4b17023SJohn Marino /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
548*e4b17023SJohn Marino are integer constants. */
549*e4b17023SJohn Marino agrp = find_or_create_group (refs, base, step);
550*e4b17023SJohn Marino record_ref (agrp, stmt, ref, delta, write_p);
551*e4b17023SJohn Marino
552*e4b17023SJohn Marino return true;
553*e4b17023SJohn Marino }
554*e4b17023SJohn Marino
555*e4b17023SJohn Marino /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
556*e4b17023SJohn Marino true if there are no other memory references inside the loop. */
557*e4b17023SJohn Marino
558*e4b17023SJohn Marino static struct mem_ref_group *
gather_memory_references(struct loop * loop,bool * no_other_refs,unsigned * ref_count)559*e4b17023SJohn Marino gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
560*e4b17023SJohn Marino {
561*e4b17023SJohn Marino basic_block *body = get_loop_body_in_dom_order (loop);
562*e4b17023SJohn Marino basic_block bb;
563*e4b17023SJohn Marino unsigned i;
564*e4b17023SJohn Marino gimple_stmt_iterator bsi;
565*e4b17023SJohn Marino gimple stmt;
566*e4b17023SJohn Marino tree lhs, rhs;
567*e4b17023SJohn Marino struct mem_ref_group *refs = NULL;
568*e4b17023SJohn Marino
569*e4b17023SJohn Marino *no_other_refs = true;
570*e4b17023SJohn Marino *ref_count = 0;
571*e4b17023SJohn Marino
572*e4b17023SJohn Marino /* Scan the loop body in order, so that the former references precede the
573*e4b17023SJohn Marino later ones. */
574*e4b17023SJohn Marino for (i = 0; i < loop->num_nodes; i++)
575*e4b17023SJohn Marino {
576*e4b17023SJohn Marino bb = body[i];
577*e4b17023SJohn Marino if (bb->loop_father != loop)
578*e4b17023SJohn Marino continue;
579*e4b17023SJohn Marino
580*e4b17023SJohn Marino for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
581*e4b17023SJohn Marino {
582*e4b17023SJohn Marino stmt = gsi_stmt (bsi);
583*e4b17023SJohn Marino
584*e4b17023SJohn Marino if (gimple_code (stmt) != GIMPLE_ASSIGN)
585*e4b17023SJohn Marino {
586*e4b17023SJohn Marino if (gimple_vuse (stmt)
587*e4b17023SJohn Marino || (is_gimple_call (stmt)
588*e4b17023SJohn Marino && !(gimple_call_flags (stmt) & ECF_CONST)))
589*e4b17023SJohn Marino *no_other_refs = false;
590*e4b17023SJohn Marino continue;
591*e4b17023SJohn Marino }
592*e4b17023SJohn Marino
593*e4b17023SJohn Marino lhs = gimple_assign_lhs (stmt);
594*e4b17023SJohn Marino rhs = gimple_assign_rhs1 (stmt);
595*e4b17023SJohn Marino
596*e4b17023SJohn Marino if (REFERENCE_CLASS_P (rhs))
597*e4b17023SJohn Marino {
598*e4b17023SJohn Marino *no_other_refs &= gather_memory_references_ref (loop, &refs,
599*e4b17023SJohn Marino rhs, false, stmt);
600*e4b17023SJohn Marino *ref_count += 1;
601*e4b17023SJohn Marino }
602*e4b17023SJohn Marino if (REFERENCE_CLASS_P (lhs))
603*e4b17023SJohn Marino {
604*e4b17023SJohn Marino *no_other_refs &= gather_memory_references_ref (loop, &refs,
605*e4b17023SJohn Marino lhs, true, stmt);
606*e4b17023SJohn Marino *ref_count += 1;
607*e4b17023SJohn Marino }
608*e4b17023SJohn Marino }
609*e4b17023SJohn Marino }
610*e4b17023SJohn Marino free (body);
611*e4b17023SJohn Marino
612*e4b17023SJohn Marino return refs;
613*e4b17023SJohn Marino }
614*e4b17023SJohn Marino
615*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the self-reuse. */
616*e4b17023SJohn Marino
617*e4b17023SJohn Marino static void
prune_ref_by_self_reuse(struct mem_ref * ref)618*e4b17023SJohn Marino prune_ref_by_self_reuse (struct mem_ref *ref)
619*e4b17023SJohn Marino {
620*e4b17023SJohn Marino HOST_WIDE_INT step;
621*e4b17023SJohn Marino bool backward;
622*e4b17023SJohn Marino
623*e4b17023SJohn Marino /* If the step size is non constant, we cannot calculate prefetch_mod. */
624*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (ref->group->step))
625*e4b17023SJohn Marino return;
626*e4b17023SJohn Marino
627*e4b17023SJohn Marino step = int_cst_value (ref->group->step);
628*e4b17023SJohn Marino
629*e4b17023SJohn Marino backward = step < 0;
630*e4b17023SJohn Marino
631*e4b17023SJohn Marino if (step == 0)
632*e4b17023SJohn Marino {
633*e4b17023SJohn Marino /* Prefetch references to invariant address just once. */
634*e4b17023SJohn Marino ref->prefetch_before = 1;
635*e4b17023SJohn Marino return;
636*e4b17023SJohn Marino }
637*e4b17023SJohn Marino
638*e4b17023SJohn Marino if (backward)
639*e4b17023SJohn Marino step = -step;
640*e4b17023SJohn Marino
641*e4b17023SJohn Marino if (step > PREFETCH_BLOCK)
642*e4b17023SJohn Marino return;
643*e4b17023SJohn Marino
644*e4b17023SJohn Marino if ((backward && HAVE_BACKWARD_PREFETCH)
645*e4b17023SJohn Marino || (!backward && HAVE_FORWARD_PREFETCH))
646*e4b17023SJohn Marino {
647*e4b17023SJohn Marino ref->prefetch_before = 1;
648*e4b17023SJohn Marino return;
649*e4b17023SJohn Marino }
650*e4b17023SJohn Marino
651*e4b17023SJohn Marino ref->prefetch_mod = PREFETCH_BLOCK / step;
652*e4b17023SJohn Marino }
653*e4b17023SJohn Marino
654*e4b17023SJohn Marino /* Divides X by BY, rounding down. */
655*e4b17023SJohn Marino
656*e4b17023SJohn Marino static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)657*e4b17023SJohn Marino ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
658*e4b17023SJohn Marino {
659*e4b17023SJohn Marino gcc_assert (by > 0);
660*e4b17023SJohn Marino
661*e4b17023SJohn Marino if (x >= 0)
662*e4b17023SJohn Marino return x / by;
663*e4b17023SJohn Marino else
664*e4b17023SJohn Marino return (x + by - 1) / by;
665*e4b17023SJohn Marino }
666*e4b17023SJohn Marino
667*e4b17023SJohn Marino /* Given a CACHE_LINE_SIZE and two inductive memory references
668*e4b17023SJohn Marino with a common STEP greater than CACHE_LINE_SIZE and an address
669*e4b17023SJohn Marino difference DELTA, compute the probability that they will fall
670*e4b17023SJohn Marino in different cache lines. Return true if the computed miss rate
671*e4b17023SJohn Marino is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
672*e4b17023SJohn Marino number of distinct iterations after which the pattern repeats itself.
673*e4b17023SJohn Marino ALIGN_UNIT is the unit of alignment in bytes. */
674*e4b17023SJohn Marino
675*e4b17023SJohn Marino static bool
is_miss_rate_acceptable(unsigned HOST_WIDE_INT cache_line_size,HOST_WIDE_INT step,HOST_WIDE_INT delta,unsigned HOST_WIDE_INT distinct_iters,int align_unit)676*e4b17023SJohn Marino is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
677*e4b17023SJohn Marino HOST_WIDE_INT step, HOST_WIDE_INT delta,
678*e4b17023SJohn Marino unsigned HOST_WIDE_INT distinct_iters,
679*e4b17023SJohn Marino int align_unit)
680*e4b17023SJohn Marino {
681*e4b17023SJohn Marino unsigned align, iter;
682*e4b17023SJohn Marino int total_positions, miss_positions, max_allowed_miss_positions;
683*e4b17023SJohn Marino int address1, address2, cache_line1, cache_line2;
684*e4b17023SJohn Marino
685*e4b17023SJohn Marino /* It always misses if delta is greater than or equal to the cache
686*e4b17023SJohn Marino line size. */
687*e4b17023SJohn Marino if (delta >= (HOST_WIDE_INT) cache_line_size)
688*e4b17023SJohn Marino return false;
689*e4b17023SJohn Marino
690*e4b17023SJohn Marino miss_positions = 0;
691*e4b17023SJohn Marino total_positions = (cache_line_size / align_unit) * distinct_iters;
692*e4b17023SJohn Marino max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
693*e4b17023SJohn Marino
694*e4b17023SJohn Marino /* Iterate through all possible alignments of the first
695*e4b17023SJohn Marino memory reference within its cache line. */
696*e4b17023SJohn Marino for (align = 0; align < cache_line_size; align += align_unit)
697*e4b17023SJohn Marino
698*e4b17023SJohn Marino /* Iterate through all distinct iterations. */
699*e4b17023SJohn Marino for (iter = 0; iter < distinct_iters; iter++)
700*e4b17023SJohn Marino {
701*e4b17023SJohn Marino address1 = align + step * iter;
702*e4b17023SJohn Marino address2 = address1 + delta;
703*e4b17023SJohn Marino cache_line1 = address1 / cache_line_size;
704*e4b17023SJohn Marino cache_line2 = address2 / cache_line_size;
705*e4b17023SJohn Marino if (cache_line1 != cache_line2)
706*e4b17023SJohn Marino {
707*e4b17023SJohn Marino miss_positions += 1;
708*e4b17023SJohn Marino if (miss_positions > max_allowed_miss_positions)
709*e4b17023SJohn Marino return false;
710*e4b17023SJohn Marino }
711*e4b17023SJohn Marino }
712*e4b17023SJohn Marino return true;
713*e4b17023SJohn Marino }
714*e4b17023SJohn Marino
715*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the reuse with BY.
716*e4b17023SJohn Marino If BY_IS_BEFORE is true, BY is before REF in the loop. */
717*e4b17023SJohn Marino
718*e4b17023SJohn Marino static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)719*e4b17023SJohn Marino prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
720*e4b17023SJohn Marino bool by_is_before)
721*e4b17023SJohn Marino {
722*e4b17023SJohn Marino HOST_WIDE_INT step;
723*e4b17023SJohn Marino bool backward;
724*e4b17023SJohn Marino HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
725*e4b17023SJohn Marino HOST_WIDE_INT delta = delta_b - delta_r;
726*e4b17023SJohn Marino HOST_WIDE_INT hit_from;
727*e4b17023SJohn Marino unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
728*e4b17023SJohn Marino HOST_WIDE_INT reduced_step;
729*e4b17023SJohn Marino unsigned HOST_WIDE_INT reduced_prefetch_block;
730*e4b17023SJohn Marino tree ref_type;
731*e4b17023SJohn Marino int align_unit;
732*e4b17023SJohn Marino
733*e4b17023SJohn Marino /* If the step is non constant we cannot calculate prefetch_before. */
734*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (ref->group->step)) {
735*e4b17023SJohn Marino return;
736*e4b17023SJohn Marino }
737*e4b17023SJohn Marino
738*e4b17023SJohn Marino step = int_cst_value (ref->group->step);
739*e4b17023SJohn Marino
740*e4b17023SJohn Marino backward = step < 0;
741*e4b17023SJohn Marino
742*e4b17023SJohn Marino
743*e4b17023SJohn Marino if (delta == 0)
744*e4b17023SJohn Marino {
745*e4b17023SJohn Marino /* If the references has the same address, only prefetch the
746*e4b17023SJohn Marino former. */
747*e4b17023SJohn Marino if (by_is_before)
748*e4b17023SJohn Marino ref->prefetch_before = 0;
749*e4b17023SJohn Marino
750*e4b17023SJohn Marino return;
751*e4b17023SJohn Marino }
752*e4b17023SJohn Marino
753*e4b17023SJohn Marino if (!step)
754*e4b17023SJohn Marino {
755*e4b17023SJohn Marino /* If the reference addresses are invariant and fall into the
756*e4b17023SJohn Marino same cache line, prefetch just the first one. */
757*e4b17023SJohn Marino if (!by_is_before)
758*e4b17023SJohn Marino return;
759*e4b17023SJohn Marino
760*e4b17023SJohn Marino if (ddown (ref->delta, PREFETCH_BLOCK)
761*e4b17023SJohn Marino != ddown (by->delta, PREFETCH_BLOCK))
762*e4b17023SJohn Marino return;
763*e4b17023SJohn Marino
764*e4b17023SJohn Marino ref->prefetch_before = 0;
765*e4b17023SJohn Marino return;
766*e4b17023SJohn Marino }
767*e4b17023SJohn Marino
768*e4b17023SJohn Marino /* Only prune the reference that is behind in the array. */
769*e4b17023SJohn Marino if (backward)
770*e4b17023SJohn Marino {
771*e4b17023SJohn Marino if (delta > 0)
772*e4b17023SJohn Marino return;
773*e4b17023SJohn Marino
774*e4b17023SJohn Marino /* Transform the data so that we may assume that the accesses
775*e4b17023SJohn Marino are forward. */
776*e4b17023SJohn Marino delta = - delta;
777*e4b17023SJohn Marino step = -step;
778*e4b17023SJohn Marino delta_r = PREFETCH_BLOCK - 1 - delta_r;
779*e4b17023SJohn Marino delta_b = PREFETCH_BLOCK - 1 - delta_b;
780*e4b17023SJohn Marino }
781*e4b17023SJohn Marino else
782*e4b17023SJohn Marino {
783*e4b17023SJohn Marino if (delta < 0)
784*e4b17023SJohn Marino return;
785*e4b17023SJohn Marino }
786*e4b17023SJohn Marino
787*e4b17023SJohn Marino /* Check whether the two references are likely to hit the same cache
788*e4b17023SJohn Marino line, and how distant the iterations in that it occurs are from
789*e4b17023SJohn Marino each other. */
790*e4b17023SJohn Marino
791*e4b17023SJohn Marino if (step <= PREFETCH_BLOCK)
792*e4b17023SJohn Marino {
793*e4b17023SJohn Marino /* The accesses are sure to meet. Let us check when. */
794*e4b17023SJohn Marino hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
795*e4b17023SJohn Marino prefetch_before = (hit_from - delta_r + step - 1) / step;
796*e4b17023SJohn Marino
797*e4b17023SJohn Marino /* Do not reduce prefetch_before if we meet beyond cache size. */
798*e4b17023SJohn Marino if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
799*e4b17023SJohn Marino prefetch_before = PREFETCH_ALL;
800*e4b17023SJohn Marino if (prefetch_before < ref->prefetch_before)
801*e4b17023SJohn Marino ref->prefetch_before = prefetch_before;
802*e4b17023SJohn Marino
803*e4b17023SJohn Marino return;
804*e4b17023SJohn Marino }
805*e4b17023SJohn Marino
806*e4b17023SJohn Marino /* A more complicated case with step > prefetch_block. First reduce
807*e4b17023SJohn Marino the ratio between the step and the cache line size to its simplest
808*e4b17023SJohn Marino terms. The resulting denominator will then represent the number of
809*e4b17023SJohn Marino distinct iterations after which each address will go back to its
810*e4b17023SJohn Marino initial location within the cache line. This computation assumes
811*e4b17023SJohn Marino that PREFETCH_BLOCK is a power of two. */
812*e4b17023SJohn Marino prefetch_block = PREFETCH_BLOCK;
813*e4b17023SJohn Marino reduced_prefetch_block = prefetch_block;
814*e4b17023SJohn Marino reduced_step = step;
815*e4b17023SJohn Marino while ((reduced_step & 1) == 0
816*e4b17023SJohn Marino && reduced_prefetch_block > 1)
817*e4b17023SJohn Marino {
818*e4b17023SJohn Marino reduced_step >>= 1;
819*e4b17023SJohn Marino reduced_prefetch_block >>= 1;
820*e4b17023SJohn Marino }
821*e4b17023SJohn Marino
822*e4b17023SJohn Marino prefetch_before = delta / step;
823*e4b17023SJohn Marino delta %= step;
824*e4b17023SJohn Marino ref_type = TREE_TYPE (ref->mem);
825*e4b17023SJohn Marino align_unit = TYPE_ALIGN (ref_type) / 8;
826*e4b17023SJohn Marino if (is_miss_rate_acceptable (prefetch_block, step, delta,
827*e4b17023SJohn Marino reduced_prefetch_block, align_unit))
828*e4b17023SJohn Marino {
829*e4b17023SJohn Marino /* Do not reduce prefetch_before if we meet beyond cache size. */
830*e4b17023SJohn Marino if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
831*e4b17023SJohn Marino prefetch_before = PREFETCH_ALL;
832*e4b17023SJohn Marino if (prefetch_before < ref->prefetch_before)
833*e4b17023SJohn Marino ref->prefetch_before = prefetch_before;
834*e4b17023SJohn Marino
835*e4b17023SJohn Marino return;
836*e4b17023SJohn Marino }
837*e4b17023SJohn Marino
838*e4b17023SJohn Marino /* Try also the following iteration. */
839*e4b17023SJohn Marino prefetch_before++;
840*e4b17023SJohn Marino delta = step - delta;
841*e4b17023SJohn Marino if (is_miss_rate_acceptable (prefetch_block, step, delta,
842*e4b17023SJohn Marino reduced_prefetch_block, align_unit))
843*e4b17023SJohn Marino {
844*e4b17023SJohn Marino if (prefetch_before < ref->prefetch_before)
845*e4b17023SJohn Marino ref->prefetch_before = prefetch_before;
846*e4b17023SJohn Marino
847*e4b17023SJohn Marino return;
848*e4b17023SJohn Marino }
849*e4b17023SJohn Marino
850*e4b17023SJohn Marino /* The ref probably does not reuse by. */
851*e4b17023SJohn Marino return;
852*e4b17023SJohn Marino }
853*e4b17023SJohn Marino
854*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the reuses with other references
855*e4b17023SJohn Marino in REFS. */
856*e4b17023SJohn Marino
857*e4b17023SJohn Marino static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)858*e4b17023SJohn Marino prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
859*e4b17023SJohn Marino {
860*e4b17023SJohn Marino struct mem_ref *prune_by;
861*e4b17023SJohn Marino bool before = true;
862*e4b17023SJohn Marino
863*e4b17023SJohn Marino prune_ref_by_self_reuse (ref);
864*e4b17023SJohn Marino
865*e4b17023SJohn Marino for (prune_by = refs; prune_by; prune_by = prune_by->next)
866*e4b17023SJohn Marino {
867*e4b17023SJohn Marino if (prune_by == ref)
868*e4b17023SJohn Marino {
869*e4b17023SJohn Marino before = false;
870*e4b17023SJohn Marino continue;
871*e4b17023SJohn Marino }
872*e4b17023SJohn Marino
873*e4b17023SJohn Marino if (!WRITE_CAN_USE_READ_PREFETCH
874*e4b17023SJohn Marino && ref->write_p
875*e4b17023SJohn Marino && !prune_by->write_p)
876*e4b17023SJohn Marino continue;
877*e4b17023SJohn Marino if (!READ_CAN_USE_WRITE_PREFETCH
878*e4b17023SJohn Marino && !ref->write_p
879*e4b17023SJohn Marino && prune_by->write_p)
880*e4b17023SJohn Marino continue;
881*e4b17023SJohn Marino
882*e4b17023SJohn Marino prune_ref_by_group_reuse (ref, prune_by, before);
883*e4b17023SJohn Marino }
884*e4b17023SJohn Marino }
885*e4b17023SJohn Marino
886*e4b17023SJohn Marino /* Prune the prefetch candidates in GROUP using the reuse analysis. */
887*e4b17023SJohn Marino
888*e4b17023SJohn Marino static void
prune_group_by_reuse(struct mem_ref_group * group)889*e4b17023SJohn Marino prune_group_by_reuse (struct mem_ref_group *group)
890*e4b17023SJohn Marino {
891*e4b17023SJohn Marino struct mem_ref *ref_pruned;
892*e4b17023SJohn Marino
893*e4b17023SJohn Marino for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
894*e4b17023SJohn Marino {
895*e4b17023SJohn Marino prune_ref_by_reuse (ref_pruned, group->refs);
896*e4b17023SJohn Marino
897*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
898*e4b17023SJohn Marino {
899*e4b17023SJohn Marino fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
900*e4b17023SJohn Marino
901*e4b17023SJohn Marino if (ref_pruned->prefetch_before == PREFETCH_ALL
902*e4b17023SJohn Marino && ref_pruned->prefetch_mod == 1)
903*e4b17023SJohn Marino fprintf (dump_file, " no restrictions");
904*e4b17023SJohn Marino else if (ref_pruned->prefetch_before == 0)
905*e4b17023SJohn Marino fprintf (dump_file, " do not prefetch");
906*e4b17023SJohn Marino else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
907*e4b17023SJohn Marino fprintf (dump_file, " prefetch once");
908*e4b17023SJohn Marino else
909*e4b17023SJohn Marino {
910*e4b17023SJohn Marino if (ref_pruned->prefetch_before != PREFETCH_ALL)
911*e4b17023SJohn Marino {
912*e4b17023SJohn Marino fprintf (dump_file, " prefetch before ");
913*e4b17023SJohn Marino fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
914*e4b17023SJohn Marino ref_pruned->prefetch_before);
915*e4b17023SJohn Marino }
916*e4b17023SJohn Marino if (ref_pruned->prefetch_mod != 1)
917*e4b17023SJohn Marino {
918*e4b17023SJohn Marino fprintf (dump_file, " prefetch mod ");
919*e4b17023SJohn Marino fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
920*e4b17023SJohn Marino ref_pruned->prefetch_mod);
921*e4b17023SJohn Marino }
922*e4b17023SJohn Marino }
923*e4b17023SJohn Marino fprintf (dump_file, "\n");
924*e4b17023SJohn Marino }
925*e4b17023SJohn Marino }
926*e4b17023SJohn Marino }
927*e4b17023SJohn Marino
928*e4b17023SJohn Marino /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
929*e4b17023SJohn Marino
930*e4b17023SJohn Marino static void
prune_by_reuse(struct mem_ref_group * groups)931*e4b17023SJohn Marino prune_by_reuse (struct mem_ref_group *groups)
932*e4b17023SJohn Marino {
933*e4b17023SJohn Marino for (; groups; groups = groups->next)
934*e4b17023SJohn Marino prune_group_by_reuse (groups);
935*e4b17023SJohn Marino }
936*e4b17023SJohn Marino
937*e4b17023SJohn Marino /* Returns true if we should issue prefetch for REF. */
938*e4b17023SJohn Marino
939*e4b17023SJohn Marino static bool
should_issue_prefetch_p(struct mem_ref * ref)940*e4b17023SJohn Marino should_issue_prefetch_p (struct mem_ref *ref)
941*e4b17023SJohn Marino {
942*e4b17023SJohn Marino /* For now do not issue prefetches for only first few of the
943*e4b17023SJohn Marino iterations. */
944*e4b17023SJohn Marino if (ref->prefetch_before != PREFETCH_ALL)
945*e4b17023SJohn Marino {
946*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
947*e4b17023SJohn Marino fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
948*e4b17023SJohn Marino (void *) ref);
949*e4b17023SJohn Marino return false;
950*e4b17023SJohn Marino }
951*e4b17023SJohn Marino
952*e4b17023SJohn Marino /* Do not prefetch nontemporal stores. */
953*e4b17023SJohn Marino if (ref->storent_p)
954*e4b17023SJohn Marino {
955*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
956*e4b17023SJohn Marino fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
957*e4b17023SJohn Marino return false;
958*e4b17023SJohn Marino }
959*e4b17023SJohn Marino
960*e4b17023SJohn Marino return true;
961*e4b17023SJohn Marino }
962*e4b17023SJohn Marino
963*e4b17023SJohn Marino /* Decide which of the prefetch candidates in GROUPS to prefetch.
964*e4b17023SJohn Marino AHEAD is the number of iterations to prefetch ahead (which corresponds
965*e4b17023SJohn Marino to the number of simultaneous instances of one prefetch running at a
966*e4b17023SJohn Marino time). UNROLL_FACTOR is the factor by that the loop is going to be
967*e4b17023SJohn Marino unrolled. Returns true if there is anything to prefetch. */
968*e4b17023SJohn Marino
969*e4b17023SJohn Marino static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)970*e4b17023SJohn Marino schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
971*e4b17023SJohn Marino unsigned ahead)
972*e4b17023SJohn Marino {
973*e4b17023SJohn Marino unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
974*e4b17023SJohn Marino unsigned slots_per_prefetch;
975*e4b17023SJohn Marino struct mem_ref *ref;
976*e4b17023SJohn Marino bool any = false;
977*e4b17023SJohn Marino
978*e4b17023SJohn Marino /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
979*e4b17023SJohn Marino remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
980*e4b17023SJohn Marino
981*e4b17023SJohn Marino /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
982*e4b17023SJohn Marino AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
983*e4b17023SJohn Marino it will need a prefetch slot. */
984*e4b17023SJohn Marino slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
985*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
986*e4b17023SJohn Marino fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
987*e4b17023SJohn Marino slots_per_prefetch);
988*e4b17023SJohn Marino
989*e4b17023SJohn Marino /* For now we just take memory references one by one and issue
990*e4b17023SJohn Marino prefetches for as many as possible. The groups are sorted
991*e4b17023SJohn Marino starting with the largest step, since the references with
992*e4b17023SJohn Marino large step are more likely to cause many cache misses. */
993*e4b17023SJohn Marino
994*e4b17023SJohn Marino for (; groups; groups = groups->next)
995*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next)
996*e4b17023SJohn Marino {
997*e4b17023SJohn Marino if (!should_issue_prefetch_p (ref))
998*e4b17023SJohn Marino continue;
999*e4b17023SJohn Marino
1000*e4b17023SJohn Marino /* The loop is far from being sufficiently unrolled for this
1001*e4b17023SJohn Marino prefetch. Do not generate prefetch to avoid many redudant
1002*e4b17023SJohn Marino prefetches. */
1003*e4b17023SJohn Marino if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1004*e4b17023SJohn Marino continue;
1005*e4b17023SJohn Marino
1006*e4b17023SJohn Marino /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1007*e4b17023SJohn Marino and we unroll the loop UNROLL_FACTOR times, we need to insert
1008*e4b17023SJohn Marino ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1009*e4b17023SJohn Marino iteration. */
1010*e4b17023SJohn Marino n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1011*e4b17023SJohn Marino / ref->prefetch_mod);
1012*e4b17023SJohn Marino prefetch_slots = n_prefetches * slots_per_prefetch;
1013*e4b17023SJohn Marino
1014*e4b17023SJohn Marino /* If more than half of the prefetches would be lost anyway, do not
1015*e4b17023SJohn Marino issue the prefetch. */
1016*e4b17023SJohn Marino if (2 * remaining_prefetch_slots < prefetch_slots)
1017*e4b17023SJohn Marino continue;
1018*e4b17023SJohn Marino
1019*e4b17023SJohn Marino ref->issue_prefetch_p = true;
1020*e4b17023SJohn Marino
1021*e4b17023SJohn Marino if (remaining_prefetch_slots <= prefetch_slots)
1022*e4b17023SJohn Marino return true;
1023*e4b17023SJohn Marino remaining_prefetch_slots -= prefetch_slots;
1024*e4b17023SJohn Marino any = true;
1025*e4b17023SJohn Marino }
1026*e4b17023SJohn Marino
1027*e4b17023SJohn Marino return any;
1028*e4b17023SJohn Marino }
1029*e4b17023SJohn Marino
1030*e4b17023SJohn Marino /* Return TRUE if no prefetch is going to be generated in the given
1031*e4b17023SJohn Marino GROUPS. */
1032*e4b17023SJohn Marino
1033*e4b17023SJohn Marino static bool
nothing_to_prefetch_p(struct mem_ref_group * groups)1034*e4b17023SJohn Marino nothing_to_prefetch_p (struct mem_ref_group *groups)
1035*e4b17023SJohn Marino {
1036*e4b17023SJohn Marino struct mem_ref *ref;
1037*e4b17023SJohn Marino
1038*e4b17023SJohn Marino for (; groups; groups = groups->next)
1039*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next)
1040*e4b17023SJohn Marino if (should_issue_prefetch_p (ref))
1041*e4b17023SJohn Marino return false;
1042*e4b17023SJohn Marino
1043*e4b17023SJohn Marino return true;
1044*e4b17023SJohn Marino }
1045*e4b17023SJohn Marino
1046*e4b17023SJohn Marino /* Estimate the number of prefetches in the given GROUPS.
1047*e4b17023SJohn Marino UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1048*e4b17023SJohn Marino
1049*e4b17023SJohn Marino static int
estimate_prefetch_count(struct mem_ref_group * groups,unsigned unroll_factor)1050*e4b17023SJohn Marino estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1051*e4b17023SJohn Marino {
1052*e4b17023SJohn Marino struct mem_ref *ref;
1053*e4b17023SJohn Marino unsigned n_prefetches;
1054*e4b17023SJohn Marino int prefetch_count = 0;
1055*e4b17023SJohn Marino
1056*e4b17023SJohn Marino for (; groups; groups = groups->next)
1057*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next)
1058*e4b17023SJohn Marino if (should_issue_prefetch_p (ref))
1059*e4b17023SJohn Marino {
1060*e4b17023SJohn Marino n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1061*e4b17023SJohn Marino / ref->prefetch_mod);
1062*e4b17023SJohn Marino prefetch_count += n_prefetches;
1063*e4b17023SJohn Marino }
1064*e4b17023SJohn Marino
1065*e4b17023SJohn Marino return prefetch_count;
1066*e4b17023SJohn Marino }
1067*e4b17023SJohn Marino
1068*e4b17023SJohn Marino /* Issue prefetches for the reference REF into loop as decided before.
1069*e4b17023SJohn Marino HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1070*e4b17023SJohn Marino is the factor by which LOOP was unrolled. */
1071*e4b17023SJohn Marino
1072*e4b17023SJohn Marino static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)1073*e4b17023SJohn Marino issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1074*e4b17023SJohn Marino {
1075*e4b17023SJohn Marino HOST_WIDE_INT delta;
1076*e4b17023SJohn Marino tree addr, addr_base, write_p, local, forward;
1077*e4b17023SJohn Marino gimple prefetch;
1078*e4b17023SJohn Marino gimple_stmt_iterator bsi;
1079*e4b17023SJohn Marino unsigned n_prefetches, ap;
1080*e4b17023SJohn Marino bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1081*e4b17023SJohn Marino
1082*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1083*e4b17023SJohn Marino fprintf (dump_file, "Issued%s prefetch for %p.\n",
1084*e4b17023SJohn Marino nontemporal ? " nontemporal" : "",
1085*e4b17023SJohn Marino (void *) ref);
1086*e4b17023SJohn Marino
1087*e4b17023SJohn Marino bsi = gsi_for_stmt (ref->stmt);
1088*e4b17023SJohn Marino
1089*e4b17023SJohn Marino n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1090*e4b17023SJohn Marino / ref->prefetch_mod);
1091*e4b17023SJohn Marino addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1092*e4b17023SJohn Marino addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1093*e4b17023SJohn Marino true, NULL, true, GSI_SAME_STMT);
1094*e4b17023SJohn Marino write_p = ref->write_p ? integer_one_node : integer_zero_node;
1095*e4b17023SJohn Marino local = nontemporal ? integer_zero_node : integer_three_node;
1096*e4b17023SJohn Marino
1097*e4b17023SJohn Marino for (ap = 0; ap < n_prefetches; ap++)
1098*e4b17023SJohn Marino {
1099*e4b17023SJohn Marino if (cst_and_fits_in_hwi (ref->group->step))
1100*e4b17023SJohn Marino {
1101*e4b17023SJohn Marino /* Determine the address to prefetch. */
1102*e4b17023SJohn Marino delta = (ahead + ap * ref->prefetch_mod) *
1103*e4b17023SJohn Marino int_cst_value (ref->group->step);
1104*e4b17023SJohn Marino addr = fold_build_pointer_plus_hwi (addr_base, delta);
1105*e4b17023SJohn Marino addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1106*e4b17023SJohn Marino true, GSI_SAME_STMT);
1107*e4b17023SJohn Marino }
1108*e4b17023SJohn Marino else
1109*e4b17023SJohn Marino {
1110*e4b17023SJohn Marino /* The step size is non-constant but loop-invariant. We use the
1111*e4b17023SJohn Marino heuristic to simply prefetch ahead iterations ahead. */
1112*e4b17023SJohn Marino forward = fold_build2 (MULT_EXPR, sizetype,
1113*e4b17023SJohn Marino fold_convert (sizetype, ref->group->step),
1114*e4b17023SJohn Marino fold_convert (sizetype, size_int (ahead)));
1115*e4b17023SJohn Marino addr = fold_build_pointer_plus (addr_base, forward);
1116*e4b17023SJohn Marino addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1117*e4b17023SJohn Marino NULL, true, GSI_SAME_STMT);
1118*e4b17023SJohn Marino }
1119*e4b17023SJohn Marino /* Create the prefetch instruction. */
1120*e4b17023SJohn Marino prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1121*e4b17023SJohn Marino 3, addr, write_p, local);
1122*e4b17023SJohn Marino gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1123*e4b17023SJohn Marino }
1124*e4b17023SJohn Marino }
1125*e4b17023SJohn Marino
1126*e4b17023SJohn Marino /* Issue prefetches for the references in GROUPS into loop as decided before.
1127*e4b17023SJohn Marino HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1128*e4b17023SJohn Marino factor by that LOOP was unrolled. */
1129*e4b17023SJohn Marino
1130*e4b17023SJohn Marino static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)1131*e4b17023SJohn Marino issue_prefetches (struct mem_ref_group *groups,
1132*e4b17023SJohn Marino unsigned unroll_factor, unsigned ahead)
1133*e4b17023SJohn Marino {
1134*e4b17023SJohn Marino struct mem_ref *ref;
1135*e4b17023SJohn Marino
1136*e4b17023SJohn Marino for (; groups; groups = groups->next)
1137*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next)
1138*e4b17023SJohn Marino if (ref->issue_prefetch_p)
1139*e4b17023SJohn Marino issue_prefetch_ref (ref, unroll_factor, ahead);
1140*e4b17023SJohn Marino }
1141*e4b17023SJohn Marino
1142*e4b17023SJohn Marino /* Returns true if REF is a memory write for that a nontemporal store insn
1143*e4b17023SJohn Marino can be used. */
1144*e4b17023SJohn Marino
1145*e4b17023SJohn Marino static bool
nontemporal_store_p(struct mem_ref * ref)1146*e4b17023SJohn Marino nontemporal_store_p (struct mem_ref *ref)
1147*e4b17023SJohn Marino {
1148*e4b17023SJohn Marino enum machine_mode mode;
1149*e4b17023SJohn Marino enum insn_code code;
1150*e4b17023SJohn Marino
1151*e4b17023SJohn Marino /* REF must be a write that is not reused. We require it to be independent
1152*e4b17023SJohn Marino on all other memory references in the loop, as the nontemporal stores may
1153*e4b17023SJohn Marino be reordered with respect to other memory references. */
1154*e4b17023SJohn Marino if (!ref->write_p
1155*e4b17023SJohn Marino || !ref->independent_p
1156*e4b17023SJohn Marino || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1157*e4b17023SJohn Marino return false;
1158*e4b17023SJohn Marino
1159*e4b17023SJohn Marino /* Check that we have the storent instruction for the mode. */
1160*e4b17023SJohn Marino mode = TYPE_MODE (TREE_TYPE (ref->mem));
1161*e4b17023SJohn Marino if (mode == BLKmode)
1162*e4b17023SJohn Marino return false;
1163*e4b17023SJohn Marino
1164*e4b17023SJohn Marino code = optab_handler (storent_optab, mode);
1165*e4b17023SJohn Marino return code != CODE_FOR_nothing;
1166*e4b17023SJohn Marino }
1167*e4b17023SJohn Marino
1168*e4b17023SJohn Marino /* If REF is a nontemporal store, we mark the corresponding modify statement
1169*e4b17023SJohn Marino and return true. Otherwise, we return false. */
1170*e4b17023SJohn Marino
1171*e4b17023SJohn Marino static bool
mark_nontemporal_store(struct mem_ref * ref)1172*e4b17023SJohn Marino mark_nontemporal_store (struct mem_ref *ref)
1173*e4b17023SJohn Marino {
1174*e4b17023SJohn Marino if (!nontemporal_store_p (ref))
1175*e4b17023SJohn Marino return false;
1176*e4b17023SJohn Marino
1177*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1178*e4b17023SJohn Marino fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1179*e4b17023SJohn Marino (void *) ref);
1180*e4b17023SJohn Marino
1181*e4b17023SJohn Marino gimple_assign_set_nontemporal_move (ref->stmt, true);
1182*e4b17023SJohn Marino ref->storent_p = true;
1183*e4b17023SJohn Marino
1184*e4b17023SJohn Marino return true;
1185*e4b17023SJohn Marino }
1186*e4b17023SJohn Marino
1187*e4b17023SJohn Marino /* Issue a memory fence instruction after LOOP. */
1188*e4b17023SJohn Marino
1189*e4b17023SJohn Marino static void
emit_mfence_after_loop(struct loop * loop)1190*e4b17023SJohn Marino emit_mfence_after_loop (struct loop *loop)
1191*e4b17023SJohn Marino {
1192*e4b17023SJohn Marino VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1193*e4b17023SJohn Marino edge exit;
1194*e4b17023SJohn Marino gimple call;
1195*e4b17023SJohn Marino gimple_stmt_iterator bsi;
1196*e4b17023SJohn Marino unsigned i;
1197*e4b17023SJohn Marino
1198*e4b17023SJohn Marino FOR_EACH_VEC_ELT (edge, exits, i, exit)
1199*e4b17023SJohn Marino {
1200*e4b17023SJohn Marino call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1201*e4b17023SJohn Marino
1202*e4b17023SJohn Marino if (!single_pred_p (exit->dest)
1203*e4b17023SJohn Marino /* If possible, we prefer not to insert the fence on other paths
1204*e4b17023SJohn Marino in cfg. */
1205*e4b17023SJohn Marino && !(exit->flags & EDGE_ABNORMAL))
1206*e4b17023SJohn Marino split_loop_exit_edge (exit);
1207*e4b17023SJohn Marino bsi = gsi_after_labels (exit->dest);
1208*e4b17023SJohn Marino
1209*e4b17023SJohn Marino gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1210*e4b17023SJohn Marino mark_virtual_ops_for_renaming (call);
1211*e4b17023SJohn Marino }
1212*e4b17023SJohn Marino
1213*e4b17023SJohn Marino VEC_free (edge, heap, exits);
1214*e4b17023SJohn Marino update_ssa (TODO_update_ssa_only_virtuals);
1215*e4b17023SJohn Marino }
1216*e4b17023SJohn Marino
1217*e4b17023SJohn Marino /* Returns true if we can use storent in loop, false otherwise. */
1218*e4b17023SJohn Marino
1219*e4b17023SJohn Marino static bool
may_use_storent_in_loop_p(struct loop * loop)1220*e4b17023SJohn Marino may_use_storent_in_loop_p (struct loop *loop)
1221*e4b17023SJohn Marino {
1222*e4b17023SJohn Marino bool ret = true;
1223*e4b17023SJohn Marino
1224*e4b17023SJohn Marino if (loop->inner != NULL)
1225*e4b17023SJohn Marino return false;
1226*e4b17023SJohn Marino
1227*e4b17023SJohn Marino /* If we must issue a mfence insn after using storent, check that there
1228*e4b17023SJohn Marino is a suitable place for it at each of the loop exits. */
1229*e4b17023SJohn Marino if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1230*e4b17023SJohn Marino {
1231*e4b17023SJohn Marino VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1232*e4b17023SJohn Marino unsigned i;
1233*e4b17023SJohn Marino edge exit;
1234*e4b17023SJohn Marino
1235*e4b17023SJohn Marino FOR_EACH_VEC_ELT (edge, exits, i, exit)
1236*e4b17023SJohn Marino if ((exit->flags & EDGE_ABNORMAL)
1237*e4b17023SJohn Marino && exit->dest == EXIT_BLOCK_PTR)
1238*e4b17023SJohn Marino ret = false;
1239*e4b17023SJohn Marino
1240*e4b17023SJohn Marino VEC_free (edge, heap, exits);
1241*e4b17023SJohn Marino }
1242*e4b17023SJohn Marino
1243*e4b17023SJohn Marino return ret;
1244*e4b17023SJohn Marino }
1245*e4b17023SJohn Marino
1246*e4b17023SJohn Marino /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1247*e4b17023SJohn Marino references in the loop. */
1248*e4b17023SJohn Marino
1249*e4b17023SJohn Marino static void
mark_nontemporal_stores(struct loop * loop,struct mem_ref_group * groups)1250*e4b17023SJohn Marino mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1251*e4b17023SJohn Marino {
1252*e4b17023SJohn Marino struct mem_ref *ref;
1253*e4b17023SJohn Marino bool any = false;
1254*e4b17023SJohn Marino
1255*e4b17023SJohn Marino if (!may_use_storent_in_loop_p (loop))
1256*e4b17023SJohn Marino return;
1257*e4b17023SJohn Marino
1258*e4b17023SJohn Marino for (; groups; groups = groups->next)
1259*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next)
1260*e4b17023SJohn Marino any |= mark_nontemporal_store (ref);
1261*e4b17023SJohn Marino
1262*e4b17023SJohn Marino if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1263*e4b17023SJohn Marino emit_mfence_after_loop (loop);
1264*e4b17023SJohn Marino }
1265*e4b17023SJohn Marino
1266*e4b17023SJohn Marino /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1267*e4b17023SJohn Marino this is the case, fill in DESC by the description of number of
1268*e4b17023SJohn Marino iterations. */
1269*e4b17023SJohn Marino
1270*e4b17023SJohn Marino static bool
should_unroll_loop_p(struct loop * loop,struct tree_niter_desc * desc,unsigned factor)1271*e4b17023SJohn Marino should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1272*e4b17023SJohn Marino unsigned factor)
1273*e4b17023SJohn Marino {
1274*e4b17023SJohn Marino if (!can_unroll_loop_p (loop, factor, desc))
1275*e4b17023SJohn Marino return false;
1276*e4b17023SJohn Marino
1277*e4b17023SJohn Marino /* We only consider loops without control flow for unrolling. This is not
1278*e4b17023SJohn Marino a hard restriction -- tree_unroll_loop works with arbitrary loops
1279*e4b17023SJohn Marino as well; but the unrolling/prefetching is usually more profitable for
1280*e4b17023SJohn Marino loops consisting of a single basic block, and we want to limit the
1281*e4b17023SJohn Marino code growth. */
1282*e4b17023SJohn Marino if (loop->num_nodes > 2)
1283*e4b17023SJohn Marino return false;
1284*e4b17023SJohn Marino
1285*e4b17023SJohn Marino return true;
1286*e4b17023SJohn Marino }
1287*e4b17023SJohn Marino
1288*e4b17023SJohn Marino /* Determine the coefficient by that unroll LOOP, from the information
1289*e4b17023SJohn Marino contained in the list of memory references REFS. Description of
1290*e4b17023SJohn Marino umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1291*e4b17023SJohn Marino insns of the LOOP. EST_NITER is the estimated number of iterations of
1292*e4b17023SJohn Marino the loop, or -1 if no estimate is available. */
1293*e4b17023SJohn Marino
1294*e4b17023SJohn Marino static unsigned
determine_unroll_factor(struct loop * loop,struct mem_ref_group * refs,unsigned ninsns,struct tree_niter_desc * desc,HOST_WIDE_INT est_niter)1295*e4b17023SJohn Marino determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1296*e4b17023SJohn Marino unsigned ninsns, struct tree_niter_desc *desc,
1297*e4b17023SJohn Marino HOST_WIDE_INT est_niter)
1298*e4b17023SJohn Marino {
1299*e4b17023SJohn Marino unsigned upper_bound;
1300*e4b17023SJohn Marino unsigned nfactor, factor, mod_constraint;
1301*e4b17023SJohn Marino struct mem_ref_group *agp;
1302*e4b17023SJohn Marino struct mem_ref *ref;
1303*e4b17023SJohn Marino
1304*e4b17023SJohn Marino /* First check whether the loop is not too large to unroll. We ignore
1305*e4b17023SJohn Marino PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1306*e4b17023SJohn Marino from unrolling them enough to make exactly one cache line covered by each
1307*e4b17023SJohn Marino iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1308*e4b17023SJohn Marino us from unrolling the loops too many times in cases where we only expect
1309*e4b17023SJohn Marino gains from better scheduling and decreasing loop overhead, which is not
1310*e4b17023SJohn Marino the case here. */
1311*e4b17023SJohn Marino upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1312*e4b17023SJohn Marino
1313*e4b17023SJohn Marino /* If we unrolled the loop more times than it iterates, the unrolled version
1314*e4b17023SJohn Marino of the loop would be never entered. */
1315*e4b17023SJohn Marino if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1316*e4b17023SJohn Marino upper_bound = est_niter;
1317*e4b17023SJohn Marino
1318*e4b17023SJohn Marino if (upper_bound <= 1)
1319*e4b17023SJohn Marino return 1;
1320*e4b17023SJohn Marino
1321*e4b17023SJohn Marino /* Choose the factor so that we may prefetch each cache just once,
1322*e4b17023SJohn Marino but bound the unrolling by UPPER_BOUND. */
1323*e4b17023SJohn Marino factor = 1;
1324*e4b17023SJohn Marino for (agp = refs; agp; agp = agp->next)
1325*e4b17023SJohn Marino for (ref = agp->refs; ref; ref = ref->next)
1326*e4b17023SJohn Marino if (should_issue_prefetch_p (ref))
1327*e4b17023SJohn Marino {
1328*e4b17023SJohn Marino mod_constraint = ref->prefetch_mod;
1329*e4b17023SJohn Marino nfactor = least_common_multiple (mod_constraint, factor);
1330*e4b17023SJohn Marino if (nfactor <= upper_bound)
1331*e4b17023SJohn Marino factor = nfactor;
1332*e4b17023SJohn Marino }
1333*e4b17023SJohn Marino
1334*e4b17023SJohn Marino if (!should_unroll_loop_p (loop, desc, factor))
1335*e4b17023SJohn Marino return 1;
1336*e4b17023SJohn Marino
1337*e4b17023SJohn Marino return factor;
1338*e4b17023SJohn Marino }
1339*e4b17023SJohn Marino
1340*e4b17023SJohn Marino /* Returns the total volume of the memory references REFS, taking into account
1341*e4b17023SJohn Marino reuses in the innermost loop and cache line size. TODO -- we should also
1342*e4b17023SJohn Marino take into account reuses across the iterations of the loops in the loop
1343*e4b17023SJohn Marino nest. */
1344*e4b17023SJohn Marino
1345*e4b17023SJohn Marino static unsigned
volume_of_references(struct mem_ref_group * refs)1346*e4b17023SJohn Marino volume_of_references (struct mem_ref_group *refs)
1347*e4b17023SJohn Marino {
1348*e4b17023SJohn Marino unsigned volume = 0;
1349*e4b17023SJohn Marino struct mem_ref_group *gr;
1350*e4b17023SJohn Marino struct mem_ref *ref;
1351*e4b17023SJohn Marino
1352*e4b17023SJohn Marino for (gr = refs; gr; gr = gr->next)
1353*e4b17023SJohn Marino for (ref = gr->refs; ref; ref = ref->next)
1354*e4b17023SJohn Marino {
1355*e4b17023SJohn Marino /* Almost always reuses another value? */
1356*e4b17023SJohn Marino if (ref->prefetch_before != PREFETCH_ALL)
1357*e4b17023SJohn Marino continue;
1358*e4b17023SJohn Marino
1359*e4b17023SJohn Marino /* If several iterations access the same cache line, use the size of
1360*e4b17023SJohn Marino the line divided by this number. Otherwise, a cache line is
1361*e4b17023SJohn Marino accessed in each iteration. TODO -- in the latter case, we should
1362*e4b17023SJohn Marino take the size of the reference into account, rounding it up on cache
1363*e4b17023SJohn Marino line size multiple. */
1364*e4b17023SJohn Marino volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1365*e4b17023SJohn Marino }
1366*e4b17023SJohn Marino return volume;
1367*e4b17023SJohn Marino }
1368*e4b17023SJohn Marino
1369*e4b17023SJohn Marino /* Returns the volume of memory references accessed across VEC iterations of
1370*e4b17023SJohn Marino loops, whose sizes are described in the LOOP_SIZES array. N is the number
1371*e4b17023SJohn Marino of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1372*e4b17023SJohn Marino
1373*e4b17023SJohn Marino static unsigned
volume_of_dist_vector(lambda_vector vec,unsigned * loop_sizes,unsigned n)1374*e4b17023SJohn Marino volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1375*e4b17023SJohn Marino {
1376*e4b17023SJohn Marino unsigned i;
1377*e4b17023SJohn Marino
1378*e4b17023SJohn Marino for (i = 0; i < n; i++)
1379*e4b17023SJohn Marino if (vec[i] != 0)
1380*e4b17023SJohn Marino break;
1381*e4b17023SJohn Marino
1382*e4b17023SJohn Marino if (i == n)
1383*e4b17023SJohn Marino return 0;
1384*e4b17023SJohn Marino
1385*e4b17023SJohn Marino gcc_assert (vec[i] > 0);
1386*e4b17023SJohn Marino
1387*e4b17023SJohn Marino /* We ignore the parts of the distance vector in subloops, since usually
1388*e4b17023SJohn Marino the numbers of iterations are much smaller. */
1389*e4b17023SJohn Marino return loop_sizes[i] * vec[i];
1390*e4b17023SJohn Marino }
1391*e4b17023SJohn Marino
1392*e4b17023SJohn Marino /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1393*e4b17023SJohn Marino at the position corresponding to the loop of the step. N is the depth
1394*e4b17023SJohn Marino of the considered loop nest, and, LOOP is its innermost loop. */
1395*e4b17023SJohn Marino
1396*e4b17023SJohn Marino static void
add_subscript_strides(tree access_fn,unsigned stride,HOST_WIDE_INT * strides,unsigned n,struct loop * loop)1397*e4b17023SJohn Marino add_subscript_strides (tree access_fn, unsigned stride,
1398*e4b17023SJohn Marino HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1399*e4b17023SJohn Marino {
1400*e4b17023SJohn Marino struct loop *aloop;
1401*e4b17023SJohn Marino tree step;
1402*e4b17023SJohn Marino HOST_WIDE_INT astep;
1403*e4b17023SJohn Marino unsigned min_depth = loop_depth (loop) - n;
1404*e4b17023SJohn Marino
1405*e4b17023SJohn Marino while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1406*e4b17023SJohn Marino {
1407*e4b17023SJohn Marino aloop = get_chrec_loop (access_fn);
1408*e4b17023SJohn Marino step = CHREC_RIGHT (access_fn);
1409*e4b17023SJohn Marino access_fn = CHREC_LEFT (access_fn);
1410*e4b17023SJohn Marino
1411*e4b17023SJohn Marino if ((unsigned) loop_depth (aloop) <= min_depth)
1412*e4b17023SJohn Marino continue;
1413*e4b17023SJohn Marino
1414*e4b17023SJohn Marino if (host_integerp (step, 0))
1415*e4b17023SJohn Marino astep = tree_low_cst (step, 0);
1416*e4b17023SJohn Marino else
1417*e4b17023SJohn Marino astep = L1_CACHE_LINE_SIZE;
1418*e4b17023SJohn Marino
1419*e4b17023SJohn Marino strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1420*e4b17023SJohn Marino
1421*e4b17023SJohn Marino }
1422*e4b17023SJohn Marino }
1423*e4b17023SJohn Marino
1424*e4b17023SJohn Marino /* Returns the volume of memory references accessed between two consecutive
1425*e4b17023SJohn Marino self-reuses of the reference DR. We consider the subscripts of DR in N
1426*e4b17023SJohn Marino loops, and LOOP_SIZES contains the volumes of accesses in each of the
1427*e4b17023SJohn Marino loops. LOOP is the innermost loop of the current loop nest. */
1428*e4b17023SJohn Marino
1429*e4b17023SJohn Marino static unsigned
self_reuse_distance(data_reference_p dr,unsigned * loop_sizes,unsigned n,struct loop * loop)1430*e4b17023SJohn Marino self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1431*e4b17023SJohn Marino struct loop *loop)
1432*e4b17023SJohn Marino {
1433*e4b17023SJohn Marino tree stride, access_fn;
1434*e4b17023SJohn Marino HOST_WIDE_INT *strides, astride;
1435*e4b17023SJohn Marino VEC (tree, heap) *access_fns;
1436*e4b17023SJohn Marino tree ref = DR_REF (dr);
1437*e4b17023SJohn Marino unsigned i, ret = ~0u;
1438*e4b17023SJohn Marino
1439*e4b17023SJohn Marino /* In the following example:
1440*e4b17023SJohn Marino
1441*e4b17023SJohn Marino for (i = 0; i < N; i++)
1442*e4b17023SJohn Marino for (j = 0; j < N; j++)
1443*e4b17023SJohn Marino use (a[j][i]);
1444*e4b17023SJohn Marino the same cache line is accessed each N steps (except if the change from
1445*e4b17023SJohn Marino i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1446*e4b17023SJohn Marino we cannot rely purely on the results of the data dependence analysis.
1447*e4b17023SJohn Marino
1448*e4b17023SJohn Marino Instead, we compute the stride of the reference in each loop, and consider
1449*e4b17023SJohn Marino the innermost loop in that the stride is less than cache size. */
1450*e4b17023SJohn Marino
1451*e4b17023SJohn Marino strides = XCNEWVEC (HOST_WIDE_INT, n);
1452*e4b17023SJohn Marino access_fns = DR_ACCESS_FNS (dr);
1453*e4b17023SJohn Marino
1454*e4b17023SJohn Marino FOR_EACH_VEC_ELT (tree, access_fns, i, access_fn)
1455*e4b17023SJohn Marino {
1456*e4b17023SJohn Marino /* Keep track of the reference corresponding to the subscript, so that we
1457*e4b17023SJohn Marino know its stride. */
1458*e4b17023SJohn Marino while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1459*e4b17023SJohn Marino ref = TREE_OPERAND (ref, 0);
1460*e4b17023SJohn Marino
1461*e4b17023SJohn Marino if (TREE_CODE (ref) == ARRAY_REF)
1462*e4b17023SJohn Marino {
1463*e4b17023SJohn Marino stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1464*e4b17023SJohn Marino if (host_integerp (stride, 1))
1465*e4b17023SJohn Marino astride = tree_low_cst (stride, 1);
1466*e4b17023SJohn Marino else
1467*e4b17023SJohn Marino astride = L1_CACHE_LINE_SIZE;
1468*e4b17023SJohn Marino
1469*e4b17023SJohn Marino ref = TREE_OPERAND (ref, 0);
1470*e4b17023SJohn Marino }
1471*e4b17023SJohn Marino else
1472*e4b17023SJohn Marino astride = 1;
1473*e4b17023SJohn Marino
1474*e4b17023SJohn Marino add_subscript_strides (access_fn, astride, strides, n, loop);
1475*e4b17023SJohn Marino }
1476*e4b17023SJohn Marino
1477*e4b17023SJohn Marino for (i = n; i-- > 0; )
1478*e4b17023SJohn Marino {
1479*e4b17023SJohn Marino unsigned HOST_WIDE_INT s;
1480*e4b17023SJohn Marino
1481*e4b17023SJohn Marino s = strides[i] < 0 ? -strides[i] : strides[i];
1482*e4b17023SJohn Marino
1483*e4b17023SJohn Marino if (s < (unsigned) L1_CACHE_LINE_SIZE
1484*e4b17023SJohn Marino && (loop_sizes[i]
1485*e4b17023SJohn Marino > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1486*e4b17023SJohn Marino {
1487*e4b17023SJohn Marino ret = loop_sizes[i];
1488*e4b17023SJohn Marino break;
1489*e4b17023SJohn Marino }
1490*e4b17023SJohn Marino }
1491*e4b17023SJohn Marino
1492*e4b17023SJohn Marino free (strides);
1493*e4b17023SJohn Marino return ret;
1494*e4b17023SJohn Marino }
1495*e4b17023SJohn Marino
1496*e4b17023SJohn Marino /* Determines the distance till the first reuse of each reference in REFS
1497*e4b17023SJohn Marino in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1498*e4b17023SJohn Marino memory references in the loop. */
1499*e4b17023SJohn Marino
1500*e4b17023SJohn Marino static void
determine_loop_nest_reuse(struct loop * loop,struct mem_ref_group * refs,bool no_other_refs)1501*e4b17023SJohn Marino determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1502*e4b17023SJohn Marino bool no_other_refs)
1503*e4b17023SJohn Marino {
1504*e4b17023SJohn Marino struct loop *nest, *aloop;
1505*e4b17023SJohn Marino VEC (data_reference_p, heap) *datarefs = NULL;
1506*e4b17023SJohn Marino VEC (ddr_p, heap) *dependences = NULL;
1507*e4b17023SJohn Marino struct mem_ref_group *gr;
1508*e4b17023SJohn Marino struct mem_ref *ref, *refb;
1509*e4b17023SJohn Marino VEC (loop_p, heap) *vloops = NULL;
1510*e4b17023SJohn Marino unsigned *loop_data_size;
1511*e4b17023SJohn Marino unsigned i, j, n;
1512*e4b17023SJohn Marino unsigned volume, dist, adist;
1513*e4b17023SJohn Marino HOST_WIDE_INT vol;
1514*e4b17023SJohn Marino data_reference_p dr;
1515*e4b17023SJohn Marino ddr_p dep;
1516*e4b17023SJohn Marino
1517*e4b17023SJohn Marino if (loop->inner)
1518*e4b17023SJohn Marino return;
1519*e4b17023SJohn Marino
1520*e4b17023SJohn Marino /* Find the outermost loop of the loop nest of loop (we require that
1521*e4b17023SJohn Marino there are no sibling loops inside the nest). */
1522*e4b17023SJohn Marino nest = loop;
1523*e4b17023SJohn Marino while (1)
1524*e4b17023SJohn Marino {
1525*e4b17023SJohn Marino aloop = loop_outer (nest);
1526*e4b17023SJohn Marino
1527*e4b17023SJohn Marino if (aloop == current_loops->tree_root
1528*e4b17023SJohn Marino || aloop->inner->next)
1529*e4b17023SJohn Marino break;
1530*e4b17023SJohn Marino
1531*e4b17023SJohn Marino nest = aloop;
1532*e4b17023SJohn Marino }
1533*e4b17023SJohn Marino
1534*e4b17023SJohn Marino /* For each loop, determine the amount of data accessed in each iteration.
1535*e4b17023SJohn Marino We use this to estimate whether the reference is evicted from the
1536*e4b17023SJohn Marino cache before its reuse. */
1537*e4b17023SJohn Marino find_loop_nest (nest, &vloops);
1538*e4b17023SJohn Marino n = VEC_length (loop_p, vloops);
1539*e4b17023SJohn Marino loop_data_size = XNEWVEC (unsigned, n);
1540*e4b17023SJohn Marino volume = volume_of_references (refs);
1541*e4b17023SJohn Marino i = n;
1542*e4b17023SJohn Marino while (i-- != 0)
1543*e4b17023SJohn Marino {
1544*e4b17023SJohn Marino loop_data_size[i] = volume;
1545*e4b17023SJohn Marino /* Bound the volume by the L2 cache size, since above this bound,
1546*e4b17023SJohn Marino all dependence distances are equivalent. */
1547*e4b17023SJohn Marino if (volume > L2_CACHE_SIZE_BYTES)
1548*e4b17023SJohn Marino continue;
1549*e4b17023SJohn Marino
1550*e4b17023SJohn Marino aloop = VEC_index (loop_p, vloops, i);
1551*e4b17023SJohn Marino vol = max_stmt_executions_int (aloop, false);
1552*e4b17023SJohn Marino if (vol < 0)
1553*e4b17023SJohn Marino vol = expected_loop_iterations (aloop);
1554*e4b17023SJohn Marino volume *= vol;
1555*e4b17023SJohn Marino }
1556*e4b17023SJohn Marino
1557*e4b17023SJohn Marino /* Prepare the references in the form suitable for data dependence
1558*e4b17023SJohn Marino analysis. We ignore unanalyzable data references (the results
1559*e4b17023SJohn Marino are used just as a heuristics to estimate temporality of the
1560*e4b17023SJohn Marino references, hence we do not need to worry about correctness). */
1561*e4b17023SJohn Marino for (gr = refs; gr; gr = gr->next)
1562*e4b17023SJohn Marino for (ref = gr->refs; ref; ref = ref->next)
1563*e4b17023SJohn Marino {
1564*e4b17023SJohn Marino dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1565*e4b17023SJohn Marino ref->mem, ref->stmt, !ref->write_p);
1566*e4b17023SJohn Marino
1567*e4b17023SJohn Marino if (dr)
1568*e4b17023SJohn Marino {
1569*e4b17023SJohn Marino ref->reuse_distance = volume;
1570*e4b17023SJohn Marino dr->aux = ref;
1571*e4b17023SJohn Marino VEC_safe_push (data_reference_p, heap, datarefs, dr);
1572*e4b17023SJohn Marino }
1573*e4b17023SJohn Marino else
1574*e4b17023SJohn Marino no_other_refs = false;
1575*e4b17023SJohn Marino }
1576*e4b17023SJohn Marino
1577*e4b17023SJohn Marino FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1578*e4b17023SJohn Marino {
1579*e4b17023SJohn Marino dist = self_reuse_distance (dr, loop_data_size, n, loop);
1580*e4b17023SJohn Marino ref = (struct mem_ref *) dr->aux;
1581*e4b17023SJohn Marino if (ref->reuse_distance > dist)
1582*e4b17023SJohn Marino ref->reuse_distance = dist;
1583*e4b17023SJohn Marino
1584*e4b17023SJohn Marino if (no_other_refs)
1585*e4b17023SJohn Marino ref->independent_p = true;
1586*e4b17023SJohn Marino }
1587*e4b17023SJohn Marino
1588*e4b17023SJohn Marino compute_all_dependences (datarefs, &dependences, vloops, true);
1589*e4b17023SJohn Marino
1590*e4b17023SJohn Marino FOR_EACH_VEC_ELT (ddr_p, dependences, i, dep)
1591*e4b17023SJohn Marino {
1592*e4b17023SJohn Marino if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1593*e4b17023SJohn Marino continue;
1594*e4b17023SJohn Marino
1595*e4b17023SJohn Marino ref = (struct mem_ref *) DDR_A (dep)->aux;
1596*e4b17023SJohn Marino refb = (struct mem_ref *) DDR_B (dep)->aux;
1597*e4b17023SJohn Marino
1598*e4b17023SJohn Marino if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1599*e4b17023SJohn Marino || DDR_NUM_DIST_VECTS (dep) == 0)
1600*e4b17023SJohn Marino {
1601*e4b17023SJohn Marino /* If the dependence cannot be analyzed, assume that there might be
1602*e4b17023SJohn Marino a reuse. */
1603*e4b17023SJohn Marino dist = 0;
1604*e4b17023SJohn Marino
1605*e4b17023SJohn Marino ref->independent_p = false;
1606*e4b17023SJohn Marino refb->independent_p = false;
1607*e4b17023SJohn Marino }
1608*e4b17023SJohn Marino else
1609*e4b17023SJohn Marino {
1610*e4b17023SJohn Marino /* The distance vectors are normalized to be always lexicographically
1611*e4b17023SJohn Marino positive, hence we cannot tell just from them whether DDR_A comes
1612*e4b17023SJohn Marino before DDR_B or vice versa. However, it is not important,
1613*e4b17023SJohn Marino anyway -- if DDR_A is close to DDR_B, then it is either reused in
1614*e4b17023SJohn Marino DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1615*e4b17023SJohn Marino in cache (and marking it as nontemporal would not affect
1616*e4b17023SJohn Marino anything). */
1617*e4b17023SJohn Marino
1618*e4b17023SJohn Marino dist = volume;
1619*e4b17023SJohn Marino for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1620*e4b17023SJohn Marino {
1621*e4b17023SJohn Marino adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1622*e4b17023SJohn Marino loop_data_size, n);
1623*e4b17023SJohn Marino
1624*e4b17023SJohn Marino /* If this is a dependence in the innermost loop (i.e., the
1625*e4b17023SJohn Marino distances in all superloops are zero) and it is not
1626*e4b17023SJohn Marino the trivial self-dependence with distance zero, record that
1627*e4b17023SJohn Marino the references are not completely independent. */
1628*e4b17023SJohn Marino if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1629*e4b17023SJohn Marino && (ref != refb
1630*e4b17023SJohn Marino || DDR_DIST_VECT (dep, j)[n-1] != 0))
1631*e4b17023SJohn Marino {
1632*e4b17023SJohn Marino ref->independent_p = false;
1633*e4b17023SJohn Marino refb->independent_p = false;
1634*e4b17023SJohn Marino }
1635*e4b17023SJohn Marino
1636*e4b17023SJohn Marino /* Ignore accesses closer than
1637*e4b17023SJohn Marino L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1638*e4b17023SJohn Marino so that we use nontemporal prefetches e.g. if single memory
1639*e4b17023SJohn Marino location is accessed several times in a single iteration of
1640*e4b17023SJohn Marino the loop. */
1641*e4b17023SJohn Marino if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1642*e4b17023SJohn Marino continue;
1643*e4b17023SJohn Marino
1644*e4b17023SJohn Marino if (adist < dist)
1645*e4b17023SJohn Marino dist = adist;
1646*e4b17023SJohn Marino }
1647*e4b17023SJohn Marino }
1648*e4b17023SJohn Marino
1649*e4b17023SJohn Marino if (ref->reuse_distance > dist)
1650*e4b17023SJohn Marino ref->reuse_distance = dist;
1651*e4b17023SJohn Marino if (refb->reuse_distance > dist)
1652*e4b17023SJohn Marino refb->reuse_distance = dist;
1653*e4b17023SJohn Marino }
1654*e4b17023SJohn Marino
1655*e4b17023SJohn Marino free_dependence_relations (dependences);
1656*e4b17023SJohn Marino free_data_refs (datarefs);
1657*e4b17023SJohn Marino free (loop_data_size);
1658*e4b17023SJohn Marino
1659*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1660*e4b17023SJohn Marino {
1661*e4b17023SJohn Marino fprintf (dump_file, "Reuse distances:\n");
1662*e4b17023SJohn Marino for (gr = refs; gr; gr = gr->next)
1663*e4b17023SJohn Marino for (ref = gr->refs; ref; ref = ref->next)
1664*e4b17023SJohn Marino fprintf (dump_file, " ref %p distance %u\n",
1665*e4b17023SJohn Marino (void *) ref, ref->reuse_distance);
1666*e4b17023SJohn Marino }
1667*e4b17023SJohn Marino }
1668*e4b17023SJohn Marino
1669*e4b17023SJohn Marino /* Determine whether or not the trip count to ahead ratio is too small based
1670*e4b17023SJohn Marino on prefitablility consideration.
1671*e4b17023SJohn Marino AHEAD: the iteration ahead distance,
1672*e4b17023SJohn Marino EST_NITER: the estimated trip count. */
1673*e4b17023SJohn Marino
1674*e4b17023SJohn Marino static bool
trip_count_to_ahead_ratio_too_small_p(unsigned ahead,HOST_WIDE_INT est_niter)1675*e4b17023SJohn Marino trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1676*e4b17023SJohn Marino {
1677*e4b17023SJohn Marino /* Assume trip count to ahead ratio is big enough if the trip count could not
1678*e4b17023SJohn Marino be estimated at compile time. */
1679*e4b17023SJohn Marino if (est_niter < 0)
1680*e4b17023SJohn Marino return false;
1681*e4b17023SJohn Marino
1682*e4b17023SJohn Marino if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1683*e4b17023SJohn Marino {
1684*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1685*e4b17023SJohn Marino fprintf (dump_file,
1686*e4b17023SJohn Marino "Not prefetching -- loop estimated to roll only %d times\n",
1687*e4b17023SJohn Marino (int) est_niter);
1688*e4b17023SJohn Marino return true;
1689*e4b17023SJohn Marino }
1690*e4b17023SJohn Marino
1691*e4b17023SJohn Marino return false;
1692*e4b17023SJohn Marino }
1693*e4b17023SJohn Marino
1694*e4b17023SJohn Marino /* Determine whether or not the number of memory references in the loop is
1695*e4b17023SJohn Marino reasonable based on the profitablity and compilation time considerations.
1696*e4b17023SJohn Marino NINSNS: estimated number of instructions in the loop,
1697*e4b17023SJohn Marino MEM_REF_COUNT: total number of memory references in the loop. */
1698*e4b17023SJohn Marino
1699*e4b17023SJohn Marino static bool
mem_ref_count_reasonable_p(unsigned ninsns,unsigned mem_ref_count)1700*e4b17023SJohn Marino mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1701*e4b17023SJohn Marino {
1702*e4b17023SJohn Marino int insn_to_mem_ratio;
1703*e4b17023SJohn Marino
1704*e4b17023SJohn Marino if (mem_ref_count == 0)
1705*e4b17023SJohn Marino return false;
1706*e4b17023SJohn Marino
1707*e4b17023SJohn Marino /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1708*e4b17023SJohn Marino (compute_all_dependences) have high costs based on quadratic complexity.
1709*e4b17023SJohn Marino To avoid huge compilation time, we give up prefetching if mem_ref_count
1710*e4b17023SJohn Marino is too large. */
1711*e4b17023SJohn Marino if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1712*e4b17023SJohn Marino return false;
1713*e4b17023SJohn Marino
1714*e4b17023SJohn Marino /* Prefetching improves performance by overlapping cache missing
1715*e4b17023SJohn Marino memory accesses with CPU operations. If the loop does not have
1716*e4b17023SJohn Marino enough CPU operations to overlap with memory operations, prefetching
1717*e4b17023SJohn Marino won't give a significant benefit. One approximate way of checking
1718*e4b17023SJohn Marino this is to require the ratio of instructions to memory references to
1719*e4b17023SJohn Marino be above a certain limit. This approximation works well in practice.
1720*e4b17023SJohn Marino TODO: Implement a more precise computation by estimating the time
1721*e4b17023SJohn Marino for each CPU or memory op in the loop. Time estimates for memory ops
1722*e4b17023SJohn Marino should account for cache misses. */
1723*e4b17023SJohn Marino insn_to_mem_ratio = ninsns / mem_ref_count;
1724*e4b17023SJohn Marino
1725*e4b17023SJohn Marino if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1726*e4b17023SJohn Marino {
1727*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1728*e4b17023SJohn Marino fprintf (dump_file,
1729*e4b17023SJohn Marino "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1730*e4b17023SJohn Marino insn_to_mem_ratio);
1731*e4b17023SJohn Marino return false;
1732*e4b17023SJohn Marino }
1733*e4b17023SJohn Marino
1734*e4b17023SJohn Marino return true;
1735*e4b17023SJohn Marino }
1736*e4b17023SJohn Marino
1737*e4b17023SJohn Marino /* Determine whether or not the instruction to prefetch ratio in the loop is
1738*e4b17023SJohn Marino too small based on the profitablity consideration.
1739*e4b17023SJohn Marino NINSNS: estimated number of instructions in the loop,
1740*e4b17023SJohn Marino PREFETCH_COUNT: an estimate of the number of prefetches,
1741*e4b17023SJohn Marino UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1742*e4b17023SJohn Marino
1743*e4b17023SJohn Marino static bool
insn_to_prefetch_ratio_too_small_p(unsigned ninsns,unsigned prefetch_count,unsigned unroll_factor)1744*e4b17023SJohn Marino insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1745*e4b17023SJohn Marino unsigned unroll_factor)
1746*e4b17023SJohn Marino {
1747*e4b17023SJohn Marino int insn_to_prefetch_ratio;
1748*e4b17023SJohn Marino
1749*e4b17023SJohn Marino /* Prefetching most likely causes performance degradation when the instruction
1750*e4b17023SJohn Marino to prefetch ratio is too small. Too many prefetch instructions in a loop
1751*e4b17023SJohn Marino may reduce the I-cache performance.
1752*e4b17023SJohn Marino (unroll_factor * ninsns) is used to estimate the number of instructions in
1753*e4b17023SJohn Marino the unrolled loop. This implementation is a bit simplistic -- the number
1754*e4b17023SJohn Marino of issued prefetch instructions is also affected by unrolling. So,
1755*e4b17023SJohn Marino prefetch_mod and the unroll factor should be taken into account when
1756*e4b17023SJohn Marino determining prefetch_count. Also, the number of insns of the unrolled
1757*e4b17023SJohn Marino loop will usually be significantly smaller than the number of insns of the
1758*e4b17023SJohn Marino original loop * unroll_factor (at least the induction variable increases
1759*e4b17023SJohn Marino and the exit branches will get eliminated), so it might be better to use
1760*e4b17023SJohn Marino tree_estimate_loop_size + estimated_unrolled_size. */
1761*e4b17023SJohn Marino insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1762*e4b17023SJohn Marino if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1763*e4b17023SJohn Marino {
1764*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1765*e4b17023SJohn Marino fprintf (dump_file,
1766*e4b17023SJohn Marino "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1767*e4b17023SJohn Marino insn_to_prefetch_ratio);
1768*e4b17023SJohn Marino return true;
1769*e4b17023SJohn Marino }
1770*e4b17023SJohn Marino
1771*e4b17023SJohn Marino return false;
1772*e4b17023SJohn Marino }
1773*e4b17023SJohn Marino
1774*e4b17023SJohn Marino
1775*e4b17023SJohn Marino /* Issue prefetch instructions for array references in LOOP. Returns
1776*e4b17023SJohn Marino true if the LOOP was unrolled. */
1777*e4b17023SJohn Marino
1778*e4b17023SJohn Marino static bool
loop_prefetch_arrays(struct loop * loop)1779*e4b17023SJohn Marino loop_prefetch_arrays (struct loop *loop)
1780*e4b17023SJohn Marino {
1781*e4b17023SJohn Marino struct mem_ref_group *refs;
1782*e4b17023SJohn Marino unsigned ahead, ninsns, time, unroll_factor;
1783*e4b17023SJohn Marino HOST_WIDE_INT est_niter;
1784*e4b17023SJohn Marino struct tree_niter_desc desc;
1785*e4b17023SJohn Marino bool unrolled = false, no_other_refs;
1786*e4b17023SJohn Marino unsigned prefetch_count;
1787*e4b17023SJohn Marino unsigned mem_ref_count;
1788*e4b17023SJohn Marino
1789*e4b17023SJohn Marino if (optimize_loop_nest_for_size_p (loop))
1790*e4b17023SJohn Marino {
1791*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1792*e4b17023SJohn Marino fprintf (dump_file, " ignored (cold area)\n");
1793*e4b17023SJohn Marino return false;
1794*e4b17023SJohn Marino }
1795*e4b17023SJohn Marino
1796*e4b17023SJohn Marino /* FIXME: the time should be weighted by the probabilities of the blocks in
1797*e4b17023SJohn Marino the loop body. */
1798*e4b17023SJohn Marino time = tree_num_loop_insns (loop, &eni_time_weights);
1799*e4b17023SJohn Marino if (time == 0)
1800*e4b17023SJohn Marino return false;
1801*e4b17023SJohn Marino
1802*e4b17023SJohn Marino ahead = (PREFETCH_LATENCY + time - 1) / time;
1803*e4b17023SJohn Marino est_niter = max_stmt_executions_int (loop, false);
1804*e4b17023SJohn Marino
1805*e4b17023SJohn Marino /* Prefetching is not likely to be profitable if the trip count to ahead
1806*e4b17023SJohn Marino ratio is too small. */
1807*e4b17023SJohn Marino if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1808*e4b17023SJohn Marino return false;
1809*e4b17023SJohn Marino
1810*e4b17023SJohn Marino ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1811*e4b17023SJohn Marino
1812*e4b17023SJohn Marino /* Step 1: gather the memory references. */
1813*e4b17023SJohn Marino refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1814*e4b17023SJohn Marino
1815*e4b17023SJohn Marino /* Give up prefetching if the number of memory references in the
1816*e4b17023SJohn Marino loop is not reasonable based on profitablity and compilation time
1817*e4b17023SJohn Marino considerations. */
1818*e4b17023SJohn Marino if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1819*e4b17023SJohn Marino goto fail;
1820*e4b17023SJohn Marino
1821*e4b17023SJohn Marino /* Step 2: estimate the reuse effects. */
1822*e4b17023SJohn Marino prune_by_reuse (refs);
1823*e4b17023SJohn Marino
1824*e4b17023SJohn Marino if (nothing_to_prefetch_p (refs))
1825*e4b17023SJohn Marino goto fail;
1826*e4b17023SJohn Marino
1827*e4b17023SJohn Marino determine_loop_nest_reuse (loop, refs, no_other_refs);
1828*e4b17023SJohn Marino
1829*e4b17023SJohn Marino /* Step 3: determine unroll factor. */
1830*e4b17023SJohn Marino unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1831*e4b17023SJohn Marino est_niter);
1832*e4b17023SJohn Marino
1833*e4b17023SJohn Marino /* Estimate prefetch count for the unrolled loop. */
1834*e4b17023SJohn Marino prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1835*e4b17023SJohn Marino if (prefetch_count == 0)
1836*e4b17023SJohn Marino goto fail;
1837*e4b17023SJohn Marino
1838*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1839*e4b17023SJohn Marino fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1840*e4b17023SJohn Marino HOST_WIDE_INT_PRINT_DEC "\n"
1841*e4b17023SJohn Marino "insn count %d, mem ref count %d, prefetch count %d\n",
1842*e4b17023SJohn Marino ahead, unroll_factor, est_niter,
1843*e4b17023SJohn Marino ninsns, mem_ref_count, prefetch_count);
1844*e4b17023SJohn Marino
1845*e4b17023SJohn Marino /* Prefetching is not likely to be profitable if the instruction to prefetch
1846*e4b17023SJohn Marino ratio is too small. */
1847*e4b17023SJohn Marino if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1848*e4b17023SJohn Marino unroll_factor))
1849*e4b17023SJohn Marino goto fail;
1850*e4b17023SJohn Marino
1851*e4b17023SJohn Marino mark_nontemporal_stores (loop, refs);
1852*e4b17023SJohn Marino
1853*e4b17023SJohn Marino /* Step 4: what to prefetch? */
1854*e4b17023SJohn Marino if (!schedule_prefetches (refs, unroll_factor, ahead))
1855*e4b17023SJohn Marino goto fail;
1856*e4b17023SJohn Marino
1857*e4b17023SJohn Marino /* Step 5: unroll the loop. TODO -- peeling of first and last few
1858*e4b17023SJohn Marino iterations so that we do not issue superfluous prefetches. */
1859*e4b17023SJohn Marino if (unroll_factor != 1)
1860*e4b17023SJohn Marino {
1861*e4b17023SJohn Marino tree_unroll_loop (loop, unroll_factor,
1862*e4b17023SJohn Marino single_dom_exit (loop), &desc);
1863*e4b17023SJohn Marino unrolled = true;
1864*e4b17023SJohn Marino }
1865*e4b17023SJohn Marino
1866*e4b17023SJohn Marino /* Step 6: issue the prefetches. */
1867*e4b17023SJohn Marino issue_prefetches (refs, unroll_factor, ahead);
1868*e4b17023SJohn Marino
1869*e4b17023SJohn Marino fail:
1870*e4b17023SJohn Marino release_mem_refs (refs);
1871*e4b17023SJohn Marino return unrolled;
1872*e4b17023SJohn Marino }
1873*e4b17023SJohn Marino
1874*e4b17023SJohn Marino /* Issue prefetch instructions for array references in loops. */
1875*e4b17023SJohn Marino
1876*e4b17023SJohn Marino unsigned int
tree_ssa_prefetch_arrays(void)1877*e4b17023SJohn Marino tree_ssa_prefetch_arrays (void)
1878*e4b17023SJohn Marino {
1879*e4b17023SJohn Marino loop_iterator li;
1880*e4b17023SJohn Marino struct loop *loop;
1881*e4b17023SJohn Marino bool unrolled = false;
1882*e4b17023SJohn Marino int todo_flags = 0;
1883*e4b17023SJohn Marino
1884*e4b17023SJohn Marino if (!HAVE_prefetch
1885*e4b17023SJohn Marino /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1886*e4b17023SJohn Marino -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1887*e4b17023SJohn Marino of processor costs and i486 does not have prefetch, but
1888*e4b17023SJohn Marino -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1889*e4b17023SJohn Marino || PREFETCH_BLOCK == 0)
1890*e4b17023SJohn Marino return 0;
1891*e4b17023SJohn Marino
1892*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1893*e4b17023SJohn Marino {
1894*e4b17023SJohn Marino fprintf (dump_file, "Prefetching parameters:\n");
1895*e4b17023SJohn Marino fprintf (dump_file, " simultaneous prefetches: %d\n",
1896*e4b17023SJohn Marino SIMULTANEOUS_PREFETCHES);
1897*e4b17023SJohn Marino fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1898*e4b17023SJohn Marino fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1899*e4b17023SJohn Marino fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1900*e4b17023SJohn Marino L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1901*e4b17023SJohn Marino fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1902*e4b17023SJohn Marino fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1903*e4b17023SJohn Marino fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1904*e4b17023SJohn Marino MIN_INSN_TO_PREFETCH_RATIO);
1905*e4b17023SJohn Marino fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1906*e4b17023SJohn Marino PREFETCH_MIN_INSN_TO_MEM_RATIO);
1907*e4b17023SJohn Marino fprintf (dump_file, "\n");
1908*e4b17023SJohn Marino }
1909*e4b17023SJohn Marino
1910*e4b17023SJohn Marino initialize_original_copy_tables ();
1911*e4b17023SJohn Marino
1912*e4b17023SJohn Marino if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1913*e4b17023SJohn Marino {
1914*e4b17023SJohn Marino tree type = build_function_type_list (void_type_node,
1915*e4b17023SJohn Marino const_ptr_type_node, NULL_TREE);
1916*e4b17023SJohn Marino tree decl = add_builtin_function ("__builtin_prefetch", type,
1917*e4b17023SJohn Marino BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1918*e4b17023SJohn Marino NULL, NULL_TREE);
1919*e4b17023SJohn Marino DECL_IS_NOVOPS (decl) = true;
1920*e4b17023SJohn Marino set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1921*e4b17023SJohn Marino }
1922*e4b17023SJohn Marino
1923*e4b17023SJohn Marino /* We assume that size of cache line is a power of two, so verify this
1924*e4b17023SJohn Marino here. */
1925*e4b17023SJohn Marino gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1926*e4b17023SJohn Marino
1927*e4b17023SJohn Marino FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1928*e4b17023SJohn Marino {
1929*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1930*e4b17023SJohn Marino fprintf (dump_file, "Processing loop %d:\n", loop->num);
1931*e4b17023SJohn Marino
1932*e4b17023SJohn Marino unrolled |= loop_prefetch_arrays (loop);
1933*e4b17023SJohn Marino
1934*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS))
1935*e4b17023SJohn Marino fprintf (dump_file, "\n\n");
1936*e4b17023SJohn Marino }
1937*e4b17023SJohn Marino
1938*e4b17023SJohn Marino if (unrolled)
1939*e4b17023SJohn Marino {
1940*e4b17023SJohn Marino scev_reset ();
1941*e4b17023SJohn Marino todo_flags |= TODO_cleanup_cfg;
1942*e4b17023SJohn Marino }
1943*e4b17023SJohn Marino
1944*e4b17023SJohn Marino free_original_copy_tables ();
1945*e4b17023SJohn Marino return todo_flags;
1946*e4b17023SJohn Marino }
1947