1 /*
2  * Copyright © 2021 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef FREEDRENO_AUTOTUNE_H
25 #define FREEDRENO_AUTOTUNE_H
26 
27 #include "util/hash_table.h"
28 #include "util/list.h"
29 
30 #include "freedreno_util.h"
31 
32 struct fd_autotune_results;
33 
34 /**
35  * "autotune" our decisions about bypass vs GMEM rendering, based on historical
36  * data about a given render target.
37  *
38  * In deciding which path to take there are tradeoffs, including some that
39  * are not reasonably estimateable without having some additional information:
40  *
41  *  (1) If you know you are touching every pixel (ie. there is a glClear()),
42  *      then the GMEM path will at least not cost more memory bandwidth than
43  *      sysmem[1]
44  *
45  *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
46  *      due to sysmem->GMEM restore pass.
47  *
48  *  (3) If you see a high draw count, that is an indication that there will be
49  *      enough pixels accessed multiple times to benefit from the reduced
50  *      memory bandwidth that GMEM brings
51  *
52  *  (4) But high draw count where there is not much overdraw can actually be
53  *      faster in bypass mode if it is pushing a lot of state change, due to
54  *      not having to go thru the state changes per-tile[2]
55  *
56  * The approach taken is to measure the samples-passed for the batch to estimate
57  * the amount of overdraw to detect cases where the number of pixels touched is
58  * low.
59  *
60  * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE}
61  * performance countables, which give a more direct measurement of what we want
62  * to know (ie. is framebuffer memory access high enough to prefer GMEM), but
63  * with the downside of consuming half of the available RB counters.  With the
64  * additional complication that external perfcntr collection (fdperf, perfetto)
65  * and the drive could be stomping on each other's feet.  (Also reading the
66  * perfcntrs accurately requires a WFI.)
67  *
68  * [1] ignoring UBWC
69  * [2] ignoring early-tile-exit optimizations, but any draw that touches all/
70  *     most of the tiles late in the tile-pass can defeat that
71  */
72 struct fd_autotune {
73 
74    /**
75     * Cache to map batch->key (also used for batch-cache) to historical
76     * information about rendering to that particular render target.
77     */
78    struct hash_table *ht;
79 
80    /**
81     * List of recently used historical results (to age out old results)
82     */
83    struct list_head lru;
84 
85    /**
86     * GPU buffer used to communicate back results to the CPU
87     */
88    struct fd_bo *results_mem;
89    struct fd_autotune_results *results;
90 
91    /**
92     * List of per-batch results that we are waiting for the GPU to finish
93     * with before reading back the results.
94     */
95    struct list_head pending_results;
96 
97    uint32_t fence_counter;
98    uint32_t idx_counter;
99 };
100 
101 /**
102  * The layout of the memory used to read back per-batch results from the
103  * GPU
104  *
105  * Note this struct is intentionally aligned to 4k.  And hw requires the
106  * sample start/stop locations to be 128b aligned.
107  */
108 struct fd_autotune_results {
109 
110    /**
111     * The GPU writes back a "fence" seqno value from the cmdstream after
112     * it finishes writing it's result slot, so that the CPU knows when
113     * results are valid
114     */
115    uint32_t fence;
116 
117    uint32_t __pad0;
118    uint64_t __pad1;
119 
120    /**
121     * From the cmdstream, the captured samples-passed values are recorded
122     * at the start and end of the batch.
123     *
124     * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
125     * may force us to revisit that.
126     */
127    struct {
128       uint64_t samples_start;
129       uint64_t __pad0;
130       uint64_t samples_end;
131       uint64_t __pad1;
132    } result[127];
133 };
134 
135 #define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
136 #define results_ptr(at, member)                                                \
137    (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0
138 
139 struct fd_batch_history;
140 
141 /**
142  * Tracks the results from an individual batch.  Initially created per batch,
143  * and appended to the tail of at->pending_results.  At a later time, when
144  * the GPU has finished writing the results,
145  *
146  * ralloc parent is the associated fd_batch_history
147  */
148 struct fd_batch_result {
149 
150    /**
151     * The index/slot in fd_autotune_results::result[] to write start/end
152     * counter to
153     */
154    unsigned idx;
155 
156    /**
157     * Fence value to write back to fd_autotune_results::fence after both
158     * start/end values written
159     */
160    uint32_t fence;
161 
162    /*
163     * Below here, only used internally within autotune
164     */
165    struct fd_batch_history *history;
166    struct list_head node;
167    uint32_t cost;
168    uint64_t samples_passed;
169 };
170 
171 void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev);
172 void fd_autotune_fini(struct fd_autotune *at);
173 
174 struct fd_batch;
175 bool fd_autotune_use_bypass(struct fd_autotune *at,
176                             struct fd_batch *batch) assert_dt;
177 
178 #endif /* FREEDRENO_AUTOTUNE_H */
179