1 /*
2  * Copyright © 2021 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "freedreno_autotune.h"
25 #include "freedreno_batch.h"
26 #include "freedreno_util.h"
27 
28 /**
29  * Tracks, for a given batch key (which maps to a FBO/framebuffer state),
30  *
31  * ralloc parent is fd_autotune::ht
32  */
33 struct fd_batch_history {
34    struct fd_batch_key *key;
35 
36    /* Entry in fd_autotune::lru: */
37    struct list_head node;
38 
39    unsigned num_results;
40 
41    /**
42     * List of recent fd_batch_result's
43     */
44    struct list_head results;
45 #define MAX_RESULTS 5
46 };
47 
48 static struct fd_batch_history *
get_history(struct fd_autotune * at,struct fd_batch * batch)49 get_history(struct fd_autotune *at, struct fd_batch *batch)
50 {
51    struct fd_batch_history *history;
52 
53    /* draw batches should still have their key at this point. */
54    assert(batch->key || batch->nondraw);
55    if (!batch->key)
56       return NULL;
57 
58    struct hash_entry *entry =
59       _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key);
60 
61    if (entry) {
62       history = entry->data;
63       goto found;
64    }
65 
66    history = rzalloc_size(at->ht, sizeof(*history));
67 
68    history->key = fd_batch_key_clone(history, batch->key);
69    list_inithead(&history->node);
70    list_inithead(&history->results);
71 
72    /* Note: We cap # of cached GMEM states at 20.. so assuming double-
73     * buffering, 40 should be a good place to cap cached autotune state
74     */
75    if (at->ht->entries >= 40) {
76       struct fd_batch_history *last =
77          list_last_entry(&at->lru, struct fd_batch_history, node);
78       _mesa_hash_table_remove_key(at->ht, last->key);
79       list_del(&last->node);
80       ralloc_free(last);
81    }
82 
83    _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key,
84                                       history);
85 
86 found:
87    /* Move to the head of the LRU: */
88    list_delinit(&history->node);
89    list_add(&history->node, &at->lru);
90 
91    return history;
92 }
93 
94 static void
result_destructor(void * r)95 result_destructor(void *r)
96 {
97    struct fd_batch_result *result = r;
98 
99    /* Just in case we manage to somehow still be on the pending_results list: */
100    list_del(&result->node);
101 }
102 
103 static struct fd_batch_result *
get_result(struct fd_autotune * at,struct fd_batch_history * history)104 get_result(struct fd_autotune *at, struct fd_batch_history *history)
105 {
106    struct fd_batch_result *result = rzalloc_size(history, sizeof(*result));
107 
108    result->fence =
109       ++at->fence_counter; /* pre-increment so zero isn't valid fence */
110    result->idx = at->idx_counter++;
111 
112    if (at->idx_counter >= ARRAY_SIZE(at->results->result))
113       at->idx_counter = 0;
114 
115    result->history = history;
116    list_addtail(&result->node, &at->pending_results);
117 
118    ralloc_set_destructor(result, result_destructor);
119 
120    return result;
121 }
122 
123 static void
process_results(struct fd_autotune * at)124 process_results(struct fd_autotune *at)
125 {
126    uint32_t current_fence = at->results->fence;
127 
128    list_for_each_entry_safe (struct fd_batch_result, result,
129                              &at->pending_results, node) {
130       if (result->fence > current_fence)
131          break;
132 
133       struct fd_batch_history *history = result->history;
134 
135       result->samples_passed = at->results->result[result->idx].samples_end -
136                                at->results->result[result->idx].samples_start;
137 
138       list_delinit(&result->node);
139       list_add(&result->node, &history->results);
140 
141       if (history->num_results < MAX_RESULTS) {
142          history->num_results++;
143       } else {
144          /* Once above a limit, start popping old results off the
145           * tail of the list:
146           */
147          struct fd_batch_result *old_result =
148             list_last_entry(&history->results, struct fd_batch_result, node);
149          list_delinit(&old_result->node);
150          ralloc_free(old_result);
151       }
152    }
153 }
154 
155 static bool
fallback_use_bypass(struct fd_batch * batch)156 fallback_use_bypass(struct fd_batch *batch)
157 {
158    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
159 
160    /* Fallback logic if we have no historical data about the rendertarget: */
161    if (batch->cleared || batch->gmem_reason ||
162        (batch->num_draws > 5) || (pfb->samples > 1)) {
163       return false;
164    }
165 
166    return true;
167 }
168 
169 /**
170  * A magic 8-ball that tells the gmem code whether we should do bypass mode
171  * for moar fps.
172  */
173 bool
fd_autotune_use_bypass(struct fd_autotune * at,struct fd_batch * batch)174 fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch)
175 {
176    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
177 
178    process_results(at);
179 
180    /* Only enable on gen's that opt-in (and actually have sample-passed
181     * collection wired up:
182     */
183    if (!batch->ctx->screen->gmem_reason_mask)
184       return fallback_use_bypass(batch);
185 
186    if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask)
187       return fallback_use_bypass(batch);
188 
189    for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
190       /* If ms-rtt is involved, force GMEM, as we don't currently
191        * implement a temporary render target that we can MSAA resolve
192        * from
193        */
194       if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples)
195          return fallback_use_bypass(batch);
196    }
197 
198    struct fd_batch_history *history = get_history(at, batch);
199    if (!history)
200       return fallback_use_bypass(batch);
201 
202    batch->autotune_result = get_result(at, history);
203    batch->autotune_result->cost = batch->cost;
204 
205    bool use_bypass = fallback_use_bypass(batch);
206 
207    if (use_bypass)
208       return true;
209 
210    if (history->num_results > 0) {
211       uint32_t total_samples = 0;
212 
213       // TODO we should account for clears somehow
214       // TODO should we try to notice if there is a drastic change from
215       // frame to frame?
216       list_for_each_entry (struct fd_batch_result, result, &history->results,
217                            node) {
218          total_samples += result->samples_passed;
219       }
220 
221       float avg_samples = (float)total_samples / (float)history->num_results;
222 
223       /* Low sample count could mean there was only a clear.. or there was
224        * a clear plus draws that touch no or few samples
225        */
226       if (avg_samples < 500.0)
227          return true;
228 
229       /* Cost-per-sample is an estimate for the average number of reads+
230        * writes for a given passed sample.
231        */
232       float sample_cost = batch->cost;
233       sample_cost /= batch->num_draws;
234 
235       float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws;
236       DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, "
237           "total_draw_cost=%f\n",
238           batch->hash, batch->num_draws, total_samples, avg_samples,
239           sample_cost, total_draw_cost);
240 
241       if (total_draw_cost < 3000.0)
242          return true;
243    }
244 
245    return use_bypass;
246 }
247 
248 void
fd_autotune_init(struct fd_autotune * at,struct fd_device * dev)249 fd_autotune_init(struct fd_autotune *at, struct fd_device *dev)
250 {
251    at->ht =
252       _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals);
253    list_inithead(&at->lru);
254 
255    at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results),
256                                0, "autotune");
257    at->results = fd_bo_map(at->results_mem);
258 
259    list_inithead(&at->pending_results);
260 }
261 
262 void
fd_autotune_fini(struct fd_autotune * at)263 fd_autotune_fini(struct fd_autotune *at)
264 {
265    _mesa_hash_table_destroy(at->ht, NULL);
266    fd_bo_del(at->results_mem);
267 }
268