1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 /** @file brw_program_cache.c
33  *
34  * This file implements a simple program cache for 965.  The consumers can
35  *  query the hash table of programs using a cache_id and program key, and
36  * receive the corresponding program buffer object (plus associated auxiliary
37  *  data) in return.  Objects in the cache may not have relocations
38  * (pointers to other BOs) in them.
39  *
40  * The inner workings are a simple hash table based on a FNV-1a of the
41  * key data.
42  *
43  * Replacement is not implemented.  Instead, when the cache gets too
44  * big we throw out all of the cache data and let it get regenerated.
45  */
46 
47 #include "main/streaming-load-memcpy.h"
48 #include "x86/common_x86_asm.h"
49 #include "brw_batch.h"
50 #include "brw_state.h"
51 #include "brw_wm.h"
52 #include "brw_gs.h"
53 #include "brw_cs.h"
54 #include "brw_program.h"
55 #include "compiler/brw_eu.h"
56 #include "util/u_memory.h"
57 #define XXH_INLINE_ALL
58 #include "util/xxhash.h"
59 
60 #define FILE_DEBUG_FLAG DEBUG_STATE
61 
62 struct brw_cache_item {
63    /**
64     * Effectively part of the key, cache_id identifies what kind of state
65     * buffer is involved, and also which dirty flag should set.
66     */
67    enum brw_cache_id cache_id;
68 
69    /** 32-bit hash of the key data */
70    GLuint hash;
71 
72    /** for variable-sized keys */
73    GLuint key_size;
74    GLuint prog_data_size;
75    const struct brw_base_prog_key *key;
76 
77    uint32_t offset;
78    uint32_t size;
79 
80    struct brw_cache_item *next;
81 };
82 
83 enum brw_cache_id
brw_stage_cache_id(gl_shader_stage stage)84 brw_stage_cache_id(gl_shader_stage stage)
85 {
86    static const enum brw_cache_id stage_ids[] = {
87       BRW_CACHE_VS_PROG,
88       BRW_CACHE_TCS_PROG,
89       BRW_CACHE_TES_PROG,
90       BRW_CACHE_GS_PROG,
91       BRW_CACHE_FS_PROG,
92       BRW_CACHE_CS_PROG,
93    };
94    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
95    return stage_ids[stage];
96 }
97 
98 static GLuint
hash_key(struct brw_cache_item * item)99 hash_key(struct brw_cache_item *item)
100 {
101     uint32_t hash = 0;
102     hash = XXH32(&item->cache_id, sizeof(item->cache_id), hash);
103     hash = XXH32(item->key, item->key_size, hash);
104 
105    return hash;
106 }
107 
108 static int
brw_cache_item_equals(const struct brw_cache_item * a,const struct brw_cache_item * b)109 brw_cache_item_equals(const struct brw_cache_item *a,
110                       const struct brw_cache_item *b)
111 {
112    return a->cache_id == b->cache_id &&
113       a->hash == b->hash &&
114       a->key_size == b->key_size &&
115       (memcmp(a->key, b->key, a->key_size) == 0);
116 }
117 
118 static struct brw_cache_item *
search_cache(struct brw_cache * cache,GLuint hash,struct brw_cache_item * lookup)119 search_cache(struct brw_cache *cache, GLuint hash,
120              struct brw_cache_item *lookup)
121 {
122    struct brw_cache_item *c;
123 
124 #if 0
125    int bucketcount = 0;
126 
127    for (c = cache->items[hash % cache->size]; c; c = c->next)
128       bucketcount++;
129 
130    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
131            cache->size, bucketcount, cache->n_items);
132 #endif
133 
134    for (c = cache->items[hash % cache->size]; c; c = c->next) {
135       if (brw_cache_item_equals(lookup, c))
136          return c;
137    }
138 
139    return NULL;
140 }
141 
142 
143 static void
rehash(struct brw_cache * cache)144 rehash(struct brw_cache *cache)
145 {
146    struct brw_cache_item **items;
147    struct brw_cache_item *c, *next;
148    GLuint size, i;
149 
150    size = cache->size * 3;
151    items = calloc(size, sizeof(*items));
152 
153    for (i = 0; i < cache->size; i++)
154       for (c = cache->items[i]; c; c = next) {
155          next = c->next;
156          c->next = items[c->hash % size];
157          items[c->hash % size] = c;
158       }
159 
160    free(cache->items);
161    cache->items = items;
162    cache->size = size;
163 }
164 
165 
166 /**
167  * Returns the buffer object matching cache_id and key, or NULL.
168  */
169 bool
brw_search_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,uint32_t * inout_offset,void * inout_prog_data,bool flag_state)170 brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
171                  const void *key, GLuint key_size, uint32_t *inout_offset,
172                  void *inout_prog_data, bool flag_state)
173 {
174    struct brw_cache_item *item;
175    struct brw_cache_item lookup;
176    GLuint hash;
177 
178    lookup.cache_id = cache_id;
179    lookup.key = key;
180    lookup.key_size = key_size;
181    hash = hash_key(&lookup);
182    lookup.hash = hash;
183 
184    item = search_cache(cache, hash, &lookup);
185 
186    if (item == NULL)
187       return false;
188 
189    void *prog_data = ((char *) item->key) + item->key_size;
190 
191    if (item->offset != *inout_offset ||
192        prog_data != *((void **) inout_prog_data)) {
193       if (likely(flag_state))
194          cache->brw->ctx.NewDriverState |= (1 << cache_id);
195       *inout_offset = item->offset;
196       *((void **) inout_prog_data) = prog_data;
197    }
198 
199    return true;
200 }
201 
202 static void
brw_cache_new_bo(struct brw_cache * cache,uint32_t new_size)203 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
204 {
205    struct brw_context *brw = cache->brw;
206    struct brw_bo *new_bo;
207 
208    perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
209               (unsigned) cache->bo->size / 1024, new_size / 1024);
210 
211    new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
212                          BRW_MEMZONE_SHADER);
213    if (can_do_exec_capture(brw->screen))
214       new_bo->kflags |= EXEC_OBJECT_CAPTURE;
215 
216    void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
217                                        MAP_ASYNC | MAP_PERSISTENT);
218 
219    /* Copy any existing data that needs to be saved. */
220    if (cache->next_offset != 0) {
221 #ifdef USE_SSE41
222       if (!cache->bo->cache_coherent && cpu_has_sse4_1)
223          _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
224       else
225 #endif
226          memcpy(map, cache->map, cache->next_offset);
227    }
228 
229    brw_bo_unmap(cache->bo);
230    brw_bo_unreference(cache->bo);
231    cache->bo = new_bo;
232    cache->map = map;
233 
234    /* Since we have a new BO in place, we need to signal the units
235     * that depend on it (state base address on gfx5+, or unit state before).
236     */
237    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
238    brw->batch.state_base_address_emitted = false;
239 }
240 
241 /**
242  * Attempts to find an item in the cache with identical data.
243  */
244 static const struct brw_cache_item *
brw_lookup_prog(const struct brw_cache * cache,enum brw_cache_id cache_id,const void * data,unsigned data_size)245 brw_lookup_prog(const struct brw_cache *cache,
246                 enum brw_cache_id cache_id,
247                 const void *data, unsigned data_size)
248 {
249    unsigned i;
250    const struct brw_cache_item *item;
251 
252    for (i = 0; i < cache->size; i++) {
253       for (item = cache->items[i]; item; item = item->next) {
254          if (item->cache_id != cache_id || item->size != data_size ||
255              memcmp(cache->map + item->offset, data, item->size) != 0)
256             continue;
257 
258          return item;
259       }
260    }
261 
262    return NULL;
263 }
264 
265 static uint32_t
brw_alloc_item_data(struct brw_cache * cache,uint32_t size)266 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
267 {
268    uint32_t offset;
269 
270    /* Allocate space in the cache BO for our new program. */
271    if (cache->next_offset + size > cache->bo->size) {
272       uint32_t new_size = cache->bo->size * 2;
273 
274       while (cache->next_offset + size > new_size)
275          new_size *= 2;
276 
277       brw_cache_new_bo(cache, new_size);
278    }
279 
280    offset = cache->next_offset;
281 
282    /* Programs are always 64-byte aligned, so set up the next one now */
283    cache->next_offset = ALIGN(offset + size, 64);
284 
285    return offset;
286 }
287 
288 const void *
brw_find_previous_compile(struct brw_cache * cache,enum brw_cache_id cache_id,unsigned program_string_id)289 brw_find_previous_compile(struct brw_cache *cache,
290                           enum brw_cache_id cache_id,
291                           unsigned program_string_id)
292 {
293    for (unsigned i = 0; i < cache->size; i++) {
294       for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
295          if (c->cache_id == cache_id &&
296              c->key->program_string_id == program_string_id) {
297             return c->key;
298          }
299       }
300    }
301 
302    return NULL;
303 }
304 
305 void
brw_upload_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,const void * data,GLuint data_size,const void * prog_data,GLuint prog_data_size,uint32_t * out_offset,void * out_prog_data)306 brw_upload_cache(struct brw_cache *cache,
307                  enum brw_cache_id cache_id,
308                  const void *key,
309                  GLuint key_size,
310                  const void *data,
311                  GLuint data_size,
312                  const void *prog_data,
313                  GLuint prog_data_size,
314                  uint32_t *out_offset,
315                  void *out_prog_data)
316 {
317    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
318    const struct brw_cache_item *matching_data =
319       brw_lookup_prog(cache, cache_id, data, data_size);
320    GLuint hash;
321    void *tmp;
322 
323    item->cache_id = cache_id;
324    item->size = data_size;
325    item->key = key;
326    item->key_size = key_size;
327    item->prog_data_size = prog_data_size;
328    hash = hash_key(item);
329    item->hash = hash;
330 
331    /* If we can find a matching prog in the cache already, then reuse the
332     * existing stuff without creating new copy into the underlying buffer
333     * object. This is notably useful for programs generating shaders at
334     * runtime, where multiple shaders may compile to the same thing in our
335     * backend.
336     */
337    if (matching_data) {
338       item->offset = matching_data->offset;
339    } else {
340       item->offset = brw_alloc_item_data(cache, data_size);
341 
342       /* Copy data to the buffer */
343       memcpy(cache->map + item->offset, data, data_size);
344    }
345 
346    /* Set up the memory containing the key and prog_data */
347    tmp = malloc(key_size + prog_data_size);
348 
349    memcpy(tmp, key, key_size);
350    memcpy(tmp + key_size, prog_data, prog_data_size);
351 
352    item->key = tmp;
353 
354    if (cache->n_items > cache->size * 1.5f)
355       rehash(cache);
356 
357    hash %= cache->size;
358    item->next = cache->items[hash];
359    cache->items[hash] = item;
360    cache->n_items++;
361 
362    *out_offset = item->offset;
363    *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
364    cache->brw->ctx.NewDriverState |= 1 << cache_id;
365 }
366 
367 void
brw_init_caches(struct brw_context * brw)368 brw_init_caches(struct brw_context *brw)
369 {
370    struct brw_cache *cache = &brw->cache;
371 
372    cache->brw = brw;
373 
374    cache->size = 7;
375    cache->n_items = 0;
376    cache->items =
377       calloc(cache->size, sizeof(struct brw_cache_item *));
378 
379    cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
380                             BRW_MEMZONE_SHADER);
381    if (can_do_exec_capture(brw->screen))
382       cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
383 
384    cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
385                                            MAP_ASYNC | MAP_PERSISTENT);
386 }
387 
388 static void
brw_clear_cache(struct brw_context * brw,struct brw_cache * cache)389 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
390 {
391    struct brw_cache_item *c, *next;
392    GLuint i;
393 
394    DBG("%s\n", __func__);
395 
396    for (i = 0; i < cache->size; i++) {
397       for (c = cache->items[i]; c; c = next) {
398          next = c->next;
399          if (c->cache_id == BRW_CACHE_VS_PROG ||
400              c->cache_id == BRW_CACHE_TCS_PROG ||
401              c->cache_id == BRW_CACHE_TES_PROG ||
402              c->cache_id == BRW_CACHE_GS_PROG ||
403              c->cache_id == BRW_CACHE_FS_PROG ||
404              c->cache_id == BRW_CACHE_CS_PROG) {
405             const void *item_prog_data = ((char *)c->key) + c->key_size;
406             brw_stage_prog_data_free(item_prog_data);
407          }
408          free((void *)c->key);
409          free(c);
410       }
411       cache->items[i] = NULL;
412    }
413 
414    cache->n_items = 0;
415 
416    /* Start putting programs into the start of the BO again, since
417     * we'll never find the old results.
418     */
419    cache->next_offset = 0;
420 
421    /* We need to make sure that the programs get regenerated, since
422     * any offsets leftover in brw_context will no longer be valid.
423     */
424    brw->NewGLState = ~0;
425    brw->ctx.NewDriverState = ~0ull;
426    brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
427    brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
428    brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
429    brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
430 
431    /* Also, NULL out any stale program pointers. */
432    brw->vs.base.prog_data = NULL;
433    brw->tcs.base.prog_data = NULL;
434    brw->tes.base.prog_data = NULL;
435    brw->gs.base.prog_data = NULL;
436    brw->wm.base.prog_data = NULL;
437    brw->cs.base.prog_data = NULL;
438 
439    brw_batch_flush(brw);
440 }
441 
442 void
brw_program_cache_check_size(struct brw_context * brw)443 brw_program_cache_check_size(struct brw_context *brw)
444 {
445    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
446     * state cache.
447     */
448    if (brw->cache.n_items > 2000) {
449       perf_debug("Exceeded state cache size limit.  Clearing the set "
450                  "of compiled programs, which will trigger recompiles\n");
451       brw_clear_cache(brw, &brw->cache);
452       brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
453    }
454 }
455 
456 
457 static void
brw_destroy_cache(struct brw_context * brw,struct brw_cache * cache)458 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
459 {
460 
461    DBG("%s\n", __func__);
462 
463    /* This can be NULL if context creation failed early on */
464    if (cache->bo) {
465       brw_bo_unmap(cache->bo);
466       brw_bo_unreference(cache->bo);
467       cache->bo = NULL;
468       cache->map = NULL;
469    }
470    brw_clear_cache(brw, cache);
471    free(cache->items);
472    cache->items = NULL;
473    cache->size = 0;
474 }
475 
476 
477 void
brw_destroy_caches(struct brw_context * brw)478 brw_destroy_caches(struct brw_context *brw)
479 {
480    brw_destroy_cache(brw, &brw->cache);
481 }
482 
483 static const char *
cache_name(enum brw_cache_id cache_id)484 cache_name(enum brw_cache_id cache_id)
485 {
486    switch (cache_id) {
487    case BRW_CACHE_VS_PROG:
488       return "VS kernel";
489    case BRW_CACHE_TCS_PROG:
490       return "TCS kernel";
491    case BRW_CACHE_TES_PROG:
492       return "TES kernel";
493    case BRW_CACHE_FF_GS_PROG:
494       return "Fixed-function GS kernel";
495    case BRW_CACHE_GS_PROG:
496       return "GS kernel";
497    case BRW_CACHE_CLIP_PROG:
498       return "CLIP kernel";
499    case BRW_CACHE_SF_PROG:
500       return "SF kernel";
501    case BRW_CACHE_FS_PROG:
502       return "FS kernel";
503    case BRW_CACHE_CS_PROG:
504       return "CS kernel";
505    default:
506       return "unknown";
507    }
508 }
509 
510 void
brw_print_program_cache(struct brw_context * brw)511 brw_print_program_cache(struct brw_context *brw)
512 {
513    const struct brw_cache *cache = &brw->cache;
514    struct brw_cache_item *item;
515 
516    for (unsigned i = 0; i < cache->size; i++) {
517       for (item = cache->items[i]; item; item = item->next) {
518          fprintf(stderr, "%s:\n", cache_name(i));
519          brw_disassemble_with_labels(&brw->screen->devinfo, cache->map,
520                                      item->offset, item->size, stderr);
521       }
522    }
523 }
524