1 /*
2 * Copyright (c) 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * The aux map provides a multi-level lookup of the main surface address which
26 * ends up providing information about the auxiliary surface data, including
27 * the address where the auxiliary data resides.
28 *
29 * The 48-bit VMA (GPU) address of the main surface is split to do the address
30 * lookup:
31 *
32 * 48 bit address of main surface
33 * +--------+--------+--------+------+
34 * | 47:36 | 35:24 | 23:16 | 15:0 |
35 * | L3-idx | L2-idx | L1-idx | ... |
36 * +--------+--------+--------+------+
37 *
38 * The GFX_AUX_TABLE_BASE_ADDR points to a buffer. The L3 Table Entry is
39 * located by indexing into this buffer as a uint64_t array using the L3-idx
40 * value. The 64-bit L3 entry is defined as:
41 *
42 * +-------+-------------+------+---+
43 * | 63:48 | 47:15 | 14:1 | 0 |
44 * | ... | L2-tbl-addr | ... | V |
45 * +-------+-------------+------+---+
46 *
47 * If the `V` (valid) bit is set, then the L2-tbl-addr gives the address for
48 * the level-2 table entries, with the lower address bits filled with zero.
49 * The L2 Table Entry is located by indexing into this buffer as a uint64_t
50 * array using the L2-idx value. The 64-bit L2 entry is similar to the L3
51 * entry, except with 2 additional address bits:
52 *
53 * +-------+-------------+------+---+
54 * | 63:48 | 47:13 | 12:1 | 0 |
55 * | ... | L1-tbl-addr | ... | V |
56 * +-------+-------------+------+---+
57 *
58 * If the `V` bit is set, then the L1-tbl-addr gives the address for the
59 * level-1 table entries, with the lower address bits filled with zero. The L1
60 * Table Entry is located by indexing into this buffer as a uint64_t array
61 * using the L1-idx value. The 64-bit L1 entry is defined as:
62 *
63 * +--------+------+-------+-------+-------+---------------+-----+---+
64 * | 63:58 | 57 | 56:54 | 53:52 | 51:48 | 47:8 | 7:1 | 0 |
65 * | Format | Y/Cr | Depth | TM | ... | aux-data-addr | ... | V |
66 * +--------+------+-------+-------+-------+---------------+-----+---+
67 *
68 * Where:
69 * - Format: See `get_format_encoding`
70 * - Y/Cr: 0=not-Y/Cr, 1=Y/Cr
71 * - (bit) Depth: See `get_bpp_encoding`
72 * - TM (Tile-mode): 0=Ys, 1=Y, 2=rsvd, 3=rsvd
73 * - aux-data-addr: VMA/GPU address for the aux-data
74 * - V: entry is valid
75 */
76
77 #include "gen_aux_map.h"
78 #include "gen_gem.h"
79
80 #include "dev/gen_device_info.h"
81 #include "isl/isl.h"
82
83 #include "drm-uapi/i915_drm.h"
84 #include "util/list.h"
85 #include "util/ralloc.h"
86 #include "util/u_atomic.h"
87 #include "main/macros.h"
88
89 #include <inttypes.h>
90 #include <stdlib.h>
91 #include <stdio.h>
92 #include <pthread.h>
93
94 static const bool aux_map_debug = false;
95
96 struct aux_map_buffer {
97 struct list_head link;
98 struct gen_buffer *buffer;
99 };
100
101 struct gen_aux_map_context {
102 void *driver_ctx;
103 pthread_mutex_t mutex;
104 struct gen_mapped_pinned_buffer_alloc *buffer_alloc;
105 uint32_t num_buffers;
106 struct list_head buffers;
107 uint64_t level3_base_addr;
108 uint64_t *level3_map;
109 uint32_t tail_offset, tail_remaining;
110 uint32_t state_num;
111 };
112
113 static bool
add_buffer(struct gen_aux_map_context * ctx)114 add_buffer(struct gen_aux_map_context *ctx)
115 {
116 struct aux_map_buffer *buf = ralloc(ctx, struct aux_map_buffer);
117 if (!buf)
118 return false;
119
120 const uint32_t size = 0x100000;
121 buf->buffer = ctx->buffer_alloc->alloc(ctx->driver_ctx, size);
122 if (!buf->buffer) {
123 ralloc_free(buf);
124 return false;
125 }
126
127 assert(buf->buffer->map != NULL);
128
129 list_addtail(&buf->link, &ctx->buffers);
130 ctx->tail_offset = 0;
131 ctx->tail_remaining = size;
132 p_atomic_inc(&ctx->num_buffers);
133
134 return true;
135 }
136
137 static void
advance_current_pos(struct gen_aux_map_context * ctx,uint32_t size)138 advance_current_pos(struct gen_aux_map_context *ctx, uint32_t size)
139 {
140 assert(ctx->tail_remaining >= size);
141 ctx->tail_remaining -= size;
142 ctx->tail_offset += size;
143 }
144
145 static bool
align_and_verify_space(struct gen_aux_map_context * ctx,uint32_t size,uint32_t align)146 align_and_verify_space(struct gen_aux_map_context *ctx, uint32_t size,
147 uint32_t align)
148 {
149 if (ctx->tail_remaining < size)
150 return false;
151
152 struct aux_map_buffer *tail =
153 list_last_entry(&ctx->buffers, struct aux_map_buffer, link);
154 uint64_t gpu = tail->buffer->gpu + ctx->tail_offset;
155 uint64_t aligned = align64(gpu, align);
156
157 if ((aligned - gpu) + size > ctx->tail_remaining) {
158 return false;
159 } else {
160 if (aligned - gpu > 0)
161 advance_current_pos(ctx, aligned - gpu);
162 return true;
163 }
164 }
165
166 static void
get_current_pos(struct gen_aux_map_context * ctx,uint64_t * gpu,uint64_t ** map)167 get_current_pos(struct gen_aux_map_context *ctx, uint64_t *gpu, uint64_t **map)
168 {
169 assert(!list_is_empty(&ctx->buffers));
170 struct aux_map_buffer *tail =
171 list_last_entry(&ctx->buffers, struct aux_map_buffer, link);
172 if (gpu)
173 *gpu = tail->buffer->gpu + ctx->tail_offset;
174 if (map)
175 *map = (uint64_t*)((uint8_t*)tail->buffer->map + ctx->tail_offset);
176 }
177
178 static bool
add_sub_table(struct gen_aux_map_context * ctx,uint32_t size,uint32_t align,uint64_t * gpu,uint64_t ** map)179 add_sub_table(struct gen_aux_map_context *ctx, uint32_t size,
180 uint32_t align, uint64_t *gpu, uint64_t **map)
181 {
182 if (!align_and_verify_space(ctx, size, align)) {
183 if (!add_buffer(ctx))
184 return false;
185 UNUSED bool aligned = align_and_verify_space(ctx, size, align);
186 assert(aligned);
187 }
188 get_current_pos(ctx, gpu, map);
189 memset(*map, 0, size);
190 advance_current_pos(ctx, size);
191 return true;
192 }
193
194 uint32_t
gen_aux_map_get_state_num(struct gen_aux_map_context * ctx)195 gen_aux_map_get_state_num(struct gen_aux_map_context *ctx)
196 {
197 return p_atomic_read(&ctx->state_num);
198 }
199
200 struct gen_aux_map_context *
gen_aux_map_init(void * driver_ctx,struct gen_mapped_pinned_buffer_alloc * buffer_alloc,const struct gen_device_info * devinfo)201 gen_aux_map_init(void *driver_ctx,
202 struct gen_mapped_pinned_buffer_alloc *buffer_alloc,
203 const struct gen_device_info *devinfo)
204 {
205 struct gen_aux_map_context *ctx;
206 if (devinfo->gen < 12)
207 return NULL;
208
209 ctx = ralloc(NULL, struct gen_aux_map_context);
210 if (!ctx)
211 return NULL;
212
213 if (pthread_mutex_init(&ctx->mutex, NULL))
214 return NULL;
215
216 ctx->driver_ctx = driver_ctx;
217 ctx->buffer_alloc = buffer_alloc;
218 ctx->num_buffers = 0;
219 list_inithead(&ctx->buffers);
220 ctx->tail_offset = 0;
221 ctx->tail_remaining = 0;
222 ctx->state_num = 0;
223
224 if (add_sub_table(ctx, 32 * 1024, 32 * 1024, &ctx->level3_base_addr,
225 &ctx->level3_map)) {
226 if (aux_map_debug)
227 fprintf(stderr, "AUX-MAP L3: 0x%"PRIx64", map=%p\n",
228 ctx->level3_base_addr, ctx->level3_map);
229 p_atomic_inc(&ctx->state_num);
230 return ctx;
231 } else {
232 ralloc_free(ctx);
233 return NULL;
234 }
235 }
236
237 void
gen_aux_map_finish(struct gen_aux_map_context * ctx)238 gen_aux_map_finish(struct gen_aux_map_context *ctx)
239 {
240 if (!ctx)
241 return;
242
243 pthread_mutex_destroy(&ctx->mutex);
244 list_for_each_entry_safe(struct aux_map_buffer, buf, &ctx->buffers, link) {
245 ctx->buffer_alloc->free(ctx->driver_ctx, buf->buffer);
246 list_del(&buf->link);
247 p_atomic_dec(&ctx->num_buffers);
248 ralloc_free(buf);
249 }
250
251 ralloc_free(ctx);
252 }
253
254 uint64_t
gen_aux_map_get_base(struct gen_aux_map_context * ctx)255 gen_aux_map_get_base(struct gen_aux_map_context *ctx)
256 {
257 /**
258 * This get initialized in gen_aux_map_init, and never changes, so there is
259 * no need to lock the mutex.
260 */
261 return ctx->level3_base_addr;
262 }
263
264 static struct aux_map_buffer *
find_buffer(struct gen_aux_map_context * ctx,uint64_t addr)265 find_buffer(struct gen_aux_map_context *ctx, uint64_t addr)
266 {
267 list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) {
268 if (buf->buffer->gpu <= addr && buf->buffer->gpu_end > addr) {
269 return buf;
270 }
271 }
272 return NULL;
273 }
274
275 static uint64_t *
get_u64_entry_ptr(struct gen_aux_map_context * ctx,uint64_t addr)276 get_u64_entry_ptr(struct gen_aux_map_context *ctx, uint64_t addr)
277 {
278 struct aux_map_buffer *buf = find_buffer(ctx, addr);
279 assert(buf);
280 uintptr_t map_offset = addr - buf->buffer->gpu;
281 return (uint64_t*)((uint8_t*)buf->buffer->map + map_offset);
282 }
283
284 static uint8_t
get_bpp_encoding(uint16_t bpp)285 get_bpp_encoding(uint16_t bpp)
286 {
287 switch (bpp) {
288 case 16: return 0;
289 case 10: return 1;
290 case 12: return 2;
291 case 8: return 4;
292 case 32: return 5;
293 case 64: return 6;
294 case 128: return 7;
295 default:
296 unreachable("Unsupported bpp!");
297 return 0;
298 }
299 }
300
301 #define GEN_AUX_MAP_ENTRY_Y_TILED_BIT (0x1ull << 52)
302
303 uint64_t
gen_aux_map_format_bits_for_isl_surf(const struct isl_surf * isl_surf)304 gen_aux_map_format_bits_for_isl_surf(const struct isl_surf *isl_surf)
305 {
306 const struct isl_format_layout *fmtl =
307 isl_format_get_layout(isl_surf->format);
308
309 uint16_t bpp = fmtl->bpb;
310 assert(fmtl->bw == 1 && fmtl->bh == 1 && fmtl->bd == 1);
311 if (aux_map_debug)
312 fprintf(stderr, "AUX-MAP entry %s, bpp=%d\n",
313 isl_format_get_name(isl_surf->format), bpp);
314
315 assert(isl_tiling_is_any_y(isl_surf->tiling));
316
317 uint64_t format_bits =
318 ((uint64_t)isl_format_get_aux_map_encoding(isl_surf->format) << 58) |
319 ((uint64_t)get_bpp_encoding(bpp) << 54) |
320 GEN_AUX_MAP_ENTRY_Y_TILED_BIT;
321
322 assert((format_bits & GEN_AUX_MAP_FORMAT_BITS_MASK) == format_bits);
323
324 return format_bits;
325 }
326
327 static void
get_aux_entry(struct gen_aux_map_context * ctx,uint64_t address,uint32_t * l1_index_out,uint64_t * l1_entry_addr_out,uint64_t ** l1_entry_map_out)328 get_aux_entry(struct gen_aux_map_context *ctx, uint64_t address,
329 uint32_t *l1_index_out, uint64_t *l1_entry_addr_out,
330 uint64_t **l1_entry_map_out)
331 {
332 uint32_t l3_index = (address >> 36) & 0xfff;
333 uint64_t *l3_entry = &ctx->level3_map[l3_index];
334
335 uint64_t *l2_map;
336 if ((*l3_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) {
337 uint64_t l2_gpu;
338 if (add_sub_table(ctx, 32 * 1024, 32 * 1024, &l2_gpu, &l2_map)) {
339 if (aux_map_debug)
340 fprintf(stderr, "AUX-MAP L3[0x%x]: 0x%"PRIx64", map=%p\n",
341 l3_index, l2_gpu, l2_map);
342 } else {
343 unreachable("Failed to add L2 Aux-Map Page Table!");
344 }
345 *l3_entry = (l2_gpu & 0xffffffff8000ULL) | 1;
346 } else {
347 uint64_t l2_addr = gen_canonical_address(*l3_entry & ~0x7fffULL);
348 l2_map = get_u64_entry_ptr(ctx, l2_addr);
349 }
350 uint32_t l2_index = (address >> 24) & 0xfff;
351 uint64_t *l2_entry = &l2_map[l2_index];
352
353 uint64_t l1_addr, *l1_map;
354 if ((*l2_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) {
355 if (add_sub_table(ctx, 8 * 1024, 8 * 1024, &l1_addr, &l1_map)) {
356 if (aux_map_debug)
357 fprintf(stderr, "AUX-MAP L2[0x%x]: 0x%"PRIx64", map=%p\n",
358 l2_index, l1_addr, l1_map);
359 } else {
360 unreachable("Failed to add L1 Aux-Map Page Table!");
361 }
362 *l2_entry = (l1_addr & 0xffffffffe000ULL) | 1;
363 } else {
364 l1_addr = gen_canonical_address(*l2_entry & ~0x1fffULL);
365 l1_map = get_u64_entry_ptr(ctx, l1_addr);
366 }
367 uint32_t l1_index = (address >> 16) & 0xff;
368 if (l1_index_out)
369 *l1_index_out = l1_index;
370 if (l1_entry_addr_out)
371 *l1_entry_addr_out = l1_addr + l1_index * sizeof(*l1_map);
372 if (l1_entry_map_out)
373 *l1_entry_map_out = &l1_map[l1_index];
374 }
375
376 static void
add_mapping(struct gen_aux_map_context * ctx,uint64_t address,uint64_t aux_address,uint64_t format_bits,bool * state_changed)377 add_mapping(struct gen_aux_map_context *ctx, uint64_t address,
378 uint64_t aux_address, uint64_t format_bits,
379 bool *state_changed)
380 {
381 if (aux_map_debug)
382 fprintf(stderr, "AUX-MAP 0x%"PRIx64" => 0x%"PRIx64"\n", address,
383 aux_address);
384
385 uint32_t l1_index;
386 uint64_t *l1_entry;
387 get_aux_entry(ctx, address, &l1_index, NULL, &l1_entry);
388
389 const uint64_t l1_data =
390 (aux_address & GEN_AUX_MAP_ADDRESS_MASK) |
391 format_bits |
392 GEN_AUX_MAP_ENTRY_VALID_BIT;
393
394 const uint64_t current_l1_data = *l1_entry;
395 if ((current_l1_data & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) {
396 assert((aux_address & 0xffULL) == 0);
397 if (aux_map_debug)
398 fprintf(stderr, "AUX-MAP L1[0x%x] 0x%"PRIx64" -> 0x%"PRIx64"\n",
399 l1_index, current_l1_data, l1_data);
400 /**
401 * We use non-zero bits in 63:1 to indicate the entry had been filled
402 * previously. If these bits are non-zero and they don't exactly match
403 * what we want to program into the entry, then we must force the
404 * aux-map tables to be flushed.
405 */
406 if (current_l1_data != 0 && \
407 (current_l1_data | GEN_AUX_MAP_ENTRY_VALID_BIT) != l1_data)
408 *state_changed = true;
409 *l1_entry = l1_data;
410 } else {
411 if (aux_map_debug)
412 fprintf(stderr, "AUX-MAP L1[0x%x] is already marked valid!\n",
413 l1_index);
414 assert(*l1_entry == l1_data);
415 }
416 }
417
418 uint64_t *
gen_aux_map_get_entry(struct gen_aux_map_context * ctx,uint64_t address,uint64_t * entry_address)419 gen_aux_map_get_entry(struct gen_aux_map_context *ctx,
420 uint64_t address,
421 uint64_t *entry_address)
422 {
423 pthread_mutex_lock(&ctx->mutex);
424 uint64_t *l1_entry_map;
425 get_aux_entry(ctx, address, NULL, entry_address, &l1_entry_map);
426 pthread_mutex_unlock(&ctx->mutex);
427
428 return l1_entry_map;
429 }
430
431 void
gen_aux_map_add_mapping(struct gen_aux_map_context * ctx,uint64_t address,uint64_t aux_address,uint64_t main_size_B,uint64_t format_bits)432 gen_aux_map_add_mapping(struct gen_aux_map_context *ctx, uint64_t address,
433 uint64_t aux_address, uint64_t main_size_B,
434 uint64_t format_bits)
435 {
436 bool state_changed = false;
437 pthread_mutex_lock(&ctx->mutex);
438 uint64_t map_addr = address;
439 uint64_t dest_aux_addr = aux_address;
440 assert(align64(address, GEN_AUX_MAP_MAIN_PAGE_SIZE) == address);
441 assert(align64(aux_address, GEN_AUX_MAP_AUX_PAGE_SIZE) == aux_address);
442 while (map_addr - address < main_size_B) {
443 add_mapping(ctx, map_addr, dest_aux_addr, format_bits, &state_changed);
444 map_addr += GEN_AUX_MAP_MAIN_PAGE_SIZE;
445 dest_aux_addr += GEN_AUX_MAP_AUX_PAGE_SIZE;
446 }
447 pthread_mutex_unlock(&ctx->mutex);
448 if (state_changed)
449 p_atomic_inc(&ctx->state_num);
450 }
451
452 void
gen_aux_map_add_image(struct gen_aux_map_context * ctx,const struct isl_surf * isl_surf,uint64_t address,uint64_t aux_address)453 gen_aux_map_add_image(struct gen_aux_map_context *ctx,
454 const struct isl_surf *isl_surf, uint64_t address,
455 uint64_t aux_address)
456 {
457 gen_aux_map_add_mapping(ctx, address, aux_address, isl_surf->size_B,
458 gen_aux_map_format_bits_for_isl_surf(isl_surf));
459 }
460
461 /**
462 * We mark the leaf entry as invalid, but we don't attempt to cleanup the
463 * other levels of translation mappings. Since we attempt to re-use VMA
464 * ranges, hopefully this will not lead to unbounded growth of the translation
465 * tables.
466 */
467 static void
remove_mapping(struct gen_aux_map_context * ctx,uint64_t address,bool * state_changed)468 remove_mapping(struct gen_aux_map_context *ctx, uint64_t address,
469 bool *state_changed)
470 {
471 uint32_t l3_index = (address >> 36) & 0xfff;
472 uint64_t *l3_entry = &ctx->level3_map[l3_index];
473
474 uint64_t *l2_map;
475 if ((*l3_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) {
476 return;
477 } else {
478 uint64_t l2_addr = gen_canonical_address(*l3_entry & ~0x7fffULL);
479 l2_map = get_u64_entry_ptr(ctx, l2_addr);
480 }
481 uint32_t l2_index = (address >> 24) & 0xfff;
482 uint64_t *l2_entry = &l2_map[l2_index];
483
484 uint64_t *l1_map;
485 if ((*l2_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) {
486 return;
487 } else {
488 uint64_t l1_addr = gen_canonical_address(*l2_entry & ~0x1fffULL);
489 l1_map = get_u64_entry_ptr(ctx, l1_addr);
490 }
491 uint32_t l1_index = (address >> 16) & 0xff;
492 uint64_t *l1_entry = &l1_map[l1_index];
493
494 const uint64_t current_l1_data = *l1_entry;
495 const uint64_t l1_data = current_l1_data & ~1ull;
496
497 if ((current_l1_data & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) {
498 return;
499 } else {
500 if (aux_map_debug)
501 fprintf(stderr, "AUX-MAP [0x%x][0x%x][0x%x] L1 entry removed!\n",
502 l3_index, l2_index, l1_index);
503 /**
504 * We use non-zero bits in 63:1 to indicate the entry had been filled
505 * previously. In the unlikely event that these are all zero, we force a
506 * flush of the aux-map tables.
507 */
508 if (unlikely(l1_data == 0))
509 *state_changed = true;
510 *l1_entry = l1_data;
511 }
512 }
513
514 void
gen_aux_map_unmap_range(struct gen_aux_map_context * ctx,uint64_t address,uint64_t size)515 gen_aux_map_unmap_range(struct gen_aux_map_context *ctx, uint64_t address,
516 uint64_t size)
517 {
518 bool state_changed = false;
519 pthread_mutex_lock(&ctx->mutex);
520 if (aux_map_debug)
521 fprintf(stderr, "AUX-MAP remove 0x%"PRIx64"-0x%"PRIx64"\n", address,
522 address + size);
523
524 uint64_t map_addr = address;
525 assert(align64(address, GEN_AUX_MAP_MAIN_PAGE_SIZE) == address);
526 while (map_addr - address < size) {
527 remove_mapping(ctx, map_addr, &state_changed);
528 map_addr += 64 * 1024;
529 }
530 pthread_mutex_unlock(&ctx->mutex);
531 if (state_changed)
532 p_atomic_inc(&ctx->state_num);
533 }
534
535 uint32_t
gen_aux_map_get_num_buffers(struct gen_aux_map_context * ctx)536 gen_aux_map_get_num_buffers(struct gen_aux_map_context *ctx)
537 {
538 return p_atomic_read(&ctx->num_buffers);
539 }
540
541 void
gen_aux_map_fill_bos(struct gen_aux_map_context * ctx,void ** driver_bos,uint32_t max_bos)542 gen_aux_map_fill_bos(struct gen_aux_map_context *ctx, void **driver_bos,
543 uint32_t max_bos)
544 {
545 assert(p_atomic_read(&ctx->num_buffers) >= max_bos);
546 uint32_t i = 0;
547 list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) {
548 if (i >= max_bos)
549 return;
550 driver_bos[i++] = buf->buffer->driver_bo;
551 }
552 }
553